260 files changed, 88711 insertions, 0 deletions
diff --git a/src/third_party/wiredtiger/src/async/async_api.c b/src/third_party/wiredtiger/src/async/async_api.c
new file mode 100644
index 00000000000..3cb78e80b09
--- /dev/null
+++ b/src/third_party/wiredtiger/src/async/async_api.c
@@ -0,0 +1,604 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __async_get_format --
+ *	Find or allocate the uri/config/format structure.
+ */
+static int
+__async_get_format(WT_CONNECTION_IMPL *conn, const char *uri,
+    const char *config, WT_ASYNC_OP_IMPL *op)
+{
+	WT_ASYNC *async;
+	WT_ASYNC_FORMAT *af;
+	WT_CURSOR *c;
+	WT_DECL_RET;
+	WT_SESSION *wt_session;
+	WT_SESSION_IMPL *session;
+	uint64_t cfg_hash, uri_hash;
+
+	async = conn->async;
+	c = NULL;
+	op->format = NULL;
+
+	if (uri != NULL)
+		uri_hash = __wt_hash_city64(uri, strlen(uri));
+	else
+		uri_hash = 0;
+	if (config != NULL)
+		cfg_hash = __wt_hash_city64(config, strlen(config));
+	else
+		cfg_hash = 0;
+
+	/*
+	 * We don't need to hold a lock around this walk.  The list is
+	 * permanent and always valid.  We might race an insert and there
+	 * is a possibility a duplicate entry might be inserted, but
+	 * that is not harmful.
+	 */
+	STAILQ_FOREACH(af, &async->formatqh, q) {
+		if (af->uri_hash == uri_hash && af->cfg_hash == cfg_hash)
+			goto setup;
+	}
+	/*
+	 * We didn't find one in the cache.  Allocate and initialize one.
+	 * Insert it at the head expecting LRU usage.  We need a real session
+	 * for the cursor.
+	 */
+	WT_RET(
+	    __wt_open_internal_session(conn, "async-cursor", 1, 1, &session));
+	__wt_spin_lock(session, &async->ops_lock);
+	WT_ERR(__wt_calloc_def(session, 1, &af));
+	WT_ERR(__wt_strdup(session, uri, &af->uri));
+	WT_ERR(__wt_strdup(session, config, &af->config));
+	af->uri_hash = uri_hash;
+	af->cfg_hash = cfg_hash;
+	/*
+	 * Get the key_format and value_format for this URI and store
+	 * it in the structure so that async->set_key/value work.
+	 */
+	wt_session = &session->iface;
+	WT_ERR(wt_session->open_cursor(wt_session, uri, NULL, NULL, &c));
+	WT_ERR(__wt_strdup(session, c->key_format, &af->key_format));
+	WT_ERR(__wt_strdup(session, c->value_format, &af->value_format));
+	WT_ERR(c->close(c));
+	c = NULL;
+
+	STAILQ_INSERT_HEAD(&async->formatqh, af, q);
+	__wt_spin_unlock(session, &async->ops_lock);
+	WT_ERR(wt_session->close(wt_session, NULL));
+
+setup:	op->format = af;
+	/*
+	 * Copy the pointers for the formats.  Items in the async format
+	 * queue remain there until the connection is closed.  We must
+	 * initialize the format fields in the async_op, which are publicly
+	 * visible, and its internal cursor used by internal key/value
+	 * functions.
+	 */
+	op->iface.c.key_format = op->iface.key_format = af->key_format;
+	op->iface.c.value_format = op->iface.value_format = af->value_format;
+	return (0);
+
+err:
+	if (c != NULL)
+		(void)c->close(c);
+	__wt_free(session, af->uri);
+	__wt_free(session, af->config);
+	__wt_free(session, af->key_format);
+	__wt_free(session, af->value_format);
+	__wt_free(session, af);
+	return (ret);
+}
+
+/*
+ * __async_new_op_alloc --
+ *	Find and allocate the next available async op handle.
+ */
+static int
+__async_new_op_alloc(WT_SESSION_IMPL *session, const char *uri,
+    const char *config, WT_ASYNC_OP_IMPL **opp)
+{
+	WT_ASYNC *async;
+	WT_ASYNC_OP_IMPL *op;
+	WT_CONNECTION_IMPL *conn;
+	uint32_t i, save_i, view;
+
+	conn = S2C(session);
+	async = conn->async;
+	WT_STAT_FAST_CONN_INCR(session, async_op_alloc);
+	*opp = NULL;
+
+retry:
+	op = NULL;
+	WT_ORDERED_READ(save_i, async->ops_index);
+	/*
+	 * Look after the last one allocated for a free one.  We'd expect
+	 * ops to be freed mostly FIFO so we should quickly find one.
+	 */
+	for (view = 1, i = save_i; i < conn->async_size; i++, view++) {
+		op = &async->async_ops[i];
+		if (op->state == WT_ASYNCOP_FREE)
+			break;
+	}
+
+	/*
+	 * Loop around back to the beginning if we need to.
+	 */
+	if (op == NULL || op->state != WT_ASYNCOP_FREE)
+		for (i = 0; i < save_i; i++, view++) {
+			op = &async->async_ops[i];
+			if (op->state == WT_ASYNCOP_FREE)
+				break;
+		}
+
+	/*
+	 * We still haven't found one.  Return an error.
+	 */
+	if (op == NULL || op->state != WT_ASYNCOP_FREE) {
+		WT_STAT_FAST_CONN_INCR(session, async_full);
+		WT_RET(EBUSY);
+	}
+	/*
+	 * Set the state of this op handle as READY for the user to use.
+	 * If we can set the state then the op entry is ours.
+	 * Start the next search at the next entry after this one.
+	 */
+	if (!WT_ATOMIC_CAS4(op->state, WT_ASYNCOP_FREE, WT_ASYNCOP_READY)) {
+		WT_STAT_FAST_CONN_INCR(session, async_alloc_race);
+		goto retry;
+	}
+	WT_STAT_FAST_CONN_INCRV(session, async_alloc_view, view);
+	WT_RET(__async_get_format(conn, uri, config, op));
+	op->unique_id = WT_ATOMIC_ADD8(async->op_id, 1);
+	op->optype = WT_AOP_NONE;
+	(void)WT_ATOMIC_STORE4(async->ops_index, (i + 1) % conn->async_size);
+	*opp = op;
+	return (0);
+}
+
+/*
+ * __async_config --
+ *	Parse and setup the async API options.
+ */
+static int
+__async_config(WT_SESSION_IMPL *session,
+    WT_CONNECTION_IMPL *conn, const char **cfg, int *runp)
+{
+	WT_CONFIG_ITEM cval;
+
+	/*
+	 * The async configuration is off by default.
+	 */
+	WT_RET(__wt_config_gets(session, cfg, "async.enabled", &cval));
+	*runp = cval.val != 0;
+
+	/*
+	 * Even if async is turned off, we want to parse and store the
+	 * default values so that reconfigure can just enable them.
+	 */
+	WT_RET(__wt_config_gets(session, cfg, "async.ops_max", &cval));
+	conn->async_size = (uint32_t)cval.val;
+
+	WT_RET(__wt_config_gets(session, cfg, "async.threads", &cval));
+	conn->async_workers = (uint32_t)cval.val;
+	/* Sanity check that api_data.py is in sync with async.h */
+	WT_ASSERT(session, conn->async_workers <= WT_ASYNC_MAX_WORKERS);
+
+	return (0);
+}
+
+/*
+ * __wt_async_stats_update --
+ *	Update the async stats for return to the application.
+ */
+void
+__wt_async_stats_update(WT_SESSION_IMPL *session)
+{
+	WT_ASYNC *async;
+	WT_CONNECTION_IMPL *conn;
+	WT_CONNECTION_STATS *stats;
+
+	conn = S2C(session);
+	async = conn->async;
+	if (async == NULL)
+		return;
+	stats = &conn->stats;
+	WT_STAT_SET(stats, async_cur_queue, async->cur_queue);
+	WT_STAT_SET(stats, async_max_queue, async->max_queue);
+	F_SET(conn, WT_CONN_SERVER_ASYNC);
+}
+
+/*
+ * __async_start --
+ *	Start the async subsystem.  All configuration processing has
+ *	already been done by the caller.
+ */
+static int
+__async_start(WT_SESSION_IMPL *session)
+{
+	WT_ASYNC *async;
+	WT_CONNECTION_IMPL *conn;
+	uint32_t i;
+
+	conn = S2C(session);
+	conn->async_cfg = 1;
+	/*
+	 * Async is on, allocate the WT_ASYNC structure and initialize the ops.
+	 */
+	WT_RET(__wt_calloc(session, 1, sizeof(WT_ASYNC), &conn->async));
+	async = conn->async;
+	STAILQ_INIT(&async->formatqh);
+	WT_RET(__wt_spin_init(session, &async->ops_lock, "ops"));
+	WT_RET(__wt_cond_alloc(session, "async flush", 0, &async->flush_cond));
+	WT_RET(__wt_async_op_init(session));
+
+	/*
+	 * Start up the worker threads.
+	 */
+	F_SET(conn, WT_CONN_SERVER_ASYNC);
+	for (i = 0; i < conn->async_workers; i++) {
+		/*
+		 * Each worker has its own session.  We set both a general
+		 * server flag in the connection and an individual flag
+		 * in the session.  The user may reconfigure the number of
+		 * workers and we may want to selectively stop some workers
+		 * while leaving the rest running.
+		 */
+		WT_RET(__wt_open_internal_session(
+		    conn, "async-worker", 1, 1, &async->worker_sessions[i]));
+		F_SET(async->worker_sessions[i], WT_SESSION_SERVER_ASYNC);
+	}
+	for (i = 0; i < conn->async_workers; i++) {
+		/*
+		 * Start the threads.
+		 */
+		WT_RET(__wt_thread_create(session, &async->worker_tids[i],
+		    __wt_async_worker, async->worker_sessions[i]));
+	}
+	__wt_async_stats_update(session);
+	return (0);
+}
+
+/*
+ * __wt_async_create --
+ *	Start the async subsystem and worker threads.
+ */
+int
+__wt_async_create(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_CONNECTION_IMPL *conn;
+	int run;
+
+	conn = S2C(session);
+
+	/* Handle configuration. */
+	run = 0;
+	WT_RET(__async_config(session, conn, cfg, &run));
+
+	/* If async is not configured, we're done. */
+	if (!run)
+		return (0);
+	return (__async_start(session));
+}
+
+/*
+ * __wt_async_reconfig --
+ *	Start the async subsystem and worker threads.
+ */
+int
+__wt_async_reconfig(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_ASYNC *async;
+	WT_CONNECTION_IMPL *conn, tmp_conn;
+	WT_DECL_RET;
+	WT_SESSION *wt_session;
+	int run;
+	uint32_t i;
+
+	conn = S2C(session);
+	async = conn->async;
+	memset(&tmp_conn, 0, sizeof(tmp_conn));
+	tmp_conn.async_cfg = conn->async_cfg;
+	tmp_conn.async_workers = conn->async_workers;
+	tmp_conn.async_size = conn->async_size;
+
+	/* Handle configuration. */
+	run = conn->async_cfg;
+	WT_RET(__async_config(session, &tmp_conn, cfg, &run));
+
+	/*
+	 * There are some restrictions on the live reconfiguration of async.
+	 * Unlike other subsystems where we simply destroy anything existing
+	 * and restart with the new configuration, async is not so easy.
+	 * If the user is just changing the number of workers, we want to
+	 * allow the existing op handles and other information to remain in
+	 * existence.  So we must handle various combinations of changes
+	 * individually.
+	 *
+	 * One restriction is that if async is currently on, the user cannot
+	 * change the number of async op handles available.  The user can try
+	 * but we do nothing with it.  However we must allow the ops_max config
+	 * string so that a user can completely start async via reconfigure.
+	 */
+
+	/*
+	 * Easy cases:
+	 * 1. If async is on and the user wants it off, shut it down.
+	 * 2. If async is off, and the user wants it on, start it.
+	 * 3. If not a toggle and async is off, we're done.
+	 */
+	if (conn->async_cfg > 0 && !run) {
+		/* Case 1 */
+		WT_TRET(__wt_async_flush(session));
+		ret = __wt_async_destroy(session);
+		conn->async_cfg = 0;
+		return (ret);
+	} else if (conn->async_cfg == 0 && run)
+		/* Case 2 */
+		return (__async_start(session));
+	else if (conn->async_cfg == 0)
+		/* Case 3 */
+		return (0);
+
+	/*
+	 * Running async worker modification cases:
+	 * 4. If number of workers didn't change, we're done.
+	 * 5. If more workers, start new ones.
+	 * 6. If fewer workers, kill some.
+	 */
+	if (conn->async_workers == tmp_conn.async_workers)
+		/* No change in the number of workers. */
+		return (0);
+	if (conn->async_workers < tmp_conn.async_workers) {
+		/* Case 5 */
+		/*
+		 * The worker_sessions array is allocated for the maximum
+		 * allowed number of workers, so starting more is easy.
+		 */
+		for (i = conn->async_workers; i < tmp_conn.async_workers; i++) {
+			/*
+			 * Each worker has its own session.
+			 */
+			WT_RET(__wt_open_internal_session(conn,
+			    "async-worker", 1, 1, &async->worker_sessions[i]));
+			F_SET(async->worker_sessions[i],
+			    WT_SESSION_SERVER_ASYNC);
+		}
+		for (i = conn->async_workers; i < tmp_conn.async_workers; i++) {
+			/*
+			 * Start the threads.
+			 */
+			WT_RET(__wt_thread_create(session,
+			    &async->worker_tids[i], __wt_async_worker,
+			    async->worker_sessions[i]));
+		}
+		conn->async_workers = tmp_conn.async_workers;
+	}
+	if (conn->async_workers > tmp_conn.async_workers) {
+		/* Case 6 */
+		/*
+		 * Stopping an individual async worker is the most complex case.
+		 * We clear the session async flag on the targeted worker thread
+		 * so that only that thread stops, and the others keep running.
+		 */
+		for (i = conn->async_workers - 1;
+		    i >= tmp_conn.async_workers; i--) {
+			/*
+			 * Join any worker we're stopping.
+			 * After the thread is stopped, close its session.
+			 */
+			WT_ASSERT(session, async->worker_tids[i] != 0);
+			WT_ASSERT(session, async->worker_sessions[i] != NULL);
+			F_CLR(async->worker_sessions[i],
+			    WT_SESSION_SERVER_ASYNC);
+			WT_TRET(__wt_thread_join(
+			    session, async->worker_tids[i]));
+			async->worker_tids[i] = 0;
+			wt_session = &async->worker_sessions[i]->iface;
+			WT_TRET(wt_session->close(wt_session, NULL));
+			async->worker_sessions[i] = NULL;
+		}
+		conn->async_workers = tmp_conn.async_workers;
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_async_destroy --
+ *	Destroy the async worker threads and async subsystem.
+ */
+int
+__wt_async_destroy(WT_SESSION_IMPL *session)
+{
+	WT_ASYNC *async;
+	WT_ASYNC_FORMAT *af, *afnext;
+	WT_ASYNC_OP *op;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_SESSION *wt_session;
+	uint32_t i;
+
+	conn = S2C(session);
+	async = conn->async;
+
+	if (!conn->async_cfg)
+		return (0);
+
+	F_CLR(conn, WT_CONN_SERVER_ASYNC);
+	for (i = 0; i < conn->async_workers; i++)
+		if (async->worker_tids[i] != 0) {
+			WT_TRET(__wt_thread_join(
+			    session, async->worker_tids[i]));
+			async->worker_tids[i] = 0;
+		}
+	WT_TRET(__wt_cond_destroy(session, &async->flush_cond));
+
+	/* Close the server threads' sessions. */
+	for (i = 0; i < conn->async_workers; i++)
+		if (async->worker_sessions[i] != NULL) {
+			wt_session = &async->worker_sessions[i]->iface;
+			WT_TRET(wt_session->close(wt_session, NULL));
+			async->worker_sessions[i] = NULL;
+		}
+	/* Free any op key/value buffers. */
+	for (i = 0; i < conn->async_size; i++) {
+		op = (WT_ASYNC_OP *)&async->async_ops[i];
+		if (op->c.key.data != NULL)
+			__wt_buf_free(session, &op->c.key);
+		if (op->c.value.data != NULL)
+			__wt_buf_free(session, &op->c.value);
+	}
+
+	/* Free format resources */
+	af = STAILQ_FIRST(&async->formatqh);
+	while (af != NULL) {
+		afnext = STAILQ_NEXT(af, q);
+		__wt_free(session, af->uri);
+		__wt_free(session, af->config);
+		__wt_free(session, af->key_format);
+		__wt_free(session, af->value_format);
+		__wt_free(session, af);
+		af = afnext;
+	}
+	__wt_free(session, async->async_queue);
+	__wt_free(session, async->async_ops);
+	__wt_spin_destroy(session, &async->ops_lock);
+	__wt_free(session, conn->async);
+
+	return (ret);
+}
+
+/*
+ * __wt_async_flush --
+ *	Implementation of the WT_CONN->async_flush method.
+ */
+int
+__wt_async_flush(WT_SESSION_IMPL *session)
+{
+	WT_ASYNC *async;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+
+	conn = S2C(session);
+	if (!conn->async_cfg)
+		return (0);
+
+	async = conn->async;
+	WT_STAT_FAST_CONN_INCR(session, async_flush);
+	/*
+	 * We have to do several things.  First we have to prevent
+	 * other callers from racing with us so that only one
+	 * flush is happening at a time.  Next we have to wait for
+	 * the worker threads to notice the flush and indicate
+	 * that the flush is complete on their side.  Then we
+	 * clear the flush flags and return.
+	 */
+retry:
+	while (async->flush_state != WT_ASYNC_FLUSH_NONE)
+		/*
+		 * We're racing an in-progress flush.  We need to wait
+		 * our turn to start our own.  We need to convoy the
+		 * racing calls because a later call may be waiting for
+		 * specific enqueued ops to be complete before this returns.
+		 */
+		__wt_sleep(0, 100000);
+
+	if (!WT_ATOMIC_CAS4(async->flush_state, WT_ASYNC_FLUSH_NONE,
+	    WT_ASYNC_FLUSH_IN_PROGRESS))
+		goto retry;
+	/*
+	 * We're the owner of this flush operation.  Set the
+	 * WT_ASYNC_FLUSH_IN_PROGRESS to block other callers.
+	 * We're also preventing all worker threads from taking
+	 * things off the work queue with the lock.
+	 */
+	async->flush_count = 0;
+	(void)WT_ATOMIC_ADD8(async->flush_gen, 1);
+	WT_ASSERT(session, async->flush_op.state == WT_ASYNCOP_FREE);
+	async->flush_op.state = WT_ASYNCOP_READY;
+	WT_ERR(__wt_async_op_enqueue(session, &async->flush_op));
+	while (async->flush_state != WT_ASYNC_FLUSH_COMPLETE)
+		WT_ERR(__wt_cond_wait(NULL, async->flush_cond, 100000));
+	/*
+	 * Flush is done.  Clear the flags.
+	 */
+	async->flush_op.state = WT_ASYNCOP_FREE;
+	WT_PUBLISH(async->flush_state, WT_ASYNC_FLUSH_NONE);
+err:
+	return (ret);
+}
+
+/*
+ * __async_runtime_config --
+ *	Configure runtime fields at allocation.
+ */
+static int
+__async_runtime_config(WT_ASYNC_OP_IMPL *op, const char *cfg[])
+{
+	WT_ASYNC_OP *asyncop;
+	WT_CONFIG_ITEM cval;
+	WT_SESSION_IMPL *session;
+
+	session = O2S(op);
+	asyncop = (WT_ASYNC_OP *)op;
+	WT_RET(__wt_config_gets_def(session, cfg, "append", 0, &cval));
+	if (cval.val)
+		F_SET(&asyncop->c, WT_CURSTD_APPEND);
+	else
+		F_CLR(&asyncop->c, WT_CURSTD_APPEND);
+	WT_RET(__wt_config_gets_def(session, cfg, "overwrite", 1, &cval));
+	if (cval.val)
+		F_SET(&asyncop->c, WT_CURSTD_OVERWRITE);
+	else
+		F_CLR(&asyncop->c, WT_CURSTD_OVERWRITE);
+	WT_RET(__wt_config_gets_def(session, cfg, "raw", 0, &cval));
+	if (cval.val)
+		F_SET(&asyncop->c, WT_CURSTD_RAW);
+	else
+		F_CLR(&asyncop->c, WT_CURSTD_RAW);
+	return (0);
+
+}
+
+/*
+ * __wt_async_new_op --
+ *	Implementation of the WT_CONN->async_new_op method.
+ */
+int
+__wt_async_new_op(WT_SESSION_IMPL *session, const char *uri,
+    const char *config, const char *cfg[], WT_ASYNC_CALLBACK *cb,
+    WT_ASYNC_OP_IMPL **opp)
+{
+	WT_ASYNC_OP_IMPL *op;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+
+	*opp = NULL;
+
+	conn = S2C(session);
+	if (!conn->async_cfg)
+		return (ENOTSUP);
+
+	op = NULL;
+	WT_ERR(__async_new_op_alloc(session, uri, config, &op));
+	WT_ERR(__async_runtime_config(op, cfg));
+	op->cb = cb;
+	*opp = op;
+	return (0);
+
+err:
+	/*
+	 * If we get an error after allocating op, set its state to free.
+	 */
+	if (op != NULL)
+		op->state = WT_ASYNCOP_FREE;
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/async/async_op.c b/src/third_party/wiredtiger/src/async/async_op.c
new file mode 100644
index 00000000000..9dba2b2b5f3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/async/async_op.c
@@ -0,0 +1,359 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+/*
+ * __async_get_key --
+ *	WT_ASYNC_OP->get_key implementation for op handles.
+ */
+static int
+__async_get_key(WT_ASYNC_OP *asyncop, ...)
+{
+	WT_DECL_RET;
+	va_list ap;
+
+	va_start(ap, asyncop);
+	ret = __wt_cursor_get_keyv(&asyncop->c, asyncop->c.flags, ap);
+	va_end(ap);
+	return (ret);
+}
+
+/*
+ * __async_set_key --
+ *	WT_ASYNC_OP->set_key implementation for op handles.
+ */
+static void
+__async_set_key(WT_ASYNC_OP *asyncop, ...)
+{
+	WT_CURSOR *c;
+	WT_DECL_RET;
+	va_list ap;
+
+	c = &asyncop->c;
+	va_start(ap, asyncop);
+	__wt_cursor_set_keyv(c, c->flags, ap);
+	if (!WT_DATA_IN_ITEM(&c->key) && !WT_CURSOR_RECNO(c))
+		WT_ERR(__wt_buf_set(O2S((WT_ASYNC_OP_IMPL *)asyncop), &c->key,
+		    c->key.data, c->key.size));
+	va_end(ap);
+	if (0)
+err:		c->saved_err = ret;
+}
+
+/*
+ * __async_get_value --
+ *	WT_ASYNC_OP->get_value implementation for op handles.
+ */
+static int
+__async_get_value(WT_ASYNC_OP *asyncop, ...)
+{
+	WT_DECL_RET;
+	va_list ap;
+
+	va_start(ap, asyncop);
+	ret = __wt_cursor_get_valuev(&asyncop->c, ap);
+	va_end(ap);
+	return (ret);
+}
+
+/*
+ * __async_set_value --
+ *	WT_ASYNC_OP->set_value implementation for op handles.
+ */
+static void
+__async_set_value(WT_ASYNC_OP *asyncop, ...)
+{
+	WT_CURSOR *c;
+	WT_DECL_RET;
+	va_list ap;
+
+	c = &asyncop->c;
+	va_start(ap, asyncop);
+	__wt_cursor_set_valuev(c, ap);
+	/* Copy the data, if it is pointing at data elsewhere. */
+	if (!WT_DATA_IN_ITEM(&c->value))
+		WT_ERR(__wt_buf_set(O2S((WT_ASYNC_OP_IMPL *)asyncop),
+		    &c->value, c->value.data, c->value.size));
+	va_end(ap);
+	if (0)
+err:		c->saved_err = ret;
+}
+
+/*
+ * __async_op_wrap --
+ *	Common wrapper for all async operations.
+ */
+static int
+__async_op_wrap(WT_ASYNC_OP_IMPL *op, WT_ASYNC_OPTYPE type)
+{
+	op->optype = type;
+	return (__wt_async_op_enqueue(O2S(op), op));
+}
+
+/*
+ * __async_search --
+ *	WT_ASYNC_OP->search implementation for op handles.
+ */
+static int
+__async_search(WT_ASYNC_OP *asyncop)
+{
+	WT_ASYNC_OP_IMPL *op;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	op = (WT_ASYNC_OP_IMPL *)asyncop;
+	ASYNCOP_API_CALL(O2C(op), session, search);
+	WT_STAT_FAST_CONN_INCR(O2S(op), async_op_search);
+	WT_ERR(__async_op_wrap(op, WT_AOP_SEARCH));
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __async_insert --
+ *	WT_ASYNC_OP->insert implementation for op handles.
+ */
+static int
+__async_insert(WT_ASYNC_OP *asyncop)
+{
+	WT_ASYNC_OP_IMPL *op;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	op = (WT_ASYNC_OP_IMPL *)asyncop;
+	ASYNCOP_API_CALL(O2C(op), session, insert);
+	WT_STAT_FAST_CONN_INCR(O2S(op), async_op_insert);
+	WT_ERR(__async_op_wrap(op, WT_AOP_INSERT));
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __async_update --
+ *	WT_ASYNC_OP->update implementation for op handles.
+ */
+static int
+__async_update(WT_ASYNC_OP *asyncop)
+{
+	WT_ASYNC_OP_IMPL *op;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	op = (WT_ASYNC_OP_IMPL *)asyncop;
+	ASYNCOP_API_CALL(O2C(op), session, update);
+	WT_STAT_FAST_CONN_INCR(O2S(op), async_op_update);
+	WT_ERR(__async_op_wrap(op, WT_AOP_UPDATE));
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __async_remove --
+ *	WT_ASYNC_OP->remove implementation for op handles.
+ */
+static int
+__async_remove(WT_ASYNC_OP *asyncop)
+{
+	WT_ASYNC_OP_IMPL *op;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	op = (WT_ASYNC_OP_IMPL *)asyncop;
+	ASYNCOP_API_CALL(O2C(op), session, remove);
+	WT_STAT_FAST_CONN_INCR(O2S(op), async_op_remove);
+	WT_ERR(__async_op_wrap(op, WT_AOP_REMOVE));
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __async_compact --
+ *	WT_ASYNC_OP->compact implementation for op handles.
+ */
+static int
+__async_compact(WT_ASYNC_OP *asyncop)
+{
+	WT_ASYNC_OP_IMPL *op;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	op = (WT_ASYNC_OP_IMPL *)asyncop;
+	ASYNCOP_API_CALL(O2C(op), session, compact);
+	WT_STAT_FAST_CONN_INCR(O2S(op), async_op_compact);
+	WT_ERR(__async_op_wrap(op, WT_AOP_COMPACT));
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __async_get_id --
+ *	WT_ASYNC_OP->get_id implementation for op handles.
+ */
+static uint64_t
+__async_get_id(WT_ASYNC_OP *asyncop)
+{
+	return (((WT_ASYNC_OP_IMPL *)asyncop)->unique_id);
+}
+
+/*
+ * __async_get_type --
+ *	WT_ASYNC_OP->get_type implementation for op handles.
+ */
+static WT_ASYNC_OPTYPE
+__async_get_type(WT_ASYNC_OP *asyncop)
+{
+	return (((WT_ASYNC_OP_IMPL *)asyncop)->optype);
+}
+
+/*
+ * __async_op_init --
+ *	Initialize all the op handle fields.
+ */
+static int
+__async_op_init(WT_CONNECTION_IMPL *conn, WT_ASYNC_OP_IMPL *op, uint32_t id)
+{
+	WT_ASYNC_OP *asyncop;
+
+	asyncop = (WT_ASYNC_OP *)op;
+	asyncop->connection = (WT_CONNECTION *)conn;
+	asyncop->key_format = asyncop->value_format = NULL;
+	asyncop->c.key_format = asyncop->c.value_format = NULL;
+	asyncop->get_key = __async_get_key;
+	asyncop->get_value = __async_get_value;
+	asyncop->set_key = __async_set_key;
+	asyncop->set_value = __async_set_value;
+	asyncop->search = __async_search;
+	asyncop->insert = __async_insert;
+	asyncop->update = __async_update;
+	asyncop->remove = __async_remove;
+	asyncop->compact = __async_compact;
+	asyncop->get_id = __async_get_id;
+	asyncop->get_type = __async_get_type;
+	/*
+	 * The cursor needs to have the get/set key/value functions initialized.
+	 * It also needs the key/value related fields set up.
+	 */
+	asyncop->c.get_key = __wt_cursor_get_key;
+	asyncop->c.set_key = __wt_cursor_set_key;
+	asyncop->c.get_value = __wt_cursor_get_value;
+	asyncop->c.set_value = __wt_cursor_set_value;
+	asyncop->c.recno = 0;
+	memset(asyncop->c.raw_recno_buf, 0, sizeof(asyncop->c.raw_recno_buf));
+	memset(&asyncop->c.key, 0, sizeof(asyncop->c.key));
+	memset(&asyncop->c.value, 0, sizeof(asyncop->c.value));
+	asyncop->c.session = (WT_SESSION *)conn->default_session;
+	asyncop->c.saved_err = 0;
+	asyncop->c.flags = 0;
+
+	op->internal_id = id;
+	op->state = WT_ASYNCOP_FREE;
+	return (0);
+}
+
+/*
+ * __wt_async_op_enqueue --
+ *	Enqueue an operation onto the work queue.
+ */
+int
+__wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op)
+{
+	WT_ASYNC *async;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	uint64_t cur_head, cur_tail, my_alloc, my_slot;
+#ifdef	HAVE_DIAGNOSTIC
+	WT_ASYNC_OP_IMPL *my_op;
+#endif
+
+	conn = S2C(session);
+	async = conn->async;
+	/*
+	 * Enqueue op at the tail of the work queue.
+	 */
+	WT_ASSERT(session, op->state == WT_ASYNCOP_READY);
+	/*
+	 * We get our slot in the ring buffer to use.
+	 */
+	my_alloc = WT_ATOMIC_ADD8(async->alloc_head, 1);
+	my_slot = my_alloc % async->async_qsize;
+
+	/*
+	 * Make sure we haven't wrapped around the queue.
+	 * If so, wait for the tail to advance off this slot.
+	 */
+	WT_ORDERED_READ(cur_tail, async->tail_slot);
+	while (cur_tail == my_slot) {
+		__wt_yield();
+		WT_ORDERED_READ(cur_tail, async->tail_slot);
+	}
+
+#ifdef	HAVE_DIAGNOSTIC
+	WT_ORDERED_READ(my_op, async->async_queue[my_slot]);
+	if (my_op != NULL)
+		return (__wt_panic(session));
+#endif
+	WT_PUBLISH(async->async_queue[my_slot], op);
+	op->state = WT_ASYNCOP_ENQUEUED;
+	if (WT_ATOMIC_ADD4(async->cur_queue, 1) > async->max_queue)
+		WT_PUBLISH(async->max_queue, async->cur_queue);
+	/*
+	 * Multiple threads may be adding ops to the queue.  We need to wait
+	 * our turn to make our slot visible to workers.
+	 */
+	WT_ORDERED_READ(cur_head, async->head);
+	while (cur_head != (my_alloc - 1)) {
+		__wt_yield();
+		WT_ORDERED_READ(cur_head, async->head);
+	}
+	WT_PUBLISH(async->head, my_alloc);
+	return (ret);
+}
+
+/*
+ * __wt_async_op_init --
+ *	Initialize all the op handles.
+ */
+int
+__wt_async_op_init(WT_SESSION_IMPL *session)
+{
+	WT_ASYNC *async;
+	WT_ASYNC_OP_IMPL *op;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	uint32_t i;
+
+	conn = S2C(session);
+	async = conn->async;
+
+	/*
+	 * Initialize the flush op structure.
+	 */
+	WT_RET(__async_op_init(conn, &async->flush_op, OPS_INVALID_INDEX));
+
+	/*
+	 * Allocate and initialize the work queue.  This is sized so that
+	 * the ring buffer is known to be big enough such that the head
+	 * can never overlap the tail.  Include extra for the flush op.
+	 */
+	async->async_qsize = conn->async_size + 2;
+	WT_RET(__wt_calloc_def(
+	    session, async->async_qsize, &async->async_queue));
+	/*
+	 * Allocate and initialize all the user ops.
+	 */
+	WT_ERR(__wt_calloc_def(session, conn->async_size, &async->async_ops));
+	for (i = 0; i < conn->async_size; i++) {
+		op = &async->async_ops[i];
+		WT_ERR(__async_op_init(conn, op, i));
+	}
+	return (0);
+err:
+	if (async->async_ops != NULL) {
+		__wt_free(session, async->async_ops);
+		async->async_ops = NULL;
+	}
+	if (async->async_queue != NULL) {
+		__wt_free(session, async->async_queue);
+		async->async_queue = NULL;
+	}
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/async/async_worker.c b/src/third_party/wiredtiger/src/async/async_worker.c
new file mode 100644
index 00000000000..74ee2dd2f86
--- /dev/null
+++ b/src/third_party/wiredtiger/src/async/async_worker.c
@@ -0,0 +1,359 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __async_op_dequeue --
+ *	Wait for work to be available.  Then atomically take it off
+ *	the work queue.
+ */
+static int
+__async_op_dequeue(WT_CONNECTION_IMPL *conn, WT_SESSION_IMPL *session,
+    WT_ASYNC_OP_IMPL **op)
+{
+	WT_ASYNC *async;
+	long sleep_usec;
+	uint64_t cur_tail, last_consume, my_consume, my_slot, prev_slot;
+	uint32_t tries;
+
+	async = conn->async;
+	*op = NULL;
+	/*
+	 * Wait for work to do.  Work is available when async->head moves.
+	 * Then grab the slot containing the work.  If we lose, try again.
+	 */
+retry:
+	tries = 0;
+	sleep_usec = 100;
+	WT_ORDERED_READ(last_consume, async->alloc_tail);
+	/*
+	 * We stay in this loop until there is work to do.
+	 */
+	while (last_consume == async->head &&
+	    async->flush_state != WT_ASYNC_FLUSHING) {
+		WT_STAT_FAST_CONN_INCR(session, async_nowork);
+		if (++tries < MAX_ASYNC_YIELD)
+			/*
+			 * Initially when we find no work, allow other
+			 * threads to run.
+			 */
+			__wt_yield();
+		else {
+			/*
+			 * If we haven't found work in a while, start sleeping
+			 * to wait for work to arrive instead of spinning.
+			 */
+			__wt_sleep(0, sleep_usec);
+			sleep_usec = WT_MIN(sleep_usec * 2,
+			    MAX_ASYNC_SLEEP_USECS);
+		}
+		if (!F_ISSET(session, WT_SESSION_SERVER_ASYNC))
+			return (0);
+		if (!F_ISSET(conn, WT_CONN_SERVER_ASYNC))
+			return (0);
+		if (F_ISSET(conn, WT_CONN_PANIC))
+			return (__wt_panic(session));
+		WT_ORDERED_READ(last_consume, async->alloc_tail);
+	}
+	if (async->flush_state == WT_ASYNC_FLUSHING)
+		return (0);
+	/*
+	 * Try to increment the tail to claim this slot.  If we lose
+	 * a race, try again.
+	 */
+	my_consume = last_consume + 1;
+	if (!WT_ATOMIC_CAS8(async->alloc_tail, last_consume, my_consume))
+		goto retry;
+	/*
+	 * This item of work is ours to process.  Clear it out of the
+	 * queue and return.
+	 */
+	my_slot = my_consume % async->async_qsize;
+	prev_slot = last_consume % async->async_qsize;
+	*op = WT_ATOMIC_STORE8(async->async_queue[my_slot], NULL);
+
+	WT_ASSERT(session, async->cur_queue > 0);
+	WT_ASSERT(session, *op != NULL);
+	WT_ASSERT(session, (*op)->state == WT_ASYNCOP_ENQUEUED);
+	(void)WT_ATOMIC_SUB4(async->cur_queue, 1);
+	(*op)->state = WT_ASYNCOP_WORKING;
+
+	if (*op == &async->flush_op)
+		/*
+		 * We're the worker to take the flush op off the queue.
+		 */
+		WT_PUBLISH(async->flush_state, WT_ASYNC_FLUSHING);
+	WT_ORDERED_READ(cur_tail, async->tail_slot);
+	while (cur_tail != prev_slot) {
+		__wt_yield();
+		WT_ORDERED_READ(cur_tail, async->tail_slot);
+	}
+	WT_PUBLISH(async->tail_slot, my_slot);
+	return (0);
+}
+
+/*
+ * __async_flush_wait --
+ *	Wait for the final worker to finish flushing.
+ */
+static int
+__async_flush_wait(WT_SESSION_IMPL *session, WT_ASYNC *async, uint64_t my_gen)
+{
+	WT_DECL_RET;
+
+	while (async->flush_state == WT_ASYNC_FLUSHING &&
+	    async->flush_gen == my_gen)
+		WT_ERR(__wt_cond_wait(session, async->flush_cond, 10000));
+err:	return (ret);
+}
+
+/*
+ * __async_worker_cursor --
+ *	Return a cursor for the worker thread to use for its op.
+ *	The worker thread caches cursors.  So first search for one
+ *	with the same config/uri signature.  Otherwise open a new
+ *	cursor and cache it.
+ */
+static int
+__async_worker_cursor(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op,
+    WT_ASYNC_WORKER_STATE *worker, WT_CURSOR **cursorp)
+{
+	WT_ASYNC_CURSOR *ac;
+	WT_CURSOR *c;
+	WT_DECL_RET;
+	WT_SESSION *wt_session;
+
+	wt_session = (WT_SESSION *)session;
+	*cursorp = NULL;
+	/*
+	 * Compact doesn't need a cursor.
+	 */
+	if (op->optype == WT_AOP_COMPACT)
+		return (0);
+	WT_ASSERT(session, op->format != NULL);
+	STAILQ_FOREACH(ac, &worker->cursorqh, q) {
+		if (op->format->cfg_hash == ac->cfg_hash &&
+		    op->format->uri_hash == ac->uri_hash) {
+			/*
+			 * If one of our cached cursors has a matching
+			 * signature, use it and we're done.
+			 */
+			*cursorp = ac->c;
+			return (0);
+		}
+	}
+	/*
+	 * We didn't find one in our cache.  Open one and cache it.
+	 * Insert it at the head expecting LRU usage.
+	 */
+	WT_RET(__wt_calloc_def(session, 1, &ac));
+	WT_ERR(wt_session->open_cursor(
+	    wt_session, op->format->uri, NULL, op->format->config, &c));
+	ac->cfg_hash = op->format->cfg_hash;
+	ac->uri_hash = op->format->uri_hash;
+	ac->c = c;
+	STAILQ_INSERT_HEAD(&worker->cursorqh, ac, q);
+	worker->num_cursors++;
+	*cursorp = c;
+	return (0);
+
+err:	__wt_free(session, ac);
+	return (ret);
+}
+
+/*
+ * __async_worker_execop --
+ *	A worker thread executes an individual op with a cursor.
+ */
+static int
+__async_worker_execop(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op,
+    WT_CURSOR *cursor)
+{
+	WT_ASYNC_OP *asyncop;
+	WT_ITEM val;
+	WT_SESSION *wt_session;
+
+	asyncop = (WT_ASYNC_OP *)op;
+	/*
+	 * Set the key of our local cursor from the async op handle.
+	 * If needed, also set the value.
+	 */
+	if (op->optype != WT_AOP_COMPACT) {
+		WT_RET(__wt_cursor_get_raw_key(&asyncop->c, &val));
+		__wt_cursor_set_raw_key(cursor, &val);
+		if (op->optype == WT_AOP_INSERT ||
+		    op->optype == WT_AOP_UPDATE) {
+			WT_RET(__wt_cursor_get_raw_value(&asyncop->c, &val));
+			__wt_cursor_set_raw_value(cursor, &val);
+		}
+	}
+	switch (op->optype) {
+		case WT_AOP_COMPACT:
+			wt_session = &session->iface;
+			WT_RET(wt_session->compact(wt_session,
+			    op->format->uri, op->format->config));
+			break;
+		case WT_AOP_INSERT:
+			WT_RET(cursor->insert(cursor));
+			break;
+		case WT_AOP_UPDATE:
+			WT_RET(cursor->update(cursor));
+			break;
+		case WT_AOP_REMOVE:
+			WT_RET(cursor->remove(cursor));
+			break;
+		case WT_AOP_SEARCH:
+			WT_RET(cursor->search(cursor));
+			/*
+			 * Get the value from the cursor and put it into
+			 * the op for op->get_value.
+			 */
+			WT_RET(__wt_cursor_get_raw_value(cursor, &val));
+			__wt_cursor_set_raw_value(&asyncop->c, &val);
+			break;
+		case WT_AOP_NONE:
+		default:
+			WT_RET_MSG(session, EINVAL, "Unknown async optype %d\n",
+			    op->optype);
+	}
+	return (0);
+}
+
+/*
+ * __async_worker_op --
+ *	A worker thread handles an individual op.
+ */
+static int
+__async_worker_op(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op,
+    WT_ASYNC_WORKER_STATE *worker)
+{
+	WT_ASYNC_OP *asyncop;
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	WT_SESSION *wt_session;
+	int cb_ret;
+
+	asyncop = (WT_ASYNC_OP *)op;
+
+	cb_ret = 0;
+
+	wt_session = &session->iface;
+	if (op->optype != WT_AOP_COMPACT)
+		WT_RET(wt_session->begin_transaction(wt_session, NULL));
+	WT_ASSERT(session, op->state == WT_ASYNCOP_WORKING);
+	WT_RET(__async_worker_cursor(session, op, worker, &cursor));
+	/*
+	 * Perform op and invoke the callback.
+	 */
+	ret = __async_worker_execop(session, op, cursor);
+	if (op->cb != NULL && op->cb->notify != NULL)
+		cb_ret = op->cb->notify(op->cb, asyncop, ret, 0);
+
+	/*
+	 * If the operation succeeded and the user callback returned
+	 * zero then commit.  Otherwise rollback.
+	 */
+	if (op->optype != WT_AOP_COMPACT) {
+		if ((ret == 0 || ret == WT_NOTFOUND) && cb_ret == 0)
+			WT_TRET(wt_session->commit_transaction(
+			    wt_session, NULL));
+		else
+			WT_TRET(wt_session->rollback_transaction(
+			    wt_session, NULL));
+		F_CLR(&asyncop->c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+		WT_TRET(cursor->reset(cursor));
+	}
+	/*
+	 * After the callback returns, and the transaction resolved release
+	 * the op back to the free pool.  We do this regardless of
+	 * success or failure.
+	 */
+	WT_PUBLISH(op->state, WT_ASYNCOP_FREE);
+	return (ret);
+}
+
+/*
+ * __async_worker --
+ *	The async worker threads.
+ */
+void *
+__wt_async_worker(void *arg)
+{
+	WT_ASYNC *async;
+	WT_ASYNC_CURSOR *ac, *acnext;
+	WT_ASYNC_OP_IMPL *op;
+	WT_ASYNC_WORKER_STATE worker;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	uint64_t flush_gen;
+
+	session = arg;
+	conn = S2C(session);
+	async = conn->async;
+
+	worker.num_cursors = 0;
+	STAILQ_INIT(&worker.cursorqh);
+	while (F_ISSET(conn, WT_CONN_SERVER_ASYNC) &&
+	    F_ISSET(session, WT_SESSION_SERVER_ASYNC)) {
+		WT_ERR(__async_op_dequeue(conn, session, &op));
+		if (op != NULL && op != &async->flush_op) {
+			/*
+			 * If an operation fails, we want the worker thread to
+			 * keep running, unless there is a panic.
+			 */
+			(void)__async_worker_op(session, op, &worker);
+			if (F_ISSET(conn, WT_CONN_PANIC))
+				WT_ERR(__wt_panic(session));
+		} else if (async->flush_state == WT_ASYNC_FLUSHING) {
+			/*
+			 * Worker flushing going on.  Last worker to the party
+			 * needs to clear the FLUSHING flag and signal the cond.
+			 * If FLUSHING is going on, we do not take anything off
+			 * the queue.
+			 */
+			WT_ORDERED_READ(flush_gen, async->flush_gen);
+			if (WT_ATOMIC_ADD4(async->flush_count, 1) ==
+			    conn->async_workers) {
+				/*
+				 * We're last.  All workers accounted for so
+				 * signal the condition and clear the FLUSHING
+				 * flag to release the other worker threads.
+				 * Set the FLUSH_COMPLETE flag so that the
+				 * caller can return to the application.
+				 */
+				WT_PUBLISH(async->flush_state,
+				    WT_ASYNC_FLUSH_COMPLETE);
+				WT_ERR(__wt_cond_signal(session,
+				    async->flush_cond));
+			} else
+				/*
+				 * We need to wait for the last worker to
+				 * signal the condition.
+				 */
+				WT_ERR(__async_flush_wait(
+				    session, async, flush_gen));
+		}
+	}
+
+	if (0) {
+err:		__wt_err(session, ret, "async worker error");
+	}
+	/*
+	 * Worker thread cleanup, close our cached cursors and
+	 * free all the WT_ASYNC_CURSOR structures.
+	 */
+	ac = STAILQ_FIRST(&worker.cursorqh);
+	while (ac != NULL) {
+		acnext = STAILQ_NEXT(ac, q);
+		WT_TRET(ac->c->close(ac->c));
+		__wt_free(session, ac);
+		ac = acnext;
+	}
+	return (NULL);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_addr.c b/src/third_party/wiredtiger/src/block/block_addr.c
new file mode 100644
index 00000000000..bbd52359157
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_addr.c
@@ -0,0 +1,202 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __block_buffer_to_addr --
+ *	Convert a filesystem address cookie into its components, UPDATING the
+ * caller's buffer reference so it can be called repeatedly to load a buffer.
+ */
+static int
+__block_buffer_to_addr(WT_BLOCK *block,
+    const uint8_t **pp, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump)
+{
+	uint64_t o, s, c;
+
+	WT_RET(__wt_vunpack_uint(pp, 0, &o));
+	WT_RET(__wt_vunpack_uint(pp, 0, &s));
+	WT_RET(__wt_vunpack_uint(pp, 0, &c));
+
+	/*
+	 * To avoid storing large offsets, we minimize the value by subtracting
+	 * a block for description information, then storing a count of block
+	 * allocation units.  That implies there is no such thing as an
+	 * "invalid" offset though, they could all be valid (other than very
+	 * large numbers), which is what we didn't want to store in the first
+	 * place.  Use the size: writing a block of size 0 makes no sense, so
+	 * that's the out-of-band value.  Once we're out of this function and
+	 * are working with a real file offset, size and checksum triplet, there
+	 * can be invalid offsets, that's simpler than testing sizes of 0 all
+	 * over the place.
+	 */
+	if (s == 0) {
+		*offsetp = 0;
+		*sizep = *cksump = 0;
+	} else {
+		*offsetp = (wt_off_t)(o + 1) * block->allocsize;
+		*sizep = (uint32_t)s * block->allocsize;
+		*cksump = (uint32_t)c;
+	}
+	return (0);
+}
+
+/*
+ * __wt_block_addr_to_buffer --
+ *	Convert the filesystem components into its address cookie.
+ */
+int
+__wt_block_addr_to_buffer(WT_BLOCK *block,
+    uint8_t **pp, wt_off_t offset, uint32_t size, uint32_t cksum)
+{
+	uint64_t o, s, c;
+
+	/* See the comment above: this is the reverse operation. */
+	if (size == 0) {
+		o = WT_BLOCK_INVALID_OFFSET;
+		s = c = 0;
+	} else {
+		o = (uint64_t)offset / block->allocsize - 1;
+		s = size / block->allocsize;
+		c = cksum;
+	}
+	WT_RET(__wt_vpack_uint(pp, 0, o));
+	WT_RET(__wt_vpack_uint(pp, 0, s));
+	WT_RET(__wt_vpack_uint(pp, 0, c));
+	return (0);
+}
+
+/*
+ * __wt_block_buffer_to_addr --
+ *	Convert a filesystem address cookie into its components NOT UPDATING
+ * the caller's buffer reference.
+ */
+int
+__wt_block_buffer_to_addr(WT_BLOCK *block,
+    const uint8_t *p, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump)
+{
+	return (__block_buffer_to_addr(block, &p, offsetp, sizep, cksump));
+}
+
+/*
+ * __wt_block_addr_valid --
+ *	Return if an address cookie is valid.
+ */
+int
+__wt_block_addr_valid(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, const uint8_t *addr, size_t addr_size, int live)
+{
+	wt_off_t offset;
+	uint32_t cksum, size;
+
+	WT_UNUSED(session);
+	WT_UNUSED(addr_size);
+	WT_UNUSED(live);
+
+	/* Crack the cookie. */
+	WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+#ifdef HAVE_DIAGNOSTIC
+	/*
+	 * In diagnostic mode, verify the address isn't on the available list,
+	 * or for live systems, the discard list.
+	 */
+	WT_RET(__wt_block_misplaced(
+	    session, block, "addr-valid", offset, size, live));
+#endif
+
+	/* Check if it's past the end of the file. */
+	return (offset + size > block->fh->size ? 0 : 1);
+}
+
+/*
+ * __wt_block_addr_string --
+ *	Return a printable string representation of an address cookie.
+ */
+int
+__wt_block_addr_string(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, WT_ITEM *buf, const uint8_t *addr, size_t addr_size)
+{
+	wt_off_t offset;
+	uint32_t cksum, size;
+
+	WT_UNUSED(addr_size);
+
+	/* Crack the cookie. */
+	WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+	/* Printable representation. */
+	WT_RET(__wt_buf_fmt(session, buf,
+	    "[%" PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+	    (uintmax_t)offset, (uintmax_t)offset + size, size, cksum));
+
+	return (0);
+}
+
+/*
+ * __wt_block_buffer_to_ckpt --
+ *	Convert a checkpoint cookie into its components.
+ */
+int
+__wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci)
+{
+	uint64_t a;
+	const uint8_t **pp;
+
+	ci->version = *p++;
+	if (ci->version != WT_BM_CHECKPOINT_VERSION)
+		WT_RET_MSG(session, WT_ERROR, "unsupported checkpoint version");
+
+	pp = &p;
+	WT_RET(__block_buffer_to_addr(block, pp,
+	    &ci->root_offset, &ci->root_size, &ci->root_cksum));
+	WT_RET(__block_buffer_to_addr(block, pp,
+	    &ci->alloc.offset, &ci->alloc.size, &ci->alloc.cksum));
+	WT_RET(__block_buffer_to_addr(block, pp,
+	    &ci->avail.offset, &ci->avail.size, &ci->avail.cksum));
+	WT_RET(__block_buffer_to_addr(block, pp,
+	    &ci->discard.offset, &ci->discard.size, &ci->discard.cksum));
+	WT_RET(__wt_vunpack_uint(pp, 0, &a));
+	ci->file_size = (wt_off_t)a;
+	WT_RET(__wt_vunpack_uint(pp, 0, &a));
+	ci->ckpt_size = a;
+
+	return (0);
+}
+
+/*
+ * __wt_block_ckpt_to_buffer --
+ *	Convert the components into its checkpoint cookie.
+ */
+int
+__wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, uint8_t **pp, WT_BLOCK_CKPT *ci)
+{
+	uint64_t a;
+
+	if (ci->version != WT_BM_CHECKPOINT_VERSION)
+		WT_RET_MSG(session, WT_ERROR, "unsupported checkpoint version");
+
+	(*pp)[0] = ci->version;
+	(*pp)++;
+
+	WT_RET(__wt_block_addr_to_buffer(block, pp,
+	    ci->root_offset, ci->root_size, ci->root_cksum));
+	WT_RET(__wt_block_addr_to_buffer(block, pp,
+	    ci->alloc.offset, ci->alloc.size, ci->alloc.cksum));
+	WT_RET(__wt_block_addr_to_buffer(block, pp,
+	    ci->avail.offset, ci->avail.size, ci->avail.cksum));
+	WT_RET(__wt_block_addr_to_buffer(block, pp,
+	    ci->discard.offset, ci->discard.size, ci->discard.cksum));
+	a = (uint64_t)ci->file_size;
+	WT_RET(__wt_vpack_uint(pp, 0, a));
+	a = (uint64_t)ci->ckpt_size;
+	WT_RET(__wt_vpack_uint(pp, 0, a));
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c
new file mode 100644
index 00000000000..83c3a40e8e1
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_ckpt.c
@@ -0,0 +1,842 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __ckpt_process(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
+static int __ckpt_string(
+	WT_SESSION_IMPL *, WT_BLOCK *, const uint8_t *, WT_ITEM *);
+static int __ckpt_update(
+	WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *, WT_BLOCK_CKPT *, int);
+
+/*
+ * __wt_block_ckpt_init --
+ *	Initialize a checkpoint structure.
+ */
+int
+__wt_block_ckpt_init(
+    WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name)
+{
+	WT_CLEAR(*ci);
+
+	ci->version = WT_BM_CHECKPOINT_VERSION;
+	ci->root_offset = WT_BLOCK_INVALID_OFFSET;
+
+	WT_RET(__wt_block_extlist_init(session, &ci->alloc, name, "alloc", 0));
+	WT_RET(__wt_block_extlist_init(session, &ci->avail, name, "avail", 1));
+	WT_RET(__wt_block_extlist_init(
+	    session, &ci->discard, name, "discard", 0));
+	WT_RET(__wt_block_extlist_init(
+	    session, &ci->ckpt_avail, name, "ckpt_avail", 1));
+
+	return (0);
+}
+
+/*
+ * __wt_block_checkpoint_load --
+ *	Load a checkpoint.
+ */
+int
+__wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block,
+    const uint8_t *addr, size_t addr_size,
+    uint8_t *root_addr, size_t *root_addr_sizep, int checkpoint)
+{
+	WT_BLOCK_CKPT *ci, _ci;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	uint8_t *endp;
+
+	WT_UNUSED(addr_size);
+	ci = NULL;
+
+	/*
+	 * Sometimes we don't find a root page (we weren't given a checkpoint,
+	 * or the checkpoint was empty).  In that case we return an empty root
+	 * address, set that up now.
+	 */
+	*root_addr_sizep = 0;
+
+	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
+		if (addr != NULL) {
+			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+			WT_ERR(__ckpt_string(session, block, addr, tmp));
+		}
+		WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
+		    "%s: load-checkpoint: %s", block->name,
+		    addr == NULL ? "[Empty]" : (const char *)tmp->data));
+	}
+
+	/*
+	 * There's a single checkpoint in the file that can be written, all of
+	 * the others are read-only.  We use the same initialization calls for
+	 * readonly checkpoints, but the information doesn't persist.
+	 */
+	if (checkpoint) {
+		ci = &_ci;
+		WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint"));
+	} else {
+		/*
+		 * We depend on the btree level for locking: things will go
+		 * bad fast should we open the live system in two handles, or
+		 * if we create, salvage, truncate or verify the live/running
+		 * file, for that matter.
+		 */
+		ci = &block->live;
+		WT_ERR(__wt_block_ckpt_init(session, ci, "live"));
+	}
+
+	/*
+	 * If the checkpoint has an on-disk root page, load it.  Otherwise, size
+	 * the file past the description information.
+	 */
+	if (addr == NULL || addr_size == 0)
+		ci->file_size = block->allocsize;
+	else {
+		/* Crack the checkpoint cookie. */
+		WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci));
+
+		/* Verify sets up next. */
+		if (block->verify)
+			WT_ERR(__wt_verify_ckpt_load(session, block, ci));
+
+		/* Read any root page. */
+		if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) {
+			endp = root_addr;
+			WT_ERR(__wt_block_addr_to_buffer(block, &endp,
+			    ci->root_offset, ci->root_size, ci->root_cksum));
+			*root_addr_sizep = WT_PTRDIFF(endp, root_addr);
+		}
+
+		/*
+		 * Rolling a checkpoint forward requires the avail list, the
+		 * blocks from which we can allocate.
+		 */
+		if (!checkpoint)
+			WT_ERR(__wt_block_extlist_read_avail(
+			    session, block, &ci->avail, ci->file_size));
+	}
+
+	/*
+	 * If the checkpoint can be written, that means anything written after
+	 * the checkpoint is no longer interesting, truncate the file.  Don't
+	 * bother checking the avail list for a block at the end of the file,
+	 * that was done when the checkpoint was first written (re-writing the
+	 * checkpoint might possibly make it relevant here, but it's unlikely
+	 * enough I don't bother).
+	 */
+	if (!checkpoint) {
+		/*
+		 * The truncate might fail if there's a file mapping (if there's
+		 * an open checkpoint on the file), that's OK.
+		 */
+		WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
+		    "truncate file to %" PRIuMAX, (uintmax_t)ci->file_size));
+		WT_ERR_BUSY_OK(
+		    __wt_ftruncate(session, block->fh, ci->file_size));
+	}
+
+	if (0) {
+err:		/*
+		 * Don't call checkpoint-unload: unload does real work including
+		 * file truncation.  If we fail early enough that the checkpoint
+		 * information isn't correct, bad things would happen.  The only
+		 * allocated memory was in the service of verify, clean that up.
+		 */
+		if (block->verify)
+			WT_TRET(__wt_verify_ckpt_unload(session, block));
+	}
+
+	/* Checkpoints don't need the original information, discard it. */
+	if (checkpoint && ci != NULL)
+		__wt_block_ckpt_destroy(session, ci);
+
+	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __wt_block_checkpoint_unload --
+ *	Unload a checkpoint.
+ */
+int
+__wt_block_checkpoint_unload(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, int checkpoint)
+{
+	WT_DECL_RET;
+
+	/* Verify cleanup. */
+	if (block->verify)
+		WT_TRET(__wt_verify_ckpt_unload(session, block));
+
+	/*
+	 * If it's the live system, truncate to discard any extended blocks and
+	 * discard the active extent lists.  Hold the lock even though we're
+	 * unloading the live checkpoint, there could be readers active in
+	 * other checkpoints.
+	 */
+	if (!checkpoint) {
+		/*
+		 * The truncate might fail if there's a file mapping (if there's
+		 * an open checkpoint on the file), that's OK.
+		 */
+		WT_TRET_BUSY_OK(
+		    __wt_ftruncate(session, block->fh, block->fh->size));
+
+		__wt_spin_lock(session, &block->live_lock);
+		__wt_block_ckpt_destroy(session, &block->live);
+		__wt_spin_unlock(session, &block->live_lock);
+	}
+
+	return (ret);
+}
+
+/*
+ * __wt_block_ckpt_destroy --
+ *	Clear a checkpoint structure.
+ */
+void
+__wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci)
+{
+	/* Discard the extent lists. */
+	__wt_block_extlist_free(session, &ci->alloc);
+	__wt_block_extlist_free(session, &ci->avail);
+	__wt_block_extlist_free(session, &ci->discard);
+	__wt_block_extlist_free(session, &ci->ckpt_alloc);
+	__wt_block_extlist_free(session, &ci->ckpt_avail);
+	__wt_block_extlist_free(session, &ci->ckpt_discard);
+}
+
+/*
+ * __wt_block_checkpoint --
+ *	Create a new checkpoint.
+ */
+int
+__wt_block_checkpoint(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase, int data_cksum)
+{
+	WT_BLOCK_CKPT *ci;
+	WT_DECL_RET;
+
+	ci = &block->live;
+
+	/*
+	 * Write the root page: it's possible for there to be a checkpoint of
+	 * an empty tree, in which case, we store an illegal root offset.
+	 *
+	 * !!!
+	 * We happen to know that checkpoints are single-threaded above us in
+	 * the btree engine.  That's probably something we want to guarantee
+	 * for any WiredTiger block manager.
+	 */
+	if (buf == NULL) {
+		ci->root_offset = WT_BLOCK_INVALID_OFFSET;
+		ci->root_size = ci->root_cksum = 0;
+	} else
+		WT_RET(__wt_block_write_off(session, block, buf,
+		    &ci->root_offset, &ci->root_size, &ci->root_cksum,
+		    data_cksum, 0));
+
+	/*
+	 * Checkpoints are potentially reading/writing/merging lots of blocks,
+	 * pre-allocate structures for this thread's use.
+	 */
+	WT_RET(__wt_block_ext_prealloc(session, 250));
+
+	/* Process the checkpoint list, deleting and updating as required. */
+	ret = __ckpt_process(session, block, ckptbase);
+
+	/* Discard any excessive memory we've allocated. */
+	WT_TRET(__wt_block_ext_discard(session, 250));
+
+	return (ret);
+}
+
+/*
+ * __ckpt_extlist_read --
+ *	Read a checkpoints extent lists and copy
+ */
+static int
+__ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
+{
+	WT_BLOCK_CKPT *ci;
+
+	/*
+	 * Allocate a checkpoint structure, crack the cookie and read the
+	 * checkpoint's extent lists.
+	 *
+	 * Ignore the avail list: checkpoint avail lists are only useful if we
+	 * are rolling forward from the particular checkpoint and they represent
+	 * our best understanding of what blocks can be allocated.  If we are
+	 * not operating on the live checkpoint, subsequent checkpoints might
+	 * have allocated those blocks, and the avail list is useless.  We don't
+	 * discard it, because it is useful as part of verification, but we
+	 * don't re-write it either.
+	 */
+	WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv));
+
+	ci = ckpt->bpriv;
+	WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
+	WT_RET(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
+	WT_RET(__wt_block_extlist_read(
+	    session, block, &ci->alloc, ci->file_size));
+	WT_RET(__wt_block_extlist_read(
+	    session, block, &ci->discard, ci->file_size));
+
+	return (0);
+}
+
+/*
+ * __ckpt_extlist_fblocks --
+ *	If a checkpoint's extent list is going away, free its blocks.
+ */
+static int
+__ckpt_extlist_fblocks(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
+{
+	if (el->offset == WT_BLOCK_INVALID_OFFSET)
+		return (0);
+
+	/*
+	 * Free blocks used to write checkpoint extents into the live system's
+	 * checkpoint avail list (they were never on any alloc list).   Do not
+	 * use the live system's avail list because that list is used to decide
+	 * if the file can be truncated, and we can't truncate any part of the
+	 * file that contains a previous checkpoint's extents.
+	 */
+	return (__wt_block_insert_ext(
+	    session, &block->live.ckpt_avail, el->offset, el->size));
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __ckpt_verify --
+ *	Diagnostic code, confirm we get what we expect in the checkpoint array.
+ */
+static int
+__ckpt_verify(WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
+{
+	WT_CKPT *ckpt;
+
+	/*
+	 * Fast check that we're seeing what we expect to see: some number of
+	 * checkpoints to add, delete or ignore, terminated by a new checkpoint.
+	 */
+	WT_CKPT_FOREACH(ckptbase, ckpt)
+		switch (ckpt->flags) {
+		case 0:
+		case WT_CKPT_DELETE:
+		case WT_CKPT_DELETE | WT_CKPT_FAKE:
+		case WT_CKPT_FAKE:
+			break;
+		case WT_CKPT_ADD:
+			if (ckpt[1].name == NULL)
+				break;
+			/* FALLTHROUGH */
+		default:
+			return (
+			    __wt_illegal_value(session, "checkpoint array"));
+		}
+	return (0);
+}
+#endif
+
+/*
+ * __ckpt_process --
+ *	Process the list of checkpoints.
+ */
+static int
+__ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
+{
+	WT_BLOCK_CKPT *a, *b, *ci;
+	WT_CKPT *ckpt, *next_ckpt;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	uint64_t ckpt_size;
+	int deleting, locked;
+
+	ci = &block->live;
+	locked = 0;
+
+#ifdef HAVE_DIAGNOSTIC
+	WT_RET(__ckpt_verify(session, ckptbase));
+#endif
+
+	/*
+	 * Checkpoints are a two-step process: first, write a new checkpoint to
+	 * disk (including all the new extent lists for modified checkpoints
+	 * and the live system).  As part of this, create a list of file blocks
+	 * newly available for reallocation, based on checkpoints being deleted.
+	 * We then return the locations of the new checkpoint information to our
+	 * caller.  Our caller has to write that information into some kind of
+	 * stable storage, and once that's done, we can actually allocate from
+	 * that list of newly available file blocks.  (We can't allocate from
+	 * that list immediately because the allocation might happen before our
+	 * caller saves the new checkpoint information, and if we crashed before
+	 * the new checkpoint location was saved, we'd have overwritten blocks
+	 * still referenced by checkpoints in the system.)  In summary, there is
+	 * a second step: after our caller saves the checkpoint information, we
+	 * are called to add the newly available blocks into the live system's
+	 * available list.
+	 *
+	 * This function is the first step, the second step is in the resolve
+	 * function.
+	 *
+	 * If we're called to checkpoint the same file twice, without the second
+	 * resolution step, it's an error at an upper level and our choices are
+	 * all bad: either leak blocks or risk crashing with our caller not
+	 * having saved the checkpoint information to stable storage.  Leaked
+	 * blocks are a safer choice, but that means file verify will fail for
+	 * the rest of "forever", and the chance of us allocating a block and
+	 * then crashing such that it matters is reasonably low: don't leak the
+	 * blocks.
+	 */
+	if (block->ckpt_inprogress) {
+		__wt_errx(session,
+		    "%s: checkpointed without the checkpoint being resolved",
+		    block->name);
+
+		WT_RET(__wt_block_checkpoint_resolve(session, block));
+	}
+
+	/*
+	 * Extents newly available as a result of deleting previous checkpoints
+	 * are added to a list of extents.  The list should be empty, but as
+	 * described above, there is no "free the checkpoint information" call
+	 * into the block manager; if there was an error in an upper level that
+	 * resulted in some previous checkpoint never being resolved, the list
+	 * may not be empty.  We should have caught that with the "checkpoint
+	 * in progress" test, but it doesn't cost us anything to be cautious.
+	 *
+	 * We free the checkpoint's allocation and discard extent lists as part
+	 * of the resolution step, not because they're needed at that time, but
+	 * because it's potentially a lot of work, and waiting allows the btree
+	 * layer to continue eviction sooner.  As for the checkpoint-available
+	 * list, make sure they get cleaned out.
+	 */
+	__wt_block_extlist_free(session, &ci->ckpt_avail);
+	WT_RET(__wt_block_extlist_init(
+	    session, &ci->ckpt_avail, "live", "ckpt_avail", 1));
+	__wt_block_extlist_free(session, &ci->ckpt_alloc);
+	__wt_block_extlist_free(session, &ci->ckpt_discard);
+
+	/*
+	 * To delete a checkpoint, we'll need checkpoint information for it and
+	 * the subsequent checkpoint into which it gets rolled; read them from
+	 * disk before we lock things down.
+	 */
+	deleting = 0;
+	WT_CKPT_FOREACH(ckptbase, ckpt) {
+		if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
+		    !F_ISSET(ckpt, WT_CKPT_DELETE))
+			continue;
+		deleting = 1;
+
+		/*
+		 * Read the checkpoint and next checkpoint extent lists if we
+		 * haven't already read them (we may have already read these
+		 * extent blocks if there is more than one deleted checkpoint).
+		 */
+		if (ckpt->bpriv == NULL)
+			WT_ERR(__ckpt_extlist_read(session, block, ckpt));
+
+		for (next_ckpt = ckpt + 1;; ++next_ckpt)
+			if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
+				break;
+
+		/*
+		 * The "next" checkpoint may be the live tree which has no
+		 * extent blocks to read.
+		 */
+		if (next_ckpt->bpriv == NULL &&
+		    !F_ISSET(next_ckpt, WT_CKPT_ADD))
+			WT_ERR(__ckpt_extlist_read(session, block, next_ckpt));
+	}
+
+	/*
+	 * Hold a lock so the live extent lists and the file size can't change
+	 * underneath us.  I suspect we'll tighten this if checkpoints take too
+	 * much time away from real work: we read the historic checkpoint
+	 * information without a lock, but we could also merge and re-write the
+	 * deleted and merged checkpoint information without a lock, except for
+	 * the final merge of ranges into the live tree.
+	 */
+	__wt_spin_lock(session, &block->live_lock);
+	locked = 1;
+
+	/*
+	 * We've allocated our last page, update the checkpoint size.  We need
+	 * to calculate the live system's checkpoint size before merging
+	 * checkpoint allocation and discard information from the checkpoints
+	 * we're deleting, those operations change the underlying byte counts.
+	 */
+	ckpt_size = ci->ckpt_size;
+	ckpt_size += ci->alloc.bytes;
+	ckpt_size -= ci->discard.bytes;
+
+	/* Skip the additional processing if we aren't deleting checkpoints. */
+	if (!deleting)
+		goto live_update;
+
+	/*
+	 * Delete any no-longer-needed checkpoints: we do this first as it frees
+	 * blocks to the live lists, and the freed blocks will then be included
+	 * when writing the live extent lists.
+	 */
+	WT_CKPT_FOREACH(ckptbase, ckpt) {
+		if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
+		    !F_ISSET(ckpt, WT_CKPT_DELETE))
+			continue;
+
+		if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
+			if (tmp == NULL)
+				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+			WT_ERR(__ckpt_string(
+			    session, block, ckpt->raw.data, tmp));
+			WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
+			    "%s: delete-checkpoint: %s: %s",
+			    block->name, ckpt->name, (const char *)tmp->data));
+		}
+
+		/*
+		 * Find the checkpoint into which we'll roll this checkpoint's
+		 * blocks: it's the next real checkpoint in the list, and it
+		 * better have been read in (if it's not the add slot).
+		 */
+		for (next_ckpt = ckpt + 1;; ++next_ckpt)
+			if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
+				break;
+
+		/*
+		 * Set the from/to checkpoint structures, where the "to" value
+		 * may be the live tree.
+		 */
+		a = ckpt->bpriv;
+		if (F_ISSET(next_ckpt, WT_CKPT_ADD))
+			b = &block->live;
+		else
+			b = next_ckpt->bpriv;
+
+		/*
+		 * Free the root page: there's nothing special about this free,
+		 * the root page is allocated using normal rules, that is, it
+		 * may have been taken from the avail list, and was entered on
+		 * the live system's alloc list at that time.  We free it into
+		 * the checkpoint's discard list, however, not the live system's
+		 * list because it appears on the checkpoint's alloc list and so
+		 * must be paired in the checkpoint.
+		 */
+		if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
+			WT_ERR(__wt_block_insert_ext(session,
+			    &a->discard, a->root_offset, a->root_size));
+
+		/*
+		 * Free the blocks used to hold the "from" checkpoint's extent
+		 * lists, including the avail list.
+		 */
+		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc));
+		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail));
+		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard));
+
+		/*
+		 * Roll the "from" alloc and discard extent lists into the "to"
+		 * checkpoint's lists.
+		 */
+		if (a->alloc.entries != 0)
+			WT_ERR(__wt_block_extlist_merge(
+			    session, &a->alloc, &b->alloc));
+		if (a->discard.entries != 0)
+			WT_ERR(__wt_block_extlist_merge(
+			    session, &a->discard, &b->discard));
+
+		/*
+		 * If the "to" checkpoint is also being deleted, we're done with
+		 * it, it's merged into some other checkpoint in the next loop.
+		 * This means the extent lists may aggregate over a number of
+		 * checkpoints, but that's OK, they're disjoint sets of ranges.
+		 */
+		if (F_ISSET(next_ckpt, WT_CKPT_DELETE))
+			continue;
+
+		/*
+		 * Find blocks for re-use: wherever the "to" checkpoint's
+		 * allocate and discard lists overlap, move the range to
+		 * the live system's checkpoint available list.
+		 */
+		WT_ERR(__wt_block_extlist_overlap(session, block, b));
+
+		/*
+		 * If we're updating the live system's information, we're done.
+		 */
+		if (F_ISSET(next_ckpt, WT_CKPT_ADD))
+			continue;
+
+		/*
+		 * We have to write the "to" checkpoint's extent lists out in
+		 * new blocks, and update its cookie.
+		 *
+		 * Free the blocks used to hold the "to" checkpoint's extent
+		 * lists; don't include the avail list, it's not changing.
+		 */
+		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
+		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));
+
+		F_SET(next_ckpt, WT_CKPT_UPDATE);
+	}
+
+	/* Update checkpoints marked for update. */
+	WT_CKPT_FOREACH(ckptbase, ckpt)
+		if (F_ISSET(ckpt, WT_CKPT_UPDATE))
+			WT_ERR(__ckpt_update(
+			    session, block, ckpt, ckpt->bpriv, 0));
+
+live_update:
+	/* Truncate the file if that's possible. */
+	WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail));
+
+	/* Update the final, added checkpoint based on the live system. */
+	WT_CKPT_FOREACH(ckptbase, ckpt)
+		if (F_ISSET(ckpt, WT_CKPT_ADD)) {
+			/*
+			 * Set the checkpoint size for the live system.
+			 *
+			 * !!!
+			 * Our caller wants the final checkpoint size.  Setting
+			 * the size here violates layering, but the alternative
+			 * is a call for the btree layer to crack the checkpoint
+			 * cookie into its components, and that's a fair amount
+			 * of work.
+			 */
+			ckpt->ckpt_size = ci->ckpt_size = ckpt_size;
+
+			WT_ERR(__ckpt_update(session, block, ckpt, ci, 1));
+		}
+
+	/*
+	 * Reset the live system's alloc and discard extent lists, leave the
+	 * avail list alone.  This includes freeing a lot of extents, so do it
+	 * outside of the system's lock by copying and resetting the original,
+	 * then doing the work later.
+	 */
+	ci->ckpt_alloc = ci->alloc;
+	WT_ERR(__wt_block_extlist_init(
+	    session, &ci->alloc, "live", "alloc", 0));
+	ci->ckpt_discard = ci->discard;
+	WT_ERR(__wt_block_extlist_init(
+	    session, &ci->discard, "live", "discard", 0));
+
+#ifdef HAVE_DIAGNOSTIC
+	/*
+	 * The first checkpoint in the system should always have an empty
+	 * discard list.  If we've read that checkpoint and/or created it,
+	 * check.
+	 */
+	WT_CKPT_FOREACH(ckptbase, ckpt)
+		if (!F_ISSET(ckpt, WT_CKPT_DELETE))
+			break;
+	if ((a = ckpt->bpriv) == NULL)
+		a = &block->live;
+	if (a->discard.entries != 0) {
+		__wt_errx(session,
+		    "first checkpoint incorrectly has blocks on the discard "
+		    "list");
+		WT_ERR(WT_ERROR);
+	}
+#endif
+
+	block->ckpt_inprogress = 1;
+
+err:	if (locked)
+		__wt_spin_unlock(session, &block->live_lock);
+
+	/* Discard any checkpoint information we loaded. */
+	WT_CKPT_FOREACH(ckptbase, ckpt)
+		if ((ci = ckpt->bpriv) != NULL)
+			__wt_block_ckpt_destroy(session, ci);
+
+	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __ckpt_update --
+ *	Update a checkpoint.
+ */
+static int
+__ckpt_update(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, WT_CKPT *ckpt, WT_BLOCK_CKPT *ci, int is_live)
+{
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	uint8_t *endp;
+
+#ifdef HAVE_DIAGNOSTIC
+	/* Check the extent list combinations for overlaps. */
+	WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail));
+	WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail));
+	WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard));
+#endif
+	/*
+	 * Write the checkpoint's alloc and discard extent lists.  After each
+	 * write, remove any allocated blocks from the system's allocation
+	 * list, checkpoint extent blocks don't appear on any extent lists.
+	 */
+	WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL));
+	WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL));
+
+	/*
+	 * We only write an avail list for the live system, other checkpoint's
+	 * avail lists are static and never change.
+	 *
+	 * Write the avail list last so it reflects changes due to allocating
+	 * blocks for the alloc and discard lists.  Second, when we write the
+	 * live system's avail list, it's two lists: the current avail list
+	 * plus the list of blocks to be made available when the new checkpoint
+	 * completes.  We can't merge that second list into the real list yet,
+	 * it's not truly available until the new checkpoint locations have been
+	 * saved to the metadata.
+	 */
+	if (is_live)
+		WT_RET(__wt_block_extlist_write(
+		    session, block, &ci->avail, &ci->ckpt_avail));
+
+	/*
+	 * Set the file size for the live system.
+	 *
+	 * !!!
+	 * We do NOT set the file size when re-writing checkpoints because we
+	 * want to test the checkpoint's blocks against a reasonable maximum
+	 * file size during verification.  This is bad: imagine a checkpoint
+	 * appearing early in the file, re-written, and then the checkpoint
+	 * requires blocks at the end of the file, blocks after the listed file
+	 * size.  If the application opens that checkpoint for writing
+	 * (discarding subsequent checkpoints), we would truncate the file to
+	 * the early chunk, discarding the re-written checkpoint information.
+	 * The alternative, updating the file size has its own problems, in
+	 * that case we'd work correctly, but we'd lose all of the blocks
+	 * between the original checkpoint and the re-written checkpoint.
+	 * Currently, there's no API to roll-forward intermediate checkpoints,
+	 * if there ever is, this will need to be fixed.
+	 */
+	if (is_live)
+		ci->file_size = block->fh->size;
+
+	/*
+	 * Copy the checkpoint information into the checkpoint array's address
+	 * cookie.
+	 */
+	WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BTREE_MAX_ADDR_COOKIE));
+	endp = ckpt->raw.mem;
+	WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, ci));
+	ckpt->raw.size = WT_PTRDIFF(endp, ckpt->raw.mem);
+
+	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
+		WT_RET(__wt_scr_alloc(session, 0, &tmp));
+		WT_ERR(__ckpt_string(session, block, ckpt->raw.data, tmp));
+		WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
+		    "%s: create-checkpoint: %s: %s",
+		    block->name, ckpt->name, (const char *)tmp->data));
+	}
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __wt_block_checkpoint_resolve --
+ *	Resolve a checkpoint.
+ */
+int
+__wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+	WT_BLOCK_CKPT *ci;
+	WT_DECL_RET;
+
+	ci = &block->live;
+
+	/*
+	 * Resolve the checkpoint after our caller has written the checkpoint
+	 * information to stable storage.
+	 */
+	if (!block->ckpt_inprogress)
+		WT_RET_MSG(session, WT_ERROR,
+		    "%s: checkpoint resolved, but no checkpoint in progress",
+		    block->name);
+	block->ckpt_inprogress = 0;
+
+	__wt_spin_lock(session, &block->live_lock);
+	ret = __wt_block_extlist_merge(session, &ci->ckpt_avail, &ci->avail);
+	__wt_spin_unlock(session, &block->live_lock);
+
+	/* Discard the lists remaining after the checkpoint call. */
+	__wt_block_extlist_free(session, &ci->ckpt_avail);
+	__wt_block_extlist_free(session, &ci->ckpt_alloc);
+	__wt_block_extlist_free(session, &ci->ckpt_discard);
+
+	return (ret);
+}
+
+/*
+ * __ckpt_string --
+ *	Return a printable string representation of a checkpoint address cookie.
+ */
+static int
+__ckpt_string(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, const uint8_t *addr, WT_ITEM *buf)
+{
+	WT_BLOCK_CKPT *ci, _ci;
+
+	/* Initialize the checkpoint, crack the cookie. */
+	ci = &_ci;
+	WT_RET(__wt_block_ckpt_init(session, ci, "string"));
+	WT_RET(__wt_block_buffer_to_ckpt(session, block, addr, ci));
+
+	WT_RET(__wt_buf_fmt(session, buf,
+	    "version=%d",
+	    ci->version));
+	if (ci->root_offset == WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__wt_buf_catfmt(session, buf, ", root=[Empty]"));
+	else
+		WT_RET(__wt_buf_catfmt(session, buf,
+		    ", root=[%"
+		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+		    (uintmax_t)ci->root_offset,
+		    (uintmax_t)(ci->root_offset + ci->root_size),
+		    ci->root_size, ci->root_cksum));
+	if (ci->alloc.offset == WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__wt_buf_catfmt(session, buf, ", alloc=[Empty]"));
+	else
+		WT_RET(__wt_buf_catfmt(session, buf,
+		    ", alloc=[%"
+		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+		    (uintmax_t)ci->alloc.offset,
+		    (uintmax_t)(ci->alloc.offset + ci->alloc.size),
+		    ci->alloc.size, ci->alloc.cksum));
+	if (ci->avail.offset == WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__wt_buf_catfmt(session, buf, ", avail=[Empty]"));
+	else
+		WT_RET(__wt_buf_catfmt(session, buf,
+		    ", avail=[%"
+		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+		    (uintmax_t)ci->avail.offset,
+		    (uintmax_t)(ci->avail.offset + ci->avail.size),
+		    ci->avail.size, ci->avail.cksum));
+	if (ci->discard.offset == WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__wt_buf_catfmt(session, buf, ", discard=[Empty]"));
+	else
+		WT_RET(__wt_buf_catfmt(session, buf,
+		    ", discard=[%"
+		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+		    (uintmax_t)ci->discard.offset,
+		    (uintmax_t)(ci->discard.offset + ci->discard.size),
+		    ci->discard.size, ci->discard.cksum));
+	WT_RET(__wt_buf_catfmt(session, buf,
+	    ", file size=%" PRIuMAX, (uintmax_t)ci->file_size));
+
+	__wt_block_ckpt_destroy(session, ci);
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_compact.c b/src/third_party/wiredtiger/src/block/block_compact.c
new file mode 100644
index 00000000000..007c77f3291
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_compact.c
@@ -0,0 +1,221 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __block_dump_avail(WT_SESSION_IMPL *, WT_BLOCK *);
+
+/*
+ * __wt_block_compact_start --
+ *	Start compaction of a file.
+ */
+int
+__wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+	WT_UNUSED(session);
+
+	/*
+	 * Save the current allocation plan, switch to first-fit allocation.
+	 * We don't need the lock, but it's not a performance question and
+	 * might avoid bugs in the future.
+	 */
+	__wt_spin_lock(session, &block->live_lock);
+	block->allocfirst_save = block->allocfirst;
+	block->allocfirst = 1;
+	__wt_spin_unlock(session, &block->live_lock);
+
+	return (0);
+}
+
+/*
+ * __wt_block_compact_end --
+ *	End compaction of a file.
+ */
+int
+__wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+	WT_UNUSED(session);
+
+	/*
+	 * Restore the previous allocation plan.
+	 * We don't need the lock, but it's not a performance question and
+	 * might avoid bugs in the future.
+	 */
+	__wt_spin_lock(session, &block->live_lock);
+	block->allocfirst = block->allocfirst_save;
+	__wt_spin_unlock(session, &block->live_lock);
+
+	return (0);
+}
+
+/*
+ * __wt_block_compact_skip --
+ *	Return if compaction will shrink the file.
+ */
+int
+__wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, int *skipp)
+{
+	WT_DECL_RET;
+	WT_EXT *ext;
+	WT_EXTLIST *el;
+	WT_FH *fh;
+	wt_off_t avail, ninety;
+
+	*skipp = 1;				/* Return a default skip. */
+
+	fh = block->fh;
+
+	/*
+	 * We do compaction by copying blocks from the end of the file to the
+	 * beginning of the file, and we need some metrics to decide if it's
+	 * worth doing.  Ignore small files, and files where we are unlikely
+	 * to recover 10% of the file.
+	 */
+	if (fh->size <= 10 * 1024)
+		return (0);
+
+	__wt_spin_lock(session, &block->live_lock);
+
+	if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT))
+		WT_ERR(__block_dump_avail(session, block));
+
+	/* Sum the number of available bytes in the first 90% of the file. */
+	avail = 0;
+	ninety = fh->size - fh->size / 10;
+
+	el = &block->live.avail;
+	WT_EXT_FOREACH(ext, el->off)
+		if (ext->off < ninety)
+			avail += ext->size;
+
+	/*
+	 * If at least 10% of the total file is available and in the first 90%
+	 * of the file, we'll try compaction.
+	 */
+	if (avail >= fh->size / 10)
+		*skipp = 0;
+
+	WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
+	    "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first "
+	    "90%% of the file, require 10%% or %" PRIuMAX "MB (%" PRIuMAX
+	    ") to perform compaction, compaction %s",
+	    block->name,
+	    (uintmax_t)avail / WT_MEGABYTE, (uintmax_t)avail,
+	    (uintmax_t)(fh->size / 10) / WT_MEGABYTE, (uintmax_t)fh->size / 10,
+	    *skipp ? "skipped" : "proceeding"));
+
+err:	__wt_spin_unlock(session, &block->live_lock);
+
+	return (ret);
+}
+
+/*
+ * __wt_block_compact_page_skip --
+ *	Return if writing a particular page will shrink the file.
+ */
+int
+__wt_block_compact_page_skip(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, const uint8_t *addr, size_t addr_size, int *skipp)
+{
+	WT_DECL_RET;
+	WT_EXT *ext;
+	WT_EXTLIST *el;
+	WT_FH *fh;
+	wt_off_t ninety, offset;
+	uint32_t size, cksum;
+
+	WT_UNUSED(addr_size);
+	*skipp = 1;				/* Return a default skip. */
+
+	fh = block->fh;
+
+	/* Crack the cookie. */
+	WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+	__wt_spin_lock(session, &block->live_lock);
+
+	/*
+	 * If this block is in the last 10% of the file and there's a block on
+	 * the available list that's in the first 90% of the file, rewrite the
+	 * block.  Checking the available list is necessary (otherwise writing
+	 * the block would extend the file), but there's an obvious race if the
+	 * file is sufficiently busy.
+	 */
+	ninety = fh->size - fh->size / 10;
+	if (offset > ninety) {
+		el = &block->live.avail;
+		WT_EXT_FOREACH(ext, el->off)
+			if (ext->off < ninety && ext->size >= size) {
+				*skipp = 0;
+				break;
+			}
+	}
+
+	__wt_spin_unlock(session, &block->live_lock);
+
+	return (ret);
+}
+
+/*
+ * __block_dump_avail --
+ *	Dump out the avail list so we can see what compaction will look like.
+ */
+static int
+__block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+	WT_EXTLIST *el;
+	WT_EXT *ext;
+	wt_off_t decile[10], percentile[100], size, v;
+	u_int i;
+
+	el = &block->live.avail;
+	size = block->fh->size;
+
+	WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+	    "file size %" PRIuMAX "MB (%" PRIuMAX ") with %" PRIuMAX
+	    "%% space available %" PRIuMAX "MB (%" PRIuMAX ")",
+	    (uintmax_t)size / WT_MEGABYTE, (uintmax_t)size,
+	    ((uintmax_t)el->bytes * 100) / (uintmax_t)size,
+	    (uintmax_t)el->bytes / WT_MEGABYTE, (uintmax_t)el->bytes));
+
+	if (el->entries == 0)
+		return (0);
+
+	/*
+	 * Bucket the available memory into file deciles/percentiles.  Large
+	 * pieces of memory will cross over multiple buckets, assign to the
+	 * decile/percentile in 512B chunks.
+	 */
+	memset(decile, 0, sizeof(decile));
+	memset(percentile, 0, sizeof(percentile));
+	WT_EXT_FOREACH(ext, el->off)
+		for (i = 0; i < ext->size / 512; ++i) {
+			++decile[((ext->off + i * 512) * 10) / size];
+			++percentile[((ext->off + i * 512) * 100) / size];
+		}
+
+#ifdef __VERBOSE_OUTPUT_PERCENTILE
+	for (i = 0; i < WT_ELEMENTS(percentile); ++i) {
+		v = percentile[i] * 512;
+		WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+		    "%2u%%: %12" PRIuMAX "MB, (%" PRIuMAX "B, %"
+		    PRIuMAX "%%)",
+		    i, (uintmax_t)v / WT_MEGABYTE, (uintmax_t)v,
+		    (uintmax_t)((v * 100) / (wt_off_t)el->bytes)));
+	}
+#endif
+	for (i = 0; i < WT_ELEMENTS(decile); ++i) {
+		v = decile[i] * 512;
+		WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+		    "%2u%%: %12" PRIuMAX "MB, (%" PRIuMAX "B, %"
+		    PRIuMAX "%%)",
+		    i * 10, (uintmax_t)v / WT_MEGABYTE, (uintmax_t)v,
+		    (uintmax_t)((v * 100) / (wt_off_t)el->bytes)));
+	}
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_ext.c b/src/third_party/wiredtiger/src/block/block_ext.c
new file mode 100644
index 00000000000..d500f93817a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_ext.c
@@ -0,0 +1,1437 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __block_append(WT_SESSION_IMPL *, WT_EXTLIST *, wt_off_t, wt_off_t);
+static int __block_ext_overlap(WT_SESSION_IMPL *,
+	WT_BLOCK *, WT_EXTLIST *, WT_EXT **, WT_EXTLIST *, WT_EXT **);
+static int __block_extlist_dump(
+	WT_SESSION_IMPL *, const char *, WT_EXTLIST *, int);
+static int __block_merge(WT_SESSION_IMPL *, WT_EXTLIST *, wt_off_t, wt_off_t);
+
+/*
+ * __block_off_srch_last --
+ *	Return the last element in the list, along with a stack for appending.
+ */
+static inline WT_EXT *
+__block_off_srch_last(WT_EXT **head, WT_EXT ***stack)
+{
+	WT_EXT **extp, *last;
+	int i;
+
+	last = NULL;				/* The list may be empty */
+
+	/*
+	 * Start at the highest skip level, then go as far as possible at each
+	 * level before stepping down to the next.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;)
+		if (*extp != NULL) {
+			last = *extp;
+			extp = &(*extp)->next[i];
+		} else
+			stack[i--] = extp--;
+	return (last);
+}
+
+/*
+ * __block_off_srch --
+ *	Search a by-offset skiplist (either the primary by-offset list, or the
+ * by-offset list referenced by a size entry), for the specified offset.
+ */
+static inline void
+__block_off_srch(WT_EXT **head, wt_off_t off, WT_EXT ***stack, int skip_off)
+{
+	WT_EXT **extp;
+	int i;
+
+	/*
+	 * Start at the highest skip level, then go as far as possible at each
+	 * level before stepping down to the next.
+	 *
+	 * Return a stack for an exact match or the next-largest item.
+	 *
+	 * The WT_EXT structure contains two skiplists, the primary one and the
+	 * per-size bucket one: if the skip_off flag is set, offset the skiplist
+	 * array by the depth specified in this particular structure.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;)
+		if (*extp != NULL && (*extp)->off < off)
+			extp =
+			    &(*extp)->next[i + (skip_off ? (*extp)->depth : 0)];
+		else
+			stack[i--] = extp--;
+}
+
+/*
+ * __block_first_srch --
+ *	Search the skiplist for the first available slot.
+ */
+static inline int
+__block_first_srch(WT_EXT **head, wt_off_t size, WT_EXT ***stack)
+{
+	WT_EXT *ext;
+
+	/*
+	 * Linear walk of the available chunks in offset order; take the first
+	 * one that's large enough.
+	 */
+	WT_EXT_FOREACH(ext, head)
+		if (ext->size >= size)
+			break;
+	if (ext == NULL)
+		return (0);
+
+	/* Build a stack for the offset we want. */
+	__block_off_srch(head, ext->off, stack, 0);
+	return (1);
+}
+
+/*
+ * __block_size_srch --
+ *	Search the by-size skiplist for the specified size.
+ */
+static inline void
+__block_size_srch(WT_SIZE **head, wt_off_t size, WT_SIZE ***stack)
+{
+	WT_SIZE **szp;
+	int i;
+
+	/*
+	 * Start at the highest skip level, then go as far as possible at each
+	 * level before stepping down to the next.
+	 *
+	 * Return a stack for an exact match or the next-largest item.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1, szp = &head[i]; i >= 0;)
+		if (*szp != NULL && (*szp)->size < size)
+			szp = &(*szp)->next[i];
+		else
+			stack[i--] = szp--;
+}
+
+/*
+ * __block_off_srch_pair --
+ *	Search a by-offset skiplist for before/after records of the specified
+ * offset.
+ */
+static inline void
+__block_off_srch_pair(
+    WT_EXTLIST *el, wt_off_t off, WT_EXT **beforep, WT_EXT **afterp)
+{
+	WT_EXT **head, **extp;
+	int i;
+
+	*beforep = *afterp = NULL;
+
+	head = el->off;
+
+	/*
+	 * Start at the highest skip level, then go as far as possible at each
+	 * level before stepping down to the next.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;) {
+		if (*extp == NULL) {
+			--i;
+			--extp;
+			continue;
+		}
+
+		if ((*extp)->off < off) {	/* Keep going at this level */
+			*beforep = *extp;
+			extp = &(*extp)->next[i];
+		} else {			/* Drop down a level */
+			*afterp = *extp;
+			--i;
+			--extp;
+		}
+	}
+}
+
+/*
+ * __block_ext_insert --
+ *	Insert an extent into an extent list.
+ */
+static int
+__block_ext_insert(WT_SESSION_IMPL *session, WT_EXTLIST *el, WT_EXT *ext)
+{
+	WT_EXT **astack[WT_SKIP_MAXDEPTH];
+	WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
+	u_int i;
+
+	/*
+	 * If we are inserting a new size onto the size skiplist, we'll need a
+	 * new WT_SIZE structure for that skiplist.
+	 */
+	if (el->track_size) {
+		__block_size_srch(el->sz, ext->size, sstack);
+		szp = *sstack[0];
+		if (szp == NULL || szp->size != ext->size) {
+			WT_RET(__wt_block_size_alloc(session, &szp));
+			szp->size = ext->size;
+			szp->depth = ext->depth;
+			for (i = 0; i < ext->depth; ++i) {
+				szp->next[i] = *sstack[i];
+				*sstack[i] = szp;
+			}
+		}
+
+		/*
+		 * Insert the new WT_EXT structure into the size element's
+		 * offset skiplist.
+		 */
+		__block_off_srch(szp->off, ext->off, astack, 1);
+		for (i = 0; i < ext->depth; ++i) {
+			ext->next[i + ext->depth] = *astack[i];
+			*astack[i] = ext;
+		}
+	}
+#ifdef HAVE_DIAGNOSTIC
+	if (!el->track_size)
+		for (i = 0; i < ext->depth; ++i)
+			ext->next[i + ext->depth] = NULL;
+#endif
+
+	/* Insert the new WT_EXT structure into the offset skiplist. */
+	__block_off_srch(el->off, ext->off, astack, 0);
+	for (i = 0; i < ext->depth; ++i) {
+		ext->next[i] = *astack[i];
+		*astack[i] = ext;
+	}
+
+	++el->entries;
+	el->bytes += (uint64_t)ext->size;
+
+	/* Update the cached end-of-list. */
+	if (ext->next[0] == NULL)
+		el->last = ext;
+
+	return (0);
+}
+
+/*
+ * __block_off_insert --
+ *	Insert a file range into an extent list.
+ */
+static int
+__block_off_insert(
+    WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+{
+	WT_EXT *ext;
+
+	WT_RET(__wt_block_ext_alloc(session, &ext));
+	ext->off = off;
+	ext->size = size;
+
+	return (__block_ext_insert(session, el, ext));
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __block_off_match --
+ *	Return if any part of a specified range appears on a specified extent
+ * list.
+ */
+static int
+__block_off_match(WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+{
+	WT_EXT *before, *after;
+
+	/* Search for before and after entries for the offset. */
+	__block_off_srch_pair(el, off, &before, &after);
+
+	/* If "before" or "after" overlaps, we have a winner. */
+	if (before != NULL && before->off + before->size > off)
+		return (1);
+	if (after != NULL && off + size > after->off)
+		return (1);
+	return (0);
+}
+
+/*
+ * __wt_block_misplaced --
+ *	Complain if a block appears on the available or discard lists.
+ */
+int
+__wt_block_misplaced(WT_SESSION_IMPL *session,
+   WT_BLOCK *block, const char *tag, wt_off_t offset, uint32_t size, int live)
+{
+	const char *name;
+
+	name = NULL;
+
+	/*
+	 * Don't check during the salvage read phase, we might be reading an
+	 * already freed overflow page.
+	 */
+	if (F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))
+		return (0);
+
+	/*
+	 * Verify a block the btree engine thinks it "owns" doesn't appear on
+	 * the available or discard lists (it might reasonably be on the alloc
+	 * list, if it was allocated since the last checkpoint).  The engine
+	 * "owns" a block if it's trying to read or free the block, and those
+	 * functions make this check.
+	 *
+	 * Any block being read or freed should not be "available".
+	 *
+	 * Any block being read or freed in the live system should not be on the
+	 * discard list.  (A checkpoint handle might be reading a block which is
+	 * on the live system's discard list; any attempt to free a block from a
+	 * checkpoint handle has already failed.)
+	 */
+	__wt_spin_lock(session, &block->live_lock);
+	if (__block_off_match(&block->live.avail, offset, size))
+		name = "available";
+	else if (live && __block_off_match(&block->live.discard, offset, size))
+		name = "discard";
+	__wt_spin_unlock(session, &block->live_lock);
+	if (name != NULL) {
+		__wt_errx(session,
+		    "%s failed: %" PRIuMAX "/%" PRIu32 " is on the %s list",
+		    tag, (uintmax_t)offset, size, name);
+		return (__wt_panic(session));
+	}
+	return (0);
+}
+#endif
+
+/*
+ * __block_off_remove --
+ *	Remove a record from an extent list.
+ */
+static int
+__block_off_remove(
+    WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, WT_EXT **extp)
+{
+	WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH];
+	WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
+	u_int i;
+
+	/* Find and remove the record from the by-offset skiplist. */
+	__block_off_srch(el->off, off, astack, 0);
+	ext = *astack[0];
+	if (ext == NULL || ext->off != off)
+		goto corrupt;
+	for (i = 0; i < ext->depth; ++i)
+		*astack[i] = ext->next[i];
+
+	/*
+	 * Find and remove the record from the size's offset skiplist; if that
+	 * empties the by-size skiplist entry, remove it as well.
+	 */
+	if (el->track_size) {
+		__block_size_srch(el->sz, ext->size, sstack);
+		szp = *sstack[0];
+		if (szp == NULL || szp->size != ext->size)
+			return (EINVAL);
+		__block_off_srch(szp->off, off, astack, 1);
+		ext = *astack[0];
+		if (ext == NULL || ext->off != off)
+			goto corrupt;
+		for (i = 0; i < ext->depth; ++i)
+			*astack[i] = ext->next[i + ext->depth];
+		if (szp->off[0] == NULL) {
+			for (i = 0; i < szp->depth; ++i)
+				*sstack[i] = szp->next[i];
+			__wt_block_size_free(session, szp);
+		}
+	}
+#ifdef HAVE_DIAGNOSTIC
+	if (!el->track_size) {
+		int not_null;
+		for (i = 0, not_null = 0; i < ext->depth; ++i)
+			if (ext->next[i + ext->depth] != NULL)
+				not_null = 1;
+		WT_ASSERT(session, not_null == 0);
+	}
+#endif
+
+	--el->entries;
+	el->bytes -= (uint64_t)ext->size;
+
+	/* Return the record if our caller wants it, otherwise free it. */
+	if (extp == NULL)
+		__wt_block_ext_free(session, ext);
+	else
+		*extp = ext;
+
+	/* Update the cached end-of-list. */
+	if (el->last == ext)
+		el->last = NULL;
+
+	return (0);
+
+corrupt:
+	WT_PANIC_RET(session, EINVAL,
+	    "attempt to remove non-existent offset from an extent list");
+}
+
+/*
+ * __wt_block_off_remove_overlap --
+ *	Remove a range from an extent list, where the range may be part of a
+ * overlapping entry.
+ */
+int
+__wt_block_off_remove_overlap(
+    WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+{
+	WT_EXT *before, *after, *ext;
+	wt_off_t a_off, a_size, b_off, b_size;
+
+	WT_ASSERT(session, off != WT_BLOCK_INVALID_OFFSET);
+
+	/* Search for before and after entries for the offset. */
+	__block_off_srch_pair(el, off, &before, &after);
+
+	/* If "before" or "after" overlaps, retrieve the overlapping entry. */
+	if (before != NULL && before->off + before->size > off) {
+		WT_RET(__block_off_remove(session, el, before->off, &ext));
+
+		/* Calculate overlapping extents. */
+		a_off = ext->off;
+		a_size = off - ext->off;
+		b_off = off + size;
+		b_size = ext->size - (a_size + size);
+	} else if (after != NULL && off + size > after->off) {
+		WT_RET(__block_off_remove(session, el, after->off, &ext));
+
+		/*
+		 * Calculate overlapping extents.  There's no initial overlap
+		 * since the after extent presumably cannot begin before "off".
+		 */
+		a_off = WT_BLOCK_INVALID_OFFSET;
+		a_size = 0;
+		b_off = off + size;
+		b_size = ext->size - (b_off - ext->off);
+	} else
+		return (WT_NOTFOUND);
+
+	/*
+	 * If there are overlaps, insert the item; re-use the extent structure
+	 * and save the allocation (we know there's no need to merge).
+	 */
+	if (a_size != 0) {
+		ext->off = a_off;
+		ext->size = a_size;
+		WT_RET(__block_ext_insert(session, el, ext));
+		ext = NULL;
+	}
+	if (b_size != 0) {
+		if (ext == NULL)
+			WT_RET(__block_off_insert(session, el, b_off, b_size));
+		else {
+			ext->off = b_off;
+			ext->size = b_size;
+			WT_RET(__block_ext_insert(session, el, ext));
+			ext = NULL;
+		}
+	}
+	if (ext != NULL)
+		__wt_block_ext_free(session, ext);
+	return (0);
+}
+
+/*
+ * __block_extend --
+ *	Extend the file to allocate space.
+ */
+static inline int
+__block_extend(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size)
+{
+	WT_FH *fh;
+
+	fh = block->fh;
+
+	/*
+	 * Callers of this function are expected to have already acquired any
+	 * locks required to extend the file.
+	 *
+	 * We should never be allocating from an empty file.
+	 */
+	if (fh->size < block->allocsize)
+		WT_RET_MSG(session, EINVAL,
+		    "file has no description information");
+
+	/*
+	 * Make sure we don't allocate past the maximum file size.  There's no
+	 * easy way to know the maximum wt_off_t on a system, limit growth to
+	 * 8B bits (we currently check an wt_off_t is 8B in verify_build.h). I
+	 * don't think we're likely to see anything bigger for awhile.
+	 */
+	if (fh->size > (wt_off_t)INT64_MAX - size)
+		WT_RET_MSG(session, WT_ERROR,
+		    "block allocation failed, file cannot grow further");
+
+	*offp = fh->size;
+	fh->size += size;
+
+	WT_STAT_FAST_DATA_INCR(session, block_extension);
+	WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+	    "file extend %" PRIdMAX "B @ %" PRIdMAX,
+	    (intmax_t)size, (intmax_t)*offp));
+
+	return (0);
+}
+
+/*
+ * __wt_block_alloc --
+ *	Alloc a chunk of space from the underlying file.
+ */
+int
+__wt_block_alloc(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size)
+{
+	WT_EXT *ext, **estack[WT_SKIP_MAXDEPTH];
+	WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
+
+	/* Assert we're maintaining the by-size skiplist. */
+	WT_ASSERT(session, block->live.avail.track_size != 0);
+
+	WT_STAT_FAST_DATA_INCR(session, block_alloc);
+	if (size % block->allocsize != 0)
+		WT_RET_MSG(session, EINVAL,
+		    "cannot allocate a block size %" PRIdMAX " that is not "
+		    "a multiple of the allocation size %" PRIu32,
+		    (intmax_t)size, block->allocsize);
+
+	/*
+	 * Allocation is either first-fit (lowest offset), or best-fit (best
+	 * size).  If it's first-fit, walk the offset list linearly until we
+	 * find an entry that will work.
+	 *
+	 * If it's best-fit by size, search the by-size skiplist for the size
+	 * and take the first entry on the by-size offset list.  This means we
+	 * prefer best-fit over lower offset, but within a size we'll prefer an
+	 * offset appearing earlier in the file.
+	 *
+	 * If we don't have anything big enough, extend the file.
+	 */
+	if (block->live.avail.bytes < (uint64_t)size)
+		goto append;
+	if (block->allocfirst) {
+		if (!__block_first_srch(block->live.avail.off, size, estack))
+			goto append;
+		ext = *estack[0];
+	} else {
+		__block_size_srch(block->live.avail.sz, size, sstack);
+		if ((szp = *sstack[0]) == NULL) {
+append:			WT_RET(__block_extend(session, block, offp, size));
+			WT_RET(__block_append(session,
+			    &block->live.alloc, *offp, (wt_off_t)size));
+			return (0);
+		}
+
+		/* Take the first record. */
+		ext = szp->off[0];
+	}
+
+	/* Remove the record, and set the returned offset. */
+	WT_RET(__block_off_remove(session, &block->live.avail, ext->off, &ext));
+	*offp = ext->off;
+
+	/* If doing a partial allocation, adjust the record and put it back. */
+	if (ext->size > size) {
+		WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+		    "allocate %" PRIdMAX " from range %" PRIdMAX "-%"
+		    PRIdMAX ", range shrinks to %" PRIdMAX "-%" PRIdMAX,
+		    (intmax_t)size,
+		    (intmax_t)ext->off, (intmax_t)(ext->off + ext->size),
+		    (intmax_t)(ext->off + size),
+		    (intmax_t)(ext->off + size + ext->size - size)));
+
+		ext->off += size;
+		ext->size -= size;
+		WT_RET(__block_ext_insert(session, &block->live.avail, ext));
+	} else {
+		WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+		    "allocate range %" PRIdMAX "-%" PRIdMAX,
+		    (intmax_t)ext->off, (intmax_t)(ext->off + ext->size)));
+
+		__wt_block_ext_free(session, ext);
+	}
+
+	/* Add the newly allocated extent to the list of allocations. */
+	WT_RET(__block_merge(
+	    session, &block->live.alloc, *offp, (wt_off_t)size));
+	return (0);
+}
+
+/*
+ * __wt_block_free --
+ *	Free a cookie-referenced chunk of space to the underlying file.
+ */
+int
+__wt_block_free(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, const uint8_t *addr, size_t addr_size)
+{
+	WT_DECL_RET;
+	wt_off_t offset;
+	uint32_t cksum, size;
+
+	WT_UNUSED(addr_size);
+	WT_STAT_FAST_DATA_INCR(session, block_free);
+
+	/* Crack the cookie. */
+	WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+	WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+	    "free %" PRIdMAX "/%" PRIdMAX, (intmax_t)offset, (intmax_t)size));
+
+#ifdef HAVE_DIAGNOSTIC
+	WT_RET(__wt_block_misplaced(session, block, "free", offset, size, 1));
+#endif
+	WT_RET(__wt_block_ext_prealloc(session, 5));
+	__wt_spin_lock(session, &block->live_lock);
+	ret = __wt_block_off_free(session, block, offset, (wt_off_t)size);
+	__wt_spin_unlock(session, &block->live_lock);
+
+	return (ret);
+}
+
+/*
+ * __wt_block_off_free --
+ *	Free a file range to the underlying file.
+ */
+int
+__wt_block_off_free(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, wt_off_t size)
+{
+	WT_DECL_RET;
+
+	/*
+	 * Callers of this function are expected to have already acquired any
+	 * locks required to manipulate the extent lists.
+	 *
+	 * We can reuse this extent immediately if it was allocated during this
+	 * checkpoint, merge it into the avail list (which slows file growth in
+	 * workloads including repeated overflow record modification).  If this
+	 * extent is referenced in a previous checkpoint, merge into the discard
+	 * list.
+	 */
+	if ((ret = __wt_block_off_remove_overlap(
+	    session, &block->live.alloc, offset, size)) == 0)
+		ret = __block_merge(
+		    session, &block->live.avail, offset, (wt_off_t)size);
+	else if (ret == WT_NOTFOUND)
+		ret = __block_merge(
+		    session, &block->live.discard, offset, (wt_off_t)size);
+	return (ret);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_block_extlist_check --
+ *	Return if the extent lists overlap.
+ */
+int
+__wt_block_extlist_check(
+    WT_SESSION_IMPL *session, WT_EXTLIST *al, WT_EXTLIST *bl)
+{
+	WT_EXT *a, *b;
+
+	a = al->off[0];
+	b = bl->off[0];
+
+	/* Walk the lists in parallel, looking for overlaps. */
+	while (a != NULL && b != NULL) {
+		/*
+		 * If there's no overlap, move the lower-offset entry to the
+		 * next entry in its list.
+		 */
+		if (a->off + a->size <= b->off) {
+			a = a->next[0];
+			continue;
+		}
+		if (b->off + b->size <= a->off) {
+			b = b->next[0];
+			continue;
+		}
+		WT_PANIC_RET(session, EINVAL,
+		    "checkpoint merge check: %s list overlaps the %s list",
+		    al->name, bl->name);
+	}
+	return (0);
+}
+#endif
+
+/*
+ * __wt_block_extlist_overlap --
+ *	Review a checkpoint's alloc/discard extent lists, move overlaps into the
+ * live system's checkpoint-avail list.
+ */
+int
+__wt_block_extlist_overlap(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci)
+{
+	WT_EXT *alloc, *discard;
+
+	alloc = ci->alloc.off[0];
+	discard = ci->discard.off[0];
+
+	/* Walk the lists in parallel, looking for overlaps. */
+	while (alloc != NULL && discard != NULL) {
+		/*
+		 * If there's no overlap, move the lower-offset entry to the
+		 * next entry in its list.
+		 */
+		if (alloc->off + alloc->size <= discard->off) {
+			alloc = alloc->next[0];
+			continue;
+		}
+		if (discard->off + discard->size <= alloc->off) {
+			discard = discard->next[0];
+			continue;
+		}
+
+		/* Reconcile the overlap. */
+		WT_RET(__block_ext_overlap(session, block,
+		    &ci->alloc, &alloc, &ci->discard, &discard));
+	}
+	return (0);
+}
+
+/*
+ * __block_ext_overlap --
+ *	Reconcile two overlapping ranges.
+ */
+static int
+__block_ext_overlap(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, WT_EXTLIST *ael, WT_EXT **ap, WT_EXTLIST *bel, WT_EXT **bp)
+{
+	WT_EXT *a, *b, **ext;
+	WT_EXTLIST *avail, *el;
+	wt_off_t off, size;
+
+	avail = &block->live.ckpt_avail;
+
+	/*
+	 * The ranges overlap, choose the range we're going to take from each.
+	 *
+	 * We can think of the overlap possibilities as 11 different cases:
+	 *
+	 *		AAAAAAAAAAAAAAAAAA
+	 * #1		BBBBBBBBBBBBBBBBBB		ranges are the same
+	 * #2	BBBBBBBBBBBBB				overlaps the beginning
+	 * #3			BBBBBBBBBBBBBBBB	overlaps the end
+	 * #4		BBBBB				B is a prefix of A
+	 * #5			BBBBBB			B is middle of A
+	 * #6			BBBBBBBBBB		B is a suffix of A
+	 *
+	 * and:
+	 *
+	 *		BBBBBBBBBBBBBBBBBB
+	 * #7	AAAAAAAAAAAAA				same as #3
+	 * #8			AAAAAAAAAAAAAAAA	same as #2
+	 * #9		AAAAA				A is a prefix of B
+	 * #10			AAAAAA			A is middle of B
+	 * #11			AAAAAAAAAA		A is a suffix of B
+	 *
+	 *
+	 * By swapping the arguments so "A" is always the lower range, we can
+	 * eliminate cases #2, #8, #10 and #11, and only handle 7 cases:
+	 *
+	 *		AAAAAAAAAAAAAAAAAA
+	 * #1		BBBBBBBBBBBBBBBBBB		ranges are the same
+	 * #3			BBBBBBBBBBBBBBBB	overlaps the end
+	 * #4		BBBBB				B is a prefix of A
+	 * #5			BBBBBB			B is middle of A
+	 * #6			BBBBBBBBBB		B is a suffix of A
+	 *
+	 * and:
+	 *
+	 *		BBBBBBBBBBBBBBBBBB
+	 * #7	AAAAAAAAAAAAA				same as #3
+	 * #9		AAAAA				A is a prefix of B
+	 */
+	a = *ap;
+	b = *bp;
+	if (a->off > b->off) {				/* Swap */
+		b = *ap;
+		a = *bp;
+		ext = ap; ap = bp; bp = ext;
+		el = ael; ael = bel; bel = el;
+	}
+
+	if (a->off == b->off) {				/* Case #1, #4, #9 */
+		if (a->size == b->size) {		/* Case #1 */
+			/*
+			 * Move caller's A and B to the next element
+			 * Add that A and B range to the avail list
+			 * Delete A and B
+			 */
+			*ap = (*ap)->next[0];
+			*bp = (*bp)->next[0];
+			WT_RET(__block_merge(session, avail, b->off, b->size));
+			WT_RET(__block_off_remove(session, ael, a->off, NULL));
+			WT_RET(__block_off_remove(session, bel, b->off, NULL));
+		}
+		else if (a->size > b->size) {		/* Case #4 */
+			/*
+			 * Remove A from its list
+			 * Increment/Decrement A's offset/size by the size of B
+			 * Insert A on its list
+			 */
+			WT_RET(__block_off_remove(session, ael, a->off, &a));
+			a->off += b->size;
+			a->size -= b->size;
+			WT_RET(__block_ext_insert(session, ael, a));
+
+			/*
+			 * Move caller's B to the next element
+			 * Add B's range to the avail list
+			 * Delete B
+			 */
+			*bp = (*bp)->next[0];
+			WT_RET(__block_merge(session, avail, b->off, b->size));
+			WT_RET(__block_off_remove(session, bel, b->off, NULL));
+		} else {				/* Case #9 */
+			/*
+			 * Remove B from its list
+			 * Increment/Decrement B's offset/size by the size of A
+			 * Insert B on its list
+			 */
+			WT_RET(__block_off_remove(session, bel, b->off, &b));
+			b->off += a->size;
+			b->size -= a->size;
+			WT_RET(__block_ext_insert(session, bel, b));
+
+			/*
+			 * Move caller's A to the next element
+			 * Add A's range to the avail list
+			 * Delete A
+			 */
+			*ap = (*ap)->next[0];
+			WT_RET(__block_merge(session, avail, a->off, a->size));
+			WT_RET(__block_off_remove(session, ael, a->off, NULL));
+		}					/* Case #6 */
+	} else if (a->off + a->size == b->off + b->size) {
+		/*
+		 * Remove A from its list
+		 * Decrement A's size by the size of B
+		 * Insert A on its list
+		 */
+		WT_RET(__block_off_remove(session, ael, a->off, &a));
+		a->size -= b->size;
+		WT_RET(__block_ext_insert(session, ael, a));
+
+		/*
+		 * Move caller's B to the next element
+		 * Add B's range to the avail list
+		 * Delete B
+		 */
+		*bp = (*bp)->next[0];
+		WT_RET(__block_merge(session, avail, b->off, b->size));
+		WT_RET(__block_off_remove(session, bel, b->off, NULL));
+	} else if					/* Case #3, #7 */
+	    (a->off + a->size < b->off + b->size) {
+		/*
+		 * Add overlap to the avail list
+		 */
+		off = b->off;
+		size = (a->off + a->size) - b->off;
+		WT_RET(__block_merge(session, avail, off, size));
+
+		/*
+		 * Remove A from its list
+		 * Decrement A's size by the overlap
+		 * Insert A on its list
+		 */
+		WT_RET(__block_off_remove(session, ael, a->off, &a));
+		a->size -= size;
+		WT_RET(__block_ext_insert(session, ael, a));
+
+		/*
+		 * Remove B from its list
+		 * Increment/Decrement B's offset/size by the overlap
+		 * Insert B on its list
+		 */
+		WT_RET(__block_off_remove(session, bel, b->off, &b));
+		b->off += size;
+		b->size -= size;
+		WT_RET(__block_ext_insert(session, bel, b));
+	} else {					/* Case #5 */
+		/* Calculate the offset/size of the trailing part of A. */
+		off = b->off + b->size;
+		size = (a->off + a->size) - off;
+
+		/*
+		 * Remove A from its list
+		 * Decrement A's size by trailing part of A plus B's size
+		 * Insert A on its list
+		 */
+		WT_RET(__block_off_remove(session, ael, a->off, &a));
+		a->size = b->off - a->off;
+		WT_RET(__block_ext_insert(session, ael, a));
+
+		/* Add trailing part of A to A's list as a new element. */
+		WT_RET(__block_merge(session, ael, off, size));
+
+		/*
+		 * Move caller's B to the next element
+		 * Add B's range to the avail list
+		 * Delete B
+		 */
+		*bp = (*bp)->next[0];
+		WT_RET(__block_merge(session, avail, b->off, b->size));
+		WT_RET(__block_off_remove(session, bel, b->off, NULL));
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_block_extlist_merge --
+ *	Merge one extent list into another.
+ */
+int
+__wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b)
+{
+	WT_EXT *ext;
+	WT_EXTLIST tmp;
+	u_int i;
+
+	WT_RET(__wt_verbose(
+	    session, WT_VERB_BLOCK, "merging %s into %s", a->name, b->name));
+
+	/*
+	 * Sometimes the list we are merging is much bigger than the other: if
+	 * so, swap the lists around to reduce the amount of work we need to do
+	 * during the merge.  The size lists have to match as well, so this is
+	 * only possible if both lists are tracking sizes, or neither are.
+	 */
+	if (a->track_size == b->track_size && a->entries > b->entries) {
+		tmp = *a;
+		a->bytes = b->bytes;
+		b->bytes = tmp.bytes;
+		a->entries = b->entries;
+		b->entries = tmp.entries;
+		for (i = 0; i < WT_SKIP_MAXDEPTH; i++) {
+			a->off[i] = b->off[i];
+			b->off[i] = tmp.off[i];
+			a->sz[i] = b->sz[i];
+			b->sz[i] = tmp.sz[i];
+		}
+	}
+
+	WT_EXT_FOREACH(ext, a->off)
+		WT_RET(__block_merge(session, b, ext->off, ext->size));
+
+	return (0);
+}
+
+/*
+ * __block_append --
+ *	Append a new entry to the allocation list.
+ */
+static int
+__block_append(
+    WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+{
+	WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH];
+	u_int i;
+
+	WT_ASSERT(session, el->track_size == 0);
+
+	/*
+	 * Identical to __block_merge, when we know the file is being extended,
+	 * that is, the information is either going to be used to extend the
+	 * last object on the list, or become a new object ending the list.
+	 *
+	 * The terminating element of the list is cached, check it; otherwise,
+	 * get a stack for the last object in the skiplist, check for a simple
+	 * extension, and otherwise append a new structure.
+	 */
+	if ((ext = el->last) != NULL && ext->off + ext->size == off)
+		ext->size += size;
+	else {
+		ext = __block_off_srch_last(el->off, astack);
+		if (ext != NULL && ext->off + ext->size == off)
+			ext->size += size;
+		else {
+			WT_RET(__wt_block_ext_alloc(session, &ext));
+			ext->off = off;
+			ext->size = size;
+
+			for (i = 0; i < ext->depth; ++i)
+				 *astack[i] = ext;
+			++el->entries;
+		}
+
+		/* Update the cached end-of-list */
+		el->last = ext;
+	}
+	el->bytes += (uint64_t)size;
+
+	return (0);
+}
+
+/*
+ * __wt_block_insert_ext --
+ *	Insert an extent into an extent list, merging if possible.
+ */
+int
+__wt_block_insert_ext(
+    WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+{
+	/*
+	 * There are currently two copies of this function (this code is a one-
+	 * liner that calls the internal version of the function, which means
+	 * the compiler should compress out the function call).  It's that way
+	 * because the interface is still fluid, I'm not convinced there won't
+	 * be a need for a functional split between the internal and external
+	 * versions in the future.
+	 *
+	 * Callers of this function are expected to have already acquired any
+	 * locks required to manipulate the extent list.
+	 */
+	return (__block_merge(session, el, off, size));
+}
+
+/*
+ * __block_merge --
+ *	Insert an extent into an extent list, merging if possible (internal
+ *	version).
+ */
+static int
+__block_merge(
+    WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+{
+	WT_EXT *ext, *after, *before;
+
+	/*
+	 * Retrieve the records preceding/following the offset.  If the records
+	 * are contiguous with the free'd offset, combine records.
+	 */
+	__block_off_srch_pair(el, off, &before, &after);
+	if (before != NULL) {
+		if (before->off + before->size > off)
+			WT_PANIC_RET(session, EINVAL,
+			    "%s: existing range %" PRIdMAX "-%" PRIdMAX
+			    " overlaps with merge range %" PRIdMAX "-%" PRIdMAX,
+			    el->name,
+			    (intmax_t)before->off,
+			    (intmax_t)(before->off + before->size),
+			    (intmax_t)off, (intmax_t)(off + size));
+		if (before->off + before->size != off)
+			before = NULL;
+	}
+	if (after != NULL) {
+		if (off + size > after->off)
+			WT_PANIC_RET(session, EINVAL,
+			    "%s: merge range %" PRIdMAX "-%" PRIdMAX
+			    " overlaps with existing range %" PRIdMAX
+			    "-%" PRIdMAX,
+			    el->name,
+			    (intmax_t)off, (intmax_t)(off + size),
+			    (intmax_t)after->off,
+			    (intmax_t)(after->off + after->size));
+		if (off + size != after->off)
+			after = NULL;
+	}
+	if (before == NULL && after == NULL) {
+		WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+		    "%s: insert range %" PRIdMAX "-%" PRIdMAX,
+		    el->name, (intmax_t)off, (intmax_t)(off + size)));
+
+		return (__block_off_insert(session, el, off, size));
+	}
+
+	/*
+	 * If the "before" offset range abuts, we'll use it as our new record;
+	 * if the "after" offset range also abuts, include its size and remove
+	 * it from the system.  Else, only the "after" offset range abuts, use
+	 * the "after" offset range as our new record.  In either case, remove
+	 * the record we're going to use, adjust it and re-insert it.
+	 */
+	if (before == NULL) {
+		WT_RET(__block_off_remove(session, el, after->off, &ext));
+
+		WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+		    "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %"
+		    PRIdMAX "-%" PRIdMAX,
+		    el->name,
+		    (intmax_t)ext->off, (intmax_t)(ext->off + ext->size),
+		    (intmax_t)off, (intmax_t)(off + ext->size + size)));
+
+		ext->off = off;
+		ext->size += size;
+	} else {
+		if (after != NULL) {
+			size += after->size;
+			WT_RET(
+			    __block_off_remove(session, el, after->off, NULL));
+		}
+		WT_RET(__block_off_remove(session, el, before->off, &ext));
+
+		WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+		    "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %"
+		    PRIdMAX "-%" PRIdMAX,
+		    el->name,
+		    (intmax_t)ext->off, (intmax_t)(ext->off + ext->size),
+		    (intmax_t)ext->off,
+		    (intmax_t)(ext->off + ext->size + size)));
+
+		ext->size += size;
+	}
+	return (__block_ext_insert(session, el, ext));
+}
+
+/*
+ * __wt_block_extlist_read_avail --
+ *	Read an avail extent list, includes minor special handling.
+ */
+int
+__wt_block_extlist_read_avail(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size)
+{
+	WT_DECL_RET;
+
+	/* If there isn't a list, we're done. */
+	if (el->offset == WT_BLOCK_INVALID_OFFSET)
+		return (0);
+
+#ifdef HAVE_DIAGNOSTIC
+	/*
+	 * In diagnostic mode, reads are checked against the available and
+	 * discard lists (a block being read should never appear on either).
+	 * Checkpoint threads may be running in the file, don't race with
+	 * them.
+	 */
+	__wt_spin_lock(session, &block->live_lock);
+#endif
+
+	WT_ERR(__wt_block_extlist_read(session, block, el, ckpt_size));
+
+	/*
+	 * Extent blocks are allocated from the available list: if reading the
+	 * avail list, the extent blocks might be included, remove them.
+	 */
+	WT_ERR_NOTFOUND_OK(
+	    __wt_block_off_remove_overlap(session, el, el->offset, el->size));
+
+err:
+#ifdef HAVE_DIAGNOSTIC
+	__wt_spin_unlock(session, &block->live_lock);
+#endif
+
+	return (ret);
+}
+
+/*
+ * __wt_block_extlist_read --
+ *	Read an extent list.
+ */
+int
+__wt_block_extlist_read(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size)
+{
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	wt_off_t off, size;
+	int (*func)(WT_SESSION_IMPL *, WT_EXTLIST *, wt_off_t, wt_off_t);
+	const uint8_t *p;
+
+	/* If there isn't a list, we're done. */
+	if (el->offset == WT_BLOCK_INVALID_OFFSET)
+		return (0);
+
+	WT_RET(__wt_scr_alloc(session, el->size, &tmp));
+	WT_ERR(__wt_block_read_off(
+	    session, block, tmp, el->offset, el->size, el->cksum));
+
+#define	WT_EXTLIST_READ(p, v) do {					\
+	uint64_t _v;							\
+	WT_ERR(__wt_vunpack_uint(&(p), 0, &_v));			\
+	(v) = (wt_off_t)_v;						\
+} while (0)
+
+	p = WT_BLOCK_HEADER_BYTE(tmp->mem);
+	WT_EXTLIST_READ(p, off);
+	WT_EXTLIST_READ(p, size);
+	if (off != WT_BLOCK_EXTLIST_MAGIC || size != 0)
+		goto corrupted;
+
+	/*
+	 * If we're not creating both offset and size skiplists, use the simpler
+	 * append API, otherwise do a full merge.  There are two reasons for the
+	 * test: first, checkpoint "available" lists are NOT sorted (checkpoints
+	 * write two separate lists, both of which are sorted but they're not
+	 * merged).  Second, the "available" list is sorted by size as well as
+	 * by offset, and the fast-path append code doesn't support that, it's
+	 * limited to offset.  The test of "track size" is short-hand for "are
+	 * we reading the "available" list.
+	 */
+	func = el->track_size == 0 ? __block_append : __block_merge;
+	for (;;) {
+		WT_EXTLIST_READ(p, off);
+		WT_EXTLIST_READ(p, size);
+		if (off == WT_BLOCK_INVALID_OFFSET)
+			break;
+
+		/*
+		 * We check the offset/size pairs represent valid file ranges,
+		 * then insert them into the list.  We don't necessarily have
+		 * to check for offsets past the end of the checkpoint, but it's
+		 * a cheap test to do here and we'd have to do the check as part
+		 * of file verification, regardless.
+		 */
+		if (off < block->allocsize ||
+		    off % block->allocsize != 0 ||
+		    size % block->allocsize != 0 ||
+		    off + size > ckpt_size)
+corrupted:		WT_PANIC_RET(session, WT_ERROR,
+			    "file contains a corrupted %s extent list, range %"
+			    PRIdMAX "-%" PRIdMAX " past end-of-file",
+			    el->name,
+			    (intmax_t)off, (intmax_t)(off + size));
+
+		WT_ERR(func(session, el, off, size));
+	}
+
+	if (WT_VERBOSE_ISSET(session, WT_VERB_BLOCK))
+		WT_ERR(__block_extlist_dump(session, "read extlist", el, 0));
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __wt_block_extlist_write --
+ *	Write an extent list at the tail of the file.
+ */
+int
+__wt_block_extlist_write(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, WT_EXTLIST *el, WT_EXTLIST *additional)
+{
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	WT_EXT *ext;
+	WT_PAGE_HEADER *dsk;
+	size_t size;
+	uint32_t entries;
+	uint8_t *p;
+
+	if (WT_VERBOSE_ISSET(session, WT_VERB_BLOCK))
+		WT_RET(__block_extlist_dump(session, "write extlist", el, 0));
+
+	/*
+	 * Figure out how many entries we're writing -- if there aren't any
+	 * entries, we're done.
+	 */
+	entries = el->entries + (additional == NULL ? 0 : additional->entries);
+	if (entries == 0) {
+		el->offset = WT_BLOCK_INVALID_OFFSET;
+		el->cksum = el->size = 0;
+		return (0);
+	}
+
+	/*
+	 * Get a scratch buffer, clear the page's header and data, initialize
+	 * the header.
+	 *
+	 * Allocate memory for the extent list entries plus two additional
+	 * entries: the initial WT_BLOCK_EXTLIST_MAGIC/0 pair and the list-
+	 * terminating WT_BLOCK_INVALID_OFFSET/0 pair.
+	 */
+	size = (entries + 2) * 2 * WT_INTPACK64_MAXSIZE;
+	WT_RET(__wt_block_write_size(session, block, &size));
+	WT_RET(__wt_scr_alloc(session, size, &tmp));
+	dsk = tmp->mem;
+	memset(dsk, 0, WT_BLOCK_HEADER_BYTE_SIZE);
+	dsk->type = WT_PAGE_BLOCK_MANAGER;
+
+#define	WT_EXTLIST_WRITE(p, v)						\
+	WT_ERR(__wt_vpack_uint(&(p), 0, (uint64_t)(v)))
+
+	/* Fill the page's data. */
+	p = WT_BLOCK_HEADER_BYTE(dsk);
+	WT_EXTLIST_WRITE(p, WT_BLOCK_EXTLIST_MAGIC);	/* Initial value */
+	WT_EXTLIST_WRITE(p, 0);
+	WT_EXT_FOREACH(ext, el->off) {			/* Free ranges */
+		WT_EXTLIST_WRITE(p, ext->off);
+		WT_EXTLIST_WRITE(p, ext->size);
+	}
+	if (additional != NULL)
+		WT_EXT_FOREACH(ext, additional->off) {	/* Free ranges */
+			WT_EXTLIST_WRITE(p, ext->off);
+			WT_EXTLIST_WRITE(p, ext->size);
+		}
+	WT_EXTLIST_WRITE(p, WT_BLOCK_INVALID_OFFSET);	/* Ending value */
+	WT_EXTLIST_WRITE(p, 0);
+
+	dsk->u.datalen = WT_PTRDIFF32(p, WT_BLOCK_HEADER_BYTE(dsk));
+	tmp->size = dsk->mem_size = WT_PTRDIFF32(p, dsk);
+
+#ifdef HAVE_DIAGNOSTIC
+	/*
+	 * The extent list is written as a valid btree page because the salvage
+	 * functionality might move into the btree layer some day, besides, we
+	 * don't need another format and this way the page format can be easily
+	 * verified.
+	 */
+	WT_ERR(__wt_verify_dsk(session, "[extent list check]", tmp));
+#endif
+
+	/* Write the extent list to disk. */
+	WT_ERR(__wt_block_write_off(
+	    session, block, tmp, &el->offset, &el->size, &el->cksum, 1, 1));
+
+	/*
+	 * Remove the allocated blocks from the system's allocation list, extent
+	 * blocks never appear on any allocation list.
+	 */
+	WT_TRET(__wt_block_off_remove_overlap(
+	    session, &block->live.alloc, el->offset, el->size));
+
+	WT_ERR(__wt_verbose(session, WT_VERB_BLOCK,
+	    "%s written %" PRIdMAX "/%" PRIu32,
+	    el->name, (intmax_t)el->offset, el->size));
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __wt_block_extlist_truncate --
+ *	Truncate the file based on the last available extent in the list.
+ */
+int
+__wt_block_extlist_truncate(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
+{
+	WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH];
+	WT_FH *fh;
+	wt_off_t orig, size;
+
+	fh = block->fh;
+
+	/*
+	 * Check if the last available extent is at the end of the file, and if
+	 * so, truncate the file and discard the extent.
+	 */
+	if ((ext = __block_off_srch_last(el->off, astack)) == NULL)
+		return (0);
+	WT_ASSERT(session, ext->off + ext->size <= fh->size);
+	if (ext->off + ext->size < fh->size)
+		return (0);
+
+	/*
+	 * Remove the extent list entry. (Save the value, we need it to reset
+	 * the cached file size, and that can't happen until after the extent
+	 * list removal succeeds.)
+	 */
+	orig = fh->size;
+	size = ext->off;
+	WT_RET(__block_off_remove(session, el, size, NULL));
+	fh->size = size;
+
+	/*
+	 * Truncate the file. The truncate might fail if there's a file mapping
+	 * (if there's an open checkpoint on the file), that's OK, we'll ignore
+	 * those blocks.
+	 */
+	WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+	    "truncate file from %" PRIdMAX " to %" PRIdMAX,
+	    (intmax_t)orig, (intmax_t)size));
+	WT_RET_BUSY_OK(__wt_ftruncate(session, block->fh, size));
+
+	return (0);
+}
+
+/*
+ * __wt_block_extlist_init --
+ *	Initialize an extent list.
+ */
+int
+__wt_block_extlist_init(WT_SESSION_IMPL *session,
+    WT_EXTLIST *el, const char *name, const char *extname, int track_size)
+{
+	size_t size;
+
+	WT_CLEAR(*el);
+
+	size = (name == NULL ? 0 : strlen(name)) +
+	    strlen(".") + (extname == NULL ? 0 : strlen(extname) + 1);
+	WT_RET(__wt_calloc_def(session, size, &el->name));
+	(void)snprintf(el->name, size, "%s.%s",
+	    name == NULL ? "" : name, extname == NULL ? "" : extname);
+
+	el->offset = WT_BLOCK_INVALID_OFFSET;
+	el->track_size = track_size;
+	return (0);
+}
+
+/*
+ * __wt_block_extlist_free --
+ *	Discard an extent list.
+ */
+void
+__wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el)
+{
+	WT_EXT *ext, *next;
+	WT_SIZE *szp, *nszp;
+
+	__wt_free(session, el->name);
+
+	for (ext = el->off[0]; ext != NULL; ext = next) {
+		next = ext->next[0];
+		__wt_free(session, ext);
+	}
+	for (szp = el->sz[0]; szp != NULL; szp = nszp) {
+		nszp = szp->next[0];
+		__wt_free(session, szp);
+	}
+
+	/* Extent lists are re-used, clear them. */
+	WT_CLEAR(*el);
+}
+
+/*
+ * __block_extlist_dump --
+ *	Dump an extent list as verbose messages.
+ */
+static int
+__block_extlist_dump(
+    WT_SESSION_IMPL *session, const char *tag, WT_EXTLIST *el, int show_size)
+{
+	WT_EXT *ext;
+	WT_SIZE *szp;
+
+	WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+	    "%s: %s: %" PRIu64 " bytes, by offset:%s",
+	    tag, el->name, el->bytes, el->entries == 0 ? " [Empty]" : ""));
+	if (el->entries == 0)
+		return (0);
+
+	WT_EXT_FOREACH(ext, el->off)
+		WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+		    "\t{%" PRIuMAX "/%" PRIuMAX "}",
+		    (uintmax_t)ext->off, (uintmax_t)ext->size));
+
+	if (!show_size)
+		return (0);
+
+	WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+	    "%s: %s: by size:%s",
+	    tag, el->name, el->entries == 0 ? " [Empty]" : ""));
+	if (el->entries == 0)
+		return (0);
+
+	WT_EXT_FOREACH(szp, el->sz) {
+		WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+		    "\t{%" PRIuMAX "}", (uintmax_t)szp->size));
+		WT_EXT_FOREACH_OFF(ext, szp->off)
+			WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+			    "\t\t{%" PRIuMAX "/%" PRIuMAX "}",
+			    (uintmax_t)ext->off, (uintmax_t)ext->size));
+	}
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_map.c b/src/third_party/wiredtiger/src/block/block_map.c
new file mode 100644
index 00000000000..68fb75179d9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_map.c
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_block_map --
+ *	Map a segment of the file in, if possible.
+ */
+int
+__wt_block_map(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapp, size_t *maplenp,
+    void **mappingcookie)
+{
+	*(void **)mapp = NULL;
+	*maplenp = 0;
+
+	/*
+	 * Turn off mapping when verifying the file, because we can't perform
+	 * checksum validation of mapped segments, and verify has to checksum
+	 * pages.
+	 */
+	if (block->verify)
+		return (0);
+
+	/*
+	 * Turn off mapping when direct I/O is configured for the file, the
+	 * Linux open(2) documentation says applications should avoid mixing
+	 * mmap(2) of files with direct I/O to the same files.
+	 */
+	if (block->fh->direct_io)
+		return (0);
+
+	/*
+	 * Turn off mapping if the application configured a cache size maximum,
+	 * we can't control how much of the cache size we use in that case.
+	 */
+	if (block->os_cache_max != 0)
+		return (0);
+
+	/*
+	 * Map the file into memory.
+	 * Ignore errors, we'll read the file through the cache if map fails.
+	 */
+	(void)__wt_mmap(session, block->fh, mapp, maplenp, mappingcookie);
+
+	return (0);
+}
+
+/*
+ * __wt_block_unmap --
+ *	Unmap any mapped-in segment of the file.
+ */
+int
+__wt_block_unmap(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, void *map, size_t maplen,
+    void **mappingcookie)
+{
+	/* Unmap the file from memory. */
+	return (__wt_munmap(session, block->fh, map, maplen, mappingcookie));
+}
diff --git a/src/third_party/wiredtiger/src/block/block_mgr.c b/src/third_party/wiredtiger/src/block/block_mgr.c
new file mode 100644
index 00000000000..4f7f2898de5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_mgr.c
@@ -0,0 +1,433 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static void __bm_method_set(WT_BM *, int);
+
+/*
+ * __bm_readonly --
+ *	General-purpose "writes not supported on this handle" function.
+ */
+static int
+__bm_readonly(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+	WT_RET_MSG(session, ENOTSUP,
+	    "%s: write operation on read-only checkpoint handle",
+	    bm->block->name);
+}
+
+/*
+ * __bm_addr_string --
+ *	Return a printable string representation of an address cookie.
+ */
+static int
+__bm_addr_string(WT_BM *bm, WT_SESSION_IMPL *session,
+    WT_ITEM *buf, const uint8_t *addr, size_t addr_size)
+{
+	return (
+	    __wt_block_addr_string(session, bm->block, buf, addr, addr_size));
+}
+
+/*
+ * __bm_addr_valid --
+ *	Return if an address cookie is valid.
+ */
+static int
+__bm_addr_valid(WT_BM *bm,
+    WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+	return (__wt_block_addr_valid(
+	    session, bm->block, addr, addr_size, bm->is_live));
+}
+
+/*
+ * __bm_block_header --
+ *	Return the size of the block header.
+ */
+static u_int
+__bm_block_header(WT_BM *bm)
+{
+	return (__wt_block_header(bm->block));
+}
+
+/*
+ * __bm_checkpoint --
+ *	Write a buffer into a block, creating a checkpoint.
+ */
+static int
+__bm_checkpoint(WT_BM *bm,
+    WT_SESSION_IMPL *session, WT_ITEM *buf, WT_CKPT *ckptbase, int data_cksum)
+{
+	return (__wt_block_checkpoint(
+	    session, bm->block, buf, ckptbase, data_cksum));
+}
+
+/*
+ * __bm_sync --
+ *	Flush a file to disk.
+ */
+static int
+__bm_sync(WT_BM *bm, WT_SESSION_IMPL *session, int async)
+{
+	return (async ?
+	    __wt_fsync_async(session, bm->block->fh) :
+	    __wt_fsync(session, bm->block->fh));
+}
+
+/*
+ * __bm_checkpoint_load --
+ *	Load a checkpoint.
+ */
+static int
+__bm_checkpoint_load(WT_BM *bm, WT_SESSION_IMPL *session,
+    const uint8_t *addr, size_t addr_size,
+    uint8_t *root_addr, size_t *root_addr_sizep, int checkpoint)
+{
+	WT_CONNECTION_IMPL *conn;
+
+	conn = S2C(session);
+
+	/* If not opening a checkpoint, we're opening the live system. */
+	bm->is_live = !checkpoint;
+	WT_RET(__wt_block_checkpoint_load(session, bm->block,
+	    addr, addr_size, root_addr, root_addr_sizep, checkpoint));
+
+	if (checkpoint) {
+		/*
+		 * Read-only objects are optionally mapped into memory instead
+		 * of being read into cache buffers.
+		 */
+		if (conn->mmap)
+			WT_RET(__wt_block_map(session, bm->block,
+			    &bm->map, &bm->maplen, &bm->mappingcookie));
+
+		/*
+		 * If this handle is for a checkpoint, that is, read-only, there
+		 * isn't a lot you can do with it.  Although the btree layer
+		 * prevents attempts to write a checkpoint reference, paranoia
+		 * is healthy.
+		 */
+		__bm_method_set(bm, 1);
+	}
+
+	return (0);
+}
+
+/*
+ * __bm_checkpoint_resolve --
+ *	Resolve the checkpoint.
+ */
+static int
+__bm_checkpoint_resolve(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+	return (__wt_block_checkpoint_resolve(session, bm->block));
+}
+
+/*
+ * __bm_checkpoint_unload --
+ *	Unload a checkpoint point.
+ */
+static int
+__bm_checkpoint_unload(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+	WT_DECL_RET;
+
+	/* Unmap any mapped segment. */
+	if (bm->map != NULL)
+		WT_TRET(__wt_block_unmap(session,
+		    bm->block, bm->map, bm->maplen, &bm->mappingcookie));
+
+	/* Unload the checkpoint. */
+	WT_TRET(__wt_block_checkpoint_unload(session, bm->block, !bm->is_live));
+
+	return (ret);
+}
+
+/*
+ * __bm_close --
+ *	Close a file.
+ */
+static int
+__bm_close(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+	WT_DECL_RET;
+
+	if (bm == NULL)				/* Safety check */
+		return (0);
+
+	ret = __wt_block_close(session, bm->block);
+
+	__wt_overwrite_and_free(session, bm);
+	return (ret);
+}
+
+/*
+ * __bm_compact_start --
+ *	Start a block manager compaction.
+ */
+static int
+__bm_compact_start(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+	return (__wt_block_compact_start(session, bm->block));
+}
+
+/*
+ * __bm_compact_page_skip --
+ *	Return if a page is useful for compaction.
+ */
+static int
+__bm_compact_page_skip(WT_BM *bm, WT_SESSION_IMPL *session,
+    const uint8_t *addr, size_t addr_size, int *skipp)
+{
+	return (__wt_block_compact_page_skip(
+	    session, bm->block, addr, addr_size, skipp));
+}
+
+/*
+ * __bm_compact_skip --
+ *	Return if a file can be compacted.
+ */
+static int
+__bm_compact_skip(WT_BM *bm, WT_SESSION_IMPL *session, int *skipp)
+{
+	return (__wt_block_compact_skip(session, bm->block, skipp));
+}
+
+/*
+ * __bm_compact_end --
+ *	End a block manager compaction.
+ */
+static int
+__bm_compact_end(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+	return (__wt_block_compact_end(session, bm->block));
+}
+
+/*
+ * __bm_free --
+ *	Free a block of space to the underlying file.
+ */
+static int
+__bm_free(WT_BM *bm,
+    WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+	return (__wt_block_free(session, bm->block, addr, addr_size));
+}
+
+/*
+ * __bm_stat --
+ *	Block-manager statistics.
+ */
+static int
+__bm_stat(WT_BM *bm, WT_SESSION_IMPL *session, WT_DSRC_STATS *stats)
+{
+	__wt_block_stat(session, bm->block, stats);
+	return (0);
+}
+
+/*
+ * __bm_write --
+ *	Write a buffer into a block, returning the block's address cookie.
+ */
+static int
+__bm_write(WT_BM *bm, WT_SESSION_IMPL *session,
+    WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, int data_cksum)
+{
+	return (__wt_block_write(
+	    session, bm->block, buf, addr, addr_sizep, data_cksum));
+}
+
+/*
+ * __bm_write_size --
+ *	Return the buffer size required to write a block.
+ */
+static int
+__bm_write_size(WT_BM *bm, WT_SESSION_IMPL *session, size_t *sizep)
+{
+	return (__wt_block_write_size(session, bm->block, sizep));
+}
+
+/*
+ * __bm_salvage_start --
+ *	Start a block manager salvage.
+ */
+static int
+__bm_salvage_start(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+	return (__wt_block_salvage_start(session, bm->block));
+}
+
+/*
+ * __bm_salvage_valid --
+ *	Inform salvage a block is valid.
+ */
+static int
+__bm_salvage_valid(WT_BM *bm,
+    WT_SESSION_IMPL *session, uint8_t *addr, size_t addr_size, int valid)
+{
+	return (__wt_block_salvage_valid(
+	    session, bm->block, addr, addr_size, valid));
+}
+
+/*
+ * __bm_salvage_next --
+ *	Return the next block from the file.
+ */
+static int
+__bm_salvage_next(WT_BM *bm,
+    WT_SESSION_IMPL *session, uint8_t *addr, size_t *addr_sizep, int *eofp)
+{
+	return (__wt_block_salvage_next(
+	    session, bm->block, addr, addr_sizep, eofp));
+}
+
+/*
+ * __bm_salvage_end --
+ *	End a block manager salvage.
+ */
+static int
+__bm_salvage_end(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+	return (__wt_block_salvage_end(session, bm->block));
+}
+
+/*
+ * __bm_verify_start --
+ *	Start a block manager verify.
+ */
+static int
+__bm_verify_start(WT_BM *bm, WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
+{
+	return (__wt_block_verify_start(session, bm->block, ckptbase));
+}
+
+/*
+ * __bm_verify_addr --
+ *	Verify an address.
+ */
+static int
+__bm_verify_addr(WT_BM *bm,
+    WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+	return (__wt_block_verify_addr(session, bm->block, addr, addr_size));
+}
+
+/*
+ * __bm_verify_end --
+ *	End a block manager verify.
+ */
+static int
+__bm_verify_end(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+	return (__wt_block_verify_end(session, bm->block));
+}
+
+/*
+ * __bm_method_set --
+ *	Set up the legal methods.
+ */
+static void
+__bm_method_set(WT_BM *bm, int readonly)
+{
+	if (readonly) {
+		bm->addr_string = __bm_addr_string;
+		bm->addr_valid = __bm_addr_valid;
+		bm->block_header = __bm_block_header;
+		bm->checkpoint = (int (*)(WT_BM *,
+		    WT_SESSION_IMPL *, WT_ITEM *, WT_CKPT *, int))__bm_readonly;
+		bm->checkpoint_load = __bm_checkpoint_load;
+		bm->checkpoint_resolve =
+		    (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
+		bm->checkpoint_unload = __bm_checkpoint_unload;
+		bm->close = __bm_close;
+		bm->compact_end =
+		    (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
+		bm->compact_page_skip = (int (*)(WT_BM *, WT_SESSION_IMPL *,
+		    const uint8_t *, size_t, int *))__bm_readonly;
+		bm->compact_skip = (int (*)
+		    (WT_BM *, WT_SESSION_IMPL *, int *))__bm_readonly;
+		bm->compact_start =
+		    (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
+		bm->free = (int (*)(WT_BM *,
+		    WT_SESSION_IMPL *, const uint8_t *, size_t))__bm_readonly;
+		bm->preload = __wt_bm_preload;
+		bm->read = __wt_bm_read;
+		bm->salvage_end = (int (*)
+		    (WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
+		bm->salvage_next = (int (*)(WT_BM *, WT_SESSION_IMPL *,
+		    uint8_t *, size_t *, int *))__bm_readonly;
+		bm->salvage_start = (int (*)
+		    (WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
+		bm->salvage_valid = (int (*)(WT_BM *,
+		    WT_SESSION_IMPL *, uint8_t *, size_t, int))__bm_readonly;
+		bm->stat = __bm_stat;
+		bm->sync =
+		    (int (*)(WT_BM *, WT_SESSION_IMPL *, int))__bm_readonly;
+		bm->verify_addr = __bm_verify_addr;
+		bm->verify_end = __bm_verify_end;
+		bm->verify_start = __bm_verify_start;
+		bm->write = (int (*)(WT_BM *, WT_SESSION_IMPL *,
+		    WT_ITEM *, uint8_t *, size_t *, int))__bm_readonly;
+		bm->write_size = (int (*)
+		    (WT_BM *, WT_SESSION_IMPL *, size_t *))__bm_readonly;
+	} else {
+		bm->addr_string = __bm_addr_string;
+		bm->addr_valid = __bm_addr_valid;
+		bm->block_header = __bm_block_header;
+		bm->checkpoint = __bm_checkpoint;
+		bm->checkpoint_load = __bm_checkpoint_load;
+		bm->checkpoint_resolve = __bm_checkpoint_resolve;
+		bm->checkpoint_unload = __bm_checkpoint_unload;
+		bm->close = __bm_close;
+		bm->compact_end = __bm_compact_end;
+		bm->compact_page_skip = __bm_compact_page_skip;
+		bm->compact_skip = __bm_compact_skip;
+		bm->compact_start = __bm_compact_start;
+		bm->free = __bm_free;
+		bm->preload = __wt_bm_preload;
+		bm->read = __wt_bm_read;
+		bm->salvage_end = __bm_salvage_end;
+		bm->salvage_next = __bm_salvage_next;
+		bm->salvage_start = __bm_salvage_start;
+		bm->salvage_valid = __bm_salvage_valid;
+		bm->stat = __bm_stat;
+		bm->sync = __bm_sync;
+		bm->verify_addr = __bm_verify_addr;
+		bm->verify_end = __bm_verify_end;
+		bm->verify_start = __bm_verify_start;
+		bm->write = __bm_write;
+		bm->write_size = __bm_write_size;
+	}
+}
+
+/*
+ * __wt_block_manager_open --
+ *	Open a file.
+ */
+int
+__wt_block_manager_open(WT_SESSION_IMPL *session,
+    const char *filename, const char *cfg[],
+    int forced_salvage, int readonly, uint32_t allocsize, WT_BM **bmp)
+{
+	WT_BM *bm;
+	WT_DECL_RET;
+
+	*bmp = NULL;
+
+	WT_RET(__wt_calloc_def(session, 1, &bm));
+	__bm_method_set(bm, 0);
+
+	WT_ERR(__wt_block_open(session, filename, cfg,
+	    forced_salvage, readonly, allocsize, &bm->block));
+
+	*bmp = bm;
+	return (0);
+
+err:	WT_TRET(bm->close(bm, session));
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c
new file mode 100644
index 00000000000..2fbaa0fe331
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_open.c
@@ -0,0 +1,330 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __desc_read(WT_SESSION_IMPL *, WT_BLOCK *);
+
+/*
+ * __wt_block_manager_truncate --
+ *	Truncate a file.
+ */
+int
+__wt_block_manager_truncate(
+    WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize)
+{
+	WT_DECL_RET;
+	WT_FH *fh;
+
+	/* Open the underlying file handle. */
+	WT_RET(__wt_open(session, filename, 0, 0, WT_FILE_TYPE_DATA, &fh));
+
+	/* Truncate the file. */
+	WT_ERR(__wt_ftruncate(session, fh, (wt_off_t)0));
+
+	/* Write out the file's meta-data. */
+	ret = __wt_desc_init(session, fh, allocsize);
+
+	/* Close the file handle. */
+err:	WT_TRET(__wt_close(session, fh));
+
+	return (ret);
+}
+
+/*
+ * __wt_block_manager_create --
+ *	Create a file.
+ */
+int
+__wt_block_manager_create(
+    WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize)
+{
+	WT_DECL_RET;
+	WT_FH *fh;
+
+	/* Create the underlying file and open a handle. */
+	WT_RET(__wt_open(session, filename, 1, 1, WT_FILE_TYPE_DATA, &fh));
+
+	/* Write out the file's meta-data. */
+	ret = __wt_desc_init(session, fh, allocsize);
+
+	/* Close the file handle. */
+	WT_TRET(__wt_close(session, fh));
+
+	/* Undo any create on error. */
+	if (ret != 0)
+		WT_TRET(__wt_remove(session, filename));
+
+	return (ret);
+}
+
+/*
+ * __block_destroy --
+ *	Destroy a block handle.
+ */
+static int
+__block_destroy(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+
+	conn = S2C(session);
+	TAILQ_REMOVE(&conn->blockqh, block, q);
+
+	if (block->name != NULL)
+		__wt_free(session, block->name);
+
+	if (block->fh != NULL)
+		WT_TRET(__wt_close(session, block->fh));
+
+	__wt_spin_destroy(session, &block->live_lock);
+
+	__wt_overwrite_and_free(session, block);
+
+	return (ret);
+}
+
+/*
+ * __wt_block_open --
+ *	Open a block handle.
+ */
+int
+__wt_block_open(WT_SESSION_IMPL *session,
+    const char *filename, const char *cfg[],
+    int forced_salvage, int readonly, uint32_t allocsize, WT_BLOCK **blockp)
+{
+	WT_BLOCK *block;
+	WT_CONFIG_ITEM cval;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+
+	WT_TRET(__wt_verbose(session, WT_VERB_BLOCK, "open: %s", filename));
+
+	conn = S2C(session);
+	*blockp = NULL;
+
+	__wt_spin_lock(session, &conn->block_lock);
+	TAILQ_FOREACH(block, &conn->blockqh, q)
+		if (strcmp(filename, block->name) == 0) {
+			++block->ref;
+			*blockp = block;
+			__wt_spin_unlock(session, &conn->block_lock);
+			return (0);
+		}
+
+	/* Basic structure allocation, initialization. */
+	WT_ERR(__wt_calloc_def(session, 1, &block));
+	block->ref = 1;
+	TAILQ_INSERT_HEAD(&conn->blockqh, block, q);
+
+	WT_ERR(__wt_strdup(session, filename, &block->name));
+	block->allocsize = allocsize;
+
+	WT_ERR(__wt_config_gets(session, cfg, "block_allocation", &cval));
+	block->allocfirst =
+	    WT_STRING_MATCH("first", cval.str, cval.len) ? 1 : 0;
+
+	/* Configuration: optional OS buffer cache maximum size. */
+	WT_ERR(__wt_config_gets(session, cfg, "os_cache_max", &cval));
+	block->os_cache_max = (size_t)cval.val;
+#ifdef HAVE_POSIX_FADVISE
+	if (conn->direct_io && block->os_cache_max)
+		WT_ERR_MSG(session, EINVAL,
+		    "os_cache_max not supported in combination with direct_io");
+#else
+	if (block->os_cache_max)
+		WT_ERR_MSG(session, EINVAL,
+		    "os_cache_max not supported if posix_fadvise not "
+		    "available");
+#endif
+
+	/* Configuration: optional immediate write scheduling flag. */
+	WT_ERR(__wt_config_gets(session, cfg, "os_cache_dirty_max", &cval));
+	block->os_cache_dirty_max = (size_t)cval.val;
+#ifdef HAVE_SYNC_FILE_RANGE
+	if (conn->direct_io && block->os_cache_dirty_max)
+		WT_ERR_MSG(session, EINVAL,
+		    "os_cache_dirty_max not supported in combination with "
+		    "direct_io");
+#else
+	if (block->os_cache_dirty_max) {
+		/*
+		 * Ignore any setting if it is not supported.
+		 */
+		block->os_cache_dirty_max = 0;
+		WT_ERR(__wt_verbose(session, WT_VERB_BLOCK,
+		    "os_cache_dirty_max ignored when sync_file_range not "
+		    "available"));
+	}
+#endif
+
+	/* Open the underlying file handle. */
+	WT_ERR(__wt_open(session, filename, 0, 0,
+	    readonly ? WT_FILE_TYPE_CHECKPOINT : WT_FILE_TYPE_DATA,
+	    &block->fh));
+
+	/* Initialize the live checkpoint's lock. */
+	WT_ERR(__wt_spin_init(session, &block->live_lock, "block manager"));
+
+	/*
+	 * Read the description information from the first block.
+	 *
+	 * Salvage is a special case: if we're forcing the salvage, we don't
+	 * look at anything, including the description information.
+	 */
+	if (!forced_salvage)
+		WT_ERR(__desc_read(session, block));
+
+	*blockp = block;
+	__wt_spin_unlock(session, &conn->block_lock);
+	return (0);
+
+err:	WT_TRET(__block_destroy(session, block));
+	__wt_spin_unlock(session, &conn->block_lock);
+	return (ret);
+}
+
+/*
+ * __wt_block_close --
+ *	Close a block handle.
+ */
+int
+__wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+
+	if (block == NULL)				/* Safety check */
+		return (0);
+
+	conn = S2C(session);
+
+	WT_TRET(__wt_verbose(session, WT_VERB_BLOCK,
+	    "close: %s", block->name == NULL ? "" : block->name ));
+
+	__wt_spin_lock(session, &conn->block_lock);
+
+			/* Reference count is initialized to 1. */
+	if (block->ref == 0 || --block->ref == 0)
+		WT_TRET(__block_destroy(session, block));
+
+	__wt_spin_unlock(session, &conn->block_lock);
+
+	return (ret);
+}
+
+/*
+ * __wt_desc_init --
+ *	Write a file's initial descriptor structure.
+ */
+int
+__wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize)
+{
+	WT_BLOCK_DESC *desc;
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+
+	/* Use a scratch buffer to get correct alignment for direct I/O. */
+	WT_RET(__wt_scr_alloc(session, allocsize, &buf));
+	memset(buf->mem, 0, allocsize);
+
+	desc = buf->mem;
+	desc->magic = WT_BLOCK_MAGIC;
+	desc->majorv = WT_BLOCK_MAJOR_VERSION;
+	desc->minorv = WT_BLOCK_MINOR_VERSION;
+
+	/* Update the checksum. */
+	desc->cksum = 0;
+	desc->cksum = __wt_cksum(desc, allocsize);
+
+	ret = __wt_write(session, fh, (wt_off_t)0, (size_t)allocsize, desc);
+
+	__wt_scr_free(&buf);
+	return (ret);
+}
+
+/*
+ * __desc_read --
+ *	Read and verify the file's metadata.
+ */
+static int
+__desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+	WT_BLOCK_DESC *desc;
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+	uint32_t cksum;
+
+	/* Use a scratch buffer to get correct alignment for direct I/O. */
+	WT_RET(__wt_scr_alloc(session, block->allocsize, &buf));
+
+	/* Read the first allocation-sized block and verify the file format. */
+	WT_ERR(__wt_read(session,
+	    block->fh, (wt_off_t)0, (size_t)block->allocsize, buf->mem));
+
+	desc = buf->mem;
+	WT_ERR(__wt_verbose(session, WT_VERB_BLOCK,
+	    "%s: magic %" PRIu32
+	    ", major/minor: %" PRIu32 "/%" PRIu32
+	    ", checksum %#" PRIx32,
+	    block->name, desc->magic,
+	    desc->majorv, desc->minorv,
+	    desc->cksum));
+
+	/*
+	 * We fail the open if the checksum fails, or the magic number is wrong
+	 * or the major/minor numbers are unsupported for this version.  This
+	 * test is done even if the caller is verifying or salvaging the file:
+	 * it makes sense for verify, and for salvage we don't overwrite files
+	 * without some reason to believe they are WiredTiger files.  The user
+	 * may have entered the wrong file name, and is now frantically pounding
+	 * their interrupt key.
+	 */
+	cksum = desc->cksum;
+	desc->cksum = 0;
+	if (desc->magic != WT_BLOCK_MAGIC ||
+	    cksum != __wt_cksum(desc, block->allocsize))
+		WT_ERR_MSG(session, WT_ERROR,
+		    "%s does not appear to be a WiredTiger file", block->name);
+
+	if (desc->majorv > WT_BLOCK_MAJOR_VERSION ||
+	    (desc->majorv == WT_BLOCK_MAJOR_VERSION &&
+	    desc->minorv > WT_BLOCK_MINOR_VERSION))
+		WT_ERR_MSG(session, WT_ERROR,
+		    "unsupported WiredTiger file version: this build only "
+		    "supports major/minor versions up to %d/%d, and the file "
+		    "is version %d/%d",
+		    WT_BLOCK_MAJOR_VERSION, WT_BLOCK_MINOR_VERSION,
+		    desc->majorv, desc->minorv);
+
+err:	__wt_scr_free(&buf);
+	return (ret);
+}
+
+/*
+ * __wt_block_stat --
+ *	Block statistics
+ */
+void
+__wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats)
+{
+	/*
+	 * We're looking inside the live system's structure, which normally
+	 * requires locking: the chances of a corrupted read are probably
+	 * non-existent, and it's statistics information regardless, but it
+	 * isn't like this is a common function for an application to call.
+	 */
+	__wt_spin_lock(session, &block->live_lock);
+	WT_STAT_SET(stats, allocation_size, block->allocsize);
+	WT_STAT_SET(stats, block_checkpoint_size, block->live.ckpt_size);
+	WT_STAT_SET(stats, block_magic, WT_BLOCK_MAGIC);
+	WT_STAT_SET(stats, block_major, WT_BLOCK_MAJOR_VERSION);
+	WT_STAT_SET(stats, block_minor, WT_BLOCK_MINOR_VERSION);
+	WT_STAT_SET(stats, block_reuse_bytes, block->live.avail.bytes);
+	WT_STAT_SET(stats, block_size, block->fh->size);
+	__wt_spin_unlock(session, &block->live_lock);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_read.c b/src/third_party/wiredtiger/src/block/block_read.c
new file mode 100644
index 00000000000..c528ee4a6aa
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_read.c
@@ -0,0 +1,212 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_bm_preload --
+ *	Pre-load a page.
+ */
+int
+__wt_bm_preload(WT_BM *bm,
+    WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+	WT_BLOCK *block;
+	WT_DECL_RET;
+	wt_off_t offset;
+	uint32_t cksum, size;
+	int mapped;
+
+	WT_UNUSED(addr_size);
+	block = bm->block;
+	ret = EINVAL;		/* Play games due to conditional compilation */
+
+	/* Crack the cookie. */
+	WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+	/* Check for a mapped block. */
+	mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen;
+	if (mapped)
+		WT_RET(__wt_mmap_preload(
+		    session, (uint8_t *)bm->map + offset, size));
+	else {
+#ifdef HAVE_POSIX_FADVISE
+		ret = posix_fadvise(block->fh->fd,
+		    (wt_off_t)offset, (wt_off_t)size, POSIX_FADV_WILLNEED);
+#endif
+		if (ret != 0) {
+			WT_DECL_ITEM(tmp);
+			WT_RET(__wt_scr_alloc(session, size, &tmp));
+			ret = __wt_block_read_off(
+			    session, block, tmp, offset, size, cksum);
+			__wt_scr_free(&tmp);
+			WT_RET(ret);
+		}
+	}
+
+	WT_STAT_FAST_CONN_INCR(session, block_preload);
+
+	return (0);
+}
+
+/*
+ * __wt_bm_read --
+ *	Map or read address cookie referenced block into a buffer.
+ */
+int
+__wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session,
+    WT_ITEM *buf, const uint8_t *addr, size_t addr_size)
+{
+	WT_BLOCK *block;
+	int mapped;
+	wt_off_t offset;
+	uint32_t cksum, size;
+
+	WT_UNUSED(addr_size);
+	block = bm->block;
+
+	/* Crack the cookie. */
+	WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+	/*
+	 * Map the block if it's possible.
+	 */
+	mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen;
+	if (mapped) {
+		buf->data = (uint8_t *)bm->map + offset;
+		buf->size = size;
+		WT_RET(__wt_mmap_preload(session, buf->data, buf->size));
+
+		WT_STAT_FAST_CONN_INCR(session, block_map_read);
+		WT_STAT_FAST_CONN_INCRV(session, block_byte_map_read, size);
+		return (0);
+	}
+
+#ifdef HAVE_DIAGNOSTIC
+	/*
+	 * In diagnostic mode, verify the block we're about to read isn't on
+	 * the available list, or for live systems, the discard list.
+	 */
+	WT_RET(__wt_block_misplaced(
+	    session, block, "read", offset, size, bm->is_live));
+#endif
+	/* Read the block. */
+	WT_RET(__wt_block_read_off(session, block, buf, offset, size, cksum));
+
+#ifdef HAVE_POSIX_FADVISE
+	/* Optionally discard blocks from the system's buffer cache. */
+	if (block->os_cache_max != 0 &&
+	    (block->os_cache += size) > block->os_cache_max) {
+		WT_DECL_RET;
+
+		block->os_cache = 0;
+		/* Ignore EINVAL - some file systems don't support the flag. */
+		if ((ret = posix_fadvise(block->fh->fd,
+		    (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0 &&
+		    ret != EINVAL)
+			WT_RET_MSG(
+			    session, ret, "%s: posix_fadvise", block->name);
+	}
+#endif
+	return (0);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_block_read_off_blind --
+ *	Read the block at an offset, try to figure out what it looks like,
+ * debugging only.
+ */
+int
+__wt_block_read_off_blind(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset)
+{
+	WT_BLOCK_HEADER *blk;
+	uint32_t cksum, size;
+
+	/*
+	 * Make sure the buffer is large enough for the header and read the
+	 * the first allocation-size block.
+	 */
+	WT_RET(__wt_buf_init(session, buf, block->allocsize));
+	WT_RET(__wt_read(
+	    session, block->fh, offset, (size_t)block->allocsize, buf->mem));
+	blk = WT_BLOCK_HEADER_REF(buf->mem);
+
+	/*
+	 * Copy out the size and checksum (we're about to re-use the buffer),
+	 * and if the size isn't insane, read the rest of the block.
+	 */
+	size = blk->disk_size;
+	cksum = blk->cksum;
+	if (__wt_block_offset_invalid(block, offset, size))
+		WT_RET_MSG(session, EINVAL,
+		    "block at offset %" PRIuMAX " cannot be a valid block, no "
+		    "read attempted",
+		    (uintmax_t)offset);
+	return (__wt_block_read_off(session, block, buf, offset, size, cksum));
+}
+#endif
+
+/*
+ * __wt_block_read_off --
+ *	Read an addr/size pair referenced block into a buffer.
+ */
+int
+__wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
+    WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t cksum)
+{
+	WT_BLOCK_HEADER *blk;
+	size_t bufsize;
+	uint32_t page_cksum;
+
+	WT_RET(__wt_verbose(session, WT_VERB_READ,
+	    "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32,
+	    (uintmax_t)offset, size, cksum));
+
+	/*
+	 * Grow the buffer as necessary and read the block.  Buffers should be
+	 * aligned for reading, but there are lots of buffers (for example, file
+	 * cursors have two buffers each, key and value), and it's difficult to
+	 * be sure we've found all of them.  If the buffer isn't aligned, it's
+	 * an easy fix: set the flag and guarantee we reallocate it.  (Most of
+	 * the time on reads, the buffer memory has not yet been allocated, so
+	 * we're not adding any additional processing time.)
+	 */
+	if (F_ISSET(buf, WT_ITEM_ALIGNED))
+		bufsize = size;
+	else {
+		F_SET(buf, WT_ITEM_ALIGNED);
+		bufsize = WT_MAX(size, buf->memsize + 10);
+	}
+	WT_RET(__wt_buf_init(session, buf, bufsize));
+	WT_RET(__wt_read(session, block->fh, offset, size, buf->mem));
+	buf->size = size;
+
+	blk = WT_BLOCK_HEADER_REF(buf->mem);
+	blk->cksum = 0;
+	page_cksum = __wt_cksum(buf->mem,
+	    F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP);
+	if (cksum != page_cksum) {
+		if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))
+			__wt_errx(session,
+			    "read checksum error [%"
+			    PRIu32 "B @ %" PRIuMAX ", %"
+			    PRIu32 " != %" PRIu32 "]",
+			    size, (uintmax_t)offset, cksum, page_cksum);
+
+		/* Panic if a checksum fails during an ordinary read. */
+		return (block->verify ||
+		    F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ?
+		    WT_ERROR :
+		    __wt_illegal_value(session, block->name));
+	}
+
+	WT_STAT_FAST_CONN_INCR(session, block_read);
+	WT_STAT_FAST_CONN_INCRV(session, block_byte_read, size);
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_session.c b/src/third_party/wiredtiger/src/block/block_session.c
new file mode 100644
index 00000000000..fa56b72f49b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_session.c
@@ -0,0 +1,305 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Per session handle cached block manager information.
+ */
+typedef struct {
+	WT_EXT  *ext_cache;			/* List of WT_EXT handles */
+	u_int    ext_cache_cnt;			/* Count */
+
+	WT_SIZE *sz_cache;			/* List of WT_SIZE handles */
+	u_int    sz_cache_cnt;			/* Count */
+} WT_BLOCK_MGR_SESSION;
+
+/*
+ * __block_ext_alloc --
+ *	Allocate a new WT_EXT structure.
+ */
+static int
+__block_ext_alloc(WT_SESSION_IMPL *session, WT_EXT **extp)
+{
+	WT_EXT *ext;
+
+	u_int skipdepth;
+
+	skipdepth = __wt_skip_choose_depth(session);
+	WT_RET(__wt_calloc(session, 1,
+	    sizeof(WT_EXT) + skipdepth * 2 * sizeof(WT_EXT *), &ext));
+	ext->depth = (uint8_t)skipdepth;
+	(*extp) = ext;
+
+	return (0);
+}
+
+/*
+ * __wt_block_ext_alloc --
+ *	Return a WT_EXT structure for use.
+ */
+int
+__wt_block_ext_alloc(WT_SESSION_IMPL *session, WT_EXT **extp)
+{
+	WT_EXT *ext;
+	WT_BLOCK_MGR_SESSION *bms;
+	u_int i;
+
+	bms = session->block_manager;
+
+	/* Return a WT_EXT structure for use from a cached list. */
+	if (bms != NULL && bms->ext_cache != NULL) {
+		ext = bms->ext_cache;
+		bms->ext_cache = ext->next[0];
+
+		/* Clear any left-over references. */
+		for (i = 0; i < ext->depth; ++i)
+			ext->next[i] = ext->next[i + ext->depth] = NULL;
+
+		/*
+		 * The count is advisory to minimize our exposure to bugs, but
+		 * don't let it go negative.
+		 */
+		if (bms->ext_cache_cnt > 0)
+			--bms->ext_cache_cnt;
+
+		*extp = ext;
+		return (0);
+	}
+
+	return (__block_ext_alloc(session, extp));
+}
+
+/*
+ * __block_ext_prealloc --
+ *	Pre-allocate WT_EXT structures.
+ */
+static int
+__block_ext_prealloc(WT_SESSION_IMPL *session, u_int max)
+{
+	WT_BLOCK_MGR_SESSION *bms;
+	WT_EXT *ext;
+
+	bms = session->block_manager;
+
+	for (; bms->ext_cache_cnt < max; ++bms->ext_cache_cnt) {
+		WT_RET(__block_ext_alloc(session, &ext));
+
+		ext->next[0] = bms->ext_cache;
+		bms->ext_cache = ext;
+	}
+	return (0);
+}
+
+/*
+ * __wt_block_ext_free --
+ *	Add a WT_EXT structure to the cached list.
+ */
+void
+__wt_block_ext_free(WT_SESSION_IMPL *session, WT_EXT *ext)
+{
+	WT_BLOCK_MGR_SESSION *bms;
+
+	if ((bms = session->block_manager) == NULL)
+		__wt_free(session, ext);
+	else {
+		ext->next[0] = bms->ext_cache;
+		bms->ext_cache = ext;
+
+		++bms->ext_cache_cnt;
+	}
+}
+
+/*
+ * __block_ext_discard --
+ *	Discard some or all of the WT_EXT structure cache.
+ */
+static int
+__block_ext_discard(WT_SESSION_IMPL *session, u_int max)
+{
+	WT_BLOCK_MGR_SESSION *bms;
+	WT_EXT *ext, *next;
+
+	bms = session->block_manager;
+	if (max != 0 && bms->ext_cache_cnt <= max)
+		return (0);
+
+	for (ext = bms->ext_cache; ext != NULL;) {
+		next = ext->next[0];
+		__wt_free(session, ext);
+		ext = next;
+
+		--bms->ext_cache_cnt;
+		if (max != 0 && bms->ext_cache_cnt <= max)
+			break;
+	}
+	bms->ext_cache = ext;
+
+	if (max == 0 && bms->ext_cache_cnt != 0)
+		WT_RET_MSG(session, WT_ERROR,
+		    "incorrect count in session handle's block manager cache");
+	return (0);
+}
+
+/*
+ * __block_size_alloc --
+ *	Allocate a new WT_SIZE structure.
+ */
+static int
+__block_size_alloc(WT_SESSION_IMPL *session, WT_SIZE **szp)
+{
+	return (__wt_calloc(session, 1, sizeof(WT_SIZE), szp));
+}
+
+/*
+ * __wt_block_size_alloc --
+ *	Return a WT_SIZE structure for use.
+ */
+int
+__wt_block_size_alloc(WT_SESSION_IMPL *session, WT_SIZE **szp)
+{
+	WT_BLOCK_MGR_SESSION *bms;
+
+	bms = session->block_manager;
+
+	/* Return a WT_SIZE structure for use from a cached list. */
+	if (bms != NULL && bms->sz_cache != NULL) {
+		(*szp) = bms->sz_cache;
+		bms->sz_cache = bms->sz_cache->next[0];
+
+		/*
+		 * The count is advisory to minimize our exposure to bugs, but
+		 * don't let it go negative.
+		 */
+		if (bms->sz_cache_cnt > 0)
+			--bms->sz_cache_cnt;
+		return (0);
+	}
+
+	return (__block_size_alloc(session, szp));
+}
+
+/*
+ * __block_size_prealloc --
+ *	Pre-allocate WT_SIZE structures.
+ */
+static int
+__block_size_prealloc(WT_SESSION_IMPL *session, u_int max)
+{
+	WT_BLOCK_MGR_SESSION *bms;
+	WT_SIZE *sz;
+
+	bms = session->block_manager;
+
+	for (; bms->sz_cache_cnt < max; ++bms->sz_cache_cnt) {
+		WT_RET(__block_size_alloc(session, &sz));
+
+		sz->next[0] = bms->sz_cache;
+		bms->sz_cache = sz;
+	}
+	return (0);
+}
+
+/*
+ * __wt_block_size_free --
+ *	Add a WT_SIZE structure to the cached list.
+ */
+void
+__wt_block_size_free(WT_SESSION_IMPL *session, WT_SIZE *sz)
+{
+	WT_BLOCK_MGR_SESSION *bms;
+
+	if ((bms = session->block_manager) == NULL)
+		__wt_free(session, sz);
+	else {
+		sz->next[0] = bms->sz_cache;
+		bms->sz_cache = sz;
+
+		++bms->sz_cache_cnt;
+	}
+}
+
+/*
+ * __block_size_discard --
+ *	Discard some or all of the WT_SIZE structure cache.
+ */
+static int
+__block_size_discard(WT_SESSION_IMPL *session, u_int max)
+{
+	WT_BLOCK_MGR_SESSION *bms;
+	WT_SIZE *sz, *nsz;
+
+	bms = session->block_manager;
+	if (max != 0 && bms->sz_cache_cnt <= max)
+		return (0);
+
+	for (sz = bms->sz_cache; sz != NULL;) {
+		nsz = sz->next[0];
+		__wt_free(session, sz);
+		sz = nsz;
+
+		--bms->sz_cache_cnt;
+		if (max != 0 && bms->sz_cache_cnt <= max)
+			break;
+	}
+	bms->sz_cache = sz;
+
+	if (max == 0 && bms->sz_cache_cnt != 0)
+		WT_RET_MSG(session, WT_ERROR,
+		    "incorrect count in session handle's block manager cache");
+	return (0);
+}
+
+/*
+ * __block_manager_session_cleanup --
+ *	Clean up the session handle's block manager information.
+ */
+static int
+__block_manager_session_cleanup(WT_SESSION_IMPL *session)
+{
+	WT_DECL_RET;
+
+	if (session->block_manager == NULL)
+		return (0);
+
+	WT_TRET(__block_ext_discard(session, 0));
+	WT_TRET(__block_size_discard(session, 0));
+
+	__wt_free(session, session->block_manager);
+
+	return (ret);
+}
+
+/*
+ * __wt_block_ext_prealloc --
+ *	Pre-allocate WT_EXT and WT_SIZE structures.
+ */
+int
+__wt_block_ext_prealloc(WT_SESSION_IMPL *session, u_int max)
+{
+	if (session->block_manager == NULL) {
+		WT_RET(__wt_calloc(session, 1,
+		    sizeof(WT_BLOCK_MGR_SESSION), &session->block_manager));
+		session->block_manager_cleanup =
+		    __block_manager_session_cleanup;
+	}
+	WT_RET(__block_ext_prealloc(session, max));
+	WT_RET(__block_size_prealloc(session, max));
+	return (0);
+}
+
+/*
+ * __wt_block_ext_discard --
+ *	Discard WT_EXT and WT_SIZE structures after checkpoint runs.
+ */
+int
+__wt_block_ext_discard(WT_SESSION_IMPL *session, u_int max)
+{
+	WT_RET(__block_ext_discard(session, max));
+	WT_RET(__block_size_discard(session, max));
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_slvg.c b/src/third_party/wiredtiger/src/block/block_slvg.c
new file mode 100644
index 00000000000..349daa620f5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_slvg.c
@@ -0,0 +1,190 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_block_salvage_start --
+ *	Start a file salvage.
+ */
+int
+__wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+	wt_off_t len;
+	uint32_t allocsize;
+
+	allocsize = block->allocsize;
+
+	/* Reset the description information in the first block. */
+	WT_RET(__wt_desc_init(session, block->fh, allocsize));
+
+	/*
+	 * Salvage creates a new checkpoint when it's finished, set up for
+	 * rolling an empty file forward.
+	 */
+	WT_RET(__wt_block_ckpt_init(session, &block->live, "live"));
+
+	/*
+	 * Truncate the file to an allocation-size multiple of blocks (bytes
+	 * trailing the last block must be garbage, by definition).
+	 */
+	if (block->fh->size > allocsize) {
+		len = (block->fh->size / allocsize) * allocsize;
+		if (len != block->fh->size)
+			WT_RET(__wt_ftruncate(session, block->fh, len));
+	} else
+		len = allocsize;
+	block->live.file_size = len;
+
+	/*
+	 * The file's first allocation-sized block is description information,
+	 * skip it when reading through the file.
+	 */
+	block->slvg_off = allocsize;
+
+	/*
+	 * The only checkpoint extent we care about is the allocation list.
+	 * Start with the entire file on the allocation list, we'll "free"
+	 * any blocks we don't want as we process the file.
+	 */
+	WT_RET(__wt_block_insert_ext(
+	    session, &block->live.alloc, allocsize, len - allocsize));
+
+	return (0);
+}
+
+/*
+ * __wt_block_salvage_end --
+ *	End a file salvage.
+ */
+int
+__wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+	/* Discard the checkpoint. */
+	return (__wt_block_checkpoint_unload(session, block, 0));
+}
+
+/*
+ * __wt_block_offset_invalid --
+ *	Return if the block offset is insane.
+ */
+int
+__wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size)
+{
+	if (size == 0)				/* < minimum page size */
+		return (1);
+	if (size % block->allocsize != 0)	/* not allocation-size units */
+		return (1);
+	if (size > WT_BTREE_PAGE_SIZE_MAX)	/* > maximum page size */
+		return (1);
+						/* past end-of-file */
+	if (offset + (wt_off_t)size > block->fh->size)
+		return (1);
+	return (0);
+}
+
+/*
+ * __wt_block_salvage_next --
+ *	Return the address for the next potential block from the file.
+ */
+int
+__wt_block_salvage_next(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, int *eofp)
+{
+	WT_BLOCK_HEADER *blk;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	WT_FH *fh;
+	wt_off_t max, offset;
+	uint32_t allocsize, cksum, size;
+	uint8_t *endp;
+
+	*eofp = 0;
+
+	fh = block->fh;
+	allocsize = block->allocsize;
+	WT_ERR(__wt_scr_alloc(session, allocsize, &tmp));
+
+	/* Read through the file, looking for pages. */
+	for (max = fh->size;;) {
+		offset = block->slvg_off;
+		if (offset >= max) {			/* Check eof. */
+			*eofp = 1;
+			goto done;
+		}
+
+		/*
+		 * Read the start of a possible page (an allocation-size block),
+		 * and get a page length from it.  Move to the next allocation
+		 * sized boundary, we'll never consider this one again.
+		 */
+		WT_ERR(__wt_read(
+		    session, fh, offset, (size_t)allocsize, tmp->mem));
+		blk = WT_BLOCK_HEADER_REF(tmp->mem);
+		size = blk->disk_size;
+		cksum = blk->cksum;
+
+		/*
+		 * Check the block size: if it's not insane, read the block.
+		 * Reading the block validates any checksum; if reading the
+		 * block succeeds, return its address as a possible page,
+		 * otherwise, move past it.
+		 */
+		if (!__wt_block_offset_invalid(block, offset, size) &&
+		    __wt_block_read_off(
+		    session, block, tmp, offset, size, cksum) == 0)
+			break;
+
+		/* Free the allocation-size block. */
+		WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+		    "skipping %" PRIu32 "B at file offset %" PRIuMAX,
+		    allocsize, (uintmax_t)offset));
+		WT_ERR(__wt_block_off_free(
+		    session, block, offset, (wt_off_t)allocsize));
+		block->slvg_off += allocsize;
+	}
+
+	/* Re-create the address cookie that should reference this block. */
+	endp = addr;
+	WT_ERR(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum));
+	*addr_sizep = WT_PTRDIFF(endp, addr);
+
+done:
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __wt_block_salvage_valid --
+ *	Let salvage know if a block is valid.
+ */
+int
+__wt_block_salvage_valid(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, uint8_t *addr, size_t addr_size, int valid)
+{
+	wt_off_t offset;
+	uint32_t size, cksum;
+
+	WT_UNUSED(session);
+	WT_UNUSED(addr_size);
+
+	/*
+	 * Crack the cookie.
+	 * If the upper layer took the block, move past it; if the upper layer
+	 * rejected the block, move past an allocation size chunk and free it.
+	 */
+	WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+	if (valid)
+		block->slvg_off = offset + size;
+	else {
+		WT_RET(__wt_block_off_free(
+		    session, block, offset, (wt_off_t)block->allocsize));
+		block->slvg_off = offset + block->allocsize;
+	}
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_vrfy.c b/src/third_party/wiredtiger/src/block/block_vrfy.c
new file mode 100644
index 00000000000..148b4fa9743
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_vrfy.c
@@ -0,0 +1,514 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __verify_ckptfrag_add(
+	WT_SESSION_IMPL *, WT_BLOCK *, wt_off_t, wt_off_t);
+static int __verify_ckptfrag_chk(WT_SESSION_IMPL *, WT_BLOCK *);
+static int __verify_filefrag_add(
+	WT_SESSION_IMPL *, WT_BLOCK *, const char *, wt_off_t, wt_off_t, int);
+static int __verify_filefrag_chk(WT_SESSION_IMPL *, WT_BLOCK *);
+static int __verify_last_avail(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
+static int __verify_last_truncate(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
+
+/* The bit list ignores the first block: convert to/from a frag/offset. */
+#define	WT_wt_off_tO_FRAG(block, off)					\
+	((off) / (block)->allocsize - 1)
+#define	WT_FRAG_TO_OFF(block, frag)					\
+	(((wt_off_t)(frag + 1)) * (block)->allocsize)
+
+/*
+ * __wt_block_verify_start --
+ *	Start file verification.
+ */
+int
+__wt_block_verify_start(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
+{
+	WT_CKPT *ckpt;
+	wt_off_t size;
+
+	/*
+	 * Find the last checkpoint in the list: if there are none, or the only
+	 * checkpoint we have is fake, there's no work to do.  Don't complain,
+	 * that's not our problem to solve.
+	 */
+	WT_CKPT_FOREACH(ckptbase, ckpt)
+		;
+	for (;; --ckpt) {
+		if (ckpt->name != NULL && !F_ISSET(ckpt, WT_CKPT_FAKE))
+			break;
+		if (ckpt == ckptbase)
+			return (0);
+	}
+
+	/* Truncate the file to the size of the last checkpoint. */
+	WT_RET(__verify_last_truncate(session, block, ckpt));
+
+	/*
+	 * We're done if the file has no data pages (this happens if we verify
+	 * a file immediately after creation or the checkpoint doesn't reflect
+	 * any of the data pages).
+	 */
+	size = block->fh->size;
+	if (size <= block->allocsize)
+		return (0);
+
+	/* The file size should be a multiple of the allocation size. */
+	if (size % block->allocsize != 0)
+		WT_RET_MSG(session, WT_ERROR,
+		    "the file size is not a multiple of the allocation size");
+
+	/*
+	 * Allocate a bit array, where each bit represents a single allocation
+	 * size piece of the file (this is how we track the parts of the file
+	 * we've verified, and check for multiply referenced or unreferenced
+	 * blocks).  Storing this on the heap seems reasonable, verifying a 1TB
+	 * file with an 512B allocation size would require a 256MB bit array:
+	 *
+	 *	(((1 * 2^40) / 512) / 8) = 256 * 2^20
+	 *
+	 * To verify larger files than we can handle in this way, we'd have to
+	 * write parts of the bit array into a disk file.
+	 *
+	 * Alternatively, we could switch to maintaining ranges of the file as
+	 * we do with the extents, but that has its own failure mode, where we
+	 * verify many non-contiguous blocks creating too many entries on the
+	 * list to fit into memory.
+	 */
+	block->frags = (uint64_t)WT_wt_off_tO_FRAG(block, size);
+	WT_RET(__bit_alloc(session, block->frags, &block->fragfile));
+
+	/*
+	 * We maintain an allocation list that is rolled forward through the
+	 * set of checkpoints.
+	 */
+	WT_RET(__wt_block_extlist_init(
+	    session, &block->verify_alloc, "verify", "alloc", 0));
+
+	/*
+	 * The only checkpoint avail list we care about is the last one written;
+	 * get it now and initialize the list of file fragments.
+	 */
+	WT_RET(__verify_last_avail(session, block, ckpt));
+
+	block->verify = 1;
+	return (0);
+}
+
+/*
+ * __verify_last_avail --
+ *	Get the last checkpoint's avail list and load it into the list of file
+ * fragments.
+ */
+static int
+__verify_last_avail(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
+{
+	WT_BLOCK_CKPT *ci, _ci;
+	WT_DECL_RET;
+	WT_EXT *ext;
+	WT_EXTLIST *el;
+
+	ci = &_ci;
+	WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
+	WT_ERR(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
+
+	el = &ci->avail;
+	if (el->offset != WT_BLOCK_INVALID_OFFSET) {
+		WT_ERR(__wt_block_extlist_read_avail(
+		    session, block, el, ci->file_size));
+		WT_EXT_FOREACH(ext, el->off)
+			if ((ret = __verify_filefrag_add(session, block,
+			    "avail-list chunk", ext->off, ext->size, 1)) != 0)
+				break;
+	}
+
+err:	__wt_block_ckpt_destroy(session, ci);
+	return (ret);
+}
+
+/*
+ * __verify_last_truncate --
+ *	Truncate the file to the last checkpoint's size.
+ */
+static int
+__verify_last_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
+{
+	WT_BLOCK_CKPT *ci, _ci;
+	WT_DECL_RET;
+
+	ci = &_ci;
+	WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
+	WT_ERR(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
+	WT_ERR(__wt_ftruncate(session, block->fh, ci->file_size));
+
+err:	__wt_block_ckpt_destroy(session, ci);
+	return (ret);
+}
+
+/*
+ * __wt_block_verify_end --
+ *	End file verification.
+ */
+int
+__wt_block_verify_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+	WT_DECL_RET;
+
+	/* Confirm we verified every file block. */
+	ret = __verify_filefrag_chk(session, block);
+
+	/* Discard the accumulated allocation list. */
+	__wt_block_extlist_free(session, &block->verify_alloc);
+
+	/* Discard the fragment tracking lists. */
+	__wt_free(session, block->fragfile);
+	__wt_free(session, block->fragckpt);
+
+	block->verify = 0;
+	return (ret);
+}
+
+/*
+ * __wt_verify_ckpt_load --
+ *	Verify work done when a checkpoint is loaded.
+ */
+int
+__wt_verify_ckpt_load(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci)
+{
+	WT_EXTLIST *el;
+	WT_EXT *ext;
+	uint64_t frag, frags;
+
+	/* Set the maximum file size for this checkpoint. */
+	block->verify_size = ci->file_size;
+
+	/*
+	 * Add the root page and disk blocks used to store the extent lists to
+	 * the list of blocks we've "seen" from the file.
+	 */
+	if (ci->root_offset != WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__verify_filefrag_add(session, block, "checkpoint",
+		    ci->root_offset, (wt_off_t)ci->root_size, 1));
+	if (ci->alloc.offset != WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__verify_filefrag_add(session, block, "alloc list",
+		    ci->alloc.offset, (wt_off_t)ci->alloc.size, 1));
+	if (ci->avail.offset != WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__verify_filefrag_add(session, block, "avail list",
+		    ci->avail.offset, (wt_off_t)ci->avail.size, 1));
+	if (ci->discard.offset != WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__verify_filefrag_add(session, block, "discard list",
+		    ci->discard.offset, (wt_off_t)ci->discard.size, 1));
+
+	/*
+	 * Checkpoint verification is similar to deleting checkpoints.  As we
+	 * read each new checkpoint, we merge the allocation lists (accumulating
+	 * all allocated pages as we move through the system), and then remove
+	 * any pages found in the discard list.   The result should be a
+	 * one-to-one mapping to the pages we find in this specific checkpoint.
+	 */
+	el = &ci->alloc;
+	if (el->offset != WT_BLOCK_INVALID_OFFSET) {
+		WT_RET(__wt_block_extlist_read(
+		    session, block, el, ci->file_size));
+		WT_RET(__wt_block_extlist_merge(
+		    session, el, &block->verify_alloc));
+		__wt_block_extlist_free(session, el);
+	}
+	el = &ci->discard;
+	if (el->offset != WT_BLOCK_INVALID_OFFSET) {
+		WT_RET(__wt_block_extlist_read(
+		    session, block, el, ci->file_size));
+		WT_EXT_FOREACH(ext, el->off)
+			WT_RET(__wt_block_off_remove_overlap(session,
+			    &block->verify_alloc, ext->off, ext->size));
+		__wt_block_extlist_free(session, el);
+	}
+
+	/*
+	 * The root page of the checkpoint appears on the alloc list, but not,
+	 * at least until the checkpoint is deleted, on a discard list.   To
+	 * handle this case, remove the root page from the accumulated list of
+	 * checkpoint pages, so it doesn't add a new requirement for subsequent
+	 * checkpoints.
+	 */
+	if (ci->root_offset != WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__wt_block_off_remove_overlap(session,
+		    &block->verify_alloc, ci->root_offset, ci->root_size));
+
+	/*
+	 * Allocate the per-checkpoint bit map.  The per-checkpoint bit map is
+	 * the opposite of the per-file bit map, that is, we set all the bits
+	 * that we expect to be set based on the checkpoint's allocation and
+	 * discard lists, then clear bits as we verify blocks.  When finished
+	 * verifying the checkpoint, the bit list should be empty.
+	 */
+	WT_RET(__bit_alloc(session, block->frags, &block->fragckpt));
+	el = &block->verify_alloc;
+	WT_EXT_FOREACH(ext, el->off) {
+		frag = (uint64_t)WT_wt_off_tO_FRAG(block, ext->off);
+		frags = (uint64_t)(ext->size / block->allocsize);
+		__bit_nset(block->fragckpt, frag, frag + (frags - 1));
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_verify_ckpt_unload --
+ *	Verify work done when a checkpoint is unloaded.
+ */
+int
+__wt_verify_ckpt_unload(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+	WT_DECL_RET;
+
+	/* Confirm we verified every checkpoint block. */
+	ret = __verify_ckptfrag_chk(session, block);
+
+	/* Discard the per-checkpoint fragment list. */
+	__wt_free(session, block->fragckpt);
+
+	return (ret);
+}
+
+/*
+ * __wt_block_verify_addr --
+ *	Update an address in a checkpoint as verified.
+ */
+int
+__wt_block_verify_addr(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, const uint8_t *addr, size_t addr_size)
+{
+	wt_off_t offset;
+	uint32_t cksum, size;
+
+	WT_UNUSED(addr_size);
+
+	/* Crack the cookie. */
+	WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+	/* Add to the per-file list. */ 
+	WT_RET(__verify_filefrag_add(session, block, NULL, offset, size, 0));
+
+	/*
+	 * It's tempting to try and flag a page as "verified" when we read it.
+	 * That doesn't work because we may visit a page multiple times when
+	 * verifying a single checkpoint (for example, when verifying the
+	 * physical image of a row-store leaf page with overflow keys, the
+	 * overflow keys are read when checking for key sort issues, and read
+	 * again when more general overflow item checking is done).  This
+	 * function is called by the btree verification code, once per logical
+	 * visit in a checkpoint, so we can detect if a page is referenced
+	 * multiple times within a single checkpoint.  This doesn't apply to
+	 * the per-file list, because it is expected for the same btree blocks
+	 * to appear in multiple checkpoints.
+	 *
+	 * Add the block to the per-checkpoint list.
+	 */
+	WT_RET(__verify_ckptfrag_add(session, block, offset, size));
+
+	return (0);
+}
+
+/*
+ * __verify_filefrag_add --
+ *	Add the fragments to the per-file fragment list, optionally complain if
+ * we've already verified this chunk of the file.
+ */
+static int
+__verify_filefrag_add(WT_SESSION_IMPL *session, WT_BLOCK *block,
+    const char *type, wt_off_t offset, wt_off_t size, int nodup)
+{
+	uint64_t f, frag, frags, i;
+
+	WT_RET(__wt_verbose(session, WT_VERB_VERIFY,
+	    "add file block%s%s%s at %" PRIuMAX "-%" PRIuMAX " (%" PRIuMAX ")",
+	    type == NULL ? "" : " (",
+	    type == NULL ? "" : type,
+	    type == NULL ? "" : ")",
+	    (uintmax_t)offset, (uintmax_t)(offset + size), (uintmax_t)size));
+
+	/* Check each chunk against the total file size. */
+	if (offset + size > block->fh->size)
+		WT_RET_MSG(session, WT_ERROR,
+		    "fragment %" PRIuMAX "-%" PRIuMAX " references "
+		    "non-existent file blocks",
+		    (uintmax_t)offset, (uintmax_t)(offset + size));
+
+	frag = (uint64_t)WT_wt_off_tO_FRAG(block, offset);
+	frags = (uint64_t)(size / block->allocsize);
+
+	/* It may be illegal to reference a particular chunk more than once. */
+	if (nodup)
+		for (f = frag, i = 0; i < frags; ++f, ++i)
+			if (__bit_test(block->fragfile, f))
+				WT_RET_MSG(session, WT_ERROR,
+				    "file fragment at %" PRIuMAX " referenced "
+				    "multiple times",
+				    (uintmax_t)offset);
+
+	/* Add fragments to the file's fragment list. */
+	__bit_nset(block->fragfile, frag, frag + (frags - 1));
+
+	return (0);
+}
+
+/*
+ * __verify_filefrag_chk --
+ *	Verify we've checked all the fragments in the file.
+ */
+static int
+__verify_filefrag_chk(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+	uint64_t count, first, last;
+
+	/* If there's nothing to verify, it was a fast run. */
+	if (block->frags == 0)
+		return (0);
+
+	/*
+	 * It's OK if we have not verified blocks at the end of the file: that
+	 * happens if the file is truncated during a checkpoint or load or was
+	 * extended after writing a checkpoint.  We should never see unverified
+	 * blocks anywhere else, though.
+	 *
+	 * I'm deliberately testing for a last fragment of 0, it makes no sense
+	 * there would be no fragments verified, complain if the first fragment
+	 * in the file wasn't verified.
+	 */
+	for (last = block->frags - 1; last != 0; --last) {
+		if (__bit_test(block->fragfile, last))
+			break;
+		__bit_set(block->fragfile, last);
+	}
+
+	/*
+	 * Check for any other file fragments we haven't verified -- every time
+	 * we find a bit that's clear, complain.  We re-start the search each
+	 * time after setting the clear bit(s) we found: it's simpler and this
+	 * isn't supposed to happen a lot.
+	 */
+	for (count = 0;; ++count) {
+		if (__bit_ffc(block->fragfile, block->frags, &first) != 0)
+			break;
+		__bit_set(block->fragfile, first);
+		for (last = first + 1; last < block->frags; ++last) {
+			if (__bit_test(block->fragfile, last))
+				break;
+			__bit_set(block->fragfile, last);
+		}
+
+		if (!WT_VERBOSE_ISSET(session, WT_VERB_VERIFY))
+			continue;
+
+		__wt_errx(session,
+		    "file range %" PRIuMAX "-%" PRIuMAX " never verified",
+		    (uintmax_t)WT_FRAG_TO_OFF(block, first),
+		    (uintmax_t)WT_FRAG_TO_OFF(block, last));
+	}
+	if (count == 0)
+		return (0);
+
+	__wt_errx(session, "file ranges never verified: %" PRIu64, count);
+	return (WT_ERROR);
+}
+
+/*
+ * __verify_ckptfrag_add --
+ *	Clear the fragments in the per-checkpoint fragment list, and complain if
+ * we've already verified this chunk of the checkpoint.
+ */
+static int
+__verify_ckptfrag_add(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, wt_off_t size)
+{
+	uint64_t f, frag, frags, i;
+
+	WT_RET(__wt_verbose(session, WT_VERB_VERIFY,
+	    "add checkpoint block at %" PRIuMAX "-%" PRIuMAX " (%" PRIuMAX ")",
+	    (uintmax_t)offset, (uintmax_t)(offset + size), (uintmax_t)size));
+
+	/*
+	 * Check each chunk against the checkpoint's size, a checkpoint should
+	 * never reference a block outside of the checkpoint's stored size.
+	 */
+	if (offset + size > block->verify_size)
+		WT_RET_MSG(session, WT_ERROR,
+		    "fragment %" PRIuMAX "-%" PRIuMAX " references "
+		    "file blocks outside the checkpoint",
+		    (uintmax_t)offset, (uintmax_t)(offset + size));
+
+	frag = (uint64_t)WT_wt_off_tO_FRAG(block, offset);
+	frags = (uint64_t)(size / block->allocsize);
+
+	/* It is illegal to reference a particular chunk more than once. */
+	for (f = frag, i = 0; i < frags; ++f, ++i)
+		if (!__bit_test(block->fragckpt, f))
+			WT_RET_MSG(session, WT_ERROR,
+			    "fragment at %" PRIuMAX " referenced multiple "
+			    "times in a single checkpoint or found in the "
+			    "checkpoint but not listed in the checkpoint's "
+			    "allocation list",
+			    (uintmax_t)offset);
+
+	/* Remove fragments from the checkpoint's allocation list. */
+	__bit_nclr(block->fragckpt, frag, frag + (frags - 1));
+
+	return (0);
+}
+
+/*
+ * __verify_ckptfrag_chk --
+ *	Verify we've checked all the fragments in the checkpoint.
+ */
+static int
+__verify_ckptfrag_chk(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+	uint64_t count, first, last;
+
+	/*
+	 * The checkpoint fragment memory is only allocated as a checkpoint
+	 * is successfully loaded; don't check if there's nothing there.
+	 */
+	if (block->fragckpt == NULL)
+		return (0);
+
+	/*
+	 * Check for checkpoint fragments we haven't verified -- every time we
+	 * find a bit that's set, complain.  We re-start the search each time
+	 * after clearing the set bit(s) we found: it's simpler and this isn't
+	 * supposed to happen a lot.
+	 */
+	for (count = 0;; ++count) {
+		if (__bit_ffs(block->fragckpt, block->frags, &first) != 0)
+			break;
+		__bit_clear(block->fragckpt, first);
+		for (last = first + 1; last < block->frags; ++last) {
+			if (!__bit_test(block->fragckpt, last))
+				break;
+			__bit_clear(block->fragckpt, last);
+		}
+
+		if (!WT_VERBOSE_ISSET(session, WT_VERB_VERIFY))
+			continue;
+
+		__wt_errx(session,
+		    "checkpoint range %" PRIuMAX "-%" PRIuMAX " never verified",
+		    (uintmax_t)WT_FRAG_TO_OFF(block, first),
+		    (uintmax_t)WT_FRAG_TO_OFF(block, last));
+	}
+
+	if (count == 0)
+		return (0);
+
+	__wt_errx(session,
+	    "checkpoint ranges never verified: %" PRIu64, count);
+	return (WT_ERROR);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_write.c b/src/third_party/wiredtiger/src/block/block_write.c
new file mode 100644
index 00000000000..0da6380e61f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_write.c
@@ -0,0 +1,269 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_block_header --
+ *	Return the size of the block-specific header.
+ */
+u_int
+__wt_block_header(WT_BLOCK *block)
+{
+	WT_UNUSED(block);
+
+	return ((u_int)WT_BLOCK_HEADER_SIZE);
+}
+
+/*
+ * __wt_block_write_size --
+ *	Return the buffer size required to write a block.
+ */
+int
+__wt_block_write_size(WT_SESSION_IMPL *session, WT_BLOCK *block, size_t *sizep)
+{
+	WT_UNUSED(session);
+
+	/*
+	 * We write the page size, in bytes, into the block's header as a 4B
+	 * unsigned value, and it's possible for the engine to accept an item
+	 * we can't write.  For example, a huge key/value where the allocation
+	 * size has been set to something large will overflow 4B when it tries
+	 * to align the write.  We could make this work (for example, writing
+	 * the page size in units of allocation size or something else), but
+	 * it's not worth the effort, writing 4GB objects into a btree makes
+	 * no sense.  Limit the writes to (4GB - 1KB), it gives us potential
+	 * mode bits, and I'm not interested in debugging corner cases anyway.
+	 */
+	*sizep = (size_t)
+	    WT_ALIGN(*sizep + WT_BLOCK_HEADER_BYTE_SIZE, block->allocsize);
+	return (*sizep > UINT32_MAX - 1024 ? EINVAL : 0);
+}
+
+/*
+ * __wt_block_write --
+ *	Write a buffer into a block, returning the block's address cookie.
+ */
+int
+__wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block,
+    WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, int data_cksum)
+{
+	wt_off_t offset;
+	uint32_t size, cksum;
+	uint8_t *endp;
+
+	WT_RET(__wt_block_write_off(
+	    session, block, buf, &offset, &size, &cksum, data_cksum, 0));
+
+	endp = addr;
+	WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum));
+	*addr_sizep = WT_PTRDIFF(endp, addr);
+
+	return (0);
+}
+
+/*
+ * __wt_block_write_off --
+ *	Write a buffer into a block, returning the block's offset, size and
+ * checksum.
+ */
+int
+__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
+    WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump,
+    int data_cksum, int caller_locked)
+{
+	WT_BLOCK_HEADER *blk;
+	WT_DECL_RET;
+	WT_FH *fh;
+	size_t align_size;
+	wt_off_t offset;
+	int local_locked;
+
+	blk = WT_BLOCK_HEADER_REF(buf->mem);
+	fh = block->fh;
+	local_locked = 0;
+
+	/* Buffers should be aligned for writing. */
+	if (!F_ISSET(buf, WT_ITEM_ALIGNED)) {
+		WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED));
+		WT_RET_MSG(session, EINVAL,
+		    "direct I/O check: write buffer incorrectly allocated");
+	}
+
+	/*
+	 * Align the size to an allocation unit.
+	 *
+	 * The buffer must be big enough for us to zero to the next allocsize
+	 * boundary, this is one of the reasons the btree layer must find out
+	 * from the block-manager layer the maximum size of the eventual write.
+	 */
+	align_size = WT_ALIGN(buf->size, block->allocsize);
+	if (align_size > buf->memsize) {
+		WT_ASSERT(session, align_size <= buf->memsize);
+		WT_RET_MSG(session, EINVAL,
+		    "buffer size check: write buffer incorrectly allocated");
+	}
+	if (align_size > UINT32_MAX) {
+		WT_ASSERT(session, align_size <= UINT32_MAX);
+		WT_RET_MSG(session, EINVAL,
+		    "buffer size check: write buffer too large to write");
+	}
+
+	/* Zero out any unused bytes at the end of the buffer. */
+	memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size);
+
+	/*
+	 * Set the disk size so we don't have to incrementally read blocks
+	 * during salvage.
+	 */
+	blk->disk_size = WT_STORE_SIZE(align_size);
+
+	/*
+	 * Update the block's checksum: if our caller specifies, checksum the
+	 * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP
+	 * bytes.  The assumption is applications with good compression support
+	 * turn off checksums and assume corrupted blocks won't decompress
+	 * correctly.  However, if compression failed to shrink the block, the
+	 * block wasn't compressed, in which case our caller will tell us to
+	 * checksum the data to detect corruption.   If compression succeeded,
+	 * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes
+	 * because they're not compressed, both to give salvage a quick test
+	 * of whether a block is useful and to give us a test so we don't lose
+	 * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing.
+	 */
+	blk->flags = 0;
+	if (data_cksum)
+		F_SET(blk, WT_BLOCK_DATA_CKSUM);
+	blk->cksum = 0;
+	blk->cksum = __wt_cksum(
+	    buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP);
+
+	if (!caller_locked) {
+		WT_RET(__wt_block_ext_prealloc(session, 5));
+		__wt_spin_lock(session, &block->live_lock);
+		local_locked = 1;
+	}
+	ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size);
+
+	/*
+	 * Extend the file in chunks.  We want to limit the number of threads
+	 * extending the file at the same time, so choose the one thread that's
+	 * crossing the extended boundary.  We don't extend newly created files,
+	 * and it's theoretically possible we might wait so long our extension
+	 * of the file is passed by another thread writing single blocks, that's
+	 * why there's a check in case the extended file size becomes too small:
+	 * if the file size catches up, every thread tries to extend it.
+	 *
+	 * File extension may require locking: some variants of the system call
+	 * used to extend the file initialize the extended space. If a writing
+	 * thread races with the extending thread, the extending thread might
+	 * overwrite already written data, and that would be very, very bad.
+	 *
+	 * Some variants of the system call to extend the file fail at run-time
+	 * based on the filesystem type, fall back to ftruncate in that case,
+	 * and remember that ftruncate requires locking.
+	 */
+	if (ret == 0 &&
+	    fh->extend_len != 0 &&
+	    (fh->extend_size <= fh->size ||
+	    (offset + fh->extend_len <= fh->extend_size &&
+	    offset +
+	    fh->extend_len + (wt_off_t)align_size >= fh->extend_size))) {
+		fh->extend_size = offset + fh->extend_len * 2;
+		if (fh->fallocate_available) {
+			/*
+			 * Release any locally acquired lock if it's not needed
+			 * to extend the file, extending the file might require
+			 * updating file metadata, which can be slow. (It may be
+			 * a bad idea to configure for file extension on systems
+			 * that require locking over the extend call.)
+			 */
+			if (!fh->fallocate_requires_locking && local_locked) {
+				__wt_spin_unlock(session, &block->live_lock);
+				local_locked = 0;
+			}
+
+			/* Extend the file. */
+			if ((ret = __wt_fallocate(session,
+			    fh, offset, fh->extend_len * 2)) == ENOTSUP) {
+				ret = 0;
+				goto extend_truncate;
+			}
+		} else {
+extend_truncate:	/*
+			 * We may have a caller lock or a locally acquired lock,
+			 * but we need a lock to call ftruncate.
+			 */
+			if (!caller_locked && local_locked == 0) {
+				__wt_spin_lock(session, &block->live_lock);
+				local_locked = 1;
+			}
+			/*
+			 * The truncate might fail if there's a file mapping
+			 * (if there's an open checkpoint on the file), that's
+			 * OK.
+			 */
+			if ((ret = __wt_ftruncate(
+			    session, fh, offset + fh->extend_len * 2)) == EBUSY)
+				ret = 0;
+		}
+	}
+	/* Release any locally acquired lock. */
+	if (local_locked) {
+		__wt_spin_unlock(session, &block->live_lock);
+		local_locked = 0;
+	}
+	WT_RET(ret);
+
+	/* Write the block. */
+	if ((ret =
+	    __wt_write(session, fh, offset, align_size, buf->mem)) != 0) {
+		if (!caller_locked)
+			__wt_spin_lock(session, &block->live_lock);
+		WT_TRET(__wt_block_off_free(
+		    session, block, offset, (wt_off_t)align_size));
+		if (!caller_locked)
+			__wt_spin_unlock(session, &block->live_lock);
+		WT_RET(ret);
+	}
+
+#ifdef HAVE_SYNC_FILE_RANGE
+	/*
+	 * Optionally schedule writes for dirty pages in the system buffer
+	 * cache, but only if the current session can wait.
+	 */
+	if (block->os_cache_dirty_max != 0 &&
+	    (block->os_cache_dirty += align_size) > block->os_cache_dirty_max &&
+	    __wt_session_can_wait(session)) {
+		block->os_cache_dirty = 0;
+		WT_RET(__wt_fsync_async(session, fh));
+	}
+#endif
+#ifdef HAVE_POSIX_FADVISE
+	/* Optionally discard blocks from the system buffer cache. */
+	if (block->os_cache_max != 0 &&
+	    (block->os_cache += align_size) > block->os_cache_max) {
+		block->os_cache = 0;
+		if ((ret = posix_fadvise(fh->fd,
+		    (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0)
+			WT_RET_MSG(
+			    session, ret, "%s: posix_fadvise", block->name);
+	}
+#endif
+	WT_STAT_FAST_CONN_INCR(session, block_write);
+	WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size);
+
+	WT_RET(__wt_verbose(session, WT_VERB_WRITE,
+	    "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32,
+	    (uintmax_t)offset, (uintmax_t)align_size, blk->cksum));
+
+	*offsetp = offset;
+	*sizep = WT_STORE_SIZE(align_size);
+	*cksump = blk->cksum;
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/bloom/bloom.c b/src/third_party/wiredtiger/src/bloom/bloom.c
new file mode 100644
index 00000000000..8c8c8bc723e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/bloom/bloom.c
@@ -0,0 +1,351 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#define	WT_BLOOM_TABLE_CONFIG "key_format=r,value_format=1t,exclusive=true"
+
+/*
+ * __bloom_init --
+ *	Allocate a WT_BLOOM handle.
+ */
+static int
+__bloom_init(WT_SESSION_IMPL *session,
+    const char *uri, const char *config, WT_BLOOM **bloomp)
+{
+	WT_BLOOM *bloom;
+	WT_DECL_RET;
+	size_t len;
+
+	*bloomp = NULL;
+
+	WT_RET(__wt_calloc_def(session, 1, &bloom));
+
+	WT_ERR(__wt_strdup(session, uri, &bloom->uri));
+	len = strlen(WT_BLOOM_TABLE_CONFIG) + 2;
+	if (config != NULL)
+		len += strlen(config);
+	WT_ERR(__wt_calloc_def(session, len, &bloom->config));
+	/* Add the standard config at the end, so it overrides user settings. */
+	(void)snprintf(bloom->config, len,
+	    "%s,%s", config == NULL ? "" : config, WT_BLOOM_TABLE_CONFIG);
+
+	bloom->session = session;
+
+	*bloomp = bloom;
+	return (0);
+
+err:	__wt_free(session, bloom->uri);
+	__wt_free(session, bloom->config);
+	__wt_free(session, bloom->bitstring);
+	__wt_free(session, bloom);
+	return (ret);
+}
+
+/*
+ * __bloom_setup --
+ *	Populate the bloom structure.
+ *
+ * Setup is passed in either the count of items expected (n), or the length of
+ * the bitstring (m). Depends on whether the function is called via create or
+ * open.
+ */
+static int
+__bloom_setup(
+    WT_BLOOM *bloom, uint64_t n, uint64_t m, uint32_t factor, uint32_t k)
+{
+	if (k < 2)
+		return (EINVAL);
+
+	bloom->k = k;
+	bloom->factor = factor;
+	if (n != 0) {
+		bloom->n = n;
+		bloom->m = bloom->n * bloom->factor;
+	} else {
+		bloom->m = m;
+		bloom->n = bloom->m / bloom->factor;
+	}
+	return (0);
+}
+
+/*
+ * __wt_bloom_create --
+ *
+ * Creates and configures a WT_BLOOM handle, allocates a bitstring in memory to
+ * use while populating the bloom filter.
+ *
+ * count  - is the expected number of inserted items
+ * factor - is the number of bits to use per inserted item
+ * k      - is the number of hash values to set or test per item
+ */
+int
+__wt_bloom_create(
+    WT_SESSION_IMPL *session, const char *uri, const char *config,
+    uint64_t count, uint32_t factor, uint32_t k, WT_BLOOM **bloomp)
+{
+	WT_BLOOM *bloom;
+	WT_DECL_RET;
+
+	WT_RET(__bloom_init(session, uri, config, &bloom));
+	WT_ERR(__bloom_setup(bloom, count, 0, factor, k));
+
+	WT_ERR(__bit_alloc(session, bloom->m, &bloom->bitstring));
+
+	*bloomp = bloom;
+	return (0);
+
+err:	(void)__wt_bloom_close(bloom);
+	return (ret);
+}
+
+/*
+ * __bloom_open_cursor --
+ *	Open a cursor to read from a Bloom filter.
+ */
+static int
+__bloom_open_cursor(WT_BLOOM *bloom, WT_CURSOR *owner)
+{
+	WT_CURSOR *c;
+	WT_SESSION_IMPL *session;
+	const char *cfg[3];
+
+	if ((c = bloom->c) != NULL)
+		return (0);
+
+	session = bloom->session;
+	cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
+	cfg[1] = bloom->config;
+	cfg[2] = NULL;
+	c = NULL;
+	WT_RET(__wt_open_cursor(session, bloom->uri, owner, cfg, &c));
+
+	/* XXX Layering violation: bump the cache priority for Bloom filters. */
+	((WT_CURSOR_BTREE *)c)->btree->evict_priority = WT_EVICT_INT_SKEW;
+
+	bloom->c = c;
+	return (0);
+}
+
+/*
+ * __wt_bloom_open --
+ *	Open a Bloom filter object for use by a single session. The filter must
+ *	have been created and finalized.
+ */
+int
+__wt_bloom_open(WT_SESSION_IMPL *session,
+    const char *uri, uint32_t factor, uint32_t k,
+    WT_CURSOR *owner, WT_BLOOM **bloomp)
+{
+	WT_BLOOM *bloom;
+	WT_CURSOR *c;
+	WT_DECL_RET;
+	uint64_t size;
+
+	WT_RET(__bloom_init(session, uri, NULL, &bloom));
+	WT_ERR(__bloom_open_cursor(bloom, owner));
+	c = bloom->c;
+
+	/* Find the largest key, to get the size of the filter. */
+	WT_ERR(c->prev(c));
+	WT_ERR(c->get_key(c, &size));
+	WT_ERR(c->reset(c));
+
+	WT_ERR(__bloom_setup(bloom, 0, size, factor, k));
+
+	*bloomp = bloom;
+	return (0);
+
+err:	(void)__wt_bloom_close(bloom);
+	return (ret);
+}
+
+/*
+ * __wt_bloom_insert --
+ *	Adds the given key to the Bloom filter.
+ */
+int
+__wt_bloom_insert(WT_BLOOM *bloom, WT_ITEM *key)
+{
+	uint64_t h1, h2;
+	uint32_t i;
+
+	h1 = __wt_hash_fnv64(key->data, key->size);
+	h2 = __wt_hash_city64(key->data, key->size);
+	for (i = 0; i < bloom->k; i++, h1 += h2) {
+		__bit_set(bloom->bitstring, h1 % bloom->m);
+	}
+	return (0);
+}
+
+/*
+ * __wt_bloom_finalize --
+ *	Writes the Bloom filter to stable storage. After calling finalize, only
+ *	read operations can be performed on the bloom filter.
+ */
+int
+__wt_bloom_finalize(WT_BLOOM *bloom)
+{
+	WT_CURSOR *c;
+	WT_DECL_RET;
+	WT_ITEM values;
+	WT_SESSION *wt_session;
+	uint64_t i;
+
+	wt_session = (WT_SESSION *)bloom->session;
+	WT_CLEAR(values);
+
+	/*
+	 * Create a bit table to store the bloom filter in.
+	 * TODO: should this call __wt_schema_create directly?
+	 */
+	WT_RET(wt_session->create(wt_session, bloom->uri, bloom->config));
+	WT_RET(wt_session->open_cursor(
+	    wt_session, bloom->uri, NULL, "bulk=bitmap", &c));
+
+	/* Add the entries from the array into the table. */
+	for (i = 0; i < bloom->m; i += values.size) {
+		/* Adjust bits to bytes for string offset */
+		values.data = bloom->bitstring + (i >> 3);
+		/*
+		 * Shave off some bytes for pure paranoia, in case WiredTiger
+		 * reserves some special sizes. Choose a value so that if
+		 * we do multiple inserts, it will be on an byte boundary.
+		 */
+		values.size = (uint32_t)WT_MIN(bloom->m - i, UINT32_MAX - 127);
+		c->set_value(c, &values);
+		WT_ERR(c->insert(c));
+	}
+
+err:	WT_TRET(c->close(c));
+	__wt_free(bloom->session, bloom->bitstring);
+	bloom->bitstring = NULL;
+
+	return (ret);
+}
+
+/*
+ * __wt_bloom_hash --
+ *	Calculate the hash values for a given key.
+ */
+int
+__wt_bloom_hash(WT_BLOOM *bloom, WT_ITEM *key, WT_BLOOM_HASH *bhash)
+{
+	WT_UNUSED(bloom);
+
+	bhash->h1 = __wt_hash_fnv64(key->data, key->size);
+	bhash->h2 = __wt_hash_city64(key->data, key->size);
+
+	return (0);
+}
+
+/*
+ * __wt_bloom_hash_get --
+ *	Tests whether the key (as given by its hash signature) is in the Bloom
+ *	filter.  Returns zero if found, WT_NOTFOUND if not.
+ */
+int
+__wt_bloom_hash_get(WT_BLOOM *bloom, WT_BLOOM_HASH *bhash)
+{
+	WT_CURSOR *c;
+	WT_DECL_RET;
+	int result;
+	uint32_t i;
+	uint64_t h1, h2;
+	uint8_t bit;
+
+	/* Get operations are only supported by finalized bloom filters. */
+	WT_ASSERT(bloom->session, bloom->bitstring == NULL);
+
+	/* Create a cursor on the first time through. */
+	WT_ERR(__bloom_open_cursor(bloom, NULL));
+	c = bloom->c;
+
+	h1 = bhash->h1;
+	h2 = bhash->h2;
+
+	result = 0;
+	for (i = 0; i < bloom->k; i++, h1 += h2) {
+		/*
+		 * Add 1 to the hash because WiredTiger tables are 1 based and
+		 * the original bitstring array was 0 based.
+		 */
+		c->set_key(c, (h1 % bloom->m) + 1);
+		WT_ERR(c->search(c));
+		WT_ERR(c->get_value(c, &bit));
+
+		if (bit == 0) {
+			result = WT_NOTFOUND;
+			break;
+		}
+	}
+	WT_ERR(c->reset(c));
+	return (result);
+
+err:	/* Don't return WT_NOTFOUND from a failed search. */
+	if (ret == WT_NOTFOUND)
+		ret = WT_ERROR;
+	__wt_err(bloom->session, ret, "Failed lookup in bloom filter.");
+	return (ret);
+}
+
+/*
+ * __wt_bloom_get --
+ *	Tests whether the given key is in the Bloom filter.
+ *	Returns zero if found, WT_NOTFOUND if not.
+ */
+int
+__wt_bloom_get(WT_BLOOM *bloom, WT_ITEM *key)
+{
+	WT_BLOOM_HASH bhash;
+
+	WT_RET(__wt_bloom_hash(bloom, key, &bhash));
+	return (__wt_bloom_hash_get(bloom, &bhash));
+}
+
+/*
+ * __wt_bloom_close --
+ *	Close the Bloom filter, release any resources.
+ */
+int
+__wt_bloom_close(WT_BLOOM *bloom)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	session = bloom->session;
+
+	if (bloom->c != NULL)
+		ret = bloom->c->close(bloom->c);
+	__wt_free(session, bloom->uri);
+	__wt_free(session, bloom->config);
+	__wt_free(session, bloom->bitstring);
+	__wt_free(session, bloom);
+
+	return (ret);
+}
+
+/*
+ * __wt_bloom_drop --
+ *	Drop a Bloom filter, release any resources.
+ */
+int
+__wt_bloom_drop(WT_BLOOM *bloom, const char *config)
+{
+	WT_DECL_RET;
+	WT_SESSION *wt_session;
+
+	wt_session = (WT_SESSION *)bloom->session;
+	if (bloom->c != NULL) {
+		ret = bloom->c->close(bloom->c);
+		bloom->c = NULL;
+	}
+	WT_TRET(wt_session->drop(wt_session, bloom->uri, config));
+	WT_TRET(__wt_bloom_close(bloom));
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c
new file mode 100644
index 00000000000..e81c951e9f6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_compact.c
@@ -0,0 +1,215 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __compact_rewrite --
+ *	Return if a page needs to be re-written.
+ */
+static int
+__compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp)
+{
+	WT_BM *bm;
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_PAGE_MODIFY *mod;
+	size_t addr_size;
+	const uint8_t *addr;
+
+	*skipp = 1;					/* Default skip. */
+
+	bm = S2BT(session)->bm;
+	page = ref->page;
+	mod = page->modify;
+
+	/*
+	 * Ignore the root: it may not have a replacement address, and besides,
+	 * if anything else gets written, so will it.
+	 */
+	if (__wt_ref_is_root(ref))
+		return (0);
+
+	/* Ignore currently dirty pages, they will be written regardless. */
+	if (__wt_page_is_modified(page))
+		return (0);
+
+	/*
+	 * If the page is clean, test the original addresses.
+	 * If the page is a 1-to-1 replacement, test the replacement addresses.
+	 * Ignore empty pages, they get merged into the parent.
+	 */
+	if (mod == NULL || F_ISSET(mod, WT_PM_REC_MASK) == 0) {
+		WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+		if (addr == NULL)
+			return (0);
+		WT_RET(
+		    bm->compact_page_skip(bm, session, addr, addr_size, skipp));
+	} else if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_REPLACE) {
+		/*
+		 * The page's modification information can change underfoot if
+		 * the page is being reconciled, lock the page down.
+		 */
+		WT_PAGE_LOCK(session, page);
+		ret = bm->compact_page_skip(bm, session,
+		    mod->mod_replace.addr, mod->mod_replace.size, skipp);
+		WT_PAGE_UNLOCK(session, page);
+		WT_RET(ret);
+	}
+	return (0);
+}
+
+/*
+ * __wt_compact --
+ *	Compact a file.
+ */
+int
+__wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_REF *ref;
+	int block_manager_begin, skip;
+
+	WT_UNUSED(cfg);
+
+	conn = S2C(session);
+	btree = S2BT(session);
+	bm = btree->bm;
+	ref = NULL;
+	block_manager_begin = 0;
+
+	WT_STAT_FAST_DATA_INCR(session, session_compact);
+
+	/*
+	 * Check if compaction might be useful -- the API layer will quit trying
+	 * to compact the data source if we make no progress, set a flag if the
+	 * block layer thinks compaction is possible.
+	 */
+	WT_RET(bm->compact_skip(bm, session, &skip));
+	if (skip)
+		return (0);
+
+	/*
+	 * Reviewing in-memory pages requires looking at page reconciliation
+	 * results, because we care about where the page is stored now, not
+	 * where the page was stored when we first read it into the cache.
+	 * We need to ensure we don't race with page reconciliation as it's
+	 * writing the page modify information.
+	 *
+	 * There are three ways we call reconciliation: checkpoints, threads
+	 * writing leaf pages (usually in preparation for a checkpoint), and
+	 * eviction.
+	 *
+	 * We're holding the schema lock which serializes with checkpoints.
+	 */
+	WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+
+	/*
+	 * Get the tree handle's flush lock which blocks threads writing leaf
+	 * pages.
+	 */
+	__wt_spin_lock(session, &btree->flush_lock);
+
+	/*
+	 * That leaves eviction, we don't want to block eviction.  Set a flag
+	 * so reconciliation knows compaction is running.  If reconciliation
+	 * sees the flag it locks the page it's writing, we acquire the same
+	 * lock when reading the page's modify information, serializing access.
+	 * The same page lock blocks work on the page, but compaction is an
+	 * uncommon, heavy-weight operation.  If it's ever a problem, there's
+	 * no reason we couldn't use an entirely separate lock than the page
+	 * lock.
+	 *
+	 * We also need to ensure we don't race with an on-going reconciliation.
+	 * After we set the flag, wait for eviction of this file to drain, and
+	 * then let eviction continue;
+	 */
+	conn->compact_in_memory_pass = 1;
+	WT_ERR(__wt_evict_file_exclusive_on(session));
+	__wt_evict_file_exclusive_off(session);
+
+	/* Start compaction. */
+	WT_ERR(bm->compact_start(bm, session));
+	block_manager_begin = 1;
+
+	/* Walk the tree reviewing pages to see if they should be re-written. */
+	session->compaction = 1;
+	for (;;) {
+		/*
+		 * Pages read for compaction aren't "useful"; don't update the
+		 * read generation of pages already in memory, and if a page is
+		 * read, set its generation to a low value so it is evicted
+		 * quickly.
+		 */
+		WT_ERR(__wt_tree_walk(session, &ref,
+		    WT_READ_COMPACT | WT_READ_NO_GEN | WT_READ_WONT_NEED));
+		if (ref == NULL)
+			break;
+
+		WT_ERR(__compact_rewrite(session, ref, &skip));
+		if (skip)
+			continue;
+
+		/* Rewrite the page: mark the page and tree dirty. */
+		WT_ERR(__wt_page_modify_init(session, ref->page));
+		__wt_page_modify_set(session, ref->page);
+
+		WT_STAT_FAST_DATA_INCR(session, btree_compact_rewrite);
+	}
+
+err:	if (ref != NULL)
+		WT_TRET(__wt_page_release(session, ref, 0));
+
+	if (block_manager_begin)
+		WT_TRET(bm->compact_end(bm, session));
+
+	__wt_spin_unlock(session, &btree->flush_lock);
+
+	conn->compact_in_memory_pass = 0;
+	WT_FULL_BARRIER();
+
+	return (ret);
+}
+
+/*
+ * __wt_compact_page_skip --
+ *	Return if compaction requires we read this page.
+ */
+int
+__wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp)
+{
+	WT_BM *bm;
+	size_t addr_size;
+	u_int type;
+	const uint8_t *addr;
+
+	*skipp = 0;				/* Default to reading. */
+	type = 0;				/* Keep compiler quiet. */
+
+	bm = S2BT(session)->bm;
+
+	/*
+	 * We aren't holding a hazard pointer, so we can't look at the page
+	 * itself, all we can look at is the WT_REF information.  If there's no
+	 * address, the page isn't on disk, but we have to read internal pages
+	 * to walk the tree regardless; throw up our hands and read it.
+	 */
+	WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, &type));
+	if (addr == NULL)
+		return (0);
+
+	/*
+	 * Internal pages must be read to walk the tree; ask the block-manager
+	 * if it's useful to rewrite leaf pages, don't do the I/O if a rewrite
+	 * won't help.
+	 */
+	return (type == WT_CELL_ADDR_INT ? 0 :
+	    bm->compact_page_skip(bm, session, addr, addr_size, skipp));
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c
new file mode 100644
index 00000000000..0cc79776634
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c
@@ -0,0 +1,468 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __cursor_fix_append_next --
+ *	Return the next entry on the append list.
+ */
+static inline int
+__cursor_fix_append_next(WT_CURSOR_BTREE *cbt, int newpage)
+{
+	WT_ITEM *val;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+	val = &cbt->iface.value;
+
+	if (newpage) {
+		if ((cbt->ins = WT_SKIP_FIRST(cbt->ins_head)) == NULL)
+			return (WT_NOTFOUND);
+	} else
+		if (cbt->recno >= WT_INSERT_RECNO(cbt->ins) &&
+		    (cbt->ins = WT_SKIP_NEXT(cbt->ins)) == NULL)
+			return (WT_NOTFOUND);
+
+	/*
+	 * This code looks different from the cursor-previous code.  The append
+	 * list appears on the last page of the tree, but it may be preceded by
+	 * other rows, which means the cursor's recno will be set to a value and
+	 * we simply want to increment it.  If the cursor's recno is NOT set,
+	 * we're starting our iteration in a tree that has only appended items.
+	 * In that case, recno will be 0 and happily enough the increment will
+	 * set it to 1, which is correct.
+	 */
+	__cursor_set_recno(cbt, cbt->recno + 1);
+
+	/*
+	 * Fixed-width column store appends are inherently non-transactional.
+	 * Even a non-visible update by a concurrent or aborted transaction
+	 * changes the effective end of the data.  The effect is subtle because
+	 * of the blurring between deleted and empty values, but ideally we
+	 * would skip all uncommitted changes at the end of the data.  This
+	 * doesn't apply to variable-width column stores because the implicitly
+	 * created records written by reconciliation are deleted and so can be
+	 * never seen by a read.
+	 *
+	 * The problem is that we don't know at this point whether there may be
+	 * multiple uncommitted changes at the end of the data, and it would be
+	 * expensive to check every time we hit an aborted update.  If an
+	 * insert is aborted, we simply return zero (empty), regardless of
+	 * whether we are at the end of the data.
+	 */
+	if (cbt->recno < WT_INSERT_RECNO(cbt->ins) ||
+	    (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) {
+		cbt->v = 0;
+		val->data = &cbt->v;
+	} else
+		val->data = WT_UPDATE_DATA(upd);
+	val->size = 1;
+	return (0);
+}
+
+/*
+ * __cursor_fix_next --
+ *	Move to the next, fixed-length column-store item.
+ */
+static inline int
+__cursor_fix_next(WT_CURSOR_BTREE *cbt, int newpage)
+{
+	WT_BTREE *btree;
+	WT_ITEM *val;
+	WT_PAGE *page;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+	btree = S2BT(session);
+	page = cbt->ref->page;
+	val = &cbt->iface.value;
+
+	/* Initialize for each new page. */
+	if (newpage) {
+		cbt->last_standard_recno = __col_fix_last_recno(page);
+		if (cbt->last_standard_recno == 0)
+			return (WT_NOTFOUND);
+		__cursor_set_recno(cbt, page->pg_fix_recno);
+		goto new_page;
+	}
+
+	/* Move to the next entry and return the item. */
+	if (cbt->recno >= cbt->last_standard_recno)
+		return (WT_NOTFOUND);
+	__cursor_set_recno(cbt, cbt->recno + 1);
+
+new_page:
+	/* Check any insert list for a matching record. */
+	cbt->ins_head = WT_COL_UPDATE_SINGLE(page);
+	cbt->ins = __col_insert_search(
+	    cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno);
+	if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins))
+		cbt->ins = NULL;
+	upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd);
+	if (upd == NULL) {
+		cbt->v = __bit_getv_recno(page, cbt->recno, btree->bitcnt);
+		val->data = &cbt->v;
+	} else
+		val->data = WT_UPDATE_DATA(upd);
+	val->size = 1;
+	return (0);
+}
+
+/*
+ * __cursor_var_append_next --
+ *	Return the next variable-length entry on the append list.
+ */
+static inline int
+__cursor_var_append_next(WT_CURSOR_BTREE *cbt, int newpage)
+{
+	WT_ITEM *val;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+	val = &cbt->iface.value;
+
+	if (newpage) {
+		cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
+		goto new_page;
+	}
+
+	for (;;) {
+		cbt->ins = WT_SKIP_NEXT(cbt->ins);
+new_page:	if (cbt->ins == NULL)
+			return (WT_NOTFOUND);
+
+		__cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
+		if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL ||
+		    WT_UPDATE_DELETED_ISSET(upd))
+			continue;
+		val->data = WT_UPDATE_DATA(upd);
+		val->size = upd->size;
+		break;
+	}
+	return (0);
+}
+
+/*
+ * __cursor_var_next --
+ *	Move to the next, variable-length column-store item.
+ */
+static inline int
+__cursor_var_next(WT_CURSOR_BTREE *cbt, int newpage)
+{
+	WT_CELL *cell;
+	WT_CELL_UNPACK unpack;
+	WT_COL *cip;
+	WT_ITEM *val;
+	WT_PAGE *page;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+	page = cbt->ref->page;
+	val = &cbt->iface.value;
+
+	/* Initialize for each new page. */
+	if (newpage) {
+		cbt->last_standard_recno = __col_var_last_recno(page);
+		if (cbt->last_standard_recno == 0)
+			return (WT_NOTFOUND);
+		__cursor_set_recno(cbt, page->pg_var_recno);
+		goto new_page;
+	}
+
+	/* Move to the next entry and return the item. */
+	for (;;) {
+		if (cbt->recno >= cbt->last_standard_recno)
+			return (WT_NOTFOUND);
+		__cursor_set_recno(cbt, cbt->recno + 1);
+
+new_page:	/* Find the matching WT_COL slot. */
+		if ((cip = __col_var_search(page, cbt->recno)) == NULL)
+			return (WT_NOTFOUND);
+		cbt->slot = WT_COL_SLOT(page, cip);
+
+		/* Check any insert list for a matching record. */
+		cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
+		cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
+		upd = cbt->ins == NULL ?
+		    NULL : __wt_txn_read(session, cbt->ins->upd);
+		if (upd != NULL) {
+			if (WT_UPDATE_DELETED_ISSET(upd))
+				continue;
+
+			val->data = WT_UPDATE_DATA(upd);
+			val->size = upd->size;
+			return (0);
+		}
+
+		/*
+		 * If we're at the same slot as the last reference and there's
+		 * no matching insert list item, re-use the return information
+		 * (so encoded items with large repeat counts aren't repeatedly
+		 * decoded).  Otherwise, unpack the cell and build the return
+		 * information.
+		 */
+		if (cbt->cip_saved != cip) {
+			if ((cell = WT_COL_PTR(page, cip)) == NULL)
+				continue;
+			__wt_cell_unpack(cell, &unpack);
+			if (unpack.type == WT_CELL_DEL)
+				continue;
+			WT_RET(__wt_page_cell_data_ref(
+			    session, page, &unpack, &cbt->tmp));
+
+			cbt->cip_saved = cip;
+		}
+		val->data = cbt->tmp.data;
+		val->size = cbt->tmp.size;
+		return (0);
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * __cursor_row_next --
+ *	Move to the next row-store item.
+ */
+static inline int
+__cursor_row_next(WT_CURSOR_BTREE *cbt, int newpage)
+{
+	WT_INSERT *ins;
+	WT_ITEM *key, *val;
+	WT_PAGE *page;
+	WT_ROW *rip;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+	page = cbt->ref->page;
+	key = &cbt->iface.key;
+	val = &cbt->iface.value;
+
+	/*
+	 * For row-store pages, we need a single item that tells us the part
+	 * of the page we're walking (otherwise switching from next to prev
+	 * and vice-versa is just too complicated), so we map the WT_ROW and
+	 * WT_INSERT_HEAD insert array slots into a single name space: slot 1
+	 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
+	 * WT_INSERT_HEAD[0], and so on.  This means WT_INSERT lists are
+	 * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
+	 *
+	 * New page configuration.
+	 */
+	if (newpage) {
+		cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
+		cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
+		cbt->row_iteration_slot = 1;
+		goto new_insert;
+	}
+
+	/* Move to the next entry and return the item. */
+	for (;;) {
+		/*
+		 * Continue traversing any insert list; maintain the insert list
+		 * head reference and entry count in case we switch to a cursor
+		 * previous movement.
+		 */
+		if (cbt->ins != NULL)
+			cbt->ins = WT_SKIP_NEXT(cbt->ins);
+
+new_insert:	if ((ins = cbt->ins) != NULL) {
+			if ((upd = __wt_txn_read(session, ins->upd)) == NULL ||
+			    WT_UPDATE_DELETED_ISSET(upd))
+				continue;
+			key->data = WT_INSERT_KEY(ins);
+			key->size = WT_INSERT_KEY_SIZE(ins);
+			val->data = WT_UPDATE_DATA(upd);
+			val->size = upd->size;
+			return (0);
+		}
+
+		/* Check for the end of the page. */
+		if (cbt->row_iteration_slot >= page->pg_row_entries * 2 + 1)
+			return (WT_NOTFOUND);
+		++cbt->row_iteration_slot;
+
+		/*
+		 * Odd-numbered slots configure as WT_INSERT_HEAD entries,
+		 * even-numbered slots configure as WT_ROW entries.
+		 */
+		if (cbt->row_iteration_slot & 0x01) {
+			cbt->ins_head = WT_ROW_INSERT_SLOT(
+			    page, cbt->row_iteration_slot / 2 - 1);
+			cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
+			goto new_insert;
+		}
+		cbt->ins_head = NULL;
+		cbt->ins = NULL;
+
+		cbt->slot = cbt->row_iteration_slot / 2 - 1;
+		rip = &page->pg_row_d[cbt->slot];
+		upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip));
+		if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd))
+			continue;
+
+		return (__cursor_row_slot_return(cbt, rip, upd));
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * __wt_btcur_iterate_setup --
+ *	Initialize a cursor for iteration, usually based on a search.
+ */
+void
+__wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt, int next)
+{
+	WT_PAGE *page;
+
+	WT_UNUSED(next);
+
+	/*
+	 * We don't currently have to do any setup when we switch between next
+	 * and prev calls, but I'm sure we will someday -- I'm leaving support
+	 * here for both flags for that reason.
+	 */
+	F_SET(cbt, WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV);
+
+	/*
+	 * If we don't have a search page, then we're done, we're starting at
+	 * the beginning or end of the tree, not as a result of a search.
+	 */
+	if (cbt->ref == NULL)
+		return;
+	page = cbt->ref->page;
+
+	if (page->type == WT_PAGE_ROW_LEAF) {
+		/*
+		 * For row-store pages, we need a single item that tells us the
+		 * part of the page we're walking (otherwise switching from next
+		 * to prev and vice-versa is just too complicated), so we map
+		 * the WT_ROW and WT_INSERT_HEAD insert array slots into a
+		 * single name space: slot 1 is the "smallest key insert list",
+		 * slot 2 is WT_ROW[0], slot 3 is WT_INSERT_HEAD[0], and so on.
+		 * This means WT_INSERT lists are odd-numbered slots, and WT_ROW
+		 * array slots are even-numbered slots.
+		 */
+		cbt->row_iteration_slot = (cbt->slot + 1) * 2;
+		if (cbt->ins_head != NULL) {
+			if (cbt->ins_head == WT_ROW_INSERT_SMALLEST(page))
+				cbt->row_iteration_slot = 1;
+			else
+				cbt->row_iteration_slot += 1;
+		}
+	} else {
+		/*
+		 * For column-store pages, calculate the largest record on the
+		 * page.
+		 */
+		cbt->last_standard_recno = page->type == WT_PAGE_COL_VAR ?
+		    __col_var_last_recno(page) : __col_fix_last_recno(page);
+
+		/* If we're traversing the append list, set the reference. */
+		if (cbt->ins_head != NULL &&
+		    cbt->ins_head == WT_COL_APPEND(page))
+			F_SET(cbt, WT_CBT_ITERATE_APPEND);
+	}
+}
+
+/*
+ * __wt_btcur_next --
+ *	Move to the next record in the tree.
+ */
+int
+__wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating)
+{
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_SESSION_IMPL *session;
+	uint32_t flags;
+	int newpage;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+	WT_STAT_FAST_CONN_INCR(session, cursor_next);
+	WT_STAT_FAST_DATA_INCR(session, cursor_next);
+
+	flags = WT_READ_SKIP_INTL;			/* Tree walk flags. */
+	if (truncating)
+		LF_SET(WT_READ_TRUNCATE);
+
+	WT_RET(__cursor_func_init(cbt, 0));
+
+	/*
+	 * If we aren't already iterating in the right direction, there's
+	 * some setup to do.
+	 */
+	if (!F_ISSET(cbt, WT_CBT_ITERATE_NEXT))
+		__wt_btcur_iterate_setup(cbt, 1);
+
+	/*
+	 * Walk any page we're holding until the underlying call returns not-
+	 * found.  Then, move to the next page, until we reach the end of the
+	 * file.
+	 */
+	page = cbt->ref == NULL ? NULL : cbt->ref->page;
+	for (newpage = 0;; newpage = 1) {
+		if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
+			switch (page->type) {
+			case WT_PAGE_COL_FIX:
+				ret = __cursor_fix_append_next(cbt, newpage);
+				break;
+			case WT_PAGE_COL_VAR:
+				ret = __cursor_var_append_next(cbt, newpage);
+				break;
+			WT_ILLEGAL_VALUE_ERR(session);
+			}
+			if (ret == 0)
+				break;
+			F_CLR(cbt, WT_CBT_ITERATE_APPEND);
+			if (ret != WT_NOTFOUND)
+				break;
+		} else if (page != NULL) {
+			switch (page->type) {
+			case WT_PAGE_COL_FIX:
+				ret = __cursor_fix_next(cbt, newpage);
+				break;
+			case WT_PAGE_COL_VAR:
+				ret = __cursor_var_next(cbt, newpage);
+				break;
+			case WT_PAGE_ROW_LEAF:
+				ret = __cursor_row_next(cbt, newpage);
+				break;
+			WT_ILLEGAL_VALUE_ERR(session);
+			}
+			if (ret != WT_NOTFOUND)
+				break;
+
+			/*
+			 * The last page in a column-store has appended entries.
+			 * We handle it separately from the usual cursor code:
+			 * it's only that one page and it's in a simple format.
+			 */
+			if (page->type != WT_PAGE_ROW_LEAF &&
+			    (cbt->ins_head = WT_COL_APPEND(page)) != NULL) {
+				F_SET(cbt, WT_CBT_ITERATE_APPEND);
+				continue;
+			}
+		}
+
+		WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
+		WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
+
+		page = cbt->ref->page;
+		WT_ASSERT(session, !WT_PAGE_IS_INTERNAL(page));
+	}
+
+err:	if (ret != 0)
+		WT_TRET(__cursor_reset(cbt));
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
new file mode 100644
index 00000000000..8de784d1f1d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -0,0 +1,560 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Walking backwards through skip lists.
+ *
+ * The skip list stack is an array of pointers set up by a search.  It points
+ * to the position a node should go in the skip list.  In other words, the skip
+ * list search stack always points *after* the search item (that is, into the
+ * search item's next array).
+ *
+ * Helper macros to go from a stack pointer at level i, pointing into a next
+ * array, back to the insert node containing that next array.
+ */
+#undef	PREV_ITEM
+#define	PREV_ITEM(ins_head, insp, i)					\
+	(((insp) == &(ins_head)->head[i] || (insp) == NULL) ? NULL :	\
+	    (WT_INSERT *)((char *)((insp) - (i)) - offsetof(WT_INSERT, next)))
+
+#undef	PREV_INS
+#define	PREV_INS(cbt, i)						\
+	PREV_ITEM((cbt)->ins_head, (cbt)->ins_stack[(i)], (i))
+
+/*
+ * __cursor_skip_prev --
+ *	Move back one position in a skip list stack (aka "finger").
+ */
+static inline int
+__cursor_skip_prev(WT_CURSOR_BTREE *cbt)
+{
+	WT_INSERT *current, *ins;
+	WT_ITEM key;
+	WT_SESSION_IMPL *session;
+	int i;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+restart:
+	/*
+	 * If the search stack does not point at the current item, fill it in
+	 * with a search.
+	 */
+	while ((current = cbt->ins) != PREV_INS(cbt, 0)) {
+		if (cbt->btree->type == BTREE_ROW) {
+			key.data = WT_INSERT_KEY(current);
+			key.size = WT_INSERT_KEY_SIZE(current);
+			WT_RET(__wt_search_insert(session, cbt, &key));
+		} else
+			cbt->ins = __col_insert_search(cbt->ins_head,
+			    cbt->ins_stack, cbt->next_stack,
+			    WT_INSERT_RECNO(current));
+	}
+
+	/*
+	 * Find the first node up the search stack that does not move.
+	 *
+	 * The depth of the current item must be at least this level, since we
+	 * see it in that many levels of the stack.
+	 *
+	 * !!! Watch these loops carefully: they all rely on the value of i,
+	 * and the exit conditions to end up with the right values are
+	 * non-trivial.
+	 */
+	ins = NULL;			/* -Wconditional-uninitialized */
+	for (i = 0; i < WT_SKIP_MAXDEPTH - 1; i++)
+		if ((ins = PREV_INS(cbt, i + 1)) != current)
+			break;
+
+	/*
+	 * Find a starting point for the new search.  That is either at the
+	 * non-moving node if we found a valid node, or the beginning of the
+	 * next list down that is not the current node.
+	 *
+	 * Since it is the beginning of a list, and we know the current node is
+	 * has a skip depth at least this high, any node we find must sort
+	 * before the current node.
+	 */
+	if (ins == NULL || ins == current)
+		for (; i >= 0; i--) {
+			cbt->ins_stack[i] = NULL;
+			cbt->next_stack[i] = NULL;
+			ins = cbt->ins_head->head[i];
+			if (ins != NULL && ins != current)
+				break;
+		}
+
+	/* Walk any remaining levels until just before the current node. */
+	while (i >= 0) {
+		/*
+		 * If we get to the end of a list without finding the current
+		 * item, we must have raced with an insert.  Restart the search.
+		 */
+		if (ins == NULL) {
+			cbt->ins_stack[0] = NULL;
+			cbt->next_stack[0] = NULL;
+			goto restart;
+		}
+		if (ins->next[i] != current)		/* Stay at this level */
+			ins = ins->next[i];
+		else {					/* Drop down a level */
+			cbt->ins_stack[i] = &ins->next[i];
+			cbt->next_stack[i] = ins->next[i];
+			--i;
+		}
+	}
+
+	/* If we found a previous node, the next one must be current. */
+	if (cbt->ins_stack[0] != NULL && *cbt->ins_stack[0] != current)
+		goto restart;
+
+	cbt->ins = PREV_INS(cbt, 0);
+	return (0);
+}
+
+/*
+ * __cursor_fix_append_prev --
+ *	Return the previous fixed-length entry on the append list.
+ */
+static inline int
+__cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, int newpage)
+{
+	WT_ITEM *val;
+	WT_PAGE *page;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+	page = cbt->ref->page;
+	val = &cbt->iface.value;
+
+	if (newpage) {
+		if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL)
+			return (WT_NOTFOUND);
+	} else {
+		/*
+		 * Handle the special case of leading implicit records, that is,
+		 * there aren't any records in the tree not on the append list,
+		 * and the first record on the append list isn't record 1.
+		 *
+		 * The "right" place to handle this is probably in our caller.
+		 * The high-level cursor-previous routine would:
+		 *    -- call this routine to walk the append list
+		 *    -- call the routine to walk the standard page items
+		 *    -- call the tree walk routine looking for a previous page
+		 * Each of them returns WT_NOTFOUND, at which point our caller
+		 * checks the cursor record number, and if it's larger than 1,
+		 * returns the implicit records.  Instead, I'm trying to detect
+		 * the case here, mostly because I don't want to put that code
+		 * into our caller.  Anyway, if this code breaks for any reason,
+		 * that's the way I'd go.
+		 *
+		 * If we're not pointing to a WT_INSERT entry, or we can't find
+		 * a WT_INSERT record that precedes our record name-space, check
+		 * if there are any records on the page.  If there aren't, then
+		 * we're in the magic zone, keep going until we get to a record
+		 * number of 1.
+		 */
+		if (cbt->ins != NULL &&
+		    cbt->recno <= WT_INSERT_RECNO(cbt->ins))
+			WT_RET(__cursor_skip_prev(cbt));
+		if (cbt->ins == NULL &&
+		    (cbt->recno == 1 || __col_fix_last_recno(page) != 0))
+			return (WT_NOTFOUND);
+	}
+
+	/*
+	 * This code looks different from the cursor-next code.  The append
+	 * list appears on the last page of the tree and contains the last
+	 * records in the tree.  If we're iterating through the tree, starting
+	 * at the last record in the tree, by definition we're starting a new
+	 * iteration and we set the record number to the last record found in
+	 * the tree.  Otherwise, decrement the record.
+	 */
+	if (newpage)
+		__cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
+	else
+		__cursor_set_recno(cbt, cbt->recno - 1);
+
+	/*
+	 * Fixed-width column store appends are inherently non-transactional.
+	 * Even a non-visible update by a concurrent or aborted transaction
+	 * changes the effective end of the data.  The effect is subtle because
+	 * of the blurring between deleted and empty values, but ideally we
+	 * would skip all uncommitted changes at the end of the data.  This
+	 * doesn't apply to variable-width column stores because the implicitly
+	 * created records written by reconciliation are deleted and so can be
+	 * never seen by a read.
+	 */
+	if (cbt->ins == NULL ||
+	    cbt->recno > WT_INSERT_RECNO(cbt->ins) ||
+	    (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) {
+		cbt->v = 0;
+		val->data = &cbt->v;
+	} else
+		val->data = WT_UPDATE_DATA(upd);
+	val->size = 1;
+	return (0);
+}
+
+/*
+ * __cursor_fix_prev --
+ *	Move to the previous, fixed-length column-store item.
+ */
+static inline int
+__cursor_fix_prev(WT_CURSOR_BTREE *cbt, int newpage)
+{
+	WT_BTREE *btree;
+	WT_ITEM *val;
+	WT_PAGE *page;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+	page = cbt->ref->page;
+	btree = S2BT(session);
+	val = &cbt->iface.value;
+
+	/* Initialize for each new page. */
+	if (newpage) {
+		cbt->last_standard_recno = __col_fix_last_recno(page);
+		if (cbt->last_standard_recno == 0)
+			return (WT_NOTFOUND);
+		__cursor_set_recno(cbt, cbt->last_standard_recno);
+		goto new_page;
+	}
+
+	/* Move to the previous entry and return the item. */
+	if (cbt->recno == page->pg_fix_recno)
+		return (WT_NOTFOUND);
+	__cursor_set_recno(cbt, cbt->recno - 1);
+
+new_page:
+	/* Check any insert list for a matching record. */
+	cbt->ins_head = WT_COL_UPDATE_SINGLE(page);
+	cbt->ins = __col_insert_search(
+	    cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno);
+	if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins))
+		cbt->ins = NULL;
+	upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd);
+	if (upd == NULL) {
+		cbt->v = __bit_getv_recno(page, cbt->recno, btree->bitcnt);
+		val->data = &cbt->v;
+	} else
+		val->data = WT_UPDATE_DATA(upd);
+	val->size = 1;
+	return (0);
+}
+
+/*
+ * __cursor_var_append_prev --
+ *	Return the previous variable-length entry on the append list.
+ */
+static inline int
+__cursor_var_append_prev(WT_CURSOR_BTREE *cbt, int newpage)
+{
+	WT_ITEM *val;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+	val = &cbt->iface.value;
+
+	if (newpage) {
+		cbt->ins = WT_SKIP_LAST(cbt->ins_head);
+		goto new_page;
+	}
+
+	for (;;) {
+		WT_RET(__cursor_skip_prev(cbt));
+new_page:	if (cbt->ins == NULL)
+			return (WT_NOTFOUND);
+
+		__cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
+		if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL ||
+		    WT_UPDATE_DELETED_ISSET(upd))
+			continue;
+		val->data = WT_UPDATE_DATA(upd);
+		val->size = upd->size;
+		break;
+	}
+	return (0);
+}
+
+/*
+ * __cursor_var_prev --
+ *	Move to the previous, variable-length column-store item.
+ */
+static inline int
+__cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage)
+{
+	WT_CELL *cell;
+	WT_CELL_UNPACK unpack;
+	WT_COL *cip;
+	WT_ITEM *val;
+	WT_PAGE *page;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+	page = cbt->ref->page;
+	val = &cbt->iface.value;
+
+	/* Initialize for each new page. */
+	if (newpage) {
+		cbt->last_standard_recno = __col_var_last_recno(page);
+		if (cbt->last_standard_recno == 0)
+			return (WT_NOTFOUND);
+		__cursor_set_recno(cbt, cbt->last_standard_recno);
+		goto new_page;
+	}
+
+	/* Move to the previous entry and return the item. */
+	for (;;) {
+		__cursor_set_recno(cbt, cbt->recno - 1);
+
+new_page:	if (cbt->recno < page->pg_var_recno)
+			return (WT_NOTFOUND);
+
+		/* Find the matching WT_COL slot. */
+		if ((cip = __col_var_search(page, cbt->recno)) == NULL)
+			return (WT_NOTFOUND);
+		cbt->slot = WT_COL_SLOT(page, cip);
+
+		/* Check any insert list for a matching record. */
+		cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
+		cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
+		upd = cbt->ins == NULL ?
+		    NULL : __wt_txn_read(session, cbt->ins->upd);
+		if (upd != NULL) {
+			if (WT_UPDATE_DELETED_ISSET(upd))
+				continue;
+
+			val->data = WT_UPDATE_DATA(upd);
+			val->size = upd->size;
+			return (0);
+		}
+
+		/*
+		 * If we're at the same slot as the last reference and there's
+		 * no matching insert list item, re-use the return information
+		 * (so encoded items with large repeat counts aren't repeatedly
+		 * decoded).  Otherwise, unpack the cell and build the return
+		 * information.
+		 */
+		if (cbt->cip_saved != cip) {
+			if ((cell = WT_COL_PTR(page, cip)) == NULL)
+				continue;
+			__wt_cell_unpack(cell, &unpack);
+			if (unpack.type == WT_CELL_DEL)
+				continue;
+			WT_RET(__wt_page_cell_data_ref(
+			    session, page, &unpack, &cbt->tmp));
+
+			cbt->cip_saved = cip;
+		}
+		val->data = cbt->tmp.data;
+		val->size = cbt->tmp.size;
+		return (0);
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * __cursor_row_prev --
+ *	Move to the previous row-store item.
+ */
+static inline int
+__cursor_row_prev(WT_CURSOR_BTREE *cbt, int newpage)
+{
+	WT_INSERT *ins;
+	WT_ITEM *key, *val;
+	WT_PAGE *page;
+	WT_ROW *rip;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+	page = cbt->ref->page;
+	key = &cbt->iface.key;
+	val = &cbt->iface.value;
+
+	/*
+	 * For row-store pages, we need a single item that tells us the part
+	 * of the page we're walking (otherwise switching from next to prev
+	 * and vice-versa is just too complicated), so we map the WT_ROW and
+	 * WT_INSERT_HEAD insert array slots into a single name space: slot 1
+	 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
+	 * WT_INSERT_HEAD[0], and so on.  This means WT_INSERT lists are
+	 * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
+	 *
+	 * New page configuration.
+	 */
+	if (newpage) {
+		/*
+		 * If we haven't instantiated keys on this page, do so, else it
+		 * is a very, very slow traversal.
+		 */
+		if (!F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS))
+			WT_RET(__wt_row_leaf_keys(session, page));
+
+		if (page->pg_row_entries == 0)
+			cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
+		else
+			cbt->ins_head =
+			    WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1);
+		cbt->ins = WT_SKIP_LAST(cbt->ins_head);
+		cbt->row_iteration_slot = page->pg_row_entries * 2 + 1;
+		goto new_insert;
+	}
+
+	/* Move to the previous entry and return the item. */
+	for (;;) {
+		/*
+		 * Continue traversing any insert list.  Maintain the reference
+		 * to the current insert element in case we switch to a cursor
+		 * next movement.
+		 */
+		if (cbt->ins != NULL)
+			WT_RET(__cursor_skip_prev(cbt));
+
+new_insert:	if ((ins = cbt->ins) != NULL) {
+			if ((upd = __wt_txn_read(session, ins->upd)) == NULL ||
+			    WT_UPDATE_DELETED_ISSET(upd))
+				continue;
+			key->data = WT_INSERT_KEY(ins);
+			key->size = WT_INSERT_KEY_SIZE(ins);
+			val->data = WT_UPDATE_DATA(upd);
+			val->size = upd->size;
+			return (0);
+		}
+
+		/* Check for the beginning of the page. */
+		if (cbt->row_iteration_slot == 1)
+			return (WT_NOTFOUND);
+		--cbt->row_iteration_slot;
+
+		/*
+		 * Odd-numbered slots configure as WT_INSERT_HEAD entries,
+		 * even-numbered slots configure as WT_ROW entries.
+		 */
+		if (cbt->row_iteration_slot & 0x01) {
+			cbt->ins_head = cbt->row_iteration_slot == 1 ?
+			    WT_ROW_INSERT_SMALLEST(page) :
+			    WT_ROW_INSERT_SLOT(
+				page, cbt->row_iteration_slot / 2 - 1);
+			cbt->ins = WT_SKIP_LAST(cbt->ins_head);
+			goto new_insert;
+		}
+		cbt->ins_head = NULL;
+		cbt->ins = NULL;
+
+		cbt->slot = cbt->row_iteration_slot / 2 - 1;
+		rip = &page->pg_row_d[cbt->slot];
+		upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip));
+		if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd))
+			continue;
+
+		return (__cursor_row_slot_return(cbt, rip, upd));
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * __wt_btcur_prev --
+ *	Move to the previous record in the tree.
+ */
+int
+__wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating)
+{
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_SESSION_IMPL *session;
+	uint32_t flags;
+	int newpage;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+	WT_STAT_FAST_CONN_INCR(session, cursor_prev);
+	WT_STAT_FAST_DATA_INCR(session, cursor_prev);
+
+	flags = WT_READ_PREV | WT_READ_SKIP_INTL;	/* Tree walk flags. */
+	if (truncating)
+		LF_SET(WT_READ_TRUNCATE);
+
+	WT_RET(__cursor_func_init(cbt, 0));
+
+	/*
+	 * If we aren't already iterating in the right direction, there's
+	 * some setup to do.
+	 */
+	if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV))
+		__wt_btcur_iterate_setup(cbt, 0);
+
+	/*
+	 * Walk any page we're holding until the underlying call returns not-
+	 * found.  Then, move to the previous page, until we reach the start
+	 * of the file.
+	 */
+	page = cbt->ref == NULL ? NULL : cbt->ref->page;
+	for (newpage = 0;; newpage = 1) {
+		if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
+			switch (page->type) {
+			case WT_PAGE_COL_FIX:
+				ret = __cursor_fix_append_prev(cbt, newpage);
+				break;
+			case WT_PAGE_COL_VAR:
+				ret = __cursor_var_append_prev(cbt, newpage);
+				break;
+			WT_ILLEGAL_VALUE_ERR(session);
+			}
+			if (ret == 0)
+				break;
+			F_CLR(cbt, WT_CBT_ITERATE_APPEND);
+			if (ret != WT_NOTFOUND)
+				break;
+			newpage = 1;
+		}
+		if (page != NULL) {
+			switch (page->type) {
+			case WT_PAGE_COL_FIX:
+				ret = __cursor_fix_prev(cbt, newpage);
+				break;
+			case WT_PAGE_COL_VAR:
+				ret = __cursor_var_prev(cbt, newpage);
+				break;
+			case WT_PAGE_ROW_LEAF:
+				ret = __cursor_row_prev(cbt, newpage);
+				break;
+			WT_ILLEGAL_VALUE_ERR(session);
+			}
+			if (ret != WT_NOTFOUND)
+				break;
+		}
+
+		WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
+		WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
+
+		page = cbt->ref->page;
+		WT_ASSERT(session, !WT_PAGE_IS_INTERNAL(page));
+
+		/*
+		 * The last page in a column-store has appended entries.
+		 * We handle it separately from the usual cursor code:
+		 * it's only that one page and it's in a simple format.
+		 */
+		if (page->type != WT_PAGE_ROW_LEAF &&
+		    (cbt->ins_head = WT_COL_APPEND(page)) != NULL)
+			F_SET(cbt, WT_CBT_ITERATE_APPEND);
+	}
+
+err:	if (ret != 0)
+		WT_TRET(__cursor_reset(cbt));
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
new file mode 100644
index 00000000000..5b2d9b055b5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -0,0 +1,1025 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __cursor_size_chk --
+ *	Return if an inserted item is too large.
+ */
+static inline int
+__cursor_size_chk(WT_SESSION_IMPL *session, WT_ITEM *kv)
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	size_t size;
+
+	btree = S2BT(session);
+	bm = btree->bm;
+
+	if (btree->type == BTREE_COL_FIX) {
+		/* Fixed-size column-stores take a single byte. */
+		if (kv->size != 1)
+			WT_RET_MSG(session, EINVAL,
+			    "item size of %" WT_SIZET_FMT " does not match "
+			    "fixed-length file requirement of 1 byte",
+			    kv->size);
+		return (0);
+	}
+
+	/* Don't waste effort, 1GB is always cool. */
+	if (kv->size <= WT_GIGABYTE)
+		return (0);
+
+	/*
+	 * There are two checks: what we are willing to store in the tree, and
+	 * what the block manager can actually write.
+	 */
+	if (kv->size > WT_BTREE_MAX_OBJECT_SIZE)
+		ret = EINVAL;
+	else {
+		size = kv->size;
+		ret = bm->write_size(bm, session, &size);
+	}
+	if (ret != 0)
+		WT_RET_MSG(session, ret,
+		    "item size of %" WT_SIZET_FMT " exceeds the maximum "
+		    "supported size",
+		    kv->size);
+	return (0);
+}
+
+/*
+ * __cursor_fix_implicit --
+ *	Return if search went past the end of the tree.
+ */
+static inline int
+__cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt)
+{
+	return (btree->type == BTREE_COL_FIX &&
+	    !F_ISSET(cbt, WT_CBT_MAX_RECORD) ? 1 : 0);
+}
+
+/*
+ * __cursor_valid --
+ *	Return if the cursor references an valid key/value pair.
+ */
+static inline int
+__cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
+{
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_COL *cip;
+	WT_PAGE *page;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
+
+	btree = cbt->btree;
+	page = cbt->ref->page;
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+	if (updp != NULL)
+		*updp = NULL;
+
+	/*
+	 * We may be pointing to an insert object, and we may have a page with
+	 * existing entries.  Insert objects always have associated update
+	 * objects (the value).  Any update object may be deleted, or invisible
+	 * to us.  In the case of an on-page entry, there is by definition a
+	 * value that is visible to us, the original page cell.
+	 *
+	 * If we find a visible update structure, return our caller a reference
+	 * to it because we don't want to repeatedly search for the update, it
+	 * might suddenly become invisible (imagine a read-uncommitted session
+	 * with another session's aborted insert), and we don't want to handle
+	 * that potential error every time we look at the value.
+	 *
+	 * Unfortunately, the objects we might have and their relationships are
+	 * different for the underlying page types.
+	 *
+	 * In the case of row-store, an insert object implies ignoring any page
+	 * objects, no insert object can have the same key as an on-page object.
+	 * For row-store:
+	 *	if there's an insert object:
+	 *		if there's a visible update:
+	 *			exact match
+	 *		else
+	 *			no exact match
+	 *	else
+	 *		use the on-page object (which may have an associated
+	 *		update object that may or may not be visible to us).
+	 *
+	 * Column-store is more complicated because an insert object can have
+	 * the same key as an on-page object: updates to column-store rows
+	 * are insert/object pairs, and an invisible update isn't the end as
+	 * there may be an on-page object that is visible.  This changes the
+	 * logic to:
+	 *	if there's an insert object:
+	 *		if there's a visible update:
+	 *			exact match
+	 *		else if the on-page object's key matches the insert key
+	 *			use the on-page object
+	 *	else
+	 *		use the on-page object
+	 *
+	 * First, check for an insert object with a visible update (a visible
+	 * update that's been deleted is not a valid key/value pair).
+	 */
+	if (cbt->ins != NULL &&
+	    (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) {
+		if (WT_UPDATE_DELETED_ISSET(upd))
+			return (0);
+		if (updp != NULL)
+			*updp = upd;
+		return (1);
+	}
+
+	/*
+	 * If we don't have an insert object, or in the case of column-store,
+	 * there's an insert object but no update was visible to us and the key
+	 * on the page is the same as the insert object's key, and the slot as
+	 * set by the search function is valid, we can use the original page
+	 * information.
+	 */
+	switch (btree->type) {
+	case BTREE_COL_FIX:
+		/*
+		 * If search returned an insert object, there may or may not be
+		 * a matching on-page object, we have to check.  Fixed-length
+		 * column-store pages don't have slots, but map one-to-one to
+		 * keys, check for retrieval past the end of the page.
+		 */
+		if (cbt->recno >= page->pg_fix_recno + page->pg_fix_entries)
+			return (0);
+
+		/*
+		 * Updates aren't stored on the page, an update would have
+		 * appeared as an "insert" object; no further checks to do.
+		 */
+		break;
+	case BTREE_COL_VAR:
+		/*
+		 * If search returned an insert object, there may or may not be
+		 * a matching on-page object, we have to check.  Variable-length
+		 * column-store pages don't map one-to-one to keys, but have
+		 * "slots", check if search returned a valid slot.
+		 */
+		if (cbt->slot >= page->pg_var_entries)
+			return (0);
+
+		/*
+		 * Updates aren't stored on the page, an update would have
+		 * appeared as an "insert" object; however, variable-length
+		 * column store deletes are written into the backing store,
+		 * check the cell for a record already deleted when read.
+		 */
+		cip = &page->pg_var_d[cbt->slot];
+		if ((cell = WT_COL_PTR(page, cip)) == NULL ||
+		    __wt_cell_type(cell) == WT_CELL_DEL)
+			return (0);
+		break;
+	case BTREE_ROW:
+		/*
+		 * See above: for row-store, no insert object can have the same
+		 * key as an on-page object, we're done.
+		 */
+		if (cbt->ins != NULL)
+			return (0);
+
+		/*
+		 * Check if searched returned a valid slot (the failure mode is
+		 * an empty page, the search function doesn't check, and so the
+		 * more exact test is "page->pg_row_entries == 0", but this test
+		 * mirrors the column-store test).
+		 */
+		if (cbt->slot >= page->pg_row_entries)
+			return (0);
+
+		/* Updates are stored on the page, check for a delete. */
+		if (page->pg_row_upd != NULL && (upd = __wt_txn_read(
+		    session, page->pg_row_upd[cbt->slot])) != NULL) {
+			if (WT_UPDATE_DELETED_ISSET(upd))
+				return (0);
+			if (updp != NULL)
+				*updp = upd;
+		}
+		break;
+	}
+	return (1);
+}
+
+/*
+ * __cursor_col_search --
+ *	Column-store search from an application cursor.
+ */
+static inline int
+__cursor_col_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+	WT_DECL_RET;
+
+	WT_WITH_PAGE_INDEX(session, 
+	    ret = __wt_col_search(session, cbt->iface.recno, NULL, cbt));
+	return (ret);
+}
+
+/*
+ * __cursor_row_search --
+ *	Row-store search from an application cursor.
+ */
+static inline int
+__cursor_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int insert)
+{
+	WT_DECL_RET;
+
+	WT_WITH_PAGE_INDEX(session, 
+	    ret = __wt_row_search(session, &cbt->iface.key, NULL, cbt, insert));
+	return (ret);
+}
+
+/*
+ * __cursor_col_modify --
+ *	Column-store delete, insert, and update from an application cursor.
+ */
+static inline int
+__cursor_col_modify(
+    WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
+{
+	return (__wt_col_modify(session,
+	    cbt, cbt->iface.recno, &cbt->iface.value, NULL, is_remove));
+}
+
+/*
+ * __cursor_row_modify --
+ *	Row-store insert, update and delete from an application cursor.
+ */
+static inline int
+__cursor_row_modify(
+    WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
+{
+	return (__wt_row_modify(session,
+	    cbt, &cbt->iface.key, &cbt->iface.value, NULL, is_remove));
+}
+
+/*
+ * __wt_btcur_reset --
+ *	Invalidate the cursor position.
+ */
+int
+__wt_btcur_reset(WT_CURSOR_BTREE *cbt)
+{
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+	WT_STAT_FAST_CONN_INCR(session, cursor_reset);
+	WT_STAT_FAST_DATA_INCR(session, cursor_reset);
+
+	return (__cursor_reset(cbt));
+}
+
+/*
+ * __wt_btcur_search --
+ *	Search for a matching record in the tree.
+ */
+int
+__wt_btcur_search(WT_CURSOR_BTREE *cbt)
+{
+	WT_BTREE *btree;
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
+
+	btree = cbt->btree;
+	cursor = &cbt->iface;
+	session = (WT_SESSION_IMPL *)cursor->session;
+
+	WT_STAT_FAST_CONN_INCR(session, cursor_search);
+	WT_STAT_FAST_DATA_INCR(session, cursor_search);
+
+	if (btree->type == BTREE_ROW)
+		WT_RET(__cursor_size_chk(session, &cursor->key));
+
+	WT_RET(__cursor_func_init(cbt, 1));
+
+	WT_ERR(btree->type == BTREE_ROW ?
+	    __cursor_row_search(session, cbt, 0) :
+	    __cursor_col_search(session, cbt));
+	if (cbt->compare == 0 && __cursor_valid(cbt, &upd))
+		ret = __wt_kv_return(session, cbt, upd);
+	else if (__cursor_fix_implicit(btree, cbt)) {
+		/*
+		 * Creating a record past the end of the tree in a fixed-length
+		 * column-store implicitly fills the gap with empty records.
+		 */
+		cbt->recno = cursor->recno;
+		cbt->v = 0;
+		cursor->value.data = &cbt->v;
+		cursor->value.size = 1;
+	} else
+		ret = WT_NOTFOUND;
+
+err:	if (ret != 0)
+		WT_TRET(__cursor_reset(cbt));
+	return (ret);
+}
+
+/*
+ * __wt_btcur_search_near --
+ *	Search for a record in the tree.
+ */
+int
+__wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
+{
+	WT_BTREE *btree;
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
+	int exact;
+
+	btree = cbt->btree;
+	cursor = &cbt->iface;
+	session = (WT_SESSION_IMPL *)cursor->session;
+	exact = 0;
+
+	WT_STAT_FAST_CONN_INCR(session, cursor_search_near);
+	WT_STAT_FAST_DATA_INCR(session, cursor_search_near);
+
+	if (btree->type == BTREE_ROW)
+		WT_RET(__cursor_size_chk(session, &cursor->key));
+
+	WT_RET(__cursor_func_init(cbt, 1));
+
+	/*
+	 * Set the "insert" flag for the btree row-store search; we may intend
+	 * to position our cursor at the end of the tree, rather than match an
+	 * existing record.
+	 */
+	WT_ERR(btree->type == BTREE_ROW ?
+	    __cursor_row_search(session, cbt, 1) :
+	    __cursor_col_search(session, cbt));
+
+	/*
+	 * If we find an valid key, return it.
+	 *
+	 * Else, creating a record past the end of the tree in a fixed-length
+	 * column-store implicitly fills the gap with empty records.  In this
+	 * case, we instantiate the empty record, it's an exact match.
+	 *
+	 * Else, move to the next key in the tree (bias for prefix searches).
+	 * Cursor next skips invalid rows, so we don't have to test for them
+	 * again.
+	 *
+	 * Else, redo the search and move to the previous key in the tree.
+	 * Cursor previous skips invalid rows, so we don't have to test for
+	 * them again.
+	 *
+	 * If that fails, quit, there's no record to return.
+	 */
+	if (__cursor_valid(cbt, &upd)) {
+		exact = cbt->compare;
+		ret = __wt_kv_return(session, cbt, upd);
+	} else if (__cursor_fix_implicit(btree, cbt)) {
+		cbt->recno = cursor->recno;
+		cbt->v = 0;
+		cursor->value.data = &cbt->v;
+		cursor->value.size = 1;
+		exact = 0;
+	} else if ((ret = __wt_btcur_next(cbt, 0)) != WT_NOTFOUND)
+		exact = 1;
+	else {
+		WT_ERR(btree->type == BTREE_ROW ?
+		    __cursor_row_search(session, cbt, 1) :
+		    __cursor_col_search(session, cbt));
+		if (__cursor_valid(cbt, &upd)) {
+			exact = cbt->compare;
+			ret = __wt_kv_return(session, cbt, upd);
+		} else if ((ret = __wt_btcur_prev(cbt, 0)) != WT_NOTFOUND)
+			exact = -1;
+	}
+
+err:	if (ret != 0)
+		WT_TRET(__cursor_reset(cbt));
+	if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND))
+		*exactp = exact;
+	return (ret);
+}
+
+/*
+ * __wt_btcur_insert --
+ *	Insert a record into the tree.
+ */
+int
+__wt_btcur_insert(WT_CURSOR_BTREE *cbt)
+{
+	WT_BTREE *btree;
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	btree = cbt->btree;
+	cursor = &cbt->iface;
+	session = (WT_SESSION_IMPL *)cursor->session;
+
+	WT_STAT_FAST_CONN_INCR(session, cursor_insert);
+	WT_STAT_FAST_DATA_INCR(session, cursor_insert);
+	WT_STAT_FAST_DATA_INCRV(session,
+	    cursor_insert_bytes, cursor->key.size + cursor->value.size);
+
+	if (btree->type == BTREE_ROW)
+		WT_RET(__cursor_size_chk(session, &cursor->key));
+	WT_RET(__cursor_size_chk(session, &cursor->value));
+
+	/*
+	 * The tree is no longer empty: eviction should pay attention to it,
+	 * and it's no longer possible to bulk-load into it.
+	 */
+	if (btree->bulk_load_ok) {
+		btree->bulk_load_ok = 0;
+		__wt_btree_evictable(session, 1);
+	}
+
+retry:	WT_RET(__cursor_func_init(cbt, 1));
+
+	switch (btree->type) {
+	case BTREE_COL_FIX:
+	case BTREE_COL_VAR:
+		/*
+		 * If WT_CURSTD_APPEND is set, insert a new record (ignoring
+		 * the application's record number).  First we search for the
+		 * maximum possible record number so the search ends on the
+		 * last page.  The real record number is assigned by the
+		 * serialized append operation.
+		 */
+		if (F_ISSET(cursor, WT_CURSTD_APPEND))
+			cbt->iface.recno = UINT64_MAX;
+
+		WT_ERR(__cursor_col_search(session, cbt));
+
+		if (F_ISSET(cursor, WT_CURSTD_APPEND))
+			cbt->iface.recno = 0;
+
+		/*
+		 * If not overwriting, fail if the key exists.  Creating a
+		 * record past the end of the tree in a fixed-length
+		 * column-store implicitly fills the gap with empty records.
+		 * Fail in that case, the record exists.
+		 */
+		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
+		    ((cbt->compare == 0 && __cursor_valid(cbt, NULL)) ||
+		    (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt))))
+			WT_ERR(WT_DUPLICATE_KEY);
+
+		WT_ERR(__cursor_col_modify(session, cbt, 0));
+		if (F_ISSET(cursor, WT_CURSTD_APPEND))
+			cbt->iface.recno = cbt->recno;
+		break;
+	case BTREE_ROW:
+		WT_ERR(__cursor_row_search(session, cbt, 1));
+		/*
+		 * If not overwriting, fail if the key exists, else insert the
+		 * key/value pair.
+		 */
+		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
+		    cbt->compare == 0 && __cursor_valid(cbt, NULL))
+			WT_ERR(WT_DUPLICATE_KEY);
+
+		ret = __cursor_row_modify(session, cbt, 0);
+		break;
+	WT_ILLEGAL_VALUE_ERR(session);
+	}
+
+err:	if (ret == WT_RESTART)
+		goto retry;
+	/* Insert doesn't maintain a position across calls, clear resources. */
+	if (ret == 0)
+		WT_TRET(__curfile_leave(cbt));
+	if (ret != 0)
+		WT_TRET(__cursor_reset(cbt));
+	return (ret);
+}
+
+/*
+ * __wt_btcur_update_check --
+ *	Check whether an update would conflict.
+ *
+ *	This can be used to replace WT_CURSOR::insert or WT_CURSOR::update, so
+ *	they only check for conflicts without updating the tree.  It is used to
+ *	maintain snapshot isolation for transactions that span multiple chunks
+ *	in an LSM tree.
+ */
+int
+__wt_btcur_update_check(WT_CURSOR_BTREE *cbt)
+{
+	WT_BTREE *btree;
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cursor = &cbt->iface;
+	btree = cbt->btree;
+	session = (WT_SESSION_IMPL *)cursor->session;
+
+retry:	WT_RET(__cursor_func_init(cbt, 1));
+
+	switch (btree->type) {
+	case BTREE_ROW:
+		WT_ERR(__cursor_row_search(session, cbt, 1));
+
+		/*
+		 * We are only interested in checking for conflicts.
+		 */
+		if (cbt->compare == 0 && cbt->ins != NULL)
+			ret = __wt_txn_update_check(session, cbt->ins->upd);
+		break;
+	case BTREE_COL_FIX:
+	case BTREE_COL_VAR:
+	WT_ILLEGAL_VALUE_ERR(session);
+	}
+
+err:	if (ret == WT_RESTART)
+		goto retry;
+	WT_TRET(__curfile_leave(cbt));
+	if (ret != 0)
+		WT_TRET(__cursor_reset(cbt));
+	return (ret);
+}
+
+/*
+ * __wt_btcur_remove --
+ *	Remove a record from the tree.
+ */
+int
+__wt_btcur_remove(WT_CURSOR_BTREE *cbt)
+{
+	WT_BTREE *btree;
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	btree = cbt->btree;
+	cursor = &cbt->iface;
+	session = (WT_SESSION_IMPL *)cursor->session;
+
+	WT_STAT_FAST_CONN_INCR(session, cursor_remove);
+	WT_STAT_FAST_DATA_INCR(session, cursor_remove);
+	WT_STAT_FAST_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size);
+
+	if (btree->type == BTREE_ROW)
+		WT_RET(__cursor_size_chk(session, &cursor->key));
+
+retry:	WT_RET(__cursor_func_init(cbt, 1));
+
+	switch (btree->type) {
+	case BTREE_COL_FIX:
+	case BTREE_COL_VAR:
+		WT_ERR(__cursor_col_search(session, cbt));
+
+		/* Remove the record if it exists. */
+		if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) {
+			if (!__cursor_fix_implicit(btree, cbt))
+				WT_ERR(WT_NOTFOUND);
+			/*
+			 * Creating a record past the end of the tree in a
+			 * fixed-length column-store implicitly fills the
+			 * gap with empty records.  Return success in that
+			 * case, the record was deleted successfully.
+			 *
+			 * Correct the btree cursor's location: the search
+			 * will have pointed us at the previous/next item,
+			 * and that's not correct.
+			 */
+			cbt->recno = cursor->recno;
+		} else
+			ret = __cursor_col_modify(session, cbt, 1);
+		break;
+	case BTREE_ROW:
+		/* Remove the record if it exists. */
+		WT_ERR(__cursor_row_search(session, cbt, 0));
+		if (cbt->compare != 0 || !__cursor_valid(cbt, NULL))
+			WT_ERR(WT_NOTFOUND);
+
+		ret = __cursor_row_modify(session, cbt, 1);
+		break;
+	WT_ILLEGAL_VALUE_ERR(session);
+	}
+
+err:	if (ret == WT_RESTART)
+		goto retry;
+	/*
+	 * If the cursor is configured to overwrite and the record is not
+	 * found, that is exactly what we want.
+	 */
+	if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ret == WT_NOTFOUND)
+		ret = 0;
+
+	if (ret != 0)
+		WT_TRET(__cursor_reset(cbt));
+
+	return (ret);
+}
+
+/*
+ * __wt_btcur_update --
+ *	Update a record in the tree.
+ */
+int
+__wt_btcur_update(WT_CURSOR_BTREE *cbt)
+{
+	WT_BTREE *btree;
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	btree = cbt->btree;
+	cursor = &cbt->iface;
+	session = (WT_SESSION_IMPL *)cursor->session;
+
+	WT_STAT_FAST_CONN_INCR(session, cursor_update);
+	WT_STAT_FAST_DATA_INCR(session, cursor_update);
+	WT_STAT_FAST_DATA_INCRV(
+	    session, cursor_update_bytes, cursor->value.size);
+
+	if (btree->type == BTREE_ROW)
+		WT_RET(__cursor_size_chk(session, &cursor->key));
+	WT_RET(__cursor_size_chk(session, &cursor->value));
+
+	/*
+	 * The tree is no longer empty: eviction should pay attention to it,
+	 * and it's no longer possible to bulk-load into it.
+	 */
+	if (btree->bulk_load_ok) {
+		btree->bulk_load_ok = 0;
+		__wt_btree_evictable(session, 1);
+	}
+
+retry:	WT_RET(__cursor_func_init(cbt, 1));
+
+	switch (btree->type) {
+	case BTREE_COL_FIX:
+	case BTREE_COL_VAR:
+		WT_ERR(__cursor_col_search(session, cbt));
+
+		/*
+		 * If not overwriting, fail if the key doesn't exist.  Update
+		 * the record if it exists.  Creating a record past the end of
+		 * the tree in a fixed-length column-store implicitly fills the
+		 * gap with empty records.  Update the record in that case, the
+		 * record exists.
+		 */
+		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
+		    (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) &&
+		    !__cursor_fix_implicit(btree, cbt))
+			WT_ERR(WT_NOTFOUND);
+		ret = __cursor_col_modify(session, cbt, 0);
+		break;
+	case BTREE_ROW:
+		WT_ERR(__cursor_row_search(session, cbt, 1));
+		/*
+		 * If not overwriting, fail if the key does not exist.
+		 */
+		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
+		    (cbt->compare != 0 || !__cursor_valid(cbt, NULL)))
+			WT_ERR(WT_NOTFOUND);
+		ret = __cursor_row_modify(session, cbt, 0);
+		break;
+	WT_ILLEGAL_VALUE_ERR(session);
+	}
+
+err:	if (ret == WT_RESTART)
+		goto retry;
+
+	/*
+	 * If successful, point the cursor at internal copies of the data.  We
+	 * could shuffle memory in the cursor so the key/value pair are in local
+	 * buffer memory, but that's a data copy.  We don't want to do another
+	 * search (and we might get a different update structure if we race).
+	 * To make this work, we add a field to the btree cursor to pass back a
+	 * pointer to the modify function's allocated update structure.
+	 */
+	if (ret == 0)
+		WT_TRET(__wt_kv_return(session, cbt, cbt->modify_update));
+
+	if (ret != 0)
+		WT_TRET(__cursor_reset(cbt));
+	return (ret);
+}
+
+/*
+ * __wt_btcur_next_random --
+ *	Move to a random record in the tree.
+ */
+int
+__wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
+{
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+	btree = cbt->btree;
+
+	/*
+	 * Only supports row-store: applications can trivially select a random
+	 * value from a column-store, if there were any reason to do so.
+	 */
+	if (btree->type != BTREE_ROW)
+		WT_RET(ENOTSUP);
+
+	WT_STAT_FAST_CONN_INCR(session, cursor_next);
+	WT_STAT_FAST_DATA_INCR(session, cursor_next);
+
+	WT_RET(__cursor_func_init(cbt, 1));
+
+	WT_ERR(__wt_row_random(session, cbt));
+	if (__cursor_valid(cbt, &upd))
+		WT_ERR(__wt_kv_return(session, cbt, upd));
+	else
+		WT_ERR(__wt_btcur_search_near(cbt, 0));
+
+err:	if (ret != 0)
+		WT_TRET(__cursor_reset(cbt));
+	return (ret);
+}
+
+/*
+ * __wt_btcur_compare --
+ *	Return a comparison between two cursors.
+ */
+int
+__wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp)
+{
+	WT_BTREE *btree;
+	WT_CURSOR *a, *b;
+	WT_SESSION_IMPL *session;
+
+	a = (WT_CURSOR *)a_arg;
+	b = (WT_CURSOR *)b_arg;
+	btree = a_arg->btree;
+	session = (WT_SESSION_IMPL *)a->session;
+
+	switch (btree->type) {
+	case BTREE_COL_FIX:
+	case BTREE_COL_VAR:
+		/*
+		 * Compare the interface's cursor record, not the underlying
+		 * cursor reference: the interface's cursor reference is the
+		 * one being returned to the application.
+		 */
+		if (a->recno < b->recno)
+			*cmpp = -1;
+		else if (a->recno == b->recno)
+			*cmpp = 0;
+		else
+			*cmpp = 1;
+		break;
+	case BTREE_ROW:
+		WT_RET(__wt_compare(
+		    session, btree->collator, &a->key, &b->key, cmpp));
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+	return (0);
+}
+
+/*
+ * __cursor_equals --
+ *	Return if two cursors reference the same row.
+ */
+static int
+__cursor_equals(WT_CURSOR_BTREE *a, WT_CURSOR_BTREE *b)
+{
+	switch (a->btree->type) {
+	case BTREE_COL_FIX:
+	case BTREE_COL_VAR:
+		/*
+		 * Compare the interface's cursor record, not the underlying
+		 * cursor reference: the interface's cursor reference is the
+		 * one being returned to the application.
+		 */
+		if (((WT_CURSOR *)a)->recno == ((WT_CURSOR *)b)->recno)
+			return (1);
+		break;
+	case BTREE_ROW:
+		if (a->ref != b->ref)
+			return (0);
+		if (a->ins != NULL || b->ins != NULL) {
+			if (a->ins == b->ins)
+				return (1);
+			break;
+		}
+		if (a->slot == b->slot)
+			return (1);
+		break;
+	}
+	return (0);
+}
+
+/*
+ * __cursor_truncate --
+ *	Discard a cursor range from row-store or variable-width column-store
+ * tree.
+ */
+static int
+__cursor_truncate(WT_SESSION_IMPL *session,
+    WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop,
+    int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, int))
+{
+	WT_DECL_RET;
+
+	/*
+	 * First, call the standard cursor remove method to do a full search and
+	 * re-position the cursor because we don't have a saved copy of the
+	 * page's write generation information, which we need to remove records.
+	 * Once that's done, we can delete records without a full search, unless
+	 * we encounter a restart error because the page was modified by some
+	 * other thread of control; in that case, repeat the full search to
+	 * refresh the page's modification information.
+	 *
+	 * If this is a row-store, we delete leaf pages having no overflow items
+	 * without reading them; for that to work, we have to ensure we read the
+	 * page referenced by the ending cursor, since we may be deleting only a
+	 * partial page at the end of the truncation.  Our caller already fully
+	 * instantiated the end cursor, so we know that page is pinned in memory
+	 * and we can proceed without concern.
+	 */
+	if (start == NULL) {
+		do {
+			WT_RET(__wt_btcur_remove(stop));
+			for (;;) {
+				if ((ret = __wt_btcur_prev(stop, 1)) != 0)
+					break;
+				stop->compare = 0;	/* Exact match */
+				if ((ret = rmfunc(session, stop, 1)) != 0)
+					break;
+			}
+		} while (ret == WT_RESTART);
+	} else {
+		do {
+			WT_RET(__wt_btcur_remove(start));
+			for (;;) {
+				if (stop != NULL &&
+				    __cursor_equals(start, stop))
+					break;
+				if ((ret = __wt_btcur_next(start, 1)) != 0)
+					break;
+				start->compare = 0;	/* Exact match */
+				if ((ret = rmfunc(session, start, 1)) != 0)
+					break;
+			}
+		} while (ret == WT_RESTART);
+	}
+
+	WT_RET_NOTFOUND_OK(ret);
+	return (0);
+}
+
+/*
+ * __cursor_truncate_fix --
+ *	Discard a cursor range from fixed-width column-store tree.
+ */
+static int
+__cursor_truncate_fix(WT_SESSION_IMPL *session,
+    WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop,
+    int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, int))
+{
+	WT_DECL_RET;
+	uint8_t *value;
+
+	/*
+	 * Handle fixed-length column-store objects separately: for row-store
+	 * and variable-length column-store objects we have "deleted" values
+	 * and so returned objects actually exist: fixed-length column-store
+	 * objects are filled-in if they don't exist, that is, if you create
+	 * record 37, records 1-36 magically appear.  Those records can't be
+	 * deleted, which means we have to ignore already "deleted" records.
+	 *
+	 * First, call the standard cursor remove method to do a full search and
+	 * re-position the cursor because we don't have a saved copy of the
+	 * page's write generation information, which we need to remove records.
+	 * Once that's done, we can delete records without a full search, unless
+	 * we encounter a restart error because the page was modified by some
+	 * other thread of control; in that case, repeat the full search to
+	 * refresh the page's modification information.
+	 */
+	if (start == NULL) {
+		do {
+			WT_RET(__wt_btcur_remove(stop));
+			for (;;) {
+				if ((ret = __wt_btcur_prev(stop, 1)) != 0)
+					break;
+				stop->compare = 0;	/* Exact match */
+				value = (uint8_t *)stop->iface.value.data;
+				if (*value != 0 &&
+				    (ret = rmfunc(session, stop, 1)) != 0)
+					break;
+			}
+		} while (ret == WT_RESTART);
+	} else {
+		do {
+			WT_RET(__wt_btcur_remove(start));
+			for (;;) {
+				if (stop != NULL &&
+				    __cursor_equals(start, stop))
+					break;
+				if ((ret = __wt_btcur_next(start, 1)) != 0)
+					break;
+				start->compare = 0;	/* Exact match */
+				value = (uint8_t *)start->iface.value.data;
+				if (*value != 0 &&
+				    (ret = rmfunc(session, start, 1)) != 0)
+					break;
+			}
+		} while (ret == WT_RESTART);
+	}
+
+	WT_RET_NOTFOUND_OK(ret);
+	return (0);
+}
+
+/*
+ * __wt_btcur_range_truncate --
+ *	Discard a cursor range from the tree.
+ */
+int
+__wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
+{
+	WT_BTREE *btree;
+	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cbt = (start != NULL) ? start : stop;
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+	btree = cbt->btree;
+
+	/*
+	 * For recovery, we log the start and stop keys for a truncate
+	 * operation, not the individual records removed.  On the other hand,
+	 * for rollback we need to keep track of all the in-memory operations.
+	 *
+	 * We deal with this here by logging the truncate range first, then (in
+	 * the logging code) disabling writing of the in-memory remove records
+	 * to disk.
+	 */
+	if (S2C(session)->logging)
+		WT_RET(__wt_txn_truncate_log(session, start, stop));
+
+	switch (btree->type) {
+	case BTREE_COL_FIX:
+		WT_ERR(__cursor_truncate_fix(
+		    session, start, stop, __cursor_col_modify));
+		break;
+	case BTREE_COL_VAR:
+		WT_ERR(__cursor_truncate(
+		    session, start, stop, __cursor_col_modify));
+		break;
+	case BTREE_ROW:
+		/*
+		 * The underlying cursor comparison routine requires cursors be
+		 * fully instantiated when truncating row-store objects because
+		 * it's comparing page and/or skiplist positions, not keys. (Key
+		 * comparison would work, it's only that a key comparison would
+		 * be relatively expensive.  Column-store objects have record
+		 * number keys, so the key comparison is cheap.)  Cursors may
+		 * have only had their keys set, so we must ensure the cursors
+		 * are positioned in the tree.
+		 */
+		if (start != NULL)
+			WT_ERR(__wt_btcur_search(start));
+		if (stop != NULL)
+			WT_ERR(__wt_btcur_search(stop));
+		WT_ERR(__cursor_truncate(
+		    session, start, stop, __cursor_row_modify));
+		break;
+	}
+
+err:	if (S2C(session)->logging)
+		WT_TRET(__wt_txn_truncate_end(session));
+	return (ret);
+}
+
+/*
+ * __wt_btcur_close --
+ *	Close a btree cursor.
+ */
+int
+__wt_btcur_close(WT_CURSOR_BTREE *cbt)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+	ret = __curfile_leave(cbt);
+	__wt_buf_free(session, &cbt->search_key);
+	__wt_buf_free(session, &cbt->tmp);
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
new file mode 100644
index 00000000000..ebbb335d3a8
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -0,0 +1,1104 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * We pass around a session handle and output information, group it together.
+ */
+typedef struct {
+	WT_SESSION_IMPL *session;		/* Enclosing session */
+
+	/*
+	 * When using the standard event handlers, the debugging output has to
+	 * do its own message handling because its output isn't line-oriented.
+	 */
+	FILE		*fp;			/* Output file stream */
+	WT_ITEM		*msg;			/* Buffered message */
+
+	WT_ITEM		*tmp;			/* Temporary space */
+} WT_DBG;
+
+static const					/* Output separator */
+    char * const sep = "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n";
+
+static int  __debug_cell(WT_DBG *, const WT_PAGE_HEADER *, WT_CELL_UNPACK *);
+static int  __debug_cell_data(
+	WT_DBG *, WT_PAGE *, int type, const char *, WT_CELL_UNPACK *);
+static void __debug_col_skip(WT_DBG *, WT_INSERT_HEAD *, const char *, int);
+static int  __debug_config(WT_SESSION_IMPL *, WT_DBG *, const char *);
+static int  __debug_dsk_cell(WT_DBG *, const WT_PAGE_HEADER *);
+static void __debug_dsk_col_fix(WT_DBG *, const WT_PAGE_HEADER *);
+static void __debug_item(WT_DBG *, const char *, const void *, size_t);
+static int  __debug_page(WT_DBG *, WT_PAGE *, uint32_t);
+static void __debug_page_col_fix(WT_DBG *, WT_PAGE *);
+static int  __debug_page_col_int(WT_DBG *, WT_PAGE *, uint32_t);
+static int  __debug_page_col_var(WT_DBG *, WT_PAGE *);
+static int  __debug_page_metadata(WT_DBG *, WT_PAGE *);
+static int  __debug_page_row_int(WT_DBG *, WT_PAGE *, uint32_t);
+static int  __debug_page_row_leaf(WT_DBG *, WT_PAGE *);
+static int  __debug_ref(WT_DBG *, WT_REF *);
+static void __debug_row_skip(WT_DBG *, WT_INSERT_HEAD *);
+static int  __debug_tree(WT_SESSION_IMPL *, WT_PAGE *, const char *, uint32_t);
+static void __debug_update(WT_DBG *, WT_UPDATE *, int);
+static void __dmsg(WT_DBG *, const char *, ...)
+	WT_GCC_ATTRIBUTE((format (printf, 2, 3)));
+static void __dmsg_wrapup(WT_DBG *);
+
+/*
+ * __wt_debug_set_verbose --
+ *	Set verbose flags from the debugger.
+ */
+int
+__wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v)
+{
+	const char *cfg[2] = { NULL, NULL };
+	char buf[256];
+
+	snprintf(buf, sizeof(buf), "verbose=[%s]", v);
+	cfg[0] = buf;
+	return (__wt_verbose_config(session, cfg));
+}
+
+/*
+ * __debug_hex_byte --
+ *	Output a single byte in hex.
+ */
+static inline void
+__debug_hex_byte(WT_DBG *ds, uint8_t v)
+{
+	static const char hex[] = "0123456789abcdef";
+
+	__dmsg(ds, "#%c%c", hex[(v & 0xf0) >> 4], hex[v & 0x0f]);
+}
+
+/*
+ * __debug_config --
+ *	Configure debugging output.
+ */
+static int
+__debug_config(WT_SESSION_IMPL *session, WT_DBG *ds, const char *ofile)
+{
+	memset(ds, 0, sizeof(WT_DBG));
+
+	ds->session = session;
+
+	WT_RET(__wt_scr_alloc(session, 512, &ds->tmp));
+
+	/*
+	 * If we weren't given a file, we use the default event handler, and
+	 * we'll have to buffer messages.
+	 */
+	if (ofile == NULL)
+		return (__wt_scr_alloc(session, 512, &ds->msg));
+
+	/* If we're using a file, flush on each line. */
+	if ((ds->fp = fopen(ofile, "w")) == NULL)
+		WT_RET_MSG(session, __wt_errno(), "%s", ofile);
+
+	(void)setvbuf(ds->fp, NULL, _IOLBF, 0);
+	return (0);
+}
+
+/*
+ * __dmsg_wrapup --
+ *	Flush any remaining output, release resources.
+ */
+static void
+__dmsg_wrapup(WT_DBG *ds)
+{
+	WT_SESSION_IMPL *session;
+	WT_ITEM *msg;
+
+	session = ds->session;
+	msg = ds->msg;
+
+	__wt_scr_free(&ds->tmp);
+
+	/*
+	 * Discard the buffer -- it shouldn't have anything in it, but might
+	 * as well be cautious.
+	 */
+	if (msg != NULL) {
+		if (msg->size != 0)
+			(void)__wt_msg(session, "%s", (char *)msg->mem);
+		__wt_scr_free(&ds->msg);
+	}
+
+	/* Close any file we opened. */
+	if (ds->fp != NULL)
+		(void)fclose(ds->fp);
+}
+
+/*
+ * __dmsg --
+ *	Debug message.
+ */
+static void
+__dmsg(WT_DBG *ds, const char *fmt, ...)
+{
+	va_list ap;
+	WT_ITEM *msg;
+	WT_SESSION_IMPL *session;
+	size_t len, space;
+	char *p;
+
+	session = ds->session;
+
+	/*
+	 * Debug output chunks are not necessarily terminated with a newline
+	 * character.  It's easy if we're dumping to a stream, but if we're
+	 * dumping to an event handler, which is line-oriented, we must buffer
+	 * the output chunk, and pass it to the event handler once we see a
+	 * terminating newline.
+	 */
+	if (ds->fp == NULL) {
+		msg = ds->msg;
+		for (;;) {
+			p = (char *)msg->mem + msg->size;
+			space = msg->memsize - msg->size;
+			va_start(ap, fmt);
+			len = (size_t)vsnprintf(p, space, fmt, ap);
+			va_end(ap);
+
+			/* Check if there was enough space. */
+			if (len < space) {
+				msg->size += len;
+				break;
+			}
+
+			/*
+			 * There's not much to do on error without checking for
+			 * an error return on every single printf.  Anyway, it's
+			 * pretty unlikely and this is debugging output, I'm not
+			 * going to worry about it.
+			 */
+			if (__wt_buf_grow(
+			    session, msg, msg->memsize + len + 128) != 0)
+				return;
+		}
+		if (((uint8_t *)msg->mem)[msg->size - 1] == '\n') {
+			((uint8_t *)msg->mem)[msg->size - 1] = '\0';
+			(void)__wt_msg(session, "%s", (char *)msg->mem);
+			msg->size = 0;
+		}
+	} else {
+		va_start(ap, fmt);
+		(void)vfprintf(ds->fp, fmt, ap);
+		va_end(ap);
+	}
+}
+
+/*
+ * __wt_debug_addr_print --
+ *	Print out an address.
+ */
+int
+__wt_debug_addr_print(
+    WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+	WT_DECL_ITEM(buf);
+
+	WT_RET(__wt_scr_alloc(session, 128, &buf));
+	fprintf(stderr, "%s\n",
+	    __wt_addr_string(session, addr, addr_size, buf));
+	__wt_scr_free(&buf);
+
+	return (0);
+}
+
+/*
+ * __wt_debug_addr --
+ *	Read and dump a disk page in debugging mode, using an addr/size pair.
+ */
+int
+__wt_debug_addr(WT_SESSION_IMPL *session,
+    const uint8_t *addr, size_t addr_size, const char *ofile)
+{
+	WT_BM *bm;
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+
+	bm = S2BT(session)->bm;
+
+	WT_RET(__wt_scr_alloc(session, 1024, &buf));
+	WT_ERR(bm->read(bm, session, buf, addr, addr_size));
+	ret = __wt_debug_disk(session, buf->mem, ofile);
+
+err:	__wt_scr_free(&buf);
+	return (ret);
+}
+
+/*
+ * __wt_debug_offset_blind --
+ *	Read and dump a disk page in debugging mode, using a file offset.
+ */
+int
+__wt_debug_offset_blind(
+    WT_SESSION_IMPL *session, wt_off_t offset, const char *ofile)
+{
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+
+	/*
+	 * This routine depends on the default block manager's view of files,
+	 * where an address consists of a file offset, length, and checksum.
+	 * This is for debugging only.  Other block managers might not see a
+	 * file or address the same way, that's why there's no block manager
+	 * method.
+	 */
+	WT_RET(__wt_scr_alloc(session, 1024, &buf));
+	WT_ERR(__wt_block_read_off_blind(
+	    session, S2BT(session)->bm->block, buf, offset));
+	ret = __wt_debug_disk(session, buf->mem, ofile);
+
+err:	__wt_scr_free(&buf);
+	return (ret);
+}
+
+/*
+ * __wt_debug_offset --
+ *	Read and dump a disk page in debugging mode, using a file
+ * offset/size/checksum triplet.
+ */
+int
+__wt_debug_offset(WT_SESSION_IMPL *session,
+     wt_off_t offset, uint32_t size, uint32_t cksum, const char *ofile)
+{
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+	uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE], *endp;
+
+	/*
+	 * This routine depends on the default block manager's view of files,
+	 * where an address consists of a file offset, length, and checksum.
+	 * This is for debugging only: other block managers might not see a
+	 * file or address the same way, that's why there's no block manager
+	 * method.
+	 *
+	 * Convert the triplet into an address structure.
+	 */
+	endp = addr;
+	WT_RET(__wt_block_addr_to_buffer(
+	    S2BT(session)->bm->block, &endp, offset, size, cksum));
+
+	/*
+	 * Read the address through the btree I/O functions (so the block is
+	 * decompressed as necessary).
+	 */
+	WT_RET(__wt_scr_alloc(session, 0, &buf));
+	WT_ERR(__wt_bt_read(session, buf, addr, WT_PTRDIFF(endp, addr)));
+	ret = __wt_debug_disk(session, buf->mem, ofile);
+
+err:	__wt_scr_free(&buf);
+	return (ret);
+}
+
+/*
+ * __wt_debug_disk --
+ *	Dump a disk page in debugging mode.
+ */
+int
+__wt_debug_disk(
+    WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile)
+{
+	WT_DBG *ds, _ds;
+	WT_DECL_RET;
+
+	ds = &_ds;
+	WT_RET(__debug_config(session, ds, ofile));
+
+	__dmsg(ds, "%s page", __wt_page_type_string(dsk->type));
+	switch (dsk->type) {
+	case WT_PAGE_COL_FIX:
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_COL_VAR:
+		__dmsg(ds, ", recno %" PRIu64, dsk->recno);
+		/* FALLTHROUGH */
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+		__dmsg(ds, ", entries %" PRIu32 "\n", dsk->u.entries);
+		break;
+	case WT_PAGE_OVFL:
+		__dmsg(ds, ", datalen %" PRIu32 "\n", dsk->u.datalen);
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	switch (dsk->type) {
+	case WT_PAGE_COL_FIX:
+		__debug_dsk_col_fix(ds, dsk);
+		break;
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_COL_VAR:
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+		ret = __debug_dsk_cell(ds, dsk);
+		break;
+	default:
+		break;
+	}
+
+	__dmsg_wrapup(ds);
+
+	return (ret);
+}
+
+/*
+ * __debug_dsk_col_fix --
+ *	Dump a WT_PAGE_COL_FIX page.
+ */
+static void
+__debug_dsk_col_fix(WT_DBG *ds, const WT_PAGE_HEADER *dsk)
+{
+	WT_BTREE *btree;
+	uint32_t i;
+	uint8_t v;
+
+	btree = S2BT(ds->session);
+
+	WT_FIX_FOREACH(btree, dsk, v, i) {
+		__dmsg(ds, "\t{");
+		__debug_hex_byte(ds, v);
+		__dmsg(ds, "}\n");
+	}
+}
+
+/*
+ * __debug_dsk_cell --
+ *	Dump a page of WT_CELL's.
+ */
+static int
+__debug_dsk_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk)
+{
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	uint32_t i;
+
+	btree = S2BT(ds->session);
+	unpack = &_unpack;
+
+	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+		__wt_cell_unpack(cell, unpack);
+		WT_RET(__debug_cell(ds, dsk, unpack));
+	}
+	return (0);
+}
+
+/*
+ * __debug_shape_info --
+ *	Pretty-print information about a page.
+ */
+static char *
+__debug_tree_shape_info(WT_PAGE *page)
+{
+	uint64_t v;
+	static char buf[32];
+
+	v = page->memory_footprint;
+	if (v >= WT_GIGABYTE)
+		snprintf(buf, sizeof(buf), "(%" PRIu64 "G)", v / WT_GIGABYTE);
+	else if (v >= WT_MEGABYTE)
+		snprintf(buf, sizeof(buf), "(%" PRIu64 "M)", v / WT_MEGABYTE);
+	else
+		snprintf(buf, sizeof(buf), "(%" PRIu64 ")", v);
+	return (buf);
+}
+
+/*
+ * __debug_tree_shape_worker --
+ *	Dump information about the current page and descend.
+ */
+static void
+__debug_tree_shape_worker(WT_DBG *ds, WT_PAGE *page, int level)
+{
+	WT_REF *ref;
+	WT_SESSION_IMPL *session;
+
+	session = ds->session;
+
+	if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) {
+		__dmsg(ds, "%*s" "I" "%s\n",
+		    level, " ", __debug_tree_shape_info(page));
+		WT_INTL_FOREACH_BEGIN(session, page, ref) {
+			if (ref->state == WT_REF_MEM)
+				__debug_tree_shape_worker(
+				    ds, ref->page, level + 3);
+		} WT_INTL_FOREACH_END;
+	} else
+		__dmsg(ds, "%*s" "L" "%s\n",
+		    level, " ", __debug_tree_shape_info(page));
+}
+
+/*
+ * __wt_debug_tree_shape --
+ *	Dump the shape of the in-memory tree.
+ */
+int
+__wt_debug_tree_shape(
+    WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile)
+{
+	WT_DBG *ds, _ds;
+
+	ds = &_ds;
+	WT_RET(__debug_config(session, ds, ofile));
+
+	/* A NULL page starts at the top of the tree -- it's a convenience. */
+	if (page == NULL)
+		page = S2BT(session)->root.page;
+
+	__debug_tree_shape_worker(ds, page, 0);
+
+	__dmsg_wrapup(ds);
+	return (0);
+}
+
+#define	WT_DEBUG_TREE_LEAF	0x01			/* Debug leaf pages */
+#define	WT_DEBUG_TREE_WALK	0x02			/* Descend the tree */
+
+/*
+ * __wt_debug_tree_all --
+ *	Dump the in-memory information for a tree, including leaf pages.
+ */
+int
+__wt_debug_tree_all(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile)
+{
+	return (__debug_tree(
+	    session, page, ofile, WT_DEBUG_TREE_LEAF | WT_DEBUG_TREE_WALK));
+}
+
+/*
+ * __wt_debug_tree --
+ *	Dump the in-memory information for a tree, not including leaf pages.
+ */
+int
+__wt_debug_tree(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile)
+{
+	return (__debug_tree(session, page, ofile, WT_DEBUG_TREE_WALK));
+}
+
+/*
+ * __wt_debug_page --
+ *	Dump the in-memory information for a page.
+ */
+int
+__wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile)
+{
+	WT_DBG *ds, _ds;
+	WT_DECL_RET;
+
+	ds = &_ds;
+	WT_RET(__debug_config(session, ds, ofile));
+
+	ret = __debug_page(ds, page, WT_DEBUG_TREE_LEAF);
+
+	__dmsg_wrapup(ds);
+
+	return (ret);
+}
+
+/*
+ * __debug_tree --
+ *	Dump the in-memory information for a tree.
+ */
+static int
+__debug_tree(
+    WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile, uint32_t flags)
+{
+	WT_DBG *ds, _ds;
+	WT_DECL_RET;
+
+	ds = &_ds;
+	WT_RET(__debug_config(session, ds, ofile));
+
+	/* A NULL page starts at the top of the tree -- it's a convenience. */
+	if (page == NULL)
+		page = S2BT(session)->root.page;
+
+	ret = __debug_page(ds, page, flags);
+
+	__dmsg_wrapup(ds);
+
+	return (ret);
+}
+
+/*
+ * __debug_page --
+ *	Dump the in-memory information for an in-memory page.
+ */
+static int
+__debug_page(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
+{
+	WT_SESSION_IMPL *session;
+
+	session = ds->session;
+
+	/* Dump the page metadata. */
+	WT_RET(__debug_page_metadata(ds, page));
+
+	/* Dump the page. */
+	switch (page->type) {
+	case WT_PAGE_COL_FIX:
+		if (LF_ISSET(WT_DEBUG_TREE_LEAF))
+			__debug_page_col_fix(ds, page);
+		break;
+	case WT_PAGE_COL_INT:
+		WT_RET(__debug_page_col_int(ds, page, flags));
+		break;
+	case WT_PAGE_COL_VAR:
+		if (LF_ISSET(WT_DEBUG_TREE_LEAF))
+			WT_RET(__debug_page_col_var(ds, page));
+		break;
+	case WT_PAGE_ROW_INT:
+		WT_RET(__debug_page_row_int(ds, page, flags));
+		break;
+	case WT_PAGE_ROW_LEAF:
+		if (LF_ISSET(WT_DEBUG_TREE_LEAF))
+			WT_RET(__debug_page_row_leaf(ds, page));
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	return (0);
+}
+
+/*
+ * __debug_page_metadata --
+ *	Dump an in-memory page's metadata.
+ */
+static int
+__debug_page_metadata(WT_DBG *ds, WT_PAGE *page)
+{
+	WT_PAGE_INDEX *pindex;
+	WT_PAGE_MODIFY *mod;
+	WT_SESSION_IMPL *session;
+	uint32_t entries;
+
+	session = ds->session;
+	mod = page->modify;
+
+	__dmsg(ds, "%p", page);
+
+	switch (page->type) {
+	case WT_PAGE_COL_INT:
+		__dmsg(ds, " recno %" PRIu64, page->pg_intl_recno);
+		pindex = WT_INTL_INDEX_COPY(page);
+		entries = pindex->entries;
+		break;
+	case WT_PAGE_COL_FIX:
+		__dmsg(ds, " recno %" PRIu64, page->pg_fix_recno);
+		entries = page->pg_fix_entries;
+		break;
+	case WT_PAGE_COL_VAR:
+		__dmsg(ds, " recno %" PRIu64, page->pg_var_recno);
+		entries = page->pg_var_entries;
+		break;
+	case WT_PAGE_ROW_INT:
+		pindex = WT_INTL_INDEX_COPY(page);
+		entries = pindex->entries;
+		break;
+	case WT_PAGE_ROW_LEAF:
+		entries = page->pg_row_entries;
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	__dmsg(ds, ": %s\n", __wt_page_type_string(page->type));
+	__dmsg(ds, "\t" "disk %p, entries %" PRIu32, page->dsk, entries);
+	__dmsg(ds, "%s", __wt_page_is_modified(page) ? ", dirty" : ", clean");
+	if (F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS))
+		__dmsg(ds, ", keys-built");
+	if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC))
+		__dmsg(ds, ", disk-alloc");
+	if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED))
+		__dmsg(ds, ", disk-mapped");
+	if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
+		__dmsg(ds, ", evict-lru");
+	if (F_ISSET_ATOMIC(page, WT_PAGE_SCANNING))
+		__dmsg(ds, ", scanning");
+	if (F_ISSET_ATOMIC(page, WT_PAGE_SPLITTING))
+		__dmsg(ds, ", splitting");
+
+	if (mod != NULL)
+		switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+		case WT_PM_REC_EMPTY:
+			__dmsg(ds, ", empty");
+			break;
+		case WT_PM_REC_MULTIBLOCK:
+			__dmsg(ds, ", multiblock");
+			break;
+		case WT_PM_REC_REPLACE:
+			__dmsg(ds, ", replaced");
+			break;
+		case 0:
+			break;
+		WT_ILLEGAL_VALUE(session);
+		}
+	if (mod != NULL)
+		__dmsg(ds, ", write generation=%" PRIu32, mod->write_gen);
+	__dmsg(ds, "\n");
+
+	return (0);
+}
+
+/*
+ * __debug_page_col_fix --
+ *	Dump an in-memory WT_PAGE_COL_FIX page.
+ */
+static void
+__debug_page_col_fix(WT_DBG *ds, WT_PAGE *page)
+{
+	WT_BTREE *btree;
+	WT_INSERT *ins;
+	const WT_PAGE_HEADER *dsk;
+	WT_SESSION_IMPL *session;
+	uint64_t recno;
+	uint32_t i;
+	uint8_t v;
+
+	session = ds->session;
+	btree = S2BT(session);
+	dsk = page->dsk;
+	recno = page->pg_fix_recno;
+
+	if (dsk != NULL) {
+		ins = WT_SKIP_FIRST(WT_COL_UPDATE_SINGLE(page));
+		WT_FIX_FOREACH(btree, dsk, v, i) {
+			__dmsg(ds, "\t%" PRIu64 "\t{", recno);
+			__debug_hex_byte(ds, v);
+			__dmsg(ds, "}\n");
+
+			/* Check for a match on the update list. */
+			if (ins != NULL && WT_INSERT_RECNO(ins) == recno) {
+				__dmsg(ds,
+				    "\tupdate %" PRIu64 "\n",
+				    WT_INSERT_RECNO(ins));
+				__debug_update(ds, ins->upd, 1);
+				ins = WT_SKIP_NEXT(ins);
+			}
+			++recno;
+		}
+	}
+
+	if (WT_COL_UPDATE_SINGLE(page) != NULL) {
+		__dmsg(ds, "%s", sep);
+		__debug_col_skip(ds, WT_COL_UPDATE_SINGLE(page), "update", 1);
+	}
+	if (WT_COL_APPEND(page) != NULL) {
+		__dmsg(ds, "%s", sep);
+		__debug_col_skip(ds, WT_COL_APPEND(page), "append", 1);
+	}
+}
+
+/*
+ * __debug_page_col_int --
+ *	Dump an in-memory WT_PAGE_COL_INT page.
+ */
+static int
+__debug_page_col_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
+{
+	WT_REF *ref;
+	WT_SESSION_IMPL *session;
+
+	session = ds->session;
+
+	WT_INTL_FOREACH_BEGIN(session, page, ref) {
+		__dmsg(ds, "\trecno %" PRIu64 "\n", ref->key.recno);
+		WT_RET(__debug_ref(ds, ref));
+	} WT_INTL_FOREACH_END;
+
+	if (LF_ISSET(WT_DEBUG_TREE_WALK))
+		WT_INTL_FOREACH_BEGIN(session, page, ref) {
+			if (ref->state == WT_REF_MEM) {
+				__dmsg(ds, "\n");
+				WT_RET(__debug_page(ds, ref->page, flags));
+			}
+		} WT_INTL_FOREACH_END;
+
+	return (0);
+}
+
+/*
+ * __debug_page_col_var --
+ *	Dump an in-memory WT_PAGE_COL_VAR page.
+ */
+static int
+__debug_page_col_var(WT_DBG *ds, WT_PAGE *page)
+{
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	WT_COL *cip;
+	WT_INSERT_HEAD *update;
+	uint64_t recno, rle;
+	uint32_t i;
+	char tag[64];
+
+	unpack = &_unpack;
+	recno = page->pg_var_recno;
+
+	WT_COL_FOREACH(page, cip, i) {
+		if ((cell = WT_COL_PTR(page, cip)) == NULL) {
+			unpack = NULL;
+			rle = 1;
+		} else {
+			__wt_cell_unpack(cell, unpack);
+			rle = __wt_cell_rle(unpack);
+		}
+		snprintf(tag, sizeof(tag), "%" PRIu64 " %" PRIu64, recno, rle);
+		WT_RET(
+		    __debug_cell_data(ds, page, WT_PAGE_COL_VAR, tag, unpack));
+
+		if ((update = WT_COL_UPDATE(page, cip)) != NULL)
+			__debug_col_skip(ds, update, "update", 0);
+		recno += rle;
+	}
+
+	if (WT_COL_APPEND(page) != NULL) {
+		__dmsg(ds, "%s", sep);
+		__debug_col_skip(ds, WT_COL_APPEND(page), "append", 0);
+	}
+
+	return (0);
+}
+
+/*
+ * __debug_page_row_int --
+ *	Dump an in-memory WT_PAGE_ROW_INT page.
+ */
+static int
+__debug_page_row_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
+{
+	WT_REF *ref;
+	WT_SESSION_IMPL *session;
+	size_t len;
+	uint8_t *p;
+
+	session = ds->session;
+
+	WT_INTL_FOREACH_BEGIN(session, page, ref) {
+		__wt_ref_key(page, ref, &p, &len);
+		__debug_item(ds, "K", p, len);
+		WT_RET(__debug_ref(ds, ref));
+	} WT_INTL_FOREACH_END;
+
+	if (LF_ISSET(WT_DEBUG_TREE_WALK))
+		WT_INTL_FOREACH_BEGIN(session, page, ref) {
+			if (ref->state == WT_REF_MEM) {
+				__dmsg(ds, "\n");
+				WT_RET(__debug_page(ds, ref->page, flags));
+			}
+		} WT_INTL_FOREACH_END;
+	return (0);
+}
+
+/*
+ * __debug_page_row_leaf --
+ *	Dump an in-memory WT_PAGE_ROW_LEAF page.
+ */
+static int
+__debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page)
+{
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	WT_DECL_ITEM(key);
+	WT_DECL_RET;
+	WT_INSERT_HEAD *insert;
+	WT_ROW *rip;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
+	uint32_t i;
+
+	session = ds->session;
+	unpack = &_unpack;
+	WT_RET(__wt_scr_alloc(session, 256, &key));
+
+	/*
+	 * Dump any K/V pairs inserted into the page before the first from-disk
+	 * key on the page.
+	 */
+	if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
+		__debug_row_skip(ds, insert);
+
+	/* Dump the page's K/V pairs. */
+	WT_ROW_FOREACH(page, rip, i) {
+		WT_RET(__wt_row_leaf_key(session, page, rip, key, 0));
+		__debug_item(ds, "K", key->data, key->size);
+
+		if ((cell = __wt_row_leaf_value_cell(page, rip, NULL)) == NULL)
+			__dmsg(ds, "\tV {}\n");
+		else {
+			__wt_cell_unpack(cell, unpack);
+			WT_ERR(__debug_cell_data(
+			    ds, page, WT_PAGE_ROW_LEAF, "V", unpack));
+		}
+
+		if ((upd = WT_ROW_UPDATE(page, rip)) != NULL)
+			__debug_update(ds, upd, 0);
+
+		if ((insert = WT_ROW_INSERT(page, rip)) != NULL)
+			__debug_row_skip(ds, insert);
+	}
+
+err:	__wt_scr_free(&key);
+	return (ret);
+}
+
+/*
+ * __debug_col_skip --
+ *	Dump a column-store skiplist.
+ */
+static void
+__debug_col_skip(WT_DBG *ds, WT_INSERT_HEAD *head, const char *tag, int hexbyte)
+{
+	WT_INSERT *ins;
+
+	WT_SKIP_FOREACH(ins, head) {
+		__dmsg(ds,
+		    "\t%s %" PRIu64 "\n", tag, WT_INSERT_RECNO(ins));
+		__debug_update(ds, ins->upd, hexbyte);
+	}
+}
+
+/*
+ * __debug_row_skip --
+ *	Dump an insert list.
+ */
+static void
+__debug_row_skip(WT_DBG *ds, WT_INSERT_HEAD *head)
+{
+	WT_INSERT *ins;
+
+	WT_SKIP_FOREACH(ins, head) {
+		__debug_item(ds,
+		    "insert", WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins));
+		__debug_update(ds, ins->upd, 0);
+	}
+}
+
+/*
+ * __debug_update --
+ *	Dump an update list.
+ */
+static void
+__debug_update(WT_DBG *ds, WT_UPDATE *upd, int hexbyte)
+{
+	for (; upd != NULL; upd = upd->next)
+		if (WT_UPDATE_DELETED_ISSET(upd))
+			__dmsg(ds, "\tvalue {deleted}\n");
+		else if (hexbyte) {
+			__dmsg(ds, "\t{");
+			__debug_hex_byte(ds,
+			    ((uint8_t *)WT_UPDATE_DATA(upd))[0]);
+			__dmsg(ds, "}\n");
+		} else
+			__debug_item(ds,
+			    "value", WT_UPDATE_DATA(upd), upd->size);
+}
+
+/*
+ * __debug_ref --
+ *	Dump a WT_REF structure.
+ */
+static int
+__debug_ref(WT_DBG *ds, WT_REF *ref)
+{
+	WT_SESSION_IMPL *session;
+	size_t addr_size;
+	const uint8_t *addr;
+
+	session = ds->session;
+
+	__dmsg(ds, "\t");
+	switch (ref->state) {
+	case WT_REF_DISK:
+		__dmsg(ds, "disk");
+		break;
+	case WT_REF_DELETED:
+		__dmsg(ds, "deleted");
+		break;
+	case WT_REF_LOCKED:
+		__dmsg(ds, "locked %p", ref->page);
+		break;
+	case WT_REF_MEM:
+		__dmsg(ds, "memory %p", ref->page);
+		break;
+	case WT_REF_READING:
+		__dmsg(ds, "reading");
+		break;
+	case WT_REF_SPLIT:
+		__dmsg(ds, "split");
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+	__dmsg(ds, " %s\n",
+	    __wt_addr_string(session, addr, addr_size, ds->tmp));
+
+	return (0);
+}
+
+/*
+ * __debug_cell --
+ *	Dump a single unpacked WT_CELL.
+ */
+static int
+__debug_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk, WT_CELL_UNPACK *unpack)
+{
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	const char *type;
+
+	session = ds->session;
+
+	__dmsg(ds, "\t%s: len %" PRIu32,
+	    __wt_cell_type_string(unpack->raw), unpack->size);
+
+	/* Dump cell's per-disk page type information. */
+	switch (dsk->type) {
+	case WT_PAGE_COL_INT:
+		switch (unpack->type) {
+		case WT_CELL_VALUE:
+			__dmsg(ds, ", recno: %" PRIu64, unpack->v);
+			break;
+		}
+		break;
+	case WT_PAGE_COL_VAR:
+		switch (unpack->type) {
+		case WT_CELL_DEL:
+		case WT_CELL_KEY_OVFL_RM:
+		case WT_CELL_VALUE:
+		case WT_CELL_VALUE_OVFL:
+		case WT_CELL_VALUE_OVFL_RM:
+			__dmsg(ds, ", rle: %" PRIu64, __wt_cell_rle(unpack));
+			break;
+		}
+		break;
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+		switch (unpack->type) {
+		case WT_CELL_KEY:
+			__dmsg(ds, ", pfx: %" PRIu8, unpack->prefix);
+			break;
+		}
+		break;
+	}
+
+	/* Dump addresses. */
+	switch (unpack->raw) {
+	case WT_CELL_ADDR_DEL:
+		type = "addr/del";
+		goto addr;
+	case WT_CELL_ADDR_INT:
+		type = "addr/int";
+		goto addr;
+	case WT_CELL_ADDR_LEAF:
+		type = "addr/leaf";
+		goto addr;
+	case WT_CELL_ADDR_LEAF_NO:
+		type = "addr/leaf-no";
+		goto addr;
+	case WT_CELL_KEY_OVFL:
+	case WT_CELL_KEY_OVFL_RM:
+	case WT_CELL_VALUE_OVFL:
+	case WT_CELL_VALUE_OVFL_RM:
+		type = "ovfl";
+addr:		WT_RET(__wt_scr_alloc(session, 128, &buf));
+		__dmsg(ds, ", %s %s", type,
+		    __wt_addr_string(session, unpack->data, unpack->size, buf));
+		__wt_scr_free(&buf);
+		WT_RET(ret);
+		break;
+	}
+	__dmsg(ds, "\n");
+
+	return (__debug_cell_data(ds, NULL, dsk->type, NULL, unpack));
+}
+
+/*
+ * __debug_cell_data --
+ *	Dump a single cell's data in debugging mode.
+ */
+static int
+__debug_cell_data(WT_DBG *ds,
+    WT_PAGE *page, int page_type, const char *tag, WT_CELL_UNPACK *unpack)
+{
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	const char *p;
+
+	session = ds->session;
+
+	/*
+	 * Column-store references to deleted cells return a NULL cell
+	 * reference.
+	 */
+	if (unpack == NULL) {
+		__debug_item(ds, tag, "deleted", strlen("deleted"));
+		return (0);
+	}
+
+	switch (unpack->raw) {
+	case WT_CELL_ADDR_DEL:
+	case WT_CELL_ADDR_INT:
+	case WT_CELL_ADDR_LEAF:
+	case WT_CELL_ADDR_LEAF_NO:
+	case WT_CELL_DEL:
+	case WT_CELL_KEY_OVFL_RM:
+	case WT_CELL_VALUE_OVFL_RM:
+		p = __wt_cell_type_string(unpack->raw);
+		__debug_item(ds, tag, p, strlen(p));
+		break;
+	case WT_CELL_KEY:
+	case WT_CELL_KEY_OVFL:
+	case WT_CELL_KEY_PFX:
+	case WT_CELL_KEY_SHORT:
+	case WT_CELL_KEY_SHORT_PFX:
+	case WT_CELL_VALUE:
+	case WT_CELL_VALUE_COPY:
+	case WT_CELL_VALUE_OVFL:
+	case WT_CELL_VALUE_SHORT:
+		WT_RET(__wt_scr_alloc(session, 256, &buf));
+		ret = page == NULL ?
+		    __wt_dsk_cell_data_ref(session, page_type, unpack, buf) :
+		    __wt_page_cell_data_ref(session, page, unpack, buf);
+		if (ret == 0)
+			__debug_item(ds, tag, buf->data, buf->size);
+		__wt_scr_free(&buf);
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	return (ret);
+}
+
+/*
+ * __debug_item --
+ *	Dump a single data/size pair, with an optional tag.
+ */
+static void
+__debug_item(WT_DBG *ds, const char *tag, const void *data_arg, size_t size)
+{
+	size_t i;
+	int ch;
+	const uint8_t *data;
+
+	__dmsg(ds, "\t%s%s{", tag == NULL ? "" : tag, tag == NULL ? "" : " ");
+	for (data = data_arg, i = 0; i < size; ++i, ++data) {
+		ch = data[0];
+		if (isprint(ch))
+			__dmsg(ds, "%c", ch);
+		else
+			__debug_hex_byte(ds, data[0]);
+	}
+	__dmsg(ds, "}\n");
+}
+#endif
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
new file mode 100644
index 00000000000..2fc1b0d5460
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -0,0 +1,339 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Fast-delete support.
+ *
+ * This file contains most of the code that allows WiredTiger to delete pages
+ * of data without reading them into the cache.  (This feature is currently
+ * only available for row-store objects.)
+ *
+ * The way cursor truncate works in a row-store object is it explicitly reads
+ * the first and last pages of the truncate range, then walks the tree with a
+ * flag so the cursor walk code marks any page within the range, that hasn't
+ * yet been read and which has no overflow items, as deleted, by changing the
+ * WT_REF state to WT_REF_DELETED.  Pages already in the cache or with overflow
+ * items, have their rows updated/deleted individually. The transaction for the
+ * delete operation is stored in memory referenced by the WT_REF.page_del field.
+ *
+ * Future cursor walks of the tree will skip the deleted page based on the
+ * transaction stored for the delete, but it gets more complicated if a read is
+ * done using a random key, or a cursor walk is done with a transaction where
+ * the delete is not visible.  In those cases, we read the original contents of
+ * the page.  The page-read code notices a deleted page is being read, and as
+ * part of the read instantiates the contents of the page, creating a WT_UPDATE
+ * with a deleted operation, in the same transaction as deleted the page.  In
+ * other words, the read process makes it appear as if the page was read and
+ * each individual row deleted, exactly as would have happened if the page had
+ * been in the cache all along.
+ *
+ * There's an additional complication to support rollback of the page delete.
+ * When the page was marked deleted, a pointer to the WT_REF was saved in the
+ * deleting session's transaction list and the delete is unrolled by resetting
+ * the WT_REF_DELETED state back to WT_REF_DISK.  However, if the page has been
+ * instantiated by some reading thread, that's not enough, each individual row
+ * on the page must have the delete operation reset.  If the page split, the
+ * WT_UPDATE lists might have been saved/restored during reconciliation and
+ * appear on multiple pages, and the WT_REF stored in the deleting session's
+ * transaction list is no longer useful.  For this reason, when the page is
+ * instantiated by a read, a list of the WT_UPDATE structures on the page is
+ * stored in the WT_REF.page_del field, with the transaction ID, that way the
+ * session unrolling the delete can find all of the WT_UPDATE structures that
+ * require update.
+ *
+ * One final note: pages can also be marked deleted if emptied and evicted.  In
+ * that case, the WT_REF state will be set to WT_REF_DELETED but there will not
+ * be any associated WT_REF.page_del field.  These pages are always skipped
+ * during cursor traversal (the page could not have been evicted if there were
+ * updates that weren't globally visible), and if read is forced to instantiate
+ * such a page, it simply creates an empty page from scratch.
+ */
+
+/*
+ * __wt_delete_page --
+ *	If deleting a range, try to delete the page without instantiating it.
+ */
+int
+__wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp)
+{
+	WT_DECL_RET;
+	WT_PAGE *parent;
+
+	*skipp = 0;
+
+	/*
+	 * Atomically switch the page's state to lock it.  If the page is not
+	 * on-disk, other threads may be using it, no fast delete.
+	 *
+	 * Possible optimization: if the page is already deleted and the delete
+	 * is visible to us (the delete has been committed), we could skip the
+	 * page instead of instantiating it and figuring out there are no rows
+	 * in the page.  While that's a huge amount of work to no purpose, it's
+	 * unclear optimizing for overlapping range deletes is worth the effort.
+	 */
+	if (ref->state != WT_REF_DISK ||
+	    !WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_LOCKED))
+		return (0);
+
+	/*
+	 * We cannot fast-delete pages that have overflow key/value items as
+	 * the overflow blocks have to be discarded.  The way we figure that
+	 * out is to check the on-page cell type for the page, cells for leaf
+	 * pages that have no overflow items are special.
+	 *
+	 * In some cases, the reference address may not reference an on-page
+	 * cell (for example, some combination of page splits), in which case
+	 * we can't check the original cell value and we fail.
+	 *
+	 * To look at an on-page cell, we need to look at the parent page, and
+	 * that's dangerous, our parent page could change without warning if
+	 * the parent page were to split, deepening the tree.  It's safe: the
+	 * page's reference will always point to some valid page, and if we find
+	 * any problems we simply fail the fast-delete optimization.
+	 *
+	 * !!!
+	 * I doubt it's worth the effort, but we could copy the cell's type into
+	 * the reference structure, and then we wouldn't need an on-page cell.
+	 */
+	parent = ref->home;
+	if (__wt_off_page(parent, ref->addr) ||
+	    __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO)
+		goto err;
+
+	/*
+	 * This action dirties the parent page: mark it dirty now, there's no
+	 * future reconciliation of the child leaf page that will dirty it as
+	 * we write the tree.
+	 */
+	WT_ERR(__wt_page_parent_modify_set(session, ref, 0));
+
+	/*
+	 * Record the change in the transaction structure and set the change's
+	 * transaction ID.
+	 */
+	WT_ERR(__wt_calloc_def(session, 1, &ref->page_del));
+	ref->page_del->txnid = session->txn.id;
+
+	WT_ERR(__wt_txn_modify_ref(session, ref));
+
+	*skipp = 1;
+	WT_PUBLISH(ref->state, WT_REF_DELETED);
+	return (0);
+
+err:	__wt_free(session, ref->page_del);
+
+	/*
+	 * Restore the page to on-disk status, we'll have to instantiate it.
+	 */
+	WT_PUBLISH(ref->state, WT_REF_DISK);
+	return (ret);
+}
+
+/*
+ * __wt_delete_page_rollback --
+ *	Abort pages that were deleted without being instantiated.
+ */
+void
+__wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+	WT_UPDATE **upd;
+
+	/*
+	 * If the page is still "deleted", it's as we left it, reset the state
+	 * to on-disk and we're done.  Otherwise, we expect the page is either
+	 * instantiated or being instantiated.  Loop because it's possible for
+	 * the page to return to the deleted state if instantiation fails.
+	 */
+	for (;; __wt_yield())
+		switch (ref->state) {
+		case WT_REF_DISK:
+		case WT_REF_READING:
+			WT_ASSERT(session, 0);		/* Impossible, assert */
+			break;
+		case WT_REF_DELETED:
+			/*
+			 * If the page is still "deleted", it's as we left it,
+			 * reset the state.
+			 */
+			if (WT_ATOMIC_CAS4(
+			    ref->state, WT_REF_DELETED, WT_REF_DISK))
+				return;
+			break;
+		case WT_REF_LOCKED:
+			/*
+			 * A possible state, the page is being instantiated.
+			 */
+			break;
+		case WT_REF_MEM:
+		case WT_REF_SPLIT:
+			/*
+			 * We can't use the normal read path to get a copy of
+			 * the page because the session may have closed the
+			 * cursor, we no longer have the reference to the tree
+			 * required for a hazard pointer.  We're safe because
+			 * with unresolved transactions, the page isn't going
+			 * anywhere.
+			 *
+			 * The page is in an in-memory state, walk the list of
+			 * update structures and abort them.
+			 */
+			for (upd =
+			    ref->page_del->update_list; *upd != NULL; ++upd)
+				(*upd)->txnid = WT_TXN_ABORTED;
+
+			/*
+			 * Discard the memory, the transaction can't abort
+			 * twice.
+			 */
+			__wt_free(session, ref->page_del->update_list);
+			__wt_free(session, ref->page_del);
+			return;
+		}
+}
+
+/*
+ * __wt_delete_page_skip --
+ *	If iterating a cursor, skip deleted pages that are visible to us.
+ */
+int
+__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+	int skip;
+
+	/*
+	 * Deleted pages come from two sources: either it's a fast-delete as
+	 * described above, or the page has been emptied by other operations
+	 * and eviction deleted it.
+	 *
+	 * In both cases, the WT_REF state will be WT_REF_DELETED.  In the case
+	 * of a fast-delete page, there will be a WT_PAGE_DELETED structure with
+	 * the transaction ID of the transaction that deleted the page, and the
+	 * page is visible if that transaction ID is visible.  In the case of an
+	 * empty page, there will be no WT_PAGE_DELETED structure and the delete
+	 * is by definition visible, eviction could not have deleted the page if
+	 * there were changes on it that were not globally visible.
+	 *
+	 * We're here because we found a WT_REF state set to WT_REF_DELETED.  It
+	 * is possible the page is being read into memory right now, though, and
+	 * the page could switch to an in-memory state at any time.  Lock down
+	 * the structure, just to be safe.
+	 */
+	if (!WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED))
+		return (0);
+
+	skip = ref->page_del == NULL ||
+	    __wt_txn_visible(session, ref->page_del->txnid) ? 1 : 0;
+
+	WT_PUBLISH(ref->state, WT_REF_DELETED);
+	return (skip);
+}
+
+/*
+ * __wt_delete_page_instantiate --
+ *	Instantiate an entirely deleted row-store leaf page.
+ */
+int
+__wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_PAGE_DELETED *page_del;
+	WT_UPDATE **upd_array, *upd;
+	uint32_t i;
+
+	btree = S2BT(session);
+	page = ref->page;
+	page_del = ref->page_del;
+
+	/*
+	 * Give the page a modify structure.
+	 *
+	 * If the tree is already dirty and so will be written, mark the page
+	 * dirty.  (We'd like to free the deleted pages, but if the handle is
+	 * read-only or if the application never modifies the tree, we're not
+	 * able to do so.)
+	 */
+	if (btree->modified) {
+		WT_RET(__wt_page_modify_init(session, page));
+		__wt_page_modify_set(session, page);
+	}
+
+	/*
+	 * An operation is accessing a "deleted" page, and we're building an
+	 * in-memory version of the page (making it look like all entries in
+	 * the page were individually updated by a remove operation).  There
+	 * are two cases where we end up here:
+	 *
+	 * First, a running transaction used a truncate call to delete the page
+	 * without reading it, in which case the page reference includes a
+	 * structure with a transaction ID; the page we're building might split
+	 * in the future, so we update that structure to include references to
+	 * all of the update structures we create, so the transaction can abort.
+	 *
+	 * Second, a truncate call deleted a page and the truncate committed,
+	 * but an older transaction in the system forced us to keep the old
+	 * version of the page around, then we crashed and recovered, and now
+	 * we're being forced to read that page.
+	 *
+	 * In the first case, we have a page reference structure, in the second
+	 * second, we don't.
+	 *
+	 * Allocate the per-reference update array; in the case of instantiating
+	 * a page, deleted by a running transaction that might eventually abort,
+	 * we need a list of the update structures so we can do that abort.  The
+	 * hard case is if a page splits: the update structures might be moved
+	 * to different pages, and we still have to find them all for an abort.
+	 */
+
+	if (page_del != NULL)
+		WT_RET(__wt_calloc_def(
+		    session, page->pg_row_entries + 1, &page_del->update_list));
+
+	/* Allocate the per-page update array. */
+	WT_ERR(__wt_calloc_def(session, page->pg_row_entries, &upd_array));
+	page->pg_row_upd = upd_array;
+
+	/*
+	 * Fill in the per-reference update array with references to update
+	 * structures, fill in the per-page update array with references to
+	 * deleted items.
+	 */
+	for (i = 0; i < page->pg_row_entries; ++i) {
+		WT_ERR(__wt_calloc_def(session, 1, &upd));
+		WT_UPDATE_DELETED_SET(upd);
+
+		if (page_del == NULL)
+			upd->txnid = WT_TXN_NONE;	/* Globally visible */
+		else {
+			upd->txnid = page_del->txnid;
+			page_del->update_list[i] = upd;
+		}
+
+		upd->next = upd_array[i];
+		upd_array[i] = upd;
+	}
+
+	__wt_cache_page_inmem_incr(session, page,
+	    page->pg_row_entries * (sizeof(WT_UPDATE *) + sizeof(WT_UPDATE)));
+
+	return (0);
+
+err:	/*
+	 * There's no need to free the page update structures on error, our
+	 * caller will discard the page and do that work for us.  We could
+	 * similarly leave the per-reference update array alone because it
+	 * won't ever be used by any page that's not in-memory, but cleaning
+	 * it up makes sense, especially if we come back in to this function
+	 * attempting to instantiate this page again.
+	 */
+	if (page_del != NULL)
+		__wt_free(session, page_del->update_list);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c
new file mode 100644
index 00000000000..a162e2dc841
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_discard.c
@@ -0,0 +1,422 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static void __free_page_modify(WT_SESSION_IMPL *, WT_PAGE *);
+static void __free_page_col_var(WT_SESSION_IMPL *, WT_PAGE *);
+static void __free_page_int(WT_SESSION_IMPL *, WT_PAGE *);
+static void __free_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *);
+static void __free_skip_array(WT_SESSION_IMPL *, WT_INSERT_HEAD **, uint32_t);
+static void __free_skip_list(WT_SESSION_IMPL *, WT_INSERT *);
+static void __free_update(WT_SESSION_IMPL *, WT_UPDATE **, uint32_t);
+static void __free_update_list(WT_SESSION_IMPL *, WT_UPDATE *);
+
+/*
+ * __wt_ref_out --
+ *	Discard an in-memory page, freeing all memory associated with it.
+ */
+void
+__wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+	/*
+	 * A version of the page-out function that allows us to make additional
+	 * diagnostic checks.
+	 */
+	WT_ASSERT(session, S2BT(session)->evict_ref != ref);
+
+	__wt_page_out(session, &ref->page);
+}
+
+/*
+ * __wt_page_out --
+ *	Discard an in-memory page, freeing all memory associated with it.
+ */
+void
+__wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep)
+{
+	WT_PAGE *page;
+	WT_PAGE_HEADER *dsk;
+	WT_PAGE_MODIFY *mod;
+
+	/*
+	 * Kill our caller's reference, do our best to catch races.
+	 */
+	page = *pagep;
+	*pagep = NULL;
+
+	/*
+	 * We should never discard a dirty page, the file's current eviction
+	 * point or a page queued for LRU eviction.
+	 */
+	WT_ASSERT(session, !__wt_page_is_modified(page));
+	WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));
+	WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_SPLITTING));
+
+#ifdef HAVE_DIAGNOSTIC
+	{
+	WT_HAZARD *hp;
+	int i;
+	/*
+	 * Make sure no other thread has a hazard pointer on the page we are
+	 * about to discard.  This is complicated by the fact that readers
+	 * publish their hazard pointer before re-checking the page state, so
+	 * our check can race with readers without indicating a real problem.
+	 * Wait for up to a second for hazard pointers to be cleared.
+	 */
+	for (hp = NULL, i = 0; i < 100; i++) {
+		if ((hp = __wt_page_hazard_check(session, page)) == NULL)
+			break;
+		__wt_sleep(0, 10000);
+	}
+	if (hp != NULL)
+		__wt_errx(session,
+		    "discarded page has hazard pointer: (%p: %s, line %d)",
+		    hp->page, hp->file, hp->line);
+	WT_ASSERT(session, hp == NULL);
+	}
+#endif
+
+	/*
+	 * If a root page split, there may be one or more pages linked from the
+	 * page; walk the list, discarding pages.
+	 */
+	switch (page->type) {
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_ROW_INT:
+		mod = page->modify;
+		if (mod != NULL && mod->mod_root_split != NULL)
+			__wt_page_out(session, &mod->mod_root_split);
+		break;
+	}
+
+	/* Update the cache's information. */
+	__wt_cache_page_evict(session, page);
+
+	/*
+	 * If discarding the page as part of process exit, the application may
+	 * configure to leak the memory rather than do the work.
+	 */
+	if (F_ISSET(S2C(session), WT_CONN_LEAK_MEMORY))
+		return;
+
+	/* Free the page modification information. */
+	if (page->modify != NULL)
+		__free_page_modify(session, page);
+
+	switch (page->type) {
+	case WT_PAGE_COL_FIX:
+		break;
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_ROW_INT:
+		__free_page_int(session, page);
+		break;
+	case WT_PAGE_COL_VAR:
+		__free_page_col_var(session, page);
+		break;
+	case WT_PAGE_ROW_LEAF:
+		__free_page_row_leaf(session, page);
+		break;
+	}
+
+	/* Discard any disk image. */
+	dsk = (WT_PAGE_HEADER *)page->dsk;
+	if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC))
+		__wt_overwrite_and_free_len(session, dsk, dsk->mem_size);
+	if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED))
+		(void)__wt_mmap_discard(session, dsk, dsk->mem_size);
+
+	__wt_overwrite_and_free(session, page);
+}
+
+/*
+ * __free_page_modify --
+ *	Discard the page's associated modification structures.
+ */
+static void
+__free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_INSERT_HEAD *append;
+	WT_MULTI *multi;
+	WT_PAGE_MODIFY *mod;
+	uint32_t i;
+
+	mod = page->modify;
+
+	switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+	case WT_PM_REC_MULTIBLOCK:
+		/* Free list of replacement blocks. */
+		for (multi = mod->mod_multi,
+		    i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+			switch (page->type) {
+			case WT_PAGE_ROW_INT:
+			case WT_PAGE_ROW_LEAF:
+				__wt_free(session, multi->key.ikey);
+				break;
+			}
+			__wt_free(session, multi->skip);
+			__wt_free(session, multi->skip_dsk);
+			__wt_free(session, multi->addr.addr);
+		}
+		__wt_free(session, mod->mod_multi);
+		break;
+	case WT_PM_REC_REPLACE:
+		/*
+		 * Discard any replacement address: this memory is usually moved
+		 * into the parent's WT_REF, but at the root that can't happen.
+		 */
+		__wt_free(session, mod->mod_replace.addr);
+		break;
+	}
+
+	switch (page->type) {
+	case WT_PAGE_COL_FIX:
+	case WT_PAGE_COL_VAR:
+		/* Free the append array. */
+		if ((append = WT_COL_APPEND(page)) != NULL) {
+			__free_skip_list(session, WT_SKIP_FIRST(append));
+			__wt_free(session, append);
+			__wt_free(session, mod->mod_append);
+		}
+
+		/* Free the insert/update array. */
+		if (mod->mod_update != NULL)
+			__free_skip_array(session, mod->mod_update,
+			    page->type ==
+			    WT_PAGE_COL_FIX ? 1 : page->pg_var_entries);
+		break;
+	}
+
+	/* Free the overflow on-page, reuse and transaction-cache skiplists. */
+	__wt_ovfl_reuse_free(session, page);
+	__wt_ovfl_txnc_free(session, page);
+	__wt_ovfl_discard_free(session, page);
+
+	__wt_free(session, page->modify->ovfl_track);
+
+	__wt_free(session, page->modify);
+}
+
+/*
+ * __free_page_int --
+ *	Discard a WT_PAGE_COL_INT or WT_PAGE_ROW_INT page.
+ */
+static void
+__free_page_int(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	__wt_free_ref_index(session, page, WT_INTL_INDEX_COPY(page), 0);
+}
+
+/*
+ * __wt_free_ref --
+ *	Discard the contents of a WT_REF structure (optionally including the
+ * pages it references).
+ */
+void
+__wt_free_ref(
+    WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, int free_pages)
+{
+	WT_IKEY *ikey;
+
+	if (ref == NULL)
+		return;
+
+	/*
+	 * Optionally free the referenced pages.  (The path to free referenced
+	 * page is used for error cleanup, no instantiated and then discarded
+	 * page should have WT_REF entries with real pages.  The page may have
+	 * been marked dirty as well; page discard checks for that, so we mark
+	 * it clean explicitly.)
+	 */
+	if (free_pages && ref->page != NULL) {
+		if (ref->page->modify != NULL) {
+			ref->page->modify->write_gen = 0;
+			__wt_cache_dirty_decr(session, ref->page);
+		}
+		__wt_page_out(session, &ref->page);
+	}
+
+	/* Free any key allocation. */
+	switch (page->type) {
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+		if ((ikey = __wt_ref_key_instantiated(ref)) != NULL)
+			__wt_free(session, ikey);
+		break;
+	}
+
+	/* Free any address allocation. */
+	if (ref->addr != NULL && __wt_off_page(page, ref->addr)) {
+		__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+		__wt_free(session, ref->addr);
+	}
+
+	/* Free any page-deleted information. */
+	if (ref->page_del != NULL) {
+		__wt_free(session, ref->page_del->update_list);
+		__wt_free(session, ref->page_del);
+	}
+
+	__wt_overwrite_and_free(session, ref);
+}
+
+/*
+ * __wt_free_ref_index --
+ *	Discard a page index and it's references.
+ */
+void
+__wt_free_ref_index(WT_SESSION_IMPL *session,
+    WT_PAGE *page, WT_PAGE_INDEX *pindex, int free_pages)
+{
+	uint32_t i;
+
+	if (pindex == NULL)
+		return;
+
+	for (i = 0; i < pindex->entries; ++i)
+		__wt_free_ref(session, page, pindex->index[i], free_pages);
+	__wt_free(session, pindex);
+}
+
+/*
+ * __free_page_col_var --
+ *	Discard a WT_PAGE_COL_VAR page.
+ */
+static void
+__free_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	/* Free the RLE lookup array. */
+	__wt_free(session, page->pg_var_repeats);
+}
+
+/*
+ * __free_page_row_leaf --
+ *	Discard a WT_PAGE_ROW_LEAF page.
+ */
+static void
+__free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_IKEY *ikey;
+	WT_ROW *rip;
+	uint32_t i;
+	void *copy;
+
+	/*
+	 * Free the in-memory index array.
+	 *
+	 * For each entry, see if the key was an allocation (that is, if it
+	 * points somewhere other than the original page), and if so, free
+	 * the memory.
+	 */
+	WT_ROW_FOREACH(page, rip, i) {
+		copy = WT_ROW_KEY_COPY(rip);
+		(void)__wt_row_leaf_key_info(
+		    page, copy, &ikey, NULL, NULL, NULL);
+		if (ikey != NULL)
+			__wt_free(session, ikey);
+	}
+
+	/*
+	 * Free the insert array.
+	 *
+	 * Row-store tables have one additional slot in the insert array (the
+	 * insert array has an extra slot to hold keys that sort before keys
+	 * found on the original page).
+	 */
+	if (page->pg_row_ins != NULL)
+		__free_skip_array(
+		    session, page->pg_row_ins, page->pg_row_entries + 1);
+
+	/* Free the update array. */
+	if (page->pg_row_upd != NULL)
+		__free_update(session, page->pg_row_upd, page->pg_row_entries);
+}
+
+/*
+ * __free_skip_array --
+ *	Discard an array of skip list headers.
+ */
+static void
+__free_skip_array(
+    WT_SESSION_IMPL *session, WT_INSERT_HEAD **head_arg, uint32_t entries)
+{
+	WT_INSERT_HEAD **head;
+
+	/*
+	 * For each non-NULL slot in the page's array of inserts, free the
+	 * linked list anchored in that slot.
+	 */
+	for (head = head_arg; entries > 0; --entries, ++head)
+		if (*head != NULL) {
+			__free_skip_list(session, WT_SKIP_FIRST(*head));
+			__wt_free(session, *head);
+		}
+
+	/* Free the header array. */
+	__wt_free(session, head_arg);
+}
+
+/*
+ * __free_skip_list --
+ *	Walk a WT_INSERT forward-linked list and free the per-thread combination
+ * of a WT_INSERT structure and its associated chain of WT_UPDATE structures.
+ */
+static void
+__free_skip_list(WT_SESSION_IMPL *session, WT_INSERT *ins)
+{
+	WT_INSERT *next;
+
+	for (; ins != NULL; ins = next) {
+		__free_update_list(session, ins->upd);
+		next = WT_SKIP_NEXT(ins);
+		__wt_free(session, ins);
+	}
+}
+
+/*
+ * __free_update --
+ *	Discard the update array.
+ */
+static void
+__free_update(
+    WT_SESSION_IMPL *session, WT_UPDATE **update_head, uint32_t entries)
+{
+	WT_UPDATE **updp;
+
+	/*
+	 * For each non-NULL slot in the page's array of updates, free the
+	 * linked list anchored in that slot.
+	 */
+	for (updp = update_head; entries > 0; --entries, ++updp)
+		if (*updp != NULL)
+			__free_update_list(session, *updp);
+
+	/* Free the update array. */
+	__wt_free(session, update_head);
+}
+
+/*
+ * __free_update_list --
+ *	Walk a WT_UPDATE forward-linked list and free the per-thread combination
+ *	of a WT_UPDATE structure and its associated data.
+ */
+static void
+__free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+{
+	WT_UPDATE *next;
+
+	for (; upd != NULL; upd = next) {
+		/* Everything we free should be visible to everyone. */
+		WT_ASSERT(session,
+		    F_ISSET(session, WT_SESSION_DISCARD_FORCE) ||
+		    upd->txnid == WT_TXN_ABORTED ||
+		    __wt_txn_visible_all(session, upd->txnid));
+
+		next = upd->next;
+		__wt_free(session, upd);
+	}
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_evict.c b/src/third_party/wiredtiger/src/btree/bt_evict.c
new file mode 100644
index 00000000000..ff049553c7f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_evict.c
@@ -0,0 +1,1297 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int   __evict_clear_walks(WT_SESSION_IMPL *);
+static int   __evict_has_work(WT_SESSION_IMPL *, uint32_t *);
+static int   __evict_lru(WT_SESSION_IMPL *, uint32_t);
+static int   __evict_lru_cmp(const void *, const void *);
+static int   __evict_lru_pages(WT_SESSION_IMPL *, int);
+static int   __evict_pass(WT_SESSION_IMPL *);
+static int   __evict_walk(WT_SESSION_IMPL *, uint32_t *, uint32_t);
+static int   __evict_walk_file(WT_SESSION_IMPL *, u_int *, uint32_t);
+static void *__evict_worker(void *);
+
+/*
+ * __evict_read_gen --
+ *	Get the adjusted read generation for an eviction entry.
+ */
+static inline uint64_t
+__evict_read_gen(const WT_EVICT_ENTRY *entry)
+{
+	WT_PAGE *page;
+	uint64_t read_gen;
+
+	/* Never prioritize empty slots. */
+	if (entry->ref == NULL)
+		return (UINT64_MAX);
+
+	page = entry->ref->page;
+	read_gen = page->read_gen + entry->btree->evict_priority;
+
+	/*
+	 * Skew the read generation for internal pages, we prefer to evict leaf
+	 * pages.
+	 */
+	if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT)
+		read_gen += WT_EVICT_INT_SKEW;
+
+	return (read_gen);
+}
+
+/*
+ * __evict_lru_cmp --
+ *	Qsort function: sort the eviction array.
+ */
+static int
+__evict_lru_cmp(const void *a, const void *b)
+{
+	uint64_t a_lru, b_lru;
+
+	a_lru = __evict_read_gen(a);
+	b_lru = __evict_read_gen(b);
+
+	return ((a_lru < b_lru) ? -1 : (a_lru == b_lru) ? 0 : 1);
+}
+
+/*
+ * __evict_list_clear --
+ *	Clear an entry in the LRU eviction list.
+ */
+static inline void
+__evict_list_clear(WT_SESSION_IMPL *session, WT_EVICT_ENTRY *e)
+{
+	if (e->ref != NULL) {
+		WT_ASSERT(session,
+		    F_ISSET_ATOMIC(e->ref->page, WT_PAGE_EVICT_LRU));
+		F_CLR_ATOMIC(e->ref->page, WT_PAGE_EVICT_LRU);
+	}
+	e->ref = NULL;
+	e->btree = WT_DEBUG_POINT;
+}
+
+/*
+ * __wt_evict_list_clear_page --
+ *	Make sure a page is not in the LRU eviction list.  This called from the
+ *	page eviction code to make sure there is no attempt to evict a child
+ *	page multiple times.
+ */
+void
+__wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+	WT_CACHE *cache;
+	WT_EVICT_ENTRY *evict;
+	uint32_t i, elem;
+
+	WT_ASSERT(session, 
+	    __wt_ref_is_root(ref) || ref->state == WT_REF_LOCKED);
+
+	/* Fast path: if the page isn't on the queue, don't bother searching. */
+	if (!F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU))
+		return;
+
+	cache = S2C(session)->cache;
+	__wt_spin_lock(session, &cache->evict_lock);
+
+	elem = cache->evict_max;
+	for (i = 0, evict = cache->evict; i < elem; i++, evict++)
+		if (evict->ref == ref) {
+			__evict_list_clear(session, evict);
+			break;
+		}
+
+	WT_ASSERT(session, !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU));
+
+	__wt_spin_unlock(session, &cache->evict_lock);
+}
+
+/*
+ * __wt_evict_server_wake --
+ *	Wake the eviction server thread.
+ */
+int
+__wt_evict_server_wake(WT_SESSION_IMPL *session)
+{
+	WT_CACHE *cache;
+	WT_CONNECTION_IMPL *conn;
+
+	conn = S2C(session);
+	cache = conn->cache;
+
+	if (WT_VERBOSE_ISSET(session, WT_VERB_EVICTSERVER)) {
+		uint64_t bytes_inuse, bytes_max;
+
+		bytes_inuse = __wt_cache_bytes_inuse(cache);
+		bytes_max = conn->cache_size;
+		WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
+		    "waking, bytes inuse %s max (%" PRIu64
+		    "MB %s %" PRIu64 "MB)",
+		    bytes_inuse <= bytes_max ? "<=" : ">",
+		    bytes_inuse / WT_MEGABYTE,
+		    bytes_inuse <= bytes_max ? "<=" : ">",
+		    bytes_max / WT_MEGABYTE));
+	}
+
+	return (__wt_cond_signal(session, cache->evict_cond));
+}
+
+/*
+ * __evict_server --
+ *	Thread to evict pages from the cache.
+ */
+static void *
+__evict_server(void *arg)
+{
+	WT_CACHE *cache;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_EVICT_WORKER *worker;
+	WT_SESSION_IMPL *session;
+
+	session = arg;
+	conn = S2C(session);
+	cache = conn->cache;
+
+	while (F_ISSET(conn, WT_CONN_EVICTION_RUN)) {
+		/* Evict pages from the cache as needed. */
+		WT_ERR(__evict_pass(session));
+
+		if (!F_ISSET(conn, WT_CONN_EVICTION_RUN))
+			break;
+
+		/*
+		 * If we have caught up and there are more than the minimum
+		 * number of eviction workers running, shut one down.
+		 */
+		if (conn->evict_workers > conn->evict_workers_min) {
+			WT_TRET(__wt_verbose(session, WT_VERB_EVICTSERVER,
+			    "Stopping evict worker: %"PRIu32"\n",
+			    conn->evict_workers));
+			worker = &conn->evict_workctx[--conn->evict_workers];
+			F_CLR(worker, WT_EVICT_WORKER_RUN);
+			WT_TRET(__wt_cond_signal(
+			    session, cache->evict_waiter_cond));
+			WT_TRET(__wt_thread_join(session, worker->tid));
+			/*
+			 * Flag errors here with a message, but don't shut down
+			 * the eviction server - that's fatal.
+			 */
+			WT_ASSERT(session, ret == 0);
+			if (ret != 0) {
+				(void)__wt_msg(session,
+				    "Error stopping eviction worker: %d", ret);
+				ret = 0;
+			}
+		}
+		F_CLR(cache, WT_EVICT_ACTIVE);
+		WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping"));
+		/* Don't rely on signals: check periodically. */
+		WT_ERR(__wt_cond_wait(session, cache->evict_cond, 100000));
+		WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "waking"));
+	}
+
+	WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "exiting"));
+
+err:	
+	if (ret != 0) {
+		WT_PANIC_MSG(session, ret, "eviction server error");
+		return (NULL);
+	}
+
+	if (cache->pages_inmem != cache->pages_evict)
+		__wt_errx(session,
+		    "cache server: exiting with %" PRIu64 " pages in "
+		    "memory and %" PRIu64 " pages evicted",
+		    cache->pages_inmem, cache->pages_evict);
+	if (cache->bytes_inmem != cache->bytes_evict)
+		__wt_errx(session,
+		    "cache server: exiting with %" PRIu64 " bytes in "
+		    "memory and %" PRIu64 " bytes evicted",
+		    cache->bytes_inmem, cache->bytes_evict);
+	if (cache->bytes_dirty != 0 || cache->pages_dirty != 0)
+		__wt_errx(session,
+		    "cache server: exiting with %" PRIu64
+		    " bytes dirty and %" PRIu64 " pages dirty",
+		    cache->bytes_dirty, cache->pages_dirty);
+
+	return (NULL);
+}
+
+/*
+ * __wt_evict_create --
+ *	Start the eviction server thread.
+ */
+int
+__wt_evict_create(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_EVICT_WORKER *workers;
+	u_int i;
+
+	conn = S2C(session);
+
+	/* Set first, the thread might run before we finish up. */
+	F_SET(conn, WT_CONN_EVICTION_RUN);
+
+	/* We need a session handle because we're reading/writing pages. */
+	WT_RET(__wt_open_internal_session(
+	    conn, "eviction-server", 0, 0, &conn->evict_session));
+	session = conn->evict_session;
+
+	/*
+	 * If there's only a single eviction thread, it may be called upon to
+	 * perform slow operations for the block manager.  (The flag is not
+	 * reset if reconfigured later, but I doubt that's a problem.)
+	 */
+	if (conn->evict_workers_max == 0)
+		F_SET(session, WT_SESSION_CAN_WAIT);
+
+	if (conn->evict_workers_max > 0) {
+		WT_RET(__wt_calloc_def(
+		    session, conn->evict_workers_max, &workers));
+		conn->evict_workctx = workers;
+
+		for (i = 0; i < conn->evict_workers_max; i++) {
+			WT_RET(__wt_open_internal_session(conn,
+			    "eviction-worker", 0, 0, &workers[i].session));
+			workers[i].id = i;
+			F_SET(workers[i].session, WT_SESSION_CAN_WAIT);
+
+			if (i < conn->evict_workers_min) {
+				++conn->evict_workers;
+				F_SET(&workers[i], WT_EVICT_WORKER_RUN);
+				WT_RET(__wt_thread_create(
+				    workers[i].session, &workers[i].tid,
+				    __evict_worker, &workers[i]));
+			}
+		}
+	}
+
+	/*
+	 * Start the primary eviction server thread after the worker threads
+	 * have started to avoid it starting additional worker threads before
+	 * the worker's sessions are created.
+	 */
+	WT_RET(__wt_thread_create(
+	    session, &conn->evict_tid, __evict_server, session));
+	conn->evict_tid_set = 1;
+
+	return (0);
+}
+
+/*
+ * __wt_evict_destroy --
+ *	Destroy the eviction server thread.
+ */
+int
+__wt_evict_destroy(WT_SESSION_IMPL *session)
+{
+	WT_CACHE *cache;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_EVICT_WORKER *workers;
+	WT_SESSION *wt_session;
+	u_int i;
+
+	conn = S2C(session);
+	cache = conn->cache;
+	workers = conn->evict_workctx;
+
+	F_CLR(conn, WT_CONN_EVICTION_RUN);
+
+	WT_TRET(__wt_verbose(
+	    session, WT_VERB_EVICTSERVER, "waiting for helper threads"));
+	for (i = 0; i < conn->evict_workers; i++) {
+		WT_TRET(__wt_cond_signal(session, cache->evict_waiter_cond));
+		WT_TRET(__wt_thread_join(session, workers[i].tid));
+	}
+	/* Handle shutdown when cleaning up after a failed open */
+	if (conn->evict_workctx != NULL) {
+		for (i = 0; i < conn->evict_workers_max; i++) {
+			wt_session = &conn->evict_workctx[i].session->iface;
+			WT_TRET(wt_session->close(wt_session, NULL));
+		}
+		__wt_free(session, conn->evict_workctx);
+	}
+
+	if (conn->evict_tid_set) {
+		WT_TRET(__wt_evict_server_wake(session));
+		WT_TRET(__wt_thread_join(session, conn->evict_tid));
+		conn->evict_tid_set = 0;
+	}
+
+	if (conn->evict_session != NULL) {
+		wt_session = &conn->evict_session->iface;
+		WT_TRET(wt_session->close(wt_session, NULL));
+
+		conn->evict_session = NULL;
+	}
+
+	return (ret);
+}
+
+/*
+ * __evict_worker --
+ *	Thread to help evict pages from the cache.
+ */
+static void *
+__evict_worker(void *arg)
+{
+	WT_CACHE *cache;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_EVICT_WORKER *worker;
+	WT_SESSION_IMPL *session;
+	uint32_t flags;
+
+	worker = arg;
+	session = worker->session;
+	conn = S2C(session);
+	cache = conn->cache;
+
+	while (F_ISSET(conn, WT_CONN_EVICTION_RUN) &&
+	    F_ISSET(worker, WT_EVICT_WORKER_RUN)) {
+		/* Don't spin in a busy loop if there is no work to do */
+		WT_ERR(__evict_has_work(session, &flags));
+		if (flags == 0)
+			WT_ERR(__wt_cond_wait(
+			    session, cache->evict_waiter_cond, 10000));
+		else
+			WT_ERR(__evict_lru_pages(session, 1));
+	}
+
+	if (0) {
+err:		__wt_err(session, ret, "cache eviction helper error");
+	}
+
+	WT_TRET(__wt_verbose(session, WT_VERB_EVICTSERVER, "helper exiting"));
+
+	return (NULL);
+}
+
+/*
+ * __evict_has_work --
+ *	Find out if there is eviction work to be done.
+ */
+static int
+__evict_has_work(WT_SESSION_IMPL *session, uint32_t *flagsp)
+{
+	WT_CACHE *cache;
+	WT_CONNECTION_IMPL *conn;
+	uint32_t flags;
+	uint64_t bytes_inuse, bytes_max, dirty_inuse;
+
+	conn = S2C(session);
+	cache = conn->cache;
+	flags = 0;
+	*flagsp = 0;
+
+	if (!F_ISSET(conn, WT_CONN_EVICTION_RUN))
+		return (0);
+
+	/*
+	 * Figure out whether the cache usage exceeds either the eviction
+	 * target or the dirty target.
+	 */
+	bytes_inuse = __wt_cache_bytes_inuse(cache);
+	dirty_inuse = cache->bytes_dirty;
+	bytes_max = conn->cache_size;
+
+	/* Check to see if the eviction server should run. */
+	if (bytes_inuse > (cache->eviction_target * bytes_max) / 100)
+		LF_SET(WT_EVICT_PASS_ALL);
+	else if (dirty_inuse >
+	    (cache->eviction_dirty_target * bytes_max) / 100)
+		/* Ignore clean pages unless the cache is too large */
+		LF_SET(WT_EVICT_PASS_DIRTY);
+
+	if (F_ISSET(cache, WT_EVICT_STUCK))
+		LF_SET(WT_EVICT_PASS_AGGRESSIVE);
+
+	*flagsp = flags;
+	return (0);
+}
+
+/*
+ * __evict_pass --
+ *	Evict pages from memory.
+ */
+static int
+__evict_pass(WT_SESSION_IMPL *session)
+{
+	WT_CACHE *cache;
+	WT_CONNECTION_IMPL *conn;
+	WT_EVICT_WORKER *worker;
+	int loop;
+	uint32_t flags;
+	uint64_t bytes_inuse;
+
+	conn = S2C(session);
+	cache = conn->cache;
+
+	/* Evict pages from the cache. */
+	for (loop = 0;; loop++) {
+		/*
+		 * If there is a request to clear eviction walks, do that now,
+		 * before checking if the cache is full.
+		 */
+		if (F_ISSET(cache, WT_EVICT_CLEAR_WALKS)) {
+			F_CLR(cache, WT_EVICT_CLEAR_WALKS);
+			WT_RET(__evict_clear_walks(session));
+			WT_RET(__wt_cond_signal(
+			    session, cache->evict_waiter_cond));
+		}
+
+		WT_RET(__evict_has_work(session, &flags));
+		if (flags == 0)
+			break;
+
+		if (loop > 10)
+			LF_SET(WT_EVICT_PASS_AGGRESSIVE);
+
+		bytes_inuse = __wt_cache_bytes_inuse(cache);
+		/*
+		 * When the cache is full, track whether pages are being
+		 * evicted.  This will be cleared by the next thread to
+		 * successfully evict a page.
+		 */
+		if (bytes_inuse > conn->cache_size) {
+			F_SET(cache, WT_EVICT_NO_PROGRESS);
+		} else
+			F_CLR(cache, WT_EVICT_NO_PROGRESS);
+
+		/* Start a worker if we have capacity and the cache is full. */
+		if (bytes_inuse > conn->cache_size &&
+		    conn->evict_workers < conn->evict_workers_max) {
+			WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
+			    "Starting evict worker: %"PRIu32"\n",
+			    conn->evict_workers));
+			worker = &conn->evict_workctx[conn->evict_workers++];
+			F_SET(worker, WT_EVICT_WORKER_RUN);
+			WT_RET(__wt_thread_create(session,
+			    &worker->tid, __evict_worker, worker));
+		}
+
+		F_SET(cache, WT_EVICT_ACTIVE);
+		WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
+		    "Eviction pass with: Max: %" PRIu64
+		    " In use: %" PRIu64 " Dirty: %" PRIu64,
+		    conn->cache_size, bytes_inuse, cache->bytes_dirty));
+
+		WT_RET(__evict_lru(session, flags));
+
+		/*
+		 * If we're making progress, keep going; if we're not making
+		 * any progress at all, mark the cache "stuck" and go back to
+		 * sleep, it's not something we can fix.
+		 */
+		if (F_ISSET(cache, WT_EVICT_NO_PROGRESS)) {
+			if (F_ISSET(cache, WT_EVICT_STUCK))
+				break;
+			if (loop == 100) {
+				F_SET(cache, WT_EVICT_STUCK);
+				WT_STAT_FAST_CONN_INCR(
+				    session, cache_eviction_slow);
+				WT_RET(__wt_verbose(
+				    session, WT_VERB_EVICTSERVER,
+				    "unable to reach eviction goal"));
+				break;
+			}
+		} else
+			loop = 0;
+	}
+	return (0);
+}
+
+/*
+ * __evict_clear_walks --
+ *	Clear the eviction walk points for all files.
+ */
+static int
+__evict_clear_walks(WT_SESSION_IMPL *session)
+{
+	WT_BTREE *btree;
+	WT_CACHE *cache;
+	WT_CONNECTION_IMPL *conn;
+	WT_DATA_HANDLE *dhandle;
+	WT_DECL_RET;
+	WT_REF *ref;
+
+	conn = S2C(session);
+	cache = conn->cache;
+	cache->evict_file_next = NULL;
+
+	/*
+	 * Lock the dhandle list so sweeping cannot change the pointers out
+	 * from under us.
+	 *
+	 * NOTE: we don't hold the schema lock, so we have to take care
+	 * that the handles we see are open and valid.
+	 */
+	__wt_spin_lock(session, &conn->dhandle_lock);
+
+	SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+		/* Ignore non-file handles, or handles that aren't open. */
+		if (!WT_PREFIX_MATCH(dhandle->name, "file:") ||
+		    !F_ISSET(dhandle, WT_DHANDLE_OPEN))
+			continue;
+
+		btree = dhandle->handle;
+		session->dhandle = dhandle;
+		if ((ref = btree->evict_ref) != NULL) {
+			/*
+			 * Clear evict_ref first, in case releasing it forces
+			 * eviction (we assert that we never try to evict the
+			 * current eviction walk point).
+			 */
+			btree->evict_ref = NULL;
+			WT_TRET(__wt_page_release(session, ref, 0));
+		}
+		session->dhandle = NULL;
+	}
+
+	__wt_spin_unlock(session, &conn->dhandle_lock);
+
+	return (ret);
+}
+
+/*
+ * __evict_tree_walk_clear --
+ *	Clear the tree's current eviction point, acquiring the eviction lock.
+ */
+static int
+__evict_tree_walk_clear(WT_SESSION_IMPL *session)
+{
+	WT_BTREE *btree;
+	WT_CACHE *cache;
+	WT_DECL_RET;
+
+	btree = S2BT(session);
+	cache = S2C(session)->cache;
+
+	while (btree->evict_ref != NULL) {
+		F_SET(cache, WT_EVICT_CLEAR_WALKS);
+		WT_RET(__wt_cond_wait(
+		    session, cache->evict_waiter_cond, 100000));
+	}
+
+	return (ret);
+}
+
+/*
+ * __wt_evict_page --
+ *	Evict a given page.
+ */
+int
+__wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+	WT_DECL_RET;
+	WT_TXN *txn;
+	WT_TXN_ISOLATION saved_iso;
+
+	/*
+	 * We have to take care when evicting pages not to write a change that:
+	 *  (a) is not yet committed; or
+	 *  (b) is committed more recently than an in-progress checkpoint.
+	 *
+	 * We handle both of these cases by setting up the transaction context
+	 * before evicting, using a special "eviction" isolation level, where
+	 * only globally visible updates can be evicted.
+	 */
+	__wt_txn_update_oldest(session);
+	txn = &session->txn;
+	saved_iso = txn->isolation;
+	txn->isolation = TXN_ISO_EVICTION;
+
+	/*
+	 * Sanity check: if a transaction has updates, its updates should not
+	 * be visible to eviction.
+	 */
+	WT_ASSERT(session,
+	    !F_ISSET(txn, TXN_HAS_ID) || !__wt_txn_visible(session, txn->id));
+
+	ret = __wt_rec_evict(session, ref, 0);
+	txn->isolation = saved_iso;
+
+	return (ret);
+}
+
+/*
+ * __wt_evict_file_exclusive_on --
+ *	Get exclusive eviction access to a file and discard any of the file's
+ *	blocks queued for eviction.
+ */
+int
+__wt_evict_file_exclusive_on(WT_SESSION_IMPL *session)
+{
+	WT_BTREE *btree;
+	WT_CACHE *cache;
+	WT_EVICT_ENTRY *evict;
+	u_int i, elem;
+
+	btree = S2BT(session);
+	cache = S2C(session)->cache;
+
+	/*
+	 * Hold the walk lock to set the "no eviction" flag: no new pages from
+	 * the file will be queued for eviction after this point.
+	 */
+	__wt_spin_lock(session, &cache->evict_walk_lock);
+	F_SET(btree, WT_BTREE_NO_EVICTION);
+	__wt_spin_unlock(session, &cache->evict_walk_lock);
+
+	/* Clear any existing LRU eviction walk for the file. */
+	WT_RET(__evict_tree_walk_clear(session));
+
+	/* Hold the evict lock to remove any queued pages from this file. */
+	__wt_spin_lock(session, &cache->evict_lock);
+
+	/*
+	 * The eviction candidate list might reference pages from the file,
+	 * clear it.
+	 */
+	elem = cache->evict_max;
+	for (i = 0, evict = cache->evict; i < elem; i++, evict++)
+		if (evict->btree == btree)
+			__evict_list_clear(session, evict);
+	__wt_spin_unlock(session, &cache->evict_lock);
+
+	/*
+	 * We have disabled further eviction: wait for concurrent LRU eviction
+	 * activity to drain.
+	 */
+	while (btree->evict_busy > 0)
+		__wt_yield();
+
+	return (0);
+}
+
+/*
+ * __wt_evict_file_exclusive_off --
+ *	Release exclusive eviction access to a file.
+ */
+void
+__wt_evict_file_exclusive_off(WT_SESSION_IMPL *session)
+{
+	WT_BTREE *btree;
+
+	btree = S2BT(session);
+
+	WT_ASSERT(session, btree->evict_ref == NULL);
+
+	F_CLR(btree, WT_BTREE_NO_EVICTION);
+}
+
+/*
+ * __evict_lru_pages --
+ *	Get pages from the LRU queue to evict.
+ */
+static int
+__evict_lru_pages(WT_SESSION_IMPL *session, int is_app)
+{
+	WT_DECL_RET;
+
+	/*
+	 * Reconcile and discard some pages: EBUSY is returned if a page fails
+	 * eviction because it's unavailable, continue in that case.
+	 */
+	while ((ret = __wt_evict_lru_page(session, is_app)) == 0 ||
+	    ret == EBUSY)
+		;
+	return (ret == WT_NOTFOUND ? 0 : ret);
+}
+
+/*
+ * __evict_lru --
+ *	Evict pages from the cache based on their read generation.
+ */
+static int
+__evict_lru(WT_SESSION_IMPL *session, uint32_t flags)
+{
+	WT_CACHE *cache;
+	WT_EVICT_ENTRY *evict;
+	uint64_t cutoff;
+	uint32_t candidates, entries, i;
+
+	cache = S2C(session)->cache;
+
+	/* Get some more pages to consider for eviction. */
+	WT_RET(__evict_walk(session, &entries, flags));
+
+	/* Sort the list into LRU order and restart. */
+	__wt_spin_lock(session, &cache->evict_lock);
+
+	qsort(cache->evict,
+	    entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp);
+
+	while (entries > 0 && cache->evict[entries - 1].ref == NULL)
+		--entries;
+
+	cache->evict_entries = entries;
+
+	if (entries == 0) {
+		/*
+		 * If there are no entries, there cannot be any candidates.
+		 * Make sure application threads don't read past the end of the
+		 * candidate list, or they may race with the next walk.
+		 */
+		cache->evict_candidates = 0;
+		cache->evict_current = NULL;
+		__wt_spin_unlock(session, &cache->evict_lock);
+		return (0);
+	}
+
+	WT_ASSERT(session, cache->evict[0].ref != NULL);
+
+	/* Find the bottom 25% of read generations. */
+	cutoff = (3 * __evict_read_gen(&cache->evict[0]) +
+	    __evict_read_gen(&cache->evict[entries - 1])) / 4;
+
+	/*
+	 * Don't take less than 10% or more than 50% of entries, regardless.
+	 * That said, if there is only one entry, which is normal when
+	 * populating an empty file, don't exclude it.
+	 */
+	for (candidates = 1 + entries / 10;
+	    candidates < entries / 2;
+	    candidates++)
+		if (__evict_read_gen(&cache->evict[candidates]) > cutoff)
+			break;
+	cache->evict_candidates = candidates;
+
+	/* If we have more than the minimum number of entries, clear them. */
+	if (cache->evict_entries > WT_EVICT_WALK_BASE) {
+		for (i = WT_EVICT_WALK_BASE, evict = cache->evict + i;
+		    i < cache->evict_entries;
+		    i++, evict++)
+			__evict_list_clear(session, evict);
+		cache->evict_entries = WT_EVICT_WALK_BASE;
+	}
+
+	cache->evict_current = cache->evict;
+	__wt_spin_unlock(session, &cache->evict_lock);
+
+	/*
+	 * The eviction server thread doesn't do any actual eviction if there
+	 * are multiple eviction workers running.
+	 */
+	WT_RET(__wt_cond_signal(session, cache->evict_waiter_cond));
+
+	if (S2C(session)->evict_workers > 1) {
+		WT_STAT_FAST_CONN_INCR(
+		    session, cache_eviction_server_not_evicting);
+		/*
+		 * If there are candidates queued, give other threads a chance
+		 * to access them before gathering more.
+		 */
+		if (candidates > 10 && cache->evict_current != NULL)
+			__wt_yield();
+	} else {
+		WT_STAT_FAST_CONN_INCR(session, cache_eviction_server_evicting);
+		WT_RET(__evict_lru_pages(session, 0));
+	}
+
+	return (0);
+}
+
+/*
+ * __evict_walk --
+ *	Fill in the array by walking the next set of pages.
+ */
+static int
+__evict_walk(WT_SESSION_IMPL *session, u_int *entriesp, uint32_t flags)
+{
+	WT_BTREE *btree;
+	WT_CACHE *cache;
+	WT_CONNECTION_IMPL *conn;
+	WT_DATA_HANDLE *dhandle;
+	WT_DECL_RET;
+	u_int max_entries, old_slot, retries, slot;
+
+	conn = S2C(session);
+	cache = S2C(session)->cache;
+	retries = 0;
+
+	/* Increment the shared read generation. */
+	__wt_cache_read_gen_incr(session);
+
+	/*
+	 * Update the oldest ID: we use it to decide whether pages are
+	 * candidates for eviction.  Without this, if all threads are blocked
+	 * after a long-running transaction (such as a checkpoint) completes,
+	 * we may never start evicting again.
+	 */
+	__wt_txn_update_oldest(session);
+
+	/*
+	 * Set the starting slot in the queue and the maximum pages added
+	 * per walk.
+	 */
+	slot = cache->evict_entries;
+	max_entries = slot + WT_EVICT_WALK_INCR;
+	if (cache->evict_current == NULL)
+		WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_empty);
+	else
+		WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_not_empty);
+
+	/*
+	 * Lock the dhandle list so sweeping cannot change the pointers out
+	 * from under us.
+	 *
+	 * NOTE: we don't hold the schema lock, so we have to take care
+	 * that the handles we see are open and valid.
+	 */
+	__wt_spin_lock(session, &conn->dhandle_lock);
+
+retry:	SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+		/* Ignore non-file handles, or handles that aren't open. */
+		if (!WT_PREFIX_MATCH(dhandle->name, "file:") ||
+		    !F_ISSET(dhandle, WT_DHANDLE_OPEN))
+			continue;
+
+		/*
+		 * Each time we reenter this function, start at the next handle
+		 * on the list.
+		 */
+		if (cache->evict_file_next != NULL &&
+		    cache->evict_file_next != dhandle)
+			continue;
+		cache->evict_file_next = NULL;
+
+		/* Skip files that don't allow eviction. */
+		btree = dhandle->handle;
+		if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
+			continue;
+
+		/*
+		 * Also skip files that are configured to stick in cache until
+		 * we get aggressive.
+		 */
+		if (btree->evict_priority != 0 &&
+		    !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE))
+			continue;
+
+		/*
+		 * If we are filling the queue, skip files that haven't been
+		 * useful in the past.
+		 */
+		if (btree->evict_walk_period != 0 &&
+		    cache->evict_entries >= WT_EVICT_WALK_INCR &&
+		    btree->evict_walk_skips++ < btree->evict_walk_period)
+			continue;
+		btree->evict_walk_skips = 0;
+		old_slot = slot;
+
+		__wt_spin_lock(session, &cache->evict_walk_lock);
+
+		/*
+		 * Re-check the "no eviction" flag -- it is used to enforce
+		 * exclusive access when a handle is being closed.
+		 */
+		if (!F_ISSET(btree, WT_BTREE_NO_EVICTION))
+			WT_WITH_BTREE(session, btree,
+			    ret = __evict_walk_file(session, &slot, flags));
+
+		__wt_spin_unlock(session, &cache->evict_walk_lock);
+
+		/*
+		 * If we didn't find enough candidates in the file, skip it
+		 * next time.
+		 */
+		if (slot >= old_slot + WT_EVICT_WALK_PER_FILE ||
+		    slot >= max_entries)
+			btree->evict_walk_period = 0;
+		else
+			btree->evict_walk_period = WT_MIN(
+			    WT_MAX(1, 2 * btree->evict_walk_period), 1000);
+
+		if (ret != 0 || slot >= max_entries)
+			break;
+	}
+
+	/* Walk the list of files a few times if we don't find enough pages. */
+	if (ret == 0 && slot < max_entries && ++retries < 10)
+		goto retry;
+
+	/* Remember the file we should visit first, next loop. */
+	if (dhandle != NULL)
+		dhandle = SLIST_NEXT(dhandle, l);
+	cache->evict_file_next = dhandle;
+
+	__wt_spin_unlock(session, &conn->dhandle_lock);
+
+	*entriesp = slot;
+	return (ret);
+}
+
+/*
+ * __evict_init_candidate --
+ *	Initialize a WT_EVICT_ENTRY structure with a given page.
+ */
+static void
+__evict_init_candidate(
+    WT_SESSION_IMPL *session, WT_EVICT_ENTRY *evict, WT_REF *ref)
+{
+	WT_CACHE *cache;
+	u_int slot;
+
+	cache = S2C(session)->cache;
+
+	/* Keep track of the maximum slot we are using. */
+	slot = (u_int)(evict - cache->evict);
+	if (slot >= cache->evict_max)
+		cache->evict_max = slot + 1;
+
+	if (evict->ref != NULL)
+		__evict_list_clear(session, evict);
+	evict->ref = ref;
+	evict->btree = S2BT(session);
+
+	/* Mark the page on the list */
+	F_SET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU);
+}
+
+/*
+ * __evict_walk_file --
+ *	Get a few page eviction candidates from a single underlying file.
+ */
+static int
+__evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
+{
+	WT_BTREE *btree;
+	WT_CACHE *cache;
+	WT_DECL_RET;
+	WT_EVICT_ENTRY *end, *evict, *start;
+	WT_PAGE *page;
+	WT_PAGE_MODIFY *mod;
+	uint64_t pages_walked;
+	uint32_t walk_flags;
+	int internal_pages, modified, restarts;
+
+	btree = S2BT(session);
+	cache = S2C(session)->cache;
+	start = cache->evict + *slotp;
+	end = WT_MIN(start + WT_EVICT_WALK_PER_FILE,
+	    cache->evict + cache->evict_slots);
+
+	walk_flags =
+	    WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT;
+
+	/*
+	 * Get some more eviction candidate pages.
+	 */
+	for (evict = start, pages_walked = 0, internal_pages = restarts = 0;
+	    evict < end && (ret == 0 || ret == WT_NOTFOUND);
+	    ret = __wt_tree_walk(session, &btree->evict_ref, walk_flags),
+	    ++pages_walked) {
+		if (btree->evict_ref == NULL) {
+			/*
+			 * Take care with terminating this loop.
+			 *
+			 * Don't make an extra call to __wt_tree_walk: that will
+			 * leave a page pinned, which may prevent any work from
+			 * being done.
+			 */
+			if (++restarts == 2)
+				break;
+			continue;
+		}
+
+		/* Ignore root pages entirely. */
+		if (__wt_ref_is_root(btree->evict_ref))
+			continue;
+		page = btree->evict_ref->page;
+
+		/*
+		 * Use the EVICT_LRU flag to avoid putting pages onto the list
+		 * multiple times.
+		 */
+		if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
+			continue;
+
+		/* Limit internal pages to 50% unless we get aggressive. */
+		if ((page->type == WT_PAGE_COL_INT ||
+		    page->type == WT_PAGE_ROW_INT) &&
+		    ++internal_pages > WT_EVICT_WALK_PER_FILE / 2 &&
+		    !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE))
+			break;
+
+		/*
+		 * If this page has never been considered for eviction,
+		 * set its read generation to a little bit in the
+		 * future and move on, give readers a chance to start
+		 * updating the read generation.
+		 */
+		if (page->read_gen == WT_READGEN_NOTSET) {
+			page->read_gen = __wt_cache_read_gen_set(session);
+			continue;
+		}
+
+		/*
+		 * If the file is being checkpointed, there's a period of time
+		 * where we can't discard dirty pages because of possible races
+		 * with the checkpointing thread.
+		 */
+		modified = __wt_page_is_modified(page);
+		if (modified && btree->checkpointing)
+			continue;
+
+		/* Optionally ignore clean pages. */
+		if (!modified && LF_ISSET(WT_EVICT_PASS_DIRTY))
+			continue;
+
+		/*
+		 * If the page is clean but has modifications that appear too
+		 * new to evict, skip it.
+		 */
+		mod = page->modify;
+		if (!modified && mod != NULL &&
+		    !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE) &&
+		    !__wt_txn_visible_all(session, mod->rec_max_txn))
+			continue;
+
+		/*
+		 * If the oldest transaction hasn't changed since the
+		 * last time this page was written, it's unlikely that
+		 * we can make progress.  Similarly, if the most recent
+		 * update on the page is not yet globally visible,
+		 * eviction will fail.  These heuristics attempt to
+		 * avoid repeated attempts to evict the same page.
+		 *
+		 * That said, if eviction is stuck, or the file is
+		 * being checkpointed, try anyway: maybe a transaction
+		 * that was running last time we wrote the page has
+		 * since rolled back, or we can help get the checkpoint
+		 * completed sooner.
+		 */
+		if (modified && !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE) &&
+		    !btree->checkpointing &&
+		    (mod->disk_snap_min == S2C(session)->txn_global.oldest_id ||
+		    !__wt_txn_visible_all(session, mod->update_txn)))
+			continue;
+
+		WT_ASSERT(session, evict->ref == NULL);
+		__evict_init_candidate(session, evict, btree->evict_ref);
+		++evict;
+
+		WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
+		    "select: %p, size %" PRIu64, page, page->memory_footprint));
+	}
+
+	/* If the walk was interrupted by a locked page, that's okay. */
+	if (ret == WT_NOTFOUND)
+		ret = 0;
+
+	*slotp += (u_int)(evict - start);
+	WT_STAT_FAST_CONN_INCRV(session, cache_eviction_walk, pages_walked);
+	return (ret);
+}
+
+/*
+ * __evict_get_ref --
+ *	Get a page for eviction.
+ */
+static int
+__evict_get_ref(
+    WT_SESSION_IMPL *session, int is_app, WT_BTREE **btreep, WT_REF **refp)
+{
+	WT_CACHE *cache;
+	WT_EVICT_ENTRY *evict;
+	uint32_t candidates;
+	WT_DECL_SPINLOCK_ID(id);			/* Must appear last */
+
+	cache = S2C(session)->cache;
+	*btreep = NULL;
+	*refp = NULL;
+
+	/*
+	 * A pathological case: if we're the oldest transaction in the system
+	 * and the eviction server is stuck trying to find space, abort the
+	 * transaction to give up all hazard pointers before trying again.
+	 */
+	if (is_app && F_ISSET(cache, WT_EVICT_STUCK) &&
+	    __wt_txn_am_oldest(session)) {
+		F_CLR(cache, WT_EVICT_STUCK);
+		WT_STAT_FAST_CONN_INCR(session, txn_fail_cache);
+		return (WT_ROLLBACK);
+	}
+
+	/*
+	 * Avoid the LRU lock if no pages are available.  If there are pages
+	 * available, spin until we get the lock.  If this function returns
+	 * without getting a page to evict, application threads assume there
+	 * are no more pages available and will attempt to wake the eviction
+	 * server.
+	 */
+	for (;;) {
+		if (cache->evict_current == NULL)
+			return (WT_NOTFOUND);
+		if (__wt_spin_trylock(session, &cache->evict_lock, &id) == 0)
+			break;
+		__wt_yield();
+	}
+
+	/*
+	 * The eviction server only tries to evict half of the pages before
+	 * looking for more.
+	 */
+	candidates = cache->evict_candidates;
+	if (!is_app && candidates > 1)
+		candidates /= 2;
+
+	/* Get the next page queued for eviction. */
+	while ((evict = cache->evict_current) != NULL &&
+	    evict < cache->evict + candidates && evict->ref != NULL) {
+		WT_ASSERT(session, evict->btree != NULL);
+
+		/* Move to the next item. */
+		++cache->evict_current;
+
+		/*
+		 * Lock the page while holding the eviction mutex to prevent
+		 * multiple attempts to evict it.  For pages that are already
+		 * being evicted, this operation will fail and we will move on.
+		 */
+		if (!WT_ATOMIC_CAS4(
+		    evict->ref->state, WT_REF_MEM, WT_REF_LOCKED)) {
+			__evict_list_clear(session, evict);
+			continue;
+		}
+
+		/*
+		 * Increment the busy count in the btree handle to prevent it
+		 * from being closed under us.
+		 */
+		(void)WT_ATOMIC_ADD4(evict->btree->evict_busy, 1);
+
+		*btreep = evict->btree;
+		*refp = evict->ref;
+
+		/*
+		 * Remove the entry so we never try to reconcile the same page
+		 * on reconciliation error.
+		 */
+		__evict_list_clear(session, evict);
+		break;
+	}
+
+	/* Clear the current pointer if there are no more candidates. */
+	if (evict >= cache->evict + cache->evict_candidates)
+		cache->evict_current = NULL;
+	__wt_spin_unlock(session, &cache->evict_lock);
+
+	return ((*refp == NULL) ? WT_NOTFOUND : 0);
+}
+
+/*
+ * __wt_evict_lru_page --
+ *	Called by both eviction and application threads to evict a page.
+ */
+int
+__wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app)
+{
+	WT_BTREE *btree;
+	WT_CACHE *cache;
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_REF *ref;
+
+	WT_RET(__evict_get_ref(session, is_app, &btree, &ref));
+	WT_ASSERT(session, ref->state == WT_REF_LOCKED);
+
+	/*
+	 * In case something goes wrong, don't pick the same set of pages every
+	 * time.
+	 *
+	 * We used to bump the page's read generation only if eviction failed,
+	 * but that isn't safe: at that point, eviction has already unlocked
+	 * the page and some other thread may have evicted it by the time we
+	 * look at it.
+	 */
+	page = ref->page;
+	if (page->read_gen != WT_READGEN_OLDEST)
+		page->read_gen = __wt_cache_read_gen_set(session);
+
+	WT_WITH_BTREE(session, btree, ret = __wt_evict_page(session, ref));
+
+	(void)WT_ATOMIC_SUB4(btree->evict_busy, 1);
+
+	WT_RET(ret);
+
+	cache = S2C(session)->cache;
+	if (F_ISSET(cache, WT_EVICT_NO_PROGRESS | WT_EVICT_STUCK))
+		F_CLR(cache, WT_EVICT_NO_PROGRESS | WT_EVICT_STUCK);
+
+	return (ret);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_cache_dump --
+ *	Dump debugging information to stdout about the size of the files in the
+ *	cache.
+ *
+ *	NOTE: this function is not called anywhere, it is intended to be called
+ *	from a debugger.
+ */
+void
+__wt_cache_dump(WT_SESSION_IMPL *session)
+{
+	WT_BTREE *btree;
+	WT_CONNECTION_IMPL *conn;
+	WT_DATA_HANDLE *dhandle;
+	WT_REF *next_walk;
+	WT_PAGE *page;
+	uint64_t file_intl_pages, file_leaf_pages;
+	uint64_t file_bytes, file_dirty, total_bytes;
+
+	conn = S2C(session);
+	total_bytes = 0;
+
+	SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+		if (!WT_PREFIX_MATCH(dhandle->name, "file:") ||
+		    !F_ISSET(dhandle, WT_DHANDLE_OPEN))
+			continue;
+
+		btree = dhandle->handle;
+		if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
+			continue;
+
+		file_bytes = file_dirty = file_intl_pages = file_leaf_pages = 0;
+		next_walk = NULL;
+		session->dhandle = dhandle;
+		while (__wt_tree_walk(session,
+		    &next_walk, WT_READ_CACHE | WT_READ_NO_WAIT) == 0 &&
+		    next_walk != NULL) {
+			page = next_walk->page;
+			if (page->type == WT_PAGE_COL_INT ||
+			    page->type == WT_PAGE_ROW_INT)
+				++file_intl_pages;
+			else
+				++file_leaf_pages;
+			file_bytes += page->memory_footprint;
+			if (__wt_page_is_modified(page))
+				file_dirty += page->memory_footprint;
+		}
+		session->dhandle = NULL;
+
+		printf("cache dump: %s [%s]:"
+		    " %" PRIu64 " intl pages, %" PRIu64 " leaf pages,"
+		    " %" PRIu64 "MB, %" PRIu64 "MB dirty\n",
+		    dhandle->name, dhandle->checkpoint,
+		    file_intl_pages, file_leaf_pages,
+		    file_bytes >> 20, file_dirty >> 20);
+
+		total_bytes += file_bytes;
+	}
+	printf("cache dump: total found = %" PRIu64 "MB"
+	    " vs tracked inuse %" PRIu64 "MB\n",
+	    total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20);
+	fflush(stdout);
+}
+#endif
diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c
new file mode 100644
index 00000000000..a21d6d277d3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_handle.c
@@ -0,0 +1,770 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __btree_conf(WT_SESSION_IMPL *, WT_CKPT *ckpt);
+static int __btree_get_last_recno(WT_SESSION_IMPL *);
+static int __btree_page_sizes(WT_SESSION_IMPL *);
+static int __btree_preload(WT_SESSION_IMPL *);
+static int __btree_tree_open_empty(WT_SESSION_IMPL *, int, int);
+
+static int pse1(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t);
+static int pse2(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t, int);
+
+/*
+ * __wt_btree_open --
+ *	Open a Btree.
+ */
+int
+__wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+	WT_CKPT ckpt;
+	WT_CONFIG_ITEM cval;
+	WT_DATA_HANDLE *dhandle;
+	WT_DECL_RET;
+	size_t root_addr_size;
+	uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE];
+	int creation, forced_salvage, readonly;
+	const char *filename;
+
+	dhandle = session->dhandle;
+	btree = S2BT(session);
+
+	/* Checkpoint files are readonly. */
+	readonly = dhandle->checkpoint == NULL ? 0 : 1;
+
+	/* Get the checkpoint information for this name/checkpoint pair. */
+	WT_CLEAR(ckpt);
+	WT_RET(__wt_meta_checkpoint(
+	    session, dhandle->name, dhandle->checkpoint, &ckpt));
+
+	/*
+	 * Bulk-load is only permitted on newly created files, not any empty
+	 * file -- see the checkpoint code for a discussion.
+	 */
+	creation = ckpt.raw.size == 0;
+	if (!creation && F_ISSET(btree, WT_BTREE_BULK))
+		WT_ERR_MSG(session, EINVAL,
+		    "bulk-load is only supported on newly created objects");
+
+	/* Handle salvage configuration. */
+	forced_salvage = 0;
+	if (F_ISSET(btree, WT_BTREE_SALVAGE)) {
+		WT_ERR(__wt_config_gets(session, op_cfg, "force", &cval));
+		forced_salvage = (cval.val != 0);
+	}
+
+	/* Initialize and configure the WT_BTREE structure. */
+	WT_ERR(__btree_conf(session, &ckpt));
+
+	/* Connect to the underlying block manager. */
+	filename = dhandle->name;
+	if (!WT_PREFIX_SKIP(filename, "file:"))
+		WT_ERR_MSG(session, EINVAL, "expected a 'file:' URI");
+
+	WT_ERR(__wt_block_manager_open(session, filename, dhandle->cfg,
+	    forced_salvage, readonly, btree->allocsize, &btree->bm));
+	bm = btree->bm;
+
+	/*
+	 * !!!
+	 * As part of block-manager configuration, we need to return the maximum
+	 * sized address cookie that a block manager will ever return.  There's
+	 * a limit of WT_BTREE_MAX_ADDR_COOKIE, but at 255B, it's too large for
+	 * a Btree with 512B internal pages.  The default block manager packs
+	 * a wt_off_t and 2 uint32_t's into its cookie, so there's no problem
+	 * now, but when we create a block manager extension API, we need some
+	 * way to consider the block manager's maximum cookie size versus the
+	 * minimum Btree internal node size.
+	 */
+	btree->block_header = bm->block_header(bm);
+
+	/*
+	 * Open the specified checkpoint unless it's a special command (special
+	 * commands are responsible for loading their own checkpoints, if any).
+	 */
+	if (!F_ISSET(btree,
+	    WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
+		/*
+		 * There are two reasons to load an empty tree rather than a
+		 * checkpoint: either there is no checkpoint (the file is
+		 * being created), or the load call returns no root page (the
+		 * checkpoint is for an empty file).
+		 */
+		WT_ERR(bm->checkpoint_load(bm, session,
+		    ckpt.raw.data, ckpt.raw.size,
+		    root_addr, &root_addr_size, readonly));
+		if (creation || root_addr_size == 0)
+			WT_ERR(__btree_tree_open_empty(
+			    session, creation, readonly));
+		else {
+			WT_ERR(__wt_btree_tree_open(
+			    session, root_addr, root_addr_size));
+
+			/* Warm the cache, if possible. */
+			WT_ERR(__btree_preload(session));
+
+			/* Get the last record number in a column-store file. */
+			if (btree->type != BTREE_ROW)
+				WT_ERR(__btree_get_last_recno(session));
+		}
+	}
+
+	if (0) {
+err:		WT_TRET(__wt_btree_close(session));
+	}
+	__wt_meta_checkpoint_free(session, &ckpt);
+
+	return (ret);
+}
+
+/*
+ * __wt_btree_close --
+ *	Close a Btree.
+ */
+int
+__wt_btree_close(WT_SESSION_IMPL *session)
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+	WT_DATA_HANDLE *dhandle;
+	WT_DECL_RET;
+
+	dhandle = session->dhandle;
+	btree = S2BT(session);
+
+	if ((bm = btree->bm) != NULL) {
+		/* Unload the checkpoint, unless it's a special command. */
+		if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
+		    !F_ISSET(btree,
+		    WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY))
+			WT_TRET(bm->checkpoint_unload(bm, session));
+
+		/* Close the underlying block manager reference. */
+		WT_TRET(bm->close(bm, session));
+
+		btree->bm = NULL;
+	}
+
+	/* Close the Huffman tree. */
+	__wt_btree_huffman_close(session);
+
+	/* Destroy locks. */
+	WT_TRET(__wt_rwlock_destroy(session, &btree->ovfl_lock));
+	__wt_spin_destroy(session, &btree->flush_lock);
+
+	/* Free allocated memory. */
+	__wt_free(session, btree->key_format);
+	__wt_free(session, btree->value_format);
+
+	if (btree->collator_owned) {
+		if (btree->collator->terminate != NULL)
+			WT_TRET(btree->collator->terminate(
+			    btree->collator, &session->iface));
+		btree->collator_owned = 0;
+	}
+	btree->collator = NULL;
+
+	btree->bulk_load_ok = 0;
+
+	return (ret);
+}
+
+/*
+ * __btree_conf --
+ *	Configure a WT_BTREE structure.
+ */
+static int
+__btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
+{
+	WT_BTREE *btree;
+	WT_CONFIG_ITEM cval;
+	WT_CONNECTION_IMPL *conn;
+	WT_NAMED_COMPRESSOR *ncomp;
+	int64_t maj_version, min_version;
+	uint32_t bitcnt;
+	int fixed;
+	const char **cfg;
+
+	btree = S2BT(session);
+	conn = S2C(session);
+	cfg = btree->dhandle->cfg;
+
+	/* Dump out format information. */
+	if (WT_VERBOSE_ISSET(session, WT_VERB_VERSION)) {
+		WT_RET(__wt_config_gets(session, cfg, "version.major", &cval));
+		maj_version = cval.val;
+		WT_RET(__wt_config_gets(session, cfg, "version.minor", &cval));
+		min_version = cval.val;
+		WT_RET(__wt_verbose(session, WT_VERB_VERSION,
+		    "%" PRIu64 ".%" PRIu64, maj_version, min_version));
+	}
+
+	/* Get the file ID. */
+	WT_RET(__wt_config_gets(session, cfg, "id", &cval));
+	btree->id = (uint32_t)cval.val;
+
+	/* Validate file types and check the data format plan. */
+	WT_RET(__wt_config_gets(session, cfg, "key_format", &cval));
+	WT_RET(__wt_struct_check(session, cval.str, cval.len, NULL, NULL));
+	if (WT_STRING_MATCH("r", cval.str, cval.len))
+		btree->type = BTREE_COL_VAR;
+	else
+		btree->type = BTREE_ROW;
+	WT_RET(__wt_strndup(session, cval.str, cval.len, &btree->key_format));
+
+	WT_RET(__wt_config_gets(session, cfg, "value_format", &cval));
+	WT_RET(__wt_struct_check(session, cval.str, cval.len, NULL, NULL));
+	WT_RET(__wt_strndup(session, cval.str, cval.len, &btree->value_format));
+
+	/* Row-store key comparison and key gap for prefix compression. */
+	if (btree->type == BTREE_ROW) {
+		WT_RET(__wt_collator_config(
+		    session, cfg, &btree->collator, &btree->collator_owned));
+
+		WT_RET(__wt_config_gets(session, cfg, "key_gap", &cval));
+		btree->key_gap = (uint32_t)cval.val;
+	}
+
+	/* Column-store: check for fixed-size data. */
+	if (btree->type == BTREE_COL_VAR) {
+		WT_RET(__wt_struct_check(
+		    session, cval.str, cval.len, &fixed, &bitcnt));
+		if (fixed) {
+			if (bitcnt == 0 || bitcnt > 8)
+				WT_RET_MSG(session, EINVAL,
+				    "fixed-width field sizes must be greater "
+				    "than 0 and less than or equal to 8");
+			btree->bitcnt = (uint8_t)bitcnt;
+			btree->type = BTREE_COL_FIX;
+		}
+	}
+
+	/* Page sizes */
+	WT_RET(__btree_page_sizes(session));
+
+	/* Eviction; the metadata file is never evicted. */
+	if (WT_IS_METADATA(btree->dhandle))
+		F_SET(btree, WT_BTREE_NO_EVICTION | WT_BTREE_NO_HAZARD);
+	else {
+		WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval));
+		if (cval.val)
+			F_SET(btree, WT_BTREE_NO_EVICTION | WT_BTREE_NO_HAZARD);
+		else
+			F_CLR(btree, WT_BTREE_NO_EVICTION);
+	}
+
+	/* Checksums */
+	WT_RET(__wt_config_gets(session, cfg, "checksum", &cval));
+	if (WT_STRING_MATCH("on", cval.str, cval.len))
+		btree->checksum = CKSUM_ON;
+	else if (WT_STRING_MATCH("off", cval.str, cval.len))
+		btree->checksum = CKSUM_OFF;
+	else
+		btree->checksum = CKSUM_UNCOMPRESSED;
+
+	/* Huffman encoding */
+	WT_RET(__wt_btree_huffman_open(session));
+
+	/*
+	 * Reconciliation configuration:
+	 *	Block compression (all)
+	 *	Dictionary compression (variable-length column-store, row-store)
+	 *	Page-split percentage
+	 *	Prefix compression (row-store)
+	 *	Suffix compression (row-store)
+	 */
+	switch (btree->type) {
+	case BTREE_COL_FIX:
+		break;
+	case BTREE_ROW:
+		WT_RET(__wt_config_gets(
+		    session, cfg, "internal_key_truncate", &cval));
+		btree->internal_key_truncate = cval.val == 0 ? 0 : 1;
+
+		WT_RET(__wt_config_gets(
+		    session, cfg, "prefix_compression", &cval));
+		btree->prefix_compression = cval.val == 0 ? 0 : 1;
+		WT_RET(__wt_config_gets(
+		    session, cfg, "prefix_compression_min", &cval));
+		btree->prefix_compression_min = (u_int)cval.val;
+		/* FALLTHROUGH */
+	case BTREE_COL_VAR:
+		WT_RET(__wt_config_gets(session, cfg, "dictionary", &cval));
+		btree->dictionary = (u_int)cval.val;
+		break;
+	}
+
+	WT_RET(__wt_config_gets(session, cfg, "block_compressor", &cval));
+	if (cval.len > 0) {
+		TAILQ_FOREACH(ncomp, &conn->compqh, q)
+			if (WT_STRING_MATCH(ncomp->name, cval.str, cval.len)) {
+				btree->compressor = ncomp->compressor;
+				break;
+			}
+		if (btree->compressor == NULL)
+			WT_RET_MSG(session, EINVAL,
+			    "unknown block compressor '%.*s'",
+			    (int)cval.len, cval.str);
+	}
+
+	/* Initialize locks. */
+	WT_RET(__wt_rwlock_alloc(
+	    session, &btree->ovfl_lock, "btree overflow lock"));
+	WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush lock"));
+
+	__wt_stat_init_dsrc_stats(&btree->dhandle->stats);
+
+	btree->write_gen = ckpt->write_gen;		/* Write generation */
+	btree->modified = 0;				/* Clean */
+
+	return (0);
+}
+
+/*
+ * __wt_root_ref_init --
+ *	Initialize a tree root reference, and link in the root page.
+ */
+void
+__wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno)
+{
+	memset(root_ref, 0, sizeof(*root_ref));
+
+	root_ref->page = root;
+	root_ref->state = WT_REF_MEM;
+
+	root_ref->key.recno = is_recno ? 1 : 0;
+
+	root->pg_intl_parent_ref = root_ref;
+}
+
+/*
+ * __wt_btree_tree_open --
+ *	Read in a tree from disk.
+ */
+int
+__wt_btree_tree_open(
+    WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_ITEM dsk;
+	WT_PAGE *page;
+
+	btree = S2BT(session);
+
+	/*
+	 * A buffer into which we read a root page; don't use a scratch buffer,
+	 * the buffer's allocated memory becomes the persistent in-memory page.
+	 */
+	WT_CLEAR(dsk);
+
+	/* Read the page, then build the in-memory version of the page. */
+	WT_ERR(__wt_bt_read(session, &dsk, addr, addr_size));
+	WT_ERR(__wt_page_inmem(session, NULL, dsk.data,
+	    WT_DATA_IN_ITEM(&dsk) ?
+	    WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED , &page));
+
+	/* Finish initializing the root, root reference links. */
+	__wt_root_ref_init(&btree->root, page, btree->type != BTREE_ROW);
+
+	if (0) {
+err:		__wt_buf_free(session, &dsk);
+	}
+	return (ret);
+}
+
+/*
+ * __btree_tree_open_empty --
+ *	Create an empty in-memory tree.
+ */
+static int
+__btree_tree_open_empty(WT_SESSION_IMPL *session, int creation, int readonly)
+{
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_PAGE *root, *leaf;
+	WT_PAGE_INDEX *pindex;
+	WT_REF *ref;
+
+	btree = S2BT(session);
+	root = leaf = NULL;
+
+	/*
+	 * Newly created objects can be used for cursor inserts or for bulk
+	 * loads; set a flag that's cleared when a row is inserted into the
+	 * tree.   Objects being bulk-loaded cannot be evicted, we set it
+	 * globally, there's no point in searching empty trees for eviction.
+	 */
+	if (creation) {
+		btree->bulk_load_ok = 1;
+		__wt_btree_evictable(session, 0);
+	}
+
+	/*
+	 * A note about empty trees: the initial tree is a root page and a leaf
+	 * page.  We need a pair of pages instead of just a single page because
+	 * we can reconcile the leaf page while the root stays pinned in memory.
+	 * If the pair is evicted without being modified, that's OK, nothing is
+	 * ever written.
+	 *
+	 * Create the root and leaf pages.
+	 *
+	 * !!!
+	 * Be cautious about changing the order of updates in this code: to call
+	 * __wt_page_out on error, we require a correct page setup at each point
+	 * where we might fail.
+	 */
+	switch (btree->type) {
+	case BTREE_COL_FIX:
+	case BTREE_COL_VAR:
+		WT_ERR(
+		    __wt_page_alloc(session, WT_PAGE_COL_INT, 1, 1, 1, &root));
+		root->pg_intl_parent_ref = &btree->root;
+
+		pindex = WT_INTL_INDEX_COPY(root);
+		ref = pindex->index[0];
+		ref->home = root;
+		WT_ERR(__wt_btree_new_leaf_page(session, &leaf));
+		ref->page = leaf;
+		ref->addr = NULL;
+		ref->state = WT_REF_MEM;
+		ref->key.recno = 1;
+		break;
+	case BTREE_ROW:
+		WT_ERR(
+		    __wt_page_alloc(session, WT_PAGE_ROW_INT, 0, 1, 1, &root));
+		root->pg_intl_parent_ref = &btree->root;
+
+		pindex = WT_INTL_INDEX_COPY(root);
+		ref = pindex->index[0];
+		ref->home = root;
+		WT_ERR(__wt_btree_new_leaf_page(session, &leaf));
+		ref->page = leaf;
+		ref->addr = NULL;
+		ref->state = WT_REF_MEM;
+		WT_ERR(__wt_row_ikey_incr(
+		    session, root, 0, "", 1, &ref->key.ikey));
+		break;
+	WT_ILLEGAL_VALUE_ERR(session);
+	}
+
+	/*
+	 * Mark the leaf page dirty: we didn't create an entirely valid root
+	 * page (specifically, the root page's disk address isn't set, and it's
+	 * the act of reconciling the leaf page that makes it work, we don't
+	 * try and use the original disk address of modified pages).  We could
+	 * get around that by leaving the leaf page clean and building a better
+	 * root page, but then we get into trouble because a checkpoint marks
+	 * the root page dirty to force a write, and without reconciling the
+	 * leaf page we won't realize there's no records to write, we'll write
+	 * a root page, which isn't correct for an empty tree.
+	 *
+	 * Earlier versions of this code kept the leaf page clean, but with the
+	 * "empty" flag set in the leaf page's modification structure; in that
+	 * case, checkpoints works (forced reconciliation of a root with a
+	 * single "empty" page wouldn't write any blocks). That version had
+	 * memory leaks because the eviction code didn't correctly handle pages
+	 * that were "clean" (and so never reconciled), yet "modified" with an
+	 * "empty" flag.  The goal of this code is to mimic a real tree that
+	 * simply has no records, for whatever reason, and trust reconciliation
+	 * to figure out it's empty and not write any blocks.
+	 *
+	 * We do not set the tree's modified flag because the checkpoint code
+	 * skips unmodified files in closing checkpoints (checkpoints that
+	 * don't require a write unless the file is actually dirty).  There's
+	 * no need to reconcile this file unless the application does a real
+	 * checkpoint or it's actually modified.
+	 *
+	 * Only do this for a live tree, not for checkpoints.  If we open an
+	 * empty checkpoint, the leaf page cannot be dirty or eviction may try
+	 * to write it, which will fail because checkpoints are read-only.
+	 */
+	if (!readonly) {
+		WT_ERR(__wt_page_modify_init(session, leaf));
+		__wt_page_only_modify_set(session, leaf);
+	}
+
+	/* Finish initializing the root, root reference links. */
+	__wt_root_ref_init(&btree->root, root, btree->type != BTREE_ROW);
+
+	return (0);
+
+err:	if (leaf != NULL)
+		__wt_page_out(session, &leaf);
+	if (root != NULL)
+		__wt_page_out(session, &root);
+	return (ret);
+}
+
+/*
+ * __wt_btree_new_leaf_page --
+ *	Create an empty leaf page and link it into a reference in its parent.
+ */
+int
+__wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep)
+{
+	WT_BTREE *btree;
+
+	btree = S2BT(session);
+
+	switch (btree->type) {
+	case BTREE_COL_FIX:
+		WT_RET(
+		    __wt_page_alloc(session, WT_PAGE_COL_FIX, 1, 0, 1, pagep));
+		break;
+	case BTREE_COL_VAR:
+		WT_RET(
+		    __wt_page_alloc(session, WT_PAGE_COL_VAR, 1, 0, 1, pagep));
+		break;
+	case BTREE_ROW:
+		WT_RET(
+		    __wt_page_alloc(session, WT_PAGE_ROW_LEAF, 0, 0, 1, pagep));
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+	return (0);
+}
+
+/*
+ * __wt_btree_evictable --
+ *      Setup or release a cache-resident tree.
+ */
+void
+__wt_btree_evictable(WT_SESSION_IMPL *session, int on)
+{
+	WT_BTREE *btree;
+
+	btree = S2BT(session);
+
+	/* The metadata file is never evicted. */
+	if (on && !WT_IS_METADATA(btree->dhandle))
+		F_CLR(btree, WT_BTREE_NO_EVICTION);
+	else
+		F_SET(btree, WT_BTREE_NO_EVICTION);
+}
+
+/*
+ * __btree_preload --
+ *	Pre-load internal pages.
+ */
+static int
+__btree_preload(WT_SESSION_IMPL *session)
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+	WT_REF *ref;
+	size_t addr_size;
+	const uint8_t *addr;
+
+	btree = S2BT(session);
+	bm = btree->bm;
+
+	/* Pre-load the second-level internal pages. */
+	WT_INTL_FOREACH_BEGIN(session, btree->root.page, ref) {
+		WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+		if (addr != NULL)
+			WT_RET(bm->preload(bm, session, addr, addr_size));
+	} WT_INTL_FOREACH_END;
+	return (0);
+}
+
+/*
+ * __btree_get_last_recno --
+ *	Set the last record number for a column-store.
+ */
+static int
+__btree_get_last_recno(WT_SESSION_IMPL *session)
+{
+	WT_BTREE *btree;
+	WT_PAGE *page;
+	WT_REF *next_walk;
+
+	btree = S2BT(session);
+
+	next_walk = NULL;
+	WT_RET(__wt_tree_walk(session, &next_walk, WT_READ_PREV));
+	if (next_walk == NULL)
+		return (WT_NOTFOUND);
+
+	page = next_walk->page;
+	btree->last_recno = page->type == WT_PAGE_COL_VAR ?
+	    __col_var_last_recno(page) : __col_fix_last_recno(page);
+
+	return (__wt_page_release(session, next_walk, 0));
+}
+
+/*
+ * __btree_page_sizes --
+ *	Verify the page sizes. Some of these sizes are automatically checked
+ *	using limits defined in the API, don't duplicate the logic here.
+ */
+static int
+__btree_page_sizes(WT_SESSION_IMPL *session)
+{
+	WT_BTREE *btree;
+	WT_CONFIG_ITEM cval;
+	uint64_t cache_size;
+	uint32_t intl_split_size, leaf_split_size;
+	const char **cfg;
+
+	btree = S2BT(session);
+	cfg = btree->dhandle->cfg;
+
+	WT_RET(__wt_direct_io_size_check(
+	    session, cfg, "allocation_size", &btree->allocsize));
+	WT_RET(__wt_direct_io_size_check(
+	    session, cfg, "internal_page_max", &btree->maxintlpage));
+	WT_RET(__wt_config_gets(session, cfg, "internal_item_max", &cval));
+	btree->maxintlitem = (uint32_t)cval.val;
+	WT_RET(__wt_direct_io_size_check(
+	    session, cfg, "leaf_page_max", &btree->maxleafpage));
+	WT_RET(__wt_config_gets(session, cfg, "leaf_item_max", &cval));
+	btree->maxleafitem = (uint32_t)cval.val;
+
+	WT_RET(__wt_config_gets(session, cfg, "split_pct", &cval));
+	btree->split_pct = (int)cval.val;
+
+	/*
+	 * When a page is forced to split, we want at least 50 entries on its
+	 * parent.
+	 */
+	WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval));
+	btree->maxmempage = WT_MAX((uint64_t)cval.val, 50 * btree->maxleafpage);
+
+	/*
+	 * Don't let pages grow to more than half the cache size.  Otherwise,
+	 * with very small caches, we can end up in a situation where nothing
+	 * can be evicted.  Take care getting the cache size: with a shared
+	 * cache, it may not have been set.
+	 */
+	cache_size = S2C(session)->cache_size;
+	if (cache_size > 0)
+		btree->maxmempage = WT_MIN(btree->maxmempage, cache_size / 2);
+
+	/* Allocation sizes must be a power-of-two, nothing else makes sense. */
+	if (!__wt_ispo2(btree->allocsize))
+		WT_RET_MSG(session,
+		    EINVAL, "the allocation size must be a power of two");
+
+	/* All page sizes must be in units of the allocation size. */
+	if (btree->maxintlpage < btree->allocsize ||
+	    btree->maxintlpage % btree->allocsize != 0 ||
+	    btree->maxleafpage < btree->allocsize ||
+	    btree->maxleafpage % btree->allocsize != 0)
+		WT_RET_MSG(session, EINVAL,
+		    "page sizes must be a multiple of the page allocation "
+		    "size (%" PRIu32 "B)", btree->allocsize);
+
+	/*
+	 * Set the split percentage: reconciliation splits to a smaller-than-
+	 * maximum page size so we don't split every time a new entry is added.
+	 */
+	intl_split_size = __wt_split_page_size(btree, btree->maxintlpage);
+	leaf_split_size = __wt_split_page_size(btree, btree->maxleafpage);
+
+	/*
+	 * Default values for internal and leaf page items: make sure at least
+	 * 8 items fit on split pages.
+	 */
+	if (btree->maxintlitem == 0)
+		    btree->maxintlitem = intl_split_size / 8;
+	if (btree->maxleafitem == 0)
+		    btree->maxleafitem = leaf_split_size / 8;
+
+	/*
+	 * If raw compression is configured, the application owns page layout,
+	 * it's not our problem.   Hopefully the application chose well.
+	 */
+	if (btree->compressor != NULL &&
+	    btree->compressor->compress_raw != NULL)
+		return (0);
+
+	/* Check we can fit at least 2 items on a page. */
+	if (btree->maxintlitem > btree->maxintlpage / 2)
+		return (pse1(session, "internal",
+		    btree->maxintlpage, btree->maxintlitem));
+	if (btree->maxleafitem > btree->maxleafpage / 2)
+		return (pse1(session, "leaf",
+		    btree->maxleafpage, btree->maxleafitem));
+
+	/*
+	 * Take into account the size of a split page:
+	 *
+	 * Make it a separate error message so it's clear what went wrong.
+	 */
+	if (btree->maxintlitem > intl_split_size / 2)
+		return (pse2(session, "internal",
+		    btree->maxintlpage, btree->maxintlitem, btree->split_pct));
+	if (btree->maxleafitem > leaf_split_size / 2)
+		return (pse2(session, "leaf",
+		    btree->maxleafpage, btree->maxleafitem, btree->split_pct));
+
+	return (0);
+}
+
+/*
+ * __wt_split_page_size --
+ *	Split page size calculation: we don't want to repeatedly split every
+ * time a new entry is added, so we split to a smaller-than-maximum page size.
+ */
+uint32_t
+__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize)
+{
+	uintmax_t a;
+	uint32_t split_size;
+
+	/*
+	 * Ideally, the split page size is some percentage of the maximum page
+	 * size rounded to an allocation unit (round to an allocation unit so
+	 * we don't waste space when we write).
+	 */
+	a = maxpagesize;			/* Don't overflow. */
+	split_size = (uint32_t)
+	    WT_ALIGN((a * (u_int)btree->split_pct) / 100, btree->allocsize);
+
+	/*
+	 * If the result of that calculation is the same as the allocation unit
+	 * (that happens if the maximum size is the same size as an allocation
+	 * unit, use a percentage of the maximum page size).
+	 */
+	if (split_size == btree->allocsize)
+		split_size = (uint32_t)((a * (u_int)btree->split_pct) / 100);
+
+	return (split_size);
+}
+
+/*
+ * pse1 --
+ *	Page size error message 1.
+ */
+static int
+pse1(WT_SESSION_IMPL *session, const char *type, uint32_t max, uint32_t ovfl)
+{
+	WT_RET_MSG(session, EINVAL,
+	    "%s page size (%" PRIu32 "B) too small for the maximum item size "
+	    "(%" PRIu32 "B); the page must be able to hold at least 2 items",
+	    type, max, ovfl);
+}
+
+/*
+ * pse2 --
+ *	Page size error message 2.
+ */
+static int
+pse2(WT_SESSION_IMPL *session,
+    const char *type, uint32_t max, uint32_t ovfl, int pct)
+{
+	WT_RET_MSG(session, EINVAL,
+	    "%s page size (%" PRIu32 "B) too small for the maximum item size "
+	    "(%" PRIu32 "B), because of the split percentage (%d %%); a split "
+	    "page must be able to hold at least 2 items",
+	    type, max, ovfl, pct);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_huffman.c b/src/third_party/wiredtiger/src/btree/bt_huffman.c
new file mode 100644
index 00000000000..aa6e7c36451
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_huffman.c
@@ -0,0 +1,340 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * 7-bit ASCII, with English language frequencies.
+ *
+ * Based on "Case-sensitive letter and bigram frequency counts from large-scale
+ * English corpora"
+ *	Michael N. Jones and D.J.K. Mewhort
+ *	Queen's University, Kingston, Ontario, Canada
+ * Behavior Research Methods, Instruments, & Computers 2004, 36 (3), 388-396
+ *
+ * Additionally supports space and tab characters; space is the most common
+ * character in text where it occurs, and tab appears about as frequently as
+ * 'a' and 'n' in text where it occurs.
+ */
+struct __wt_huffman_table {
+	uint32_t symbol;
+	uint32_t frequency;
+};
+static const struct __wt_huffman_table __wt_huffman_nytenglish[] = {
+	/* nul */	{ 0x00,       0 },	/* For an escape character. */
+	/*  ht */	{ 0x09, 5263779 },
+	/*  sp */	{ 0x20, 8000000 },
+	/*  !  */	{ 0x21,    2178 },
+	/*  "  */	{ 0x22,  284671 },
+	/*  #  */	{ 0x23,      10 },
+	/*  $  */	{ 0x24,   51572 },
+	/*  %  */	{ 0x25,    1993 },
+	/*  &  */	{ 0x26,    6523 },
+	/*  '  */	{ 0x27,  204497 },
+	/*  (  */	{ 0x28,   53398 },
+	/*  )  */	{ 0x29,   53735 },
+	/*  *  */	{ 0x2a,   20716 },
+	/*  +  */	{ 0x2b,     309 },
+	/*  ,  */	{ 0x2c,  984969 },
+	/*  -  */	{ 0x2d,  252302 },
+	/*  .  */	{ 0x2e,  946136 },
+	/*  /  */	{ 0x2f,    8161 },
+	/*  0  */	{ 0x30,  546233 },
+	/*  1  */	{ 0x31,  460946 },
+	/*  2  */	{ 0x32,  333499 },
+	/*  3  */	{ 0x33,  187606 },
+	/*  4  */	{ 0x34,  192528 },
+	/*  5  */	{ 0x35,  374413 },
+	/*  6  */	{ 0x36,  153865 },
+	/*  7  */	{ 0x37,  120094 },
+	/*  8  */	{ 0x38,  182627 },
+	/*  9  */	{ 0x39,  282364 },
+	/*  :  */	{ 0x3a,   54036 },
+	/*  ;  */	{ 0x3b,   36727 },
+	/*  <  */	{ 0x3c,      82 },
+	/*  =  */	{ 0x3d,      22 },
+	/*  >  */	{ 0x3e,      83 },
+	/*  ?  */	{ 0x3f,   12357 },
+	/*  @  */	{ 0x40,       1 },
+	/*  A  */	{ 0x41,  280937 },
+	/*  B  */	{ 0x42,  169474 },
+	/*  C  */	{ 0x43,  229363 },
+	/*  D  */	{ 0x44,  129632 },
+	/*  E  */	{ 0x45,  138443 },
+	/*  F  */	{ 0x46,  100751 },
+	/*  G  */	{ 0x47,   93212 },
+	/*  H  */	{ 0x48,  123632 },
+	/*  I  */	{ 0x49,  223312 },
+	/*  J  */	{ 0x4a,   78706 },
+	/*  K  */	{ 0x4b,   46580 },
+	/*  L  */	{ 0x4c,  106984 },
+	/*  M  */	{ 0x4d,  259474 },
+	/*  N  */	{ 0x4e,  205409 },
+	/*  O  */	{ 0x4f,  105700 },
+	/*  P  */	{ 0x50,  144239 },
+	/*  Q  */	{ 0x51,   11659 },
+	/*  R  */	{ 0x52,  146448 },
+	/*  S  */	{ 0x53,  304971 },
+	/*  T  */	{ 0x54,  325462 },
+	/*  U  */	{ 0x55,   57488 },
+	/*  V  */	{ 0x56,   31053 },
+	/*  W  */	{ 0x57,  107195 },
+	/*  X  */	{ 0x58,    7578 },
+	/*  Y  */	{ 0x59,   94297 },
+	/*  Z  */	{ 0x5a,    5610 },
+	/*  [  */	{ 0x5b,       1 },
+	/*  \  */	{ 0x5c,       1 },
+	/*  ]  */	{ 0x5d,       1 },
+	/*  ^  */	{ 0x5e,       1 },
+	/*  _  */	{ 0x5f,       1 },
+	/*  `  */	{ 0x60,       1 },
+	/*  a  */	{ 0x61, 5263779 },
+	/*  b  */	{ 0x62,  866156 },
+	/*  c  */	{ 0x63, 1960412 },
+	/*  d  */	{ 0x64, 2369820 },
+	/*  e  */	{ 0x65, 7741842 },
+	/*  f  */	{ 0x66, 1296925 },
+	/*  g  */	{ 0x67, 1206747 },
+	/*  h  */	{ 0x68, 2955858 },
+	/*  i  */	{ 0x69, 4527332 },
+	/*  j  */	{ 0x6a,   65856 },
+	/*  k  */	{ 0x6b,  460788 },
+	/*  l  */	{ 0x6c, 2553152 },
+	/*  m  */	{ 0x6d, 1467376 },
+	/*  n  */	{ 0x6e, 4535545 },
+	/*  o  */	{ 0x6f, 4729266 },
+	/*  p  */	{ 0x70, 1255579 },
+	/*  q  */	{ 0x71,   54221 },
+	/*  r  */	{ 0x72, 4137949 },
+	/*  s  */	{ 0x73, 4186210 },
+	/*  t  */	{ 0x74, 5507692 },
+	/*  u  */	{ 0x75, 1613323 },
+	/*  v  */	{ 0x76,  653370 },
+	/*  w  */	{ 0x77, 1015656 },
+	/*  x  */	{ 0x78,  123577 },
+	/*  y  */	{ 0x79, 1062040 },
+	/*  z  */	{ 0x7a,   66423 },
+	/*  {  */	{ 0x7b,       1 },
+	/*  |  */	{ 0x7c,       1 },
+	/*  }  */	{ 0x7d,       1 },
+	/*  ~  */	{ 0x7e,       1 }
+};
+
+static int __wt_huffman_read(WT_SESSION_IMPL *,
+    WT_CONFIG_ITEM *, struct __wt_huffman_table **, u_int *, u_int *);
+
+/*
+ * __wt_btree_huffman_open --
+ *	Configure Huffman encoding for the tree.
+ */
+int
+__wt_btree_huffman_open(WT_SESSION_IMPL *session)
+{
+	struct __wt_huffman_table *table;
+	WT_BTREE *btree;
+	WT_CONFIG_ITEM key_conf, value_conf;
+	WT_DECL_RET;
+	const char **cfg;
+	u_int entries, numbytes;
+
+	btree = S2BT(session);
+	cfg = btree->dhandle->cfg;
+
+	WT_RET(__wt_config_gets(session, cfg, "huffman_key", &key_conf));
+	WT_RET(__wt_config_gets(session, cfg, "huffman_value", &value_conf));
+	if (key_conf.len == 0 && value_conf.len == 0)
+		return (0);
+
+	switch (btree->type) {		/* Check file type compatibility. */
+	case BTREE_COL_FIX:
+		WT_RET_MSG(session, EINVAL,
+		    "fixed-size column-store files may not be Huffman encoded");
+	case BTREE_COL_VAR:
+		if (key_conf.len != 0)
+			WT_RET_MSG(session, EINVAL,
+			    "the keys of variable-length column-store files "
+			    "may not be Huffman encoded");
+		break;
+	case BTREE_ROW:
+		break;
+	}
+
+	if (strncasecmp(key_conf.str, "english", key_conf.len) == 0) {
+		struct __wt_huffman_table
+		    copy[WT_ELEMENTS(__wt_huffman_nytenglish)];
+
+		memcpy(copy,
+		    __wt_huffman_nytenglish, sizeof(__wt_huffman_nytenglish));
+		WT_RET(__wt_huffman_open(session, copy,
+		    WT_ELEMENTS(__wt_huffman_nytenglish),
+		    1, &btree->huffman_key));
+
+		/* Check for a shared key/value table. */
+		if (strncasecmp(
+		    value_conf.str, "english", value_conf.len) == 0) {
+			btree->huffman_value = btree->huffman_key;
+			return (0);
+		}
+	} else {
+		WT_RET(__wt_huffman_read(
+		    session, &key_conf, &table, &entries, &numbytes));
+		ret = __wt_huffman_open(session, table,
+		    entries, numbytes, &btree->huffman_key);
+		__wt_free(session, table);
+		if (ret != 0)
+			return (ret);
+
+		/* Check for a shared key/value table. */
+		if (value_conf.len != 0 && key_conf.len == value_conf.len &&
+		    memcmp(key_conf.str, value_conf.str, key_conf.len) == 0) {
+			btree->huffman_value = btree->huffman_key;
+			return (0);
+		}
+	}
+	if (strncasecmp(value_conf.str, "english", value_conf.len) == 0) {
+		struct __wt_huffman_table
+		    copy[WT_ELEMENTS(__wt_huffman_nytenglish)];
+
+		memcpy(copy,
+		    __wt_huffman_nytenglish, sizeof(__wt_huffman_nytenglish));
+		WT_RET(__wt_huffman_open(session, copy,
+		    WT_ELEMENTS(__wt_huffman_nytenglish),
+		    1, &btree->huffman_value));
+	} else {
+		WT_RET(__wt_huffman_read(
+		    session, &value_conf, &table, &entries, &numbytes));
+		ret = __wt_huffman_open(session, table,
+		    entries, numbytes, &btree->huffman_value);
+		__wt_free(session, table);
+		if (ret != 0)
+			return (ret);
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_huffman_read --
+ *	Read a Huffman table from a file.
+ */
+static int
+__wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip,
+    struct __wt_huffman_table **tablep, u_int *entriesp, u_int *numbytesp)
+{
+	struct __wt_huffman_table *table, *tp;
+	FILE *fp;
+	WT_DECL_RET;
+	uint64_t symbol, frequency;
+	u_int entries, lineno;
+	char *file;
+
+	*tablep = NULL;
+	*entriesp = *numbytesp = 0;
+
+	fp = NULL;
+	file = NULL;
+	table = NULL;
+
+	/*
+	 * UTF-8 table is 256 bytes, with a range of 0-255.
+	 * UTF-16 is 128KB (2 * 65536) bytes, with a range of 0-65535.
+	 */
+	if (strncasecmp(ip->str, "utf8", 4) == 0) {
+		entries = UINT8_MAX;
+		*numbytesp = 1;
+		WT_ERR(__wt_calloc_def(session, entries, &table));
+
+		if (ip->len == 4)
+			WT_ERR_MSG(session, EINVAL,
+			    "no Huffman table file name specified");
+		WT_ERR(__wt_calloc_def(session, ip->len, &file));
+		memcpy(file, ip->str + 4, ip->len - 4);
+	} else if (strncasecmp(ip->str, "utf16", 5) == 0) {
+		entries = UINT16_MAX;
+		*numbytesp = 2;
+		WT_ERR(__wt_calloc_def(session, entries, &table));
+
+		if (ip->len == 5)
+			WT_ERR_MSG(session, EINVAL,
+			    "no Huffman table file name specified");
+		WT_ERR(__wt_calloc_def(session, ip->len, &file));
+		memcpy(file, ip->str + 5, ip->len - 5);
+	} else {
+		WT_ERR_MSG(session, EINVAL,
+		    "unknown Huffman configuration value %.*s",
+		    (int)ip->len, ip->str);
+	}
+
+	if ((fp = fopen(file, "r")) == NULL)
+		WT_ERR_MSG(session, __wt_errno(),
+		    "unable to read Huffman table file %.*s",
+		    (int)ip->len, ip->str);
+
+	for (tp = table, lineno = 1; (ret =
+	    fscanf(fp, "%" SCNu64 " %" SCNu64, &symbol, &frequency)) != EOF;
+	    ++tp, ++lineno) {
+		if (lineno > entries)
+			WT_ERR_MSG(session, EINVAL,
+			    "Huffman table file %.*s is corrupted, "
+			    "more than %" PRIu32 " entries",
+			    (int)ip->len, ip->str, entries);
+		if (ret != 2)
+			WT_ERR_MSG(session, EINVAL,
+			    "line %u of Huffman table file %.*s is corrupted: "
+			    "expected two unsigned integral values",
+			    lineno, (int)ip->len, ip->str);
+		if (symbol > entries)
+			WT_ERR_MSG(session, EINVAL,
+			    "line %u of Huffman table file %.*s is corrupted: "
+			    "symbol larger than maximum value of %u",
+			    lineno, (int)ip->len, ip->str, entries);
+		if (frequency > UINT32_MAX)
+			WT_ERR_MSG(session, EINVAL,
+			    "line %u of Huffman table file %.*s is corrupted: "
+			    "frequency larger than maximum value of %" PRIu32,
+			    lineno, (int)ip->len, ip->str, UINT32_MAX);
+
+		tp->symbol = (uint32_t)symbol;
+		tp->frequency = (uint32_t)frequency;
+	}
+
+	*entriesp = lineno - 1;
+	*tablep = table;
+
+	if (0) {
+err:		__wt_free(session, table);
+	}
+	if (fp != NULL)
+		(void)fclose(fp);
+	__wt_free(session, file);
+	return (ret);
+}
+
+/*
+ * __wt_btree_huffman_close --
+ *	Close the Huffman tables.
+ */
+void
+__wt_btree_huffman_close(WT_SESSION_IMPL *session)
+{
+	WT_BTREE *btree;
+
+	btree = S2BT(session);
+
+	if (btree->huffman_key != NULL) {
+		/* Key and data may use the same table, only close it once. */
+		if (btree->huffman_value == btree->huffman_key)
+			btree->huffman_value = NULL;
+
+		__wt_huffman_close(session, btree->huffman_key);
+		btree->huffman_key = NULL;
+	}
+	if (btree->huffman_value != NULL) {
+		__wt_huffman_close(session, btree->huffman_value);
+		btree->huffman_value = NULL;
+	}
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_io.c b/src/third_party/wiredtiger/src/btree/bt_io.c
new file mode 100644
index 00000000000..ccc67c994dc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_io.c
@@ -0,0 +1,304 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_bt_read --
+ *	Read a cookie referenced block into a buffer.
+ */
+int
+__wt_bt_read(WT_SESSION_IMPL *session,
+    WT_ITEM *buf, const uint8_t *addr, size_t addr_size)
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	const WT_PAGE_HEADER *dsk;
+	size_t result_len;
+
+	btree = S2BT(session);
+	bm = btree->bm;
+
+	/*
+	 * If anticipating a compressed block, read into a scratch buffer and
+	 * decompress into the caller's buffer.  Else, read directly into the
+	 * caller's buffer.
+	 */
+	if (btree->compressor == NULL) {
+		WT_RET(bm->read(bm, session, buf, addr, addr_size));
+		dsk = buf->data;
+	} else {
+		WT_RET(__wt_scr_alloc(session, 0, &tmp));
+		WT_ERR(bm->read(bm, session, tmp, addr, addr_size));
+		dsk = tmp->data;
+	}
+
+	/*
+	 * If the block is compressed, copy the skipped bytes of the original
+	 * image into place, then decompress.
+	 */
+	if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) {
+		if (btree->compressor == NULL ||
+		    btree->compressor->decompress == NULL)
+			WT_ERR_MSG(session, WT_ERROR,
+			    "read compressed block where no compression engine "
+			    "configured");
+
+		/*
+		 * We're allocating the exact number of bytes we're expecting
+		 * from decompression.
+		 */
+		WT_ERR(__wt_buf_initsize(session, buf, dsk->mem_size));
+
+		/*
+		 * Note the source length is NOT the number of compressed bytes,
+		 * it's the length of the block we just read (minus the skipped
+		 * bytes).  We don't store the number of compressed bytes: some
+		 * compression engines need that length stored externally, they
+		 * don't have markers in the stream to signal the end of the
+		 * compressed bytes.  Those engines must store the compressed
+		 * byte length somehow, see the snappy compression extension for
+		 * an example.
+		 */
+		memcpy(buf->mem, tmp->data, WT_BLOCK_COMPRESS_SKIP);
+		ret = btree->compressor->decompress(
+		    btree->compressor, &session->iface,
+		    (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP,
+		    tmp->size - WT_BLOCK_COMPRESS_SKIP,
+		    (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP,
+		    dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len);
+
+		/*
+		 * If checksums were turned off because we're depending on the
+		 * decompression to fail on any corrupted data, we'll end up
+		 * here after corruption happens.  If we're salvaging the file,
+		 * it's OK, otherwise it's really, really bad.
+		 */
+		if (ret != 0 ||
+		    result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP)
+			WT_ERR(
+			    F_ISSET(btree, WT_BTREE_VERIFY) ||
+			    F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ?
+			    WT_ERROR :
+			    __wt_illegal_value(session, btree->dhandle->name));
+	} else
+		if (btree->compressor == NULL)
+			buf->size = dsk->mem_size;
+		else
+			/*
+			 * We guessed wrong: there was a compressor, but this
+			 * block was not compressed, and now the page is in the
+			 * wrong buffer and the buffer may be of the wrong size.
+			 * This should be rare, but happens with small blocks
+			 * that aren't worth compressing.
+			 */
+			WT_ERR(__wt_buf_set(
+			    session, buf, tmp->data, dsk->mem_size));
+
+	/* If the handle is a verify handle, verify the physical page. */
+	if (F_ISSET(btree, WT_BTREE_VERIFY)) {
+		if (tmp == NULL)
+			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+		WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size));
+		WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf));
+	}
+
+	WT_STAT_FAST_CONN_INCR(session, cache_read);
+	WT_STAT_FAST_DATA_INCR(session, cache_read);
+	if (F_ISSET(dsk, WT_PAGE_COMPRESSED))
+		WT_STAT_FAST_DATA_INCR(session, compress_read);
+	WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size);
+	WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size);
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __wt_bt_write --
+ *	Write a buffer into a block, returning the block's addr/size and
+ * checksum.
+ */
+int
+__wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf,
+    uint8_t *addr, size_t *addr_sizep, int checkpoint, int compressed)
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+	WT_ITEM *ip;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	WT_PAGE_HEADER *dsk;
+	size_t len, src_len, dst_len, result_len, size;
+	int data_cksum, compression_failed;
+	uint8_t *src, *dst;
+
+	btree = S2BT(session);
+	bm = btree->bm;
+
+	/* Checkpoint calls are different than standard calls. */
+	WT_ASSERT(session,
+	    (checkpoint == 0 && addr != NULL && addr_sizep != NULL) ||
+	    (checkpoint == 1 && addr == NULL && addr_sizep == NULL));
+
+#ifdef HAVE_DIAGNOSTIC
+	/*
+	 * We're passed a table's disk image.  Decompress if necessary and
+	 * verify the image.  Always check the in-memory length for accuracy.
+	 */
+	dsk = buf->mem;
+	WT_ASSERT(session, dsk->u.entries != 0);
+	if (compressed) {
+		WT_ERR(__wt_scr_alloc(session, dsk->mem_size, &tmp));
+
+		memcpy(tmp->mem, buf->data, WT_BLOCK_COMPRESS_SKIP);
+		WT_ERR(btree->compressor->decompress(
+		    btree->compressor, &session->iface,
+		    (uint8_t *)buf->data + WT_BLOCK_COMPRESS_SKIP,
+		    buf->size - WT_BLOCK_COMPRESS_SKIP,
+		    (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP,
+		    tmp->memsize - WT_BLOCK_COMPRESS_SKIP,
+		    &result_len));
+		WT_ASSERT(session,
+		    dsk->mem_size == result_len + WT_BLOCK_COMPRESS_SKIP);
+		tmp->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP;
+		ip = tmp;
+	} else {
+		WT_ASSERT(session, dsk->mem_size == buf->size);
+		ip = buf;
+	}
+	WT_ERR(__wt_verify_dsk(session, "[write-check]", ip));
+	__wt_scr_free(&tmp);
+#endif
+
+	/*
+	 * Optionally stream-compress the data, but don't compress blocks that
+	 * are already as small as they're going to get.
+	 */
+	if (btree->compressor == NULL ||
+	    btree->compressor->compress == NULL || compressed)
+		ip = buf;
+	else if (buf->size <= btree->allocsize) {
+		ip = buf;
+		WT_STAT_FAST_DATA_INCR(session, compress_write_too_small);
+	} else {
+		/* Skip the header bytes of the source data. */
+		src = (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP;
+		src_len = buf->size - WT_BLOCK_COMPRESS_SKIP;
+
+		/*
+		 * Compute the size needed for the destination buffer.  We only
+		 * allocate enough memory for a copy of the original by default,
+		 * if any compressed version is bigger than the original, we
+		 * won't use it.  However, some compression engines (snappy is
+		 * one example), may need more memory because they don't stop
+		 * just because there's no more memory into which to compress.
+		 */
+		if (btree->compressor->pre_size == NULL)
+			len = src_len;
+		else
+			WT_ERR(btree->compressor->pre_size(btree->compressor,
+			    &session->iface, src, src_len, &len));
+
+		size = len + WT_BLOCK_COMPRESS_SKIP;
+		WT_ERR(bm->write_size(bm, session, &size));
+		WT_ERR(__wt_scr_alloc(session, size, &tmp));
+
+		/* Skip the header bytes of the destination data. */
+		dst = (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP;
+		dst_len = len;
+
+		compression_failed = 0;
+		WT_ERR(btree->compressor->compress(btree->compressor,
+		    &session->iface,
+		    src, src_len,
+		    dst, dst_len,
+		    &result_len, &compression_failed));
+		result_len += WT_BLOCK_COMPRESS_SKIP;
+
+		/*
+		 * If compression fails, or doesn't gain us at least one unit of
+		 * allocation, fallback to the original version.  This isn't
+		 * unexpected: if compression doesn't work for some chunk of
+		 * data for some reason (noting likely additional format/header
+		 * information which compressed output requires), it just means
+		 * the uncompressed version is as good as it gets, and that's
+		 * what we use.
+		 */
+		if (compression_failed ||
+		    buf->size / btree->allocsize ==
+		    result_len / btree->allocsize) {
+			ip = buf;
+			WT_STAT_FAST_DATA_INCR(session, compress_write_fail);
+		} else {
+			compressed = 1;
+			WT_STAT_FAST_DATA_INCR(session, compress_write);
+
+			/*
+			 * Copy in the skipped header bytes, set the final data
+			 * size.
+			 */
+			memcpy(tmp->mem, buf->mem, WT_BLOCK_COMPRESS_SKIP);
+			tmp->size = result_len;
+			ip = tmp;
+		}
+	}
+	dsk = ip->mem;
+
+	/* If the buffer is compressed, set the flag. */
+	if (compressed)
+		F_SET(dsk, WT_PAGE_COMPRESSED);
+
+	/*
+	 * We increment the block's write generation so it's easy to identify
+	 * newer versions of blocks during salvage.  (It's common in WiredTiger,
+	 * at least for the default block manager, for multiple blocks to be
+	 * internally consistent with identical first and last keys, so we need
+	 * a way to know the most recent state of the block.  We could check
+	 * which leaf is referenced by a valid internal page, but that implies
+	 * salvaging internal pages, which I don't want to do, and it's not
+	 * as good anyway, because the internal page may not have been written
+	 * after the leaf page was updated.  So, write generations it is.
+	 *
+	 * Nothing is locked at this point but two versions of a page with the
+	 * same generation is pretty unlikely, and if we did, they're going to
+	 * be roughly identical for the purposes of salvage, anyway.
+	 */
+	dsk->write_gen = ++btree->write_gen;
+
+	/*
+	 * Checksum the data if the buffer isn't compressed or checksums are
+	 * configured.
+	 */
+	switch (btree->checksum) {
+	case CKSUM_ON:
+		data_cksum = 1;
+		break;
+	case CKSUM_OFF:
+		data_cksum = 0;
+		break;
+	case CKSUM_UNCOMPRESSED:
+	default:
+		data_cksum = !compressed;
+		break;
+	}
+
+	/* Call the block manager to write the block. */
+	WT_ERR(checkpoint ?
+	    bm->checkpoint(bm, session, ip, btree->ckpt, data_cksum) :
+	    bm->write(bm, session, ip, addr, addr_sizep, data_cksum));
+
+	WT_STAT_FAST_CONN_INCR(session, cache_write);
+	WT_STAT_FAST_DATA_INCR(session, cache_write);
+	WT_STAT_FAST_CONN_INCRV(session, cache_bytes_write, ip->size);
+	WT_STAT_FAST_DATA_INCRV(session, cache_bytes_write, ip->size);
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_misc.c b/src/third_party/wiredtiger/src/btree/bt_misc.c
new file mode 100644
index 00000000000..cba1c0c61aa
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_misc.c
@@ -0,0 +1,128 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_page_type_string --
+ *	Return a string representing the page type.
+ */
+const char *
+__wt_page_type_string(u_int type)
+{
+	switch (type) {
+	case WT_PAGE_INVALID:
+		return ("invalid");
+	case WT_PAGE_BLOCK_MANAGER:
+		return ("block manager");
+	case WT_PAGE_COL_FIX:
+		return ("column-store fixed-length leaf");
+	case WT_PAGE_COL_INT:
+		return ("column-store internal");
+	case WT_PAGE_COL_VAR:
+		return ("column-store variable-length leaf");
+	case WT_PAGE_OVFL:
+		return ("overflow");
+	case WT_PAGE_ROW_INT:
+		return ("row-store internal");
+	case WT_PAGE_ROW_LEAF:
+		return ("row-store leaf");
+	default:
+		return ("unknown");
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * __wt_cell_type_string --
+ *	Return a string representing the cell type.
+ */
+const char *
+__wt_cell_type_string(uint8_t type)
+{
+	switch (type) {
+	case WT_CELL_ADDR_DEL:
+		return ("addr/del");
+	case WT_CELL_ADDR_INT:
+		return ("addr/int");
+	case WT_CELL_ADDR_LEAF:
+		return ("addr/leaf");
+	case WT_CELL_ADDR_LEAF_NO:
+		return ("addr/leaf-no");
+	case WT_CELL_DEL:
+		return ("deleted");
+	case WT_CELL_KEY:
+		return ("key");
+	case WT_CELL_KEY_PFX:
+		return ("key/pfx");
+	case WT_CELL_KEY_OVFL:
+		return ("key/ovfl");
+	case WT_CELL_KEY_SHORT:
+		return ("key/short");
+	case WT_CELL_KEY_SHORT_PFX:
+		return ("key/short,pfx");
+	case WT_CELL_KEY_OVFL_RM:
+		return ("key/ovfl,rm");
+	case WT_CELL_VALUE:
+		return ("value");
+	case WT_CELL_VALUE_COPY:
+		return ("value/copy");
+	case WT_CELL_VALUE_OVFL:
+		return ("value/ovfl");
+	case WT_CELL_VALUE_OVFL_RM:
+		return ("value/ovfl,rm");
+	case WT_CELL_VALUE_SHORT:
+		return ("value/short");
+	default:
+		return ("unknown");
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * __wt_page_addr_string --
+ *	Figure out a page's "address" and load a buffer with a printable,
+ * nul-terminated representation of that address.
+ */
+const char *
+__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf)
+{
+	size_t addr_size;
+	const uint8_t *addr;
+
+	if (__wt_ref_is_root(ref)) {
+		buf->data = "[Root]";
+		buf->size = strlen("[Root]");
+		return (buf->data);
+	}
+
+	(void)__wt_ref_info(session, ref, &addr, &addr_size, NULL);
+	return (__wt_addr_string(session, addr, addr_size, buf));
+}
+
+/*
+ * __wt_addr_string --
+ *	Load a buffer with a printable, nul-terminated representation of an
+ * address.
+ */
+const char *
+__wt_addr_string(WT_SESSION_IMPL *session,
+    const uint8_t *addr, size_t addr_size, WT_ITEM *buf)
+{
+	WT_BM *bm;
+
+	bm = S2BT(session)->bm;
+
+	if (addr == NULL) {
+		buf->data = "[NoAddr]";
+		buf->size = strlen("[NoAddr]");
+	} else if (bm->addr_string(bm, session, buf, addr, addr_size) != 0) {
+		buf->data = "[Error]";
+		buf->size = strlen("[Error]");
+	}
+	return (buf->data);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
new file mode 100644
index 00000000000..4cd317f1e8f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
@@ -0,0 +1,270 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __ovfl_read --
+ *	Read an overflow item from the disk.
+ */
+static int
+__ovfl_read(WT_SESSION_IMPL *session,
+    const uint8_t *addr, size_t addr_size, WT_ITEM *store)
+{
+	WT_BTREE *btree;
+	const WT_PAGE_HEADER *dsk;
+
+	btree = S2BT(session);
+
+	/*
+	 * Read the overflow item from the block manager, then reference the
+	 * start of the data and set the data's length.
+	 *
+	 * Overflow reads are synchronous. That may bite me at some point, but
+	 * WiredTiger supports large page sizes, overflow items should be rare.
+	 */
+	WT_RET(__wt_bt_read(session, store, addr, addr_size));
+	dsk = store->data;
+	store->data = WT_PAGE_HEADER_BYTE(btree, dsk);
+	store->size = dsk->u.datalen;
+
+	WT_STAT_FAST_DATA_INCR(session, cache_read_overflow);
+
+	return (0);
+}
+
+/*
+ * __wt_ovfl_read --
+ *	Bring an overflow item into memory.
+ */
+int
+__wt_ovfl_read(WT_SESSION_IMPL *session,
+    WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+{
+	WT_DECL_RET;
+
+	/*
+	 * If no page specified, there's no need to lock and there's no cache
+	 * to search, we don't care about WT_CELL_VALUE_OVFL_RM cells.
+	 */
+	if (page == NULL)
+		return (
+		    __ovfl_read(session, unpack->data, unpack->size, store));
+
+	/*
+	 * WT_CELL_VALUE_OVFL_RM cells: If reconciliation deleted an overflow
+	 * value, but there was still a reader in the system that might need it,
+	 * the on-page cell type will have been reset to WT_CELL_VALUE_OVFL_RM
+	 * and we will be passed a page so we can look-aside into the cache of
+	 * such values.
+	 *
+	 * Acquire the overflow lock, and retest the on-page cell's value inside
+	 * the lock.
+	 */
+	WT_RET(__wt_readlock(session, S2BT(session)->ovfl_lock));
+	ret = __wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM ?
+	    __wt_ovfl_txnc_search(page, unpack->data, unpack->size, store) :
+	    __ovfl_read(session, unpack->data, unpack->size, store);
+	WT_TRET(__wt_readunlock(session, S2BT(session)->ovfl_lock));
+
+	return (ret);
+}
+
+/*
+ * __ovfl_cache_col_visible --
+ *	column-store: check for a globally visible update.
+ */
+static int
+__ovfl_cache_col_visible(
+    WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_CELL_UNPACK *unpack)
+{
+	/*
+	 * Column-store is harder than row_store: we're here because there's a
+	 * reader in the system that might read the original version of an
+	 * overflow record, which might match a number of records.  For example,
+	 * the original overflow value was for records 100-200, we've replaced
+	 * each of those records individually, but there exists a reader that
+	 * might read any one of those records, and all of those records have
+	 * different update entries with different transaction IDs.  Since it's
+	 * infeasible to determine if there's a globally visible update for each
+	 * reader for each record, we test the simple case where a single record
+	 * has a single, globally visible update.  If that's not the case, cache
+	 * the value.
+	 */
+	if (__wt_cell_rle(unpack) == 1 &&
+	    upd != NULL &&		/* Sanity: upd should always be set. */
+	    __wt_txn_visible_all(session, upd->txnid))
+		return (1);
+	return (0);
+}
+
+/*
+ * __ovfl_cache_row_visible --
+ *	row-store: check for a globally visible update.
+ */
+static int
+__ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip)
+{
+	WT_UPDATE *upd;
+
+	/* Check to see if there's a globally visible update. */
+	for (upd = WT_ROW_UPDATE(page, rip); upd != NULL; upd = upd->next)
+		if (__wt_txn_visible_all(session, upd->txnid))
+			return (1);
+
+	return (0);
+}
+
+/*
+ * __ovfl_cache --
+ *	Cache a deleted overflow value.
+ */
+static int
+__ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack)
+{
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	size_t addr_size;
+	const uint8_t *addr;
+
+	addr = unpack->data;
+	addr_size = unpack->size;
+
+	WT_RET(__wt_scr_alloc(session, 1024, &tmp));
+
+	/* Enter the value into the overflow cache. */
+	WT_ERR(__ovfl_read(session, addr, addr_size, tmp));
+	WT_ERR(__wt_ovfl_txnc_add(
+	    session, page, addr, addr_size, tmp->data, tmp->size));
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __wt_ovfl_cache --
+ *	Handle deletion of an overflow value.
+ */
+int
+__wt_ovfl_cache(WT_SESSION_IMPL *session,
+    WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack)
+{
+	int visible;
+
+	/*
+	 * This function solves a problem in reconciliation. The scenario is:
+	 *     - reconciling a leaf page that references an overflow item
+	 *     - the item is updated and the update committed
+	 *     - a checkpoint runs, freeing the backing overflow blocks
+	 *     - a snapshot transaction wants the original version of the item
+	 *
+	 * In summary, we may need the original version of an overflow item for
+	 * a snapshot transaction after the item was deleted from a page that's
+	 * subsequently been checkpointed, where the checkpoint must know about
+	 * the freed blocks.  We don't have any way to delay a free of the
+	 * underlying blocks until a particular set of transactions exit (and
+	 * this shouldn't be a common scenario), so cache the overflow value in
+	 * memory.
+	 *
+	 * This gets hard because the snapshot transaction reader might:
+	 *     - search the WT_UPDATE list and not find an useful entry
+	 *     - read the overflow value's address from the on-page cell
+	 *     - go to sleep
+	 *     - checkpoint runs, caches the overflow value, frees the blocks
+	 *     - another thread allocates and overwrites the blocks
+	 *     - the reader wakes up and reads the wrong value
+	 *
+	 * Use a read/write lock and the on-page cell to fix the problem: hold
+	 * a write lock when changing the cell type from WT_CELL_VALUE_OVFL to
+	 * WT_CELL_VALUE_OVFL_RM and hold a read lock when reading an overflow
+	 * item.
+	 *
+	 * The read/write lock is per btree, but it could be per page or even
+	 * per overflow item.  We don't do any of that because overflow values
+	 * are supposed to be rare and we shouldn't see contention for the lock.
+	 *
+	 * Check for a globally visible update.  If there is a globally visible
+	 * update, we don't need to cache the item because it's not possible for
+	 * a running thread to have moved past it.
+	 */
+	switch (page->type) {
+	case WT_PAGE_COL_VAR:
+		visible = __ovfl_cache_col_visible(session, cookie, vpack);
+		break;
+	case WT_PAGE_ROW_LEAF:
+		visible = __ovfl_cache_row_visible(session, page, cookie);
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	/*
+	 * If there's no globally visible update, there's a reader in the system
+	 * that might try and read the old value, cache it.
+	 */
+	if (!visible) {
+		WT_RET(__ovfl_cache(session, page, vpack));
+		WT_STAT_FAST_DATA_INCR(session, cache_overflow_value);
+	}
+
+	/*
+	 * Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the
+	 * underlying overflow value's blocks to be freed when reconciliation
+	 * completes.
+	 */
+	return (__wt_ovfl_discard_add(session, page, vpack->cell));
+}
+
+/*
+ * __wt_ovfl_discard --
+ *	Discard an on-page overflow value, and reset the page's cell.
+ */
+int
+__wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell)
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+	WT_CELL_UNPACK *unpack, _unpack;
+	WT_DECL_RET;
+
+	btree = S2BT(session);
+	bm = btree->bm;
+	unpack = &_unpack;
+
+	__wt_cell_unpack(cell, unpack);
+
+	/*
+	 * Finally remove overflow key/value objects, called when reconciliation
+	 * finishes after successfully writing a page.
+	 *
+	 * Keys must have already been instantiated and value objects must have
+	 * already been cached (if they might potentially still be read by any
+	 * running transaction).
+	 *
+	 * Acquire the overflow lock to avoid racing with a thread reading the
+	 * backing overflow blocks.
+	 */
+	WT_RET(__wt_writelock(session, btree->ovfl_lock));
+
+	switch (unpack->raw) {
+	case WT_CELL_KEY_OVFL:
+		__wt_cell_type_reset(session,
+		    unpack->cell, WT_CELL_KEY_OVFL, WT_CELL_KEY_OVFL_RM);
+		break;
+	case WT_CELL_VALUE_OVFL:
+		__wt_cell_type_reset(session,
+		    unpack->cell, WT_CELL_VALUE_OVFL, WT_CELL_VALUE_OVFL_RM);
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	WT_TRET(__wt_writeunlock(session, btree->ovfl_lock));
+
+	/* Free the backing disk blocks. */
+	WT_TRET(bm->free(bm, session, unpack->data, unpack->size));
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c
new file mode 100644
index 00000000000..c5f24c06286
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_page.c
@@ -0,0 +1,734 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static void __inmem_col_fix(WT_SESSION_IMPL *, WT_PAGE *);
+static void __inmem_col_int(WT_SESSION_IMPL *, WT_PAGE *);
+static int  __inmem_col_var(WT_SESSION_IMPL *, WT_PAGE *, size_t *);
+static int  __inmem_row_int(WT_SESSION_IMPL *, WT_PAGE *, size_t *);
+static int  __inmem_row_leaf(WT_SESSION_IMPL *, WT_PAGE *);
+static int  __inmem_row_leaf_entries(
+	WT_SESSION_IMPL *, const WT_PAGE_HEADER *, uint32_t *);
+
+/*
+ * __evict_force_check --
+ *	Check if a page matches the criteria for forced eviction.
+ */
+static int
+__evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_BTREE *btree;
+
+	btree = S2BT(session);
+
+	/* Pages are usually small enough, check that first. */
+	if (page->memory_footprint < btree->maxmempage)
+		return (0);
+
+	/* Leaf pages only. */
+	if (page->type != WT_PAGE_COL_FIX &&
+	    page->type != WT_PAGE_COL_VAR &&
+	    page->type != WT_PAGE_ROW_LEAF)
+		return (0);
+
+	/* Eviction may be turned off, although that's rare. */
+	if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
+		return (0);
+
+	/*
+	 * It's hard to imagine a page with a huge memory footprint that has
+	 * never been modified, but check to be sure.
+	 */
+	if (page->modify == NULL)
+		return (0);
+
+	/* Trigger eviction on the next page release. */
+	page->read_gen = WT_READGEN_OLDEST;
+
+	return (1);
+}
+
+/*
+ * __wt_page_in_func --
+ *	Acquire a hazard pointer to a page; if the page is not in-memory,
+ *	read it from the disk and build an in-memory version.
+ */
+int
+__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
+#ifdef HAVE_DIAGNOSTIC
+    , const char *file, int line
+#endif
+    )
+{
+	WT_DECL_RET;
+	WT_PAGE *page;
+	int busy, force_attempts, oldgen;
+
+	for (force_attempts = oldgen = 0;;) {
+		switch (ref->state) {
+		case WT_REF_DISK:
+		case WT_REF_DELETED:
+			if (LF_ISSET(WT_READ_CACHE))
+				return (WT_NOTFOUND);
+
+			/*
+			 * The page isn't in memory, attempt to read it.
+			 * Make sure there is space in the cache.
+			 */
+			WT_RET(__wt_cache_full_check(session));
+			WT_RET(__wt_cache_read(session, ref));
+			oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
+			    F_ISSET(session, WT_SESSION_NO_CACHE);
+			continue;
+		case WT_REF_READING:
+			if (LF_ISSET(WT_READ_CACHE))
+				return (WT_NOTFOUND);
+			/* FALLTHROUGH */
+		case WT_REF_LOCKED:
+			if (LF_ISSET(WT_READ_NO_WAIT))
+				return (WT_NOTFOUND);
+			/* The page is busy -- wait. */
+			break;
+		case WT_REF_SPLIT:
+			return (WT_RESTART);
+		case WT_REF_MEM:
+			/*
+			 * The page is in memory: get a hazard pointer, update
+			 * the page's LRU and return.  The expected reason we
+			 * can't get a hazard pointer is because the page is
+			 * being evicted; yield and try again.
+			 */
+#ifdef HAVE_DIAGNOSTIC
+			WT_RET(
+			    __wt_hazard_set(session, ref, &busy, file, line));
+#else
+			WT_RET(__wt_hazard_set(session, ref, &busy));
+#endif
+			if (busy)
+				break;
+
+			page = ref->page;
+			WT_ASSERT(session, page != NULL);
+
+			/* Forcibly evict pages that are too big. */
+			if (!LF_ISSET(WT_READ_NO_EVICT) &&
+			    force_attempts < 10 &&
+			    __evict_force_check(session, page)) {
+				++force_attempts;
+				WT_RET(__wt_page_release(session, ref, flags));
+				break;
+			}
+
+			/* Check if we need an autocommit transaction. */
+			if ((ret = __wt_txn_autocommit_check(session)) != 0) {
+				WT_TRET(__wt_hazard_clear(session, page));
+				return (ret);
+			}
+
+			/*
+			 * If we read the page and we are configured to not
+			 * trash the cache, set the oldest read generation so
+			 * the page is forcibly evicted as soon as possible.
+			 *
+			 * Otherwise, update the page's read generation.
+			 */
+			if (oldgen && page->read_gen == WT_READGEN_NOTSET)
+				page->read_gen = WT_READGEN_OLDEST;
+			else if (!LF_ISSET(WT_READ_NO_GEN) &&
+			    page->read_gen < __wt_cache_read_gen(session))
+				page->read_gen =
+				    __wt_cache_read_gen_set(session);
+
+			return (0);
+		WT_ILLEGAL_VALUE(session);
+		}
+
+		/* We failed to get the page -- yield before retrying. */
+		__wt_yield();
+	}
+}
+
+/*
+ * __wt_page_alloc --
+ *	Create or read a page into the cache.
+ */
+int
+__wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type,
+    uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep)
+{
+	WT_CACHE *cache;
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_PAGE_INDEX *pindex;
+	size_t size;
+	uint32_t i;
+	void *p;
+
+	*pagep = NULL;
+
+	cache = S2C(session)->cache;
+	page = NULL;
+
+	size = sizeof(WT_PAGE);
+	switch (type) {
+	case WT_PAGE_COL_FIX:
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_ROW_INT:
+		break;
+	case WT_PAGE_COL_VAR:
+		/*
+		 * Variable-length column-store leaf page: allocate memory to
+		 * describe the page's contents with the initial allocation.
+		 */
+		size += alloc_entries * sizeof(WT_COL);
+		break;
+	case WT_PAGE_ROW_LEAF:
+		/*
+		 * Row-store leaf page: allocate memory to describe the page's
+		 * contents with the initial allocation.
+		 */
+		size += alloc_entries * sizeof(WT_ROW);
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	WT_RET(__wt_calloc(session, 1, size, &page));
+
+	page->type = type;
+	page->read_gen = WT_READGEN_NOTSET;
+
+	switch (type) {
+	case WT_PAGE_COL_FIX:
+		page->pg_fix_recno = recno;
+		page->pg_fix_entries = alloc_entries;
+		break;
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_ROW_INT:
+		page->pg_intl_recno = recno;
+
+		/*
+		 * Internal pages have an array of references to objects so they
+		 * can split.  Allocate the array of references and optionally,
+		 * the objects to which they point.
+		 */
+		WT_ERR(__wt_calloc(session, 1,
+		    sizeof(WT_PAGE_INDEX) + alloc_entries * sizeof(WT_REF *),
+		    &p));
+		size +=
+		    sizeof(WT_PAGE_INDEX) + alloc_entries * sizeof(WT_REF *);
+		pindex = p;
+		pindex->index = (WT_REF **)((WT_PAGE_INDEX *)p + 1);
+		pindex->entries = alloc_entries;
+		WT_INTL_INDEX_SET(page, pindex);
+		if (alloc_refs)
+			for (i = 0; i < pindex->entries; ++i) {
+				WT_ERR(__wt_calloc_def(
+				    session, 1, &pindex->index[i]));
+				size += sizeof(WT_REF);
+			}
+		if (0) {
+err:			if ((pindex = WT_INTL_INDEX_COPY(page)) != NULL) {
+				for (i = 0; i < pindex->entries; ++i)
+					__wt_free(session, pindex->index[i]);
+				__wt_free(session, pindex);
+			}
+			__wt_free(session, page);
+			return (ret);
+		}
+		break;
+	case WT_PAGE_COL_VAR:
+		page->pg_var_recno = recno;
+		page->pg_var_d = (WT_COL *)((uint8_t *)page + sizeof(WT_PAGE));
+		page->pg_var_entries = alloc_entries;
+		break;
+	case WT_PAGE_ROW_LEAF:
+		page->pg_row_d = (WT_ROW *)((uint8_t *)page + sizeof(WT_PAGE));
+		page->pg_row_entries = alloc_entries;
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	/* Increment the cache statistics. */
+	__wt_cache_page_inmem_incr(session, page, size);
+	(void)WT_ATOMIC_ADD8(cache->pages_inmem, 1);
+
+	*pagep = page;
+	return (0);
+}
+
+/*
+ * __wt_page_inmem --
+ *	Build in-memory page information.
+ */
+int
+__wt_page_inmem(WT_SESSION_IMPL *session,
+    WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep)
+{
+	WT_DECL_RET;
+	WT_PAGE *page;
+	const WT_PAGE_HEADER *dsk;
+	uint32_t alloc_entries;
+	size_t size;
+
+	*pagep = NULL;
+
+	dsk = image;
+	alloc_entries = 0;
+
+	/*
+	 * Figure out how many underlying objects the page references so we can
+	 * allocate them along with the page.
+	 */
+	switch (dsk->type) {
+	case WT_PAGE_COL_FIX:
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_COL_VAR:
+		/*
+		 * Column-store leaf page entries map one-to-one to the number
+		 * of physical entries on the page (each physical entry is a
+		 * value item).
+		 *
+		 * Column-store internal page entries map one-to-one to the
+		 * number of physical entries on the page (each entry is a
+		 * location cookie).
+		 */
+		alloc_entries = dsk->u.entries;
+		break;
+	case WT_PAGE_ROW_INT:
+		/*
+		 * Row-store internal page entries map one-to-two to the number
+		 * of physical entries on the page (each entry is a key and
+		 * location cookie pair).
+		 */
+		alloc_entries = dsk->u.entries / 2;
+		break;
+	case WT_PAGE_ROW_LEAF:
+		/*
+		 * If the "no empty values" flag is set, row-store leaf page
+		 * entries map one-to-one to the number of physical entries
+		 * on the page (each physical entry is a key or value item).
+		 * If that flag is not set, there are more keys than values,
+		 * we have to walk the page to figure it out.
+		 */
+		if (F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL))
+			alloc_entries = dsk->u.entries;
+		else if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE))
+			alloc_entries = dsk->u.entries / 2;
+		else
+			WT_RET(__inmem_row_leaf_entries(
+			    session, dsk, &alloc_entries));
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	/* Allocate and initialize a new WT_PAGE. */
+	WT_RET(__wt_page_alloc(
+	    session, dsk->type, dsk->recno, alloc_entries, 1, &page));
+	page->dsk = dsk;
+	F_SET_ATOMIC(page, flags);
+
+	/*
+	 * Track the memory allocated to build this page so we can update the
+	 * cache statistics in a single call.
+	 */
+	size = LF_ISSET(WT_PAGE_DISK_ALLOC) ? dsk->mem_size : 0;
+
+	switch (page->type) {
+	case WT_PAGE_COL_FIX:
+		__inmem_col_fix(session, page);
+		break;
+	case WT_PAGE_COL_INT:
+		__inmem_col_int(session, page);
+		break;
+	case WT_PAGE_COL_VAR:
+		WT_ERR(__inmem_col_var(session, page, &size));
+		break;
+	case WT_PAGE_ROW_INT:
+		WT_ERR(__inmem_row_int(session, page, &size));
+		break;
+	case WT_PAGE_ROW_LEAF:
+		WT_ERR(__inmem_row_leaf(session, page));
+		break;
+	WT_ILLEGAL_VALUE_ERR(session);
+	}
+
+	/* Update the page's in-memory size and the cache statistics. */
+	__wt_cache_page_inmem_incr(session, page, size);
+
+	/* Link the new internal page to the parent. */
+	if (ref != NULL) {
+		switch (page->type) {
+		case WT_PAGE_COL_INT:
+		case WT_PAGE_ROW_INT:
+			page->pg_intl_parent_ref = ref;
+			break;
+		}
+		ref->page = page;
+	}
+
+	*pagep = page;
+	return (0);
+
+err:	__wt_page_out(session, &page);
+	return (ret);
+}
+
+/*
+ * __inmem_col_fix --
+ *	Build in-memory index for fixed-length column-store leaf pages.
+ */
+static void
+__inmem_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_BTREE *btree;
+	const WT_PAGE_HEADER *dsk;
+
+	btree = S2BT(session);
+	dsk = page->dsk;
+
+	page->pg_fix_bitf = WT_PAGE_HEADER_BYTE(btree, dsk);
+}
+
+/*
+ * __inmem_col_int --
+ *	Build in-memory index for column-store internal pages.
+ */
+static void
+__inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	const WT_PAGE_HEADER *dsk;
+	WT_PAGE_INDEX *pindex;
+	WT_REF **refp, *ref;
+	uint32_t i;
+
+	btree = S2BT(session);
+	dsk = page->dsk;
+	unpack = &_unpack;
+
+	/*
+	 * Walk the page, building references: the page contains value items.
+	 * The value items are on-page items (WT_CELL_VALUE).
+	 */
+	pindex = WT_INTL_INDEX_COPY(page);
+	refp = pindex->index;
+	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+		ref = *refp++;
+		ref->home = page;
+
+		__wt_cell_unpack(cell, unpack);
+		ref->addr = cell;
+		ref->key.recno = unpack->v;
+	}
+}
+
+/*
+ * __inmem_col_var_repeats --
+ *	Count the number of repeat entries on the page.
+ */
+static int
+__inmem_col_var_repeats(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t *np)
+{
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	const WT_PAGE_HEADER *dsk;
+	uint32_t i;
+
+	btree = S2BT(session);
+	dsk = page->dsk;
+	unpack = &_unpack;
+
+	/* Walk the page, counting entries for the repeats array. */
+	*np = 0;
+	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+		__wt_cell_unpack(cell, unpack);
+		if (__wt_cell_rle(unpack) > 1)
+			++*np;
+	}
+	return (0);
+}
+
+/*
+ * __inmem_col_var --
+ *	Build in-memory index for variable-length, data-only leaf pages in
+ *	column-store trees.
+ */
+static int
+__inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
+{
+	WT_BTREE *btree;
+	WT_COL *cip;
+	WT_COL_RLE *repeats;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	const WT_PAGE_HEADER *dsk;
+	uint64_t recno, rle;
+	size_t bytes_allocated;
+	uint32_t i, indx, n, repeat_off;
+
+	btree = S2BT(session);
+	dsk = page->dsk;
+	recno = page->pg_var_recno;
+
+	repeats = NULL;
+	repeat_off = 0;
+	unpack = &_unpack;
+	bytes_allocated = 0;
+
+	/*
+	 * Walk the page, building references: the page contains unsorted value
+	 * items.  The value items are on-page (WT_CELL_VALUE), overflow items
+	 * (WT_CELL_VALUE_OVFL) or deleted items (WT_CELL_DEL).
+	 */
+	indx = 0;
+	cip = page->pg_var_d;
+	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+		__wt_cell_unpack(cell, unpack);
+		WT_COL_PTR_SET(cip, WT_PAGE_DISK_OFFSET(page, cell));
+		cip++;
+
+		/*
+		 * Add records with repeat counts greater than 1 to an array we
+		 * use for fast lookups.  The first entry we find needing the
+		 * repeats array triggers a re-walk from the start of the page
+		 * to determine the size of the array.
+		 */
+		rle = __wt_cell_rle(unpack);
+		if (rle > 1) {
+			if (repeats == NULL) {
+				WT_RET(
+				    __inmem_col_var_repeats(session, page, &n));
+				WT_RET(__wt_realloc_def(session,
+				    &bytes_allocated, n + 1, &repeats));
+
+				page->pg_var_repeats = repeats;
+				page->pg_var_nrepeats = n;
+				*sizep += bytes_allocated;
+			}
+			repeats[repeat_off].indx = indx;
+			repeats[repeat_off].recno = recno;
+			repeats[repeat_off++].rle = rle;
+		}
+		indx++;
+		recno += rle;
+	}
+
+	return (0);
+}
+
+/*
+ * __inmem_row_int --
+ *	Build in-memory index for row-store internal pages.
+ */
+static int
+__inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
+{
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	WT_DECL_ITEM(current);
+	WT_DECL_RET;
+	const WT_PAGE_HEADER *dsk;
+	WT_PAGE_INDEX *pindex;
+	WT_REF *ref, **refp;
+	uint32_t i;
+
+	btree = S2BT(session);
+	unpack = &_unpack;
+	dsk = page->dsk;
+
+	WT_RET(__wt_scr_alloc(session, 0, &current));
+
+	/*
+	 * Walk the page, instantiating keys: the page contains sorted key and
+	 * location cookie pairs.  Keys are on-page/overflow items and location
+	 * cookies are WT_CELL_ADDR_XXX items.
+	 */
+	pindex = WT_INTL_INDEX_COPY(page);
+	refp = pindex->index;
+	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+		ref = *refp;
+		ref->home = page;
+
+		__wt_cell_unpack(cell, unpack);
+		switch (unpack->type) {
+		case WT_CELL_KEY:
+			/*
+			 * Note: we don't Huffman encode internal page keys,
+			 * there's no decoding work to do.
+			 */
+			__wt_ref_key_onpage_set(page, ref, unpack);
+			break;
+		case WT_CELL_KEY_OVFL:
+			/* Instantiate any overflow records. */
+			WT_ERR(__wt_dsk_cell_data_ref(
+			    session, page->type, unpack, current));
+
+			WT_ERR(__wt_row_ikey_incr(session, page,
+			    WT_PAGE_DISK_OFFSET(page, cell),
+			    current->data, current->size, &ref->key.ikey));
+
+			*sizep += sizeof(WT_IKEY) + current->size;
+			break;
+		case WT_CELL_ADDR_DEL:
+			/*
+			 * A cell may reference a deleted leaf page: if a leaf
+			 * page was deleted without being read (fast truncate),
+			 * and the deletion committed, but older transactions
+			 * in the system required the previous version of the
+			 * page to remain available, a special deleted-address
+			 * type cell is written.  The only reason we'd ever see
+			 * that cell on a page we're reading is if we crashed
+			 * and recovered (otherwise a version of the page w/o
+			 * that cell would have eventually been written).  If we
+			 * crash and recover to a page with a deleted-address
+			 * cell, we want to discard the page from the backing
+			 * store (it was never discarded), and, of course, by
+			 * definition no earlier transaction will ever need it.
+			 *
+			 * Re-create the state of a deleted page.
+			 */
+			ref->addr = cell;
+			ref->state = WT_REF_DELETED;
+			++refp;
+
+			/*
+			 * If the tree is already dirty and so will be written,
+			 * mark the page dirty.  (We want to free the deleted
+			 * pages, but if the handle is read-only or if the
+			 * application never modifies the tree, we're not able
+			 * to do so.)
+			 */
+			if (btree->modified) {
+				WT_ERR(__wt_page_modify_init(session, page));
+				__wt_page_modify_set(session, page);
+			}
+			break;
+		case WT_CELL_ADDR_INT:
+		case WT_CELL_ADDR_LEAF:
+		case WT_CELL_ADDR_LEAF_NO:
+			ref->addr = cell;
+			++refp;
+			break;
+		WT_ILLEGAL_VALUE_ERR(session);
+		}
+	}
+
+err:	__wt_scr_free(&current);
+	return (ret);
+}
+
+/*
+ * __inmem_row_leaf_entries --
+ *	Return the number of entries for row-store leaf pages.
+ */
+static int
+__inmem_row_leaf_entries(
+    WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, uint32_t *nindxp)
+{
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	uint32_t i, nindx;
+
+	btree = S2BT(session);
+	unpack = &_unpack;
+
+	/*
+	 * Leaf row-store page entries map to a maximum of one-to-one to the
+	 * number of physical entries on the page (each physical entry might be
+	 * a key without a subsequent data item).  To avoid over-allocation in
+	 * workloads without empty data items, first walk the page counting the
+	 * number of keys, then allocate the indices.
+	 *
+	 * The page contains key/data pairs.  Keys are on-page (WT_CELL_KEY) or
+	 * overflow (WT_CELL_KEY_OVFL) items, data are either non-existent or a
+	 * single on-page (WT_CELL_VALUE) or overflow (WT_CELL_VALUE_OVFL) item.
+	 */
+	nindx = 0;
+	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+		__wt_cell_unpack(cell, unpack);
+		switch (unpack->type) {
+		case WT_CELL_KEY:
+		case WT_CELL_KEY_OVFL:
+			++nindx;
+			break;
+		case WT_CELL_VALUE:
+		case WT_CELL_VALUE_OVFL:
+			break;
+		WT_ILLEGAL_VALUE(session);
+		}
+	}
+
+	*nindxp = nindx;
+	return (0);
+}
+
+/*
+ * __inmem_row_leaf --
+ *	Build in-memory index for row-store leaf pages.
+ */
+static int
+__inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	const WT_PAGE_HEADER *dsk;
+	WT_ROW *rip;
+	uint32_t i;
+
+	btree = S2BT(session);
+	dsk = page->dsk;
+	unpack = &_unpack;
+
+	/* Walk the page, building indices. */
+	rip = page->pg_row_d;
+	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+		__wt_cell_unpack(cell, unpack);
+		switch (unpack->type) {
+		case WT_CELL_KEY_OVFL:
+			__wt_row_leaf_key_set_cell(page, rip, cell);
+			++rip;
+			break;
+		case WT_CELL_KEY:
+			/*
+			 * Simple keys without compression (not Huffman encoded
+			 * or prefix compressed), can be directly referenced on
+			 * the page to avoid repeatedly unpacking their cells.
+			 */
+			if (!btree->huffman_key && unpack->prefix == 0)
+				__wt_row_leaf_key_set(page, rip, unpack);
+			else
+				__wt_row_leaf_key_set_cell(page, rip, cell);
+			++rip;
+			break;
+		case WT_CELL_VALUE:
+			/*
+			 * Simple values without compression can be directly
+			 * referenced on the page to avoid repeatedly unpacking
+			 * their cells.
+			 */
+			if (!btree->huffman_value)
+				__wt_row_leaf_value_set(page, rip - 1, unpack);
+			break;
+		case WT_CELL_VALUE_OVFL:
+			break;
+		WT_ILLEGAL_VALUE(session);
+		}
+	}
+
+	/*
+	 * We do not currently instantiate keys on leaf pages when the page is
+	 * loaded, they're instantiated on demand.
+	 */
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
new file mode 100644
index 00000000000..9cd6f8310af
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -0,0 +1,88 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cache_read --
+ *	Read a page from the file.
+ */
+int
+__wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+	WT_DECL_RET;
+	WT_ITEM tmp;
+	WT_PAGE *page;
+	WT_PAGE_STATE previous_state;
+	size_t addr_size;
+	const uint8_t *addr;
+
+	page = NULL;
+
+	/*
+	 * Don't pass an allocated buffer to the underlying block read function,
+	 * force allocation of new memory of the appropriate size.
+	 */
+	WT_CLEAR(tmp);
+
+	/*
+	 * Attempt to set the state to WT_REF_READING for normal reads, or
+	 * WT_REF_LOCKED, for deleted pages.  If successful, we've won the
+	 * race, read the page.
+	 */
+	if (WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_READING))
+		previous_state = WT_REF_DISK;
+	else if (WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED))
+		previous_state = WT_REF_DELETED;
+	else
+		return (0);
+
+	/*
+	 * Get the address: if there is no address, the page was deleted, but a
+	 * subsequent search or insert is forcing re-creation of the name space.
+	 * Otherwise, there's an address, read the backing disk page and build
+	 * an in-memory version of the page.
+	 */
+	WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+	if (addr == NULL) {
+		WT_ASSERT(session, previous_state == WT_REF_DELETED);
+
+		WT_ERR(__wt_btree_new_leaf_page(session, &page));
+		ref->page = page;
+	} else {
+		/* Read the backing disk page. */
+		WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size));
+
+		/* Build the in-memory version of the page. */
+		WT_ERR(__wt_page_inmem(session, ref, tmp.data,
+		    WT_DATA_IN_ITEM(&tmp) ?
+		    WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));
+
+		/* If the page was deleted, instantiate that information. */
+		if (previous_state == WT_REF_DELETED)
+			WT_ERR(__wt_delete_page_instantiate(session, ref));
+	}
+
+	WT_ERR(__wt_verbose(session, WT_VERB_READ,
+	    "page %p: %s", page, __wt_page_type_string(page->type)));
+
+	WT_PUBLISH(ref->state, WT_REF_MEM);
+	return (0);
+
+err:	/*
+	 * If the function building an in-memory version of the page failed,
+	 * it discarded the page, but not the disk image.  Discard the page
+	 * and separately discard the disk image in all cases.
+	 */
+	if (ref->page != NULL)
+		__wt_ref_out(session, ref);
+	WT_PUBLISH(ref->state, previous_state);
+
+	__wt_buf_free(session, &tmp);
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c
new file mode 100644
index 00000000000..25b4bfc3005
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_ret.c
@@ -0,0 +1,116 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_kv_return --
+ *	Return a page referenced key/value pair to the application.
+ */
+int
+__wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+{
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK unpack;
+	WT_CURSOR *cursor;
+	WT_PAGE *page;
+	WT_ROW *rip;
+	uint8_t v;
+
+	btree = S2BT(session);
+
+	page = cbt->ref->page;
+	cursor = &cbt->iface;
+
+	switch (page->type) {
+	case WT_PAGE_COL_FIX:
+		/*
+		 * The interface cursor's record has usually been set, but that
+		 * isn't universally true, specifically, cursor.search_near may
+		 * call here without first setting the interface cursor.
+		 */
+		cursor->recno = cbt->recno;
+
+		/* If the cursor references a WT_UPDATE item, return it. */
+		if (upd != NULL) {
+			cursor->value.data = WT_UPDATE_DATA(upd);
+			cursor->value.size = upd->size;
+			return (0);
+		}
+
+		/* Take the value from the original page. */
+		v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt);
+		return (__wt_buf_set(session, &cursor->value, &v, 1));
+	case WT_PAGE_COL_VAR:
+		/*
+		 * The interface cursor's record has usually been set, but that
+		 * isn't universally true, specifically, cursor.search_near may
+		 * call here without first setting the interface cursor.
+		 */
+		cursor->recno = cbt->recno;
+
+		/* If the cursor references a WT_UPDATE item, return it. */
+		if (upd != NULL) {
+			cursor->value.data = WT_UPDATE_DATA(upd);
+			cursor->value.size = upd->size;
+			return (0);
+		}
+
+		/* Take the value from the original page cell. */
+		cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]);
+		break;
+	case WT_PAGE_ROW_LEAF:
+		rip = &page->pg_row_d[cbt->slot];
+
+		/*
+		 * If the cursor references a WT_INSERT item, take its key.
+		 * Else, if we have an exact match, we copied the key in the
+		 * search function, take it from there.
+		 * If we don't have an exact match, take the key from the
+		 * original page.
+		 */
+		if (cbt->ins != NULL) {
+			cursor->key.data = WT_INSERT_KEY(cbt->ins);
+			cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins);
+		} else if (cbt->compare == 0) {
+			cursor->key.data = cbt->search_key.data;
+			cursor->key.size = cbt->search_key.size;
+		} else
+			WT_RET(__wt_row_leaf_key(
+			    session, page, rip, &cursor->key, 0));
+
+		/* If the cursor references a WT_UPDATE item, return it. */
+		if (upd != NULL) {
+			cursor->value.data = WT_UPDATE_DATA(upd);
+			cursor->value.size = upd->size;
+			return (0);
+		}
+
+		/* Simple values have their location encoded in the WT_ROW. */
+		if (__wt_row_leaf_value(page, rip, &cursor->value))
+			return (0);
+
+		/*
+		 * Take the value from the original page cell (which may be
+		 * empty).
+		 */
+		if ((cell =
+		    __wt_row_leaf_value_cell(page, rip, NULL)) == NULL) {
+			cursor->value.size = 0;
+			return (0);
+		}
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	/* The value is an on-page cell, unpack and expand it as necessary. */
+	__wt_cell_unpack(cell, &unpack);
+	WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cursor->value));
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c
new file mode 100644
index 00000000000..10366e91a0e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c
@@ -0,0 +1,2520 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+struct __wt_stuff;	  typedef struct __wt_stuff WT_STUFF;
+struct __wt_track;	  typedef struct __wt_track WT_TRACK;
+struct __wt_track_shared; typedef struct __wt_track_shared WT_TRACK_SHARED;
+
+/*
+ * There's a bunch of stuff we pass around during salvage, group it together
+ * to make the code prettier.
+ */
+struct __wt_stuff {
+	WT_SESSION_IMPL *session;		/* Salvage session */
+
+	WT_TRACK **pages;			/* Pages */
+	uint32_t   pages_next;			/* Next empty slot */
+	size_t     pages_allocated;		/* Bytes allocated */
+
+	WT_TRACK **ovfl;			/* Overflow pages */
+	uint32_t   ovfl_next;			/* Next empty slot */
+	size_t     ovfl_allocated;		/* Bytes allocated */
+
+	WT_REF	   root_ref;			/* Created root page */
+
+	uint8_t    page_type;			/* Page type */
+
+	/* If need to free blocks backing merged page ranges. */
+	int	   merge_free;
+
+	WT_ITEM	  *tmp1;			/* Verbose print buffer */
+	WT_ITEM	  *tmp2;			/* Verbose print buffer */
+
+	uint64_t fcnt;				/* Progress counter */
+};
+
+/*
+ * WT_TRACK_SHARED --
+ *	Information shared between pages being merged.
+ */
+struct __wt_track_shared {
+	uint32_t ref;				/* Reference count */
+
+	/*
+	 * Physical information about the file block.
+	 */
+	WT_ADDR  addr;				/* Page address */
+	uint32_t size;				/* Page size */
+	uint64_t gen;				/* Page generation */
+
+	/*
+	 * Pages that reference overflow pages contain a list of the overflow
+	 * pages they reference.  We start out with a list of addresses, and
+	 * convert to overflow array slots during the reconciliation of page
+	 * references to overflow records.
+	 */
+	WT_ADDR  *ovfl_addr;			/* Overflow pages by address */
+	uint32_t *ovfl_slot;			/* Overflow pages by slot */
+	uint32_t  ovfl_cnt;			/* Overflow reference count */
+};
+
+/*
+ * WT_TRACK --
+ *	Structure to track chunks, one per chunk; we start out with a chunk per
+ * page (either leaf or overflow), but when we find overlapping key ranges, we
+ * split the leaf page chunks up, one chunk for each unique key range.
+ */
+struct __wt_track {
+#define	trk_addr	shared->addr.addr
+#define	trk_addr_size	shared->addr.size
+#define	trk_gen		shared->gen
+#define	trk_ovfl_addr	shared->ovfl_addr
+#define	trk_ovfl_cnt	shared->ovfl_cnt
+#define	trk_ovfl_slot	shared->ovfl_slot
+#define	trk_size	shared->size
+	WT_TRACK_SHARED *shared;		/* Shared information */
+
+	WT_STUFF  *ss;				/* Enclosing stuff */
+
+	union {
+		struct {
+#undef	row_start
+#define	row_start	u.row._row_start
+			WT_ITEM   _row_start;	/* Row-store start range */
+#undef	row_stop
+#define	row_stop	u.row._row_stop
+			WT_ITEM   _row_stop;	/* Row-store stop range */
+		} row;
+
+		struct {
+#undef	col_start
+#define	col_start	u.col._col_start
+			uint64_t _col_start;	/* Col-store start range */
+#undef	col_stop
+#define	col_stop	u.col._col_stop
+			uint64_t _col_stop;	/* Col-store stop range */
+#undef	col_missing
+#define	col_missing	u.col._col_missing
+			uint64_t _col_missing;	/* Col-store missing range */
+		} col;
+	} u;
+
+#define	WT_TRACK_CHECK_START	0x01		/* Row: initial key updated */
+#define	WT_TRACK_CHECK_STOP	0x02		/* Row: last key updated */
+#define	WT_TRACK_MERGE		0x04		/* Page requires merging */
+#define	WT_TRACK_OVFL_REFD	0x08		/* Overflow page referenced */
+	u_int flags;
+};
+
+static int  __slvg_cleanup(WT_SESSION_IMPL *, WT_STUFF *);
+static int  __slvg_col_build_internal(WT_SESSION_IMPL *, uint32_t, WT_STUFF *);
+static int  __slvg_col_build_leaf(WT_SESSION_IMPL *, WT_TRACK *, WT_REF *);
+static int  __slvg_col_ovfl(
+		WT_SESSION_IMPL *, WT_TRACK *, WT_PAGE *, uint64_t, uint64_t);
+static int  __slvg_col_range(WT_SESSION_IMPL *, WT_STUFF *);
+static int  __slvg_col_range_missing(WT_SESSION_IMPL *, WT_STUFF *);
+static int  __slvg_col_range_overlap(
+		WT_SESSION_IMPL *, uint32_t, uint32_t, WT_STUFF *);
+static void __slvg_col_trk_update_start(uint32_t, WT_STUFF *);
+static int  __slvg_merge_block_free(WT_SESSION_IMPL *, WT_STUFF *);
+static int  __slvg_ovfl_compare(const void *, const void *);
+static int  __slvg_ovfl_discard(WT_SESSION_IMPL *, WT_STUFF *);
+static int  __slvg_ovfl_reconcile(WT_SESSION_IMPL *, WT_STUFF *);
+static int  __slvg_ovfl_ref(WT_SESSION_IMPL *, WT_TRACK *, int);
+static int  __slvg_ovfl_ref_all(WT_SESSION_IMPL *, WT_TRACK *);
+static int  __slvg_read(WT_SESSION_IMPL *, WT_STUFF *);
+static int  __slvg_row_build_internal(WT_SESSION_IMPL *, uint32_t, WT_STUFF *);
+static int  __slvg_row_build_leaf(
+		WT_SESSION_IMPL *, WT_TRACK *, WT_REF *, WT_STUFF *);
+static int  __slvg_row_ovfl(
+		WT_SESSION_IMPL *, WT_TRACK *, WT_PAGE *, uint32_t, uint32_t);
+static int  __slvg_row_range(WT_SESSION_IMPL *, WT_STUFF *);
+static int  __slvg_row_range_overlap(
+		WT_SESSION_IMPL *, uint32_t, uint32_t, WT_STUFF *);
+static int  __slvg_row_trk_update_start(
+		WT_SESSION_IMPL *, WT_ITEM *, uint32_t, WT_STUFF *);
+static int  __slvg_trk_compare_addr(const void *, const void *);
+static int  __slvg_trk_compare_gen(const void *, const void *);
+static int  __slvg_trk_compare_key(const void *, const void *);
+static int  __slvg_trk_free(WT_SESSION_IMPL *, WT_TRACK **, int);
+static void __slvg_trk_free_addr(WT_SESSION_IMPL *, WT_TRACK *);
+static int  __slvg_trk_init(WT_SESSION_IMPL *, uint8_t *,
+		size_t, uint32_t, uint64_t, WT_STUFF *, WT_TRACK **);
+static int  __slvg_trk_leaf(WT_SESSION_IMPL *,
+		const WT_PAGE_HEADER *, uint8_t *, size_t, WT_STUFF *);
+static int  __slvg_trk_leaf_ovfl(
+		WT_SESSION_IMPL *, const WT_PAGE_HEADER *, WT_TRACK *);
+static int  __slvg_trk_ovfl(WT_SESSION_IMPL *,
+		const WT_PAGE_HEADER *, uint8_t *, size_t, WT_STUFF *);
+static int  __slvg_trk_split(WT_SESSION_IMPL *, WT_TRACK *, WT_TRACK **);
+
+/*
+ * __wt_bt_salvage --
+ *	Salvage a Btree.
+ */
+int
+__wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[])
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_STUFF *ss, stuff;
+	uint32_t i, leaf_cnt;
+
+	WT_UNUSED(cfg);
+
+	btree = S2BT(session);
+	bm = btree->bm;
+
+	WT_CLEAR(stuff);
+	ss = &stuff;
+	ss->session = session;
+	ss->page_type = WT_PAGE_INVALID;
+
+	/* Allocate temporary buffers. */
+	WT_ERR(__wt_scr_alloc(session, 0, &ss->tmp1));
+	WT_ERR(__wt_scr_alloc(session, 0, &ss->tmp2));
+
+	/*
+	 * Step 1:
+	 * Inform the underlying block manager that we're salvaging the file.
+	 */
+	WT_ERR(bm->salvage_start(bm, session));
+
+	/*
+	 * Step 2:
+	 * Read the file and build in-memory structures that reference any leaf
+	 * or overflow page.  Any pages other than leaf or overflow pages are
+	 * added to the free list.
+	 *
+	 * Turn off read checksum and verification error messages while we're
+	 * reading the file, we expect to see corrupted blocks.
+	 */
+	F_SET(session, WT_SESSION_SALVAGE_CORRUPT_OK);
+	ret = __slvg_read(session, ss);
+	F_CLR(session, WT_SESSION_SALVAGE_CORRUPT_OK);
+	WT_ERR(ret);
+
+	/*
+	 * Step 3:
+	 * Discard any page referencing a non-existent overflow page.  We do
+	 * this before checking overlapping key ranges on the grounds that a
+	 * bad key range we can use is better than a terrific key range that
+	 * references pages we don't have. On the other hand, we subsequently
+	 * discard key ranges where there are better overlapping ranges, and
+	 * it would be better if we let the availability of an overflow value
+	 * inform our choices as to the key ranges we select, ideally on a
+	 * per-key basis.
+	 *
+	 * A complicating problem is found in variable-length column-store
+	 * objects, where we potentially split key ranges within RLE units.
+	 * For example, if there's a page with rows 15-20 and we later find
+	 * row 17 with a larger LSN, the range splits into 3 chunks, 15-16,
+	 * 17, and 18-20.  If rows 15-20 were originally a single value (an
+	 * RLE of 6), and that record is an overflow record, we end up with
+	 * two chunks, both of which want to reference the same overflow value.
+	 *
+	 * Instead of the approach just described, we're first discarding any
+	 * pages referencing non-existent overflow pages, then we're reviewing
+	 * our key ranges and discarding any that overlap.  We're doing it that
+	 * way for a few reasons: absent corruption, missing overflow items are
+	 * strong arguments the page was replaced (on the other hand, some kind
+	 * of file corruption is probably why we're here); it's a significant
+	 * amount of additional complexity to simultaneously juggle overlapping
+	 * ranges and missing overflow items; finally, real-world applications
+	 * usually don't have a lot of overflow items, as WiredTiger supports
+	 * very large page sizes, overflow items shouldn't be common.
+	 *
+	 * Step 4:
+	 * Add unreferenced overflow page blocks to the free list so they are
+	 * reused immediately.
+	 */
+	if (ss->ovfl_next != 0) {
+		WT_ERR(__slvg_ovfl_reconcile(session, ss));
+		WT_ERR(__slvg_ovfl_discard(session, ss));
+	}
+
+	/*
+	 * Step 5:
+	 * Walk the list of pages looking for overlapping ranges to resolve.
+	 * If we find a range that needs to be resolved, set a global flag
+	 * and a per WT_TRACK flag on the pages requiring modification.
+	 *
+	 * This requires sorting the page list by key, and secondarily by LSN.
+	 *
+	 * !!!
+	 * It's vanishingly unlikely and probably impossible for fixed-length
+	 * column-store files to have overlapping key ranges.  It's possible
+	 * for an entire key range to go missing (if a page is corrupted and
+	 * lost), but because pages can't split, it shouldn't be possible to
+	 * find pages where the key ranges overlap.  That said, we check for
+	 * it and clean up after it in reconciliation because it doesn't cost
+	 * much and future column-store formats or operations might allow for
+	 * fixed-length format ranges to overlap during salvage, and I don't
+	 * want to have to retrofit the code later.
+	 */
+	qsort(ss->pages,
+	    (size_t)ss->pages_next, sizeof(WT_TRACK *), __slvg_trk_compare_key);
+	if (ss->page_type == WT_PAGE_ROW_LEAF)
+		WT_ERR(__slvg_row_range(session, ss));
+	else
+		WT_ERR(__slvg_col_range(session, ss));
+
+	/*
+	 * Step 6:
+	 * We may have lost key ranges in column-store databases, that is, some
+	 * part of the record number space is gone.   Look for missing ranges.
+	 */
+	switch (ss->page_type) {
+	case WT_PAGE_COL_FIX:
+	case WT_PAGE_COL_VAR:
+		WT_ERR(__slvg_col_range_missing(session, ss));
+		break;
+	case WT_PAGE_ROW_LEAF:
+		break;
+	}
+
+	/*
+	 * Step 7:
+	 * Build an internal page that references all of the leaf pages,
+	 * and write it, as well as any merged pages, to the file.
+	 *
+	 * Count how many leaf pages we have (we could track this during the
+	 * array shuffling/splitting, but that's a lot harder).
+	 */
+	for (leaf_cnt = i = 0; i < ss->pages_next; ++i)
+		if (ss->pages[i] != NULL)
+			++leaf_cnt;
+	if (leaf_cnt != 0)
+		switch (ss->page_type) {
+		case WT_PAGE_COL_FIX:
+		case WT_PAGE_COL_VAR:
+			WT_ERR(
+			    __slvg_col_build_internal(session, leaf_cnt, ss));
+			break;
+		case WT_PAGE_ROW_LEAF:
+			WT_ERR(
+			    __slvg_row_build_internal(session, leaf_cnt, ss));
+			break;
+		}
+
+	/*
+	 * Step 8:
+	 * If we had to merge key ranges, we have to do a final pass through
+	 * the leaf page array and discard file pages used during key merges.
+	 * We can't do it earlier: if we free'd the leaf pages we're merging as
+	 * we merged them, the write of subsequent leaf pages or the internal
+	 * page might allocate those free'd file blocks, and if the salvage run
+	 * subsequently fails, we'd have overwritten pages used to construct the
+	 * final key range.  In other words, if the salvage run fails, we don't
+	 * want to overwrite data the next salvage run might need.
+	 */
+	if (ss->merge_free)
+		WT_ERR(__slvg_merge_block_free(session, ss));
+
+	/*
+	 * Step 9:
+	 * Evict the newly created root page, creating a checkpoint.
+	 */
+	if (ss->root_ref.page != NULL) {
+		btree->ckpt = ckptbase;
+		ret = __wt_rec_evict(session, &ss->root_ref, 1);
+		ss->root_ref.page = NULL;
+		btree->ckpt = NULL;
+	}
+
+	/*
+	 * Step 10:
+	 * Inform the underlying block manager that we're done.
+	 */
+err:	WT_TRET(bm->salvage_end(bm, session));
+
+	/* Discard any root page we created. */
+	if (ss->root_ref.page != NULL)
+		__wt_ref_out(session, &ss->root_ref);
+
+	/* Discard the leaf and overflow page memory. */
+	WT_TRET(__slvg_cleanup(session, ss));
+
+	/* Discard temporary buffers. */
+	__wt_scr_free(&ss->tmp1);
+	__wt_scr_free(&ss->tmp2);
+
+	/* Wrap up reporting. */
+	WT_TRET(__wt_progress(session, NULL, ss->fcnt));
+
+	return (ret);
+}
+
+/*
+ * __slvg_read --
+ *	Read the file and build a table of the pages we can use.
+ */
+static int
+__slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+	WT_BM *bm;
+	WT_DECL_ITEM(as);
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+	const WT_PAGE_HEADER *dsk;
+	size_t addr_size;
+	uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE];
+	int eof, valid;
+
+	bm = S2BT(session)->bm;
+	WT_ERR(__wt_scr_alloc(session, 0, &as));
+	WT_ERR(__wt_scr_alloc(session, 0, &buf));
+
+	for (;;) {
+		/* Get the next block address from the block manager. */
+		WT_ERR(bm->salvage_next(bm, session, addr, &addr_size, &eof));
+		if (eof)
+			break;
+
+		/* Report progress every 10 chunks. */
+		if (++ss->fcnt % 10 == 0)
+			WT_ERR(__wt_progress(session, NULL, ss->fcnt));
+
+		/*
+		 * Read (and potentially decompress) the block; the underlying
+		 * block manager might return only good blocks if checksums are
+		 * configured, or both good and bad blocks if we're relying on
+		 * compression.
+		 *
+		 * Report the block's status to the block manager.
+		 */
+		if ((ret = __wt_bt_read(session, buf, addr, addr_size)) == 0)
+			valid = 1;
+		else {
+			valid = 0;
+			if (ret == WT_ERROR)
+				ret = 0;
+			WT_ERR(ret);
+		}
+		WT_ERR(bm->salvage_valid(bm, session, addr, addr_size, valid));
+		if (!valid)
+			continue;
+
+		/* Create a printable version of the address. */
+		WT_ERR(bm->addr_string(bm, session, as, addr, addr_size));
+
+		/*
+		 * Make sure it's an expected page type for the file.
+		 *
+		 * We only care about leaf and overflow pages from here on out;
+		 * discard all of the others.  We put them on the free list now,
+		 * because we might as well overwrite them, we want the file to
+		 * grow as little as possible, or shrink, and future salvage
+		 * calls don't need them either.
+		 */
+		dsk = buf->data;
+		switch (dsk->type) {
+		case WT_PAGE_BLOCK_MANAGER:
+		case WT_PAGE_COL_INT:
+		case WT_PAGE_ROW_INT:
+			WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+			    "%s page ignored %s",
+			    __wt_page_type_string(dsk->type),
+			    (const char *)as->data));
+			WT_ERR(bm->free(bm, session, addr, addr_size));
+			continue;
+		}
+
+		/*
+		 * Verify the page.  It's unlikely a page could have a valid
+		 * checksum and still be broken, but paranoia is healthy in
+		 * salvage.  Regardless, verify does return failure because
+		 * it detects failures we'd expect to see in a corrupted file,
+		 * like overflow references past the end of the file or
+		 * overflow references to non-existent pages, might as well
+		 * discard these pages now.
+		 */
+		if (__wt_verify_dsk(session, as->data, buf) != 0) {
+			WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+			    "%s page failed verify %s",
+			    __wt_page_type_string(dsk->type),
+			    (const char *)as->data));
+			WT_ERR(bm->free(bm, session, addr, addr_size));
+			continue;
+		}
+
+		WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+		    "tracking %s page, generation %" PRIu64 " %s",
+		    __wt_page_type_string(dsk->type), dsk->write_gen,
+		    (const char *)as->data));
+
+		switch (dsk->type) {
+		case WT_PAGE_COL_FIX:
+		case WT_PAGE_COL_VAR:
+		case WT_PAGE_ROW_LEAF:
+			if (ss->page_type == WT_PAGE_INVALID)
+				ss->page_type = dsk->type;
+			if (ss->page_type != dsk->type)
+				WT_ERR_MSG(session, WT_ERROR,
+				    "file contains multiple file formats (both "
+				    "%s and %s), and cannot be salvaged",
+				    __wt_page_type_string(ss->page_type),
+				    __wt_page_type_string(dsk->type));
+
+			WT_ERR(__slvg_trk_leaf(
+			    session, dsk, addr, addr_size, ss));
+			break;
+		case WT_PAGE_OVFL:
+			WT_ERR(__slvg_trk_ovfl(
+			    session, dsk, addr, addr_size, ss));
+			break;
+		}
+	}
+
+err:	__wt_scr_free(&as);
+	__wt_scr_free(&buf);
+
+	return (ret);
+}
+
+/*
+ * __slvg_trk_init --
+ *	Initialize tracking information for a page.
+ */
+static int
+__slvg_trk_init(WT_SESSION_IMPL *session,
+    uint8_t *addr, size_t addr_size,
+    uint32_t size, uint64_t gen, WT_STUFF *ss, WT_TRACK **retp)
+{
+	WT_DECL_RET;
+	WT_TRACK *trk;
+
+	WT_RET(__wt_calloc_def(session, 1, &trk));
+	WT_ERR(__wt_calloc_def(session, 1, &trk->shared));
+	trk->shared->ref = 1;
+
+	trk->ss = ss;
+	WT_ERR(__wt_strndup(session, addr, addr_size, &trk->trk_addr));
+	trk->trk_addr_size = (uint8_t)addr_size;
+	trk->trk_size = size;
+	trk->trk_gen = gen;
+
+	*retp = trk;
+	return (0);
+
+err:	__wt_free(session, trk->trk_addr);
+	__wt_free(session, trk->shared);
+	__wt_free(session, trk);
+	return (ret);
+}
+
+/*
+ * __slvg_trk_split --
+ *	Split a tracked chunk.
+ */
+static int
+__slvg_trk_split(WT_SESSION_IMPL *session, WT_TRACK *orig, WT_TRACK **newp)
+{
+	WT_TRACK *trk;
+
+	WT_RET(__wt_calloc_def(session, 1, &trk));
+
+	trk->shared = orig->shared;
+	trk->ss = orig->ss;
+
+	++orig->shared->ref;
+
+	*newp = trk;
+	return (0);
+}
+
+/*
+ * __slvg_trk_leaf --
+ *	Track a leaf page.
+ */
+static int
+__slvg_trk_leaf(WT_SESSION_IMPL *session,
+    const WT_PAGE_HEADER *dsk, uint8_t *addr, size_t addr_size, WT_STUFF *ss)
+{
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_TRACK *trk;
+	uint64_t stop_recno;
+	uint32_t i;
+
+	btree = S2BT(session);
+	unpack = &_unpack;
+	page = NULL;
+	trk = NULL;
+
+	/* Re-allocate the array of pages, as necessary. */
+	WT_RET(__wt_realloc_def(
+	    session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages));
+
+	/* Allocate a WT_TRACK entry for this new page and fill it in. */
+	WT_RET(__slvg_trk_init(
+	    session, addr, addr_size, dsk->mem_size, dsk->write_gen, ss, &trk));
+
+	switch (dsk->type) {
+	case WT_PAGE_COL_FIX:
+		/*
+		 * Column-store fixed-sized format: start and stop keys can be
+		 * taken from the block's header, and doesn't contain overflow
+		 * items.
+		 */
+		trk->col_start = dsk->recno;
+		trk->col_stop = dsk->recno + (dsk->u.entries - 1);
+
+		WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+		    "%s records %" PRIu64 "-%" PRIu64,
+		    __wt_addr_string(
+		    session, trk->trk_addr, trk->trk_addr_size, ss->tmp1),
+		    trk->col_start, trk->col_stop));
+		break;
+	case WT_PAGE_COL_VAR:
+		/*
+		 * Column-store variable-length format: the start key can be
+		 * taken from the block's header, stop key requires walking
+		 * the page.
+		 */
+		stop_recno = dsk->recno;
+		WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+			__wt_cell_unpack(cell, unpack);
+			stop_recno += __wt_cell_rle(unpack);
+		}
+
+		trk->col_start = dsk->recno;
+		trk->col_stop = stop_recno - 1;
+
+		WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+		    "%s records %" PRIu64 "-%" PRIu64,
+		    __wt_addr_string(
+		    session, trk->trk_addr, trk->trk_addr_size, ss->tmp1),
+		    trk->col_start, trk->col_stop));
+
+		/* Column-store pages can contain overflow items. */
+		WT_ERR(__slvg_trk_leaf_ovfl(session, dsk, trk));
+		break;
+	case WT_PAGE_ROW_LEAF:
+		/*
+		 * Row-store format: copy the first and last keys on the page.
+		 * Keys are prefix-compressed, the simplest and slowest thing
+		 * to do is instantiate the in-memory page, then instantiate
+		 * and copy the full keys, then free the page.   We do this
+		 * on every leaf page, and if you need to speed up the salvage,
+		 * it's probably a great place to start.
+		 */
+		WT_ERR(__wt_page_inmem(session, NULL, dsk, 0, &page));
+		WT_ERR(__wt_row_leaf_key_copy(session,
+		    page, &page->pg_row_d[0], &trk->row_start));
+		WT_ERR(__wt_row_leaf_key_copy(session, page,
+		    &page->pg_row_d[page->pg_row_entries - 1], &trk->row_stop));
+
+		if (WT_VERBOSE_ISSET(session, WT_VERB_SALVAGE)) {
+			WT_ERR(__wt_buf_set_printable(session, ss->tmp1,
+			    trk->row_start.data, trk->row_start.size));
+			WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+			    "%s start key %.*s",
+			    __wt_addr_string(session,
+			    trk->trk_addr, trk->trk_addr_size, ss->tmp2),
+			    (int)ss->tmp1->size, (char *)ss->tmp1->data));
+			WT_ERR(__wt_buf_set_printable(session, ss->tmp1,
+			    trk->row_stop.data, trk->row_stop.size));
+			WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+			    "%s stop key %.*s",
+			    __wt_addr_string(session,
+			    trk->trk_addr, trk->trk_addr_size, ss->tmp2),
+			    (int)ss->tmp1->size, (char *)ss->tmp1->data));
+		}
+
+		/* Row-store pages can contain overflow items. */
+		WT_ERR(__slvg_trk_leaf_ovfl(session, dsk, trk));
+		break;
+	}
+	ss->pages[ss->pages_next++] = trk;
+
+	if (0) {
+err:		__wt_free(session, trk);
+	}
+	if (page != NULL)
+		__wt_page_out(session, &page);
+	return (ret);
+}
+
+/*
+ * __slvg_trk_ovfl --
+ *	Track an overflow page.
+ */
+static int
+__slvg_trk_ovfl(WT_SESSION_IMPL *session,
+    const WT_PAGE_HEADER *dsk, uint8_t *addr, size_t addr_size, WT_STUFF *ss)
+{
+	WT_TRACK *trk;
+
+	/*
+	 * Reallocate the overflow page array as necessary, then save the
+	 * page's location information.
+	 */
+	WT_RET(__wt_realloc_def(
+	    session, &ss->ovfl_allocated, ss->ovfl_next + 1, &ss->ovfl));
+
+	WT_RET(__slvg_trk_init(
+	    session, addr, addr_size, dsk->mem_size, dsk->write_gen, ss, &trk));
+	ss->ovfl[ss->ovfl_next++] = trk;
+
+	return (0);
+}
+
+/*
+ * __slvg_trk_leaf_ovfl --
+ *	Search a leaf page for overflow items.
+ */
+static int
+__slvg_trk_leaf_ovfl(
+    WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_TRACK *trk)
+{
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	uint32_t i, ovfl_cnt;
+
+	btree = S2BT(session);
+	unpack = &_unpack;
+
+	/*
+	 * Two passes: count the overflow items, then copy them into an
+	 * allocated array.
+	 */
+	ovfl_cnt = 0;
+	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+		__wt_cell_unpack(cell, unpack);
+		if (unpack->ovfl)
+			++ovfl_cnt;
+	}
+	if (ovfl_cnt == 0)
+		return (0);
+
+	/* Allocate room for the array of overflow addresses and fill it in. */
+	WT_RET(__wt_calloc_def(session, ovfl_cnt, &trk->trk_ovfl_addr));
+	trk->trk_ovfl_cnt = ovfl_cnt;
+
+	ovfl_cnt = 0;
+	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+		__wt_cell_unpack(cell, unpack);
+		if (unpack->ovfl) {
+			WT_RET(__wt_strndup(session, unpack->data,
+			    unpack->size, &trk->trk_ovfl_addr[ovfl_cnt].addr));
+			trk->trk_ovfl_addr[ovfl_cnt].size =
+			    (uint8_t)unpack->size;
+
+			WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+			    "%s overflow reference %s",
+			    __wt_addr_string(session,
+			    trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1),
+			    __wt_addr_string(session,
+			    unpack->data, unpack->size, trk->ss->tmp2)));
+
+			if (++ovfl_cnt == trk->trk_ovfl_cnt)
+				break;
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * __slvg_col_range --
+ *	Figure out the leaf pages we need and free the leaf pages we don't.
+ *
+ * When pages split, the key range is split across multiple pages.  If not all
+ * of the old versions of the page are overwritten, or not all of the new pages
+ * are written, or some of the pages are corrupted, salvage will read different
+ * pages with overlapping key ranges, at different LSNs.
+ *
+ * We salvage all of the key ranges we find, at the latest LSN value: this means
+ * we may resurrect pages of deleted items, as page deletion doesn't write leaf
+ * pages and salvage will read and instantiate the contents of an old version of
+ * the deleted page.
+ *
+ * The leaf page array is sorted in key order, and secondarily on LSN: what this
+ * means is that for each new key range, the first page we find is the best page
+ * for that key.   The process is to walk forward from each page until we reach
+ * a page with a starting key after the current page's stopping key.
+ *
+ * For each of page, check to see if they overlap the current page's key range.
+ * If they do, resolve the overlap.  Because WiredTiger rarely splits pages,
+ * overlap resolution usually means discarding a page because the key ranges
+ * are the same, and one of the pages is simply an old version of the other.
+ *
+ * However, it's possible more complex resolution is necessary.  For example,
+ * here's an improbably complex list of page ranges and LSNs:
+ *
+ *	Page	Range	LSN
+ *	 30	 A-G	 3
+ *	 31	 C-D	 4
+ *	 32	 B-C	 5
+ *	 33	 C-F	 6
+ *	 34	 C-D	 7
+ *	 35	 F-M	 8
+ *	 36	 H-O	 9
+ *
+ * We walk forward from each page reviewing all other pages in the array that
+ * overlap the range.  For each overlap, the current or the overlapping
+ * page is updated so the page with the most recent information for any range
+ * "owns" that range.  Here's an example for page 30.
+ *
+ * Review page 31: because page 31 has the range C-D and a higher LSN than page
+ * 30, page 30 would "split" into two ranges, A-C and E-G, conceding the C-D
+ * range to page 31.  The new track element would be inserted into array with
+ * the following result:
+ *
+ *	Page	Range	LSN
+ *	 30	 A-C	 3		<< Changed WT_TRACK element
+ *	 31	 C-D	 4
+ *	 32	 B-C	 5
+ *	 33	 C-F	 6
+ *	 34	 C-D	 7
+ *	 30	 E-G	 3		<< New WT_TRACK element
+ *	 35	 F-M	 8
+ *	 36	 H-O	 9
+ *
+ * Continue the review of the first element, using its new values.
+ *
+ * Review page 32: because page 31 has the range B-C and a higher LSN than page
+ * 30, page 30's A-C range would be truncated, conceding the B-C range to page
+ * 32.
+ *	 30	 A-B	 3
+ *		 E-G	 3
+ *	 31	 C-D	 4
+ *	 32	 B-C	 5
+ *	 33	 C-F	 6
+ *	 34	 C-D	 7
+ *
+ * Review page 33: because page 33 has a starting key (C) past page 30's ending
+ * key (B), we stop evaluating page 30's A-B range, as there can be no further
+ * overlaps.
+ *
+ * This process is repeated for each page in the array.
+ *
+ * When page 33 is processed, we'd discover that page 33's C-F range overlaps
+ * page 30's E-G range, and page 30's E-G range would be updated, conceding the
+ * E-F range to page 33.
+ *
+ * This is not computationally expensive because we don't walk far forward in
+ * the leaf array because it's sorted by starting key, and because WiredTiger
+ * splits are rare, the chance of finding the kind of range overlap requiring
+ * re-sorting the array is small.
+ */
+static int
+__slvg_col_range(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+	WT_TRACK *jtrk;
+	uint32_t i, j;
+
+	/*
+	 * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR
+	 * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE
+	 * BEING HANDLED.
+	 *
+	 * Walk the page array looking for overlapping key ranges, adjusting
+	 * the ranges based on the LSN until there are no overlaps.
+	 *
+	 * DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE
+	 * AS ENTRIES ARE SPLIT, SO ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE
+	 * PLUS OFFSET.
+	 */
+	for (i = 0; i < ss->pages_next; ++i) {
+		if (ss->pages[i] == NULL)
+			continue;
+
+		/* Check for pages that overlap our page. */
+		for (j = i + 1; j < ss->pages_next; ++j) {
+			if (ss->pages[j] == NULL)
+				continue;
+			/*
+			 * We're done if this page starts after our stop, no
+			 * subsequent pages can overlap our page.
+			 */
+			if (ss->pages[j]->col_start >
+			    ss->pages[i]->col_stop)
+				break;
+
+			/* There's an overlap, fix it up. */
+			jtrk = ss->pages[j];
+			WT_RET(__slvg_col_range_overlap(session, i, j, ss));
+
+			/*
+			 * If the overlap resolution changed the entry's start
+			 * key, the entry might have moved and the page array
+			 * re-sorted, and pages[j] would reference a different
+			 * page.  We don't move forward if that happened, we
+			 * re-process the slot again (by decrementing j before
+			 * the loop's increment).
+			 */
+			if (ss->pages[j] != NULL && jtrk != ss->pages[j])
+				--j;
+		}
+	}
+	return (0);
+}
+
+/*
+ * __slvg_col_range_overlap --
+ *	Two column-store key ranges overlap, deal with it.
+ */
+static int
+__slvg_col_range_overlap(
+    WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_slot, WT_STUFF *ss)
+{
+	WT_TRACK *a_trk, *b_trk, *new;
+	uint32_t i;
+
+	/*
+	 * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR
+	 * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE
+	 * BEING HANDLED.
+	 */
+	a_trk = ss->pages[a_slot];
+	b_trk = ss->pages[b_slot];
+
+	WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+	    "%s and %s range overlap",
+	    __wt_addr_string(
+	    session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1),
+	    __wt_addr_string(
+	    session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2)));
+
+	/*
+	 * The key ranges of two WT_TRACK pages in the array overlap -- choose
+	 * the ranges we're going to take from each.
+	 *
+	 * We can think of the overlap possibilities as 11 different cases:
+	 *
+	 *		AAAAAAAAAAAAAAAAAA
+	 * #1		BBBBBBBBBBBBBBBBBB		pages are the same
+	 * #2	BBBBBBBBBBBBB				overlaps the beginning
+	 * #3			BBBBBBBBBBBBBBBB	overlaps the end
+	 * #4		BBBBB				B is a prefix of A
+	 * #5			BBBBBB			B is middle of A
+	 * #6			BBBBBBBBBB		B is a suffix of A
+	 *
+	 * and:
+	 *
+	 *		BBBBBBBBBBBBBBBBBB
+	 * #7	AAAAAAAAAAAAA				same as #3
+	 * #8			AAAAAAAAAAAAAAAA	same as #2
+	 * #9		AAAAA				A is a prefix of B
+	 * #10			AAAAAA			A is middle of B
+	 * #11			AAAAAAAAAA		A is a suffix of B
+	 *
+	 * Note the leaf page array was sorted by key and a_trk appears earlier
+	 * in the array than b_trk, so cases #2/8, #10 and #11 are impossible.
+	 *
+	 * Finally, there's one additional complicating factor -- final ranges
+	 * are assigned based on the page's LSN.
+	 */
+						/* Case #2/8, #10, #11 */
+	if (a_trk->col_start > b_trk->col_start)
+		WT_PANIC_RET(
+		    session, EINVAL, "unexpected merge array sort order");
+
+	if (a_trk->col_start == b_trk->col_start) {	/* Case #1, #4 and #9 */
+		/*
+		 * The secondary sort of the leaf page array was the page's LSN,
+		 * in high-to-low order, which means a_trk has a higher LSN, and
+		 * is more desirable, than b_trk.  In cases #1 and #4 and #9,
+		 * where the start of the range is the same for the two pages,
+		 * this simplifies things, it guarantees a_trk has a higher LSN
+		 * than b_trk.
+		 */
+		if (a_trk->col_stop >= b_trk->col_stop)
+			/*
+			 * Case #1, #4: a_trk is a superset of b_trk, and a_trk
+			 * is more desirable -- discard b_trk.
+			 */
+			goto delete_b;
+
+		/*
+		 * Case #9: b_trk is a superset of a_trk, but a_trk is more
+		 * desirable: keep both but delete a_trk's key range from
+		 * b_trk.
+		 */
+		b_trk->col_start = a_trk->col_stop + 1;
+		__slvg_col_trk_update_start(b_slot, ss);
+		F_SET(b_trk, WT_TRACK_MERGE);
+		goto merge;
+	}
+
+	if (a_trk->col_stop == b_trk->col_stop) {	/* Case #6 */
+		if (a_trk->trk_gen > b_trk->trk_gen)
+			/*
+			 * Case #6: a_trk is a superset of b_trk and a_trk is
+			 * more desirable -- discard b_trk.
+			 */
+			goto delete_b;
+
+		/*
+		 * Case #6: a_trk is a superset of b_trk, but b_trk is more
+		 * desirable: keep both but delete b_trk's key range from a_trk.
+		 */
+		a_trk->col_stop = b_trk->col_start - 1;
+		F_SET(a_trk, WT_TRACK_MERGE);
+		goto merge;
+	}
+
+	if  (a_trk->col_stop < b_trk->col_stop) {	/* Case #3/7 */
+		if (a_trk->trk_gen > b_trk->trk_gen) {
+			/*
+			 * Case #3/7: a_trk is more desirable, delete a_trk's
+			 * key range from b_trk;
+			 */
+			b_trk->col_start = a_trk->col_stop + 1;
+			__slvg_col_trk_update_start(b_slot, ss);
+			F_SET(b_trk, WT_TRACK_MERGE);
+		} else {
+			/*
+			 * Case #3/7: b_trk is more desirable, delete b_trk's
+			 * key range from a_trk;
+			 */
+			a_trk->col_stop = b_trk->col_start - 1;
+			F_SET(a_trk, WT_TRACK_MERGE);
+		}
+		goto merge;
+	}
+
+	/*
+	 * Case #5: a_trk is a superset of b_trk and a_trk is more desirable --
+	 * discard b_trk.
+	 */
+	if (a_trk->trk_gen > b_trk->trk_gen) {
+delete_b:	/*
+		 * After page and overflow reconciliation, one (and only one)
+		 * page can reference an overflow record.  But, if we split a
+		 * page into multiple chunks, any of the chunks might own any
+		 * of the backing overflow records, so overflow records won't
+		 * normally be discarded until after the merge phase completes.
+		 * (The merge phase is where the final pages are written, and
+		 * we figure out which overflow records are actually used.)
+		 * If freeing a chunk and there are no other references to the
+		 * underlying shared information, the overflow records must be
+		 * useless, discard them to keep the final file size small.
+		 */
+		if (b_trk->shared->ref == 1)
+			for (i = 0; i < b_trk->trk_ovfl_cnt; ++i)
+				WT_RET(__slvg_trk_free(session,
+				    &ss->ovfl[b_trk->trk_ovfl_slot[i]], 1));
+		return (__slvg_trk_free(session, &ss->pages[b_slot], 1));
+	}
+
+	/*
+	 * Case #5: b_trk is more desirable and is a middle chunk of a_trk.
+	 * Split a_trk into two parts, the key range before b_trk and the
+	 * key range after b_trk.
+	 */
+	WT_RET(__slvg_trk_split(session, a_trk, &new));
+
+	/*
+	 * Second, reallocate the array of pages if necessary, and then insert
+	 * the new element into the array after the existing element (that's
+	 * probably wrong, but we'll fix it up in a second).
+	 */
+	WT_RET(__wt_realloc_def(
+	    session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages));
+	memmove(ss->pages + a_slot + 1, ss->pages + a_slot,
+	    (ss->pages_next - a_slot) * sizeof(*ss->pages));
+	ss->pages[a_slot + 1] = new;
+	++ss->pages_next;
+
+	/*
+	 * Third, set its start key to be the first key after the stop key of
+	 * the middle chunk (that's b_trk), and its stop key to be the stop key
+	 * of the original chunk, and call __slvg_col_trk_update_start.  That
+	 * function will re-sort the WT_TRACK array as necessary to move our
+	 * new entry into the right sorted location.
+	 */
+	new->col_start = b_trk->col_stop + 1;
+	new->col_stop = a_trk->col_stop;
+	__slvg_col_trk_update_start(a_slot + 1, ss);
+
+	/*
+	 * Fourth, set the original WT_TRACK information to reference only
+	 * the initial key space in the page, that is, everything up to the
+	 * starting key of the middle chunk (that's b_trk).
+	 */
+	a_trk->col_stop = b_trk->col_start - 1;
+
+	F_SET(new, WT_TRACK_MERGE);
+	F_SET(a_trk, WT_TRACK_MERGE);
+
+merge:	WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+	    "%s and %s require merge",
+	    __wt_addr_string(
+	    session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1),
+	    __wt_addr_string(
+	    session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2)));
+	return (0);
+}
+
+/*
+ * __slvg_col_trk_update_start --
+ *	Update a column-store page's start key after an overlap.
+ */
+static void
+__slvg_col_trk_update_start(uint32_t slot, WT_STUFF *ss)
+{
+	WT_TRACK *trk;
+	uint32_t i;
+
+	trk = ss->pages[slot];
+
+	/*
+	 * If we deleted an initial piece of the WT_TRACK name space, it may no
+	 * longer be in the right location.
+	 *
+	 * For example, imagine page #1 has the key range 30-50, it split, and
+	 * we wrote page #2 with key range 30-40, and page #3 key range with
+	 * 40-50, where pages #2 and #3 have larger LSNs than page #1.  When the
+	 * key ranges were sorted, page #2 came first, then page #1 (because of
+	 * their earlier start keys than page #3), and page #2 came before page
+	 * #1 because of its LSN.  When we resolve the overlap between page #2
+	 * and page #1, we truncate the initial key range of page #1, and it now
+	 * sorts after page #3, because it has the same starting key of 40, and
+	 * a lower LSN.
+	 *
+	 * We have already updated b_trk's start key; what we may have to do is
+	 * re-sort some number of elements in the list.
+	 */
+	for (i = slot + 1; i < ss->pages_next; ++i) {
+		if (ss->pages[i] == NULL)
+			continue;
+		if (ss->pages[i]->col_start > trk->col_stop)
+			break;
+	}
+	i -= slot;
+	if (i > 1)
+		qsort(ss->pages + slot, (size_t)i,
+		    sizeof(WT_TRACK *), __slvg_trk_compare_key);
+}
+
+/*
+ * __slvg_col_range_missing --
+ *	Detect missing ranges from column-store files.
+ */
+static int
+__slvg_col_range_missing(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+	WT_TRACK *trk;
+	uint64_t r;
+	uint32_t i;
+
+	for (i = 0, r = 0; i < ss->pages_next; ++i) {
+		if ((trk = ss->pages[i]) == NULL)
+			continue;
+		if (trk->col_start != r + 1) {
+			WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+			    "%s column-store missing range from %"
+			    PRIu64 " to %" PRIu64 " inclusive",
+			    __wt_addr_string(session,
+			    trk->trk_addr, trk->trk_addr_size, ss->tmp1),
+			    r + 1, trk->col_start - 1));
+
+			/*
+			 * We need to instantiate deleted items for the missing
+			 * record range.
+			 */
+			trk->col_missing = r + 1;
+			F_SET(trk, WT_TRACK_MERGE);
+		}
+		r = trk->col_stop;
+	}
+	return (0);
+}
+
+/*
+ * __slvg_modify_init --
+ *	Initialize a salvage page's modification information.
+ */
+static int
+__slvg_modify_init(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_RET(__wt_page_modify_init(session, page));
+	__wt_page_modify_set(session, page);
+
+	return (0);
+}
+
+/*
+ * __slvg_col_build_internal --
+ *	Build a column-store in-memory page that references all of the leaf
+ *	pages we've found.
+ */
+static int
+__slvg_col_build_internal(
+    WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss)
+{
+	WT_ADDR *addr;
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_PAGE_INDEX *pindex;
+	WT_REF *ref, **refp;
+	WT_TRACK *trk;
+	uint32_t i;
+
+	addr = NULL;
+
+	/* Allocate a column-store root (internal) page and fill it in. */
+	WT_RET(
+	    __wt_page_alloc(session, WT_PAGE_COL_INT, 1, leaf_cnt, 1, &page));
+	WT_ERR(__slvg_modify_init(session, page));
+
+	pindex = WT_INTL_INDEX_COPY(page);
+	for (refp = pindex->index, i = 0; i < ss->pages_next; ++i) {
+		if ((trk = ss->pages[i]) == NULL)
+			continue;
+
+		ref = *refp++;
+		ref->home = page;
+		ref->page = NULL;
+
+		WT_ERR(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr));
+		WT_ERR(__wt_strndup(
+		    session, trk->trk_addr, trk->trk_addr_size, &addr->addr));
+		addr->size = trk->trk_addr_size;
+		addr->type =
+		    trk->trk_ovfl_cnt == 0 ? WT_ADDR_LEAF_NO : WT_ADDR_LEAF;
+		ref->addr = addr;
+		addr = NULL;
+
+		ref->key.recno = trk->col_start;
+		ref->state = WT_REF_DISK;
+
+		/*
+		 * If the page's key range is unmodified from when we read it
+		 * (in other words, we didn't merge part of this page with
+		 * another page), we can use the page without change, and the
+		 * only thing we need to do is mark all overflow records the
+		 * page references as in-use.
+		 *
+		 * If we did merge with another page, we have to build a page
+		 * reflecting the updated key range.  Note, that requires an
+		 * additional pass to free the merge page's backing blocks.
+		 */
+		if (F_ISSET(trk, WT_TRACK_MERGE)) {
+			ss->merge_free = 1;
+
+			WT_ERR(__slvg_col_build_leaf(session, trk, ref));
+		} else
+			WT_ERR(__slvg_ovfl_ref_all(session, trk));
+		++ref;
+	}
+
+	__wt_root_ref_init(&ss->root_ref, page, 1);
+
+	if (0) {
+err:		if (addr != NULL)
+			__wt_free(session, addr);
+		__wt_page_out(session, &page);
+	}
+	return (ret);
+}
+
+/*
+ * __slvg_col_build_leaf --
+ *	Build a column-store leaf page for a merged page.
+ */
+static int
+__slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
+{
+	WT_COL *save_col_var;
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_SALVAGE_COOKIE *cookie, _cookie;
+	uint64_t skip, take;
+	uint32_t *entriesp, save_entries;
+
+	cookie = &_cookie;
+	WT_CLEAR(*cookie);
+
+	/* Get the original page, including the full in-memory setup. */
+	WT_RET(__wt_page_in(session, ref, 0));
+	page = ref->page;
+
+	entriesp = page->type == WT_PAGE_COL_VAR ?
+	    &page->pg_var_entries : &page->pg_fix_entries;
+
+	save_col_var = page->pg_var_d;
+	save_entries = *entriesp;
+
+	/*
+	 * Calculate the number of K/V entries we are going to skip, and
+	 * the total number of K/V entries we'll take from this page.
+	 */
+	cookie->skip = skip = trk->col_start - page->pg_var_recno;
+	cookie->take = take = (trk->col_stop - trk->col_start) + 1;
+
+	WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+	    "%s merge discarding first %" PRIu64 " records, "
+	    "then taking %" PRIu64 " records",
+	    __wt_addr_string(
+	    session, trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1),
+	    skip, take));
+
+	/* Set the referenced flag on overflow pages we're using. */
+	if (page->type == WT_PAGE_COL_VAR && trk->trk_ovfl_cnt != 0)
+		WT_ERR(__slvg_col_ovfl(session, trk, page, skip, take));
+
+	/*
+	 * If we're missing some part of the range, the real start range is in
+	 * trk->col_missing, else, it's in trk->col_start.  Update the parent's
+	 * reference as well as the page itself.
+	 */
+	if (trk->col_missing == 0)
+		page->pg_var_recno = trk->col_start;
+	else {
+		page->pg_var_recno = trk->col_missing;
+		cookie->missing = trk->col_start - trk->col_missing;
+
+		WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+		    "%s merge inserting %" PRIu64 " missing records",
+		    __wt_addr_string(
+		    session, trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1),
+		    cookie->missing));
+	}
+	ref->key.recno = page->pg_var_recno;
+
+	/*
+	 * We can't discard the original blocks associated with this page now.
+	 * (The problem is we don't want to overwrite any original information
+	 * until the salvage run succeeds -- if we free the blocks now, the next
+	 * merge page we write might allocate those blocks and overwrite them,
+	 * and should the salvage run eventually fail, the original information
+	 * would have been lost.)  Clear the reference addr so eviction doesn't
+	 * free the underlying blocks.
+	 */
+	__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+	__wt_free(session, ref->addr);
+	ref->addr = NULL;
+
+	/* Write the new version of the leaf page to disk. */
+	WT_ERR(__slvg_modify_init(session, page));
+	WT_ERR(__wt_rec_write(session, ref, cookie, WT_SKIP_UPDATE_ERR));
+
+	/* Reset the page. */
+	page->pg_var_d = save_col_var;
+	*entriesp = save_entries;
+
+	ret = __wt_page_release(session, ref, 0);
+	if (ret == 0)
+		ret = __wt_rec_evict(session, ref, 1);
+
+	if (0) {
+err:		WT_TRET(__wt_page_release(session, ref, 0));
+	}
+
+	return (ret);
+}
+
+/*
+ * __slvg_col_ovfl_single --
+ *	Find a single overflow record in the merge page's list, and mark it as
+ * referenced.
+ */
+static int
+__slvg_col_ovfl_single(
+    WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL_UNPACK *unpack)
+{
+	WT_TRACK *ovfl;
+	uint32_t i;
+
+	/*
+	 * Search the list of overflow records for this page -- we should find
+	 * exactly one match, and we mark it as referenced.
+	 */
+	for (i = 0; i < trk->trk_ovfl_cnt; ++i) {
+		ovfl = trk->ss->ovfl[trk->trk_ovfl_slot[i]];
+		if (unpack->size == ovfl->trk_addr_size &&
+		    memcmp(unpack->data, ovfl->trk_addr, unpack->size) == 0)
+			return (__slvg_ovfl_ref(session, ovfl, 0));
+	}
+
+	WT_PANIC_RET(session,
+	    EINVAL, "overflow record at column-store page merge not found");
+}
+
+/*
+ * __slvg_col_ovfl --
+ *	Mark overflow items referenced by the merged page.
+ */
+static int
+__slvg_col_ovfl(WT_SESSION_IMPL *session,
+    WT_TRACK *trk, WT_PAGE *page, uint64_t skip, uint64_t take)
+{
+	WT_CELL_UNPACK unpack;
+	WT_CELL *cell;
+	WT_COL *cip;
+	WT_DECL_RET;
+	uint64_t recno, start, stop;
+	uint32_t i;
+
+	/*
+	 * Merging a variable-length column-store page, and we took some number
+	 * of records, figure out which (if any) overflow records we used.
+	 */
+	recno = page->pg_var_recno;
+	start = recno + skip;
+	stop = (recno + skip + take) - 1;
+
+	WT_COL_FOREACH(page, cip, i) {
+		cell = WT_COL_PTR(page, cip);
+		__wt_cell_unpack(cell, &unpack);
+		recno += __wt_cell_rle(&unpack);
+
+		/*
+		 * I keep getting this calculation wrong, so here's the logic.
+		 * Start is the first record we want, stop is the last record
+		 * we want. The record number has already been incremented one
+		 * past the maximum record number for this page entry, that is,
+		 * it's set to the first record number for the next page entry.
+		 * The test of start should be greater-than (not greater-than-
+		 * or-equal), because of that increment, if the record number
+		 * equals start, we want the next record, not this one.  The
+		 * test against stop is greater-than, not greater-than-or-equal
+		 * because stop is the last record wanted, if the record number
+		 * equals stop, we want the next record.
+		 */
+		if (recno > start && unpack.type == WT_CELL_VALUE_OVFL) {
+			ret = __slvg_col_ovfl_single(session, trk, &unpack);
+
+			/*
+			 * When handling overlapping ranges on variable-length
+			 * column-store leaf pages, we split ranges without
+			 * considering if we were splitting RLE units.  (See
+			 * note at the beginning of this file for explanation
+			 * of the overall process.) If the RLE unit was on-page,
+			 * we can simply write it again. If the RLE unit was an
+			 * overflow value that's already been used by another
+			 * row (from some other page created by a range split),
+			 * there's not much to do, this row can't reference an
+			 * overflow record we don't have: delete the row.
+			 */
+			if (ret == EBUSY) {
+				__wt_cell_type_reset(session,
+				    cell, WT_CELL_VALUE_OVFL, WT_CELL_DEL);
+				ret = 0;
+			}
+			WT_RET(ret);
+		}
+		if (recno > stop)
+			break;
+	}
+	return (0);
+}
+
+/*
+ * __slvg_row_range --
+ *	Figure out the leaf pages we need and discard everything else.  At the
+ * same time, tag the overflow pages they reference.
+ */
+static int
+__slvg_row_range(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+	WT_TRACK *jtrk;
+	WT_BTREE *btree;
+	uint32_t i, j;
+	int cmp;
+
+	btree = S2BT(session);
+
+	/*
+	 * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR
+	 * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE
+	 * BEING HANDLED.
+	 *
+	 * Walk the page array looking for overlapping key ranges, adjusting
+	 * the ranges based on the LSN until there are no overlaps.
+	 *
+	 * DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE
+	 * AS ENTRIES ARE SPLIT, SO ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE
+	 * PLUS OFFSET.
+	 */
+	for (i = 0; i < ss->pages_next; ++i) {
+		if (ss->pages[i] == NULL)
+			continue;
+
+		/* Check for pages that overlap our page. */
+		for (j = i + 1; j < ss->pages_next; ++j) {
+			if (ss->pages[j] == NULL)
+				continue;
+			/*
+			 * We're done if this page starts after our stop, no
+			 * subsequent pages can overlap our page.
+			 */
+			WT_RET(__wt_compare(session, btree->collator,
+			    &ss->pages[j]->row_start, &ss->pages[i]->row_stop,
+			    &cmp));
+			if (cmp > 0)
+				break;
+
+			/* There's an overlap, fix it up. */
+			jtrk = ss->pages[j];
+			WT_RET(__slvg_row_range_overlap(session, i, j, ss));
+
+			/*
+			 * If the overlap resolution changed the entry's start
+			 * key, the entry might have moved and the page array
+			 * re-sorted, and pages[j] would reference a different
+			 * page.  We don't move forward if that happened, we
+			 * re-process the slot again (by decrementing j before
+			 * the loop's increment).
+			 */
+			if (ss->pages[j] != NULL && jtrk != ss->pages[j])
+				--j;
+		}
+	}
+	return (0);
+}
+
+/*
+ * __slvg_row_range_overlap --
+ *	Two row-store key ranges overlap, deal with it.
+ */
+static int
+__slvg_row_range_overlap(
+    WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_slot, WT_STUFF *ss)
+{
+	WT_BTREE *btree;
+	WT_TRACK *a_trk, *b_trk, *new;
+	uint32_t i;
+	int start_cmp, stop_cmp;
+
+	/*
+	 * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR
+	 * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE
+	 * BEING HANDLED.
+	 */
+	btree = S2BT(session);
+
+	a_trk = ss->pages[a_slot];
+	b_trk = ss->pages[b_slot];
+
+	WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+	    "%s and %s range overlap",
+	    __wt_addr_string(
+	    session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1),
+	    __wt_addr_string(
+	    session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2)));
+
+	/*
+	 * The key ranges of two WT_TRACK pages in the array overlap -- choose
+	 * the ranges we're going to take from each.
+	 *
+	 * We can think of the overlap possibilities as 11 different cases:
+	 *
+	 *		AAAAAAAAAAAAAAAAAA
+	 * #1		BBBBBBBBBBBBBBBBBB		pages are the same
+	 * #2	BBBBBBBBBBBBB				overlaps the beginning
+	 * #3			BBBBBBBBBBBBBBBB	overlaps the end
+	 * #4		BBBBB				B is a prefix of A
+	 * #5			BBBBBB			B is middle of A
+	 * #6			BBBBBBBBBB		B is a suffix of A
+	 *
+	 * and:
+	 *
+	 *		BBBBBBBBBBBBBBBBBB
+	 * #7	AAAAAAAAAAAAA				same as #3
+	 * #8			AAAAAAAAAAAAAAAA	same as #2
+	 * #9		AAAAA				A is a prefix of B
+	 * #10			AAAAAA			A is middle of B
+	 * #11			AAAAAAAAAA		A is a suffix of B
+	 *
+	 * Note the leaf page array was sorted by key and a_trk appears earlier
+	 * in the array than b_trk, so cases #2/8, #10 and #11 are impossible.
+	 *
+	 * Finally, there's one additional complicating factor -- final ranges
+	 * are assigned based on the page's LSN.
+	 */
+#define	A_TRK_START	(&a_trk->row_start)
+#define	A_TRK_STOP	(&a_trk->row_stop)
+#define	B_TRK_START	(&b_trk->row_start)
+#define	B_TRK_STOP	(&b_trk->row_stop)
+#define	SLOT_START(i)	(&ss->pages[i]->row_start)
+#define	__slvg_key_copy(session, dst, src)				\
+	__wt_buf_set(session, dst, (src)->data, (src)->size)
+
+	WT_RET(__wt_compare(
+	    session, btree->collator, A_TRK_START, B_TRK_START, &start_cmp));
+	WT_RET(__wt_compare(
+	    session, btree->collator, A_TRK_STOP, B_TRK_STOP, &stop_cmp));
+
+	if (start_cmp > 0)			/* Case #2/8, #10, #11 */
+		WT_PANIC_RET(
+		    session, EINVAL, "unexpected merge array sort order");
+
+	if (start_cmp == 0) {				/* Case #1, #4, #9 */
+		/*
+		 * The secondary sort of the leaf page array was the page's LSN,
+		 * in high-to-low order, which means a_trk has a higher LSN, and
+		 * is more desirable, than b_trk.  In cases #1 and #4 and #9,
+		 * where the start of the range is the same for the two pages,
+		 * this simplifies things, it guarantees a_trk has a higher LSN
+		 * than b_trk.
+		 */
+		if (stop_cmp >= 0)
+			/*
+			 * Case #1, #4: a_trk is a superset of b_trk, and a_trk
+			 * is more desirable -- discard b_trk.
+			 */
+			goto delete_b;
+
+		/*
+		 * Case #9: b_trk is a superset of a_trk, but a_trk is more
+		 * desirable: keep both but delete a_trk's key range from
+		 * b_trk.
+		 */
+		WT_RET(__slvg_row_trk_update_start(
+		    session, A_TRK_STOP, b_slot, ss));
+		F_SET(b_trk, WT_TRACK_CHECK_START | WT_TRACK_MERGE);
+		goto merge;
+	}
+
+	if (stop_cmp == 0) {				/* Case #6 */
+		if (a_trk->trk_gen > b_trk->trk_gen)
+			/*
+			 * Case #6: a_trk is a superset of b_trk and a_trk is
+			 * more desirable -- discard b_trk.
+			 */
+			goto delete_b;
+
+		/*
+		 * Case #6: a_trk is a superset of b_trk, but b_trk is more
+		 * desirable: keep both but delete b_trk's key range from a_trk.
+		 */
+		WT_RET(__slvg_key_copy(session, A_TRK_STOP, B_TRK_START));
+		F_SET(a_trk, WT_TRACK_CHECK_STOP | WT_TRACK_MERGE);
+		goto merge;
+	}
+
+	if (stop_cmp < 0) {				/* Case #3/7 */
+		if (a_trk->trk_gen > b_trk->trk_gen) {
+			/*
+			 * Case #3/7: a_trk is more desirable, delete a_trk's
+			 * key range from b_trk;
+			 */
+			WT_RET(__slvg_row_trk_update_start(
+			    session, A_TRK_STOP, b_slot, ss));
+			F_SET(b_trk, WT_TRACK_CHECK_START | WT_TRACK_MERGE);
+		} else {
+			/*
+			 * Case #3/7: b_trk is more desirable, delete b_trk's
+			 * key range from a_trk;
+			 */
+			WT_RET(__slvg_key_copy(
+			    session, A_TRK_STOP, B_TRK_START));
+			F_SET(a_trk, WT_TRACK_CHECK_STOP | WT_TRACK_MERGE);
+		}
+		goto merge;
+	}
+
+	/*
+	 * Case #5: a_trk is a superset of b_trk and a_trk is more desirable --
+	 * discard b_trk.
+	 */
+	if (a_trk->trk_gen > b_trk->trk_gen) {
+delete_b:	/*
+		 * After page and overflow reconciliation, one (and only one)
+		 * page can reference an overflow record.  But, if we split a
+		 * page into multiple chunks, any of the chunks might own any
+		 * of the backing overflow records, so overflow records won't
+		 * normally be discarded until after the merge phase completes.
+		 * (The merge phase is where the final pages are written, and
+		 * we figure out which overflow records are actually used.)
+		 * If freeing a chunk and there are no other references to the
+		 * underlying shared information, the overflow records must be
+		 * useless, discard them to keep the final file size small.
+		 */
+		if (b_trk->shared->ref == 1)
+			for (i = 0; i < b_trk->trk_ovfl_cnt; ++i)
+				WT_RET(__slvg_trk_free(session,
+				    &ss->ovfl[b_trk->trk_ovfl_slot[i]], 1));
+		return (__slvg_trk_free(session, &ss->pages[b_slot], 1));
+	}
+
+	/*
+	 * Case #5: b_trk is more desirable and is a middle chunk of a_trk.
+	 * Split a_trk into two parts, the key range before b_trk and the
+	 * key range after b_trk.
+	 */
+	WT_RET(__slvg_trk_split(session, a_trk, &new));
+
+	/*
+	 * Second, reallocate the array of pages if necessary, and then insert
+	 * the new element into the array after the existing element (that's
+	 * probably wrong, but we'll fix it up in a second).
+	 */
+	WT_RET(__wt_realloc_def(
+	    session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages));
+	memmove(ss->pages + a_slot + 1, ss->pages + a_slot,
+	    (ss->pages_next - a_slot) * sizeof(*ss->pages));
+	ss->pages[a_slot + 1] = new;
+	++ss->pages_next;
+
+	/*
+	 * Third, set its its stop key to be the stop key of the original chunk,
+	 * and call __slvg_row_trk_update_start.   That function will both set
+	 * the start key to be the first key after the stop key of the middle
+	 * chunk (that's b_trk), and re-sort the WT_TRACK array as necessary to
+	 * move our new entry into the right sorted location.
+	 */
+	WT_RET(__slvg_key_copy(session, &new->row_stop, A_TRK_STOP));
+	WT_RET(
+	    __slvg_row_trk_update_start(session, B_TRK_STOP, a_slot + 1, ss));
+
+	/*
+	 * Fourth, set the original WT_TRACK information to reference only
+	 * the initial key space in the page, that is, everything up to the
+	 * starting key of the middle chunk (that's b_trk).
+	 */
+	WT_RET(__slvg_key_copy(session, A_TRK_STOP, B_TRK_START));
+	F_SET(new, WT_TRACK_CHECK_START);
+	F_SET(a_trk, WT_TRACK_CHECK_STOP);
+
+	F_SET(new, WT_TRACK_MERGE);
+	F_SET(a_trk, WT_TRACK_MERGE);
+
+merge:	WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+	    "%s and %s require merge",
+	    __wt_addr_string(
+	    session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1),
+	    __wt_addr_string(
+	    session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2)));
+	return (0);
+}
+
+/*
+ * __slvg_row_trk_update_start --
+ *	Update a row-store page's start key after an overlap.
+ */
+static int
+__slvg_row_trk_update_start(
+    WT_SESSION_IMPL *session, WT_ITEM *stop, uint32_t slot, WT_STUFF *ss)
+{
+	WT_BTREE *btree;
+	WT_DECL_ITEM(dsk);
+	WT_DECL_ITEM(key);
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_ROW *rip;
+	WT_TRACK *trk;
+	uint32_t i;
+	int cmp, found;
+
+	btree = S2BT(session);
+	page = NULL;
+	found = 0;
+
+	trk = ss->pages[slot];
+
+	/*
+	 * If we deleted an initial piece of the WT_TRACK name space, it may no
+	 * longer be in the right location.
+	 *
+	 * For example, imagine page #1 has the key range 30-50, it split, and
+	 * we wrote page #2 with key range 30-40, and page #3 key range with
+	 * 40-50, where pages #2 and #3 have larger LSNs than page #1.  When the
+	 * key ranges were sorted, page #2 came first, then page #1 (because of
+	 * their earlier start keys than page #3), and page #2 came before page
+	 * #1 because of its LSN.  When we resolve the overlap between page #2
+	 * and page #1, we truncate the initial key range of page #1, and it now
+	 * sorts after page #3, because it has the same starting key of 40, and
+	 * a lower LSN.
+	 *
+	 * First, update the WT_TRACK start key based on the specified stop key.
+	 *
+	 * Read and instantiate the WT_TRACK page (we don't have to verify the
+	 * page, nor do we have to be quiet on error, we've already read this
+	 * page successfully).
+	 */
+	WT_RET(__wt_scr_alloc(session, trk->trk_size, &dsk));
+	WT_ERR(__wt_bt_read(session, dsk, trk->trk_addr, trk->trk_addr_size));
+	WT_ERR(__wt_page_inmem(session, NULL, dsk->mem, 0, &page));
+
+	/*
+	 * Walk the page, looking for a key sorting greater than the specified
+	 * stop key -- that's our new start key.
+	 */
+	WT_ERR(__wt_scr_alloc(session, 0, &key));
+	WT_ROW_FOREACH(page, rip, i) {
+		WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0));
+		WT_ERR(__wt_compare(session, btree->collator, key, stop, &cmp));
+		if (cmp > 0) {
+			found = 1;
+			break;
+		}
+	}
+
+	/*
+	 * We know that at least one key on the page sorts after the specified
+	 * stop key, otherwise the page would have entirely overlapped and we
+	 * would have discarded it, we wouldn't be here.  Therefore, this test
+	 * is safe.  (But, it never hurts to check.)
+	 */
+	WT_ERR_TEST(!found, WT_ERROR);
+	WT_ERR(__slvg_key_copy(session, &trk->row_start, key));
+
+	/*
+	 * We may need to re-sort some number of elements in the list.  Walk
+	 * forward in the list until reaching an entry which cannot overlap
+	 * the adjusted entry.  If it's more than a single slot, re-sort the
+	 * entries.
+	 */
+	for (i = slot + 1; i < ss->pages_next; ++i) {
+		if (ss->pages[i] == NULL)
+			continue;
+		WT_ERR(__wt_compare(session,
+		    btree->collator, SLOT_START(i), &trk->row_stop, &cmp));
+		if (cmp > 0)
+			break;
+	}
+	i -= slot;
+	if (i > 1)
+		qsort(ss->pages + slot, (size_t)i,
+		    sizeof(WT_TRACK *), __slvg_trk_compare_key);
+
+err:	if (page != NULL)
+		__wt_page_out(session, &page);
+	__wt_scr_free(&dsk);
+	__wt_scr_free(&key);
+
+	return (ret);
+}
+
+/*
+ * __slvg_row_build_internal --
+ *	Build a row-store in-memory page that references all of the leaf
+ *	pages we've found.
+ */
+static int
+__slvg_row_build_internal(
+    WT_SESSION_IMPL *session, uint32_t leaf_cnt,  WT_STUFF *ss)
+{
+	WT_ADDR *addr;
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_PAGE_INDEX *pindex;
+	WT_REF *ref, **refp;
+	WT_TRACK *trk;
+	uint32_t i;
+
+	addr = NULL;
+
+	/* Allocate a row-store root (internal) page and fill it in. */
+	WT_RET(
+	    __wt_page_alloc(session, WT_PAGE_ROW_INT, 0, leaf_cnt, 1, &page));
+	WT_ERR(__slvg_modify_init(session, page));
+
+	pindex = WT_INTL_INDEX_COPY(page);
+	for (refp = pindex->index, i = 0; i < ss->pages_next; ++i) {
+		if ((trk = ss->pages[i]) == NULL)
+			continue;
+
+		ref = *refp++;
+		ref->home = page;
+		ref->page = NULL;
+
+		WT_ERR(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr));
+		WT_ERR(__wt_strndup(
+		    session, trk->trk_addr, trk->trk_addr_size, &addr->addr));
+		addr->size = trk->trk_addr_size;
+		addr->type =
+		    trk->trk_ovfl_cnt == 0 ? WT_ADDR_LEAF_NO : WT_ADDR_LEAF;
+		ref->addr = addr;
+		addr = NULL;
+
+		__wt_ref_key_clear(ref);
+		ref->state = WT_REF_DISK;
+
+		/*
+		 * If the page's key range is unmodified from when we read it
+		 * (in other words, we didn't merge part of this page with
+		 * another page), we can use the page without change, and the
+		 * only thing we need to do is mark all overflow records the
+		 * page references as in-use.
+		 *
+		 * If we did merge with another page, we have to build a page
+		 * reflecting the updated key range.  Note, that requires an
+		 * additional pass to free the merge page's backing blocks.
+		 */
+		if (F_ISSET(trk, WT_TRACK_MERGE)) {
+			ss->merge_free = 1;
+
+			WT_ERR(__slvg_row_build_leaf(session, trk, ref, ss));
+		} else {
+			WT_ERR(__wt_row_ikey_incr(session, page, 0,
+			    trk->row_start.data, trk->row_start.size,
+			    &ref->key.ikey));
+
+			WT_ERR(__slvg_ovfl_ref_all(session, trk));
+		}
+		++ref;
+	}
+
+	__wt_root_ref_init(&ss->root_ref, page, 0);
+
+	if (0) {
+err:		if (addr != NULL)
+			__wt_free(session, addr);
+		__wt_page_out(session, &page);
+	}
+	return (ret);
+}
+
+/*
+ * __slvg_row_build_leaf --
+ *	Build a row-store leaf page for a merged page.
+ */
+static int
+__slvg_row_build_leaf(
+    WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref, WT_STUFF *ss)
+{
+	WT_BTREE *btree;
+	WT_DECL_ITEM(key);
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_ROW *rip;
+	WT_SALVAGE_COOKIE *cookie, _cookie;
+	uint32_t i, skip_start, skip_stop;
+	int cmp;
+
+	btree = S2BT(session);
+	page = NULL;
+
+	cookie = &_cookie;
+	WT_CLEAR(*cookie);
+
+	/* Allocate temporary space in which to instantiate the keys. */
+	WT_RET(__wt_scr_alloc(session, 0, &key));
+
+	/* Get the original page, including the full in-memory setup. */
+	WT_ERR(__wt_page_in(session, ref, 0));
+	page = ref->page;
+
+	/*
+	 * Figure out how many page keys we want to take and how many we want
+	 * to skip.
+	 *
+	 * If checking the starting range key, the key we're searching for will
+	 * be equal to the starting range key.  This is because we figured out
+	 * the true merged-page start key as part of discarding initial keys
+	 * from the page (see the __slvg_row_range_overlap function, and its
+	 * calls to __slvg_row_trk_update_start for more information).
+	 *
+	 * If checking the stopping range key, we want the keys on the page that
+	 * are less-than the stopping range key.  This is because we copied a
+	 * key from another page to define this page's stop range: that page is
+	 * the page that owns the "equal to" range space.
+	 */
+	skip_start = skip_stop = 0;
+	if (F_ISSET(trk, WT_TRACK_CHECK_START))
+		WT_ROW_FOREACH(page, rip, i) {
+			WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0));
+
+			/*
+			 * >= is correct: see the comment above.
+			 */
+			WT_ERR(__wt_compare(session,
+			    btree->collator, key, &trk->row_start, &cmp));
+			if (cmp >= 0)
+				break;
+			if (WT_VERBOSE_ISSET(session, WT_VERB_SALVAGE)) {
+				WT_ERR(__wt_buf_set_printable(session,
+				    ss->tmp1, key->data, key->size));
+				WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+				    "%s merge discarding leading key %.*s",
+				    __wt_addr_string(session,
+				    trk->trk_addr, trk->trk_addr_size,
+				    ss->tmp2), (int)ss->tmp1->size,
+				    (char *)ss->tmp1->data));
+			}
+			++skip_start;
+		}
+	if (F_ISSET(trk, WT_TRACK_CHECK_STOP))
+		WT_ROW_FOREACH_REVERSE(page, rip, i) {
+			WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0));
+
+			/*
+			 * < is correct: see the comment above.
+			 */
+			WT_ERR(__wt_compare(session,
+			    btree->collator, key, &trk->row_stop, &cmp));
+			if (cmp < 0)
+				break;
+			if (WT_VERBOSE_ISSET(session, WT_VERB_SALVAGE)) {
+				WT_ERR(__wt_buf_set_printable(session,
+				    ss->tmp1, key->data, key->size));
+				WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+				    "%s merge discarding trailing key %.*s",
+				    __wt_addr_string(session,
+				    trk->trk_addr, trk->trk_addr_size,
+				    ss->tmp2), (int)ss->tmp1->size,
+				    (char *)ss->tmp1->data));
+			}
+			++skip_stop;
+		}
+
+	/* We should have selected some entries, but not the entire page. */
+	WT_ASSERT(session,
+	    skip_start + skip_stop > 0 &&
+	    skip_start + skip_stop < page->pg_row_entries);
+
+	/*
+	 * Take a copy of this page's first key to define the start of
+	 * its range.  The key may require processing, otherwise, it's
+	 * a copy from the page.
+	 */
+	rip = page->pg_row_d + skip_start;
+	WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0));
+	WT_ERR(__wt_row_ikey_incr(session,
+	    ref->home, 0, key->data, key->size, &ref->key.ikey));
+
+	/* Set the referenced flag on overflow pages we're using. */
+	if (trk->trk_ovfl_cnt != 0)
+		WT_ERR(__slvg_row_ovfl(session,
+		    trk, page, skip_start, page->pg_row_entries - skip_stop));
+
+	/*
+	 * Change the page to reflect the correct record count: there is no
+	 * need to copy anything on the page itself, the entries value limits
+	 * the number of page items.
+	 */
+	page->pg_row_entries -= skip_stop;
+	cookie->skip = skip_start;
+
+	/*
+	 * We can't discard the original blocks associated with this page now.
+	 * (The problem is we don't want to overwrite any original information
+	 * until the salvage run succeeds -- if we free the blocks now, the next
+	 * merge page we write might allocate those blocks and overwrite them,
+	 * and should the salvage run eventually fail, the original information
+	 * would have been lost.)  Clear the reference addr so eviction doesn't
+	 * free the underlying blocks.
+	 */
+	__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+	__wt_free(session, ref->addr);
+	ref->addr = NULL;
+
+	/* Write the new version of the leaf page to disk. */
+	WT_ERR(__slvg_modify_init(session, page));
+	WT_ERR(__wt_rec_write(session, ref, cookie, WT_SKIP_UPDATE_ERR));
+
+	/* Reset the page. */
+	page->pg_row_entries += skip_stop;
+
+	/*
+	 * Discard our hazard pointer and evict the page, updating the
+	 * parent's reference.
+	 */
+	ret = __wt_page_release(session, ref, 0);
+	if (ret == 0)
+		ret = __wt_rec_evict(session, ref, 1);
+
+	if (0) {
+err:		WT_TRET(__wt_page_release(session, ref, 0));
+	}
+	__wt_scr_free(&key);
+
+	return (ret);
+}
+
+/*
+ * __slvg_row_ovfl_single --
+ *	Find a single overflow record in the merge page's list, and mark it as
+ * referenced.
+ */
+static int
+__slvg_row_ovfl_single(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL *cell)
+{
+	WT_CELL_UNPACK unpack;
+	WT_TRACK *ovfl;
+	uint32_t i;
+
+	/* Unpack the cell, and check if it's an overflow record. */
+	__wt_cell_unpack(cell, &unpack);
+	if (unpack.type != WT_CELL_KEY_OVFL &&
+	    unpack.type != WT_CELL_VALUE_OVFL)
+		return (0);
+
+	/*
+	 * Search the list of overflow records for this page -- we should find
+	 * exactly one match, and we mark it as referenced.
+	 */
+	for (i = 0; i < trk->trk_ovfl_cnt; ++i) {
+		ovfl = trk->ss->ovfl[trk->trk_ovfl_slot[i]];
+		if (unpack.size == ovfl->trk_addr_size &&
+		    memcmp(unpack.data, ovfl->trk_addr, unpack.size) == 0)
+			return (__slvg_ovfl_ref(session, ovfl, 1));
+	}
+
+	WT_PANIC_RET(session,
+	    EINVAL, "overflow record at row-store page merge not found");
+}
+
+/*
+ * __slvg_row_ovfl --
+ *	Mark overflow items referenced by the merged page.
+ */
+static int
+__slvg_row_ovfl(WT_SESSION_IMPL *session,
+    WT_TRACK *trk, WT_PAGE *page, uint32_t start, uint32_t stop)
+{
+	WT_CELL *cell;
+	WT_ROW *rip;
+	void *copy;
+
+	/*
+	 * We're merging a row-store page, and we took some number of records,
+	 * figure out which (if any) overflow records we used.
+	 */
+	for (rip = page->pg_row_d + start; start < stop; ++start, ++rip) {
+		copy = WT_ROW_KEY_COPY(rip);
+		(void)__wt_row_leaf_key_info(
+		    page, copy, NULL, &cell, NULL, NULL);
+		if (cell != NULL)
+			WT_RET(__slvg_row_ovfl_single(session, trk, cell));
+		cell = __wt_row_leaf_value_cell(page, rip, NULL);
+		if (cell != NULL)
+			WT_RET(__slvg_row_ovfl_single(session, trk, cell));
+	}
+	return (0);
+}
+
+/*
+ * __slvg_trk_compare_addr --
+ *	Compare two WT_TRACK array entries by address cookie.
+ */
+static int
+__slvg_trk_compare_addr(const void *a, const void *b)
+{
+	WT_DECL_RET;
+	WT_TRACK *a_trk, *b_trk;
+	size_t len;
+
+	a_trk = *(WT_TRACK **)a;
+	b_trk = *(WT_TRACK **)b;
+
+	/*
+	 * We don't care about the order because these are opaque cookies --
+	 * we're just sorting them so we can binary search instead of linear
+	 * search.
+	 */
+	len = WT_MIN(a_trk->trk_addr_size, b_trk->trk_addr_size);
+	ret = memcmp(a_trk->trk_addr, b_trk->trk_addr, len);
+	if (ret == 0)
+		ret = a_trk->trk_addr_size > b_trk->trk_addr_size ? -1 : 1;
+	return (ret);
+}
+
+/*
+ * __slvg_ovfl_compare --
+ *	Bsearch comparison routine for the overflow array.
+ */
+static int
+__slvg_ovfl_compare(const void *a, const void *b)
+{
+	WT_ADDR *addr;
+	WT_DECL_RET;
+	WT_TRACK *trk;
+	size_t len;
+
+	addr = (WT_ADDR *)a;
+	trk = *(WT_TRACK **)b;
+
+	len = WT_MIN(trk->trk_addr_size, addr->size);
+	ret = memcmp(addr->addr, trk->trk_addr, len);
+	if (ret == 0 && addr->size != trk->trk_addr_size)
+		ret = addr->size < trk->trk_addr_size ? -1 : 1;
+	return (ret);
+}
+
+/*
+ * __slvg_ovfl_reconcile --
+ *	Review relationships between leaf pages and the overflow pages, delete
+ * leaf pages until there's a one-to-one relationship between leaf and overflow
+ * pages.
+ */
+static int
+__slvg_ovfl_reconcile(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+	WT_ADDR *addr;
+	WT_DECL_RET;
+	WT_TRACK **searchp, *trk;
+	uint32_t i, j, *slot;
+
+	slot = NULL;
+
+	/*
+	 * If an overflow page is referenced more than once, discard leaf pages
+	 * with the lowest LSNs until overflow pages are only referenced once.
+	 *
+	 * This requires sorting the page list by LSN, and the overflow array
+	 * by address cookie.
+	 */
+	qsort(ss->pages,
+	    (size_t)ss->pages_next, sizeof(WT_TRACK *), __slvg_trk_compare_gen);
+	qsort(ss->ovfl,
+	    (size_t)ss->ovfl_next, sizeof(WT_TRACK *), __slvg_trk_compare_addr);
+
+	/*
+	 * Walk the list of pages and discard any pages referencing non-existent
+	 * overflow pages or referencing overflow pages also referenced by pages
+	 * with higher LSNs.  Our caller sorted the page list by LSN, high to
+	 * low, so we don't have to do explicit testing of the page LSNs, the
+	 * first page to reference an overflow page is the best page to own it.
+	 */
+	for (i = 0; i < ss->pages_next; ++i) {
+		if ((trk = ss->pages[i]) == NULL || trk->trk_ovfl_cnt == 0)
+			continue;
+
+		WT_ERR(__wt_calloc_def(session, trk->trk_ovfl_cnt, &slot));
+		for (j = 0; j < trk->trk_ovfl_cnt; ++j) {
+			addr = &trk->trk_ovfl_addr[j];
+			searchp = bsearch(addr, ss->ovfl, ss->ovfl_next,
+			    sizeof(WT_TRACK *), __slvg_ovfl_compare);
+
+			/*
+			 * If the overflow page doesn't exist or if another page
+			 * has already claimed it, this leaf page isn't usable.
+			 */
+			if (searchp != NULL &&
+			    !F_ISSET(*searchp, WT_TRACK_OVFL_REFD)) {
+				/*
+				 * Convert each block address into a slot in the
+				 * list of overflow pages as we go.
+				 */
+				slot[j] = (uint32_t)(searchp - ss->ovfl);
+				F_SET(*searchp, WT_TRACK_OVFL_REFD);
+				continue;
+			}
+
+			WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+			    "%s references unavailable overflow page %s",
+			    __wt_addr_string(session,
+			    trk->trk_addr, trk->trk_addr_size, ss->tmp1),
+			    __wt_addr_string(session,
+			    addr->addr, addr->size, ss->tmp2)));
+
+			/*
+			 * Clear the "referenced" flag for any overflow pages
+			 * already claimed by this leaf page some other page
+			 * might claim them.
+			 */
+			while (j > 0)
+				F_CLR(ss->ovfl[slot[--j]], WT_TRACK_OVFL_REFD);
+			trk = NULL;
+			WT_ERR(__slvg_trk_free(session, &ss->pages[i], 1));
+			break;
+		}
+
+		/*
+		 * We now have a reference to the overflow WT_TRACK, and so no
+		 * longer need the page's address array, discard it.  Note, we
+		 * potentially freed the WT_TRACK in the loop above, check it's
+		 * still valid.
+		 */
+		if (trk == NULL)
+			__wt_free(session, slot);
+		else {
+			__slvg_trk_free_addr(session, trk);
+
+			trk->trk_ovfl_slot = slot;
+			slot = NULL;
+		}
+	}
+	return (0);
+
+err:	__wt_free(session, slot);
+	return (ret);
+}
+
+/*
+ * __slvg_trk_compare_key --
+ *	Compare two WT_TRACK array entries by key, and secondarily, by LSN.
+ */
+static int
+__slvg_trk_compare_key(const void *a, const void *b)
+{
+	WT_SESSION_IMPL *session;
+	WT_TRACK *a_trk, *b_trk;
+	uint64_t a_gen, a_recno, b_gen, b_recno;
+	int cmp;
+
+	a_trk = *(WT_TRACK **)a;
+	b_trk = *(WT_TRACK **)b;
+
+	if (a_trk == NULL)
+		return (b_trk == NULL ? 0 : 1);
+	if (b_trk == NULL)
+		return (-1);
+
+	switch (a_trk->ss->page_type) {
+	case WT_PAGE_COL_FIX:
+	case WT_PAGE_COL_VAR:
+		a_recno = a_trk->col_start;
+		b_recno = b_trk->col_start;
+		if (a_recno == b_recno)
+			break;
+		if (a_recno > b_recno)
+			return (1);
+		if (a_recno < b_recno)
+			return (-1);
+		break;
+	case WT_PAGE_ROW_LEAF:
+		/*
+		 * XXX
+		 * __wt_compare can potentially fail, and we're ignoring that
+		 * error because this routine is called as an underlying qsort
+		 * routine.
+		 */
+		session = a_trk->ss->session;
+		(void)__wt_compare(session, S2BT(session)->collator,
+		    &a_trk->row_start, &b_trk->row_start, &cmp);
+		if (cmp != 0)
+			return (cmp);
+		break;
+	}
+
+	/*
+	 * If the primary keys compare equally, differentiate based on LSN.
+	 * Sort from highest LSN to lowest, that is, the earlier pages in
+	 * the array are more desirable.
+	 */
+	a_gen = a_trk->trk_gen;
+	b_gen = b_trk->trk_gen;
+	return (a_gen > b_gen ? -1 : (a_gen < b_gen ? 1 : 0));
+}
+
+/*
+ * __slvg_trk_compare_gen --
+ *	Compare two WT_TRACK array entries by LSN.
+ */
+static int
+__slvg_trk_compare_gen(const void *a, const void *b)
+{
+	WT_TRACK *a_trk, *b_trk;
+	uint64_t a_gen, b_gen;
+
+	a_trk = *(WT_TRACK **)a;
+	b_trk = *(WT_TRACK **)b;
+
+	/*
+	 * Sort from highest LSN to lowest, that is, the earlier pages in the
+	 * array are more desirable.
+	 */
+	a_gen = a_trk->trk_gen;
+	b_gen = b_trk->trk_gen;
+	return (a_gen > b_gen ? -1 : (a_gen < b_gen ? 1 : 0));
+}
+
+/*
+ * __slvg_merge_block_free --
+ *	Clean up backing file and overflow blocks after the merge phase.
+ */
+static int
+__slvg_merge_block_free(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+	WT_TRACK *trk;
+	uint32_t i;
+
+	/* Free any underlying file blocks for merged pages. */
+	for (i = 0; i < ss->pages_next; ++i) {
+		if ((trk = ss->pages[i]) == NULL)
+			continue;
+		if (F_ISSET(trk, WT_TRACK_MERGE))
+			WT_RET(__slvg_trk_free(session, &ss->pages[i], 1));
+	}
+
+	/* Free any unused overflow records. */
+	return (__slvg_ovfl_discard(session, ss));
+}
+
+/*
+ * __slvg_ovfl_ref --
+ *	Reference an overflow page, checking for multiple references.
+ */
+static int
+__slvg_ovfl_ref(WT_SESSION_IMPL *session, WT_TRACK *trk, int multi_panic)
+{
+	if (F_ISSET(trk, WT_TRACK_OVFL_REFD)) {
+		if (!multi_panic)
+			return (EBUSY);
+		WT_PANIC_RET(session, EINVAL,
+		    "overflow record unexpectedly referenced multiple times "
+		    "during leaf page merge");
+	}
+
+	F_SET(trk, WT_TRACK_OVFL_REFD);
+	return (0);
+}
+
+/*
+ * __slvg_ovfl_ref_all --
+ *	Reference all of the page's overflow pages.
+ */
+static int
+__slvg_ovfl_ref_all(WT_SESSION_IMPL *session, WT_TRACK *trk)
+{
+	uint32_t i;
+
+	for (i = 0; i < trk->trk_ovfl_cnt; ++i)
+		WT_RET(__slvg_ovfl_ref(
+		    session, trk->ss->ovfl[trk->trk_ovfl_slot[i]], 1));
+
+	return (0);
+}
+
+/*
+ * __slvg_ovfl_discard --
+ *	Discard unused overflow pages.
+ */
+static int
+__slvg_ovfl_discard(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+	WT_TRACK *trk;
+	uint32_t i;
+
+	/*
+	 * Walk the overflow page array: if an overflow page isn't referenced,
+	 * add its file blocks to the free list.
+	 *
+	 * Clear the reference flag (it's reused to figure out if the overflow
+	 * record is referenced, but never used, by merged pages).
+	 */
+	for (i = 0; i < ss->ovfl_next; ++i) {
+		if ((trk = ss->ovfl[i]) == NULL)
+			continue;
+
+		if (F_ISSET(trk, WT_TRACK_OVFL_REFD)) {
+			F_CLR(trk, WT_TRACK_OVFL_REFD);
+			continue;
+		}
+		WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+		    "%s unused overflow page",
+		    __wt_addr_string(
+		    session, trk->trk_addr, trk->trk_addr_size, ss->tmp1)));
+		WT_RET(__slvg_trk_free(session, &ss->ovfl[i], 1));
+	}
+
+	return (0);
+}
+
+/*
+ * __slvg_cleanup --
+ *	Discard memory allocated to the page and overflow arrays.
+ */
+static int
+__slvg_cleanup(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+	uint32_t i;
+
+	/* Discard the leaf page array. */
+	for (i = 0; i < ss->pages_next; ++i)
+		if (ss->pages[i] != NULL)
+			WT_RET(__slvg_trk_free(session, &ss->pages[i], 0));
+	__wt_free(session, ss->pages);
+
+	/* Discard the ovfl page array. */
+	for (i = 0; i < ss->ovfl_next; ++i)
+		if (ss->ovfl[i] != NULL)
+			WT_RET(__slvg_trk_free(session, &ss->ovfl[i], 0));
+	__wt_free(session, ss->ovfl);
+
+	return (0);
+}
+
+/*
+ * __slvg_trk_free_addr --
+ *	Discard address information.
+ */
+static void
+__slvg_trk_free_addr(WT_SESSION_IMPL *session, WT_TRACK *trk)
+{
+	uint32_t i;
+
+	if (trk->trk_ovfl_addr != NULL) {
+		for (i = 0; i < trk->trk_ovfl_cnt; ++i)
+			__wt_free(session, trk->trk_ovfl_addr[i].addr);
+		__wt_free(session, trk->trk_ovfl_addr);
+	}
+}
+
+/*
+ * __slvg_trk_free_block --
+ *	Discard underlying blocks.
+ */
+static int
+__slvg_trk_free_block(WT_SESSION_IMPL *session, WT_TRACK *trk)
+{
+	WT_BM *bm;
+
+	bm = S2BT(session)->bm;
+
+	/*
+	 * If freeing underlying file blocks or overflow pages, this is a page
+	 * we were tracking but eventually decided not to use.
+	 */
+	WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+	    "%s blocks discarded: discard freed file bytes %" PRIu32,
+	    __wt_addr_string(session,
+	    trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1), trk->trk_size));
+
+	return (bm->free(bm, session, trk->trk_addr, trk->trk_addr_size));
+}
+
+/*
+ * __slvg_trk_free --
+ *	Discard a WT_TRACK structure and (optionally) its underlying blocks.
+ */
+static int
+__slvg_trk_free(WT_SESSION_IMPL *session, WT_TRACK **trkp, int free_on_last_ref)
+{
+	WT_TRACK *trk;
+
+	trk = *trkp;
+	*trkp = NULL;
+
+	/*
+	 * If we're the last user of shared information, clean up.
+	 */
+	WT_ASSERT(session, trk->shared->ref > 0);
+	if (--trk->shared->ref == 0) {
+		/*
+		 * If the free-on-last-ref flag is set, this chunk isn't going
+		 * to use the backing physical blocks.  As we're the last user
+		 * of those blocks, nobody is going to use them and they can be
+		 * discarded.
+		 */
+		if (free_on_last_ref)
+			WT_RET(__slvg_trk_free_block(session, trk));
+
+		__wt_free(session, trk->trk_addr);
+
+		__slvg_trk_free_addr(session, trk);
+
+		__wt_free(session, trk->trk_ovfl_slot);
+
+		__wt_free(session, trk->shared);
+	}
+
+	if (trk->ss->page_type == WT_PAGE_ROW_LEAF) {
+		__wt_buf_free(session, &trk->row_start);
+		__wt_buf_free(session, &trk->row_stop);
+	}
+
+	__wt_free(session, trk);
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c
new file mode 100644
index 00000000000..3da0bcf346c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_stat.c
@@ -0,0 +1,190 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int  __stat_page(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *);
+static int  __stat_page_col_var(WT_PAGE *, WT_DSRC_STATS *);
+static int  __stat_page_row_leaf(WT_PAGE *, WT_DSRC_STATS *);
+
+/*
+ * __wt_btree_stat_init --
+ *	Initialize the Btree statistics.
+ */
+int
+__wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_DSRC_STATS *stats;
+	WT_REF *next_walk;
+
+	btree = S2BT(session);
+	bm = btree->bm;
+	stats = &btree->dhandle->stats;
+
+	WT_RET(bm->stat(bm, session, stats));
+
+	WT_STAT_SET(stats, btree_fixed_len, btree->bitcnt);
+	WT_STAT_SET(stats, btree_maximum_depth, btree->maximum_depth);
+	WT_STAT_SET(stats, btree_maxintlitem, btree->maxintlitem);
+	WT_STAT_SET(stats, btree_maxintlpage, btree->maxintlpage);
+	WT_STAT_SET(stats, btree_maxleafitem, btree->maxleafitem);
+	WT_STAT_SET(stats, btree_maxleafpage, btree->maxleafpage);
+
+	/* Everything else is really, really expensive. */
+	if (!F_ISSET(cst, WT_CONN_STAT_ALL))
+		return (0);
+
+	next_walk = NULL;
+	while ((ret =
+	    __wt_tree_walk(session, &next_walk, 0)) == 0 && next_walk != NULL)
+		WT_RET(__stat_page(session, next_walk->page, stats));
+	return (ret == WT_NOTFOUND ? 0 : ret);
+}
+
+/*
+ * __stat_page --
+ *	Stat any Btree page.
+ */
+static int
+__stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
+{
+	WT_PAGE_INDEX *pindex;
+
+	/*
+	 * All internal pages and overflow pages are trivial, all we track is
+	 * a count of the page type.
+	 */
+	switch (page->type) {
+	case WT_PAGE_COL_FIX:
+		WT_STAT_INCR(stats, btree_column_fix);
+		WT_STAT_INCRV(stats, btree_entries, page->pg_fix_entries);
+		break;
+	case WT_PAGE_COL_INT:
+		WT_STAT_INCR(stats, btree_column_internal);
+		pindex = WT_INTL_INDEX_COPY(page);
+		WT_STAT_INCRV(stats, btree_entries, pindex->entries);
+		break;
+	case WT_PAGE_COL_VAR:
+		WT_RET(__stat_page_col_var(page, stats));
+		break;
+	case WT_PAGE_OVFL:
+		WT_STAT_INCR(stats, btree_overflow);
+		break;
+	case WT_PAGE_ROW_INT:
+		WT_STAT_INCR(stats, btree_row_internal);
+		pindex = WT_INTL_INDEX_COPY(page);
+		WT_STAT_INCRV(stats, btree_entries, pindex->entries);
+		break;
+	case WT_PAGE_ROW_LEAF:
+		WT_RET(__stat_page_row_leaf(page, stats));
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+	return (0);
+}
+
+/*
+ * __stat_page_col_var --
+ *	Stat a WT_PAGE_COL_VAR page.
+ */
+static int
+__stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats)
+{
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	WT_COL *cip;
+	WT_INSERT *ins;
+	WT_UPDATE *upd;
+	uint32_t i;
+	int orig_deleted;
+
+	unpack = &_unpack;
+
+	WT_STAT_INCR(stats, btree_column_variable);
+
+	/*
+	 * Walk the page, counting regular and overflow data items, and checking
+	 * to be sure any updates weren't deletions.  If the item was updated,
+	 * assume it was updated by an item of the same size (it's expensive to
+	 * figure out if it will require the same space or not, especially if
+	 * there's Huffman encoding).
+	 */
+	WT_COL_FOREACH(page, cip, i) {
+		if ((cell = WT_COL_PTR(page, cip)) == NULL) {
+			orig_deleted = 1;
+			WT_STAT_INCR(stats, btree_column_deleted);
+		} else {
+			orig_deleted = 0;
+			__wt_cell_unpack(cell, unpack);
+			WT_STAT_INCRV(
+			    stats, btree_entries, __wt_cell_rle(unpack));
+		}
+
+		/*
+		 * Walk the insert list, checking for changes.  For each insert
+		 * we find, correct the original count based on its state.
+		 */
+		WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) {
+			upd = ins->upd;
+			if (WT_UPDATE_DELETED_ISSET(upd)) {
+				if (orig_deleted)
+					continue;
+				WT_STAT_INCR(stats, btree_column_deleted);
+				WT_STAT_DECR(stats, btree_entries);
+			} else {
+				if (!orig_deleted)
+					continue;
+				WT_STAT_DECR(stats, btree_column_deleted);
+				WT_STAT_INCR(stats, btree_entries);
+			}
+		}
+	}
+	return (0);
+}
+
+/*
+ * __stat_page_row_leaf --
+ *	Stat a WT_PAGE_ROW_LEAF page.
+ */
+static int
+__stat_page_row_leaf(WT_PAGE *page, WT_DSRC_STATS *stats)
+{
+	WT_INSERT *ins;
+	WT_ROW *rip;
+	WT_UPDATE *upd;
+	uint32_t cnt, i;
+
+	WT_STAT_INCR(stats, btree_row_leaf);
+
+	/*
+	 * Stat any K/V pairs inserted into the page before the first from-disk
+	 * key on the page.
+	 */
+	cnt = 0;
+	WT_SKIP_FOREACH(ins, WT_ROW_INSERT_SMALLEST(page))
+		if (!WT_UPDATE_DELETED_ISSET(ins->upd))
+			++cnt;
+
+	/* Stat the page's K/V pairs. */
+	WT_ROW_FOREACH(page, rip, i) {
+		upd = WT_ROW_UPDATE(page, rip);
+		if (upd == NULL || !WT_UPDATE_DELETED_ISSET(upd))
+			++cnt;
+
+		/* Stat inserted K/V pairs. */
+		WT_SKIP_FOREACH(ins, WT_ROW_INSERT(page, rip))
+			if (!WT_UPDATE_DELETED_ISSET(ins->upd))
+				++cnt;
+	}
+
+	WT_STAT_INCRV(stats, btree_entries, cnt);
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c
new file mode 100644
index 00000000000..607e7919513
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_sync.c
@@ -0,0 +1,373 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __sync_file --
+ *	Flush pages for a specific file.
+ */
+static int
+__sync_file(WT_SESSION_IMPL *session, int syncop)
+{
+	struct timespec end, start;
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_PAGE_MODIFY *mod;
+	WT_REF *walk;
+	WT_TXN *txn;
+	uint64_t internal_bytes, leaf_bytes;
+	uint64_t internal_pages, leaf_pages;
+	uint32_t flags;
+
+	btree = S2BT(session);
+
+	flags = WT_READ_CACHE | WT_READ_NO_GEN;
+	walk = NULL;
+	txn = &session->txn;
+
+	internal_bytes = leaf_bytes = 0;
+	internal_pages = leaf_pages = 0;
+	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
+		WT_RET(__wt_epoch(session, &start));
+
+	switch (syncop) {
+	case WT_SYNC_WRITE_LEAVES:
+		/*
+		 * Write all immediately available, dirty in-cache leaf pages.
+		 *
+		 * Writing the leaf pages is done without acquiring a high-level
+		 * lock, serialize so multiple threads don't walk the tree at
+		 * the same time.
+		 */
+		if (!btree->modified)
+			return (0);
+		__wt_spin_lock(session, &btree->flush_lock);
+		if (!btree->modified) {
+			__wt_spin_unlock(session, &btree->flush_lock);
+			return (0);
+		}
+
+		flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL;
+		for (walk = NULL;;) {
+			WT_ERR(__wt_tree_walk(session, &walk, flags));
+			if (walk == NULL)
+				break;
+
+			/* Write dirty pages if nobody beat us to it. */
+			page = walk->page;
+			if (__wt_page_is_modified(page)) {
+				if (txn->isolation == TXN_ISO_READ_COMMITTED)
+					__wt_txn_refresh(session, 1);
+				leaf_bytes += page->memory_footprint;
+				++leaf_pages;
+				WT_ERR(__wt_rec_write(session, walk, NULL, 0));
+			}
+		}
+		break;
+	case WT_SYNC_CHECKPOINT:
+		/*
+		 * We cannot check the tree modified flag in the case of a
+		 * checkpoint, the checkpoint code has already cleared it.
+		 *
+		 * Writing the leaf pages is done without acquiring a high-level
+		 * lock, serialize so multiple threads don't walk the tree at
+		 * the same time.  We're holding the schema lock, but need the
+		 * lower-level lock as well.
+		 */
+		__wt_spin_lock(session, &btree->flush_lock);
+
+		/*
+		 * When internal pages are being reconciled by checkpoint their
+		 * child pages cannot disappear from underneath them or be split
+		 * into them, nor can underlying blocks be freed until the block
+		 * lists for the checkpoint are stable.  Set the checkpointing
+		 * flag to block eviction of dirty pages until the checkpoint's
+		 * internal page pass is complete, then wait for any existing
+		 * eviction to complete.
+		 */
+		btree->checkpointing = 1;
+
+		if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) {
+			WT_ERR(__wt_evict_file_exclusive_on(session));
+			__wt_evict_file_exclusive_off(session);
+		}
+
+		/* Write all dirty in-cache pages. */
+		flags |= WT_READ_NO_EVICT;
+		for (walk = NULL;;) {
+			WT_ERR(__wt_tree_walk(session, &walk, flags));
+			if (walk == NULL)
+				break;
+
+			/*
+			 * Write dirty pages, unless we can be sure they only
+			 * became dirty after the checkpoint started.
+			 *
+			 * We can skip dirty pages if:
+			 * (1) they are leaf pages;
+			 * (2) there is a snapshot transaction active (which
+			 *     is the case in ordinary application checkpoints
+			 *     but not all internal cases); and
+			 * (3) the first dirty update on the page is
+			 *     sufficiently recent that the checkpoint
+			 *     transaction would skip them.
+			 */
+			page = walk->page;
+			mod = page->modify;
+			if (__wt_page_is_modified(page) &&
+			    (WT_PAGE_IS_INTERNAL(page) ||
+			    !F_ISSET(txn, TXN_HAS_SNAPSHOT) ||
+			    TXNID_LE(mod->first_dirty_txn, txn->snap_max))) {
+				if (WT_PAGE_IS_INTERNAL(page)) {
+					internal_bytes +=
+					    page->memory_footprint;
+					++internal_pages;
+				} else {
+					leaf_bytes += page->memory_footprint;
+					++leaf_pages;
+				}
+				WT_ERR(__wt_rec_write(session, walk, NULL, 0));
+			}
+		}
+		break;
+	WT_ILLEGAL_VALUE_ERR(session);
+	}
+
+	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
+		WT_ERR(__wt_epoch(session, &end));
+		WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
+		    "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64
+		    " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64
+		    " bytes, %" PRIu64 " pages of internal\n\t"
+		    "Took: %" PRIu64 "ms",
+		    syncop == WT_SYNC_WRITE_LEAVES ?
+		    "WRITE_LEAVES" : "CHECKPOINT",
+		    leaf_bytes, leaf_pages, internal_bytes, internal_pages,
+		    WT_TIMEDIFF(end, start) / WT_MILLION));
+	}
+
+err:	/* On error, clear any left-over tree walk. */
+	if (walk != NULL)
+		WT_TRET(__wt_page_release(session, walk, flags));
+
+	if (txn->isolation == TXN_ISO_READ_COMMITTED && session->ncursors == 0)
+		__wt_txn_release_snapshot(session);
+
+	if (btree->checkpointing) {
+		/*
+		 * Clear the checkpoint flag and push the change; not required,
+		 * but publishing the change means stalled eviction gets moving
+		 * as soon as possible.
+		 */
+		btree->checkpointing = 0;
+		WT_FULL_BARRIER();
+
+		/*
+		 * Wake the eviction server, in case application threads have
+		 * stalled while the eviction server decided it couldn't make
+		 * progress.  Without this, application threads will be stalled
+		 * until the eviction server next wakes.
+		 */
+		WT_TRET(__wt_evict_server_wake(session));
+	}
+
+	__wt_spin_unlock(session, &btree->flush_lock);
+
+	/*
+	 * Leaves are written before a checkpoint (or as part of a file close,
+	 * before checkpointing the file).  Start a flush to stable storage,
+	 * but don't wait for it.
+	 */
+	if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES)
+		WT_RET(btree->bm->sync(btree->bm, session, 1));
+
+	return (ret);
+}
+
+/*
+ * __evict_file --
+ *	Discard pages for a specific file.
+ */
+static int
+__evict_file(WT_SESSION_IMPL *session, int syncop)
+{
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_REF *next_ref, *ref;
+	int eviction_enabled;
+
+	btree = S2BT(session);
+	eviction_enabled = !F_ISSET(btree, WT_BTREE_NO_EVICTION);
+
+	/*
+	 * We need exclusive access to the file -- disable ordinary eviction
+	 * and drain any blocks already queued.
+	 */
+	if (eviction_enabled)
+		WT_RET(__wt_evict_file_exclusive_on(session));
+
+	/* Make sure the oldest transaction ID is up-to-date. */
+	__wt_txn_update_oldest(session);
+
+	/* Walk the tree, discarding pages. */
+	next_ref = NULL;
+	WT_ERR(__wt_tree_walk(
+	    session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT));
+	while ((ref = next_ref) != NULL) {
+		page = ref->page;
+
+		/*
+		 * Eviction can fail when a page in the evicted page's subtree
+		 * switches state.  For example, if we don't evict a page marked
+		 * empty, because we expect it to be merged into its parent, it
+		 * might no longer be empty after it's reconciled, in which case
+		 * eviction of its parent would fail.  We can either walk the
+		 * tree multiple times (until it's finally empty), or reconcile
+		 * each page to get it to its final state before considering if
+		 * it's an eviction target or will be merged into its parent.
+		 *
+		 * Don't limit this test to any particular page type, that tends
+		 * to introduce bugs when the reconciliation of other page types
+		 * changes, and there's no advantage to doing so.
+		 *
+		 * Eviction can also fail because an update cannot be written.
+		 * If sessions have disjoint sets of files open, updates in a
+		 * no-longer-referenced file may not yet be globally visible,
+		 * and the write will fail with EBUSY.  Our caller handles that
+		 * error, retrying later.
+		 */
+		if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page))
+			WT_ERR(__wt_rec_write(session, ref, NULL, WT_EVICTING));
+
+		/*
+		 * We can't evict the page just returned to us (it marks our
+		 * place in the tree), so move the walk to one page ahead of
+		 * the page being evicted.  Note, we reconciled the returned
+		 * page first: if reconciliation of that page were to change
+		 * the shape of the tree, and we did the next walk call before
+		 * the reconciliation, the next walk call could miss a page in
+		 * the tree.
+		 */
+		WT_ERR(__wt_tree_walk(
+		    session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT));
+
+		switch (syncop) {
+		case WT_SYNC_CLOSE:
+			/*
+			 * Evict the page.
+			 * Do not attempt to evict pages expected to be merged
+			 * into their parents, with the exception that the root
+			 * page can't be merged, it must be written.
+			 */
+			if (__wt_ref_is_root(ref) ||
+			    page->modify == NULL ||
+			    !F_ISSET(page->modify, WT_PM_REC_EMPTY))
+				WT_ERR(__wt_rec_evict(session, ref, 1));
+			break;
+		case WT_SYNC_DISCARD:
+		case WT_SYNC_DISCARD_FORCE:
+			/*
+			 * Discard the page, whether clean or dirty.
+			 *
+			 * Clean the page, both to keep statistics correct, and
+			 * to let the page-discard function assert no dirty page
+			 * is ever discarded.
+			 */
+			if (__wt_page_is_modified(page)) {
+				page->modify->write_gen = 0;
+				__wt_cache_dirty_decr(session, page);
+			}
+			/*
+			 * If the page contains an update that is too recent to
+			 * evict, stop.  This should never happen during
+			 * connection close, and in other paths our caller
+			 * should be prepared to deal with this case.
+			 */
+			if (syncop == WT_SYNC_DISCARD &&
+			    page->modify != NULL &&
+			    !__wt_txn_visible_all(session,
+			    page->modify->rec_max_txn))
+				return (EBUSY);
+			if (syncop == WT_SYNC_DISCARD_FORCE)
+				F_SET(session, WT_SESSION_DISCARD_FORCE);
+			__wt_ref_out(session, ref);
+			/*
+			 * In case we don't discard the whole tree, make sure
+			 * that future readers know that the page is no longer
+			 * in cache.
+			 */
+			ref->state = WT_REF_DISK;
+			F_CLR(session, WT_SESSION_DISCARD_FORCE);
+			break;
+		WT_ILLEGAL_VALUE_ERR(session);
+		}
+	}
+
+	if (0) {
+err:		/* On error, clear any left-over tree walk. */
+		if (next_ref != NULL)
+			WT_TRET(__wt_page_release(
+			    session, next_ref, WT_READ_NO_EVICT));
+	}
+
+	if (eviction_enabled)
+		__wt_evict_file_exclusive_off(session);
+
+	return (ret);
+}
+
+/*
+ * __wt_cache_op --
+ *	Cache operations.
+ */
+int
+__wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op)
+{
+	WT_DECL_RET;
+	WT_BTREE *btree;
+
+	btree = S2BT(session);
+
+	switch (op) {
+	case WT_SYNC_CHECKPOINT:
+	case WT_SYNC_CLOSE:
+		/*
+		 * Set the checkpoint reference for reconciliation; it's ugly,
+		 * but drilling a function parameter path from our callers to
+		 * the reconciliation of the tree's root page is going to be
+		 * worse.
+		 */
+		WT_ASSERT(session, btree->ckpt == NULL);
+		btree->ckpt = ckptbase;
+		break;
+	}
+
+	switch (op) {
+	case WT_SYNC_CHECKPOINT:
+	case WT_SYNC_WRITE_LEAVES:
+		WT_ERR(__sync_file(session, op));
+		break;
+	case WT_SYNC_CLOSE:
+	case WT_SYNC_DISCARD:
+	case WT_SYNC_DISCARD_FORCE:
+		WT_ERR(__evict_file(session, op));
+		break;
+	WT_ILLEGAL_VALUE_ERR(session);
+	}
+
+err:	switch (op) {
+	case WT_SYNC_CHECKPOINT:
+	case WT_SYNC_CLOSE:
+		btree->ckpt = NULL;
+		break;
+	}
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_upgrade.c b/src/third_party/wiredtiger/src/btree/bt_upgrade.c
new file mode 100644
index 00000000000..d65c8793fbb
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_upgrade.c
@@ -0,0 +1,22 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_upgrade --
+ *	Upgrade a file.
+ */
+int
+__wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_UNUSED(cfg);
+
+	/* There's nothing to upgrade, yet. */
+	WT_RET(__wt_progress(session, NULL, 1));
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
new file mode 100644
index 00000000000..e7caf02fd2f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
@@ -0,0 +1,666 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * There's a bunch of stuff we pass around during verification, group it
+ * together to make the code prettier.
+ */
+typedef struct {
+	uint64_t record_total;			/* Total record count */
+
+	WT_ITEM *max_key;			/* Largest key */
+	WT_ITEM *max_addr;			/* Largest key page */
+
+	uint64_t fcnt;				/* Progress counter */
+
+	int dump_address;			/* Debugging hooks */
+	int dump_pages;
+	int dump_blocks;
+
+	WT_ITEM *tmp1;				/* Temporary buffer */
+	WT_ITEM *tmp2;				/* Temporary buffer */
+} WT_VSTUFF;
+
+static void __verify_checkpoint_reset(WT_VSTUFF *);
+static int  __verify_config(WT_SESSION_IMPL *, const char *[], WT_VSTUFF *);
+static int  __verify_config_offsets(WT_SESSION_IMPL *, const char *[], int *);
+static int  __verify_overflow(
+	WT_SESSION_IMPL *, const uint8_t *, size_t, WT_VSTUFF *);
+static int  __verify_overflow_cell(
+	WT_SESSION_IMPL *, WT_REF *, int *, WT_VSTUFF *);
+static int  __verify_row_int_key_order(
+	WT_SESSION_IMPL *, WT_PAGE *, WT_REF *, uint32_t, WT_VSTUFF *);
+static int  __verify_row_leaf_key_order(
+	WT_SESSION_IMPL *, WT_REF *, WT_VSTUFF *);
+static int  __verify_tree(WT_SESSION_IMPL *, WT_REF *, WT_VSTUFF *);
+
+/*
+ * __wt_verify --
+ *	Verify a file.
+ */
+int
+__wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+	WT_CKPT *ckptbase, *ckpt;
+	WT_DECL_RET;
+	WT_VSTUFF *vs, _vstuff;
+	size_t root_addr_size;
+	uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE];
+	int bm_start, quit;
+
+	btree = S2BT(session);
+	bm = btree->bm;
+	ckptbase = NULL;
+	bm_start = 0;
+
+	WT_CLEAR(_vstuff);
+	vs = &_vstuff;
+	WT_ERR(__wt_scr_alloc(session, 0, &vs->max_key));
+	WT_ERR(__wt_scr_alloc(session, 0, &vs->max_addr));
+	WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1));
+	WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2));
+
+	/* Check configuration strings. */
+	WT_ERR(__verify_config(session, cfg, vs));
+
+	/* Optionally dump specific block offsets. */
+	WT_ERR(__verify_config_offsets(session, cfg, &quit));
+	if (quit)
+		goto done;
+
+	/* Get a list of the checkpoints for this file. */
+	WT_ERR(
+	    __wt_meta_ckptlist_get(session, btree->dhandle->name, &ckptbase));
+
+	/* Inform the underlying block manager we're verifying. */
+	WT_ERR(bm->verify_start(bm, session, ckptbase));
+	bm_start = 1;
+
+	/* Loop through the file's checkpoints, verifying each one. */
+	WT_CKPT_FOREACH(ckptbase, ckpt) {
+		WT_ERR(__wt_verbose(session, WT_VERB_VERIFY,
+		    "%s: checkpoint %s", btree->dhandle->name, ckpt->name));
+
+		/* Fake checkpoints require no work. */
+		if (F_ISSET(ckpt, WT_CKPT_FAKE))
+			continue;
+
+		/* House-keeping between checkpoints. */
+		__verify_checkpoint_reset(vs);
+
+#ifdef HAVE_DIAGNOSTIC
+		if (vs->dump_address || vs->dump_blocks || vs->dump_pages)
+			WT_ERR(__wt_msg(session, "%s: checkpoint %s",
+			    btree->dhandle->name, ckpt->name));
+#endif
+		/* Load the checkpoint. */
+		WT_ERR(bm->checkpoint_load(bm, session,
+		    ckpt->raw.data, ckpt->raw.size,
+		    root_addr, &root_addr_size, 1));
+
+		/*
+		 * Ignore trees with no root page.
+		 * Verify, then discard the checkpoint from the cache.
+		 */
+		if (root_addr_size != 0 &&
+		    (ret = __wt_btree_tree_open(
+		    session, root_addr, root_addr_size)) == 0) {
+#ifdef HAVE_DIAGNOSTIC
+			if (vs->dump_address ||
+			    vs->dump_blocks || vs->dump_pages)
+				WT_ERR(__wt_msg(session, "Root: %s %s",
+				    __wt_addr_string(session,
+				    root_addr, root_addr_size, vs->tmp1),
+				    __wt_page_type_string(
+				    btree->root.page->type)));
+#endif
+			ret = __verify_tree(session, &btree->root, vs);
+
+			WT_TRET(__wt_cache_op(session, NULL, WT_SYNC_DISCARD));
+		}
+
+		/* Unload the checkpoint. */
+		WT_TRET(bm->checkpoint_unload(bm, session));
+		WT_ERR(ret);
+	}
+
+done:
+err:	/* Inform the underlying block manager we're done. */
+	if (bm_start)
+		WT_TRET(bm->verify_end(bm, session));
+
+	/* Discard the list of checkpoints. */
+	if (ckptbase != NULL)
+		__wt_meta_ckptlist_free(session, ckptbase);
+
+	/* Wrap up reporting. */
+	WT_TRET(__wt_progress(session, NULL, vs->fcnt));
+
+	/* Free allocated memory. */
+	__wt_scr_free(&vs->max_key);
+	__wt_scr_free(&vs->max_addr);
+	__wt_scr_free(&vs->tmp1);
+	__wt_scr_free(&vs->tmp2);
+
+	return (ret);
+}
+
+/*
+ * __verify_config --
+ *	Debugging: verification supports dumping pages in various formats.
+ */
+static int
+__verify_config(WT_SESSION_IMPL *session, const char *cfg[], WT_VSTUFF *vs)
+{
+	WT_CONFIG_ITEM cval;
+
+	WT_RET(__wt_config_gets(session, cfg, "dump_address", &cval));
+	vs->dump_address = cval.val != 0;
+
+	WT_RET(__wt_config_gets(session, cfg, "dump_blocks", &cval));
+	vs->dump_blocks = cval.val != 0;
+
+	WT_RET(__wt_config_gets(session, cfg, "dump_pages", &cval));
+	vs->dump_pages = cval.val != 0;
+
+#if !defined(HAVE_DIAGNOSTIC)
+	if (vs->dump_address || vs->dump_blocks || vs->dump_pages)
+		WT_RET_MSG(session, ENOTSUP,
+		    "the WiredTiger library was not built in diagnostic mode");
+#endif
+	return (0);
+}
+
+/*
+ * __verify_config_offsets --
+ *	Debugging: optionally dump specific blocks from the file.
+ */
+static int
+__verify_config_offsets(WT_SESSION_IMPL *session, const char *cfg[], int *quitp)
+{
+	WT_CONFIG list;
+	WT_CONFIG_ITEM cval, k, v;
+	WT_DECL_RET;
+	u_long offset;
+
+	*quitp = 0;
+
+	WT_RET(__wt_config_gets(session, cfg, "dump_offsets", &cval));
+	WT_RET(__wt_config_subinit(session, &list, &cval));
+	while ((ret = __wt_config_next(&list, &k, &v)) == 0) {
+		/*
+		 * Quit after dumping the requested blocks.  (That's hopefully
+		 * what the user wanted, all of this stuff is just hooked into
+		 * verify because that's where we "dump blocks" for debugging.)
+		 */
+		*quitp = 1;
+		if (v.len != 0 || sscanf(k.str, "%lu", &offset) != 1)
+			WT_RET_MSG(session, EINVAL,
+			    "unexpected dump offset format");
+#if !defined(HAVE_DIAGNOSTIC)
+		WT_RET_MSG(session, ENOTSUP,
+		    "the WiredTiger library was not built in diagnostic mode");
+#else
+		WT_TRET(
+		    __wt_debug_offset_blind(session, (wt_off_t)offset, NULL));
+#endif
+	}
+	return (ret == WT_NOTFOUND ? 0 : ret);
+}
+
+/*
+ * __verify_checkpoint_reset --
+ *	Reset anything needing to be reset for each new checkpoint verification.
+ */
+static void
+__verify_checkpoint_reset(WT_VSTUFF *vs)
+{
+	/*
+	 * Key order is per checkpoint, reset the data length that serves as a
+	 * flag value.
+	 */
+	vs->max_addr->size = 0;
+
+	/* Record total is per checkpoint, reset the record count. */
+	vs->record_total = 0;
+}
+
+/*
+ * __verify_tree --
+ *	Verify a tree, recursively descending through it in depth-first fashion.
+ * The page argument was physically verified (so we know it's correctly formed),
+ * and the in-memory version built.  Our job is to check logical relationships
+ * in the page and in the tree.
+ */
+static int
+__verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs)
+{
+	WT_BM *bm;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	WT_COL *cip;
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_REF *child_ref;
+	uint64_t recno;
+	uint32_t entry, i;
+	int found;
+
+	bm = S2BT(session)->bm;
+	page = ref->page;
+
+	unpack = &_unpack;
+	WT_CLEAR(*unpack);	/* -Wuninitialized */
+
+	WT_RET(__wt_verbose(session, WT_VERB_VERIFY, "%s %s",
+	    __wt_page_addr_string(session, ref, vs->tmp1),
+	    __wt_page_type_string(page->type)));
+#ifdef HAVE_DIAGNOSTIC
+	if (vs->dump_address)
+		WT_RET(__wt_msg(session, "%s %s",
+		    __wt_page_addr_string(session, ref, vs->tmp1),
+		    __wt_page_type_string(page->type)));
+#endif
+
+	/*
+	 * The page's physical structure was verified when it was read into
+	 * memory by the read server thread, and then the in-memory version
+	 * of the page was built.   Now we make sure the page and tree are
+	 * logically consistent.
+	 *
+	 * !!!
+	 * The problem: (1) the read server has to build the in-memory version
+	 * of the page because the read server is the thread that flags when
+	 * any thread can access the page in the tree; (2) we can't build the
+	 * in-memory version of the page until the physical structure is known
+	 * to be OK, so the read server has to verify at least the physical
+	 * structure of the page; (3) doing complete page verification requires
+	 * reading additional pages (for example, overflow keys imply reading
+	 * overflow pages in order to test the key's order in the page); (4)
+	 * the read server cannot read additional pages because it will hang
+	 * waiting on itself.  For this reason, we split page verification
+	 * into a physical verification, which allows the in-memory version
+	 * of the page to be built, and then a subsequent logical verification
+	 * which happens here.
+	 *
+	 * Report progress every 10 pages.
+	 */
+	if (++vs->fcnt % 10 == 0)
+		WT_RET(__wt_progress(session, NULL, vs->fcnt));
+
+#ifdef HAVE_DIAGNOSTIC
+	/* Optionally dump the blocks or page in debugging mode. */
+	if (vs->dump_blocks)
+		WT_RET(__wt_debug_disk(session, page->dsk, NULL));
+	if (vs->dump_pages)
+		WT_RET(__wt_debug_page(session, page, NULL));
+#endif
+
+	/*
+	 * Column-store key order checks: check the page's record number and
+	 * then update the total record count.
+	 */
+	switch (page->type) {
+	case WT_PAGE_COL_FIX:
+		recno = page->pg_fix_recno;
+		goto recno_chk;
+	case WT_PAGE_COL_INT:
+		recno = page->pg_intl_recno;
+		goto recno_chk;
+	case WT_PAGE_COL_VAR:
+		recno = page->pg_var_recno;
+recno_chk:	if (recno != vs->record_total + 1)
+			WT_RET_MSG(session, WT_ERROR,
+			    "page at %s has a starting record of %" PRIu64
+			    " when the expected starting record is %" PRIu64,
+			    __wt_page_addr_string(session, ref, vs->tmp1),
+			    recno, vs->record_total + 1);
+		break;
+	}
+	switch (page->type) {
+	case WT_PAGE_COL_FIX:
+		vs->record_total += page->pg_fix_entries;
+		break;
+	case WT_PAGE_COL_VAR:
+		recno = 0;
+		WT_COL_FOREACH(page, cip, i)
+			if ((cell = WT_COL_PTR(page, cip)) == NULL)
+				++recno;
+			else {
+				__wt_cell_unpack(cell, unpack);
+				recno += __wt_cell_rle(unpack);
+			}
+		vs->record_total += recno;
+		break;
+	}
+
+	/*
+	 * Row-store leaf page key order check: it's a depth-first traversal,
+	 * the first key on this page should be larger than any key previously
+	 * seen.
+	 */
+	switch (page->type) {
+	case WT_PAGE_ROW_LEAF:
+		WT_RET(__verify_row_leaf_key_order(session, ref, vs));
+		break;
+	}
+
+	/* If it's not the root page, unpack the parent cell. */
+	if (!__wt_ref_is_root(ref)) {
+		__wt_cell_unpack(ref->addr, unpack);
+
+		/* Compare the parent cell against the page type. */
+		switch (page->type) {
+		case WT_PAGE_COL_FIX:
+			if (unpack->raw != WT_CELL_ADDR_LEAF_NO)
+				goto celltype_err;
+			break;
+		case WT_PAGE_COL_VAR:
+		case WT_PAGE_ROW_LEAF:
+			if (unpack->raw != WT_CELL_ADDR_LEAF &&
+			    unpack->raw != WT_CELL_ADDR_LEAF_NO)
+				goto celltype_err;
+			break;
+		case WT_PAGE_COL_INT:
+		case WT_PAGE_ROW_INT:
+			if (unpack->raw != WT_CELL_ADDR_INT)
+celltype_err:			WT_RET_MSG(session, WT_ERROR,
+				    "page at %s, of type %s, is referenced in "
+				    "its parent by a cell of type %s",
+				    __wt_page_addr_string(
+					session, ref, vs->tmp1),
+				    __wt_page_type_string(page->type),
+				    __wt_cell_type_string(unpack->raw));
+			break;
+		}
+	}
+
+	/*
+	 * Check overflow pages.  We check overflow cells separately from other
+	 * tests that walk the page as it's simpler, and I don't care much how
+	 * fast table verify runs.
+	 */
+	switch (page->type) {
+	case WT_PAGE_COL_VAR:
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+		WT_RET(__verify_overflow_cell(session, ref, &found, vs));
+		if (__wt_ref_is_root(ref) || page->type == WT_PAGE_ROW_INT)
+			break;
+
+		/*
+		 * Object if a leaf-no-overflow address cell references a page
+		 * with overflow keys, but don't object if a leaf address cell
+		 * references a page without overflow keys.  Reconciliation
+		 * doesn't guarantee every leaf page without overflow items will
+		 * be a leaf-no-overflow type.
+		 */
+		if (found && unpack->raw == WT_CELL_ADDR_LEAF_NO)
+			WT_RET_MSG(session, WT_ERROR,
+			    "page at %s, of type %s and referenced in its "
+			    "parent by a cell of type %s, contains overflow "
+			    "items",
+			    __wt_page_addr_string(session, ref, vs->tmp1),
+			    __wt_page_type_string(page->type),
+			    __wt_cell_type_string(WT_CELL_ADDR_LEAF_NO));
+		break;
+	}
+
+	/* Check tree connections and recursively descend the tree. */
+	switch (page->type) {
+	case WT_PAGE_COL_INT:
+		/* For each entry in an internal page, verify the subtree. */
+		entry = 0;
+		WT_INTL_FOREACH_BEGIN(session, page, child_ref) {
+			/*
+			 * It's a depth-first traversal: this entry's starting
+			 * record number should be 1 more than the total records
+			 * reviewed to this point.
+			 */
+			++entry;
+			if (child_ref->key.recno != vs->record_total + 1) {
+				WT_RET_MSG(session, WT_ERROR,
+				    "the starting record number in entry %"
+				    PRIu32 " of the column internal page at "
+				    "%s is %" PRIu64 " and the expected "
+				    "starting record number is %" PRIu64,
+				    entry,
+				    __wt_page_addr_string(
+				    session, child_ref, vs->tmp1),
+				    child_ref->key.recno,
+				    vs->record_total + 1);
+			}
+
+			/* Verify the subtree. */
+			WT_RET(__wt_page_in(session, child_ref, 0));
+			ret = __verify_tree(session, child_ref, vs);
+			WT_TRET(__wt_page_release(session, child_ref, 0));
+			WT_RET(ret);
+
+			__wt_cell_unpack(child_ref->addr, unpack);
+			WT_RET(bm->verify_addr(
+			    bm, session, unpack->data, unpack->size));
+		} WT_INTL_FOREACH_END;
+		break;
+	case WT_PAGE_ROW_INT:
+		/* For each entry in an internal page, verify the subtree. */
+		entry = 0;
+		WT_INTL_FOREACH_BEGIN(session, page, child_ref) {
+			/*
+			 * It's a depth-first traversal: this entry's starting
+			 * key should be larger than the largest key previously
+			 * reviewed.
+			 *
+			 * The 0th key of any internal page is magic, and we
+			 * can't test against it.
+			 */
+			++entry;
+			if (entry != 1)
+				WT_RET(__verify_row_int_key_order(
+				    session, page, child_ref, entry, vs));
+
+			/* Verify the subtree. */
+			WT_RET(__wt_page_in(session, child_ref, 0));
+			ret = __verify_tree(session, child_ref, vs);
+			WT_TRET(__wt_page_release(session, child_ref, 0));
+			WT_RET(ret);
+
+			__wt_cell_unpack(child_ref->addr, unpack);
+			WT_RET(bm->verify_addr(
+			    bm, session, unpack->data, unpack->size));
+		} WT_INTL_FOREACH_END;
+		break;
+	}
+	return (0);
+}
+
+/*
+ * __verify_row_int_key_order --
+ *	Compare a key on an internal page to the largest key we've seen so
+ * far; update the largest key we've seen so far to that key.
+ */
+static int
+__verify_row_int_key_order(WT_SESSION_IMPL *session,
+    WT_PAGE *parent, WT_REF *ref, uint32_t entry, WT_VSTUFF *vs)
+{
+	WT_BTREE *btree;
+	WT_ITEM item;
+	int cmp;
+
+	btree = S2BT(session);
+
+	/* The maximum key is set, we updated it from a leaf page first. */
+	WT_ASSERT(session, vs->max_addr->size != 0);
+
+	/* Get the parent page's internal key. */
+	__wt_ref_key(parent, ref, &item.data, &item.size);
+
+	/* Compare the key against the largest key we've seen so far. */
+	WT_RET(__wt_compare(
+	    session, btree->collator, &item, vs->max_key, &cmp));
+	if (cmp <= 0)
+		WT_RET_MSG(session, WT_ERROR,
+		    "the internal key in entry %" PRIu32 " on the page at %s "
+		    "sorts before the last key appearing on page %s, earlier "
+		    "in the tree",
+		    entry,
+		    __wt_page_addr_string(session, ref, vs->tmp1),
+		    (char *)vs->max_addr->data);
+
+	/* Update the largest key we've seen to the key just checked. */
+	WT_RET(__wt_buf_set(session, vs->max_key, item.data, item.size));
+	(void)__wt_page_addr_string(session, ref, vs->max_addr);
+
+	return (0);
+}
+
+/*
+ * __verify_row_leaf_key_order --
+ *	Compare the first key on a leaf page to the largest key we've seen so
+ * far; update the largest key we've seen so far to the last key on the page.
+ */
+static int
+__verify_row_leaf_key_order(
+    WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs)
+{
+	WT_BTREE *btree;
+	WT_PAGE *page;
+	int cmp;
+
+	btree = S2BT(session);
+	page = ref->page;
+
+	/*
+	 * If a tree is empty (just created), it won't have keys; if there
+	 * are no keys, we're done.
+	 */
+	if (page->pg_row_entries == 0)
+		return (0);
+
+	/*
+	 * We visit our first leaf page before setting the maximum key (the 0th
+	 * keys on the internal pages leading to the smallest leaf in the tree
+	 * are all empty entries).
+	 */
+	if (vs->max_addr->size != 0) {
+		WT_RET(__wt_row_leaf_key_copy(
+		    session, page, page->pg_row_d, vs->tmp1));
+
+		/*
+		 * Compare the key against the largest key we've seen so far.
+		 *
+		 * If we're comparing against a key taken from an internal page,
+		 * we can compare equal (which is an expected path, the internal
+		 * page key is often a copy of the leaf page's first key).  But,
+		 * in the case of the 0th slot on an internal page, the last key
+		 * we've seen was a key from a previous leaf page, and it's not
+		 * OK to compare equally in that case.
+		 */
+		WT_RET(__wt_compare(session,
+		    btree->collator, vs->tmp1, (WT_ITEM *)vs->max_key, &cmp));
+		if (cmp < 0)
+			WT_RET_MSG(session, WT_ERROR,
+			    "the first key on the page at %s sorts equal to or "
+			    "less than a key appearing on the page at %s, "
+			    "earlier in the tree",
+			    __wt_page_addr_string(session, ref, vs->tmp1),
+				(char *)vs->max_addr->data);
+	}
+
+	/* Update the largest key we've seen to the last key on this page. */
+	WT_RET(__wt_row_leaf_key_copy(session, page,
+	    page->pg_row_d + (page->pg_row_entries - 1), vs->max_key));
+	(void)__wt_page_addr_string(session, ref, vs->max_addr);
+
+	return (0);
+}
+
+/*
+ * __verify_overflow_cell --
+ *	Verify any overflow cells on the page.
+ */
+static int
+__verify_overflow_cell(
+    WT_SESSION_IMPL *session, WT_REF *ref, int *found, WT_VSTUFF *vs)
+{
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	WT_DECL_RET;
+	const WT_PAGE_HEADER *dsk;
+	uint32_t cell_num, i;
+
+	btree = S2BT(session);
+	unpack = &_unpack;
+	*found = 0;
+
+	/*
+	 * If a tree is empty (just created), it won't have a disk image;
+	 * if there is no disk image, we're done.
+	 */
+	if ((dsk = ref->page->dsk) == NULL)
+		return (0);
+
+	/* Walk the disk page, verifying pages referenced by overflow cells. */
+	cell_num = 0;
+	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+		++cell_num;
+		__wt_cell_unpack(cell, unpack);
+		switch (unpack->type) {
+		case WT_CELL_KEY_OVFL:
+		case WT_CELL_VALUE_OVFL:
+			*found = 1;
+			WT_ERR(__verify_overflow(
+			    session, unpack->data, unpack->size, vs));
+			break;
+		}
+	}
+
+	return (0);
+
+err:	WT_RET_MSG(session, ret,
+	    "cell %" PRIu32 " on page at %s references an overflow item at %s "
+	    "that failed verification",
+	    cell_num - 1,
+	    __wt_page_addr_string(session, ref, vs->tmp1),
+	    __wt_addr_string(session, unpack->data, unpack->size, vs->tmp2));
+}
+
+/*
+ * __verify_overflow --
+ *	Read in an overflow page and check it.
+ */
+static int
+__verify_overflow(WT_SESSION_IMPL *session,
+    const uint8_t *addr, size_t addr_size, WT_VSTUFF *vs)
+{
+	WT_BM *bm;
+	const WT_PAGE_HEADER *dsk;
+
+	bm = S2BT(session)->bm;
+
+	/* Read and verify the overflow item. */
+	WT_RET(__wt_bt_read(session, vs->tmp1, addr, addr_size));
+
+	/*
+	 * The physical page has already been verified, but we haven't confirmed
+	 * it was an overflow page, only that it was a valid page.  Confirm it's
+	 * the type of page we expected.
+	 */
+	dsk = vs->tmp1->data;
+	if (dsk->type != WT_PAGE_OVFL)
+		WT_RET_MSG(session, WT_ERROR,
+		    "overflow referenced page at %s is not an overflow page",
+		    __wt_addr_string(session, addr, addr_size, vs->tmp1));
+
+	WT_RET(bm->verify_addr(bm, session, addr, addr_size));
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
new file mode 100644
index 00000000000..a14f9f1078e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
@@ -0,0 +1,739 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __err_cell_corrupted(WT_SESSION_IMPL *, uint32_t, const char *);
+static int __err_cell_type(
+	WT_SESSION_IMPL *, uint32_t, const char *, uint8_t, uint8_t);
+static int __err_eof(WT_SESSION_IMPL *, uint32_t, const char *);
+static int __verify_dsk_chunk(
+	WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *, uint32_t);
+static int __verify_dsk_col_fix(
+	WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *);
+static int __verify_dsk_col_int(
+	WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *);
+static int __verify_dsk_col_var(
+	WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *);
+static int __verify_dsk_memsize(
+	WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *, WT_CELL *);
+static int __verify_dsk_row(
+	WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *);
+
+#define	WT_ERR_VRFY(session, ...) do {					\
+	if (!(F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)))		\
+		__wt_errx(session, __VA_ARGS__);			\
+	goto err;							\
+} while (0)
+
+#define	WT_RET_VRFY(session, ...) do {					\
+	if (!(F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)))		\
+		__wt_errx(session, __VA_ARGS__);			\
+	return (WT_ERROR);						\
+} while (0)
+
+/*
+ * __wt_verify_dsk_image --
+ *	Verify a single block as read from disk.
+ */
+int
+__wt_verify_dsk_image(WT_SESSION_IMPL *session,
+    const char *addr, const WT_PAGE_HEADER *dsk, size_t size)
+{
+	const uint8_t *p, *end;
+	u_int i;
+	uint8_t flags;
+
+	/* Check the page type. */
+	switch (dsk->type) {
+	case WT_PAGE_BLOCK_MANAGER:
+	case WT_PAGE_COL_FIX:
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_COL_VAR:
+	case WT_PAGE_OVFL:
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+		break;
+	case WT_PAGE_INVALID:
+	default:
+		WT_RET_VRFY(session,
+		    "page at %s has an invalid type of %" PRIu32,
+		    addr, dsk->type);
+	}
+
+	/* Check the page record number. */
+	switch (dsk->type) {
+	case WT_PAGE_COL_FIX:
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_COL_VAR:
+		if (dsk->recno != 0)
+			break;
+		WT_RET_VRFY(session,
+		    "%s page at %s has a record number of zero",
+		    __wt_page_type_string(dsk->type), addr);
+	case WT_PAGE_BLOCK_MANAGER:
+	case WT_PAGE_OVFL:
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+		if (dsk->recno == 0)
+			break;
+		WT_RET_VRFY(session,
+		    "%s page at %s has a non-zero record number",
+		    __wt_page_type_string(dsk->type), addr);
+	}
+
+	/* Check the page flags. */
+	flags = dsk->flags;
+	if (LF_ISSET(WT_PAGE_COMPRESSED))
+		LF_CLR(WT_PAGE_COMPRESSED);
+	if (dsk->type == WT_PAGE_ROW_LEAF) {
+		if (LF_ISSET(WT_PAGE_EMPTY_V_ALL) &&
+		    LF_ISSET(WT_PAGE_EMPTY_V_NONE))
+			WT_RET_VRFY(session,
+			    "page at %s has invalid flags combination: 0x%"
+			    PRIx8,
+			    addr, dsk->flags);
+		if (LF_ISSET(WT_PAGE_EMPTY_V_ALL))
+			LF_CLR(WT_PAGE_EMPTY_V_ALL);
+		if (LF_ISSET(WT_PAGE_EMPTY_V_NONE))
+			LF_CLR(WT_PAGE_EMPTY_V_NONE);
+	}
+	if (flags != 0)
+		WT_RET_VRFY(session,
+		    "page at %s has invalid flags set: 0x%" PRIx8,
+		    addr, flags);
+
+	/* Unused bytes */
+	for (p = dsk->unused, i = sizeof(dsk->unused); i > 0; --i)
+		if (*p != '\0')
+			WT_RET_VRFY(session,
+			    "page at %s has non-zero unused page header bytes",
+			    addr);
+
+	/*
+	 * Any bytes after the data chunk should be nul bytes; ignore if the
+	 * size is 0, that allows easy checking of disk images where we don't
+	 * have the size.
+	 */
+	if (size != 0) {
+		p = (uint8_t *)dsk + dsk->mem_size;
+		end = (uint8_t *)dsk + size;
+		for (; p < end; ++p)
+			if (*p != '\0')
+				WT_RET_VRFY(session,
+				    "%s page at %s has non-zero trailing bytes",
+				    __wt_page_type_string(dsk->type), addr);
+	}
+
+	/* Verify the items on the page. */
+	switch (dsk->type) {
+	case WT_PAGE_COL_INT:
+		return (__verify_dsk_col_int(session, addr, dsk));
+	case WT_PAGE_COL_FIX:
+		return (__verify_dsk_col_fix(session, addr, dsk));
+	case WT_PAGE_COL_VAR:
+		return (__verify_dsk_col_var(session, addr, dsk));
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+		return (__verify_dsk_row(session, addr, dsk));
+	case WT_PAGE_BLOCK_MANAGER:
+	case WT_PAGE_OVFL:
+		return (__verify_dsk_chunk(session, addr, dsk, dsk->u.datalen));
+	WT_ILLEGAL_VALUE(session);
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * __wt_verify_dsk --
+ *	Verify a single Btree page as read from disk.
+ */
+int
+__wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf)
+{
+	return (__wt_verify_dsk_image(session, addr, buf->data, buf->size));
+}
+
+/*
+ * __verify_dsk_row --
+ *	Walk a WT_PAGE_ROW_INT or WT_PAGE_ROW_LEAF disk page and verify it.
+ */
+static int
+__verify_dsk_row(
+    WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	WT_DECL_ITEM(current);
+	WT_DECL_ITEM(last_ovfl);
+	WT_DECL_ITEM(last_pfx);
+	WT_DECL_RET;
+	WT_ITEM *last;
+	enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type;
+	void *huffman;
+	uint32_t cell_num, cell_type, i, key_cnt, prefix;
+	uint8_t *end;
+	int cmp;
+
+	btree = S2BT(session);
+	bm = btree->bm;
+	unpack = &_unpack;
+	huffman = dsk->type == WT_PAGE_ROW_INT ? NULL : btree->huffman_key;
+
+	WT_ERR(__wt_scr_alloc(session, 0, &current));
+	WT_ERR(__wt_scr_alloc(session, 0, &last_pfx));
+	WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl));
+	last = last_ovfl;
+
+	end = (uint8_t *)dsk + dsk->mem_size;
+
+	last_cell_type = FIRST;
+	cell_num = 0;
+	key_cnt = 0;
+	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+		++cell_num;
+
+		/* Carefully unpack the cell. */
+		if (__wt_cell_unpack_safe(cell, unpack, end) != 0) {
+			ret = __err_cell_corrupted(session, cell_num, addr);
+			goto err;
+		}
+
+		/* Check the raw and collapsed cell types. */
+		WT_ERR(__err_cell_type(
+		    session, cell_num, addr, unpack->raw, dsk->type));
+		WT_ERR(__err_cell_type(
+		    session, cell_num, addr, unpack->type, dsk->type));
+		cell_type = unpack->type;
+
+		/*
+		 * Check ordering relationships between the WT_CELL entries.
+		 * For row-store internal pages, check for:
+		 *	two values in a row,
+		 *	two keys in a row,
+		 *	a value as the first cell on a page.
+		 * For row-store leaf pages, check for:
+		 *	two values in a row,
+		 *	a value as the first cell on a page.
+		 */
+		switch (cell_type) {
+		case WT_CELL_KEY:
+		case WT_CELL_KEY_OVFL:
+			++key_cnt;
+			switch (last_cell_type) {
+			case FIRST:
+			case WAS_VALUE:
+				break;
+			case WAS_KEY:
+				if (dsk->type == WT_PAGE_ROW_LEAF)
+					break;
+				WT_ERR_VRFY(session,
+				    "cell %" PRIu32 " on page at %s is the "
+				    "first of two adjacent keys",
+				    cell_num - 1, addr);
+			}
+			last_cell_type = WAS_KEY;
+			break;
+		case WT_CELL_ADDR_DEL:
+		case WT_CELL_ADDR_INT:
+		case WT_CELL_ADDR_LEAF:
+		case WT_CELL_ADDR_LEAF_NO:
+		case WT_CELL_VALUE:
+		case WT_CELL_VALUE_OVFL:
+			switch (last_cell_type) {
+			case FIRST:
+				WT_ERR_VRFY(session,
+				    "page at %s begins with a value", addr);
+			case WAS_KEY:
+				break;
+			case WAS_VALUE:
+				WT_ERR_VRFY(session,
+				    "cell %" PRIu32 " on page at %s is the "
+				    "first of two adjacent values",
+				    cell_num - 1, addr);
+			}
+			last_cell_type = WAS_VALUE;
+			break;
+		}
+
+		/* Check if any referenced item has a valid address. */
+		switch (cell_type) {
+		case WT_CELL_ADDR_DEL:
+		case WT_CELL_ADDR_INT:
+		case WT_CELL_ADDR_LEAF:
+		case WT_CELL_ADDR_LEAF_NO:
+		case WT_CELL_KEY_OVFL:
+		case WT_CELL_VALUE_OVFL:
+			if (!bm->addr_valid(bm,
+			    session, unpack->data, unpack->size))
+				goto eof;
+			break;
+		}
+
+		/*
+		 * Remaining checks are for key order and prefix compression.
+		 * If this cell isn't a key, we're done, move to the next cell.
+		 * If this cell is an overflow item, instantiate the key and
+		 * compare it with the last key.   Otherwise, we have to deal
+		 * with prefix compression.
+		 */
+		switch (cell_type) {
+		case WT_CELL_KEY:
+			break;
+		case WT_CELL_KEY_OVFL:
+			WT_ERR(__wt_dsk_cell_data_ref(
+			    session, dsk->type, unpack, current));
+			goto key_compare;
+		default:
+			/* Not a key -- continue with the next cell. */
+			continue;
+		}
+
+		/*
+		 * Prefix compression checks.
+		 *
+		 * Confirm the first non-overflow key on a page has a zero
+		 * prefix compression count.
+		 */
+		prefix = unpack->prefix;
+		if (last_pfx->size == 0 && prefix != 0)
+			WT_ERR_VRFY(session,
+			    "the %" PRIu32 " key on page at %s is the first "
+			    "non-overflow key on the page and has a non-zero "
+			    "prefix compression value",
+			    cell_num, addr);
+
+		/* Confirm the prefix compression count is possible. */
+		if (cell_num > 1 && prefix > last->size)
+			WT_ERR_VRFY(session,
+			    "key %" PRIu32 " on page at %s has a prefix "
+			    "compression count of %" PRIu32 ", larger than "
+			    "the length of the previous key, %" WT_SIZET_FMT,
+			    cell_num, addr, prefix, last->size);
+
+		/*
+		 * If Huffman decoding required, unpack the cell to build the
+		 * key, then resolve the prefix.  Else, we can do it faster
+		 * internally because we don't have to shuffle memory around as
+		 * much.
+		 */
+		if (huffman != NULL) {
+			WT_ERR(__wt_dsk_cell_data_ref(
+			    session, dsk->type, unpack, current));
+
+			/*
+			 * If there's a prefix, make sure there's enough buffer
+			 * space, then shift the decoded data past the prefix
+			 * and copy the prefix into place.  Take care with the
+			 * pointers: current->data may be pointing inside the
+			 * buffer.
+			 */
+			if (prefix != 0) {
+				WT_ERR(__wt_buf_grow(
+				    session, current, prefix + current->size));
+				memmove((uint8_t *)current->mem + prefix,
+				    current->data, current->size);
+				memcpy(current->mem, last->data, prefix);
+				current->data = current->mem;
+				current->size += prefix;
+			}
+		} else {
+			/*
+			 * Get the cell's data/length and make sure we have
+			 * enough buffer space.
+			 */
+			WT_ERR(__wt_buf_init(
+			    session, current, prefix + unpack->size));
+
+			/* Copy the prefix then the data into place. */
+			if (prefix != 0)
+				memcpy(current->mem, last->data, prefix);
+			memcpy((uint8_t *)current->mem + prefix, unpack->data,
+			    unpack->size);
+			current->size = prefix + unpack->size;
+		}
+
+key_compare:	/*
+		 * Compare the current key against the last key.
+		 *
+		 * Be careful about the 0th key on internal pages: we only store
+		 * the first byte and custom collators may not be able to handle
+		 * truncated keys.
+		 */
+		if ((dsk->type == WT_PAGE_ROW_INT && cell_num > 3) ||
+		    (dsk->type != WT_PAGE_ROW_INT && cell_num > 1)) {
+			WT_ERR(__wt_compare(
+			    session, btree->collator, last, current, &cmp));
+			if (cmp >= 0)
+				WT_ERR_VRFY(session,
+				    "the %" PRIu32 " and %" PRIu32 " keys on "
+				    "page at %s are incorrectly sorted",
+				    cell_num - 2, cell_num, addr);
+		}
+
+		/*
+		 * Swap the buffers: last always references the last key entry,
+		 * last_pfx and last_ovfl reference the last prefix-compressed
+		 * and last overflow key entries.  Current gets pointed to the
+		 * buffer we're not using this time around, which is where the
+		 * next key goes.
+		 */
+		last = current;
+		if (cell_type == WT_CELL_KEY) {
+			current = last_pfx;
+			last_pfx = last;
+		} else {
+			current = last_ovfl;
+			last_ovfl = last;
+		}
+		WT_ASSERT(session, last != current);
+	}
+	WT_ERR(__verify_dsk_memsize(session, addr, dsk, cell));
+
+	/*
+	 * On row-store internal pages, and on row-store leaf pages, where the
+	 * "no empty values" flag is set, the key count should be equal to half
+	 * the number of physical entries.  On row-store leaf pages where the
+	 * "all empty values" flag is set, the key count should be equal to the
+	 * number of physical entries.
+	 */
+	if (dsk->type == WT_PAGE_ROW_INT && key_cnt * 2 != dsk->u.entries)
+		WT_ERR_VRFY(session,
+		    "%s page at %s has a key count of %" PRIu32 " and a "
+		    "physical entry count of %" PRIu32,
+		    __wt_page_type_string(dsk->type),
+		    addr, key_cnt, dsk->u.entries);
+	if (dsk->type == WT_PAGE_ROW_LEAF &&
+	    F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL) &&
+	    key_cnt != dsk->u.entries)
+		WT_ERR_VRFY(session,
+		    "%s page at %s with the 'all empty values' flag set has a "
+		    "key count of %" PRIu32 " and a physical entry count of %"
+		    PRIu32,
+		    __wt_page_type_string(dsk->type),
+		    addr, key_cnt, dsk->u.entries);
+	if (dsk->type == WT_PAGE_ROW_LEAF &&
+	    F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE) &&
+	    key_cnt * 2 != dsk->u.entries)
+		WT_ERR_VRFY(session,
+		    "%s page at %s with the 'no empty values' flag set has a "
+		    "key count of %" PRIu32 " and a physical entry count of %"
+		    PRIu32,
+		    __wt_page_type_string(dsk->type),
+		    addr, key_cnt, dsk->u.entries);
+
+	if (0) {
+eof:		ret = __err_eof(session, cell_num, addr);
+	}
+
+	if (0) {
+err:		if (ret == 0)
+			ret = WT_ERROR;
+	}
+	__wt_scr_free(&current);
+	__wt_scr_free(&last_pfx);
+	__wt_scr_free(&last_ovfl);
+	return (ret);
+}
+
+/*
+ * __verify_dsk_col_int --
+ *	Walk a WT_PAGE_COL_INT disk page and verify it.
+ */
+static int
+__verify_dsk_col_int(
+    WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	uint32_t cell_num, i;
+	uint8_t *end;
+
+	btree = S2BT(session);
+	bm = btree->bm;
+	unpack = &_unpack;
+	end = (uint8_t *)dsk + dsk->mem_size;
+
+	cell_num = 0;
+	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+		++cell_num;
+
+		/* Carefully unpack the cell. */
+		if (__wt_cell_unpack_safe(cell, unpack, end) != 0)
+			return (__err_cell_corrupted(session, cell_num, addr));
+
+		/* Check the raw and collapsed cell types. */
+		WT_RET(__err_cell_type(
+		    session, cell_num, addr, unpack->raw, dsk->type));
+		WT_RET(__err_cell_type(
+		    session, cell_num, addr, unpack->type, dsk->type));
+
+		/* Check if any referenced item is entirely in the file. */
+		if (!bm->addr_valid(bm, session, unpack->data, unpack->size))
+			return (__err_eof(session, cell_num, addr));
+	}
+	WT_RET(__verify_dsk_memsize(session, addr, dsk, cell));
+
+	return (0);
+}
+
+/*
+ * __verify_dsk_col_fix --
+ *	Walk a WT_PAGE_COL_FIX disk page and verify it.
+ */
+static int
+__verify_dsk_col_fix(
+    WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
+{
+	WT_BTREE *btree;
+	uint32_t datalen;
+
+	btree = S2BT(session);
+
+	datalen = __bitstr_size(btree->bitcnt * dsk->u.entries);
+	return (__verify_dsk_chunk(session, addr, dsk, datalen));
+}
+
+/*
+ * __verify_dsk_col_var --
+ *	Walk a WT_PAGE_COL_VAR disk page and verify it.
+ */
+static int
+__verify_dsk_col_var(
+    WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	size_t last_size;
+	uint32_t cell_num, cell_type, i;
+	int last_deleted;
+	const uint8_t *last_data;
+	uint8_t *end;
+
+	btree = S2BT(session);
+	bm = btree->bm;
+	unpack = &_unpack;
+	end = (uint8_t *)dsk + dsk->mem_size;
+
+	last_data = NULL;
+	last_size = 0;
+	last_deleted = 0;
+
+	cell_num = 0;
+	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+		++cell_num;
+
+		/* Carefully unpack the cell. */
+		if (__wt_cell_unpack_safe(cell, unpack, end) != 0)
+			return (__err_cell_corrupted(session, cell_num, addr));
+
+		/* Check the raw and collapsed cell types. */
+		WT_RET(__err_cell_type(
+		    session, cell_num, addr, unpack->raw, dsk->type));
+		WT_RET(__err_cell_type(
+		    session, cell_num, addr, unpack->type, dsk->type));
+		cell_type = unpack->type;
+
+		/* Check if any referenced item is entirely in the file. */
+		if (cell_type == WT_CELL_VALUE_OVFL &&
+		    !bm->addr_valid(bm, session, unpack->data, unpack->size))
+			return (__err_eof(session, cell_num, addr));
+
+		/*
+		 * Compare the last two items and see if reconciliation missed
+		 * a chance for RLE encoding.  We don't have to care about data
+		 * encoding or anything else, a byte comparison is enough.
+		 */
+		if (last_deleted == 1) {
+			if (cell_type == WT_CELL_DEL)
+				goto match_err;
+		} else
+			if (cell_type == WT_CELL_VALUE &&
+			    last_data != NULL &&
+			    last_size == unpack->size &&
+			    memcmp(last_data, unpack->data, last_size) == 0)
+match_err:			WT_RET_VRFY(session,
+				    "data entries %" PRIu32 " and %" PRIu32
+				    " on page at %s are identical and should "
+				    "have been run-length encoded",
+				    cell_num - 1, cell_num, addr);
+
+		switch (cell_type) {
+		case WT_CELL_DEL:
+			last_deleted = 1;
+			last_data = NULL;
+			break;
+		case WT_CELL_VALUE_OVFL:
+			last_deleted = 0;
+			last_data = NULL;
+			break;
+		case WT_CELL_VALUE:
+			last_deleted = 0;
+			last_data = unpack->data;
+			last_size = unpack->size;
+			break;
+		}
+	}
+	WT_RET(__verify_dsk_memsize(session, addr, dsk, cell));
+
+	return (0);
+}
+
+/*
+ * __verify_dsk_memsize --
+ *	Verify the last cell on the page matches the page's memory size.
+ */
+static int
+__verify_dsk_memsize(WT_SESSION_IMPL *session,
+    const char *addr, const WT_PAGE_HEADER *dsk, WT_CELL *cell)
+{
+	size_t len;
+
+	/*
+	 * We use the fact that cells exactly fill a page to detect the case of
+	 * a row-store leaf page where the last cell is a key (that is, there's
+	 * no subsequent value cell).  Check for any page type containing cells.
+	 */
+	len = WT_PTRDIFF((uint8_t *)dsk + dsk->mem_size, cell);
+	if (len == 0)
+		return (0);
+	WT_RET_VRFY(session,
+	    "%s page at %s has %" WT_SIZET_FMT " unexpected bytes of data "
+	    "after the last cell",
+	    __wt_page_type_string(dsk->type), addr, len);
+}
+
+/*
+ * __verify_dsk_chunk --
+ *	Verify a Chunk O' Data on a Btree page.
+ */
+static int
+__verify_dsk_chunk(WT_SESSION_IMPL *session,
+    const char *addr, const WT_PAGE_HEADER *dsk, uint32_t datalen)
+{
+	WT_BTREE *btree;
+	uint8_t *p, *end;
+
+	btree = S2BT(session);
+	end = (uint8_t *)dsk + dsk->mem_size;
+
+	/*
+	 * Fixed-length column-store and overflow pages are simple chunks of
+	 * data.
+	 */
+	if (datalen == 0)
+		WT_RET_VRFY(session,
+		    "%s page at %s has no data",
+		    __wt_page_type_string(dsk->type), addr);
+
+	/* Verify the data doesn't overflow the end of the page. */
+	p = WT_PAGE_HEADER_BYTE(btree, dsk);
+	if (p + datalen > end)
+		WT_RET_VRFY(session,
+		    "data on page at %s extends past the end of the page",
+		    addr);
+
+	/* Any bytes after the data chunk should be nul bytes. */
+	for (p += datalen; p < end; ++p)
+		if (*p != '\0')
+			WT_RET_VRFY(session,
+			    "%s page at %s has non-zero trailing bytes",
+			    __wt_page_type_string(dsk->type), addr);
+
+	return (0);
+}
+
+/*
+ * __err_cell_corrupted --
+ *	Generic corrupted cell, we couldn't read it.
+ */
+static int
+__err_cell_corrupted(
+    WT_SESSION_IMPL *session, uint32_t entry_num, const char *addr)
+{
+	WT_RET_VRFY(session,
+	    "item %" PRIu32 " on page at %s is a corrupted cell",
+	    entry_num, addr);
+}
+
+/*
+ * __err_cell_type --
+ *	Generic illegal cell type for a particular page type error.
+ */
+static int
+__err_cell_type(WT_SESSION_IMPL *session,
+    uint32_t entry_num, const char *addr, uint8_t cell_type, uint8_t dsk_type)
+{
+	switch (cell_type) {
+	case WT_CELL_ADDR_DEL:
+	case WT_CELL_ADDR_INT:
+	case WT_CELL_ADDR_LEAF:
+	case WT_CELL_ADDR_LEAF_NO:
+		if (dsk_type == WT_PAGE_COL_INT ||
+		    dsk_type == WT_PAGE_ROW_INT)
+			return (0);
+		break;
+	case WT_CELL_DEL:
+		if (dsk_type == WT_PAGE_COL_VAR)
+			return (0);
+		break;
+	case WT_CELL_KEY:
+	case WT_CELL_KEY_OVFL:
+	case WT_CELL_KEY_SHORT:
+		if (dsk_type == WT_PAGE_ROW_INT ||
+		    dsk_type == WT_PAGE_ROW_LEAF)
+			return (0);
+		break;
+	case WT_CELL_KEY_PFX:
+	case WT_CELL_KEY_SHORT_PFX:
+		if (dsk_type == WT_PAGE_ROW_LEAF)
+			return (0);
+		break;
+	case WT_CELL_KEY_OVFL_RM:
+	case WT_CELL_VALUE_OVFL_RM:
+		/*
+		 * Removed overflow cells are in-memory only, it's an error to
+		 * ever see one on a disk page.
+		 */
+		break;
+	case WT_CELL_VALUE:
+	case WT_CELL_VALUE_COPY:
+	case WT_CELL_VALUE_OVFL:
+	case WT_CELL_VALUE_SHORT:
+		if (dsk_type == WT_PAGE_COL_VAR ||
+		    dsk_type == WT_PAGE_ROW_LEAF)
+			return (0);
+		break;
+	default:
+		break;
+	}
+
+	WT_RET_VRFY(session,
+	    "illegal cell and page type combination: cell %" PRIu32
+	    " on page at %s is a %s cell on a %s page",
+	    entry_num, addr,
+	    __wt_cell_type_string(cell_type), __wt_page_type_string(dsk_type));
+}
+
+/*
+ * __err_eof --
+ *	Generic item references non-existent file pages error.
+ */
+static int
+__err_eof(WT_SESSION_IMPL *session, uint32_t entry_num, const char *addr)
+{
+	WT_RET_VRFY(session,
+	    "off-page item %" PRIu32
+	    " on page at %s references non-existent file pages",
+	    entry_num, addr);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c
new file mode 100644
index 00000000000..ef35d215ec0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_walk.c
@@ -0,0 +1,285 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_tree_walk --
+ *	Move to the next/previous page in the tree.
+ */
+int
+__wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags)
+{
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_PAGE_INDEX *pindex;
+	WT_REF *couple, *ref;
+	WT_TXN_STATE *txn_state;
+	int descending, prev, skip;
+	uint32_t slot;
+
+	btree = S2BT(session);
+	descending = 0;
+
+	/*
+	 * Tree walks are special: they look inside page structures that splits
+	 * may want to free.  Publish that the tree is active during this
+	 * window.
+	 */
+	WT_ENTER_PAGE_INDEX(session);
+
+	/*
+	 * !!!
+	 * Fast-truncate currently only works on row-store trees.
+	 */
+	if (btree->type != BTREE_ROW)
+		LF_CLR(WT_READ_TRUNCATE);
+
+	prev = LF_ISSET(WT_READ_PREV) ? 1 : 0;
+
+	/*
+	 * Pin a transaction ID, required to safely look at page index
+	 * structures, if our caller has not already done so.
+	 */
+	txn_state = WT_SESSION_TXN_STATE(session);
+	if (txn_state->snap_min == WT_TXN_NONE)
+		txn_state->snap_min = S2C(session)->txn_global.last_running;
+	else
+		txn_state = NULL;
+
+	/*
+	 * There are multiple reasons and approaches to walking the in-memory
+	 * tree:
+	 *
+	 * (1) finding pages to evict (the eviction server);
+	 * (2) writing just dirty leaves or internal nodes (checkpoint);
+	 * (3) discarding pages (close);
+	 * (4) truncating pages in a range (fast truncate);
+	 * (5) skipping pages based on outside information (compaction);
+	 * (6) cursor scans (applications).
+	 *
+	 * Except for cursor scans and compaction, the walk is limited to the
+	 * cache, no pages are read.  In all cases, hazard pointers protect the
+	 * walked pages from eviction.
+	 *
+	 * Walks use hazard-pointer coupling through the tree and that's OK
+	 * (hazard pointers can't deadlock, so there's none of the usual
+	 * problems found when logically locking up a btree).  If the eviction
+	 * thread tries to evict the active page, it fails because of our
+	 * hazard pointer.  If eviction tries to evict our parent, that fails
+	 * because the parent has a child page that can't be discarded.  We do
+	 * play one game: don't couple up to our parent and then back down to a
+	 * new leaf, couple to the next page to which we're descending, it
+	 * saves a hazard-pointer swap for each cursor page movement.
+	 *
+	 * !!!
+	 * NOTE: we depend on the fact it's OK to release a page we don't hold,
+	 * that is, it's OK to release couple when couple is set to NULL.
+	 *
+	 * Take a copy of any held page and clear the return value.  Remember
+	 * the hazard pointer we're currently holding.
+	 *
+	 * We may be passed a pointer to btree->evict_page that we are clearing
+	 * here.  We check when discarding pages that we're not discarding that
+	 * page, so this clear must be done before the page is released.
+	 */
+	couple = ref = *refp;
+	*refp = NULL;
+
+	/* If no page is active, begin a walk from the start of the tree. */
+	if (ref == NULL) {
+		ref = &btree->root;
+		if (ref->page == NULL) {
+			if (txn_state != NULL)
+				txn_state->snap_min = WT_TXN_NONE;
+			goto done;
+		}
+		goto descend;
+	}
+
+ascend:	/*
+	 * If the active page was the root, we've reached the walk's end.
+	 * Release any hazard-pointer we're holding.
+	 */
+	if (__wt_ref_is_root(ref)) {
+		WT_ERR(__wt_page_release(session, couple, flags));
+		goto done;
+	}
+
+	/* Figure out the current slot in the WT_REF array. */
+	__wt_page_refp(session, ref, &pindex, &slot);
+
+	if (0) {
+restart:	/*
+		 * The page we're moving to might have split, in which case find
+		 * the last position we held.
+		 *
+		 * If we were starting a tree walk, begin again.
+		 *
+		 * If we were in the process of descending, repeat the descent.
+		 * If we were moving within a single level of the tree, repeat
+		 * the last move.
+		 */
+		ref = couple;
+		if (ref == &btree->root) {
+			ref = &btree->root;
+			if (ref->page == NULL) {
+				if (txn_state != NULL)
+					txn_state->snap_min = WT_TXN_NONE;
+				goto done;
+			}
+			goto descend;
+		}
+		__wt_page_refp(session, ref, &pindex, &slot);
+		if (descending)
+			goto descend;
+	}
+
+	for (;;) {
+		/*
+		 * If we're at the last/first slot on the page, return this page
+		 * in post-order traversal.  Otherwise we move to the next/prev
+		 * slot and left/right-most element in its subtree.
+		 */
+		if ((prev && slot == 0) ||
+		    (!prev && slot == pindex->entries - 1)) {
+			ref = ref->home->pg_intl_parent_ref;
+
+			/* Optionally skip internal pages. */
+			if (LF_ISSET(WT_READ_SKIP_INTL))
+				goto ascend;
+
+			/*
+			 * We've ascended the tree and are returning an internal
+			 * page.  If it's the root, discard our hazard pointer,
+			 * otherwise, swap our hazard pointer for the page we'll
+			 * return.
+			 */
+			if (__wt_ref_is_root(ref))
+				WT_ERR(__wt_page_release(
+				    session, couple, flags));
+			else {
+				/*
+				 * Locate the reference to our parent page then
+				 * swap our child hazard pointer for the parent.
+				 * We don't handle a restart return because it
+				 * would require additional complexity in the
+				 * restart code (ascent code somewhat like the
+				 * descent code already there), and it's not a
+				 * possible return: we're moving to the parent
+				 * of the current child, not another child of
+				 * the same parent, there's no way our parent
+				 * split.
+				 */
+				__wt_page_refp(session, ref, &pindex, &slot);
+				if ((ret = __wt_page_swap(
+				    session, couple, ref, flags)) != 0) {
+					WT_TRET(__wt_page_release(
+					    session, couple, flags));
+					WT_ERR(ret);
+				}
+			}
+
+			*refp = ref;
+			goto done;
+		}
+
+		if (prev)
+			--slot;
+		else
+			++slot;
+
+		for (descending = 0;;) {
+			ref = pindex->index[slot];
+
+			if (LF_ISSET(WT_READ_CACHE)) {
+				/*
+				 * Only look at unlocked pages in memory:
+				 * fast-path some common cases.
+				 */
+				if (LF_ISSET(WT_READ_NO_WAIT) &&
+				    ref->state != WT_REF_MEM)
+					break;
+			} else if (LF_ISSET(WT_READ_TRUNCATE)) {
+				/*
+				 * If deleting a range, try to delete the page
+				 * without instantiating it.
+				 */
+				WT_ERR(__wt_delete_page(session, ref, &skip));
+				if (skip)
+					break;
+			} else if (LF_ISSET(WT_READ_COMPACT)) {
+				/*
+				 * Skip deleted pages, rewriting them doesn't
+				 * seem useful.
+				 */
+				if (ref->state == WT_REF_DELETED)
+					break;
+
+				/*
+				 * If the page is in-memory, we want to look at
+				 * it (it may have been modified and written,
+				 * and the current location is the interesting
+				 * one in terms of compaction, not the original
+				 * location).  If the page isn't in-memory, test
+				 * if the page will help with compaction, don't
+				 * read it if we don't have to.
+				 */
+				if (ref->state == WT_REF_DISK) {
+					WT_ERR(__wt_compact_page_skip(
+					    session, ref, &skip));
+					if (skip)
+						break;
+				}
+			} else {
+				/*
+				 * If iterating a cursor, try to skip deleted
+				 * pages that are visible to us.
+				 */
+				if (ref->state == WT_REF_DELETED &&
+				    __wt_delete_page_skip(session, ref))
+					break;
+			}
+
+			ret = __wt_page_swap(session, couple, ref, flags);
+			if (ret == WT_NOTFOUND) {
+				ret = 0;
+				break;
+			}
+			if (ret == WT_RESTART)
+				goto restart;
+			WT_ERR(ret);
+
+			/*
+			 * Entering a new page: configure for traversal of any
+			 * internal page's children, else return (or optionally
+			 * skip), the leaf page.
+			 */
+descend:		couple = ref;
+			page = ref->page;
+			if (page->type == WT_PAGE_ROW_INT ||
+			    page->type == WT_PAGE_COL_INT) {
+				pindex = WT_INTL_INDEX_COPY(page);
+				slot = prev ? pindex->entries - 1 : 0;
+				descending = 1;
+			} else if (LF_ISSET(WT_READ_SKIP_LEAF))
+				goto ascend;
+			else {
+				*refp = ref;
+				goto done;
+			}
+		}
+	}
+
+done:
+err:	if (txn_state != NULL)
+		txn_state->snap_min = WT_TXN_NONE;
+
+	WT_LEAVE_PAGE_INDEX(session);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c
new file mode 100644
index 00000000000..3a4a2a2987d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/col_modify.c
@@ -0,0 +1,223 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __col_insert_alloc(
+    WT_SESSION_IMPL *, uint64_t, u_int, WT_INSERT **, size_t *);
+
+/*
+ * __wt_col_modify --
+ *	Column-store delete, insert, and update.
+ */
+int
+__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
+    uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove)
+{
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_INSERT *ins;
+	WT_INSERT_HEAD *ins_head, **ins_headp;
+	WT_ITEM _value;
+	WT_PAGE *page;
+	WT_UPDATE *old_upd;
+	size_t ins_size, upd_size;
+	u_int i, skipdepth;
+	int append, logged;
+
+	btree = cbt->btree;
+	ins = NULL;
+	page = cbt->ref->page;
+	append = logged = 0;
+
+	/* This code expects a remove to have a NULL value. */
+	if (is_remove) {
+		if (btree->type == BTREE_COL_FIX) {
+			value = &_value;
+			value->data = "";
+			value->size = 1;
+		} else
+			value = NULL;
+	} else {
+		/*
+		 * There's some chance the application specified a record past
+		 * the last record on the page.  If that's the case, and we're
+		 * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the
+		 * append list, not the update list.   In addition, a recno of
+		 * 0 implies an append operation, we're allocating a new row.
+		 */
+		if (recno == 0 ||
+		    recno > (btree->type == BTREE_COL_VAR ?
+		    __col_var_last_recno(page) : __col_fix_last_recno(page)))
+			append = 1;
+	}
+
+	/* If we don't yet have a modify structure, we'll need one. */
+	WT_RET(__wt_page_modify_init(session, page));
+
+	/*
+	 * Delete, insert or update a column-store entry.
+	 *
+	 * If modifying a previously modified record, create a new WT_UPDATE
+	 * entry and have a serialized function link it into an existing
+	 * WT_INSERT entry's WT_UPDATE list.
+	 *
+	 * Else, allocate an insert array as necessary, build a WT_INSERT and
+	 * WT_UPDATE structure pair, and call a serialized function to insert
+	 * the WT_INSERT structure.
+	 */
+	if (cbt->compare == 0 && cbt->ins != NULL) {
+		/*
+		 * If we are restoring updates that couldn't be evicted, the
+		 * key must not exist on the new page.
+		 */
+		WT_ASSERT(session, upd == NULL);
+
+		/* Make sure the update can proceed. */
+		WT_ERR(__wt_txn_update_check(
+		    session, old_upd = cbt->ins->upd));
+
+		/* Allocate a WT_UPDATE structure and transaction ID. */
+		WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
+		WT_ERR(__wt_txn_modify(session, upd));
+		logged = 1;
+
+		/* Avoid a data copy in WT_CURSOR.update. */
+		cbt->modify_update = upd;
+
+		/*
+		 * Point the new WT_UPDATE item to the next element in the list.
+		 * If we get it right, the serialization function lock acts as
+		 * our memory barrier to flush this write.
+		 */
+		upd->next = old_upd;
+
+		/* Serialize the update. */
+		WT_ERR(__wt_update_serial(
+		    session, page, &cbt->ins->upd, &upd, upd_size));
+	} else {
+		/* Allocate the append/update list reference as necessary. */
+		if (append) {
+			WT_PAGE_ALLOC_AND_SWAP(session,
+			    page, page->modify->mod_append, ins_headp, 1);
+			ins_headp = &page->modify->mod_append[0];
+		} else if (page->type == WT_PAGE_COL_FIX) {
+			WT_PAGE_ALLOC_AND_SWAP(session,
+			    page, page->modify->mod_update, ins_headp, 1);
+			ins_headp = &page->modify->mod_update[0];
+		} else {
+			WT_PAGE_ALLOC_AND_SWAP(session,
+			    page, page->modify->mod_update, ins_headp,
+			    page->pg_var_entries);
+			ins_headp = &page->modify->mod_update[cbt->slot];
+		}
+
+		/* Allocate the WT_INSERT_HEAD structure as necessary. */
+		WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1);
+		ins_head = *ins_headp;
+
+		/* Choose a skiplist depth for this insert. */
+		skipdepth = __wt_skip_choose_depth(session);
+
+		/*
+		 * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and
+		 * update the cursor to reference it (the WT_INSERT_HEAD might
+		 * be allocated, the WT_INSERT was allocated).
+		 */
+		WT_ERR(__col_insert_alloc(
+		    session, recno, skipdepth, &ins, &ins_size));
+		cbt->ins_head = ins_head;
+		cbt->ins = ins;
+
+		if (upd == NULL) {
+			WT_ERR(
+			    __wt_update_alloc(session, value, &upd, &upd_size));
+			WT_ERR(__wt_txn_modify(session, upd));
+			logged = 1;
+
+			/* Avoid a data copy in WT_CURSOR.update. */
+			cbt->modify_update = upd;
+		} else
+			upd_size = sizeof(WT_UPDATE) + upd->size;
+		ins->upd = upd;
+		ins_size += upd_size;
+
+		/*
+		 * If there was no insert list during the search, or there was
+		 * no search because the record number has not been allocated
+		 * yet, the cursor's information cannot be correct, search
+		 * couldn't have initialized it.
+		 *
+		 * Otherwise, point the new WT_INSERT item's skiplist to the
+		 * next elements in the insert list (which we will check are
+		 * still valid inside the serialization function).
+		 *
+		 * The serial mutex acts as our memory barrier to flush these
+		 * writes before inserting them into the list.
+		 */
+		if (WT_SKIP_FIRST(ins_head) == NULL || recno == 0)
+			for (i = 0; i < skipdepth; i++) {
+				cbt->ins_stack[i] = &ins_head->head[i];
+				ins->next[i] = cbt->next_stack[i] = NULL;
+			}
+		else
+			for (i = 0; i < skipdepth; i++)
+				ins->next[i] = cbt->next_stack[i];
+
+		/* Append or insert the WT_INSERT structure. */
+		if (append)
+			WT_ERR(__wt_col_append_serial(
+			    session, page, cbt->ins_head, cbt->ins_stack,
+			    &ins, ins_size, &cbt->recno, skipdepth));
+		else
+			WT_ERR(__wt_insert_serial(
+			    session, page, cbt->ins_head, cbt->ins_stack,
+			    &ins, ins_size, skipdepth));
+	}
+
+	/* If the update was successful, add it to the in-memory log. */
+	if (logged)
+		WT_ERR(__wt_txn_log_op(session, cbt));
+
+	if (0) {
+err:		/*
+		 * Remove the update from the current transaction, so we don't
+		 * try to modify it on rollback.
+		 */
+		if (logged)
+			__wt_txn_unmodify(session);
+		__wt_free(session, ins);
+		__wt_free(session, upd);
+	}
+
+	return (ret);
+}
+
+/*
+ * __col_insert_alloc --
+ *	Column-store insert: allocate a WT_INSERT structure and fill it in.
+ */
+static int
+__col_insert_alloc(WT_SESSION_IMPL *session,
+    uint64_t recno, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep)
+{
+	WT_INSERT *ins;
+	size_t ins_size;
+
+	/*
+	 * Allocate the WT_INSERT structure and skiplist pointers, then copy
+	 * the record number into place.
+	 */
+	ins_size = sizeof(WT_INSERT) + skipdepth * sizeof(WT_INSERT *);
+	WT_RET(__wt_calloc(session, 1, ins_size, &ins));
+
+	WT_INSERT_RECNO(ins) = recno;
+
+	*insp = ins;
+	*ins_sizep = ins_size;
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c
new file mode 100644
index 00000000000..e4083e2282f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/col_srch.c
@@ -0,0 +1,199 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_col_search --
+ *	Search a column-store tree for a specific record-based key.
+ */
+int
+__wt_col_search(WT_SESSION_IMPL *session,
+    uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
+{
+	WT_BTREE *btree;
+	WT_COL *cip;
+	WT_DECL_RET;
+	WT_INSERT *ins;
+	WT_INSERT_HEAD *ins_head;
+	WT_PAGE *page;
+	WT_PAGE_INDEX *pindex;
+	WT_REF *current, *descent;
+	uint32_t base, indx, limit;
+	int depth;
+
+	btree = S2BT(session);
+
+	__cursor_pos_clear(cbt);
+
+	/*
+	 * In the service of eviction splits, we're only searching a single leaf
+	 * page, not a full tree.
+	 */
+	if (leaf != NULL) {
+		current = leaf;
+		goto leaf_only;
+	}
+
+	/* Search the internal pages of the tree. */
+	current = &btree->root;
+	for (depth = 2;; ++depth) {
+restart:	page = current->page;
+		if (page->type != WT_PAGE_COL_INT)
+			break;
+
+		WT_ASSERT(session, current->key.recno == page->pg_intl_recno);
+
+		pindex = WT_INTL_INDEX_COPY(page);
+		base = pindex->entries;
+		descent = pindex->index[base - 1];
+
+		/* Fast path appends. */
+		if (recno >= descent->key.recno)
+			goto descend;
+
+		/* Binary search of internal pages. */
+		for (base = 0,
+		    limit = pindex->entries - 1; limit != 0; limit >>= 1) {
+			indx = base + (limit >> 1);
+			descent = pindex->index[indx];
+
+			if (recno == descent->key.recno)
+				break;
+			if (recno < descent->key.recno)
+				continue;
+			base = indx + 1;
+			--limit;
+		}
+descend:	/*
+		 * Reference the slot used for next step down the tree.
+		 *
+		 * Base is the smallest index greater than recno and may be the
+		 * (last + 1) index.  The slot for descent is the one before
+		 * base.
+		 */
+		if (recno != descent->key.recno) {
+			/*
+			 * We don't have to correct for base == 0 because the
+			 * only way for base to be 0 is if recno is the page's
+			 * starting recno.
+			 */
+			WT_ASSERT(session, base > 0);
+			descent = pindex->index[base - 1];
+		}
+
+		/*
+		 * Swap the current page for the child page. If the page splits
+		 * while we're retrieving it, restart the search in the current
+		 * page; otherwise return on error, the swap call ensures we're
+		 * holding nothing on failure.
+		 */
+		switch (ret = __wt_page_swap(session, current, descent, 0)) {
+		case 0:
+			current = descent;
+			break;
+		case WT_RESTART:
+			goto restart;
+		default:
+			return (ret);
+		}
+	}
+
+	/* Track how deep the tree gets. */
+	if (depth > btree->maximum_depth)
+		btree->maximum_depth = depth;
+
+leaf_only:
+	page = current->page;
+	cbt->ref = current;
+	cbt->recno = recno;
+	cbt->compare = 0;
+
+	/*
+	 * Set the on-page slot to an impossible value larger than any possible
+	 * slot (it's used to interpret the search function's return after the
+	 * search returns an insert list for a page that has no entries).
+	 */
+	cbt->slot = UINT32_MAX;
+
+	/*
+	 * Search the leaf page.  We do not check in the search path for a
+	 * record greater than the maximum record in the tree; in that case,
+	 * we arrive here with a record that's impossibly large for the page.
+	 */
+	if (page->type == WT_PAGE_COL_FIX) {
+		if (recno >= page->pg_fix_recno + page->pg_fix_entries) {
+			cbt->recno = page->pg_fix_recno + page->pg_fix_entries;
+			goto past_end;
+		} else
+			ins_head = WT_COL_UPDATE_SINGLE(page);
+	} else
+		if ((cip = __col_var_search(page, recno)) == NULL) {
+			cbt->recno = __col_var_last_recno(page);
+			goto past_end;
+		} else {
+			cbt->slot = WT_COL_SLOT(page, cip);
+			ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
+		}
+
+	/*
+	 * We have a match on the page, check for an update.  Check the page's
+	 * update list (fixed-length), or slot's update list (variable-length)
+	 * for a better match.  The only better match we can find is an exact
+	 * match, otherwise the existing match on the page is the one we want.
+	 * For that reason, don't set the cursor's WT_INSERT_HEAD/WT_INSERT pair
+	 * until we know we have a useful entry.
+	 */
+	if ((ins = __col_insert_search(
+	    ins_head, cbt->ins_stack, cbt->next_stack, recno)) != NULL)
+		if (recno == WT_INSERT_RECNO(ins)) {
+			cbt->ins_head = ins_head;
+			cbt->ins = ins;
+		}
+	return (0);
+
+past_end:
+	/*
+	 * A record past the end of the page's standard information.  Check the
+	 * append list; by definition, any record on the append list is closer
+	 * than the last record on the page, so it's a better choice for return.
+	 * This is a rarely used path: we normally find exact matches, because
+	 * column-store files are dense, but in this case the caller searched
+	 * past the end of the table.
+	 *
+	 * Don't bother searching if the caller is appending a new record where
+	 * we'll allocate the record number; we're not going to find a match by
+	 * definition, and we figure out the position when we do the work.
+	 */
+	cbt->ins_head = WT_COL_APPEND(page);
+	if (recno == UINT64_MAX)
+		cbt->ins = NULL;
+	else
+		cbt->ins = __col_insert_search(
+		    cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno);
+	if (cbt->ins == NULL)
+		cbt->compare = -1;
+	else {
+		cbt->recno = WT_INSERT_RECNO(cbt->ins);
+		if (recno == cbt->recno)
+			cbt->compare = 0;
+		else if (recno < cbt->recno)
+			cbt->compare = 1;
+		else
+			cbt->compare = -1;
+	}
+
+	/*
+	 * Note if the record is past the maximum record in the tree, the cursor
+	 * search functions need to know for fixed-length column-stores because
+	 * appended records implicitly create any skipped records, and cursor
+	 * search functions have to handle that case.
+	 */
+	if (cbt->compare == -1)
+		F_SET(cbt, WT_CBT_MAX_RECORD);
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/rec_evict.c b/src/third_party/wiredtiger/src/btree/rec_evict.c
new file mode 100644
index 00000000000..4696e78059e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/rec_evict.c
@@ -0,0 +1,468 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int  __hazard_exclusive(WT_SESSION_IMPL *, WT_REF *, int);
+static void __rec_discard_tree(WT_SESSION_IMPL *, WT_REF *, int, int);
+static void __rec_excl_clear(WT_SESSION_IMPL *);
+static void __rec_page_clean_update(WT_SESSION_IMPL *, WT_REF *);
+static int  __rec_page_dirty_update(WT_SESSION_IMPL *, WT_REF *, int);
+static int  __rec_review(WT_SESSION_IMPL *, WT_REF *, int, int, int *);
+
+/*
+ * __wt_rec_evict --
+ *	Reconciliation plus eviction.
+ */
+int
+__wt_rec_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
+{
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_PAGE_MODIFY *mod;
+	WT_TXN_STATE *txn_state;
+	int istree;
+
+	page = ref->page;
+	istree = 0;
+
+	WT_RET(__wt_verbose(session, WT_VERB_EVICT,
+	    "page %p (%s)", page, __wt_page_type_string(page->type)));
+
+	/*
+	 * Pin the oldest transaction ID: eviction looks at page structures
+	 * that are freed when no transaction in the system needs them.
+	 */
+	txn_state = WT_SESSION_TXN_STATE(session);
+	if (txn_state->snap_min == WT_TXN_NONE)
+		txn_state->snap_min = S2C(session)->txn_global.oldest_id;
+	else
+		txn_state = NULL;
+
+	/*
+	 * Get exclusive access to the page and review the page and its subtree
+	 * for conditions that would block our eviction of the page.  If the
+	 * check fails (for example, we find a child page that can't be merged),
+	 * we're done.  We have to make this check for clean pages, too: while
+	 * unlikely eviction would choose an internal page with children, it's
+	 * not disallowed anywhere.
+	 */
+	WT_ERR(__rec_review(session, ref, exclusive, 1, &istree));
+
+	/*
+	 * Update the page's modification reference, reconciliation might have
+	 * changed it.
+	 */
+	mod = page->modify;
+
+	/* Count evictions of internal pages during normal operation. */
+	if (!exclusive &&
+	    (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)) {
+		WT_STAT_FAST_CONN_INCR(session, cache_eviction_internal);
+		WT_STAT_FAST_DATA_INCR(session, cache_eviction_internal);
+	}
+
+	/* Discard any subtree rooted in this page. */
+	if (istree)
+		__rec_discard_tree(session, ref, exclusive, 1);
+
+	/* Update the reference and discard the page. */
+	if (mod == NULL || !F_ISSET(mod, WT_PM_REC_MASK)) {
+		WT_ASSERT(session, exclusive || ref->state == WT_REF_LOCKED);
+
+		if (__wt_ref_is_root(ref))
+			__wt_ref_out(session, ref);
+		else
+			__rec_page_clean_update(session, ref);
+
+		WT_STAT_FAST_CONN_INCR(session, cache_eviction_clean);
+		WT_STAT_FAST_DATA_INCR(session, cache_eviction_clean);
+	} else {
+		if (__wt_ref_is_root(ref))
+			__wt_ref_out(session, ref);
+		else
+			WT_ERR(
+			    __rec_page_dirty_update(session, ref, exclusive));
+
+		WT_STAT_FAST_CONN_INCR(session, cache_eviction_dirty);
+		WT_STAT_FAST_DATA_INCR(session, cache_eviction_dirty);
+	}
+
+	if (0) {
+err:		/*
+		 * If unable to evict this page, release exclusive reference(s)
+		 * we've acquired.
+		 */
+		if (!exclusive)
+			__rec_excl_clear(session);
+
+		WT_STAT_FAST_CONN_INCR(session, cache_eviction_fail);
+		WT_STAT_FAST_DATA_INCR(session, cache_eviction_fail);
+	}
+	session->excl_next = 0;
+
+	if (txn_state != NULL)
+		txn_state->snap_min = WT_TXN_NONE;
+
+	return (ret);
+}
+
+/*
+ * __rec_page_clean_update --
+ *	Update a clean page's reference on eviction.
+ */
+static void
+__rec_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+	/*
+	 * Discard the page and update the reference structure; if the page has
+	 * an address, it's a disk page; if it has no address, it's a deleted
+	 * page re-instantiated (for example, by searching) and never written.
+	 */
+	__wt_ref_out(session, ref);
+	WT_PUBLISH(ref->state,
+	    ref->addr == NULL ? WT_REF_DELETED : WT_REF_DISK);
+}
+
+/*
+ * __rec_page_dirty_update --
+ *	Update a dirty page's reference on eviction.
+ */
+static int
+__rec_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
+{
+	WT_ADDR *addr;
+	WT_PAGE *parent;
+	WT_PAGE_MODIFY *mod;
+
+	parent = ref->home;
+	mod = ref->page->modify;
+
+	switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+	case WT_PM_REC_EMPTY:				/* Page is empty */
+		if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) {
+			__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+			__wt_free(session, ref->addr);
+		}
+
+		/*
+		 * Update the parent to reference a deleted page.  The fact that
+		 * reconciliation left the page "empty" means there's no older
+		 * transaction in the system that might need to see an earlier
+		 * version of the page.  For that reason, we clear the address
+		 * of the page, if we're forced to "read" into that namespace,
+		 * we'll instantiate a new page instead of trying to read from
+		 * the backing store.
+		 *
+		 * Publish: a barrier to ensure the structure fields are set
+		 * before the state change makes the page available to readers.
+		 */
+		__wt_ref_out(session, ref);
+		ref->addr = NULL;
+		WT_PUBLISH(ref->state, WT_REF_DELETED);
+		break;
+	case WT_PM_REC_MULTIBLOCK:			/* Multiple blocks */
+		/* Split the page in memory. */
+		WT_RET(__wt_split_evict(session, ref, exclusive));
+		break;
+	case WT_PM_REC_REPLACE: 			/* 1-for-1 page swap */
+		if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) {
+			__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+			__wt_free(session, ref->addr);
+		}
+
+		/*
+		 * Update the parent to reference the replacement page.
+		 *
+		 * Publish: a barrier to ensure the structure fields are set
+		 * before the state change makes the page available to readers.
+		 */
+		WT_RET(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr));
+		*addr = mod->mod_replace;
+		mod->mod_replace.addr = NULL;
+		mod->mod_replace.size = 0;
+
+		__wt_ref_out(session, ref);
+		ref->addr = addr;
+		WT_PUBLISH(ref->state, WT_REF_DISK);
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	return (0);
+}
+
+/*
+ * __rec_discard_tree --
+ *	Discard the tree rooted a page (that is, any pages merged into it),
+ * then the page itself.
+ */
+static void
+__rec_discard_tree(
+    WT_SESSION_IMPL *session, WT_REF *ref, int exclusive, int top)
+{
+	WT_REF *child;
+
+	switch (ref->page->type) {
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_ROW_INT:
+		/* For each entry in the page... */
+		WT_INTL_FOREACH_BEGIN(session, ref->page, child) {
+			if (child->state == WT_REF_DISK ||
+			    child->state == WT_REF_DELETED)
+				continue;
+			WT_ASSERT(session,
+			    exclusive || child->state == WT_REF_LOCKED);
+			__rec_discard_tree(session, child, exclusive, 0);
+		} WT_INTL_FOREACH_END;
+		/* FALLTHROUGH */
+	default:
+		if (!top)
+			__wt_ref_out(session, ref);
+		break;
+	}
+}
+
+/*
+ * __rec_review --
+ *	Get exclusive access to the page and review the page and its subtree
+ *	for conditions that would block its eviction.
+ */
+static int
+__rec_review(
+    WT_SESSION_IMPL *session, WT_REF *ref, int exclusive, int top, int *istree)
+{
+	WT_BTREE *btree;
+	WT_PAGE *page;
+	WT_PAGE_MODIFY *mod;
+	WT_REF *child;
+	uint32_t flags;
+
+	btree = S2BT(session);
+	page = ref->page;
+
+	/*
+	 * Get exclusive access to the page if our caller doesn't have the tree
+	 * locked down.
+	 */
+	if (!exclusive) {
+		WT_RET(__hazard_exclusive(session, ref, top));
+
+		/*
+		 * Now the page is locked, remove it from the LRU eviction
+		 * queue.  We have to do this before freeing the page memory or
+		 * otherwise touching the reference because eviction paths
+		 * assume a non-NULL reference on the queue is pointing at
+		 * valid memory.
+		 */
+		__wt_evict_list_clear_page(session, ref);
+	}
+
+	/*
+	 * Recurse through the page's subtree: this happens first because we
+	 * have to write pages in depth-first order, otherwise we'll dirty
+	 * pages after we've written them.
+	 */
+	if (WT_PAGE_IS_INTERNAL(page))
+		WT_INTL_FOREACH_BEGIN(session, page, child) {
+			switch (child->state) {
+			case WT_REF_DISK:		/* On-disk */
+			case WT_REF_DELETED:		/* On-disk, deleted */
+				break;
+			case WT_REF_MEM:		/* In-memory */
+				/*
+				 * Tell our caller if there's a subtree so we
+				 * know to do a full walk when discarding the
+				 * page.
+				 */
+				*istree = 1;
+				WT_RET(__rec_review(
+				    session, child, exclusive, 0, istree));
+				break;
+			case WT_REF_LOCKED:		/* Being evicted */
+			case WT_REF_READING:		/* Being read */
+			case WT_REF_SPLIT:		/* Being split */
+				return (EBUSY);
+			WT_ILLEGAL_VALUE(session);
+			}
+		} WT_INTL_FOREACH_END;
+
+	mod = page->modify;
+
+	/*
+	 * If the tree was deepened, there's a requirement that newly created
+	 * internal pages not be evicted until all threads are known to have
+	 * exited the original page index array, because evicting an internal
+	 * page discards its WT_REF array, and a thread traversing the original
+	 * page index array might see an freed WT_REF.  During the split we set
+	 * a transaction value, once that's globally visible, we know we can
+	 * evict the created page.
+	 */
+	if (!exclusive && mod != NULL && WT_PAGE_IS_INTERNAL(page) &&
+	    !__wt_txn_visible_all(session, mod->mod_split_txn))
+		return (EBUSY);
+
+	/*
+	 * If the file is being checkpointed, we can't evict dirty pages:
+	 * if we write a page and free the previous version of the page, that
+	 * previous version might be referenced by an internal page already
+	 * been written in the checkpoint, leaving the checkpoint inconsistent.
+	 *
+	 * Don't rely on new updates being skipped by the transaction used
+	 * for transaction reads: (1) there are paths that dirty pages for
+	 * artificial reasons; (2) internal pages aren't transactional; and
+	 * (3) if an update was skipped during the checkpoint (leaving the page
+	 * dirty), then rolled back, we could still successfully overwrite a
+	 * page and corrupt the checkpoint.
+	 *
+	 * Further, we can't race with the checkpoint's reconciliation of
+	 * an internal page as we evict a clean child from the page's subtree.
+	 * This works in the usual way: eviction locks the page and then checks
+	 * for existing hazard pointers, the checkpoint thread reconciling an
+	 * internal page acquires hazard pointers on child pages it reads, and
+	 * is blocked by the exclusive lock.
+	 */
+	if (mod != NULL && btree->checkpointing &&
+	    (__wt_page_is_modified(page) ||
+	    F_ISSET(mod, WT_PM_REC_MULTIBLOCK))) {
+		WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint);
+		WT_STAT_FAST_DATA_INCR(session, cache_eviction_checkpoint);
+		return (EBUSY);
+	}
+
+	/*
+	 * Fail if any page in the top-level page's subtree won't be merged into
+	 * its parent, the page that cannot be merged must be evicted first.
+	 * The test is necessary but should not fire much: the eviction code is
+	 * biased for leaf pages, an internal page shouldn't be selected for
+	 * eviction until its children have been evicted.
+	 *
+	 * We have to write dirty pages to know their final state, a page marked
+	 * empty may have had records added since reconciliation.  Writing the
+	 * page is expensive, do a cheap test first: if it doesn't seem likely a
+	 * subtree page can be merged, quit.
+	 */
+	if (!top && (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY)))
+		return (EBUSY);
+
+	/*
+	 * If the page is dirty and can possibly change state, write it so we
+	 * know the final state.
+	 *
+	 * If we have an exclusive lock (we're discarding the tree), assert
+	 * there are no updates we cannot read.
+	 *
+	 * Otherwise, if the top-level page we're evicting is a leaf page, set
+	 * the update-restore flag, so reconciliation will write blocks it can
+	 * write and create a list of skipped updates for blocks it cannot
+	 * write.  This is how forced eviction of huge pages works: we take a
+	 * big page and reconcile it into blocks, some of which we write and
+	 * discard, the rest of which we re-create as smaller in-memory pages,
+	 * (restoring the updates that stopped us from writing the block), and
+	 * inserting the whole mess into the page's parent.
+	 *
+	 * Don't set the update-restore flag for internal pages, they don't
+	 * have updates that can be saved and restored.
+	 *
+	 * Don't set the update-restore flag for small pages.  (If a small
+	 * page were selected by eviction and then modified, and we configure it
+	 * for update-restore, we'll end up splitting one or two pages into the
+	 * parent, which is a waste of effort.  If we don't set update-restore,
+	 * eviction will return EBUSY, which makes more sense, the page was just
+	 * modified.)
+	 *
+	 * Don't set the update-restore flag for any page other than the
+	 * top one; only the reconciled top page goes through the split path
+	 * (and child pages are pages we expect to merge into the top page, they
+	 * they are not expected to split).
+	 */
+	if (__wt_page_is_modified(page)) {
+		flags = WT_EVICTING;
+		if (exclusive)
+			LF_SET(WT_SKIP_UPDATE_ERR);
+		else if (top && !WT_PAGE_IS_INTERNAL(page) &&
+		    page->memory_footprint > 10 * btree->maxleafpage)
+			LF_SET(WT_SKIP_UPDATE_RESTORE);
+		WT_RET(__wt_rec_write(session, ref, NULL, flags));
+		WT_ASSERT(session,
+		    !__wt_page_is_modified(page) ||
+		    LF_ISSET(WT_SKIP_UPDATE_RESTORE));
+	} else {
+		/*
+		 * If the page was ever modified, make sure all of the updates
+		 * on the page are old enough they can be discarded from cache.
+		 */
+		if (!exclusive && mod != NULL &&
+		    !__wt_txn_visible_all(session, mod->rec_max_txn))
+			return (EBUSY);
+	}
+
+	/*
+	 * Repeat the test: fail if any page in the top-level page's subtree
+	 * won't be merged into its parent.
+	 */
+	if (!top && (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY)))
+		return (EBUSY);
+
+	return (0);
+}
+
+/*
+ * __rec_excl_clear --
+ *	Discard exclusive access and return a page's subtree to availability.
+ */
+static void
+__rec_excl_clear(WT_SESSION_IMPL *session)
+{
+	WT_REF *ref;
+	uint32_t i;
+
+	for (i = 0; i < session->excl_next; ++i) {
+		if ((ref = session->excl[i]) == NULL)
+			break;
+		WT_ASSERT(session,
+		    ref->state == WT_REF_LOCKED && ref->page != NULL);
+		ref->state = WT_REF_MEM;
+	}
+}
+
+/*
+ * __hazard_exclusive --
+ *	Request exclusive access to a page.
+ */
+static int
+__hazard_exclusive(WT_SESSION_IMPL *session, WT_REF *ref, int top)
+{
+	/*
+	 * Make sure there is space to track exclusive access so we can unlock
+	 * to clean up.
+	 */
+	WT_RET(__wt_realloc_def(session, &session->excl_allocated,
+	    session->excl_next + 1, &session->excl));
+
+	/*
+	 * Request exclusive access to the page.  The top-level page should
+	 * already be in the locked state, lock child pages in memory.
+	 * If another thread already has this page, give up.
+	 */
+	if (!top && !WT_ATOMIC_CAS4(ref->state, WT_REF_MEM, WT_REF_LOCKED))
+		return (EBUSY);	/* We couldn't change the state. */
+	WT_ASSERT(session, ref->state == WT_REF_LOCKED);
+
+	session->excl[session->excl_next++] = ref;
+
+	/* Check for a matching hazard pointer. */
+	if (__wt_page_hazard_check(session, ref->page) == NULL)
+		return (0);
+
+	WT_STAT_FAST_DATA_INCR(session, cache_eviction_hazard);
+	WT_STAT_FAST_CONN_INCR(session, cache_eviction_hazard);
+
+	WT_RET(__wt_verbose(session, WT_VERB_EVICT,
+	    "page %p hazard request failed", ref->page));
+	return (EBUSY);
+}
diff --git a/src/third_party/wiredtiger/src/btree/rec_split.c b/src/third_party/wiredtiger/src/btree/rec_split.c
new file mode 100644
index 00000000000..babec2cc295
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/rec_split.c
@@ -0,0 +1,1121 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Tuning; global variables to allow the binary to be patched, we don't yet have
+ * any real understanding of what might be useful to surface to applications.
+ */
+static u_int __split_deepen_max_internal_image = 100;
+static u_int __split_deepen_min_child = 10;
+static u_int __split_deepen_per_child = 100;
+static u_int __split_deepen_split_child = 100;
+
+/*
+ * Track allocation increments, matching the cache calculations, which add an
+ * estimate of allocation overhead to every object.
+ */
+#define	WT_MEMSIZE_ADD(total, len)	do {				\
+	total += (len) + WT_ALLOC_OVERHEAD;				\
+} while (0)
+#define	WT_MEMSIZE_TRANSFER(from_decr, to_incr, len) do {		\
+	WT_MEMSIZE_ADD(from_decr, len);					\
+	WT_MEMSIZE_ADD(to_incr, len);					\
+} while (0)
+
+/*
+ * __split_oldest_gen --
+ *	Calculate the oldest active split generation.
+ */
+static uint64_t
+__split_oldest_gen(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_SESSION_IMPL *s;
+	uint64_t gen, oldest;
+	u_int i, session_cnt;
+
+	conn = S2C(session);
+	WT_ORDERED_READ(session_cnt, conn->session_cnt);
+	for (i = 0, s = conn->sessions, oldest = conn->split_gen + 1;
+	    i < session_cnt;
+	    i++, s++)
+		if (((gen = s->split_gen) != 0) && gen < oldest)
+			oldest = gen;
+
+	return (oldest);
+}
+
+/*
+ * __split_stash_add --
+ *	Add a new entry into the session's split stash list.
+ */
+static int
+__split_stash_add(WT_SESSION_IMPL *session, void *p, size_t len)
+{
+	WT_SPLIT_STASH *stash;
+
+	WT_ASSERT(session, p != NULL);
+
+	/* Grow the list as necessary. */
+	WT_RET(__wt_realloc_def(session, &session->split_stash_alloc,
+	    session->split_stash_cnt + 1, &session->split_stash));
+
+	stash = session->split_stash + session->split_stash_cnt++;
+	stash->split_gen = WT_ATOMIC_ADD8(S2C(session)->split_gen, 1);
+	stash->p = p;
+	stash->len = len;
+
+	WT_STAT_FAST_CONN_ATOMIC_INCRV(session, rec_split_stashed_bytes, len);
+	WT_STAT_FAST_CONN_ATOMIC_INCR(session, rec_split_stashed_objects);
+
+	/* See if we can free any previous entries. */
+	if (session->split_stash_cnt > 1)
+		__wt_split_stash_discard(session);
+
+	return (0);
+}
+
+/*
+ * __wt_split_stash_discard --
+ *	Discard any memory from a session's split stash that we can.
+ */
+void
+__wt_split_stash_discard(WT_SESSION_IMPL *session)
+{
+	WT_SPLIT_STASH *stash;
+	uint64_t oldest;
+	size_t i;
+
+	/* Get the oldest split generation. */
+	oldest = __split_oldest_gen(session);
+
+	for (i = 0, stash = session->split_stash;
+	    i < session->split_stash_cnt;
+	    ++i, ++stash) {
+		if (stash->p == NULL)
+			continue;
+		else if (stash->split_gen >= oldest)
+			break;
+		/*
+		 * It's a bad thing if another thread is in this memory after
+		 * we free it, make sure nothing good happens to that thread.
+		 */
+		WT_STAT_FAST_CONN_ATOMIC_DECRV(
+		    session, rec_split_stashed_bytes, stash->len);
+		WT_STAT_FAST_CONN_ATOMIC_DECR(
+		    session, rec_split_stashed_objects);
+		__wt_overwrite_and_free_len(session, stash->p, stash->len);
+	}
+
+	/*
+	 * If there are enough free slots at the beginning of the list, shuffle
+	 * everything down.
+	 */
+	if (i > 100 || i == session->split_stash_cnt)
+		if ((session->split_stash_cnt -= i) > 0)
+			memmove(session->split_stash, stash,
+			    session->split_stash_cnt * sizeof(*stash));
+}
+
+/*
+ * __wt_split_stash_discard_all --
+ *	Discard all memory from a session's split stash.
+ */
+void
+__wt_split_stash_discard_all(
+    WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session)
+{
+	WT_SPLIT_STASH *stash;
+	size_t i;
+
+	/*
+	 * This function is called during WT_CONNECTION.close to discard any
+	 * memory that remains.  For that reason, we take two WT_SESSION_IMPL
+	 * arguments: session_safe is still linked to the WT_CONNECTION and
+	 * can be safely used for calls to other WiredTiger functions, while
+	 * session is the WT_SESSION_IMPL we're cleaning up.
+	 */
+	for (i = 0, stash = session->split_stash;
+	    i < session->split_stash_cnt;
+	    ++i, ++stash)
+		if (stash->p != NULL)
+			__wt_free(session_safe, stash->p);
+
+	__wt_free(session_safe, session->split_stash);
+	session->split_stash_cnt = session->split_stash_alloc = 0;
+}
+
+/*
+ * __split_safe_free --
+ *	Free a buffer if we can be sure no thread is accessing it, or schedule
+ *	it to be freed otherwise.
+ */
+static int
+__split_safe_free(WT_SESSION_IMPL *session, int exclusive, void *p, size_t s)
+{
+	/*
+	 * We have swapped something in a page: if we don't have exclusive
+	 * access, check whether there are other threads in the same tree.
+	 */
+	if (!exclusive &&
+	    __split_oldest_gen(session) == S2C(session)->split_gen + 1)
+		exclusive = 1;
+
+	if (exclusive) {
+		__wt_free(session, p);
+		return (0);
+	}
+
+	return (__split_stash_add(session, p, s));
+}
+
+/*
+ * __split_should_deepen --
+ *	Return if we should deepen the tree.
+ */
+static int
+__split_should_deepen(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_PAGE_INDEX *pindex;
+
+	/*
+	 * Splits are based on either the number of child pages that will be
+	 * created by the split (splitting an internal page that will be slow
+	 * to search), or by the memory footprint of the parent page (avoiding
+	 * an internal page that will eat up all of the cache and put eviction
+	 * pressure on the system).
+	 */
+	pindex = WT_INTL_INDEX_COPY(page);
+
+	/*
+	 * Deepen the tree if the page's memory footprint is larger than the
+	 * maximum size for a page in memory.  We need an absolute minimum
+	 * number of entries in order to split the page: if there is a single
+	 * huge key, splitting won't help.
+	 */
+	if (page->memory_footprint > S2BT(session)->maxmempage &&
+	    pindex->entries >= __split_deepen_min_child)
+		return (1);
+
+	/*
+	 * Deepen the tree if the page's memory footprint is at least N
+	 * times the maximum internal page size chunk in the backing file and
+	 * the split will result in at least N children in the newly created
+	 * intermediate layer.
+	 */
+	if (page->memory_footprint >
+	    __split_deepen_max_internal_image * S2BT(session)->maxintlpage &&
+	    pindex->entries >=
+	    (__split_deepen_per_child * __split_deepen_split_child))
+		return (1);
+
+	return (0);
+}
+
+/*
+ * __split_ovfl_key_cleanup --
+ *	Handle cleanup for on-page row-store overflow keys.
+ */
+static int
+__split_ovfl_key_cleanup(WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref)
+{
+	WT_CELL *cell;
+	WT_CELL_UNPACK kpack;
+	WT_IKEY *ikey;
+	uint32_t cell_offset;
+
+	/*
+	 * A key being discarded (page split) or moved to a different page (page
+	 * deepening) may be an on-page overflow key.  Clear any reference to an
+	 * underlying disk image, and, if the key hasn't been deleted, delete it
+	 * along with any backing blocks.
+	 */
+	if ((ikey = __wt_ref_key_instantiated(ref)) == NULL)
+		return (0);
+	if ((cell_offset = ikey->cell_offset) == 0)
+		return (0);
+
+	/* Leak blocks rather than try this twice. */
+	ikey->cell_offset = 0;
+
+	cell = WT_PAGE_REF_OFFSET(page, cell_offset);
+	__wt_cell_unpack(cell, &kpack);
+	if (kpack.ovfl && kpack.raw != WT_CELL_KEY_OVFL_RM)
+		WT_RET(__wt_ovfl_discard(session, cell));
+
+	return (0);
+}
+
+/*
+ * __split_ref_instantiate --
+ *	Instantiate key/address pairs in memory in service of a split.
+ */
+static int
+__split_ref_instantiate(WT_SESSION_IMPL *session,
+    WT_PAGE *page, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp)
+{
+	WT_ADDR *addr;
+	WT_CELL_UNPACK unpack;
+	WT_DECL_RET;
+	WT_IKEY *ikey;
+	size_t size;
+	void *key;
+
+	/*
+	 * Instantiate row-store keys, and column- and row-store addresses in
+	 * the WT_REF structures referenced by a page that's being split (and
+	 * deepening the tree).  The WT_REF structures aren't moving, but the
+	 * index references are moving from the page we're splitting to a set
+	 * of child pages, and so we can no longer reference the block image
+	 * that remains with the page being split.
+	 *
+	 * Track how much memory the parent is losing and the child gaining.
+	 *
+	 * No locking is required to update the WT_REF structure because we're
+	 * the only thread splitting the parent page, and there's no way for
+	 * readers to race with our updates of single pointers.  The changes
+	 * have to be written before the page goes away, of course, our caller
+	 * owns that problem.
+	 *
+	 * Row-store keys, first.
+	 */
+	if (page->type == WT_PAGE_ROW_INT) {
+		if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) {
+			__wt_ref_key(page, ref, &key, &size);
+			WT_RET(__wt_row_ikey(session, 0, key, size, &ikey));
+			ref->key.ikey = ikey;
+		} else {
+			WT_RET(__split_ovfl_key_cleanup(session, page, ref));
+			WT_MEMSIZE_ADD(*parent_decrp,
+			    sizeof(WT_IKEY) + ikey->size);
+		}
+		WT_MEMSIZE_ADD(*child_incrp, sizeof(WT_IKEY) + ikey->size);
+	}
+
+	/*
+	 * If there's no address (the page has never been written), or the
+	 * address has been instantiated, there's no work to do.  Otherwise,
+	 * get the address from the on-page cell.
+	 */
+	if ((addr = ref->addr) == NULL)
+		return (0);
+	if (__wt_off_page(page, addr))
+		WT_MEMSIZE_TRANSFER(*parent_decrp, *child_incrp,
+		    sizeof(WT_ADDR) + addr->size);
+	else {
+		__wt_cell_unpack((WT_CELL *)ref->addr, &unpack);
+		WT_RET(__wt_calloc_def(session, 1, &addr));
+		if ((ret = __wt_strndup(
+		    session, unpack.data, unpack.size, &addr->addr)) != 0) {
+			__wt_free(session, addr);
+			return (ret);
+		}
+		addr->size = (uint8_t)unpack.size;
+		addr->type =
+		    unpack.raw == WT_CELL_ADDR_INT ? WT_ADDR_INT : WT_ADDR_LEAF;
+		ref->addr = addr;
+		WT_MEMSIZE_ADD(*child_incrp, sizeof(WT_ADDR) + addr->size);
+	}
+	return (0);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __split_verify_intl_key_order --
+ *	Verify the key order on an internal page after a split, diagnostic only.
+ */
+static void
+__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_BTREE *btree;
+	WT_ITEM *next, _next, *last, _last, *tmp;
+	WT_REF *ref;
+	uint64_t recno;
+	int cmp, first;
+
+	btree = S2BT(session);
+
+	switch (page->type) {
+	case WT_PAGE_COL_INT:
+		recno = 0;
+		WT_INTL_FOREACH_BEGIN(session, page, ref) {
+			WT_ASSERT(session, ref->key.recno > recno);
+			recno = ref->key.recno;
+		} WT_INTL_FOREACH_END;
+		break;
+	case WT_PAGE_ROW_INT:
+		next = &_next;
+		WT_CLEAR(_next);
+		last = &_last;
+		WT_CLEAR(_last);
+
+		first = 1;
+		WT_INTL_FOREACH_BEGIN(session, page, ref) {
+			__wt_ref_key(page, ref, &next->data, &next->size);
+			if (last->size == 0) {
+				if (first)
+					first = 0;
+				else {
+					WT_ASSERT(session, __wt_compare(
+					    session, btree->collator, last,
+					    next, &cmp) == 0);
+					WT_ASSERT(session, cmp < 0);
+				}
+			}
+			tmp = last;
+			last = next;
+			next = tmp;
+		} WT_INTL_FOREACH_END;
+		break;
+	}
+}
+#endif
+
+/*
+ * __split_deepen --
+ *	Split an internal page in-memory, deepening the tree.
+ */
+static int
+__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
+{
+	WT_DECL_RET;
+	WT_PAGE *child;
+	WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex;
+	WT_REF **alloc_refp;
+	WT_REF *child_ref, **child_refp, *parent_ref, **parent_refp, *ref;
+	size_t child_incr, parent_decr, parent_incr, size;
+	uint32_t children, chunk, i, j, remain, slots;
+	int panic;
+	void *p;
+
+	alloc_index = NULL;
+	parent_incr = parent_decr = 0;
+	panic = 0;
+
+	pindex = WT_INTL_INDEX_COPY(parent);
+
+	/*
+	 * Create N children, unless we are dealing with a large page without
+	 * many entries, in which case split into the minimum number of pages.
+	 */
+	children = WT_MAX(pindex->entries / __split_deepen_per_child,
+	    __split_deepen_min_child);
+
+	WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen);
+	WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
+	    "%p: %" PRIu32 " elements, splitting into %" PRIu32 " children",
+	    parent, pindex->entries, children));
+
+	/*
+	 * If the workload is prepending/appending to the tree, we could deepen
+	 * without bound.  Don't let that happen, keep the first/last pages of
+	 * the tree at their current level.
+	 *
+	 * XXX
+	 * To improve this, we could track which pages were last merged into
+	 * this page by eviction, and leave those pages alone, to prevent any
+	 * sustained insert into the tree from deepening a single location.
+	 */
+#undef	SPLIT_CORRECT_1
+#define	SPLIT_CORRECT_1	1		/* First page correction */
+#undef	SPLIT_CORRECT_2
+#define	SPLIT_CORRECT_2	2		/* First/last page correction */
+
+	/*
+	 * Allocate a new WT_PAGE_INDEX and set of WT_REF objects.  Initialize
+	 * the first/last slots of the allocated WT_PAGE_INDEX to point to the
+	 * first/last pages we're keeping at the current level, and the rest of
+	 * the slots to point to new WT_REF objects.
+	 */
+	size = sizeof(WT_PAGE_INDEX) +
+	    (children + SPLIT_CORRECT_2) * sizeof(WT_REF *);
+	WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
+	WT_MEMSIZE_ADD(parent_incr, size);
+	alloc_index->index = (WT_REF **)(alloc_index + 1);
+	alloc_index->entries = children + SPLIT_CORRECT_2;
+	alloc_index->index[0] = pindex->index[0];
+	alloc_index->index[alloc_index->entries - 1] =
+	    pindex->index[pindex->entries - 1];
+	for (alloc_refp = alloc_index->index + SPLIT_CORRECT_1,
+	    i = 0; i < children; ++alloc_refp, ++i) {
+		WT_ERR(__wt_calloc_def(session, 1, alloc_refp));
+		WT_MEMSIZE_ADD(parent_incr, sizeof(WT_REF));
+	}
+
+	/* Allocate child pages, and connect them into the new page index. */
+	chunk = (pindex->entries - SPLIT_CORRECT_2) / children;
+	remain = (pindex->entries - SPLIT_CORRECT_2) - chunk * (children - 1);
+	for (parent_refp = pindex->index + SPLIT_CORRECT_1,
+	    alloc_refp = alloc_index->index + SPLIT_CORRECT_1,
+	    i = 0; i < children; ++i) {
+		slots = i == children - 1 ? remain : chunk;
+		WT_ERR(__wt_page_alloc(
+		    session, parent->type, 0, slots, 0, &child));
+
+		/*
+		 * Initialize the parent page's child reference; we need a copy
+		 * of the page's key.
+		 */
+		ref = *alloc_refp++;
+		ref->home = parent;
+		ref->page = child;
+		ref->addr = NULL;
+		if (parent->type == WT_PAGE_ROW_INT) {
+			__wt_ref_key(parent, *parent_refp, &p, &size);
+			WT_ERR(
+			    __wt_row_ikey(session, 0, p, size, &ref->key.ikey));
+			WT_MEMSIZE_ADD(parent_incr, sizeof(WT_IKEY) + size);
+		} else
+			ref->key.recno = (*parent_refp)->key.recno;
+		ref->state = WT_REF_MEM;
+
+		/* Initialize the child page. */
+		if (parent->type == WT_PAGE_COL_INT)
+			child->pg_intl_recno = (*parent_refp)->key.recno;
+		child->pg_intl_parent_ref = ref;
+
+		/* Mark it dirty. */
+		WT_ERR(__wt_page_modify_init(session, child));
+		__wt_page_only_modify_set(session, child);
+
+		/*
+		 * Once the split goes live, the newly created internal pages
+		 * might be evicted and their WT_REF structures freed.  If those
+		 * pages are evicted before threads exit the previous page index
+		 * array, a thread might see a freed WT_REF.  Set the eviction
+		 * transaction requirement for the newly created internal pages.
+		 */
+		child->modify->mod_split_txn = __wt_txn_new_id(session);
+
+		/*
+		 * The newly allocated child's page index references the same
+		 * structures as the parent.  (We cannot move WT_REF structures,
+		 * threads may be underneath us right now changing the structure
+		 * state.)  However, if the WT_REF structures reference on-page
+		 * information, we have to fix that, because the disk image for
+		 * the page that has an page index entry for the WT_REF is about
+		 * to change.
+		 */
+		child_incr = 0;
+		child_pindex = WT_INTL_INDEX_COPY(child);
+		for (child_refp = child_pindex->index, j = 0; j < slots; ++j) {
+			WT_ERR(__split_ref_instantiate(session,
+			    parent, *parent_refp, &parent_decr, &child_incr));
+			*child_refp++ = *parent_refp++;
+
+			WT_MEMSIZE_TRANSFER(
+			    parent_decr, child_incr, sizeof(WT_REF));
+		}
+		__wt_cache_page_inmem_incr(session, child, child_incr);
+	}
+	WT_ASSERT(session, alloc_refp -
+	    alloc_index->index == alloc_index->entries - SPLIT_CORRECT_1);
+	WT_ASSERT(session,
+	    parent_refp - pindex->index == pindex->entries - SPLIT_CORRECT_1);
+
+	/*
+	 * Update the parent's index; this is the update which splits the page,
+	 * making the change visible to threads descending the tree.  From now
+	 * on, we're committed to the split.  If any subsequent work fails, we
+	 * have to panic because we potentially have threads of control using
+	 * the new page index we just swapped in.
+	 *
+	 * A note on error handling: until this point, there's no problem with
+	 * unwinding on error.  We allocated a new page index, a new set of
+	 * WT_REFs and a new set of child pages -- if an error occurred, the
+	 * parent remained unchanged, although it may have an incorrect memory
+	 * footprint.  From now on we've modified the parent page, attention
+	 * needs to be paid.
+	 */
+	WT_INTL_INDEX_SET(parent, alloc_index);
+	panic = 1;
+
+#ifdef HAVE_DIAGNOSTIC
+	__split_verify_intl_key_order(session, parent);
+#endif
+
+	/*
+	 * The moved reference structures now reference the wrong parent page,
+	 * and we have to fix that up.  The problem is revealed when a thread
+	 * of control searches for a page's reference structure slot, and fails
+	 * to find it because the page it's searching no longer references it.
+	 * When that failure happens, the thread waits for the reference's home
+	 * page to be updated, which we do here: walk the children and fix them
+	 * up.
+	 *
+	 * We're not acquiring hazard pointers on these pages, they cannot be
+	 * evicted because of the eviction transaction value set above.
+	 */
+	for (parent_refp = alloc_index->index,
+	    i = alloc_index->entries; i > 0; ++parent_refp, --i) {
+		parent_ref = *parent_refp;
+		WT_ASSERT(session, parent_ref->home == parent);
+		if (parent_ref->state != WT_REF_MEM)
+			continue;
+
+		/*
+		 * We left the first/last children of the parent at the current
+		 * level to avoid bad split patterns, they might be leaf pages;
+		 * check the page type before we continue.
+		 */
+		child = parent_ref->page;
+		if (!WT_PAGE_IS_INTERNAL(child))
+			continue;
+#ifdef HAVE_DIAGNOSTIC
+		__split_verify_intl_key_order(session, child);
+#endif
+		WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
+			/*
+			 * The page's parent reference may not be wrong, as we
+			 * opened up access from the top of the tree already,
+			 * pages may have been read in since then.  Check and
+			 * only update pages that reference the original page,
+			 * they must be wrong.
+			 */
+			if (child_ref->home == parent) {
+				child_ref->home = child;
+				child_ref->ref_hint = 0;
+			}
+		} WT_INTL_FOREACH_END;
+	}
+
+	/*
+	 * Push out the changes: not required for correctness, but don't let
+	 * threads spin on incorrect page references longer than necessary.
+	 */
+	WT_FULL_BARRIER();
+	alloc_index = NULL;
+
+	/*
+	 * We can't free the previous parent's index, there may be threads using
+	 * it.  Add to the session's discard list, to be freed once we know no
+	 * threads can still be using it.
+	 *
+	 * This change requires care with error handling: we have already
+	 * updated the page with a new index.  Even if stashing the old value
+	 * fails, we don't roll back that change, because threads may already
+	 * be using the new index.
+	 */
+	size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
+	WT_MEMSIZE_ADD(parent_decr, size);
+	WT_ERR(__split_safe_free(session, 0, pindex, size));
+
+	/*
+	 * Adjust the parent's memory footprint.  This may look odd, but we
+	 * have already taken the allocation overhead into account, and an
+	 * increment followed by a decrement will cancel out the normal
+	 * adjustment.
+	 */
+	__wt_cache_page_inmem_incr(session, parent, parent_incr);
+	__wt_cache_page_inmem_decr(session, parent, parent_decr);
+
+	if (0) {
+err:		__wt_free_ref_index(session, parent, alloc_index, 1);
+
+		/*
+		 * If panic is set, we saw an error after opening up the tree
+		 * to descent through the parent page's new index.  There is
+		 * nothing we can do, the tree is inconsistent and there are
+		 * threads potentially active in both versions of the tree.
+		 */
+		if (panic)
+			ret = __wt_panic(session);
+	}
+	return (ret);
+}
+
+/*
+ * __split_inmem_build --
+ *	Instantiate a page in a multi-block set, when an update couldn't be
+ * written.
+ */
+static int
+__split_inmem_build(
+    WT_SESSION_IMPL *session, WT_PAGE *orig, WT_REF *ref, WT_MULTI *multi)
+{
+	WT_CURSOR_BTREE cbt;
+	WT_DECL_ITEM(key);
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_UPDATE *upd;
+	WT_UPD_SKIPPED *skip;
+	uint64_t recno;
+	uint32_t i, slot;
+
+	WT_CLEAR(cbt);
+	cbt.iface.session = &session->iface;
+	cbt.btree = S2BT(session);
+
+	/*
+	 * We can find unresolved updates when attempting to evict a page, which
+	 * can't be written. This code re-creates the in-memory page and applies
+	 * the unresolved updates to that page.
+	 *
+	 * Clear the disk image and link the page into the passed-in WT_REF to
+	 * simplify error handling: our caller will not discard the disk image
+	 * when discarding the original page, and our caller will discard the
+	 * allocated page on error, when discarding the allocated WT_REF.
+	 */
+	WT_RET(__wt_page_inmem(
+	    session, ref, multi->skip_dsk, WT_PAGE_DISK_ALLOC, &page));
+	multi->skip_dsk = NULL;
+
+	if (orig->type == WT_PAGE_ROW_LEAF)
+		WT_RET(__wt_scr_alloc(session, 0, &key));
+
+	/* Re-create each modification we couldn't write. */
+	for (i = 0, skip = multi->skip; i < multi->skip_entries; ++i, ++skip)
+		switch (orig->type) {
+		case WT_PAGE_COL_FIX:
+		case WT_PAGE_COL_VAR:
+			/* Build a key. */
+			upd = skip->ins->upd;
+			skip->ins->upd = NULL;
+			recno = WT_INSERT_RECNO(skip->ins);
+
+			/* Search the page. */
+			WT_ERR(__wt_col_search(session, recno, ref, &cbt));
+
+			/* Apply the modification. */
+			WT_ERR(__wt_col_modify(
+			    session, &cbt, recno, NULL, upd, 0));
+			break;
+		case WT_PAGE_ROW_LEAF:
+			/* Build a key. */
+			if (skip->ins == NULL) {
+				slot = WT_ROW_SLOT(orig, skip->rip);
+				upd = orig->pg_row_upd[slot];
+				orig->pg_row_upd[slot] = NULL;
+
+				WT_ERR(__wt_row_leaf_key(
+				    session, orig, skip->rip, key, 0));
+			} else {
+				upd = skip->ins->upd;
+				skip->ins->upd = NULL;
+
+				key->data = WT_INSERT_KEY(skip->ins);
+				key->size = WT_INSERT_KEY_SIZE(skip->ins);
+			}
+
+			/* Search the page. */
+			WT_ERR(__wt_row_search(session, key, ref, &cbt, 1));
+
+			/* Apply the modification. */
+			WT_ERR(
+			    __wt_row_modify(session, &cbt, key, NULL, upd, 0));
+			break;
+		WT_ILLEGAL_VALUE_ERR(session);
+		}
+
+	/*
+	 * We modified the page above, which will have set the first dirty
+	 * transaction to the last transaction current running.  However, the
+	 * updates we installed may be older than that.  Take the oldest active
+	 * transaction ID to make sure these updates are not skipped by a
+	 * checkpoint.
+	 */
+	page->modify->first_dirty_txn = S2C(session)->txn_global.oldest_id;
+
+err:	__wt_scr_free(&key);
+	/* Free any resources that may have been cached in the cursor. */
+	WT_TRET(__wt_btcur_close(&cbt));
+	return (ret);
+}
+
+/*
+ * __wt_multi_to_ref --
+ *	Move a multi-block list into an array of WT_REF structures.
+ */
+int
+__wt_multi_to_ref(WT_SESSION_IMPL *session,
+    WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp)
+{
+	WT_ADDR *addr;
+	WT_IKEY *ikey;
+	WT_REF *ref;
+	size_t incr;
+
+	addr = NULL;
+	incr = 0;
+
+	/* In some cases, the underlying WT_REF has not yet been allocated. */
+	if (*refp == NULL) {
+		WT_RET(__wt_calloc_def(session, 1, refp));
+		WT_MEMSIZE_ADD(incr, sizeof(WT_REF));
+	}
+	ref = *refp;
+
+	/*
+	 * Any parent reference must be filled in by our caller; the primary
+	 * use of this function is when splitting into a parent page, and we
+	 * aren't holding any locks here that would allow us to know which
+	 * parent we'll eventually split into, if the tree is simultaneously
+	 * being deepened.
+	 */
+	ref->home = NULL;
+
+	if (multi->skip == NULL) {
+		/*
+		 * Copy the address: we could simply take the buffer, but that
+		 * would complicate error handling, freeing the reference array
+		 * would have to avoid freeing the memory, and it's not worth
+		 * the confusion.
+		 */
+		WT_RET(__wt_calloc_def(session, 1, &addr));
+		WT_MEMSIZE_ADD(incr, sizeof(WT_ADDR));
+		ref->addr = addr;
+		addr->size = multi->addr.size;
+		addr->type = multi->addr.type;
+		WT_RET(__wt_strndup(session,
+		    multi->addr.addr, addr->size, &addr->addr));
+		/* Need a cast to avoid an implicit conversion warning. */
+		WT_MEMSIZE_ADD(incr, addr->size);
+	} else
+		WT_RET(__split_inmem_build(session, page, ref, multi));
+
+	switch (page->type) {
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+		ikey = multi->key.ikey;
+		WT_RET(__wt_row_ikey(session, 0,
+		    WT_IKEY_DATA(ikey), ikey->size, &ref->key.ikey));
+		WT_MEMSIZE_ADD(incr, sizeof(WT_IKEY) + ikey->size);
+		break;
+	default:
+		ref->key.recno = multi->key.recno;
+		break;
+	}
+
+	ref->state = multi->skip == NULL ? WT_REF_DISK : WT_REF_MEM;
+
+	/*
+	 * If our caller wants to track the memory allocations, we have a return
+	 * reference.
+	 */
+	if (incrp != NULL)
+		*incrp += incr;
+	return (0);
+}
+
+/*
+ * __split_evict_multi --
+ *	Resolve a multi-page split, inserting new information into the parent.
+ */
+static int
+__split_evict_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
+{
+	WT_DECL_RET;
+	WT_IKEY *ikey;
+	WT_PAGE *parent, *child;
+	WT_PAGE_INDEX *alloc_index, *pindex;
+	WT_PAGE_MODIFY *mod;
+	WT_REF **alloc_refp, *parent_ref, ref_copy, **ref_tmp;
+	size_t parent_decr, parent_incr, size;
+	uint32_t i, j, parent_entries, result_entries, split_entries;
+	int complete, hazard, locked;
+
+	parent = NULL;			/* -Wconditional-uninitialized */
+	alloc_index = NULL;
+	parent_ref = NULL;
+	ref_tmp = NULL;
+	parent_decr = parent_incr = 0;
+	complete = hazard = locked = 0;
+
+	child = ref->page;
+	mod = child->modify;
+
+	/*
+	 * Convert the split page's multiblock reconciliation information into
+	 * an array of page reference structures.
+	 */
+	split_entries = mod->mod_multi_entries;
+	WT_RET(__wt_calloc_def(session, split_entries, &ref_tmp));
+	for (i = 0; i < split_entries; ++i)
+		WT_ERR(__wt_multi_to_ref(session,
+		    child, &mod->mod_multi[i], &ref_tmp[i], &parent_incr));
+
+	/*
+	 * Get a page-level lock on the parent to single-thread splits into the
+	 * page because we need to single-thread sizing/growing the page index.
+	 * It's OK to queue up multiple splits as the child pages split, but the
+	 * actual split into the parent has to be serialized.  Note we allocate
+	 * memory inside of the lock and may want to invest effort in making the
+	 * locked period shorter.
+	 *
+	 * We could race with another thread deepening our parent.  To deal
+	 * with that, read the parent pointer each time we try to lock it, and
+	 * check that it's still correct after it is locked.
+	 */
+	for (;;) {
+		parent = ref->home;
+		F_CAS_ATOMIC(parent, WT_PAGE_SPLITTING, ret);
+		if (ret == 0) {
+			if (parent == ref->home)
+				break;
+			F_CLR_ATOMIC(parent, WT_PAGE_SPLITTING);
+			continue;
+		}
+		__wt_yield();
+	}
+	locked = 1;
+
+	/*
+	 * We have exclusive access to split the parent, and at this point, the
+	 * child prevents the parent from being evicted.  However, once we
+	 * update the parent's index, it will no longer refer to the child, and
+	 * could conceivably be evicted.  Get a hazard pointer on the parent
+	 * now, so that we can safely access it after updating the index.
+	 */
+	if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) {
+		WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT));
+		hazard = 1;
+	}
+
+	pindex = WT_INTL_INDEX_COPY(parent);
+	parent_entries = pindex->entries;
+	result_entries = (parent_entries - 1) + split_entries;
+
+	/*
+	 * Allocate and initialize a new page index array for the parent, then
+	 * copy references from the original index array, plus references from
+	 * the newly created split array, into place.
+	 */
+	size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *);
+	WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
+	WT_MEMSIZE_ADD(parent_incr, size);
+	alloc_index->index = (WT_REF **)(alloc_index + 1);
+	alloc_index->entries = result_entries;
+	for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i)
+		if (pindex->index[i] == ref)
+			for (j = 0; j < split_entries; ++j) {
+				ref_tmp[j]->home = parent;
+				*alloc_refp++ = ref_tmp[j];
+
+				/*
+				 * Clear the split reference as it moves to the
+				 * allocated page index, so it never appears on
+				 * both after an error.
+				 */
+				ref_tmp[j] = NULL;
+			}
+		else
+			*alloc_refp++ = pindex->index[i];
+	__wt_free(session, ref_tmp);
+
+	/*
+	 * Update the parent page's index: this update makes the split visible
+	 * to threads descending the tree.
+	 */
+	WT_INTL_INDEX_SET(parent, alloc_index);
+	alloc_index = NULL;
+
+#ifdef HAVE_DIAGNOSTIC
+	__split_verify_intl_key_order(session, parent);
+#endif
+
+	/*
+	 * Reset the page's original WT_REF field to split.  Threads cursoring
+	 * through the tree were blocked because that WT_REF state was set to
+	 * locked.  This update changes the locked state to split, unblocking
+	 * those threads and causing them to re-calculate their position based
+	 * on the updated parent page's index.
+	 */
+	WT_PUBLISH(ref->state, WT_REF_SPLIT);
+
+	/*
+	 * A note on error handling: failures before we swapped the new page
+	 * index into the parent can be resolved by simply freeing allocated
+	 * memory because the original page is unchanged, we can continue to
+	 * use it and we have not yet modified the parent.  (See below for an
+	 * exception, we cannot discard pages referencing unresolved changes.)
+	 * Failures after we swap the new page index into the parent are also
+	 * relatively benign because the split is OK and complete and the page
+	 * is reset so it will be discarded by eviction.  For that reason, we
+	 * mostly ignore further errors unless there's a panic.
+	 */
+	complete = 1;
+
+	/*
+	 * The previous parent page's key for this child page may have been an
+	 * on-page overflow key.  In that case, if the key hasn't been deleted,
+	 * delete it now, including its backing blocks.  We are exchanging the
+	 * WT_REF that referenced it for the split page WT_REFs and their keys,
+	 * and there's no longer any reference to it.  Done after completing the
+	 * split (if we failed, we'd leak the underlying blocks, but the parent
+	 * page would be unaffected).
+	 */
+	if (parent->type == WT_PAGE_ROW_INT)
+		WT_TRET(__split_ovfl_key_cleanup(session, parent, ref));
+
+	/*
+	 * We can't free the previous page index, or the page's original WT_REF
+	 * structure and instantiated key, there may be threads using them. Add
+	 * them to the session discard list, to be freed once we know it's safe.
+	 */
+	size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
+	WT_TRET(__split_safe_free(session, exclusive, pindex, size));
+	WT_MEMSIZE_ADD(parent_decr, size);
+	if (parent->type == WT_PAGE_ROW_INT &&
+	    (ikey = __wt_ref_key_instantiated(ref)) != NULL) {
+		size = sizeof(WT_IKEY) + ikey->size;
+		WT_TRET(__split_safe_free(session, exclusive, ikey, size));
+		WT_MEMSIZE_ADD(parent_decr, size);
+	}
+	/*
+	 * Take a copy of the ref in case we can free it immediately: we still
+	 * need to discard the page.
+	 */
+	ref_copy = *ref;
+	WT_TRET(__split_safe_free(session, exclusive, ref, sizeof(WT_REF)));
+	WT_MEMSIZE_ADD(parent_decr, sizeof(WT_REF));
+
+	/*
+	 * Adjust the parent's memory footprint.  This may look odd, but we
+	 * have already taken the allocation overhead into account, and an
+	 * increment followed by a decrement will cancel out the normal
+	 * adjustment.
+	 */
+	__wt_cache_page_inmem_incr(session, parent, parent_incr);
+	__wt_cache_page_inmem_decr(session, parent, parent_decr);
+
+	WT_STAT_FAST_CONN_INCR(session, cache_eviction_split);
+	WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
+	    "%p: %s split into parent %p %" PRIu32 " -> %" PRIu32
+	    " (%" PRIu32 ")",
+	    child, __wt_page_type_string(child->type), parent, parent_entries,
+	    result_entries, result_entries - parent_entries));
+
+	/*
+	 * Simple page splits trickle up the tree, that is, as leaf pages grow
+	 * large enough and are evicted, they'll split into their parent.  And,
+	 * as that parent grows large enough and is evicted, it will split into
+	 * its parent and so on.  When the page split wave reaches the root,
+	 * the tree will permanently deepen as multiple root pages are written.
+	 *	However, this only helps if first, the pages are evicted (and
+	 * we resist evicting internal pages for obvious reasons), and second,
+	 * if the tree is closed and re-opened from a disk image, which may be
+	 * a rare event.
+	 *	To avoid the case of internal pages becoming too large when they
+	 * aren't being evicted, check internal pages each time a leaf page is
+	 * split into them.  If it's big enough, deepen the tree at that point.
+	 *	Do the check here because we've just grown the parent page and
+	 * are holding it locked.
+	 */
+	if (ret == 0 && !exclusive && __split_should_deepen(session, parent))
+		ret = __split_deepen(session, parent);
+
+err:	if (locked)
+		F_CLR_ATOMIC(parent, WT_PAGE_SPLITTING);
+
+	if (hazard)
+		WT_TRET(__wt_hazard_clear(session, parent));
+
+	/*
+	 * Discard the child; test for split completion instead of errors, there
+	 * might be a relatively innocuous error, and if we split the parent, we
+	 * want to discard the child.
+	 */
+	if (complete) {
+		/*
+		 * Pages with unresolved changes are not marked clean during
+		 * reconciliation, do it now.
+		 */
+		if (__wt_page_is_modified(child)) {
+			 mod->write_gen = 0;
+			 __wt_cache_dirty_decr(session, child);
+		}
+		__wt_ref_out(session, &ref_copy);
+	}
+
+	/*
+	 * A note on error handling: in the case of evicting a page that has
+	 * unresolved changes, we just instantiated some in-memory pages that
+	 * reflect those unresolved changes.  The problem is those pages
+	 * reference the same WT_UPDATE chains as the page we're splitting,
+	 * that is, we simply copied references into the new pages.  If the
+	 * split fails, the original page is fine, but discarding the created
+	 * page would free those update chains, and that's wrong.  There isn't
+	 * an easy solution, there's a lot of small memory allocations in some
+	 * common code paths, and unwinding those changes will be difficult.
+	 * For now, leak the memory by not discarding the instantiated pages.
+	 */
+	__wt_free_ref_index(session, NULL, alloc_index, 0);
+	if (ref_tmp != NULL) {
+		for (i = 0; i < split_entries; ++i)
+			__wt_free_ref(session, child, ref_tmp[i], 0);
+		__wt_free(session, ref_tmp);
+	}
+
+	/*
+	 * A note on error handling: if we completed the split, return success,
+	 * nothing really bad can have happened.
+	 */
+	return (ret == WT_PANIC || !complete ? ret : 0);
+}
+
+/*
+ * __split_evict_single --
+ *	Resolve a single page split, replacing a page with a new version.
+ */
+static int
+__split_evict_single(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+	WT_PAGE *page;
+	WT_PAGE_MODIFY *mod;
+	WT_REF new;
+
+	page = ref->page;
+	mod = page->modify;
+
+	/* Build the new page. */
+	memset(&new, 0, sizeof(new));
+	WT_RET(__split_inmem_build(session, page, &new, &mod->mod_multi[0]));
+
+	/*
+	 * Discard the original page. Pages with unresolved changes are not
+	 * marked clean during reconciliation, do it now.
+	 */
+	mod->write_gen = 0;
+	__wt_cache_dirty_decr(session, page);
+	__wt_page_out(session, &page);
+
+	/* Swap the new page into place. */
+	ref->page = new.page;
+	WT_PUBLISH(ref->state, WT_REF_MEM);
+
+	return (0);
+}
+
+/*
+ * __wt_split_evict --
+ *	Resolve a page split.
+ */
+int
+__wt_split_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
+{
+	uint32_t split_entries;
+
+	/*
+	 * There are two cases entering this code. First, an in-memory page that
+	 * got too large, we forcibly evicted it, and there wasn't anything to
+	 * write. (Imagine two threads updating a small set keys on a leaf page.
+	 * The page is too large so we try to evict it, but after reconciliation
+	 * there's only a small amount of data (so it's a single page we can't
+	 * split), and because there are two threads, there's some data we can't
+	 * write (so we can't evict it). In that case, we take advantage of the
+	 * fact we have exclusive access to the page and rewrite it in memory.)
+	 *
+	 * Second, a real split where we reconciled a page and it turned into a
+	 * lot of pages.
+	 */
+	split_entries = ref->page->modify->mod_multi_entries;
+	return (split_entries == 1 ?
+	    __split_evict_single(session, ref) :
+	    __split_evict_multi(session, ref, exclusive));
+}
diff --git a/src/third_party/wiredtiger/src/btree/rec_track.c b/src/third_party/wiredtiger/src/btree/rec_track.c
new file mode 100644
index 00000000000..92282393a23
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/rec_track.c
@@ -0,0 +1,904 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Estimated memory cost for a structure on the overflow lists, the size of
+ * the structure plus two pointers (assume the average skip list depth is 2).
+ */
+#define	WT_OVFL_SIZE(s)							\
+	(sizeof(s) + 2 * sizeof(void *))
+
+/*
+ * __ovfl_track_init --
+ *	Initialize the overflow tracking structure.
+ */
+static int
+__ovfl_track_init(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	return (__wt_calloc_def(session, 1, &page->modify->ovfl_track));
+}
+
+/*
+ * __ovfl_discard_verbose --
+ *	Dump information about a discard overflow record.
+ */
+static int
+__ovfl_discard_verbose(
+    WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell, const char *tag)
+{
+	WT_CELL_UNPACK *unpack, _unpack;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+
+	WT_RET(__wt_scr_alloc(session, 512, &tmp));
+
+	unpack = &_unpack;
+	__wt_cell_unpack(cell, unpack);
+
+	WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW,
+	    "discard: %s%s%p %s",
+	    tag == NULL ? "" : tag,
+	    tag == NULL ? "" : ": ",
+	    page,
+	    __wt_addr_string(session, unpack->data, unpack->size, tmp)));
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+#if 0
+/*
+ * __ovfl_discard_dump --
+ *	Debugging information.
+ */
+static void
+__ovfl_discard_dump(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_CELL **cellp;
+	WT_OVFL_TRACK *track;
+	size_t i;
+
+	if (page->modify == NULL || page->modify->ovfl_track == NULL)
+		return;
+
+	track = page->modify->ovfl_track;
+	for (i = 0, cellp = track->discard;
+	    i < track->discard_entries; ++i, ++cellp)
+		(void)__ovfl_discard_verbose(session, page, *cellp, "dump");
+}
+#endif
+
+/*
+ * __ovfl_discard_wrapup --
+ *	Resolve the page's overflow discard list after a page is written.
+ */
+static int
+__ovfl_discard_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_CELL **cellp;
+	WT_DECL_RET;
+	WT_OVFL_TRACK *track;
+	uint32_t i;
+
+	track = page->modify->ovfl_track;
+	for (i = 0, cellp = track->discard;
+	    i < track->discard_entries; ++i, ++cellp) {
+		if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+			WT_RET(__ovfl_discard_verbose(
+			    session, page, *cellp, "free"));
+
+		/* Discard each cell's overflow item. */
+		WT_RET(__wt_ovfl_discard(session, *cellp));
+	}
+
+	__wt_free(session, track->discard);
+	track->discard_entries = track->discard_allocated = 0;
+
+	return (ret);
+}
+
+/*
+ * __ovfl_discard_wrapup_err --
+ *	Resolve the page's overflow discard list after an error occurs.
+ */
+static int
+__ovfl_discard_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_OVFL_TRACK *track;
+
+	track = page->modify->ovfl_track;
+
+	__wt_free(session, track->discard);
+	track->discard_entries = track->discard_allocated = 0;
+
+	return (0);
+}
+
+/*
+ * __wt_ovfl_discard_add --
+ *	Add a new entry to the page's list of overflow records that have been
+ * discarded.
+ */
+int
+__wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell)
+{
+	WT_OVFL_TRACK *track;
+
+	if (page->modify->ovfl_track == NULL)
+		WT_RET(__ovfl_track_init(session, page));
+
+	track = page->modify->ovfl_track;
+	WT_RET(__wt_realloc_def(session, &track->discard_allocated,
+	    track->discard_entries + 1, &track->discard));
+	track->discard[track->discard_entries++] = cell;
+
+	if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+		WT_RET(__ovfl_discard_verbose(session, page, cell, "add"));
+
+	return (0);
+}
+
+/*
+ * __wt_ovfl_discard_free --
+ *	Free the page's list of discarded overflow record addresses.
+ */
+void
+__wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_OVFL_TRACK *track;
+
+	if (page->modify == NULL || page->modify->ovfl_track == NULL)
+		return;
+
+	track = page->modify->ovfl_track;
+
+	__wt_free(session, track->discard);
+	track->discard_entries = track->discard_allocated = 0;
+}
+
+/*
+ * __ovfl_reuse_verbose --
+ *	Dump information about a reuse overflow record.
+ */
+static int
+__ovfl_reuse_verbose(WT_SESSION_IMPL *session,
+    WT_PAGE *page, WT_OVFL_REUSE *reuse, const char *tag)
+{
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+
+	WT_RET(__wt_scr_alloc(session, 64, &tmp));
+
+	WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW,
+	    "reuse: %s%s%p %s (%s%s%s) {%.*s}",
+	    tag == NULL ? "" : tag,
+	    tag == NULL ? "" : ": ",
+	    page,
+	    __wt_addr_string(
+		session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size, tmp),
+	    F_ISSET(reuse, WT_OVFL_REUSE_INUSE) ? "inuse" : "",
+	    F_ISSET(reuse, WT_OVFL_REUSE_INUSE) &&
+	    F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED) ? ", " : "",
+	    F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED) ? "just-added" : "",
+	    WT_MIN(reuse->value_size, 40), (char *)WT_OVFL_REUSE_VALUE(reuse)));
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+#if 0
+/*
+ * __ovfl_reuse_dump --
+ *	Debugging information.
+ */
+static void
+__ovfl_reuse_dump(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_OVFL_REUSE **head, *reuse;
+
+	if (page->modify == NULL || page->modify->ovfl_track == NULL)
+		return;
+	head = page->modify->ovfl_track->ovfl_reuse;
+
+	for (reuse = head[0]; reuse != NULL; reuse = reuse->next[0])
+		(void)__ovfl_reuse_verbose(session, page, reuse, "dump");
+}
+#endif
+
+/*
+ * __ovfl_reuse_skip_search --
+ *	Return the first, not in-use, matching value in the overflow reuse list.
+ */
+static WT_OVFL_REUSE *
+__ovfl_reuse_skip_search(
+    WT_OVFL_REUSE **head, const void *value, size_t value_size)
+{
+	WT_OVFL_REUSE **e, *next;
+	size_t len;
+	int cmp, i;
+
+	/*
+	 * Start at the highest skip level, then go as far as possible at each
+	 * level before stepping down to the next.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+		if (*e == NULL) {		/* Empty levels */
+			--i;
+			--e;
+			continue;
+		}
+
+		/*
+		 * Values are not unique, and it's possible to have long lists
+		 * of identical overflow items.  (We've seen it in benchmarks.)
+		 * Move through a list of identical items at the current level
+		 * as long as the next one is in-use, otherwise, drop down a
+		 * level.   When at the bottom level, return items if reusable,
+		 * else NULL.
+		 */
+		len = WT_MIN((*e)->value_size, value_size);
+		cmp = memcmp(WT_OVFL_REUSE_VALUE(*e), value, len);
+		if (cmp == 0 && (*e)->value_size == value_size) {
+			if (i == 0)
+				return (F_ISSET(*e,
+				    WT_OVFL_REUSE_INUSE) ? NULL : *e);
+			if ((next = (*e)->next[i]) == NULL ||
+			    !F_ISSET(next, WT_OVFL_REUSE_INUSE) ||
+			    next->value_size != len || memcmp(
+			    WT_OVFL_REUSE_VALUE(next), value, len) != 0) {
+				--i;		/* Drop down a level */
+				--e;
+			} else			/* Keep going at this level */
+				e = &(*e)->next[i];
+			continue;
+		}
+
+		/*
+		 * If the skiplist value is larger than the search value, or
+		 * they compare equally and the skiplist value is longer than
+		 * the search value, drop down a level, otherwise continue on
+		 * this level.
+		 */
+		if (cmp > 0 || (cmp == 0 && (*e)->value_size > value_size)) {
+			--i;			/* Drop down a level */
+			--e;
+		} else				/* Keep going at this level */
+			e = &(*e)->next[i];
+	}
+	return (NULL);
+}
+
+/*
+ * __ovfl_reuse_skip_search_stack --
+ *	 Search an overflow reuse skiplist, returning an insert/remove stack.
+ */
+static void
+__ovfl_reuse_skip_search_stack(WT_OVFL_REUSE **head,
+    WT_OVFL_REUSE ***stack, const void *value, size_t value_size)
+{
+	WT_OVFL_REUSE **e;
+	size_t len;
+	int cmp, i;
+
+	/*
+	 * Start at the highest skip level, then go as far as possible at each
+	 * level before stepping down to the next.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+		if (*e == NULL) {		/* Empty levels */
+			stack[i--] = e--;
+			continue;
+		}
+
+		/*
+		 * If the skiplist value is larger than the search value, or
+		 * they compare equally and the skiplist value is longer than
+		 * the search value, drop down a level, otherwise continue on
+		 * this level.
+		 */
+		len = WT_MIN((*e)->value_size, value_size);
+		cmp = memcmp(WT_OVFL_REUSE_VALUE(*e), value, len);
+		if (cmp > 0 || (cmp == 0 && (*e)->value_size > value_size))
+			stack[i--] = e--;	/* Drop down a level */
+		else
+			e = &(*e)->next[i];	/* Keep going at this level */
+	}
+}
+
+/*
+ * __ovfl_reuse_wrapup --
+ *	Resolve the page's overflow reuse list after a page is written.
+ */
+static int
+__ovfl_reuse_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_BM *bm;
+	WT_OVFL_REUSE **e, **head, *reuse;
+	size_t incr, decr;
+	int i;
+
+	bm = S2BT(session)->bm;
+	head = page->modify->ovfl_track->ovfl_reuse;
+
+	/*
+	 * Discard any overflow records that aren't in-use, freeing underlying
+	 * blocks.
+	 *
+	 * First, walk the overflow reuse lists (except for the lowest one),
+	 * fixing up skiplist links.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
+		for (e = &head[i]; *e != NULL;) {
+			if (F_ISSET(*e, WT_OVFL_REUSE_INUSE)) {
+				e = &(*e)->next[i];
+				continue;
+			}
+			*e = (*e)->next[i];
+		}
+
+	/*
+	 * Second, discard any overflow record without an in-use flag, clear
+	 * the flags for the next run.
+	 *
+	 * As part of the pass through the lowest level, figure out how much
+	 * space we added/subtracted from the page, and update its footprint.
+	 * We don't get it exactly correct because we don't know the depth of
+	 * the skiplist here, but it's close enough, and figuring out the
+	 * memory footprint change in the reconciliation wrapup code means
+	 * fewer atomic updates and less code overall.
+	 */
+	incr = decr = 0;
+	for (e = &head[0]; (reuse = *e) != NULL;) {
+		if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) {
+			if (F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED))
+				incr += WT_OVFL_SIZE(WT_OVFL_REUSE) +
+				    reuse->addr_size + reuse->value_size;
+
+			F_CLR(reuse,
+			    WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED);
+			e = &(*e)->next[0];
+			continue;
+		}
+		*e = (*e)->next[0];
+
+		WT_ASSERT(session, !F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED));
+		decr += WT_OVFL_SIZE(WT_OVFL_REUSE) +
+		    reuse->addr_size + reuse->value_size;
+
+		if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+			WT_RET(
+			    __ovfl_reuse_verbose(session, page, reuse, "free"));
+		WT_RET(bm->free(
+		    bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size));
+		__wt_free(session, reuse);
+	}
+
+	if (incr > decr)
+		__wt_cache_page_inmem_incr(session, page, incr - decr);
+	if (decr > incr)
+		__wt_cache_page_inmem_decr(session, page, decr - incr);
+	return (0);
+}
+
+/*
+ * __ovfl_reuse_wrapup_err --
+ *	Resolve the page's overflow reuse list after an error occurs.
+ */
+static int
+__ovfl_reuse_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_BM *bm;
+	WT_DECL_RET;
+	WT_OVFL_REUSE **e, **head, *reuse;
+	int i;
+
+	bm = S2BT(session)->bm;
+	head = page->modify->ovfl_track->ovfl_reuse;
+
+	/*
+	 * Discard any overflow records that were just added, freeing underlying
+	 * blocks.
+	 *
+	 * First, walk the overflow reuse lists (except for the lowest one),
+	 * fixing up skiplist links.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
+		for (e = &head[i]; *e != NULL;) {
+			if (!F_ISSET(*e, WT_OVFL_REUSE_JUST_ADDED)) {
+				e = &(*e)->next[i];
+				continue;
+			}
+			*e = (*e)->next[i];
+		}
+
+	/*
+	 * Second, discard any overflow record with a just-added flag, clear the
+	 * flags for the next run.
+	 */
+	for (e = &head[0]; (reuse = *e) != NULL;) {
+		if (!F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) {
+			F_CLR(reuse, WT_OVFL_REUSE_INUSE);
+			e = &(*e)->next[0];
+			continue;
+		}
+		*e = (*e)->next[0];
+
+		if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+			WT_RET(
+			    __ovfl_reuse_verbose(session, page, reuse, "free"));
+		WT_TRET(bm->free(
+		    bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size));
+		__wt_free(session, reuse);
+	}
+	return (0);
+}
+
+/*
+ * __wt_ovfl_reuse_search --
+ *	Search the page's list of overflow records for a match.
+ */
+int
+__wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page,
+    uint8_t **addrp, size_t *addr_sizep,
+    const void *value, size_t value_size)
+{
+	WT_OVFL_REUSE **head, *reuse;
+
+	*addrp = NULL;
+	*addr_sizep = 0;
+
+	if (page->modify->ovfl_track == NULL)
+		return (0);
+
+	head = page->modify->ovfl_track->ovfl_reuse;
+
+	/*
+	 * The search function returns the first matching record in the list
+	 * which does not have the in-use flag set, or NULL.
+	 */
+	if ((reuse = __ovfl_reuse_skip_search(head, value, value_size)) == NULL)
+		return (0);
+
+	*addrp = WT_OVFL_REUSE_ADDR(reuse);
+	*addr_sizep = reuse->addr_size;
+	F_SET(reuse, WT_OVFL_REUSE_INUSE);
+
+	if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+		WT_RET(__ovfl_reuse_verbose(session, page, reuse, "reclaim"));
+	return (1);
+}
+
+/*
+ * __wt_ovfl_reuse_add --
+ *	Add a new entry to the page's list of overflow records tracked for
+ * reuse.
+ */
+int
+__wt_ovfl_reuse_add(WT_SESSION_IMPL *session, WT_PAGE *page,
+    const uint8_t *addr, size_t addr_size,
+    const void *value, size_t value_size)
+{
+	WT_OVFL_REUSE **head, *reuse, **stack[WT_SKIP_MAXDEPTH];
+	size_t size;
+	u_int i, skipdepth;
+	uint8_t *p;
+
+	if (page->modify->ovfl_track == NULL)
+		WT_RET(__ovfl_track_init(session, page));
+
+	head = page->modify->ovfl_track->ovfl_reuse;
+
+	/* Choose a skiplist depth for this insert. */
+	skipdepth = __wt_skip_choose_depth(session);
+
+	/*
+	 * Allocate the WT_OVFL_REUSE structure, next pointers for the skip
+	 * list, room for the address and value, then copy everything into
+	 * place.
+	 *
+	 * To minimize the WT_OVFL_REUSE structure size, the address offset
+	 * and size are single bytes: that's safe because the address follows
+	 * the structure (which can't be more than about 100B), and address
+	 * cookies are limited to 255B.
+	 */
+	size = sizeof(WT_OVFL_REUSE) +
+	    skipdepth * sizeof(WT_OVFL_REUSE *) + addr_size + value_size;
+	WT_RET(__wt_calloc(session, 1, size, &reuse));
+	p = (uint8_t *)reuse +
+	    sizeof(WT_OVFL_REUSE) + skipdepth * sizeof(WT_OVFL_REUSE *);
+	reuse->addr_offset = (uint8_t)WT_PTRDIFF(p, reuse);
+	reuse->addr_size = (uint8_t)addr_size;
+	memcpy(p, addr, addr_size);
+	p += addr_size;
+	reuse->value_offset = WT_PTRDIFF32(p, reuse);
+	reuse->value_size = WT_STORE_SIZE(value_size);
+	memcpy(p, value, value_size);
+	F_SET(reuse, WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED);
+
+	/* Insert the new entry into the skiplist. */
+	__ovfl_reuse_skip_search_stack(head, stack, value, value_size);
+	for (i = 0; i < skipdepth; ++i) {
+		reuse->next[i] = *stack[i];
+		*stack[i] = reuse;
+	}
+
+	if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+		WT_RET(__ovfl_reuse_verbose(session, page, reuse, "add"));
+
+	return (0);
+}
+
+/*
+ * __wt_ovfl_reuse_free --
+ *	Free the page's list of overflow records tracked for reuse.
+ */
+void
+__wt_ovfl_reuse_free(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_OVFL_REUSE *reuse;
+	WT_PAGE_MODIFY *mod;
+	void *next;
+
+	mod = page->modify;
+	if (mod == NULL || mod->ovfl_track == NULL)
+		return;
+
+	for (reuse = mod->ovfl_track->ovfl_reuse[0];
+	    reuse != NULL; reuse = next) {
+		next = reuse->next[0];
+		__wt_free(session, reuse);
+	}
+}
+
+/*
+ * __ovfl_txnc_verbose --
+ *	Dump information about a transaction-cached overflow record.
+ */
+static int
+__ovfl_txnc_verbose(WT_SESSION_IMPL *session,
+    WT_PAGE *page, WT_OVFL_TXNC *txnc, const char *tag)
+{
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+
+	WT_RET(__wt_scr_alloc(session, 64, &tmp));
+
+	WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW,
+	    "txn-cache: %s%s%p %s %" PRIu64 " {%.*s}",
+	    tag == NULL ? "" : tag,
+	    tag == NULL ? "" : ": ",
+	    page,
+	    __wt_addr_string(
+		session, WT_OVFL_TXNC_ADDR(txnc), txnc->addr_size, tmp),
+	    txnc->current,
+	    WT_MIN(txnc->value_size, 40), (char *)WT_OVFL_TXNC_VALUE(txnc)));
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+#if 0
+/*
+ * __ovfl_txnc_dump --
+ *	Debugging information.
+ */
+static void
+__ovfl_txnc_dump(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_OVFL_TXNC **head, *txnc;
+
+	if (page->modify == NULL || page->modify->ovfl_track == NULL)
+		return;
+	head = page->modify->ovfl_track->ovfl_txnc;
+
+	for (txnc = head[0]; txnc != NULL; txnc = txnc->next[0])
+		(void)__ovfl_txnc_verbose(session, page, txnc, "dump");
+}
+#endif
+
+/*
+ * __ovfl_txnc_skip_search --
+ *	Return the first matching addr in the overflow transaction-cache list.
+ */
+static WT_OVFL_TXNC *
+__ovfl_txnc_skip_search(WT_OVFL_TXNC **head, const void *addr, size_t addr_size)
+{
+	WT_OVFL_TXNC **e;
+	size_t len;
+	int cmp, i;
+
+	/*
+	 * Start at the highest skip level, then go as far as possible at each
+	 * level before stepping down to the next.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+		if (*e == NULL) {		/* Empty levels */
+			--i;
+			--e;
+			continue;
+		}
+
+		/*
+		 * Return any exact matches: we don't care in what search level
+		 * we found a match.
+		 */
+		len = WT_MIN((*e)->addr_size, addr_size);
+		cmp = memcmp(WT_OVFL_TXNC_ADDR(*e), addr, len);
+		if (cmp == 0 && (*e)->addr_size == addr_size)
+			return (*e);
+
+		/*
+		 * If the skiplist address is larger than the search address, or
+		 * they compare equally and the skiplist address is longer than
+		 * the search address, drop down a level, otherwise continue on
+		 * this level.
+		 */
+		if (cmp > 0 || (cmp == 0 && (*e)->addr_size > addr_size)) {
+			--i;			/* Drop down a level */
+			--e;
+		} else				/* Keep going at this level */
+			e = &(*e)->next[i];
+	}
+	return (NULL);
+}
+
+/*
+ * __ovfl_txnc_skip_search_stack --
+ *	 Search an overflow transaction-cache skiplist, returning an
+ * insert/remove stack.
+ */
+static void
+__ovfl_txnc_skip_search_stack(WT_OVFL_TXNC **head,
+    WT_OVFL_TXNC ***stack, const void *addr, size_t addr_size)
+{
+	WT_OVFL_TXNC **e;
+	size_t len;
+	int cmp, i;
+
+	/*
+	 * Start at the highest skip level, then go as far as possible at each
+	 * level before stepping down to the next.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+		if (*e == NULL) {		/* Empty levels */
+			stack[i--] = e--;
+			continue;
+		}
+
+		/*
+		 * If the skiplist addr is larger than the search addr, or
+		 * they compare equally and the skiplist addr is longer than
+		 * the search addr, drop down a level, otherwise continue on
+		 * this level.
+		 */
+		len = WT_MIN((*e)->addr_size, addr_size);
+		cmp = memcmp(WT_OVFL_TXNC_ADDR(*e), addr, len);
+		if (cmp > 0 || (cmp == 0 && (*e)->addr_size > addr_size))
+			stack[i--] = e--;	/* Drop down a level */
+		else
+			e = &(*e)->next[i];	/* Keep going at this level */
+	}
+}
+
+/*
+ * __ovfl_txnc_wrapup --
+ *	Resolve the page's transaction-cache list.
+ */
+static int
+__ovfl_txnc_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_OVFL_TXNC **e, **head, *txnc;
+	size_t decr;
+	int i;
+
+	head = page->modify->ovfl_track->ovfl_txnc;
+
+	/*
+	 * Discard any transaction-cache records with transaction IDs earlier
+	 * than any in the system.
+	 *
+	 * First, walk the overflow transaction-cache skip lists (except for
+	 * the lowest level), fixing up links.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
+		for (e = &head[i]; *e != NULL;) {
+			if (!__wt_txn_visible_all(session, (*e)->current)) {
+				e = &(*e)->next[i];
+				continue;
+			}
+			*e = (*e)->next[i];
+		}
+
+	/* Second, discard any no longer needed transaction-cache records. */
+	decr = 0;
+	for (e = &head[0]; (txnc = *e) != NULL;) {
+		if (!__wt_txn_visible_all(session, txnc->current)) {
+			e = &(*e)->next[0];
+			continue;
+		}
+		*e = (*e)->next[0];
+
+		decr += WT_OVFL_SIZE(WT_OVFL_TXNC) +
+		    txnc->addr_size + txnc->value_size;
+
+		if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+			WT_RET(
+			    __ovfl_txnc_verbose(session, page, txnc, "free"));
+		__wt_free(session, txnc);
+	}
+
+	if (decr != 0)
+		__wt_cache_page_inmem_decr(session, page, decr);
+	return (0);
+}
+
+/*
+ * __wt_ovfl_txnc_search --
+ *	Search the page's list of transaction-cache overflow records for a
+ * match.
+ */
+int
+__wt_ovfl_txnc_search(
+    WT_PAGE *page, const uint8_t *addr, size_t addr_size, WT_ITEM *store)
+{
+	WT_OVFL_TXNC **head, *txnc;
+
+	if (page->modify->ovfl_track == NULL)
+		return (WT_NOTFOUND);
+
+	head = page->modify->ovfl_track->ovfl_txnc;
+
+	if ((txnc = __ovfl_txnc_skip_search(head, addr, addr_size)) == NULL)
+		return (WT_NOTFOUND);
+
+	store->data = WT_OVFL_TXNC_VALUE(txnc);
+	store->size = txnc->value_size;
+	return (0);
+}
+
+/*
+ * __wt_ovfl_txnc_add --
+ *	Add a new entry to the page's list of transaction-cached overflow
+ * records.
+ */
+int
+__wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page,
+    const uint8_t *addr, size_t addr_size,
+    const void *value, size_t value_size)
+{
+	WT_OVFL_TXNC **head, **stack[WT_SKIP_MAXDEPTH], *txnc;
+	size_t size;
+	u_int i, skipdepth;
+	uint8_t *p;
+
+	if (page->modify->ovfl_track == NULL)
+		WT_RET(__ovfl_track_init(session, page));
+
+	head = page->modify->ovfl_track->ovfl_txnc;
+
+	/* Choose a skiplist depth for this insert. */
+	skipdepth = __wt_skip_choose_depth(session);
+
+	/*
+	 * Allocate the WT_OVFL_TXNC structure, next pointers for the skip
+	 * list, room for the address and value, then copy everything into
+	 * place.
+	 *
+	 * To minimize the WT_OVFL_TXNC structure size, the address offset
+	 * and size are single bytes: that's safe because the address follows
+	 * the structure (which can't be more than about 100B), and address
+	 * cookies are limited to 255B.
+	 */
+	size = sizeof(WT_OVFL_TXNC) +
+	    skipdepth * sizeof(WT_OVFL_TXNC *) + addr_size + value_size;
+	WT_RET(__wt_calloc(session, 1, size, &txnc));
+	p = (uint8_t *)txnc +
+	    sizeof(WT_OVFL_TXNC) + skipdepth * sizeof(WT_OVFL_TXNC *);
+	txnc->addr_offset = (uint8_t)WT_PTRDIFF(p, txnc);
+	txnc->addr_size = (uint8_t)addr_size;
+	memcpy(p, addr, addr_size);
+	p += addr_size;
+	txnc->value_offset = WT_PTRDIFF32(p, txnc);
+	txnc->value_size = WT_STORE_SIZE(value_size);
+	memcpy(p, value, value_size);
+	txnc->current = __wt_txn_new_id(session);
+
+	__wt_cache_page_inmem_incr(session, page,
+	    WT_OVFL_SIZE(WT_OVFL_TXNC) + addr_size + value_size);
+
+	/* Insert the new entry into the skiplist. */
+	__ovfl_txnc_skip_search_stack(head, stack, addr, addr_size);
+	for (i = 0; i < skipdepth; ++i) {
+		txnc->next[i] = *stack[i];
+		*stack[i] = txnc;
+	}
+
+	if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+		WT_RET(__ovfl_txnc_verbose(session, page, txnc, "add"));
+
+	return (0);
+}
+
+/*
+ * __wt_ovfl_txnc_free --
+ *	Free the page's list of transaction-cached overflow records.
+ */
+void
+__wt_ovfl_txnc_free(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_OVFL_TXNC *txnc;
+	WT_PAGE_MODIFY *mod;
+	void *next;
+
+	mod = page->modify;
+	if (mod == NULL || mod->ovfl_track == NULL)
+		return;
+
+	for (txnc = mod->ovfl_track->ovfl_txnc[0];
+	    txnc != NULL; txnc = next) {
+		next = txnc->next[0];
+		__wt_free(session, txnc);
+	}
+}
+
+/*
+ * __wt_ovfl_track_wrapup --
+ *	Resolve the page's overflow tracking on reconciliation success.
+ */
+int
+__wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_DECL_RET;
+	WT_OVFL_TRACK *track;
+
+	if (page->modify == NULL || page->modify->ovfl_track == NULL)
+		return (0);
+
+	track = page->modify->ovfl_track;
+	if (track->discard != NULL)
+		WT_RET(__ovfl_discard_wrapup(session, page));
+
+	if (track->ovfl_reuse[0] != NULL)
+		WT_RET(__ovfl_reuse_wrapup(session, page));
+
+	if (track->ovfl_txnc[0] != NULL) {
+		WT_RET(__wt_writelock(session, S2BT(session)->ovfl_lock));
+		ret = __ovfl_txnc_wrapup(session, page);
+		WT_TRET(__wt_writeunlock(session, S2BT(session)->ovfl_lock));
+	}
+	return (0);
+}
+
+/*
+ * __wt_ovfl_track_wrapup_err --
+ *	Resolve the page's overflow tracking on reconciliation error.
+ */
+int
+__wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_DECL_RET;
+	WT_OVFL_TRACK *track;
+
+	if (page->modify == NULL || page->modify->ovfl_track == NULL)
+		return (0);
+
+	track = page->modify->ovfl_track;
+	if (track->discard != NULL)
+		WT_RET(__ovfl_discard_wrapup_err(session, page));
+
+	if (track->ovfl_reuse[0] != NULL)
+		WT_RET(__ovfl_reuse_wrapup_err(session, page));
+
+	if (track->ovfl_txnc[0] != NULL) {
+		WT_RET(__wt_writelock(session, S2BT(session)->ovfl_lock));
+		ret = __ovfl_txnc_wrapup(session, page);
+		WT_TRET(__wt_writeunlock(session, S2BT(session)->ovfl_lock));
+	}
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/rec_write.c b/src/third_party/wiredtiger/src/btree/rec_write.c
new file mode 100644
index 00000000000..1b3a9a0898f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/rec_write.c
@@ -0,0 +1,5521 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+struct __rec_boundary;		typedef struct __rec_boundary WT_BOUNDARY;
+struct __rec_dictionary;	typedef struct __rec_dictionary WT_DICTIONARY;
+struct __rec_kv;		typedef struct __rec_kv WT_KV;
+
+/*
+ * Reconciliation is the process of taking an in-memory page, walking each entry
+ * in the page, building a backing disk image in a temporary buffer representing
+ * that information, and writing that buffer to disk.  What could be simpler?
+ *
+ * WT_RECONCILE --
+ *	Information tracking a single page reconciliation.
+ */
+typedef struct {
+	WT_REF  *ref;			/* Page being reconciled */
+	WT_PAGE *page;
+	uint32_t flags;			/* Caller's configuration */
+
+	WT_ITEM	 dsk;			/* Temporary disk-image buffer */
+
+	/* Track whether all changes to the page are written. */
+	uint64_t max_txn;
+	uint64_t skipped_txn;
+	uint32_t orig_write_gen;
+
+	/*
+	 * If page updates are skipped because they are as yet unresolved, or
+	 * the page has updates we cannot discard, the page is left "dirty":
+	 * the page cannot be discarded and a subsequent reconciliation will
+	 * be necessary to discard the page.
+	 */
+	int	 leave_dirty;
+
+	/*
+	 * Raw compression (don't get me started, as if normal reconciliation
+	 * wasn't bad enough).  If an application wants absolute control over
+	 * what gets written to disk, we give it a list of byte strings and it
+	 * gives us back an image that becomes a file block.  Because we don't
+	 * know the number of items we're storing in a block until we've done
+	 * a lot of work, we turn off most compression: dictionary, copy-cell,
+	 * prefix and row-store internal page suffix compression are all off.
+	 */
+	int	  raw_compression;
+	uint32_t  raw_max_slots;	/* Raw compression array sizes */
+	uint32_t *raw_entries;		/* Raw compression slot entries */
+	uint32_t *raw_offsets;		/* Raw compression slot offsets */
+	uint64_t *raw_recnos;		/* Raw compression recno count */
+	WT_ITEM	  raw_destination;	/* Raw compression destination buffer */
+
+	/*
+	 * Track if reconciliation has seen any overflow items.  If a leaf page
+	 * with no overflow items is written, the parent page's address cell is
+	 * set to the leaf-no-overflow type.  This means we can delete the leaf
+	 * page without reading it because we don't have to discard any overflow
+	 * items it might reference.
+	 *
+	 * The test test is per-page reconciliation, that is, once we see an
+	 * overflow item on the page, all subsequent leaf pages written for the
+	 * page will not be leaf-no-overflow type, regardless of whether or not
+	 * they contain overflow items.  In other words, leaf-no-overflow is not
+	 * guaranteed to be set on every page that doesn't contain an overflow
+	 * item, only that if it is set, the page contains no overflow items.
+	 *
+	 * The reason is because of raw compression: there's no easy/fast way to
+	 * figure out if the rows selected by raw compression included overflow
+	 * items, and the optimization isn't worth another pass over the data.
+	 */
+	int	ovfl_items;
+
+	/*
+	 * Track if reconciliation of a row-store leaf page has seen empty (zero
+	 * length) values.  We don't write out anything for empty values, so if
+	 * there are empty values on a page, we have to make two passes over the
+	 * page when it's read to figure out how many keys it has, expensive in
+	 * the common case of no empty values and (entries / 2) keys.  Likewise,
+	 * a page with only empty values is another common data set, and keys on
+	 * that page will be equal to the number of entries.  In both cases, set
+	 * a flag in the page's on-disk header.
+	 *
+	 * The test is per-page reconciliation as described above for the
+	 * overflow-item test.
+	 */
+	int	all_empty_value, any_empty_value;
+
+	/*
+	 * Reconciliation gets tricky if we have to split a page, which happens
+	 * when the disk image we create exceeds the page type's maximum disk
+	 * image size.
+	 *
+	 * First, the sizes of the page we're building.  If WiredTiger is doing
+	 * page layout, page_size is the same as page_size_max.  We accumulate
+	 * the maximum page size of raw data and when we reach that size, we
+	 * split the page into multiple chunks, eventually compressing those
+	 * chunks.  When the application is doing page layout (raw compression
+	 * is configured), page_size can continue to grow past page_size_max,
+	 * and we keep accumulating raw data until the raw compression callback
+	 * accepts it.
+	 */
+	uint32_t page_size;		/* Current page size */
+	uint32_t page_size_max;		/* Maximum on-disk page size */
+
+	/*
+	 * Second, the split size: if we're doing the page layout, split to a
+	 * smaller-than-maximum page size when a split is required so we don't
+	 * repeatedly split a packed page.
+	 */
+	uint32_t split_size;		/* Split page size */
+
+	/*
+	 * The problem with splits is we've done a lot of work by the time we
+	 * realize we're going to have to split, we don't want to start over.
+	 *
+	 * To keep from having to start over when we hit the maximum page size,
+	 * we track the page information when we approach a split boundary.
+	 * If we eventually have to split, we walk this structure and pretend
+	 * we were splitting all along.  After that, we continue to append to
+	 * this structure, and eventually walk it to create a new internal page
+	 * that references all of our split pages.
+	 */
+	struct __rec_boundary {
+		/*
+		 * The start field records location in the initial split buffer,
+		 * that is, the first byte of the split chunk recorded before we
+		 * decide to split a page; the offset between the first byte of
+		 * chunk[0] and the first byte of chunk[1] is chunk[0]'s length.
+		 *
+		 * Once we split a page, we stop filling in the start field, as
+		 * we're writing the split chunks as we find them.
+		 */
+		uint8_t *start;		/* Split's first byte */
+
+		/*
+		 * The recno and entries fields are the starting record number
+		 * of the split chunk (for column-store splits), and the number
+		 * of entries in the split chunk.  These fields are used both
+		 * to write the split chunk, and to create a new internal page
+		 * to reference the split pages.
+		 */
+		uint64_t recno;		/* Split's starting record */
+		uint32_t entries;	/* Split's entries */
+
+		WT_ADDR addr;		/* Split's written location */
+		uint32_t size;		/* Split's size */
+		uint32_t cksum;		/* Split's checksum */
+		void    *dsk;		/* Split's disk image */
+
+		/*
+		 * When busy pages get large, we need to be able to evict them
+		 * even when they contain unresolved updates, or updates which
+		 * cannot be evicted because of running transactions.  In such
+		 * cases, break the page into multiple blocks, write the blocks
+		 * that can be evicted, saving lists of updates for blocks that
+		 * cannot be evicted, then re-instantiate the blocks that cannot
+		 * be evicted as new, in-memory pages, restoring the updates on
+		 * those pages.
+		 */
+		WT_UPD_SKIPPED *skip;		/* Skipped updates */
+		uint32_t	skip_next;
+		size_t		skip_allocated;
+
+		/*
+		 * The key for a row-store page; no column-store key is needed
+		 * because the page's recno, stored in the recno field, is the
+		 * column-store key.
+		 */
+		WT_ITEM key;		/* Promoted row-store key */
+
+		/*
+		 * During wrapup, after reconciling the root page, we write a
+		 * final block as part of a checkpoint.  If raw compression
+		 * was configured, that block may have already been compressed.
+		 */
+		int already_compressed;
+	} *bnd;				/* Saved boundaries */
+	uint32_t bnd_next;		/* Next boundary slot */
+	uint32_t bnd_next_max;		/* Maximum boundary slots used */
+	size_t	 bnd_entries;		/* Total boundary slots */
+	size_t   bnd_allocated;		/* Bytes allocated */
+
+	/*
+	 * We track the total number of page entries copied into split chunks
+	 * so we can easily figure out how many entries in the current split
+	 * chunk.
+	 */
+	uint32_t total_entries;		/* Total entries in splits */
+
+	/*
+	 * And there's state information as to where in this process we are:
+	 * (1) tracking split boundaries because we can still fit more split
+	 * chunks into the maximum page size, (2) tracking the maximum page
+	 * size boundary because we can't fit any more split chunks into the
+	 * maximum page size, (3) not performing boundary checks because it's
+	 * either not useful with the current page size configuration, or
+	 * because we've already been forced to split.
+	 */
+	enum {	SPLIT_BOUNDARY=0,	/* Next: a split page boundary */
+		SPLIT_MAX=1,		/* Next: the maximum page boundary */
+		SPLIT_TRACKING_OFF=2,	/* No boundary checks */
+		SPLIT_TRACKING_RAW=3 }	/* Underlying compression decides */
+	bnd_state;
+
+	/*
+	 * We track current information about the current record number, the
+	 * number of entries copied into the temporary buffer, where we are
+	 * in the temporary buffer, and how much memory remains.  Those items
+	 * are packaged here rather than passing pointers to stack locations
+	 * around the code.
+	 */
+	uint64_t recno;			/* Current record number */
+	uint32_t entries;		/* Current number of entries */
+	uint8_t *first_free;		/* Current first free byte */
+	size_t	 space_avail;		/* Remaining space in this chunk */
+
+	/*
+	 * While reviewing updates for each page, we store skipped updates here,
+	 * and then move them to per-block areas as the blocks are defined.
+	 */
+	WT_UPD_SKIPPED *skip;		/* Skipped updates */
+	uint32_t	skip_next;
+	size_t		skip_allocated;
+
+	/*
+	 * We don't need to keep the 0th key around on internal pages, the
+	 * search code ignores them as nothing can sort less by definition.
+	 * There's some trickiness here, see the code for comments on how
+	 * these fields work.
+	 */
+	int	 cell_zero;		/* Row-store internal page 0th key */
+
+	/*
+	 * WT_DICTIONARY --
+	 *	We optionally build a dictionary of row-store values for leaf
+	 * pages.  Where two value cells are identical, only write the value
+	 * once, the second and subsequent copies point to the original cell.
+	 * The dictionary is fixed size, but organized in a skip-list to make
+	 * searches faster.
+	 */
+	struct __rec_dictionary {
+		uint64_t hash;				/* Hash value */
+		void	*cell;				/* Matching cell */
+
+		u_int depth;				/* Skiplist */
+		WT_DICTIONARY *next[0];
+	} **dictionary;					/* Dictionary */
+	u_int dictionary_next, dictionary_slots;	/* Next, max entries */
+							/* Skiplist head. */
+	WT_DICTIONARY *dictionary_head[WT_SKIP_MAXDEPTH];
+
+	/*
+	 * WT_KV--
+	 *	An on-page key/value item we're building.
+	 */
+	struct __rec_kv {
+		WT_ITEM	 buf;		/* Data */
+		WT_CELL	 cell;		/* Cell and cell's length */
+		size_t cell_len;
+		size_t len;		/* Total length of cell + data */
+	} k, v;				/* Key/Value being built */
+
+	WT_ITEM *cur, _cur;		/* Key/Value being built */
+	WT_ITEM *last, _last;		/* Last key/value built */
+
+	int key_pfx_compress;		/* If can prefix-compress next key */
+	int key_pfx_compress_conf;	/* If prefix compression configured */
+	int key_sfx_compress;		/* If can suffix-compress next key */
+	int key_sfx_compress_conf;	/* If suffix compression configured */
+
+	int is_bulk_load;		/* If it's a bulk load */
+
+	WT_SALVAGE_COOKIE *salvage;	/* If it's a salvage operation */
+
+	int tested_ref_state;		/* Debugging information */
+} WT_RECONCILE;
+
+static void __rec_bnd_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *, int);
+static void __rec_cell_build_addr(
+		WT_RECONCILE *, const void *, size_t, u_int, uint64_t);
+static int  __rec_cell_build_int_key(WT_SESSION_IMPL *,
+		WT_RECONCILE *, const void *, size_t, int *);
+static int  __rec_cell_build_leaf_key(WT_SESSION_IMPL *,
+		WT_RECONCILE *, const void *, size_t, int *);
+static int  __rec_cell_build_ovfl(WT_SESSION_IMPL *,
+		WT_RECONCILE *, WT_KV *, uint8_t, uint64_t);
+static int  __rec_cell_build_val(WT_SESSION_IMPL *,
+		WT_RECONCILE *, const void *, size_t, uint64_t);
+static int  __rec_child_deleted(
+		WT_SESSION_IMPL *, WT_RECONCILE *, WT_REF *, int *);
+static int  __rec_col_fix(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int  __rec_col_fix_slvg(WT_SESSION_IMPL *,
+		WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *);
+static int  __rec_col_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int  __rec_col_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int  __rec_col_var(WT_SESSION_IMPL *,
+		WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *);
+static int  __rec_col_var_helper(WT_SESSION_IMPL *, WT_RECONCILE *,
+		WT_SALVAGE_COOKIE *, WT_ITEM *, int, uint8_t, uint64_t);
+static int  __rec_destroy_session(WT_SESSION_IMPL *);
+static int  __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t);
+static int  __rec_row_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int  __rec_row_leaf(WT_SESSION_IMPL *,
+		WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *);
+static int  __rec_row_leaf_insert(
+		WT_SESSION_IMPL *, WT_RECONCILE *, WT_INSERT *);
+static int  __rec_row_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int  __rec_split_col(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int  __rec_split_discard(WT_SESSION_IMPL *, WT_PAGE *);
+static int  __rec_split_fixup(WT_SESSION_IMPL *, WT_RECONCILE *);
+static int  __rec_split_row(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int  __rec_split_row_promote(
+		WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t);
+static int  __rec_split_write(WT_SESSION_IMPL *,
+		WT_RECONCILE *, WT_BOUNDARY *, WT_ITEM *, int);
+static int  __rec_write_init(WT_SESSION_IMPL *,
+		WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *);
+static int  __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int  __rec_write_wrapup_err(
+		WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+
+static void __rec_dictionary_free(WT_SESSION_IMPL *, WT_RECONCILE *);
+static int  __rec_dictionary_init(WT_SESSION_IMPL *, WT_RECONCILE *, u_int);
+static int  __rec_dictionary_lookup(
+		WT_SESSION_IMPL *, WT_RECONCILE *, WT_KV *, WT_DICTIONARY **);
+static void __rec_dictionary_reset(WT_RECONCILE *);
+
+/*
+ * __wt_rec_write --
+ *	Reconcile an in-memory page into its on-disk format, and write it.
+ */
+int
+__wt_rec_write(WT_SESSION_IMPL *session,
+    WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_PAGE_MODIFY *mod;
+	WT_RECONCILE *r;
+	int locked;
+
+	conn = S2C(session);
+	page = ref->page;
+	mod = page->modify;
+
+	/* We're shouldn't get called with a clean page, that's an error. */
+	if (!__wt_page_is_modified(page))
+		WT_RET_MSG(session, WT_ERROR,
+		    "Attempt to reconcile a clean page.");
+
+	WT_RET(__wt_verbose(session,
+	    WT_VERB_RECONCILE, "%s", __wt_page_type_string(page->type)));
+	WT_STAT_FAST_CONN_INCR(session, rec_pages);
+	WT_STAT_FAST_DATA_INCR(session, rec_pages);
+	if (LF_ISSET(WT_EVICTING)) {
+		WT_STAT_FAST_CONN_INCR(session, rec_pages_eviction);
+		WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction);
+	}
+
+	/* Record the most recent transaction ID we will *not* write. */
+	mod->disk_snap_min = session->txn.snap_min;
+
+	/* Initialize the reconciliation structure for each new run. */
+	WT_RET(__rec_write_init(
+	    session, ref, flags, salvage, &session->reconcile));
+	r = session->reconcile;
+
+	/*
+	 * The compaction process looks at the page's modification information;
+	 * if compaction is running, lock the page down.
+	 *
+	 * Otherwise, flip on the scanning flag: obsolete updates cannot be
+	 * freed while reconciliation is in progress.
+	 */
+	locked = 0;
+	if (conn->compact_in_memory_pass) {
+		locked = 1;
+		WT_PAGE_LOCK(session, page);
+	} else
+		for (;;) {
+			F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret);
+			if (ret == 0)
+				break;
+			__wt_yield();
+		}
+
+	/* Reconcile the page. */
+	switch (page->type) {
+	case WT_PAGE_COL_FIX:
+		if (salvage != NULL)
+			ret = __rec_col_fix_slvg(session, r, page, salvage);
+		else
+			ret = __rec_col_fix(session, r, page);
+		break;
+	case WT_PAGE_COL_INT:
+		ret = __rec_col_int(session, r, page);
+		break;
+	case WT_PAGE_COL_VAR:
+		ret = __rec_col_var(session, r, page, salvage);
+		break;
+	case WT_PAGE_ROW_INT:
+		ret = __rec_row_int(session, r, page);
+		break;
+	case WT_PAGE_ROW_LEAF:
+		ret = __rec_row_leaf(session, r, page, salvage);
+		break;
+	WT_ILLEGAL_VALUE_SET(session);
+	}
+
+	/* Wrap up the page reconciliation. */
+	if (ret == 0)
+		ret = __rec_write_wrapup(session, r, page);
+	else
+		WT_TRET(__rec_write_wrapup_err(session, r, page));
+
+	/* Release the page lock if we're holding one. */
+	if (locked)
+		WT_PAGE_UNLOCK(session, page);
+	else
+		F_CLR_ATOMIC(page, WT_PAGE_SCANNING);
+
+	/*
+	 * Clean up the boundary structures: some workloads result in millions
+	 * of these structures, and if associated with some random session that
+	 * got roped into doing forced eviction, they won't be discarded for the
+	 * life of the session.
+	 */
+	__rec_bnd_cleanup(session, r, 0);
+
+	WT_RET(ret);
+
+	/*
+	 * Root pages are special, splits have to be done, we can't put it off
+	 * as the parent's problem any more.
+	 */
+	if (__wt_ref_is_root(ref))
+		return (__rec_root_write(session, page, flags));
+
+	/*
+	 * Otherwise, mark the page's parent dirty.
+	 * Don't mark the tree dirty: if this reconciliation is in service of a
+	 * checkpoint, it's cleared the tree's dirty flag, and we don't want to
+	 * set it again as part of that walk.
+	 */
+	return (__wt_page_parent_modify_set(session, ref, 1));
+}
+
+/*
+ * __rec_root_write --
+ *	Handle the write of a root page.
+ */
+static int
+__rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
+{
+	WT_DECL_RET;
+	WT_PAGE *next;
+	WT_PAGE_INDEX *pindex;
+	WT_PAGE_MODIFY *mod;
+	WT_REF fake_ref;
+	uint32_t i;
+
+	mod = page->modify;
+
+	/*
+	 * If a single root page was written (either an empty page or there was
+	 * a 1-for-1 page swap), we've written root and checkpoint, we're done.
+	 * If the root page split, write the resulting WT_REF array.  We already
+	 * have an infrastructure for writing pages, create a fake root page and
+	 * write it instead of adding code to write blocks based on the list of
+	 * blocks resulting from a multiblock reconciliation.
+	 */
+	switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+	case WT_PM_REC_EMPTY:				/* Page is empty */
+	case WT_PM_REC_REPLACE:				/* 1-for-1 page swap */
+		return (0);
+	case WT_PM_REC_MULTIBLOCK:			/* Multiple blocks */
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	WT_RET(__wt_verbose(session, WT_VERB_SPLIT,
+	    "root page split -> %" PRIu32 " pages", mod->mod_multi_entries));
+
+	/*
+	 * Create a new root page, initialize the array of child references,
+	 * mark it dirty, then write it.
+	 */
+	switch (page->type) {
+	case WT_PAGE_COL_INT:
+		WT_RET(__wt_page_alloc(session,
+		    WT_PAGE_COL_INT, 1, mod->mod_multi_entries, 1, &next));
+		break;
+	case WT_PAGE_ROW_INT:
+		WT_RET(__wt_page_alloc(session,
+		    WT_PAGE_ROW_INT, 0, mod->mod_multi_entries, 1, &next));
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	pindex = WT_INTL_INDEX_COPY(next);
+	for (i = 0; i < mod->mod_multi_entries; ++i) {
+		WT_ERR(__wt_multi_to_ref(session,
+		    next, &mod->mod_multi[i], &pindex->index[i], NULL));
+		pindex->index[i]->home = next;
+	}
+
+	/*
+	 * We maintain a list of pages written for the root in order to free the
+	 * backing blocks the next time the root is written.
+	 */
+	mod->mod_root_split = next;
+
+	WT_ERR(__wt_page_modify_init(session, next));
+	__wt_page_only_modify_set(session, next);
+
+	/*
+	 * Fake up a reference structure, and write the next root page.
+	 */
+	__wt_root_ref_init(&fake_ref, next, page->type == WT_PAGE_COL_INT);
+	return (__wt_rec_write(session, &fake_ref, NULL, flags));
+
+err:	__wt_page_out(session, &next);
+	return (ret);
+}
+
+/*
+ * __rec_raw_compression_config --
+ *	Configure raw compression.
+ */
+static inline int
+__rec_raw_compression_config(
+    WT_SESSION_IMPL *session, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
+{
+	WT_BTREE *btree;
+
+	btree = S2BT(session);
+
+	/* Check if raw compression configured. */
+	if (btree->compressor == NULL ||
+	    btree->compressor->compress_raw == NULL)
+		return (0);
+
+	/* Only for row-store and variable-length column-store objects. */
+	if (page->type == WT_PAGE_COL_FIX)
+		return (0);
+
+	/*
+	 * Raw compression cannot support dictionary compression. (Technically,
+	 * we could still use the raw callback on column-store variable length
+	 * internal pages with dictionary compression configured, because
+	 * dictionary compression only applies to column-store leaf pages, but
+	 * that seems an unlikely use case.)
+	 */
+	if (btree->dictionary != 0)
+		return (0);
+
+	/* Raw compression cannot support prefix compression. */
+	if (btree->prefix_compression != 0)
+		return (0);
+
+	/*
+	 * Raw compression is also turned off during salvage: we can't allow
+	 * pages to split during salvage, raw compression has no point if it
+	 * can't manipulate the page size.
+	 */
+	if (salvage != NULL)
+		return (0);
+
+	return (1);
+}
+
+/*
+ * __rec_write_init --
+ *	Initialize the reconciliation structure.
+ */
+static int
+__rec_write_init(WT_SESSION_IMPL *session,
+    WT_REF *ref, uint32_t flags, WT_SALVAGE_COOKIE *salvage, void *reconcilep)
+{
+	WT_BTREE *btree;
+	WT_PAGE *page;
+	WT_RECONCILE *r;
+
+	btree = S2BT(session);
+	page = ref->page;
+
+	if ((r = *(WT_RECONCILE **)reconcilep) == NULL) {
+		WT_RET(__wt_calloc_def(session, 1, &r));
+
+		*(WT_RECONCILE **)reconcilep = r;
+		session->reconcile_cleanup = __rec_destroy_session;
+
+		/* Connect pointers/buffers. */
+		r->cur = &r->_cur;
+		r->last = &r->_last;
+
+		/* Disk buffers need to be aligned for writing. */
+		F_SET(&r->dsk, WT_ITEM_ALIGNED);
+	}
+
+	/* Remember the configuration. */
+	r->ref = ref;
+	r->page = page;
+	r->flags = flags;
+
+	/* Track if the page can be marked clean. */
+	r->leave_dirty = 0;
+
+	/* Raw compression. */
+	r->raw_compression =
+	    __rec_raw_compression_config(session, page, salvage);
+	r->raw_destination.flags = WT_ITEM_ALIGNED;
+
+	/* Track overflow items. */
+	r->ovfl_items = 0;
+
+	/* Track empty values. */
+	r->all_empty_value = 1;
+	r->any_empty_value = 0;
+
+	/* The list of cached, skipped updates. */
+	r->skip_next = 0;
+
+	/*
+	 * Dictionary compression only writes repeated values once.  We grow
+	 * the dictionary as necessary, always using the largest size we've
+	 * seen.
+	 *
+	 * Reset the dictionary.
+	 *
+	 * Sanity check the size: 100 slots is the smallest dictionary we use.
+	 */
+	if (btree->dictionary != 0 && btree->dictionary > r->dictionary_slots)
+		WT_RET(__rec_dictionary_init(session,
+		    r, btree->dictionary < 100 ? 100 : btree->dictionary));
+	__rec_dictionary_reset(r);
+
+	/*
+	 * Suffix compression shortens internal page keys by discarding trailing
+	 * bytes that aren't necessary for tree navigation.  We don't do suffix
+	 * compression if there is a custom collator because we don't know what
+	 * bytes a custom collator might use.  Some custom collators (for
+	 * example, a collator implementing reverse ordering of strings), won't
+	 * have any problem with suffix compression: if there's ever a reason to
+	 * implement suffix compression for custom collators, we can add a
+	 * setting to the collator, configured when the collator is added, that
+	 * turns on suffix compression.
+	 *
+	 * The raw compression routines don't even consider suffix compression,
+	 * but it doesn't hurt to confirm that.
+	 */
+	r->key_sfx_compress_conf = 0;
+	if (btree->collator == NULL &&
+	    btree->internal_key_truncate && !r->raw_compression)
+		r->key_sfx_compress_conf = 1;
+
+	/*
+	 * Prefix compression discards repeated prefix bytes from row-store leaf
+	 * page keys.
+	 */
+	r->key_pfx_compress_conf = 0;
+	if (btree->prefix_compression && page->type == WT_PAGE_ROW_LEAF)
+		r->key_pfx_compress_conf = 1;
+
+	r->salvage = salvage;
+
+	/* Save the page's write generation before reading the page. */
+	WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen);
+
+	/*
+	 * Running transactions may update the page after we write it, so
+	 * this is the highest ID we can be confident we will see.
+	 */
+	r->skipped_txn = S2C(session)->txn_global.last_running;
+
+	return (0);
+}
+
+/*
+ * __rec_destroy --
+ *	Clean up the reconciliation structure.
+ */
+static void
+__rec_destroy(WT_SESSION_IMPL *session, void *reconcilep)
+{
+	WT_RECONCILE *r;
+
+	if ((r = *(WT_RECONCILE **)reconcilep) == NULL)
+		return;
+	*(WT_RECONCILE **)reconcilep = NULL;
+
+	__wt_buf_free(session, &r->dsk);
+
+	__wt_free(session, r->raw_entries);
+	__wt_free(session, r->raw_offsets);
+	__wt_free(session, r->raw_recnos);
+	__wt_buf_free(session, &r->raw_destination);
+
+	__rec_bnd_cleanup(session, r, 1);
+
+	__wt_free(session, r->skip);
+
+	__wt_buf_free(session, &r->k.buf);
+	__wt_buf_free(session, &r->v.buf);
+	__wt_buf_free(session, &r->_cur);
+	__wt_buf_free(session, &r->_last);
+
+	__rec_dictionary_free(session, r);
+
+	__wt_free(session, r);
+}
+
+/*
+ * __rec_destroy_session --
+ *	Clean up the reconciliation structure, session version.
+ */
+static int
+__rec_destroy_session(WT_SESSION_IMPL *session)
+{
+	__rec_destroy(session, &session->reconcile);
+	return (0);
+}
+
+/*
+ * __rec_bnd_cleanup --
+ *	Cleanup the boundary structure information.
+ */
+static void
+__rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy)
+{
+	WT_BOUNDARY *bnd;
+	uint32_t i, last_used;
+
+	if (r->bnd == NULL)
+		return;
+
+	/*
+	 * Free the boundary structures' memory.  In the case of normal cleanup,
+	 * discard any memory we won't reuse in the next reconciliation; in the
+	 * case of destruction, discard everything.
+	 *
+	 * During some big-page evictions we have seen boundary arrays that have
+	 * millions of elements.  That should not be a normal event, but if the
+	 * memory is associated with a random session, it won't be discarded
+	 * until the session is closed.   If there are more than 10,000 boundary
+	 * structure elements, destroy the boundary array and we'll start over.
+	 */
+	if (destroy || r->bnd_entries > 10 * 1000) {
+		for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) {
+			__wt_free(session, bnd->addr.addr);
+			__wt_free(session, bnd->dsk);
+			__wt_free(session, bnd->skip);
+			__wt_buf_free(session, &bnd->key);
+		}
+		__wt_free(session, r->bnd);
+		r->bnd_next = 0;
+		r->bnd_entries = r->bnd_allocated = 0;
+	} else {
+		/*
+		 * The boundary-next field points to the next boundary structure
+		 * we were going to use, but there's no requirement that value
+		 * be incremented before reconciliation updates the structure it
+		 * points to, that is, there's no guarantee elements of the next
+		 * boundary structure are still unchanged. Be defensive, clean
+		 * up the "next" structure as well as the ones we know we used.
+		 */
+		last_used = r->bnd_next;
+		if (last_used < r->bnd_entries)
+			++last_used;
+		for (bnd = r->bnd, i = 0; i < last_used; ++bnd, ++i) {
+			__wt_free(session, bnd->addr.addr);
+			__wt_free(session, bnd->dsk);
+			__wt_free(session, bnd->skip);
+		}
+	}
+}
+
+/*
+ * __rec_skip_update_save --
+ *	Save a skipped WT_UPDATE list for later restoration.
+ */
+static int
+__rec_skip_update_save(
+    WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip)
+{
+	WT_RET(__wt_realloc_def(
+	    session, &r->skip_allocated, r->skip_next + 1, &r->skip));
+	r->skip[r->skip_next].ins = ins;
+	r->skip[r->skip_next].rip = rip;
+	++r->skip_next;
+	return (0);
+}
+
+/*
+ * __rec_skip_update_move --
+ *	Move a skipped WT_UPDATE list from the per-page cache to a specific
+ * block's list.
+ */
+static int
+__rec_skip_update_move(
+    WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_UPD_SKIPPED *skip)
+{
+	WT_RET(__wt_realloc_def(
+	    session, &bnd->skip_allocated, bnd->skip_next + 1, &bnd->skip));
+	bnd->skip[bnd->skip_next] = *skip;
+	++bnd->skip_next;
+
+	skip->ins = NULL;
+	skip->rip = NULL;
+	return (0);
+}
+
+/*
+ * __rec_txn_read --
+ *	Return the first visible update in a list (or NULL if none are visible),
+ * set a flag if any updates were skipped, track the maximum transaction ID on
+ * the page.
+ */
+static inline int
+__rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
+    WT_INSERT *ins, WT_ROW *rip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp)
+{
+	WT_ITEM ovfl;
+	WT_PAGE *page;
+	WT_UPDATE *upd, *upd_list, *upd_ovfl;
+	size_t notused;
+	uint64_t max_txn, min_txn, txnid;
+	int skipped;
+
+	*updp = NULL;
+
+	page = r->page;
+
+	/*
+	 * If we're called with an WT_INSERT reference, use its WT_UPDATE
+	 * list, else is an on-page row-store WT_UPDATE list.
+	 */
+	upd_list = ins == NULL ? WT_ROW_UPDATE(page, rip) : ins->upd;
+	skipped = 0;
+
+	for (max_txn = WT_TXN_NONE, min_txn = UINT64_MAX, upd = upd_list;
+	    upd != NULL; upd = upd->next) {
+		if ((txnid = upd->txnid) == WT_TXN_ABORTED)
+			continue;
+
+		/* Track the largest/smallest transaction IDs on the list. */
+		if (TXNID_LT(max_txn, txnid))
+			max_txn = txnid;
+		if (TXNID_LT(txnid, min_txn))
+			min_txn = txnid;
+		if (TXNID_LT(txnid, r->skipped_txn) &&
+		    !__wt_txn_visible_all(session, txnid))
+			r->skipped_txn = txnid;
+
+		/*
+		 * Record whether any updates were skipped on the way to finding
+		 * the first visible update.
+		 *
+		 * If updates were skipped before the one being written, future
+		 * reads without intervening modifications to the page could
+		 * see a different value; if no updates were skipped, the page
+		 * can safely be marked clean and does not need to be
+		 * reconciled until modified again.
+		 */
+		if (*updp == NULL) {
+			if (__wt_txn_visible(session, txnid))
+				*updp = upd;
+			else
+				skipped = 1;
+		}
+	}
+
+	/*
+	 * Track the maximum transaction ID in the page.  We store this in the
+	 * page at the end of reconciliation if no updates are skipped, it's
+	 * used to avoid evicting clean pages from memory with changes required
+	 * to satisfy a snapshot read.
+	 */
+	if (TXNID_LT(r->max_txn, max_txn))
+		r->max_txn = max_txn;
+
+	/*
+	 * If all updates are globally visible and no updates were skipped, the
+	 * page can be marked clean and we're done, regardless of whether we're
+	 * evicting or checkpointing.
+	 *
+	 * The oldest transaction ID may have moved while we were scanning the
+	 * page, so it is possible to skip an update but then find that by the
+	 * end of the scan, all updates are stable.
+	 */
+	if (__wt_txn_visible_all(session, max_txn) && !skipped)
+		return (0);
+
+	/*
+	 * If some updates are not globally visible, or were skipped, the page
+	 * cannot be marked clean.
+	 */
+	r->leave_dirty = 1;
+
+	/* If we're not evicting, we're done, we know what we'll write. */
+	if (!F_ISSET(r, WT_EVICTING))
+		return (0);
+
+	/* In some cases, there had better not be any updates we can't write. */
+	if (F_ISSET(r, WT_SKIP_UPDATE_ERR))
+		WT_PANIC_RET(session, EINVAL,
+		    "reconciliation illegally skipped an update");
+
+	/*
+	 * If evicting and we aren't able to save/restore the not-yet-visible
+	 * updates, the page can't be evicted.
+	 */
+	if (!F_ISSET(r, WT_SKIP_UPDATE_RESTORE))
+		return (EBUSY);
+
+	/*
+	 * Evicting a page with not-yet-visible updates: save and restore the
+	 * list of updates on a newly instantiated page.
+	 *
+	 * The order of the updates on the list matters so we can't move only
+	 * the unresolved updates, we have to move the entire update list.
+	 *
+	 * Clear the returned update so our caller ignores the key/value pair
+	 * in the case of an insert/append entry (everything we need is in the
+	 * update list), and otherwise writes the original on-page key/value
+	 * pair to which the update list applies.
+	 */
+	*updp = NULL;
+
+	/*
+	 * Handle the case were we don't want to write an original on-page value
+	 * item to disk because it's been updated or removed.
+	 *
+	 * Here's the deal: an overflow value was updated or removed and its
+	 * backing blocks freed.  If any transaction in the system might still
+	 * read the value, a copy was cached in page reconciliation tracking
+	 * memory, and the page cell set to WT_CELL_VALUE_OVFL_RM.  Eviction
+	 * then chose the page and we're splitting it up in order to push parts
+	 * of it out of memory.
+	 *
+	 * We could write the original on-page value item to disk... if we had
+	 * a copy.  The cache may not have a copy (a globally visible update
+	 * would have kept a value from ever being cached), or an update that
+	 * subsequent became globally visible could cause a cached value to be
+	 * discarded.  Either way, once there's a globally visible update, we
+	 * may not have the value.
+	 *
+	 * Fortunately, if there's a globally visible update we don't care about
+	 * the original version, so we simply ignore it, no transaction can ever
+	 * try and read it.  If there isn't a globally visible update, there had
+	 * better be a cached value.
+	 *
+	 * In the latter case, we could write the value out to disk, but (1) we
+	 * are planning on re-instantiating this page in memory, it isn't going
+	 * to disk, and (2) the value item is eventually going to be discarded,
+	 * that seems like a waste of a write.  Instead, find the cached value
+	 * and append it to the update list we're saving for later restoration.
+	 */
+	if (vpack != NULL && vpack->raw == WT_CELL_VALUE_OVFL_RM &&
+	    !__wt_txn_visible_all(session, min_txn)) {
+		WT_RET(__wt_ovfl_txnc_search(
+		    page, vpack->data, vpack->size, &ovfl));
+		/*
+		 * Create an update structure with an impossibly low transaction
+		 * ID and append it to the update list we're about to save.
+		 * Restoring that update list when this page is re-instantiated
+		 * creates an update for the key/value pair visible to every
+		 * running transaction in the system, ensuring the on-page value
+		 * will be ignored.
+		 */
+		WT_RET(__wt_update_alloc(session, &ovfl, &upd_ovfl, &notused));
+		upd_ovfl->txnid = WT_TXN_NONE;
+		for (upd = upd_list; upd->next != NULL; upd = upd->next)
+			;
+		upd->next = upd_ovfl;
+	}
+
+	return (__rec_skip_update_save(session, r, ins, rip));
+}
+
+/*
+ * CHILD_RELEASE --
+ *	Macros to clean up during internal-page reconciliation, releasing the
+ * hazard pointer we're holding on child pages.
+ */
+#undef	CHILD_RELEASE
+#define	CHILD_RELEASE(session, hazard, ref) do {			\
+	if (hazard) {							\
+		hazard = 0;						\
+		WT_TRET(						\
+		    __wt_page_release(session, ref, WT_READ_NO_EVICT));	\
+	}								\
+} while (0)
+#undef	CHILD_RELEASE_ERR
+#define	CHILD_RELEASE_ERR(session, hazard, ref) do {			\
+	CHILD_RELEASE(session, hazard, ref);				\
+	WT_ERR(ret);							\
+} while (0)
+
+/*
+ * __rec_child_modify --
+ *	Return if the internal page's child references any modifications.
+ */
+static int
+__rec_child_modify(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, WT_REF *ref, int *hazardp, int *statep)
+{
+	WT_DECL_RET;
+	WT_PAGE_MODIFY *mod;
+
+	/* We may acquire a hazard pointer our caller must release. */
+	*hazardp = 0;
+
+#define	WT_CHILD_IGNORE		1		/* Deleted child: ignore */
+#define	WT_CHILD_MODIFIED	2		/* Modified child */
+#define	WT_CHILD_PROXY		3		/* Deleted child: proxy */
+	*statep = 0;
+
+	/*
+	 * This function is called when walking an internal page to decide how
+	 * to handle child pages referenced by the internal page, specifically
+	 * if the child page is to be merged into its parent.
+	 *
+	 * Internal pages are reconciled for two reasons: first, when evicting
+	 * an internal page, second by the checkpoint code when writing internal
+	 * pages.  During eviction, the subtree is locked down so all pages
+	 * should be in the WT_REF_DISK or WT_REF_LOCKED state. During
+	 * checkpoint, any eviction that might affect our review of an internal
+	 * page is prohibited, however, as the subtree is not reserved for our
+	 * exclusive use, there are other page states that must be considered.
+	 */
+	for (;; __wt_yield())
+		switch (r->tested_ref_state = ref->state) {
+		case WT_REF_DISK:
+			/* On disk, not modified by definition. */
+			goto done;
+
+		case WT_REF_DELETED:
+			/*
+			 * The child is in a deleted state.
+			 *
+			 * It's possible the state could change underneath us as
+			 * the page is read in, and we can race between checking
+			 * for a deleted state and looking at the transaction ID
+			 * to see if the delete is visible to us.  Lock down the
+			 * structure.
+			 */
+			if (!WT_ATOMIC_CAS4(
+			    ref->state, WT_REF_DELETED, WT_REF_LOCKED))
+				break;
+			ret = __rec_child_deleted(session, r, ref, statep);
+			WT_PUBLISH(ref->state, WT_REF_DELETED);
+			goto done;
+
+		case WT_REF_LOCKED:
+			/*
+			 * Locked.
+			 *
+			 * If evicting, the evicted page's subtree, including
+			 * this child, was selected for eviction by us and the
+			 * state is stable until we reset it, it's an in-memory
+			 * state.  This is the expected state for a child being
+			 * merged into a page (where the page was selected by
+			 * the eviction server for eviction).
+			 */
+			if (F_ISSET(r, WT_EVICTING))
+				goto in_memory;
+
+			/*
+			 * If called during checkpoint, the child is being
+			 * considered by the eviction server or the child is a
+			 * fast-delete page being read.  The eviction may have
+			 * started before the checkpoint and so we must wait
+			 * for the eviction to be resolved.  I suspect we could
+			 * handle fast-delete reads, but we can't distinguish
+			 * between the two and fast-delete reads aren't expected
+			 * to be common.
+			 */
+			break;
+
+		case WT_REF_MEM:
+			/*
+			 * In memory.
+			 *
+			 * If evicting, the evicted page's subtree, including
+			 * this child, was selected for eviction by us and the
+			 * state is stable until we reset it, it's an in-memory
+			 * state.  This is the expected state for a child being
+			 * merged into a page (where the page belongs to a file
+			 * being discarded from the cache during close).
+			 */
+			if (F_ISSET(r, WT_EVICTING))
+				goto in_memory;
+
+			/*
+			 * If called during checkpoint, acquire a hazard pointer
+			 * so the child isn't evicted, it's an in-memory case.
+			 *
+			 * This call cannot return split/restart, dirty page
+			 * eviction is shutout during checkpoint, all splits in
+			 * process will have completed before we walk any pages
+			 * for checkpoint.
+			 */
+			if ((ret = __wt_page_in(session, ref,
+			    WT_READ_CACHE | WT_READ_NO_EVICT |
+			    WT_READ_NO_GEN | WT_READ_NO_WAIT)) == WT_NOTFOUND) {
+				ret = 0;
+				break;
+			}
+			*hazardp = 1;
+			goto in_memory;
+
+		case WT_REF_READING:
+			/*
+			 * Being read, not modified by definition.
+			 *
+			 * We should never be here during eviction, a child page
+			 * in this state within an evicted page's subtree would
+			 * have caused normally eviction to fail, and exclusive
+			 * eviction shouldn't ever see pages being read.
+			 */
+			WT_ASSERT(session, !F_ISSET(r, WT_EVICTING));
+			goto done;
+
+		case WT_REF_SPLIT:
+			/*
+			 * The page was split out from under us.
+			 *
+			 * We should never be here during eviction, a child page
+			 * in this state within an evicted page's subtree would
+			 * have caused eviction to fail.
+			 *
+			 * We should never be here during checkpoint, dirty page
+			 * eviction is shutout during checkpoint, all splits in
+			 * process will have completed before we walk any pages
+			 * for checkpoint.
+			 */
+			WT_ASSERT(session, ref->state != WT_REF_SPLIT);
+			/* FALLTHROUGH */
+
+		WT_ILLEGAL_VALUE(session);
+		}
+
+in_memory:
+	/*
+	 * In-memory states: the child is potentially modified if the page's
+	 * modify structure has been instantiated.   If the modify structure
+	 * exists and the page has actually been modified, set that state.
+	 * If that's not the case, we would normally use the original cell's
+	 * disk address as our reference, but, if we're forced to instantiate
+	 * a deleted child page and it's never modified, we end up here with
+	 * a page that has a modify structure, no modifications, and no disk
+	 * address.  Ignore those pages, they're not modified and there is no
+	 * reason to write the cell.
+	 */
+	mod = ref->page->modify;
+	if (mod != NULL && mod->flags != 0)
+		*statep = WT_CHILD_MODIFIED;
+	else if (ref->addr == NULL) {
+		*statep = WT_CHILD_IGNORE;
+		CHILD_RELEASE(session, *hazardp, ref);
+	}
+
+done:	WT_HAVE_DIAGNOSTIC_YIELD;
+	return (ret);
+}
+
+/*
+ * __rec_child_deleted --
+ *	Handle pages with leaf pages in the WT_REF_DELETED state.
+ */
+static int
+__rec_child_deleted(
+    WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, int *statep)
+{
+	WT_BM *bm;
+	WT_PAGE_DELETED *page_del;
+	size_t addr_size;
+	const uint8_t *addr;
+
+	bm = S2BT(session)->bm;
+	page_del = ref->page_del;
+
+	/*
+	 * Internal pages with child leaf pages in the WT_REF_DELETED state are
+	 * a special case during reconciliation.  First, if the deletion was a
+	 * result of a session truncate call, the deletion may not be visible to
+	 * us.  In that case, we proceed as with any change that's not visible
+	 * during reconciliation by setting the skipped flag and ignoring the
+	 * change for the purposes of writing the internal page.
+	 *
+	 * In this case, there must be an associated page-deleted structure, and
+	 * it holds the transaction ID we care about.
+	 */
+	if (page_del != NULL && !__wt_txn_visible(session, page_del->txnid)) {
+		/*
+		 * In some cases, there had better not be any updates we can't
+		 * write.
+		 */
+		if (F_ISSET(r, WT_SKIP_UPDATE_ERR))
+			WT_PANIC_RET(session, EINVAL,
+			    "reconciliation illegally skipped an update");
+
+		/* If this page cannot be evicted, quit now. */
+		if (F_ISSET(r, WT_EVICTING))
+			return (EBUSY);
+	}
+
+	/*
+	 * The deletion is visible to us, deal with any underlying disk blocks.
+	 *
+	 * First, check to see if there is an address associated with this leaf:
+	 * if there isn't, we're done, the underlying page is already gone.  If
+	 * the page still exists, check for any transactions in the system that
+	 * might want to see the page's state before it's deleted.
+	 *
+	 * If any such transactions exist, we cannot discard the underlying leaf
+	 * page to the block manager because the transaction may eventually read
+	 * it.  However, this write might be part of a checkpoint, and should we
+	 * recover to that checkpoint, we'll need to delete the leaf page, else
+	 * we'd leak it.  The solution is to write a proxy cell on the internal
+	 * page ensuring the leaf page is eventually discarded.
+	 *
+	 * If no such transactions exist, we can discard the leaf page to the
+	 * block manager and no cell needs to be written at all.  We do this
+	 * outside of the underlying tracking routines because this action is
+	 * permanent and irrevocable.  (Clearing the address means we've lost
+	 * track of the disk address in a permanent way.  This is safe because
+	 * there's no path to reading the leaf page again: if there's ever a
+	 * read into this part of the name space again, the cache read function
+	 * instantiates an entirely new page.)
+	 */
+	if (ref->addr != NULL &&
+	    (page_del == NULL ||
+	    __wt_txn_visible_all(session, page_del->txnid))) {
+		WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+		WT_RET(bm->free(bm, session, addr, addr_size));
+
+		if (__wt_off_page(ref->home, ref->addr)) {
+			__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+			__wt_free(session, ref->addr);
+		}
+		ref->addr = NULL;
+	}
+
+	/*
+	 * Minor memory cleanup: if a truncate call deleted this page and we
+	 * were ever forced to instantiate the page in memory, we would have
+	 * built a list of updates in the page reference in order to be able
+	 * to abort the truncate.  It's a cheap test to make that memory go
+	 * away, we do it here because there's really nowhere else we do the
+	 * checks.  In short, if we have such a list, and the backing address
+	 * blocks are gone, there can't be any transaction that can abort.
+	 */
+	if (ref->addr == NULL && page_del != NULL) {
+		__wt_free(session, ref->page_del->update_list);
+		__wt_free(session, ref->page_del);
+	}
+
+	/*
+	 * If there's still a disk address, then we have to write a proxy
+	 * record, otherwise, we can safely ignore this child page.
+	 */
+	*statep = ref->addr == NULL ? WT_CHILD_IGNORE : WT_CHILD_PROXY;
+	return (0);
+}
+
+/*
+ * __rec_incr --
+ *	Update the memory tracking structure for a set of new entries.
+ */
+static inline void
+__rec_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size)
+{
+	/*
+	 * The buffer code is fragile and prone to off-by-one errors -- check
+	 * for overflow in diagnostic mode.
+	 */
+	WT_ASSERT(session, r->space_avail >= size);
+	WT_ASSERT(session,
+	    WT_BLOCK_FITS(r->first_free, size, r->dsk.mem, r->page_size));
+
+	r->entries += v;
+	r->space_avail -= size;
+	r->first_free += size;
+}
+
+/*
+ * __rec_copy_incr --
+ *	Copy a key/value cell and buffer pair into the new image.
+ */
+static inline void
+__rec_copy_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *kv)
+{
+	size_t len;
+	uint8_t *p, *t;
+
+	/*
+	 * If there's only one chunk of data to copy (because the cell and data
+	 * are being copied from the original disk page), the cell length won't
+	 * be set, the WT_ITEM data/length will reference the data to be copied.
+	 *
+	 * WT_CELLs are typically small, 1 or 2 bytes -- don't call memcpy, do
+	 * the copy in-line.
+	 */
+	for (p = (uint8_t *)r->first_free,
+	    t = (uint8_t *)&kv->cell, len = kv->cell_len; len > 0; --len)
+		*p++ = *t++;
+
+	/* The data can be quite large -- call memcpy. */
+	if (kv->buf.size != 0)
+		memcpy(p, kv->buf.data, kv->buf.size);
+
+	WT_ASSERT(session, kv->len == kv->cell_len + kv->buf.size);
+	__rec_incr(session, r, 1, kv->len);
+}
+
+/*
+ * __rec_dict_replace --
+ *	Check for a dictionary match.
+ */
+static int
+__rec_dict_replace(
+    WT_SESSION_IMPL *session, WT_RECONCILE *r, uint64_t rle, WT_KV *val)
+{
+	WT_DICTIONARY *dp;
+	uint64_t offset;
+
+	/*
+	 * We optionally create a dictionary of values and only write a unique
+	 * value once per page, using a special "copy" cell for all subsequent
+	 * copies of the value.  We have to do the cell build and resolution at
+	 * this low level because we need physical cell offsets for the page.
+	 *
+	 * Sanity check: short-data cells can be smaller than dictionary-copy
+	 * cells.  If the data is already small, don't bother doing the work.
+	 * This isn't just work avoidance: on-page cells can't grow as a result
+	 * of writing a dictionary-copy cell, the reconciliation functions do a
+	 * split-boundary test based on the size required by the value's cell;
+	 * if we grow the cell after that test we'll potentially write off the
+	 * end of the buffer's memory.
+	 */
+	if (val->buf.size <= WT_INTPACK32_MAXSIZE)
+		return (0);
+	WT_RET(__rec_dictionary_lookup(session, r, val, &dp));
+	if (dp == NULL)
+		return (0);
+
+	/*
+	 * If the dictionary cell reference is not set, we're creating a new
+	 * entry in the dictionary, update its location.
+	 *
+	 * If the dictionary cell reference is set, we have a matching value.
+	 * Create a copy cell instead.
+	 */
+	if (dp->cell == NULL)
+		dp->cell = r->first_free;
+	else {
+		offset = WT_PTRDIFF(r->first_free, dp->cell);
+		val->len = val->cell_len =
+		    __wt_cell_pack_copy(&val->cell, rle, offset);
+		val->buf.data = NULL;
+		val->buf.size = 0;
+	}
+	return (0);
+}
+
+/*
+ * __rec_key_state_update --
+ *	Update prefix and suffix compression based on the last key.
+ */
+static inline void
+__rec_key_state_update(WT_RECONCILE *r, int ovfl_key)
+{
+	WT_ITEM *a;
+
+	/*
+	 * If writing an overflow key onto the page, don't update the "last key"
+	 * value, and leave the state of prefix compression alone.  (If we are
+	 * currently doing prefix compression, we have a key state which will
+	 * continue to work, we're just skipping the key just created because
+	 * it's an overflow key and doesn't participate in prefix compression.
+	 * If we are not currently doing prefix compression, we can't start, an
+	 * overflow key doesn't give us any state.)
+	 *
+	 * Additionally, if we wrote an overflow key onto the page, turn off the
+	 * suffix compression of row-store internal node keys.  (When we split,
+	 * "last key" is the largest key on the previous page, and "cur key" is
+	 * the first key on the next page, which is being promoted.  In some
+	 * cases we can discard bytes from the "cur key" that are not needed to
+	 * distinguish between the "last key" and "cur key", compressing the
+	 * size of keys on internal nodes.  If we just built an overflow key,
+	 * we're not going to update the "last key", making suffix compression
+	 * impossible for the next key.   Alternatively, we could remember where
+	 * the last key was on the page, detect it's an overflow key, read it
+	 * from disk and do suffix compression, but that's too much work for an
+	 * unlikely event.)
+	 *
+	 * If we're not writing an overflow key on the page, update the last-key
+	 * value and turn on both prefix and suffix compression.
+	 */
+	if (ovfl_key)
+		r->key_sfx_compress = 0;
+	else {
+		a = r->cur;
+		r->cur = r->last;
+		r->last = a;
+
+		r->key_pfx_compress = r->key_pfx_compress_conf;
+		r->key_sfx_compress = r->key_sfx_compress_conf;
+	}
+}
+
+/*
+ * Macros from fixed-length entries to/from bytes.
+ */
+#define	WT_FIX_BYTES_TO_ENTRIES(btree, bytes)				\
+    ((uint32_t)((((bytes) * 8) / (btree)->bitcnt)))
+#define	WT_FIX_ENTRIES_TO_BYTES(btree, entries)				\
+	((uint32_t)WT_ALIGN((entries) * (btree)->bitcnt, 8))
+
+/*
+ * __rec_leaf_page_max --
+ *	Figure out the maximum leaf page size for the reconciliation.
+ */
+static inline uint32_t
+__rec_leaf_page_max(WT_SESSION_IMPL *session,  WT_RECONCILE *r)
+{
+	WT_BTREE *btree;
+	WT_PAGE *page;
+	uint32_t page_size;
+
+	btree = S2BT(session);
+	page = r->page;
+
+	page_size = 0;
+	switch (page->type) {
+	case WT_PAGE_COL_FIX:
+		/*
+		 * Column-store pages can grow if there are missing records
+		 * (that is, we lost a chunk of the range, and have to write
+		 * deleted records).  Fixed-length objects are a problem, if
+		 * there's a big missing range, we could theoretically have to
+		 * write large numbers of missing objects.
+		 */
+		page_size = (uint32_t)WT_ALIGN(WT_FIX_ENTRIES_TO_BYTES(btree,
+		    r->salvage->take + r->salvage->missing), btree->allocsize);
+		break;
+	case WT_PAGE_COL_VAR:
+		/*
+		 * Column-store pages can grow if there are missing records
+		 * (that is, we lost a chunk of the range, and have to write
+		 * deleted records).  Variable-length objects aren't usually a
+		 * problem because we can write any number of deleted records
+		 * in a single page entry because of the RLE, we just need to
+		 * ensure that additional entry fits.
+		 */
+		break;
+	case WT_PAGE_ROW_LEAF:
+	default:
+		/*
+		 * Row-store pages can't grow, salvage never does anything
+		 * other than reduce the size of a page read from disk.
+		 */
+		break;
+	}
+
+	/*
+	 * Default size for variable-length column-store and row-store pages
+	 * during salvage is the maximum leaf page size.
+	 */
+	if (page_size < btree->maxleafpage)
+		page_size = btree->maxleafpage;
+
+	/*
+	 * The page we read from the disk should be smaller than the page size
+	 * we just calculated, check out of paranoia.
+	 */
+	if (page_size < page->dsk->mem_size)
+		page_size = page->dsk->mem_size;
+
+	/*
+	 * Salvage is the backup plan: don't let this fail.
+	 */
+	return (page_size * 2);
+}
+
+/*
+ * __rec_split_bnd_init --
+ *	Initialize a single boundary structure.
+ */
+static void
+__rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd)
+{
+	bnd->start = NULL;
+
+	bnd->recno = 0;
+	bnd->entries = 0;
+
+	__wt_free(session, bnd->addr.addr);
+	WT_CLEAR(bnd->addr);
+	bnd->size = 0;
+	bnd->cksum = 0;
+	__wt_free(session, bnd->dsk);
+
+	__wt_free(session, bnd->skip);
+	bnd->skip_next = 0;
+	bnd->skip_allocated = 0;
+
+	/* Ignore the key, we re-use that memory in each new reconciliation. */
+
+	bnd->already_compressed = 0;
+}
+
+/*
+ * __rec_split_bnd_grow --
+ *	Grow the boundary array as necessary.
+ */
+static int
+__rec_split_bnd_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+	/*
+	 * Make sure there's enough room for another boundary.  The calculation
+	 * is +2, because when filling in the current boundary's information,
+	 * we save the start point of the next boundary (for example, a record
+	 * number or key), in the (current + 1) slot.
+	 *
+	 * For the same reason, we're always initializing one ahead.
+	 */
+	WT_RET(__wt_realloc_def(
+	    session, &r->bnd_allocated, r->bnd_next + 2, &r->bnd));
+	r->bnd_entries = r->bnd_allocated / sizeof(r->bnd[0]);
+
+	__rec_split_bnd_init(session, &r->bnd[r->bnd_next + 1]);
+
+	return (0);
+}
+
+/*
+ * __rec_split_init --
+ *	Initialization for the reconciliation split functions.
+ */
+static int
+__rec_split_init(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, WT_PAGE *page, uint64_t recno, uint32_t max)
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+	WT_PAGE_HEADER *dsk;
+	size_t corrected_page_size;
+
+	btree = S2BT(session);
+	bm = btree->bm;
+
+	/*
+	 * The maximum leaf page size governs when an in-memory leaf page splits
+	 * into multiple on-disk pages; however, salvage can't be allowed to
+	 * split, there's no parent page yet.  If we're doing salvage, override
+	 * the caller's selection of a maximum page size, choosing a page size
+	 * that ensures we won't split.
+	 */
+	if (r->salvage != NULL)
+		max = __rec_leaf_page_max(session, r);
+
+	/*
+	 * Set the page sizes.  If we're doing the page layout, the maximum page
+	 * size is the same as the page size.  If the application is doing page
+	 * layout (raw compression is configured), we accumulate some amount of
+	 * additional data because we don't know how well it will compress, and
+	 * we don't want to increment our way up to the amount of data needed by
+	 * the application to successfully compress to the target page size.
+	 */
+	r->page_size = r->page_size_max = max;
+	if (r->raw_compression)
+		r->page_size *= 10;
+
+	/*
+	 * Ensure the disk image buffer is large enough for the max object, as
+	 * corrected by the underlying block manager.
+	 */
+	corrected_page_size = r->page_size;
+	WT_RET(bm->write_size(bm, session, &corrected_page_size));
+	WT_RET(__wt_buf_init(session, &r->dsk, corrected_page_size));
+
+	/*
+	 * Clear the disk page's header and block-manager space, set the page
+	 * type (the type doesn't change, and setting it later would require
+	 * additional code in a few different places).
+	 */
+	dsk = r->dsk.mem;
+	memset(dsk, 0, WT_PAGE_HEADER_BYTE_SIZE(btree));
+	dsk->type = page->type;
+
+	/*
+	 * If we have to split, we want to choose a smaller page size for the
+	 * split pages, because otherwise we could end up splitting one large
+	 * packed page over and over.   We don't want to pick the minimum size
+	 * either, because that penalizes an application that did a bulk load
+	 * and subsequently inserted a few items into packed pages.  Currently
+	 * defaulted to 75%, but I have no empirical evidence that's "correct".
+	 *
+	 * The maximum page size may be a multiple of the split page size (for
+	 * example, there's a maximum page size of 128KB, but because the table
+	 * is active and we don't want to split a lot, the split size is 20KB).
+	 * The maximum page size may NOT be an exact multiple of the split page
+	 * size.
+	 *
+	 * It's lots of work to build these pages and don't want to start over
+	 * when we reach the maximum page size (it's painful to restart after
+	 * creating overflow items and compacted data, for example, as those
+	 * items have already been written to disk).  So, the loop calls the
+	 * helper functions when approaching a split boundary, and we save the
+	 * information at that point.  That allows us to go back and split the
+	 * page at the boundary points if we eventually overflow the maximum
+	 * page size.
+	 *
+	 * Finally, all this doesn't matter for fixed-size column-store pages,
+	 * raw compression, and salvage.  Fixed-size column store pages can
+	 * split under (very) rare circumstances, but they're allocated at a
+	 * fixed page size, never anything smaller.  In raw compression, the
+	 * underlying compression routine decides when we split, so it's not
+	 * our problem.  In salvage, as noted above, we can't split at all.
+	 */
+	if (r->raw_compression || r->salvage != NULL) {
+		r->split_size = 0;
+		r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+	}
+	else if (page->type == WT_PAGE_COL_FIX) {
+		r->split_size = r->page_size_max;
+		r->space_avail =
+		    r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+	} else {
+		r->split_size = __wt_split_page_size(btree, r->page_size_max);
+		r->space_avail =
+		    r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+	}
+	r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
+
+	/* Initialize the first boundary. */
+	r->bnd_next = 0;
+	WT_RET(__rec_split_bnd_grow(session, r));
+	__rec_split_bnd_init(session, &r->bnd[0]);
+	r->bnd[0].recno = recno;
+	r->bnd[0].start = WT_PAGE_HEADER_BYTE(btree, dsk);
+
+	/*
+	 * If the maximum page size is the same as the split page size, either
+	 * because of the object type or application configuration, there isn't
+	 * any need to maintain split boundaries within a larger page.
+	 *
+	 * No configuration for salvage here, because salvage can't split.
+	 */
+	if (r->raw_compression)
+		r->bnd_state = SPLIT_TRACKING_RAW;
+	else if (max == r->split_size)
+		r->bnd_state = SPLIT_TRACKING_OFF;
+	else
+		r->bnd_state = SPLIT_BOUNDARY;
+
+	/* Initialize the entry counters. */
+	r->entries = r->total_entries = 0;
+
+	/* Initialize the starting record number. */
+	r->recno = recno;
+
+	/* New page, compression off. */
+	r->key_pfx_compress = r->key_sfx_compress = 0;
+
+	return (0);
+}
+
+/*
+ * __rec_is_checkpoint --
+ *	Return if we're writing a checkpoint.
+ */
+static int
+__rec_is_checkpoint(WT_RECONCILE *r, WT_BOUNDARY *bnd)
+{
+	/*
+	 * Check to see if we're going to create a checkpoint.
+	 *
+	 * This function exists as a place to hang this comment.
+	 *
+	 * Any time we write the root page of the tree without splitting we are
+	 * creating a checkpoint (and have to tell the underlying block manager
+	 * so it creates and writes the additional information checkpoints
+	 * require).  However, checkpoints are completely consistent, and so we
+	 * have to resolve information about the blocks we're expecting to free
+	 * as part of the checkpoint, before writing the checkpoint.  In short,
+	 * we don't do checkpoint writes here; clear the boundary information as
+	 * a reminder and create the checkpoint during wrapup.
+	 */
+	if (bnd == &r->bnd[0] && __wt_ref_is_root(r->ref)) {
+		bnd->addr.addr = NULL;
+		bnd->addr.size = 0;
+		bnd->addr.type = 0;
+		return (1);
+	}
+	return (0);
+}
+
+/*
+ * __rec_split_row_promote_cell --
+ *	Get a key from a cell for the purposes of promotion.
+ */
+static int
+__rec_split_row_promote_cell(
+    WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk, WT_ITEM *key)
+{
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *kpack, _kpack;
+
+	btree = S2BT(session);
+	kpack = &_kpack;
+
+	/*
+	 * The cell had better have a zero-length prefix and not be a copy cell;
+	 * the first cell on a page cannot refer an earlier cell on the page.
+	 */
+	cell = WT_PAGE_HEADER_BYTE(btree, dsk);
+	__wt_cell_unpack(cell, kpack);
+	WT_ASSERT(session,
+	    kpack->prefix == 0 && kpack->raw != WT_CELL_VALUE_COPY);
+
+	WT_RET(__wt_cell_data_copy(session, dsk->type, kpack, key));
+	return (0);
+}
+
+/*
+ * __rec_split_row_promote --
+ *	Key promotion for a row-store.
+ */
+static int
+__rec_split_row_promote(
+    WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ITEM *key, uint8_t type)
+{
+	WT_BTREE *btree;
+	WT_DECL_ITEM(update);
+	WT_DECL_RET;
+	WT_ITEM *max;
+	WT_UPD_SKIPPED *skip;
+	size_t cnt, len, size;
+	uint32_t i;
+	const uint8_t *pa, *pb;
+	int cmp;
+
+	/*
+	 * For a column-store, the promoted key is the recno and we already have
+	 * a copy.  For a row-store, it's the first key on the page, a variable-
+	 * length byte string, get a copy.
+	 *
+	 * This function is called from the split code at each split boundary,
+	 * but that means we're not called before the first boundary, and we
+	 * will eventually have to get the first key explicitly when splitting
+	 * a page.
+	 *
+	 * For the current slot, take the last key we built, after doing suffix
+	 * compression.  The "last key we built" describes some process: before
+	 * calling the split code, we must place the last key on the page before
+	 * the boundary into the "last" key structure, and the first key on the
+	 * page after the boundary into the "current" key structure, we're going
+	 * to compare them for suffix compression.
+	 *
+	 * Suffix compression is a hack to shorten keys on internal pages.  We
+	 * only need enough bytes in the promoted key to ensure searches go to
+	 * the correct page: the promoted key has to be larger than the last key
+	 * on the leaf page preceding it, but we don't need any more bytes than
+	 * that.   In other words, we can discard any suffix bytes not required
+	 * to distinguish between the key being promoted and the last key on the
+	 * leaf page preceding it.  This can only be done for the first level of
+	 * internal pages, you cannot repeat suffix truncation as you split up
+	 * the tree, it loses too much information.
+	 *
+	 * Note #1: if the last key on the previous page was an overflow key,
+	 * we don't have the in-memory key against which to compare, and don't
+	 * try to do suffix compression.  The code for that case turns suffix
+	 * compression off for the next key, we don't have to deal with it here.
+	 */
+	if (type != WT_PAGE_ROW_LEAF || !r->key_sfx_compress)
+		return (__wt_buf_set(session, key, r->cur->data, r->cur->size));
+
+	btree = S2BT(session);
+	WT_RET(__wt_scr_alloc(session, 0, &update));
+
+	/*
+	 * Note #2: if we skipped updates, an update key may be larger than the
+	 * last key stored in the previous block (probable for append-centric
+	 * workloads).  If there are skipped updates, check for one larger than
+	 * the last key and smaller than the current key.
+	 */
+	max = r->last;
+	for (i = r->skip_next; i > 0; --i) {
+		skip = &r->skip[i - 1];
+		if (skip->ins == NULL)
+			WT_ERR(__wt_row_leaf_key(
+			    session, r->page, skip->rip, update, 0));
+		else {
+			update->data = WT_INSERT_KEY(skip->ins);
+			update->size = WT_INSERT_KEY_SIZE(skip->ins);
+		}
+
+		/* Compare against the current key, it must be less. */
+		WT_ERR(__wt_compare(
+		    session, btree->collator, update, r->cur, &cmp));
+		if (cmp >= 0)
+			continue;
+
+		/* Compare against the last key, it must be greater. */
+		WT_ERR(__wt_compare(
+		    session, btree->collator, update, r->last, &cmp));
+		if (cmp >= 0)
+			max = update;
+
+		/*
+		 * The skipped updates are in key-sort order so the entry we're
+		 * looking for is either the last one or the next-to-last one
+		 * in the list.  Once we've compared an entry against the last
+		 * key on the page, we're done.
+		 */
+		break;
+	}
+
+	/*
+	 * The largest key on the last block must sort before the current key,
+	 * so we'll either find a larger byte value in the current key, or the
+	 * current key will be a longer key, and the interesting byte is one
+	 * past the length of the shorter key.
+	 */
+	pa = max->data;
+	pb = r->cur->data;
+	len = WT_MIN(max->size, r->cur->size);
+	size = len + 1;
+	for (cnt = 1; len > 0; ++cnt, --len, ++pa, ++pb)
+		if (*pa != *pb) {
+			if (size != cnt) {
+				WT_STAT_FAST_DATA_INCRV(session,
+				    rec_suffix_compression, size - cnt);
+				size = cnt;
+			}
+			break;
+		}
+	ret = __wt_buf_set(session, key, r->cur->data, size);
+
+err:	__wt_scr_free(&update);
+	return (ret);
+}
+
+/*
+ * __rec_split --
+ *	Handle the page reconciliation bookkeeping.  (Did you know "bookkeeper"
+ * has 3 doubled letters in a row?  Sweet-tooth does, too.)
+ */
+static int
+__rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+	WT_BTREE *btree;
+	WT_BOUNDARY *last, *next;
+	WT_PAGE_HEADER *dsk;
+	uint32_t len;
+
+	/*
+	 * We should never split during salvage, and we're about to drop core
+	 * because there's no parent page.
+	 */
+	if (r->salvage != NULL)
+		WT_PANIC_RET(session, WT_PANIC,
+		    "%s page too large, attempted split during salvage",
+		    __wt_page_type_string(r->page->type));
+
+	/*
+	 * Handle page-buffer size tracking; we have to do this work in every
+	 * reconciliation loop, and I don't want to repeat the code that many
+	 * times.
+	 */
+	btree = S2BT(session);
+	dsk = r->dsk.mem;
+
+	/* Hitting a page boundary resets the dictionary, in all cases. */
+	__rec_dictionary_reset(r);
+
+	/*
+	 * There are 3 cases we have to handle.
+	 *
+	 * #1
+	 * About to cross a split boundary: save current boundary information
+	 * and return.
+	 *
+	 * #2
+	 * About to cross the maximum boundary: use saved boundary information
+	 * to write all of the split pages.
+	 *
+	 * #3
+	 * About to cross a split boundary, but we've either already done the
+	 * split thing when we approached the maximum boundary, in which
+	 * case we write the page and keep going, or we were never tracking
+	 * split boundaries at all.
+	 *
+	 * Cases #1 and #2 are the hard ones: we're called when we're about to
+	 * cross each split boundary, and we save information away so we can
+	 * split if we have to.  We're also called when we're about to cross
+	 * the maximum page boundary: in that case, we do the actual split and
+	 * clean up all the previous boundaries, then keep going.
+	 */
+	switch (r->bnd_state) {
+	case SPLIT_BOUNDARY:				/* Case #1 */
+		/*
+		 * Save the information about where we are when the split would
+		 * have happened.
+		 */
+		WT_RET(__rec_split_bnd_grow(session, r));
+		last = &r->bnd[r->bnd_next++];
+		next = last + 1;
+
+		/* Set the number of entries for the just finished chunk. */
+		last->entries = r->entries - r->total_entries;
+		r->total_entries = r->entries;
+
+		/* Set the key for the next chunk. */
+		next->recno = r->recno;
+		if (dsk->type == WT_PAGE_ROW_INT ||
+		    dsk->type == WT_PAGE_ROW_LEAF)
+			WT_RET(__rec_split_row_promote(
+			    session, r, &next->key, dsk->type));
+
+		/*
+		 * Set the starting buffer address and clear the entries (the
+		 * latter not required, but cleaner).
+		 */
+		next->start = r->first_free;
+		next->entries = 0;
+
+		/*
+		 * Set the space available to another split-size chunk, if we
+		 * have one.  If we don't have room for another split chunk,
+		 * add whatever space remains in the maximum page size, and
+		 * hope it's enough.
+		 */
+		len = WT_PTRDIFF32(r->first_free, dsk);
+		if (len + r->split_size <= r->page_size)
+			r->space_avail =
+			    r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+		else {
+			r->bnd_state = SPLIT_MAX;
+			r->space_avail = r->page_size -
+			    (WT_PAGE_HEADER_BYTE_SIZE(btree) + len);
+		}
+		break;
+	case SPLIT_MAX:					/* Case #2 */
+		/*
+		 * It didn't all fit into a single page.
+		 *
+		 * Cycle through the saved split-point information, writing the
+		 * split chunks we have tracked.
+		 */
+		WT_RET(__rec_split_fixup(session, r));
+
+		/* We're done saving split chunks. */
+		r->bnd_state = SPLIT_TRACKING_OFF;
+		break;
+	case SPLIT_TRACKING_OFF:			/* Case #3 */
+		/*
+		 * It didn't all fit, but either we've already noticed it and
+		 * are now processing the rest of the page at the split-size
+		 * boundaries, or the split size was the same as the page size,
+		 * so we never bothered with saving split-point information.
+		 */
+		WT_RET(__rec_split_bnd_grow(session, r));
+		last = &r->bnd[r->bnd_next++];
+		next = last + 1;
+
+		/*
+		 * Set the key for the next chunk (before writing the block, a
+		 * key range is needed in that code).
+		 */
+		next->recno = r->recno;
+		if (dsk->type == WT_PAGE_ROW_INT ||
+		    dsk->type == WT_PAGE_ROW_LEAF)
+			WT_RET(__rec_split_row_promote(
+			    session, r, &next->key, dsk->type));
+
+		/* Clear the entries (not required, but cleaner). */
+		next->entries = 0;
+
+		/* Finalize the header information and write the page. */
+		dsk->recno = last->recno;
+		dsk->u.entries = r->entries;
+		dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk);
+		WT_RET(__rec_split_write(session, r, last, &r->dsk, 0));
+
+		/*
+		 * Set the caller's entry count and buffer information for the
+		 * next chunk.  We only get here if we're not splitting or have
+		 * already split, so it's split-size chunks from here on out.
+		 */
+		r->entries = 0;
+		r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
+		r->space_avail =
+		    r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+		break;
+	case SPLIT_TRACKING_RAW:
+	WT_ILLEGAL_VALUE(session);
+	}
+	return (0);
+}
+
+/*
+ * __rec_split_raw_worker --
+ *	Handle the raw compression page reconciliation bookkeeping.
+ */
+static int
+__rec_split_raw_worker(
+    WT_SESSION_IMPL *session, WT_RECONCILE *r, int no_more_rows)
+{
+	WT_BM *bm;
+	WT_BOUNDARY *last, *next;
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	WT_COMPRESSOR *compressor;
+	WT_DECL_RET;
+	WT_ITEM *dst, *write_ref;
+	WT_PAGE_HEADER *dsk, *dsk_dst;
+	WT_SESSION *wt_session;
+	size_t corrected_page_size, len, result_len;
+	uint64_t recno;
+	uint32_t entry, i, result_slots, slots;
+	int last_block;
+	uint8_t *dsk_start;
+
+	wt_session = (WT_SESSION *)session;
+	btree = S2BT(session);
+	bm = btree->bm;
+
+	unpack = &_unpack;
+	compressor = btree->compressor;
+	dst = &r->raw_destination;
+	dsk = r->dsk.mem;
+
+	WT_RET(__rec_split_bnd_grow(session, r));
+	last = &r->bnd[r->bnd_next];
+	next = last + 1;
+
+	/*
+	 * Build arrays of offsets and cumulative counts of cells and rows in
+	 * the page: the offset is the byte offset to the possible split-point
+	 * (adjusted for an initial chunk that cannot be compressed), entries
+	 * is the cumulative page entries covered by the byte offset, recnos is
+	 * the cumulative rows covered by the byte offset.
+	 */
+	if (r->entries >= r->raw_max_slots) {
+		__wt_free(session, r->raw_entries);
+		__wt_free(session, r->raw_offsets);
+		__wt_free(session, r->raw_recnos);
+		r->raw_max_slots = 0;
+
+		i = r->entries + 100;
+		WT_RET(__wt_calloc_def(session, i, &r->raw_entries));
+		WT_RET(__wt_calloc_def(session, i, &r->raw_offsets));
+		if (dsk->type == WT_PAGE_COL_INT ||
+		    dsk->type == WT_PAGE_COL_VAR)
+			WT_RET(__wt_calloc_def(session, i, &r->raw_recnos));
+		r->raw_max_slots = i;
+	}
+
+	/*
+	 * We're going to walk the disk image, which requires setting the
+	 * number of entries.
+	 */
+	dsk->u.entries = r->entries;
+
+	/*
+	 * We track the record number at each column-store split point, set an
+	 * initial value.
+	 */
+	recno = 0;
+	if (dsk->type == WT_PAGE_COL_VAR)
+		recno = last->recno;
+
+	entry = slots = 0;
+	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+		++entry;
+
+		/*
+		 * Row-store pages can split at keys, but not at values,
+		 * column-store pages can split at values.
+		 */
+		__wt_cell_unpack(cell, unpack);
+		switch (unpack->type) {
+		case WT_CELL_KEY:
+		case WT_CELL_KEY_OVFL:
+		case WT_CELL_KEY_SHORT:
+			break;
+		case WT_CELL_ADDR_DEL:
+		case WT_CELL_ADDR_INT:
+		case WT_CELL_ADDR_LEAF:
+		case WT_CELL_ADDR_LEAF_NO:
+		case WT_CELL_DEL:
+		case WT_CELL_VALUE:
+		case WT_CELL_VALUE_OVFL:
+		case WT_CELL_VALUE_SHORT:
+			if (dsk->type == WT_PAGE_COL_INT) {
+				recno = unpack->v;
+				break;
+			}
+			if (dsk->type == WT_PAGE_COL_VAR) {
+				recno += __wt_cell_rle(unpack);
+				break;
+			}
+			r->raw_entries[slots] = entry;
+			continue;
+		WT_ILLEGAL_VALUE(session);
+		}
+
+		/*
+		 * We can't compress the first 64B of the block (it must be
+		 * written without compression), and a possible split point
+		 * may appear in that 64B; keep it simple, ignore the first
+		 * allocation size of data, anybody splitting smaller than
+		 * that (as calculated before compression), is doing it wrong.
+		 */
+		if ((len = WT_PTRDIFF(cell, dsk)) > btree->allocsize)
+			r->raw_offsets[++slots] =
+			    WT_STORE_SIZE(len - WT_BLOCK_COMPRESS_SKIP);
+
+		if (dsk->type == WT_PAGE_COL_INT ||
+		    dsk->type == WT_PAGE_COL_VAR)
+			r->raw_recnos[slots] = recno;
+		r->raw_entries[slots] = entry;
+	}
+
+	/*
+	 * If we haven't managed to find at least one split point, we're done,
+	 * don't bother calling the underlying compression function.
+	 */
+	if (slots == 0) {
+		result_len = 0;
+		result_slots = 0;
+		goto no_slots;
+	}
+
+	/* The slot at array's end is the total length of the data. */
+	r->raw_offsets[++slots] =
+	    WT_STORE_SIZE(WT_PTRDIFF(cell, dsk) - WT_BLOCK_COMPRESS_SKIP);
+
+	/*
+	 * Allocate a destination buffer.  If there's a pre-size function, use
+	 * it to determine the destination buffer's minimum size, otherwise the
+	 * destination buffer is documented to be at least the maximum object
+	 * size.
+	 *
+	 * The destination buffer really only needs to be large enough for the
+	 * target block size, corrected for the requirements of the underlying
+	 * block manager.  If the target block size is 8KB, that's a multiple
+	 * of 512B and so the underlying block manager is fine with it.  But...
+	 * we don't control what the pre_size method returns us as a required
+	 * size, and we don't want to document the compress_raw method has to
+	 * skip bytes in the buffer because that's confusing, so do something
+	 * more complicated.  First, find out how much space the compress_raw
+	 * function might need, either the value returned from pre_size, or the
+	 * maximum object size.  Add the compress-skip bytes, and then correct
+	 * that value for the underlying block manager.   As a result, we have
+	 * a destination buffer that's the right "object" size when calling the
+	 * compress_raw method, and there are bytes in the header just for us.
+	 */
+	if (compressor->pre_size == NULL)
+		result_len = r->page_size_max;
+	else
+		WT_RET(compressor->pre_size(compressor, wt_session,
+		    (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
+		    (size_t)r->raw_offsets[slots], &result_len));
+	corrected_page_size = result_len + WT_BLOCK_COMPRESS_SKIP;
+	WT_RET(bm->write_size(bm, session, &corrected_page_size));
+	WT_RET(__wt_buf_init(session, dst, corrected_page_size));
+
+	/*
+	 * Copy the header bytes into the destination buffer, then call the
+	 * compression function.
+	 */
+	memcpy(dst->mem, dsk, WT_BLOCK_COMPRESS_SKIP);
+	ret = compressor->compress_raw(compressor, wt_session,
+	    r->page_size_max, btree->split_pct,
+	    WT_BLOCK_COMPRESS_SKIP, (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
+	    r->raw_offsets, slots,
+	    (uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP,
+	    result_len, no_more_rows, &result_len, &result_slots);
+	switch (ret) {
+	case EAGAIN:
+		/*
+		 * The compression function wants more rows; accumulate and
+		 * retry.
+		 *
+		 * Reset the resulting slots count, just in case the compression
+		 * function modified it before giving up.
+		 */
+		result_slots = 0;
+		break;
+	case 0:
+		/*
+		 * If the compression function returned zero result slots, it's
+		 * giving up and we write the original data.  (This is a pretty
+		 * bad result: we've not done compression on a block much larger
+		 * than the maximum page size, but once compression gives up,
+		 * there's not much else we can do.)
+		 *
+		 * If the compression function returned non-zero result slots,
+		 * we were successful and have a block to write.
+		 */
+		if (result_slots == 0) {
+			WT_STAT_FAST_DATA_INCR(session, compress_raw_fail);
+
+			/*
+			 * If there are no more rows, we can write the original
+			 * data from the original buffer.
+			 */
+			if (no_more_rows)
+				break;
+
+			/*
+			 * Copy the original data to the destination buffer, as
+			 * if the compression function simply copied it.  Take
+			 * all but the last row of the original data (the last
+			 * row has to be set as the key for the next block).
+			 */
+			result_slots = slots - 1;
+			result_len = r->raw_offsets[result_slots];
+			WT_RET(__wt_buf_grow(
+			    session, dst, result_len + WT_BLOCK_COMPRESS_SKIP));
+			memcpy((uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP,
+			    (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
+			    result_len);
+
+			/*
+			 * Mark it as uncompressed so the standard compression
+			 * function is called before the buffer is written.
+			 */
+			last->already_compressed = 0;
+		} else {
+			WT_STAT_FAST_DATA_INCR(session, compress_raw_ok);
+
+			/*
+			 * If there are more rows and the compression function
+			 * consumed all of the current data, there are problems:
+			 * First, with row-store objects, we're potentially
+			 * skipping updates, we must have a key for the next
+			 * block so we know with what block a skipped update is
+			 * associated.  Second, if the compression function
+			 * compressed all of the data, we're not pushing it
+			 * hard enough (unless we got lucky and gave it exactly
+			 * the right amount to work with, which is unlikely).
+			 * Handle both problems by accumulating more data any
+			 * time we're not writing the last block and compression
+			 * ate all of the rows.
+			 */
+			if (result_slots == slots && !no_more_rows)
+				result_slots = 0;
+			else
+				last->already_compressed = 1;
+		}
+		break;
+	default:
+		return (ret);
+	}
+
+no_slots:
+	/*
+	 * Check for the last block we're going to write: if no more rows and
+	 * we failed to compress anything, or we compressed everything, it's
+	 * the last block.
+	 */
+	last_block = no_more_rows &&
+	    (result_slots == 0 || result_slots == slots);
+
+	if (result_slots != 0) {
+		/*
+		 * We have a block, finalize the header information.
+		 */
+		dst->size = result_len + WT_BLOCK_COMPRESS_SKIP;
+		dsk_dst = dst->mem;
+		dsk_dst->recno = last->recno;
+		dsk_dst->mem_size =
+		    r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP;
+		dsk_dst->u.entries = r->raw_entries[result_slots - 1];
+
+		/*
+		 * There is likely a remnant in the working buffer that didn't
+		 * get compressed; copy it down to the start of the buffer and
+		 * update the starting record number, free space and so on.
+		 * !!!
+		 * Note use of memmove, the source and destination buffers can
+		 * overlap.
+		 */
+		len = WT_PTRDIFF(r->first_free, (uint8_t *)dsk +
+		    r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP);
+		dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
+		(void)memmove(dsk_start, (uint8_t *)r->first_free - len, len);
+
+		r->entries -= r->raw_entries[result_slots - 1];
+		r->first_free = dsk_start + len;
+		r->space_avail =
+		    r->page_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len);
+
+		/*
+		 * Set the key for the next block (before writing the block, a
+		 * key range is needed in that code).
+		 */
+		switch (dsk->type) {
+		case WT_PAGE_COL_INT:
+			next->recno = r->raw_recnos[result_slots];
+			break;
+		case WT_PAGE_COL_VAR:
+			next->recno = r->raw_recnos[result_slots - 1];
+			break;
+		case WT_PAGE_ROW_INT:
+		case WT_PAGE_ROW_LEAF:
+			next->recno = 0;
+			if (!last_block) {
+				/*
+				 * Confirm there was uncompressed data remaining
+				 * in the buffer, we're about to read it for the
+				 * next chunk's initial key.
+				 */
+				WT_ASSERT(session, len > 0);
+				WT_RET(__rec_split_row_promote_cell(
+				    session, dsk, &next->key));
+			}
+			break;
+		}
+		write_ref = dst;
+	} else if (no_more_rows) {
+		/*
+		 * Compression failed and there are no more rows to accumulate,
+		 * write the original buffer instead.
+		 */
+		WT_STAT_FAST_DATA_INCR(session, compress_raw_fail);
+
+		dsk->recno = last->recno;
+		dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk);
+		dsk->u.entries = r->entries;
+
+		r->entries = 0;
+		r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
+		r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+
+		write_ref = &r->dsk;
+		last->already_compressed = 0;
+	} else {
+		/*
+		 * Compression failed, there are more rows to accumulate and the
+		 * compression function wants to try again; increase the size of
+		 * the "page" and try again after we accumulate some more rows.
+		 */
+		WT_STAT_FAST_DATA_INCR(session, compress_raw_fail_temporary);
+
+		len = WT_PTRDIFF(r->first_free, r->dsk.mem);
+		corrected_page_size = r->page_size * 2;
+		WT_RET(bm->write_size(bm, session, &corrected_page_size));
+		WT_RET(__wt_buf_grow(session, &r->dsk, corrected_page_size));
+		r->page_size *= 2;
+		r->first_free = (uint8_t *)r->dsk.mem + len;
+		r->space_avail =
+		    r->page_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len);
+		return (0);
+	}
+
+	/* We have a block, update the boundary counter. */
+	++r->bnd_next;
+
+	/*
+	 * If we are writing the whole page in our first/only attempt, it might
+	 * be a checkpoint (checkpoints are only a single page, by definition).
+	 * Further, checkpoints aren't written here, the wrapup functions do the
+	 * write, and they do the write from the original buffer location.  If
+	 * it's a checkpoint and the block isn't in the right buffer, copy it.
+	 *
+	 * If it's not a checkpoint, write the block.
+	 */
+	if (r->bnd_next == 1 && last_block && __rec_is_checkpoint(r, last)) {
+		if (write_ref == dst)
+			WT_RET(__wt_buf_set(
+			    session, &r->dsk, dst->mem, dst->size));
+	} else
+		WT_RET(
+		    __rec_split_write(session, r, last, write_ref, last_block));
+	return (0);
+}
+
+/*
+ * __rec_raw_decompress --
+ *	Decompress a raw-compressed image.
+ */
+static int
+__rec_raw_decompress(
+    WT_SESSION_IMPL *session, const void *image, size_t size, void *retp)
+{
+	WT_BTREE *btree;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	WT_PAGE_HEADER const *dsk;
+	size_t result_len;
+
+	btree = S2BT(session);
+	dsk = image;
+
+	/*
+	 * We skipped an update and we can't write a block, but unfortunately,
+	 * the block has already been compressed.   Decompress the block so we
+	 * can subsequently re-instantiate it in memory.
+	 */
+	WT_RET(__wt_scr_alloc(session, dsk->mem_size, &tmp));
+	memcpy(tmp->mem, image, WT_BLOCK_COMPRESS_SKIP);
+	WT_ERR(btree->compressor->decompress(btree->compressor,
+	    &session->iface,
+	    (uint8_t *)image + WT_BLOCK_COMPRESS_SKIP,
+	    size - WT_BLOCK_COMPRESS_SKIP,
+	    (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP,
+	    dsk->mem_size - WT_BLOCK_COMPRESS_SKIP,
+	    &result_len));
+	if (result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP)
+		WT_ERR(__wt_illegal_value(session, btree->dhandle->name));
+
+	WT_ERR(__wt_strndup(session, tmp->data, dsk->mem_size, retp));
+	WT_ASSERT(session, __wt_verify_dsk_image(
+	    session, "[raw evict split]", tmp->data, dsk->mem_size) == 0);
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __rec_split_raw --
+ *	Raw compression split routine.
+ */
+static inline int
+__rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+	return (__rec_split_raw_worker(session, r, 0));
+}
+
+/*
+ * __rec_split_finish_std --
+ *	Finish processing a page, standard version.
+ */
+static int
+__rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+	WT_BOUNDARY *bnd;
+	WT_PAGE_HEADER *dsk;
+
+	/* Adjust the boundary information based on our split status. */
+	switch (r->bnd_state) {
+	case SPLIT_BOUNDARY:
+	case SPLIT_MAX:
+		/*
+		 * We never split, the reconciled page fit into a maximum page
+		 * size.  Change the first boundary slot to represent the full
+		 * page (the first boundary slot is largely correct, just update
+		 * the number of entries).
+		 */
+		r->bnd_next = 0;
+		break;
+	case SPLIT_TRACKING_OFF:
+		/*
+		 * If we have already split, or aren't tracking boundaries, put
+		 * the remaining data in the next boundary slot.
+		 */
+		WT_RET(__rec_split_bnd_grow(session, r));
+		break;
+	case SPLIT_TRACKING_RAW:
+		/*
+		 * We were configured for raw compression, but never actually
+		 * wrote anything.
+		 */
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	/*
+	 * We only arrive here with no entries to write if the page was entirely
+	 * empty, and if the page is empty, we merge it into its parent during
+	 * the parent's reconciliation.  A page with skipped updates isn't truly
+	 * empty, continue on.
+	 */
+	if (r->entries == 0 && r->skip_next == 0)
+		return (0);
+
+	/* Set the boundary reference and increment the count. */
+	bnd = &r->bnd[r->bnd_next++];
+	bnd->entries = r->entries;
+
+	/* Finalize the header information. */
+	dsk = r->dsk.mem;
+	dsk->recno = bnd->recno;
+	dsk->u.entries = r->entries;
+	dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk);
+
+	/* If this is a checkpoint, we're done, otherwise write the page. */
+	return (
+	    __rec_is_checkpoint(r, bnd) ? 0 :
+	    __rec_split_write(session, r, bnd, &r->dsk, 1));
+}
+
+/*
+ * __rec_split_finish --
+ *	Finish processing a page.
+ */
+static int
+__rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+	/* We're done reconciling - write the final page */
+	if (r->raw_compression && r->entries != 0) {
+		while (r->entries != 0)
+			WT_RET(__rec_split_raw_worker(session, r, 1));
+	} else
+		WT_RET(__rec_split_finish_std(session, r));
+
+	return (0);
+}
+
+/*
+ * __rec_split_fixup --
+ *	Fix up after crossing the maximum page boundary.
+ */
+static int
+__rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+	WT_BOUNDARY *bnd;
+	WT_BTREE *btree;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	WT_PAGE_HEADER *dsk;
+	uint32_t i, len;
+	uint8_t *dsk_start;
+
+	/*
+	 * When we overflow physical limits of the page, we walk the list of
+	 * split chunks we've created and write those pages out, then update
+	 * the caller's information.
+	 */
+	btree = S2BT(session);
+
+	/*
+	 * The data isn't laid out on a page boundary or nul padded; copy it to
+	 * a clean, aligned, padded buffer before writing it.
+	 *
+	 * Allocate a scratch buffer to hold the new disk image.  Copy the
+	 * WT_PAGE_HEADER header onto the scratch buffer, most of the header
+	 * information remains unchanged between the pages.
+	 */
+	WT_RET(__wt_scr_alloc(session, r->page_size_max, &tmp));
+	dsk = tmp->mem;
+	memcpy(dsk, r->dsk.mem, WT_PAGE_HEADER_SIZE);
+
+	/*
+	 * For each split chunk we've created, update the disk image and copy
+	 * it into place.
+	 */
+	dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
+	for (i = 0, bnd = r->bnd; i < r->bnd_next; ++i, ++bnd) {
+		/* Copy the page contents to the temporary buffer. */
+		len = WT_PTRDIFF32((bnd + 1)->start, bnd->start);
+		memcpy(dsk_start, bnd->start, len);
+
+		/* Finalize the header information and write the page. */
+		dsk->recno = bnd->recno;
+		dsk->u.entries = bnd->entries;
+		dsk->mem_size =
+		    tmp->size = WT_PAGE_HEADER_BYTE_SIZE(btree) + len;
+		WT_ERR(__rec_split_write(session, r, bnd, tmp, 0));
+	}
+
+	/*
+	 * There is probably a remnant in the working buffer that didn't get
+	 * written; copy it down to the beginning of the working buffer, and
+	 * update the starting record number.
+	 *
+	 * Confirm the remnant is no larger than the available split buffer.
+	 *
+	 * Fix up our caller's information.
+	 */
+	len = WT_PTRDIFF32(r->first_free, bnd->start);
+	if (len >= r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree))
+		WT_PANIC_ERR(session, EINVAL,
+		    "Reconciliation remnant too large for the split buffer");
+
+	dsk = r->dsk.mem;
+	dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
+	(void)memmove(dsk_start, bnd->start, len);
+
+	r->entries -= r->total_entries;
+	r->first_free = dsk_start + len;
+	r->space_avail =
+	    (r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree)) - len;
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __rec_split_write --
+ *	Write a disk block out for the split helper functions.
+ */
+static int
+__rec_split_write(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, WT_BOUNDARY *bnd, WT_ITEM *buf, int last_block)
+{
+	WT_BTREE *btree;
+	WT_DECL_ITEM(key);
+	WT_DECL_RET;
+	WT_MULTI *multi;
+	WT_PAGE *page;
+	WT_PAGE_HEADER *dsk;
+	WT_PAGE_MODIFY *mod;
+	WT_UPD_SKIPPED *skip;
+	size_t addr_size;
+	uint32_t bnd_slot, i, j;
+	int cmp;
+	uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE];
+
+	btree = S2BT(session);
+	dsk = buf->mem;
+	page = r->page;
+	mod = page->modify;
+
+	WT_RET(__wt_scr_alloc(session, 0, &key));
+
+	/* Set the zero-length value flag in the page header. */
+	if (dsk->type == WT_PAGE_ROW_LEAF) {
+		F_CLR(dsk, WT_PAGE_EMPTY_V_ALL | WT_PAGE_EMPTY_V_NONE);
+
+		if (r->entries != 0 && r->all_empty_value)
+			F_SET(dsk, WT_PAGE_EMPTY_V_ALL);
+		if (r->entries != 0 && !r->any_empty_value)
+			F_SET(dsk, WT_PAGE_EMPTY_V_NONE);
+	}
+
+	/* Initialize the address (set the page type for the parent). */
+	switch (dsk->type) {
+	case WT_PAGE_COL_FIX:
+		bnd->addr.type = WT_ADDR_LEAF_NO;
+		break;
+	case WT_PAGE_COL_VAR:
+	case WT_PAGE_ROW_LEAF:
+		bnd->addr.type = r->ovfl_items ? WT_ADDR_LEAF : WT_ADDR_LEAF_NO;
+		break;
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_ROW_INT:
+		bnd->addr.type = WT_ADDR_INT;
+		break;
+	WT_ILLEGAL_VALUE_ERR(session);
+	}
+
+	bnd->size = (uint32_t)buf->size;
+	bnd->cksum = 0;
+
+	/*
+	 * Check if we've skipped updates that belong to this block, and move
+	 * any to the per-block structure.  Quit as soon as we find a skipped
+	 * update that doesn't belong to the block, they're in sorted order.
+	 *
+	 * This code requires a key be filled in for the next block (or the
+	 * last block flag be set, if there's no next block).
+	 */
+	for (i = 0, skip = r->skip; i < r->skip_next; ++i, ++skip) {
+		/* The last block gets all remaining skipped updates. */
+		if (last_block) {
+			WT_ERR(__rec_skip_update_move(session, bnd, skip));
+			continue;
+		}
+
+		/*
+		 * Get the skipped update's key and compare it with this block's
+		 * key range.  If the skipped update list belongs with the block
+		 * we're about to write, move it to the per-block memory.  Check
+		 * only to the first update that doesn't go with the block, they
+		 * must be in sorted order.
+		 */
+		switch (page->type) {
+		case WT_PAGE_COL_FIX:
+		case WT_PAGE_COL_VAR:
+			if (WT_INSERT_RECNO(skip->ins) >= (bnd + 1)->recno)
+				goto skip_check_complete;
+			break;
+		case WT_PAGE_ROW_LEAF:
+			if (skip->ins == NULL)
+				WT_ERR(__wt_row_leaf_key(
+				    session, page, skip->rip, key, 0));
+			else {
+				key->data = WT_INSERT_KEY(skip->ins);
+				key->size = WT_INSERT_KEY_SIZE(skip->ins);
+			}
+			WT_ERR(__wt_compare(session,
+			    btree->collator, key, &(bnd + 1)->key, &cmp));
+			if (cmp >= 0)
+				goto skip_check_complete;
+			break;
+		WT_ILLEGAL_VALUE_ERR(session);
+		}
+		WT_ERR(__rec_skip_update_move(session, bnd, skip));
+	}
+
+skip_check_complete:
+	/*
+	 * If there are updates that weren't moved to the block, shuffle them to
+	 * the beginning of the cached list (we maintain the skipped updates in
+	 * sorted order, new skipped updates must be appended to the list).
+	 */
+	for (j = 0; i < r->skip_next; ++j, ++i)
+		r->skip[j] = r->skip[i];
+	r->skip_next = j;
+
+	/*
+	 * If we had to skip updates in order to build this disk image, we can't
+	 * actually write it. Instead, we will re-instantiate the page using the
+	 * disk image and the list of updates we skipped.
+	 *
+	 * If the buffer is compressed (raw compression was configured), we have
+	 * to decompress it so we can instantiate it later.
+	 */
+	if (bnd->skip != NULL) {
+		if (bnd->already_compressed)
+			WT_ERR(__rec_raw_decompress(
+			    session, buf->data, buf->size, &bnd->dsk));
+		else {
+			WT_ERR(__wt_strndup(
+			    session, buf->data, buf->size, &bnd->dsk));
+			WT_ASSERT(session, __wt_verify_dsk_image(session,
+			    "[evict split]", buf->data, buf->size) == 0);
+		}
+		goto done;
+	}
+
+	/*
+	 * If we wrote this block before, re-use it.  Pages get written in the
+	 * same block order every time, only check the appropriate slot.  The
+	 * expensive part of this test is the checksum, only do that work when
+	 * there has been or will be a reconciliation of this page involving
+	 * split pages.  This test isn't perfect: we're doing a checksum if a
+	 * previous reconciliation of the page split or if we will split this
+	 * time, but that test won't calculate a checksum on the first block
+	 * the first time the page splits.
+	 */
+	bnd_slot = (uint32_t)(bnd - r->bnd);
+	if (bnd_slot > 1 ||
+	    (F_ISSET(mod, WT_PM_REC_MULTIBLOCK) && mod->mod_multi != NULL)) {
+		/*
+		 * There are page header fields which need to be cleared to get
+		 * consistent checksums: specifically, the write generation and
+		 * the memory owned by the block manager.  We are reusing the
+		 * same buffer space each time, clear it before calculating the
+		 * checksum.
+		 */
+		dsk->write_gen = 0;
+		memset(WT_BLOCK_HEADER_REF(dsk), 0, btree->block_header);
+		bnd->cksum = __wt_cksum(buf->data, buf->size);
+
+		if (F_ISSET(mod, WT_PM_REC_MULTIBLOCK) &&
+		    mod->mod_multi_entries > bnd_slot) {
+			multi = &mod->mod_multi[bnd_slot];
+			if (multi->size == bnd->size &&
+			    multi->cksum == bnd->cksum) {
+				multi->addr.reuse = 1;
+				bnd->addr = multi->addr;
+
+				WT_STAT_FAST_DATA_INCR(session, rec_page_match);
+				goto done;
+			}
+		}
+	}
+
+	WT_ERR(__wt_bt_write(session,
+	    buf, addr, &addr_size, 0, bnd->already_compressed));
+	WT_ERR(__wt_strndup(session, addr, addr_size, &bnd->addr.addr));
+	bnd->addr.size = (uint8_t)addr_size;
+
+done:
+err:	__wt_scr_free(&key);
+	return (ret);
+}
+
+/*
+ * __wt_bulk_init --
+ *	Bulk insert initialization.
+ */
+int
+__wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+	WT_BTREE *btree;
+	WT_PAGE_INDEX *pindex;
+	WT_RECONCILE *r;
+	uint64_t recno;
+
+	btree = S2BT(session);
+	/*
+	 * Bulk-load is only permitted on newly created files, not any empty
+	 * file -- see the checkpoint code for a discussion.
+	 */
+	if (!btree->bulk_load_ok)
+		WT_RET_MSG(session, EINVAL,
+		    "bulk-load is only possible for newly created trees");
+
+	/* Set a reference to the empty leaf page. */
+	pindex = WT_INTL_INDEX_COPY(btree->root.page);
+	cbulk->ref = pindex->index[0];
+	cbulk->leaf = cbulk->ref->page;
+
+	WT_RET(
+	    __rec_write_init(session, cbulk->ref, 0, NULL, &cbulk->reconcile));
+	r = cbulk->reconcile;
+	r->is_bulk_load = 1;
+
+	switch (btree->type) {
+	case BTREE_COL_FIX:
+	case BTREE_COL_VAR:
+		recno = 1;
+		break;
+	case BTREE_ROW:
+		recno = 0;
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	return (__rec_split_init(
+	    session, r, cbulk->leaf, recno, btree->maxleafpage));
+}
+
+/*
+ * __wt_bulk_wrapup --
+ *	Bulk insert cleanup.
+ */
+int
+__wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+	WT_BTREE *btree;
+	WT_PAGE *parent;
+	WT_RECONCILE *r;
+
+	r = cbulk->reconcile;
+	btree = S2BT(session);
+
+	switch (btree->type) {
+	case BTREE_COL_FIX:
+		if (cbulk->entry != 0)
+			__rec_incr(session, r, cbulk->entry,
+			    __bitstr_size(
+			    (size_t)cbulk->entry * btree->bitcnt));
+		break;
+	case BTREE_COL_VAR:
+		if (cbulk->rle != 0)
+			WT_RET(__wt_bulk_insert_var(session, cbulk));
+		break;
+	case BTREE_ROW:
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	WT_RET(__rec_split_finish(session, r));
+	WT_RET(__rec_write_wrapup(session, r, r->page));
+
+	/* Mark the page's parent dirty. */
+	parent = r->ref->home;
+	WT_RET(__wt_page_modify_init(session, parent));
+	__wt_page_modify_set(session, parent);
+
+	__rec_destroy(session, &cbulk->reconcile);
+
+	return (0);
+}
+
+/*
+ * __wt_bulk_insert_row --
+ *	Row-store bulk insert.
+ */
+int
+__wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+	WT_BTREE *btree;
+	WT_CURSOR *cursor;
+	WT_KV *key, *val;
+	WT_RECONCILE *r;
+	int ovfl_key;
+
+	r = cbulk->reconcile;
+	btree = S2BT(session);
+	cursor = &cbulk->cbt.iface;
+
+	key = &r->k;
+	val = &r->v;
+	WT_RET(__rec_cell_build_leaf_key(session, r,	/* Build key cell */
+	    cursor->key.data, cursor->key.size, &ovfl_key));
+	WT_RET(__rec_cell_build_val(session, r,		/* Build value cell */
+	    cursor->value.data, cursor->value.size, (uint64_t)0));
+
+	/* Boundary: split or write the page. */
+	while (key->len + val->len > r->space_avail)
+		if (r->raw_compression)
+			WT_RET(__rec_split_raw(session, r));
+		else {
+			WT_RET(__rec_split(session, r));
+
+			/*
+			 * Turn off prefix compression until a full key written
+			 * to the new page, and (unless we're already working
+			 * with an overflow key), rebuild the key without prefix
+			 * compression.
+			 */
+			if (r->key_pfx_compress_conf) {
+				r->key_pfx_compress = 0;
+				if (!ovfl_key)
+					WT_RET(__rec_cell_build_leaf_key(
+					    session, r, NULL, 0, &ovfl_key));
+			}
+		}
+
+	/* Copy the key/value pair onto the page. */
+	__rec_copy_incr(session, r, key);
+	if (val->len == 0)
+		r->any_empty_value = 1;
+	else {
+		r->all_empty_value = 0;
+		if (btree->dictionary)
+			WT_RET(__rec_dict_replace(session, r, 0, val));
+		__rec_copy_incr(session, r, val);
+	}
+
+	/* Update compression state. */
+	__rec_key_state_update(r, ovfl_key);
+
+	return (0);
+}
+
+/*
+ * __rec_col_fix_bulk_insert_split_check --
+ *	Check if a bulk-loaded fixed-length column store page needs to split.
+ */
+static inline int
+__rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk)
+{
+	WT_BTREE *btree;
+	WT_RECONCILE *r;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session;
+	r = cbulk->reconcile;
+	btree = S2BT(session);
+
+	if (cbulk->entry == cbulk->nrecs) {
+		if (cbulk->entry != 0) {
+			/*
+			 * If everything didn't fit, update the counters and
+			 * split.
+			 *
+			 * Boundary: split or write the page.
+			 */
+			__rec_incr(session, r, cbulk->entry,
+			    __bitstr_size(
+			    (size_t)cbulk->entry * btree->bitcnt));
+			WT_RET(__rec_split(session, r));
+		}
+		cbulk->entry = 0;
+		cbulk->nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail);
+	}
+	return (0);
+}
+
+/*
+ * __wt_bulk_insert_fix --
+ *	Fixed-length column-store bulk insert.
+ */
+int
+__wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+	WT_BTREE *btree;
+	WT_CURSOR *cursor;
+	WT_RECONCILE *r;
+	uint32_t entries, offset, page_entries, page_size;
+	const uint8_t *data;
+
+	r = cbulk->reconcile;
+	btree = S2BT(session);
+	cursor = &cbulk->cbt.iface;
+
+	if (cbulk->bitmap) {
+		if (((r->recno - 1) * btree->bitcnt) & 0x7)
+			WT_RET_MSG(session, EINVAL,
+			    "Bulk bitmap load not aligned on a byte boundary");
+		for (data = cursor->value.data,
+		    entries = (uint32_t)cursor->value.size;
+		    entries > 0;
+		    entries -= page_entries, data += page_size) {
+			WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk));
+
+			page_entries =
+			    WT_MIN(entries, cbulk->nrecs - cbulk->entry);
+			page_size = __bitstr_size(page_entries * btree->bitcnt);
+			offset = __bitstr_size(cbulk->entry * btree->bitcnt);
+			memcpy(r->first_free + offset, data, page_size);
+			cbulk->entry += page_entries;
+			r->recno += page_entries;
+		}
+		return (0);
+	}
+
+	WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk));
+
+	__bit_setv(r->first_free,
+	    cbulk->entry, btree->bitcnt, ((uint8_t *)cursor->value.data)[0]);
+	++cbulk->entry;
+	++r->recno;
+
+	return (0);
+}
+
+/*
+ * __wt_bulk_insert_var --
+ *	Variable-length column-store bulk insert.
+ */
+int
+__wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+	WT_BTREE *btree;
+	WT_KV *val;
+	WT_RECONCILE *r;
+
+	r = cbulk->reconcile;
+	btree = S2BT(session);
+
+	/*
+	 * Store the bulk cursor's last buffer, not the current value, we're
+	 * creating a duplicate count, which means we want the previous value
+	 * seen, not the current value.
+	 */
+	val = &r->v;
+	WT_RET(__rec_cell_build_val(
+	    session, r, cbulk->last.data, cbulk->last.size, cbulk->rle));
+
+	/* Boundary: split or write the page. */
+	while (val->len > r->space_avail)
+		if (r->raw_compression)
+			WT_RET(__rec_split_raw(session, r));
+		else
+			WT_RET(__rec_split(session, r));
+
+	/* Copy the value onto the page. */
+	if (btree->dictionary)
+		WT_RET(__rec_dict_replace(session, r, cbulk->rle, val));
+	__rec_copy_incr(session, r, val);
+
+	/* Update the starting record number in case we split. */
+	r->recno += cbulk->rle;
+
+	return (0);
+}
+
+/*
+ * __rec_vtype --
+ *	Return a value cell's address type.
+ */
+static inline u_int
+__rec_vtype(WT_ADDR *addr)
+{
+	if (addr->type == WT_ADDR_INT)
+		return (WT_CELL_ADDR_INT);
+	if (addr->type == WT_ADDR_LEAF)
+		return (WT_CELL_ADDR_LEAF);
+	return (WT_CELL_ADDR_LEAF_NO);
+}
+
+/*
+ * __rec_col_int --
+ *	Reconcile a column-store internal page.
+ */
+static int
+__rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+	WT_ADDR *addr;
+	WT_BTREE *btree;
+	WT_CELL_UNPACK *vpack, _vpack;
+	WT_DECL_RET;
+	WT_KV *val;
+	WT_PAGE *child;
+	WT_REF *ref;
+	int hazard, state;
+
+	btree = S2BT(session);
+	child = NULL;
+	hazard = 0;
+
+	val = &r->v;
+	vpack = &_vpack;
+
+	WT_RET(__rec_split_init(
+	    session, r, page, page->pg_intl_recno, btree->maxintlpage));
+
+	/* For each entry in the in-memory page... */
+	WT_INTL_FOREACH_BEGIN(session, page, ref) {
+		/* Update the starting record number in case we split. */
+		r->recno = ref->key.recno;
+
+		/*
+		 * Modified child.
+		 * The page may be emptied or internally created during a split.
+		 * Deleted/split pages are merged into the parent and discarded.
+		 */
+		WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state));
+		addr = NULL;
+		child = ref->page;
+		if (state != 0) {
+			/*
+			 * Currently the only non-zero returned stated possible
+			 * for a column-store page is child-modified (all other
+			 * states are part of the fast-truncate support, which
+			 * is row-store only).
+			 */
+			WT_ASSERT(session, state == WT_CHILD_MODIFIED);
+
+			switch (F_ISSET(child->modify, WT_PM_REC_MASK)) {
+			case WT_PM_REC_EMPTY:
+				/*
+				 * Column-store pages are almost never empty, as
+				 * discarding a page would remove a chunk of the
+				 * name space.  The exceptions are pages created
+				 * when the tree is created, and never filled.
+				 */
+				CHILD_RELEASE_ERR(session, hazard, ref);
+				continue;
+			case WT_PM_REC_MULTIBLOCK:
+				WT_ERR(__rec_col_merge(session, r, child));
+				CHILD_RELEASE_ERR(session, hazard, ref);
+				continue;
+			case WT_PM_REC_REPLACE:
+				addr = &child->modify->mod_replace;
+				break;
+			WT_ILLEGAL_VALUE_ERR(session);
+			}
+		}
+
+		/*
+		 * Build the value cell.  The child page address is in one of 3
+		 * places: if the page was replaced, the page's modify structure
+		 * references it and we built the value cell just above in the
+		 * switch statement.  Else, the WT_REF->addr reference points to
+		 * an on-page cell or an off-page WT_ADDR structure: if it's an
+		 * on-page cell and we copy it from the page, else build a new
+		 * cell.
+		 */
+		if (addr == NULL && __wt_off_page(page, ref->addr))
+			addr = ref->addr;
+		if (addr == NULL) {
+			__wt_cell_unpack(ref->addr, vpack);
+			val->buf.data = ref->addr;
+			val->buf.size = __wt_cell_total_len(vpack);
+			val->cell_len = 0;
+			val->len = val->buf.size;
+		} else
+			__rec_cell_build_addr(r, addr->addr, addr->size,
+			    __rec_vtype(addr), ref->key.recno);
+		CHILD_RELEASE_ERR(session, hazard, ref);
+
+		/* Boundary: split or write the page. */
+		while (val->len > r->space_avail)
+			if (r->raw_compression)
+				WT_ERR(__rec_split_raw(session, r));
+			else
+				WT_ERR(__rec_split(session, r));
+
+		/* Copy the value onto the page. */
+		__rec_copy_incr(session, r, val);
+	} WT_INTL_FOREACH_END;
+
+	/* Write the remnant page. */
+	return (__rec_split_finish(session, r));
+
+err:	CHILD_RELEASE(session, hazard, ref);
+	return (ret);
+}
+
+/*
+ * __rec_col_merge --
+ *	Merge in a split page.
+ */
+static int
+__rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+	WT_ADDR *addr;
+	WT_KV *val;
+	WT_MULTI *multi;
+	WT_PAGE_MODIFY *mod;
+	uint32_t i;
+
+	mod = page->modify;
+
+	val = &r->v;
+
+	/* For each entry in the split array... */
+	for (multi = mod->mod_multi,
+	    i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+		/* Update the starting record number in case we split. */
+		r->recno = multi->key.recno;
+
+		/* Build the value cell. */
+		addr = &multi->addr;
+		__rec_cell_build_addr(r,
+		    addr->addr, addr->size, __rec_vtype(addr), r->recno);
+
+		/* Boundary: split or write the page. */
+		while (val->len > r->space_avail)
+			if (r->raw_compression)
+				WT_RET(__rec_split_raw(session, r));
+			else
+				WT_RET(__rec_split(session, r));
+
+		/* Copy the value onto the page. */
+		__rec_copy_incr(session, r, val);
+	}
+	return (0);
+}
+
+/*
+ * __rec_col_fix --
+ *	Reconcile a fixed-width, column-store leaf page.
+ */
+static int
+__rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+	WT_BTREE *btree;
+	WT_INSERT *ins;
+	WT_UPDATE *upd;
+	uint64_t recno;
+	uint32_t entry, nrecs;
+
+	btree = S2BT(session);
+
+	WT_RET(__rec_split_init(
+	    session, r, page, page->pg_fix_recno, btree->maxleafpage));
+
+	/* Update any changes to the original on-page data items. */
+	WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page)) {
+		WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
+		if (upd != NULL)
+			__bit_setv_recno(page, WT_INSERT_RECNO(ins),
+			    btree->bitcnt, ((uint8_t *)WT_UPDATE_DATA(upd))[0]);
+	}
+
+	/* Copy the updated, disk-image bytes into place. */
+	memcpy(r->first_free, page->pg_fix_bitf,
+	    __bitstr_size((size_t)page->pg_fix_entries * btree->bitcnt));
+
+	/* Calculate the number of entries per page remainder. */
+	entry = page->pg_fix_entries;
+	nrecs = WT_FIX_BYTES_TO_ENTRIES(
+	    btree, r->space_avail) - page->pg_fix_entries;
+	r->recno += entry;
+
+	/* Walk any append list. */
+	WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) {
+		WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
+		if (upd == NULL)
+			continue;
+		for (;;) {
+			/*
+			 * The application may have inserted records which left
+			 * gaps in the name space.
+			 */
+			for (recno = WT_INSERT_RECNO(ins);
+			    nrecs > 0 && r->recno < recno;
+			    --nrecs, ++entry, ++r->recno)
+				__bit_setv(
+				    r->first_free, entry, btree->bitcnt, 0);
+
+			if (nrecs > 0) {
+				__bit_setv(r->first_free, entry, btree->bitcnt,
+				    ((uint8_t *)WT_UPDATE_DATA(upd))[0]);
+				--nrecs;
+				++entry;
+				++r->recno;
+				break;
+			}
+
+			/*
+			 * If everything didn't fit, update the counters and
+			 * split.
+			 *
+			 * Boundary: split or write the page.
+			 */
+			__rec_incr(session, r, entry,
+			    __bitstr_size((size_t)entry * btree->bitcnt));
+			WT_RET(__rec_split(session, r));
+
+			/* Calculate the number of entries per page. */
+			entry = 0;
+			nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail);
+		}
+	}
+
+	/* Update the counters. */
+	__rec_incr(
+	    session, r, entry, __bitstr_size((size_t)entry * btree->bitcnt));
+
+	/* Write the remnant page. */
+	return (__rec_split_finish(session, r));
+}
+
+/*
+ * __rec_col_fix_slvg --
+ *	Reconcile a fixed-width, column-store leaf page created during salvage.
+ */
+static int
+__rec_col_fix_slvg(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
+{
+	WT_BTREE *btree;
+	uint64_t page_start, page_take;
+	uint32_t entry, nrecs;
+
+	btree = S2BT(session);
+
+	/*
+	 * !!!
+	 * It's vanishingly unlikely and probably impossible for fixed-length
+	 * column-store files to have overlapping key ranges.  It's possible
+	 * for an entire key range to go missing (if a page is corrupted and
+	 * lost), but because pages can't split, it shouldn't be possible to
+	 * find pages where the key ranges overlap.  That said, we check for
+	 * it during salvage and clean up after it here because it doesn't
+	 * cost much and future column-store formats or operations might allow
+	 * for fixed-length format ranges to overlap during salvage, and I
+	 * don't want to have to retrofit the code later.
+	 */
+	WT_RET(__rec_split_init(
+	    session, r, page, page->pg_fix_recno, btree->maxleafpage));
+
+	/* We may not be taking all of the entries on the original page. */
+	page_take = salvage->take == 0 ? page->pg_fix_entries : salvage->take;
+	page_start = salvage->skip == 0 ? 0 : salvage->skip;
+
+	/* Calculate the number of entries per page. */
+	entry = 0;
+	nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail);
+
+	for (; nrecs > 0 && salvage->missing > 0;
+	    --nrecs, --salvage->missing, ++entry)
+		__bit_setv(r->first_free, entry, btree->bitcnt, 0);
+
+	for (; nrecs > 0 && page_take > 0;
+	    --nrecs, --page_take, ++page_start, ++entry)
+		__bit_setv(r->first_free, entry, btree->bitcnt,
+		    __bit_getv(page->pg_fix_bitf,
+			(uint32_t)page_start, btree->bitcnt));
+
+	r->recno += entry;
+	__rec_incr(session, r, entry,
+	    __bitstr_size((size_t)entry * btree->bitcnt));
+
+	/*
+	 * We can't split during salvage -- if everything didn't fit, it's
+	 * all gone wrong.
+	 */
+	if (salvage->missing != 0 || page_take != 0)
+		WT_PANIC_RET(session, WT_PANIC,
+		    "%s page too large, attempted split during salvage",
+		    __wt_page_type_string(page->type));
+
+	/* Write the page. */
+	return (__rec_split_finish(session, r));
+}
+
+/*
+ * __rec_col_var_helper --
+ *	Create a column-store variable length record cell and write it onto a
+ * page.
+ */
+static int
+__rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r,
+    WT_SALVAGE_COOKIE *salvage,
+    WT_ITEM *value, int deleted, uint8_t overflow_type, uint64_t rle)
+{
+	WT_BTREE *btree;
+	WT_KV *val;
+
+	btree = S2BT(session);
+
+	val = &r->v;
+
+	/*
+	 * Occasionally, salvage needs to discard records from the beginning or
+	 * end of the page, and because the items may be part of a RLE cell, do
+	 * the adjustments here.   It's not a mistake we don't bother telling
+	 * our caller we've handled all the records from the page we care about,
+	 * and can quit processing the page: salvage is a rare operation and I
+	 * don't want to complicate our caller's loop.
+	 */
+	if (salvage != NULL) {
+		if (salvage->done)
+			return (0);
+		if (salvage->skip != 0) {
+			if (rle <= salvage->skip) {
+				salvage->skip -= rle;
+				return (0);
+			}
+			rle -= salvage->skip;
+			salvage->skip = 0;
+		}
+		if (salvage->take != 0) {
+			if (rle <= salvage->take)
+				salvage->take -= rle;
+			else {
+				rle = salvage->take;
+				salvage->take = 0;
+			}
+			if (salvage->take == 0)
+				salvage->done = 1;
+		}
+	}
+
+	if (deleted) {
+		val->cell_len = __wt_cell_pack_del(&val->cell, rle);
+		val->buf.data = NULL;
+		val->buf.size = 0;
+		val->len = val->cell_len;
+	} else if (overflow_type) {
+		val->cell_len = __wt_cell_pack_ovfl(
+		    &val->cell, overflow_type, rle, value->size);
+		val->buf.data = value->data;
+		val->buf.size = value->size;
+		val->len = val->cell_len + value->size;
+	} else
+		WT_RET(__rec_cell_build_val(
+		    session, r, value->data, value->size, rle));
+
+	/* Boundary: split or write the page. */
+	while (val->len > r->space_avail)
+		if (r->raw_compression)
+			WT_RET(__rec_split_raw(session, r));
+		else
+			WT_RET(__rec_split(session, r));
+
+	/* Copy the value onto the page. */
+	if (!deleted && !overflow_type && btree->dictionary)
+		WT_RET(__rec_dict_replace(session, r, rle, val));
+	__rec_copy_incr(session, r, val);
+
+	/* Update the starting record number in case we split. */
+	r->recno += rle;
+
+	return (0);
+}
+
+/*
+ * __rec_col_var --
+ *	Reconcile a variable-width column-store leaf page.
+ */
+static int
+__rec_col_var(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
+{
+	enum { OVFL_IGNORE, OVFL_UNUSED, OVFL_USED } ovfl_state;
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *vpack, _vpack;
+	WT_COL *cip;
+	WT_DECL_ITEM(orig);
+	WT_DECL_RET;
+	WT_INSERT *ins;
+	WT_ITEM *last;
+	WT_UPDATE *upd;
+	uint64_t n, nrepeat, repeat_count, rle, src_recno;
+	uint32_t i, size;
+	int deleted, last_deleted, orig_deleted, update_no_copy;
+	const void *data;
+
+	btree = S2BT(session);
+	last = r->last;
+	vpack = &_vpack;
+
+	WT_RET(__wt_scr_alloc(session, 0, &orig));
+	data = NULL;
+	size = 0;
+	upd = NULL;
+
+	WT_RET(__rec_split_init(
+	    session, r, page, page->pg_var_recno, btree->maxleafpage));
+
+	/*
+	 * The salvage code may be calling us to reconcile a page where there
+	 * were missing records in the column-store name space.  If taking the
+	 * first record from on the page, it might be a deleted record, so we
+	 * have to give the RLE code a chance to figure that out.  Else, if
+	 * not taking the first record from the page, write a single element
+	 * representing the missing records onto a new page.  (Don't pass the
+	 * salvage cookie to our helper function in this case, we're handling
+	 * one of the salvage cookie fields on our own, and we don't need the
+	 * helper function's assistance.)
+	 */
+	rle = 0;
+	last_deleted = 0;
+	if (salvage != NULL && salvage->missing != 0) {
+		if (salvage->skip == 0) {
+			rle = salvage->missing;
+			last_deleted = 1;
+
+			/*
+			 * Correct the number of records we're going to "take",
+			 * pretending the missing records were on the page.
+			 */
+			salvage->take += salvage->missing;
+		} else
+			WT_ERR(__rec_col_var_helper(
+			    session, r, NULL, NULL, 1, 0, salvage->missing));
+	}
+
+	/*
+	 * We track two data items through this loop: the previous (last) item
+	 * and the current item: if the last item is the same as the current
+	 * item, we increment the RLE count for the last item; if the last item
+	 * is different from the current item, we write the last item onto the
+	 * page, and replace it with the current item.  The r->recno counter
+	 * tracks records written to the page, and is incremented by the helper
+	 * function immediately after writing records to the page.  The record
+	 * number of our source record, that is, the current item, is maintained
+	 * in src_recno.
+	 */
+	src_recno = r->recno + rle;
+
+	/* For each entry in the in-memory page... */
+	WT_COL_FOREACH(page, cip, i) {
+		ovfl_state = OVFL_IGNORE;
+		if ((cell = WT_COL_PTR(page, cip)) == NULL) {
+			nrepeat = 1;
+			ins = NULL;
+			orig_deleted = 1;
+		} else {
+			__wt_cell_unpack(cell, vpack);
+			nrepeat = __wt_cell_rle(vpack);
+			ins = WT_SKIP_FIRST(WT_COL_UPDATE(page, cip));
+
+			/*
+			 * If the original value is "deleted", there's no value
+			 * to compare, we're done.
+			 */
+			orig_deleted = vpack->type == WT_CELL_DEL ? 1 : 0;
+			if (orig_deleted)
+				goto record_loop;
+
+			/*
+			 * Overflow items are tricky: we don't know until we're
+			 * finished processing the set of values if we need the
+			 * overflow value or not.  If we don't use the overflow
+			 * item at all, we have to discard it from the backing
+			 * file, otherwise we'll leak blocks on the checkpoint.
+			 * That's safe because if the backing overflow value is
+			 * still needed by any running transaction, we'll cache
+			 * a copy in the reconciliation tracking structures.
+			 *
+			 * Regardless, we avoid copying in overflow records: if
+			 * there's a WT_INSERT entry that modifies a reference
+			 * counted overflow record, we may have to write copies
+			 * of the overflow record, and in that case we'll do the
+			 * comparisons, but we don't read overflow items just to
+			 * see if they match records on either side.
+			 */
+			if (vpack->ovfl) {
+				ovfl_state = OVFL_UNUSED;
+				goto record_loop;
+			}
+
+			/*
+			 * If data is Huffman encoded, we have to decode it in
+			 * order to compare it with the last item we saw, which
+			 * may have been an update string.  This guarantees we
+			 * find every single pair of objects we can RLE encode,
+			 * including applications updating an existing record
+			 * where the new value happens (?) to match a Huffman-
+			 * encoded value in a previous or next record.
+			 */
+			WT_ERR(__wt_dsk_cell_data_ref(
+			    session, WT_PAGE_COL_VAR, vpack, orig));
+		}
+
+record_loop:	/*
+		 * Generate on-page entries: loop repeat records, looking for
+		 * WT_INSERT entries matching the record number.  The WT_INSERT
+		 * lists are in sorted order, so only need check the next one.
+		 */
+		for (n = 0;
+		    n < nrepeat; n += repeat_count, src_recno += repeat_count) {
+			upd = NULL;
+			if (ins != NULL && WT_INSERT_RECNO(ins) == src_recno) {
+				WT_ERR(__rec_txn_read(
+				    session, r, ins, NULL, vpack, &upd));
+				ins = WT_SKIP_NEXT(ins);
+			}
+			if (upd != NULL) {
+				update_no_copy = 1;	/* No data copy */
+				repeat_count = 1;	/* Single record */
+
+				deleted = WT_UPDATE_DELETED_ISSET(upd);
+				if (!deleted) {
+					data = WT_UPDATE_DATA(upd);
+					size = upd->size;
+				}
+			} else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) {
+				update_no_copy = 1;	/* No data copy */
+				repeat_count = 1;	/* Single record */
+
+				deleted = 0;
+
+				/*
+				 * If doing update save and restore, there's an
+				 * update that's not globally visible, and the
+				 * underlying value is a removed overflow value,
+				 * we end up here.
+				 *
+				 * When the update save/restore code noticed the
+				 * removed overflow value, it appended a copy of
+				 * the cached, original overflow value to the
+				 * update list being saved (ensuring the on-page
+				 * item will never be accessed after the page is
+				 * re-instantiated), then returned a NULL update
+				 * to us.
+				 *
+				 * Assert the case: if we remove an underlying
+				 * overflow object, checkpoint reconciliation
+				 * should never see it again, there should be a
+				 * visible update in the way.
+				 *
+				 * Write a placeholder.
+				 */
+				 WT_ASSERT(session,
+				     F_ISSET(r, WT_SKIP_UPDATE_RESTORE));
+
+				data = "@";
+				size = 1;
+			} else {
+				update_no_copy = 0;	/* Maybe data copy */
+
+				/*
+				 * The repeat count is the number of records up
+				 * to the next WT_INSERT record, or up to the
+				 * end of the entry if we have no more WT_INSERT
+				 * records.
+				 */
+				if (ins == NULL)
+					repeat_count = nrepeat - n;
+				else
+					repeat_count =
+					    WT_INSERT_RECNO(ins) - src_recno;
+
+				deleted = orig_deleted;
+				if (deleted)
+					goto compare;
+
+				/*
+				 * If we are handling overflow items, use the
+				 * overflow item itself exactly once, after
+				 * which we have to copy it into a buffer and
+				 * from then on use a complete copy because we
+				 * are re-creating a new overflow record each
+				 * time.
+				 */
+				switch (ovfl_state) {
+				case OVFL_UNUSED:
+					/*
+					 * An as-yet-unused overflow item.
+					 *
+					 * We're going to copy the on-page cell,
+					 * write out any record we're tracking.
+					 */
+					if (rle != 0) {
+						WT_ERR(__rec_col_var_helper(
+						    session, r, salvage, last,
+						    last_deleted, 0, rle));
+						rle = 0;
+					}
+
+					last->data = vpack->data;
+					last->size = vpack->size;
+					WT_ERR(__rec_col_var_helper(
+					    session, r, salvage, last, 0,
+					    WT_CELL_VALUE_OVFL, repeat_count));
+
+					/* Track if page has overflow items. */
+					r->ovfl_items = 1;
+
+					ovfl_state = OVFL_USED;
+					continue;
+				case OVFL_USED:
+					/*
+					 * Original is an overflow item; we used
+					 * it for a key and now we need another
+					 * copy; read it into memory.
+					 */
+					WT_ERR(__wt_dsk_cell_data_ref(session,
+					    WT_PAGE_COL_VAR, vpack, orig));
+
+					ovfl_state = OVFL_IGNORE;
+					/* FALLTHROUGH */
+				case OVFL_IGNORE:
+					/*
+					 * Original is an overflow item and we
+					 * were forced to copy it into memory,
+					 * or the original wasn't an overflow
+					 * item; use the data copied into orig.
+					 */
+					data = orig->data;
+					size = (uint32_t)orig->size;
+					break;
+				}
+			}
+
+compare:		/*
+			 * If we have a record against which to compare, and
+			 * the records compare equal, increment the rle counter
+			 * and continue.  If the records don't compare equal,
+			 * output the last record and swap the last and current
+			 * buffers: do NOT update the starting record number,
+			 * we've been doing that all along.
+			 */
+			if (rle != 0) {
+				if ((deleted && last_deleted) ||
+				    (!last_deleted && !deleted &&
+				    last->size == size &&
+				    memcmp(last->data, data, size) == 0)) {
+					rle += repeat_count;
+					continue;
+				}
+				WT_ERR(__rec_col_var_helper(session, r,
+				    salvage, last, last_deleted, 0, rle));
+			}
+
+			/*
+			 * Swap the current/last state.
+			 *
+			 * Reset RLE counter and turn on comparisons.
+			 */
+			if (!deleted) {
+				/*
+				 * We can't simply assign the data values into
+				 * the last buffer because they may have come
+				 * from a copy built from an encoded/overflow
+				 * cell and creating the next record is going
+				 * to overwrite that memory.  Check, because
+				 * encoded/overflow cells aren't that common
+				 * and we'd like to avoid the copy.  If data
+				 * was taken from the current unpack structure
+				 * (which points into the page), or was taken
+				 * from an update structure, we can just use
+				 * the pointers, they're not moving.
+				 */
+				if (data == vpack->data || update_no_copy) {
+					last->data = data;
+					last->size = size;
+				} else
+					WT_ERR(__wt_buf_set(
+					    session, last, data, size));
+			}
+			last_deleted = deleted;
+			rle = repeat_count;
+		}
+
+		/*
+		 * If we had a reference to an overflow record we never used,
+		 * discard the underlying blocks, they're no longer useful.
+		 *
+		 * One complication: we must cache a copy before discarding the
+		 * on-disk version if there's a transaction in the system that
+		 * might read the original value.
+		 */
+		if (ovfl_state == OVFL_UNUSED &&
+		    vpack->raw != WT_CELL_VALUE_OVFL_RM)
+			WT_ERR(__wt_ovfl_cache(session, page, upd, vpack));
+	}
+
+	/* Walk any append list. */
+	WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) {
+		WT_ERR(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
+		if (upd == NULL)
+			continue;
+		for (n = WT_INSERT_RECNO(ins); src_recno <= n; ++src_recno) {
+			/*
+			 * The application may have inserted records which left
+			 * gaps in the name space.
+			 */
+			if (src_recno < n)
+				deleted = 1;
+			else {
+				deleted = WT_UPDATE_DELETED_ISSET(upd);
+				if (!deleted) {
+					data = WT_UPDATE_DATA(upd);
+					size = upd->size;
+				}
+			}
+
+			/*
+			 * Handle RLE accounting and comparisons -- see comment
+			 * above, this code fragment does the same thing.
+			 */
+			if (rle != 0) {
+				if ((deleted && last_deleted) ||
+				    (!last_deleted && !deleted &&
+				    last->size == size &&
+				    memcmp(last->data, data, size) == 0)) {
+					++rle;
+					continue;
+				}
+				WT_ERR(__rec_col_var_helper(session, r,
+				    salvage, last, last_deleted, 0, rle));
+			}
+
+			/*
+			 * Swap the current/last state.  We always assign the
+			 * data values to the buffer because they can only be
+			 * the data from a WT_UPDATE structure.
+			 *
+			 * Reset RLE counter and turn on comparisons.
+			 */
+			if (!deleted) {
+				last->data = data;
+				last->size = size;
+			}
+			last_deleted = deleted;
+			rle = 1;
+		}
+	}
+
+	/* If we were tracking a record, write it. */
+	if (rle != 0)
+		WT_ERR(__rec_col_var_helper(
+		    session, r, salvage, last, last_deleted, 0, rle));
+
+	/* Write the remnant page. */
+	ret = __rec_split_finish(session, r);
+
+err:	__wt_scr_free(&orig);
+	return (ret);
+}
+
+/*
+ * __rec_row_int --
+ *	Reconcile a row-store internal page.
+ */
+static int
+__rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+	WT_ADDR *addr;
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack;
+	WT_DECL_RET;
+	WT_IKEY *ikey;
+	WT_KV *key, *val;
+	WT_PAGE *child;
+	WT_REF *ref;
+	size_t size;
+	u_int vtype;
+	int hazard, key_onpage_ovfl, ovfl_key, state;
+	const void *p;
+
+	btree = S2BT(session);
+	child = NULL;
+	hazard = 0;
+
+	key = &r->k;
+	kpack = &_kpack;
+	WT_CLEAR(*kpack);	/* -Wuninitialized */
+	val = &r->v;
+	vpack = &_vpack;
+	WT_CLEAR(*vpack);	/* -Wuninitialized */
+
+	WT_RET(__rec_split_init(session, r, page, 0ULL, btree->maxintlpage));
+
+	/*
+	 * Ideally, we'd never store the 0th key on row-store internal pages
+	 * because it's never used during tree search and there's no reason
+	 * to waste the space.  The problem is how we do splits: when we split,
+	 * we've potentially picked out several "split points" in the buffer
+	 * which is overflowing the maximum page size, and when the overflow
+	 * happens, we go back and physically split the buffer, at those split
+	 * points, into new pages.  It would be both difficult and expensive
+	 * to re-process the 0th key at each split point to be an empty key,
+	 * so we don't do that.  However, we are reconciling an internal page
+	 * for whatever reason, and the 0th key is known to be useless.  We
+	 * truncate the key to a single byte, instead of removing it entirely,
+	 * it simplifies various things in other parts of the code (we don't
+	 * have to special case transforming the page from its disk image to
+	 * its in-memory version, for example).
+	 */
+	r->cell_zero = 1;
+
+	/* For each entry in the in-memory page... */
+	WT_INTL_FOREACH_BEGIN(session, page, ref) {
+		/*
+		 * There are different paths if the key is an overflow item vs.
+		 * a straight-forward on-page value.   If an overflow item, we
+		 * would have instantiated it, and we can use that fact to set
+		 * things up.
+		 *
+		 * Note the cell reference and unpacked key cell are available
+		 * only in the case of an instantiated, off-page key.
+		 */
+		ikey = __wt_ref_key_instantiated(ref);
+		if (ikey == NULL || ikey->cell_offset == 0) {
+			cell = NULL;
+			key_onpage_ovfl = 0;
+		} else {
+			cell = WT_PAGE_REF_OFFSET(page, ikey->cell_offset);
+			__wt_cell_unpack(cell, kpack);
+			key_onpage_ovfl =
+			    kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM;
+		}
+
+		WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state));
+		addr = ref->addr;
+		child = ref->page;
+		vtype = 0;
+
+		/* Deleted child we don't have to write. */
+		if (state == WT_CHILD_IGNORE) {
+			/*
+			 * Overflow keys referencing discarded pages are no
+			 * longer useful, schedule them for discard.  Don't
+			 * worry about instantiation, internal page keys are
+			 * always instantiated.  Don't worry about reuse,
+			 * reusing this key in this reconciliation is unlikely.
+			 */
+			if (key_onpage_ovfl)
+				WT_ERR(__wt_ovfl_discard_add(
+				    session, page, kpack->cell));
+			CHILD_RELEASE_ERR(session, hazard, ref);
+			continue;
+		}
+
+		/* Deleted child requiring a proxy cell. */
+		if (state == WT_CHILD_PROXY)
+			vtype = WT_CELL_ADDR_DEL;
+
+		/*
+		 * Modified child.  Empty pages are merged into the parent and
+		 * discarded.
+		 */
+		if (state == WT_CHILD_MODIFIED)
+			switch (F_ISSET(child->modify, WT_PM_REC_MASK)) {
+			case WT_PM_REC_EMPTY:
+				/*
+				 * Overflow keys referencing empty pages are no
+				 * longer useful, schedule them for discard.
+				 * Don't worry about instantiation, internal
+				 * page keys are always instantiated.  Don't
+				 * worry about reuse, reusing this key in this
+				 * reconciliation is unlikely.
+				 */
+				if (key_onpage_ovfl)
+					WT_ERR(__wt_ovfl_discard_add(
+					    session, page, kpack->cell));
+				CHILD_RELEASE_ERR(session, hazard, ref);
+				continue;
+			case WT_PM_REC_MULTIBLOCK:
+				/*
+				 * Overflow keys referencing split pages are no
+				 * longer useful (the split page's key is the
+				 * interesting key); schedule them for discard.
+				 * Don't worry about instantiation, internal
+				 * page keys are always instantiated.  Don't
+				 * worry about reuse, reusing this key in this
+				 * reconciliation is unlikely.
+				 */
+				if (key_onpage_ovfl)
+					WT_ERR(__wt_ovfl_discard_add(
+					    session, page, kpack->cell));
+
+				WT_ERR(__rec_row_merge(session, r, child));
+				CHILD_RELEASE_ERR(session, hazard, ref);
+				continue;
+			case WT_PM_REC_REPLACE:
+				/*
+				 * If the page is replaced, the page's modify
+				 * structure has the page's address.
+				 */
+				addr = &child->modify->mod_replace;
+				break;
+			WT_ILLEGAL_VALUE_ERR(session);
+			}
+
+		/*
+		 * Build the value cell, the child page's address.  Addr points
+		 * to an on-page cell or an off-page WT_ADDR structure.   The
+		 * cell type has been set in the case of page deletion requiring
+		 * a proxy cell, otherwise use the information from the addr or
+		 * original cell.
+		 */
+		if (__wt_off_page(page, addr)) {
+			p = addr->addr;
+			size = addr->size;
+			if (vtype == 0)
+				vtype = __rec_vtype(addr);
+		} else {
+			__wt_cell_unpack(ref->addr, vpack);
+			p = vpack->data;
+			size = vpack->size;
+			if (vtype == 0)
+				vtype = vpack->raw;
+		}
+		__rec_cell_build_addr(r, p, size, vtype, 0);
+		CHILD_RELEASE_ERR(session, hazard, ref);
+
+		/*
+		 * Build key cell.
+		 * Truncate any 0th key, internal pages don't need 0th keys.
+		 */
+		if (key_onpage_ovfl) {
+			key->buf.data = cell;
+			key->buf.size = __wt_cell_total_len(kpack);
+			key->cell_len = 0;
+			key->len = key->buf.size;
+			ovfl_key = 1;
+		} else {
+			__wt_ref_key(page, ref, &p, &size);
+			WT_ERR(__rec_cell_build_int_key(
+			    session, r, p, r->cell_zero ? 1 : size, &ovfl_key));
+		}
+		r->cell_zero = 0;
+
+		/* Boundary: split or write the page. */
+		while (key->len + val->len > r->space_avail) {
+			if (r->raw_compression) {
+				WT_ERR(__rec_split_raw(session, r));
+				continue;
+			}
+
+			/*
+			 * In one path above, we copied address blocks from the
+			 * page rather than building the actual key.  In that
+			 * case, we have to build the actual key now because we
+			 * are about to promote it.
+			 */
+			if (key_onpage_ovfl) {
+				WT_ERR(__wt_buf_set(session,
+				    r->cur, WT_IKEY_DATA(ikey), ikey->size));
+				key_onpage_ovfl = 0;
+			}
+			WT_ERR(__rec_split(session, r));
+		}
+
+		/* Copy the key and value onto the page. */
+		__rec_copy_incr(session, r, key);
+		__rec_copy_incr(session, r, val);
+
+		/* Update compression state. */
+		__rec_key_state_update(r, ovfl_key);
+	} WT_INTL_FOREACH_END;
+
+	/* Write the remnant page. */
+	return (__rec_split_finish(session, r));
+
+err:	CHILD_RELEASE(session, hazard, ref);
+	return (ret);
+}
+
+/*
+ * __rec_row_merge --
+ *	Merge in a split page.
+ */
+static int
+__rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+	WT_ADDR *addr;
+	WT_KV *key, *val;
+	WT_MULTI *multi;
+	WT_PAGE_MODIFY *mod;
+	uint32_t i;
+	int ovfl_key;
+
+	mod = page->modify;
+
+	key = &r->k;
+	val = &r->v;
+
+	/* For each entry in the split array... */
+	for (multi = mod->mod_multi,
+	    i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+		/* Build the key and value cells. */
+		WT_RET(__rec_cell_build_int_key(session, r,
+		    WT_IKEY_DATA(multi->key.ikey),
+		    r->cell_zero ? 1 : multi->key.ikey->size, &ovfl_key));
+		r->cell_zero = 0;
+
+		addr = &multi->addr;
+		__rec_cell_build_addr(
+		    r, addr->addr, addr->size, __rec_vtype(addr), 0);
+
+		/* Boundary: split or write the page. */
+		while (key->len + val->len > r->space_avail)
+			if (r->raw_compression)
+				WT_RET(__rec_split_raw(session, r));
+			else
+				WT_RET(__rec_split(session, r));
+
+		/* Copy the key and value onto the page. */
+		__rec_copy_incr(session, r, key);
+		__rec_copy_incr(session, r, val);
+
+		/* Update compression state. */
+		__rec_key_state_update(r, ovfl_key);
+	}
+	return (0);
+}
+
+/*
+ * __rec_row_leaf --
+ *	Reconcile a row-store leaf page.
+ */
+static int
+__rec_row_leaf(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
+{
+	WT_BTREE *btree;
+	WT_CELL *cell, *val_cell;
+	WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack;
+	WT_DECL_ITEM(tmpkey);
+	WT_DECL_ITEM(tmpval);
+	WT_DECL_RET;
+	WT_IKEY *ikey;
+	WT_INSERT *ins;
+	WT_KV *key, *val;
+	WT_ROW *rip;
+	WT_UPDATE *upd;
+	size_t size;
+	uint64_t slvg_skip;
+	uint32_t i;
+	int dictionary, onpage_ovfl, ovfl_key;
+	const void *p;
+	void *copy;
+
+	btree = S2BT(session);
+	slvg_skip = salvage == NULL ? 0 : salvage->skip;
+
+	key = &r->k;
+	val = &r->v;
+
+	WT_RET(__rec_split_init(session, r, page, 0ULL, btree->maxleafpage));
+
+	/*
+	 * Write any K/V pairs inserted into the page before the first from-disk
+	 * key on the page.
+	 */
+	if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL)
+		WT_RET(__rec_row_leaf_insert(session, r, ins));
+
+	/*
+	 * Temporary buffers in which to instantiate any uninstantiated keys
+	 * or value items we need.
+	 */
+	WT_RET(__wt_scr_alloc(session, 0, &tmpkey));
+	WT_RET(__wt_scr_alloc(session, 0, &tmpval));
+
+	/* For each entry in the page... */
+	WT_ROW_FOREACH(page, rip, i) {
+		/*
+		 * The salvage code, on some rare occasions, wants to reconcile
+		 * a page but skip some leading records on the page.  Because
+		 * the row-store leaf reconciliation function copies keys from
+		 * the original disk page, this is non-trivial -- just changing
+		 * the in-memory pointers isn't sufficient, we have to change
+		 * the WT_CELL structures on the disk page, too.  It's ugly, but
+		 * we pass in a value that tells us how many records to skip in
+		 * this case.
+		 */
+		if (slvg_skip != 0) {
+			--slvg_skip;
+			continue;
+		}
+
+		/*
+		 * Figure out the key: set any cell reference (and unpack it),
+		 * set any instantiated key reference.
+		 */
+		copy = WT_ROW_KEY_COPY(rip);
+		(void)__wt_row_leaf_key_info(
+		    page, copy, &ikey, &cell, NULL, NULL);
+		if (cell == NULL)
+			kpack = NULL;
+		else {
+			kpack = &_kpack;
+			__wt_cell_unpack(cell, kpack);
+		}
+
+		/* Unpack the on-page value cell, and look for an update. */
+		if ((val_cell =
+		    __wt_row_leaf_value_cell(page, rip, NULL)) == NULL)
+			vpack = NULL;
+		else {
+			vpack = &_vpack;
+			__wt_cell_unpack(val_cell, vpack);
+		}
+		WT_ERR(__rec_txn_read(session, r, NULL, rip, vpack, &upd));
+
+		/* Build value cell. */
+		dictionary = 0;
+		if (upd == NULL) {
+			/*
+			 * When the page was read into memory, there may not
+			 * have been a value item.
+			 *
+			 * If there was a value item, check if it's a dictionary
+			 * cell (a copy of another item on the page).  If it's a
+			 * copy, we have to create a new value item as the old
+			 * item might have been discarded from the page.
+			 */
+			if (vpack == NULL) {
+				val->buf.data = NULL;
+				val->cell_len = val->len = val->buf.size = 0;
+			} else if (vpack->raw == WT_CELL_VALUE_COPY) {
+				/* If the item is Huffman encoded, decode it. */
+				if (btree->huffman_value == NULL) {
+					p = vpack->data;
+					size = vpack->size;
+				} else {
+					WT_ERR(__wt_huffman_decode(session,
+					    btree->huffman_value,
+					    vpack->data, vpack->size,
+					    tmpval));
+					p = tmpval->data;
+					size = tmpval->size;
+				}
+				WT_ERR(__rec_cell_build_val(
+				    session, r, p, size, (uint64_t)0));
+				dictionary = 1;
+			} else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) {
+				/*
+				 * If doing update save and restore in service
+				 * of eviction, there's an update that's not
+				 * globally visible, and the underlying value
+				 * is a removed overflow value, we end up here.
+				 *
+				 * When the update save/restore code noticed the
+				 * removed overflow value, it appended a copy of
+				 * the cached, original overflow value to the
+				 * update list being saved (ensuring any on-page
+				 * item will never be accessed after the page is
+				 * re-instantiated), then returned a NULL update
+				 * to us.
+				 *
+				 * Assert the case.
+				 */
+				WT_ASSERT(session,
+				    F_ISSET(r, WT_SKIP_UPDATE_RESTORE));
+
+				/*
+				 * If the key is also a removed overflow item,
+				 * don't write anything at all.
+				 *
+				 * We don't have to write anything because the
+				 * code re-instantiating the page gets the key
+				 * to match the saved list of updates from the
+				 * original page.  By not putting the key on
+				 * the page, we'll move the key/value set from
+				 * a row-store leaf page slot to an insert list,
+				 * but that shouldn't matter.
+				 *
+				 * The reason we bother with the test is because
+				 * overflows are expensive to write.  It's hard
+				 * to imagine a real workload where this test is
+				 * worth the effort, but it's a simple test.
+				 */
+				if (kpack != NULL &&
+				    kpack->raw == WT_CELL_KEY_OVFL_RM)
+					goto leaf_insert;
+
+				/*
+				 * The on-page value will never be accessed,
+				 * write a placeholder record.
+				 */
+				WT_ERR(__rec_cell_build_val(
+				    session, r, "@", 1, (uint64_t)0));
+			} else {
+				val->buf.data = val_cell;
+				val->buf.size = __wt_cell_total_len(vpack);
+				val->cell_len = 0;
+				val->len = val->buf.size;
+
+				/* Track if page has overflow items. */
+				if (vpack->ovfl)
+					r->ovfl_items = 1;
+			}
+		} else {
+			/*
+			 * If the original value was an overflow and we've not
+			 * already done so, discard it.  One complication: we
+			 * must cache a copy before discarding the on-disk
+			 * version if there's a transaction in the system that
+			 * might read the original value.
+			 */
+			if (vpack != NULL &&
+			    vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM)
+				WT_ERR(
+				    __wt_ovfl_cache(session, page, rip, vpack));
+
+			/* If this key/value pair was deleted, we're done. */
+			if (WT_UPDATE_DELETED_ISSET(upd)) {
+				/*
+				 * Overflow keys referencing discarded values
+				 * are no longer useful, discard the backing
+				 * blocks.  Don't worry about reuse, reusing
+				 * keys from a row-store page reconciliation
+				 * seems unlikely enough to ignore.
+				 */
+				if (kpack != NULL && kpack->ovfl &&
+				    kpack->raw != WT_CELL_KEY_OVFL_RM) {
+					/*
+					 * Keys are part of the name-space, we
+					 * can't remove them from the in-memory
+					 * tree; if an overflow key was deleted
+					 * without being instantiated (for
+					 * example, cursor-based truncation, do
+					 * it now.
+					 */
+					if (ikey == NULL)
+						WT_ERR(__wt_row_leaf_key(
+						    session,
+						    page, rip, tmpkey, 1));
+
+					WT_ERR(__wt_ovfl_discard_add(
+					    session, page, kpack->cell));
+				}
+
+				/*
+				 * We aren't actually creating the key so we
+				 * can't use bytes from this key to provide
+				 * prefix information for a subsequent key.
+				 */
+				tmpkey->size = 0;
+
+				/* Proceed with appended key/value pairs. */
+				goto leaf_insert;
+			}
+
+			/*
+			 * If no value, nothing needs to be copied.  Otherwise,
+			 * build the value's WT_CELL chunk from the most recent
+			 * update value.
+			 */
+			if (upd->size == 0) {
+				val->buf.data = NULL;
+				val->cell_len = val->len = val->buf.size = 0;
+			} else {
+				WT_ERR(__rec_cell_build_val(session, r,
+				    WT_UPDATE_DATA(upd), upd->size,
+				    (uint64_t)0));
+				dictionary = 1;
+			}
+		}
+
+		/*
+		 * Build key cell.
+		 *
+		 * If the key is an overflow key that hasn't been removed, use
+		 * the original backing blocks.
+		 */
+		onpage_ovfl = kpack != NULL &&
+		    kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM;
+		if (onpage_ovfl) {
+			key->buf.data = cell;
+			key->buf.size = __wt_cell_total_len(kpack);
+			key->cell_len = 0;
+			key->len = key->buf.size;
+			ovfl_key = 1;
+
+			/*
+			 * We aren't creating a key so we can't use this key as
+			 * a prefix for a subsequent key.
+			 */
+			tmpkey->size = 0;
+
+			/* Track if page has overflow items. */
+			r->ovfl_items = 1;
+		} else {
+			/*
+			 * Get the key from the page or an instantiated key, or
+			 * inline building the key from a previous key (it's a
+			 * fast path for simple, prefix-compressed keys), or by
+			 * by building the key from scratch.
+			 */
+			if (__wt_row_leaf_key_info(page, copy,
+			    NULL, &cell, &tmpkey->data, &tmpkey->size))
+				goto build;
+
+			kpack = &_kpack;
+			__wt_cell_unpack(cell, kpack);
+			if (btree->huffman_key == NULL &&
+			    kpack->type == WT_CELL_KEY &&
+			    tmpkey->size >= kpack->prefix) {
+				/*
+				 * The previous clause checked for a prefix of
+				 * zero, which means the temporary buffer must
+				 * have a non-zero size, and it references a
+				 * valid key.
+				 */
+				WT_ASSERT(session, tmpkey->size != 0);
+
+				/*
+				 * Grow the buffer as necessary, ensuring data
+				 * data has been copied into local buffer space,
+				 * then append the suffix to the prefix already
+				 * in the buffer.
+				 *
+				 * Don't grow the buffer unnecessarily or copy
+				 * data we don't need, truncate the item's data
+				 * length to the prefix bytes.
+				 */
+				tmpkey->size = kpack->prefix;
+				WT_ERR(__wt_buf_grow(session,
+				    tmpkey, tmpkey->size + kpack->size));
+				memcpy((uint8_t *)tmpkey->mem + tmpkey->size,
+				    kpack->data, kpack->size);
+				tmpkey->size += kpack->size;
+			} else
+				WT_ERR(__wt_row_leaf_key_copy(
+				    session, page, rip, tmpkey));
+build:
+			WT_ERR(__rec_cell_build_leaf_key(session, r,
+			    tmpkey->data, tmpkey->size, &ovfl_key));
+		}
+
+		/* Boundary: split or write the page. */
+		while (key->len + val->len > r->space_avail) {
+			if (r->raw_compression) {
+				WT_ERR(__rec_split_raw(session, r));
+				continue;
+			}
+
+			/*
+			 * In one path above, we copied address blocks from the
+			 * page rather than building the actual key.  In that
+			 * case, we have to build the actual key now because we
+			 * are about to promote it.
+			 */
+			if (onpage_ovfl) {
+				WT_ERR(__wt_dsk_cell_data_ref(
+				    session, WT_PAGE_ROW_LEAF, kpack, r->cur));
+				onpage_ovfl = 0;
+			}
+			WT_ERR(__rec_split(session, r));
+
+			/*
+			 * Turn off prefix compression until a full key written
+			 * to the new page, and (unless we're already working
+			 * with an overflow key), rebuild the key without prefix
+			 * compression.
+			 */
+			if (r->key_pfx_compress_conf) {
+				r->key_pfx_compress = 0;
+				if (!ovfl_key)
+					WT_ERR(__rec_cell_build_leaf_key(
+					    session, r, NULL, 0, &ovfl_key));
+			}
+		}
+
+		/* Copy the key/value pair onto the page. */
+		__rec_copy_incr(session, r, key);
+		if (val->len == 0)
+			r->any_empty_value = 1;
+		else {
+			r->all_empty_value = 0;
+			if (dictionary && btree->dictionary)
+				WT_ERR(__rec_dict_replace(session, r, 0, val));
+			__rec_copy_incr(session, r, val);
+		}
+
+		/* Update compression state. */
+		__rec_key_state_update(r, ovfl_key);
+
+leaf_insert:	/* Write any K/V pairs inserted into the page after this key. */
+		if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT(page, rip))) != NULL)
+		    WT_ERR(__rec_row_leaf_insert(session, r, ins));
+	}
+
+	/* Write the remnant page. */
+	ret = __rec_split_finish(session, r);
+
+err:	__wt_scr_free(&tmpkey);
+	__wt_scr_free(&tmpval);
+	return (ret);
+}
+
+/*
+ * __rec_row_leaf_insert --
+ *	Walk an insert chain, writing K/V pairs.
+ */
+static int
+__rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
+{
+	WT_BTREE *btree;
+	WT_KV *key, *val;
+	WT_UPDATE *upd;
+	int ovfl_key;
+
+	btree = S2BT(session);
+
+	key = &r->k;
+	val = &r->v;
+
+	for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) {
+		/* Look for an update. */
+		WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
+		if (upd == NULL || WT_UPDATE_DELETED_ISSET(upd))
+			continue;
+
+		if (upd->size == 0)			/* Build value cell. */
+			val->len = 0;
+		else
+			WT_RET(__rec_cell_build_val(session, r,
+			    WT_UPDATE_DATA(upd), upd->size, (uint64_t)0));
+
+							/* Build key cell. */
+		WT_RET(__rec_cell_build_leaf_key(session, r,
+		    WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key));
+
+		/* Boundary: split or write the page. */
+		while (key->len + val->len > r->space_avail) {
+			if (r->raw_compression) {
+				WT_RET(__rec_split_raw(session, r));
+				continue;
+			}
+			WT_RET(__rec_split(session, r));
+
+			/*
+			 * Turn off prefix compression until a full key written
+			 * to the new page, and (unless we're already working
+			 * with an overflow key), rebuild the key without prefix
+			 * compression.
+			 */
+			if (r->key_pfx_compress_conf) {
+				r->key_pfx_compress = 0;
+				if (!ovfl_key)
+					WT_RET(__rec_cell_build_leaf_key(
+					    session, r, NULL, 0, &ovfl_key));
+			}
+		}
+
+		/* Copy the key/value pair onto the page. */
+		__rec_copy_incr(session, r, key);
+		if (val->len == 0)
+			r->any_empty_value = 1;
+		else {
+			r->all_empty_value = 0;
+			if (btree->dictionary)
+				WT_RET(__rec_dict_replace(session, r, 0, val));
+			__rec_copy_incr(session, r, val);
+		}
+
+		/* Update compression state. */
+		__rec_key_state_update(r, ovfl_key);
+	}
+
+	return (0);
+}
+
+/*
+ * __rec_split_discard --
+ *	Discard the pages resulting from a previous split.
+ */
+static int
+__rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_BM *bm;
+	WT_DECL_RET;
+	WT_PAGE_MODIFY *mod;
+	WT_MULTI *multi;
+	uint32_t i;
+
+	bm = S2BT(session)->bm;
+	mod = page->modify;
+
+	/*
+	 * A page that split is being reconciled for the second, or subsequent
+	 * time; discard underlying block space used in the last reconciliation
+	 * that is not being reused for this reconciliation.
+	 */
+	for (multi = mod->mod_multi,
+	    i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+		switch (page->type) {
+		case WT_PAGE_ROW_INT:
+		case WT_PAGE_ROW_LEAF:
+			__wt_free(session, multi->key.ikey);
+			break;
+		}
+		if (multi->skip == NULL) {
+			if (multi->addr.reuse)
+				multi->addr.addr = NULL;
+			else {
+				WT_RET(bm->free(bm, session,
+				    multi->addr.addr, multi->addr.size));
+				__wt_free(session, multi->addr.addr);
+			}
+		} else {
+			__wt_free(session, multi->skip);
+			__wt_free(session, multi->skip_dsk);
+		}
+	}
+	__wt_free(session, mod->mod_multi);
+	mod->mod_multi_entries = 0;
+
+	/*
+	 * This routine would be trivial, and only walk a single page freeing
+	 * any blocks written to support the split, except for root splits.
+	 * In the case of root splits, we have to cope with multiple pages in
+	 * a linked list, and we also have to discard overflow items written
+	 * for the page.
+	 */
+	switch (page->type) {
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_ROW_INT:
+		if (mod->mod_root_split == NULL)
+			break;
+		WT_RET(__rec_split_discard(session, mod->mod_root_split));
+		WT_RET(__wt_ovfl_track_wrapup(session, mod->mod_root_split));
+		__wt_page_out(session, &mod->mod_root_split);
+		break;
+	}
+
+	return (ret);
+}
+
+/*
+ * __rec_write_wrapup --
+ *	Finish the reconciliation.
+ */
+static int
+__rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+	WT_BM *bm;
+	WT_BOUNDARY *bnd;
+	WT_BTREE *btree;
+	WT_MULTI *multi;
+	WT_PAGE_MODIFY *mod;
+	WT_REF *ref;
+	size_t addr_size;
+	const uint8_t *addr;
+
+	btree = S2BT(session);
+	bm = btree->bm;
+	mod = page->modify;
+	ref = r->ref;
+
+	/*
+	 * This page may have previously been reconciled, and that information
+	 * is now about to be replaced.  Make sure it's discarded at some point,
+	 * and clear the underlying modification information, we're creating a
+	 * new reality.
+	 */
+	switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+	case 0:	/*
+		 * The page has never been reconciled before, free the original
+		 * address blocks (if any).  The "if any" is for empty trees
+		 * created when a new tree is opened or previously deleted pages
+		 * instantiated in memory.
+		 *
+		 * The exception is root pages are never tracked or free'd, they
+		 * are checkpoints, and must be explicitly dropped.
+		 */
+		if (__wt_ref_is_root(ref))
+			break;
+		if (ref->addr != NULL) {
+			/*
+			 * Free the page and clear the address (so we don't free
+			 * it twice).
+			 */
+			WT_RET(__wt_ref_info(
+			    session, ref, &addr, &addr_size, NULL));
+			WT_RET(bm->free(bm, session, addr, addr_size));
+			if (__wt_off_page(ref->home, ref->addr)) {
+				__wt_free(
+				    session, ((WT_ADDR *)ref->addr)->addr);
+				__wt_free(session, ref->addr);
+			}
+			ref->addr = NULL;
+		}
+		break;
+	case WT_PM_REC_EMPTY:				/* Page deleted */
+		break;
+	case WT_PM_REC_MULTIBLOCK:			/* Multiple blocks */
+		/*
+		 * Discard the multiple replacement blocks.
+		 */
+		WT_RET(__rec_split_discard(session, page));
+		break;
+	case WT_PM_REC_REPLACE:				/* 1-for-1 page swap */
+		/*
+		 * Discard the replacement leaf page's blocks.
+		 *
+		 * The exception is root pages are never tracked or free'd, they
+		 * are checkpoints, and must be explicitly dropped.
+		 */
+		if (!__wt_ref_is_root(ref))
+			WT_RET(bm->free(bm, session,
+			    mod->mod_replace.addr, mod->mod_replace.size));
+
+		/* Discard the replacement page's address. */
+		__wt_free(session, mod->mod_replace.addr);
+		mod->mod_replace.size = 0;
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+	F_CLR(mod, WT_PM_REC_MASK);
+
+	/*
+	 * Wrap up overflow tracking.  If we are about to create a checkpoint,
+	 * the system must be entirely consistent at that point (the underlying
+	 * block manager is presumably going to do some action to resolve the
+	 * list of allocated/free/whatever blocks that are associated with the
+	 * checkpoint).
+	 */
+	WT_RET(__wt_ovfl_track_wrapup(session, page));
+
+	switch (r->bnd_next) {
+	case 0:						/* Page delete */
+		WT_RET(__wt_verbose(
+		    session, WT_VERB_RECONCILE, "page %p empty", page));
+		WT_STAT_FAST_DATA_INCR(session, rec_page_delete);
+
+		/* If this is the root page, we need to create a sync point. */
+		ref = r->ref;
+		if (__wt_ref_is_root(ref))
+			WT_RET(
+			    bm->checkpoint(bm, session, NULL, btree->ckpt, 0));
+
+		/*
+		 * If the page was empty, we want to discard it from the tree
+		 * by discarding the parent's key when evicting the parent.
+		 * Mark the page as deleted, then return success, leaving the
+		 * page in memory.  If the page is subsequently modified, that
+		 * is OK, we'll just reconcile it again.
+		 */
+		F_SET(mod, WT_PM_REC_EMPTY);
+		break;
+	case 1:						/* 1-for-1 page swap */
+		/*
+		 * Because WiredTiger's pages grow without splitting, we're
+		 * replacing a single page with another single page most of
+		 * the time.
+		 */
+		bnd = &r->bnd[0];
+
+		/*
+		 * If we're saving/restoring changes for this page, there's
+		 * nothing to write. Allocate, then initialize the array of
+		 * replacement blocks.
+		 */
+		if (bnd->skip != NULL) {
+			WT_RET(__wt_calloc_def(
+			    session, r->bnd_next, &mod->mod_multi));
+			multi = mod->mod_multi;
+			multi->skip = bnd->skip;
+			multi->skip_entries = bnd->skip_next;
+			bnd->skip = NULL;
+			multi->skip_dsk = bnd->dsk;
+			bnd->dsk = NULL;
+			mod->mod_multi_entries = 1;
+
+			F_SET(mod, WT_PM_REC_MULTIBLOCK);
+			break;
+		}
+
+		/*
+		 * If this is a root page, then we don't have an address and we
+		 * have to create a sync point.  The address was cleared when
+		 * we were about to write the buffer so we know what to do here.
+		 */
+		if (bnd->addr.addr == NULL)
+			WT_RET(__wt_bt_write(session,
+			    &r->dsk, NULL, NULL, 1, bnd->already_compressed));
+		else {
+			mod->mod_replace = bnd->addr;
+			bnd->addr.addr = NULL;
+		}
+
+		F_SET(mod, WT_PM_REC_REPLACE);
+		break;
+	default:					/* Page split */
+		WT_RET(__wt_verbose(session, WT_VERB_RECONCILE,
+		    "page %p reconciled into %" PRIu32 " pages",
+		    page, r->bnd_next));
+
+		switch (page->type) {
+		case WT_PAGE_COL_INT:
+		case WT_PAGE_ROW_INT:
+			WT_STAT_FAST_DATA_INCR(
+			    session, rec_multiblock_internal);
+			break;
+		case WT_PAGE_COL_FIX:
+		case WT_PAGE_COL_VAR:
+		case WT_PAGE_ROW_LEAF:
+			WT_STAT_FAST_DATA_INCR(session, rec_multiblock_leaf);
+			break;
+		WT_ILLEGAL_VALUE(session);
+		}
+
+		/* Display the actual split keys. */
+		if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT)) {
+			WT_DECL_ITEM(tkey);
+			WT_DECL_RET;
+			uint32_t i;
+
+			if (page->type == WT_PAGE_ROW_INT ||
+			    page->type == WT_PAGE_ROW_LEAF)
+				WT_RET(__wt_scr_alloc(session, 0, &tkey));
+			for (bnd = r->bnd, i = 0; i < r->bnd_next; ++bnd, ++i)
+				switch (page->type) {
+				case WT_PAGE_ROW_INT:
+				case WT_PAGE_ROW_LEAF:
+					WT_ERR(__wt_buf_set_printable(
+					    session, tkey,
+					    bnd->key.data, bnd->key.size));
+					WT_ERR(__wt_verbose(
+					    session, WT_VERB_SPLIT,
+					    "split: starting key "
+					    "%.*s",
+					    (int)tkey->size,
+					    (const char *)tkey->data));
+					break;
+				case WT_PAGE_COL_FIX:
+				case WT_PAGE_COL_INT:
+				case WT_PAGE_COL_VAR:
+					WT_ERR(__wt_verbose(
+					    session, WT_VERB_SPLIT,
+					    "split: starting recno %" PRIu64,
+					    bnd->recno));
+					break;
+				WT_ILLEGAL_VALUE_ERR(session);
+				}
+err:			__wt_scr_free(&tkey);
+			WT_RET(ret);
+		}
+		if (r->bnd_next > r->bnd_next_max) {
+			r->bnd_next_max = r->bnd_next;
+			WT_STAT_FAST_DATA_SET(
+			    session, rec_multiblock_max, r->bnd_next_max);
+		}
+
+		switch (page->type) {
+		case WT_PAGE_ROW_INT:
+		case WT_PAGE_ROW_LEAF:
+			WT_RET(__rec_split_row(session, r, page));
+			break;
+		case WT_PAGE_COL_INT:
+		case WT_PAGE_COL_FIX:
+		case WT_PAGE_COL_VAR:
+			WT_RET(__rec_split_col(session, r, page));
+			break;
+		WT_ILLEGAL_VALUE(session);
+		}
+		F_SET(mod, WT_PM_REC_MULTIBLOCK);
+		break;
+	}
+
+	/*
+	 * If updates were skipped, the tree isn't clean.  The checkpoint call
+	 * cleared the tree's modified value before calling the eviction thread,
+	 * so we must explicitly reset the tree's modified flag.  We insert a
+	 * barrier after the change for clarity (the requirement is the value
+	 * be set before a subsequent checkpoint reads it, and because the
+	 * current checkpoint is waiting on this reconciliation to complete,
+	 * there's no risk of that happening).
+	 *
+	 * Otherwise, if no updates were skipped, we have a new maximum
+	 * transaction written for the page (used to decide if a clean page can
+	 * be evicted).  The page only might be clean; if the write generation
+	 * is unchanged since reconciliation started, clear it and update cache
+	 * dirty statistics, if the write generation changed, then the page has
+	 * been written since we started reconciliation, it cannot be
+	 * discarded.
+	 */
+	if (r->leave_dirty) {
+		mod->first_dirty_txn = r->skipped_txn;
+
+		btree->modified = 1;
+		WT_FULL_BARRIER();
+	} else {
+		mod->rec_max_txn = r->max_txn;
+
+		if (WT_ATOMIC_CAS4(mod->write_gen, r->orig_write_gen, 0))
+			__wt_cache_dirty_decr(session, page);
+	}
+
+	return (0);
+}
+
+/*
+ * __rec_write_wrapup_err --
+ *	Finish the reconciliation on error.
+ */
+static int
+__rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+	WT_BM *bm;
+	WT_BOUNDARY *bnd;
+	WT_DECL_RET;
+	WT_MULTI *multi;
+	WT_PAGE_MODIFY *mod;
+	uint32_t i;
+
+	bm = S2BT(session)->bm;
+	mod = page->modify;
+
+	/*
+	 * Clear the address-reused flag from the multiblock reconciliation
+	 * information (otherwise we might think the backing block is being
+	 * reused on a subsequent reconciliation where we want to free it).
+	 */
+	if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_MULTIBLOCK)
+		for (multi = mod->mod_multi,
+		    i = 0; i < mod->mod_multi_entries; ++multi, ++i)
+			multi->addr.reuse = 0;
+
+	/*
+	 * On error, discard blocks we've written, they're unreferenced by the
+	 * tree.  This is not a question of correctness, we're avoiding block
+	 * leaks.
+	 *
+	 * Don't discard backing blocks marked for reuse, they remain part of
+	 * a previous reconciliation.
+	 */
+	WT_TRET(__wt_ovfl_track_wrapup_err(session, page));
+	for (bnd = r->bnd, i = 0; i < r->bnd_next; ++bnd, ++i)
+		if (bnd->addr.addr != NULL) {
+			if (bnd->addr.reuse)
+				bnd->addr.addr = NULL;
+			else {
+				WT_TRET(bm->free(bm, session,
+				    bnd->addr.addr, bnd->addr.size));
+				__wt_free(session, bnd->addr.addr);
+			}
+		}
+
+	return (ret);
+}
+
+/*
+ * __rec_split_row --
+ *	Split a row-store page into a set of replacement blocks.
+ */
+static int
+__rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+	WT_BOUNDARY *bnd;
+	WT_MULTI *multi;
+	WT_PAGE_MODIFY *mod;
+	WT_REF *ref;
+	uint32_t i;
+	size_t size;
+	void *p;
+
+	mod = page->modify;
+
+	/* We never set the first page's key, grab it from the original page. */
+	ref = r->ref;
+	if (__wt_ref_is_root(ref))
+		WT_RET(__wt_buf_set(session, &r->bnd[0].key, "", 1));
+	else {
+		__wt_ref_key(ref->home, ref, &p, &size);
+		WT_RET(__wt_buf_set(session, &r->bnd[0].key, p, size));
+	}
+
+	/* Allocate, then initialize the array of replacement blocks. */
+	WT_RET(__wt_calloc_def(session, r->bnd_next, &mod->mod_multi));
+
+	for (multi = mod->mod_multi,
+	    bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
+		WT_RET(__wt_row_ikey(session, 0,
+		    bnd->key.data, bnd->key.size, &multi->key.ikey));
+
+		if (bnd->skip == NULL) {
+			multi->addr = bnd->addr;
+			multi->addr.reuse = 0;
+			multi->size = bnd->size;
+			multi->cksum = bnd->cksum;
+			bnd->addr.addr = NULL;
+		} else {
+			multi->skip = bnd->skip;
+			multi->skip_entries = bnd->skip_next;
+			bnd->skip = NULL;
+			multi->skip_dsk = bnd->dsk;
+			bnd->dsk = NULL;
+		}
+	}
+	mod->mod_multi_entries = r->bnd_next;
+
+	return (0);
+}
+
+/*
+ * __rec_split_col --
+ *	Split a column-store page into a set of replacement blocks.
+ */
+static int
+__rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+	WT_BOUNDARY *bnd;
+	WT_MULTI *multi;
+	WT_PAGE_MODIFY *mod;
+	uint32_t i;
+
+	mod = page->modify;
+
+	/* Allocate, then initialize the array of replacement blocks. */
+	WT_RET(__wt_calloc_def(session, r->bnd_next, &mod->mod_multi));
+
+	for (multi = mod->mod_multi,
+	    bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
+		multi->key.recno = bnd->recno;
+
+		if (bnd->skip == NULL) {
+			multi->addr = bnd->addr;
+			multi->addr.reuse = 0;
+			multi->size = bnd->size;
+			multi->cksum = bnd->cksum;
+			bnd->addr.addr = NULL;
+		} else {
+			multi->skip = bnd->skip;
+			multi->skip_entries = bnd->skip_next;
+			bnd->skip = NULL;
+			multi->skip_dsk = bnd->dsk;
+			bnd->dsk = NULL;
+		}
+	}
+	mod->mod_multi_entries = r->bnd_next;
+
+	return (0);
+}
+
+/*
+ * __rec_cell_build_int_key --
+ *	Process a key and return a WT_CELL structure and byte string to be
+ * stored on a row-store internal page.
+ */
+static int
+__rec_cell_build_int_key(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, const void *data, size_t size, int *is_ovflp)
+{
+	WT_BTREE *btree;
+	WT_KV *key;
+
+	*is_ovflp = 0;
+
+	btree = S2BT(session);
+
+	key = &r->k;
+
+	/* Copy the bytes into the "current" and key buffers. */
+	WT_RET(__wt_buf_set(session, r->cur, data, size));
+	WT_RET(__wt_buf_set(session, &key->buf, data, size));
+
+	/* Create an overflow object if the data won't fit. */
+	if (size > btree->maxintlitem) {
+		WT_STAT_FAST_DATA_INCR(session, rec_overflow_key_internal);
+
+		*is_ovflp = 1;
+		return (__rec_cell_build_ovfl(
+		    session, r, key, WT_CELL_KEY_OVFL, (uint64_t)0));
+	}
+
+	key->cell_len = __wt_cell_pack_int_key(&key->cell, key->buf.size);
+	key->len = key->cell_len + key->buf.size;
+
+	return (0);
+}
+
+/*
+ * __rec_cell_build_leaf_key --
+ *	Process a key and return a WT_CELL structure and byte string to be
+ * stored on a row-store leaf page.
+ */
+static int
+__rec_cell_build_leaf_key(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, const void *data, size_t size, int *is_ovflp)
+{
+	WT_BTREE *btree;
+	WT_KV *key;
+	size_t pfx_max;
+	uint8_t pfx;
+	const uint8_t *a, *b;
+
+	*is_ovflp = 0;
+
+	btree = S2BT(session);
+
+	key = &r->k;
+
+	pfx = 0;
+	if (data == NULL)
+		/*
+		 * When data is NULL, our caller has a prefix compressed key
+		 * they can't use (probably because they just crossed a split
+		 * point).  Use the full key saved when last called, instead.
+		 */
+		WT_RET(__wt_buf_set(
+		    session, &key->buf, r->cur->data, r->cur->size));
+	else {
+		/*
+		 * Save a copy of the key for later reference: we use the full
+		 * key for prefix-compression comparisons, and if we are, for
+		 * any reason, unable to use the compressed key we generate.
+		 */
+		WT_RET(__wt_buf_set(session, r->cur, data, size));
+
+		/*
+		 * Do prefix compression on the key.  We know by definition the
+		 * previous key sorts before the current key, which means the
+		 * keys must differ and we just need to compare up to the
+		 * shorter of the two keys.
+		 */
+		if (r->key_pfx_compress) {
+			/*
+			 * We can't compress out more than 256 bytes, limit the
+			 * comparison to that.
+			 */
+			pfx_max = UINT8_MAX;
+			if (size < pfx_max)
+				pfx_max = size;
+			if (r->last->size < pfx_max)
+				pfx_max = r->last->size;
+			for (a = data, b = r->last->data; pfx < pfx_max; ++pfx)
+				if (*a++ != *b++)
+					break;
+
+			/*
+			 * Prefix compression may cost us CPU and memory when
+			 * the page is re-loaded, don't do it unless there's
+			 * reasonable gain.
+			 */
+			if (pfx < btree->prefix_compression_min)
+				pfx = 0;
+			else
+				WT_STAT_FAST_DATA_INCRV(
+				    session, rec_prefix_compression, pfx);
+		}
+
+		/* Copy the non-prefix bytes into the key buffer. */
+		WT_RET(__wt_buf_set(
+		    session, &key->buf, (uint8_t *)data + pfx, size - pfx));
+	}
+
+	/* Optionally compress the key using the Huffman engine. */
+	if (btree->huffman_key != NULL)
+		WT_RET(__wt_huffman_encode(session, btree->huffman_key,
+		    key->buf.data, (uint32_t)key->buf.size, &key->buf));
+
+	/* Create an overflow object if the data won't fit. */
+	if (key->buf.size > btree->maxleafitem) {
+		/*
+		 * Overflow objects aren't prefix compressed -- rebuild any
+		 * object that was prefix compressed.
+		 */
+		if (pfx == 0) {
+			WT_STAT_FAST_DATA_INCR(session, rec_overflow_key_leaf);
+
+			*is_ovflp = 1;
+			return (__rec_cell_build_ovfl(
+			    session, r, key, WT_CELL_KEY_OVFL, (uint64_t)0));
+		}
+		return (
+		    __rec_cell_build_leaf_key(session, r, NULL, 0, is_ovflp));
+	}
+
+	key->cell_len = __wt_cell_pack_leaf_key(&key->cell, pfx, key->buf.size);
+	key->len = key->cell_len + key->buf.size;
+
+	return (0);
+}
+
+/*
+ * __rec_cell_build_addr --
+ *	Process an address reference and return a cell structure to be stored
+ * on the page.
+ */
+static void
+__rec_cell_build_addr(WT_RECONCILE *r,
+    const void *addr, size_t size, u_int cell_type, uint64_t recno)
+{
+	WT_KV *val;
+
+	val = &r->v;
+
+	/*
+	 * We don't check the address size because we can't store an address on
+	 * an overflow page: if the address won't fit, the overflow page's
+	 * address won't fit either.  This possibility must be handled by Btree
+	 * configuration, we have to disallow internal page sizes that are too
+	 * small with respect to the largest address cookie the underlying block
+	 * manager might return.
+	 */
+
+	/*
+	 * We don't copy the data into the buffer, it's not necessary; just
+	 * re-point the buffer's data/length fields.
+	 */
+	val->buf.data = addr;
+	val->buf.size = size;
+	val->cell_len =
+	    __wt_cell_pack_addr(&val->cell, cell_type, recno, val->buf.size);
+	val->len = val->cell_len + val->buf.size;
+}
+
+/*
+ * __rec_cell_build_val --
+ *	Process a data item and return a WT_CELL structure and byte string to
+ * be stored on the page.
+ */
+static int
+__rec_cell_build_val(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, const void *data, size_t size, uint64_t rle)
+{
+	WT_BTREE *btree;
+	WT_KV *val;
+
+	btree = S2BT(session);
+
+	val = &r->v;
+
+	/*
+	 * We don't copy the data into the buffer, it's not necessary; just
+	 * re-point the buffer's data/length fields.
+	 */
+	val->buf.data = data;
+	val->buf.size = size;
+
+	/* Handle zero-length cells quickly. */
+	if (size != 0) {
+		/* Optionally compress the data using the Huffman engine. */
+		if (btree->huffman_value != NULL)
+			WT_RET(__wt_huffman_encode(
+			    session, btree->huffman_value,
+			    val->buf.data, (uint32_t)val->buf.size, &val->buf));
+
+		/* Create an overflow object if the data won't fit. */
+		if (val->buf.size > btree->maxleafitem) {
+			WT_STAT_FAST_DATA_INCR(session, rec_overflow_value);
+
+			return (__rec_cell_build_ovfl(
+			    session, r, val, WT_CELL_VALUE_OVFL, rle));
+		}
+	}
+	val->cell_len = __wt_cell_pack_data(&val->cell, rle, val->buf.size);
+	val->len = val->cell_len + val->buf.size;
+
+	return (0);
+}
+
+/*
+ * __rec_cell_build_ovfl --
+ *	Store overflow items in the file, returning the address cookie.
+ */
+static int
+__rec_cell_build_ovfl(WT_SESSION_IMPL *session,
+    WT_RECONCILE *r, WT_KV *kv, uint8_t type, uint64_t rle)
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	WT_PAGE *page;
+	WT_PAGE_HEADER *dsk;
+	size_t size;
+	uint8_t *addr, buf[WT_BTREE_MAX_ADDR_COOKIE];
+
+	btree = S2BT(session);
+	bm = btree->bm;
+	page = r->page;
+
+	/* Track if page has overflow items. */
+	r->ovfl_items = 1;
+
+	/*
+	 * See if this overflow record has already been written and reuse it if
+	 * possible.  Else, write a new overflow record.
+	 */
+	if (!__wt_ovfl_reuse_search(session, page,
+	    &addr, &size, kv->buf.data, kv->buf.size)) {
+		/* Allocate a buffer big enough to write the overflow record. */
+		size = kv->buf.size;
+		WT_RET(bm->write_size(bm, session, &size));
+		WT_RET(__wt_scr_alloc(session, size, &tmp));
+
+		/* Initialize the buffer: disk header and overflow record. */
+		dsk = tmp->mem;
+		memset(dsk, 0, WT_PAGE_HEADER_SIZE);
+		dsk->type = WT_PAGE_OVFL;
+		dsk->u.datalen = (uint32_t)kv->buf.size;
+		memcpy(WT_PAGE_HEADER_BYTE(btree, dsk),
+		    kv->buf.data, kv->buf.size);
+		dsk->mem_size = tmp->size =
+		    WT_PAGE_HEADER_BYTE_SIZE(btree) + (uint32_t)kv->buf.size;
+
+		/* Write the buffer. */
+		addr = buf;
+		WT_ERR(__wt_bt_write(session, tmp, addr, &size, 0, 0));
+
+		/*
+		 * Track the overflow record (unless it's a bulk load, which
+		 * by definition won't ever reuse a record.
+		 */
+		if (!r->is_bulk_load)
+			WT_ERR(__wt_ovfl_reuse_add(session, page,
+			    addr, size, kv->buf.data, kv->buf.size));
+	}
+
+	/* Set the callers K/V to reference the overflow record's address. */
+	WT_ERR(__wt_buf_set(session, &kv->buf, addr, size));
+
+	/* Build the cell and return. */
+	kv->cell_len = __wt_cell_pack_ovfl(&kv->cell, type, rle, kv->buf.size);
+	kv->len = kv->cell_len + kv->buf.size;
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * The dictionary --
+ *	The rest of this file is support for dictionaries.
+ *
+ * It's difficult to write generic skiplist functions without turning a single
+ * memory allocation into two, or requiring a function call instead of a simple
+ * comparison.  Fortunately, skiplists are relatively simple things and we can
+ * include them in-place.  If you need generic skip-list functions to modify,
+ * this set wouldn't be a bad place to start.
+ *
+ * __rec_dictionary_skip_search --
+ *	Search a dictionary skiplist.
+ */
+static WT_DICTIONARY *
+__rec_dictionary_skip_search(WT_DICTIONARY **head, uint64_t hash)
+{
+	WT_DICTIONARY **e;
+	int i;
+
+	/*
+	 * Start at the highest skip level, then go as far as possible at each
+	 * level before stepping down to the next.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+		if (*e == NULL) {		/* Empty levels */
+			--i;
+			--e;
+			continue;
+		}
+
+		/*
+		 * Return any exact matches: we don't care in what search level
+		 * we found a match.
+		 */
+		if ((*e)->hash == hash)		/* Exact match */
+			return (*e);
+		if ((*e)->hash > hash) {	/* Drop down a level */
+			--i;
+			--e;
+		} else				/* Keep going at this level */
+			e = &(*e)->next[i];
+	}
+	return (NULL);
+}
+
+/*
+ * __rec_dictionary_skip_search_stack --
+ *	Search a dictionary skiplist, returning an insert/remove stack.
+ */
+static void
+__rec_dictionary_skip_search_stack(
+    WT_DICTIONARY **head, WT_DICTIONARY ***stack, uint64_t hash)
+{
+	WT_DICTIONARY **e;
+	int i;
+
+	/*
+	 * Start at the highest skip level, then go as far as possible at each
+	 * level before stepping down to the next.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;)
+		if (*e == NULL || (*e)->hash > hash)
+			stack[i--] = e--;	/* Drop down a level */
+		else
+			e = &(*e)->next[i];	/* Keep going at this level */
+}
+
+/*
+ * __rec_dictionary_skip_insert --
+ *	Insert an entry into the dictionary skip-list.
+ */
+static void
+__rec_dictionary_skip_insert(
+    WT_DICTIONARY **head, WT_DICTIONARY *e, uint64_t hash)
+{
+	WT_DICTIONARY **stack[WT_SKIP_MAXDEPTH];
+	u_int i;
+
+	/* Insert the new entry into the skiplist. */
+	__rec_dictionary_skip_search_stack(head, stack, hash);
+	for (i = 0; i < e->depth; ++i) {
+		e->next[i] = *stack[i];
+		*stack[i] = e;
+	}
+}
+
+/*
+ * __rec_dictionary_init --
+ *	Allocate and initialize the dictionary.
+ */
+static int
+__rec_dictionary_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, u_int slots)
+{
+	u_int depth, i;
+
+	/* Free any previous dictionary. */
+	__rec_dictionary_free(session, r);
+
+	r->dictionary_slots = slots;
+	WT_RET(__wt_calloc(session,
+	    r->dictionary_slots, sizeof(WT_DICTIONARY *), &r->dictionary));
+	for (i = 0; i < r->dictionary_slots; ++i) {
+		depth = __wt_skip_choose_depth(session);
+		WT_RET(__wt_calloc(session, 1,
+		    sizeof(WT_DICTIONARY) + depth * sizeof(WT_DICTIONARY *),
+		    &r->dictionary[i]));
+		r->dictionary[i]->depth = depth;
+	}
+	return (0);
+}
+
+/*
+ * __rec_dictionary_free --
+ *	Free the dictionary.
+ */
+static void
+__rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+	u_int i;
+
+	if (r->dictionary == NULL)
+		return;
+
+	/*
+	 * We don't correct dictionary_slots when we fail during allocation,
+	 * but that's OK, the value is either NULL or a memory reference to
+	 * be free'd.
+	 */
+	for (i = 0; i < r->dictionary_slots; ++i)
+		__wt_free(session, r->dictionary[i]);
+	__wt_free(session, r->dictionary);
+}
+
+/*
+ * __rec_dictionary_reset --
+ *	Reset the dictionary when reconciliation restarts and when crossing a
+ * page boundary (a potential split).
+ */
+static void
+__rec_dictionary_reset(WT_RECONCILE *r)
+{
+	if (r->dictionary_slots) {
+		r->dictionary_next = 0;
+		memset(r->dictionary_head, 0, sizeof(r->dictionary_head));
+	}
+}
+
+/*
+ * __rec_dictionary_lookup --
+ *	Check the dictionary for a matching value on this page.
+ */
+static int
+__rec_dictionary_lookup(
+    WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *val, WT_DICTIONARY **dpp)
+{
+	WT_DICTIONARY *dp, *next;
+	uint64_t hash;
+	int match;
+
+	*dpp = NULL;
+
+	/* Search the dictionary, and return any match we find. */
+	hash = __wt_hash_fnv64(val->buf.data, val->buf.size);
+	for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash);
+	    dp != NULL && dp->hash == hash; dp = dp->next[0]) {
+		WT_RET(__wt_cell_pack_data_match(
+		    dp->cell, &val->cell, val->buf.data, &match));
+		if (match) {
+			WT_STAT_FAST_DATA_INCR(session, rec_dictionary);
+			*dpp = dp;
+			return (0);
+		}
+	}
+
+	/*
+	 * We're not doing value replacement in the dictionary.  We stop adding
+	 * new entries if we run out of empty dictionary slots (but continue to
+	 * use the existing entries).  I can't think of any reason a leaf page
+	 * value is more likely to be seen because it was seen more recently
+	 * than some other value: if we find working sets where that's not the
+	 * case, it shouldn't be too difficult to maintain a pointer which is
+	 * the next dictionary slot to re-use.
+	 */
+	if (r->dictionary_next >= r->dictionary_slots)
+		return (0);
+
+	/*
+	 * Set the hash value, we'll add this entry into the dictionary when we
+	 * write it into the page's disk image buffer (because that's when we
+	 * know where on the page it will be written).
+	 */
+	next = r->dictionary[r->dictionary_next++];
+	next->cell = NULL;		/* Not necessary, just cautious. */
+	next->hash = hash;
+	__rec_dictionary_skip_insert(r->dictionary_head, next, hash);
+	*dpp = next;
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/row_key.c b/src/third_party/wiredtiger/src/btree/row_key.c
new file mode 100644
index 00000000000..308bc1f0dc5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/row_key.c
@@ -0,0 +1,500 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static void __inmem_row_leaf_slots(uint8_t *, uint32_t, uint32_t, uint32_t);
+
+/*
+ * __wt_row_leaf_keys --
+ *	Instantiate the interesting keys for random search of a page.
+ */
+int
+__wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_BTREE *btree;
+	WT_DECL_ITEM(key);
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	WT_ROW *rip;
+	uint32_t gap, i;
+
+	btree = S2BT(session);
+
+	if (page->pg_row_entries == 0) {		/* Just checking... */
+		F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS);
+		return (0);
+	}
+
+	/*
+	 * Row-store leaf pages are written as one big prefix-compressed chunk,
+	 * that is, only the first key on the page is not prefix-compressed, and
+	 * to instantiate the last key on the page, you have to take the first
+	 * key on the page and roll it forward to the end of the page.  We don't
+	 * want to do that on every page access, of course, so we instantiate a
+	 * set of keys, essentially creating prefix chunks on the page, where we
+	 * can roll forward from the closest, previous, instantiated key.  The
+	 * complication is that not all keys on a page are equal: we're doing a
+	 * binary search on the  page, which means there are keys we look at a
+	 * lot (every time we search the page), and keys we never look at unless
+	 * they are actually being searched for.  This function figures out the
+	 * "interesting" keys on a page, and then we sequentially walk that list
+	 * instantiating those keys.
+	 *
+	 * Allocate a bit array and figure out the set of "interesting" keys,
+	 * marking up the array.
+	 */
+	WT_RET(__wt_scr_alloc(session, 0, &key));
+	WT_RET(__wt_scr_alloc(session,
+	    (uint32_t)__bitstr_size(page->pg_row_entries), &tmp));
+
+	if ((gap = btree->key_gap) == 0)
+		gap = 1;
+	__inmem_row_leaf_slots(tmp->mem, 0, page->pg_row_entries, gap);
+
+	/* Instantiate the keys. */
+	for (rip = page->pg_row_d, i = 0; i < page->pg_row_entries; ++rip, ++i)
+		if (__bit_test(tmp->mem, i))
+			WT_ERR(__wt_row_leaf_key_work(
+			    session, page, rip, key, 1));
+
+	F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS);
+
+err:	__wt_scr_free(&key);
+	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __inmem_row_leaf_slots --
+ *	Figure out the interesting slots of a page for random search, up to
+ * the specified depth.
+ */
+static void
+__inmem_row_leaf_slots(
+    uint8_t *list, uint32_t base, uint32_t entries, uint32_t gap)
+{
+	uint32_t indx, limit;
+
+	if (entries < gap)
+		return;
+
+	/*
+	 * !!!
+	 * Don't clean this code up -- it deliberately looks like the binary
+	 * search code.
+	 *
+	 * !!!
+	 * There's got to be a function that would give me this information, but
+	 * I don't see any performance reason we can't just do this recursively.
+	 */
+	limit = entries;
+	indx = base + (limit >> 1);
+	__bit_set(list, indx);
+
+	__inmem_row_leaf_slots(list, base, limit >> 1, gap);
+
+	base = indx + 1;
+	--limit;
+	__inmem_row_leaf_slots(list, base, limit >> 1, gap);
+}
+
+/*
+ * __wt_row_leaf_key_copy --
+ *	Get a copy of a row-store leaf-page key.
+ */
+int
+__wt_row_leaf_key_copy(
+    WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key)
+{
+	WT_RET(__wt_row_leaf_key(session, page, rip, key, 0));
+
+	/* The return buffer may only hold a reference to a key, copy it. */
+	if (!WT_DATA_IN_ITEM(key))
+		WT_RET(__wt_buf_set(session, key, key->data, key->size));
+
+	return (0);
+}
+
+/*
+ * __wt_row_leaf_key_work --
+ *	Return a reference to, a row-store leaf-page key, optionally instantiate
+ * the key into the in-memory page.
+ */
+int
+__wt_row_leaf_key_work(WT_SESSION_IMPL *session,
+    WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, int instantiate)
+{
+	enum { FORWARD, BACKWARD } direction;
+	WT_BTREE *btree;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	WT_IKEY *ikey;
+	WT_ROW *rip, *jump_rip;
+	size_t size;
+	u_int last_prefix;
+	int jump_slot_offset, slot_offset;
+	void *copy;
+	const void *p;
+
+	/*
+	 * !!!
+	 * It is unusual to call this function: most code should be calling the
+	 * front-end, __wt_row_leaf_key, be careful if you're calling this code
+	 * directly.
+	 */
+
+	btree = S2BT(session);
+	unpack = &_unpack;
+	rip = rip_arg;
+
+	jump_rip = NULL;
+	jump_slot_offset = 0;
+	last_prefix = 0;
+
+	p = NULL;			/* -Werror=maybe-uninitialized */
+	size = 0;			/* -Werror=maybe-uninitialized */
+
+	direction = BACKWARD;
+	for (slot_offset = 0;;) {
+		if (0) {
+switch_and_jump:	/* Switching to a forward roll. */
+			WT_ASSERT(session, direction == BACKWARD);
+			direction = FORWARD;
+
+			/* Skip list of keys with compatible prefixes. */
+			rip = jump_rip;
+			slot_offset = jump_slot_offset;
+		}
+		copy = WT_ROW_KEY_COPY(rip);
+
+		/*
+		 * Figure out what the key looks like.
+		 */
+		(void)__wt_row_leaf_key_info(
+		    page, copy, &ikey, &cell, &p, &size);
+
+		/* 1: the test for a directly referenced on-page key. */
+		if (cell == NULL) {
+			keyb->data = p;
+			keyb->size = size;
+
+			/*
+			 * If this is the key we originally wanted, we don't
+			 * care if we're rolling forward or backward, or if
+			 * it's an overflow key or not, it's what we wanted.
+			 * This shouldn't normally happen, the fast-path code
+			 * that front-ends this function will have figured it
+			 * out before we were called.
+			 *
+			 * The key doesn't need to be instantiated, skip past
+			 * that test.
+			 */
+			if (slot_offset == 0)
+				goto done;
+
+			/*
+			 * This key is not an overflow key by definition and
+			 * isn't compressed in any way, we can use it to roll
+			 * forward.
+			 *	If rolling backward, switch directions.
+			 *	If rolling forward: there's a bug somewhere,
+			 * we should have hit this key when rolling backward.
+			 */
+			goto switch_and_jump;
+		}
+
+		/* 2: the test for an instantiated off-page key. */
+		if (ikey != NULL) {
+			/*
+			 * If this is the key we originally wanted, we don't
+			 * care if we're rolling forward or backward, or if
+			 * it's an overflow key or not, it's what we wanted.
+			 * Take a copy and wrap up.
+			 *
+			 * The key doesn't need to be instantiated, skip past
+			 * that test.
+			 */
+			if (slot_offset == 0) {
+				keyb->data = p;
+				keyb->size = size;
+				goto done;
+			}
+
+			/*
+			 * If we wanted a different key and this key is an
+			 * overflow key:
+			 *	If we're rolling backward, this key is useless
+			 * to us because it doesn't have a valid prefix: keep
+			 * rolling backward.
+			 *	If we're rolling forward, there's no work to be
+			 * done because prefixes skip overflow keys: keep
+			 * rolling forward.
+			 */
+			if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL)
+				goto next;
+
+			/*
+			 * If we wanted a different key and this key is not an
+			 * overflow key, it has a valid prefix, we can use it.
+			 *	If rolling backward, take a copy of the key and
+			 * switch directions, we can roll forward from this key.
+			 *	If rolling forward, replace the key we've been
+			 * building with this key, it's what we would have built
+			 * anyway.
+			 * In short: if it's not an overflow key, take a copy
+			 * and roll forward.
+			 */
+			keyb->data = p;
+			keyb->size = size;
+			direction = FORWARD;
+			goto next;
+		}
+
+		/*
+		 * It must be an on-page cell, unpack it.
+		 */
+		__wt_cell_unpack(cell, unpack);
+
+		/* 3: the test for an on-page reference to an overflow key. */
+		if (unpack->type == WT_CELL_KEY_OVFL) {
+			/*
+			 * If this is the key we wanted from the start, we don't
+			 * care if it's an overflow key, get a copy and wrap up.
+			 *
+			 * Avoid racing with reconciliation deleting overflow
+			 * keys.  Deleted overflow keys must be instantiated
+			 * first, acquire the overflow lock and check.  Read
+			 * the key if we still need to do so, but holding the
+			 * overflow lock.  Note we are not using the version of
+			 * the cell-data-ref calls that acquire the overflow
+			 * lock and do a look-aside into the tracking cache:
+			 * this is an overflow key, not a value, meaning it's
+			 * instantiated before being deleted, not copied into
+			 * the tracking cache.
+			 */
+			if (slot_offset == 0) {
+				WT_ERR(
+				    __wt_readlock(session, btree->ovfl_lock));
+				copy = WT_ROW_KEY_COPY(rip);
+				if (!__wt_row_leaf_key_info(page, copy,
+				    NULL, &cell, &keyb->data, &keyb->size)) {
+					__wt_cell_unpack(cell, unpack);
+					ret = __wt_dsk_cell_data_ref(session,
+					    WT_PAGE_ROW_LEAF, unpack, keyb);
+				}
+				WT_TRET(
+				    __wt_readunlock(session, btree->ovfl_lock));
+				WT_ERR(ret);
+				break;
+			}
+
+			/*
+			 * If we wanted a different key:
+			 *	If we're rolling backward, this key is useless
+			 * to us because it doesn't have a valid prefix: keep
+			 * rolling backward.
+			 *	If we're rolling forward, there's no work to be
+			 * done because prefixes skip overflow keys: keep
+			 * rolling forward.
+			 */
+			goto next;
+		}
+
+		/*
+		 * 4: the test for an on-page reference to a key that isn't
+		 * prefix compressed.
+		 */
+		if (unpack->prefix == 0) {
+			/*
+			 * The only reason to be here is a Huffman encoded key,
+			 * a non-encoded key with no prefix compression should
+			 * have been directly referenced, and we should not have
+			 * needed to unpack its cell.
+			 */
+			WT_ASSERT(session, btree->huffman_key != NULL);
+
+			/*
+			 * If this is the key we originally wanted, we don't
+			 * care if we're rolling forward or backward, it's
+			 * what we want.  Take a copy and wrap up.
+			 *
+			 * If we wanted a different key, this key has a valid
+			 * prefix, we can use it.
+			 *	If rolling backward, take a copy of the key and
+			 * switch directions, we can roll forward from this key.
+			 *	If rolling forward there's a bug, we should have
+			 * found this key while rolling backwards and switched
+			 * directions then.
+			 *
+			 * The key doesn't need to be instantiated, skip past
+			 * that test.
+			 */
+			WT_ERR(__wt_dsk_cell_data_ref(
+			    session, WT_PAGE_ROW_LEAF, unpack, keyb));
+			if (slot_offset == 0)
+				goto done;
+			goto switch_and_jump;
+		}
+
+		/*
+		 * 5: an on-page reference to a key that's prefix compressed.
+		 *	If rolling backward, keep looking for something we can
+		 * use.
+		 *	If rolling forward, build the full key and keep rolling
+		 * forward.
+		 */
+		if (direction == BACKWARD) {
+			/*
+			 * If there's a set of keys with identical prefixes, we
+			 * don't want to instantiate each one, the prefixes are
+			 * all the same.
+			 *
+			 * As we roll backward through the page, track the last
+			 * time the prefix decreased in size, so we can start
+			 * with that key during our roll-forward.  For a page
+			 * populated with a single key prefix, we'll be able to
+			 * instantiate the key we want as soon as we find a key
+			 * without a prefix.
+			 */
+			if (slot_offset == 0)
+				last_prefix = unpack->prefix;
+			if (slot_offset == 0 || last_prefix > unpack->prefix) {
+				jump_rip = rip;
+				jump_slot_offset = slot_offset;
+				last_prefix = unpack->prefix;
+			}
+		}
+		if (direction == FORWARD) {
+			/*
+			 * Get a reference to the current key's bytes.  Usually
+			 * we want bytes from the page, fast-path that case.
+			 */
+			if (btree->huffman_key == NULL) {
+				p = unpack->data;
+				size = unpack->size;
+			} else {
+				if (tmp == NULL)
+					WT_ERR(
+					__wt_scr_alloc(session, 0, &tmp));
+				WT_ERR(__wt_dsk_cell_data_ref(
+				    session, WT_PAGE_ROW_LEAF, unpack, tmp));
+				p = tmp->data;
+				size = tmp->size;
+			}
+
+			/*
+			 * Grow the buffer as necessary as well as ensure data
+			 * has been copied into local buffer space, then append
+			 * the suffix to the prefix already in the buffer.
+			 *
+			 * Don't grow the buffer unnecessarily or copy data we
+			 * don't need, truncate the item's data length to the
+			 * prefix bytes.
+			 */
+			keyb->size = unpack->prefix;
+			WT_ERR(__wt_buf_grow(session, keyb, keyb->size + size));
+			memcpy((uint8_t *)keyb->data + keyb->size, p, size);
+			keyb->size += size;
+
+			if (slot_offset == 0)
+				break;
+		}
+
+next:		switch (direction) {
+		case  BACKWARD:
+			--rip;
+			++slot_offset;
+			break;
+		case FORWARD:
+			++rip;
+			--slot_offset;
+			break;
+		}
+	}
+
+	/*
+	 * Optionally instantiate the key: there's a cost to figuring out a key
+	 * value in a leaf page with prefix-compressed or Huffman encoded keys,
+	 * amortize the cost by instantiating a copy of the calculated key in
+	 * allocated memory.  We don't instantiate keys when pages are first
+	 * brought into memory because it's wasted effort if the page is only
+	 * read by a cursor in sorted order.  If, instead, the page is read by a
+	 * cursor in reverse order, we immediately instantiate periodic keys for
+	 * the page (otherwise the reverse walk would be insanely slow).  If,
+	 * instead, the page is randomly searched, we instantiate keys as they
+	 * are accessed (meaning, for example, as long as the binary search only
+	 * touches one-half of the page, the only keys we instantiate will be in
+	 * that half of the page).
+	 */
+	if (instantiate) {
+		copy = WT_ROW_KEY_COPY(rip_arg);
+		(void)__wt_row_leaf_key_info(
+		    page, copy, &ikey, &cell, NULL, NULL);
+		if (ikey == NULL) {
+			WT_ERR(__wt_row_ikey(session,
+			    WT_PAGE_DISK_OFFSET(page, cell),
+			    keyb->data, keyb->size, &ikey));
+
+			/*
+			 * Serialize the swap of the key into place: on success,
+			 * update the page's memory footprint, on failure, free
+			 * the allocated memory.
+			 */
+			if (WT_ATOMIC_CAS8(WT_ROW_KEY_COPY(rip), copy, ikey))
+				__wt_cache_page_inmem_incr(session,
+				    page, sizeof(WT_IKEY) + ikey->size);
+			else
+				__wt_free(session, ikey);
+		}
+	}
+
+done:
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __wt_row_ikey_incr --
+ *	Instantiate a key in a WT_IKEY structure and increment the page's
+ * memory footprint.
+ */
+int
+__wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page,
+    uint32_t cell_offset, const void *key, size_t size, void *ikeyp)
+{
+	WT_RET(__wt_row_ikey(session, cell_offset, key, size, ikeyp));
+
+	__wt_cache_page_inmem_incr(session, page, sizeof(WT_IKEY) + size);
+
+	return (0);
+}
+
+/*
+ * __wt_row_ikey --
+ *	Instantiate a key in a WT_IKEY structure.
+ */
+int
+__wt_row_ikey(WT_SESSION_IMPL *session,
+    uint32_t cell_offset, const void *key, size_t size, void *ikeyp)
+{
+	WT_IKEY *ikey;
+
+	/*
+	 * Allocate memory for the WT_IKEY structure and the key, then copy
+	 * the key into place.
+	 */
+	WT_RET(__wt_calloc(session, 1, sizeof(WT_IKEY) + size, &ikey));
+	ikey->size = WT_STORE_SIZE(size);
+	ikey->cell_offset = cell_offset;
+	memcpy(WT_IKEY_DATA(ikey), key, size);
+
+	*(WT_IKEY **)ikeyp = ikey;
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c
new file mode 100644
index 00000000000..e0036d14cbb
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/row_modify.c
@@ -0,0 +1,346 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_page_modify_alloc --
+ *	Allocate a page's modification structure.
+ */
+int
+__wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_PAGE_MODIFY *modify;
+
+	conn = S2C(session);
+
+	WT_RET(__wt_calloc_def(session, 1, &modify));
+
+	/*
+	 * Select a spinlock for the page; let the barrier immediately below
+	 * keep things from racing too badly.
+	 */
+	modify->page_lock = ++conn->page_lock_cnt % WT_PAGE_LOCKS(conn);
+
+	/*
+	 * Multiple threads of control may be searching and deciding to modify
+	 * a page.  If our modify structure is used, update the page's memory
+	 * footprint, else discard the modify structure, another thread did the
+	 * work.
+	 */
+	if (WT_ATOMIC_CAS8(page->modify, NULL, modify))
+		__wt_cache_page_inmem_incr(session, page, sizeof(*modify));
+	else
+		__wt_free(session, modify);
+	return (0);
+}
+
+/*
+ * __wt_row_modify --
+ *	Row-store insert, update and delete.
+ */
+int
+__wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
+    WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd, int is_remove)
+{
+	WT_DECL_RET;
+	WT_INSERT *ins;
+	WT_INSERT_HEAD *ins_head, **ins_headp;
+	WT_PAGE *page;
+	WT_UPDATE *old_upd, **upd_entry;
+	size_t ins_size, upd_size;
+	uint32_t ins_slot;
+	u_int i, skipdepth;
+	int logged;
+
+	ins = NULL;
+	page = cbt->ref->page;
+	logged = 0;
+
+	/* This code expects a remove to have a NULL value. */
+	if (is_remove)
+		value = NULL;
+
+	/* If we don't yet have a modify structure, we'll need one. */
+	WT_RET(__wt_page_modify_init(session, page));
+
+	/*
+	 * Modify: allocate an update array as necessary, build a WT_UPDATE
+	 * structure, and call a serialized function to insert the WT_UPDATE
+	 * structure.
+	 *
+	 * Insert: allocate an insert array as necessary, build a WT_INSERT
+	 * and WT_UPDATE structure pair, and call a serialized function to
+	 * insert the WT_INSERT structure.
+	 */
+	if (cbt->compare == 0) {
+		if (cbt->ins == NULL) {
+			/* Allocate an update array as necessary. */
+			WT_PAGE_ALLOC_AND_SWAP(session, page,
+			    page->pg_row_upd, upd_entry, page->pg_row_entries);
+
+			/* Set the WT_UPDATE array reference. */
+			upd_entry = &page->pg_row_upd[cbt->slot];
+		} else
+			upd_entry = &cbt->ins->upd;
+
+		if (upd == NULL) {
+			/* Make sure the update can proceed. */
+			WT_ERR(__wt_txn_update_check(
+			    session, old_upd = *upd_entry));
+
+			/* Allocate a WT_UPDATE structure and transaction ID. */
+			WT_ERR(
+			    __wt_update_alloc(session, value, &upd, &upd_size));
+			WT_ERR(__wt_txn_modify(session, upd));
+			logged = 1;
+
+			/* Avoid WT_CURSOR.update data copy. */
+			cbt->modify_update = upd;
+		} else {
+			upd_size = sizeof(WT_UPDATE) +
+			    (WT_UPDATE_DELETED_ISSET(upd) ? 0 : upd->size);
+
+			/*
+			 * We are restoring updates that couldn't be evicted,
+			 * there should only be one update list per key.
+			 */
+			WT_ASSERT(session, *upd_entry == NULL);
+			/*
+			 * Set the "old" entry to the second update in the list
+			 * so that the serialization function succeeds in
+			 * swapping the first update into place.
+			 */
+			old_upd = *upd_entry = upd->next;
+		}
+
+		/*
+		 * Point the new WT_UPDATE item to the next element in the list.
+		 * If we get it right, the serialization function lock acts as
+		 * our memory barrier to flush this write.
+		 */
+		upd->next = old_upd;
+
+		/* Serialize the update. */
+		WT_ERR(__wt_update_serial(
+		    session, page, upd_entry, &upd, upd_size));
+	} else {
+		/*
+		 * Allocate the insert array as necessary.
+		 *
+		 * We allocate an additional insert array slot for insert keys
+		 * sorting less than any key on the page.  The test to select
+		 * that slot is baroque: if the search returned the first page
+		 * slot, we didn't end up processing an insert list, and the
+		 * comparison value indicates the search key was smaller than
+		 * the returned slot, then we're using the smallest-key insert
+		 * slot.  That's hard, so we set a flag.
+		 */
+		WT_PAGE_ALLOC_AND_SWAP(session, page,
+		    page->pg_row_ins, ins_headp, page->pg_row_entries + 1);
+
+		ins_slot = F_ISSET(cbt, WT_CBT_SEARCH_SMALLEST) ?
+		    page->pg_row_entries: cbt->slot;
+		ins_headp = &page->pg_row_ins[ins_slot];
+
+		/* Allocate the WT_INSERT_HEAD structure as necessary. */
+		WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1);
+		ins_head = *ins_headp;
+
+		/* Choose a skiplist depth for this insert. */
+		skipdepth = __wt_skip_choose_depth(session);
+
+		/*
+		 * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and
+		 * update the cursor to reference it (the WT_INSERT_HEAD might
+		 * be allocated, the WT_INSERT was allocated).
+		 */
+		WT_ERR(__wt_row_insert_alloc(
+		    session, key, skipdepth, &ins, &ins_size));
+		cbt->ins_head = ins_head;
+		cbt->ins = ins;
+
+		if (upd == NULL) {
+			WT_ERR(
+			    __wt_update_alloc(session, value, &upd, &upd_size));
+			WT_ERR(__wt_txn_modify(session, upd));
+			logged = 1;
+
+			/* Avoid WT_CURSOR.update data copy. */
+			cbt->modify_update = upd;
+		} else
+			upd_size = sizeof(WT_UPDATE) +
+			    (WT_UPDATE_DELETED_ISSET(upd) ? 0 : upd->size);
+
+		ins->upd = upd;
+		ins_size += upd_size;
+
+		/*
+		 * If there was no insert list during the search, the cursor's
+		 * information cannot be correct, search couldn't have
+		 * initialized it.
+		 *
+		 * Otherwise, point the new WT_INSERT item's skiplist to the
+		 * next elements in the insert list (which we will check are
+		 * still valid inside the serialization function).
+		 *
+		 * The serial mutex acts as our memory barrier to flush these
+		 * writes before inserting them into the list.
+		 */
+		if (WT_SKIP_FIRST(ins_head) == NULL)
+			for (i = 0; i < skipdepth; i++) {
+				cbt->ins_stack[i] = &ins_head->head[i];
+				ins->next[i] = cbt->next_stack[i] = NULL;
+			}
+		else
+			for (i = 0; i < skipdepth; i++)
+				ins->next[i] = cbt->next_stack[i];
+
+		/* Insert the WT_INSERT structure. */
+		WT_ERR(__wt_insert_serial(
+		    session, page, cbt->ins_head, cbt->ins_stack,
+		    &ins, ins_size, skipdepth));
+	}
+
+	if (logged)
+		WT_ERR(__wt_txn_log_op(session, cbt));
+
+	if (0) {
+err:		/*
+		 * Remove the update from the current transaction, so we don't
+		 * try to modify it on rollback.
+		 */
+		if (logged)
+			__wt_txn_unmodify(session);
+		__wt_free(session, ins);
+		cbt->ins = NULL;
+		__wt_free(session, upd);
+	}
+
+	return (ret);
+}
+
+/*
+ * __wt_row_insert_alloc --
+ *	Row-store insert: allocate a WT_INSERT structure and fill it in.
+ */
+int
+__wt_row_insert_alloc(WT_SESSION_IMPL *session,
+    WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep)
+{
+	WT_INSERT *ins;
+	size_t ins_size;
+
+	/*
+	 * Allocate the WT_INSERT structure, next pointers for the skip list,
+	 * and room for the key.  Then copy the key into place.
+	 */
+	ins_size = sizeof(WT_INSERT) +
+	    skipdepth * sizeof(WT_INSERT *) + key->size;
+	WT_RET(__wt_calloc(session, 1, ins_size, &ins));
+
+	ins->u.key.offset = WT_STORE_SIZE(ins_size - key->size);
+	WT_INSERT_KEY_SIZE(ins) = WT_STORE_SIZE(key->size);
+	memcpy(WT_INSERT_KEY(ins), key->data, key->size);
+
+	*insp = ins;
+	if (ins_sizep != NULL)
+		*ins_sizep = ins_size;
+	return (0);
+}
+
+/*
+ * __wt_update_alloc --
+ *	Allocate a WT_UPDATE structure and associated value and fill it in.
+ */
+int
+__wt_update_alloc(
+    WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep)
+{
+	WT_UPDATE *upd;
+	size_t size;
+
+	/*
+	 * Allocate the WT_UPDATE structure and room for the value, then copy
+	 * the value into place.
+	 */
+	size = value == NULL ? 0 : value->size;
+	WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE) + size, &upd));
+	if (value == NULL)
+		WT_UPDATE_DELETED_SET(upd);
+	else {
+		upd->size = WT_STORE_SIZE(size);
+		memcpy(WT_UPDATE_DATA(upd), value->data, size);
+	}
+
+	*updp = upd;
+	*sizep = sizeof(WT_UPDATE) + size;
+	return (0);
+}
+
+/*
+ * __wt_update_obsolete_check --
+ *	Check for obsolete updates.
+ */
+WT_UPDATE *
+__wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+{
+	WT_UPDATE *first, *next;
+
+	/*
+	 * This function identifies obsolete updates, and truncates them from
+	 * the rest of the chain; because this routine is called from inside
+	 * a serialization function, the caller has responsibility for actually
+	 * freeing the memory.
+	 *
+	 * Walk the list of updates, looking for obsolete updates at the end.
+	 */
+	for (first = NULL; upd != NULL; upd = upd->next)
+		if (__wt_txn_visible_all(session, upd->txnid)) {
+			if (first == NULL)
+				first = upd;
+		} else if (upd->txnid != WT_TXN_ABORTED)
+			first = NULL;
+
+	/*
+	 * We cannot discard this WT_UPDATE structure, we can only discard
+	 * WT_UPDATE structures subsequent to it, other threads of control will
+	 * terminate their walk in this element.  Save a reference to the list
+	 * we will discard, and terminate the list.
+	 */
+	if (first != NULL &&
+	    (next = first->next) != NULL &&
+	    WT_ATOMIC_CAS8(first->next, next, NULL))
+		return (next);
+
+	return (NULL);
+}
+
+/*
+ * __wt_update_obsolete_free --
+ *	Free an obsolete update list.
+ */
+void
+__wt_update_obsolete_free(
+    WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd)
+{
+	WT_UPDATE *next;
+	size_t size;
+
+	/* Free a WT_UPDATE list. */
+	for (size = 0; upd != NULL; upd = next) {
+		/* Deleted items have a dummy size: don't include that. */
+		size += sizeof(WT_UPDATE) +
+		    (WT_UPDATE_DELETED_ISSET(upd) ? 0 : upd->size);
+
+		next = upd->next;
+		__wt_free(session, upd);
+	}
+	if (size != 0)
+		__wt_cache_page_inmem_decr(session, page, size);
+}
diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c
new file mode 100644
index 00000000000..b190aaaded5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/row_srch.c
@@ -0,0 +1,553 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_search_insert_append --
+ *	Fast append search of a row-store insert list, creating a skiplist stack
+ * as we go.
+ */
+static inline int
+__wt_search_insert_append(WT_SESSION_IMPL *session,
+    WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, int *donep)
+{
+	WT_BTREE *btree;
+	WT_COLLATOR *collator;
+	WT_INSERT *ins;
+	WT_INSERT_HEAD *inshead;
+	WT_ITEM key;
+	int cmp, i;
+
+	btree = S2BT(session);
+	collator = btree->collator;
+	*donep = 0;
+
+	inshead = cbt->ins_head;
+	if ((ins = WT_SKIP_LAST(inshead)) == NULL)
+		return (0);
+	key.data = WT_INSERT_KEY(ins);
+	key.size = WT_INSERT_KEY_SIZE(ins);
+
+	WT_RET(__wt_compare(session, collator, srch_key, &key, &cmp));
+	if (cmp >= 0) {
+		/*
+		 * !!!
+		 * We may race with another appending thread.
+		 *
+		 * To catch that case, rely on the atomic pointer read above
+		 * and set the next stack to NULL here.  If we have raced with
+		 * another thread, one of the next pointers will not be NULL by
+		 * the time they are checked against the next stack inside the
+		 * serialized insert function.
+		 */
+		for (i = WT_SKIP_MAXDEPTH - 1; i >= 0; i--) {
+			cbt->ins_stack[i] = (i == 0) ? &ins->next[0] :
+			    (inshead->tail[i] != NULL) ?
+			    &inshead->tail[i]->next[i] : &inshead->head[i];
+			cbt->next_stack[i] = NULL;
+		}
+		cbt->compare = -cmp;
+		cbt->ins = ins;
+		*donep = 1;
+	}
+	return (0);
+}
+
+/*
+ * __wt_search_insert --
+ *	Search a row-store insert list, creating a skiplist stack as we go.
+ */
+int
+__wt_search_insert(
+    WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key)
+{
+	WT_BTREE *btree;
+	WT_COLLATOR *collator;
+	WT_INSERT *ins, **insp, *last_ins;
+	WT_INSERT_HEAD *inshead;
+	WT_ITEM key;
+	size_t match, skiphigh, skiplow;
+	int cmp, i;
+
+	btree = S2BT(session);
+	collator = btree->collator;
+	inshead = cbt->ins_head;
+	cmp = 0;				/* -Wuninitialized */
+
+	/*
+	 * The insert list is a skip list: start at the highest skip level, then
+	 * go as far as possible at each level before stepping down to the next.
+	 */
+	match = skiphigh = skiplow = 0;
+	ins = last_ins = NULL;
+	for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0;) {
+		if ((ins = *insp) == NULL) {
+			cbt->next_stack[i] = NULL;
+			cbt->ins_stack[i--] = insp--;
+			continue;
+		}
+
+		/*
+		 * Comparisons may be repeated as we drop down skiplist levels;
+		 * don't repeat comparisons, they might be expensive.
+		 */
+		if (ins != last_ins) {
+			last_ins = ins;
+			key.data = WT_INSERT_KEY(ins);
+			key.size = WT_INSERT_KEY_SIZE(ins);
+			match = WT_MIN(skiplow, skiphigh);
+			WT_RET(__wt_compare_skip(
+			    session, collator, srch_key, &key, &cmp, &match));
+		}
+
+		if (cmp > 0) {			/* Keep going at this level */
+			insp = &ins->next[i];
+			skiplow = match;
+		} else if (cmp < 0) {		/* Drop down a level */
+			cbt->next_stack[i] = ins;
+			cbt->ins_stack[i--] = insp--;
+			skiphigh = match;
+		} else
+			for (; i >= 0; i--) {
+				cbt->next_stack[i] = ins->next[i];
+				cbt->ins_stack[i] = &ins->next[i];
+			}
+	}
+
+	/*
+	 * For every insert element we review, we're getting closer to a better
+	 * choice; update the compare field to its new value.  If we went past
+	 * the last item in the list, return the last one: that is used to
+	 * decide whether we are positioned in a skiplist.
+	 */
+	cbt->compare = -cmp;
+	cbt->ins = (ins != NULL) ? ins : last_ins;
+	return (0);
+}
+
+/*
+ * __wt_row_search --
+ *	Search a row-store tree for a specific key.
+ */
+int
+__wt_row_search(WT_SESSION_IMPL *session,
+    WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, int insert)
+{
+	WT_BTREE *btree;
+	WT_COLLATOR *collator;
+	WT_DECL_RET;
+	WT_ITEM *item;
+	WT_PAGE *page;
+	WT_PAGE_INDEX *pindex;
+	WT_REF *current, *descent;
+	WT_ROW *rip;
+	size_t match, skiphigh, skiplow;
+	uint32_t base, indx, limit;
+	int append_check, cmp, depth, descend_right, done;
+
+	btree = S2BT(session);
+	collator = btree->collator;
+	item = &cbt->search_key;
+
+	__cursor_pos_clear(cbt);
+
+	/*
+	 * The row-store search routine uses a different comparison API.
+	 * The assumption is we're comparing more than a few keys with
+	 * matching prefixes, and it's a win to avoid the memory fetches
+	 * by skipping over those prefixes.  That's done by tracking the
+	 * length of the prefix match for the lowest and highest keys we
+	 * compare as we descend the tree.
+	 */
+	skiphigh = skiplow = 0;
+
+	/*
+	 * If a cursor repeatedly appends to the tree, compare the search key
+	 * against the last key on each internal page during insert before
+	 * doing the full binary search.
+	 *
+	 * Track if the descent is to the right-side of the tree, used to set
+	 * the cursor's append history.
+	 */
+	append_check = insert && cbt->append_tree;
+	descend_right = 1;
+
+	/*
+	 * In the service of eviction splits, we're only searching a single leaf
+	 * page, not a full tree.
+	 */
+	if (leaf != NULL) {
+		current = leaf;
+		goto leaf_only;
+	}
+
+	/* Search the internal pages of the tree. */
+	cmp = -1;
+	current = &btree->root;
+	for (depth = 2;; ++depth) {
+restart:	page = current->page;
+		if (page->type != WT_PAGE_ROW_INT)
+			break;
+
+		pindex = WT_INTL_INDEX_COPY(page);
+
+		/*
+		 * Fast-path internal pages with one child, a common case for
+		 * the root page in new trees.
+		 */
+		if (pindex->entries == 1) {
+			descent = pindex->index[0];
+			goto descend;
+		}
+
+		/* Fast-path appends. */
+		if (append_check) {
+			descent = pindex->index[pindex->entries - 1];
+			__wt_ref_key(page, descent, &item->data, &item->size);
+			WT_ERR(__wt_compare(
+			    session, collator, srch_key, item, &cmp));
+			if (cmp >= 0)
+				goto descend;
+
+			/* A failed append check turns off append checks. */
+			append_check = 0;
+		}
+
+		/*
+		 * Binary search of the internal page.  There are two versions
+		 * (a default loop and an application-specified collation loop),
+		 * because moving the collation test and error handling inside
+		 * the loop costs about 5%.
+		 *
+		 * The 0th key on an internal page is a problem for a couple of
+		 * reasons.  First, we have to force the 0th key to sort less
+		 * than any application key, so internal pages don't have to be
+		 * updated if the application stores a new, "smallest" key in
+		 * the tree.  Second, reconciliation is aware of this and will
+		 * store a byte of garbage in the 0th key, so the comparison of
+		 * an application key and a 0th key is meaningless (but doing
+		 * the comparison could still incorrectly modify our tracking
+		 * of the leading bytes in each key that we can skip during the
+		 * comparison).  For these reasons, skip the 0th key.
+		 */
+		base = 1;
+		limit = pindex->entries - 1;
+		if (collator == NULL)
+			for (; limit != 0; limit >>= 1) {
+				indx = base + (limit >> 1);
+				descent = pindex->index[indx];
+				__wt_ref_key(
+				    page, descent, &item->data, &item->size);
+
+				match = WT_MIN(skiplow, skiphigh);
+				cmp = __wt_lex_compare_skip(
+				    srch_key, item, &match);
+				if (cmp > 0) {
+					skiplow = match;
+					base = indx + 1;
+					--limit;
+				} else if (cmp < 0)
+					skiphigh = match;
+				else
+					goto descend;
+			}
+		else
+			for (; limit != 0; limit >>= 1) {
+				indx = base + (limit >> 1);
+				descent = pindex->index[indx];
+				__wt_ref_key(
+				    page, descent, &item->data, &item->size);
+
+				WT_ERR(__wt_compare(
+				    session, collator, srch_key, item, &cmp));
+				if (cmp > 0) {
+					base = indx + 1;
+					--limit;
+				} else if (cmp == 0)
+					goto descend;
+			}
+
+		/*
+		 * Set the slot to descend the tree: descent is already set if
+		 * there was an exact match on the page, otherwise, base is
+		 * the smallest index greater than key, possibly (last + 1).
+		 */
+		descent = pindex->index[base - 1];
+
+		/*
+		 * If we end up somewhere other than the last slot, it's not a
+		 * right-side descent.
+		 */
+		if (pindex->entries != base - 1)
+			descend_right = 0;
+
+descend:	/*
+		 * Swap the current page for the child page. If the page splits
+		 * while we're retrieving it, restart the search in the current
+		 * page; otherwise return on error, the swap call ensures we're
+		 * holding nothing on failure.
+		 */
+		switch (ret = __wt_page_swap(session, current, descent, 0)) {
+		case 0:
+			current = descent;
+			break;
+		case WT_RESTART:
+			skiphigh = skiplow = 0;
+			goto restart;
+		default:
+			return (ret);
+		}
+	}
+
+	/* Track how deep the tree gets. */
+	if (depth > btree->maximum_depth)
+		btree->maximum_depth = depth;
+
+leaf_only:
+	page = current->page;
+	cbt->ref = current;
+
+	/*
+	 * In the case of a right-side tree descent during an insert, do a fast
+	 * check for an append to the page, try to catch cursors appending data
+	 * into the tree.
+	 *
+	 * It's tempting to make this test more rigorous: if a cursor inserts
+	 * randomly into a two-level tree (a root referencing a single child
+	 * that's empty except for an insert list), the right-side descent flag
+	 * will be set and this comparison wasted.  The problem resolves itself
+	 * as the tree grows larger: either we're no longer doing right-side
+	 * descent, or we'll avoid additional comparisons in internal pages,
+	 * making up for the wasted comparison here.  Similarly, the cursor's
+	 * history is set any time it's an insert and a right-side descent,
+	 * both to avoid a complicated/expensive test, and, in the case of
+	 * multiple threads appending to the tree, we want to mark them all as
+	 * appending, even if this test doesn't work.
+	 */
+	if (insert && descend_right) {
+		cbt->append_tree = 1;
+
+		if (page->pg_row_entries == 0) {
+			cbt->slot = WT_ROW_SLOT(page, page->pg_row_d);
+
+			F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
+			cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
+		} else {
+			cbt->slot = WT_ROW_SLOT(page,
+			    page->pg_row_d + (page->pg_row_entries - 1));
+
+			cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot);
+		}
+
+		WT_ERR(
+		    __wt_search_insert_append(session, cbt, srch_key, &done));
+		if (done)
+			return (0);
+
+		/*
+		 * Don't leave the insert list head set, code external to the
+		 * search uses it.
+		 */
+		cbt->ins_head = NULL;
+	}
+
+	/*
+	 * Binary search of the leaf page.  There are two versions (a default
+	 * loop and an application-specified collation loop), because moving
+	 * the collation test and error handling inside the loop costs about 5%.
+	 */
+	base = 0;
+	limit = page->pg_row_entries;
+	if (collator == NULL)
+		for (; limit != 0; limit >>= 1) {
+			indx = base + (limit >> 1);
+			rip = page->pg_row_d + indx;
+			WT_ERR(__wt_row_leaf_key(session, page, rip, item, 1));
+
+			match = WT_MIN(skiplow, skiphigh);
+			cmp = __wt_lex_compare_skip(srch_key, item, &match);
+			if (cmp > 0) {
+				skiplow = match;
+				base = indx + 1;
+				--limit;
+			} else if (cmp < 0)
+				skiphigh = match;
+			else
+				goto leaf_match;
+		}
+	else
+		for (; limit != 0; limit >>= 1) {
+			indx = base + (limit >> 1);
+			rip = page->pg_row_d + indx;
+			WT_ERR(__wt_row_leaf_key(session, page, rip, item, 1));
+
+			WT_ERR(__wt_compare(
+			    session, collator, srch_key, item, &cmp));
+			if (cmp > 0) {
+				base = indx + 1;
+				--limit;
+			} else if (cmp == 0)
+				goto leaf_match;
+		}
+
+	/*
+	 * The best case is finding an exact match in the leaf page's WT_ROW
+	 * array, probable for any read-mostly workload.  Check that case and
+	 * get out fast.
+	 */
+	if (0) {
+leaf_match:	cbt->compare = 0;
+		cbt->slot = WT_ROW_SLOT(page, rip);
+		return (0);
+	}
+
+	/*
+	 * We didn't find an exact match in the WT_ROW array.
+	 *
+	 * Base is the smallest index greater than key and may be the 0th index
+	 * or the (last + 1) index.  Set the slot to be the largest index less
+	 * than the key if that's possible (if base is the 0th index it means
+	 * the application is inserting a key before any key found on the page).
+	 *
+	 * It's still possible there is an exact match, but it's on an insert
+	 * list.  Figure out which insert chain to search and then set up the
+	 * return information assuming we'll find nothing in the insert list
+	 * (we'll correct as needed inside the search routine, depending on
+	 * what we find).
+	 *
+	 * If inserting a key smaller than any key found in the WT_ROW array,
+	 * use the extra slot of the insert array, otherwise the insert array
+	 * maps one-to-one to the WT_ROW array.
+	 */
+	if (base == 0) {
+		cbt->compare = 1;
+		cbt->slot = WT_ROW_SLOT(page, page->pg_row_d);
+
+		F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
+		cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
+	} else {
+		cbt->compare = -1;
+		cbt->slot = WT_ROW_SLOT(page, page->pg_row_d + (base - 1));
+
+		cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot);
+	}
+
+	/* If there's no insert list, we're done. */
+	if (WT_SKIP_FIRST(cbt->ins_head) == NULL)
+		return (0);
+
+	/*
+	 * Test for an append first when inserting onto an insert list, try to
+	 * catch cursors repeatedly inserting at a single point.
+	 */
+	if (insert) {
+		WT_ERR(
+		    __wt_search_insert_append(session, cbt, srch_key, &done));
+		if (done)
+			return (0);
+	}
+	WT_ERR(__wt_search_insert(session, cbt, srch_key));
+
+	return (0);
+
+err:	if (leaf != NULL)
+		WT_TRET(__wt_page_release(session, current, 0));
+	return (ret);
+}
+
+/*
+ * __wt_row_random --
+ *	Return a random key from a row-store tree.
+ */
+int
+__wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_INSERT *p, *t;
+	WT_PAGE *page;
+	WT_PAGE_INDEX *pindex;
+	WT_REF *current, *descent;
+
+	btree = S2BT(session);
+
+	__cursor_pos_clear(cbt);
+
+restart:
+	/* Walk the internal pages of the tree. */
+	current = &btree->root;
+	for (;;) {
+		page = current->page;
+		if (page->type != WT_PAGE_ROW_INT)
+			break;
+
+		pindex = WT_INTL_INDEX_COPY(page);
+		descent = pindex->index[
+		    __wt_random(session->rnd) % pindex->entries];
+
+		/*
+		 * Swap the parent page for the child page; return on error,
+		 * the swap function ensures we're holding nothing on failure.
+		 */
+		if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) {
+			current = descent;
+			continue;
+		}
+		/*
+		 * Restart is returned if we find a page that's been split; the
+		 * held page isn't discarded when restart is returned, discard
+		 * it and restart the search from the top of the tree.
+		 */
+		if (ret == WT_RESTART &&
+		    (ret = __wt_page_release(session, current, 0)) == 0)
+			goto restart;
+		return (ret);
+	}
+
+	if (page->pg_row_entries != 0) {
+		/*
+		 * The use case for this call is finding a place to split the
+		 * tree.  Cheat (it's not like this is "random", anyway), and
+		 * make things easier by returning the first key on the page.
+		 * If the caller is attempting to split a newly created tree,
+		 * or a tree with just one big page, that's not going to work,
+		 * check for that.
+		 */
+		cbt->ref = current;
+		cbt->compare = 0;
+		pindex = WT_INTL_INDEX_COPY(btree->root.page);
+		cbt->slot = pindex->entries < 2 ?
+		    __wt_random(session->rnd) % page->pg_row_entries : 0;
+
+		return (__wt_row_leaf_key(session,
+		    page, page->pg_row_d + cbt->slot, &cbt->search_key, 0));
+	}
+
+	/*
+	 * If the tree is new (and not empty), it might have a large insert
+	 * list, pick the key in the middle of that insert list.
+	 */
+	F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
+	if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
+		WT_ERR(WT_NOTFOUND);
+	for (p = t = WT_SKIP_FIRST(cbt->ins_head);;) {
+		if ((p = WT_SKIP_NEXT(p)) == NULL)
+			break;
+		if ((p = WT_SKIP_NEXT(p)) == NULL)
+			break;
+		t = WT_SKIP_NEXT(t);
+	}
+	cbt->ref = current;
+	cbt->compare = 0;
+	cbt->ins = t;
+
+	return (0);
+
+err:	WT_TRET(__wt_page_release(session, current, 0));
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/config/config.c b/src/third_party/wiredtiger/src/config/config.c
new file mode 100644
index 00000000000..c792cb4fcf2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config.c
@@ -0,0 +1,745 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __config_err --
+ *	Error message and return for config string parse failures.
+ */
+static int
+__config_err(WT_CONFIG *conf, const char *msg, int err)
+{
+	WT_RET_MSG(conf->session, err,
+	    "Error parsing '%.*s' at byte %u: %s",
+	    (int)(conf->end - conf->orig), conf->orig,
+	    (u_int)(conf->cur - conf->orig), msg);
+}
+
+/*
+ * __wt_config_initn --
+ *	Initialize a config handle, used to iterate through a config string of
+ *	specified length.
+ */
+int
+__wt_config_initn(
+    WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len)
+{
+	conf->session = session;
+	conf->orig = conf->cur = str;
+	conf->end = str + len;
+	conf->depth = 0;
+	conf->top = -1;
+	conf->go = NULL;
+
+	return (0);
+}
+
+/*
+ * __wt_config_init --
+ *	Initialize a config handle, used to iterate through a NUL-terminated
+ *	config string.
+ */
+int
+__wt_config_init(WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str)
+{
+	size_t len;
+
+	len = (str == NULL) ? 0 : strlen(str);
+
+	return (__wt_config_initn(session, conf, str, len));
+}
+
+/*
+ * __wt_config_subinit --
+ *	Initialize a config handle, used to iterate through a config string
+ *	extracted from another config string (used for parsing nested
+ *	structures).
+ */
+int
+__wt_config_subinit(
+    WT_SESSION_IMPL *session, WT_CONFIG *conf, WT_CONFIG_ITEM *item)
+{
+	return (__wt_config_initn(session, conf, item->str, item->len));
+}
+
+#define	PUSH(i, t) do {							\
+	if (conf->top == -1)						\
+		conf->top = conf->depth;				\
+	if (conf->depth == conf->top) {					\
+		if (out->len > 0)					\
+			return (__config_err(conf,			\
+			    "New value starts without a separator",	\
+			    EINVAL));					\
+		out->type = (t);					\
+		out->str = (conf->cur + (i));				\
+	}								\
+} while (0)
+
+#define	CAP(i) do {							\
+	if (conf->depth == conf->top)					\
+		out->len = (size_t)((conf->cur + (i) + 1) - out->str);	\
+} while (0)
+
+typedef enum {
+	A_LOOP, A_BAD, A_DOWN, A_UP, A_VALUE, A_NEXT, A_QDOWN, A_QUP,
+	A_ESC, A_UNESC, A_BARE, A_NUMBARE, A_UNBARE, A_UTF8_2,
+	A_UTF8_3, A_UTF8_4, A_UTF_CONTINUE
+} CONFIG_ACTION;
+
+/*
+ * static void *gostruct[] = {
+ *		[0 ... 255] = &&l_bad,
+ *		['\t'] = &&l_loop, [' '] = &&l_loop,
+ *		['\r'] = &&l_loop, ['\n'] = &&l_loop,
+ *		['"'] = &&l_qup,
+ *		[':'] = &&l_value, ['='] = &&l_value,
+ *		[','] = &&l_next,
+ *		// tracking [] and {} individually would allow fuller
+ *		// validation but is really messy
+ *		['('] = &&l_up, [')'] = &&l_down,
+ *		['['] = &&l_up, [']'] = &&l_down,
+ *		['{'] = &&l_up, ['}'] = &&l_down,
+ *		// bare identifiers
+ *		['-'] = &&l_numbare,
+ *		['0' ... '9'] = &&l_numbare,
+ *		['_'] = &&l_bare,
+ *		['A' ... 'Z'] = &&l_bare, ['a' ... 'z'] = &&l_bare,
+ *		['/'] = &&l_bare,
+ *	};
+ */
+static const int8_t gostruct[256] = {
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_LOOP, A_LOOP, A_BAD, A_BAD, A_LOOP, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_LOOP, A_BAD, A_QUP,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UP, A_DOWN, A_BAD, A_BAD,
+	A_NEXT, A_NUMBARE, A_BARE, A_BARE, A_NUMBARE, A_NUMBARE,
+	A_NUMBARE, A_NUMBARE, A_NUMBARE, A_NUMBARE, A_NUMBARE,
+	A_NUMBARE, A_NUMBARE, A_NUMBARE, A_VALUE, A_BAD, A_BAD,
+	A_VALUE, A_BAD, A_BAD, A_BAD, A_BARE, A_BARE, A_BARE, A_BARE,
+	A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE,
+	A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE,
+	A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_UP, A_BAD,
+	A_DOWN, A_BAD, A_BARE, A_BAD, A_BARE, A_BARE, A_BARE, A_BARE,
+	A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE,
+	A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE,
+	A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_UP, A_BAD,
+	A_DOWN, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD
+};
+
+/*
+ *	static void *gobare[] =
+ *	{
+ *		[0 ... 31] = &&l_bad,
+ *		// could be more pedantic/validation-checking
+ *		[32 ... 126] = &&l_loop,
+ *		['\t'] = &&l_unbare, [' '] = &&l_unbare,
+ *		['\r'] = &&l_unbare, ['\n'] = &&l_unbare,
+ *		[':'] = &&l_unbare, ['='] = &&l_unbare,
+ *		[','] = &&l_unbare,
+ *		[')'] = &&l_unbare, [']'] = &&l_unbare, ['}'] = &&l_unbare,
+ *		[127 ... 255] = &&l_bad
+ *	};
+ */
+static const int8_t gobare[256] = {
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_UNBARE, A_UNBARE, A_BAD, A_BAD, A_UNBARE, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UNBARE,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+	A_UNBARE, A_LOOP, A_LOOP, A_UNBARE, A_LOOP, A_LOOP, A_LOOP,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+	A_LOOP, A_LOOP, A_UNBARE, A_LOOP, A_LOOP, A_UNBARE, A_LOOP,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_UNBARE,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+	A_UNBARE, A_LOOP, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD
+};
+
+/*
+ *	static void *gostring[] =
+ *	{
+ *		[0 ... 31] = &&l_bad, [127] = &&l_bad,
+ *		[32 ... 126] = &&l_loop,
+ *		['\\'] = &&l_esc, ['"'] = &&l_qdown,
+ *		[128 ... 191] = &&l_bad,
+ *		[192 ... 223] = &&l_utf8_2,
+ *		[224 ... 239] = &&l_utf8_3,
+ *		[240 ... 247] = &&l_utf8_4,
+ *		[248 ... 255] = &&l_bad
+ *	};
+ */
+static const int8_t gostring[256] = {
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_LOOP, A_LOOP, A_QDOWN,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+	A_LOOP, A_ESC, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+	A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UTF8_2,
+	A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2,
+	A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2,
+	A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2,
+	A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2,
+	A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2,
+	A_UTF8_2, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3,
+	A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3,
+	A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_4,
+	A_UTF8_4, A_UTF8_4, A_UTF8_4, A_UTF8_4, A_UTF8_4, A_UTF8_4,
+	A_UTF8_4, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD
+};
+
+/*
+ *	static void *goutf8_continue[] =
+ *	{
+ *		[0 ... 127] = &&l_bad,
+ *		[128 ... 191] = &&l_utf_continue,
+ *		[192 ... 255] = &&l_bad
+ *	};
+ */
+static const int8_t goutf8_continue[256] = {
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+	A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+	A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+	A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+	A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+	A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+	A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+	A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+	A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+	A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+	A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+	A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+	A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+	A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+	A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+	A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+	A_UTF_CONTINUE, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD
+};
+
+/*
+ *	static void *goesc[] =
+ *	{
+ *		[0 ... 255] = &&l_bad,
+ *		['"'] = &&l_unesc, ['\\'] = &&l_unesc,
+ *		['/'] = &&l_unesc, ['b'] = &&l_unesc,
+ *		['f'] = &&l_unesc, ['n'] = &&l_unesc,
+ *		['r'] = &&l_unesc, ['t'] = &&l_unesc, ['u'] = &&l_unesc
+ *	};
+ */
+static const int8_t goesc[256] = {
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UNESC,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_UNESC, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_UNESC, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_UNESC, A_BAD, A_BAD, A_BAD, A_UNESC, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UNESC, A_BAD,
+	A_BAD, A_BAD, A_UNESC, A_BAD, A_UNESC, A_UNESC, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+	A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD
+};
+
+/*
+ * __config_next --
+ *	Get the next config item in the string without processing the value.
+ */
+static int
+__config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
+{
+	WT_CONFIG_ITEM *out = key;
+	int utf8_remain = 0;
+	static const WT_CONFIG_ITEM true_value = {
+		"", 0, 1, WT_CONFIG_ITEM_BOOL
+	};
+
+	key->len = 0;
+	/* Keys with no value default to true. */
+	*value = true_value;
+
+	if (conf->go == NULL)
+		conf->go = gostruct;
+
+	while (conf->cur < conf->end) {
+		switch (conf->go[(int)*conf->cur]) {
+		case A_LOOP:
+			break;
+
+		case A_BAD:
+			return (__config_err(
+			    conf, "Unexpected character", EINVAL));
+
+		case A_DOWN:
+			--conf->depth;
+			CAP(0);
+			break;
+
+		case A_UP:
+			if (conf->top == -1)
+				conf->top = 1;
+			PUSH(0, WT_CONFIG_ITEM_STRUCT);
+			++conf->depth;
+			break;
+
+		case A_VALUE:
+			if (conf->depth == conf->top) {
+				/*
+				 * Special case: ':' is permitted in unquoted
+				 * values.
+				 */
+				if (out == value && *conf->cur != ':')
+					return (__config_err(conf,
+					    "Value already complete", EINVAL));
+				out = value;
+			}
+			break;
+
+		case A_NEXT:
+			/*
+			 * If we're at the top level and we have a complete
+			 * key (and optional value), we're done.
+			 */
+			if (conf->depth == conf->top && key->len > 0) {
+				++conf->cur;
+				return (0);
+			} else
+				break;
+
+		case A_QDOWN:
+			CAP(-1);
+			conf->go = gostruct;
+			break;
+
+		case A_QUP:
+			PUSH(1, WT_CONFIG_ITEM_STRING);
+			conf->go = gostring;
+			break;
+
+		case A_ESC:
+			conf->go = goesc;
+			break;
+
+		case A_UNESC:
+			conf->go = gostring;
+			break;
+
+		case A_BARE:
+			PUSH(0, WT_CONFIG_ITEM_ID);
+			conf->go = gobare;
+			break;
+
+		case A_NUMBARE:
+			PUSH(0, WT_CONFIG_ITEM_NUM);
+			conf->go = gobare;
+			break;
+
+		case A_UNBARE:
+			CAP(-1);
+			conf->go = gostruct;
+			continue;
+
+		case A_UTF8_2:
+			conf->go = goutf8_continue;
+			utf8_remain = 1;
+			break;
+
+		case A_UTF8_3:
+			conf->go = goutf8_continue;
+			utf8_remain = 2;
+			break;
+
+		case A_UTF8_4:
+			conf->go = goutf8_continue;
+			utf8_remain = 3;
+			break;
+
+		case A_UTF_CONTINUE:
+			if (!--utf8_remain)
+				conf->go = gostring;
+			break;
+		}
+
+		conf->cur++;
+	}
+
+	/* Might have a trailing key/value without a closing brace */
+	if (conf->go == gobare) {
+		CAP(-1);
+		conf->go = gostruct;
+	}
+
+	/* Did we find something? */
+	if (conf->depth <= conf->top && key->len > 0)
+		return (0);
+
+	/* We're either at the end of the string or we failed to parse. */
+	if (conf->depth == 0)
+		return (WT_NOTFOUND);
+
+	return (__config_err(conf,
+	    "Closing brackets missing from config string", EINVAL));
+}
+
+/*
+ * Arithmetic shift of a negative number is undefined by ISO/IEC 9899, and the
+ * WiredTiger API supports negative numbers.  Check it's not a negative number,
+ * and then cast the shift out of paranoia.
+ */
+#define	WT_SHIFT_INT64(v, s) do {					\
+	if ((v) < 0)							\
+		goto range;						\
+	(v) = (int64_t)(((uint64_t)(v)) << (s));			\
+} while (0)
+
+/*
+ * __config_process_value --
+ *	Deal with special config values like true / false.
+ */
+static int
+__config_process_value(WT_CONFIG *conf, WT_CONFIG_ITEM *value)
+{
+	char *endptr;
+
+	/* Empty values are okay: we can't do anything interesting with them. */
+	if (value->len == 0)
+		return (0);
+
+	if (value->type == WT_CONFIG_ITEM_ID) {
+		if (strncasecmp(value->str, "true", value->len) == 0) {
+			value->type = WT_CONFIG_ITEM_BOOL;
+			value->val = 1;
+		} else if (strncasecmp(value->str, "false", value->len) == 0) {
+			value->type = WT_CONFIG_ITEM_BOOL;
+			value->val = 0;
+		}
+	} else if (value->type == WT_CONFIG_ITEM_NUM) {
+		errno = 0;
+		value->val = strtoll(value->str, &endptr, 10);
+
+		/* Check any leftover characters. */
+		while (endptr < value->str + value->len)
+			switch (*endptr++) {
+			case 'b':
+			case 'B':
+				/* Byte: no change. */
+				break;
+			case 'k':
+			case 'K':
+				WT_SHIFT_INT64(value->val, 10);
+				break;
+			case 'm':
+			case 'M':
+				WT_SHIFT_INT64(value->val, 20);
+				break;
+			case 'g':
+			case 'G':
+				WT_SHIFT_INT64(value->val, 30);
+				break;
+			case 't':
+			case 'T':
+				WT_SHIFT_INT64(value->val, 40);
+				break;
+			case 'p':
+			case 'P':
+				WT_SHIFT_INT64(value->val, 50);
+				break;
+			default:
+				/*
+				 * We didn't get a well-formed number.  That
+				 * might be okay, the required type will be
+				 * checked by __wt_config_check.
+				 */
+				value->type = WT_CONFIG_ITEM_ID;
+				break;
+			}
+
+		/*
+		 * If we parsed the whole string but the number is out of range,
+		 * report an error.  Don't report an error for strings that
+		 * aren't well-formed integers: if an integer is expected, that
+		 * will be caught by __wt_config_check.
+		 */
+		if (value->type == WT_CONFIG_ITEM_NUM && errno == ERANGE)
+			goto range;
+	}
+
+	return (0);
+
+range:
+	return (__config_err(conf, "Number out of range", ERANGE));
+}
+
+/*
+ * __wt_config_next --
+ *	Get the next config item in the string and process the value.
+ */
+int
+__wt_config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
+{
+	WT_RET(__config_next(conf, key, value));
+	return (__config_process_value(conf, value));
+}
+
+/*
+ * __config_getraw --
+ *	Given a config parser, find the final value for a given key.
+ */
+static int
+__config_getraw(
+    WT_CONFIG *cparser, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value, int top)
+{
+	WT_CONFIG sparser;
+	WT_CONFIG_ITEM k, v, subk;
+	WT_DECL_RET;
+	int found;
+
+	found = 0;
+	while ((ret = __config_next(cparser, &k, &v)) == 0) {
+		if (k.type != WT_CONFIG_ITEM_STRING &&
+		    k.type != WT_CONFIG_ITEM_ID)
+			continue;
+		if (k.len == key->len &&
+		    strncasecmp(key->str, k.str, k.len) == 0) {
+			*value = v;
+			found = 1;
+		} else if (k.len < key->len && key->str[k.len] == '.' &&
+		    strncasecmp(key->str, k.str, k.len) == 0) {
+			subk.str = key->str + k.len + 1;
+			subk.len = (key->len - k.len) - 1;
+			WT_RET(__wt_config_initn(
+			    cparser->session, &sparser, v.str, v.len));
+			if ((ret =
+			    __config_getraw(&sparser, &subk, value, 0)) == 0)
+				found = 1;
+			WT_RET_NOTFOUND_OK(ret);
+		}
+	}
+	WT_RET_NOTFOUND_OK(ret);
+
+	if (!found)
+		return (WT_NOTFOUND);
+	return (top ? __config_process_value(cparser, value) : 0);
+}
+
+/*
+ * __wt_config_get --
+ *	Given a NULL-terminated list of configuration strings, find
+ *	the final value for a given key.
+ */
+int
+__wt_config_get(WT_SESSION_IMPL *session,
+    const char **cfg, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
+{
+	WT_CONFIG cparser;
+	WT_DECL_RET;
+	int found;
+
+	for (found = 0; *cfg != NULL; cfg++) {
+		WT_RET(__wt_config_init(session, &cparser, *cfg));
+		if ((ret = __config_getraw(&cparser, key, value, 1)) == 0)
+			found = 1;
+		else if (ret != WT_NOTFOUND)
+			return (ret);
+	}
+
+	return (found ? 0 : WT_NOTFOUND);
+}
+
+/*
+ * __wt_config_gets --
+ *	Given a NULL-terminated list of configuration strings, find the final
+ *	value for a given string key.
+ */
+int
+__wt_config_gets(WT_SESSION_IMPL *session,
+    const char **cfg, const char *key, WT_CONFIG_ITEM *value)
+{
+	WT_CONFIG_ITEM key_item =
+	    { key, strlen(key), 0, WT_CONFIG_ITEM_STRING };
+
+	return (__wt_config_get(session, cfg, &key_item, value));
+}
+
+/*
+ * __wt_config_getone --
+ *	Get the value for a given key from a single config string.
+ */
+int
+__wt_config_getone(WT_SESSION_IMPL *session,
+    const char *config, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
+{
+	WT_CONFIG cparser;
+
+	WT_RET(__wt_config_init(session, &cparser, config));
+	return (__config_getraw(&cparser, key, value, 1));
+}
+
+/*
+ * __wt_config_getones --
+ *	Get the value for a given string key from a single config string.
+ */
+int
+__wt_config_getones(WT_SESSION_IMPL *session,
+    const char *config, const char *key, WT_CONFIG_ITEM *value)
+{
+	WT_CONFIG cparser;
+	WT_CONFIG_ITEM key_item =
+	    { key, strlen(key), 0, WT_CONFIG_ITEM_STRING };
+
+	WT_RET(__wt_config_init(session, &cparser, config));
+	return (__config_getraw(&cparser, &key_item, value, 1));
+}
+
+/*
+ * __wt_config_gets_def --
+ *	Performance hack: skip parsing config strings by hard-coding defaults.
+ *
+ *	It's expensive to repeatedly parse configuration strings, so don't do
+ *	it unless it's necessary in performance paths like cursor creation.
+ *	Assume the second configuration string is the application's
+ *	configuration string, and if it's not set (which is true most of the
+ *	time), then use the supplied default value.  This makes it faster to
+ *	open cursors when checking for obscure open configuration strings like
+ *	"next_random".
+ */
+int
+__wt_config_gets_def(WT_SESSION_IMPL *session,
+    const char **cfg, const char *key, int def, WT_CONFIG_ITEM *value)
+{
+	static const WT_CONFIG_ITEM false_value = {
+		"", 0, 0, WT_CONFIG_ITEM_NUM
+	};
+
+	*value = false_value;
+	value->val = def;
+	if (cfg == NULL || cfg[0] == NULL || cfg[1] == NULL)
+		return (0);
+	else if (cfg[2] == NULL)
+		WT_RET_NOTFOUND_OK(
+		    __wt_config_getones(session, cfg[1], key, value));
+	return (__wt_config_gets(session, cfg, key, value));
+}
+
+/*
+ * __wt_config_subgetraw --
+ *	Get the value for a given key from a config string in a WT_CONFIG_ITEM.
+ *	This is useful for dealing with nested structs in config strings.
+ */
+int
+__wt_config_subgetraw(WT_SESSION_IMPL *session,
+    WT_CONFIG_ITEM *cfg, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
+{
+	WT_CONFIG cparser;
+
+	WT_RET(__wt_config_initn(session, &cparser, cfg->str, cfg->len));
+	return (__config_getraw(&cparser, key, value, 1));
+}
+
+/*
+ * __wt_config_subgets --
+ *	Get the value for a given key from a config string in a WT_CONFIG_ITEM.
+ *	This is useful for dealing with nested structs in config strings.
+ */
+int
+__wt_config_subgets(WT_SESSION_IMPL *session,
+    WT_CONFIG_ITEM *cfg, const char *key, WT_CONFIG_ITEM *value)
+{
+	WT_CONFIG_ITEM key_item =
+	    { key, strlen(key), 0, WT_CONFIG_ITEM_STRING };
+
+	return (__wt_config_subgetraw(session, cfg, &key_item, value));
+}
diff --git a/src/third_party/wiredtiger/src/config/config_api.c b/src/third_party/wiredtiger/src/config/config_api.c
new file mode 100644
index 00000000000..42f4c117b81
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_api.c
@@ -0,0 +1,105 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __config_parser_close --
+ *      WT_CONFIG_PARSER->close method.
+ */
+static int
+__config_parser_close(WT_CONFIG_PARSER *wt_config_parser)
+{
+	WT_CONFIG_PARSER_IMPL *config_parser;
+
+	config_parser = (WT_CONFIG_PARSER_IMPL *)wt_config_parser;
+
+	if (config_parser == NULL)
+		return (EINVAL);
+
+	__wt_free(config_parser->session, config_parser);
+	return (0);
+}
+
+/*
+ * __config_parser_get --
+ *      WT_CONFIG_PARSER->search method.
+ */
+static int
+__config_parser_get(WT_CONFIG_PARSER *wt_config_parser,
+     const char *key, WT_CONFIG_ITEM *cval)
+{
+	WT_CONFIG_PARSER_IMPL *config_parser;
+
+	config_parser = (WT_CONFIG_PARSER_IMPL *)wt_config_parser;
+
+	if (config_parser == NULL)
+		return (EINVAL);
+
+	return (__wt_config_subgets(config_parser->session,
+	    &config_parser->config_item, key, cval));
+}
+
+/*
+ * __config_parser_next --
+ *	WT_CONFIG_PARSER->next method.
+ */
+static int
+__config_parser_next(WT_CONFIG_PARSER *wt_config_parser,
+     WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *cval)
+{
+	WT_CONFIG_PARSER_IMPL *config_parser;
+
+	config_parser = (WT_CONFIG_PARSER_IMPL *)wt_config_parser;
+
+	if (config_parser == NULL)
+		return (EINVAL);
+
+	return (__wt_config_next(&config_parser->config, key, cval));
+}
+
+/*
+ * wiredtiger_config_parser_open --
+ *	Create a configuration parser.
+ */
+int
+wiredtiger_config_parser_open(WT_SESSION *wt_session,
+    const char *config, size_t len, WT_CONFIG_PARSER **config_parserp)
+{
+	static const WT_CONFIG_PARSER stds = {
+		__config_parser_close,
+		__config_parser_next,
+		__config_parser_get
+	};
+	WT_CONFIG_ITEM config_item =
+	    { config, len, 0, WT_CONFIG_ITEM_STRING };
+	WT_CONFIG_PARSER_IMPL *config_parser;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	*config_parserp = NULL;
+	session = (WT_SESSION_IMPL *)wt_session;
+
+	WT_RET(__wt_calloc_def(session, 1, &config_parser));
+	config_parser->iface = stds;
+	config_parser->session = session;
+
+	/*
+	 * Setup a WT_CONFIG_ITEM to be used for get calls and a WT_CONFIG
+	 * structure for iterations through the configuration string.
+	 */
+	memcpy(&config_parser->config_item, &config_item, sizeof(config_item));
+	WT_ERR(__wt_config_initn(
+	    session, &config_parser->config, config, len));
+
+	if (ret == 0)
+		*config_parserp = (WT_CONFIG_PARSER *)config_parser;
+	else
+err:		__wt_free(session, config_parser);
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/config/config_check.c b/src/third_party/wiredtiger/src/config/config_check.c
new file mode 100644
index 00000000000..310e54c3349
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_check.c
@@ -0,0 +1,370 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int config_check(
+    WT_SESSION_IMPL *, const WT_CONFIG_CHECK *, const char *, size_t);
+
+/*
+ * __conn_foc_add --
+ *	Add a new entry into the connection's free-on-close list.
+ */
+static int
+__conn_foc_add(WT_SESSION_IMPL *session, const void *p)
+{
+	WT_CONNECTION_IMPL *conn;
+
+	conn = S2C(session);
+
+	/*
+	 * Our caller is expected to be holding any locks we need.
+	 */
+	WT_RET(__wt_realloc_def(
+	    session, &conn->foc_size, conn->foc_cnt + 1, &conn->foc));
+
+	conn->foc[conn->foc_cnt++] = (void *)p;
+	return (0);
+}
+
+/*
+ * __wt_conn_foc_discard --
+ *	Discard any memory the connection accumulated.
+ */
+void
+__wt_conn_foc_discard(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	size_t i;
+
+	conn = S2C(session);
+
+	/*
+	 * If we have a list of chunks to free, run through the list, then
+	 * free the list itself.
+	 */
+	for (i = 0; i < conn->foc_cnt; ++i)
+		__wt_free(session, conn->foc[i]);
+	__wt_free(session, conn->foc);
+}
+
+/*
+ * __wt_configure_method --
+ *	WT_CONNECTION.configure_method.
+ */
+int
+__wt_configure_method(WT_SESSION_IMPL *session,
+    const char *method, const char *uri,
+    const char *config, const char *type, const char *check)
+{
+	const WT_CONFIG_CHECK *cp;
+	WT_CONFIG_CHECK *checks, *newcheck;
+	const WT_CONFIG_ENTRY **epp;
+	WT_CONFIG_ENTRY *entry;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	size_t cnt;
+	char *newcheck_name, *p;
+
+	/*
+	 * !!!
+	 * We ignore the specified uri, that is, all new configuration options
+	 * will be valid for all data sources.   That's shouldn't be too bad
+	 * as the worst that can happen is an application might specify some
+	 * configuration option and not get an error -- the option should be
+	 * ignored by the underlying implementation since it's unexpected, so
+	 * there shouldn't be any real problems.  Eventually I expect we will
+	 * get the whole data-source thing sorted, at which time there may be
+	 * configuration arrays for each data source, and that's when the uri
+	 * will matter.
+	 */
+	WT_UNUSED(uri);
+
+	conn = S2C(session);
+	checks = newcheck = NULL;
+	entry = NULL;
+	newcheck_name = NULL;
+
+	/* Argument checking; we only support a limited number of types. */
+	if (config == NULL)
+		WT_RET_MSG(session, EINVAL, "no configuration specified");
+	if (type == NULL)
+		WT_RET_MSG(session, EINVAL, "no configuration type specified");
+	if (strcmp(type, "boolean") != 0 && strcmp(type, "int") != 0 &&
+	    strcmp(type, "list") != 0 && strcmp(type, "string") != 0)
+		WT_RET_MSG(session, EINVAL,
+		    "type must be one of \"boolean\", \"int\", \"list\" or "
+		    "\"string\"");
+
+	/* Find a match for the method name. */
+	for (epp = conn->config_entries; (*epp)->method != NULL; ++epp)
+		if (strcmp((*epp)->method, method) == 0)
+			break;
+	if ((*epp)->method == NULL)
+		WT_RET_MSG(session,
+		    WT_NOTFOUND, "no method matching %s found", method);
+
+	/*
+	 * Technically possible for threads to race, lock the connection while
+	 * adding the new configuration information.  We're holding the lock
+	 * for an extended period of time, but configuration changes should be
+	 * rare and only happen during startup.
+	 */
+	__wt_spin_lock(session, &conn->api_lock);
+
+	/*
+	 * Allocate new configuration entry and fill it in.
+	 *
+	 * The new base value is the previous base value, a separator and the
+	 * new configuration string.
+	 */
+	WT_ERR(__wt_calloc_def(session, 1, &entry));
+	entry->method = (*epp)->method;
+	WT_ERR(__wt_calloc_def(session,
+	    strlen((*epp)->base) + strlen(",") + strlen(config) + 1, &p));
+	(void)strcpy(p, (*epp)->base);
+	(void)strcat(p, ",");
+	(void)strcat(p, config);
+	entry->base = p;
+
+	/*
+	 * There may be a default value in the config argument passed in (for
+	 * example, (kvs_parallelism=64").  The default value isn't part of the
+	 * name, build a new one.
+	 */
+	WT_ERR(__wt_strdup(session, config, &newcheck_name));
+	if ((p = strchr(newcheck_name, '=')) != NULL)
+		*p = '\0';
+
+	/*
+	 * The new configuration name may replace an existing check with new
+	 * information, in that case skip the old version.
+	 */
+	cnt = 0;
+	if ((*epp)->checks != NULL)
+		for (cp = (*epp)->checks; cp->name != NULL; ++cp)
+			++cnt;
+	WT_ERR(__wt_calloc_def(session, cnt + 2, &checks));
+	cnt = 0;
+	if ((*epp)->checks != NULL)
+		for (cp = (*epp)->checks; cp->name != NULL; ++cp)
+			if (strcmp(newcheck_name, cp->name) != 0)
+				checks[cnt++] = *cp;
+	newcheck = &checks[cnt];
+	newcheck->name = newcheck_name;
+	WT_ERR(__wt_strdup(session, type, &newcheck->type));
+	if (check != NULL)
+		WT_ERR(__wt_strdup(session, check, &newcheck->checks));
+	entry->checks = checks;
+
+	/*
+	 * Confirm the configuration string passes the new set of
+	 * checks.
+	 */
+	WT_ERR(config_check(session, entry->checks, config, 0));
+
+	/*
+	 * The next time this configuration is updated, we don't want to figure
+	 * out which of these pieces of memory were allocated and will need to
+	 * be free'd on close (this isn't a heavily used API and it's too much
+	 * work); add them all to the free-on-close list now.  We don't check
+	 * for errors deliberately, we'd have to figure out which elements have
+	 * already been added to the free-on-close array and which have not in
+	 * order to avoid freeing chunks of memory twice.  Again, this isn't a
+	 * commonly used API and it shouldn't ever happen, just leak it.
+	 */
+	(void)__conn_foc_add(session, entry->base);
+	(void)__conn_foc_add(session, entry);
+	(void)__conn_foc_add(session, checks);
+	(void)__conn_foc_add(session, newcheck->type);
+	(void)__conn_foc_add(session, newcheck->checks);
+	(void)__conn_foc_add(session, newcheck_name);
+
+	/*
+	 * Instead of using locks to protect configuration information, assume
+	 * we can atomically update a pointer to a chunk of memory, and because
+	 * a pointer is never partially written, readers will correctly see the
+	 * original or new versions of the memory.  Readers might be using the
+	 * old version as it's being updated, though, which means we cannot free
+	 * the old chunk of memory until all possible readers have finished.
+	 * Currently, that's on connection close: in other words, we can use
+	 * this because it's small amounts of memory, and we really, really do
+	 * not want to acquire locks every time we access configuration strings,
+	 * since that's done on every API call.
+	 */
+	WT_PUBLISH(*epp, entry);
+
+	if (0) {
+err:		if (entry != NULL) {
+			__wt_free(session, entry->base);
+			__wt_free(session, entry);
+		}
+		__wt_free(session, checks);
+		if (newcheck != NULL) {
+			__wt_free(session, newcheck->type);
+			__wt_free(session, newcheck->checks);
+		}
+		__wt_free(session, newcheck_name);
+	}
+
+	__wt_spin_unlock(session, &conn->api_lock);
+	return (ret);
+}
+
+/*
+ * __wt_config_check --
+ *	Check the keys in an application-supplied config string match what is
+ *	specified in an array of check strings.
+ */
+int
+__wt_config_check(WT_SESSION_IMPL *session,
+    const WT_CONFIG_ENTRY *entry, const char *config, size_t config_len)
+{
+	/*
+	 * Callers don't check, it's a fast call without a configuration or
+	 * check array.
+	 */
+	return (config == NULL || entry->checks == NULL ?
+	    0 : config_check(session, entry->checks, config, config_len));
+}
+
+/*
+ * config_check --
+ *	Check the keys in an application-supplied config string match what is
+ * specified in an array of check strings.
+ */
+static int
+config_check(WT_SESSION_IMPL *session,
+    const WT_CONFIG_CHECK *checks, const char *config, size_t config_len)
+{
+	WT_CONFIG parser, cparser, sparser;
+	WT_CONFIG_ITEM k, v, ck, cv, dummy;
+	WT_DECL_RET;
+	int badtype, found, i;
+
+	/*
+	 * The config_len parameter is optional, and allows passing in strings
+	 * that are not nul-terminated.
+	 */
+	if (config_len == 0)
+		WT_RET(__wt_config_init(session, &parser, config));
+	else
+		WT_RET(__wt_config_initn(session, &parser, config, config_len));
+	while ((ret = __wt_config_next(&parser, &k, &v)) == 0) {
+		if (k.type != WT_CONFIG_ITEM_STRING &&
+		    k.type != WT_CONFIG_ITEM_ID)
+			WT_RET_MSG(session, EINVAL,
+			    "Invalid configuration key found: '%.*s'",
+			    (int)k.len, k.str);
+
+		/* Search for a matching entry. */
+		for (i = 0; checks[i].name != NULL; i++)
+			if (WT_STRING_MATCH(checks[i].name, k.str, k.len))
+				break;
+		if (checks[i].name == NULL)
+			WT_RET_MSG(session, EINVAL,
+			    "unknown configuration key: '%.*s'",
+			    (int)k.len, k.str);
+
+		if (strcmp(checks[i].type, "boolean") == 0) {
+			badtype = (v.type != WT_CONFIG_ITEM_BOOL &&
+			    (v.type != WT_CONFIG_ITEM_NUM ||
+			    (v.val != 0 && v.val != 1)));
+		} else if (strcmp(checks[i].type, "category") == 0) {
+			/* Deal with categories of the form: XXX=(XXX=blah). */
+			ret = config_check(session,
+			    checks[i].subconfigs,
+			    k.str + strlen(checks[i].name) + 1, v.len);
+			if (ret != EINVAL)
+				badtype = 0;
+			else
+				badtype = 1;
+		} else if (strcmp(checks[i].type, "format") == 0) {
+			badtype = 0;
+		} else if (strcmp(checks[i].type, "int") == 0) {
+			badtype = (v.type != WT_CONFIG_ITEM_NUM);
+		} else if (strcmp(checks[i].type, "list") == 0) {
+			badtype = (v.len > 0 &&
+			    v.type != WT_CONFIG_ITEM_STRUCT);
+		} else if (strcmp(checks[i].type, "string") == 0) {
+			badtype = 0;
+		} else
+			WT_RET_MSG(session, EINVAL,
+			    "unknown configuration type: '%s'",
+			    checks[i].type);
+
+		if (badtype)
+			WT_RET_MSG(session, EINVAL,
+			    "Invalid value for key '%.*s': expected a %s",
+			    (int)k.len, k.str, checks[i].type);
+
+		if (checks[i].checks == NULL)
+			continue;
+
+		/* Setup an iterator for the check string. */
+		WT_RET(__wt_config_init(session, &cparser, checks[i].checks));
+		while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) {
+			if (WT_STRING_MATCH("min", ck.str, ck.len)) {
+				if (v.val < cv.val)
+					WT_RET_MSG(session, EINVAL,
+					    "Value too small for key '%.*s' "
+					    "the minimum is %.*s",
+					    (int)k.len, k.str,
+					    (int)cv.len, cv.str);
+			} else if (WT_STRING_MATCH("max", ck.str, ck.len)) {
+				if (v.val > cv.val)
+					WT_RET_MSG(session, EINVAL,
+					    "Value too large for key '%.*s' "
+					    "the maximum is %.*s",
+					    (int)k.len, k.str,
+					    (int)cv.len, cv.str);
+			} else if (WT_STRING_MATCH("choices", ck.str, ck.len)) {
+				if (v.len == 0)
+					WT_RET_MSG(session, EINVAL,
+					    "Key '%.*s' requires a value",
+					    (int)k.len, k.str);
+				if (v.type == WT_CONFIG_ITEM_STRUCT) {
+					/*
+					 * Handle the 'verbose' case of a list
+					 * containing restricted choices.
+					 */
+					WT_RET(__wt_config_subinit(session,
+					    &sparser, &v));
+					found = 1;
+					while (found &&
+					    (ret = __wt_config_next(&sparser,
+					    &v, &dummy)) == 0) {
+						ret = __wt_config_subgetraw(
+						    session, &cv, &v, &dummy);
+						found = (ret == 0);
+					}
+				} else  {
+					ret = __wt_config_subgetraw(session,
+					    &cv, &v, &dummy);
+					found = (ret == 0);
+				}
+
+				if (ret != 0 && ret != WT_NOTFOUND)
+					return (ret);
+				if (!found)
+					WT_RET_MSG(session, EINVAL,
+					    "Value '%.*s' not a "
+					    "permitted choice for key '%.*s'",
+					    (int)v.len, v.str,
+					    (int)k.len, k.str);
+			} else
+				WT_RET_MSG(session, EINVAL,
+				    "unexpected configuration description "
+				    "keyword %.*s", (int)ck.len, ck.str);
+		}
+	}
+
+	if (ret == WT_NOTFOUND)
+		ret = 0;
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/config/config_collapse.c b/src/third_party/wiredtiger/src/config/config_collapse.c
new file mode 100644
index 00000000000..3e4c539cbe9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_collapse.c
@@ -0,0 +1,380 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_config_collapse --
+ *	Collapse a set of configuration strings into newly allocated memory.
+ *
+ * This function takes a NULL-terminated list of configuration strings (where
+ * the first one contains all the defaults and the values are in order from
+ * least to most preferred, that is, the default values are least preferred),
+ * and collapses them into newly allocated memory.  The algorithm is to walk
+ * the first of the configuration strings, and for each entry, search all of
+ * the configuration strings for a final value, keeping the last value found.
+ *
+ * Notes:
+ *	Any key not appearing in the first configuration string is discarded
+ *	from the final result, because we'll never search for it.
+ *
+ *	Nested structures aren't parsed.  For example, imagine a configuration
+ *	string contains "key=(k2=v2,k3=v3)", and a subsequent string has
+ *	"key=(k4=v4)", the result will be "key=(k4=v4)", as we search for and
+ *	use the final value of "key", regardless of field overlap or missing
+ *	fields in the nested value.
+ */
+int
+__wt_config_collapse(
+    WT_SESSION_IMPL *session, const char **cfg, const char **config_ret)
+{
+	WT_CONFIG cparser;
+	WT_CONFIG_ITEM k, v;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+
+	WT_RET(__wt_scr_alloc(session, 0, &tmp));
+
+	WT_ERR(__wt_config_init(session, &cparser, cfg[0]));
+	while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) {
+		if (k.type != WT_CONFIG_ITEM_STRING &&
+		    k.type != WT_CONFIG_ITEM_ID)
+			WT_ERR_MSG(session, EINVAL,
+			    "Invalid configuration key found: '%s'\n", k.str);
+		WT_ERR(__wt_config_get(session, cfg, &k, &v));
+		/* Include the quotes around string keys/values. */
+		if (k.type == WT_CONFIG_ITEM_STRING) {
+			--k.str;
+			k.len += 2;
+		}
+		if (v.type == WT_CONFIG_ITEM_STRING) {
+			--v.str;
+			v.len += 2;
+		}
+		WT_ERR(__wt_buf_catfmt(session, tmp, "%.*s=%.*s,",
+		    (int)k.len, k.str, (int)v.len, v.str));
+	}
+	if (ret != WT_NOTFOUND)
+		goto err;
+
+	/*
+	 * If the caller passes us no valid configuration strings, we get here
+	 * with no bytes to copy -- that's OK, the underlying string copy can
+	 * handle empty strings.
+	 *
+	 * Strip any trailing comma.
+	 */
+	if (tmp->size != 0)
+		--tmp->size;
+	ret = __wt_strndup(session, tmp->data, tmp->size, config_ret);
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * We need a character that can't appear in a key as a separator.
+ */
+#undef	SEP					/* separator key, character */
+#define	SEP	"["
+#undef	SEPC
+#define	SEPC	'['
+
+/*
+ * Individual configuration entries, including a generation number used to make
+ * the qsort stable.
+ */
+typedef struct {
+	char  *k, *v;				/* key, value */
+	size_t gen;				/* generation */
+} WT_CONFIG_MERGE_ENTRY;
+
+/*
+ * The array of configuration entries.
+ */
+typedef struct {
+	size_t entries_allocated;		/* allocated */
+	size_t entries_next;			/* next slot */
+
+	WT_CONFIG_MERGE_ENTRY *entries;		/* array of entries */
+} WT_CONFIG_MERGE;
+
+/*
+ * __config_merge_scan --
+ *	Walk a configuration string, inserting entries into the merged array.
+ */
+static int
+__config_merge_scan(WT_SESSION_IMPL *session,
+    const char *key, const char *value, WT_CONFIG_MERGE *cp)
+{
+	WT_CONFIG cparser;
+	WT_CONFIG_ITEM k, v;
+	WT_DECL_ITEM(kb);
+	WT_DECL_ITEM(vb);
+	WT_DECL_RET;
+	size_t len;
+
+	WT_ERR(__wt_scr_alloc(session, 0, &kb));
+	WT_ERR(__wt_scr_alloc(session, 0, &vb));
+
+	WT_ERR(__wt_config_init(session, &cparser, value));
+	while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) {
+		if (k.type != WT_CONFIG_ITEM_STRING &&
+		    k.type != WT_CONFIG_ITEM_ID)
+			WT_ERR_MSG(session, EINVAL,
+			    "Invalid configuration key found: '%s'\n", k.str);
+
+		/* Include the quotes around string keys/values. */
+		if (k.type == WT_CONFIG_ITEM_STRING) {
+			--k.str;
+			k.len += 2;
+		}
+		if (v.type == WT_CONFIG_ITEM_STRING) {
+			--v.str;
+			v.len += 2;
+		}
+
+		/*
+		 * !!!
+		 * We're using a JSON quote character to separate the names we
+		 * create for nested structures. That's not completely safe as
+		 * it's possible to quote characters in JSON such that a quote
+		 * character appears as a literal character in a key name. In
+		 * a few cases, applications can create their own key namespace
+		 * (for example, shared library extension names), and therefore
+		 * it's possible for an application to confuse us. Error if we
+		 * we ever see a key with a magic character.
+		 */
+		for (len = 0; len < k.len; ++len)
+			if (k.str[len] == SEPC)
+				WT_ERR_MSG(session, EINVAL,
+				    "key %.*s contains a '%c' separator "
+				    "character",
+				    (int)k.len, (char *)k.str, SEPC);
+
+		/* Build the key/value strings. */
+		WT_ERR(__wt_buf_fmt(session,
+		    kb, "%s%s%.*s",
+		    key == NULL ? "" : key,
+		    key == NULL ? "" : SEP,
+		    (int)k.len, k.str));
+		WT_ERR(__wt_buf_fmt(session,
+		    vb, "%.*s", (int)v.len, v.str));
+
+		/*
+		 * If the value is a structure, recursively parse it.
+		 *
+		 * !!!
+		 * Don't merge unless the structure has field names. WiredTiger
+		 * stores checkpoint LSNs in the metadata file using nested
+		 * structures without field names: "checkpoint_lsn=(1,0)", not
+		 * "checkpoint_lsn=(file=1,offset=0)". The value type is still
+		 * WT_CONFIG_ITEM_STRUCT, so we check for a field name in the
+		 * value.
+		 */
+		if (v.type == WT_CONFIG_ITEM_STRUCT &&
+		    strchr(vb->data, '=') != NULL) {
+			WT_ERR(__config_merge_scan(
+			    session, kb->data, vb->data, cp));
+			continue;
+		}
+
+		/* Insert the value into the array. */
+		WT_ERR(__wt_realloc_def(session,
+		    &cp->entries_allocated,
+		    cp->entries_next + 1, &cp->entries));
+		WT_ERR(__wt_strndup(session,
+		    kb->data, kb->size, &cp->entries[cp->entries_next].k));
+		WT_ERR(__wt_strndup(session,
+		    vb->data, vb->size, &cp->entries[cp->entries_next].v));
+		cp->entries[cp->entries_next].gen = cp->entries_next;
+		++cp->entries_next;
+	}
+	WT_ERR_NOTFOUND_OK(ret);
+
+err:	__wt_scr_free(&kb);
+	__wt_scr_free(&vb);
+	return (ret);
+}
+
+/*
+ * __strip_comma --
+ *	Strip a trailing comma.
+ */
+static void
+__strip_comma(WT_ITEM *buf)
+{
+	if (buf->size != 0 && ((char *)buf->data)[buf->size - 1] == ',')
+		--buf->size;
+}
+
+/*
+ * __config_merge_format_next --
+ *	Walk the array, building entries.
+ */
+static int
+__config_merge_format_next(WT_SESSION_IMPL *session, const char *prefix,
+    size_t plen, size_t *enp, WT_CONFIG_MERGE *cp, WT_ITEM *build)
+{
+	WT_CONFIG_MERGE_ENTRY *ep;
+	size_t len1, len2, next;
+	char *p;
+
+	for (; *enp < cp->entries_next; ++*enp) {
+		ep = &cp->entries[*enp];
+		len1 = strlen(ep->k);
+
+		/*
+		 * The entries are in sorted order, take the last entry for any
+		 * key.
+		 */
+		if (*enp < (cp->entries_next - 1)) {
+			len2 = strlen((ep + 1)->k);
+
+			/* Choose the last of identical keys. */
+			if (len1 == len2 &&
+			    memcmp(ep->k, (ep + 1)->k, len1) == 0)
+				continue;
+
+			/*
+			 * The test is complicated by matching empty entries
+			 * "foo=" against nested structures "foo,bar=", where
+			 * the latter is a replacement for the former.
+			 */
+			if (len2 > len1 &&
+			    (ep + 1)->k[len1] == SEPC &&
+			    memcmp(ep->k, (ep + 1)->k, len1) == 0)
+				continue;
+		}
+
+		/*
+		 * If we're skipping a prefix and this entry doesn't match it,
+		 * back off one entry and pop up a level.
+		 */
+		if (plen != 0 &&
+		    (plen > len1 || memcmp(ep->k, prefix, plen) != 0)) {
+			--*enp;
+			break;
+		}
+
+		/*
+		 * If the entry introduces a new level, recurse through that
+		 * new level.
+		 */
+		if ((p = strchr(ep->k + plen, SEPC)) != NULL) {
+			next = WT_PTRDIFF(p, ep->k);
+			WT_RET(__wt_buf_catfmt(session,
+			    build, "%.*s=(", (int)(next - plen), ep->k + plen));
+			WT_RET(__config_merge_format_next(
+			    session, ep->k, next + 1, enp, cp, build));
+			__strip_comma(build);
+			WT_RET(__wt_buf_catfmt(session, build, "),"));
+			continue;
+		}
+
+		/* Append the entry to the buffer. */
+		WT_RET(__wt_buf_catfmt(
+		    session, build, "%s=%s,", ep->k + plen, ep->v));
+	}
+
+	return (0);
+}
+
+/*
+ * __config_merge_format --
+ *	Take the sorted array of entries, and format them into allocated memory.
+ */
+static int
+__config_merge_format(
+    WT_SESSION_IMPL *session, WT_CONFIG_MERGE *cp, const char **config_ret)
+{
+	WT_DECL_ITEM(build);
+	WT_DECL_RET;
+	size_t entries;
+
+	WT_RET(__wt_scr_alloc(session, 4 * 1024, &build));
+
+	entries = 0;
+	WT_ERR(__config_merge_format_next(session, "", 0, &entries, cp, build));
+
+	__strip_comma(build);
+
+	ret = __wt_strndup(session, build->data, build->size, config_ret);
+
+err:	__wt_scr_free(&build);
+	return (ret);
+}
+
+/*
+ * __config_merge_cmp --
+ *	Qsort function: sort the config merge array.
+ */
+static int
+__config_merge_cmp(const void *a, const void *b)
+{
+	WT_CONFIG_MERGE_ENTRY *ae, *be;
+	int cmp;
+
+	ae = (WT_CONFIG_MERGE_ENTRY *)a;
+	be = (WT_CONFIG_MERGE_ENTRY *)b;
+
+	if ((cmp = strcmp(ae->k, be->k)) != 0)
+		return (cmp);
+	return (ae->gen > be->gen ? 1 : -1);
+}
+
+/*
+ * __wt_config_merge --
+ *	Merge a set of configuration strings into newly allocated memory.
+ *
+ * This function takes a NULL-terminated list of configuration strings (where
+ * the values are in order from least to most preferred), and merges them into
+ * newly allocated memory.  The algorithm is to walk the configuration strings
+ * and build a table of each key/value pair. The pairs are sorted based on the
+ * name and the configuration string in which they were found, and a final
+ * configuration string is built from the result.
+ *
+ * Note:
+ *	Nested structures are parsed and merge. For example, if configuration
+ *	strings "key=(k1=v1,k2=v2)" and "key=(k1=v2)" appear, the result will
+ *	be "key=(k1=v2,k2=v2)" because the nested values are merged.
+ */
+int
+__wt_config_merge(
+    WT_SESSION_IMPL *session, const char **cfg, const char **config_ret)
+{
+	WT_CONFIG_MERGE merge;
+	WT_DECL_RET;
+	size_t i;
+
+	/* Start out with a reasonable number of entries. */
+	WT_CLEAR(merge);
+
+	WT_RET(__wt_realloc_def(
+	    session, &merge.entries_allocated, 100, &merge.entries));
+
+	/* Scan the configuration strings, entering them into the array. */
+	for (; *cfg != NULL; ++cfg)
+		WT_ERR(__config_merge_scan(session, NULL, *cfg, &merge));
+
+	/*
+	 * Sort the array by key and, in the case of identical keys, by
+	 * generation.
+	 */
+	qsort(merge.entries, merge.entries_next,
+	    sizeof(WT_CONFIG_MERGE_ENTRY), __config_merge_cmp);
+
+	/* Convert the array of entries into a string. */
+	ret = __config_merge_format(session, &merge, config_ret);
+
+err:	for (i = 0; i < merge.entries_next; ++i) {
+		__wt_free(session, merge.entries[i].k);
+		__wt_free(session, merge.entries[i].v);
+	}
+	__wt_free(session, merge.entries);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/config/config_concat.c b/src/third_party/wiredtiger/src/config/config_concat.c
new file mode 100644
index 00000000000..99475ef6f47
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_concat.c
@@ -0,0 +1,71 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_config_concat --
+ *	Given a NULL-terminated list of configuration strings, concatenate them
+ *	into newly allocated memory.  Nothing special is assumed about any of
+ *	the config strings, they are simply combined in order.
+ *
+ *	This code deals with the case where some of the config strings are
+ *	wrapped in brackets but others aren't: the resulting string does not
+ *	have brackets.
+ */
+int
+__wt_config_concat(
+    WT_SESSION_IMPL *session, const char **cfg, const char **config_ret)
+{
+	WT_CONFIG cparser;
+	WT_CONFIG_ITEM k, v;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	const char **cp;
+
+	WT_RET(__wt_scr_alloc(session, 0, &tmp));
+
+	for (cp = cfg; *cp != NULL; ++cp) {
+		WT_ERR(__wt_config_init(session, &cparser, *cp));
+		while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) {
+			if (k.type != WT_CONFIG_ITEM_STRING &&
+			    k.type != WT_CONFIG_ITEM_ID)
+				WT_ERR_MSG(session, EINVAL,
+				    "Invalid configuration key found: '%s'\n",
+				    k.str);
+			/* Include the quotes around string keys/values. */
+			if (k.type == WT_CONFIG_ITEM_STRING) {
+				--k.str;
+				k.len += 2;
+			}
+			if (v.type == WT_CONFIG_ITEM_STRING) {
+				--v.str;
+				v.len += 2;
+			}
+			WT_ERR(__wt_buf_catfmt(session, tmp, "%.*s%s%.*s,",
+			    (int)k.len, k.str,
+			    (v.len > 0) ? "=" : "",
+			    (int)v.len, v.str));
+		}
+		if (ret != WT_NOTFOUND)
+			goto err;
+	}
+
+	/*
+	 * If the caller passes us no valid configuration strings, we get here
+	 * with no bytes to copy -- that's OK, the underlying string copy can
+	 * handle empty strings.
+	 *
+	 * Strip any trailing comma.
+	 */
+	if (tmp->size != 0)
+		--tmp->size;
+	ret = __wt_strndup(session, tmp->data, tmp->size, config_ret);
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c
new file mode 100644
index 00000000000..0cd2d32df57
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_def.c
@@ -0,0 +1,744 @@
+/* DO NOT EDIT: automatically built by dist/config.py. */
+
+#include "wt_internal.h"
+
+static const WT_CONFIG_CHECK confchk_colgroup_meta[] = {
+	{ "app_metadata", "string", NULL, NULL },
+	{ "columns", "list", NULL, NULL },
+	{ "source", "string", NULL, NULL },
+	{ "type", "string", NULL, NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_connection_async_new_op[] = {
+	{ "append", "boolean", NULL, NULL },
+	{ "overwrite", "boolean", NULL, NULL },
+	{ "raw", "boolean", NULL, NULL },
+	{ "timeout", "int", NULL, NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_connection_close[] = {
+	{ "leak_memory", "boolean", NULL, NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_connection_load_extension[] = {
+	{ "config", "string", NULL, NULL },
+	{ "entry", "string", NULL, NULL },
+	{ "terminate", "string", NULL, NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_connection_open_session[] = {
+	{ "isolation", "string",
+	    "choices=[\"read-uncommitted\",\"read-committed\",\"snapshot\"]",
+	    NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_async_subconfigs[] = {
+	{ "enabled", "boolean", NULL, NULL },
+	{ "ops_max", "int", "min=10,max=4096", NULL },
+	{ "threads", "int", "min=1,max=20", NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_checkpoint_subconfigs[] = {
+	{ "log_size", "int", "min=0,max=2GB", NULL },
+	{ "name", "string", NULL, NULL },
+	{ "wait", "int", "min=0,max=100000", NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_eviction_subconfigs[] = {
+	{ "threads_max", "int", "min=1,max=20", NULL },
+	{ "threads_min", "int", "min=1,max=20", NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_lsm_manager_subconfigs[] = {
+	{ "merge", "boolean", NULL, NULL },
+	{ "worker_thread_max", "int", "min=3,max=20", NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_shared_cache_subconfigs[] = {
+	{ "chunk", "int", "min=1MB,max=10TB", NULL },
+	{ "name", "string", NULL, NULL },
+	{ "reserve", "int", NULL, NULL },
+	{ "size", "int", "min=1MB,max=10TB", NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_statistics_log_subconfigs[] = {
+	{ "on_close", "boolean", NULL, NULL },
+	{ "path", "string", NULL, NULL },
+	{ "sources", "list", NULL, NULL },
+	{ "timestamp", "string", NULL, NULL },
+	{ "wait", "int", "min=0,max=100000", NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_connection_reconfigure[] = {
+	{ "async", "category", NULL, confchk_async_subconfigs },
+	{ "cache_size", "int", "min=1MB,max=10TB", NULL },
+	{ "checkpoint", "category", NULL,
+	     confchk_checkpoint_subconfigs },
+	{ "error_prefix", "string", NULL, NULL },
+	{ "eviction", "category", NULL, confchk_eviction_subconfigs },
+	{ "eviction_dirty_target", "int", "min=10,max=99", NULL },
+	{ "eviction_target", "int", "min=10,max=99", NULL },
+	{ "eviction_trigger", "int", "min=10,max=99", NULL },
+	{ "lsm_manager", "category", NULL,
+	     confchk_lsm_manager_subconfigs },
+	{ "lsm_merge", "boolean", NULL, NULL },
+	{ "shared_cache", "category", NULL,
+	     confchk_shared_cache_subconfigs },
+	{ "statistics", "list",
+	    "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
+	    NULL },
+	{ "statistics_log", "category", NULL,
+	     confchk_statistics_log_subconfigs },
+	{ "verbose", "list",
+	    "choices=[\"api\",\"block\",\"checkpoint\",\"compact\",\"evict\""
+	    ",\"evictserver\",\"fileops\",\"log\",\"lsm\",\"metadata\","
+	    "\"mutex\",\"overflow\",\"read\",\"reconcile\",\"recovery\","
+	    "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
+	    "\"transaction\",\"verify\",\"version\",\"write\"]",
+	    NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_file_meta[] = {
+	{ "allocation_size", "int", "min=512B,max=128MB", NULL },
+	{ "app_metadata", "string", NULL, NULL },
+	{ "block_allocation", "string",
+	    "choices=[\"first\",\"best\"]",
+	    NULL },
+	{ "block_compressor", "string", NULL, NULL },
+	{ "cache_resident", "boolean", NULL, NULL },
+	{ "checkpoint", "string", NULL, NULL },
+	{ "checkpoint_lsn", "string", NULL, NULL },
+	{ "checksum", "string",
+	    "choices=[\"on\",\"off\",\"uncompressed\"]",
+	    NULL },
+	{ "collator", "string", NULL, NULL },
+	{ "columns", "list", NULL, NULL },
+	{ "dictionary", "int", "min=0", NULL },
+	{ "format", "string", "choices=[\"btree\"]", NULL },
+	{ "huffman_key", "string", NULL, NULL },
+	{ "huffman_value", "string", NULL, NULL },
+	{ "id", "string", NULL, NULL },
+	{ "internal_item_max", "int", "min=0", NULL },
+	{ "internal_key_truncate", "boolean", NULL, NULL },
+	{ "internal_page_max", "int", "min=512B,max=512MB", NULL },
+	{ "key_format", "format", NULL, NULL },
+	{ "key_gap", "int", "min=0", NULL },
+	{ "leaf_item_max", "int", "min=0", NULL },
+	{ "leaf_page_max", "int", "min=512B,max=512MB", NULL },
+	{ "memory_page_max", "int", "min=512B,max=10TB", NULL },
+	{ "os_cache_dirty_max", "int", "min=0", NULL },
+	{ "os_cache_max", "int", "min=0", NULL },
+	{ "prefix_compression", "boolean", NULL, NULL },
+	{ "prefix_compression_min", "int", "min=0", NULL },
+	{ "split_pct", "int", "min=25,max=100", NULL },
+	{ "value_format", "format", NULL, NULL },
+	{ "version", "string", NULL, NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_index_meta[] = {
+	{ "app_metadata", "string", NULL, NULL },
+	{ "columns", "list", NULL, NULL },
+	{ "key_format", "format", NULL, NULL },
+	{ "source", "string", NULL, NULL },
+	{ "type", "string", NULL, NULL },
+	{ "value_format", "format", NULL, NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_begin_transaction[] = {
+	{ "isolation", "string",
+	    "choices=[\"read-uncommitted\",\"read-committed\",\"snapshot\"]",
+	    NULL },
+	{ "name", "string", NULL, NULL },
+	{ "priority", "int", "min=-100,max=100", NULL },
+	{ "sync", "boolean", NULL, NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_checkpoint[] = {
+	{ "drop", "list", NULL, NULL },
+	{ "force", "boolean", NULL, NULL },
+	{ "name", "string", NULL, NULL },
+	{ "target", "list", NULL, NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_compact[] = {
+	{ "timeout", "int", NULL, NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_lsm_subconfigs[] = {
+	{ "auto_throttle", "boolean", NULL, NULL },
+	{ "bloom", "boolean", NULL, NULL },
+	{ "bloom_bit_count", "int", "min=2,max=1000", NULL },
+	{ "bloom_config", "string", NULL, NULL },
+	{ "bloom_hash_count", "int", "min=2,max=100", NULL },
+	{ "bloom_oldest", "boolean", NULL, NULL },
+	{ "chunk_max", "int", "min=100MB,max=10TB", NULL },
+	{ "chunk_size", "int", "min=512K,max=500MB", NULL },
+	{ "merge_max", "int", "min=2,max=100", NULL },
+	{ "merge_min", "int", "max=100", NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_create[] = {
+	{ "allocation_size", "int", "min=512B,max=128MB", NULL },
+	{ "app_metadata", "string", NULL, NULL },
+	{ "block_allocation", "string",
+	    "choices=[\"first\",\"best\"]",
+	    NULL },
+	{ "block_compressor", "string", NULL, NULL },
+	{ "cache_resident", "boolean", NULL, NULL },
+	{ "checksum", "string",
+	    "choices=[\"on\",\"off\",\"uncompressed\"]",
+	    NULL },
+	{ "colgroups", "list", NULL, NULL },
+	{ "collator", "string", NULL, NULL },
+	{ "columns", "list", NULL, NULL },
+	{ "dictionary", "int", "min=0", NULL },
+	{ "exclusive", "boolean", NULL, NULL },
+	{ "format", "string", "choices=[\"btree\"]", NULL },
+	{ "huffman_key", "string", NULL, NULL },
+	{ "huffman_value", "string", NULL, NULL },
+	{ "internal_item_max", "int", "min=0", NULL },
+	{ "internal_key_truncate", "boolean", NULL, NULL },
+	{ "internal_page_max", "int", "min=512B,max=512MB", NULL },
+	{ "key_format", "format", NULL, NULL },
+	{ "key_gap", "int", "min=0", NULL },
+	{ "leaf_item_max", "int", "min=0", NULL },
+	{ "leaf_page_max", "int", "min=512B,max=512MB", NULL },
+	{ "lsm", "category", NULL, confchk_lsm_subconfigs },
+	{ "memory_page_max", "int", "min=512B,max=10TB", NULL },
+	{ "os_cache_dirty_max", "int", "min=0", NULL },
+	{ "os_cache_max", "int", "min=0", NULL },
+	{ "prefix_compression", "boolean", NULL, NULL },
+	{ "prefix_compression_min", "int", "min=0", NULL },
+	{ "source", "string", NULL, NULL },
+	{ "split_pct", "int", "min=25,max=100", NULL },
+	{ "type", "string", NULL, NULL },
+	{ "value_format", "format", NULL, NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_drop[] = {
+	{ "force", "boolean", NULL, NULL },
+	{ "remove_files", "boolean", NULL, NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_open_cursor[] = {
+	{ "append", "boolean", NULL, NULL },
+	{ "bulk", "string", NULL, NULL },
+	{ "checkpoint", "string", NULL, NULL },
+	{ "dump", "string",
+	    "choices=[\"hex\",\"json\",\"print\"]",
+	    NULL },
+	{ "next_random", "boolean", NULL, NULL },
+	{ "overwrite", "boolean", NULL, NULL },
+	{ "raw", "boolean", NULL, NULL },
+	{ "readonly", "boolean", NULL, NULL },
+	{ "skip_sort_check", "boolean", NULL, NULL },
+	{ "statistics", "list",
+	    "choices=[\"all\",\"fast\",\"clear\"]",
+	    NULL },
+	{ "target", "list", NULL, NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_reconfigure[] = {
+	{ "isolation", "string",
+	    "choices=[\"read-uncommitted\",\"read-committed\",\"snapshot\"]",
+	    NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_salvage[] = {
+	{ "force", "boolean", NULL, NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_verify[] = {
+	{ "dump_address", "boolean", NULL, NULL },
+	{ "dump_blocks", "boolean", NULL, NULL },
+	{ "dump_offsets", "list", NULL, NULL },
+	{ "dump_pages", "boolean", NULL, NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_table_meta[] = {
+	{ "app_metadata", "string", NULL, NULL },
+	{ "colgroups", "list", NULL, NULL },
+	{ "columns", "list", NULL, NULL },
+	{ "key_format", "format", NULL, NULL },
+	{ "value_format", "format", NULL, NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_log_subconfigs[] = {
+	{ "archive", "boolean", NULL, NULL },
+	{ "enabled", "boolean", NULL, NULL },
+	{ "file_max", "int", "min=100KB,max=2GB", NULL },
+	{ "path", "string", NULL, NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_transaction_sync_subconfigs[] = {
+	{ "enabled", "boolean", NULL, NULL },
+	{ "method", "string",
+	    "choices=[\"dsync\",\"fsync\",\"none\"]",
+	    NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
+	{ "async", "category", NULL, confchk_async_subconfigs },
+	{ "buffer_alignment", "int", "min=-1,max=1MB", NULL },
+	{ "cache_size", "int", "min=1MB,max=10TB", NULL },
+	{ "checkpoint", "category", NULL,
+	     confchk_checkpoint_subconfigs },
+	{ "checkpoint_sync", "boolean", NULL, NULL },
+	{ "config_base", "boolean", NULL, NULL },
+	{ "create", "boolean", NULL, NULL },
+	{ "direct_io", "list",
+	    "choices=[\"checkpoint\",\"data\",\"log\"]",
+	    NULL },
+	{ "error_prefix", "string", NULL, NULL },
+	{ "eviction", "category", NULL, confchk_eviction_subconfigs },
+	{ "eviction_dirty_target", "int", "min=10,max=99", NULL },
+	{ "eviction_target", "int", "min=10,max=99", NULL },
+	{ "eviction_trigger", "int", "min=10,max=99", NULL },
+	{ "exclusive", "boolean", NULL, NULL },
+	{ "extensions", "list", NULL, NULL },
+	{ "file_extend", "list", "choices=[\"data\",\"log\"]", NULL },
+	{ "hazard_max", "int", "min=15", NULL },
+	{ "log", "category", NULL, confchk_log_subconfigs },
+	{ "lsm_manager", "category", NULL,
+	     confchk_lsm_manager_subconfigs },
+	{ "lsm_merge", "boolean", NULL, NULL },
+	{ "mmap", "boolean", NULL, NULL },
+	{ "multiprocess", "boolean", NULL, NULL },
+	{ "session_max", "int", "min=1", NULL },
+	{ "shared_cache", "category", NULL,
+	     confchk_shared_cache_subconfigs },
+	{ "statistics", "list",
+	    "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
+	    NULL },
+	{ "statistics_log", "category", NULL,
+	     confchk_statistics_log_subconfigs },
+	{ "transaction_sync", "category", NULL,
+	     confchk_transaction_sync_subconfigs },
+	{ "use_environment_priv", "boolean", NULL, NULL },
+	{ "verbose", "list",
+	    "choices=[\"api\",\"block\",\"checkpoint\",\"compact\",\"evict\""
+	    ",\"evictserver\",\"fileops\",\"log\",\"lsm\",\"metadata\","
+	    "\"mutex\",\"overflow\",\"read\",\"reconcile\",\"recovery\","
+	    "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
+	    "\"transaction\",\"verify\",\"version\",\"write\"]",
+	    NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
+	{ "async", "category", NULL, confchk_async_subconfigs },
+	{ "buffer_alignment", "int", "min=-1,max=1MB", NULL },
+	{ "cache_size", "int", "min=1MB,max=10TB", NULL },
+	{ "checkpoint", "category", NULL,
+	     confchk_checkpoint_subconfigs },
+	{ "checkpoint_sync", "boolean", NULL, NULL },
+	{ "config_base", "boolean", NULL, NULL },
+	{ "create", "boolean", NULL, NULL },
+	{ "direct_io", "list",
+	    "choices=[\"checkpoint\",\"data\",\"log\"]",
+	    NULL },
+	{ "error_prefix", "string", NULL, NULL },
+	{ "eviction", "category", NULL, confchk_eviction_subconfigs },
+	{ "eviction_dirty_target", "int", "min=10,max=99", NULL },
+	{ "eviction_target", "int", "min=10,max=99", NULL },
+	{ "eviction_trigger", "int", "min=10,max=99", NULL },
+	{ "exclusive", "boolean", NULL, NULL },
+	{ "extensions", "list", NULL, NULL },
+	{ "file_extend", "list", "choices=[\"data\",\"log\"]", NULL },
+	{ "hazard_max", "int", "min=15", NULL },
+	{ "log", "category", NULL, confchk_log_subconfigs },
+	{ "lsm_manager", "category", NULL,
+	     confchk_lsm_manager_subconfigs },
+	{ "lsm_merge", "boolean", NULL, NULL },
+	{ "mmap", "boolean", NULL, NULL },
+	{ "multiprocess", "boolean", NULL, NULL },
+	{ "session_max", "int", "min=1", NULL },
+	{ "shared_cache", "category", NULL,
+	     confchk_shared_cache_subconfigs },
+	{ "statistics", "list",
+	    "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
+	    NULL },
+	{ "statistics_log", "category", NULL,
+	     confchk_statistics_log_subconfigs },
+	{ "transaction_sync", "category", NULL,
+	     confchk_transaction_sync_subconfigs },
+	{ "use_environment_priv", "boolean", NULL, NULL },
+	{ "verbose", "list",
+	    "choices=[\"api\",\"block\",\"checkpoint\",\"compact\",\"evict\""
+	    ",\"evictserver\",\"fileops\",\"log\",\"lsm\",\"metadata\","
+	    "\"mutex\",\"overflow\",\"read\",\"reconcile\",\"recovery\","
+	    "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
+	    "\"transaction\",\"verify\",\"version\",\"write\"]",
+	    NULL },
+	{ "version", "string", NULL, NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
+	{ "async", "category", NULL, confchk_async_subconfigs },
+	{ "buffer_alignment", "int", "min=-1,max=1MB", NULL },
+	{ "cache_size", "int", "min=1MB,max=10TB", NULL },
+	{ "checkpoint", "category", NULL,
+	     confchk_checkpoint_subconfigs },
+	{ "checkpoint_sync", "boolean", NULL, NULL },
+	{ "direct_io", "list",
+	    "choices=[\"checkpoint\",\"data\",\"log\"]",
+	    NULL },
+	{ "error_prefix", "string", NULL, NULL },
+	{ "eviction", "category", NULL, confchk_eviction_subconfigs },
+	{ "eviction_dirty_target", "int", "min=10,max=99", NULL },
+	{ "eviction_target", "int", "min=10,max=99", NULL },
+	{ "eviction_trigger", "int", "min=10,max=99", NULL },
+	{ "extensions", "list", NULL, NULL },
+	{ "file_extend", "list", "choices=[\"data\",\"log\"]", NULL },
+	{ "hazard_max", "int", "min=15", NULL },
+	{ "log", "category", NULL, confchk_log_subconfigs },
+	{ "lsm_manager", "category", NULL,
+	     confchk_lsm_manager_subconfigs },
+	{ "lsm_merge", "boolean", NULL, NULL },
+	{ "mmap", "boolean", NULL, NULL },
+	{ "multiprocess", "boolean", NULL, NULL },
+	{ "session_max", "int", "min=1", NULL },
+	{ "shared_cache", "category", NULL,
+	     confchk_shared_cache_subconfigs },
+	{ "statistics", "list",
+	    "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
+	    NULL },
+	{ "statistics_log", "category", NULL,
+	     confchk_statistics_log_subconfigs },
+	{ "transaction_sync", "category", NULL,
+	     confchk_transaction_sync_subconfigs },
+	{ "verbose", "list",
+	    "choices=[\"api\",\"block\",\"checkpoint\",\"compact\",\"evict\""
+	    ",\"evictserver\",\"fileops\",\"log\",\"lsm\",\"metadata\","
+	    "\"mutex\",\"overflow\",\"read\",\"reconcile\",\"recovery\","
+	    "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
+	    "\"transaction\",\"verify\",\"version\",\"write\"]",
+	    NULL },
+	{ "version", "string", NULL, NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
+	{ "async", "category", NULL, confchk_async_subconfigs },
+	{ "buffer_alignment", "int", "min=-1,max=1MB", NULL },
+	{ "cache_size", "int", "min=1MB,max=10TB", NULL },
+	{ "checkpoint", "category", NULL,
+	     confchk_checkpoint_subconfigs },
+	{ "checkpoint_sync", "boolean", NULL, NULL },
+	{ "direct_io", "list",
+	    "choices=[\"checkpoint\",\"data\",\"log\"]",
+	    NULL },
+	{ "error_prefix", "string", NULL, NULL },
+	{ "eviction", "category", NULL, confchk_eviction_subconfigs },
+	{ "eviction_dirty_target", "int", "min=10,max=99", NULL },
+	{ "eviction_target", "int", "min=10,max=99", NULL },
+	{ "eviction_trigger", "int", "min=10,max=99", NULL },
+	{ "extensions", "list", NULL, NULL },
+	{ "file_extend", "list", "choices=[\"data\",\"log\"]", NULL },
+	{ "hazard_max", "int", "min=15", NULL },
+	{ "log", "category", NULL, confchk_log_subconfigs },
+	{ "lsm_manager", "category", NULL,
+	     confchk_lsm_manager_subconfigs },
+	{ "lsm_merge", "boolean", NULL, NULL },
+	{ "mmap", "boolean", NULL, NULL },
+	{ "multiprocess", "boolean", NULL, NULL },
+	{ "session_max", "int", "min=1", NULL },
+	{ "shared_cache", "category", NULL,
+	     confchk_shared_cache_subconfigs },
+	{ "statistics", "list",
+	    "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
+	    NULL },
+	{ "statistics_log", "category", NULL,
+	     confchk_statistics_log_subconfigs },
+	{ "transaction_sync", "category", NULL,
+	     confchk_transaction_sync_subconfigs },
+	{ "verbose", "list",
+	    "choices=[\"api\",\"block\",\"checkpoint\",\"compact\",\"evict\""
+	    ",\"evictserver\",\"fileops\",\"log\",\"lsm\",\"metadata\","
+	    "\"mutex\",\"overflow\",\"read\",\"reconcile\",\"recovery\","
+	    "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
+	    "\"transaction\",\"verify\",\"version\",\"write\"]",
+	    NULL },
+	{ NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_ENTRY config_entries[] = {
+	{ "colgroup.meta",
+	  "app_metadata=,columns=,source=,type=file",
+	  confchk_colgroup_meta
+	},
+	{ "connection.add_collator",
+	  "",
+	  NULL
+	},
+	{ "connection.add_compressor",
+	  "",
+	  NULL
+	},
+	{ "connection.add_data_source",
+	  "",
+	  NULL
+	},
+	{ "connection.add_extractor",
+	  "",
+	  NULL
+	},
+	{ "connection.async_new_op",
+	  "append=0,overwrite=,raw=0,timeout=1200",
+	  confchk_connection_async_new_op
+	},
+	{ "connection.close",
+	  "leak_memory=0",
+	  confchk_connection_close
+	},
+	{ "connection.load_extension",
+	  "config=,entry=wiredtiger_extension_init,"
+	  "terminate=wiredtiger_extension_terminate",
+	  confchk_connection_load_extension
+	},
+	{ "connection.open_session",
+	  "isolation=read-committed",
+	  confchk_connection_open_session
+	},
+	{ "connection.reconfigure",
+	  "async=(enabled=0,ops_max=1024,threads=2),cache_size=100MB,"
+	  "checkpoint=(log_size=0,name=\"WiredTigerCheckpoint\",wait=0),"
+	  "error_prefix=,eviction=(threads_max=1,threads_min=1),"
+	  "eviction_dirty_target=80,eviction_target=80,eviction_trigger=95,"
+	  "lsm_manager=(merge=,worker_thread_max=4),lsm_merge=,"
+	  "shared_cache=(chunk=10MB,name=,reserve=0,size=500MB),"
+	  "statistics=none,statistics_log=(on_close=0,"
+	  "path=\"WiredTigerStat.%d.%H\",sources=,"
+	  "timestamp=\"%b %d %H:%M:%S\",wait=0),verbose=",
+	  confchk_connection_reconfigure
+	},
+	{ "cursor.close",
+	  "",
+	  NULL
+	},
+	{ "file.meta",
+	  "allocation_size=4KB,app_metadata=,block_allocation=best,"
+	  "block_compressor=,cache_resident=0,checkpoint=,checkpoint_lsn=,"
+	  "checksum=uncompressed,collator=,columns=,dictionary=0,"
+	  "format=btree,huffman_key=,huffman_value=,id=,internal_item_max=0"
+	  ",internal_key_truncate=,internal_page_max=4KB,key_format=u,"
+	  "key_gap=10,leaf_item_max=0,leaf_page_max=32KB,"
+	  "memory_page_max=5MB,os_cache_dirty_max=0,os_cache_max=0,"
+	  "prefix_compression=0,prefix_compression_min=4,split_pct=75,"
+	  "value_format=u,version=(major=0,minor=0)",
+	  confchk_file_meta
+	},
+	{ "index.meta",
+	  "app_metadata=,columns=,key_format=u,source=,type=file,"
+	  "value_format=u",
+	  confchk_index_meta
+	},
+	{ "session.begin_transaction",
+	  "isolation=,name=,priority=0,sync=",
+	  confchk_session_begin_transaction
+	},
+	{ "session.checkpoint",
+	  "drop=,force=0,name=,target=",
+	  confchk_session_checkpoint
+	},
+	{ "session.close",
+	  "",
+	  NULL
+	},
+	{ "session.commit_transaction",
+	  "",
+	  NULL
+	},
+	{ "session.compact",
+	  "timeout=1200",
+	  confchk_session_compact
+	},
+	{ "session.create",
+	  "allocation_size=4KB,app_metadata=,block_allocation=best,"
+	  "block_compressor=,cache_resident=0,checksum=uncompressed,"
+	  "colgroups=,collator=,columns=,dictionary=0,exclusive=0,"
+	  "format=btree,huffman_key=,huffman_value=,internal_item_max=0,"
+	  "internal_key_truncate=,internal_page_max=4KB,key_format=u,"
+	  "key_gap=10,leaf_item_max=0,leaf_page_max=32KB,"
+	  "lsm=(auto_throttle=,bloom=,bloom_bit_count=16,bloom_config=,"
+	  "bloom_hash_count=8,bloom_oldest=0,chunk_max=5GB,chunk_size=10MB,"
+	  "merge_max=15,merge_min=0),memory_page_max=5MB,"
+	  "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0,"
+	  "prefix_compression_min=4,source=,split_pct=75,type=file,"
+	  "value_format=u",
+	  confchk_session_create
+	},
+	{ "session.drop",
+	  "force=0,remove_files=",
+	  confchk_session_drop
+	},
+	{ "session.log_printf",
+	  "",
+	  NULL
+	},
+	{ "session.open_cursor",
+	  "append=0,bulk=0,checkpoint=,dump=,next_random=0,overwrite=,raw=0"
+	  ",readonly=0,skip_sort_check=0,statistics=,target=",
+	  confchk_session_open_cursor
+	},
+	{ "session.reconfigure",
+	  "isolation=read-committed",
+	  confchk_session_reconfigure
+	},
+	{ "session.rename",
+	  "",
+	  NULL
+	},
+	{ "session.rollback_transaction",
+	  "",
+	  NULL
+	},
+	{ "session.salvage",
+	  "force=0",
+	  confchk_session_salvage
+	},
+	{ "session.truncate",
+	  "",
+	  NULL
+	},
+	{ "session.upgrade",
+	  "",
+	  NULL
+	},
+	{ "session.verify",
+	  "dump_address=0,dump_blocks=0,dump_offsets=,dump_pages=0",
+	  confchk_session_verify
+	},
+	{ "table.meta",
+	  "app_metadata=,colgroups=,columns=,key_format=u,value_format=u",
+	  confchk_table_meta
+	},
+	{ "wiredtiger_open",
+	  "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1,"
+	  "cache_size=100MB,checkpoint=(log_size=0,"
+	  "name=\"WiredTigerCheckpoint\",wait=0),checkpoint_sync=,"
+	  "config_base=,create=0,direct_io=,error_prefix=,"
+	  "eviction=(threads_max=1,threads_min=1),eviction_dirty_target=80,"
+	  "eviction_target=80,eviction_trigger=95,exclusive=0,extensions=,"
+	  "file_extend=,hazard_max=1000,log=(archive=,enabled=0,"
+	  "file_max=100MB,path=\"\"),lsm_manager=(merge=,"
+	  "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,"
+	  "session_max=100,shared_cache=(chunk=10MB,name=,reserve=0,"
+	  "size=500MB),statistics=none,statistics_log=(on_close=0,"
+	  "path=\"WiredTigerStat.%d.%H\",sources=,"
+	  "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
+	  ",method=fsync),use_environment_priv=0,verbose=",
+	  confchk_wiredtiger_open
+	},
+	{ "wiredtiger_open_all",
+	  "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1,"
+	  "cache_size=100MB,checkpoint=(log_size=0,"
+	  "name=\"WiredTigerCheckpoint\",wait=0),checkpoint_sync=,"
+	  "config_base=,create=0,direct_io=,error_prefix=,"
+	  "eviction=(threads_max=1,threads_min=1),eviction_dirty_target=80,"
+	  "eviction_target=80,eviction_trigger=95,exclusive=0,extensions=,"
+	  "file_extend=,hazard_max=1000,log=(archive=,enabled=0,"
+	  "file_max=100MB,path=\"\"),lsm_manager=(merge=,"
+	  "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,"
+	  "session_max=100,shared_cache=(chunk=10MB,name=,reserve=0,"
+	  "size=500MB),statistics=none,statistics_log=(on_close=0,"
+	  "path=\"WiredTigerStat.%d.%H\",sources=,"
+	  "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
+	  ",method=fsync),use_environment_priv=0,verbose=,version=(major=0,"
+	  "minor=0)",
+	  confchk_wiredtiger_open_all
+	},
+	{ "wiredtiger_open_basecfg",
+	  "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1,"
+	  "cache_size=100MB,checkpoint=(log_size=0,"
+	  "name=\"WiredTigerCheckpoint\",wait=0),checkpoint_sync=,"
+	  "direct_io=,error_prefix=,eviction=(threads_max=1,threads_min=1),"
+	  "eviction_dirty_target=80,eviction_target=80,eviction_trigger=95,"
+	  "extensions=,file_extend=,hazard_max=1000,log=(archive=,enabled=0"
+	  ",file_max=100MB,path=\"\"),lsm_manager=(merge=,"
+	  "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,"
+	  "session_max=100,shared_cache=(chunk=10MB,name=,reserve=0,"
+	  "size=500MB),statistics=none,statistics_log=(on_close=0,"
+	  "path=\"WiredTigerStat.%d.%H\",sources=,"
+	  "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
+	  ",method=fsync),verbose=,version=(major=0,minor=0)",
+	  confchk_wiredtiger_open_basecfg
+	},
+	{ "wiredtiger_open_usercfg",
+	  "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1,"
+	  "cache_size=100MB,checkpoint=(log_size=0,"
+	  "name=\"WiredTigerCheckpoint\",wait=0),checkpoint_sync=,"
+	  "direct_io=,error_prefix=,eviction=(threads_max=1,threads_min=1),"
+	  "eviction_dirty_target=80,eviction_target=80,eviction_trigger=95,"
+	  "extensions=,file_extend=,hazard_max=1000,log=(archive=,enabled=0"
+	  ",file_max=100MB,path=\"\"),lsm_manager=(merge=,"
+	  "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,"
+	  "session_max=100,shared_cache=(chunk=10MB,name=,reserve=0,"
+	  "size=500MB),statistics=none,statistics_log=(on_close=0,"
+	  "path=\"WiredTigerStat.%d.%H\",sources=,"
+	  "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
+	  ",method=fsync),verbose=",
+	  confchk_wiredtiger_open_usercfg
+	},
+	{ NULL, NULL, NULL }
+};
+
+int
+__wt_conn_config_init(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	const WT_CONFIG_ENTRY *ep, **epp;
+
+	conn = S2C(session);
+
+	/* Build a list of pointers to the configuration information. */
+	WT_RET(__wt_calloc_def(session,
+	    sizeof(config_entries) / sizeof(config_entries[0]), &epp));
+	conn->config_entries = epp;
+
+	/* Fill in the list to reference the default information. */
+	for (ep = config_entries;;) {
+		*epp++ = ep++;
+		if (ep->method == NULL)
+			break;
+	}
+	return (0);
+}
+
+void
+__wt_conn_config_discard(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+
+	conn = S2C(session);
+
+	__wt_free(session, conn->config_entries);
+}
diff --git a/src/third_party/wiredtiger/src/config/config_ext.c b/src/third_party/wiredtiger/src/config/config_ext.c
new file mode 100644
index 00000000000..26b3799d61c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_ext.c
@@ -0,0 +1,44 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_ext_config_parser_open --
+ *	WT_EXTENSION_API->config_parser_open implementation
+ */
+int
+__wt_ext_config_parser_open(WT_EXTENSION_API *wt_ext, WT_SESSION *wt_session,
+    const char *config, size_t len, WT_CONFIG_PARSER **config_parserp)
+{
+	WT_UNUSED(wt_ext);
+	return (wiredtiger_config_parser_open(
+	    wt_session, config, len, config_parserp));
+}
+
+/*
+ * __wt_ext_config_get --
+ *	Given a NULL-terminated list of configuration strings, find the final
+ * value for a given string key (external API version).
+ */
+int
+__wt_ext_config_get(WT_EXTENSION_API *wt_api,
+    WT_SESSION *wt_session, WT_CONFIG_ARG *cfg_arg, const char *key,
+    WT_CONFIG_ITEM *cval)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_SESSION_IMPL *session;
+	const char **cfg;
+
+	conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+	if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+		session = conn->default_session;
+
+	if ((cfg = (const char **)cfg_arg) == NULL)
+		return (WT_NOTFOUND);
+	return (__wt_config_gets(session, cfg, key, cval));
+}
diff --git a/src/third_party/wiredtiger/src/config/config_upgrade.c b/src/third_party/wiredtiger/src/config/config_upgrade.c
new file mode 100644
index 00000000000..24297df839b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_upgrade.c
@@ -0,0 +1,32 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_config_upgrade --
+ *	Upgrade a configuration string by appended the replacement version.
+ */
+int
+__wt_config_upgrade(WT_SESSION_IMPL *session, WT_ITEM *buf)
+{
+	WT_CONFIG_ITEM v;
+	const char *config;
+
+	config = buf->data;
+
+	/*
+	 * wiredtiger_open:
+	 *	lsm_merge=boolean -> lsm_manager=(merge=boolean)
+	 */
+	if (__wt_config_getones(
+	    session, config, "lsm_merge", &v) != WT_NOTFOUND)
+		WT_RET(__wt_buf_catfmt(session, buf,
+		    ",lsm_manager=(merge=%s)", v.val ? "true" : "false"));
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/conn/api_strerror.c b/src/third_party/wiredtiger/src/conn/api_strerror.c
new file mode 100644
index 00000000000..1ad136eae12
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/api_strerror.c
@@ -0,0 +1,43 @@
+/* DO NOT EDIT: automatically built by dist/api_err.py. */
+
+#include "wt_internal.h"
+
+/*
+ * wiredtiger_strerror --
+ *	Return a string for any error value.
+ */
+const char *
+wiredtiger_strerror(int error)
+{
+	static char errbuf[64];
+	char *p;
+
+	if (error == 0)
+		return ("Successful return: 0");
+
+	switch (error) {
+	case WT_DUPLICATE_KEY:
+		return ("WT_DUPLICATE_KEY: attempt to insert an existing key");
+	case WT_ERROR:
+		return ("WT_ERROR: non-specific WiredTiger error");
+	case WT_NOTFOUND:
+		return ("WT_NOTFOUND: item not found");
+	case WT_PANIC:
+		return ("WT_PANIC: WiredTiger library panic");
+	case WT_RESTART:
+		return ("WT_RESTART: restart the operation (internal)");
+	case WT_ROLLBACK:
+		return ("WT_ROLLBACK: conflict between concurrent operations");
+	default:
+		if (error > 0 && (p = strerror(error)) != NULL)
+			return (p);
+		break;
+	}
+
+	/*
+	 * !!!
+	 * Not thread-safe, but this is never supposed to happen.
+	 */
+	(void)snprintf(errbuf, sizeof(errbuf), "Unknown error: %d", error);
+	return (errbuf);
+}
diff --git a/src/third_party/wiredtiger/src/conn/api_version.c b/src/third_party/wiredtiger/src/conn/api_version.c
new file mode 100644
index 00000000000..1355220c585
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/api_version.c
@@ -0,0 +1,24 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * wiredtiger_version --
+ *	Return library version information.
+ */
+const char *
+wiredtiger_version(int *majorp, int *minorp, int *patchp)
+{
+	if (majorp != NULL)
+		*majorp = WIREDTIGER_VERSION_MAJOR;
+	if (minorp != NULL)
+		*minorp = WIREDTIGER_VERSION_MINOR;
+	if (patchp != NULL)
+		*patchp = WIREDTIGER_VERSION_PATCH;
+	return (WIREDTIGER_VERSION_STRING);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
new file mode 100644
index 00000000000..c7562ab94c3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -0,0 +1,1573 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __conn_statistics_config(WT_SESSION_IMPL *, const char *[]);
+
+/*
+ * ext_collate --
+ *	Call the collation function (external API version).
+ */
+static int
+ext_collate(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session,
+    WT_COLLATOR *collator, WT_ITEM *first, WT_ITEM *second, int *cmpp)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_SESSION_IMPL *session;
+
+	conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+	if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+		session = conn->default_session;
+
+	WT_RET(__wt_compare(session, collator, first, second, cmpp));
+
+	return (0);
+}
+
+/*
+ * ext_collator_config --
+ *	Given a configuration, configure the collator (external API version).
+ */
+static int
+ext_collator_config(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session,
+    WT_CONFIG_ARG *cfg_arg, WT_COLLATOR **collatorp, int *ownp)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_SESSION_IMPL *session;
+	const char **cfg;
+
+	conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+	if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+		session = conn->default_session;
+
+	/* The default is a standard lexicographic comparison. */
+	if ((cfg = (const char **)cfg_arg) == NULL)
+		return (0);
+
+	return (__wt_collator_config(session, cfg, collatorp, ownp));
+}
+
+/*
+ * __wt_collator_config --
+ *	Given a configuration, configure the collator.
+ */
+int
+__wt_collator_config(WT_SESSION_IMPL *session, const char **cfg,
+    WT_COLLATOR **collatorp, int *ownp)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_CONFIG_ITEM cval;
+	WT_DECL_RET;
+	WT_NAMED_COLLATOR *ncoll;
+
+	*collatorp = NULL;
+	*ownp = 0;
+
+	conn = S2C(session);
+
+	if ((ret = __wt_config_gets(session, cfg, "collator", &cval)) != 0)
+		return (ret == WT_NOTFOUND ? 0 : ret);
+
+	if (cval.len > 0) {
+		TAILQ_FOREACH(ncoll, &conn->collqh, q)
+			if (WT_STRING_MATCH(ncoll->name, cval.str, cval.len))
+				break;
+
+		if (ncoll == NULL)
+			WT_RET_MSG(session, EINVAL,
+			    "unknown collator '%.*s'", (int)cval.len, cval.str);
+
+		if (ncoll->collator->customize != NULL) {
+			WT_RET(__wt_config_gets(session,
+			    session->dhandle->cfg, "app_metadata", &cval));
+			WT_RET(ncoll->collator->customize(
+			    ncoll->collator, &session->iface,
+			    session->dhandle->name, &cval, collatorp));
+		}
+		if (*collatorp == NULL)
+			*collatorp = ncoll->collator;
+		else
+			*ownp = 1;
+	}
+
+	return (0);
+}
+
+/*
+ * __conn_get_extension_api --
+ *	WT_CONNECTION.get_extension_api method.
+ */
+static WT_EXTENSION_API *
+__conn_get_extension_api(WT_CONNECTION *wt_conn)
+{
+	WT_CONNECTION_IMPL *conn;
+
+	conn = (WT_CONNECTION_IMPL *)wt_conn;
+
+	conn->extension_api.conn = wt_conn;
+	conn->extension_api.err_printf = __wt_ext_err_printf;
+	conn->extension_api.msg_printf = __wt_ext_msg_printf;
+	conn->extension_api.strerror = wiredtiger_strerror;
+	conn->extension_api.scr_alloc = __wt_ext_scr_alloc;
+	conn->extension_api.scr_free = __wt_ext_scr_free;
+	conn->extension_api.collator_config = ext_collator_config;
+	conn->extension_api.collate = ext_collate;
+	conn->extension_api.config_parser_open = __wt_ext_config_parser_open;
+	conn->extension_api.config_get = __wt_ext_config_get;
+	conn->extension_api.metadata_insert = __wt_ext_metadata_insert;
+	conn->extension_api.metadata_remove = __wt_ext_metadata_remove;
+	conn->extension_api.metadata_search = __wt_ext_metadata_search;
+	conn->extension_api.metadata_update = __wt_ext_metadata_update;
+	conn->extension_api.struct_pack = __wt_ext_struct_pack;
+	conn->extension_api.struct_size = __wt_ext_struct_size;
+	conn->extension_api.struct_unpack = __wt_ext_struct_unpack;
+	conn->extension_api.transaction_id = __wt_ext_transaction_id;
+	conn->extension_api.transaction_isolation_level =
+	    __wt_ext_transaction_isolation_level;
+	conn->extension_api.transaction_notify = __wt_ext_transaction_notify;
+	conn->extension_api.transaction_oldest = __wt_ext_transaction_oldest;
+	conn->extension_api.transaction_visible = __wt_ext_transaction_visible;
+	conn->extension_api.version = wiredtiger_version;
+
+	return (&conn->extension_api);
+}
+
+#ifdef HAVE_BUILTIN_EXTENSION_SNAPPY
+	extern int snappy_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *);
+#endif
+#ifdef HAVE_BUILTIN_EXTENSION_ZLIB
+	extern int zlib_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *);
+#endif
+
+/*
+ * __conn_load_default_extensions --
+ *	Load extensions that are enabled via --with-builtins
+ */
+static int
+__conn_load_default_extensions(WT_CONNECTION_IMPL *conn)
+{
+	WT_UNUSED(conn);
+#ifdef HAVE_BUILTIN_EXTENSION_SNAPPY
+	WT_RET(snappy_extension_init(&conn->iface, NULL));
+#endif
+#ifdef HAVE_BUILTIN_EXTENSION_ZLIB
+	WT_RET(zlib_extension_init(&conn->iface, NULL));
+#endif
+	return (0);
+}
+
+/*
+ * __conn_load_extension --
+ *	WT_CONNECTION->load_extension method.
+ */
+static int
+__conn_load_extension(
+    WT_CONNECTION *wt_conn, const char *path, const char *config)
+{
+	WT_CONFIG_ITEM cval;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_DLH *dlh;
+	WT_SESSION_IMPL *session;
+	int (*load)(WT_CONNECTION *, WT_CONFIG_ARG *);
+	int is_local;
+	const char *init_name, *terminate_name;
+
+	dlh = NULL;
+	init_name = terminate_name = NULL;
+	is_local = (strcmp(path, "local") == 0);
+
+	conn = (WT_CONNECTION_IMPL *)wt_conn;
+	CONNECTION_API_CALL(conn, session, load_extension, config, cfg);
+
+	/*
+	 * This assumes the underlying shared libraries are reference counted,
+	 * that is, that re-opening a shared library simply increments a ref
+	 * count, and closing it simply decrements the ref count, and the last
+	 * close discards the reference entirely -- in other words, we do not
+	 * check to see if we've already opened this shared library.
+	 */
+	WT_ERR(__wt_dlopen(session, is_local ? NULL : path, &dlh));
+
+	/*
+	 * Find the load function, remember the unload function for when we
+	 * close.
+	 */
+	WT_ERR(__wt_config_gets(session, cfg, "entry", &cval));
+	WT_ERR(__wt_strndup(session, cval.str, cval.len, &init_name));
+	WT_ERR(__wt_dlsym(session, dlh, init_name, 1, &load));
+
+	WT_ERR(__wt_config_gets(session, cfg, "terminate", &cval));
+	WT_ERR(__wt_strndup(session, cval.str, cval.len, &terminate_name));
+	WT_ERR(__wt_dlsym(session, dlh, terminate_name, 0, &dlh->terminate));
+
+	/* Call the load function last, it simplifies error handling. */
+	WT_ERR(load(wt_conn, (WT_CONFIG_ARG *)cfg));
+
+	/* Link onto the environment's list of open libraries. */
+	__wt_spin_lock(session, &conn->api_lock);
+	TAILQ_INSERT_TAIL(&conn->dlhqh, dlh, q);
+	__wt_spin_unlock(session, &conn->api_lock);
+	dlh = NULL;
+
+err:	if (dlh != NULL)
+		WT_TRET(__wt_dlclose(session, dlh));
+	__wt_free(session, init_name);
+	__wt_free(session, terminate_name);
+
+	API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_load_extensions --
+ *	Load the list of application-configured extensions.
+ */
+static int
+__conn_load_extensions(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_CONFIG subconfig;
+	WT_CONFIG_ITEM cval, skey, sval;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_ITEM(exconfig);
+	WT_DECL_ITEM(expath);
+	WT_DECL_RET;
+
+	conn = S2C(session);
+
+	WT_ERR(__conn_load_default_extensions(conn));
+
+	WT_ERR(__wt_config_gets(session, cfg, "extensions", &cval));
+	WT_ERR(__wt_config_subinit(session, &subconfig, &cval));
+	while ((ret = __wt_config_next(&subconfig, &skey, &sval)) == 0) {
+		if (expath == NULL)
+			WT_ERR(__wt_scr_alloc(session, 0, &expath));
+		WT_ERR(__wt_buf_fmt(
+		    session, expath, "%.*s", (int)skey.len, skey.str));
+		if (sval.len > 0) {
+			if (exconfig == NULL)
+				WT_ERR(__wt_scr_alloc(session, 0, &exconfig));
+			WT_ERR(__wt_buf_fmt(session,
+			    exconfig, "%.*s", (int)sval.len, sval.str));
+		}
+		WT_ERR(conn->iface.load_extension(&conn->iface,
+		    expath->data, (sval.len > 0) ? exconfig->data : NULL));
+	}
+	WT_ERR_NOTFOUND_OK(ret);
+
+err:	__wt_scr_free(&expath);
+	__wt_scr_free(&exconfig);
+
+	return (ret);
+}
+
+/*
+ * __conn_add_collator --
+ *	WT_CONNECTION->add_collator method.
+ */
+static int
+__conn_add_collator(WT_CONNECTION *wt_conn,
+    const char *name, WT_COLLATOR *collator, const char *config)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_NAMED_COLLATOR *ncoll;
+	WT_SESSION_IMPL *session;
+
+	ncoll = NULL;
+
+	conn = (WT_CONNECTION_IMPL *)wt_conn;
+	CONNECTION_API_CALL(conn, session, add_collator, config, cfg);
+	WT_UNUSED(cfg);
+
+	WT_ERR(__wt_calloc_def(session, 1, &ncoll));
+	WT_ERR(__wt_strdup(session, name, &ncoll->name));
+	ncoll->collator = collator;
+
+	__wt_spin_lock(session, &conn->api_lock);
+	TAILQ_INSERT_TAIL(&conn->collqh, ncoll, q);
+	ncoll = NULL;
+	__wt_spin_unlock(session, &conn->api_lock);
+
+err:	if (ncoll != NULL) {
+		__wt_free(session, ncoll->name);
+		__wt_free(session, ncoll);
+	}
+
+	API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __wt_conn_remove_collator --
+ *	Remove collator added by WT_CONNECTION->add_collator, only used
+ * internally.
+ */
+int
+__wt_conn_remove_collator(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_NAMED_COLLATOR *ncoll;
+
+	conn = S2C(session);
+
+	while ((ncoll = TAILQ_FIRST(&conn->collqh)) != NULL) {
+		/* Call any termination method. */
+		if (ncoll->collator->terminate != NULL)
+			WT_TRET(ncoll->collator->terminate(
+			    ncoll->collator, (WT_SESSION *)session));
+
+		/* Remove from the connection's list, free memory. */
+		TAILQ_REMOVE(&conn->collqh, ncoll, q);
+		__wt_free(session, ncoll->name);
+		__wt_free(session, ncoll);
+	}
+
+	return (ret);
+}
+
+/*
+ * __conn_add_compressor --
+ *	WT_CONNECTION->add_compressor method.
+ */
+static int
+__conn_add_compressor(WT_CONNECTION *wt_conn,
+    const char *name, WT_COMPRESSOR *compressor, const char *config)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_NAMED_COMPRESSOR *ncomp;
+	WT_SESSION_IMPL *session;
+
+	WT_UNUSED(name);
+	WT_UNUSED(compressor);
+	ncomp = NULL;
+
+	conn = (WT_CONNECTION_IMPL *)wt_conn;
+	CONNECTION_API_CALL(conn, session, add_compressor, config, cfg);
+	WT_UNUSED(cfg);
+
+	WT_ERR(__wt_calloc_def(session, 1, &ncomp));
+	WT_ERR(__wt_strdup(session, name, &ncomp->name));
+	ncomp->compressor = compressor;
+
+	__wt_spin_lock(session, &conn->api_lock);
+	TAILQ_INSERT_TAIL(&conn->compqh, ncomp, q);
+	ncomp = NULL;
+	__wt_spin_unlock(session, &conn->api_lock);
+
+err:	if (ncomp != NULL) {
+		__wt_free(session, ncomp->name);
+		__wt_free(session, ncomp);
+	}
+
+	API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __wt_conn_remove_compressor --
+ *	remove compressor added by WT_CONNECTION->add_compressor, only used
+ * internally.
+ */
+int
+__wt_conn_remove_compressor(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_NAMED_COMPRESSOR *ncomp;
+
+	conn = S2C(session);
+
+	while ((ncomp = TAILQ_FIRST(&conn->compqh)) != NULL) {
+		/* Call any termination method. */
+		if (ncomp->compressor->terminate != NULL)
+			WT_TRET(ncomp->compressor->terminate(
+			    ncomp->compressor, (WT_SESSION *)session));
+
+		/* Remove from the connection's list, free memory. */
+		TAILQ_REMOVE(&conn->compqh, ncomp, q);
+		__wt_free(session, ncomp->name);
+		__wt_free(session, ncomp);
+	}
+
+	return (ret);
+}
+
+/*
+ * __conn_add_data_source --
+ *	WT_CONNECTION->add_data_source method.
+ */
+static int
+__conn_add_data_source(WT_CONNECTION *wt_conn,
+    const char *prefix, WT_DATA_SOURCE *dsrc, const char *config)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_NAMED_DATA_SOURCE *ndsrc;
+	WT_SESSION_IMPL *session;
+
+	ndsrc = NULL;
+
+	conn = (WT_CONNECTION_IMPL *)wt_conn;
+	CONNECTION_API_CALL(conn, session, add_data_source, config, cfg);
+	WT_UNUSED(cfg);
+
+	WT_ERR(__wt_calloc_def(session, 1, &ndsrc));
+	WT_ERR(__wt_strdup(session, prefix, &ndsrc->prefix));
+	ndsrc->dsrc = dsrc;
+
+	/* Link onto the environment's list of data sources. */
+	__wt_spin_lock(session, &conn->api_lock);
+	TAILQ_INSERT_TAIL(&conn->dsrcqh, ndsrc, q);
+	ndsrc = NULL;
+	__wt_spin_unlock(session, &conn->api_lock);
+
+err:	if (ndsrc != NULL) {
+		__wt_free(session, ndsrc->prefix);
+		__wt_free(session, ndsrc);
+	}
+
+	API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __wt_conn_remove_data_source --
+ *	Remove data source added by WT_CONNECTION->add_data_source.
+ */
+int
+__wt_conn_remove_data_source(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_NAMED_DATA_SOURCE *ndsrc;
+
+	conn = S2C(session);
+
+	while ((ndsrc = TAILQ_FIRST(&conn->dsrcqh)) != NULL) {
+		/* Call any termination method. */
+		if (ndsrc->dsrc->terminate != NULL)
+			WT_TRET(ndsrc->dsrc->terminate(
+			    ndsrc->dsrc, (WT_SESSION *)session));
+
+		/* Remove from the connection's list, free memory. */
+		TAILQ_REMOVE(&conn->dsrcqh, ndsrc, q);
+		__wt_free(session, ndsrc->prefix);
+		__wt_free(session, ndsrc);
+	}
+
+	return (ret);
+}
+
+/*
+ * __conn_add_extractor --
+ *	WT_CONNECTION->add_extractor method.
+ */
+static int
+__conn_add_extractor(WT_CONNECTION *wt_conn,
+    const char *name, WT_EXTRACTOR *extractor, const char *config)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	WT_UNUSED(name);
+	WT_UNUSED(extractor);
+	ret = ENOTSUP;
+
+	conn = (WT_CONNECTION_IMPL *)wt_conn;
+	CONNECTION_API_CALL(conn, session, add_extractor, config, cfg);
+	WT_UNUSED(cfg);
+
+err:	API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_async_flush --
+ *	WT_CONNECTION.async_flush method.
+ */
+static int
+__conn_async_flush(WT_CONNECTION *wt_conn)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	conn = (WT_CONNECTION_IMPL *)wt_conn;
+	CONNECTION_API_CALL_NOCONF(conn, session, async_flush);
+	WT_ERR(__wt_async_flush(session));
+
+err:	API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_async_new_op --
+ *	WT_CONNECTION.async_new_op method.
+ */
+static int
+__conn_async_new_op(WT_CONNECTION *wt_conn, const char *uri, const char *config,
+    WT_ASYNC_CALLBACK *callback, WT_ASYNC_OP **asyncopp)
+{
+	WT_ASYNC_OP_IMPL *op;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	conn = (WT_CONNECTION_IMPL *)wt_conn;
+	CONNECTION_API_CALL(conn, session, async_new_op, config, cfg);
+	WT_ERR(__wt_async_new_op(session, uri, config, cfg, callback, &op));
+
+	*asyncopp = &op->iface;
+
+err:	API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_get_home --
+ *	WT_CONNECTION.get_home method.
+ */
+static const char *
+__conn_get_home(WT_CONNECTION *wt_conn)
+{
+	return (((WT_CONNECTION_IMPL *)wt_conn)->home);
+}
+
+/*
+ * __conn_configure_method --
+ *	WT_CONNECTION.configure_method method.
+ */
+static int
+__conn_configure_method(WT_CONNECTION *wt_conn, const char *method,
+    const char *uri, const char *config, const char *type, const char *check)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	conn = (WT_CONNECTION_IMPL *)wt_conn;
+	CONNECTION_API_CALL_NOCONF(conn, session, configure_method);
+
+	ret = __wt_configure_method(session, method, uri, config, type, check);
+
+err:	API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_is_new --
+ *	WT_CONNECTION->is_new method.
+ */
+static int
+__conn_is_new(WT_CONNECTION *wt_conn)
+{
+	return (((WT_CONNECTION_IMPL *)wt_conn)->is_new);
+}
+
+/*
+ * __conn_close --
+ *	WT_CONNECTION->close method.
+ */
+static int
+__conn_close(WT_CONNECTION *wt_conn, const char *config)
+{
+	WT_CONFIG_ITEM cval;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_SESSION *wt_session;
+	WT_SESSION_IMPL *s, *session;
+	uint32_t i;
+
+	conn = (WT_CONNECTION_IMPL *)wt_conn;
+
+	CONNECTION_API_CALL(conn, session, close, config, cfg);
+
+	WT_TRET(__wt_config_gets(session, cfg, "leak_memory", &cval));
+	if (cval.val != 0)
+		F_SET(conn, WT_CONN_LEAK_MEMORY);
+
+err:	/*
+	 * Rollback all running transactions.
+	 * We do this as a separate pass because an active transaction in one
+	 * session could cause trouble when closing a file, even if that
+	 * session never referenced that file.
+	 */
+	for (s = conn->sessions, i = 0; i < conn->session_cnt; ++s, ++i)
+		if (s->active && !F_ISSET(s, WT_SESSION_INTERNAL) &&
+		    F_ISSET(&s->txn, TXN_RUNNING)) {
+			wt_session = &s->iface;
+			WT_TRET(wt_session->rollback_transaction(
+			    wt_session, NULL));
+		}
+
+	/* Close open, external sessions. */
+	for (s = conn->sessions, i = 0; i < conn->session_cnt; ++s, ++i)
+		if (s->active && !F_ISSET(s, WT_SESSION_INTERNAL)) {
+			wt_session = &s->iface;
+			/*
+			 * Notify the user that we are closing the session
+			 * handle via the registered close callback.
+			 */
+			if (s->event_handler->handle_close != NULL)
+				WT_TRET(s->event_handler->handle_close(
+				    s->event_handler, wt_session, NULL));
+			WT_TRET(wt_session->close(wt_session, config));
+		}
+
+	WT_TRET(__wt_connection_close(conn));
+
+	/* We no longer have a session, don't try to update it. */
+	session = NULL;
+
+	API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_reconfigure --
+ *	WT_CONNECTION->reconfigure method.
+ */
+static int
+__conn_reconfigure(WT_CONNECTION *wt_conn, const char *config)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	const char *p, *config_cfg[] = { NULL, NULL, NULL };
+
+	conn = (WT_CONNECTION_IMPL *)wt_conn;
+
+	CONNECTION_API_CALL(conn, session, reconfigure, config, cfg);
+	WT_UNUSED(cfg);
+
+	/* Serialize reconfiguration. */
+	__wt_spin_lock(session, &conn->reconfig_lock);
+
+	/*
+	 * The configuration argument has been checked for validity, replace the
+	 * previous connection configuration.
+	 *
+	 * DO NOT merge the configuration before the reconfigure calls.  Some
+	 * of the underlying reconfiguration functions do explicit checks with
+	 * the second element of the configuration array, knowing the defaults
+	 * are in slot #1 and the application's modifications are in slot #2.
+	 */
+	config_cfg[0] = conn->cfg;
+	config_cfg[1] = config;
+
+	WT_ERR(__conn_statistics_config(session, config_cfg));
+	WT_ERR(__wt_async_reconfig(session, config_cfg));
+	WT_ERR(__wt_cache_config(session, config_cfg));
+	WT_ERR(__wt_cache_pool_config(session, config_cfg));
+	WT_ERR(__wt_checkpoint_server_create(session, config_cfg));
+	WT_ERR(__wt_lsm_manager_reconfig(session, config_cfg));
+	WT_ERR(__wt_statlog_create(session, config_cfg));
+	WT_ERR(__wt_verbose_config(session, config_cfg));
+
+	WT_ERR(__wt_config_merge(session, config_cfg, &p));
+	__wt_free(session, conn->cfg);
+	conn->cfg = p;
+
+err:	__wt_spin_unlock(session, &conn->reconfig_lock);
+
+	API_END_RET(session, ret);
+}
+
+/*
+ * __conn_open_session --
+ *	WT_CONNECTION->open_session method.
+ */
+static int
+__conn_open_session(WT_CONNECTION *wt_conn,
+    WT_EVENT_HANDLER *event_handler, const char *config,
+    WT_SESSION **wt_sessionp)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session, *session_ret;
+
+	*wt_sessionp = NULL;
+
+	conn = (WT_CONNECTION_IMPL *)wt_conn;
+	session_ret = NULL;
+
+	CONNECTION_API_CALL(conn, session, open_session, config, cfg);
+	WT_UNUSED(cfg);
+
+	WT_ERR(__wt_open_session(conn, event_handler, config, &session_ret));
+
+	*wt_sessionp = &session_ret->iface;
+
+err:	API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_config_append --
+ *	Append an entry to a config stack.
+ */
+static void
+__conn_config_append(const char *cfg[], const char *config)
+{
+	while (*cfg != NULL)
+		++cfg;
+	*cfg = config;
+}
+
+/*
+ * __conn_config_check_version --
+ *	Check if a configuration version isn't compatible.
+ */
+static int
+__conn_config_check_version(WT_SESSION_IMPL *session, const char *config)
+{
+	WT_CONFIG_ITEM vmajor, vminor;
+
+	/*
+	 * Version numbers aren't included in all configuration strings, but
+	 * we check all of them just in case. Ignore configurations without
+	 * a version.
+	 */
+	 if (__wt_config_getones(
+	     session, config, "version.major", &vmajor) == WT_NOTFOUND)
+		return (0);
+	 WT_RET(__wt_config_getones(session, config, "version.minor", &vminor));
+
+	 if (vmajor.val > WIREDTIGER_VERSION_MAJOR ||
+	     (vmajor.val == WIREDTIGER_VERSION_MAJOR &&
+	     vminor.val > WIREDTIGER_VERSION_MINOR))
+		WT_RET_MSG(session, ENOTSUP,
+		    "WiredTiger configuration is from an incompatible release "
+		    "of the WiredTiger engine");
+
+	return (0);
+}
+
+/*
+ * __conn_config_file --
+ *	Read WiredTiger config files from the home directory.
+ */
+static int
+__conn_config_file(WT_SESSION_IMPL *session,
+    const char *filename, int is_user, const char **cfg, WT_ITEM *cbuf)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_FH *fh;
+	size_t len;
+	wt_off_t size;
+	int exist, quoted;
+	char *p, *t;
+
+	conn = S2C(session);
+	fh = NULL;
+
+	/* Configuration files are always optional. */
+	WT_RET(__wt_exist(session, filename, &exist));
+	if (!exist)
+		return (0);
+
+	/*
+	 * The base configuration should not exist if we are creating this
+	 * database.
+	 */
+	if (!is_user && conn->is_new)
+		WT_RET_MSG(session, EINVAL,
+		    "%s exists before database creation", filename);
+
+	/* Open the configuration file. */
+	WT_RET(__wt_open(session, filename, 0, 0, 0, &fh));
+	WT_ERR(__wt_filesize(session, fh, &size));
+	if (size == 0)
+		goto err;
+
+	/*
+	 * Sanity test: a 100KB configuration file would be insane.  (There's
+	 * no practical reason to limit the file size, but I can either limit
+	 * the file size to something rational, or add code to test if the
+	 * wt_off_t size is larger than a uint32_t, which is more complicated
+	 * and a waste of time.)
+	 */
+	if (size > 100 * 1024)
+		WT_ERR_MSG(
+		    session, EFBIG, "Configuration file too big: %s", filename);
+	len = (size_t)size;
+
+	/*
+	 * Copy the configuration file into memory, with a little slop, I'm not
+	 * interested in debugging off-by-ones.
+	 *
+	 * The beginning of a file is the same as if we run into an unquoted
+	 * newline character, simplify the parsing loop by pretending that's
+	 * what we're doing.
+	 */
+	WT_ERR(__wt_buf_init(session, cbuf, len + 10));
+	WT_ERR(__wt_read(
+	    session, fh, (wt_off_t)0, len, ((uint8_t *)cbuf->mem) + 1));
+	((uint8_t *)cbuf->mem)[0] = '\n';
+	cbuf->size = len + 1;
+
+	/*
+	 * Collapse the file's lines into a single string: newline characters
+	 * are replaced with commas unless the newline is quoted or backslash
+	 * escaped.  Comment lines (an unescaped newline where the next non-
+	 * white-space character is a hash), are discarded.
+	 */
+	for (quoted = 0, p = t = cbuf->mem; len > 0;) {
+		/*
+		 * Backslash pairs pass through untouched, unless immediately
+		 * preceding a newline, in which case both the backslash and
+		 * the newline are discarded.  Backslash characters escape
+		 * quoted characters, too, that is, a backslash followed by a
+		 * quote doesn't start or end a quoted string.
+		 */
+		if (*p == '\\' && len > 1) {
+			if (p[1] != '\n') {
+				*t++ = p[0];
+				*t++ = p[1];
+			}
+			p += 2;
+			len -= 2;
+			continue;
+		}
+
+		/*
+		 * If we're in a quoted string, or starting a quoted string,
+		 * take all characters, including white-space and newlines.
+		 */
+		if (quoted || *p == '"') {
+			if (*p == '"')
+				quoted = !quoted;
+			*t++ = *p++;
+			--len;
+			continue;
+		}
+
+		/* Everything else gets taken, except for newline characters. */
+		if (*p != '\n') {
+			*t++ = *p++;
+			--len;
+			continue;
+		}
+
+		/*
+		 * Replace any newline characters with commas (and strings of
+		 * commas are safe).
+		 *
+		 * After any newline, skip to a non-white-space character; if
+		 * the next character is a hash mark, skip to the next newline.
+		 */
+		for (;;) {
+			for (*t++ = ','; --len > 0 && isspace(*++p);)
+				;
+			if (len == 0)
+				break;
+			if (*p != '#')
+				break;
+			while (--len > 0 && *++p != '\n')
+				;
+			if (len == 0)
+				break;
+		}
+	}
+	*t = '\0';
+	cbuf->size = WT_PTRDIFF(t, cbuf->data);
+
+	/* Check any version. */
+	WT_ERR(__conn_config_check_version(session, cbuf->data));
+
+	/* Upgrade the configuration string. */
+	WT_ERR(__wt_config_upgrade(session, cbuf));
+
+	/* Check the configuration information. */
+	WT_ERR(__wt_config_check(session, is_user ?
+	    WT_CONFIG_REF(session, wiredtiger_open_usercfg) :
+	    WT_CONFIG_REF(session, wiredtiger_open_basecfg), cbuf->data, 0));
+
+	/* Append it to the stack. */
+	__conn_config_append(cfg, cbuf->data);
+
+err:	if (fh != NULL)
+		WT_TRET(__wt_close(session, fh));
+	return (ret);
+}
+
+/*
+ * __conn_config_env --
+ *	Read configuration from an environment variable, if set.
+ */
+static int
+__conn_config_env(WT_SESSION_IMPL *session, const char *cfg[], WT_ITEM *cbuf)
+{
+	WT_CONFIG_ITEM cval;
+	const char *env_config;
+	size_t len;
+
+	if ((env_config = getenv("WIREDTIGER_CONFIG")) == NULL)
+		return (0);
+	len = strlen(env_config);
+	if (len == 0)
+		return (0);
+	WT_RET(__wt_buf_set(session, cbuf, env_config, len + 1));
+
+	/*
+	 * Security stuff:
+	 *
+	 * If the "use_environment_priv" configuration string is set, use the
+	 * environment variable if the process has appropriate privileges.
+	 */
+	WT_RET(__wt_config_gets(session, cfg, "use_environment_priv", &cval));
+	if (cval.val == 0 && __wt_has_priv())
+		WT_RET_MSG(session, WT_ERROR, "%s",
+		    "WIREDTIGER_CONFIG environment variable set but process "
+		    "lacks privileges to use that environment variable");
+
+	/* Check any version. */
+	WT_RET(__conn_config_check_version(session, env_config));
+
+	/* Upgrade the configuration string. */
+	WT_RET(__wt_config_upgrade(session, cbuf));
+
+	/* Check the configuration information. */
+	WT_RET(__wt_config_check(session,
+	    WT_CONFIG_REF(session, wiredtiger_open), env_config, 0));
+
+	/* Append it to the stack. */
+	__conn_config_append(cfg, env_config);
+
+	return (0);
+}
+
+/*
+ * __conn_home --
+ *	Set the database home directory.
+ */
+static int
+__conn_home(WT_SESSION_IMPL *session, const char *home, const char *cfg[])
+{
+	WT_CONFIG_ITEM cval;
+
+	/* If the application specifies a home directory, use it. */
+	if (home != NULL)
+		goto copy;
+
+	/* If there's no WIREDTIGER_HOME environment variable, use ".". */
+	if ((home = getenv("WIREDTIGER_HOME")) == NULL || strlen(home) == 0) {
+		home = ".";
+		goto copy;
+	}
+
+	/*
+	 * Security stuff:
+	 *
+	 * Unless the "use_environment_priv" configuration string is set,
+	 * fail if the process is running with special privileges.
+	 */
+	WT_RET(__wt_config_gets(session, cfg, "use_environment_priv", &cval));
+	if (cval.val == 0 && __wt_has_priv())
+		WT_RET_MSG(session, WT_ERROR, "%s",
+		    "WIREDTIGER_HOME environment variable set but process "
+		    "lacks privileges to use that environment variable");
+
+copy:	return (__wt_strdup(session, home, &S2C(session)->home));
+}
+
+/*
+ * __conn_single --
+ *	Confirm that no other thread of control is using this database.
+ */
+static int
+__conn_single(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_CONFIG_ITEM cval;
+	WT_CONNECTION_IMPL *conn, *t;
+	WT_DECL_RET;
+	WT_FH *fh;
+	size_t len;
+	wt_off_t size;
+	char buf[256];
+
+	conn = S2C(session);
+	fh = NULL;
+
+	__wt_spin_lock(session, &__wt_process.spinlock);
+
+	/*
+	 * We first check for other threads of control holding a lock on this
+	 * database, because the byte-level locking functions are based on the
+	 * POSIX 1003.1 fcntl APIs, which require all locks associated with a
+	 * file for a given process are removed when any file descriptor for
+	 * the file is closed by that process. In other words, we can't open a
+	 * file handle on the lock file until we are certain that closing that
+	 * handle won't discard the owning thread's lock. Applications hopefully
+	 * won't open a database in multiple threads, but we don't want to have
+	 * it fail the first time, but succeed the second.
+	 */
+	TAILQ_FOREACH(t, &__wt_process.connqh, q)
+		if (t->home != NULL &&
+		    t != conn && strcmp(t->home, conn->home) == 0) {
+			ret = EBUSY;
+			break;
+		}
+	if (ret != 0)
+		WT_ERR_MSG(session, EBUSY,
+		    "WiredTiger database is already being managed by another "
+		    "thread in this process");
+
+	/*
+	 * !!!
+	 * Be careful changing this code.
+	 *
+	 * We locked the WiredTiger file before release 2.3.2; a separate lock
+	 * file was added after 2.3.1 because hot backup has to copy the
+	 * WiredTiger file and system utilities on Windows can't copy locked
+	 * files.
+	 *
+	 * For this reason, we don't use the lock file's existence to decide if
+	 * we're creating the database or not, use the WiredTiger file instead,
+	 * it has existed in every version of WiredTiger.
+	 *
+	 * Additionally, avoid an upgrade race: a 2.3.1 release process might
+	 * have the WiredTiger file locked, and we're going to create the lock
+	 * file and lock it instead. For this reason, first acquire a lock on
+	 * the lock file and then a lock on the WiredTiger file, then release
+	 * the latter so hot backups can proceed.  (If someone were to run a
+	 * current release and subsequently a historic release, we could still
+	 * fail because the historic release will ignore our lock file and will
+	 * then successfully lock the WiredTiger file, but I can't think of any
+	 * way to fix that.)
+	 *
+	 * Open the WiredTiger lock file, creating it if it doesn't exist. (I'm
+	 * not removing the lock file if we create it and subsequently fail, it
+	 * isn't simple to detect that case, and there's no risk other than a
+	 * useless file being left in the directory.)
+	 */
+	WT_ERR(__wt_open(session, WT_SINGLETHREAD, 1, 0, 0, &conn->lock_fh));
+
+	/*
+	 * Lock a byte of the file: if we don't get the lock, some other process
+	 * is holding it, we're done.  The file may be zero-length, and that's
+	 * OK, the underlying call supports locking past the end-of-file.
+	 */
+	if (__wt_bytelock(conn->lock_fh, (wt_off_t)0, 1) != 0)
+		WT_ERR_MSG(session, EBUSY,
+		    "WiredTiger database is already being managed by another "
+		    "process");
+
+	/*
+	 * If the size of the lock file is 0, we created it (or we won a locking
+	 * race with the thread that created it, it doesn't matter).
+	 *
+	 * Write something into the file, zero-length files make me nervous.
+	 */
+	WT_ERR(__wt_filesize(session, conn->lock_fh, &size));
+	if (size == 0) {
+#define	WT_SINGLETHREAD_STRING	"WiredTiger lock file\n"
+		WT_ERR(__wt_write(session, conn->lock_fh, (wt_off_t)0,
+		    strlen(WT_SINGLETHREAD_STRING), WT_SINGLETHREAD_STRING));
+	}
+
+	/* We own the lock file, optionally create the WiredTiger file. */
+	WT_ERR(__wt_config_gets(session, cfg, "create", &cval));
+	WT_ERR(__wt_open(session,
+	    WT_WIREDTIGER, cval.val == 0 ? 0 : 1, 0, 0, &fh));
+
+	/*
+	 * Lock the WiredTiger file (for backward compatibility reasons as
+	 * described above).  Immediately release the lock, it's just a test.
+	 */
+	if (__wt_bytelock(fh, (wt_off_t)0, 1) != 0) {
+		WT_ERR_MSG(session, EBUSY,
+		    "WiredTiger database is already being managed by another "
+		    "process");
+	}
+	WT_ERR(__wt_bytelock(fh, (wt_off_t)0, 0));
+
+	/*
+	 * If the size of the file is zero, we created it, fill it in. If the
+	 * size of the file is non-zero, fail if configured for exclusivity.
+	 */
+	WT_ERR(__wt_filesize(session, fh, &size));
+	if (size == 0) {
+		len = (size_t)snprintf(buf, sizeof(buf),
+		    "%s\n%s\n", WT_WIREDTIGER, WIREDTIGER_VERSION_STRING);
+		WT_ERR(__wt_write(session, fh, (wt_off_t)0, len, buf));
+
+		conn->is_new = 1;
+	} else {
+		WT_ERR(__wt_config_gets(session, cfg, "exclusive", &cval));
+		if (cval.val != 0)
+			WT_ERR_MSG(session, EEXIST,
+			    "WiredTiger database already exists and exclusive "
+			    "option configured");
+
+		conn->is_new = 0;
+	}
+
+err:	/*
+	 * We ignore the connection's lock file handle on error, it will be
+	 * closed when the connection structure is destroyed.
+	 */
+	if (fh != NULL)
+		WT_TRET(__wt_close(session, fh));
+
+	__wt_spin_unlock(session, &__wt_process.spinlock);
+	return (ret);
+}
+
+/*
+ * __conn_statistics_config --
+ *	Set statistics configuration.
+ */
+static int
+__conn_statistics_config(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_CONFIG_ITEM cval, sval;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	uint32_t flags;
+	int set;
+
+	conn = S2C(session);
+
+	WT_RET(__wt_config_gets(session, cfg, "statistics", &cval));
+
+	flags = 0;
+	set = 0;
+	if ((ret = __wt_config_subgets(
+	    session, &cval, "none", &sval)) == 0 && sval.val != 0) {
+		LF_SET(WT_CONN_STAT_NONE);
+		++set;
+	}
+	WT_RET_NOTFOUND_OK(ret);
+
+	if ((ret = __wt_config_subgets(
+	    session, &cval, "fast", &sval)) == 0 && sval.val != 0) {
+		LF_SET(WT_CONN_STAT_FAST);
+		++set;
+	}
+	WT_RET_NOTFOUND_OK(ret);
+
+	if ((ret = __wt_config_subgets(
+	    session, &cval, "all", &sval)) == 0 && sval.val != 0) {
+		LF_SET(WT_CONN_STAT_ALL | WT_CONN_STAT_FAST);
+		++set;
+	}
+	WT_RET_NOTFOUND_OK(ret);
+
+	if ((ret = __wt_config_subgets(
+	    session, &cval, "clear", &sval)) == 0 && sval.val != 0)
+		LF_SET(WT_CONN_STAT_CLEAR);
+	WT_RET_NOTFOUND_OK(ret);
+
+	if (set > 1)
+		WT_RET_MSG(session, EINVAL,
+		    "only one statistics configuration value may be specified");
+
+	/* Configuring statistics clears any existing values. */
+	conn->stat_flags = flags;
+
+	return (0);
+}
+
+/* Simple structure for name and flag configuration searches. */
+typedef struct {
+	const char *name;
+	uint32_t flag;
+} WT_NAME_FLAG;
+
+/*
+ * __wt_verbose_config --
+ *	Set verbose configuration.
+ */
+int
+__wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	static const WT_NAME_FLAG verbtypes[] = {
+		{ "api",		WT_VERB_API },
+		{ "block",		WT_VERB_BLOCK },
+		{ "checkpoint",		WT_VERB_CHECKPOINT },
+		{ "compact",		WT_VERB_COMPACT },
+		{ "evict",		WT_VERB_EVICT },
+		{ "evictserver",	WT_VERB_EVICTSERVER },
+		{ "fileops",		WT_VERB_FILEOPS },
+		{ "log",		WT_VERB_LOG },
+		{ "lsm",		WT_VERB_LSM },
+		{ "metadata",		WT_VERB_METADATA },
+		{ "mutex",		WT_VERB_MUTEX },
+		{ "overflow",		WT_VERB_OVERFLOW },
+		{ "read",		WT_VERB_READ },
+		{ "reconcile",		WT_VERB_RECONCILE },
+		{ "recovery",		WT_VERB_RECOVERY },
+		{ "salvage",		WT_VERB_SALVAGE },
+		{ "shared_cache",	WT_VERB_SHARED_CACHE },
+		{ "split",		WT_VERB_SPLIT },
+		{ "temporary",		WT_VERB_TEMPORARY },
+		{ "transaction",	WT_VERB_TRANSACTION },
+		{ "verify",		WT_VERB_VERIFY },
+		{ "version",		WT_VERB_VERSION },
+		{ "write",		WT_VERB_WRITE },
+		{ NULL, 0 }
+	};
+	WT_CONFIG_ITEM cval, sval;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	const WT_NAME_FLAG *ft;
+	uint32_t flags;
+
+	conn = S2C(session);
+
+	WT_RET(__wt_config_gets(session, cfg, "verbose", &cval));
+
+	flags = 0;
+	for (ft = verbtypes; ft->name != NULL; ft++) {
+		if ((ret = __wt_config_subgets(
+		    session, &cval, ft->name, &sval)) == 0 && sval.val != 0) {
+#ifdef HAVE_VERBOSE
+			LF_SET(ft->flag);
+#else
+			WT_RET_MSG(session, EINVAL,
+			    "Verbose option specified when WiredTiger built "
+			    "without verbose support. Add --enable-verbose to "
+			    "configure command and rebuild to include support "
+			    "for verbose messages");
+#endif
+		}
+		WT_RET_NOTFOUND_OK(ret);
+	}
+
+	conn->verbose = flags;
+	return (0);
+}
+
+/*
+ * __conn_write_config --
+ *	Save the configuration used to create a database.
+ */
+static int
+__conn_write_config(
+    WT_SESSION_IMPL *session, const char *filename, const char *cfg[])
+{
+	FILE *fp;
+	WT_CONFIG parser;
+	WT_CONFIG_ITEM k, v;
+	WT_DECL_RET;
+	char *path;
+
+	/*
+	 * We were passed an array of configuration strings where slot 0 is all
+	 * all possible values and the second and subsequent slots are changes
+	 * specified by the application during open (using the wiredtiger_open
+	 * configuration string, an environment variable, or user-configuration
+	 * file). The base configuration file contains all changes to default
+	 * settings made at create, and we include the user-configuration file
+	 * in that list, even though we don't expect it to change. Of course,
+	 * an application could leave that file as it is right now and not
+	 * remove a configuration we need, but applications can also guarantee
+	 * all database users specify consistent environment variables and
+	 * wiredtiger_open configuration arguments, and if we protect against
+	 * those problems, might as well include the application's configuration
+	 * file as well.
+	 *
+	 * If there is no configuration, don't bother creating an empty file.
+	 */
+	if (cfg[1] == NULL)
+		return (0);
+
+	WT_RET(__wt_filename(session, filename, &path));
+	if ((fp = fopen(path, "w")) == NULL)
+		ret = __wt_errno();
+	__wt_free(session, path);
+	if (fp == NULL)
+		return (ret);
+
+	fprintf(fp, "%s\n\n",
+	    "# Do not modify this file.\n"
+	    "#\n"
+	    "# WiredTiger created this file when the database was created,\n"
+	    "# to store persistent database settings.  Instead of changing\n"
+	    "# these settings, set a WIREDTIGER_CONFIG environment variable\n"
+	    "# or create a WiredTiger.config file to override them.");
+
+	fprintf(fp, "version=(major=%d,minor=%d)\n\n",
+	    WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR);
+
+	/*
+	 * We want the list of defaults that have been changed, that is, if the
+	 * application didn't somehow configure a setting, we don't write out a
+	 * default value, so future releases may silently migrate to new default
+	 * values.
+	 */
+	while (*++cfg != NULL) {
+		WT_ERR(__wt_config_init( session,
+		    &parser, WT_CONFIG_BASE(session, wiredtiger_open_basecfg)));
+		while ((ret = __wt_config_next(&parser, &k, &v)) == 0) {
+			if ((ret =
+			    __wt_config_getone(session, *cfg, &k, &v)) == 0) {
+				/* Fix quoting for non-trivial settings. */
+				if (v.type == WT_CONFIG_ITEM_STRING) {
+					--v.str;
+					v.len += 2;
+				}
+				fprintf(fp, "%.*s=%.*s\n",
+				    (int)k.len, k.str, (int)v.len, v.str);
+			}
+			WT_ERR_NOTFOUND_OK(ret);
+		}
+		WT_ERR_NOTFOUND_OK(ret);
+	}
+
+err:	WT_TRET(fclose(fp));
+
+	/* Don't leave a damaged file in place. */
+	if (ret != 0)
+		(void)__wt_remove(session, filename);
+
+	return (ret);
+}
+
+/*
+ * wiredtiger_open --
+ *	Main library entry point: open a new connection to a WiredTiger
+ *	database.
+ */
+int
+wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
+    const char *config, WT_CONNECTION **wt_connp)
+{
+	static const WT_CONNECTION stdc = {
+		__conn_async_flush,
+		__conn_async_new_op,
+		__conn_close,
+		__conn_reconfigure,
+		__conn_get_home,
+		__conn_configure_method,
+		__conn_is_new,
+		__conn_open_session,
+		__conn_load_extension,
+		__conn_add_data_source,
+		__conn_add_collator,
+		__conn_add_compressor,
+		__conn_add_extractor,
+		__conn_get_extension_api
+	};
+	static const WT_NAME_FLAG file_types[] = {
+		{ "checkpoint",	WT_FILE_TYPE_CHECKPOINT },
+		{ "data",	WT_FILE_TYPE_DATA },
+		{ "log",	WT_FILE_TYPE_LOG },
+		{ NULL, 0 }
+	};
+
+	WT_CONFIG_ITEM cval, sval;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_ITEM i1, i2, i3;
+	const WT_NAME_FLAG *ft;
+	WT_SESSION_IMPL *session;
+
+	/* Leave space for optional additional configuration. */
+	const char *cfg[] = { NULL, NULL, NULL, NULL, NULL, NULL };
+
+	*wt_connp = NULL;
+
+	conn = NULL;
+	session = NULL;
+
+	/*
+	 * We could use scratch buffers, but I'd rather the default session
+	 * not tie down chunks of memory past the open call.
+	 */
+	WT_CLEAR(i1);
+	WT_CLEAR(i2);
+	WT_CLEAR(i3);
+
+	WT_RET(__wt_library_init());
+
+	WT_RET(__wt_calloc_def(NULL, 1, &conn));
+	conn->iface = stdc;
+
+	/*
+	 * Immediately link the structure into the connection structure list:
+	 * the only thing ever looked at on that list is the database name,
+	 * and a NULL value is fine.
+	 */
+	__wt_spin_lock(NULL, &__wt_process.spinlock);
+	TAILQ_INSERT_TAIL(&__wt_process.connqh, conn, q);
+	__wt_spin_unlock(NULL, &__wt_process.spinlock);
+
+	session = conn->default_session = &conn->dummy_session;
+	session->iface.connection = &conn->iface;
+	session->name = "wiredtiger_open";
+	__wt_random_init(session->rnd);
+	__wt_event_handler_set(session, event_handler);
+
+	/* Remaining basic initialization of the connection structure. */
+	WT_ERR(__wt_connection_init(conn));
+
+	/* Check/set the application-specified configuration string. */
+	WT_ERR(__wt_config_check(session,
+	    WT_CONFIG_REF(session, wiredtiger_open), config, 0));
+	cfg[0] = WT_CONFIG_BASE(session, wiredtiger_open);
+	cfg[1] = config;
+
+	/* Configure error messages so we get them right early. */
+	WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval));
+	if (cval.len != 0)
+		WT_ERR(__wt_strndup(
+		    session, cval.str, cval.len, &conn->error_prefix));
+
+	/* Get the database home. */
+	WT_ERR(__conn_home(session, home, cfg));
+
+	/* Make sure no other thread of control already owns this database. */
+	WT_ERR(__conn_single(session, cfg));
+
+	/*
+	 * Build the configuration stack, in the following order (where later
+	 * entries override earlier entries):
+	 *
+	 * 1. all possible wiredtiger_open configurations
+	 * 2. base configuration file, created with the database (optional)
+	 * 3. the config passed in by the application.
+	 * 4. user configuration file (optional)
+	 * 5. environment variable settings (optional)
+	 *
+	 * Clear the entries we added to the stack, we're going to build it in
+	 * order.
+	 */
+	cfg[0] = WT_CONFIG_BASE(session, wiredtiger_open_all);
+	cfg[1] = NULL;
+	WT_ERR(__conn_config_file(session, WT_BASECONFIG, 0, cfg, &i1));
+	__conn_config_append(cfg, config);
+	WT_ERR(__conn_config_file(session, WT_USERCONFIG, 1, cfg, &i2));
+	WT_ERR(__conn_config_env(session, cfg, &i3));
+
+	/*
+	 * Configuration ...
+	 *
+	 * We can't open sessions yet, so any configurations that cause
+	 * sessions to be opened must be handled inside __wt_connection_open.
+	 *
+	 * The error message configuration might have changed (if set in a
+	 * configuration file, and not in the application's configuration
+	 * string), get it again. Do it first, make error messages correct.
+	 */
+	WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval));
+	if (cval.len != 0) {
+		__wt_free(session, conn->error_prefix);
+		WT_ERR(__wt_strndup(
+		    session, cval.str, cval.len, &conn->error_prefix));
+	}
+
+	WT_ERR(__wt_config_gets(session, cfg, "hazard_max", &cval));
+	conn->hazard_max = (uint32_t)cval.val;
+
+	WT_ERR(__wt_config_gets(session, cfg, "session_max", &cval));
+	conn->session_size = (uint32_t)cval.val + WT_NUM_INTERNAL_SESSIONS;
+
+	WT_ERR(__wt_config_gets(session, cfg, "checkpoint_sync", &cval));
+	if (cval.val)
+		F_SET(conn, WT_CONN_CKPT_SYNC);
+
+	WT_ERR(__wt_config_gets(session, cfg, "buffer_alignment", &cval));
+	if (cval.val == -1)
+		conn->buffer_alignment = WT_BUFFER_ALIGNMENT_DEFAULT;
+	else
+		conn->buffer_alignment = (size_t)cval.val;
+#ifndef HAVE_POSIX_MEMALIGN
+	if (conn->buffer_alignment != 0)
+		WT_ERR_MSG(session, EINVAL,
+		    "buffer_alignment requires posix_memalign");
+#endif
+
+	WT_ERR(__wt_config_gets(session, cfg, "direct_io", &cval));
+	for (ft = file_types; ft->name != NULL; ft++) {
+		ret = __wt_config_subgets(session, &cval, ft->name, &sval);
+		if (ret == 0) {
+			if (sval.val)
+				FLD_SET(conn->direct_io, ft->flag);
+		} else if (ret != WT_NOTFOUND)
+			goto err;
+	}
+
+	WT_ERR(__wt_config_gets(session, cfg, "file_extend", &cval));
+	for (ft = file_types; ft->name != NULL; ft++) {
+		ret = __wt_config_subgets(session, &cval, ft->name, &sval);
+		if (ret == 0) {
+			switch (ft->flag) {
+			case WT_FILE_TYPE_DATA:
+				conn->data_extend_len = sval.val;
+				break;
+			case WT_FILE_TYPE_LOG:
+				conn->log_extend_len = sval.val;
+				break;
+			}
+		} else if (ret != WT_NOTFOUND)
+			goto err;
+	}
+
+	WT_ERR(__wt_config_gets(session, cfg, "mmap", &cval));
+	conn->mmap = cval.val == 0 ? 0 : 1;
+
+	WT_ERR(__conn_statistics_config(session, cfg));
+	WT_ERR(__wt_lsm_manager_config(session, cfg));
+	WT_ERR(__wt_verbose_config(session, cfg));
+
+	/* Now that we know if verbose is configured, output the version. */
+	WT_ERR(__wt_verbose(
+	    session, WT_VERB_VERSION, "%s", WIREDTIGER_VERSION_STRING));
+
+	/*
+	 * Open the connection, then reset the local session as the real one
+	 * was allocated in __wt_connection_open.
+	 */
+	WT_ERR(__wt_connection_open(conn, cfg));
+	session = conn->default_session;
+
+	/*
+	 * Check on the turtle and metadata files, creating them if necessary
+	 * (which avoids application threads racing to create the metadata file
+	 * later).  Once the metadata file exists, get a reference to it in
+	 * the connection's session.
+	 */
+	WT_ERR(__wt_turtle_init(session));
+	WT_ERR(__wt_metadata_open(session));
+
+	/*
+	 * Load the extensions after initialization completes; extensions expect
+	 * everything else to be in place, and the extensions call back into the
+	 * library.
+	 */
+	WT_ERR(__conn_load_extensions(session, cfg));
+
+	/*
+	 * We've completed configuration, write the base configuration file if
+	 * we're creating the database.
+	 */
+	if (conn->is_new) {
+		WT_ERR(__wt_config_gets(session, cfg, "config_base", &cval));
+		if (cval.val)
+			WT_ERR(
+			    __conn_write_config(session, WT_BASECONFIG, cfg));
+	}
+
+	/*
+	 * Start the worker threads last.
+	 */
+	WT_ERR(__wt_connection_workers(session, cfg));
+
+	/* Merge the final configuration for later reconfiguration. */
+	WT_ERR(__wt_config_merge(session, cfg, &conn->cfg));
+
+	WT_STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0);
+	*wt_connp = &conn->iface;
+
+err:	/* Discard the configuration strings. */
+	__wt_buf_free(session, &i1);
+	__wt_buf_free(session, &i2);
+	__wt_buf_free(session, &i3);
+
+	if (ret != 0 && conn != NULL)
+		WT_TRET(__wt_connection_close(conn));
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_cache.c b/src/third_party/wiredtiger/src/conn/conn_cache.c
new file mode 100644
index 00000000000..079bd05ff1e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_cache.c
@@ -0,0 +1,174 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cache_config --
+ *	Configure the underlying cache.
+ */
+int
+__wt_cache_config(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_CACHE *cache;
+	WT_CONFIG_ITEM cval;
+	WT_CONNECTION_IMPL *conn;
+
+	conn = S2C(session);
+	cache = conn->cache;
+
+	/*
+	 * If not using a shared cache configure the cache size, otherwise
+	 * check for a reserved size.
+	 */
+	if (!F_ISSET(conn, WT_CONN_CACHE_POOL)) {
+		WT_RET(__wt_config_gets(session, cfg, "cache_size", &cval));
+		conn->cache_size = (uint64_t)cval.val;
+	} else {
+		WT_RET(__wt_config_gets(
+		    session, cfg, "shared_cache.reserve", &cval));
+		if (cval.val == 0)
+			WT_RET(__wt_config_gets(
+			    session, cfg, "shared_cache.chunk", &cval));
+		cache->cp_reserved = (uint64_t)cval.val;
+	}
+
+	WT_RET(__wt_config_gets(session, cfg, "eviction_target", &cval));
+	cache->eviction_target = (u_int)cval.val;
+
+	WT_RET(__wt_config_gets(session, cfg, "eviction_trigger", &cval));
+	cache->eviction_trigger = (u_int)cval.val;
+
+	WT_RET(__wt_config_gets(session, cfg, "eviction_dirty_target", &cval));
+	cache->eviction_dirty_target = (u_int)cval.val;
+
+	/*
+	 * The eviction thread configuration options include the main eviction
+	 * thread and workers. Our implementation splits them out. Adjust for
+	 * the difference when parsing the configuration.
+	 */
+	WT_RET(__wt_config_gets(session, cfg, "eviction.threads_max", &cval));
+	WT_ASSERT(session, cval.val > 0);
+	conn->evict_workers_max = (u_int)cval.val - 1;
+
+	WT_RET(__wt_config_gets(session, cfg, "eviction.threads_min", &cval));
+	WT_ASSERT(session, cval.val > 0);
+	conn->evict_workers_min = (u_int)cval.val - 1;
+
+	if (conn->evict_workers_min > conn->evict_workers_max)
+		WT_RET_MSG(session, EINVAL,
+		    "eviction=(threads_min) cannot be greater than "
+		    "eviction=(threads_max)");
+
+	return (0);
+}
+
+/*
+ * __wt_cache_create --
+ *	Create the underlying cache.
+ */
+int
+__wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_CACHE *cache;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+
+	conn = S2C(session);
+
+	WT_ASSERT(session, conn->cache == NULL ||
+	    (F_ISSET(conn, WT_CONN_CACHE_POOL) && conn->cache != NULL));
+
+	WT_RET(__wt_calloc_def(session, 1, &conn->cache));
+
+	cache = conn->cache;
+
+	/* Use a common routine for run-time configuration options. */
+	WT_RET(__wt_cache_config(session, cfg));
+
+	/* Add the configured cache to the cache pool. */
+	if (F_ISSET(conn, WT_CONN_CACHE_POOL))
+		WT_RET(__wt_conn_cache_pool_open(session));
+
+	/*
+	 * The target size must be lower than the trigger size or we will never
+	 * get any work done.
+	 */
+	if (cache->eviction_target >= cache->eviction_trigger)
+		WT_ERR_MSG(session, EINVAL,
+		    "eviction target must be lower than the eviction trigger");
+
+	WT_ERR(__wt_cond_alloc(session,
+	    "cache eviction server", 0, &cache->evict_cond));
+	WT_ERR(__wt_cond_alloc(session,
+	    "eviction waiters", 0, &cache->evict_waiter_cond));
+	WT_ERR(__wt_spin_init(session, &cache->evict_lock, "cache eviction"));
+	WT_ERR(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk"));
+
+	/* Allocate the LRU eviction queue. */
+	cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR;
+	WT_ERR(__wt_calloc_def(session, cache->evict_slots, &cache->evict));
+
+	/*
+	 * We get/set some values in the cache statistics (rather than have
+	 * two copies), configure them.
+	 */
+	__wt_cache_stats_update(session);
+	return (0);
+
+err:	WT_RET(__wt_cache_destroy(session));
+	return (ret);
+}
+
+/*
+ * __wt_cache_stats_update --
+ *	Update the cache statistics for return to the application.
+ */
+void
+__wt_cache_stats_update(WT_SESSION_IMPL *session)
+{
+	WT_CACHE *cache;
+	WT_CONNECTION_IMPL *conn;
+	WT_CONNECTION_STATS *stats;
+
+	conn = S2C(session);
+	cache = conn->cache;
+	stats = &conn->stats;
+
+	WT_STAT_SET(stats, cache_bytes_max, conn->cache_size);
+	WT_STAT_SET(stats, cache_bytes_inuse, __wt_cache_bytes_inuse(cache));
+	WT_STAT_SET(stats, cache_pages_inuse, __wt_cache_pages_inuse(cache));
+	WT_STAT_SET(stats, cache_bytes_dirty, cache->bytes_dirty);
+	WT_STAT_SET(stats, cache_pages_dirty, cache->pages_dirty);
+}
+
+/*
+ * __wt_cache_destroy --
+ *	Discard the underlying cache.
+ */
+int
+__wt_cache_destroy(WT_SESSION_IMPL *session)
+{
+	WT_CACHE *cache;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+
+	conn = S2C(session);
+	cache = conn->cache;
+
+	if (cache == NULL)
+		return (0);
+
+	WT_TRET(__wt_cond_destroy(session, &cache->evict_cond));
+	WT_TRET(__wt_cond_destroy(session, &cache->evict_waiter_cond));
+	__wt_spin_destroy(session, &cache->evict_lock);
+	__wt_spin_destroy(session, &cache->evict_walk_lock);
+
+	__wt_free(session, cache->evict);
+	__wt_free(session, conn->cache);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
new file mode 100644
index 00000000000..ba80ac15267
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
@@ -0,0 +1,639 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Tuning constants.
+ */
+/* Threshold when a connection is allocated more cache */
+#define	WT_CACHE_POOL_BUMP_THRESHOLD	6
+/* Threshold when a connection is allocated less cache */
+#define	WT_CACHE_POOL_REDUCE_THRESHOLD	2
+/* Balancing passes after a bump before a connection is a candidate. */
+#define	WT_CACHE_POOL_BUMP_SKIPS	10
+/* Balancing passes after a reduction before a connection is a candidate. */
+#define	WT_CACHE_POOL_REDUCE_SKIPS	5
+
+static int __cache_pool_adjust(WT_SESSION_IMPL *, uint64_t, uint64_t, int *);
+static int __cache_pool_assess(WT_SESSION_IMPL *, uint64_t *);
+static int __cache_pool_balance(WT_SESSION_IMPL *);
+
+/*
+ * __wt_cache_pool_config --
+ *	Parse and setup the cache pool options.
+ */
+int
+__wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
+{
+	WT_CACHE_POOL *cp;
+	WT_CONFIG_ITEM cval;
+	WT_CONNECTION_IMPL *conn, *entry;
+	WT_DECL_RET;
+	char *pool_name;
+	int created, reconfiguring;
+	uint64_t chunk, reserve, size, used_cache;
+
+	conn = S2C(session);
+	created = reconfiguring = 0;
+	pool_name = NULL;
+	cp = NULL;
+	size = 0;
+
+	if (F_ISSET(conn, WT_CONN_CACHE_POOL))
+		reconfiguring = 1;
+	else {
+		WT_RET(
+		    __wt_config_gets(session, cfg, "shared_cache.name", &cval));
+		if (cval.len == 0) {
+			/*
+			 * Tell the user if they configured some shared cache
+			 * settings, but didn't enable it by naming it.
+			 */
+			if (__wt_config_gets(session,
+			    &cfg[1], "shared_cache", &cval) != WT_NOTFOUND)
+				WT_RET_MSG(session, EINVAL,
+				    "Shared cache configuration requires a "
+				    "pool name");
+			return (0);
+		}
+		if (__wt_config_gets(session,
+		    &cfg[1], "cache_size", &cval) != WT_NOTFOUND)
+			WT_RET_MSG(session, EINVAL,
+			    "Only one of cache_size and shared_cache can be "
+			    "in the configuration");
+
+		/*
+		 * NOTE: The allocations made when configuring and opening a
+		 * cache pool don't really belong to the connection that
+		 * allocates them. If a memory allocator becomes connection
+		 * specific in the future we will need a way to allocate memory
+		 * outside of the connection here.
+		 */
+		WT_RET(__wt_strndup(session, cval.str, cval.len, &pool_name));
+	}
+
+	__wt_spin_lock(session, &__wt_process.spinlock);
+	if (__wt_process.cache_pool == NULL) {
+		WT_ASSERT(session, !reconfiguring);
+		/* Create a cache pool. */
+		WT_ERR(__wt_calloc_def(session, 1, &cp));
+		created = 1;
+		cp->name = pool_name;
+		pool_name = NULL; /* Belongs to the cache pool now. */
+		TAILQ_INIT(&cp->cache_pool_qh);
+		WT_ERR(__wt_spin_init(
+		    session, &cp->cache_pool_lock, "cache shared pool"));
+		WT_ERR(__wt_cond_alloc(session,
+		    "cache pool server", 0, &cp->cache_pool_cond));
+
+		__wt_process.cache_pool = cp;
+		WT_ERR(__wt_verbose(session,
+		    WT_VERB_SHARED_CACHE, "Created cache pool %s", cp->name));
+	} else if (!reconfiguring && !WT_STRING_MATCH(
+	    __wt_process.cache_pool->name, pool_name, strlen(pool_name)))
+		/* Only a single cache pool is supported. */
+		WT_ERR_MSG(session, WT_ERROR,
+		    "Attempting to join a cache pool that does not exist: %s",
+		    pool_name);
+
+	cp = __wt_process.cache_pool;
+
+	/*
+	 * The cache pool requires a reference count to avoid a race between
+	 * configuration/open and destroy.
+	 */
+	if (!reconfiguring)
+		++cp->refs;
+
+	/*
+	 * Cache pool configurations are optional when not creating. If
+	 * values aren't being changed, retrieve the current value so that
+	 * validation of settings works.
+	 */
+	if (!created) {
+		if (__wt_config_gets(session, &cfg[1],
+		    "shared_cache.size", &cval) == 0 && cval.val != 0)
+			size = (uint64_t)cval.val;
+		 else
+			size = cp->size;
+		if (__wt_config_gets(session, &cfg[1],
+		    "shared_cache.chunk", &cval) == 0 && cval.val != 0)
+			chunk = (uint64_t)cval.val;
+		else
+			chunk = cp->chunk;
+	} else {
+		/*
+		 * The only time shared cache configuration uses default
+		 * values is when we are creating the pool.
+		 */
+		WT_ERR(__wt_config_gets(
+		    session, cfg, "shared_cache.size", &cval));
+		WT_ASSERT(session, cval.val != 0);
+		size = (uint64_t)cval.val;
+		WT_ERR(__wt_config_gets(
+		    session, cfg, "shared_cache.chunk", &cval));
+		WT_ASSERT(session, cval.val != 0);
+		chunk = (uint64_t)cval.val;
+	}
+
+	/*
+	 * Retrieve the reserve size here for validation of configuration.
+	 * Don't save it yet since the connections cache is not created if
+	 * we are opening. Cache configuration is responsible for saving the
+	 * setting.
+	 * The different conditions when reserved size are set are:
+	 *  - It's part of the users configuration - use that value.
+	 *  - We are reconfiguring - keep the previous value.
+	 *  - We are joining a cache pool for the first time (including
+	 *  creating the pool) - use the chunk size; that's the default.
+	 */
+	if (__wt_config_gets(session, &cfg[1],
+	    "shared_cache.reserve", &cval) == 0 && cval.val != 0)
+		reserve = (uint64_t)cval.val;
+	else if (reconfiguring)
+		reserve = conn->cache->cp_reserved;
+	else
+		reserve = chunk;
+
+	/*
+	 * Validate that size and reserve values don't cause the cache
+	 * pool to be over subscribed.
+	 */
+	used_cache = 0;
+	if (!created) {
+		TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq)
+			used_cache += entry->cache->cp_reserved;
+	}
+	if (used_cache + reserve > size)
+		WT_ERR_MSG(session, EINVAL,
+		    "Shared cache unable to accommodate this configuration. "
+		    "Shared cache size: %" PRIu64 ", reserved: %" PRIu64,
+		    size, used_cache + reserve);
+
+	/* The configuration is verified - it's safe to update the pool. */
+	cp->size = size;
+	cp->chunk = chunk;
+
+	/* Wake up the cache pool server so any changes are noticed. */
+	if (reconfiguring)
+		WT_ERR(__wt_cond_signal(
+		    session, __wt_process.cache_pool->cache_pool_cond));
+
+	WT_ERR(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+	    "Configured cache pool %s. Size: %" PRIu64
+	    ", chunk size: %" PRIu64, cp->name, cp->size, cp->chunk));
+
+	F_SET(conn, WT_CONN_CACHE_POOL);
+err:	__wt_spin_unlock(session, &__wt_process.spinlock);
+	if (!reconfiguring)
+		__wt_free(session, pool_name);
+	if (ret != 0 && created) {
+		__wt_free(session, cp->name);
+		WT_TRET(__wt_cond_destroy(session, &cp->cache_pool_cond));
+		__wt_free(session, cp);
+	}
+	return (ret);
+}
+
+/*
+ * __wt_conn_cache_pool_open --
+ *	Add a connection to the cache pool.
+ */
+int
+__wt_conn_cache_pool_open(WT_SESSION_IMPL *session)
+{
+	WT_CACHE *cache;
+	WT_CACHE_POOL *cp;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+
+	conn = S2C(session);
+	cache = conn->cache;
+	cp = __wt_process.cache_pool;
+
+	/*
+	 * Create a session that can be used by the cache pool thread, do
+	 * it in the main thread to avoid shutdown races
+	 */
+	if ((ret = __wt_open_internal_session(
+		conn, "cache-pool", 0, 0, &cache->cp_session)) != 0)
+		WT_RET_MSG(NULL, ret,
+		    "Failed to create session for cache pool");
+
+	/*
+	 * Add this connection into the cache pool connection queue. Figure
+	 * out if a manager thread is needed while holding the lock. Don't
+	 * start the thread until we have released the lock.
+	 */
+	__wt_spin_lock(session, &cp->cache_pool_lock);
+	TAILQ_INSERT_TAIL(&cp->cache_pool_qh, conn, cpq);
+	__wt_spin_unlock(session, &cp->cache_pool_lock);
+
+	WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+	    "Added %s to cache pool %s", conn->home, cp->name));
+
+	/*
+	 * Each connection participating in the cache pool starts a manager
+	 * thread. Only one manager is active at a time, but having a thread
+	 * in each connection saves having a complex election process when
+	 * the active connection shuts down.
+	 */
+	F_SET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE);
+	F_SET(cache, WT_CACHE_POOL_RUN);
+	WT_RET(__wt_thread_create(session, &cache->cp_tid,
+	    __wt_cache_pool_server, cache->cp_session));
+
+	/* Wake up the cache pool server to get our initial chunk. */
+	WT_RET(__wt_cond_signal(session, cp->cache_pool_cond));
+
+	return (0);
+}
+
+/*
+ * __wt_conn_cache_pool_destroy --
+ *	Remove our resources from the shared cache pool. Remove the cache pool
+ *	if we were the last connection.
+ */
+int
+__wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session)
+{
+	WT_CACHE *cache;
+	WT_CACHE_POOL *cp;
+	WT_CONNECTION_IMPL *conn, *entry;
+	WT_DECL_RET;
+	WT_SESSION *wt_session;
+	int cp_locked, found;
+
+	conn = S2C(session);
+	cache = conn->cache;
+	cp_locked = found = 0;
+	cp = __wt_process.cache_pool;
+
+	if (!F_ISSET(conn, WT_CONN_CACHE_POOL))
+		return (0);
+
+	__wt_spin_lock(session, &cp->cache_pool_lock);
+	cp_locked = 1;
+	TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq)
+		if (entry == conn) {
+			found = 1;
+			break;
+		}
+
+	/*
+	 * If there was an error during open, we may not have made it onto the
+	 * queue.  We did increment the reference count, so proceed regardless.
+	 */
+	if (found) {
+		WT_TRET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+		    "Removing %s from cache pool", entry->home));
+		TAILQ_REMOVE(&cp->cache_pool_qh, entry, cpq);
+
+		/* Give the connection's resources back to the pool. */
+		WT_ASSERT(session, cp->currently_used >= conn->cache_size);
+		cp->currently_used -= conn->cache_size;
+
+		/*
+		 * Stop our manager thread - release the cache pool lock while
+		 * joining the thread to allow it to complete any balance
+		 * operation.
+		 */
+		__wt_spin_unlock(session, &cp->cache_pool_lock);
+		cp_locked = 0;
+
+		F_CLR(cache, WT_CACHE_POOL_RUN);
+		WT_TRET(__wt_cond_signal(session, cp->cache_pool_cond));
+		WT_TRET(__wt_thread_join(session, cache->cp_tid));
+
+		wt_session = &cache->cp_session->iface;
+		WT_TRET(wt_session->close(wt_session, NULL));
+
+		/*
+		 * Grab the lock again now to stop other threads joining the
+		 * pool while we are figuring out whether we were the last
+		 * participant.
+		 */
+		__wt_spin_lock(session, &cp->cache_pool_lock);
+		cp_locked = 1;
+	}
+
+	/*
+	 * If there are no references, we are cleaning up after a failed
+	 * wiredtiger_open, there is nothing further to do.
+	 */
+	if (cp->refs < 1) {
+		if (cp_locked)
+			__wt_spin_unlock(session, &cp->cache_pool_lock);
+		return (0);
+	}
+
+	if (--cp->refs == 0) {
+		WT_ASSERT(session, TAILQ_EMPTY(&cp->cache_pool_qh));
+		F_CLR_ATOMIC(cp, WT_CACHE_POOL_ACTIVE);
+	}
+
+	if (!F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE)) {
+		WT_TRET(__wt_verbose(
+		    session, WT_VERB_SHARED_CACHE, "Destroying cache pool"));
+		__wt_spin_lock(session, &__wt_process.spinlock);
+		/*
+		 * We have been holding the pool lock - no connections could
+		 * have been added.
+		 */
+		WT_ASSERT(session,
+		    cp == __wt_process.cache_pool &&
+		    TAILQ_EMPTY(&cp->cache_pool_qh));
+		__wt_process.cache_pool = NULL;
+		__wt_spin_unlock(session, &__wt_process.spinlock);
+		__wt_spin_unlock(session, &cp->cache_pool_lock);
+		cp_locked = 0;
+
+		/* Now free the pool. */
+		__wt_free(session, cp->name);
+
+		__wt_spin_destroy(session, &cp->cache_pool_lock);
+		WT_TRET(__wt_cond_destroy(session, &cp->cache_pool_cond));
+		__wt_free(session, cp);
+	}
+
+	if (cp_locked) {
+		__wt_spin_unlock(session, &cp->cache_pool_lock);
+
+		/* Notify other participants if we were managing */
+		if (F_ISSET(cache, WT_CACHE_POOL_MANAGER)) {
+			F_CLR_ATOMIC(cp, WT_CACHE_POOL_MANAGED);
+			WT_TRET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+			    "Shutting down shared cache manager connection"));
+		}
+	}
+
+	return (ret);
+}
+
+/*
+ * __cache_pool_balance --
+ *	Do a pass over the cache pool members and ensure the pool is being
+ *	effectively used.
+ */
+static int
+__cache_pool_balance(WT_SESSION_IMPL *session)
+{
+	WT_CACHE_POOL *cp;
+	WT_DECL_RET;
+	int adjusted;
+	uint64_t bump_threshold, highest;
+
+	cp = __wt_process.cache_pool;
+	adjusted = 0;
+	highest = 0;
+
+	__wt_spin_lock(NULL, &cp->cache_pool_lock);
+
+	/* If the queue is empty there is nothing to do. */
+	if (TAILQ_FIRST(&cp->cache_pool_qh) == NULL)
+		goto err;
+
+	WT_ERR(__cache_pool_assess(session, &highest));
+	bump_threshold = WT_CACHE_POOL_BUMP_THRESHOLD;
+	/*
+	 * Actively attempt to:
+	 * - Reduce the amount allocated, if we are over the budget
+	 * - Increase the amount used if there is capacity and any pressure.
+	 */
+	for (bump_threshold = WT_CACHE_POOL_BUMP_THRESHOLD;
+	    F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
+	    F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN);) {
+		WT_ERR(__cache_pool_adjust(
+		    session, highest, bump_threshold, &adjusted));
+		/*
+		 * Stop if the amount of cache being used is stable, and we
+		 * aren't over capacity.
+		 */
+		if (cp->currently_used <= cp->size && !adjusted)
+			break;
+		if (bump_threshold > 0)
+			--bump_threshold;
+	}
+
+err:	__wt_spin_unlock(NULL, &cp->cache_pool_lock);
+	return (ret);
+}
+
+/*
+ * __cache_pool_assess --
+ *	Assess the usage of the cache pool.
+ */
+static int
+__cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest)
+{
+	WT_CACHE_POOL *cp;
+	WT_CACHE *cache;
+	WT_CONNECTION_IMPL *entry;
+	uint64_t entries, highest, new;
+
+	cp = __wt_process.cache_pool;
+	entries = highest = 0;
+
+	/* Generate read pressure information. */
+	TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) {
+		if (entry->cache_size == 0 ||
+		    entry->cache == NULL)
+			continue;
+		cache = entry->cache;
+		++entries;
+		new = cache->bytes_evict;
+		/* Handle wrapping of eviction requests. */
+		if (new >= cache->cp_saved_evict)
+			cache->cp_current_evict = new - cache->cp_saved_evict;
+		else
+			cache->cp_current_evict = new;
+		cache->cp_saved_evict = new;
+		if (cache->cp_current_evict > highest)
+			highest = cache->cp_current_evict;
+	}
+	WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+	    "Highest eviction count: %" PRIu64 ", entries: %" PRIu64,
+	    highest, entries));
+	/* Normalize eviction information across connections. */
+	highest = highest / (entries + 1);
+	++highest; /* Avoid divide by zero. */
+
+	*phighest = highest;
+	return (0);
+}
+
+/*
+ * __cache_pool_adjust --
+ *	Adjust the allocation of cache to each connection. If force is set
+ *	ignore cache load information, and reduce the allocation for every
+ *	connection allocated more than their reserved size.
+ */
+static int
+__cache_pool_adjust(WT_SESSION_IMPL *session,
+    uint64_t highest, uint64_t bump_threshold, int *adjustedp)
+{
+	WT_CACHE_POOL *cp;
+	WT_CACHE *cache;
+	WT_CONNECTION_IMPL *entry;
+	uint64_t adjusted, reserved, read_pressure;
+	int force, grew;
+
+	*adjustedp = 0;
+	cp = __wt_process.cache_pool;
+	force = (cp->currently_used > cp->size);
+	grew = 0;
+	if (WT_VERBOSE_ISSET(session, WT_VERB_SHARED_CACHE)) {
+		WT_RET(__wt_verbose(session,
+		    WT_VERB_SHARED_CACHE, "Cache pool distribution: "));
+		WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+		    "\t" "cache_size, read_pressure, skips: "));
+	}
+
+	TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) {
+		cache = entry->cache;
+		reserved = cache->cp_reserved;
+		adjusted = 0;
+
+		read_pressure = cache->cp_current_evict / highest;
+		WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+		    "\t%" PRIu64 ", %" PRIu64 ", %" PRIu32,
+		    entry->cache_size, read_pressure, cache->cp_skip_count));
+
+		/* Allow to stabilize after changes. */
+		if (cache->cp_skip_count > 0 && --cache->cp_skip_count > 0)
+			continue;
+		/*
+		 * If the entry is currently allocated less than the reserved
+		 * size, increase it's allocation. This should only happen if:
+		 *  - It's the first time we've seen this member
+		 *  - The reserved size has been adjusted
+		 */
+		if (entry->cache_size < reserved) {
+			grew = 1;
+			adjusted = reserved - entry->cache_size;
+		/*
+		 * Conditions for reducing the amount of resources for an
+		 * entry:
+		 *  - If we are forcing and this entry has more than the
+		 *    minimum amount of space in use.
+		 *  - If the read pressure in this entry is below the
+		 *    threshold, other entries need more cache, the entry has
+		 *    more than the minimum space and there is no available
+		 *    space in the pool.
+		 */
+		} else if ((force && entry->cache_size > reserved) ||
+		    (read_pressure < WT_CACHE_POOL_REDUCE_THRESHOLD &&
+		     highest > 1 && entry->cache_size > reserved &&
+		     cp->currently_used >= cp->size)) {
+			grew = 0;
+			/*
+			 * Shrink by a chunk size if that doesn't drop us
+			 * below the reserved size.
+			 */
+			if (entry->cache_size > cp->chunk + reserved)
+				adjusted = cp->chunk;
+			else
+				adjusted = entry->cache_size - reserved;
+		/*
+		 * Conditions for increasing the amount of resources for an
+		 * entry:
+		 *  - There was some activity across the pool
+		 *  - This entry is using less than the entire cache pool
+		 *  - The connection is using enough cache to require eviction
+		 *  - There is space available in the pool
+		 *  - Additional cache would benefit the connection
+		 */
+		} else if (highest > 1 &&
+		    entry->cache_size < cp->size &&
+		     cache->bytes_inmem >=
+		     (entry->cache_size * cache->eviction_target) / 100 &&
+		     cp->currently_used < cp->size &&
+		     read_pressure > bump_threshold) {
+			grew = 1;
+			adjusted = WT_MIN(cp->chunk,
+			    cp->size - cp->currently_used);
+		}
+		if (adjusted > 0) {
+			*adjustedp = 1;
+			if (grew > 0) {
+				cache->cp_skip_count = WT_CACHE_POOL_BUMP_SKIPS;
+				entry->cache_size += adjusted;
+				cp->currently_used += adjusted;
+			} else {
+				cache->cp_skip_count =
+				    WT_CACHE_POOL_REDUCE_SKIPS;
+				WT_ASSERT(session,
+				    entry->cache_size >= adjusted &&
+				    cp->currently_used >= adjusted);
+				entry->cache_size -= adjusted;
+				cp->currently_used -= adjusted;
+			}
+			WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+			    "Allocated %s%" PRId64 " to %s",
+			    grew ? "" : "-", adjusted, entry->home));
+			/*
+			 * TODO: Add a loop waiting for connection to give up
+			 * cache.
+			 */
+		}
+	}
+	return (0);
+}
+
+/*
+ * __wt_cache_pool_server --
+ *	Thread to manage cache pool among connections.
+ */
+void *
+__wt_cache_pool_server(void *arg)
+{
+	WT_CACHE *cache;
+	WT_CACHE_POOL *cp;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)arg;
+
+	cp = __wt_process.cache_pool;
+	cache = S2C(session)->cache;
+
+	while (F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
+	    F_ISSET(cache, WT_CACHE_POOL_RUN)) {
+		if (cp->currently_used <= cp->size)
+			WT_ERR(__wt_cond_wait(session,
+			    cp->cache_pool_cond, 1000000));
+
+		/*
+		 * Re-check pool run flag - since we want to avoid getting the
+		 * lock on shutdown.
+		 */
+		if (!F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
+		    F_ISSET(cache, WT_CACHE_POOL_RUN))
+			break;
+
+		/* Try to become the managing thread */
+		F_CAS_ATOMIC(cp, WT_CACHE_POOL_MANAGED, ret);
+		if (ret == 0) {
+			F_SET(cache, WT_CACHE_POOL_MANAGER);
+			WT_ERR(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+			    "Cache pool switched manager thread"));
+		}
+
+		/*
+		 * Continue even if there was an error. Details of errors are
+		 * reported in the balance function.
+		 */
+		if (F_ISSET(cache, WT_CACHE_POOL_MANAGER))
+			(void)__cache_pool_balance(session);
+	}
+
+	if (0) {
+err:		__wt_err(session, ret, "cache pool manager server error");
+	}
+	return (NULL);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_ckpt.c b/src/third_party/wiredtiger/src/conn/conn_ckpt.c
new file mode 100644
index 00000000000..ab97d4ead46
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_ckpt.c
@@ -0,0 +1,228 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __ckpt_server_start(WT_CONNECTION_IMPL *);
+
+/*
+ * __ckpt_server_config --
+ *	Parse and setup the checkpoint server options.
+ */
+static int
+__ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, int *startp)
+{
+	WT_CONFIG_ITEM cval;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	char *p;
+
+	conn = S2C(session);
+
+	/*
+	 * The checkpoint configuration requires a wait time and/or a log
+	 * size -- if one is not set, we're not running at all.
+	 * Checkpoints based on log size also require logging be enabled.
+	 */
+	WT_RET(__wt_config_gets(session, cfg, "checkpoint.wait", &cval));
+	conn->ckpt_usecs = (long)cval.val * 1000000;
+	WT_RET(__wt_config_gets(session, cfg, "checkpoint.log_size", &cval));
+	conn->ckpt_logsize = (wt_off_t)cval.val;
+	__wt_log_written_reset(session);
+	if ((conn->ckpt_usecs == 0 && conn->ckpt_logsize == 0) ||
+	    (conn->ckpt_logsize && !conn->logging && conn->ckpt_usecs == 0)) {
+		*startp = 0;
+		return (0);
+	}
+	*startp = 1;
+
+	/*
+	 * The application can specify a checkpoint name, which we ignore if
+	 * it's our default.
+	 */
+	WT_RET(__wt_config_gets(session, cfg, "checkpoint.name", &cval));
+	if (cval.len != 0 &&
+	    !WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) {
+		WT_RET(__wt_checkpoint_name_ok(session, cval.str, cval.len));
+
+		WT_RET(__wt_scr_alloc(session, cval.len + 20, &tmp));
+		WT_ERR(__wt_buf_fmt(
+		    session, tmp, "name=%.*s", (int)cval.len, cval.str));
+		WT_ERR(__wt_strdup(session, tmp->data, &p));
+
+		__wt_free(session, conn->ckpt_config);
+		conn->ckpt_config = p;
+	}
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __ckpt_server --
+ *	The checkpoint server thread.
+ */
+static void *
+__ckpt_server(void *arg)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_SESSION *wt_session;
+	WT_SESSION_IMPL *session;
+
+	session = arg;
+	conn = S2C(session);
+	wt_session = (WT_SESSION *)session;
+
+	while (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
+	    F_ISSET(conn, WT_CONN_SERVER_CHECKPOINT)) {
+		/* Checkpoint the database. */
+		WT_ERR(wt_session->checkpoint(wt_session, conn->ckpt_config));
+
+		/* Reset. */
+		if (conn->ckpt_logsize) {
+			__wt_log_written_reset(session);
+			conn->ckpt_signalled = 0;
+		}
+		/*
+		 * Wait...
+		 * NOTE: If the user only configured logsize, then usecs
+		 * will be 0 and this wait won't return until signalled.
+		 */
+		WT_ERR(
+		    __wt_cond_wait(session, conn->ckpt_cond, conn->ckpt_usecs));
+	}
+
+	if (0) {
+err:		__wt_err(session, ret, "checkpoint server error");
+	}
+	return (NULL);
+}
+
+/*
+ * __ckpt_server_start --
+ *	Start the checkpoint server thread.
+ */
+static int
+__ckpt_server_start(WT_CONNECTION_IMPL *conn)
+{
+	WT_SESSION_IMPL *session;
+
+	/* Nothing to do if the server is already running. */
+	if (conn->ckpt_session != NULL)
+		return (0);
+
+	F_SET(conn, WT_CONN_SERVER_CHECKPOINT);
+	/* The checkpoint server gets its own session. */
+	WT_RET(__wt_open_internal_session(
+	    conn, "checkpoint-server", 1, 1, &conn->ckpt_session));
+	session = conn->ckpt_session;
+
+	/*
+	 * Checkpoint does enough I/O it may be called upon to perform slow
+	 * operations for the block manager.
+	 */
+	F_SET(session, WT_SESSION_CAN_WAIT);
+
+	WT_RET(
+	    __wt_cond_alloc(session, "checkpoint server", 0, &conn->ckpt_cond));
+
+	/*
+	 * Start the thread.
+	 */
+	WT_RET(__wt_thread_create(
+	    session, &conn->ckpt_tid, __ckpt_server, session));
+	conn->ckpt_tid_set = 1;
+
+	return (0);
+}
+
+/*
+ * __wt_checkpoint_server_create --
+ *	Configure and start the checkpoint server.
+ */
+int
+__wt_checkpoint_server_create(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_CONNECTION_IMPL *conn;
+	int start;
+
+	conn = S2C(session);
+	start = 0;
+
+	/* If there is already a server running, shut it down. */
+	if (conn->ckpt_session != NULL)
+		WT_RET(__wt_checkpoint_server_destroy(session));
+
+	WT_RET(__ckpt_server_config(session, cfg, &start));
+	if (start)
+		WT_RET(__ckpt_server_start(conn));
+
+	return (0);
+}
+
+/*
+ * __wt_checkpoint_server_destroy --
+ *	Destroy the checkpoint server thread.
+ */
+int
+__wt_checkpoint_server_destroy(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_SESSION *wt_session;
+
+	conn = S2C(session);
+
+	F_CLR(conn, WT_CONN_SERVER_CHECKPOINT);
+	if (conn->ckpt_tid_set) {
+		WT_TRET(__wt_cond_signal(session, conn->ckpt_cond));
+		WT_TRET(__wt_thread_join(session, conn->ckpt_tid));
+		conn->ckpt_tid_set = 0;
+	}
+	WT_TRET(__wt_cond_destroy(session, &conn->ckpt_cond));
+
+	__wt_free(session, conn->ckpt_config);
+
+	/* Close the server thread's session. */
+	if (conn->ckpt_session != NULL) {
+		wt_session = &conn->ckpt_session->iface;
+		WT_TRET(wt_session->close(wt_session, NULL));
+	}
+
+	/*
+	 * Ensure checkpoint settings are cleared - so that reconfigure doesn't
+	 * get confused.
+	 */
+	conn->ckpt_session = NULL;
+	conn->ckpt_tid_set = 0;
+	conn->ckpt_cond = NULL;
+	conn->ckpt_config = NULL;
+	conn->ckpt_usecs = 0;
+
+	return (ret);
+}
+
+/*
+ * __wt_checkpoint_signal --
+ *	Signal the checkpoint thread if sufficient log has been written.
+ *	Return 1 if this signals the checkpoint thread, 0 otherwise.
+ */
+int
+__wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize)
+{
+	WT_CONNECTION_IMPL *conn;
+
+	conn = S2C(session);
+	WT_ASSERT(session, WT_CKPT_LOGSIZE(conn));
+	if (logsize >= conn->ckpt_logsize && !conn->ckpt_signalled) {
+		WT_RET(__wt_cond_signal(session, conn->ckpt_cond));
+		conn->ckpt_signalled = 1;
+	}
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
new file mode 100644
index 00000000000..f4f540e33c7
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
@@ -0,0 +1,694 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __conn_dhandle_open_lock --
+ *	Spin on the current data handle until either (a) it is open, read
+ *	locked; or (b) it is closed, write locked.  If exclusive access is
+ *	requested and cannot be granted immediately, fail with EBUSY.
+ */
+static int
+__conn_dhandle_open_lock(
+    WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, uint32_t flags)
+{
+	WT_BTREE *btree;
+	WT_DECL_RET;
+
+	btree = dhandle->handle;
+
+	/*
+	 * Check that the handle is open.  We've already incremented
+	 * the reference count, so once the handle is open it won't be
+	 * closed by another thread.
+	 *
+	 * If we can see the WT_DHANDLE_OPEN flag set while holding a
+	 * lock on the handle, then it's really open and we can start
+	 * using it.  Alternatively, if we can get an exclusive lock
+	 * and WT_DHANDLE_OPEN is still not set, we need to do the open.
+	 */
+	for (;;) {
+		if (!LF_ISSET(WT_DHANDLE_EXCLUSIVE) &&
+		    F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS))
+			return (EBUSY);
+
+		if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
+		    !LF_ISSET(WT_DHANDLE_EXCLUSIVE)) {
+			WT_RET(__wt_readlock(session, dhandle->rwlock));
+			if (F_ISSET(dhandle, WT_DHANDLE_OPEN))
+				return (0);
+			WT_RET(__wt_readunlock(session, dhandle->rwlock));
+		}
+
+		/*
+		 * It isn't open or we want it exclusive: try to get an
+		 * exclusive lock.  There is some subtlety here: if we race
+		 * with another thread that successfully opens the file, we
+		 * don't want to block waiting to get exclusive access.
+		 */
+		if ((ret = __wt_try_writelock(session, dhandle->rwlock)) == 0) {
+			/*
+			 * If it was opened while we waited, drop the write
+			 * lock and get a read lock instead.
+			 */
+			if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
+			    !LF_ISSET(WT_DHANDLE_EXCLUSIVE)) {
+				WT_RET(
+				    __wt_writeunlock(session, dhandle->rwlock));
+				continue;
+			}
+
+			/* We have an exclusive lock, we're done. */
+			F_SET(dhandle, WT_DHANDLE_EXCLUSIVE);
+			return (0);
+		} else if (ret != EBUSY || LF_ISSET(WT_DHANDLE_EXCLUSIVE))
+			return (EBUSY);
+
+		/* Give other threads a chance to make progress. */
+		__wt_yield();
+	}
+}
+
+/*
+ * __conn_dhandle_get --
+ *	Find an open btree file handle, otherwise create a new one, lock it
+ * exclusively, and return it linked into the connection's list.
+ */
+static int
+__conn_dhandle_get(WT_SESSION_IMPL *session,
+    const char *name, const char *ckpt, uint32_t flags)
+{
+	WT_BTREE *btree;
+	WT_CONNECTION_IMPL *conn;
+	WT_DATA_HANDLE *dhandle;
+	WT_DECL_RET;
+	uint64_t hash;
+
+	conn = S2C(session);
+
+	/* We must be holding the schema lock at a higher level. */
+	WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) &&
+	    !LF_ISSET(WT_DHANDLE_HAVE_REF));
+
+	/* Increment the reference count if we already have the btree open. */
+	hash = __wt_hash_city64(name, strlen(name));
+	SLIST_FOREACH(dhandle, &conn->dhlh, l)
+		if ((hash == dhandle->name_hash &&
+		     strcmp(name, dhandle->name) == 0) &&
+		    ((ckpt == NULL && dhandle->checkpoint == NULL) ||
+		    (ckpt != NULL && dhandle->checkpoint != NULL &&
+		    strcmp(ckpt, dhandle->checkpoint) == 0))) {
+			WT_RET(__conn_dhandle_open_lock(
+			    session, dhandle, flags));
+			(void)WT_ATOMIC_ADD4(dhandle->session_ref, 1);
+			session->dhandle = dhandle;
+			return (0);
+		}
+
+	/*
+	 * Allocate the data source handle and underlying btree handle, then
+	 * initialize the data source handle.  Exclusively lock the data
+	 * source handle before inserting it in the list.
+	 */
+	WT_RET(__wt_calloc_def(session, 1, &dhandle));
+
+	WT_ERR(__wt_rwlock_alloc(session, &dhandle->rwlock, "data handle"));
+	dhandle->session_ref = 1;
+
+	dhandle->name_hash = hash;
+	WT_ERR(__wt_strdup(session, name, &dhandle->name));
+	if (ckpt != NULL)
+		WT_ERR(__wt_strdup(session, ckpt, &dhandle->checkpoint));
+
+	WT_ERR(__wt_calloc_def(session, 1, &btree));
+	dhandle->handle = btree;
+	btree->dhandle = dhandle;
+
+	WT_ERR(__wt_spin_init(
+	    session, &dhandle->close_lock, "data handle close"));
+
+	F_SET(dhandle, WT_DHANDLE_EXCLUSIVE);
+	WT_ERR(__wt_writelock(session, dhandle->rwlock));
+
+	/*
+	 * Prepend the handle to the connection list, assuming we're likely to
+	 * need new files again soon, until they are cached by all sessions.
+	 *
+	 * !!!
+	 * We hold only the schema lock here, not the dhandle lock.  Eviction
+	 * walks this list only holding the dhandle lock.  This works because
+	 * we're inserting at the beginning of the list, and we're only
+	 * publishing this one entry per lock acquisition.  Eviction either
+	 * sees our newly added entry or the former head of the list, and it
+	 * doesn't matter which (if eviction only sees a single element in the
+	 * list because the insert races, it will return without finding enough
+	 * candidates for eviction, and will then retry).
+	 */
+	SLIST_INSERT_HEAD(&conn->dhlh, dhandle, l);
+
+	session->dhandle = dhandle;
+	return (0);
+
+err:	WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock));
+	__wt_free(session, dhandle->name);
+	__wt_free(session, dhandle->checkpoint);
+	__wt_free(session, dhandle->handle);		/* btree free */
+	__wt_spin_destroy(session, &dhandle->close_lock);
+	__wt_overwrite_and_free(session, dhandle);
+
+	return (ret);
+}
+
+/*
+ * __wt_conn_btree_sync_and_close --
+ *	Sync and close the underlying btree handle.
+ */
+int
+__wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int force)
+{
+	WT_BTREE *btree;
+	WT_DATA_HANDLE *dhandle;
+	WT_DECL_RET;
+	int no_schema_lock;
+
+	dhandle = session->dhandle;
+	btree = S2BT(session);
+
+	if (!F_ISSET(dhandle, WT_DHANDLE_OPEN))
+		return (0);
+
+	/*
+	 * If we don't already have the schema lock, make it an error to try
+	 * to acquire it.  The problem is that we are holding an exclusive
+	 * lock on the handle, and if we attempt to acquire the schema lock
+	 * we might deadlock with a thread that has the schema lock and wants
+	 * a handle lock (specifically, checkpoint).
+	 */
+	no_schema_lock = 0;
+	if (!F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) {
+		no_schema_lock = 1;
+		F_SET(session, WT_SESSION_NO_SCHEMA_LOCK);
+	}
+
+	/*
+	 * We may not be holding the schema lock, and threads may be walking
+	 * the list of open handles (for example, checkpoint).  Acquire the
+	 * handle's close lock.
+	 */
+	__wt_spin_lock(session, &dhandle->close_lock);
+
+	/*
+	 * The close can fail if an update cannot be written, return the EBUSY
+	 * error to our caller for eventual retry.
+	 */
+	if (!F_ISSET(btree,
+	    WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY))
+		WT_ERR(__wt_checkpoint_close(session, force));
+
+	if (dhandle->checkpoint == NULL)
+		--S2C(session)->open_btree_count;
+
+	WT_TRET(__wt_btree_close(session));
+	F_CLR(dhandle, WT_DHANDLE_OPEN);
+	F_CLR(btree, WT_BTREE_SPECIAL_FLAGS);
+
+err:	__wt_spin_unlock(session, &dhandle->close_lock);
+
+	if (no_schema_lock)
+		F_CLR(session, WT_SESSION_NO_SCHEMA_LOCK);
+
+	return (ret);
+}
+
+/*
+ * __conn_btree_config_clear --
+ *	Clear the underlying object's configuration information.
+ */
+static void
+__conn_btree_config_clear(WT_SESSION_IMPL *session)
+{
+	WT_DATA_HANDLE *dhandle;
+	const char **a;
+
+	dhandle = session->dhandle;
+
+	if (dhandle->cfg == NULL)
+		return;
+	for (a = dhandle->cfg; *a != NULL; ++a)
+		__wt_free(session, *a);
+	__wt_free(session, dhandle->cfg);
+}
+
+/*
+ * __conn_btree_config_set --
+ *	Set up a btree handle's configuration information.
+ */
+static int
+__conn_btree_config_set(WT_SESSION_IMPL *session)
+{
+	WT_DATA_HANDLE *dhandle;
+	WT_DECL_RET;
+	const char *metaconf;
+
+	dhandle = session->dhandle;
+
+	/*
+	 * Read the object's entry from the metadata file, we're done if we
+	 * don't find one.
+	 */
+	if ((ret =
+	    __wt_metadata_search(session, dhandle->name, &metaconf)) != 0) {
+		if (ret == WT_NOTFOUND)
+			ret = ENOENT;
+		WT_RET(ret);
+	}
+
+	/*
+	 * The defaults are included because underlying objects have persistent
+	 * configuration information stored in the metadata file.  If defaults
+	 * are included in the configuration, we can add new configuration
+	 * strings without upgrading the metadata file or writing special code
+	 * in case a configuration string isn't initialized, as long as the new
+	 * configuration string has an appropriate default value.
+	 *
+	 * The error handling is a little odd, but be careful: we're holding a
+	 * chunk of allocated memory in metaconf.  If we fail before we copy a
+	 * reference to it into the object's configuration array, we must free
+	 * it, after the copy, we don't want to free it.
+	 */
+	WT_ERR(__wt_calloc_def(session, 3, &dhandle->cfg));
+	WT_ERR(__wt_strdup(
+	    session, WT_CONFIG_BASE(session, file_meta), &dhandle->cfg[0]));
+	dhandle->cfg[1] = metaconf;
+	return (0);
+
+err:	__wt_free(session, metaconf);
+	return (ret);
+}
+
+/*
+ * __conn_btree_open --
+ *	Open the current btree handle.
+ */
+static int
+__conn_btree_open(
+	WT_SESSION_IMPL *session, const char *op_cfg[], uint32_t flags)
+{
+	WT_BTREE *btree;
+	WT_DATA_HANDLE *dhandle;
+	WT_DECL_RET;
+
+	dhandle = session->dhandle;
+	btree = S2BT(session);
+
+	WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) &&
+	    F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) &&
+	    !LF_ISSET(WT_DHANDLE_LOCK_ONLY));
+
+	/*
+	 * If the handle is already open, it has to be closed so it can be
+	 * reopened with a new configuration.  We don't need to check again:
+	 * this function isn't called if the handle is already open in the
+	 * required mode.
+	 *
+	 * This call can return EBUSY if there's an update in the object that's
+	 * not yet globally visible.  That's not a problem because it can only
+	 * happen when we're switching from a normal handle to a "special" one,
+	 * so we're returning EBUSY to an attempt to verify or do other special
+	 * operations.  The reverse won't happen because when the handle from a
+	 * verify or other special operation is closed, there won't be updates
+	 * in the tree that can block the close.
+	 */
+	if (F_ISSET(dhandle, WT_DHANDLE_OPEN))
+		WT_RET(__wt_conn_btree_sync_and_close(session, 0));
+
+	/* Discard any previous configuration, set up the new configuration. */
+	__conn_btree_config_clear(session);
+	WT_RET(__conn_btree_config_set(session));
+
+	/* Set any special flags on the handle. */
+	F_SET(btree, LF_ISSET(WT_BTREE_SPECIAL_FLAGS));
+
+	do {
+		WT_ERR(__wt_btree_open(session, op_cfg));
+		F_SET(dhandle, WT_DHANDLE_OPEN);
+		/*
+		 * Checkpoint handles are read only, so eviction calculations
+		 * based on the number of btrees are better to ignore them.
+		 */
+		if (dhandle->checkpoint == NULL)
+			++S2C(session)->open_btree_count;
+
+		/* Drop back to a readlock if that is all that was needed. */
+		if (!LF_ISSET(WT_DHANDLE_EXCLUSIVE)) {
+			F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
+			WT_ERR(__wt_writeunlock(session, dhandle->rwlock));
+			WT_ERR(
+			    __conn_dhandle_open_lock(session, dhandle, flags));
+		}
+	} while (!F_ISSET(dhandle, WT_DHANDLE_OPEN));
+
+	if (0) {
+err:		F_CLR(btree, WT_BTREE_SPECIAL_FLAGS);
+		/*
+		 * If the open failed, close the handle.  If there was no
+		 * reference to the handle in this session, we incremented the
+		 * session reference count, so decrement it here.  Otherwise,
+		 * just close the handle without decrementing.
+		 */
+		if (!LF_ISSET(WT_DHANDLE_HAVE_REF))
+			__wt_conn_btree_close(session);
+		else if (F_ISSET(dhandle, WT_DHANDLE_OPEN))
+			WT_TRET(__wt_conn_btree_sync_and_close(session, 0));
+	}
+
+	return (ret);
+}
+
+/*
+ * __wt_conn_btree_get --
+ *	Get an open btree file handle, otherwise open a new one.
+ */
+int
+__wt_conn_btree_get(WT_SESSION_IMPL *session,
+    const char *name, const char *ckpt, const char *op_cfg[], uint32_t flags)
+{
+	WT_DATA_HANDLE *dhandle;
+	WT_DECL_RET;
+
+	if (LF_ISSET(WT_DHANDLE_HAVE_REF))
+		WT_RET(
+		    __conn_dhandle_open_lock(session, session->dhandle, flags));
+	else
+		WT_RET(__conn_dhandle_get(session, name, ckpt, flags));
+	dhandle = session->dhandle;
+
+	if (!LF_ISSET(WT_DHANDLE_LOCK_ONLY) &&
+	    (!F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
+	    LF_ISSET(WT_BTREE_SPECIAL_FLAGS)))
+		if ((ret = __conn_btree_open(session, op_cfg, flags)) != 0) {
+			F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
+			WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
+		}
+
+	WT_ASSERT(session, ret != 0 ||
+	    LF_ISSET(WT_DHANDLE_EXCLUSIVE) ==
+	    F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE));
+
+	return (ret);
+}
+
+/*
+ * __wt_conn_btree_apply --
+ *	Apply a function to all open btree handles apart from the metadata
+ * file.
+ */
+int
+__wt_conn_btree_apply(WT_SESSION_IMPL *session,
+    int apply_checkpoints,
+    int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[])
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DATA_HANDLE *dhandle;
+	WT_DECL_RET;
+
+	conn = S2C(session);
+
+	WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+
+	SLIST_FOREACH(dhandle, &conn->dhlh, l)
+		if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
+		    WT_PREFIX_MATCH(dhandle->name, "file:") &&
+		    (apply_checkpoints || dhandle->checkpoint == NULL) &&
+		    !WT_IS_METADATA(dhandle)) {
+			/*
+			 * We need to pull the handle into the session handle
+			 * cache and make sure it's referenced to stop other
+			 * internal code dropping the handle (e.g in LSM when
+			 * cleaning up obsolete chunks). Holding the metadata
+			 * lock isn't enough.
+			 */
+			ret = __wt_session_get_btree(session,
+			    dhandle->name, dhandle->checkpoint, NULL, 0);
+			if (ret == 0) {
+				ret = func(session, cfg);
+				if (WT_META_TRACKING(session))
+					WT_TRET(__wt_meta_track_handle_lock(
+					    session, 0));
+				else
+					WT_TRET(__wt_session_release_btree(
+					    session));
+			} else if (ret == EBUSY)
+				ret = __wt_conn_btree_apply_single(
+				    session, dhandle->name,
+				    dhandle->checkpoint, func, cfg);
+			WT_RET(ret);
+		}
+
+	return (0);
+}
+
+/*
+ * __wt_conn_btree_apply_single --
+ *	Apply a function to a single btree handle that couldn't be locked
+ * (attempting to get the handle returned EBUSY).
+ */
+int
+__wt_conn_btree_apply_single(WT_SESSION_IMPL *session,
+    const char *uri, const char *checkpoint,
+    int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[])
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DATA_HANDLE *dhandle, *saved_dhandle;
+	WT_DECL_RET;
+
+	conn = S2C(session);
+	saved_dhandle = session->dhandle;
+
+	WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+
+	SLIST_FOREACH(dhandle, &conn->dhlh, l)
+		if (strcmp(dhandle->name, uri) == 0 &&
+		    ((dhandle->checkpoint == NULL && checkpoint == NULL) ||
+		    (dhandle->checkpoint != NULL && checkpoint != NULL &&
+		    strcmp(dhandle->checkpoint, checkpoint) == 0))) {
+			/*
+			 * We're holding the schema lock which locks out handle
+			 * open (which might change the state of the underlying
+			 * object).  However, closing a handle doesn't require
+			 * the schema lock, lock out closing the handle and then
+			 * confirm the handle is still open.
+			 */
+			__wt_spin_lock(session, &dhandle->close_lock);
+			if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) {
+				session->dhandle = dhandle;
+				ret = func(session, cfg);
+			}
+			__wt_spin_unlock(session, &dhandle->close_lock);
+			WT_ERR(ret);
+		}
+
+err:	session->dhandle = saved_dhandle;
+	return (ret);
+}
+
+/*
+ * __wt_conn_btree_close --
+ *	Discard a reference to an open btree file handle.
+ */
+void
+__wt_conn_btree_close(WT_SESSION_IMPL *session)
+{
+	(void)WT_ATOMIC_SUB4(session->dhandle->session_ref, 1);
+}
+
+/*
+ * __wt_conn_dhandle_close_all --
+ *	Close all data handles handles with matching name (including all
+ *	checkpoint handles).
+ */
+int
+__wt_conn_dhandle_close_all(
+    WT_SESSION_IMPL *session, const char *name, int force)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DATA_HANDLE *dhandle;
+	WT_DECL_RET;
+
+	conn = S2C(session);
+
+	WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+	WT_ASSERT(session, session->dhandle == NULL);
+
+	SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+		if (strcmp(dhandle->name, name) != 0)
+			continue;
+
+		session->dhandle = dhandle;
+
+		/* Lock the handle exclusively. */
+		WT_ERR(__wt_session_get_btree(session,
+		    dhandle->name, dhandle->checkpoint,
+		    NULL, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY));
+		if (WT_META_TRACKING(session))
+			WT_ERR(__wt_meta_track_handle_lock(session, 0));
+
+		/*
+		 * We have an exclusive lock, which means there are no cursors
+		 * open at this point.  Close the handle, if necessary.
+		 */
+		if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) {
+			if ((ret = __wt_meta_track_sub_on(session)) == 0)
+				ret = __wt_conn_btree_sync_and_close(
+				    session, force);
+
+			/*
+			 * If the close succeeded, drop any locks it acquired.
+			 * If there was a failure, this function will fail and
+			 * the whole transaction will be rolled back.
+			 */
+			if (ret == 0)
+				ret = __wt_meta_track_sub_off(session);
+		}
+
+		if (!WT_META_TRACKING(session))
+			WT_TRET(__wt_session_release_btree(session));
+
+		WT_ERR(ret);
+	}
+
+err:	session->dhandle = NULL;
+	return (ret);
+}
+
+/*
+ * __wt_conn_dhandle_discard_single --
+ *	Close/discard a single data handle.
+ */
+int
+__wt_conn_dhandle_discard_single(
+    WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, int final)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DATA_HANDLE *save_dhandle;
+	WT_DECL_RET;
+	WT_DECL_SPINLOCK_ID(id);			/* Must appear last */
+
+	conn = S2C(session);
+
+	save_dhandle = session->dhandle;
+	session->dhandle = dhandle;
+
+	/*
+	 * We're called from the periodic sweep function and the final close;
+	 * the former wants to continue if the handle is suddenly found to be
+	 * busy, the latter wants to shut things down.
+	 */
+	if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) {
+		if (!final)
+			WT_ERR(EBUSY);
+		WT_ERR(__wt_conn_btree_sync_and_close(session, 0));
+	}
+
+	/* 
+	 * Get the schema lock (required to remove entries from the data handle
+	 * list), get the dhandle lock to block the eviction server from
+	 * walking the list.
+	 */
+	F_SET(session, WT_SESSION_SCHEMA_LOCKED);
+	__wt_spin_lock(session, &conn->schema_lock);
+
+	/*
+	 * If the eviction server is running, don't block waiting for it while
+	 * holding the schema lock.  The sweep server will try again.
+	 */
+	if (final)
+		__wt_spin_lock(session, &conn->dhandle_lock);
+	else if ((ret =
+	    __wt_spin_trylock(session, &conn->dhandle_lock, &id)) != 0)
+		goto unlock;
+
+	/*
+	 * Check if the handle was reacquired by a session while we waited;
+	 * this should only happen when called from the periodic sweep code, of
+	 * course.
+	 */
+	if (!final && dhandle->session_ref != 0)
+		ret = EBUSY;
+	else
+		SLIST_REMOVE(&conn->dhlh, dhandle, __wt_data_handle, l);
+
+	__wt_spin_unlock(session, &conn->dhandle_lock);
+
+unlock:	__wt_spin_unlock(session, &conn->schema_lock);
+	F_CLR(session, WT_SESSION_SCHEMA_LOCKED);
+
+	/*
+	 * After successfully removing the handle, clean it up.
+	 */
+	if (ret == 0) {
+		WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock));
+		__wt_free(session, dhandle->name);
+		__wt_free(session, dhandle->checkpoint);
+		__conn_btree_config_clear(session);
+		__wt_free(session, dhandle->handle);
+		__wt_spin_destroy(session, &dhandle->close_lock);
+		__wt_overwrite_and_free(session, dhandle);
+
+		WT_CLEAR_BTREE_IN_SESSION(session);
+	}
+
+err:	session->dhandle = save_dhandle;
+	WT_ASSERT(session, !final || ret == 0);
+	return (ret);
+}
+
+/*
+ * __wt_conn_dhandle_discard --
+ *	Close/discard all data handles.
+ */
+int
+__wt_conn_dhandle_discard(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DATA_HANDLE *dhandle;
+	WT_DECL_RET;
+
+	conn = S2C(session);
+
+	/*
+	 * Close open data handles: first, everything but the metadata file
+	 * (as closing a normal file may open and write the metadata file),
+	 * then the metadata file.  This function isn't called often, and I
+	 * don't want to "know" anything about the metadata file's position on
+	 * the list, so we do it the hard way.
+	 */
+restart:
+	SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+		if (WT_IS_METADATA(dhandle))
+			continue;
+
+		WT_TRET(__wt_conn_dhandle_discard_single(session, dhandle, 1));
+		goto restart;
+	}
+
+	/*
+	 * Closing the files may have resulted in entries on our default
+	 * session's list of open data handles, specifically, we added the
+	 * metadata file if any of the files were dirty.  Clean up that list
+	 * before we shut down the metadata entry, for good.
+	 */
+	__wt_session_close_cache(session);
+	F_SET(session, WT_SESSION_NO_DATA_HANDLES);
+
+	/* Close the metadata file handle. */
+	while ((dhandle = SLIST_FIRST(&conn->dhlh)) != NULL)
+		WT_TRET(__wt_conn_dhandle_discard_single(session, dhandle, 1));
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_handle.c b/src/third_party/wiredtiger/src/conn/conn_handle.c
new file mode 100644
index 00000000000..e4f0a6ddd73
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_handle.c
@@ -0,0 +1,142 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_connection_init --
+ *	Structure initialization for a just-created WT_CONNECTION_IMPL handle.
+ */
+int
+__wt_connection_init(WT_CONNECTION_IMPL *conn)
+{
+	WT_SESSION_IMPL *session;
+	u_int i;
+
+	session = conn->default_session;
+
+	SLIST_INIT(&conn->dhlh);		/* Data handle list */
+	TAILQ_INIT(&conn->dlhqh);		/* Library list */
+	TAILQ_INIT(&conn->dsrcqh);		/* Data source list */
+	TAILQ_INIT(&conn->fhqh);		/* File list */
+	TAILQ_INIT(&conn->collqh);		/* Collator list */
+	TAILQ_INIT(&conn->compqh);		/* Compressor list */
+
+	TAILQ_INIT(&conn->lsmqh);		/* WT_LSM_TREE list */
+
+	/* Setup the LSM work queues. */
+	TAILQ_INIT(&conn->lsm_manager.switchqh);
+	TAILQ_INIT(&conn->lsm_manager.appqh);
+	TAILQ_INIT(&conn->lsm_manager.managerqh);
+
+	/* Configuration. */
+	WT_RET(__wt_conn_config_init(session));
+
+	/* Statistics. */
+	__wt_stat_init_connection_stats(&conn->stats);
+
+	/* Locks. */
+	WT_RET(__wt_spin_init(session, &conn->api_lock, "api"));
+	WT_RET(__wt_spin_init(session, &conn->checkpoint_lock, "checkpoint"));
+	WT_RET(__wt_spin_init(session, &conn->dhandle_lock, "data handle"));
+	WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list"));
+	WT_RET(__wt_spin_init(session, &conn->hot_backup_lock, "hot backup"));
+	WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure"));
+	WT_RET(__wt_spin_init(session, &conn->schema_lock, "schema"));
+	WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS(conn), &conn->page_lock));
+	for (i = 0; i < WT_PAGE_LOCKS(conn); ++i)
+		WT_RET(
+		    __wt_spin_init(session, &conn->page_lock[i], "btree page"));
+
+	/* Setup the spin locks for the LSM manager queues. */
+	WT_RET(__wt_spin_init(session,
+	    &conn->lsm_manager.app_lock, "LSM application queue lock"));
+	WT_RET(__wt_spin_init(session,
+	    &conn->lsm_manager.manager_lock, "LSM manager queue lock"));
+	WT_RET(__wt_spin_init(
+	    session, &conn->lsm_manager.switch_lock, "LSM switch queue lock"));
+	WT_RET(__wt_cond_alloc(
+	    session, "LSM worker cond", 0, &conn->lsm_manager.work_cond));
+
+	/*
+	 * Generation numbers.
+	 *
+	 * Start split generations at one.  Threads publish this generation
+	 * number before examining tree structures, and zero when they leave.
+	 * We need to distinguish between threads that are in a tree before the
+	 * first split has happened, and threads that are not in a tree.
+	 */
+	conn->split_gen = 1;
+
+	/*
+	 * Block manager.
+	 * XXX
+	 * If there's ever a second block manager, we'll want to make this
+	 * more opaque, but for now this is simpler.
+	 */
+	WT_RET(__wt_spin_init(session, &conn->block_lock, "block manager"));
+	TAILQ_INIT(&conn->blockqh);		/* Block manager list */
+
+	return (0);
+}
+
+/*
+ * __wt_connection_destroy --
+ *	Destroy the connection's underlying WT_CONNECTION_IMPL structure.
+ */
+int
+__wt_connection_destroy(WT_CONNECTION_IMPL *conn)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	u_int i;
+
+	/* Check there's something to destroy. */
+	if (conn == NULL)
+		return (0);
+
+	session = conn->default_session;
+
+	/*
+	 * Close remaining open files (before discarding the mutex, the
+	 * underlying file-close code uses the mutex to guard lists of
+	 * open files.
+	 */
+	if (conn->lock_fh != NULL)
+		WT_TRET(__wt_close(session, conn->lock_fh));
+
+	/* Remove from the list of connections. */
+	__wt_spin_lock(session, &__wt_process.spinlock);
+	TAILQ_REMOVE(&__wt_process.connqh, conn, q);
+	__wt_spin_unlock(session, &__wt_process.spinlock);
+
+	/* Configuration */
+	__wt_conn_config_discard(session);		/* configuration */
+
+	__wt_conn_foc_discard(session);			/* free-on-close */
+
+	__wt_spin_destroy(session, &conn->api_lock);
+	__wt_spin_destroy(session, &conn->block_lock);
+	__wt_spin_destroy(session, &conn->checkpoint_lock);
+	__wt_spin_destroy(session, &conn->dhandle_lock);
+	__wt_spin_destroy(session, &conn->fh_lock);
+	__wt_spin_destroy(session, &conn->hot_backup_lock);
+	__wt_spin_destroy(session, &conn->reconfig_lock);
+	__wt_spin_destroy(session, &conn->schema_lock);
+	for (i = 0; i < WT_PAGE_LOCKS(conn); ++i)
+		__wt_spin_destroy(session, &conn->page_lock[i]);
+	__wt_free(session, conn->page_lock);
+
+	/* Free allocated memory. */
+	__wt_free(session, conn->cfg);
+	__wt_free(session, conn->home);
+	__wt_free(session, conn->error_prefix);
+	__wt_free(session, conn->sessions);
+
+	__wt_free(NULL, conn);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c
new file mode 100644
index 00000000000..e516fdc68d2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_log.c
@@ -0,0 +1,284 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __logmgr_sync_cfg --
+ *	Interpret the transaction_sync config.
+ */
+static int
+__logmgr_sync_cfg(WT_SESSION_IMPL *session, const char **cfg)
+{
+	WT_CONFIG_ITEM cval;
+	WT_CONNECTION_IMPL *conn;
+
+	conn = S2C(session);
+
+	WT_RET(
+	    __wt_config_gets(session, cfg, "transaction_sync.enabled", &cval));
+	if (cval.val)
+		FLD_SET(conn->txn_logsync, WT_LOG_FLUSH);
+	else
+		FLD_CLR(conn->txn_logsync, WT_LOG_FLUSH);
+
+	WT_RET(
+	    __wt_config_gets(session, cfg, "transaction_sync.method", &cval));
+	FLD_CLR(conn->txn_logsync, WT_LOG_DSYNC | WT_LOG_FSYNC);
+	if (WT_STRING_MATCH("dsync", cval.str, cval.len))
+		FLD_SET(conn->txn_logsync, WT_LOG_DSYNC);
+	else if (WT_STRING_MATCH("fsync", cval.str, cval.len))
+		FLD_SET(conn->txn_logsync, WT_LOG_FSYNC);
+	return (0);
+}
+
+/*
+ * __logmgr_config --
+ *	Parse and setup the logging server options.
+ */
+static int
+__logmgr_config(WT_SESSION_IMPL *session, const char **cfg, int *runp)
+{
+	WT_CONFIG_ITEM cval;
+	WT_CONNECTION_IMPL *conn;
+
+	conn = S2C(session);
+
+	/*
+	 * The logging configuration is off by default.
+	 */
+	WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
+	*runp = cval.val != 0;
+	if (*runp == 0)
+		return (0);
+
+	WT_RET(__wt_config_gets(session, cfg, "log.archive", &cval));
+	conn->archive = cval.val != 0;
+
+	WT_RET(__wt_config_gets(session, cfg, "log.file_max", &cval));
+	conn->log_file_max = (wt_off_t)cval.val;
+	WT_STAT_FAST_CONN_SET(session, log_max_filesize, conn->log_file_max);
+
+	WT_RET(__wt_config_gets(session, cfg, "log.path", &cval));
+	WT_RET(__wt_strndup(session, cval.str, cval.len, &conn->log_path));
+
+	WT_RET(__logmgr_sync_cfg(session, cfg));
+	return (0);
+}
+
+/*
+ * __log_archive_server --
+ *	The log archiving server thread.
+ */
+static void *
+__log_archive_server(void *arg)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_LOG *log;
+	WT_LSN lsn;
+	WT_SESSION_IMPL *session;
+	uint32_t lognum;
+	u_int i, logcount;
+	char **logfiles;
+
+	session = arg;
+	conn = S2C(session);
+	log = conn->log;
+	logcount = 0;
+	logfiles = NULL;
+
+	while (F_ISSET(conn, WT_CONN_SERVER_RUN)) {
+		/*
+		 * If archiving is reconfigured and turned off, wait until it
+		 * gets turned back on and check again.  Don't wait forever: if
+		 * a notification gets lost during close, we want to find out
+		 * eventually.
+		 */
+		if (conn->archive == 0 ||
+		    __wt_try_writelock(session, log->log_archive_lock) != 0) {
+			if (conn->archive != 0) {
+				WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+				    "log_archive: Blocked due to open log "
+				    "cursor holding archive lock"));
+			}
+			WT_ERR(
+			    __wt_cond_wait(session, conn->arch_cond, 1000000));
+			continue;
+		}
+
+		lsn = log->ckpt_lsn;
+		lsn.offset = 0;
+		WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+		    "log_archive: ckpt LSN %" PRIu32 ",%" PRIu64,
+		    lsn.file, lsn.offset));
+		/*
+		 * Main archive code.  Get the list of all log files and
+		 * remove any earlier than the checkpoint LSN.
+		 */
+		WT_ERR(__wt_dirlist(session, conn->log_path,
+		    WT_LOG_FILENAME, WT_DIRLIST_INCLUDE, &logfiles, &logcount));
+
+		/*
+		 * We can only archive files if a hot backup is not in progress.
+		 */
+		__wt_spin_lock(session, &conn->hot_backup_lock);
+		for (i = 0; i < logcount; i++) {
+			if (conn->hot_backup == 0) {
+				WT_ERR(__wt_log_extract_lognum(
+				    session, logfiles[i], &lognum));
+				if (lognum < lsn.file)
+					WT_ERR(
+					    __wt_log_remove(session, lognum));
+			}
+		}
+		__wt_spin_unlock(session, &conn->hot_backup_lock);
+		__wt_log_files_free(session, logfiles, logcount);
+		logfiles = NULL;
+		logcount = 0;
+
+		/*
+		 * Indicate what is our new earliest LSN.  It is the start
+		 * of the log file containing the last checkpoint.
+		 */
+		log->first_lsn = lsn;
+		log->first_lsn.offset = 0;
+		WT_ERR(__wt_writeunlock(session, log->log_archive_lock));
+
+		/* Wait until the next event. */
+		WT_ERR(__wt_cond_wait(session, conn->arch_cond, 1000000));
+	}
+
+	if (0) {
+err:		__wt_err(session, ret, "log archive server error");
+	}
+	if (logfiles != NULL)
+		__wt_log_files_free(session, logfiles, logcount);
+	return (NULL);
+}
+
+/*
+ * __wt_logmgr_create --
+ *	Start the log subsystem and archive server thread.
+ */
+int
+__wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_LOG *log;
+	int run;
+
+	conn = S2C(session);
+
+	/* Handle configuration. */
+	WT_RET(__logmgr_config(session, cfg, &run));
+
+	/* If logging is not configured, we're done. */
+	if (!run)
+		return (0);
+
+	conn->logging = 1;
+	/*
+	 * Logging is on, allocate the WT_LOG structure and open the log file.
+	 */
+	WT_RET(__wt_calloc(session, 1, sizeof(WT_LOG), &conn->log));
+	log = conn->log;
+	WT_RET(__wt_spin_init(session, &log->log_lock, "log"));
+	WT_RET(__wt_spin_init(session, &log->log_slot_lock, "log slot"));
+	WT_RET(__wt_spin_init(session, &log->log_sync_lock, "log sync"));
+	WT_RET(__wt_rwlock_alloc(session,
+	    &log->log_archive_lock, "log archive lock"));
+	if (FLD_ISSET(conn->direct_io, WT_FILE_TYPE_LOG))
+		log->allocsize =
+		    WT_MAX((uint32_t)conn->buffer_alignment, LOG_ALIGN);
+	else
+		log->allocsize = LOG_ALIGN;
+	INIT_LSN(&log->alloc_lsn);
+	INIT_LSN(&log->ckpt_lsn);
+	INIT_LSN(&log->first_lsn);
+	INIT_LSN(&log->sync_lsn);
+	INIT_LSN(&log->trunc_lsn);
+	INIT_LSN(&log->write_lsn);
+	log->fileid = 0;
+	WT_RET(__wt_cond_alloc(session, "log sync", 0, &log->log_sync_cond));
+	WT_RET(__wt_log_open(session));
+	WT_RET(__wt_log_slot_init(session));
+
+	/* If archiving is not configured, we're done. */ 
+	if (!conn->archive)
+		return (0);
+
+	/*
+	 * If an archive thread exists, the user may have reconfigured the
+	 * archive thread.  Signal the thread.  Otherwise the user wants
+	 * archiving and we need to start up the thread.
+	 */
+	if (conn->arch_session != NULL) {
+		WT_ASSERT(session, conn->arch_cond != NULL);
+		WT_ASSERT(session, conn->arch_tid_set != 0);
+		WT_RET(__wt_cond_signal(session, conn->arch_cond));
+	} else {
+		/* The log archive server gets its own session. */
+		WT_RET(__wt_open_internal_session(
+		    conn, "archive-server", 0, 0, &conn->arch_session));
+		WT_RET(__wt_cond_alloc(conn->arch_session,
+		    "log archiving server", 0, &conn->arch_cond));
+
+		/*
+		 * Start the thread.
+		 */
+		WT_RET(__wt_thread_create(conn->arch_session,
+		    &conn->arch_tid, __log_archive_server, conn->arch_session));
+		conn->arch_tid_set = 1;
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_logmgr_destroy --
+ *	Destroy the log archiving server thread and logging subsystem.
+ */
+int
+__wt_logmgr_destroy(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_SESSION *wt_session;
+
+	conn = S2C(session);
+
+	if (!conn->logging)
+		return (0);
+	if (conn->arch_tid_set) {
+		WT_TRET(__wt_cond_signal(session, conn->arch_cond));
+		WT_TRET(__wt_thread_join(session, conn->arch_tid));
+		conn->arch_tid_set = 0;
+	}
+	WT_TRET(__wt_cond_destroy(session, &conn->arch_cond));
+
+	WT_TRET(__wt_log_close(session));
+
+	__wt_free(session, conn->log_path);
+
+	/* Close the server thread's session. */
+	if (conn->arch_session != NULL) {
+		wt_session = &conn->arch_session->iface;
+		WT_TRET(wt_session->close(wt_session, NULL));
+		conn->arch_session = NULL;
+	}
+
+	WT_TRET(__wt_log_slot_destroy(session));
+	WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond));
+	WT_TRET(__wt_rwlock_destroy(session, &conn->log->log_archive_lock));
+	__wt_spin_destroy(session, &conn->log->log_lock);
+	__wt_spin_destroy(session, &conn->log->log_slot_lock);
+	__wt_spin_destroy(session, &conn->log->log_sync_lock);
+	__wt_free(session, conn->log);
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c
new file mode 100644
index 00000000000..41fc9809521
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_open.c
@@ -0,0 +1,244 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_connection_open --
+ *	Open a connection.
+ */
+int
+__wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[])
+{
+	WT_SESSION_IMPL *session;
+
+	/* Default session. */
+	session = conn->default_session;
+	WT_ASSERT(session, session->iface.connection == &conn->iface);
+
+	/*
+	 * Tell internal server threads to run: this must be set before opening
+	 * any sessions.
+	 */
+	F_SET(conn, WT_CONN_SERVER_RUN);
+
+	/* WT_SESSION_IMPL array. */
+	WT_RET(__wt_calloc(session,
+	    conn->session_size, sizeof(WT_SESSION_IMPL), &conn->sessions));
+
+	/*
+	 * Open the default session.  We open this before starting service
+	 * threads because those may allocate and use session resources that
+	 * need to get cleaned up on close.
+	 */
+	WT_RET(__wt_open_internal_session(conn, "connection", 1, 0, &session));
+
+	/*
+	 * The connection's default session is originally a static structure,
+	 * swap that out for a more fully-functional session.  It's necessary
+	 * to have this step: the session allocation code uses the connection's
+	 * session, and if we pass a reference to the default session as the
+	 * place to store the allocated session, things get confused and error
+	 * handling can be corrupted.  So, we allocate into a stack variable
+	 * and then assign it on success.
+	 */
+	conn->default_session = session;
+
+	/*
+	 * Publish: there must be a barrier to ensure the connection structure
+	 * fields are set before other threads read from the pointer.
+	 */
+	WT_WRITE_BARRIER();
+
+	/* Connect to a cache pool. */
+	WT_RET(__wt_cache_pool_config(session, cfg));
+
+	/* Create the cache. */
+	WT_RET(__wt_cache_create(session, cfg));
+
+	/* Initialize transaction support. */
+	WT_RET(__wt_txn_global_init(session, cfg));
+
+	return (0);
+}
+
+/*
+ * __wt_connection_close --
+ *	Close a connection handle.
+ */
+int
+__wt_connection_close(WT_CONNECTION_IMPL *conn)
+{
+	WT_CONNECTION *wt_conn;
+	WT_DECL_RET;
+	WT_DLH *dlh;
+	WT_FH *fh;
+	WT_SESSION_IMPL *s, *session;
+	WT_TXN_GLOBAL *txn_global;
+	u_int i;
+
+	wt_conn = &conn->iface;
+	txn_global = &conn->txn_global;
+	session = conn->default_session;
+
+	/*
+	 * We're shutting down.  Make sure everything gets freed.
+	 *
+	 * It's possible that the eviction server is in the middle of a long
+	 * operation, with a transaction ID pinned.  In that case, we will loop
+	 * here until the transaction ID is released, when the oldest
+	 * transaction ID will catch up with the current ID.
+	 */
+	for (;;) {
+		__wt_txn_update_oldest(session);
+		if (txn_global->oldest_id == txn_global->current)
+			break;
+		__wt_yield();
+	}
+
+	/* Clear any pending async ops. */
+	WT_TRET(__wt_async_flush(session));
+
+	/*
+	 * Shut down server threads other than the eviction server, which is
+	 * needed later to close btree handles.  Some of these threads access
+	 * btree handles, so take care in ordering shutdown to make sure they
+	 * exit before files are closed.
+	 */
+	F_CLR(conn, WT_CONN_SERVER_RUN);
+	WT_TRET(__wt_async_destroy(session));
+	WT_TRET(__wt_lsm_manager_destroy(session));
+	WT_TRET(__wt_checkpoint_server_destroy(session));
+	WT_TRET(__wt_statlog_destroy(session, 1));
+	WT_TRET(__wt_sweep_destroy(session));
+
+	/* Close open data handles. */
+	WT_TRET(__wt_conn_dhandle_discard(session));
+
+	/*
+	 * Now that all data handles are closed, tell logging that a checkpoint
+	 * has completed then shut down the log manager (only after closing
+	 * data handles).
+	 */
+	if (conn->logging) {
+		WT_TRET(__wt_txn_checkpoint_log(
+		    session, 1, WT_TXN_LOG_CKPT_STOP, NULL));
+		WT_TRET(__wt_logmgr_destroy(session));
+	}
+
+	/* Free memory for collators, compressors, data sources. */
+	WT_TRET(__wt_conn_remove_collator(session));
+	WT_TRET(__wt_conn_remove_compressor(session));
+	WT_TRET(__wt_conn_remove_data_source(session));
+
+	/*
+	 * Complain if files weren't closed, ignoring the lock file, we'll
+	 * close it in a minute.
+	 */
+	TAILQ_FOREACH(fh, &conn->fhqh, q) {
+		if (fh == conn->lock_fh)
+			continue;
+
+		__wt_errx(session,
+		    "Connection has open file handles: %s", fh->name);
+		WT_TRET(__wt_close(session, fh));
+		fh = TAILQ_FIRST(&conn->fhqh);
+	}
+
+	/* Shut down the eviction server thread. */
+	WT_TRET(__wt_evict_destroy(session));
+
+	/* Disconnect from shared cache - must be before cache destroy. */
+	WT_TRET(__wt_conn_cache_pool_destroy(session));
+
+	/* Discard the cache. */
+	WT_TRET(__wt_cache_destroy(session));
+
+	/* Discard transaction state. */
+	__wt_txn_global_destroy(session);
+
+	/* Close extensions, first calling any unload entry point. */
+	while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) {
+		TAILQ_REMOVE(&conn->dlhqh, dlh, q);
+
+		if (dlh->terminate != NULL)
+			WT_TRET(dlh->terminate(wt_conn));
+		WT_TRET(__wt_dlclose(session, dlh));
+	}
+
+	/*
+	 * Close the internal (default) session, and switch back to the dummy
+	 * session in case of any error messages from the remaining operations
+	 * while destroying the connection handle.
+	 */
+	if (session != &conn->dummy_session) {
+		WT_TRET(session->iface.close(&session->iface, NULL));
+		session = conn->default_session = &conn->dummy_session;
+	}
+
+	/*
+	 * The session's split stash isn't discarded during normal session close
+	 * because it may persist past the life of the session.  Discard it now.
+	 */
+	if ((s = conn->sessions) != NULL)
+		for (i = 0; i < conn->session_size; ++s, ++i)
+			__wt_split_stash_discard_all(session, s);
+
+	/*
+	 * The session's hazard pointer memory isn't discarded during normal
+	 * session close because access to it isn't serialized.  Discard it
+	 * now.
+	 */
+	if ((s = conn->sessions) != NULL)
+		for (i = 0; i < conn->session_size; ++s, ++i)
+			if (s != session)
+				__wt_free(session, s->hazard);
+
+	/* Destroy the handle. */
+	WT_TRET(__wt_connection_destroy(conn));
+
+	return (ret);
+}
+
+/*
+ * __wt_connection_workers --
+ *	Start the worker threads.
+ */
+int
+__wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	/*
+	 * Start the eviction thread.
+	 */
+	WT_RET(__wt_evict_create(session));
+
+	/*
+	 * Start the handle sweep thread.
+	 */
+	WT_RET(__wt_sweep_create(session));
+
+	/*
+	 * Start the optional statistics thread.  Start statistics first so that
+	 * other optional threads can know if statistics are enabled or not.
+	 */
+	WT_RET(__wt_statlog_create(session, cfg));
+
+	/* Start the optional async threads. */
+	WT_RET(__wt_async_create(session, cfg));
+
+	/*
+	 * Start the optional logging/archive thread.
+	 * NOTE: The log manager must be started before checkpoints so that the
+	 * checkpoint server knows if logging is enabled.
+	 */
+	WT_RET(__wt_logmgr_create(session, cfg));
+
+	/* Start the optional checkpoint thread. */
+	WT_RET(__wt_checkpoint_server_create(session, cfg));
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c
new file mode 100644
index 00000000000..f7229504898
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_stat.c
@@ -0,0 +1,540 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#ifdef __GNUC__
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 1)
+/*
+ * !!!
+ * GCC with -Wformat-nonliteral complains about calls to strftime in this file.
+ * There's nothing wrong, this makes the warning go away.
+ */
+#pragma GCC diagnostic ignored "-Wformat-nonliteral"
+#endif
+#endif
+
+/*
+ * __stat_sources_free --
+ *	Free the array of statistics sources.
+ */
+static void
+__stat_sources_free(WT_SESSION_IMPL *session, char ***sources)
+{
+	char **p;
+
+	if ((p = (*sources)) != NULL) {
+		for (; *p != NULL; ++p)
+			__wt_free(session, *p);
+		__wt_free(session, *sources);
+	}
+}
+
+/*
+ * __wt_conn_stat_init --
+ *	Initialize the per-connection statistics.
+ */
+void
+__wt_conn_stat_init(WT_SESSION_IMPL *session)
+{
+	__wt_async_stats_update(session);
+	__wt_cache_stats_update(session);
+	__wt_txn_stats_update(session);
+}
+
+/*
+ * __statlog_config --
+ *	Parse and setup the statistics server options.
+ */
+static int
+__statlog_config(WT_SESSION_IMPL *session, const char **cfg, int *runp)
+{
+	WT_CONFIG objectconf;
+	WT_CONFIG_ITEM cval, k, v;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	int cnt;
+	char **sources;
+
+	conn = S2C(session);
+	sources = NULL;
+
+	WT_RET(__wt_config_gets(session, cfg, "statistics_log.wait", &cval));
+	/* Only start the server if wait time is non-zero */
+	*runp = (cval.val == 0) ? 0 : 1;
+	conn->stat_usecs = (long)cval.val * 1000000;
+
+	WT_RET(__wt_config_gets(
+	    session, cfg, "statistics_log.on_close", &cval));
+	if (cval.val != 0)
+		FLD_SET(conn->stat_flags, WT_CONN_STAT_ON_CLOSE);
+
+	/*
+	 * Statistics logging configuration requires either a wait time or an
+	 * on-close setting.
+	 */
+	if (*runp == 0 && !FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ON_CLOSE))
+		return (0);
+
+	WT_RET(__wt_config_gets(session, cfg, "statistics_log.sources", &cval));
+	WT_RET(__wt_config_subinit(session, &objectconf, &cval));
+	for (cnt = 0; (ret = __wt_config_next(&objectconf, &k, &v)) == 0; ++cnt)
+		;
+	WT_RET_NOTFOUND_OK(ret);
+	if (cnt != 0) {
+		WT_RET(__wt_calloc_def(session, cnt + 1, &sources));
+		WT_RET(__wt_config_subinit(session, &objectconf, &cval));
+		for (cnt = 0;
+		    (ret = __wt_config_next(&objectconf, &k, &v)) == 0; ++cnt) {
+			/*
+			 * XXX
+			 * Only allow "file:" and "lsm:" for now: "file:" works
+			 * because it's been converted to data handles, "lsm:"
+			 * works because we can easily walk the list of open LSM
+			 * objects, even though it hasn't been converted.
+			 */
+			if (!WT_PREFIX_MATCH(k.str, "file:") &&
+			    !WT_PREFIX_MATCH(k.str, "lsm:"))
+				WT_ERR_MSG(session, EINVAL,
+				    "statistics_log sources configuration only "
+				    "supports objects of type \"file\" or "
+				    "\"lsm\"");
+			WT_ERR(
+			    __wt_strndup(session, k.str, k.len, &sources[cnt]));
+		}
+		WT_ERR_NOTFOUND_OK(ret);
+
+		conn->stat_sources = sources;
+		sources = NULL;
+	}
+
+	WT_ERR(__wt_config_gets(session, cfg, "statistics_log.path", &cval));
+	WT_ERR(__wt_nfilename(session, cval.str, cval.len, &conn->stat_path));
+
+	WT_ERR(__wt_config_gets(
+	    session, cfg, "statistics_log.timestamp", &cval));
+	WT_ERR(__wt_strndup(session, cval.str, cval.len, &conn->stat_format));
+
+err:	__stat_sources_free(session, &sources);
+	return (ret);
+}
+
+/*
+ * __statlog_dump --
+ *	Dump out handle/connection statistics.
+ */
+static int
+__statlog_dump(WT_SESSION_IMPL *session, const char *name, int conn_stats)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_CURSOR *cursor;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	WT_STATS *stats;
+	u_int i;
+	uint64_t max;
+	const char *uri;
+	const char *cfg[] = {
+	    WT_CONFIG_BASE(session, session_open_cursor), NULL };
+
+	conn = S2C(session);
+
+	/* Build URI and configuration string. */
+	if (conn_stats)
+		uri = "statistics:";
+	else {
+		WT_RET(__wt_scr_alloc(session, 0, &tmp));
+		WT_ERR(__wt_buf_fmt(session, tmp, "statistics:%s", name));
+		uri = tmp->data;
+	}
+
+	/*
+	 * Open the statistics cursor and dump the statistics.
+	 *
+	 * If we don't find an underlying object, silently ignore it, the object
+	 * may exist only intermittently.
+	 */
+	switch (ret = __wt_curstat_open(session, uri, cfg, &cursor)) {
+	case 0:
+		max = conn_stats ?
+		    sizeof(WT_CONNECTION_STATS) / sizeof(WT_STATS) :
+		    sizeof(WT_DSRC_STATS) / sizeof(WT_STATS);
+		for (i = 0,
+		    stats = WT_CURSOR_STATS(cursor); i <  max; ++i, ++stats)
+			WT_ERR_TEST((fprintf(conn->stat_fp,
+			    "%s %" PRIu64 " %s %s\n",
+			    conn->stat_stamp,
+			    stats->v, name, stats->desc) < 0), __wt_errno());
+		WT_ERR(cursor->close(cursor));
+		break;
+	case EBUSY:
+	case ENOENT:
+	case WT_NOTFOUND:
+		ret = 0;
+		break;
+	default:
+		break;
+	}
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __statlog_apply --
+ *	Review a single open handle and dump statistics on demand.
+ */
+static int
+__statlog_apply(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_DATA_HANDLE *dhandle;
+	char **p;
+
+	WT_UNUSED(cfg);
+
+	dhandle = session->dhandle;
+
+	/* Check for a match on the set of sources. */
+	for (p = S2C(session)->stat_sources; *p != NULL; ++p)
+		if (WT_PREFIX_MATCH(dhandle->name, *p))
+			return (__statlog_dump(session, dhandle->name, 0));
+	return (0);
+}
+
+/*
+ * __statlog_lsm_apply --
+ *	Review the list open LSM trees, and dump statistics on demand.
+ *
+ * XXX
+ * This code should be removed when LSM objects are converted to data handles.
+ */
+static int
+__statlog_lsm_apply(WT_SESSION_IMPL *session)
+{
+#define	WT_LSM_TREE_LIST_SLOTS	100
+	WT_LSM_TREE *lsm_tree, *list[WT_LSM_TREE_LIST_SLOTS];
+	WT_DECL_RET;
+	int cnt, locked;
+	char **p;
+
+	cnt = locked = 0;
+
+	/*
+	 * Walk the list of LSM trees, checking for a match on the set of
+	 * sources.
+	 *
+	 * XXX
+	 * We can't hold the schema lock for the traversal because the LSM
+	 * statistics code acquires the tree lock, and the LSM cursor code
+	 * acquires the tree lock and then acquires the schema lock, it's a
+	 * classic deadlock.  This is temporary code so I'm not going to do
+	 * anything fancy.
+	 * It is OK to not keep holding the schema lock after populating
+	 * the list of matching LSM trees, since the __wt_lsm_tree_get call
+	 * will bump a reference count, so the tree won't go away.
+	 */
+	__wt_spin_lock(session, &S2C(session)->schema_lock);
+	locked = 1;
+	TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) {
+		if (cnt == WT_LSM_TREE_LIST_SLOTS)
+			break;
+		for (p = S2C(session)->stat_sources; *p != NULL; ++p)
+			if (WT_PREFIX_MATCH(lsm_tree->name, *p)) {
+				WT_ERR(__wt_lsm_tree_get(
+				    session, lsm_tree->name, 0, &list[cnt++]));
+				break;
+			}
+	}
+	__wt_spin_unlock(session, &S2C(session)->schema_lock);
+	locked = 0;
+
+	while (cnt > 0) {
+		--cnt;
+		WT_TRET(__statlog_dump(session, list[cnt]->name, 0));
+		__wt_lsm_tree_release(session, list[cnt]);
+	}
+
+err:	if (locked)
+		__wt_spin_unlock(session, &S2C(session)->schema_lock);
+	/* Release any LSM trees on error. */
+	while (cnt > 0) {
+		--cnt;
+		__wt_lsm_tree_release(session, list[cnt]);
+	}
+	return (ret);
+}
+
+/*
+ * __statlog_log_one --
+ *	Output a set of statistics into the current log file.
+ */
+static int
+__statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp)
+{
+	FILE *log_file;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	struct timespec ts;
+	struct tm *tm, _tm;
+
+	conn = S2C(session);
+
+	/* Get the current local time of day. */
+	WT_RET(__wt_epoch(session, &ts));
+	tm = localtime_r(&ts.tv_sec, &_tm);
+
+	/* Create the logging path name for this time of day. */
+	if (strftime(tmp->mem, tmp->memsize, conn->stat_path, tm) == 0)
+		WT_RET_MSG(session, ENOMEM, "strftime path conversion");
+
+	/* If the path has changed, cycle the log file. */
+	if ((log_file = conn->stat_fp) == NULL ||
+	    path == NULL || strcmp(tmp->mem, path->mem) != 0) {
+		conn->stat_fp = NULL;
+		if (log_file != NULL)
+			WT_RET(fclose(log_file) == 0 ? 0 : __wt_errno());
+
+		if (path != NULL)
+			(void)strcpy(path->mem, tmp->mem);
+		WT_RET_TEST((log_file =
+		    fopen(tmp->mem, "a")) == NULL, __wt_errno());
+	}
+	conn->stat_fp = log_file;
+
+	/* Create the entry prefix for this time of day. */
+	if (strftime(tmp->mem, tmp->memsize, conn->stat_format, tm) == 0)
+		WT_RET_MSG(session, ENOMEM, "strftime timestamp conversion");
+	conn->stat_stamp = tmp->mem;
+
+	/* Dump the connection statistics. */
+	WT_RET(__statlog_dump(session, conn->home, 1));
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+	/* Dump the spinlock statistics. */
+	WT_RET(__wt_statlog_dump_spinlock(conn, conn->home));
+#endif
+
+	/*
+	 * Lock the schema and walk the list of open handles, dumping
+	 * any that match the list of object sources.
+	 */
+	if (conn->stat_sources != NULL) {
+		WT_WITH_SCHEMA_LOCK(session, ret =
+		    __wt_conn_btree_apply(session, 0, __statlog_apply, NULL));
+		WT_RET(ret);
+	}
+
+	/*
+	 * Walk the list of open LSM trees, dumping any that match the
+	 * the list of object sources.
+	 *
+	 * XXX
+	 * This code should be removed when LSM objects are converted to
+	 * data handles.
+	 */
+	if (conn->stat_sources != NULL)
+		WT_RET(__statlog_lsm_apply(session));
+
+	/* Flush. */
+	WT_RET(fflush(conn->stat_fp) == 0 ? 0 : __wt_errno());
+
+	return (0);
+}
+
+/*
+ * __wt_statlog_log_one --
+ *	Log a set of statistics into the configured statistics log. Requires
+ *	that the server is not currently running.
+ */
+int
+__wt_statlog_log_one(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_DECL_ITEM(tmp);
+
+	conn = S2C(session);
+
+	if (!FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ON_CLOSE))
+		return (0);
+
+	if (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
+	    F_ISSET(conn, WT_CONN_SERVER_STATISTICS))
+		WT_RET_MSG(session, EINVAL,
+		    "Attempt to log statistics while a server is running");
+
+	WT_RET(__wt_scr_alloc(session, strlen(conn->stat_path) + 128, &tmp));
+	WT_ERR(__statlog_log_one(session, NULL, tmp));
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __statlog_server --
+ *	The statistics server thread.
+ */
+static void *
+__statlog_server(void *arg)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_ITEM path, tmp;
+	WT_SESSION_IMPL *session;
+
+	session = arg;
+	conn = S2C(session);
+
+	WT_CLEAR(path);
+	WT_CLEAR(tmp);
+
+	/*
+	 * We need a temporary place to build a path and an entry prefix.
+	 * The length of the path plus 128 should be more than enough.
+	 *
+	 * We also need a place to store the current path, because that's
+	 * how we know when to close/re-open the file.
+	 */
+	WT_ERR(__wt_buf_init(session, &path, strlen(conn->stat_path) + 128));
+	WT_ERR(__wt_buf_init(session, &tmp, strlen(conn->stat_path) + 128));
+
+	while (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
+	    F_ISSET(conn, WT_CONN_SERVER_STATISTICS)) {
+		if (!FLD_ISSET(conn->stat_flags, WT_CONN_STAT_NONE))
+			WT_ERR(__statlog_log_one(session, &path, &tmp));
+
+		/* Wait until the next event. */
+		WT_ERR(
+		    __wt_cond_wait(session, conn->stat_cond, conn->stat_usecs));
+	}
+
+	if (0) {
+err:		__wt_err(session, ret, "statistics log server error");
+	}
+	__wt_buf_free(session, &path);
+	__wt_buf_free(session, &tmp);
+	return (NULL);
+}
+
+/*
+ * __statlog_start --
+ *	Start the statistics server thread.
+ */
+static int
+__statlog_start(WT_CONNECTION_IMPL *conn)
+{
+	WT_SESSION_IMPL *session;
+
+	/* Nothing to do if the server is already running. */
+	if (conn->stat_session != NULL)
+		return (0);
+
+	F_SET(conn, WT_CONN_SERVER_STATISTICS);
+	/* The statistics log server gets its own session. */
+	WT_RET(__wt_open_internal_session(
+	    conn, "statlog-server", 1, 1, &conn->stat_session));
+	session = conn->stat_session;
+
+	WT_RET(__wt_cond_alloc(
+	    session, "statistics log server", 0, &conn->stat_cond));
+
+	/*
+	 * Start the thread.
+	 *
+	 * Statistics logging creates a thread per database, rather than using
+	 * a single thread to do logging for all of the databases.   If we ever
+	 * see lots of databases at a time, doing statistics logging, and we
+	 * want to reduce the number of threads, there's no reason we have to
+	 * have more than one thread, I just didn't feel like writing the code
+	 * to figure out the scheduling.
+	 */
+	WT_RET(__wt_thread_create(
+	    session, &conn->stat_tid, __statlog_server, session));
+	conn->stat_tid_set = 1;
+
+	return (0);
+}
+
+/*
+ * __wt_statlog_create --
+ *	Start the statistics server thread.
+ */
+int
+__wt_statlog_create(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_CONNECTION_IMPL *conn;
+	int start;
+
+	conn = S2C(session);
+	start = 0;
+
+	/*
+	 * Stop any server that is already running. This means that each time
+	 * reconfigure is called we'll bounce the server even if there are no
+	 * configuration changes - but that makes our lives easier.
+	 */
+	if (conn->stat_session != NULL)
+		WT_RET(__wt_statlog_destroy(session, 0));
+
+	WT_RET(__statlog_config(session, cfg, &start));
+	if (start)
+		WT_RET(__statlog_start(conn));
+
+	return (0);
+}
+
+/*
+ * __wt_statlog_destroy --
+ *	Destroy the statistics server thread.
+ */
+int
+__wt_statlog_destroy(WT_SESSION_IMPL *session, int is_close)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_SESSION *wt_session;
+
+	conn = S2C(session);
+
+	F_CLR(conn, WT_CONN_SERVER_STATISTICS);
+	if (conn->stat_tid_set) {
+		WT_TRET(__wt_cond_signal(session, conn->stat_cond));
+		WT_TRET(__wt_thread_join(session, conn->stat_tid));
+		conn->stat_tid_set = 0;
+	}
+
+	/* Log a set of statistics on shutdown if configured. */
+	if (is_close)
+		WT_TRET(__wt_statlog_log_one(session));
+
+	WT_TRET(__wt_cond_destroy(session, &conn->stat_cond));
+
+	__stat_sources_free(session, &conn->stat_sources);
+	__wt_free(session, conn->stat_path);
+	__wt_free(session, conn->stat_format);
+
+	/* Close the server thread's session. */
+	if (conn->stat_session != NULL) {
+		wt_session = &conn->stat_session->iface;
+		WT_TRET(wt_session->close(wt_session, NULL));
+	}
+
+	/* Clear connection settings so reconfigure is reliable. */
+	conn->stat_session = NULL;
+	conn->stat_tid_set = 0;
+	conn->stat_format = NULL;
+	if (conn->stat_fp != NULL) {
+		WT_TRET(fclose(conn->stat_fp) == 0 ? 0 : __wt_errno());
+		conn->stat_fp = NULL;
+	}
+	conn->stat_path = NULL;
+	conn->stat_sources = NULL;
+	conn->stat_stamp = NULL;
+	conn->stat_usecs = 0;
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c
new file mode 100644
index 00000000000..3bccc5814be
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c
@@ -0,0 +1,187 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __sweep --
+ *	Close unused dhandles on the connection dhandle list.
+ */
+static int
+__sweep(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DATA_HANDLE *dhandle, *dhandle_next;
+	WT_DECL_RET;
+	time_t now;
+
+	conn = S2C(session);
+
+	/*
+	 * Session's cache handles unless the session itself is closed, at which
+	 * time the handle reference counts are immediately decremented.  Don't
+	 * discard handles that have been open recently.
+	 */
+	WT_RET(__wt_seconds(session, &now));
+
+	dhandle = SLIST_FIRST(&conn->dhlh);
+	for (; dhandle != NULL; dhandle = dhandle_next) {
+		dhandle_next = SLIST_NEXT(dhandle, l);
+		if (dhandle->session_ref != 0 ||
+		    now - dhandle->timeofdeath <= WT_DHANDLE_SWEEP_WAIT)
+			continue;
+
+		/*
+		 * We have a candidate for closing; if it's open, flush dirty
+		 * leaf pages, then acquire an exclusive lock on the handle
+		 * and close it. We might be blocking opens for a long time
+		 * (over disk I/O), but the handle was quiescent for awhile.
+		 *
+		 * The close can fail if an update cannot be written (updates in
+		 * a no-longer-referenced file might not yet be globally visible
+		 * if sessions have disjoint sets of files open).  If the handle
+		 * is busy, skip it, we'll retry the close the next time, after
+		 * the transaction state has progressed.
+		 */
+		if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) {
+			WT_WITH_DHANDLE(session, dhandle,
+			    ret = __wt_cache_op(
+			    session, NULL, WT_SYNC_WRITE_LEAVES));
+			WT_RET(ret);
+
+			/*
+			 * We don't set WT_DHANDLE_EXCLUSIVE deliberately, we
+			 * want opens to block on us rather than returning an
+			 * EBUSY error to the application.
+			 */
+			ret = __wt_try_writelock(session, dhandle->rwlock);
+			if (ret == EBUSY) {
+				ret = 0;
+				continue;
+			}
+			WT_RET(ret);
+
+			WT_WITH_DHANDLE(session, dhandle,
+			    ret = __wt_conn_btree_sync_and_close(session, 0));
+			if (ret == EBUSY)
+				ret = 0;
+
+			WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
+			WT_RET(ret);
+		}
+
+		/*
+		 * Attempt to discard the handle (the called function checks the
+		 * handle-open flag after acquiring appropriate locks, which is
+		 * why we don't do any special handling of EBUSY returns above,
+		 * that path never cleared the handle-open flag.
+		 */
+		ret = __wt_conn_dhandle_discard_single(session, dhandle, 0);
+		if (ret == EBUSY)
+			ret = 0;
+		WT_RET(ret);
+	}
+	return (0);
+}
+
+/*
+ * __sweep_server --
+ *	The handle sweep server thread.
+ */
+static void *
+__sweep_server(void *arg)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	session = arg;
+	conn = S2C(session);
+
+	/*
+	 * Sweep for dead handles.
+	 */
+	while (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
+	    F_ISSET(conn, WT_CONN_SERVER_SWEEP)) {
+
+		/* Wait until the next event. */
+		WT_ERR(
+		    __wt_cond_wait(session, conn->sweep_cond, 30 * WT_MILLION));
+
+		/* Sweep the handles. */
+		WT_ERR(__sweep(session));
+	}
+
+	if (0) {
+err:		__wt_err(session, ret, "handle sweep server error");
+	}
+	return (NULL);
+}
+
+/*
+ * __wt_sweep_create --
+ *	Start the handle sweep thread.
+ */
+int
+__wt_sweep_create(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+
+	conn = S2C(session);
+
+	/* Set first, the thread might run before we finish up. */
+	F_SET(conn, WT_CONN_SERVER_SWEEP);
+
+	WT_RET(__wt_open_internal_session(
+	    conn, "sweep-server", 1, 1, &conn->sweep_session));
+	session = conn->sweep_session;
+
+	/*
+	 * Handle sweep does enough I/O it may be called upon to perform slow
+	 * operations for the block manager.
+	 */
+	F_SET(session, WT_SESSION_CAN_WAIT);
+
+	WT_RET(__wt_cond_alloc(
+	    session, "handle sweep server", 0, &conn->sweep_cond));
+
+	WT_RET(__wt_thread_create(
+	    session, &conn->sweep_tid, __sweep_server, session));
+	conn->sweep_tid_set = 1;
+
+	return (0);
+}
+
+/*
+ * __wt_sweep_destroy --
+ *	Destroy the handle-sweep thread.
+ */
+int
+__wt_sweep_destroy(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_SESSION *wt_session;
+
+	conn = S2C(session);
+
+	F_CLR(conn, WT_CONN_SERVER_SWEEP);
+	if (conn->sweep_tid_set) {
+		WT_TRET(__wt_cond_signal(session, conn->sweep_cond));
+		WT_TRET(__wt_thread_join(session, conn->sweep_tid));
+		conn->sweep_tid_set = 0;
+	}
+	WT_TRET(__wt_cond_destroy(session, &conn->sweep_cond));
+
+	if (conn->sweep_session != NULL) {
+		wt_session = &conn->sweep_session->iface;
+		WT_TRET(wt_session->close(wt_session, NULL));
+
+		conn->sweep_session = NULL;
+	}
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c
new file mode 100644
index 00000000000..85a85521213
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c
@@ -0,0 +1,540 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __backup_all(WT_SESSION_IMPL *, WT_CURSOR_BACKUP *);
+static int __backup_cleanup_handles(WT_SESSION_IMPL *, WT_CURSOR_BACKUP *);
+static int __backup_file_create(WT_SESSION_IMPL *, WT_CURSOR_BACKUP *);
+static int __backup_file_remove(WT_SESSION_IMPL *);
+static int __backup_list_all_append(WT_SESSION_IMPL *, const char *[]);
+static int __backup_list_append(
+    WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *);
+static int __backup_start(
+    WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *[]);
+static int __backup_stop(WT_SESSION_IMPL *);
+static int __backup_uri(
+    WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *[], int *);
+
+/*
+ * __curbackup_next --
+ *	WT_CURSOR->next method for the backup cursor type.
+ */
+static int
+__curbackup_next(WT_CURSOR *cursor)
+{
+	WT_CURSOR_BACKUP *cb;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cb = (WT_CURSOR_BACKUP *)cursor;
+	CURSOR_API_CALL(cursor, session, next, NULL);
+
+	if (cb->list == NULL || cb->list[cb->next].name == NULL) {
+		F_CLR(cursor, WT_CURSTD_KEY_SET);
+		WT_ERR(WT_NOTFOUND);
+	}
+
+	cb->iface.key.data = cb->list[cb->next].name;
+	cb->iface.key.size = strlen(cb->list[cb->next].name) + 1;
+	++cb->next;
+
+	F_SET(cursor, WT_CURSTD_KEY_INT);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curbackup_reset --
+ *	WT_CURSOR->reset method for the backup cursor type.
+ */
+static int
+__curbackup_reset(WT_CURSOR *cursor)
+{
+	WT_CURSOR_BACKUP *cb;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cb = (WT_CURSOR_BACKUP *)cursor;
+	CURSOR_API_CALL(cursor, session, reset, NULL);
+
+	cb->next = 0;
+	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curbackup_close --
+ *	WT_CURSOR->close method for the backup cursor type.
+ */
+static int
+__curbackup_close(WT_CURSOR *cursor)
+{
+	WT_CURSOR_BACKUP *cb;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	int tret;
+
+	cb = (WT_CURSOR_BACKUP *)cursor;
+	CURSOR_API_CALL(cursor, session, close, NULL);
+
+	WT_TRET(__backup_cleanup_handles(session, cb));
+	WT_TRET(__wt_cursor_close(cursor));
+	session->bkp_cursor = NULL;
+
+	WT_WITH_SCHEMA_LOCK(session,
+	    tret = __backup_stop(session));		/* Stop the backup. */
+	WT_TRET(tret);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curbackup_open --
+ *	WT_SESSION->open_cursor method for the backup cursor type.
+ */
+int
+__wt_curbackup_open(WT_SESSION_IMPL *session,
+    const char *uri, const char *cfg[], WT_CURSOR **cursorp)
+{
+	WT_CURSOR_STATIC_INIT(iface,
+	    __wt_cursor_get_key,	/* get-key */
+	    __wt_cursor_notsup,		/* get-value */
+	    __wt_cursor_notsup,		/* set-key */
+	    __wt_cursor_notsup,		/* set-value */
+	    __wt_cursor_notsup,		/* compare */
+	    __curbackup_next,		/* next */
+	    __wt_cursor_notsup,		/* prev */
+	    __curbackup_reset,		/* reset */
+	    __wt_cursor_notsup,		/* search */
+	    __wt_cursor_notsup,		/* search-near */
+	    __wt_cursor_notsup,		/* insert */
+	    __wt_cursor_notsup,		/* update */
+	    __wt_cursor_notsup,		/* remove */
+	    __curbackup_close);		/* close */
+	WT_CURSOR *cursor;
+	WT_CURSOR_BACKUP *cb;
+	WT_DECL_RET;
+
+	WT_STATIC_ASSERT(offsetof(WT_CURSOR_BACKUP, iface) == 0);
+
+	cb = NULL;
+
+	WT_RET(__wt_calloc_def(session, 1, &cb));
+	cursor = &cb->iface;
+	*cursor = iface;
+	cursor->session = &session->iface;
+	session->bkp_cursor = cb;
+
+	cursor->key_format = "S";	/* Return the file names as the key. */
+	cursor->value_format = "";	/* No value. */
+
+	/*
+	 * Start the backup and fill in the cursor's list.  Acquire the schema
+	 * lock, we need a consistent view when creating a copy.
+	 */
+	WT_WITH_SCHEMA_LOCK(session, ret = __backup_start(session, cb, cfg));
+	WT_ERR(ret);
+
+	/* __wt_cursor_init is last so we don't have to clean up on error. */
+	WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));
+
+	if (0) {
+err:		__wt_free(session, cb);
+	}
+
+	return (ret);
+}
+
+/*
+ * __backup_start --
+ *	Start a backup.
+ */
+static int
+__backup_start(
+    WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *cfg[])
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	u_int i, logcount;
+	int exist, target_list;
+	char **logfiles;
+
+	conn = S2C(session);
+
+	cb->next = 0;
+	cb->list = NULL;
+	logfiles = NULL;
+	logcount = 0;
+
+	/*
+	 * Single thread hot backups: we're holding the schema lock, so we
+	 * know we'll serialize with other attempts to start a hot backup.
+	 */
+	if (conn->hot_backup)
+		WT_RET_MSG(
+		    session, EINVAL, "there is already a backup cursor open");
+
+	/*
+	 * The hot backup copy is done outside of WiredTiger, which means file
+	 * blocks can't be freed and re-allocated until the backup completes.
+	 * The checkpoint code checks the backup flag, and if a backup cursor
+	 * is open checkpoints aren't discarded.   We release the lock as soon
+	 * as we've set the flag, we don't want to block checkpoints, we just
+	 * want to make sure no checkpoints are deleted.  The checkpoint code
+	 * holds the lock until it's finished the checkpoint, otherwise we
+	 * could start a hot backup that would race with an already-started
+	 * checkpoint.
+	 */
+	__wt_spin_lock(session, &conn->hot_backup_lock);
+	conn->hot_backup = 1;
+	__wt_spin_unlock(session, &conn->hot_backup_lock);
+
+	/* Create the hot backup file. */
+	WT_ERR(__backup_file_create(session, cb));
+
+	/* Add log files if logging is enabled. */
+
+	/*
+	 * If a list of targets was specified, work our way through them.
+	 * Else, generate a list of all database objects.
+	 *
+	 * Include log files if doing a full backup, and copy them before
+	 * copying data files to avoid rolling the metadata forward across
+	 * a checkpoint that completes during the backup.
+	 */
+	target_list = 0;
+	WT_ERR(__backup_uri(session, cb, cfg, &target_list));
+	if (!target_list) {
+		if (conn->log) {
+			WT_ERR(__wt_log_get_active_files(
+			    session, &logfiles, &logcount));
+			for (i = 0; i < logcount; i++)
+				WT_ERR(__backup_list_append(
+				    session, cb, logfiles[i]));
+		}
+
+		WT_ERR(__backup_all(session, cb));
+	}
+
+	/* Add the hot backup and standard WiredTiger files to the list. */
+	WT_ERR(__backup_list_append(session, cb, WT_METADATA_BACKUP));
+	WT_ERR(__wt_exist(session, WT_BASECONFIG, &exist));
+	if (exist)
+		WT_ERR(__backup_list_append(session, cb, WT_BASECONFIG));
+	WT_ERR(__wt_exist(session, WT_USERCONFIG, &exist));
+	if (exist)
+		WT_ERR(__backup_list_append(session, cb, WT_USERCONFIG));
+	WT_ERR(__backup_list_append(session, cb, WT_WIREDTIGER));
+
+err:	/* Close the hot backup file. */
+	if (cb->bfp != NULL) {
+		WT_TRET(fclose(cb->bfp) == 0 ? 0 : __wt_errno());
+		cb->bfp = NULL;
+	}
+	if (logfiles != NULL)
+		__wt_log_files_free(session, logfiles, logcount);
+
+	if (ret != 0) {
+		WT_TRET(__backup_cleanup_handles(session, cb));
+		WT_TRET(__backup_stop(session));
+	}
+
+	return (ret);
+}
+
+/*
+ * __backup_cleanup_handles --
+ *	Release and free all btree handles held by the backup. This is kept
+ *	separate from __backup_stop because it can be called without the
+ *	schema lock held.
+ */
+static int
+__backup_cleanup_handles(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb)
+{
+	WT_CURSOR_BACKUP_ENTRY *p;
+	WT_DECL_RET;
+
+	if (cb->list == NULL)
+		return (0);
+
+	/* Release the handles, free the file names, free the list itself. */
+	for (p = cb->list; p->name != NULL; ++p) {
+		if (p->handle != NULL)
+			WT_WITH_DHANDLE(session, p->handle,
+			    WT_TRET(__wt_session_release_btree(session)));
+		__wt_free(session, p->name);
+	}
+
+	__wt_free(session, cb->list);
+	return (ret);
+}
+
+/*
+ * __backup_stop --
+ *	Stop a backup.
+ */
+static int
+__backup_stop(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+
+	conn = S2C(session);
+
+	/* Remove any backup metadata file. */
+	ret = __backup_file_remove(session);
+
+	/* Checkpoint deletion can proceed, as can the next hot backup. */
+	__wt_spin_lock(session, &conn->hot_backup_lock);
+	conn->hot_backup = 0;
+	__wt_spin_unlock(session, &conn->hot_backup_lock);
+
+	return (ret);
+}
+
+/*
+ * __backup_all --
+ *	Backup all objects in the database.
+ */
+static int
+__backup_all(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb)
+{
+	WT_CONFIG_ITEM cval;
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	const char *key, *value;
+
+	cursor = NULL;
+
+	/*
+	 * Open a cursor on the metadata file and copy all of the entries to
+	 * the hot backup file.
+	 */
+	WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+	while ((ret = cursor->next(cursor)) == 0) {
+		WT_ERR(cursor->get_key(cursor, &key));
+		WT_ERR(cursor->get_value(cursor, &value));
+		WT_ERR_TEST((fprintf(
+		    cb->bfp, "%s\n%s\n", key, value) < 0), __wt_errno());
+
+		/*
+		 * While reading the metadata file, check there are no "sources"
+		 * or "types" which can't support hot backup.  This checks for
+		 * a data source that's non-standard, which can't be backed up,
+		 * but is also sanity checking: if there's an entry backed by
+		 * anything other than a file or lsm entry, we're confused.
+		 */
+		if ((ret = __wt_config_getones(
+		    session, value, "type", &cval)) == 0 &&
+		    !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "file") &&
+		    !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "lsm"))
+			WT_ERR_MSG(session, ENOTSUP,
+			    "hot backup is not supported for objects of "
+			    "type %.*s", (int)cval.len, cval.str);
+		WT_ERR_NOTFOUND_OK(ret);
+		if ((ret =__wt_config_getones(
+		    session, value, "source", &cval)) == 0 &&
+		    !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "file:") &&
+		    !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "lsm:"))
+			WT_ERR_MSG(session, ENOTSUP,
+			    "hot backup is not supported for objects of "
+			    "source %.*s", (int)cval.len, cval.str);
+		WT_ERR_NOTFOUND_OK(ret);
+	}
+	WT_ERR_NOTFOUND_OK(ret);
+
+	/* Build a list of the file objects that need to be copied. */
+	WT_ERR(__wt_meta_btree_apply(session, __backup_list_all_append, NULL));
+
+err:	if (cursor != NULL)
+		WT_TRET(cursor->close(cursor));
+	return (ret);
+}
+
+/*
+ * __backup_uri --
+ *	Backup a list of objects.
+ */
+static int
+__backup_uri(WT_SESSION_IMPL *session,
+    WT_CURSOR_BACKUP *cb, const char *cfg[], int *foundp)
+{
+	WT_CONFIG targetconf;
+	WT_CONFIG_ITEM cval, k, v;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	int target_list;
+	const char *uri;
+
+	*foundp = target_list = 0;
+
+	/*
+	 * If we find a non-empty target configuration string, we have a job,
+	 * otherwise it's not our problem.
+	 */
+	WT_RET(__wt_config_gets(session, cfg, "target", &cval));
+	WT_RET(__wt_config_subinit(session, &targetconf, &cval));
+	for (cb->list_next = 0;
+	    (ret = __wt_config_next(&targetconf, &k, &v)) == 0;) {
+		if (!target_list) {
+			target_list = *foundp = 1;
+
+			WT_ERR(__wt_scr_alloc(session, 512, &tmp));
+		}
+
+		WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str));
+		uri = tmp->data;
+		if (v.len != 0)
+			WT_ERR_MSG(session, EINVAL,
+			    "%s: invalid backup target: URIs may need quoting",
+			    uri);
+
+		WT_ERR(__wt_schema_worker(
+		    session, uri, NULL, __wt_backup_list_uri_append, cfg, 0));
+	}
+	WT_ERR_NOTFOUND_OK(ret);
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __backup_file_create --
+ *	Create the meta-data backup file.
+ */
+static int
+__backup_file_create(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb)
+{
+	WT_DECL_RET;
+	char *path;
+
+	/* Open the hot backup file. */
+	WT_RET(__wt_filename(session, WT_METADATA_BACKUP, &path));
+	WT_ERR_TEST((cb->bfp = fopen(path, "w")) == NULL, __wt_errno());
+
+err:	__wt_free(session, path);
+	return (ret);
+}
+
+/*
+ * __backup_file_remove --
+ *	Remove the meta-data backup file.
+ */
+static int
+__backup_file_remove(WT_SESSION_IMPL *session)
+{
+	return (__wt_remove(session, WT_METADATA_BACKUP));
+}
+
+/*
+ * __wt_backup_list_uri_append --
+ *	Append a new file name to the list, allocate space as necessary.
+ *	Called via the schema_worker function.
+ */
+int
+__wt_backup_list_uri_append(
+    WT_SESSION_IMPL *session, const char *name, int *skip)
+{
+	WT_CURSOR_BACKUP *cb;
+	const char *value;
+
+	cb = session->bkp_cursor;
+	WT_UNUSED(skip);
+
+	/* Add the metadata entry to the backup file. */
+	WT_RET(__wt_metadata_search(session, name, &value));
+	WT_RET_TEST(
+	    (fprintf(cb->bfp, "%s\n%s\n", name, value) < 0), __wt_errno());
+	__wt_free(session, value);
+
+	/* Add file type objects to the list of files to be copied. */
+	if (WT_PREFIX_MATCH(name, "file:"))
+		WT_RET(__backup_list_append(session, cb, name));
+
+	return (0);
+}
+
+/*
+ * __backup_list_all_append --
+ *	Append a new file name to the list, allocate space as necessary.
+ *	Called via the __wt_meta_btree_apply function.
+ */
+static int
+__backup_list_all_append(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_CURSOR_BACKUP *cb;
+
+	WT_UNUSED(cfg);
+
+	cb = session->bkp_cursor;
+
+	/* Ignore files in the process of being bulk-loaded. */
+	if (F_ISSET(S2BT(session), WT_BTREE_BULK))
+		return (0);
+
+	/* Add the file to the list of files to be copied. */
+	return (__backup_list_append(session, cb, session->dhandle->name));
+}
+
+/*
+ * __backup_list_append --
+ *	Append a new file name to the list, allocate space as necessary.
+ */
+static int
+__backup_list_append(
+    WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *uri)
+{
+	WT_CURSOR_BACKUP_ENTRY *p;
+	WT_DATA_HANDLE *old_dhandle;
+	WT_DECL_RET;
+	const char *name;
+	int need_handle;
+
+	/* Leave a NULL at the end to mark the end of the list. */
+	WT_RET(__wt_realloc_def(session, &cb->list_allocated,
+	    cb->list_next + 2, &cb->list));
+	p = &cb->list[cb->list_next];
+	p[0].name = p[1].name = NULL;
+	p[0].handle = p[1].handle = NULL;
+
+	need_handle = 0;
+	name = uri;
+	if (WT_PREFIX_MATCH(uri, "file:")) {
+		need_handle = 1;
+		name += strlen("file:");
+	}
+
+	/*
+	 * !!!
+	 * Assumes metadata file entries map one-to-one to physical files.
+	 * To support a block manager where that's not the case, we'd need
+	 * to call into the block manager and get a list of physical files
+	 * that map to this logical "file".  I'm not going to worry about
+	 * that for now, that block manager might not even support physical
+	 * copying of files by applications.
+	 */
+	WT_RET(__wt_strdup(session, name, &p->name));
+
+	/*
+	 * If it's a file in the database, get a handle for the underlying
+	 * object (this handle blocks schema level operations, for example
+	 * WT_SESSION.drop or an LSM file discard after level merging).
+	 */
+	if (need_handle) {
+		old_dhandle = session->dhandle;
+		if ((ret =
+		    __wt_session_get_btree(session, uri, NULL, NULL, 0)) == 0)
+			p->handle = session->dhandle;
+		session->dhandle = old_dhandle;
+		WT_RET(ret);
+	}
+
+	++cb->list_next;
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_bulk.c b/src/third_party/wiredtiger/src/cursor/cur_bulk.c
new file mode 100644
index 00000000000..96a45a7e629
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_bulk.c
@@ -0,0 +1,287 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __curbulk_insert_fix --
+ *	Fixed-length column-store bulk cursor insert.
+ */
+static int
+__curbulk_insert_fix(WT_CURSOR *cursor)
+{
+	WT_BTREE *btree;
+	WT_CURSOR_BULK *cbulk;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cbulk = (WT_CURSOR_BULK *)cursor;
+	btree = cbulk->cbt.btree;
+
+	/*
+	 * Bulk cursor inserts are updates, but don't need auto-commit
+	 * transactions because they are single-threaded and not visible
+	 * until the bulk cursor is closed.
+	 */
+	CURSOR_API_CALL(cursor, session, insert, btree);
+
+	WT_CURSOR_NEEDVALUE(cursor);
+
+	WT_ERR(__wt_bulk_insert_fix(session, cbulk));
+
+	WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curbulk_insert_var --
+ *	Variable-length column-store bulk cursor insert.
+ */
+static int
+__curbulk_insert_var(WT_CURSOR *cursor)
+{
+	WT_BTREE *btree;
+	WT_CURSOR_BULK *cbulk;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	int duplicate;
+
+	cbulk = (WT_CURSOR_BULK *)cursor;
+	btree = cbulk->cbt.btree;
+
+	/*
+	 * Bulk cursor inserts are updates, but don't need auto-commit
+	 * transactions because they are single-threaded and not visible
+	 * until the bulk cursor is closed.
+	 */
+	CURSOR_API_CALL(cursor, session, insert, btree);
+
+	WT_CURSOR_NEEDVALUE(cursor);
+
+	/*
+	 * If this isn't the first value inserted, compare it against the last
+	 * value and increment the RLE count.
+	 *
+	 * Instead of a "first time" variable, I'm using the RLE count, because
+	 * it is only zero before the first row is inserted.
+	 */
+	duplicate = 0;
+	if (cbulk->rle != 0) {
+		if (cbulk->last.size == cursor->value.size &&
+		    memcmp(cbulk->last.data, cursor->value.data,
+		    cursor->value.size) == 0) {
+			++cbulk->rle;
+			duplicate = 1;
+		} else
+			WT_ERR(__wt_bulk_insert_var(session, cbulk));
+	}
+
+	/*
+	 * Save a copy of the value for the next comparison and reset the RLE
+	 * counter.
+	 */
+	if (!duplicate) {
+		WT_ERR(__wt_buf_set(session,
+		    &cbulk->last, cursor->value.data, cursor->value.size));
+		cbulk->rle = 1;
+	}
+
+	WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __bulk_row_keycmp_err --
+ *	Error routine when keys inserted out-of-order.
+ */
+static int
+__bulk_row_keycmp_err(WT_CURSOR_BULK *cbulk)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_ITEM(a);
+	WT_DECL_ITEM(b);
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session;
+	cursor = &cbulk->cbt.iface;
+
+	WT_ERR(__wt_scr_alloc(session, 512, &a));
+	WT_ERR(__wt_scr_alloc(session, 512, &b));
+
+	WT_ERR(__wt_buf_set_printable(
+	    session, a, cursor->key.data, cursor->key.size));
+	WT_ERR(__wt_buf_set_printable(
+	    session, b, cbulk->last.data, cbulk->last.size));
+
+	WT_ERR_MSG(session, EINVAL,
+	    "bulk-load presented with out-of-order keys: %.*s compares smaller "
+	    "than previously inserted key %.*s",
+	    (int)a->size, (const char *)a->data,
+	    (int)b->size, (const char *)b->data);
+
+err:	__wt_scr_free(&a);
+	__wt_scr_free(&b);
+	return (ret);
+}
+
+/*
+ * __curbulk_insert_row --
+ *	Row-store bulk cursor insert, with key-sort checks.
+ */
+static int
+__curbulk_insert_row(WT_CURSOR *cursor)
+{
+	WT_BTREE *btree;
+	WT_CURSOR_BULK *cbulk;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	int cmp;
+
+	cbulk = (WT_CURSOR_BULK *)cursor;
+	btree = cbulk->cbt.btree;
+
+	/*
+	 * Bulk cursor inserts are updates, but don't need auto-commit
+	 * transactions because they are single-threaded and not visible
+	 * until the bulk cursor is closed.
+	 */
+	CURSOR_API_CALL(cursor, session, insert, btree);
+
+	WT_CURSOR_CHECKKEY(cursor);
+	WT_CURSOR_CHECKVALUE(cursor);
+
+	/*
+	 * If this isn't the first key inserted, compare it against the last key
+	 * to ensure the application doesn't accidentally corrupt the table.
+	 *
+	 * Instead of a "first time" variable, I'm using the RLE count, because
+	 * it is only zero before the first row is inserted.
+	 */
+	if (cbulk->rle != 0) {
+		WT_ERR(__wt_compare(session,
+		    btree->collator, &cursor->key, &cbulk->last, &cmp));
+		if (cmp <= 0)
+			WT_ERR(__bulk_row_keycmp_err(cbulk));
+	}
+
+	/*
+	 * Save a copy of the key for the next comparison and set the RLE
+	 * counter.
+	 */
+	WT_ERR(__wt_buf_set(session,
+	    &cbulk->last, cursor->key.data, cursor->key.size));
+	cbulk->rle = 1;
+
+	WT_ERR(__wt_bulk_insert_row(session, cbulk));
+
+	WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curbulk_insert_row_skip_check --
+ *	Row-store bulk cursor insert, without key-sort checks.
+ */
+static int
+__curbulk_insert_row_skip_check(WT_CURSOR *cursor)
+{
+	WT_BTREE *btree;
+	WT_CURSOR_BULK *cbulk;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cbulk = (WT_CURSOR_BULK *)cursor;
+	btree = cbulk->cbt.btree;
+
+	/*
+	 * Bulk cursor inserts are updates, but don't need auto-commit
+	 * transactions because they are single-threaded and not visible
+	 * until the bulk cursor is closed.
+	 */
+	CURSOR_API_CALL(cursor, session, insert, btree);
+
+	WT_CURSOR_NEEDKEY(cursor);
+	WT_CURSOR_NEEDVALUE(cursor);
+
+	WT_ERR(__wt_bulk_insert_row(session, cbulk));
+
+	WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curbulk_close --
+ *	WT_CURSOR->close for the bulk cursor type.
+ */
+static int
+__curbulk_close(WT_CURSOR *cursor)
+{
+	WT_BTREE *btree;
+	WT_CURSOR_BULK *cbulk;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cbulk = (WT_CURSOR_BULK *)cursor;
+	btree = cbulk->cbt.btree;
+
+	CURSOR_API_CALL(cursor, session, close, btree);
+
+	WT_TRET(__wt_bulk_wrapup(session, cbulk));
+	__wt_buf_free(session, &cbulk->last);
+
+	WT_TRET(__wt_session_release_btree(session));
+
+	/* The URI is owned by the btree handle. */
+	cursor->internal_uri = NULL;
+
+	WT_TRET(__wt_cursor_close(cursor));
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curbulk_init --
+ *	Initialize a bulk cursor.
+ */
+int
+__wt_curbulk_init(WT_SESSION_IMPL *session,
+    WT_CURSOR_BULK *cbulk, int bitmap, int skip_sort_check)
+{
+	WT_CURSOR *c;
+	WT_CURSOR_BTREE *cbt;
+
+	c = &cbulk->cbt.iface;
+	cbt = &cbulk->cbt;
+
+	/* Bulk cursors only support insert and close (reset is a no-op). */
+	__wt_cursor_set_notsup(c);
+	switch (cbt->btree->type) {
+	case BTREE_COL_FIX:
+		c->insert = __curbulk_insert_fix;
+		break;
+	case BTREE_COL_VAR:
+		c->insert = __curbulk_insert_var;
+		break;
+	case BTREE_ROW:
+		c->insert = skip_sort_check ?
+		    __curbulk_insert_row_skip_check : __curbulk_insert_row;
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+	c->close = __curbulk_close;
+
+	cbulk->bitmap = bitmap;
+	if (bitmap)
+		F_SET(c, WT_CURSTD_RAW);
+
+	return (__wt_bulk_init(session, cbulk));
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_config.c b/src/third_party/wiredtiger/src/cursor/cur_config.c
new file mode 100644
index 00000000000..868b144efc1
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_config.c
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __curconfig_close --
+ *	WT_CURSOR->close method for the config cursor type.
+ */
+static int
+__curconfig_close(WT_CURSOR *cursor)
+{
+	return (__wt_cursor_close(cursor));
+}
+
+/*
+ * __wt_curconfig_open --
+ *	WT_SESSION->open_cursor method for config cursors.
+ */
+int
+__wt_curconfig_open(WT_SESSION_IMPL *session,
+    const char *uri, const char *cfg[], WT_CURSOR **cursorp)
+{
+	WT_CURSOR_STATIC_INIT(iface,
+	    __wt_cursor_get_key,	/* get-key */
+	    __wt_cursor_get_value,	/* get-value */
+	    __wt_cursor_set_key,	/* set-key */
+	    __wt_cursor_set_value,	/* set-value */
+	    __wt_cursor_notsup,		/* compare */
+	    __wt_cursor_notsup,		/* next */
+	    __wt_cursor_notsup,		/* prev */
+	    __wt_cursor_noop,		/* reset */
+	    __wt_cursor_notsup,		/* search */
+	    __wt_cursor_notsup,		/* search-near */
+	    __wt_cursor_notsup,		/* insert */
+	    __wt_cursor_notsup,		/* update */
+	    __wt_cursor_notsup,		/* remove */
+	    __curconfig_close);
+	WT_CURSOR_CONFIG *cconfig;
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+
+	WT_STATIC_ASSERT(offsetof(WT_CURSOR_CONFIG, iface) == 0);
+
+	WT_UNUSED(uri);
+
+	WT_RET(__wt_calloc_def(session, 1, &cconfig));
+
+	cursor = &cconfig->iface;
+	*cursor = iface;
+	cursor->session = &session->iface;
+	cursor->key_format = cursor->value_format = "S";
+
+	/* __wt_cursor_init is last so we don't have to clean up on error. */
+	WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));
+
+	if (0) {
+err:		__wt_free(session, cconfig);
+	}
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_ds.c b/src/third_party/wiredtiger/src/cursor/cur_ds.c
new file mode 100644
index 00000000000..33e89764617
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_ds.c
@@ -0,0 +1,524 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __curds_txn_enter --
+ *	Do transactional initialization when starting an operation.
+ */
+static int
+__curds_txn_enter(WT_SESSION_IMPL *session)
+{
+	session->ncursors++;				/* XXX */
+	__wt_txn_cursor_op(session);
+
+	return (0);
+}
+
+/*
+ * __curds_txn_leave --
+ *	Do transactional cleanup when ending an operation.
+ */
+static void
+__curds_txn_leave(WT_SESSION_IMPL *session)
+{
+	if (--session->ncursors == 0)			/* XXX */
+		__wt_txn_read_last(session);
+}
+
+/*
+ * __curds_key_set --
+ *	Set the key for the data-source.
+ */
+static int
+__curds_key_set(WT_CURSOR *cursor)
+{
+	WT_CURSOR *source;
+	WT_DECL_RET;
+
+	source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+	WT_CURSOR_NEEDKEY(cursor);
+
+	source->recno = cursor->recno;
+	source->key.data = cursor->key.data;
+	source->key.size = cursor->key.size;
+
+err:	return (ret);
+}
+
+/*
+ * __curds_value_set --
+ *	Set the value for the data-source.
+ */
+static int
+__curds_value_set(WT_CURSOR *cursor)
+{
+	WT_CURSOR *source;
+	WT_DECL_RET;
+
+	source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+	WT_CURSOR_NEEDVALUE(cursor);
+
+	source->value.data = cursor->value.data;
+	source->value.size = cursor->value.size;
+
+err:	return (ret);
+}
+
+/*
+ * __curds_cursor_resolve --
+ *	Resolve cursor operation.
+ */
+static int
+__curds_cursor_resolve(WT_CURSOR *cursor, int ret)
+{
+	WT_CURSOR *source;
+
+	source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+	/*
+	 * Update the cursor's key, value and flags.  (We use the _INT flags in
+	 * the same way as file objects: there's some chance the underlying data
+	 * source is passing us a reference to data only pinned per operation,
+	 * might as well be safe.)
+	 *
+	 * There's also a requirement the underlying data-source never returns
+	 * with the cursor/source key referencing application memory: it'd be
+	 * great to do a copy as necessary here so the data-source doesn't have
+	 * to worry about copying the key, but we don't have enough information
+	 * to know if a cursor is pointing at application or data-source memory.
+	 */
+	if (ret == 0) {
+		cursor->key.data = source->key.data;
+		cursor->key.size = source->key.size;
+		cursor->value.data = source->value.data;
+		cursor->value.size = source->value.size;
+		cursor->recno = source->recno;
+
+		F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+		F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+	} else {
+		if (ret == WT_NOTFOUND)
+			F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+		else
+			F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+		/*
+		 * Cursor operation failure implies a lost cursor position and
+		 * a subsequent next/prev starting at the beginning/end of the
+		 * table.  We simplify underlying data source implementations
+		 * by resetting the cursor explicitly here.
+		 */
+		WT_TRET(source->reset(source));
+	}
+
+	return (ret);
+}
+
+/*
+ * __curds_compare --
+ *	WT_CURSOR.compare method for the data-source cursor type.
+ */
+static int
+__curds_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
+{
+	WT_COLLATOR *collator;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	CURSOR_API_CALL(a, session, compare, NULL);
+
+	/*
+	 * Confirm both cursors refer to the same source and have keys, then
+	 * compare them.
+	 */
+	if (strcmp(a->internal_uri, b->internal_uri) != 0)
+		WT_ERR_MSG(session, EINVAL,
+		    "Cursors must reference the same object");
+
+	WT_CURSOR_NEEDKEY(a);
+	WT_CURSOR_NEEDKEY(b);
+
+	if (WT_CURSOR_RECNO(a)) {
+		if (a->recno < b->recno)
+			*cmpp = -1;
+		else if (a->recno == b->recno)
+			*cmpp = 0;
+		else
+			*cmpp = 1;
+	} else {
+		/*
+		 * The assumption is data-sources don't provide WiredTiger with
+		 * WT_CURSOR.compare methods, instead, we'll copy the key/value
+		 * out of the underlying data-source cursor and any comparison
+		 * to be done can be done at this level.
+		 */
+		collator = ((WT_CURSOR_DATA_SOURCE *)a)->collator;
+		WT_ERR(__wt_compare(
+		    session, collator, &a->key, &b->key, cmpp));
+	}
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curds_next --
+ *	WT_CURSOR.next method for the data-source cursor type.
+ */
+static int
+__curds_next(WT_CURSOR *cursor)
+{
+	WT_CURSOR *source;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+	CURSOR_API_CALL(cursor, session, next, NULL);
+
+	WT_STAT_FAST_CONN_INCR(session, cursor_next); 
+	WT_STAT_FAST_DATA_INCR(session, cursor_next);
+
+	WT_ERR(__curds_txn_enter(session));
+
+	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);         
+	ret = __curds_cursor_resolve(cursor, source->next(source));
+
+err:	__curds_txn_leave(session);
+
+	API_END_RET(session, ret);
+}
+
+/*
+ * __curds_prev --
+ *	WT_CURSOR.prev method for the data-source cursor type.
+ */
+static int
+__curds_prev(WT_CURSOR *cursor)
+{
+	WT_CURSOR *source;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+	CURSOR_API_CALL(cursor, session, prev, NULL);
+
+	WT_STAT_FAST_CONN_INCR(session, cursor_prev);
+	WT_STAT_FAST_DATA_INCR(session, cursor_prev);
+
+	WT_ERR(__curds_txn_enter(session));
+
+	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);         
+	ret = __curds_cursor_resolve(cursor, source->prev(source));
+
+err:	__curds_txn_leave(session);
+	API_END_RET(session, ret);
+}
+
+/*
+ * __curds_reset --
+ *	WT_CURSOR.reset method for the data-source cursor type.
+ */
+static int
+__curds_reset(WT_CURSOR *cursor)
+{
+	WT_CURSOR *source;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+	CURSOR_API_CALL(cursor, session, reset, NULL);
+
+	WT_STAT_FAST_CONN_INCR(session, cursor_reset);      
+	WT_STAT_FAST_DATA_INCR(session, cursor_reset);
+
+	WT_ERR(source->reset(source));
+
+	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curds_search --
+ *	WT_CURSOR.search method for the data-source cursor type.
+ */
+static int
+__curds_search(WT_CURSOR *cursor)
+{
+	WT_CURSOR *source;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+	CURSOR_API_CALL(cursor, session, search, NULL);
+
+	WT_STAT_FAST_CONN_INCR(session, cursor_search);
+	WT_STAT_FAST_DATA_INCR(session, cursor_search);
+
+	WT_ERR(__curds_txn_enter(session));
+
+	WT_ERR(__curds_key_set(cursor));
+	ret = __curds_cursor_resolve(cursor, source->search(source));
+
+err:	__curds_txn_leave(session);
+
+	API_END_RET(session, ret);
+}
+
+/*
+ * __curds_search_near --
+ *	WT_CURSOR.search_near method for the data-source cursor type.
+ */
+static int
+__curds_search_near(WT_CURSOR *cursor, int *exact)
+{
+	WT_CURSOR *source;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+	CURSOR_API_CALL(cursor, session, search_near, NULL);
+
+	WT_STAT_FAST_CONN_INCR(session, cursor_search_near);
+	WT_STAT_FAST_DATA_INCR(session, cursor_search_near);
+
+	WT_ERR(__curds_txn_enter(session));
+
+	WT_ERR(__curds_key_set(cursor));
+	ret =
+	    __curds_cursor_resolve(cursor, source->search_near(source, exact));
+
+err:	__curds_txn_leave(session);
+
+	API_END_RET(session, ret);
+}
+
+/*
+ * __curds_insert --
+ *	WT_CURSOR.insert method for the data-source cursor type.
+ */
+static int
+__curds_insert(WT_CURSOR *cursor)
+{
+	WT_CURSOR *source;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+	CURSOR_UPDATE_API_CALL(cursor, session, insert, NULL);
+
+	WT_ERR(__curds_txn_enter(session));
+
+	WT_STAT_FAST_CONN_INCR(session, cursor_insert);     
+	WT_STAT_FAST_DATA_INCR(session, cursor_insert);
+	WT_STAT_FAST_DATA_INCRV(session,
+	    cursor_insert_bytes, cursor->key.size + cursor->value.size);
+
+	if (!F_ISSET(cursor, WT_CURSTD_APPEND))
+		WT_ERR(__curds_key_set(cursor));
+	WT_ERR(__curds_value_set(cursor));
+	ret = __curds_cursor_resolve(cursor, source->insert(source));
+
+err:	__curds_txn_leave(session);
+
+	CURSOR_UPDATE_API_END(session, ret);
+	return (ret);
+}
+
+/*
+ * __curds_update --
+ *	WT_CURSOR.update method for the data-source cursor type.
+ */
+static int
+__curds_update(WT_CURSOR *cursor)
+{
+	WT_CURSOR *source;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+	CURSOR_UPDATE_API_CALL(cursor, session, update, NULL);
+
+	WT_STAT_FAST_CONN_INCR(session, cursor_update);     
+	WT_STAT_FAST_DATA_INCR(session, cursor_update);
+	WT_STAT_FAST_DATA_INCRV(
+	    session, cursor_update_bytes, cursor->value.size);
+
+	WT_ERR(__curds_txn_enter(session));
+
+	WT_ERR(__curds_key_set(cursor));
+	WT_ERR(__curds_value_set(cursor));
+	ret = __curds_cursor_resolve(cursor, source->update(source));
+
+err:	__curds_txn_leave(session);
+
+	CURSOR_UPDATE_API_END(session, ret);
+	return (ret);
+}
+
+/*
+ * __curds_remove --
+ *	WT_CURSOR.remove method for the data-source cursor type.
+ */
+static int
+__curds_remove(WT_CURSOR *cursor)
+{
+	WT_CURSOR *source;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+	CURSOR_UPDATE_API_CALL(cursor, session, remove, NULL);
+
+	WT_STAT_FAST_CONN_INCR(session, cursor_remove);     
+	WT_STAT_FAST_DATA_INCR(session, cursor_remove);
+	WT_STAT_FAST_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size);
+
+	WT_ERR(__curds_txn_enter(session));
+
+	WT_ERR(__curds_key_set(cursor));
+	ret = __curds_cursor_resolve(cursor, source->remove(source));
+
+err:	__curds_txn_leave(session);
+
+	CURSOR_UPDATE_API_END(session, ret);
+	return (ret);
+}
+
+/*
+ * __curds_close --
+ *	WT_CURSOR.close method for the data-source cursor type.
+ */
+static int
+__curds_close(WT_CURSOR *cursor)
+{
+	WT_CURSOR_DATA_SOURCE *cds;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cds = (WT_CURSOR_DATA_SOURCE *)cursor;
+
+	CURSOR_API_CALL(cursor, session, close, NULL);
+
+	if (cds->source != NULL)
+		ret = cds->source->close(cds->source);
+
+	if (cds->collator_owned) {
+		if (cds->collator->terminate != NULL)
+			WT_TRET(cds->collator->terminate(
+			    cds->collator, &session->iface));
+		cds->collator_owned = 0;
+	}
+	cds->collator = NULL;
+
+	/*
+	 * The key/value formats are in allocated memory, which isn't standard
+	 * behavior.
+	 */
+	__wt_free(session, cursor->key_format);
+	__wt_free(session, cursor->value_format);
+
+	WT_TRET(__wt_cursor_close(cursor));
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curds_open --
+ *	Initialize a data-source cursor.
+ */
+int
+__wt_curds_open(
+    WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner,
+    const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp)
+{
+	WT_CURSOR_STATIC_INIT(iface,
+	    __wt_cursor_get_key,	/* get-key */
+	    __wt_cursor_get_value,	/* get-value */
+	    __wt_cursor_set_key,	/* set-key */
+	    __wt_cursor_set_value,	/* set-value */
+	    __curds_compare,		/* compare */
+	    __curds_next,		/* next */
+	    __curds_prev,		/* prev */
+	    __curds_reset,		/* reset */
+	    __curds_search,		/* search */
+	    __curds_search_near,	/* search-near */
+	    __curds_insert,		/* insert */
+	    __curds_update,		/* update */
+	    __curds_remove,		/* remove */
+	    __curds_close);		/* close */
+	WT_CONFIG_ITEM cval;
+	WT_CURSOR *cursor, *source;
+	WT_CURSOR_DATA_SOURCE *data_source;
+	WT_DECL_RET;
+	const char *metaconf;
+
+	WT_STATIC_ASSERT(offsetof(WT_CURSOR_DATA_SOURCE, iface) == 0);
+
+	data_source = NULL;
+	metaconf = NULL;
+
+	WT_RET(__wt_calloc_def(session, 1, &data_source));
+	cursor = &data_source->iface;
+	*cursor = iface;
+	cursor->session = &session->iface;
+	F_SET(cursor, WT_CURSTD_DATA_SOURCE);
+
+	/*
+	 * XXX
+	 * The underlying data-source may require the object's key and value
+	 * formats.  This isn't a particularly elegant way of getting that
+	 * information to the data-source, this feels like a layering problem
+	 * to me.
+	 */
+	WT_ERR(__wt_metadata_search(session, uri, &metaconf));
+	WT_ERR(__wt_config_getones(session, metaconf, "key_format", &cval));
+	WT_ERR(__wt_strndup(session, cval.str, cval.len, &cursor->key_format));
+	WT_ERR(__wt_config_getones(session, metaconf, "value_format", &cval));
+	WT_ERR(
+	    __wt_strndup(session, cval.str, cval.len, &cursor->value_format));
+
+	WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp));
+
+	/* Data-source cursors have a collator reference. */
+	WT_ERR(__wt_collator_config(session, cfg,
+	    &data_source->collator, &data_source->collator_owned));
+
+	WT_ERR(dsrc->open_cursor(dsrc,
+	    &session->iface, uri, (WT_CONFIG_ARG *)cfg, &data_source->source));
+	source = data_source->source;
+	source->session = (WT_SESSION *)session;
+	memset(&source->q, 0, sizeof(source->q));
+	source->recno = 0;
+	memset(source->raw_recno_buf, 0, sizeof(source->raw_recno_buf));
+	memset(&source->key, 0, sizeof(source->key));
+	memset(&source->value, 0, sizeof(source->value));
+	source->saved_err = 0;
+	source->flags = 0;
+
+	if (0) {
+err:		if (F_ISSET(cursor, WT_CURSTD_OPEN))
+			WT_TRET(cursor->close(cursor));
+		else
+			__wt_free(session, data_source);
+		*cursorp = NULL;
+	}
+
+	__wt_free(session, metaconf);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_dump.c b/src/third_party/wiredtiger/src/cursor/cur_dump.c
new file mode 100644
index 00000000000..003b7e1f961
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_dump.c
@@ -0,0 +1,400 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __raw_to_dump --
+ *	We have a buffer where the data item contains a raw value,
+ *	convert it to a printable string.
+ */
+static int
+__raw_to_dump(
+    WT_SESSION_IMPL *session, WT_ITEM *from, WT_ITEM *to, int hexonly)
+{
+	if (hexonly)
+		WT_RET(__wt_raw_to_hex(session, from->data, from->size, to));
+	else
+		WT_RET(
+		    __wt_raw_to_esc_hex(session, from->data, from->size, to));
+
+	return (0);
+}
+
+/*
+ * __dump_to_raw --
+ *	We have a buffer containing a dump string,
+ *	convert it to a raw value.
+ */
+static int
+__dump_to_raw(
+    WT_SESSION_IMPL *session, const char *src_arg, WT_ITEM *item, int hexonly)
+{
+	if (hexonly)
+		WT_RET(__wt_hex_to_raw(session, src_arg, item));
+	else
+		WT_RET(__wt_esc_hex_to_raw(session, src_arg, item));
+
+	return (0);
+}
+
+/*
+ * __curdump_get_key --
+ *	WT_CURSOR->get_key for dump cursors.
+ */
+static int
+__curdump_get_key(WT_CURSOR *cursor, ...)
+{
+	WT_CURSOR *child;
+	WT_CURSOR_DUMP *cdump;
+	WT_CURSOR_JSON *json;
+	WT_DECL_RET;
+	WT_ITEM item, *itemp;
+	WT_SESSION_IMPL *session;
+	size_t size;
+	uint64_t recno;
+	const char *fmt;
+	const void *buffer;
+	va_list ap;
+
+	cdump = (WT_CURSOR_DUMP *)cursor;
+	child = cdump->child;
+
+	va_start(ap, cursor);
+	CURSOR_API_CALL(cursor, session, get_key, NULL);
+
+	if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) {
+		json = (WT_CURSOR_JSON *)cursor->json_private;
+		WT_ASSERT(session, json != NULL);
+		if (WT_CURSOR_RECNO(cursor)) {
+			WT_ERR(child->get_key(child, &recno));
+			buffer = &recno;
+			size = sizeof(recno);
+			fmt = "R";
+		} else {
+			WT_ERR(__wt_cursor_get_raw_key(child, &item));
+			buffer = item.data;
+			size = item.size;
+			if (F_ISSET(cursor, WT_CURSTD_RAW))
+				fmt = "u";
+			else
+				fmt = cursor->key_format;
+		}
+		ret = __wt_json_alloc_unpack(session, buffer, size, fmt,
+		    json, 1, ap);
+	} else {
+		if (WT_CURSOR_RECNO(cursor) &&
+		    !F_ISSET(cursor, WT_CURSTD_RAW)) {
+			WT_ERR(child->get_key(child, &recno));
+
+			WT_ERR(__wt_buf_fmt(session, &cursor->key, "%"
+			    PRIu64, recno));
+		} else {
+			WT_ERR(child->get_key(child, &item));
+
+			WT_ERR(__raw_to_dump(session, &item, &cursor->key,
+			    F_ISSET(cursor, WT_CURSTD_DUMP_HEX) ? 1 : 0));
+		}
+
+		if (F_ISSET(cursor, WT_CURSTD_RAW)) {
+			itemp = va_arg(ap, WT_ITEM *);
+			itemp->data = cursor->key.data;
+			itemp->size = cursor->key.size;
+		} else
+			*va_arg(ap, const char **) = cursor->key.data;
+	}
+
+err:	va_end(ap);
+	API_END_RET(session, ret);
+}
+
+/*
+ * str2recno --
+ *	Convert a string to a record number.
+ */
+static int
+str2recno(WT_SESSION_IMPL *session, const char *p, uint64_t *recnop)
+{
+	uint64_t recno;
+	char *endptr;
+
+	/*
+	 * strtouq takes lots of things like hex values, signs and so on and so
+	 * forth -- none of them are OK with us.  Check the string starts with
+	 * digit, that turns off the special processing.
+	 */
+	if (!isdigit(p[0]))
+		goto format;
+
+	errno = 0;
+	recno = __wt_strtouq(p, &endptr, 0);
+	if (recno == ULLONG_MAX && errno == ERANGE)
+		WT_RET_MSG(session, ERANGE, "%s: invalid record number", p);
+	if (endptr[0] != '\0')
+format:		WT_RET_MSG(session, EINVAL, "%s: invalid record number", p);
+
+	*recnop = recno;
+	return (0);
+}
+
+/*
+ * __curdump_set_key --
+ *	WT_CURSOR->set_key for dump cursors.
+ */
+static void
+__curdump_set_key(WT_CURSOR *cursor, ...)
+{
+	WT_CURSOR_DUMP *cdump;
+	WT_CURSOR *child;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	uint64_t recno;
+	va_list ap;
+	const char *p;
+
+	cdump = (WT_CURSOR_DUMP *)cursor;
+	child = cdump->child;
+	CURSOR_API_CALL(cursor, session, set_key, NULL);
+
+	va_start(ap, cursor);
+	if (F_ISSET(cursor, WT_CURSTD_RAW))
+		p = va_arg(ap, WT_ITEM *)->data;
+	else
+		p = va_arg(ap, const char *);
+	va_end(ap);
+
+	if (WT_CURSOR_RECNO(cursor) && !F_ISSET(cursor, WT_CURSTD_RAW)) {
+		WT_ERR(str2recno(session, p, &recno));
+
+		child->set_key(child, recno);
+	} else {
+		if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON))
+			WT_ERR(__wt_json_to_item(session, p, cursor->key_format,
+			    (WT_CURSOR_JSON *)cursor->json_private, 1,
+			    &cursor->key));
+		else
+			WT_ERR(__dump_to_raw(session, p, &cursor->key,
+			    F_ISSET(cursor, WT_CURSTD_DUMP_HEX) ? 1 : 0));
+
+		child->set_key(child, &cursor->key);
+	}
+
+	if (0) {
+err:		cursor->saved_err = ret;
+		F_CLR(cursor, WT_CURSTD_KEY_SET);
+	}
+	API_END(session, ret);
+}
+
+/*
+ * __curdump_get_value --
+ *	WT_CURSOR->get_value for dump cursors.
+ */
+static int
+__curdump_get_value(WT_CURSOR *cursor, ...)
+{
+	WT_CURSOR_DUMP *cdump;
+	WT_CURSOR_JSON *json;
+	WT_CURSOR *child;
+	WT_DECL_RET;
+	WT_ITEM item, *itemp;
+	WT_SESSION_IMPL *session;
+	va_list ap;
+	const char *fmt;
+
+	cdump = (WT_CURSOR_DUMP *)cursor;
+	child = cdump->child;
+
+	va_start(ap, cursor);
+	CURSOR_API_CALL(cursor, session, get_value, NULL);
+
+	if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) {
+		json = (WT_CURSOR_JSON *)cursor->json_private;
+		WT_ASSERT(session, json != NULL);
+		WT_ERR(__wt_cursor_get_raw_value(child, &item));
+		fmt = F_ISSET(cursor, WT_CURSTD_RAW) ?
+		    "u" : cursor->value_format;
+		ret = __wt_json_alloc_unpack(session, item.data,
+		    item.size, fmt, json, 0, ap);
+	} else {
+		WT_ERR(child->get_value(child, &item));
+
+		WT_ERR(__raw_to_dump(session, &item, &cursor->value,
+		    F_ISSET(cursor, WT_CURSTD_DUMP_HEX) ? 1 : 0));
+
+		if (F_ISSET(cursor, WT_CURSTD_RAW)) {
+			itemp = va_arg(ap, WT_ITEM *);
+			itemp->data = cursor->value.data;
+			itemp->size = cursor->value.size;
+		} else
+			*va_arg(ap, const char **) = cursor->value.data;
+	}
+
+err:	va_end(ap);
+	API_END_RET(session, ret);
+}
+
+/*
+ * __curdump_set_value --
+ *	WT_CURSOR->set_value for dump cursors.
+ */
+static void
+__curdump_set_value(WT_CURSOR *cursor, ...)
+{
+	WT_CURSOR_DUMP *cdump;
+	WT_CURSOR *child;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	va_list ap;
+	const char *p;
+
+	cdump = (WT_CURSOR_DUMP *)cursor;
+	child = cdump->child;
+	CURSOR_API_CALL(cursor, session, set_value, NULL);
+
+	va_start(ap, cursor);
+	if (F_ISSET(cursor, WT_CURSTD_RAW))
+		p = va_arg(ap, WT_ITEM *)->data;
+	else
+		p = va_arg(ap, const char *);
+	va_end(ap);
+
+	if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON))
+		WT_ERR(__wt_json_to_item(session, p, cursor->value_format,
+		    (WT_CURSOR_JSON *)cursor->json_private, 0, &cursor->value));
+	else
+		WT_ERR(__dump_to_raw(session, p, &cursor->value,
+		    F_ISSET(cursor, WT_CURSTD_DUMP_HEX) ? 1 : 0));
+
+	child->set_value(child, &cursor->value);
+
+	if (0) {
+err:		cursor->saved_err = ret;
+		F_CLR(cursor, WT_CURSTD_VALUE_SET);
+	}
+	API_END(session, ret);
+}
+
+/* Pass through a call to the underlying cursor. */
+#define	WT_CURDUMP_PASS(op)						\
+static int								\
+__curdump_##op(WT_CURSOR *cursor)					\
+{									\
+	WT_CURSOR *child;						\
+									\
+	child = ((WT_CURSOR_DUMP *)cursor)->child;			\
+	return (child->op(child));					\
+}
+
+WT_CURDUMP_PASS(next)
+WT_CURDUMP_PASS(prev)
+WT_CURDUMP_PASS(reset)
+WT_CURDUMP_PASS(search)
+
+/*
+ * __curdump_search_near --
+ *	WT_CURSOR::search_near for dump cursors.
+ */
+static int
+__curdump_search_near(WT_CURSOR *cursor, int *exact)
+{
+	WT_CURSOR_DUMP *cdump;
+
+	cdump = (WT_CURSOR_DUMP *)cursor;
+	return (cdump->child->search_near(cdump->child, exact));
+}
+
+WT_CURDUMP_PASS(insert)
+WT_CURDUMP_PASS(update)
+WT_CURDUMP_PASS(remove)
+
+/*
+ * __curdump_close --
+ *	WT_CURSOR::close for dump cursors.
+ */
+static int
+__curdump_close(WT_CURSOR *cursor)
+{
+	WT_CURSOR_DUMP *cdump;
+	WT_CURSOR *child;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cdump = (WT_CURSOR_DUMP *)cursor;
+	child = cdump->child;
+
+	CURSOR_API_CALL(cursor, session, get_key, NULL);
+	if (child != NULL)
+		WT_TRET(child->close(child));
+	/* We shared the child's URI. */
+	cursor->internal_uri = NULL;
+	__wt_json_close(session, cursor);
+	WT_TRET(__wt_cursor_close(cursor));
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curdump_create --
+ *	initialize a dump cursor.
+ */
+int
+__wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp)
+{
+	WT_CURSOR_STATIC_INIT(iface,
+	    __curdump_get_key,		/* get-key */
+	    __curdump_get_value,	/* get-value */
+	    __curdump_set_key,		/* set-key */
+	    __curdump_set_value,	/* set-value */
+	    __wt_cursor_notsup,		/* compare */
+	    __curdump_next,		/* next */
+	    __curdump_prev,		/* prev */
+	    __curdump_reset,		/* reset */
+	    __curdump_search,		/* search */
+	    __curdump_search_near,	/* search-near */
+	    __curdump_insert,		/* insert */
+	    __curdump_update,		/* update */
+	    __curdump_remove,		/* remove */
+	    __curdump_close);		/* close */
+	WT_CURSOR *cursor;
+	WT_CURSOR_DUMP *cdump;
+	WT_CURSOR_JSON *json;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	const char *cfg[2];
+
+	WT_STATIC_ASSERT(offsetof(WT_CURSOR_DUMP, iface) == 0);
+
+	session = (WT_SESSION_IMPL *)child->session;
+
+	WT_RET(__wt_calloc_def(session, 1, &cdump));
+	cursor = &cdump->iface;
+	*cursor = iface;
+	cursor->session = child->session;
+	cursor->internal_uri = child->internal_uri;
+	cursor->key_format = child->key_format;
+	cursor->value_format = child->value_format;
+	cdump->child = child;
+
+	/* Copy the dump flags from the child cursor. */
+	F_SET(cursor, F_ISSET(child,
+	    WT_CURSTD_DUMP_HEX | WT_CURSTD_DUMP_JSON | WT_CURSTD_DUMP_PRINT));
+	if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) {
+		WT_ERR(__wt_calloc_def(session, 1, &json));
+		cursor->json_private = child->json_private = json;
+	}
+
+	/* __wt_cursor_init is last so we don't have to clean up on error. */
+	cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
+	cfg[1] = NULL;
+	WT_ERR(__wt_cursor_init(cursor, NULL, owner, cfg, cursorp));
+
+	if (0) {
+err:		__wt_free(session, cursor);
+	}
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c
new file mode 100644
index 00000000000..e5aaa19d0cc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_file.c
@@ -0,0 +1,471 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * WT_BTREE_CURSOR_SAVE_AND_RESTORE
+ *	Save the cursor's key/value data/size fields, call an underlying btree
+ *	function, and then consistently handle failure and success.
+ */
+#define	WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, f, ret) do {		\
+	WT_ITEM __key_copy = (cursor)->key;				\
+	uint64_t __recno = (cursor)->recno;				\
+	WT_ITEM __value_copy = (cursor)->value;				\
+	if (((ret) = (f)) == 0) {					\
+		F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);	\
+		F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);	\
+	} else {							\
+		if (F_ISSET(cursor, WT_CURSTD_KEY_EXT)) {		\
+			(cursor)->recno = __recno;			\
+			WT_ITEM_SET((cursor)->key, __key_copy);		\
+		}							\
+		if (F_ISSET(cursor, WT_CURSTD_VALUE_EXT))		\
+			WT_ITEM_SET((cursor)->value, __value_copy);	\
+		F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);	\
+	}								\
+} while (0)
+
+/*
+ * __curfile_compare --
+ *	WT_CURSOR->compare method for the btree cursor type.
+ */
+static int
+__curfile_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
+{
+	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cbt = (WT_CURSOR_BTREE *)a;
+	CURSOR_API_CALL(a, session, compare, cbt->btree);
+
+	/*
+	 * Confirm both cursors refer to the same source and have keys, then
+	 * call the underlying object to compare them.
+	 */
+	if (strcmp(a->internal_uri, b->internal_uri) != 0)
+		WT_ERR_MSG(session, EINVAL,
+		    "Cursors must reference the same object");
+
+	WT_CURSOR_CHECKKEY(a);
+	WT_CURSOR_CHECKKEY(b);
+
+	ret = __wt_btcur_compare(
+	    (WT_CURSOR_BTREE *)a, (WT_CURSOR_BTREE *)b, cmpp);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_next --
+ *	WT_CURSOR->next method for the btree cursor type.
+ */
+static int
+__curfile_next(WT_CURSOR *cursor)
+{
+	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cbt = (WT_CURSOR_BTREE *)cursor;
+	CURSOR_API_CALL(cursor, session, next, cbt->btree);
+
+	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+	if ((ret = __wt_btcur_next(cbt, 0)) == 0)
+		F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_next_random --
+ *	WT_CURSOR->next method for the btree cursor type when configured with
+ * next_random.
+ */
+static int
+__curfile_next_random(WT_CURSOR *cursor)
+{
+	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cbt = (WT_CURSOR_BTREE *)cursor;
+	CURSOR_API_CALL(cursor, session, next, cbt->btree);
+
+	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+	if ((ret = __wt_btcur_next_random(cbt)) == 0)
+		F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_prev --
+ *	WT_CURSOR->prev method for the btree cursor type.
+ */
+static int
+__curfile_prev(WT_CURSOR *cursor)
+{
+	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cbt = (WT_CURSOR_BTREE *)cursor;
+	CURSOR_API_CALL(cursor, session, prev, cbt->btree);
+
+	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+	if ((ret = __wt_btcur_prev(cbt, 0)) == 0)
+		F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_reset --
+ *	WT_CURSOR->reset method for the btree cursor type.
+ */
+static int
+__curfile_reset(WT_CURSOR *cursor)
+{
+	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cbt = (WT_CURSOR_BTREE *)cursor;
+	CURSOR_API_CALL(cursor, session, reset, cbt->btree);
+
+	ret = __wt_btcur_reset(cbt);
+
+	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_search --
+ *	WT_CURSOR->search method for the btree cursor type.
+ */
+static int
+__curfile_search(WT_CURSOR *cursor)
+{
+	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cbt = (WT_CURSOR_BTREE *)cursor;
+	CURSOR_API_CALL(cursor, session, search, cbt->btree);
+
+	WT_CURSOR_NEEDKEY(cursor);
+	WT_CURSOR_NOVALUE(cursor);
+
+	WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_search(cbt), ret);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_search_near --
+ *	WT_CURSOR->search_near method for the btree cursor type.
+ */
+static int
+__curfile_search_near(WT_CURSOR *cursor, int *exact)
+{
+	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cbt = (WT_CURSOR_BTREE *)cursor;
+	CURSOR_API_CALL(cursor, session, search_near, cbt->btree);
+
+	WT_CURSOR_NEEDKEY(cursor);
+	WT_CURSOR_NOVALUE(cursor);
+
+	WT_BTREE_CURSOR_SAVE_AND_RESTORE(
+	    cursor, __wt_btcur_search_near(cbt, exact), ret);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_insert --
+ *	WT_CURSOR->insert method for the btree cursor type.
+ */
+static int
+__curfile_insert(WT_CURSOR *cursor)
+{
+	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cbt = (WT_CURSOR_BTREE *)cursor;
+	CURSOR_UPDATE_API_CALL(cursor, session, insert, cbt->btree);
+	if (!F_ISSET(cursor, WT_CURSTD_APPEND))
+		WT_CURSOR_NEEDKEY(cursor);
+	WT_CURSOR_NEEDVALUE(cursor);
+
+	WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_insert(cbt), ret);
+
+	/*
+	 * Insert is the one cursor operation that doesn't end with the cursor
+	 * pointing to an on-page item.   The standard macro handles errors
+	 * correctly, but we need to leave the application cursor unchanged in
+	 * the case of success, except for column-store appends, where we are
+	 * returning a key.
+	 */
+	if (ret == 0) {
+		if (!F_ISSET(cursor, WT_CURSTD_APPEND)) {
+			F_SET(cursor, WT_CURSTD_KEY_EXT);
+			F_CLR(cursor, WT_CURSTD_KEY_INT);
+		}
+		F_SET(cursor, WT_CURSTD_VALUE_EXT);
+		F_CLR(cursor, WT_CURSTD_VALUE_INT);
+	}
+
+err:	CURSOR_UPDATE_API_END(session, ret);
+	return (ret);
+}
+
+/*
+ * __curfile_update --
+ *	WT_CURSOR->update method for the btree cursor type.
+ */
+static int
+__curfile_update(WT_CURSOR *cursor)
+{
+	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cbt = (WT_CURSOR_BTREE *)cursor;
+	CURSOR_UPDATE_API_CALL(cursor, session, update, cbt->btree);
+
+	WT_CURSOR_NEEDKEY(cursor);
+	WT_CURSOR_NEEDVALUE(cursor);
+
+	WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_update(cbt), ret);
+
+err:	CURSOR_UPDATE_API_END(session, ret);
+	return (ret);
+}
+
+/*
+ * __wt_curfile_update_check --
+ *	WT_CURSOR->update_check method for the btree cursor type.
+ */
+int
+__wt_curfile_update_check(WT_CURSOR *cursor)
+{
+	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cbt = (WT_CURSOR_BTREE *)cursor;
+	CURSOR_UPDATE_API_CALL(cursor, session, update, cbt->btree);
+
+	WT_CURSOR_NEEDKEY(cursor);
+	WT_CURSOR_NOVALUE(cursor);
+
+	WT_BTREE_CURSOR_SAVE_AND_RESTORE(
+	    cursor, __wt_btcur_update_check(cbt), ret);
+
+err:	CURSOR_UPDATE_API_END(session, ret);
+	return (ret);
+}
+
+/*
+ * __curfile_remove --
+ *	WT_CURSOR->remove method for the btree cursor type.
+ */
+static int
+__curfile_remove(WT_CURSOR *cursor)
+{
+	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cbt = (WT_CURSOR_BTREE *)cursor;
+	CURSOR_UPDATE_API_CALL(cursor, session, remove, cbt->btree);
+
+	WT_CURSOR_NEEDKEY(cursor);
+	WT_CURSOR_NOVALUE(cursor);
+
+	WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_remove(cbt), ret);
+
+	/*
+	 * After a successful remove, copy the key: the value is not available.
+	 */
+	if (ret == 0) {
+		if (F_ISSET(cursor, WT_CURSTD_KEY_INT) &&
+		    !WT_DATA_IN_ITEM(&(cursor)->key)) {
+			WT_ERR(__wt_buf_set(session, &cursor->key,
+			    cursor->key.data, cursor->key.size));
+			F_CLR(cursor, WT_CURSTD_KEY_INT);
+			F_SET(cursor, WT_CURSTD_KEY_EXT);
+		}
+		F_CLR(cursor, WT_CURSTD_VALUE_SET);
+	}
+
+err:	CURSOR_UPDATE_API_END(session, ret);
+	return (ret);
+}
+
+/*
+ * __curfile_close --
+ *	WT_CURSOR->close method for the btree cursor type.
+ */
+static int
+__curfile_close(WT_CURSOR *cursor)
+{
+	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cbt = (WT_CURSOR_BTREE *)cursor;
+	CURSOR_API_CALL(cursor, session, close, cbt->btree);
+	WT_TRET(__wt_btcur_close(cbt));
+	if (cbt->btree != NULL)
+		WT_TRET(__wt_session_release_btree(session));
+	/* The URI is owned by the btree handle. */
+	cursor->internal_uri = NULL;
+	WT_TRET(__wt_cursor_close(cursor));
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curfile_create --
+ *	Open a cursor for a given btree handle.
+ */
+int
+__wt_curfile_create(WT_SESSION_IMPL *session,
+    WT_CURSOR *owner, const char *cfg[], int bulk, int bitmap,
+    WT_CURSOR **cursorp)
+{
+	WT_CURSOR_STATIC_INIT(iface,
+	    __wt_cursor_get_key,	/* get-key */
+	    __wt_cursor_get_value,	/* get-value */
+	    __wt_cursor_set_key,	/* set-key */
+	    __wt_cursor_set_value,	/* set-value */
+	    __curfile_compare,		/* compare */
+	    __curfile_next,		/* next */
+	    __curfile_prev,		/* prev */
+	    __curfile_reset,		/* reset */
+	    __curfile_search,		/* search */
+	    __curfile_search_near,	/* search-near */
+	    __curfile_insert,		/* insert */
+	    __curfile_update,		/* update */
+	    __curfile_remove,		/* remove */
+	    __curfile_close);		/* close */
+	WT_BTREE *btree;
+	WT_CONFIG_ITEM cval;
+	WT_CURSOR *cursor;
+	WT_CURSOR_BTREE *cbt;
+	WT_CURSOR_BULK *cbulk;
+	WT_DECL_RET;
+	size_t csize;
+
+	WT_STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0);
+
+	cbt = NULL;
+
+	btree = S2BT(session);
+	WT_ASSERT(session, btree != NULL);
+
+	csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE);
+	WT_RET(__wt_calloc(session, 1, csize, &cbt));
+
+	cursor = &cbt->iface;
+	*cursor = iface;
+	cursor->session = &session->iface;
+	cursor->internal_uri = btree->dhandle->name;
+	cursor->key_format = btree->key_format;
+	cursor->value_format = btree->value_format;
+
+	cbt->btree = btree;
+	if (bulk) {
+		F_SET(cursor, WT_CURSTD_BULK);
+
+		cbulk = (WT_CURSOR_BULK *)cbt;
+
+		/* Optionally skip the validation of each bulk-loaded key. */
+		WT_ERR(__wt_config_gets_def(
+		    session, cfg, "skip_sort_check", 0, &cval));
+		WT_ERR(__wt_curbulk_init(
+		    session, cbulk, bitmap, cval.val == 0 ? 0 : 1));
+	}
+
+	/*
+	 * random_retrieval
+	 * Random retrieval cursors only support next, reset and close.
+	 */
+	WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval));
+	if (cval.val != 0) {
+		__wt_cursor_set_notsup(cursor);
+		cursor->next = __curfile_next_random;
+		cursor->reset = __curfile_reset;
+	}
+
+	/* __wt_cursor_init is last so we don't have to clean up on error. */
+	WT_ERR(__wt_cursor_init(
+	    cursor, cursor->internal_uri, owner, cfg, cursorp));
+
+	WT_STAT_FAST_CONN_INCR(session, cursor_create);
+	WT_STAT_FAST_DATA_INCR(session, cursor_create);
+
+	if (0) {
+err:		__wt_free(session, cbt);
+	}
+
+	return (ret);
+}
+
+/*
+ * __wt_curfile_open --
+ *	WT_SESSION->open_cursor method for the btree cursor type.
+ */
+int
+__wt_curfile_open(WT_SESSION_IMPL *session, const char *uri,
+    WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+{
+	WT_CONFIG_ITEM cval;
+	WT_DECL_RET;
+	int bitmap, bulk;
+	uint32_t flags;
+
+	flags = 0;
+
+	WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval));
+	if (cval.type == WT_CONFIG_ITEM_BOOL ||
+	    (cval.type == WT_CONFIG_ITEM_NUM &&
+	    (cval.val == 0 || cval.val == 1))) {
+		bitmap = 0;
+		bulk = (cval.val != 0);
+	} else if (WT_STRING_MATCH("bitmap", cval.str, cval.len))
+		bitmap = bulk = 1;
+	else
+		WT_RET_MSG(session, EINVAL,
+		    "Value for 'bulk' must be a boolean or 'bitmap'");
+
+	/* Bulk handles require exclusive access. */
+	if (bulk)
+		LF_SET(WT_BTREE_BULK | WT_DHANDLE_EXCLUSIVE);
+
+	/* Get the handle and lock it while the cursor is using it. */
+	if (WT_PREFIX_MATCH(uri, "file:"))
+		WT_RET(__wt_session_get_btree_ckpt(session, uri, cfg, flags));
+	else
+		WT_RET(__wt_bad_object_type(session, uri));
+
+	WT_ERR(__wt_curfile_create(session, owner, cfg, bulk, bitmap, cursorp));
+
+	return (0);
+
+err:	/* If the cursor could not be opened, release the handle. */
+	WT_TRET(__wt_session_release_btree(session));
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_index.c b/src/third_party/wiredtiger/src/cursor/cur_index.c
new file mode 100644
index 00000000000..936337047b8
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_index.c
@@ -0,0 +1,447 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __curindex_get_value --
+ *	WT_CURSOR->get_value implementation for index cursors.
+ */
+static int
+__curindex_get_value(WT_CURSOR *cursor, ...)
+{
+	WT_CURSOR_INDEX *cindex;
+	WT_DECL_RET;
+	WT_ITEM *item;
+	WT_SESSION_IMPL *session;
+	va_list ap;
+
+	cindex = (WT_CURSOR_INDEX *)cursor;
+	CURSOR_API_CALL(cursor, session, get_value, NULL);
+	WT_CURSOR_NEEDVALUE(cursor);
+
+	va_start(ap, cursor);
+	if (F_ISSET(cursor, WT_CURSTD_RAW)) {
+		ret = __wt_schema_project_merge(session,
+		    cindex->cg_cursors, cindex->value_plan,
+		    cursor->value_format, &cursor->value);
+		if (ret == 0) {
+			item = va_arg(ap, WT_ITEM *);
+			item->data = cursor->value.data;
+			item->size = cursor->value.size;
+		}
+	} else
+		ret = __wt_schema_project_out(session,
+		    cindex->cg_cursors, cindex->value_plan, ap);
+	va_end(ap);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_set_value --
+ *	WT_CURSOR->set_value implementation for index cursors.
+ */
+static void
+__curindex_set_value(WT_CURSOR *cursor, ...)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	CURSOR_API_CALL(cursor, session, set_value, NULL);
+	ret = ENOTSUP;
+err:	cursor->saved_err = ret;
+	F_CLR(cursor, WT_CURSTD_VALUE_SET);
+	API_END(session, ret);
+}
+
+/*
+ * __curindex_move --
+ *	When an index cursor changes position, set the primary key in the
+ *	associated column groups and update their positions to match.
+ */
+static int
+__curindex_move(WT_CURSOR_INDEX *cindex)
+{
+	WT_CURSOR **cp, *first;
+	WT_SESSION_IMPL *session;
+	u_int i;
+
+	session = (WT_SESSION_IMPL *)cindex->iface.session;
+	first = NULL;
+
+	/* Point the public cursor to the key in the child. */
+	__wt_cursor_set_raw_key(&cindex->iface, &cindex->child->key);
+	F_CLR(&cindex->iface, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+	for (i = 0, cp = cindex->cg_cursors;
+	    i < WT_COLGROUPS(cindex->table);
+	    i++, cp++) {
+		if (*cp == NULL)
+			continue;
+		if (first == NULL) {
+			/*
+			 * Set the primary key -- note that we need the primary
+			 * key columns, so we have to use the full key format,
+			 * not just the public columns.
+			 */
+			WT_RET(__wt_schema_project_slice(session,
+			    cp, cindex->index->key_plan,
+			    1, cindex->index->key_format,
+			    &cindex->iface.key));
+			first = *cp;
+		} else {
+			(*cp)->key.data = first->key.data;
+			(*cp)->key.size = first->key.size;
+			(*cp)->recno = first->recno;
+		}
+		F_SET(*cp, WT_CURSTD_KEY_EXT);
+		WT_RET((*cp)->search(*cp));
+	}
+
+	F_SET(&cindex->iface, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+	return (0);
+}
+
+/*
+ * __curindex_next --
+ *	WT_CURSOR->next method for index cursors.
+ */
+static int
+__curindex_next(WT_CURSOR *cursor)
+{
+	WT_CURSOR_INDEX *cindex;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cindex = (WT_CURSOR_INDEX *)cursor;
+	CURSOR_API_CALL(cursor, session, next, NULL);
+	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+	if ((ret = cindex->child->next(cindex->child)) == 0)
+		ret = __curindex_move(cindex);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_prev --
+ *	WT_CURSOR->prev method for index cursors.
+ */
+static int
+__curindex_prev(WT_CURSOR *cursor)
+{
+	WT_CURSOR_INDEX *cindex;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cindex = (WT_CURSOR_INDEX *)cursor;
+	CURSOR_API_CALL(cursor, session, prev, NULL);
+	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+	if ((ret = cindex->child->prev(cindex->child)) == 0)
+		ret = __curindex_move(cindex);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_reset --
+ *	WT_CURSOR->reset method for index cursors.
+ */
+static int
+__curindex_reset(WT_CURSOR *cursor)
+{
+	WT_CURSOR **cp;
+	WT_CURSOR_INDEX *cindex;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	u_int i;
+
+	cindex = (WT_CURSOR_INDEX *)cursor;
+	CURSOR_API_CALL(cursor, session, reset, NULL);
+	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+	WT_TRET(cindex->child->reset(cindex->child));
+	for (i = 0, cp = cindex->cg_cursors;
+	    i < WT_COLGROUPS(cindex->table);
+	    i++, cp++) {
+		if (*cp == NULL)
+			continue;
+		WT_TRET((*cp)->reset(*cp));
+	}
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_search --
+ *	WT_CURSOR->search method for index cursors.
+ */
+static int
+__curindex_search(WT_CURSOR *cursor)
+{
+	WT_CURSOR *child;
+	WT_CURSOR_INDEX *cindex;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	int exact;
+
+	cindex = (WT_CURSOR_INDEX *)cursor;
+	child = cindex->child;
+	CURSOR_API_CALL(cursor, session, search, NULL);
+
+	/*
+	 * We expect partial matches, but we want the smallest item that
+	 * matches the prefix.  Fail if there is no matching item.
+	 */
+	__wt_cursor_set_raw_key(child, &cursor->key);
+	WT_ERR(child->search_near(child, &exact));
+
+	/*
+	 * We expect partial matches, and want the smallest record with a key
+	 * greater than or equal to the search key.  The only way for the key
+	 * to be equal is if there is an index on the primary key, because
+	 * otherwise the primary key columns will be appended to the index key,
+	 * but we don't disallow that (odd) case.
+	 */
+	if (exact < 0)
+		WT_ERR(child->next(child));
+
+	if (child->key.size < cursor->key.size ||
+	    memcmp(child->key.data, cursor->key.data, cursor->key.size) != 0) {
+		ret = WT_NOTFOUND;
+		goto err;
+	}
+
+	WT_ERR(__curindex_move(cindex));
+
+	if (0) {
+err:		F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+	}
+
+	API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_search_near --
+ *	WT_CURSOR->search_near method for index cursors.
+ */
+static int
+__curindex_search_near(WT_CURSOR *cursor, int *exact)
+{
+	WT_CURSOR_INDEX *cindex;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cindex = (WT_CURSOR_INDEX *)cursor;
+	CURSOR_API_CALL(cursor, session, search_near, NULL);
+	__wt_cursor_set_raw_key(cindex->child, &cursor->key);
+	if ((ret = cindex->child->search_near(cindex->child, exact)) == 0)
+		ret = __curindex_move(cindex);
+	else
+		F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_close --
+ *	WT_CURSOR->close method for index cursors.
+ */
+static int
+__curindex_close(WT_CURSOR *cursor)
+{
+	WT_CURSOR_INDEX *cindex;
+	WT_CURSOR **cp;
+	WT_DECL_RET;
+	WT_INDEX *idx;
+	WT_SESSION_IMPL *session;
+	u_int i;
+
+	cindex = (WT_CURSOR_INDEX *)cursor;
+	idx = cindex->index;
+
+	CURSOR_API_CALL(cursor, session, close, NULL);
+
+	if ((cp = cindex->cg_cursors) != NULL)
+		for (i = 0, cp = cindex->cg_cursors;
+		    i < WT_COLGROUPS(cindex->table); i++, cp++)
+			if (*cp != NULL) {
+				WT_TRET((*cp)->close(*cp));
+				*cp = NULL;
+			}
+
+	__wt_free(session, cindex->cg_cursors);
+	if (cindex->key_plan != idx->key_plan)
+		__wt_free(session, cindex->key_plan);
+	if (cursor->value_format != cindex->table->value_format)
+		__wt_free(session, cursor->value_format);
+	if (cindex->value_plan != idx->value_plan)
+		__wt_free(session, cindex->value_plan);
+
+	if (cindex->child != NULL)
+		WT_TRET(cindex->child->close(cindex->child));
+
+	__wt_schema_release_table(session, cindex->table);
+	/* The URI is owned by the index. */
+	cursor->internal_uri = NULL;
+	WT_TRET(__wt_cursor_close(cursor));
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_open_colgroups --
+ *	Open cursors on the column groups required for an index cursor.
+ */
+static int
+__curindex_open_colgroups(
+    WT_SESSION_IMPL *session, WT_CURSOR_INDEX *cindex, const char *cfg_arg[])
+{
+	WT_TABLE *table;
+	WT_CURSOR **cp;
+	u_long arg;
+	/* Child cursors are opened with dump disabled. */
+	const char *cfg[] = { cfg_arg[0], cfg_arg[1], "dump=\"\"", NULL };
+	char *proj;
+
+	table = cindex->table;
+	WT_RET(__wt_calloc_def(session, WT_COLGROUPS(table), &cp));
+	cindex->cg_cursors = cp;
+
+	/* Work out which column groups we need. */
+	for (proj = (char *)cindex->value_plan; *proj != '\0'; proj++) {
+		arg = strtoul(proj, &proj, 10);
+		if ((*proj != WT_PROJ_KEY && *proj != WT_PROJ_VALUE) ||
+		    cp[arg] != NULL)
+			continue;
+		WT_RET(__wt_open_cursor(session,
+		    table->cgroups[arg]->source,
+		    &cindex->iface, cfg, &cp[arg]));
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_curindex_open --
+ *	WT_SESSION->open_cursor method for index cursors.
+ */
+int
+__wt_curindex_open(WT_SESSION_IMPL *session,
+    const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+{
+	WT_CURSOR_STATIC_INIT(iface,
+	    __wt_cursor_get_key,	/* get-key */
+	    __curindex_get_value,	/* get-value */
+	    __wt_cursor_set_key,	/* set-key */
+	    __curindex_set_value,	/* set-value */
+	    __wt_cursor_notsup,		/* compare */
+	    __curindex_next,		/* next */
+	    __curindex_prev,		/* prev */
+	    __curindex_reset,		/* reset */
+	    __curindex_search,		/* search */
+	    __curindex_search_near,	/* search-near */
+	    __wt_cursor_notsup,		/* insert */
+	    __wt_cursor_notsup,		/* update */
+	    __wt_cursor_notsup,		/* remove */
+	    __curindex_close);		/* close */
+	WT_CURSOR_INDEX *cindex;
+	WT_CURSOR *cursor;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	WT_INDEX *idx;
+	WT_TABLE *table;
+	const char *columns, *idxname, *tablename;
+	size_t namesize;
+
+	tablename = uri;
+	if (!WT_PREFIX_SKIP(tablename, "index:") ||
+	    (idxname = strchr(tablename, ':')) == NULL)
+		WT_RET_MSG(session, EINVAL, "Invalid cursor URI: '%s'", uri);
+	namesize = (size_t)(idxname - tablename);
+	++idxname;
+
+	if ((ret = __wt_schema_get_table(session,
+	    tablename, namesize, 0, &table)) != 0) {
+		if (ret == WT_NOTFOUND)
+			WT_RET_MSG(session, EINVAL,
+			    "Cannot open cursor '%s' on unknown table", uri);
+		return (ret);
+	}
+
+	columns = strchr(idxname, '(');
+	if (columns == NULL)
+		namesize = strlen(idxname);
+	else
+		namesize = (size_t)(columns - idxname);
+
+	WT_RET(__wt_schema_open_index(session, table, idxname, namesize, &idx));
+	WT_RET(__wt_calloc_def(session, 1, &cindex));
+
+	cursor = &cindex->iface;
+	*cursor = iface;
+	cursor->session = &session->iface;
+
+	cindex->table = table;
+	cindex->index = idx;
+	cindex->key_plan = idx->key_plan;
+	cindex->value_plan = idx->value_plan;
+
+	cursor->internal_uri = idx->name;
+	cursor->key_format = idx->idxkey_format;
+	cursor->value_format = table->value_format;
+
+	/*
+	 * XXX
+	 * A very odd corner case is an index with a recno key.
+	 * The only way to get here is by creating an index on a column store
+	 * using only the primary's recno as the index key.  Disallow that for
+	 * now.
+	 */
+	if (WT_CURSOR_RECNO(cursor))
+		WT_ERR_MSG(session, WT_ERROR,
+		    "Column store indexes based on a record number primary "
+		    "key are not supported.");
+
+	/* Handle projections. */
+	if (columns != NULL) {
+		WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+		WT_ERR(__wt_struct_reformat(session, table,
+		    columns, strlen(columns), NULL, 0, tmp));
+		WT_ERR(__wt_strndup(
+		    session, tmp->data, tmp->size, &cursor->value_format));
+
+		WT_ERR(__wt_buf_init(session, tmp, 0));
+		WT_ERR(__wt_struct_plan(session, table,
+		    columns, strlen(columns), 0, tmp));
+		WT_ERR(__wt_strndup(
+		    session, tmp->data, tmp->size, &cindex->value_plan));
+	}
+
+	WT_ERR(__wt_cursor_init(
+	    cursor, cursor->internal_uri, owner, cfg, cursorp));
+
+	WT_ERR(__wt_open_cursor(
+	    session, idx->source, cursor, cfg, &cindex->child));
+
+	/* Open the column groups needed for this index cursor. */
+	WT_ERR(__curindex_open_colgroups(session, cindex, cfg));
+
+	if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON))
+		WT_ERR(__wt_json_column_init(cursor, table->key_format,
+			&idx->colconf, &table->colconf));
+
+	if (0) {
+err:		WT_TRET(__curindex_close(cursor));
+		*cursorp = NULL;
+	}
+
+	__wt_scr_free(&tmp);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_json.c b/src/third_party/wiredtiger/src/cursor/cur_json.c
new file mode 100644
index 00000000000..f4459819259
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_json.c
@@ -0,0 +1,931 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static size_t __json_unpack_put(WT_SESSION_IMPL *, void *, u_char *, size_t,
+    WT_CONFIG_ITEM *);
+static inline int __json_struct_size(WT_SESSION_IMPL *, const void *, size_t,
+    const char *, WT_CONFIG_ITEM *, int, size_t *);
+static inline int __json_struct_unpackv(WT_SESSION_IMPL *, const void *, size_t,
+    const char *, WT_CONFIG_ITEM *, u_char *, size_t, int, va_list);
+static int json_string_arg(WT_SESSION_IMPL *, const char **, WT_ITEM *);
+static int json_int_arg(WT_SESSION_IMPL *, const char **, int64_t *);
+static int json_uint_arg(WT_SESSION_IMPL *, const char **, uint64_t *);
+static int __json_pack_struct(WT_SESSION_IMPL *, void *, size_t, const char *,
+    const char *);
+static int __json_pack_size(WT_SESSION_IMPL *, const char *, WT_CONFIG_ITEM *,
+    int, const char *, size_t *);
+
+#define	WT_PACK_JSON_GET(session, pv, jstr) do {			\
+	switch (pv.type) {						\
+	case 'x':							\
+		break;							\
+	case 's':							\
+	case 'S':							\
+		WT_RET(json_string_arg(session, &jstr, &pv.u.item));	\
+		pv.type = pv.type == 's' ? 'j' : 'J';			\
+		break;							\
+	case 'b':							\
+	case 'h':							\
+	case 'i':							\
+	case 'l':							\
+	case 'q':							\
+		WT_RET(json_int_arg(session, &jstr, &pv.u.i));		\
+		break;							\
+	case 'B':							\
+	case 'H':							\
+	case 'I':							\
+	case 'L':							\
+	case 'Q':							\
+	case 'r':							\
+	case 'R':							\
+	case 't':							\
+		WT_RET(json_uint_arg(session, &jstr, &pv.u.u));		\
+		break;							\
+	/* User format strings have already been validated. */		\
+	WT_ILLEGAL_VALUE(session);					\
+	}								\
+} while (0)
+
+/*
+ * __json_unpack_put --
+ *	Calculate the size of a packed byte string as formatted for JSON.
+ */
+static size_t
+__json_unpack_put(WT_SESSION_IMPL *session, void *voidpv,
+    u_char *buf, size_t bufsz, WT_CONFIG_ITEM *name)
+{
+	WT_PACK_VALUE *pv;
+	const char *p, *end;
+	size_t s, n;
+
+	pv = (WT_PACK_VALUE *)voidpv;
+	s = (size_t)snprintf((char *)buf, bufsz, "\"%.*s\" : ",
+	    (int)name->len, name->str);
+	if (s <= bufsz) {
+		bufsz -= s;
+		buf += s;
+	}
+	else
+		bufsz = 0;
+
+	switch (pv->type) {
+	case 'x':
+		return (0);
+	case 's':
+	case 'S':
+		/* Account for '"' quote in front and back. */
+		s += 2;
+		p = (const char *)pv->u.s;
+		if (bufsz > 0) {
+			*buf++ = '"';
+			bufsz--;
+		}
+		if (pv->type == 's' || pv->havesize) {
+			end = p + pv->size;
+			for (; p < end; p++) {
+				n = __wt_json_unpack_char(*p, buf, bufsz, 0);
+				if (n > bufsz)
+					bufsz = 0;
+				else {
+					bufsz -= n;
+					buf += n;
+				}
+				s += n;
+			}
+		} else
+			for (; *p; p++) {
+				n = __wt_json_unpack_char(*p, buf, bufsz, 0);
+				if (n > bufsz)
+					bufsz = 0;
+				else {
+					bufsz -= n;
+					buf += n;
+				}
+				s += n;
+			}
+		if (bufsz > 0)
+			*buf++ = '"';
+		return (s);
+	case 'U':
+	case 'u':
+		s += 2;
+		p = (const char *)pv->u.item.data;
+		end = p + pv->u.item.size;
+		if (bufsz > 0) {
+			*buf++ = '"';
+			bufsz--;
+		}
+		for (; p < end; p++) {
+			n = __wt_json_unpack_char(*p, buf, bufsz, 1);
+			if (n > bufsz)
+				bufsz = 0;
+			else {
+				bufsz -= n;
+				buf += n;
+			}
+			s += n;
+		}
+		if (bufsz > 0)
+			*buf++ = '"';
+		return (s);
+	case 'b':
+	case 'h':
+	case 'i':
+	case 'l':
+	case 'q':
+		return (s +
+		    (size_t)snprintf((char *)buf, bufsz, "%" PRId64, pv->u.i));
+	case 'B':
+	case 't':
+	case 'H':
+	case 'I':
+	case 'L':
+	case 'Q':
+	case 'r':
+	case 'R':
+		return (s +
+		    (size_t)snprintf((char *)buf, bufsz, "%" PRId64, pv->u.u));
+	}
+	__wt_err(session, EINVAL, "unknown pack-value type: %c", (int)pv->type);
+	return ((size_t)-1);
+}
+
+/*
+ * __json_struct_size --
+ *	Calculate the size of a packed byte string as formatted for JSON.
+ */
+static inline int
+__json_struct_size(WT_SESSION_IMPL *session, const void *buffer,
+    size_t size, const char *fmt, WT_CONFIG_ITEM *names, int iskey,
+    size_t *presult)
+{
+	WT_CONFIG_ITEM name;
+	WT_DECL_PACK_VALUE(pv);
+	WT_DECL_RET;
+	WT_PACK pack;
+	WT_PACK_NAME packname;
+	const uint8_t *p, *end;
+	size_t result;
+	int needcr;
+
+	p = buffer;
+	end = p + size;
+	result = 0;
+	needcr = 0;
+
+	WT_RET(__pack_name_init(session, names, iskey, &packname));
+	WT_RET(__pack_init(session, &pack, fmt));
+	while ((ret = __pack_next(&pack, &pv)) == 0) {
+		if (needcr)
+			result += 2;
+		needcr = 1;
+		WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p)));
+		WT_RET(__pack_name_next(&packname, &name));
+		result += __json_unpack_put(session, &pv, NULL, 0, &name);
+	}
+	if (ret == WT_NOTFOUND)
+		ret = 0;
+
+	/* Be paranoid - __pack_write should never overflow. */
+	WT_ASSERT(session, p <= end);
+
+	*presult = result;
+	return (ret);
+}
+
+/*
+ * __json_struct_unpackv --
+ *	Unpack a byte string to JSON (va_list version).
+ */
+static inline int
+__json_struct_unpackv(WT_SESSION_IMPL *session,
+    const void *buffer, size_t size, const char *fmt, WT_CONFIG_ITEM *names,
+    u_char *jbuf, size_t jbufsize, int iskey, va_list ap)
+{
+	WT_CONFIG_ITEM name;
+	WT_DECL_PACK_VALUE(pv);
+	WT_DECL_RET;
+	WT_PACK pack;
+	WT_PACK_NAME packname;
+	int needcr;
+	size_t jsize;
+	const uint8_t *p, *end;
+
+	p = buffer;
+	end = p + size;
+	needcr = 0;
+
+	/* Unpacking a cursor marked as json implies a single arg. */
+	*va_arg(ap, const char **) = (char *)jbuf;
+
+	WT_RET(__pack_name_init(session, names, iskey, &packname));
+	WT_RET(__pack_init(session, &pack, fmt));
+	while ((ret = __pack_next(&pack, &pv)) == 0) {
+		if (needcr) {
+			WT_ASSERT(session, jbufsize >= 3);
+			strncat((char *)jbuf, ",\n", jbufsize);
+			jbuf += 2;
+			jbufsize -= 2;
+		}
+		needcr = 1;
+		WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p)));
+		WT_RET(__pack_name_next(&packname, &name));
+		jsize = __json_unpack_put(session,
+		    (u_char *)&pv, jbuf, jbufsize, &name);
+		WT_ASSERT(session, jsize <= jbufsize);
+		jbuf += jsize;
+		jbufsize -= jsize;
+	}
+	if (ret == WT_NOTFOUND)
+		ret = 0;
+
+	/* Be paranoid - __unpack_read should never overflow. */
+	WT_ASSERT(session, p <= end);
+
+	WT_ASSERT(session, jbufsize == 1);
+
+	return (ret);
+}
+
+/*
+ * __wt_json_alloc_unpack --
+ *	Allocate space for, and unpack an entry into JSON format.
+ */
+int
+__wt_json_alloc_unpack(WT_SESSION_IMPL *session, const void *buffer,
+    size_t size, const char *fmt, WT_CURSOR_JSON *json,
+    int iskey, va_list ap)
+{
+	WT_CONFIG_ITEM *names;
+	WT_DECL_RET;
+	size_t needed;
+	char **json_bufp;
+
+	if (iskey) {
+		names = &json->key_names;
+		json_bufp = &json->key_buf;
+	} else {
+		names = &json->value_names;
+		json_bufp = &json->value_buf;
+	}
+	needed = 0;
+	WT_RET(__json_struct_size(session, buffer, size, fmt, names,
+	    iskey, &needed));
+	WT_RET(__wt_realloc(session, NULL, needed + 1, json_bufp));
+	WT_RET(__json_struct_unpackv(session, buffer, size, fmt,
+	    names, (u_char *)*json_bufp, needed + 1, iskey, ap));
+
+	return (ret);
+}
+
+/*
+ * __wt_json_close --
+ *	Release any json related resources.
+ */
+void
+__wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
+{
+	WT_CURSOR_JSON *json;
+
+	if ((json = (WT_CURSOR_JSON *)cursor->json_private) != NULL) {
+		__wt_free(session, json->key_buf);
+		__wt_free(session, json->value_buf);
+		__wt_free(session, json);
+	}
+	return;
+}
+
+/*
+ * __wt_json_unpack_char --
+ *	Unpack a single character into JSON escaped format.
+ *	Can be called with null buf for sizing.
+ */
+size_t
+__wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, int force_unicode)
+{
+	char abbrev;
+	u_char h;
+
+	if (!force_unicode) {
+		if (isprint(ch) && ch != '\\' && ch != '"') {
+			if (bufsz >= 1)
+				*buf = (u_char)ch;
+			return (1);
+		} else {
+			abbrev = '\0';
+			switch (ch) {
+			case '\\':
+			case '"':
+				abbrev = ch;
+				break;
+			case '\f':
+				abbrev = 'f';
+				break;
+			case '\n':
+				abbrev = 'n';
+				break;
+			case '\r':
+				abbrev = 'r';
+				break;
+			case '\t':
+				abbrev = 't';
+				break;
+			}
+			if (abbrev != '\0') {
+				if (bufsz >= 2) {
+					*buf++ = '\\';
+					*buf = (u_char)abbrev;
+				}
+				return (2);
+			}
+		}
+	}
+	if (bufsz >= 6) {
+		*buf++ = '\\';
+		*buf++ = 'u';
+		*buf++ = '0';
+		*buf++ = '0';
+		h = (((u_char)ch) >> 4) & 0xF;
+		if (h >= 10)
+			*buf++ = 'A' + (h - 10);
+		else
+			*buf++ = '0' + h;
+		h = ((u_char)ch) & 0xF;
+		if (h >= 10)
+			*buf++ = 'A' + (h - 10);
+		else
+			*buf++ = '0' + h;
+	}
+	return (6);
+}
+
+/*
+ * __wt_json_column_init --
+ *	set json_key_names, json_value_names to comma separated lists
+ *	of column names.
+ */
+int
+__wt_json_column_init(WT_CURSOR *cursor, const char *keyformat,
+    const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf)
+{
+	WT_CURSOR_JSON *json;
+	const char *p, *end, *beginkey;
+	uint32_t keycnt, nkeys;
+
+	json = (WT_CURSOR_JSON *)cursor->json_private;
+	beginkey = colconf->str;
+	end = beginkey + colconf->len;
+
+	if (idxconf != NULL) {
+		json->key_names.str = idxconf->str;
+		json->key_names.len = idxconf->len;
+	} else if (colconf->len > 0 && *beginkey == '(') {
+		beginkey++;
+		if (end[-1] == ')')
+			end--;
+	}
+
+	for (nkeys = 0; *keyformat; keyformat++)
+		if (!isdigit(*keyformat))
+			nkeys++;
+
+	p = beginkey;
+	keycnt = 0;
+	while (p < end && keycnt < nkeys) {
+		if (*p == ',')
+			keycnt++;
+		p++;
+	}
+	json->value_names.str = p;
+	json->value_names.len = WT_PTRDIFF(end, p);
+	if (idxconf == NULL) {
+		if (p > beginkey)
+			p--;
+		json->key_names.str = beginkey;
+		json->key_names.len = WT_PTRDIFF(p, beginkey);
+	}
+	return (0);
+}
+
+#define	MATCH_KEYWORD(session, in, result, keyword, matchval) 	do {	\
+	size_t _kwlen = strlen(keyword);				\
+	if (strncmp(in, keyword, _kwlen) == 0 && !isalnum(in[_kwlen])) { \
+		in += _kwlen;						\
+		result = matchval;					\
+	} else {							\
+		const char *_bad = in;					\
+		while (isalnum(*in))					\
+			in++;						\
+		__wt_errx(session, "unknown keyword \"%.*s\" in JSON",	\
+		    (int)(in - _bad), _bad);				\
+	}								\
+} while (0)
+
+/*
+ * __wt_json_token --
+ *	Return the type, start position and length of the next JSON
+ *	token in the input.  String tokens include the quotes.  JSON
+ *	can be entirely parsed using calls to this tokenizer, each
+ *	call using a src pointer that is the previously returned
+ *	tokstart + toklen.
+ *
+ *	The token type returned is one of:
+ *		0	:  EOF
+ *		's'	:  string
+ *		'i'	:  intnum
+ *		'f'	:  floatnum
+ *		':'	:  colon
+ *		','	:  comma
+ *		'{'	:  lbrace
+ *		'}'	:  rbrace
+ *		'['	:  lbracket
+ *		']'	:  rbracket
+ *		'N'	:  null
+ *		'T'	:  true
+ *		'F'	:  false
+ */
+int
+__wt_json_token(WT_SESSION *wt_session, const char *src, int *toktype,
+    const char **tokstart, size_t *toklen)
+{
+	WT_SESSION_IMPL *session;
+	char ch;
+	const char *bad;
+	int backslash, isalph, isfloat, result;
+
+	result = -1;
+	session = (WT_SESSION_IMPL *)wt_session;
+	while (isspace(*src))
+		src++;
+	*tokstart = src;
+
+	if (*src == '\0') {
+		*toktype = 0;
+		*toklen = 0;
+		return (0);
+	}
+
+	/* JSON is specified in RFC 4627. */
+	switch (*src) {
+	case '"':
+		backslash = 0;
+		src++;
+		while ((ch = *src) != '\0') {
+			if (!backslash) {
+				if (ch == '"') {
+					src++;
+					result = 's';
+					break;
+				}
+				if (ch == '\\')
+					backslash = 1;
+			} else {
+				/* We validate Unicode on this pass. */
+				if (ch == 'u') {
+					u_char ignored;
+					const u_char *uc;
+
+					uc = (const u_char *)src;
+					if (__wt_hex2byte(&uc[1], &ignored) ||
+					    __wt_hex2byte(&uc[3], &ignored)) {
+						__wt_errx(session,
+				    "invalid Unicode within JSON string");
+						return (-1);
+					}
+					src += 5;
+				}
+				backslash = 0;
+			}
+			src++;
+		}
+		if (result != 's')
+			__wt_errx(session, "unterminated string in JSON");
+		break;
+	case '-':
+	case '0':
+	case '1':
+	case '2':
+	case '3':
+	case '4':
+	case '5':
+	case '6':
+	case '7':
+	case '8':
+	case '9':
+		isfloat = 0;
+		if (*src == '-')
+			src++;
+		while ((ch = *src) != '\0' && isdigit(ch))
+			src++;
+		if (*src == '.') {
+			isfloat = 1;
+			src++;
+			while ((ch = *src) != '\0' &&
+			    isdigit(ch))
+				src++;
+		}
+		if (*src == 'e' || *src == 'E') {
+			isfloat = 1;
+			src++;
+			if (*src == '+' || *src == '-')
+				src++;
+			while ((ch = *src) != '\0' &&
+			    isdigit(ch))
+				src++;
+		}
+		result = isfloat ? 'f' : 'i';
+		break;
+	case ':':
+	case ',':
+	case '{':
+	case '}':
+	case '[':
+	case ']':
+		result = *src++;
+		break;
+	case 'n':
+		MATCH_KEYWORD(session, src, result, "null", 'N');
+		break;
+	case 't':
+		MATCH_KEYWORD(session, src, result, "true", 'T');
+		break;
+	case 'f':
+		MATCH_KEYWORD(session, src, result, "false", 'F');
+		break;
+	default:
+		/* An illegal token, move past it anyway */
+		bad = src;
+		isalph = isalnum(*src);
+		src++;
+		if (isalph)
+			while (*src != '\0' && isalnum(*src))
+				src++;
+		__wt_errx(session, "unknown token \"%.*s\" in JSON",
+		    (int)(src - bad), bad);
+		break;
+	}
+	*toklen = (size_t)(src - *tokstart);
+	*toktype = result;
+	return (result < 0 ? EINVAL : 0);
+}
+
+/*
+ * __wt_json_tokname
+ *	Return a descriptive name from the token type returned by
+ *	__wt_json_token
+ */
+const char *
+__wt_json_tokname(int toktype)
+{
+	switch (toktype) {
+	case 0:		return ("<EOF>");
+	case 's':	return ("<string>");
+	case 'i':	return ("<integer>");
+	case 'f':	return ("<float>");
+	case ':':	return ("':'");
+	case ',':	return ("','");
+	case '{':	return ("'{'");
+	case '}':	return ("'}'");
+	case '[':	return ("'['");
+	case ']':	return ("']'");
+	case 'N':	return ("'null'");
+	case 'T':	return ("'true'");
+	case 'F':	return ("'false'");
+	default:	return ("<UNKNOWN>");
+	}
+}
+
+/*
+ * json_string_arg --
+ *	Returns a first cut of the needed string in item.
+ *	The result has not been stripped of escapes.
+ */
+static int
+json_string_arg(WT_SESSION_IMPL *session, const char **jstr, WT_ITEM *item)
+{
+	const char *tokstart;
+	int tok;
+	WT_DECL_RET;
+
+	WT_RET(__wt_json_token((WT_SESSION *)session, *jstr, &tok, &tokstart,
+		&item->size));
+	if (tok == 's') {
+		*jstr = tokstart + item->size;
+		/* The tokenizer includes the '"' chars */
+		item->data = tokstart + 1;
+		item->size -= 2;
+		ret = 0;
+	} else {
+		__wt_errx(session, "expected JSON <string>, got %s",
+		    __wt_json_tokname(tok));
+		ret = EINVAL;
+	}
+	return (ret);
+}
+
+/*
+ * json_int_arg --
+ *	Returns a signed integral value from the current position
+ *	in the JSON string.
+ */
+static int
+json_int_arg(WT_SESSION_IMPL *session, const char **jstr, int64_t *ip)
+{
+	char *end;
+	const char *tokstart;
+	int tok;
+	size_t toksize;
+
+	WT_RET(__wt_json_token((WT_SESSION *)session, *jstr, &tok, &tokstart,
+		&toksize));
+	if (tok == 'i') {
+		/* JSON only allows decimal */
+		*ip = strtoll(tokstart, &end, 10);
+		if (end != tokstart + toksize)
+			WT_RET_MSG(session, EINVAL,
+			    "JSON <int> extraneous input");
+		*jstr = tokstart + toksize;
+	} else {
+		__wt_errx(session, "expected JSON <int>, got %s",
+		    __wt_json_tokname(tok));
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/*
+ * json_uint_arg --
+ *	Returns an unsigned integral value from the current position
+ *	in the JSON string.
+ */
+static int
+json_uint_arg(WT_SESSION_IMPL *session, const char **jstr, uint64_t *up)
+{
+	char *end;
+	const char *tokstart;
+	int tok;
+	size_t toksize;
+
+	WT_RET(__wt_json_token((WT_SESSION *)session, *jstr, &tok, &tokstart,
+		&toksize));
+	if (tok == 'i' && *tokstart != '-') {
+		/* JSON only allows decimal */
+		*up = strtoull(tokstart, &end, 10);
+		if (end != tokstart + toksize)
+			WT_RET_MSG(session, EINVAL,
+			    "JSON <int> extraneous input");
+		*jstr = tokstart + toksize;
+	} else {
+		__wt_errx(session, "expected unsigned JSON <int>, got %s",
+		    __wt_json_tokname(tok));
+		return (EINVAL);
+	}
+	return (0);
+}
+
+#define	JSON_EXPECT_TOKEN_GET(session, jstr, tokval, start, sz) do {	\
+    int __tok;								\
+    WT_RET(__wt_json_token((WT_SESSION *)session, jstr, &__tok, &start, &sz));\
+    if (__tok != tokval) {						\
+	    __wt_errx(session, "expected JSON %s, got %s",		\
+		__wt_json_tokname(tokval), __wt_json_tokname(__tok));	\
+	    return (EINVAL);						\
+    }									\
+    jstr = start + sz;							\
+} while (0)
+
+#define	JSON_EXPECT_TOKEN(session, jstr, tokval) do {			\
+    const char *__start;						\
+    size_t __sz;							\
+    JSON_EXPECT_TOKEN_GET(session, jstr, tokval, __start, __sz);	\
+} while (0)
+
+/*
+ * __json_pack_struct --
+ *	Pack a byte string from a JSON string.
+ */
+static int
+__json_pack_struct(WT_SESSION_IMPL *session, void *buffer, size_t size,
+    const char *fmt, const char *jstr)
+{
+	WT_DECL_PACK_VALUE(pv);
+	WT_DECL_RET;
+	WT_PACK pack;
+	const char *tokstart;
+	int multi;
+	size_t toksize;
+	uint8_t *p, *end;
+
+	p = buffer;
+	end = p + size;
+	multi = 0;
+
+	if (fmt[0] != '\0' && fmt[1] == '\0') {
+		JSON_EXPECT_TOKEN_GET(session, jstr, 's', tokstart, toksize);
+		/* the key name was verified in __json_pack_size */
+		JSON_EXPECT_TOKEN(session, jstr, ':');
+		pv.type = fmt[0];
+		WT_PACK_JSON_GET(session, pv, jstr);
+		return (__pack_write(session, &pv, &p, size));
+	}
+
+	WT_RET(__pack_init(session, &pack, fmt));
+	while ((ret = __pack_next(&pack, &pv)) == 0) {
+		if (multi)
+			JSON_EXPECT_TOKEN(session, jstr, ',');
+		JSON_EXPECT_TOKEN_GET(session, jstr, 's', tokstart, toksize);
+		/* the key name was verified in __json_pack_size */
+		JSON_EXPECT_TOKEN(session, jstr, ':');
+		WT_PACK_JSON_GET(session, pv, jstr);
+		WT_RET(__pack_write(session, &pv, &p, (size_t)(end - p)));
+		multi = 1;
+	}
+
+	/* Be paranoid - __pack_write should never overflow. */
+	WT_ASSERT(session, p <= end);
+
+	if (ret != WT_NOTFOUND)
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __json_pack_size --
+ *	Calculate the size of a packed byte string from a JSON string.
+ *	We verify that the names and value types provided in JSON match
+ *	the column names and type from the schema format, returning error
+ *	if not.
+ */
+static int
+__json_pack_size(
+    WT_SESSION_IMPL *session, const char *fmt, WT_CONFIG_ITEM *names,
+	int iskey, const char *jstr, size_t *sizep)
+{
+	WT_CONFIG_ITEM name;
+	WT_DECL_PACK_VALUE(pv);
+	WT_PACK pack;
+	WT_PACK_NAME packname;
+	const char *tokstart;
+	int multi;
+	size_t toksize, total;
+
+	WT_RET(__pack_name_init(session, names, iskey, &packname));
+	multi = 0;
+	WT_RET(__pack_init(session, &pack, fmt));
+	for (total = 0; __pack_next(&pack, &pv) == 0;) {
+		if (multi)
+			JSON_EXPECT_TOKEN(session, jstr, ',');
+		JSON_EXPECT_TOKEN_GET(session, jstr, 's', tokstart, toksize);
+		WT_RET(__pack_name_next(&packname, &name));
+		if (toksize - 2 != name.len ||
+		    strncmp(tokstart + 1, name.str, toksize - 2) != 0) {
+			__wt_errx(session, "JSON expected %s name: \"%.*s\"",
+			    iskey ? "key" : "value", (int)name.len, name.str);
+			return (EINVAL);
+		}
+		JSON_EXPECT_TOKEN(session, jstr, ':');
+		WT_PACK_JSON_GET(session, pv, jstr);
+		total += __pack_size(session, &pv);
+		multi = 1;
+	}
+	/* check end of string */
+	JSON_EXPECT_TOKEN(session, jstr, 0);
+
+	*sizep = total;
+	return (0);
+}
+
+/*
+ * __wt_json_to_item --
+ *	Convert a JSON input string for either key/value to a raw WT_ITEM.
+ *	Checks that the input matches the expected format.
+ */
+int
+__wt_json_to_item(WT_SESSION_IMPL *session, const char *jstr,
+    const char *format, WT_CURSOR_JSON *json, int iskey, WT_ITEM *item)
+{
+	size_t sz;
+	sz = 0; /* Initialize because GCC 4.1 is paranoid */
+
+	WT_RET(__json_pack_size(session, format,
+	    iskey ? &json->key_names : &json->value_names, iskey, jstr, &sz));
+	WT_RET(__wt_buf_initsize(session, item, sz));
+	WT_RET(__json_pack_struct(session, item->mem, sz, format, jstr));
+	return (0);
+}
+
+/*
+ * __wt_json_strlen --
+ *	Return the number of bytes represented by a string in JSON format,
+ *	or -1 if the format is incorrect.
+ */
+ssize_t
+__wt_json_strlen(const char *src, size_t srclen)
+{
+	const char *srcend;
+	size_t dstlen;
+	u_char hi, lo;
+
+	dstlen = 0;
+	srcend = src + srclen;
+	while (src < srcend) {
+		/* JSON can include any UTF-8 expressed in 4 hex chars. */
+		if (*src == '\\') {
+			if (*++src == 'u') {
+				if (__wt_hex2byte((const u_char *)++src, &hi))
+					return (-1);
+				src += 2;
+				if (__wt_hex2byte((const u_char *)src, &lo))
+					return (-1);
+				src += 2;
+				/* RFC 3629 */
+				if (hi >= 0x8) {
+					/* 3 bytes total */
+					dstlen += 2;
+				}
+				else if (hi != 0 || lo >= 0x80) {
+					/* 2 bytes total */
+					dstlen++;
+				}
+				/* else 1 byte total */
+			}
+		}
+		dstlen++;
+		src++;
+	}
+	if (src != srcend)
+		return (-1);   /* invalid input, e.g. final char is '\\' */
+	return ((ssize_t)dstlen);
+}
+
+/*
+ * __wt_json_strncpy --
+ *	Copy bytes of string in JSON format to a destination,
+ *	up to dstlen bytes.  If dstlen is greater than the needed size,
+ *	the result if zero padded.
+ */
+int
+__wt_json_strncpy(char **pdst, size_t dstlen, const char *src, size_t srclen)
+{
+	char *dst;
+	const char *dstend, *srcend;
+	u_char hi, lo;
+
+	dst = *pdst;
+	dstend = dst + dstlen;
+	srcend = src + srclen;
+	while (src < srcend && dst < dstend) {
+		/* JSON can include any UTF-8 expressed in 4 hex chars. */
+		if (*src == '\\') {
+			if (*++src == 'u') {
+				if (__wt_hex2byte((const u_char *)++src, &hi))
+					return (EINVAL);
+				src += 2;
+				if (__wt_hex2byte((const u_char *)src, &lo))
+					return (EINVAL);
+				src += 2;
+				/* RFC 3629 */
+				if (hi >= 0x8) {
+					/* 3 bytes total */
+					/* byte 0: 1110HHHH */
+					/* byte 1: 10HHHHLL */
+					/* byte 2: 10LLLLLL */
+					*dst++ = (char)(0xe0 |
+					    ((hi >> 4) & 0x0f));
+					*dst++ = (char)(0x80 |
+					    ((hi << 2) & 0x3c) |
+					    ((lo >> 6) & 0x03));
+					*dst++ = (char)(0x80 | (lo & 0x3f));
+				} else if (hi != 0 || lo >= 0x80) {
+					/* 2 bytes total */
+					/* byte 0: 110HHHLL */
+					/* byte 1: 10LLLLLL */
+					*dst++ = (char)(0xc0 |
+					    (hi << 2) |
+					    ((lo >> 6) & 0x03));
+					*dst++ = (char)(0x80 | (lo & 0x3f));
+				} else
+					/* else 1 byte total */
+					/* byte 0: 0LLLLLLL */
+					*dst++ = (char)lo;
+			}
+			else
+				*dst++ = *src;
+		} else
+			*dst++ = *src;
+		src++;
+	}
+	if (src != srcend)
+		return (ENOMEM);
+	*pdst = dst;
+	while (dst < dstend)
+		*dst++ = '\0';
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_log.c b/src/third_party/wiredtiger/src/cursor/cur_log.c
new file mode 100644
index 00000000000..803d68e890c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_log.c
@@ -0,0 +1,380 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __curlog_logrec --
+ *	Callback function from log_scan to get a log record.
+ */
+static int
+__curlog_logrec(
+    WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, void *cookie)
+{
+	WT_CURSOR_LOG *cl;
+
+	cl = cookie;
+
+	/* Set up the LSNs and take a copy of the log record for the cursor. */
+	*cl->cur_lsn = *lsnp;
+	*cl->next_lsn = *lsnp;
+	cl->next_lsn->offset += (wt_off_t)logrec->size;
+	WT_RET(__wt_buf_set(session, cl->logrec, logrec->data, logrec->size));
+
+	/*
+	 * Read the log header.  Set up the step pointers to walk the
+	 * operations inside the record.  Get the record type.
+	 */
+	cl->stepp = LOG_SKIP_HEADER(cl->logrec->data);
+	cl->stepp_end = (uint8_t *)cl->logrec->data + logrec->size;
+	WT_RET(__wt_logrec_read(session, &cl->stepp, cl->stepp_end,
+	    &cl->rectype));
+
+	/* A step count of 0 means the entire record. */
+	cl->step_count = 0;
+
+	/*
+	 * Unpack the txnid so that we can return each
+	 * individual operation for this txnid.
+	 */
+	if (cl->rectype == WT_LOGREC_COMMIT)
+		WT_RET(__wt_vunpack_uint(&cl->stepp,
+		    WT_PTRDIFF(cl->stepp_end, cl->stepp), &cl->txnid));
+	else {
+		/*
+		 * Step over anything else.
+		 * Setting stepp to NULL causes the next()
+		 * method to read a new record on the next call.
+		 */
+		cl->stepp = NULL;
+		cl->txnid = 0;
+	}
+	return (0);
+}
+
+/*
+ * __curlog_compare --
+ *	WT_CURSOR.compare method for the log cursor type.
+ */
+static int
+__curlog_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
+{
+	WT_CURSOR_LOG *acl, *bcl;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	CURSOR_API_CALL(a, session, compare, NULL);
+
+	acl = (WT_CURSOR_LOG *)a;
+	bcl = (WT_CURSOR_LOG *)b;
+	WT_ASSERT(session, cmpp != NULL);
+	*cmpp = LOG_CMP(acl->cur_lsn, bcl->cur_lsn);
+	/*
+	 * If both are on the same LSN, compare step counter.
+	 */
+	if (*cmpp == 0)
+		*cmpp = (acl->step_count != bcl->step_count ?
+		    (acl->step_count < bcl->step_count ? -1 : 1) : 0);
+err:	API_END_RET(session, ret);
+
+}
+
+/*
+ * __curlog_op_read --
+ *	Read out any key/value from an individual operation record
+ *	in the log.  We're only interested in put and remove operations
+ *	since truncate is not a cursor operation.  All successful
+ *	returns from this function will have set up the cursor copy of
+ *	key and value to give the user.
+ */
+static int
+__curlog_op_read(WT_SESSION_IMPL *session,
+    WT_CURSOR_LOG *cl, uint32_t optype, uint32_t opsize, uint32_t *fileid)
+{
+	WT_ITEM key, value;
+	uint64_t recno;
+	const uint8_t *end, *pp;
+
+	pp = cl->stepp;
+	end = pp + opsize;
+	switch (optype) {
+	case WT_LOGOP_COL_PUT:
+		WT_RET(__wt_logop_col_put_unpack(session, &pp, end,
+		    fileid, &recno, &value));
+		WT_RET(__wt_buf_set(session, cl->opkey, &recno, sizeof(recno)));
+		WT_RET(__wt_buf_set(session,
+		    cl->opvalue, value.data, value.size));
+		break;
+	case WT_LOGOP_COL_REMOVE:
+		WT_RET(__wt_logop_col_remove_unpack(session, &pp, end,
+		    fileid, &recno));
+		WT_RET(__wt_buf_set(session, cl->opkey, &recno, sizeof(recno)));
+		WT_RET(__wt_buf_set(session, cl->opvalue, NULL, 0));
+		break;
+	case WT_LOGOP_ROW_PUT:
+		WT_RET(__wt_logop_row_put_unpack(session, &pp, end,
+		    fileid, &key, &value));
+		WT_RET(__wt_buf_set(session, cl->opkey, key.data, key.size));
+		WT_RET(__wt_buf_set(session,
+		    cl->opvalue, value.data, value.size));
+		break;
+	case WT_LOGOP_ROW_REMOVE:
+		WT_RET(__wt_logop_row_remove_unpack(session, &pp, end,
+		    fileid, &key));
+		WT_RET(__wt_buf_set(session, cl->opkey, key.data, key.size));
+		WT_RET(__wt_buf_set(session, cl->opvalue, NULL, 0));
+		break;
+	default:
+		/*
+		 * Any other operations return the record in the value
+		 * and an empty key.
+		 */
+		*fileid = 0;
+		WT_RET(__wt_buf_set(session, cl->opkey, NULL, 0));
+		WT_RET(__wt_buf_set(session, cl->opvalue, cl->stepp, opsize));
+	}
+	return (0);
+}
+
+/*
+ * __curlog_kv --
+ *	Set the key and value of the log cursor to return to the user.
+ */
+static int
+__curlog_kv(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
+{
+	WT_CURSOR_LOG *cl;
+	uint32_t fileid, key_count, opsize, optype;
+
+	cl = (WT_CURSOR_LOG *)cursor;
+	/*
+	 * If it is a commit and we have stepped over the header, peek to get
+	 * the size and optype and read out any key/value from this operation.
+	 */
+	if ((key_count = cl->step_count++) > 0) {
+		WT_RET(__wt_logop_read(session,
+		    &cl->stepp, cl->stepp_end, &optype, &opsize));
+		WT_RET(__curlog_op_read(session, cl, optype, opsize, &fileid));
+		/* Position on the beginning of the next record part. */
+		cl->stepp += opsize;
+	} else {
+		optype = WT_LOGOP_INVALID;
+		fileid = 0;
+		cl->opkey->data = NULL;
+		cl->opkey->size = 0;
+		/*
+		 * Non-commit records we want to return the record without the
+		 * header and the adjusted size.  Add one to skip over the type
+		 * which is normally consumed by __wt_logrec_read.
+		 */
+		cl->opvalue->data = LOG_SKIP_HEADER(cl->logrec->data) + 1;
+		cl->opvalue->size = LOG_REC_SIZE(cl->logrec->size) - 1;
+	}
+	/*
+	 * The log cursor sets the LSN and step count as the cursor key and
+	 * and log record related data in the value.  The data in the value
+	 * contains any operation key/value that was in the log record.
+	 */
+	__wt_cursor_set_key(cursor, cl->cur_lsn->file, cl->cur_lsn->offset,
+	    key_count);
+	__wt_cursor_set_value(cursor, cl->txnid, cl->rectype, optype,
+	    fileid, cl->opkey, cl->opvalue);
+	return (0);
+}
+
+/*
+ * __curlog_next --
+ *	WT_CURSOR.next method for the step log cursor type.
+ */
+static int
+__curlog_next(WT_CURSOR *cursor)
+{
+	WT_CURSOR_LOG *cl;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cl = (WT_CURSOR_LOG *)cursor;
+
+	CURSOR_API_CALL(cursor, session, next, NULL);
+
+	/*
+	 * If we don't have a record, or went to the end of the record we
+	 * have, or we are in the zero-fill portion of the record, get a
+	 * new one.
+	 */
+	if (cl->stepp == NULL || cl->stepp >= cl->stepp_end || !*cl->stepp) {
+		cl->txnid = 0;
+		WT_ERR(__wt_log_scan(session, cl->next_lsn, WT_LOGSCAN_ONE,
+		    __curlog_logrec, cl));
+	}
+	WT_ASSERT(session, cl->logrec->data != NULL);
+	WT_ERR(__curlog_kv(session, cursor));
+	WT_STAT_FAST_CONN_INCR(session, cursor_next);
+	WT_STAT_FAST_DATA_INCR(session, cursor_next);
+
+err:	API_END_RET(session, ret);
+
+}
+
+/*
+ * __curlog_search --
+ *	WT_CURSOR.search method for the log cursor type.
+ */
+static int
+__curlog_search(WT_CURSOR *cursor)
+{
+	WT_CURSOR_LOG *cl;
+	WT_DECL_RET;
+	WT_LSN key;
+	WT_SESSION_IMPL *session;
+	uint32_t counter;
+
+	cl = (WT_CURSOR_LOG *)cursor;
+
+	CURSOR_API_CALL(cursor, session, search, NULL);
+
+	/*
+	 * !!! We are ignoring the counter and only searching based on the LSN.
+	 */
+	WT_ERR(__wt_cursor_get_key((WT_CURSOR *)cl,
+	    &key.file, &key.offset, &counter));
+	WT_ERR(__wt_log_scan(session, &key, WT_LOGSCAN_ONE,
+	    __curlog_logrec, cl));
+	WT_ERR(__curlog_kv(session, cursor));
+	WT_STAT_FAST_CONN_INCR(session, cursor_search);
+	WT_STAT_FAST_DATA_INCR(session, cursor_search);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curlog_reset --
+ *	WT_CURSOR.reset method for the log cursor type.
+ */
+static int
+__curlog_reset(WT_CURSOR *cursor)
+{
+	WT_CURSOR_LOG *cl;
+
+	cl = (WT_CURSOR_LOG *)cursor;
+	cl->stepp = cl->stepp_end = NULL;
+	cl->step_count = 0;
+	INIT_LSN(cl->cur_lsn);
+	INIT_LSN(cl->next_lsn);
+	return (0);
+}
+
+/*
+ * __curlog_close --
+ *	WT_CURSOR.close method for the log cursor type.
+ */
+static int
+__curlog_close(WT_CURSOR *cursor)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_CURSOR_LOG *cl;
+	WT_DECL_RET;
+	WT_LOG *log;
+	WT_SESSION_IMPL *session;
+
+	CURSOR_API_CALL(cursor, session, close, NULL);
+	cl = (WT_CURSOR_LOG *)cursor;
+	conn = S2C(session);
+	WT_ASSERT(session, conn->logging);
+	log = conn->log;
+	WT_TRET(__wt_readunlock(session, log->log_archive_lock));
+	WT_TRET(__curlog_reset(cursor));
+	__wt_free(session, cl->cur_lsn);
+	__wt_free(session, cl->next_lsn);
+	__wt_scr_free(&cl->logrec);
+	__wt_scr_free(&cl->opkey);
+	__wt_scr_free(&cl->opvalue);
+	WT_TRET(__wt_cursor_close(cursor));
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curlog_open --
+ *	Initialize a log cursor.
+ */
+int
+__wt_curlog_open(WT_SESSION_IMPL *session,
+    const char *uri, const char *cfg[], WT_CURSOR **cursorp)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_CURSOR_STATIC_INIT(iface,
+	    __wt_cursor_get_key,	/* get-key */
+	    __wt_cursor_get_value,	/* get-value */
+	    __wt_cursor_set_key,	/* set-key */
+	    __wt_cursor_set_value,	/* set-value */
+	    __curlog_compare,		/* compare */
+	    __curlog_next,		/* next */
+	    __wt_cursor_notsup,		/* prev */
+	    __curlog_reset,		/* reset */
+	    __curlog_search,		/* search */
+	    __wt_cursor_notsup,		/* search-near */
+	    __wt_cursor_notsup,		/* insert */
+	    __wt_cursor_notsup,		/* update */
+	    __wt_cursor_notsup,		/* remove */
+	    __curlog_close);		/* close */
+	WT_CURSOR *cursor;
+	WT_CURSOR_LOG *cl;
+	WT_DECL_RET;
+	WT_LOG *log;
+
+	WT_STATIC_ASSERT(offsetof(WT_CURSOR_LOG, iface) == 0);
+	conn = S2C(session);
+	if (!conn->logging)
+		WT_RET_MSG(session, EINVAL,
+		    "Cannot open a log cursor without logging enabled");
+
+	log = conn->log;
+	cl = NULL;
+	WT_RET(__wt_calloc_def(session, 1, &cl));
+	cursor = &cl->iface;
+	*cursor = iface;
+	cursor->session = &session->iface;
+	WT_ERR(__wt_calloc_def(session, 1, &cl->cur_lsn));
+	WT_ERR(__wt_calloc_def(session, 1, &cl->next_lsn));
+	WT_ERR(__wt_scr_alloc(session, 0, &cl->logrec));
+	WT_ERR(__wt_scr_alloc(session, 0, &cl->opkey));
+	WT_ERR(__wt_scr_alloc(session, 0, &cl->opvalue));
+	cursor->key_format = LOGC_KEY_FORMAT;
+	cursor->value_format = LOGC_VALUE_FORMAT;
+
+	INIT_LSN(cl->cur_lsn);
+	INIT_LSN(cl->next_lsn);
+
+	WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));
+
+	/* Log cursors are read only. */
+	WT_ERR(__wt_cursor_config_readonly(cursor, cfg, 1));
+	/* Log cursors block archiving. */
+	WT_ERR(__wt_readlock(session, log->log_archive_lock));
+
+	if (0) {
+err:		if (F_ISSET(cursor, WT_CURSTD_OPEN))
+			WT_TRET(cursor->close(cursor));
+		else {
+			__wt_free(session, cl->cur_lsn);
+			__wt_free(session, cl->next_lsn);
+			__wt_scr_free(&cl->logrec);
+			__wt_scr_free(&cl->opkey);
+			__wt_scr_free(&cl->opvalue);
+			/*
+			 * NOTE:  We cannot get on the error path with the
+			 * readlock held.  No need to unlock it unless that
+			 * changes above.
+			 */
+			__wt_free(session, cl);
+		}
+		*cursorp = NULL;
+	}
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_metadata.c b/src/third_party/wiredtiger/src/cursor/cur_metadata.c
new file mode 100644
index 00000000000..30fe3b28625
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_metadata.c
@@ -0,0 +1,444 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Custom NEED macros for metadata cursors - that copy the values into the
+ * backing metadata table cursor.
+ */
+#define	WT_MD_CURSOR_NEEDKEY(cursor) do {				\
+	WT_CURSOR_NEEDKEY(cursor);					\
+	WT_ERR(__wt_buf_set(session,					\
+	    &((WT_CURSOR_METADATA *)(cursor))->file_cursor->key,	\
+	    cursor->key.data, cursor->key.size));			\
+	F_SET(((WT_CURSOR_METADATA *)(cursor))->file_cursor,		\
+	    WT_CURSTD_KEY_EXT);						\
+} while (0)
+
+#define	WT_MD_CURSOR_NEEDVALUE(cursor) do {				\
+	WT_CURSOR_NEEDVALUE(cursor);					\
+	WT_ERR(__wt_buf_set(session,					\
+	    &((WT_CURSOR_METADATA *)(cursor))->file_cursor->value,	\
+	    cursor->value.data, cursor->value.size));			\
+	F_SET(((WT_CURSOR_METADATA *)(cursor))->file_cursor,		\
+	    WT_CURSTD_VALUE_EXT);					\
+} while (0)
+
+#define	WT_MD_SET_KEY_VALUE(c, mc, fc) do {				\
+	(c)->key.data = (fc)->key.data;					\
+	(c)->key.size = (fc)->key.size;					\
+	(c)->value.data = (fc)->value.data;				\
+	(c)->value.size = (fc)->value.size;				\
+	F_SET((c), WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);		\
+	F_CLR((mc), WT_MDC_ONMETADATA);					\
+	F_SET((mc), WT_MDC_POSITIONED);					\
+} while (0)
+
+/*
+ * Check if a key matches the metadata.  The public value is "metadata:",
+ * but also check for the internal version of the URI.
+ */
+#define	WT_KEY_IS_METADATA(key)						\
+	(WT_STRING_MATCH(WT_METADATA_URI, (key)->data, (key)->size - 1) ||\
+	 WT_STRING_MATCH(WT_METAFILE_URI, (key)->data, (key)->size - 1))
+
+/*
+ * __curmetadata_metadata_search --
+ *	Retrieve the metadata for the metadata table
+ */
+static int
+__curmetadata_metadata_search(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
+{
+	WT_CURSOR_METADATA *mdc;
+	WT_DECL_RET;
+	const char *value;
+
+	mdc = (WT_CURSOR_METADATA *)cursor;
+
+	/* The metadata search interface allocates a new string in value. */
+	WT_RET(__wt_metadata_search(session, WT_METAFILE_URI, &value));
+
+	/*
+	 * Copy the value to the underlying btree cursor's tmp item which will
+	 * be freed when the cursor is closed.
+	 */
+	ret = __wt_buf_setstr(session, &cursor->value, value);
+	__wt_free(session, value);
+	WT_RET(ret);
+
+	WT_RET(__wt_buf_setstr(session, &cursor->key, WT_METADATA_URI));
+
+	F_SET(mdc, WT_MDC_ONMETADATA | WT_MDC_POSITIONED);
+	F_SET(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+	return (0);
+}
+
+/*
+ * __curmetadata_compare --
+ *	WT_CURSOR->compare method for the metadata cursor type.
+ */
+static int
+__curmetadata_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
+{
+	WT_CURSOR *a_file_cursor, *b_file_cursor;
+	WT_CURSOR_METADATA *a_mdc, *b_mdc;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	a_mdc = ((WT_CURSOR_METADATA *)a);
+	b_mdc = ((WT_CURSOR_METADATA *)b);
+	a_file_cursor = a_mdc->file_cursor;
+	b_file_cursor = b_mdc->file_cursor;
+
+	CURSOR_API_CALL(a, session,
+	    compare, ((WT_CURSOR_BTREE *)a_file_cursor)->btree);
+
+	if (b->compare != __curmetadata_compare)
+		WT_ERR_MSG(session, EINVAL,
+		    "Can only compare cursors of the same type");
+
+	WT_MD_CURSOR_NEEDKEY(a);
+	WT_MD_CURSOR_NEEDKEY(b);
+
+	if (F_ISSET(a_mdc, WT_MDC_ONMETADATA)) {
+		if (F_ISSET(b_mdc, WT_MDC_ONMETADATA))
+			*cmpp = 0;
+		else
+			*cmpp = 1;
+	} else if (F_ISSET(b_mdc, WT_MDC_ONMETADATA))
+		*cmpp = -1;
+	else
+		ret = a_file_cursor->compare(
+		    a_file_cursor, b_file_cursor, cmpp);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_next --
+ *	WT_CURSOR->next method for the metadata cursor type.
+ */
+static int
+__curmetadata_next(WT_CURSOR *cursor)
+{
+	WT_CURSOR *file_cursor;
+	WT_CURSOR_METADATA *mdc;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	mdc = (WT_CURSOR_METADATA *)cursor;
+	file_cursor = mdc->file_cursor;
+	CURSOR_API_CALL(cursor, session,
+	    next, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+	if (!F_ISSET(mdc, WT_MDC_POSITIONED))
+		WT_ERR(__curmetadata_metadata_search(session, cursor));
+	else {
+		WT_ERR(file_cursor->next(mdc->file_cursor));
+		WT_MD_SET_KEY_VALUE(cursor, mdc, file_cursor);
+	}
+
+err:	if (ret != 0) {
+		F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA);
+		F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+	}
+	API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_prev --
+ *	WT_CURSOR->prev method for the metadata cursor type.
+ */
+static int
+__curmetadata_prev(WT_CURSOR *cursor)
+{
+	WT_CURSOR *file_cursor;
+	WT_CURSOR_METADATA *mdc;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	mdc = (WT_CURSOR_METADATA *)cursor;
+	file_cursor = mdc->file_cursor;
+	CURSOR_API_CALL(cursor, session,
+	    prev, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+	if (F_ISSET(mdc, WT_MDC_ONMETADATA)) {
+		ret = WT_NOTFOUND;
+		goto err;
+	}
+
+	ret = file_cursor->prev(file_cursor);
+	if (ret == 0) {
+		WT_MD_SET_KEY_VALUE(cursor, mdc, file_cursor);
+	} else if (ret == WT_NOTFOUND)
+		WT_ERR(__curmetadata_metadata_search(session, cursor));
+
+err:	if (ret != 0) {
+		F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA);
+		F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+	}
+	API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_reset --
+ *	WT_CURSOR->reset method for the metadata cursor type.
+ */
+static int
+__curmetadata_reset(WT_CURSOR *cursor)
+{
+	WT_CURSOR *file_cursor;
+	WT_CURSOR_METADATA *mdc;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	mdc = (WT_CURSOR_METADATA *)cursor;
+	file_cursor = mdc->file_cursor;
+	CURSOR_API_CALL(cursor, session,
+	    reset, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+	if (F_ISSET(mdc, WT_MDC_POSITIONED) && !F_ISSET(mdc, WT_MDC_ONMETADATA))
+		ret = file_cursor->reset(file_cursor);
+	F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA);
+	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_search --
+ *	WT_CURSOR->search method for the metadata cursor type.
+ */
+static int
+__curmetadata_search(WT_CURSOR *cursor)
+{
+	WT_CURSOR *file_cursor;
+	WT_CURSOR_METADATA *mdc;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	mdc = (WT_CURSOR_METADATA *)cursor;
+	file_cursor = mdc->file_cursor;
+	CURSOR_API_CALL(cursor, session,
+	    search, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+	WT_MD_CURSOR_NEEDKEY(cursor);
+
+	if (WT_KEY_IS_METADATA(&cursor->key))
+		WT_ERR(__curmetadata_metadata_search(session, cursor));
+	else {
+		WT_ERR(file_cursor->search(file_cursor));
+		WT_MD_SET_KEY_VALUE(cursor, mdc, file_cursor);
+	}
+
+err:	if (ret != 0) {
+		F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA);
+		F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+	}
+	API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_search_near --
+ *	WT_CURSOR->search_near method for the metadata cursor type.
+ */
+static int
+__curmetadata_search_near(WT_CURSOR *cursor, int *exact)
+{
+	WT_CURSOR *file_cursor;
+	WT_CURSOR_METADATA *mdc;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	mdc = (WT_CURSOR_METADATA *)cursor;
+	file_cursor = mdc->file_cursor;
+	CURSOR_API_CALL(cursor, session,
+	    search_near, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+	WT_MD_CURSOR_NEEDKEY(cursor);
+
+	if (WT_KEY_IS_METADATA(&cursor->key)) {
+		WT_ERR(__curmetadata_metadata_search(session, cursor));
+		*exact = 1;
+	} else {
+		WT_ERR(file_cursor->search_near(file_cursor, exact));
+		WT_MD_SET_KEY_VALUE(cursor, mdc, file_cursor);
+	}
+
+err:	if (ret != 0) {
+		F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA);
+		F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+	}
+	API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_insert --
+ *	WT_CURSOR->insert method for the metadata cursor type.
+ */
+static int
+__curmetadata_insert(WT_CURSOR *cursor)
+{
+	WT_CURSOR *file_cursor;
+	WT_CURSOR_METADATA *mdc;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	mdc = (WT_CURSOR_METADATA *)cursor;
+	file_cursor = mdc->file_cursor;
+	CURSOR_API_CALL(cursor, session,
+	    insert, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+	WT_MD_CURSOR_NEEDKEY(cursor);
+	WT_MD_CURSOR_NEEDVALUE(cursor);
+
+	/*
+	 * Since the key/value formats are 's' the WT_ITEMs must contain a
+	 * NULL terminated string.
+	 */
+	ret =
+	    __wt_metadata_insert(session, cursor->key.data, cursor->value.data);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_update --
+ *	WT_CURSOR->update method for the metadata cursor type.
+ */
+static int
+__curmetadata_update(WT_CURSOR *cursor)
+{
+	WT_CURSOR *file_cursor;
+	WT_CURSOR_METADATA *mdc;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	mdc = (WT_CURSOR_METADATA *)cursor;
+	file_cursor = mdc->file_cursor;
+	CURSOR_API_CALL(cursor, session,
+	    update, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+	WT_MD_CURSOR_NEEDKEY(cursor);
+	WT_MD_CURSOR_NEEDVALUE(cursor);
+
+	/*
+	 * Since the key/value formats are 's' the WT_ITEMs must contain a
+	 * NULL terminated string.
+	 */
+	ret =
+	    __wt_metadata_update(session, cursor->key.data, cursor->value.data);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_remove --
+ *	WT_CURSOR->remove method for the metadata cursor type.
+ */
+static int
+__curmetadata_remove(WT_CURSOR *cursor)
+{
+	WT_CURSOR *file_cursor;
+	WT_CURSOR_METADATA *mdc;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	mdc = (WT_CURSOR_METADATA *)cursor;
+	file_cursor = mdc->file_cursor;
+	CURSOR_API_CALL(cursor, session,
+	    remove, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+	WT_MD_CURSOR_NEEDKEY(cursor);
+
+	/*
+	 * Since the key format is 's' the WT_ITEM must contain a NULL
+	 * terminated string.
+	 */
+	ret = __wt_metadata_remove(session, cursor->key.data);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_close --
+ *	WT_CURSOR->close method for the metadata cursor type.
+ */
+static int
+__curmetadata_close(WT_CURSOR *cursor)
+{
+	WT_CURSOR *file_cursor;
+	WT_CURSOR_METADATA *mdc;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	mdc = (WT_CURSOR_METADATA *)cursor;
+	file_cursor = mdc->file_cursor;
+	CURSOR_API_CALL(cursor, session,
+	    close, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+	ret = file_cursor->close(file_cursor);
+	WT_TRET(__wt_cursor_close(cursor));
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curmetadata_open --
+ *	WT_SESSION->open_cursor method for metadata cursors.
+ *
+ * Metadata cursors are a similar to a file cursor on the special metadata
+ * table, except that the metadata for the metadata table (which is stored
+ * in the turtle file) can also be queried.
+ *
+ * Metadata cursors are read-only by default.
+ */
+int
+__wt_curmetadata_open(WT_SESSION_IMPL *session,
+    const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+{
+	WT_CURSOR_STATIC_INIT(iface,
+	    __wt_cursor_get_key,	/* get-key */
+	    __wt_cursor_get_value,	/* get-value */
+	    __wt_cursor_set_key,	/* set-key */
+	    __wt_cursor_set_value,	/* set-value */
+	    __curmetadata_compare,	/* compare */
+	    __curmetadata_next,		/* next */
+	    __curmetadata_prev,		/* prev */
+	    __curmetadata_reset,	/* reset */
+	    __curmetadata_search,	/* search */
+	    __curmetadata_search_near,	/* search-near */
+	    __curmetadata_insert,	/* insert */
+	    __curmetadata_update,	/* update */
+	    __curmetadata_remove,	/* remove */
+	    __curmetadata_close);	/* close */
+	WT_CURSOR *cursor;
+	WT_CURSOR_METADATA *mdc;
+	WT_DECL_RET;
+
+	WT_RET(__wt_calloc(session, 1, sizeof(WT_CURSOR_METADATA), &mdc));
+
+	cursor = &mdc->iface;
+	*cursor = iface;
+	cursor->session = &session->iface;
+	cursor->key_format = "S";
+	cursor->value_format = "S";
+
+	/* Open the file cursor for operations on the regular metadata */
+	WT_ERR(__wt_metadata_cursor(session, cfg[1], &mdc->file_cursor));
+
+	WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp));
+
+	/* Metadata cursors default to read only. */
+	WT_ERR(__wt_cursor_config_readonly(cursor, cfg, 1));
+
+	if (0) {
+err:		__wt_free(session, mdc);
+	}
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_stat.c b/src/third_party/wiredtiger/src/cursor/cur_stat.c
new file mode 100644
index 00000000000..c06efced369
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_stat.c
@@ -0,0 +1,574 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int  __curstat_next(WT_CURSOR *cursor);
+static int  __curstat_prev(WT_CURSOR *cursor);
+
+/*
+ * The statistics identifier is an offset from a base to ensure the integer ID
+ * values don't overlap (the idea is if they overlap it's easy for application
+ * writers to confuse them).
+ */
+#define	WT_STAT_KEY_MAX(cst)	(((cst)->stats_base + (cst)->stats_count) - 1)
+#define	WT_STAT_KEY_MIN(cst)	((cst)->stats_base)
+#define	WT_STAT_KEY_OFFSET(cst)	((cst)->key - (cst)->stats_base)
+
+/*
+ * __curstat_print_value --
+ *	Convert statistics cursor value to printable format.
+ */
+static int
+__curstat_print_value(WT_SESSION_IMPL *session, uint64_t v, WT_ITEM *buf)
+{
+	if (v >= WT_BILLION)
+		WT_RET(__wt_buf_fmt(session, buf,
+		    "%" PRIu64 "B (%" PRIu64 ")", v / WT_BILLION, v));
+	else if (v >= WT_MILLION)
+		WT_RET(__wt_buf_fmt(session, buf,
+		    "%" PRIu64 "M (%" PRIu64 ")", v / WT_MILLION, v));
+	else
+		WT_RET(__wt_buf_fmt(session, buf, "%" PRIu64, v));
+
+	return (0);
+}
+
+/*
+ * __curstat_get_key --
+ *	WT_CURSOR->get_key for statistics cursors.
+ */
+static int
+__curstat_get_key(WT_CURSOR *cursor, ...)
+{
+	WT_CURSOR_STAT *cst;
+	WT_DECL_RET;
+	WT_ITEM *item;
+	WT_SESSION_IMPL *session;
+	size_t size;
+	va_list ap;
+
+	cst = (WT_CURSOR_STAT *)cursor;
+	va_start(ap, cursor);
+	CURSOR_API_CALL(cursor, session, get_key, NULL);
+
+	WT_CURSOR_NEEDKEY(cursor);
+
+	if (F_ISSET(cursor, WT_CURSTD_RAW)) {
+		WT_ERR(__wt_struct_size(
+		    session, &size, cursor->key_format, cst->key));
+		WT_ERR(__wt_buf_initsize(session, &cursor->key, size));
+		WT_ERR(__wt_struct_pack(session, cursor->key.mem, size,
+		    cursor->key_format, cst->key));
+
+		item = va_arg(ap, WT_ITEM *);
+		item->data = cursor->key.data;
+		item->size = cursor->key.size;
+	} else
+		*va_arg(ap, int *) = cst->key;
+
+err:	va_end(ap);
+	API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_get_value --
+ *	WT_CURSOR->get_value for statistics cursors.
+ */
+static int
+__curstat_get_value(WT_CURSOR *cursor, ...)
+{
+	WT_CURSOR_STAT *cst;
+	WT_DECL_RET;
+	WT_ITEM *item;
+	WT_SESSION_IMPL *session;
+	va_list ap;
+	size_t size;
+	uint64_t *v;
+	const char **p;
+
+	cst = (WT_CURSOR_STAT *)cursor;
+	va_start(ap, cursor);
+	CURSOR_API_CALL(cursor, session, get_value, NULL);
+
+	WT_CURSOR_NEEDVALUE(cursor);
+
+	if (F_ISSET(cursor, WT_CURSTD_RAW)) {
+		WT_ERR(__wt_struct_size(session, &size, cursor->value_format,
+		    cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc,
+		    cst->pv.data, cst->v));
+		WT_ERR(__wt_buf_initsize(session, &cursor->value, size));
+		WT_ERR(__wt_struct_pack(session, cursor->value.mem, size,
+		    cursor->value_format,
+		    cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc,
+		    cst->pv.data, cst->v));
+
+		item = va_arg(ap, WT_ITEM *);
+		item->data = cursor->value.data;
+		item->size = cursor->value.size;
+	} else {
+		/*
+		 * Don't drop core if the statistics value isn't requested; NULL
+		 * pointer support isn't documented, but it's a cheap test.
+		 */
+		if ((p = va_arg(ap, const char **)) != NULL)
+			*p = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc;
+		if ((p = va_arg(ap, const char **)) != NULL)
+			*p = cst->pv.data;
+		if ((v = va_arg(ap, uint64_t *)) != NULL)
+			*v = cst->v;
+	}
+
+err:	va_end(ap);
+	API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_set_key --
+ *	WT_CURSOR->set_key for statistics cursors.
+ */
+static void
+__curstat_set_key(WT_CURSOR *cursor, ...)
+{
+	WT_CURSOR_STAT *cst;
+	WT_DECL_RET;
+	WT_ITEM *item;
+	WT_SESSION_IMPL *session;
+	va_list ap;
+
+	cst = (WT_CURSOR_STAT *)cursor;
+	CURSOR_API_CALL(cursor, session, set_key, NULL);
+	F_CLR(cursor, WT_CURSTD_KEY_SET);
+
+	va_start(ap, cursor);
+	if (F_ISSET(cursor, WT_CURSTD_RAW)) {
+		item = va_arg(ap, WT_ITEM *);
+		ret = __wt_struct_unpack(session, item->data, item->size,
+		    cursor->key_format, &cst->key);
+	} else
+		cst->key = va_arg(ap, int);
+	va_end(ap);
+
+	if ((cursor->saved_err = ret) == 0)
+		F_SET(cursor, WT_CURSTD_KEY_EXT);
+
+err:	API_END(session, ret);
+}
+
+/*
+ * __curstat_set_value --
+ *	WT_CURSOR->set_value for statistics cursors.
+ */
+static void
+__curstat_set_value(WT_CURSOR *cursor, ...)
+{
+	WT_UNUSED(cursor);
+	return;
+}
+
+/*
+ * __curstat_next --
+ *	WT_CURSOR->next method for the statistics cursor type.
+ */
+static int
+__curstat_next(WT_CURSOR *cursor)
+{
+	WT_CURSOR_STAT *cst;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cst = (WT_CURSOR_STAT *)cursor;
+	CURSOR_API_CALL(cursor, session, next, NULL);
+
+	/* Move to the next item. */
+	if (cst->notpositioned) {
+		cst->notpositioned = 0;
+		cst->key = WT_STAT_KEY_MIN(cst);
+	} else if (cst->key < WT_STAT_KEY_MAX(cst))
+		++cst->key;
+	else {
+		F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+		WT_ERR(WT_NOTFOUND);
+	}
+	cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v;
+	WT_ERR(__curstat_print_value(session, cst->v, &cst->pv));
+	F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_prev --
+ *	WT_CURSOR->prev method for the statistics cursor type.
+ */
+static int
+__curstat_prev(WT_CURSOR *cursor)
+{
+	WT_CURSOR_STAT *cst;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cst = (WT_CURSOR_STAT *)cursor;
+	CURSOR_API_CALL(cursor, session, prev, NULL);
+
+	/* Move to the previous item. */
+	if (cst->notpositioned) {
+		cst->notpositioned = 0;
+		cst->key = WT_STAT_KEY_MAX(cst);
+	} else if (cst->key > WT_STAT_KEY_MIN(cst))
+		--cst->key;
+	else {
+		F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+		WT_ERR(WT_NOTFOUND);
+	}
+
+	cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v;
+	WT_ERR(__curstat_print_value(session, cst->v, &cst->pv));
+	F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_reset --
+ *	WT_CURSOR->reset method for the statistics cursor type.
+ */
+static int
+__curstat_reset(WT_CURSOR *cursor)
+{
+	WT_CURSOR_STAT *cst;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cst = (WT_CURSOR_STAT *)cursor;
+	CURSOR_API_CALL(cursor, session, reset, NULL);
+
+	cst->notpositioned = 1;
+	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_search --
+ *	WT_CURSOR->search method for the statistics cursor type.
+ */
+static int
+__curstat_search(WT_CURSOR *cursor)
+{
+	WT_CURSOR_STAT *cst;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cst = (WT_CURSOR_STAT *)cursor;
+	CURSOR_API_CALL(cursor, session, search, NULL);
+
+	WT_CURSOR_NEEDKEY(cursor);
+	F_CLR(cursor, WT_CURSTD_VALUE_SET | WT_CURSTD_VALUE_SET);
+
+	if (cst->key < WT_STAT_KEY_MIN(cst) || cst->key > WT_STAT_KEY_MAX(cst))
+		WT_ERR(WT_NOTFOUND);
+
+	cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v;
+	WT_ERR(__curstat_print_value(session, cst->v, &cst->pv));
+	F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_close --
+ *	WT_CURSOR->close method for the statistics cursor type.
+ */
+static int
+__curstat_close(WT_CURSOR *cursor)
+{
+	WT_CURSOR_STAT *cst;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cst = (WT_CURSOR_STAT *)cursor;
+	CURSOR_API_CALL(cursor, session, close, NULL);
+
+	__wt_buf_free(session, &cst->pv);
+
+	WT_ERR(__wt_cursor_close(cursor));
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_conn_init --
+ *	Initialize the statistics for a connection.
+ */
+static void
+__curstat_conn_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
+{
+	WT_CONNECTION_IMPL *conn;
+
+	conn = S2C(session);
+
+	/*
+	 * Fill in the connection statistics, and copy them to the cursor.
+	 * Optionally clear the connection statistics.
+	 */
+	__wt_conn_stat_init(session);
+	cst->u.conn_stats = conn->stats;
+	if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
+		__wt_stat_refresh_connection_stats(&conn->stats);
+
+	cst->stats_first = cst->stats = (WT_STATS *)&cst->u.conn_stats;
+	cst->stats_base = WT_CONNECTION_STATS_BASE;
+	cst->stats_count = sizeof(WT_CONNECTION_STATS) / sizeof(WT_STATS);
+}
+
+/*
+ * When returning the statistics for a file URI, we review open handles, and
+ * aggregate checkpoint handle statistics with the file URI statistics.  To
+ * make that work, we have to pass information to the function reviewing the
+ * handles, this structure is what we pass.
+ */
+struct __checkpoint_args {
+	const char *name;		/* Data source handle name */
+	WT_DSRC_STATS *stats;		/* Stat structure being filled */
+	int clear;			/* WT_STATISTICS_CLEAR */
+};
+
+/*
+ * __curstat_checkpoint --
+ *	Aggregate statistics from checkpoint handles.
+ */
+static int
+__curstat_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	struct __checkpoint_args *args;
+	WT_DATA_HANDLE *dhandle;
+
+	dhandle = session->dhandle;
+	args = (struct __checkpoint_args *)cfg[0];
+
+	/* Aggregate the flagged file's checkpoint handles. */
+	if (dhandle->checkpoint != NULL &&
+	    strcmp(dhandle->name, args->name) == 0) {
+		__wt_stat_aggregate_dsrc_stats(&dhandle->stats, args->stats);
+		if (args->clear)
+			__wt_stat_refresh_dsrc_stats(&dhandle->stats);
+	}
+
+	return (0);
+}
+
+/*
+ * __curstat_file_init --
+ *	Initialize the statistics for a file.
+ */
+static int
+__curstat_file_init(WT_SESSION_IMPL *session,
+    const char *uri, const char *cfg[], WT_CURSOR_STAT *cst)
+{
+	struct __checkpoint_args args;
+	WT_DATA_HANDLE *dhandle, *saved_dhandle;
+	WT_DECL_RET;
+	const char *cfg_arg[] = { NULL, NULL };
+
+	WT_RET(__wt_session_get_btree_ckpt(session, uri, cfg, 0));
+	dhandle = session->dhandle;
+
+	/*
+	 * Fill in the data source statistics, and copy them to the cursor.
+	 * Optionally clear the data source statistics.
+	 */
+	if ((ret = __wt_btree_stat_init(session, cst)) == 0) {
+		cst->u.dsrc_stats = dhandle->stats;
+		if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
+			__wt_stat_refresh_dsrc_stats(&dhandle->stats);
+		__wt_curstat_dsrc_final(cst);
+	}
+
+	/* Release the handle, we're done with it. */
+	WT_TRET(__wt_session_release_btree(session));
+	WT_RET(ret);
+
+	/*
+	 * If no checkpoint was specified, review the open handles and aggregate
+	 * the statistics from any checkpoint handles matching this file.
+	 */
+	if (dhandle->checkpoint == NULL) {
+		args.name = dhandle->name;
+		args.stats = &cst->u.dsrc_stats;
+		args.clear = F_ISSET(cst, WT_CONN_STAT_CLEAR);
+		cfg_arg[0] = (char *)&args;
+
+		/*
+		 * We're likely holding the schema lock inside the statistics
+		 * logging thread, not to mention calling __wt_conn_btree_apply
+		 * from there as well.  Save/restore the handle.
+		 */
+		saved_dhandle = dhandle;
+		WT_WITH_SCHEMA_LOCK(session,
+		    ret = __wt_conn_btree_apply(
+		    session, 1, __curstat_checkpoint, cfg_arg));
+		session->dhandle = saved_dhandle;
+	}
+
+	return (ret);
+}
+
+/*
+ * __wt_curstat_dsrc_final --
+ *	Finalize a data-source statistics cursor.
+ */
+void
+__wt_curstat_dsrc_final(WT_CURSOR_STAT *cst)
+{
+
+	cst->stats_first = cst->stats = (WT_STATS *)&cst->u.dsrc_stats;
+	cst->stats_base = WT_DSRC_STATS_BASE;
+	cst->stats_count = sizeof(WT_DSRC_STATS) / sizeof(WT_STATS);
+}
+
+/*
+ * __wt_curstat_init --
+ *	Initialize a statistics cursor.
+ */
+int
+__wt_curstat_init(WT_SESSION_IMPL *session,
+    const char *uri, const char *cfg[], WT_CURSOR_STAT *cst)
+{
+	const char *dsrc_uri;
+
+	cst->notpositioned = 1;
+
+	if (strcmp(uri, "statistics:") == 0) {
+		__curstat_conn_init(session, cst);
+		return (0);
+	}
+
+	dsrc_uri = uri + strlen("statistics:");
+
+	if (WT_PREFIX_MATCH(dsrc_uri, "colgroup:"))
+		return (
+		    __wt_curstat_colgroup_init(session, dsrc_uri, cfg, cst));
+
+	if (WT_PREFIX_MATCH(dsrc_uri, "file:"))
+		return (__curstat_file_init(session, dsrc_uri, cfg, cst));
+
+	if (WT_PREFIX_MATCH(dsrc_uri, "index:"))
+		return (__wt_curstat_index_init(session, dsrc_uri, cfg, cst));
+
+	if (WT_PREFIX_MATCH(dsrc_uri, "lsm:"))
+		return (__wt_curstat_lsm_init(session, dsrc_uri, cst));
+
+	if (WT_PREFIX_MATCH(dsrc_uri, "table:"))
+		return (__wt_curstat_table_init(session, dsrc_uri, cfg, cst));
+
+	return (__wt_bad_object_type(session, uri));
+}
+
+/*
+ * __wt_curstat_open --
+ *	WT_SESSION->open_cursor method for the statistics cursor type.
+ */
+int
+__wt_curstat_open(WT_SESSION_IMPL *session,
+    const char *uri, const char *cfg[], WT_CURSOR **cursorp)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_CURSOR_STATIC_INIT(iface,
+	    __curstat_get_key,		/* get-key */
+	    __curstat_get_value,	/* get-value */
+	    __curstat_set_key,		/* set-key */
+	    __curstat_set_value,	/* set-value */
+	    __wt_cursor_notsup,		/* compare */
+	    __curstat_next,		/* next */
+	    __curstat_prev,		/* prev */
+	    __curstat_reset,		/* reset */
+	    __curstat_search,		/* search */
+	    __wt_cursor_notsup,		/* search-near */
+	    __wt_cursor_notsup,		/* insert */
+	    __wt_cursor_notsup,		/* update */
+	    __wt_cursor_notsup,		/* remove */
+	    __curstat_close);		/* close */
+	WT_CONFIG_ITEM cval, sval;
+	WT_CURSOR *cursor;
+	WT_CURSOR_STAT *cst;
+	WT_DECL_RET;
+
+	WT_STATIC_ASSERT(offsetof(WT_CURSOR_STAT, iface) == 0);
+
+	conn = S2C(session);
+
+	WT_ERR(__wt_calloc_def(session, 1, &cst));
+	cursor = &cst->iface;
+	*cursor = iface;
+	cursor->session = &session->iface;
+
+	/*
+	 * Statistics cursor configuration: must match (and defaults to), the
+	 * database configuration.
+	 */
+	if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_NONE))
+		goto config_err;
+	if ((ret = __wt_config_gets(session, cfg, "statistics", &cval)) == 0) {
+		if ((ret = __wt_config_subgets(
+		    session, &cval, "all", &sval)) == 0 && sval.val != 0) {
+			if (!FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ALL))
+				goto config_err;
+			F_SET(cst, WT_CONN_STAT_ALL | WT_CONN_STAT_FAST);
+		}
+		WT_ERR_NOTFOUND_OK(ret);
+		if ((ret = __wt_config_subgets(
+		    session, &cval, "fast", &sval)) == 0 && sval.val != 0) {
+			if (F_ISSET(cst, WT_CONN_STAT_ALL))
+				WT_ERR_MSG(session, EINVAL,
+				    "only one statistics configuration value "
+				    "may be specified");
+			F_SET(cst, WT_CONN_STAT_FAST);
+		}
+		WT_ERR_NOTFOUND_OK(ret);
+		if ((ret = __wt_config_subgets(
+		    session, &cval, "clear", &sval)) == 0 && sval.val != 0)
+			F_SET(cst, WT_CONN_STAT_CLEAR);
+		WT_ERR_NOTFOUND_OK(ret);
+
+		/* If no configuration, use the connection's configuration. */
+		if (cst->flags == 0) {
+			if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ALL))
+				F_SET(cst, WT_CONN_STAT_ALL);
+			if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_FAST))
+				F_SET(cst, WT_CONN_STAT_FAST);
+		}
+
+		/* If the connection configures clear, so do we. */
+		if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
+			F_SET(cst, WT_CONN_STAT_CLEAR);
+	}
+
+	/*
+	 * We return the statistics field's offset as the key, and a string
+	 * description, a string value,  and a uint64_t value as the value
+	 * columns.
+	 */
+	cursor->key_format = "i";
+	cursor->value_format = "SSq";
+	WT_ERR(__wt_curstat_init(session, uri, cfg, cst));
+
+	/* __wt_cursor_init is last so we don't have to clean up on error. */
+	WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));
+
+	if (0) {
+config_err:	WT_ERR_MSG(session, EINVAL,
+		    "cursor's statistics configuration doesn't match the "
+		    "database statistics configuration");
+	}
+
+	if (0) {
+err:		__wt_free(session, cst);
+	}
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c
new file mode 100644
index 00000000000..21d676d943a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_std.c
@@ -0,0 +1,625 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cursor_notsup --
+ *	Unsupported cursor actions.
+ */
+int
+__wt_cursor_notsup(WT_CURSOR *cursor)
+{
+	WT_UNUSED(cursor);
+
+	return (ENOTSUP);
+}
+
+/*
+ * __wt_cursor_noop --
+ *	Cursor noop.
+ */
+int
+__wt_cursor_noop(WT_CURSOR *cursor)
+{
+	WT_UNUSED(cursor);
+
+	return (0);
+}
+
+/*
+ * __wt_cursor_set_notsup --
+ *	Reset the cursor methods to not-supported.
+ */
+void
+__wt_cursor_set_notsup(WT_CURSOR *cursor)
+{
+	/*
+	 * Set all of the cursor methods (except for close and reset), to fail.
+	 * Close is unchanged so the cursor can be discarded, reset defaults to
+	 * a no-op because session transactional operations reset all of the
+	 * cursors in a session, and random cursors shouldn't block transactions
+	 * or checkpoints.
+	 */
+	cursor->compare =
+	    (int (*)(WT_CURSOR *, WT_CURSOR *, int *))__wt_cursor_notsup;
+	cursor->next = __wt_cursor_notsup;
+	cursor->prev = __wt_cursor_notsup;
+	cursor->reset = __wt_cursor_noop;
+	cursor->search = __wt_cursor_notsup;
+	cursor->search_near = (int (*)(WT_CURSOR *, int *))__wt_cursor_notsup;
+	cursor->insert = __wt_cursor_notsup;
+	cursor->update = __wt_cursor_notsup;
+	cursor->remove = __wt_cursor_notsup;
+}
+
+/*
+ * __wt_cursor_config_readonly --
+ *	Parse read only configuration and setup cursor appropriately.
+ */
+int
+__wt_cursor_config_readonly(WT_CURSOR *cursor, const char *cfg[], int def)
+{
+	WT_CONFIG_ITEM cval;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)cursor->session;
+
+	WT_RET(__wt_config_gets_def(session, cfg, "readonly", def, &cval));
+	if (cval.val != 0) {
+		/* Reset all cursor methods that could modify data. */
+		cursor->insert = __wt_cursor_notsup;
+		cursor->update = __wt_cursor_notsup;
+		cursor->remove = __wt_cursor_notsup;
+	}
+	return (0);
+}
+
+/*
+ * __wt_cursor_kv_not_set --
+ *	Standard error message for key/values not set.
+ */
+int
+__wt_cursor_kv_not_set(WT_CURSOR *cursor, int key)
+{
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)cursor->session;
+
+	WT_RET_MSG(session,
+	    cursor->saved_err == 0 ? EINVAL : cursor->saved_err,
+	    "requires %s be set", key ? "key" : "value");
+}
+
+/*
+ * __wt_cursor_get_key --
+ *	WT_CURSOR->get_key default implementation.
+ */
+int
+__wt_cursor_get_key(WT_CURSOR *cursor, ...)
+{
+	WT_DECL_RET;
+	va_list ap;
+
+	va_start(ap, cursor);
+	ret = __wt_cursor_get_keyv(cursor, cursor->flags, ap);
+	va_end(ap);
+	return (ret);
+}
+
+/*
+ * __wt_cursor_set_key --
+ *	WT_CURSOR->set_key default implementation.
+ */
+void
+__wt_cursor_set_key(WT_CURSOR *cursor, ...)
+{
+	va_list ap;
+
+	va_start(ap, cursor);
+	__wt_cursor_set_keyv(cursor, cursor->flags, ap);
+	va_end(ap);
+}
+
+/*
+ * __wt_cursor_get_raw_key --
+ *	Temporarily force raw mode in a cursor to get a canonical copy of
+ * the key.
+ */
+int
+__wt_cursor_get_raw_key(WT_CURSOR *cursor, WT_ITEM *key)
+{
+	WT_DECL_RET;
+	int raw_set;
+
+	raw_set = F_ISSET(cursor, WT_CURSTD_RAW) ? 1 : 0;
+	if (!raw_set)
+		F_SET(cursor, WT_CURSTD_RAW);
+	ret = cursor->get_key(cursor, key);
+	if (!raw_set)
+		F_CLR(cursor, WT_CURSTD_RAW);
+	return (ret);
+}
+
+/*
+ * __wt_cursor_set_raw_key --
+ *	Temporarily force raw mode in a cursor to set a canonical copy of
+ * the key.
+ */
+void
+__wt_cursor_set_raw_key(WT_CURSOR *cursor, WT_ITEM *key)
+{
+	int raw_set;
+
+	raw_set = F_ISSET(cursor, WT_CURSTD_RAW) ? 1 : 0;
+	if (!raw_set)
+		F_SET(cursor, WT_CURSTD_RAW);
+	cursor->set_key(cursor, key);
+	if (!raw_set)
+		F_CLR(cursor, WT_CURSTD_RAW);
+}
+
+/*
+ * __wt_cursor_get_raw_value --
+ *	Temporarily force raw mode in a cursor to get a canonical copy of
+ * the value.
+ */
+int
+__wt_cursor_get_raw_value(WT_CURSOR *cursor, WT_ITEM *value)
+{
+	WT_DECL_RET;
+	int raw_set;
+
+	raw_set = F_ISSET(cursor, WT_CURSTD_RAW) ? 1 : 0;
+	if (!raw_set)
+		F_SET(cursor, WT_CURSTD_RAW);
+	ret = cursor->get_value(cursor, value);
+	if (!raw_set)
+		F_CLR(cursor, WT_CURSTD_RAW);
+	return (ret);
+}
+
+/*
+ * __wt_cursor_set_raw_value --
+ *	Temporarily force raw mode in a cursor to set a canonical copy of
+ * the value.
+ */
+void
+__wt_cursor_set_raw_value(WT_CURSOR *cursor, WT_ITEM *value)
+{
+	int raw_set;
+
+	raw_set = F_ISSET(cursor, WT_CURSTD_RAW) ? 1 : 0;
+	if (!raw_set)
+		F_SET(cursor, WT_CURSTD_RAW);
+	cursor->set_value(cursor, value);
+	if (!raw_set)
+		F_CLR(cursor, WT_CURSTD_RAW);
+}
+
+/*
+ * __wt_cursor_get_keyv --
+ *	WT_CURSOR->get_key worker function.
+ */
+int
+__wt_cursor_get_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap)
+{
+	WT_DECL_RET;
+	WT_ITEM *key;
+	WT_SESSION_IMPL *session;
+	size_t size;
+	const char *fmt;
+
+	CURSOR_API_CALL(cursor, session, get_key, NULL);
+	if (!F_ISSET(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_KEY_INT))
+		WT_ERR(__wt_cursor_kv_not_set(cursor, 1));
+
+	if (WT_CURSOR_RECNO(cursor)) {
+		if (LF_ISSET(WT_CURSTD_RAW)) {
+			key = va_arg(ap, WT_ITEM *);
+			key->data = cursor->raw_recno_buf;
+			WT_ERR(__wt_struct_size(
+			    session, &size, "q", cursor->recno));
+			key->size = size;
+			ret = __wt_struct_pack(session, cursor->raw_recno_buf,
+			    sizeof(cursor->raw_recno_buf), "q", cursor->recno);
+		} else
+			*va_arg(ap, uint64_t *) = cursor->recno;
+	} else {
+		/* Fast path some common cases. */
+		fmt = cursor->key_format;
+		if (LF_ISSET(WT_CURSOR_RAW_OK) || WT_STREQ(fmt, "u")) {
+			key = va_arg(ap, WT_ITEM *);
+			key->data = cursor->key.data;
+			key->size = cursor->key.size;
+		} else if (WT_STREQ(fmt, "S"))
+			*va_arg(ap, const char **) = cursor->key.data;
+		else
+			ret = __wt_struct_unpackv(session,
+			    cursor->key.data, cursor->key.size, fmt, ap);
+	}
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __wt_cursor_set_keyv --
+ *	WT_CURSOR->set_key default implementation.
+ */
+void
+__wt_cursor_set_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	WT_ITEM *buf, *item;
+	size_t sz;
+	va_list ap_copy;
+	const char *fmt, *str;
+
+	CURSOR_API_CALL(cursor, session, set_key, NULL);
+	F_CLR(cursor, WT_CURSTD_KEY_SET);
+
+	if (WT_CURSOR_RECNO(cursor)) {
+		if (LF_ISSET(WT_CURSTD_RAW)) {
+			item = va_arg(ap, WT_ITEM *);
+			WT_ERR(__wt_struct_unpack(session,
+			    item->data, item->size, "q", &cursor->recno));
+		} else
+			cursor->recno = va_arg(ap, uint64_t);
+		if (cursor->recno == 0)
+			WT_ERR_MSG(session, EINVAL,
+			    "Record numbers must be greater than zero");
+		cursor->key.data = &cursor->recno;
+		sz = sizeof(cursor->recno);
+	} else {
+		/* Fast path some common cases and special case WT_ITEMs. */
+		fmt = cursor->key_format;
+		if (LF_ISSET(WT_CURSOR_RAW_OK | WT_CURSTD_DUMP_JSON) ||
+		    WT_STREQ(fmt, "u")) {
+			item = va_arg(ap, WT_ITEM *);
+			sz = item->size;
+			cursor->key.data = item->data;
+		} else if (WT_STREQ(fmt, "S")) {
+			str = va_arg(ap, const char *);
+			sz = strlen(str) + 1;
+			cursor->key.data = (void *)str;
+		} else {
+			buf = &cursor->key;
+
+			va_copy(ap_copy, ap);
+			ret = __wt_struct_sizev(
+			    session, &sz, cursor->key_format, ap_copy);
+			va_end(ap_copy);
+			WT_ERR(ret);
+
+			WT_ERR(__wt_buf_initsize(session, buf, sz));
+			WT_ERR(__wt_struct_packv(
+			    session, buf->mem, sz, cursor->key_format, ap));
+		}
+	}
+	if (sz == 0)
+		WT_ERR_MSG(session, EINVAL, "Empty keys not permitted");
+	else if ((uint32_t)sz != sz)
+		WT_ERR_MSG(session, EINVAL,
+		    "Key size (%" PRIu64 ") out of range", (uint64_t)sz);
+	cursor->saved_err = 0;
+	cursor->key.size = sz;
+	F_SET(cursor, WT_CURSTD_KEY_EXT);
+	if (0) {
+err:		cursor->saved_err = ret;
+	}
+
+	API_END(session, ret);
+}
+
+/*
+ * __wt_cursor_get_value --
+ *	WT_CURSOR->get_value default implementation.
+ */
+int
+__wt_cursor_get_value(WT_CURSOR *cursor, ...)
+{
+	WT_DECL_RET;
+	va_list ap;
+
+	va_start(ap, cursor);
+	ret = __wt_cursor_get_valuev(cursor, ap);
+	va_end(ap);
+	return (ret);
+}
+
+/*
+ * __wt_cursor_get_valuev --
+ *	WT_CURSOR->get_value worker implementation.
+ */
+int
+__wt_cursor_get_valuev(WT_CURSOR *cursor, va_list ap)
+{
+	WT_DECL_RET;
+	WT_ITEM *value;
+	WT_SESSION_IMPL *session;
+	const char *fmt;
+
+	CURSOR_API_CALL(cursor, session, get_value, NULL);
+
+	if (!F_ISSET(cursor, WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT))
+		WT_ERR(__wt_cursor_kv_not_set(cursor, 0));
+
+	/* Fast path some common cases. */
+	fmt = cursor->value_format;
+	if (F_ISSET(cursor, WT_CURSOR_RAW_OK) || WT_STREQ(fmt, "u")) {
+		value = va_arg(ap, WT_ITEM *);
+		value->data = cursor->value.data;
+		value->size = cursor->value.size;
+	} else if (WT_STREQ(fmt, "S"))
+		*va_arg(ap, const char **) = cursor->value.data;
+	else if (WT_STREQ(fmt, "t") ||
+	    (isdigit(fmt[0]) && WT_STREQ(fmt + 1, "t")))
+		*va_arg(ap, uint8_t *) = *(uint8_t *)cursor->value.data;
+	else
+		ret = __wt_struct_unpackv(session,
+		    cursor->value.data, cursor->value.size, fmt, ap);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __wt_cursor_set_value --
+ *	WT_CURSOR->set_value default implementation.
+ */
+void
+__wt_cursor_set_value(WT_CURSOR *cursor, ...)
+{
+	va_list ap;
+
+	va_start(ap, cursor);
+	__wt_cursor_set_valuev(cursor, ap);
+	va_end(ap);
+}
+
+/*
+ * __wt_cursor_set_valuev --
+ *	WT_CURSOR->set_value worker implementation.
+ */
+void
+__wt_cursor_set_valuev(WT_CURSOR *cursor, va_list ap)
+{
+	WT_DECL_RET;
+	WT_ITEM *buf, *item;
+	WT_SESSION_IMPL *session;
+	const char *fmt, *str;
+	va_list ap_copy;
+	size_t sz;
+
+	CURSOR_API_CALL(cursor, session, set_value, NULL);
+	F_CLR(cursor, WT_CURSTD_VALUE_SET);
+
+	/* Fast path some common cases. */
+	fmt = cursor->value_format;
+	if (F_ISSET(cursor, WT_CURSOR_RAW_OK | WT_CURSTD_DUMP_JSON) ||
+	    WT_STREQ(fmt, "u")) {
+		item = va_arg(ap, WT_ITEM *);
+		sz = item->size;
+		cursor->value.data = item->data;
+	} else if (WT_STREQ(fmt, "S")) {
+		str = va_arg(ap, const char *);
+		sz = strlen(str) + 1;
+		cursor->value.data = str;
+	} else if (WT_STREQ(fmt, "t") ||
+	    (isdigit(fmt[0]) && WT_STREQ(fmt + 1, "t"))) {
+		sz = 1;
+		buf = &cursor->value;
+		WT_ERR(__wt_buf_initsize(session, buf, sz));
+		*(uint8_t *)buf->mem = (uint8_t)va_arg(ap, int);
+	} else {
+		va_copy(ap_copy, ap);
+		ret = __wt_struct_sizev(session,
+		    &sz, cursor->value_format, ap_copy);
+		va_end(ap_copy);
+		WT_ERR(ret);
+		buf = &cursor->value;
+		WT_ERR(__wt_buf_initsize(session, buf, sz));
+		WT_ERR(__wt_struct_packv(session, buf->mem, sz,
+		    cursor->value_format, ap));
+	}
+	F_SET(cursor, WT_CURSTD_VALUE_EXT);
+	cursor->value.size = sz;
+
+	if (0) {
+err:		cursor->saved_err = ret;
+	}
+	API_END(session, ret);
+}
+
+/*
+ * __wt_cursor_close --
+ *	WT_CURSOR->close default implementation.
+ */
+int
+__wt_cursor_close(WT_CURSOR *cursor)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)cursor->session;
+	__wt_buf_free(session, &cursor->key);
+	__wt_buf_free(session, &cursor->value);
+
+	if (F_ISSET(cursor, WT_CURSTD_OPEN)) {
+		TAILQ_REMOVE(&session->cursors, cursor, q);
+
+		WT_STAT_FAST_DATA_DECR(session, session_cursor_open);
+		WT_STAT_FAST_CONN_ATOMIC_DECR(session, session_cursor_open);
+	}
+
+	__wt_free(session, cursor->internal_uri);
+	__wt_free(session, cursor->uri);
+	__wt_overwrite_and_free(session, cursor);
+	return (ret);
+}
+
+/*
+ * __cursor_runtime_config --
+ *	Set runtime-configurable settings.
+ */
+static int
+__cursor_runtime_config(WT_CURSOR *cursor, const char *cfg[])
+{
+	WT_CONFIG_ITEM cval;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)cursor->session;
+
+	/* 
+	 * !!!
+	 * There's no way yet to reconfigure cursor flags at runtime; if, in
+	 * the future there is a way to do that, similar support needs to be
+	 * added for data-source cursors, or, this call needs to return an
+	 * error in the case of a data-source cursor.
+	 */
+	WT_RET(__wt_config_gets_def(session, cfg, "overwrite", 1, &cval));
+	if (cval.val)
+		F_SET(cursor, WT_CURSTD_OVERWRITE);
+	else
+		F_CLR(cursor, WT_CURSTD_OVERWRITE);
+
+	return (0);
+}
+
+/*
+ * __wt_cursor_dup_position --
+ *	Set a cursor to another cursor's position.
+ */
+int
+__wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor)
+{
+	WT_ITEM key;
+
+	/*
+	 * Get a copy of the cursor's raw key, and set it in the new cursor,
+	 * then search for that key to position the cursor.
+	 *
+	 * We don't clear the WT_ITEM structure: all that happens when getting
+	 * and setting the key is the data/size fields are reset to reference
+	 * the original cursor's key.
+	 *
+	 * That said, we're playing games with the cursor flags: setting the key
+	 * sets the key/value application-set flags in the new cursor, which may
+	 * or may not be correct, but there's nothing simple that fixes it.  We
+	 * depend on the subsequent cursor search to clean things up, as search
+	 * is required to copy and/or reference private memory after success.
+	 */
+	WT_RET(__wt_cursor_get_raw_key(to_dup, &key));
+	__wt_cursor_set_raw_key(cursor, &key);
+
+	/*
+	 * We now have a reference to the raw key, but we don't know anything
+	 * about the memory in which it's stored, it could be btree/file page
+	 * memory in the cache, application memory or the original cursor's
+	 * key/value WT_ITEMs.  Memory allocated in support of another cursor
+	 * could be discarded when that cursor is closed, so it's a problem.
+	 * However, doing a search to position the cursor will fix the problem:
+	 * cursors cannot reference application memory after cursor operations
+	 * and that requirement will save the day.
+	 */
+	WT_RET(cursor->search(cursor));
+
+	return (0);
+}
+
+/*
+ * __wt_cursor_init --
+ *	Default cursor initialization.
+ */
+int
+__wt_cursor_init(WT_CURSOR *cursor,
+    const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+{
+	WT_CONFIG_ITEM cval;
+	WT_CURSOR *cdump;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)cursor->session;
+
+	if (cursor->internal_uri == NULL)
+		WT_RET(__wt_strdup(session, uri, &cursor->internal_uri));
+
+	/* Set runtime-configurable settings. */
+	WT_RET(__cursor_runtime_config(cursor, cfg));
+
+	/*
+	 * append
+	 * The append flag is only relevant to column stores.
+	 */
+	if (WT_CURSOR_RECNO(cursor)) {
+		WT_RET(__wt_config_gets_def(session, cfg, "append", 0, &cval));
+		if (cval.val != 0)
+			F_SET(cursor, WT_CURSTD_APPEND);
+	}
+
+	/*
+	 * checkpoint
+	 * Checkpoint cursors are read-only.
+	 */
+	WT_RET(__wt_config_gets_def(session, cfg, "checkpoint", 0, &cval));
+	if (cval.len != 0) {
+		cursor->insert = __wt_cursor_notsup;
+		cursor->update = __wt_cursor_notsup;
+		cursor->remove = __wt_cursor_notsup;
+	}
+
+	/*
+	 * dump
+	 * If an index cursor is opened with dump, then this
+	 * function is called on the index files, with the dump
+	 * config string, and with the index cursor as an owner.
+	 * We don't want to create a dump cursor in that case, because
+	 * we'll create the dump cursor on the index cursor itself.
+	 */
+	WT_RET(__wt_config_gets_def(session, cfg, "dump", 0, &cval));
+	if (cval.len != 0 && owner == NULL) {
+		F_SET(cursor,
+		    WT_STRING_MATCH("json", cval.str, cval.len) ?
+		    WT_CURSTD_DUMP_JSON :
+		    (WT_STRING_MATCH("print", cval.str, cval.len) ?
+		    WT_CURSTD_DUMP_PRINT : WT_CURSTD_DUMP_HEX));
+		/*
+		 * Dump cursors should not have owners: only the
+		 * top-level cursor should be wrapped in a dump cursor.
+		 */
+		WT_RET(__wt_curdump_create(cursor, owner, &cdump));
+		owner = cdump;
+	} else
+		cdump = NULL;
+
+	/* raw */
+	WT_RET(__wt_config_gets_def(session, cfg, "raw", 0, &cval));
+	if (cval.val != 0)
+		F_SET(cursor, WT_CURSTD_RAW);
+
+	/* readonly */
+	WT_RET(__wt_cursor_config_readonly(cursor, cfg, 0));
+
+	/*
+	 * Cursors that are internal to some other cursor (such as file cursors
+	 * inside a table cursor) should be closed after the containing cursor.
+	 * Arrange for that to happen by putting internal cursors after their
+	 * owners on the queue.
+	 */
+	if (owner != NULL) {
+		WT_ASSERT(session, F_ISSET(owner, WT_CURSTD_OPEN));
+		TAILQ_INSERT_AFTER(&session->cursors, owner, cursor, q);
+	} else
+		TAILQ_INSERT_HEAD(&session->cursors, cursor, q);
+
+	F_SET(cursor, WT_CURSTD_OPEN);
+	WT_STAT_FAST_DATA_INCR(session, session_cursor_open);
+	WT_STAT_FAST_CONN_ATOMIC_INCR(session, session_cursor_open);
+
+	*cursorp = (cdump != NULL) ? cdump : cursor;
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c
new file mode 100644
index 00000000000..ea267f96f9c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_table.c
@@ -0,0 +1,808 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __curtable_open_indices(WT_CURSOR_TABLE *ctable);
+static int __curtable_update(WT_CURSOR *cursor);
+
+#define	APPLY_CG(ctable, f) do {					\
+	WT_CURSOR **__cp;						\
+	u_int __i;							\
+	for (__i = 0, __cp = ctable->cg_cursors;			\
+	    __i < WT_COLGROUPS(ctable->table);				\
+	    __i++, __cp++)						\
+		WT_TRET((*__cp)->f(*__cp));				\
+} while (0)
+
+#define	APPLY_IDX(ctable, f) do {					\
+	WT_INDEX *idx;							\
+	WT_CURSOR **__cp;						\
+	u_int __i;							\
+	__cp = (ctable)->idx_cursors;					\
+	for (__i = 0; __i < ctable->table->nindices; __i++, __cp++) {	\
+		idx = ctable->table->indices[__i];			\
+		WT_ERR(__wt_schema_project_merge(session,		\
+		    ctable->cg_cursors,					\
+		    idx->key_plan, idx->key_format, &(*__cp)->key));	\
+		F_SET(*__cp, WT_CURSTD_KEY_EXT |			\
+		    WT_CURSTD_VALUE_EXT);				\
+		WT_ERR((*__cp)->f(*__cp));				\
+		WT_ERR((*__cp)->reset(*__cp));				\
+	}								\
+} while (0)
+
+/*
+ * __wt_curtable_get_key --
+ *	WT_CURSOR->get_key implementation for tables.
+ */
+int
+__wt_curtable_get_key(WT_CURSOR *cursor, ...)
+{
+	WT_CURSOR *primary;
+	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
+	va_list ap;
+
+	ctable = (WT_CURSOR_TABLE *)cursor;
+	primary = *ctable->cg_cursors;
+
+	va_start(ap, cursor);
+	ret = __wt_cursor_get_keyv(primary, cursor->flags, ap);
+	va_end(ap);
+
+	return (ret);
+}
+
+/*
+ * __wt_curtable_get_value --
+ *	WT_CURSOR->get_value implementation for tables.
+ */
+int
+__wt_curtable_get_value(WT_CURSOR *cursor, ...)
+{
+	WT_CURSOR *primary;
+	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
+	WT_ITEM *item;
+	WT_SESSION_IMPL *session;
+	va_list ap;
+
+	ctable = (WT_CURSOR_TABLE *)cursor;
+	primary = *ctable->cg_cursors;
+	CURSOR_API_CALL(cursor, session, get_value, NULL);
+	WT_CURSOR_NEEDVALUE(primary);
+
+	va_start(ap, cursor);
+	if (F_ISSET(cursor, WT_CURSOR_RAW_OK)) {
+		ret = __wt_schema_project_merge(session,
+		    ctable->cg_cursors, ctable->plan,
+		    cursor->value_format, &cursor->value);
+		if (ret == 0) {
+			item = va_arg(ap, WT_ITEM *);
+			item->data = cursor->value.data;
+			item->size = cursor->value.size;
+		}
+	} else
+		ret = __wt_schema_project_out(session,
+		    ctable->cg_cursors, ctable->plan, ap);
+	va_end(ap);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curtable_set_key --
+ *	WT_CURSOR->set_key implementation for tables.
+ */
+void
+__wt_curtable_set_key(WT_CURSOR *cursor, ...)
+{
+	WT_CURSOR **cp, *primary;
+	WT_CURSOR_TABLE *ctable;
+	va_list ap;
+	u_int i;
+
+	ctable = (WT_CURSOR_TABLE *)cursor;
+	cp = ctable->cg_cursors;
+	primary = *cp++;
+
+	va_start(ap, cursor);
+	__wt_cursor_set_keyv(primary, cursor->flags, ap);
+	va_end(ap);
+
+	if (!F_ISSET(primary, WT_CURSTD_KEY_SET))
+		return;
+
+	/* Copy the primary key to the other cursors. */
+	for (i = 1; i < WT_COLGROUPS(ctable->table); i++, cp++) {
+		(*cp)->recno = primary->recno;
+		(*cp)->key.data = primary->key.data;
+		(*cp)->key.size = primary->key.size;
+		F_SET(*cp, WT_CURSTD_KEY_EXT);
+	}
+}
+
+/*
+ * __wt_curtable_set_value --
+ *	WT_CURSOR->set_value implementation for tables.
+ */
+void
+__wt_curtable_set_value(WT_CURSOR *cursor, ...)
+{
+	WT_CURSOR **cp;
+	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
+	WT_ITEM *item;
+	WT_SESSION_IMPL *session;
+	va_list ap;
+	u_int i;
+
+	ctable = (WT_CURSOR_TABLE *)cursor;
+	CURSOR_API_CALL(cursor, session, set_value, NULL);
+
+	va_start(ap, cursor);
+	if (F_ISSET(cursor, WT_CURSOR_RAW_OK | WT_CURSTD_DUMP_JSON)) {
+		item = va_arg(ap, WT_ITEM *);
+		cursor->value.data = item->data;
+		cursor->value.size = item->size;
+		ret = __wt_schema_project_slice(session,
+		    ctable->cg_cursors, ctable->plan, 0,
+		    cursor->value_format, &cursor->value);
+	} else
+		ret = __wt_schema_project_in(session,
+		    ctable->cg_cursors, ctable->plan, ap);
+	va_end(ap);
+
+	for (i = 0, cp = ctable->cg_cursors;
+	    i < WT_COLGROUPS(ctable->table); i++, cp++)
+		if (ret == 0)
+			F_SET(*cp, WT_CURSTD_VALUE_EXT);
+		else {
+			(*cp)->saved_err = ret;
+			F_CLR(*cp, WT_CURSTD_VALUE_SET);
+		}
+
+err:	API_END(session, ret);
+}
+
+/*
+ * __curtable_compare --
+ *	WT_CURSOR->compare implementation for tables.
+ */
+static int
+__curtable_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	CURSOR_API_CALL(a, session, compare, NULL);
+
+	/*
+	 * Confirm both cursors refer to the same source and have keys, then
+	 * call the underlying object's comparison routine.
+	 */
+	if (strcmp(a->internal_uri, b->internal_uri) != 0)
+		WT_ERR_MSG(session, EINVAL,
+		    "comparison method cursors must reference the same object");
+	WT_CURSOR_CHECKKEY(WT_CURSOR_PRIMARY(a));
+	WT_CURSOR_CHECKKEY(WT_CURSOR_PRIMARY(b));
+
+	ret = WT_CURSOR_PRIMARY(a)->compare(
+	    WT_CURSOR_PRIMARY(a), WT_CURSOR_PRIMARY(b), cmpp);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_next --
+ *	WT_CURSOR->next method for the table cursor type.
+ */
+static int
+__curtable_next(WT_CURSOR *cursor)
+{
+	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	ctable = (WT_CURSOR_TABLE *)cursor;
+	CURSOR_API_CALL(cursor, session, next, NULL);
+	APPLY_CG(ctable, next);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_next_random --
+ *	WT_CURSOR->next method for the table cursor type when configured with
+ *	next_random.
+ */
+static int
+__curtable_next_random(WT_CURSOR *cursor)
+{
+	WT_CURSOR *primary, **cp;
+	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	u_int i;
+
+	ctable = (WT_CURSOR_TABLE *)cursor;
+	CURSOR_API_CALL(cursor, session, next, NULL);
+	cp = ctable->cg_cursors;
+
+	/* Split out the first next, it retrieves the random record. */
+	primary = *cp++;
+	WT_ERR(primary->next(primary));
+
+	/* Fill in the rest of the columns. */
+	for (i = 1; i < WT_COLGROUPS(ctable->table); i++, cp++) {
+		(*cp)->key.data = primary->key.data;
+		(*cp)->key.size = primary->key.size;
+		(*cp)->recno = primary->recno;
+		F_SET(*cp, WT_CURSTD_KEY_EXT);
+		WT_ERR((*cp)->search(*cp));
+	}
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_prev --
+ *	WT_CURSOR->prev method for the table cursor type.
+ */
+static int
+__curtable_prev(WT_CURSOR *cursor)
+{
+	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	ctable = (WT_CURSOR_TABLE *)cursor;
+	CURSOR_API_CALL(cursor, session, prev, NULL);
+	APPLY_CG(ctable, prev);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_reset --
+ *	WT_CURSOR->reset method for the table cursor type.
+ */
+static int
+__curtable_reset(WT_CURSOR *cursor)
+{
+	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	ctable = (WT_CURSOR_TABLE *)cursor;
+	CURSOR_API_CALL(cursor, session, reset, NULL);
+	APPLY_CG(ctable, reset);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_search --
+ *	WT_CURSOR->search method for the table cursor type.
+ */
+static int
+__curtable_search(WT_CURSOR *cursor)
+{
+	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	ctable = (WT_CURSOR_TABLE *)cursor;
+	CURSOR_API_CALL(cursor, session, search, NULL);
+	APPLY_CG(ctable, search);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_search_near --
+ *	WT_CURSOR->search_near method for the table cursor type.
+ */
+static int
+__curtable_search_near(WT_CURSOR *cursor, int *exact)
+{
+	WT_CURSOR_TABLE *ctable;
+	WT_CURSOR *primary, **cp;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	u_int i;
+
+	ctable = (WT_CURSOR_TABLE *)cursor;
+	CURSOR_API_CALL(cursor, session, search_near, NULL);
+	cp = ctable->cg_cursors;
+	primary = *cp;
+	WT_ERR(primary->search_near(primary, exact));
+
+	for (i = 1, ++cp; i < WT_COLGROUPS(ctable->table); i++) {
+		(*cp)->key.data = primary->key.data;
+		(*cp)->key.size = primary->key.size;
+		(*cp)->recno = primary->recno;
+		F_SET(*cp, WT_CURSTD_KEY_EXT);
+		WT_ERR((*cp)->search(*cp));
+	}
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_insert --
+ *	WT_CURSOR->insert method for the table cursor type.
+ */
+static int
+__curtable_insert(WT_CURSOR *cursor)
+{
+	WT_CURSOR *primary, **cp;
+	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	uint32_t flag_orig;
+	u_int i;
+
+	ctable = (WT_CURSOR_TABLE *)cursor;
+	CURSOR_UPDATE_API_CALL(cursor, session, insert, NULL);
+	WT_ERR(__curtable_open_indices(ctable));
+
+	/*
+	 * Split out the first insert, it may be allocating a recno.
+	 *
+	 * If the table has indices, we also need to know whether this record
+	 * is replacing an existing record so that the existing index entries
+	 * can be removed.  We discover if this is an overwrite by configuring
+	 * the primary cursor for no-overwrite, and checking if the insert
+	 * detects a duplicate key.
+	 */
+	cp = ctable->cg_cursors;
+	primary = *cp++;
+
+	flag_orig = F_ISSET(primary, WT_CURSTD_OVERWRITE);
+	if (ctable->table->nindices > 0)
+		F_CLR(primary, WT_CURSTD_OVERWRITE);
+	ret = primary->insert(primary);
+	F_SET(primary, flag_orig);
+
+	if (ret == WT_DUPLICATE_KEY && F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
+		/*
+		 * !!!
+		 * The insert failure clears these flags, but does not touch the
+		 * items.  We could make a copy each time for overwrite cursors,
+		 * but for now we just reset the flags.
+		 */
+		F_SET(primary, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+		ret = __curtable_update(cursor);
+		goto err;
+	}
+	WT_ERR(ret);
+
+	for (i = 1; i < WT_COLGROUPS(ctable->table); i++, cp++) {
+		(*cp)->recno = primary->recno;
+		WT_ERR((*cp)->insert(*cp));
+	}
+
+	APPLY_IDX(ctable, insert);
+
+err:	CURSOR_UPDATE_API_END(session, ret);
+	return (ret);
+}
+
+/*
+ * __curtable_update --
+ *	WT_CURSOR->update method for the table cursor type.
+ */
+static int
+__curtable_update(WT_CURSOR *cursor)
+{
+	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	ctable = (WT_CURSOR_TABLE *)cursor;
+	CURSOR_UPDATE_API_CALL(cursor, session, update, NULL);
+	WT_ERR(__curtable_open_indices(ctable));
+
+	/*
+	 * If the table has indices, first delete any old index keys, then
+	 * update the primary, then insert the new index keys.  This is
+	 * complicated by the fact that we need the old value to generate the
+	 * old index keys, so we make a temporary copy of the new value.
+	 */
+	if (ctable->table->nindices > 0) {
+		WT_ERR(__wt_schema_project_merge(session,
+		    ctable->cg_cursors, ctable->plan,
+		    cursor->value_format, &cursor->value));
+		APPLY_CG(ctable, search);
+		/*
+		 * Remove only if the key exists.
+		 */
+		if (ret == 0) {
+			APPLY_IDX(ctable, remove);
+			WT_ERR(__wt_schema_project_slice(session,
+			    ctable->cg_cursors, ctable->plan, 0,
+			    cursor->value_format, &cursor->value));
+		} else if (ret == WT_NOTFOUND)
+			ret = 0;
+		else
+			WT_ERR(ret);
+	}
+	APPLY_CG(ctable, update);
+	WT_ERR(ret);
+	if (ctable->idx_cursors != NULL)
+		APPLY_IDX(ctable, insert);
+
+err:	CURSOR_UPDATE_API_END(session, ret);
+	return (ret);
+}
+
+/*
+ * __curtable_remove --
+ *	WT_CURSOR->remove method for the table cursor type.
+ */
+static int
+__curtable_remove(WT_CURSOR *cursor)
+{
+	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	ctable = (WT_CURSOR_TABLE *)cursor;
+	CURSOR_UPDATE_API_CALL(cursor, session, remove, NULL);
+	WT_ERR(__curtable_open_indices(ctable));
+
+	/* Find the old record so it can be removed from indices */
+	if (ctable->table->nindices > 0) {
+		APPLY_CG(ctable, search);
+		WT_ERR(ret);
+		APPLY_IDX(ctable, remove);
+	}
+
+	APPLY_CG(ctable, remove);
+
+err:	CURSOR_UPDATE_API_END(session, ret);
+	return (ret);
+}
+
+/*
+ * __wt_table_range_truncate --
+ *	Truncate of a cursor range, table implementation.
+ */
+int
+__wt_table_range_truncate(WT_CURSOR_TABLE *start, WT_CURSOR_TABLE *stop)
+{
+	WT_CURSOR *wt_start, *wt_stop;
+	WT_CURSOR_TABLE *ctable;
+	WT_DECL_ITEM(key);
+	WT_DECL_RET;
+	WT_ITEM raw;
+	WT_SESSION_IMPL *session;
+	u_int i;
+	int cmp;
+
+	ctable = (start != NULL) ? start : stop;
+	session = (WT_SESSION_IMPL *)ctable->iface.session;
+	wt_start = &start->iface;
+	wt_stop = &stop->iface;
+
+	/* Open any indices. */
+	WT_RET(__curtable_open_indices(ctable));
+	WT_RET(__wt_scr_alloc(session, 128, &key));
+
+	/*
+	 * Step through the cursor range, removing the index entries.
+	 *
+	 * If there are indices, copy the key we're using to step through the
+	 * cursor range (so we can reset the cursor to its original position),
+	 * then remove all of the index records in the truncated range.  Copy
+	 * the raw key because the memory is only valid until the cursor moves.
+	 */
+	if (ctable->table->nindices > 0) {
+		if (start == NULL) {
+			WT_ERR(__wt_cursor_get_raw_key(wt_stop, &raw));
+			WT_ERR(__wt_buf_set(session, key, raw.data, raw.size));
+
+			do {
+				APPLY_CG(stop, search);
+				WT_ERR(ret);
+				APPLY_IDX(stop, remove);
+			} while ((ret = wt_stop->prev(wt_stop)) == 0);
+			WT_ERR_NOTFOUND_OK(ret);
+
+			__wt_cursor_set_raw_key(wt_stop, key);
+			APPLY_CG(stop, search);
+		} else {
+			WT_ERR(__wt_cursor_get_raw_key(wt_start, &raw));
+			WT_ERR(__wt_buf_set(session, key, raw.data, raw.size));
+
+			cmp = -1;
+			do {
+				APPLY_CG(start, search);
+				WT_ERR(ret);
+				APPLY_IDX(start, remove);
+				if (stop != NULL)
+					WT_ERR(wt_start->compare(
+					    wt_start, wt_stop,
+					    &cmp));
+			} while (cmp < 0 &&
+			    (ret = wt_start->next(wt_start)) == 0);
+			WT_ERR_NOTFOUND_OK(ret);
+
+			__wt_cursor_set_raw_key(wt_start, key);
+			APPLY_CG(start, search);
+		}
+	}
+
+	/* Truncate the column groups. */
+	for (i = 0; i < WT_COLGROUPS(ctable->table); i++)
+		WT_ERR(__wt_range_truncate(
+		    (start == NULL) ? NULL : start->cg_cursors[i],
+		    (stop == NULL) ? NULL : stop->cg_cursors[i]));
+
+err:	__wt_scr_free(&key);
+	return (ret);
+}
+
+/*
+ * __curtable_close --
+ *	WT_CURSOR->close method for the table cursor type.
+ */
+static int
+__curtable_close(WT_CURSOR *cursor)
+{
+	WT_CURSOR_TABLE *ctable;
+	WT_CURSOR **cp;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	u_int i;
+
+	ctable = (WT_CURSOR_TABLE *)cursor;
+	CURSOR_API_CALL(cursor, session, close, NULL);
+
+	for (i = 0, cp = ctable->cg_cursors;
+	    i < WT_COLGROUPS(ctable->table); i++, cp++)
+		if (*cp != NULL) {
+			WT_TRET((*cp)->close(*cp));
+			*cp = NULL;
+		}
+
+	if (ctable->idx_cursors != NULL)
+		for (i = 0, cp = ctable->idx_cursors;
+		    i < ctable->table->nindices; i++, cp++)
+			if (*cp != NULL) {
+				WT_TRET((*cp)->close(*cp));
+				*cp = NULL;
+			}
+
+	if (ctable->plan != ctable->table->plan)
+		__wt_free(session, ctable->plan);
+	for (i = 0; ctable->cfg[i] != NULL; ++i)
+		__wt_free(session, ctable->cfg[i]);
+	__wt_free(session, ctable->cfg);
+	if (cursor->value_format != ctable->table->value_format)
+		__wt_free(session, cursor->value_format);
+	__wt_free(session, ctable->cg_cursors);
+	__wt_free(session, ctable->idx_cursors);
+	__wt_schema_release_table(session, ctable->table);
+	/* The URI is owned by the table. */
+	cursor->internal_uri = NULL;
+	WT_TRET(__wt_cursor_close(cursor));
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_open_colgroups --
+ *	Open cursors on column groups for a table cursor.
+ */
+static int
+__curtable_open_colgroups(WT_CURSOR_TABLE *ctable, const char *cfg_arg[])
+{
+	WT_SESSION_IMPL *session;
+	WT_TABLE *table;
+	WT_CURSOR **cp;
+	/*
+	 * Underlying column groups are always opened without dump, and only
+	 * the primary is opened with next_random.
+	 */
+	const char *cfg[] = {
+		cfg_arg[0], cfg_arg[1], "dump=\"\"", NULL, NULL
+	};
+	u_int i;
+
+	session = (WT_SESSION_IMPL *)ctable->iface.session;
+	table = ctable->table;
+
+	if (!table->cg_complete)
+		WT_RET_MSG(session, EINVAL,
+		    "Can't use '%s' until all column groups are created",
+		    table->name);
+
+	WT_RET(__wt_calloc_def(session,
+	    WT_COLGROUPS(table), &ctable->cg_cursors));
+
+	for (i = 0, cp = ctable->cg_cursors;
+	    i < WT_COLGROUPS(table);
+	    i++, cp++) {
+		WT_RET(__wt_open_cursor(session, table->cgroups[i]->source,
+		    &ctable->iface, cfg, cp));
+		cfg[3] = "next_random=false";
+	}
+	return (0);
+}
+
+/*
+ * __curtable_open_indices --
+ *	Open cursors on indices for a table cursor.
+ */
+static int
+__curtable_open_indices(WT_CURSOR_TABLE *ctable)
+{
+	WT_CURSOR **cp, *primary;
+	WT_SESSION_IMPL *session;
+	WT_TABLE *table;
+	u_int i;
+
+	session = (WT_SESSION_IMPL *)ctable->iface.session;
+	table = ctable->table;
+
+	WT_RET(__wt_schema_open_indices(session, table));
+	if (table->nindices == 0 || ctable->idx_cursors != NULL)
+		return (0);
+
+	/* Check for bulk cursors. */
+	primary = *ctable->cg_cursors;
+	if (F_ISSET(primary, WT_CURSTD_BULK))
+		WT_RET_MSG(session, ENOTSUP,
+		    "Bulk load is not supported for tables with indices");
+
+	WT_RET(__wt_calloc_def(session, table->nindices, &ctable->idx_cursors));
+	for (i = 0, cp = ctable->idx_cursors; i < table->nindices; i++, cp++)
+		WT_RET(__wt_open_cursor(session, table->indices[i]->source,
+		    &ctable->iface, ctable->cfg, cp));
+	return (0);
+}
+
+/*
+ * __wt_curtable_open --
+ *	WT_SESSION->open_cursor method for table cursors.
+ */
+int
+__wt_curtable_open(WT_SESSION_IMPL *session,
+    const char *uri, const char *cfg[], WT_CURSOR **cursorp)
+{
+	WT_CURSOR_STATIC_INIT(iface,
+	    __wt_curtable_get_key,		/* get-key */
+	    __wt_curtable_get_value,		/* get-value */
+	    __wt_curtable_set_key,		/* set-key */
+	    __wt_curtable_set_value,		/* set-value */
+	    __curtable_compare,			/* compare */
+	    __curtable_next,			/* next */
+	    __curtable_prev,			/* prev */
+	    __curtable_reset,			/* reset */
+	    __curtable_search,			/* search */
+	    __curtable_search_near,		/* search-near */
+	    __curtable_insert,			/* insert */
+	    __curtable_update,			/* update */
+	    __curtable_remove,			/* remove */
+	    __curtable_close);			/* close */
+	WT_CONFIG_ITEM cval;
+	WT_CURSOR *cursor;
+	WT_CURSOR_TABLE *ctable;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	WT_TABLE *table;
+	size_t size;
+	int cfg_cnt;
+	const char *tablename, *columns;
+
+	WT_STATIC_ASSERT(offsetof(WT_CURSOR_TABLE, iface) == 0);
+
+	ctable = NULL;
+
+	tablename = uri;
+	if (!WT_PREFIX_SKIP(tablename, "table:"))
+		return (EINVAL);
+	columns = strchr(tablename, '(');
+	if (columns == NULL)
+		size = strlen(tablename);
+	else
+		size = WT_PTRDIFF(columns, tablename);
+	WT_RET(__wt_schema_get_table(session, tablename, size, 0, &table));
+
+	if (table->is_simple) {
+		/* Just return a cursor on the underlying data source. */
+		ret = __wt_open_cursor(session,
+		    table->cgroups[0]->source, NULL, cfg, cursorp);
+
+		__wt_schema_release_table(session, table);
+		return (ret);
+	}
+
+	WT_RET(__wt_calloc_def(session, 1, &ctable));
+
+	cursor = &ctable->iface;
+	*cursor = iface;
+	cursor->session = &session->iface;
+	cursor->internal_uri = table->name;
+	cursor->key_format = table->key_format;
+	cursor->value_format = table->value_format;
+
+	ctable->table = table;
+	ctable->plan = table->plan;
+
+	/* Handle projections. */
+	if (columns != NULL) {
+		WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+		WT_ERR(__wt_struct_reformat(session, table,
+		    columns, strlen(columns), NULL, 1, tmp));
+		WT_ERR(__wt_strndup(
+		    session, tmp->data, tmp->size, &cursor->value_format));
+
+		WT_ERR(__wt_buf_init(session, tmp, 0));
+		WT_ERR(__wt_struct_plan(session, table,
+		    columns, strlen(columns), 0, tmp));
+		WT_ERR(__wt_strndup(
+		    session, tmp->data, tmp->size, &ctable->plan));
+	}
+
+	/*
+	 * random_retrieval
+	 * Random retrieval cursors only support next, reset and close.
+	 */
+	WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval));
+	if (cval.val != 0) {
+		__wt_cursor_set_notsup(cursor);
+		cursor->next = __curtable_next_random;
+		cursor->reset = __curtable_reset;
+	}
+
+	WT_ERR(__wt_cursor_init(
+	    cursor, cursor->internal_uri, NULL, cfg, cursorp));
+
+	if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON))
+		WT_ERR(__wt_json_column_init(cursor, table->key_format,
+		    NULL, &table->colconf));
+
+	/*
+	 * Open the colgroup cursors immediately: we're going to need them for
+	 * any operation.  We defer opening index cursors until we need them
+	 * for an update.  Note that this must come after the call to
+	 * __wt_cursor_init: the table cursor must already be on the list of
+	 * session cursors or we can't work out where to put the colgroup
+	 * cursor(s).
+	 */
+	WT_ERR(__curtable_open_colgroups(ctable, cfg));
+
+	/*
+	 * We'll need to squirrel away a copy of the cursor configuration
+	 * for if/when we open indices.
+	 *
+	 * cfg[0] is the baseline configuration for the cursor open and we can
+	 * acquire another copy from the configuration structures, so it would
+	 * be reasonable not to copy it here: but I'd rather be safe than sorry.
+	 *
+	 * Underlying indices are always opened without dump.
+	 */
+	for (cfg_cnt = 0; cfg[cfg_cnt] != NULL; ++cfg_cnt)
+		;
+	WT_ERR(__wt_calloc_def(session, cfg_cnt + 2, &ctable->cfg));
+	for (cfg_cnt = 0; cfg[cfg_cnt] != NULL; ++cfg_cnt)
+		WT_ERR(
+		    __wt_strdup(session, cfg[cfg_cnt], &ctable->cfg[cfg_cnt]));
+	WT_ERR(__wt_strdup(session, "dump=\"\"", &ctable->cfg[cfg_cnt]));
+
+	if (0) {
+err:		WT_TRET(__curtable_close(cursor));
+		*cursorp = NULL;
+	}
+
+	__wt_scr_free(&tmp);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h
new file mode 100644
index 00000000000..e358d22b278
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/api.h
@@ -0,0 +1,128 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/* Standard entry points to the API: declares/initializes local variables. */
+#define	API_SESSION_INIT(s, h, n, cur, dh)				\
+	WT_DATA_HANDLE *__olddh = (s)->dhandle;				\
+	const char *__oldname = (s)->name;				\
+	(s)->cursor = (cur);						\
+	(s)->dhandle = (dh);						\
+	(s)->name = (s)->lastop = #h "." #n;				\
+
+#define	API_CALL_NOCONF(s, h, n, cur, dh) do {				\
+	API_SESSION_INIT(s, h, n, cur, dh);				\
+	WT_ERR(F_ISSET(S2C(s), WT_CONN_PANIC) ? __wt_panic(s) : 0);	\
+	WT_ERR(__wt_verbose((s), WT_VERB_API, "CALL: " #h ":" #n))
+
+#define	API_CALL(s, h, n, cur, dh, config, cfg) do {			\
+	const char *cfg[] =						\
+	    { WT_CONFIG_BASE(s, h##_##n), config, NULL };		\
+	API_SESSION_INIT(s, h, n, cur, dh);				\
+	WT_ERR(F_ISSET(S2C(s), WT_CONN_PANIC) ? __wt_panic(s) : 0);	\
+	WT_ERR(((config) != NULL) ?					\
+	    __wt_config_check((s),					\
+	    WT_CONFIG_REF(session, h##_##n), (config), 0) : 0);		\
+	WT_ERR(__wt_verbose((s), WT_VERB_API, "CALL: " #h ":" #n))
+
+#define	API_END(s, ret)							\
+	if ((s) != NULL) {						\
+		(s)->dhandle = __olddh;					\
+		(s)->name = __oldname;					\
+		if (F_ISSET(&(s)->txn, TXN_RUNNING) &&			\
+		    (ret) != 0 &&					\
+		    (ret) != WT_NOTFOUND &&				\
+		    (ret) != WT_DUPLICATE_KEY)				\
+			F_SET(&(s)->txn, TXN_ERROR);			\
+	}								\
+} while (0)
+
+/* An API call wrapped in a transaction if necessary. */
+#define	TXN_API_CALL(s, h, n, cur, bt, config, cfg) do {		\
+	int __autotxn = 0;						\
+	API_CALL(s, h, n, bt, cur, config, cfg);			\
+	__autotxn = !F_ISSET(&(s)->txn, TXN_AUTOCOMMIT | TXN_RUNNING);	\
+	if (__autotxn)							\
+		F_SET(&(s)->txn, TXN_AUTOCOMMIT)
+
+/* An API call wrapped in a transaction if necessary. */
+#define	TXN_API_CALL_NOCONF(s, h, n, cur, bt) do {			\
+	int __autotxn = 0;						\
+	API_CALL_NOCONF(s, h, n, cur, bt);				\
+	__autotxn = !F_ISSET(&(s)->txn, TXN_AUTOCOMMIT | TXN_RUNNING);	\
+	if (__autotxn)							\
+		F_SET(&(s)->txn, TXN_AUTOCOMMIT)
+
+/* End a transactional API call, optional retry on deadlock. */
+#define	TXN_API_END_RETRY(s, ret, retry)				\
+	API_END(s, ret);						\
+	if (__autotxn) {						\
+		if (F_ISSET(&(s)->txn, TXN_AUTOCOMMIT))			\
+			F_CLR(&(s)->txn, TXN_AUTOCOMMIT);		\
+		else if (ret == 0 && !F_ISSET(&(s)->txn, TXN_ERROR))	\
+			ret = __wt_txn_commit((s), NULL);		\
+		else {							\
+			WT_TRET(__wt_txn_rollback((s), NULL));		\
+			if ((ret == 0 || ret == WT_ROLLBACK) &&		\
+			    (retry)) {					\
+				ret = 0;				\
+				continue;				\
+			}						\
+			WT_TRET(__wt_session_reset_cursors(s));		\
+		}							\
+	}								\
+	break;								\
+} while (ret == 0)
+
+/* End a transactional API call, retry on deadlock. */
+#define	TXN_API_END(s, ret)	TXN_API_END_RETRY(s, ret, 1)
+
+/*
+ * In almost all cases, API_END is returning immediately, make it simple.
+ * If a session or connection method is about to return WT_NOTFOUND (some
+ * underlying object was not found), map it to ENOENT, only cursor methods
+ * return WT_NOTFOUND.
+ */
+#define	API_END_RET(s, ret)						\
+	API_END(s, ret);						\
+	return (ret)
+#define	API_END_RET_NOTFOUND_MAP(s, ret)				\
+	API_END(s, ret);						\
+	return ((ret) == WT_NOTFOUND ? ENOENT : (ret))
+
+#define	CONNECTION_API_CALL(conn, s, n, config, cfg)			\
+	s = (conn)->default_session;					\
+	API_CALL(s, connection, n, NULL, NULL, config, cfg)
+
+#define	CONNECTION_API_CALL_NOCONF(conn, s, n)				\
+	s = (conn)->default_session;					\
+	API_CALL_NOCONF(s, connection, n, NULL, NULL)
+
+#define	SESSION_API_CALL(s, n, config, cfg)				\
+	API_CALL(s, session, n, NULL, NULL, config, cfg)
+
+#define	SESSION_API_CALL_NOCONF(s, n)					\
+	API_CALL_NOCONF(s, session, n, NULL, NULL)
+
+#define	SESSION_TXN_API_CALL(s, n, config, cfg)				\
+	TXN_API_CALL(s, session, n, NULL, NULL, config, cfg)
+
+#define	CURSOR_API_CALL(cur, s, n, bt)					\
+	(s) = (WT_SESSION_IMPL *)(cur)->session;			\
+	API_CALL_NOCONF(s, cursor, n, cur,				\
+	    ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle)
+
+#define	CURSOR_UPDATE_API_CALL(cur, s, n, bt)				\
+	(s) = (WT_SESSION_IMPL *)(cur)->session;			\
+	TXN_API_CALL_NOCONF(s, cursor, n, cur,				\
+	    ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle)
+
+#define	CURSOR_UPDATE_API_END(s, ret)					\
+	TXN_API_END(s, ret)
+
+#define	ASYNCOP_API_CALL(conn, s, n)					\
+	s = (conn)->default_session;					\
+	API_CALL_NOCONF(s, asyncop, n, NULL, NULL)
diff --git a/src/third_party/wiredtiger/src/include/async.h b/src/third_party/wiredtiger/src/include/async.h
new file mode 100644
index 00000000000..8565874c2f3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/async.h
@@ -0,0 +1,128 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+typedef enum {
+	WT_ASYNCOP_ENQUEUED,	/* Placed on the work queue */
+	WT_ASYNCOP_FREE,	/* Able to be allocated to user */
+	WT_ASYNCOP_READY,	/* Allocated and ready for user to use */
+	WT_ASYNCOP_WORKING	/* Operation in progress by worker */
+} WT_ASYNC_STATE;
+
+typedef enum {
+	WT_ASYNC_FLUSH_NONE=0,		/* No flush in progress */
+	WT_ASYNC_FLUSH_COMPLETE,	/* Notify flush caller it's done */
+	WT_ASYNC_FLUSH_IN_PROGRESS,	/* Prevent other callers */
+	WT_ASYNC_FLUSHING		/* Notify workers */
+} WT_ASYNC_FLUSH_STATE;
+
+#define	MAX_ASYNC_SLEEP_USECS	100000	/* Maximum sleep waiting for work */
+#define	MAX_ASYNC_YIELD		200	/* Maximum number of yields for work */
+
+#define	O2C(op)	((WT_CONNECTION_IMPL *)(op)->iface.connection)
+#define	O2S(op)								\
+    (((WT_CONNECTION_IMPL *)(op)->iface.connection)->default_session)
+/*
+ * WT_ASYNC_FORMAT --
+ *	The URI/config/format cache.
+ */
+struct __wt_async_format {
+	STAILQ_ENTRY(__wt_async_format) q;
+	const char	*config;
+	uint64_t	cfg_hash;		/* Config hash */
+	const char	*uri;
+	uint64_t	uri_hash;		/* URI hash */
+	const char	*key_format;
+	const char	*value_format;
+};
+
+/*
+ * WT_ASYNC_OP_IMPL --
+ *	Implementation of the WT_ASYNC_OP.
+ */
+struct __wt_async_op_impl {
+	WT_ASYNC_OP	iface;
+
+	WT_ASYNC_CALLBACK	*cb;
+
+	uint32_t	internal_id;	/* Array position id. */
+	uint64_t	unique_id;	/* Unique identifier. */
+
+	WT_ASYNC_FORMAT *format;	/* Format structure */
+	WT_ASYNC_STATE	state;		/* Op state */
+	WT_ASYNC_OPTYPE	optype;		/* Operation type */
+};
+
+/*
+ * Definition of the async subsystem.
+ */
+struct __wt_async {
+	/*
+	 * Ops array protected by the ops_lock.
+	 */
+	WT_SPINLOCK		 ops_lock;      /* Locked: ops array */
+	WT_ASYNC_OP_IMPL	 *async_ops;	/* Async ops */
+#define	OPS_INVALID_INDEX	0xffffffff
+	uint32_t		 ops_index;	/* Active slot index */
+	uint64_t		 op_id;		/* Unique ID counter */
+	WT_ASYNC_OP_IMPL	 **async_queue;	/* Async ops work queue */
+	uint32_t		 async_qsize;	/* Async work queue size */
+	/*
+	 * We need to have two head and tail values.  All but one is
+	 * maintained as an ever increasing value to ease wrap around.
+	 *
+	 * alloc_head: the next one to allocate for producers.
+	 * head: the current head visible to consumers.
+	 * head is always <= alloc_head.
+	 * alloc_tail: the next slot for consumers to dequeue.
+	 * alloc_tail is always <= head.
+	 * tail_slot: the last slot consumed.
+	 * A producer may need wait for tail_slot to advance.
+	 */
+	uint64_t		 alloc_head;	/* Next slot to enqueue */
+	uint64_t		 head;		/* Head visible to worker */
+	uint64_t		 alloc_tail;	/* Next slot to dequeue */
+	uint64_t		 tail_slot;	/* Worker slot consumed */
+
+	STAILQ_HEAD(__wt_async_format_qh, __wt_async_format) formatqh;
+	int			 cur_queue;	/* Currently enqueued */
+	int			 max_queue;	/* Maximum enqueued */
+	WT_ASYNC_FLUSH_STATE	 flush_state;	/* Queue flush state */
+	/* Notify any waiting threads when flushing is done. */
+	WT_CONDVAR		*flush_cond;
+	WT_ASYNC_OP_IMPL	 flush_op;	/* Special flush op */
+	uint32_t		 flush_count;	/* Worker count */
+	uint64_t		 flush_gen;	/* Flush generation number */
+
+#define	WT_ASYNC_MAX_WORKERS	20
+	WT_SESSION_IMPL		*worker_sessions[WT_ASYNC_MAX_WORKERS];
+					/* Async worker threads */
+	wt_thread_t		 worker_tids[WT_ASYNC_MAX_WORKERS];
+
+	uint32_t		 flags;	/* Currently unused. */
+};
+
+/*
+ * WT_ASYNC_CURSOR --
+ *	Async container for a cursor.  Each async worker thread
+ *	has a cache of async cursors to reuse for operations.
+ */
+struct __wt_async_cursor {
+	STAILQ_ENTRY(__wt_async_cursor) q;	/* Worker cache */
+	uint64_t	cfg_hash;		/* Config hash */
+	uint64_t	uri_hash;		/* URI hash */
+	WT_CURSOR	*c;			/* WT cursor */
+};
+
+/*
+ * WT_ASYNC_WORKER_STATE --
+ *	State for an async worker thread.
+ */
+struct __wt_async_worker_state {
+	uint32_t	id;
+	STAILQ_HEAD(__wt_cursor_qh, __wt_async_cursor)	cursorqh;
+	uint32_t	num_cursors;
+};
diff --git a/src/third_party/wiredtiger/src/include/bitstring.i b/src/third_party/wiredtiger/src/include/bitstring.i
new file mode 100644
index 00000000000..95af6731bf9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/bitstring.i
@@ -0,0 +1,316 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*-
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Paul Vixie.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/sys/bitstring.h,v 1.5 2005/01/07 02:29:23 imp Exp $
+ */
+
+				/* byte of the bitstring bit is in */
+#define	__bit_byte(bit)	((bit) >> 3)
+
+				/* mask for the bit within its byte */
+#define	__bit_mask(bit)	(1 << ((bit) & 0x7))
+
+				/* Bytes in a bitstring of nbits */
+#define	__bitstr_size(nbits) (((nbits) + 7) >> 3)
+
+/*
+ * __bit_alloc --
+ *	Allocate a bitstring.
+ */
+static inline int
+__bit_alloc(WT_SESSION_IMPL *session, uint64_t nbits, void *retp)
+{
+	return (__wt_calloc(
+	    session, (size_t)__bitstr_size(nbits), sizeof(uint8_t), retp));
+}
+
+/*
+ * __bit_test --
+ *	Test one bit in name.
+ */
+static inline int
+__bit_test(uint8_t *bitf, uint64_t bit)
+{
+	return (bitf[__bit_byte(bit)] & __bit_mask(bit) ? 1 : 0);
+}
+
+/*
+ * __bit_set --
+ *	Set one bit in name.
+ */
+static inline void
+__bit_set(uint8_t *bitf, uint64_t bit)
+{
+	bitf[__bit_byte(bit)] |= __bit_mask(bit);
+}
+
+/*
+ * __bit_clear --
+ *	Clear one bit in name.
+ */
+static inline void
+__bit_clear(uint8_t *bitf, uint64_t bit)
+{
+	bitf[__bit_byte(bit)] &= ~__bit_mask(bit);
+}
+
+/*
+ * __bit_nclr --
+ *	Clear bits start-to-stop in name.
+ */
+static inline void
+__bit_nclr(uint8_t *bitf, uint64_t start, uint64_t stop)
+{
+	uint64_t startbyte, stopbyte;
+
+	startbyte = __bit_byte(start);
+	stopbyte = __bit_byte(stop);
+
+	if (startbyte == stopbyte)
+		bitf[startbyte] &=
+		    ((0xff >> (8 - (start & 0x7))) |
+		    (0xff << ((stop & 0x7) + 1)));
+	else {
+		bitf[startbyte] &= 0xff >> (8 - (start & 0x7));
+		while (++startbyte < stopbyte)
+			bitf[startbyte] = 0;
+		bitf[stopbyte] &= 0xff << ((stop & 0x7) + 1);
+	}
+}
+
+/*
+ * __bit_nset --
+ *	Set bits start-to-stop in name.
+ */
+static inline void
+__bit_nset(uint8_t *bitf, uint64_t start, uint64_t stop)
+{
+	uint64_t startbyte, stopbyte;
+
+	startbyte = __bit_byte(start);
+	stopbyte = __bit_byte(stop);
+	if (startbyte == stopbyte)
+		bitf[startbyte] |=
+		    ((0xff << (start & 0x7)) & (0xff >> (7 - (stop & 0x7))));
+	else {
+		bitf[startbyte] |= 0xff << (start & 0x7);
+		while (++startbyte < stopbyte)
+			bitf[startbyte] = 0xff;
+		bitf[stopbyte] |= 0xff >> (7 - (stop & 0x7));
+	}
+}
+
+/*
+ * __bit_ffc --
+ *	Find first clear bit in name, return 0 on success, -1 on no bit clear.
+ */
+static inline int
+__bit_ffc(uint8_t *bitf, uint64_t nbits, uint64_t *retp)
+{
+	uint8_t lb;
+	uint64_t byte, stopbyte, value;
+
+	value = 0;		/* -Wuninitialized */
+
+	if (nbits == 0)
+		return (-1);
+
+	for (byte = 0,
+	    stopbyte = __bit_byte(nbits - 1); byte <= stopbyte; ++byte)
+		if (bitf[byte] != 0xff) {
+			value = byte << 3;
+			for (lb = bitf[byte]; lb & 0x01; ++value, lb >>= 1)
+				;
+			break;
+		}
+
+	if (byte > stopbyte || value >= nbits)
+		return (-1);
+
+	*retp = value;
+	return (0);
+}
+
+/*
+ * __bit_ffs --
+ *	Find first set bit in name, return 0 on success, -1 on no bit set.
+ */
+static inline int
+__bit_ffs(uint8_t *bitf, uint64_t nbits, uint64_t *retp)
+{
+	uint8_t lb;
+	uint64_t byte, stopbyte, value;
+
+	value = 0;
+	if (nbits == 0)
+		return (-1);
+
+	for (byte = 0,
+	    stopbyte = __bit_byte(nbits - 1); byte <= stopbyte; ++byte)
+		if (bitf[byte] != 0) {
+			value = byte << 3;
+			for (lb = bitf[byte]; !(lb & 0x01); ++value, lb >>= 1)
+				;
+			break;
+		}
+
+	if (byte > stopbyte || value >= nbits)
+		return (-1);
+
+	*retp = value;
+	return (0);
+}
+
+/*
+ * __bit_getv --
+ *	Return a fixed-length column store bit-field value.
+ */
+static inline uint8_t
+__bit_getv(uint8_t *bitf, uint64_t entry, uint8_t width)
+{
+	uint8_t value;
+	uint64_t bit;
+
+#define	__BIT_GET(len, mask)						\
+	case len:							\
+		if (__bit_test(bitf, bit))				\
+			value |= mask;					\
+		++bit							\
+		/* FALLTHROUGH */
+
+	value = 0;
+	bit = entry * width;
+
+	/*
+	 * Fast-path single bytes, do repeated tests for the rest: we could
+	 * slice-and-dice instead, but the compiler is probably going to do
+	 * a better job than I will.
+	 */
+	switch (width) {
+	case 8:
+		return (bitf[__bit_byte(bit)]);
+	__BIT_GET(7, 0x40);
+	__BIT_GET(6, 0x20);
+	__BIT_GET(5, 0x10);
+	__BIT_GET(4, 0x08);
+	__BIT_GET(3, 0x04);
+	__BIT_GET(2, 0x02);
+	__BIT_GET(1, 0x01);
+	}
+	return (value);
+}
+
+/*
+ * __bit_getv_recno --
+ *	Return a record number's bit-field value.
+ */
+static inline uint8_t
+__bit_getv_recno(WT_PAGE *page, uint64_t recno, uint8_t width)
+{
+	return (__bit_getv(
+	    page->pg_fix_bitf, recno - page->pg_fix_recno, width));
+}
+
+/*
+ * __bit_setv --
+ *	Set a fixed-length column store bit-field value.
+ */
+static inline void
+__bit_setv(uint8_t *bitf, uint64_t entry, uint8_t width, uint8_t value)
+{
+	uint64_t bit;
+
+#define	__BIT_SET(len, mask)						\
+	case len:							\
+		if (value & (mask))					\
+			__bit_set(bitf, bit);				\
+		else							\
+			__bit_clear(bitf, bit);				\
+		++bit							\
+		/* FALLTHROUGH */
+
+	bit = entry * width;
+
+	/*
+	 * Fast-path single bytes, do repeated tests for the rest: we could
+	 * slice-and-dice instead, but the compiler is probably going to do
+	 * a better job than I will.
+	 */
+	switch (width) {
+	case 8:
+		bitf[__bit_byte(bit)] = value;
+		return;
+	__BIT_SET(7, 0x40);
+	__BIT_SET(6, 0x20);
+	__BIT_SET(5, 0x10);
+	__BIT_SET(4, 0x08);
+	__BIT_SET(3, 0x04);
+	__BIT_SET(2, 0x02);
+	__BIT_SET(1, 0x01);
+	}
+}
+
+/*
+ * __bit_setv_recno --
+ *	Set a record number's bit-field value.
+ */
+static inline void
+__bit_setv_recno(WT_PAGE *page, uint64_t recno, uint8_t width, uint8_t value)
+{
+	__bit_setv(page->pg_fix_bitf, recno - page->pg_fix_recno, width, value);
+}
diff --git a/src/third_party/wiredtiger/src/include/block.h b/src/third_party/wiredtiger/src/include/block.h
new file mode 100644
index 00000000000..10fa51243ac
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/block.h
@@ -0,0 +1,337 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * WiredTiger's block manager interface.
+ */
+
+/*
+ * The file's description is written into the first block of the file, which
+ * means we can use an offset of 0 as an invalid offset.
+ */
+#define	WT_BLOCK_INVALID_OFFSET		0
+
+/*
+ * The block manager maintains three per-checkpoint extent lists:
+ *	alloc:	 the extents allocated in this checkpoint
+ *	avail:	 the extents available for allocation
+ *	discard: the extents freed in this checkpoint
+ *
+ * An extent list is based on two skiplists: first, a by-offset list linking
+ * WT_EXT elements and sorted by file offset (low-to-high), second, a by-size
+ * list linking WT_SIZE elements and sorted by chunk size (low-to-high).
+ *
+ * Additionally, each WT_SIZE element on the by-size has a skiplist of its own,
+ * linking WT_EXT elements and sorted by file offset (low-to-high).  This list
+ * has an entry for extents of a particular size.
+ *
+ * The trickiness is each individual WT_EXT element appears on two skiplists.
+ * In order to minimize allocation calls, we allocate a single array of WT_EXT
+ * pointers at the end of the WT_EXT structure, for both skiplists, and store
+ * the depth of the skiplist in the WT_EXT structure.  The skiplist entries for
+ * the offset skiplist start at WT_EXT.next[0] and the entries for the size
+ * skiplist start at WT_EXT.next[WT_EXT.depth].
+ *
+ * One final complication: we only maintain the per-size skiplist for the avail
+ * list, the alloc and discard extent lists are not searched based on size.
+ */
+
+/*
+ * WT_EXTLIST --
+ *	An extent list.
+ */
+struct __wt_extlist {
+	char *name;				/* Name */
+
+	uint64_t bytes;				/* Byte count */
+	uint32_t entries;			/* Entry count */
+
+	wt_off_t offset;			/* Written extent offset */
+	uint32_t cksum, size;			/* Written extent cksum, size */
+
+	int	track_size;			/* Maintain per-size skiplist */
+
+	WT_EXT	*last;				/* Cached last element */
+
+	WT_EXT	*off[WT_SKIP_MAXDEPTH];		/* Size/offset skiplists */
+	WT_SIZE *sz[WT_SKIP_MAXDEPTH];
+};
+
+/*
+ * WT_EXT --
+ *	Encapsulation of an extent, either allocated or freed within the
+ * checkpoint.
+ */
+struct __wt_ext {
+	wt_off_t  off;				/* Extent's file offset */
+	wt_off_t  size;				/* Extent's Size */
+
+	uint8_t	 depth;				/* Skip list depth */
+
+	/*
+	 * Variable-length array, sized by the number of skiplist elements.
+	 * The first depth array entries are the address skiplist elements,
+	 * the second depth array entries are the size skiplist.
+	 */
+	WT_EXT	*next[0];			/* Offset, size skiplists */
+};
+
+/*
+ * WT_SIZE --
+ *	Encapsulation of a block size skiplist entry.
+ */
+struct __wt_size {
+	wt_off_t size;				/* Size */
+
+	uint8_t	 depth;				/* Skip list depth */
+
+	WT_EXT	*off[WT_SKIP_MAXDEPTH];		/* Per-size offset skiplist */
+
+	/*
+	 * We don't use a variable-length array for the size skiplist, we want
+	 * to be able to use any cached WT_SIZE structure as the head of a list,
+	 * and we don't know the related WT_EXT structure's depth.
+	 */
+	WT_SIZE *next[WT_SKIP_MAXDEPTH];	/* Size skiplist */
+};
+
+/*
+ * WT_EXT_FOREACH --
+ *	Walk a block manager skiplist.
+ * WT_EXT_FOREACH_OFF --
+ *	Walk a block manager skiplist where the WT_EXT.next entries are offset
+ * by the depth.
+ */
+#define	WT_EXT_FOREACH(skip, head)					\
+	for ((skip) = (head)[0];					\
+	    (skip) != NULL; (skip) = (skip)->next[0])
+#define	WT_EXT_FOREACH_OFF(skip, head)					\
+	for ((skip) = (head)[0];					\
+	    (skip) != NULL; (skip) = (skip)->next[(skip)->depth])
+
+/*
+ * Checkpoint cookie: carries a version number as I don't want to rev the schema
+ * file version should the default block manager checkpoint format change.
+ *
+ * Version #1 checkpoint cookie format:
+ *	[1] [root addr] [alloc addr] [avail addr] [discard addr]
+ *	    [file size] [checkpoint size] [write generation]
+ */
+#define	WT_BM_CHECKPOINT_VERSION	1	/* Checkpoint format version */
+#define	WT_BLOCK_EXTLIST_MAGIC		71002	/* Identify a list */
+struct __wt_block_ckpt {
+	uint8_t	 version;			/* Version */
+
+	wt_off_t root_offset;			/* The root */
+	uint32_t root_cksum, root_size;
+
+	WT_EXTLIST alloc;			/* Extents allocated */
+	WT_EXTLIST avail;			/* Extents available */
+	WT_EXTLIST discard;			/* Extents discarded */
+
+	wt_off_t   file_size;			/* Checkpoint file size */
+	uint64_t   ckpt_size;			/* Checkpoint byte count */
+
+	WT_EXTLIST ckpt_avail;			/* Checkpoint free'd extents */
+
+	/*
+	 * Checkpoint archive: the block manager may potentially free a lot of
+	 * memory from the allocation and discard extent lists when checkpoint
+	 * completes.  Put it off until the checkpoint resolves, that lets the
+	 * upper btree layer continue eviction sooner.
+	 */
+	WT_EXTLIST ckpt_alloc;			/* Checkpoint archive */
+	WT_EXTLIST ckpt_discard;		/* Checkpoint archive */
+};
+
+/*
+ * WT_BM --
+ *	Block manager handle, references a single checkpoint in a file.
+ */
+struct __wt_bm {
+						/* Methods */
+	int (*addr_string)
+	    (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t);
+	int (*addr_valid)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
+	u_int (*block_header)(WT_BM *);
+	int (*checkpoint)
+	    (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, WT_CKPT *, int);
+	int (*checkpoint_load)(WT_BM *, WT_SESSION_IMPL *,
+	    const uint8_t *, size_t, uint8_t *, size_t *, int);
+	int (*checkpoint_resolve)(WT_BM *, WT_SESSION_IMPL *);
+	int (*checkpoint_unload)(WT_BM *, WT_SESSION_IMPL *);
+	int (*close)(WT_BM *, WT_SESSION_IMPL *);
+	int (*compact_end)(WT_BM *, WT_SESSION_IMPL *);
+	int (*compact_page_skip)
+	    (WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t, int *);
+	int (*compact_skip)(WT_BM *, WT_SESSION_IMPL *, int *);
+	int (*compact_start)(WT_BM *, WT_SESSION_IMPL *);
+	int (*free)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
+	int (*preload)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
+	int (*read)
+	    (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t);
+	int (*salvage_end)(WT_BM *, WT_SESSION_IMPL *);
+	int (*salvage_next)
+	    (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t *, int *);
+	int (*salvage_start)(WT_BM *, WT_SESSION_IMPL *);
+	int (*salvage_valid)
+	    (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t, int);
+	int (*stat)(WT_BM *, WT_SESSION_IMPL *, WT_DSRC_STATS *stats);
+	int (*sync)(WT_BM *, WT_SESSION_IMPL *, int);
+	int (*verify_addr)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
+	int (*verify_end)(WT_BM *, WT_SESSION_IMPL *);
+	int (*verify_start)(WT_BM *, WT_SESSION_IMPL *, WT_CKPT *);
+	int (*write) (WT_BM *,
+	    WT_SESSION_IMPL *, WT_ITEM *, uint8_t *, size_t *, int);
+	int (*write_size)(WT_BM *, WT_SESSION_IMPL *, size_t *);
+
+	WT_BLOCK *block;			/* Underlying file */
+
+	void  *map;				/* Mapped region */
+	size_t maplen;
+	void *mappingcookie;
+
+	/*
+	 * There's only a single block manager handle that can be written, all
+	 * others are checkpoints.
+	 */
+	int is_live;				/* The live system */
+};
+
+/*
+ * WT_BLOCK --
+ *	Block manager handle, references a single file.
+ */
+struct __wt_block {
+	const char *name;		/* Name */
+
+	/* A list of block manager handles, sharing a file descriptor. */
+	uint32_t ref;			/* References */
+	WT_FH	*fh;			/* Backing file handle */
+	TAILQ_ENTRY(__wt_block) q;	/* Linked list of handles */
+
+	/* Configuration information, set when the file is opened. */
+	int	 allocfirst;		/* Allocation is first-fit */
+	int	 allocfirst_save;	/* Allocation is first-fit, saved */
+	uint32_t allocsize;		/* Allocation size */
+	size_t	 os_cache;		/* System buffer cache flush max */
+	size_t	 os_cache_max;
+	size_t	 os_cache_dirty;	/* System buffer cache write max */
+	size_t	 os_cache_dirty_max;
+
+	u_int	 block_header;		/* Header length */
+
+	/*
+	 * There is only a single checkpoint in a file that can be written.  The
+	 * information could logically live in the WT_BM structure, but then we
+	 * would be re-creating it every time we opened a new checkpoint and I'd
+	 * rather not do that.  So, it's stored here, only accessed by one WT_BM
+	 * handle.
+	 */
+	WT_SPINLOCK	live_lock;	/* Live checkpoint lock */
+	WT_BLOCK_CKPT	live;		/* Live checkpoint */
+	int		ckpt_inprogress;/* Live checkpoint in progress */
+
+				/* Salvage support */
+	wt_off_t	slvg_off;	/* Salvage file offset */
+
+				/* Verification support */
+	int	   verify;		/* If performing verification */
+	wt_off_t   verify_size;		/* Checkpoint's file size */
+	WT_EXTLIST verify_alloc;	/* Verification allocation list */
+	uint64_t   frags;		/* Maximum frags in the file */
+	uint8_t   *fragfile;		/* Per-file frag tracking list */
+	uint8_t   *fragckpt;		/* Per-checkpoint frag tracking list */
+};
+
+/*
+ * WT_BLOCK_DESC --
+ *	The file's description.
+ */
+struct __wt_block_desc {
+#define	WT_BLOCK_MAGIC		120897
+	uint32_t magic;			/* 00-03: Magic number */
+#define	WT_BLOCK_MAJOR_VERSION	1
+	uint16_t majorv;		/* 04-05: Major version */
+#define	WT_BLOCK_MINOR_VERSION	0
+	uint16_t minorv;		/* 06-07: Minor version */
+
+	uint32_t cksum;			/* 08-11: Description block checksum */
+
+	uint32_t unused;		/* 12-15: Padding */
+};
+/*
+ * WT_BLOCK_DESC_SIZE is the expected structure size -- we verify the build to
+ * ensure the compiler hasn't inserted padding (padding won't cause failure,
+ * we reserve the first allocation-size block of the file for this information,
+ * but it would be worth investigation, regardless).
+ */
+#define	WT_BLOCK_DESC_SIZE		16
+
+/*
+ * WT_BLOCK_HEADER --
+ *	Blocks have a common header, a WT_PAGE_HEADER structure followed by a
+ * block-manager specific structure: WT_BLOCK_HEADER is WiredTiger's default.
+ */
+struct __wt_block_header {
+	/*
+	 * We write the page size in the on-disk page header because it makes
+	 * salvage easier.  (If we don't know the expected page length, we'd
+	 * have to read increasingly larger chunks from the file until we find
+	 * one that checksums, and that's going to be harsh given WiredTiger's
+	 * potentially large page sizes.)
+	 */
+	uint32_t disk_size;		/* 00-03: on-disk page size */
+
+	/*
+	 * Page checksums are stored in two places.  First, the page checksum
+	 * is written within the internal page that references it as part of
+	 * the address cookie.  This is done to improve the chances of detecting
+	 * not only disk corruption but other bugs (for example, overwriting a
+	 * page with another valid page image).  Second, a page's checksum is
+	 * stored in the disk header.  This is for salvage, so salvage knows it
+	 * has found a page that may be useful.
+	 */
+	uint32_t cksum;			/* 04-07: checksum */
+
+#define	WT_BLOCK_DATA_CKSUM	0x01	/* Block data is part of the checksum */
+	uint8_t flags;			/* 08: flags */
+
+	/*
+	 * End the structure with 3 bytes of padding: it wastes space, but it
+	 * leaves the structure 32-bit aligned and having a few bytes to play
+	 * with in the future can't hurt.
+	 */
+	uint8_t unused[3];		/* 09-11: unused padding */
+};
+/*
+ * WT_BLOCK_HEADER_SIZE is the number of bytes we allocate for the structure: if
+ * the compiler inserts padding it will break the world.
+ */
+#define	WT_BLOCK_HEADER_SIZE		12
+
+/*
+ * WT_BLOCK_HEADER_BYTE
+ * WT_BLOCK_HEADER_BYTE_SIZE --
+ *	The first usable data byte on the block (past the combined headers).
+ */
+#define	WT_BLOCK_HEADER_BYTE_SIZE					\
+	(WT_PAGE_HEADER_SIZE + WT_BLOCK_HEADER_SIZE)
+#define	WT_BLOCK_HEADER_BYTE(dsk)					\
+	((void *)((uint8_t *)(dsk) + WT_BLOCK_HEADER_BYTE_SIZE))
+
+/*
+ * Don't compress the block's WT_PAGE_HEADER and WT_BLOCK_HEADER structures.
+ * We need the WT_PAGE_HEADER in-memory size, and the WT_BLOCK_HEADER checksum
+ * and on-disk size to be immediately available without decompression.  We use
+ * the on-disk size and checksum during salvage to figure out where the blocks
+ * are, and the in-memory size tells us how large a buffer we need to decompress
+ * the block.  We could skip less than 64B, but a 64B boundary may offer better
+ * alignment for the underlying compression engine, and skipping 64B won't make
+ * a difference in terms of compression efficiency.
+ */
+#define	WT_BLOCK_COMPRESS_SKIP	64
diff --git a/src/third_party/wiredtiger/src/include/bloom.h b/src/third_party/wiredtiger/src/include/bloom.h
new file mode 100644
index 00000000000..4ae6d96b935
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/bloom.h
@@ -0,0 +1,28 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+/*
+ * REFERENCES:
+ *      http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf
+ *      http://code.google.com/p/cityhash-c/
+ */
+
+struct __wt_bloom {
+	const char *uri;
+	char *config;
+	uint8_t *bitstring;     /* For in memory representation. */
+	WT_SESSION_IMPL *session;
+	WT_CURSOR *c;
+
+	uint32_t k;		/* The number of hash functions used. */
+	uint32_t factor;	/* The number of bits per item inserted. */
+	uint64_t m;		/* The number of slots in the bit string. */
+	uint64_t n;		/* The number of items to be inserted. */
+};
+
+struct __wt_bloom_hash {
+	uint64_t h1, h2;	/* The two hashes used to calculate bits. */
+};
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
new file mode 100644
index 00000000000..0c4fe876e5e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -0,0 +1,1015 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * WT_PAGE_HEADER --
+ *	Blocks have a common header, a WT_PAGE_HEADER structure followed by a
+ * block-manager specific structure.
+ */
+struct __wt_page_header {
+	/*
+	 * The record number of the first record of the page is stored on disk
+	 * so we can figure out where the column-store leaf page fits into the
+	 * key space during salvage.
+	 */
+	uint64_t recno;			/* 00-07: column-store starting recno */
+
+	/*
+	 * We maintain page write-generations in the non-transactional case
+	 * as that's how salvage can determine the most recent page between
+	 * pages overlapping the same key range.
+	 */
+	uint64_t write_gen;		/* 08-15: write generation */
+
+	/*
+	 * The page's in-memory size isn't rounded or aligned, it's the actual
+	 * number of bytes the disk-image consumes when instantiated in memory.
+	 */
+	uint32_t mem_size;		/* 16-19: in-memory page size */
+
+	union {
+		uint32_t entries;	/* 20-23: number of cells on page */
+		uint32_t datalen;	/* 20-23: overflow data length */
+	} u;
+
+	uint8_t type;			/* 24: page type */
+
+#define	WT_PAGE_COMPRESSED	0x01	/* Page is compressed on disk */
+#define	WT_PAGE_EMPTY_V_ALL	0x02	/* Page has all zero-length values */
+#define	WT_PAGE_EMPTY_V_NONE	0x04	/* Page has no zero-length values */
+	uint8_t flags;			/* 25: flags */
+
+	/*
+	 * End the structure with 2 bytes of padding: it wastes space, but it
+	 * leaves the structure 32-bit aligned and having a few bytes to play
+	 * with in the future can't hurt.
+	 */
+	uint8_t unused[2];		/* 26-27: unused padding */
+};
+/*
+ * WT_PAGE_HEADER_SIZE is the number of bytes we allocate for the structure: if
+ * the compiler inserts padding it will break the world.
+ */
+#define	WT_PAGE_HEADER_SIZE		28
+
+/*
+ * The block-manager specific information immediately follows the WT_PAGE_HEADER
+ * structure.
+ */
+#define	WT_BLOCK_HEADER_REF(dsk)					\
+	((void *)((uint8_t *)(dsk) + WT_PAGE_HEADER_SIZE))
+
+/*
+ * WT_PAGE_HEADER_BYTE --
+ * WT_PAGE_HEADER_BYTE_SIZE --
+ *	The first usable data byte on the block (past the combined headers).
+ */
+#define	WT_PAGE_HEADER_BYTE_SIZE(btree)					\
+	((u_int)(WT_PAGE_HEADER_SIZE + (btree)->block_header))
+#define	WT_PAGE_HEADER_BYTE(btree, dsk)					\
+	((void *)((uint8_t *)(dsk) + WT_PAGE_HEADER_BYTE_SIZE(btree)))
+
+/*
+ * WT_ADDR --
+ *	An in-memory structure to hold a block's location.
+ */
+struct __wt_addr {
+	uint8_t *addr;			/* Block-manager's cookie */
+	uint8_t  size;			/* Block-manager's cookie length */
+
+#define	WT_ADDR_INT	1		/* Internal page */
+#define	WT_ADDR_LEAF	2		/* Leaf page */
+#define	WT_ADDR_LEAF_NO	3		/* Leaf page, no overflow */
+	uint8_t  type;
+
+	/*
+	 * If an address is both as an address for the previous and the current
+	 * multi-block reconciliations, that is, a block we're writing matches
+	 * the block written the last time, it will appear in both the current
+	 * boundary points as well as the page modification's list of previous
+	 * blocks.  The reuse flag is how we know that's happening so the block
+	 * is treated correctly (not free'd on error, for example).
+	 */
+	uint8_t	 reuse;
+};
+
+/*
+ * Overflow tracking for reuse: When a page is reconciled, we write new K/V
+ * overflow items.  If pages are reconciled multiple times, we need to know
+ * if we've already written a particular overflow record (so we don't write
+ * it again), as well as if we've modified an overflow record previously
+ * written (in which case we want to write a new record and discard blocks
+ * used by the previously written record).  Track overflow records written
+ * for the page, storing the values in a skiplist with the record's value as
+ * the "key".
+ */
+struct __wt_ovfl_reuse {
+	uint32_t value_offset;		/* Overflow value offset */
+	uint32_t value_size;		/* Overflow value size */
+	uint8_t  addr_offset;		/* Overflow addr offset */
+	uint8_t  addr_size;		/* Overflow addr size */
+
+	/*
+	 * On each page reconciliation, we clear the entry's in-use flag, and
+	 * reset it as the overflow record is re-used.  After reconciliation
+	 * completes, unused skiplist entries are discarded, along with their
+	 * underlying blocks.
+	 *
+	 * On each page reconciliation, set the just-added flag for each new
+	 * skiplist entry; if reconciliation fails for any reason, discard the
+	 * newly added skiplist entries, along with their underlying blocks.
+	 */
+#define	WT_OVFL_REUSE_INUSE		0x01
+#define	WT_OVFL_REUSE_JUST_ADDED	0x02
+	uint8_t	 flags;
+
+	/*
+	 * The untyped address immediately follows the WT_OVFL_REUSE structure,
+	 * the untyped value immediately follows the address.
+	 */
+#define	WT_OVFL_REUSE_ADDR(p)						\
+	((void *)((uint8_t *)(p) + (p)->addr_offset))
+#define	WT_OVFL_REUSE_VALUE(p)						\
+	((void *)((uint8_t *)(p) + (p)->value_offset))
+
+	WT_OVFL_REUSE *next[0];		/* Forward-linked skip list */
+};
+
+/*
+ * Overflow tracking for cached values: When a page is reconciled, we write new
+ * K/V overflow items, and discard previous underlying blocks.  If there's a
+ * transaction in the system that needs to read the previous value, we have to
+ * cache the old value until no running transaction needs it.
+ */
+struct __wt_ovfl_txnc {
+	uint64_t current;		/* Maximum transaction ID at store */
+
+	uint32_t value_offset;		/* Overflow value offset */
+	uint32_t value_size;		/* Overflow value size */
+	uint8_t  addr_offset;		/* Overflow addr offset */
+	uint8_t  addr_size;		/* Overflow addr size */
+
+	/*
+	 * The untyped address immediately follows the WT_OVFL_TXNC
+	 * structure, the untyped value immediately follows the address.
+	 */
+#define	WT_OVFL_TXNC_ADDR(p)						\
+	((void *)((uint8_t *)(p) + (p)->addr_offset))
+#define	WT_OVFL_TXNC_VALUE(p)						\
+	((void *)((uint8_t *)(p) + (p)->value_offset))
+
+	WT_OVFL_TXNC *next[0];		/* Forward-linked skip list */
+};
+
+/*
+ * WT_PAGE_MODIFY --
+ *	When a page is modified, there's additional information to maintain.
+ */
+struct __wt_page_modify {
+	/*
+	 * Track the highest transaction ID at which the page was written to
+	 * disk.  This can be used to avoid trying to write the page multiple
+	 * times if a snapshot is keeping old versions pinned (e.g., in a
+	 * checkpoint).
+	 */
+	uint64_t disk_snap_min;
+
+	/* The largest transaction ID seen on the page by reconciliation. */
+	uint64_t rec_max_txn;
+
+	/* The first unwritten transaction ID (approximate). */
+	uint64_t first_dirty_txn;
+
+	/* The largest update transaction ID (approximate). */
+	uint64_t update_txn;
+
+	/* Dirty bytes added to the cache. */
+	uint64_t bytes_dirty;
+
+	/*
+	 * When pages are reconciled, the result is one or more replacement
+	 * blocks.  A replacement block can be in one of two states: it was
+	 * written to disk, and so we have a block address, or it contained
+	 * unresolved modifications and we have a disk image for it with a
+	 * list of those unresolved modifications.  The former is the common
+	 * case: we only build lists of unresolved modifications when we're
+	 * evicting a page, and we only expect to see unresolved modifications
+	 * on a page being evicted in the case of a hot page that's too large
+	 * to keep in memory as it is.  In other words, checkpoints will skip
+	 * unresolved modifications, and will write the blocks rather than
+	 * build lists of unresolved modifications.
+	 *
+	 * Ugly union/struct layout to conserve memory, we never have both
+	 * a replace address and multiple replacement blocks.
+	 */
+	union {
+	WT_ADDR	 replace;		/* Single, written replacement block */
+#define	mod_replace	u1.replace
+
+	struct {			/* Multiple replacement blocks */
+	struct __wt_multi {
+		/*
+		 * Block's key: either a column-store record number or a
+		 * row-store variable length byte string.
+		 */
+		union {
+			uint64_t recno;
+			WT_IKEY *ikey;
+		} key;
+
+		/*
+		 * Eviction, but block wasn't written: unresolved updates and
+		 * associated disk image.
+		 *
+		 * Skipped updates are either a WT_INSERT, or a row-store leaf
+		 * page entry.
+		 */
+		struct __wt_upd_skipped {
+			WT_INSERT *ins;
+			WT_ROW	  *rip;
+		} *skip;
+		uint32_t skip_entries;
+		void	*skip_dsk;
+
+		/*
+		 * Block was written: address, size and checksum.
+		 * On subsequent reconciliations of this page, we avoid writing
+		 * the block if it's unchanged by comparing size and checksum;
+		 * the reuse flag is set when the block is unchanged and we're
+		 * reusing a previous address.
+		 */
+		WT_ADDR	 addr;
+		uint32_t size;
+		uint32_t cksum;
+	} *multi;
+	uint32_t multi_entries;		/* Multiple blocks element count */
+	} m;
+#define	mod_multi		u1.m.multi
+#define	mod_multi_entries	u1.m.multi_entries
+	} u1;
+
+	/*
+	 * Internal pages need to be able to chain root-page splits and have a
+	 * special transactional eviction requirement.  Column-store leaf pages
+	 * need update and append lists.
+	 *
+	 * Ugly union/struct layout to conserve memory, a page is either a leaf
+	 * page or an internal page.
+	 */
+	union {
+	struct {
+		/*
+		 * When a root page splits, we create a new page and write it;
+		 * the new page can also split and so on, and we continue this
+		 * process until we write a single replacement root page.  We
+		 * use the root split field to track the list of created pages
+		 * so they can be discarded when no longer needed.
+		 */
+		WT_PAGE *root_split;	/* Linked list of root split pages */
+
+		/*
+		 * When we deepen the tree, newly created internal pages cannot
+		 * be evicted until all threads have exited the original page
+		 * index structure.  We set a transaction value during the split
+		 * that's checked during eviction.
+		 */
+		uint64_t split_txn;	/* Split eviction transaction value */
+	} intl;
+#define	mod_root_split		u2.intl.root_split
+#define	mod_split_txn		u2.intl.split_txn
+	struct {
+		/*
+		 * Appended items to column-stores: there is only a single one
+		 * of these per column-store tree.
+		 */
+		WT_INSERT_HEAD **append;
+
+		/*
+		 * Updated items in column-stores: variable-length RLE entries
+		 * can expand to multiple entries which requires some kind of
+		 * list we can expand on demand.  Updated items in fixed-length
+		 * files could be done based on an WT_UPDATE array as in
+		 * row-stores, but there can be a very large number of bits on
+		 * a single page, and the cost of the WT_UPDATE array would be
+		 * huge.
+		 */
+		WT_INSERT_HEAD **update;
+	} leaf;
+#define	mod_append		u2.leaf.append
+#define	mod_update		u2.leaf.update
+	} u2;
+
+	/*
+	 * Overflow record tracking for reconciliation.  We assume overflow
+	 * records are relatively rare, so we don't allocate the structures
+	 * to track them until we actually see them in the data.
+	 */
+	struct __wt_ovfl_track {
+		/*
+		 * Overflow key/value address/byte-string pairs we potentially
+		 * reuse each time we reconcile the page.
+		 */
+		WT_OVFL_REUSE	*ovfl_reuse[WT_SKIP_MAXDEPTH];
+
+		/*
+		 * Overflow value address/byte-string pairs cached until no
+		 * running transaction will possibly read them.
+		 */
+		WT_OVFL_TXNC	*ovfl_txnc[WT_SKIP_MAXDEPTH];
+
+		/*
+		 * Overflow key/value addresses to be discarded from the block
+		 * manager after reconciliation completes successfully.
+		 */
+		WT_CELL **discard;
+		size_t	  discard_entries;
+		size_t	  discard_allocated;
+	} *ovfl_track;
+
+	/*
+	 * The write generation is incremented when a page is modified, a page
+	 * is clean if the write generation is 0.
+	 *
+	 * !!!
+	 * 4B values are probably larger than required, but I'm more confident
+	 * 4B types will always be backed by atomic writes to memory.
+	 */
+	uint32_t write_gen;
+
+#define	WT_PAGE_LOCK(s, p)						\
+	__wt_spin_lock((s), &S2C(s)->page_lock[(p)->modify->page_lock])
+#define	WT_PAGE_UNLOCK(s, p)						\
+	__wt_spin_unlock((s), &S2C(s)->page_lock[(p)->modify->page_lock])
+	uint8_t page_lock;    /* Page's spinlock */
+
+#define	WT_PM_REC_EMPTY		0x01	/* Reconciliation: no replacement */
+#define	WT_PM_REC_MULTIBLOCK	0x02	/* Reconciliation: multiple blocks */
+#define	WT_PM_REC_REPLACE	0x04	/* Reconciliation: single block */
+#define	WT_PM_REC_MASK							\
+	(WT_PM_REC_EMPTY | WT_PM_REC_MULTIBLOCK | WT_PM_REC_REPLACE)
+	uint8_t flags;			/* Page flags */
+};
+
+/*
+ * WT_PAGE --
+ * The WT_PAGE structure describes the in-memory page information.
+ */
+struct __wt_page {
+	/* Per page-type information. */
+	union {
+		/*
+		 * Internal pages (both column- and row-store).
+		 *
+		 * The page record number is only used by column-store, but it
+		 * makes some things simpler and it doesn't cost us any memory,
+		 * other structures in this union are still as large.
+		 *
+		 * In-memory internal pages have an array of pointers to child
+		 * structures, maintained in collated order.  When a page is
+		 * read into memory, the initial list of children is stored in
+		 * the "orig_index" field, and it and the collated order are
+		 * the same.  After a page splits, the collated order and the
+		 * original order will differ.
+		 *
+		 * Multiple threads of control may be searching the in-memory
+		 * internal page and a child page of the internal page may
+		 * cause a split at any time.  When a page splits, a new array
+		 * is allocated and atomically swapped into place.  Threads in
+		 * the old array continue without interruption (the old array is
+		 * still valid), but have to avoid racing.  No barrier is needed
+		 * because the array reference is updated atomically, but code
+		 * reading the fields multiple times would be a very bad idea.
+		 * Specifically, do not do this:
+		 *	WT_REF **refp = page->u.intl__index->index;
+		 *	uint32_t entries = page->u.intl__index->entries;
+		 *
+		 * The field is declared volatile (so the compiler knows not to
+		 * read it multiple times), and we obscure the field name and
+		 * use a copy macro in all references to the field (so the code
+		 * doesn't read it multiple times).
+		 */
+		struct {
+			uint64_t recno;		/* Starting recno */
+			WT_REF	*parent_ref;	/* Parent reference */
+
+			struct __wt_page_index {
+				uint32_t entries;
+				WT_REF	**index;
+			} * volatile __index;	/* Collated children */
+		} intl;
+#undef	pg_intl_recno
+#define	pg_intl_recno			u.intl.recno
+#define	pg_intl_parent_ref		u.intl.parent_ref
+
+	/*
+	 * Macros to copy/set the index because the name is obscured to ensure
+	 * the field isn't read multiple times.
+	 */
+#define	WT_INTL_INDEX_COPY(page)	((page)->u.intl.__index)
+#define	WT_INTL_INDEX_SET(page, v) do {					\
+	WT_WRITE_BARRIER();						\
+	((page)->u.intl.__index) = (v);					\
+} while (0)
+
+	/*
+	 * Macro to walk the list of references in an internal page.
+	 */
+#define	WT_INTL_FOREACH_BEGIN(session, page, ref) do {			\
+	WT_PAGE_INDEX *__pindex;					\
+	WT_REF **__refp;						\
+	WT_SESSION_IMPL *__session = (session);				\
+	uint32_t __entries;						\
+	WT_ENTER_PAGE_INDEX(session);					\
+	for (__pindex = WT_INTL_INDEX_COPY(page),			\
+	    __refp = __pindex->index,					\
+	    __entries = __pindex->entries; __entries > 0; --__entries) {\
+		(ref) = *__refp++;
+#define	WT_INTL_FOREACH_END						\
+		}							\
+		WT_LEAVE_PAGE_INDEX(__session);				\
+	} while (0)
+
+		/* Row-store leaf page. */
+		struct {
+			WT_ROW *d;		/* Key/value pairs */
+
+			/*
+			 * The column-store leaf page modification structures
+			 * live in the WT_PAGE_MODIFY structure to keep the
+			 * WT_PAGE structure as small as possible for read-only
+			 * pages.  For consistency, we could move the row-store
+			 * modification structures into WT_PAGE_MODIFY too, but
+			 * that doesn't shrink WT_PAGE any further and it would
+			 * require really ugly naming inside of WT_PAGE_MODIFY
+			 * to avoid growing that structure.
+			 */
+			WT_INSERT_HEAD	**ins;	/* Inserts */
+			WT_UPDATE	**upd;	/* Updates */
+
+			uint32_t entries;	/* Entries */
+		} row;
+#undef	pg_row_d
+#define	pg_row_d	u.row.d
+#undef	pg_row_ins
+#define	pg_row_ins	u.row.ins
+#undef	pg_row_upd
+#define	pg_row_upd	u.row.upd
+#define	pg_row_entries	u.row.entries
+#define	pg_row_entries	u.row.entries
+
+		/* Fixed-length column-store leaf page. */
+		struct {
+			uint64_t recno;		/* Starting recno */
+
+			uint8_t	*bitf;		/* Values */
+			uint32_t entries;	/* Entries */
+		} col_fix;
+#undef	pg_fix_recno
+#define	pg_fix_recno	u.col_fix.recno
+#undef	pg_fix_bitf
+#define	pg_fix_bitf	u.col_fix.bitf
+#undef	pg_fix_entries
+#define	pg_fix_entries	u.col_fix.entries
+
+		/* Variable-length column-store leaf page. */
+		struct {
+			uint64_t recno;		/* Starting recno */
+
+			WT_COL *d;		/* Values */
+
+			/*
+			 * Variable-length column-store files maintain a list of
+			 * RLE entries on the page so it's unnecessary to walk
+			 * the page counting records to find a specific entry.
+			 */
+			WT_COL_RLE *repeats;	/* RLE array for lookups */
+			uint32_t    nrepeats;	/* Number of repeat slots */
+
+			uint32_t    entries;	/* Entries */
+		} col_var;
+#undef	pg_var_recno
+#define	pg_var_recno	u.col_var.recno
+#undef	pg_var_d
+#define	pg_var_d	u.col_var.d
+#undef	pg_var_repeats
+#define	pg_var_repeats	u.col_var.repeats
+#undef	pg_var_nrepeats
+#define	pg_var_nrepeats	u.col_var.nrepeats
+#undef	pg_var_entries
+#define	pg_var_entries	u.col_var.entries
+	} u;
+
+	/* Page's on-disk representation: NULL for pages created in memory. */
+	const WT_PAGE_HEADER *dsk;
+
+	/* If/when the page is modified, we need lots more information. */
+	WT_PAGE_MODIFY *modify;
+
+	/*
+	 * The page's read generation acts as an LRU value for each page in the
+	 * tree; it is used by the eviction server thread to select pages to be
+	 * discarded from the in-memory tree.
+	 *
+	 * The read generation is a 64-bit value, if incremented frequently, a
+	 * 32-bit value could overflow.
+	 *
+	 * The read generation is a piece of shared memory potentially read
+	 * by many threads.  We don't want to update page read generations for
+	 * in-cache workloads and suffer the cache misses, so we don't simply
+	 * increment the read generation value on every access.  Instead, the
+	 * read generation is incremented by the eviction server each time it
+	 * becomes active.  To avoid incrementing a page's read generation too
+	 * frequently, it is set to a future point.
+	 */
+#define	WT_READGEN_NOTSET	0
+#define	WT_READGEN_OLDEST	1
+#define	WT_READGEN_STEP		100
+	uint64_t read_gen;
+
+	uint64_t memory_footprint;	/* Memory attached to the page */
+
+#define	WT_PAGE_IS_INTERNAL(page)					\
+	((page)->type == WT_PAGE_COL_INT || (page)->type == WT_PAGE_ROW_INT)
+#define	WT_PAGE_INVALID		0	/* Invalid page */
+#define	WT_PAGE_BLOCK_MANAGER	1	/* Block-manager page */
+#define	WT_PAGE_COL_FIX		2	/* Col-store fixed-len leaf */
+#define	WT_PAGE_COL_INT		3	/* Col-store internal page */
+#define	WT_PAGE_COL_VAR		4	/* Col-store var-length leaf page */
+#define	WT_PAGE_OVFL		5	/* Overflow page */
+#define	WT_PAGE_ROW_INT		6	/* Row-store internal page */
+#define	WT_PAGE_ROW_LEAF	7	/* Row-store leaf page */
+	uint8_t type;			/* Page type */
+
+#define	WT_PAGE_BUILD_KEYS	0x01	/* Keys have been built in memory */
+#define	WT_PAGE_DISK_ALLOC	0x02	/* Disk image in allocated memory */
+#define	WT_PAGE_DISK_MAPPED	0x04	/* Disk image in mapped memory */
+#define	WT_PAGE_EVICT_LRU	0x08	/* Page is on the LRU queue */
+#define	WT_PAGE_SCANNING	0x10	/* Obsolete updates are being scanned */
+#define	WT_PAGE_SPLITTING	0x20	/* An internal page is growing. */
+	uint8_t flags_atomic;		/* Atomic flags, use F_*_ATOMIC */
+};
+
+/*
+ * WT_PAGE_DISK_OFFSET, WT_PAGE_REF_OFFSET --
+ *	Return the offset/pointer of a pointer/offset in a page disk image.
+ */
+#define	WT_PAGE_DISK_OFFSET(page, p)					\
+	WT_PTRDIFF32(p, (page)->dsk)
+#define	WT_PAGE_REF_OFFSET(page, o)					\
+	((void *)((uint8_t *)((page)->dsk) + (o)))
+
+/*
+ * Page state.
+ *
+ * Synchronization is based on the WT_REF->state field, which has a number of
+ * possible states:
+ *
+ * WT_REF_DISK:
+ *	The initial setting before a page is brought into memory, and set as a
+ *	result of page eviction; the page is on disk, and must be read into
+ *	memory before use.  WT_REF_DISK has a value of 0 (the default state
+ *	after allocating cleared memory).
+ *
+ * WT_REF_DELETED:
+ *	The page is on disk, but has been deleted from the tree; we can delete
+ *	row-store leaf pages without reading them if they don't reference
+ *	overflow items.
+ *
+ * WT_REF_LOCKED:
+ *	Locked for exclusive access.  In eviction, this page or a parent has
+ *	been selected for eviction; once hazard pointers are checked, the page
+ *	will be evicted.  When reading a page that was previously deleted, it
+ *	is locked until the page is in memory with records marked deleted.  The
+ *	thread that set the page to WT_REF_LOCKED has exclusive access, no
+ *	other thread may use the WT_REF until the state is changed.
+ *
+ * WT_REF_MEM:
+ *	Set by a reading thread once the page has been read from disk; the page
+ *	is in the cache and the page reference is OK.
+ *
+ * WT_REF_READING:
+ *	Set by a reading thread before reading an ordinary page from disk;
+ *	other readers of the page wait until the read completes.  Sync can
+ *	safely skip over such pages: they are clean by definition.
+ *
+ * WT_REF_SPLIT:
+ *	Set when the page is split; the WT_REF is dead and can no longer be
+ *	used.
+ *
+ * The life cycle of a typical page goes like this: pages are read into memory
+ * from disk and their state set to WT_REF_MEM.  When the page is selected for
+ * eviction, the page state is set to WT_REF_LOCKED.  In all cases, evicting
+ * threads reset the page's state when finished with the page: if eviction was
+ * successful (a clean page was discarded, and a dirty page was written to disk
+ * and then discarded), the page state is set to WT_REF_DISK; if eviction failed
+ * because the page was busy, page state is reset to WT_REF_MEM.
+ *
+ * Readers check the state field and if it's WT_REF_MEM, they set a hazard
+ * pointer to the page, flush memory and re-confirm the page state.  If the
+ * page state is unchanged, the reader has a valid reference and can proceed.
+ *
+ * When an evicting thread wants to discard a page from the tree, it sets the
+ * WT_REF_LOCKED state, flushes memory, then checks hazard pointers.  If a
+ * hazard pointer is found, state is reset to WT_REF_MEM, restoring the page
+ * to the readers.  If the evicting thread does not find a hazard pointer,
+ * the page is evicted.
+ */
+typedef enum __wt_page_state {
+	WT_REF_DISK=0,			/* Page is on disk */
+	WT_REF_DELETED,			/* Page is on disk, but deleted */
+	WT_REF_LOCKED,			/* Page locked for exclusive access */
+	WT_REF_MEM,			/* Page is in cache and valid */
+	WT_REF_READING,			/* Page being read */
+	WT_REF_SPLIT			/* Page was split */
+} WT_PAGE_STATE;
+
+/*
+ * WT_PAGE_DELETED --
+ *	Related information for fast-delete, on-disk pages.
+ */
+struct __wt_page_deleted {
+	uint64_t txnid;			/* Transaction ID */
+
+	WT_UPDATE **update_list;	/* List of updates for abort */
+};
+
+/*
+ * WT_REF --
+ *	A single in-memory page and the state information used to determine if
+ * it's OK to dereference the pointer to the page.
+ */
+struct __wt_ref {
+	WT_PAGE *page;			/* Page */
+
+	/*
+	 * When the tree deepens as a result of a split, the home page value
+	 * changes.  Don't cache it, we need to see that change when looking
+	 * up our slot in the page's index structure.
+	 */
+	WT_PAGE * volatile home;	/* Reference page */
+	uint32_t ref_hint;		/* Reference page index hint */
+
+	volatile WT_PAGE_STATE state;	/* Page state */
+
+	/*
+	 * Address: on-page cell if read from backing block, off-page WT_ADDR
+	 * if instantiated in-memory, or NULL if page created in-memory.
+	 */
+	void	*addr;
+
+	/*
+	 * The child page's key.  Do NOT change this union without reviewing
+	 * __wt_ref_key.
+	 */
+	union {
+		uint64_t recno;		/* Column-store: starting recno */
+		void	*ikey;		/* Row-store: key */
+	} key;
+
+	WT_PAGE_DELETED	*page_del;	/* Deleted on-disk page information */
+};
+/*
+ * WT_REF_SIZE is the expected structure size -- we verify the build to ensure
+ * the compiler hasn't inserted padding which would break the world.
+ */
+#define	WT_REF_SIZE	48
+
+/*
+ * WT_ROW --
+ * Each in-memory page row-store leaf page has an array of WT_ROW structures:
+ * this is created from on-page data when a page is read from the file.  It's
+ * sorted by key, fixed in size, and starts with a reference to on-page data.
+ *
+ * Multiple threads of control may be searching the in-memory row-store pages,
+ * and the key may be instantiated at any time.  Code must be able to handle
+ * both when the key has not been instantiated (the key field points into the
+ * page's disk image), and when the key has been instantiated (the key field
+ * points outside the page's disk image).  We don't need barriers because the
+ * key is updated atomically, but code that reads the key field multiple times
+ * is a very, very bad idea.  Specifically, do not do this:
+ *
+ *	key = rip->key;
+ *	if (key_is_on_page(key)) {
+ *		cell = rip->key;
+ *	}
+ *
+ * The field is declared volatile (so the compiler knows it shouldn't read it
+ * multiple times), and we obscure the field name and use a copy macro in all
+ * references to the field (so the code doesn't read it multiple times), all
+ * to make sure we don't introduce this bug (again).
+ */
+struct __wt_row {	/* On-page key, on-page cell, or off-page WT_IKEY */
+	void * volatile __key;
+};
+#define	WT_ROW_KEY_COPY(rip)	((rip)->__key)
+#define	WT_ROW_KEY_SET(rip, v)	((rip)->__key) = (void *)(v)
+
+/*
+ * WT_ROW_FOREACH --
+ *	Walk the entries of an in-memory row-store leaf page.
+ */
+#define	WT_ROW_FOREACH(page, rip, i)					\
+	for ((i) = (page)->pg_row_entries,				\
+	    (rip) = (page)->pg_row_d; (i) > 0; ++(rip), --(i))
+#define	WT_ROW_FOREACH_REVERSE(page, rip, i)				\
+	for ((i) = (page)->pg_row_entries,				\
+	    (rip) = (page)->pg_row_d + ((page)->pg_row_entries - 1);	\
+	    (i) > 0; --(rip), --(i))
+
+/*
+ * WT_ROW_SLOT --
+ *	Return the 0-based array offset based on a WT_ROW reference.
+ */
+#define	WT_ROW_SLOT(page, rip)						\
+	((uint32_t)(((WT_ROW *)(rip)) - (page)->pg_row_d))
+
+/*
+ * WT_COL --
+ * Each in-memory variable-length column-store leaf page has an array of WT_COL
+ * structures: this is created from on-page data when a page is read from the
+ * file.  It's fixed in size, and references data on the page.
+ */
+struct __wt_col {
+	/*
+	 * Variable-length column-store data references are page offsets, not
+	 * pointers (we boldly re-invent short pointers).  The trade-off is 4B
+	 * per K/V pair on a 64-bit machine vs. a single cycle for the addition
+	 * of a base pointer.  The on-page data is a WT_CELL (same as row-store
+	 * pages).
+	 *
+	 * If the value is 0, it's a single, deleted record.
+	 *
+	 * Obscure the field name, code shouldn't use WT_COL->__col_value, the
+	 * public interface is WT_COL_PTR and WT_COL_PTR_SET.
+	 */
+	uint32_t __col_value;
+};
+
+/*
+ * WT_COL_RLE --
+ * In variable-length column store leaf pages, we build an array of entries
+ * with RLE counts greater than 1 when reading the page.  We can do a binary
+ * search in this array, then an offset calculation to find the cell.
+ */
+struct __wt_col_rle {
+	uint64_t recno;			/* Record number of first repeat. */
+	uint64_t rle;			/* Repeat count. */
+	uint32_t indx;			/* Slot of entry in col_var.d */
+} WT_GCC_ATTRIBUTE((packed));
+
+/*
+ * WT_COL_PTR, WT_COL_PTR_SET --
+ *	Return/Set a pointer corresponding to the data offset. (If the item does
+ * not exist on the page, return a NULL.)
+ */
+#define	WT_COL_PTR(page, cip)						\
+	((cip)->__col_value == 0 ?					\
+	    NULL : WT_PAGE_REF_OFFSET(page, (cip)->__col_value))
+#define	WT_COL_PTR_SET(cip, value)					\
+	(cip)->__col_value = (value)
+
+/*
+ * WT_COL_FOREACH --
+ *	Walk the entries of variable-length column-store leaf page.
+ */
+#define	WT_COL_FOREACH(page, cip, i)					\
+	for ((i) = (page)->pg_var_entries,				\
+	    (cip) = (page)->pg_var_d; (i) > 0; ++(cip), --(i))
+
+/*
+ * WT_COL_SLOT --
+ *	Return the 0-based array offset based on a WT_COL reference.
+ */
+#define	WT_COL_SLOT(page, cip)						\
+	((uint32_t)(((WT_COL *)cip) - (page)->pg_var_d))
+
+/*
+ * WT_IKEY --
+ * Instantiated key: row-store keys are usually prefix compressed and sometimes
+ * Huffman encoded or overflow objects.  Normally, a row-store page in-memory
+ * key points to the on-page WT_CELL, but in some cases, we instantiate the key
+ * in memory, in which case the row-store page in-memory key points to a WT_IKEY
+ * structure.
+ */
+struct __wt_ikey {
+	uint32_t size;			/* Key length */
+
+	/*
+	 * If we no longer point to the key's on-page WT_CELL, we can't find its
+	 * related value.  Save the offset of the key cell in the page.
+	 *
+	 * Row-store cell references are page offsets, not pointers (we boldly
+	 * re-invent short pointers).  The trade-off is 4B per K/V pair on a
+	 * 64-bit machine vs. a single cycle for the addition of a base pointer.
+	 */
+	uint32_t  cell_offset;
+
+	/* The key bytes immediately follow the WT_IKEY structure. */
+#define	WT_IKEY_DATA(ikey)						\
+	((void *)((uint8_t *)(ikey) + sizeof(WT_IKEY)))
+};
+
+/*
+ * WT_UPDATE --
+ * Entries on leaf pages can be updated, either modified or deleted.  Updates
+ * to entries referenced from the WT_ROW and WT_COL arrays are stored in the
+ * page's WT_UPDATE array.  When the first element on a page is updated, the
+ * WT_UPDATE array is allocated, with one slot for every existing element in
+ * the page.  A slot points to a WT_UPDATE structure; if more than one update
+ * is done for an entry, WT_UPDATE structures are formed into a forward-linked
+ * list.
+ */
+struct __wt_update {
+	uint64_t txnid;			/* update transaction */
+
+	WT_UPDATE *next;		/* forward-linked list */
+
+	/*
+	 * We use the maximum size as an is-deleted flag, which means we can't
+	 * store 4GB objects; I'd rather do that than increase the size of this
+	 * structure for a flag bit.
+	 */
+#define	WT_UPDATE_DELETED_ISSET(upd)	((upd)->size == UINT32_MAX)
+#define	WT_UPDATE_DELETED_SET(upd)	((upd)->size = UINT32_MAX)
+	uint32_t size;			/* update length */
+
+	/* The untyped value immediately follows the WT_UPDATE structure. */
+#define	WT_UPDATE_DATA(upd)						\
+	((void *)((uint8_t *)(upd) + sizeof(WT_UPDATE)))
+} WT_GCC_ATTRIBUTE((packed));
+
+/*
+ * WT_INSERT --
+ *
+ * Row-store leaf pages support inserts of new K/V pairs.  When the first K/V
+ * pair is inserted, the WT_INSERT_HEAD array is allocated, with one slot for
+ * every existing element in the page, plus one additional slot.  A slot points
+ * to a WT_INSERT_HEAD structure for the items which sort after the WT_ROW
+ * element that references it and before the subsequent WT_ROW element; the
+ * skiplist structure has a randomly chosen depth of next pointers in each
+ * inserted node.
+ *
+ * The additional slot is because it's possible to insert items smaller than any
+ * existing key on the page: for that reason, the first slot of the insert array
+ * holds keys smaller than any other key on the page.
+ *
+ * In column-store variable-length run-length encoded pages, a single indx
+ * entry may reference a large number of records, because there's a single
+ * on-page entry representing many identical records.   (We don't expand those
+ * entries when the page comes into memory, as that would require resources as
+ * pages are moved to/from the cache, including read-only files.)  Instead, a
+ * single indx entry represents all of the identical records originally found
+ * on the page.
+ *
+ * Modifying (or deleting) run-length encoded column-store records is hard
+ * because the page's entry no longer references a set of identical items.  We
+ * handle this by "inserting" a new entry into the insert array, with its own
+ * record number.  (This is the only case where it's possible to insert into a
+ * column-store: only appends are allowed, as insert requires re-numbering
+ * subsequent records.  Berkeley DB did support mutable records, but it won't
+ * scale and it isn't useful enough to re-implement, IMNSHO.)
+ */
+struct __wt_insert {
+	WT_UPDATE *upd;				/* value */
+
+	union {
+		uint64_t recno;			/* column-store record number */
+		struct {
+			uint32_t offset;	/* row-store key data start */
+			uint32_t size;		/* row-store key data size */
+		} key;
+	} u;
+
+#define	WT_INSERT_KEY_SIZE(ins) (((WT_INSERT *)ins)->u.key.size)
+#define	WT_INSERT_KEY(ins)						\
+	((void *)((uint8_t *)(ins) + ((WT_INSERT *)ins)->u.key.offset))
+#define	WT_INSERT_RECNO(ins)	(((WT_INSERT *)ins)->u.recno)
+
+	WT_INSERT *next[0];			/* forward-linked skip list */
+};
+
+/*
+ * Skiplist helper macros.
+ */
+#define	WT_SKIP_FIRST(ins_head)						\
+	(((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)ins_head)->head[0])
+#define	WT_SKIP_LAST(ins_head)						\
+	(((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)ins_head)->tail[0])
+#define	WT_SKIP_NEXT(ins)  ((ins)->next[0])
+#define	WT_SKIP_FOREACH(ins, ins_head)					\
+	for ((ins) = WT_SKIP_FIRST(ins_head);				\
+	    (ins) != NULL;						\
+	    (ins) = WT_SKIP_NEXT(ins))
+
+/*
+ * Atomically allocate and swap a structure or array into place.
+ */
+#define	WT_PAGE_ALLOC_AND_SWAP(s, page, dest, v, count)	do {		\
+	if (((v) = (dest)) == NULL) {					\
+		WT_ERR(__wt_calloc_def(s, count, &(v)));		\
+		if (WT_ATOMIC_CAS8(dest, NULL, v))			\
+			__wt_cache_page_inmem_incr(			\
+			    s, page, (count) * sizeof(*(v)));		\
+		else							\
+			__wt_free(s, v);				\
+	}								\
+} while (0)
+
+/*
+ * WT_INSERT_HEAD --
+ * 	The head of a skiplist of WT_INSERT items.
+ */
+struct __wt_insert_head {
+	WT_INSERT *head[WT_SKIP_MAXDEPTH];	/* first item on skiplists */
+	WT_INSERT *tail[WT_SKIP_MAXDEPTH];	/* last item on skiplists */
+};
+
+/*
+ * The row-store leaf page insert lists are arrays of pointers to structures,
+ * and may not exist.  The following macros return an array entry if the array
+ * of pointers and the specific structure exist, else NULL.
+ */
+#define	WT_ROW_INSERT_SLOT(page, slot)					\
+	((page)->pg_row_ins == NULL ? NULL : (page)->pg_row_ins[slot])
+#define	WT_ROW_INSERT(page, ip)						\
+	WT_ROW_INSERT_SLOT(page, WT_ROW_SLOT(page, ip))
+#define	WT_ROW_UPDATE(page, ip)						\
+	((page)->pg_row_upd == NULL ?					\
+	    NULL : (page)->pg_row_upd[WT_ROW_SLOT(page, ip)])
+/*
+ * WT_ROW_INSERT_SMALLEST references an additional slot past the end of the
+ * the "one per WT_ROW slot" insert array.  That's because the insert array
+ * requires an extra slot to hold keys that sort before any key found on the
+ * original page.
+ */
+#define	WT_ROW_INSERT_SMALLEST(page)					\
+	((page)->pg_row_ins == NULL ?					\
+	    NULL : (page)->pg_row_ins[(page)->pg_row_entries])
+
+/*
+ * The column-store leaf page update lists are arrays of pointers to structures,
+ * and may not exist.  The following macros return an array entry if the array
+ * of pointers and the specific structure exist, else NULL.
+ */
+#define	WT_COL_UPDATE_SLOT(page, slot)					\
+	((page)->modify == NULL || (page)->modify->mod_update == NULL ?	\
+	    NULL : (page)->modify->mod_update[slot])
+#define	WT_COL_UPDATE(page, ip)						\
+	WT_COL_UPDATE_SLOT(page, WT_COL_SLOT(page, ip))
+
+/*
+ * WT_COL_UPDATE_SINGLE is a single WT_INSERT list, used for any fixed-length
+ * column-store updates for a page.
+ */
+#define	WT_COL_UPDATE_SINGLE(page)					\
+	WT_COL_UPDATE_SLOT(page, 0)
+
+/*
+ * WT_COL_APPEND is an WT_INSERT list, used for fixed- and variable-length
+ * appends.
+ */
+#define	WT_COL_APPEND(page)						\
+	((page)->modify != NULL && (page)->modify->mod_append != NULL ?	\
+	    (page)->modify->mod_append[0] : NULL)
+
+/* WT_FIX_FOREACH walks fixed-length bit-fields on a disk page. */
+#define	WT_FIX_FOREACH(btree, dsk, v, i)				\
+	for ((i) = 0,							\
+	    (v) = (i) < (dsk)->u.entries ?				\
+	    __bit_getv(							\
+	    WT_PAGE_HEADER_BYTE(btree, dsk), 0, (btree)->bitcnt) : 0;	\
+	    (i) < (dsk)->u.entries; ++(i),				\
+	    (v) = __bit_getv(						\
+	    WT_PAGE_HEADER_BYTE(btree, dsk), i, (btree)->bitcnt))
+
+/*
+ * Manage split generation numbers.  Splits walk the list of sessions to check
+ * when it is safe to free structures that have been replaced.  We also check
+ * that list periodically (e.g., when wrapping up a transaction) to free any
+ * memory we can.
+ *
+ * Before a thread enters code that will examine page indexes (which are
+ * swapped out by splits), it publishes a copy of the current split generation
+ * into its session.  Don't assume that threads never re-enter this code: if we
+ * already have a split generation, leave it alone.  If our caller is examining
+ * an index, we don't want the oldest split generation to move forward and
+ * potentially free it.
+ */
+#define	WT_ENTER_PAGE_INDEX(session) do {				\
+	uint64_t __prev_split_gen = (session)->split_gen;		\
+	if (__prev_split_gen == 0)					\
+		WT_PUBLISH((session)->split_gen, S2C(session)->split_gen)
+
+#define	WT_LEAVE_PAGE_INDEX(session)					\
+	if (__prev_split_gen == 0)					\
+		(session)->split_gen = 0;				\
+	} while (0)
+
+#define	WT_WITH_PAGE_INDEX(session, e)					\
+	WT_ENTER_PAGE_INDEX(session);					\
+	(e);								\
+	WT_LEAVE_PAGE_INDEX(session)
diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h
new file mode 100644
index 00000000000..05250951a65
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/btree.h
@@ -0,0 +1,155 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Supported btree formats: the "current" version is the maximum supported
+ * major/minor versions.
+ */
+#define	WT_BTREE_MAJOR_VERSION_MIN	1	/* Oldest version supported */
+#define	WT_BTREE_MINOR_VERSION_MIN	1
+
+#define	WT_BTREE_MAJOR_VERSION_MAX	1	/* Newest version supported */
+#define	WT_BTREE_MINOR_VERSION_MAX	1
+
+/*
+ * The maximum btree leaf and internal page size is 512MB (2^29).  The limit
+ * is enforced in software, it could be larger, specifically, the underlying
+ * default block manager can support 4GB (2^32).  Currently, the maximum page
+ * size must accommodate our dependence on the maximum page size fitting into
+ * a number of bits less than 32; see the row-store page key-lookup functions
+ * for the magic.
+ */
+#define	WT_BTREE_PAGE_SIZE_MAX		(512 * WT_MEGABYTE)
+
+/*
+ * The length of variable-length column-store values and row-store keys/values
+ * are stored in a 4B type, so the largest theoretical key/value item is 4GB.
+ * However, in the WT_UPDATE structure we use the UINT32_MAX size as a "deleted"
+ * flag, and second, the size of an overflow object is constrained by what an
+ * underlying block manager can actually write.  (For example, in the default
+ * block manager, writing an overflow item includes the underlying block's page
+ * header and block manager specific structure, aligned to an allocation-sized
+ * unit).  The btree engine limits the size of a single object to (4GB - 1KB);
+ * that gives us additional bytes if we ever want to store a structure length
+ * plus the object size in 4B, or if we need additional flag values.  Attempts
+ * to store large key/value items in the tree trigger an immediate check to the
+ * block manager, to make sure it can write the item.  Storing 4GB objects in a
+ * btree borders on clinical insanity, anyway.
+ *
+ * Record numbers are stored in 64-bit unsigned integers, meaning the largest
+ * record number is "really, really big".
+ */
+#define	WT_BTREE_MAX_OBJECT_SIZE	(UINT32_MAX - 1024)
+
+/*
+ * A location in a file is a variable-length cookie, but it has a maximum size
+ * so it's easy to create temporary space in which to store them.  (Locations
+ * can't be much larger than this anyway, they must fit onto the minimum size
+ * page because a reference to an overflow page is itself a location.)
+ */
+#define	WT_BTREE_MAX_ADDR_COOKIE	255	/* Maximum address cookie */
+
+/*
+ * WT_BTREE --
+ *	A btree handle.
+ */
+struct __wt_btree {
+	WT_DATA_HANDLE *dhandle;
+
+	WT_CKPT	  *ckpt;		/* Checkpoint information */
+
+	enum {	BTREE_COL_FIX=1,	/* Fixed-length column store */
+		BTREE_COL_VAR=2,	/* Variable-length column store */
+		BTREE_ROW=3		/* Row-store */
+	} type;				/* Type */
+
+	const char *key_format;		/* Key format */
+	const char *value_format;	/* Value format */
+	uint8_t bitcnt;			/* Fixed-length field size in bits */
+
+	WT_COLLATOR *collator;		/* Row-store comparator */
+	int collator_owned;		/* The collator needs to be freed */
+
+	uint32_t id;			/* File ID, for logging */
+
+	uint32_t key_gap;		/* Row-store prefix key gap */
+
+	uint32_t allocsize;		/* Allocation size */
+	uint32_t maxintlpage;		/* Internal page max size */
+	uint32_t maxintlitem;		/* Internal page max item size */
+	uint32_t maxleafpage;		/* Leaf page max size */
+	uint32_t maxleafitem;		/* Leaf page max item size */
+	uint64_t maxmempage;		/* In memory page max size */
+
+	void *huffman_key;		/* Key huffman encoding */
+	void *huffman_value;		/* Value huffman encoding */
+
+	enum {	CKSUM_ON=1,		/* On */
+		CKSUM_OFF=2,		/* Off */
+		CKSUM_UNCOMPRESSED=3	/* Uncompressed blocks only */
+	} checksum;			/* Checksum configuration */
+
+	u_int dictionary;		/* Reconcile: dictionary slots */
+	int   internal_key_truncate;	/* Reconcile: internal key truncate */
+	int   maximum_depth;		/* Reconcile: maximum tree depth */
+	int   prefix_compression;	/* Reconcile: prefix compression */
+	u_int prefix_compression_min;	/* Reconcile: prefix compression min */
+	int   split_pct;		/* Reconcile: split page percent */
+	WT_COMPRESSOR *compressor;	/* Reconcile: page compressor */
+	WT_RWLOCK *ovfl_lock;		/* Reconcile: overflow lock */
+
+	uint64_t last_recno;		/* Column-store last record number */
+
+	WT_REF root;			/* Root page reference */
+	int modified;			/* If the tree ever modified */
+	int bulk_load_ok;		/* Bulk-load is a possibility */
+
+	WT_BM	*bm;			/* Block manager reference */
+	u_int	 block_header;		/* WT_PAGE_HEADER_BYTE_SIZE */
+
+	uint64_t write_gen;		/* Write generation */
+
+	WT_REF  *evict_ref;		/* Eviction thread's location */
+	uint64_t evict_priority;	/* Relative priority of cached pages */
+	u_int    evict_walk_period;	/* Skip this many LRU walks */
+	u_int    evict_walk_skips;	/* Number of walks skipped */
+	volatile uint32_t evict_busy;	/* Count of threads in eviction */
+
+	int checkpointing;		/* Checkpoint in progress */
+
+	/*
+	 * We flush pages from the tree (in order to make checkpoint faster),
+	 * without a high-level lock.  To avoid multiple threads flushing at
+	 * the same time, lock the tree.
+	 */
+	WT_SPINLOCK	flush_lock;	/* Lock to flush the tree's pages */
+
+	/* Flags values up to 0xff are reserved for WT_DHANDLE_* */
+#define	WT_BTREE_BULK		0x00100	/* Bulk-load handle */
+#define	WT_BTREE_NO_EVICTION	0x00200	/* Disable eviction */
+#define	WT_BTREE_NO_HAZARD	0x00400	/* Disable hazard pointers */
+#define	WT_BTREE_SALVAGE	0x00800	/* Handle is for salvage */
+#define	WT_BTREE_UPGRADE	0x01000	/* Handle is for upgrade */
+#define	WT_BTREE_VERIFY		0x02000	/* Handle is for verify */
+	uint32_t flags;
+};
+
+/* Flags that make a btree handle special (not for normal use). */
+#define	WT_BTREE_SPECIAL_FLAGS	 					\
+	(WT_BTREE_BULK | WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)
+
+/*
+ * WT_SALVAGE_COOKIE --
+ *	Encapsulation of salvage information for reconciliation.
+ */
+struct __wt_salvage_cookie {
+	uint64_t missing;			/* Initial items to create */
+	uint64_t skip;				/* Initial items to skip */
+	uint64_t take;				/* Items to take */
+
+	int	 done;				/* Ignore the rest */
+};
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
new file mode 100644
index 00000000000..b7957e6647f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -0,0 +1,1216 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __wt_ref_is_root --
+ *	Return if the page reference is for the root page.
+ */
+static inline int
+__wt_ref_is_root(WT_REF *ref)
+{
+	return (ref->home == NULL ? 1 : 0);
+}
+
+/*
+ * __wt_page_is_modified --
+ *	Return if the page is dirty.
+ */
+static inline int
+__wt_page_is_modified(WT_PAGE *page)
+{
+	return (page->modify != NULL && page->modify->write_gen != 0 ? 1 : 0);
+}
+
+/*
+ * Estimate the per-allocation overhead.  All implementations of malloc / free
+ * have some kind of header and pad for alignment.  We can't know for sure what
+ * that adds up to, but this is an estimate based on some measurements of heap
+ * size versus bytes in use.
+ */
+#define	WT_ALLOC_OVERHEAD	32U
+
+/*
+ * __wt_cache_page_inmem_incr --
+ *	Increment a page's memory footprint in the cache.
+ */
+static inline void
+__wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
+{
+	WT_CACHE *cache;
+
+	size += WT_ALLOC_OVERHEAD;
+
+	cache = S2C(session)->cache;
+	(void)WT_ATOMIC_ADD8(cache->bytes_inmem, size);
+	(void)WT_ATOMIC_ADD8(page->memory_footprint, size);
+	if (__wt_page_is_modified(page)) {
+		(void)WT_ATOMIC_ADD8(cache->bytes_dirty, size);
+		(void)WT_ATOMIC_ADD8(page->modify->bytes_dirty, size);
+	}
+}
+
+/*
+ * __wt_cache_page_inmem_decr --
+ *	Decrement a page's memory footprint in the cache.
+ */
+static inline void
+__wt_cache_page_inmem_decr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
+{
+	WT_CACHE *cache;
+
+	size += WT_ALLOC_OVERHEAD;
+
+	cache = S2C(session)->cache;
+	(void)WT_ATOMIC_SUB8(cache->bytes_inmem, size);
+	(void)WT_ATOMIC_SUB8(page->memory_footprint, size);
+	if (__wt_page_is_modified(page)) {
+		(void)WT_ATOMIC_SUB8(cache->bytes_dirty, size);
+		(void)WT_ATOMIC_SUB8(page->modify->bytes_dirty, size);
+	}
+}
+
+/*
+ * __wt_cache_dirty_incr --
+ *	Increment the cache dirty page/byte counts.
+ */
+static inline void
+__wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_CACHE *cache;
+	size_t size;
+
+	cache = S2C(session)->cache;
+	(void)WT_ATOMIC_ADD8(cache->pages_dirty, 1);
+
+	/*
+	 * Take care to read the memory_footprint once in case we are racing
+	 * with updates.
+	 */
+	size = page->memory_footprint;
+	(void)WT_ATOMIC_ADD8(cache->bytes_dirty, size);
+	(void)WT_ATOMIC_ADD8(page->modify->bytes_dirty, size);
+}
+
+/*
+ * __wt_cache_dirty_decr --
+ *	Decrement the cache dirty page/byte counts.
+ */
+static inline void
+__wt_cache_dirty_decr(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_CACHE *cache;
+	size_t size;
+
+	cache = S2C(session)->cache;
+
+	if (cache->pages_dirty < 1) {
+		(void)__wt_errx(session,
+		   "cache dirty decrement failed: cache dirty page count went "
+		   "negative");
+		cache->pages_dirty = 0;
+	} else
+		(void)WT_ATOMIC_SUB8(cache->pages_dirty, 1);
+
+	/*
+	 * It is possible to decrement the footprint of the page without making
+	 * the page dirty (for example when freeing an obsolete update list),
+	 * so the footprint could change between read and decrement, and we
+	 * might attempt to decrement by a different amount than the bytes held
+	 * by the page.
+	 *
+	 * We catch that by maintaining a per-page dirty size, and fixing the
+	 * cache stats if that is non-zero when the page is discarded.
+	 *
+	 * Also take care that the global size doesn't go negative.  This may
+	 * lead to small accounting errors (particularly on the last page of the
+	 * last file in a checkpoint), but that will come out in the wash when
+	 * the page is evicted.
+	 */
+	size = WT_MIN(page->memory_footprint, cache->bytes_dirty);
+	(void)WT_ATOMIC_SUB8(cache->bytes_dirty, size);
+	(void)WT_ATOMIC_SUB8(page->modify->bytes_dirty, size);
+}
+
+/*
+ * __wt_cache_page_evict --
+ *	Evict pages from the cache.
+ */
+static inline void
+__wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_CACHE *cache;
+	WT_PAGE_MODIFY *mod;
+
+	cache = S2C(session)->cache;
+	mod = page->modify;
+
+	/*
+	 * In rare cases, we may race tracking a page's dirty footprint.
+	 * If so, we will get here with a non-zero dirty_size in the page, and
+	 * we can fix the global stats.
+	 */
+	if (mod != NULL && mod->bytes_dirty != 0)
+		(void)WT_ATOMIC_SUB8(cache->bytes_dirty, mod->bytes_dirty);
+
+	WT_ASSERT(session, page->memory_footprint != 0);
+	(void)WT_ATOMIC_ADD8(cache->bytes_evict, page->memory_footprint);
+	page->memory_footprint = 0;
+
+	(void)WT_ATOMIC_ADD8(cache->pages_evict, 1);
+}
+
+/*
+ * __wt_cache_read_gen --
+ *      Get the current read generation number.
+ */
+static inline uint64_t
+__wt_cache_read_gen(WT_SESSION_IMPL *session)
+{
+	return (S2C(session)->cache->read_gen);
+}
+
+/*
+ * __wt_cache_read_gen_incr --
+ *      Increment the current read generation number.
+ */
+static inline void
+__wt_cache_read_gen_incr(WT_SESSION_IMPL *session)
+{
+	++S2C(session)->cache->read_gen;
+}
+
+/*
+ * __wt_cache_read_gen_set --
+ *      Get the read generation to store in a page.
+ */
+static inline uint64_t
+__wt_cache_read_gen_set(WT_SESSION_IMPL *session)
+{
+	/*
+	 * We return read-generations from the future (where "the future" is
+	 * measured by increments of the global read generation).  The reason
+	 * is because when acquiring a new hazard pointer for a page, we can
+	 * check its read generation, and if the read generation isn't less
+	 * than the current global generation, we don't bother updating the
+	 * page.  In other words, the goal is to avoid some number of updates
+	 * immediately after each update we have to make.
+	 */
+	return (__wt_cache_read_gen(session) + WT_READGEN_STEP);
+}
+
+/*
+ * __wt_cache_pages_inuse --
+ *	Return the number of pages in use.
+ */
+static inline uint64_t
+__wt_cache_pages_inuse(WT_CACHE *cache)
+{
+	return (cache->pages_inmem - cache->pages_evict);
+}
+
+/*
+ * __wt_cache_bytes_inuse --
+ *	Return the number of bytes in use.
+ */
+static inline uint64_t
+__wt_cache_bytes_inuse(WT_CACHE *cache)
+{
+	return (cache->bytes_inmem - cache->bytes_evict);
+}
+
+/*
+ * __wt_page_refp --
+ *      Return the page's index and slot for a reference.
+ */
+static inline void
+__wt_page_refp(WT_SESSION_IMPL *session,
+    WT_REF *ref, WT_PAGE_INDEX **pindexp, uint32_t *slotp)
+{
+	WT_PAGE_INDEX *pindex;
+	uint32_t i;
+
+	WT_ASSERT(session,
+	    WT_SESSION_TXN_STATE(session)->snap_min != WT_TXN_NONE);
+
+	/*
+	 * Copy the parent page's index value: the page can split at any time,
+	 * but the index's value is always valid, even if it's not up-to-date.
+	 */
+retry:	pindex = WT_INTL_INDEX_COPY(ref->home);
+
+	/*
+	 * Use the page's reference hint: it should be correct unless the page
+	 * split before our slot.  If the page splits after our slot, the hint
+	 * will point earlier in the array than our actual slot, so the first
+	 * loop is from the hint to the end of the list, and the second loop
+	 * is from the start of the list to the end of the list.  (The second
+	 * loop overlaps the first, but that only happen in cases where we've
+	 * deepened the tree and aren't going to find our slot at all, that's
+	 * not worth optimizing.)
+	 *
+	 * It's not an error for the reference hint to be wrong, it just means
+	 * the first retrieval (which sets the hint for subsequent retrievals),
+	 * is slower.
+	 */
+	for (i = ref->ref_hint; i < pindex->entries; ++i)
+		if (pindex->index[i]->page == ref->page) {
+			*pindexp = pindex;
+			*slotp = ref->ref_hint = i;
+			return;
+		}
+	for (i = 0; i < pindex->entries; ++i)
+		if (pindex->index[i]->page == ref->page) {
+			*pindexp = pindex;
+			*slotp = ref->ref_hint = i;
+			return;
+		}
+
+	/*
+	 * If we don't find our reference, the page split into a new level and
+	 * our home pointer references the wrong page.  After internal pages
+	 * deepen, their reference structure home value are updated; yield and
+	 * wait for that to happen.
+	 */
+	__wt_yield();
+	goto retry;
+}
+
+/*
+ * __wt_page_modify_init --
+ *	A page is about to be modified, allocate the modification structure.
+ */
+static inline int
+__wt_page_modify_init(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	return (page->modify == NULL ?
+	    __wt_page_modify_alloc(session, page) : 0);
+}
+
+/*
+ * __wt_page_only_modify_set --
+ *	Mark the page (but only the page) dirty.
+ */
+static inline void
+__wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	uint64_t last_running;
+
+	last_running = 0;
+	if (page->modify->write_gen == 0)
+		last_running = S2C(session)->txn_global.last_running;
+
+	/*
+	 * We depend on atomic-add being a write barrier, that is, a barrier to
+	 * ensure all changes to the page are flushed before updating the page
+	 * write generation and/or marking the tree dirty, otherwise checkpoints
+	 * and/or page reconciliation might be looking at a clean page/tree.
+	 *
+	 * Every time the page transitions from clean to dirty, update the cache
+	 * and transactional information.
+	 */
+	if (WT_ATOMIC_ADD4(page->modify->write_gen, 1) == 1) {
+		__wt_cache_dirty_incr(session, page);
+
+		/*
+		 * The page can never end up with changes older than the oldest
+		 * running transaction.
+		 */
+		if (F_ISSET(&session->txn, TXN_HAS_SNAPSHOT))
+			page->modify->disk_snap_min = session->txn.snap_min;
+
+		/*
+		 * We won the race to dirty the page, but another thread could
+		 * have committed in the meantime, and the last_running field
+		 * been updated past it.  That is all very unlikely, but not
+		 * impossible, so we take care to read the global state before
+		 * the atomic increment.  If we raced with reconciliation, just
+		 * leave the previous value here: at worst, we will write a
+		 * page in a checkpoint when not absolutely necessary.
+		 */
+		if (last_running != 0)
+			page->modify->first_dirty_txn = last_running;
+	}
+
+	/* Check if this is the largest transaction ID to update the page. */
+	if (TXNID_LT(page->modify->update_txn, session->txn.id))
+		page->modify->update_txn = session->txn.id;
+}
+
+/*
+ * __wt_page_modify_set --
+ *	Mark the page and tree dirty.
+ */
+static inline void
+__wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	/*
+	 * Mark the tree dirty (even if the page is already marked dirty), newly
+	 * created pages to support "empty" files are dirty, but the file isn't
+	 * marked dirty until there's a real change needing to be written. Test
+	 * before setting the dirty flag, it's a hot cache line.
+	 *
+	 * The tree's modified flag is cleared by the checkpoint thread: set it
+	 * and insert a barrier before dirtying the page.  (I don't think it's
+	 * a problem if the tree is marked dirty with all the pages clean, it
+	 * might result in an extra checkpoint that doesn't do any work but it
+	 * shouldn't cause problems; regardless, let's play it safe.)
+	 */
+	if (S2BT(session)->modified == 0) {
+		S2BT(session)->modified = 1;
+		WT_FULL_BARRIER();
+	}
+
+	__wt_page_only_modify_set(session, page);
+}
+
+/*
+ * __wt_page_parent_modify_set --
+ *	Mark the parent page and tree dirty.
+ */
+static inline int
+__wt_page_parent_modify_set(
+    WT_SESSION_IMPL *session, WT_REF *ref, int page_only)
+{
+	WT_PAGE *parent;
+
+	/*
+	 * This function exists as a place to stash this comment.  There are a
+	 * few places where we need to dirty a page's parent.  The trick is the
+	 * page's parent might split at any point, and the page parent might be
+	 * the wrong parent at any particular time.  We ignore this and dirty
+	 * whatever page the page's reference structure points to.  This is safe
+	 * because if we're pointing to the wrong parent, that parent must have
+	 * split, deepening the tree, which implies marking the original parent
+	 * and all of the newly-created children as dirty.  In other words, if
+	 * we have the wrong parent page, everything was marked dirty already.
+	 */
+	parent = ref->home;
+	WT_RET(__wt_page_modify_init(session, parent));
+	if (page_only)
+		__wt_page_only_modify_set(session, parent);
+	else
+		__wt_page_modify_set(session, parent);
+	return (0);
+}
+
+/*
+ * __wt_off_page --
+ *	Return if a pointer references off-page data.
+ */
+static inline int
+__wt_off_page(WT_PAGE *page, const void *p)
+{
+	/*
+	 * There may be no underlying page, in which case the reference is
+	 * off-page by definition.
+	 */
+	return (page->dsk == NULL ||
+	    p < (void *)page->dsk ||
+	    p >= (void *)((uint8_t *)page->dsk + page->dsk->mem_size));
+}
+
+/*
+ * __wt_ref_key --
+ *	Return a reference to a row-store internal page key as cheaply as
+ * possible.
+ */
+static inline void
+__wt_ref_key(WT_PAGE *page, WT_REF *ref, void *keyp, size_t *sizep)
+{
+	uintptr_t v;
+
+	/*
+	 * An internal page key is in one of two places: if we instantiated the
+	 * key (for example, when reading the page), WT_REF.key.ikey references
+	 * a WT_IKEY structure, otherwise WT_REF.key.ikey references an on-page
+	 * key offset/length pair.
+	 *
+	 * Now the magic: allocated memory must be aligned to store any standard
+	 * type, and we expect some standard type to require at least quad-byte
+	 * alignment, so allocated memory should have some clear low-order bits.
+	 * On-page objects consist of an offset/length pair: the maximum page
+	 * size currently fits into 29 bits, so we use the low-order bits of the
+	 * pointer to mark the other bits of the pointer as encoding the key's
+	 * location and length.  This breaks if allocated memory isn't aligned,
+	 * of course.
+	 *
+	 * In this specific case, we use bit 0x01 to mark an on-page key, else
+	 * it's a WT_IKEY reference.  The bit pattern for internal row-store
+	 * on-page keys is:
+	 *	32 bits		key length
+	 *	31 bits		page offset of the key's bytes,
+	 *	 1 bits		flags
+	 */
+#define	WT_IK_FLAG			0x01
+#define	WT_IK_ENCODE_KEY_LEN(v)		((uintptr_t)(v) << 32)
+#define	WT_IK_DECODE_KEY_LEN(v)		((v) >> 32)
+#define	WT_IK_ENCODE_KEY_OFFSET(v)	((uintptr_t)(v) << 1)
+#define	WT_IK_DECODE_KEY_OFFSET(v)	(((v) & 0xFFFFFFFF) >> 1)
+	v = (uintptr_t)ref->key.ikey;
+	if (v & WT_IK_FLAG) {
+		*(void **)keyp =
+		    WT_PAGE_REF_OFFSET(page, WT_IK_DECODE_KEY_OFFSET(v));
+		*sizep = WT_IK_DECODE_KEY_LEN(v);
+	} else {
+		*(void **)keyp = WT_IKEY_DATA(ref->key.ikey);
+		*sizep = ((WT_IKEY *)ref->key.ikey)->size;
+	}
+}
+
+/*
+ * __wt_ref_key_onpage_set --
+ *	Set a WT_REF to reference an on-page key.
+ */
+static inline void
+__wt_ref_key_onpage_set(WT_PAGE *page, WT_REF *ref, WT_CELL_UNPACK *unpack)
+{
+	uintptr_t v;
+
+	/*
+	 * See the comment in __wt_ref_key for an explanation of the magic.
+	 */
+	v = WT_IK_ENCODE_KEY_LEN(unpack->size) |
+	    WT_IK_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) |
+	    WT_IK_FLAG;
+	ref->key.ikey = (void *)v;
+}
+
+/*
+ * __wt_ref_key_instantiated --
+ *	Return if a WT_REF key is instantiated.
+ */
+static inline WT_IKEY *
+__wt_ref_key_instantiated(WT_REF *ref)
+{
+	uintptr_t v;
+
+	/*
+	 * See the comment in __wt_ref_key for an explanation of the magic.
+	 */
+	v = (uintptr_t)ref->key.ikey;
+	return (v & WT_IK_FLAG ? NULL : ref->key.ikey);
+}
+
+/*
+ * __wt_ref_key_clear --
+ *	Clear a WT_REF key.
+ */
+static inline void
+__wt_ref_key_clear(WT_REF *ref)
+{
+	/* The key union has 2 fields, both of which are 8B. */
+	ref->key.recno = 0;
+}
+
+/*
+ * __wt_row_leaf_key_info --
+ *	Return a row-store leaf page key referenced by a WT_ROW if it can be
+ * had without unpacking a cell, and information about the cell, if the key
+ * isn't cheaply available.
+ */
+static inline int
+__wt_row_leaf_key_info(WT_PAGE *page, void *copy,
+    WT_IKEY **ikeyp, WT_CELL **cellp, void *datap, size_t *sizep)
+{
+	WT_IKEY *ikey;
+	uintptr_t v;
+
+	v = (uintptr_t)copy;
+
+	/*
+	 * A row-store leaf page key is in one of two places: if instantiated,
+	 * the WT_ROW pointer references a WT_IKEY structure, otherwise, it
+	 * references an on-page offset.  Further, on-page keys are in one of
+	 * two states: if the key is a simple key (not an overflow key, prefix
+	 * compressed or Huffman encoded, all of which are likely), the key's
+	 * offset/size is encoded in the pointer.  Otherwise, the offset is to
+	 * the key's on-page cell.
+	 *
+	 * Now the magic: allocated memory must be aligned to store any standard
+	 * type, and we expect some standard type to require at least quad-byte
+	 * alignment, so allocated memory should have some clear low-order bits.
+	 * On-page objects consist of an offset/length pair: the maximum page
+	 * size currently fits into 29 bits, so we use the low-order bits of the
+	 * pointer to mark the other bits of the pointer as encoding the key's
+	 * location and length.  This breaks if allocated memory isn't aligned,
+	 * of course.
+	 *
+	 * In this specific case, we use bit 0x01 to mark an on-page cell, bit
+	 * 0x02 to mark an on-page key, 0x03 to mark an on-page key/value pair,
+	 * otherwise it's a WT_IKEY reference. The bit pattern for on-page cells
+	 * is:
+	 *	29 bits		page offset of the key's cell,
+	 *	 2 bits		flags
+	 *
+	 * The bit pattern for on-page keys is:
+	 *	32 bits		key length,
+	 *	29 bits		page offset of the key's bytes,
+	 *	 2 bits		flags
+	 *
+	 * But, while that allows us to skip decoding simple key cells, we also
+	 * want to skip decoding the value cell in the case where the value cell
+	 * is also simple/short.  We use bit 0x03 to mark an encoded on-page key
+	 * and value pair.  The bit pattern for on-page key/value pairs is:
+	 *	 9 bits		key length,
+	 *	13 bits		value length,
+	 *	20 bits		page offset of the key's bytes,
+	 *	20 bits		page offset of the value's bytes,
+	 *	 2 bits		flags
+	 *
+	 * These bit patterns are in-memory only, of course, so can be modified
+	 * (we could even tune for specific workloads).  Generally, the fields
+	 * are larger than the anticipated values being stored (512B keys, 8KB
+	 * values, 1MB pages), hopefully that won't be necessary.
+	 *
+	 * This function returns a list of things about the key (instantiation
+	 * reference, cell reference and key/length pair).  Our callers know
+	 * the order in which we look things up and the information returned;
+	 * for example, the cell will never be returned if we are working with
+	 * an on-page key.
+	 */
+#define	WT_CELL_FLAG			0x01
+#define	WT_CELL_ENCODE_OFFSET(v)	((uintptr_t)(v) << 2)
+#define	WT_CELL_DECODE_OFFSET(v)	(((v) & 0xFFFFFFFF) >> 2)
+
+#define	WT_K_FLAG			0x02
+#define	WT_K_ENCODE_KEY_LEN(v)		((uintptr_t)(v) << 32)
+#define	WT_K_DECODE_KEY_LEN(v)		((v) >> 32)
+#define	WT_K_ENCODE_KEY_OFFSET(v)	((uintptr_t)(v) << 2)
+#define	WT_K_DECODE_KEY_OFFSET(v)	(((v) & 0xFFFFFFFF) >> 2)
+
+#define	WT_KV_FLAG			0x03
+#define	WT_KV_ENCODE_KEY_LEN(v)		((uintptr_t)(v) << 55)
+#define	WT_KV_DECODE_KEY_LEN(v)		((v) >> 55)
+#define	WT_KV_MAX_KEY_LEN		(0x200 - 1)
+#define	WT_KV_ENCODE_VALUE_LEN(v)	((uintptr_t)(v) << 42)
+#define	WT_KV_DECODE_VALUE_LEN(v)	(((v) & 0x007FFC0000000000) >> 42)
+#define	WT_KV_MAX_VALUE_LEN		(0x2000 - 1)
+#define	WT_KV_ENCODE_KEY_OFFSET(v)	((uintptr_t)(v) << 22)
+#define	WT_KV_DECODE_KEY_OFFSET(v)	(((v) & 0x000003FFFFC00000) >> 22)
+#define	WT_KV_MAX_KEY_OFFSET		(0x100000 - 1)
+#define	WT_KV_ENCODE_VALUE_OFFSET(v)	((uintptr_t)(v) << 2)
+#define	WT_KV_DECODE_VALUE_OFFSET(v)	(((v) & 0x00000000003FFFFC) >> 2)
+#define	WT_KV_MAX_VALUE_OFFSET		(0x100000 - 1)
+	switch (v & 0x03) {
+	case WT_CELL_FLAG:
+		/* On-page cell: no instantiated key. */
+		if (ikeyp != NULL)
+			*ikeyp = NULL;
+		if (cellp != NULL)
+			*cellp =
+			    WT_PAGE_REF_OFFSET(page, WT_CELL_DECODE_OFFSET(v));
+		return (0);
+	case WT_K_FLAG:
+		/* Encoded key: no instantiated key, no cell. */
+		if (cellp != NULL)
+			*cellp = NULL;
+		if (ikeyp != NULL)
+			*ikeyp = NULL;
+		if (datap != NULL) {
+			*(void **)datap =
+			    WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_OFFSET(v));
+			*sizep = WT_K_DECODE_KEY_LEN(v);
+			return (1);
+		}
+		return (0);
+	case WT_KV_FLAG:
+		/* Encoded key/value pair: no instantiated key, no cell. */
+		if (cellp != NULL)
+			*cellp = NULL;
+		if (ikeyp != NULL)
+			*ikeyp = NULL;
+		if (datap != NULL) {
+			*(void **)datap = WT_PAGE_REF_OFFSET(
+			    page, WT_KV_DECODE_KEY_OFFSET(v));
+			*sizep = WT_KV_DECODE_KEY_LEN(v);
+			return (1);
+		}
+		return (0);
+
+	}
+
+	/* Instantiated key. */
+	ikey = copy;
+	if (ikeyp != NULL)
+		*ikeyp = copy;
+	if (cellp != NULL)
+		*cellp = WT_PAGE_REF_OFFSET(page, ikey->cell_offset);
+	if (datap != NULL) {
+		*(void **)datap = WT_IKEY_DATA(ikey);
+		*sizep = ikey->size;
+		return (1);
+	}
+	return (0);
+}
+
+/*
+ * __wt_row_leaf_key_set_cell --
+ *	Set a WT_ROW to reference an on-page row-store leaf cell.
+ */
+static inline void
+__wt_row_leaf_key_set_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL *cell)
+{
+	uintptr_t v;
+
+	/*
+	 * See the comment in __wt_row_leaf_key_info for an explanation of the
+	 * magic.
+	 */
+	v = WT_CELL_ENCODE_OFFSET(WT_PAGE_DISK_OFFSET(page, cell)) |
+	    WT_CELL_FLAG;
+	WT_ROW_KEY_SET(rip, v);
+}
+
+/*
+ * __wt_row_leaf_key_set --
+ *	Set a WT_ROW to reference an on-page row-store leaf key.
+ */
+static inline void
+__wt_row_leaf_key_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack)
+{
+	uintptr_t v;
+
+	/*
+	 * See the comment in __wt_row_leaf_key_info for an explanation of the
+	 * magic.
+	 */
+	v = WT_K_ENCODE_KEY_LEN(unpack->size) |
+	    WT_K_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) |
+	    WT_K_FLAG;
+	WT_ROW_KEY_SET(rip, v);
+}
+
+/*
+ * __wt_row_leaf_value_set --
+ *	Set a WT_ROW to reference an on-page row-store leaf value.
+ */
+static inline void
+__wt_row_leaf_value_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack)
+{
+	uintptr_t key_len, key_offset, value_offset, v;
+
+	v = (uintptr_t)WT_ROW_KEY_COPY(rip);
+
+	/*
+	 * See the comment in __wt_row_leaf_key_info for an explanation of the
+	 * magic.
+	 */
+	if (!(v & WT_K_FLAG))			/* Already an encoded key */
+		return;
+
+	key_len = WT_K_DECODE_KEY_LEN(v);	/* Key length */
+	if (key_len > WT_KV_MAX_KEY_LEN)
+		return;
+	if (unpack->size > WT_KV_MAX_VALUE_LEN)	/* Value length */
+		return;
+
+	key_offset = WT_K_DECODE_KEY_OFFSET(v);	/* Page offsets */
+	if (key_offset > WT_KV_MAX_KEY_OFFSET)
+		return;
+	value_offset = WT_PAGE_DISK_OFFSET(page, unpack->data);
+	if (value_offset > WT_KV_MAX_VALUE_OFFSET)
+		return;
+
+	v = WT_KV_ENCODE_KEY_LEN(key_len) |
+	    WT_KV_ENCODE_VALUE_LEN(unpack->size) |
+	    WT_KV_ENCODE_KEY_OFFSET(key_offset) |
+	    WT_KV_ENCODE_VALUE_OFFSET(value_offset) | WT_KV_FLAG;
+	WT_ROW_KEY_SET(rip, v);
+}
+
+/*
+ * __wt_row_leaf_key --
+ *	Set a buffer to reference a row-store leaf page key as cheaply as
+ * possible.
+ */
+static inline int
+__wt_row_leaf_key(WT_SESSION_IMPL *session,
+    WT_PAGE *page, WT_ROW *rip, WT_ITEM *key, int instantiate)
+{
+	void *copy;
+
+	/*
+	 * A front-end for __wt_row_leaf_key_work, here to inline fast paths.
+	 *
+	 * The row-store key can change underfoot; explicitly take a copy.
+	 */
+	copy = WT_ROW_KEY_COPY(rip);
+
+	/*
+	 * All we handle here are on-page keys (which should be a common case),
+	 * and instantiated keys (which start out rare, but become more common
+	 * as a leaf page is searched, instantiating prefix-compressed keys).
+	 */
+	if (__wt_row_leaf_key_info(
+	    page, copy, NULL, NULL, &key->data, &key->size))
+		return (0);
+
+	/*
+	 * The alternative is an on-page cell with some kind of compressed or
+	 * overflow key that's never been instantiated.  Call the underlying
+	 * worker function to figure it out.
+	 */
+	return (__wt_row_leaf_key_work(session, page, rip, key, instantiate));
+}
+
+/*
+ * __wt_cursor_row_leaf_key --
+ *	Set a buffer to reference a cursor-referenced row-store leaf page key.
+ */
+static inline int
+__wt_cursor_row_leaf_key(WT_CURSOR_BTREE *cbt, WT_ITEM *key)
+{
+	WT_PAGE *page;
+	WT_ROW *rip;
+	WT_SESSION_IMPL *session;
+
+	/*
+	 * If the cursor references a WT_INSERT item, take the key from there,
+	 * else take the key from the original page.
+	 */
+	if (cbt->ins == NULL) {
+		session = (WT_SESSION_IMPL *)cbt->iface.session;
+		page = cbt->ref->page;
+		rip = &page->u.row.d[cbt->slot];
+		WT_RET(__wt_row_leaf_key(session, page, rip, key, 0));
+	} else {
+		key->data = WT_INSERT_KEY(cbt->ins);
+		key->size = WT_INSERT_KEY_SIZE(cbt->ins);
+	}
+	return (0);
+}
+
+/*
+ * __wt_row_leaf_value_cell --
+ *	Return a pointer to the value cell for a row-store leaf page key, or
+ * NULL if there isn't one.
+ */
+static inline WT_CELL *
+__wt_row_leaf_value_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *kpack)
+{
+	WT_CELL *kcell, *vcell;
+	WT_CELL_UNPACK unpack;
+	void *copy, *key;
+	size_t size;
+
+	/* If we already have an unpacked key cell, use it. */
+	if (kpack != NULL)
+		vcell = (WT_CELL *)
+		    ((uint8_t *)kpack->cell + __wt_cell_total_len(kpack));
+	else {
+		/*
+		 * The row-store key can change underfoot; explicitly take a
+		 * copy.
+		 */
+		copy = WT_ROW_KEY_COPY(rip);
+
+		/*
+		 * Figure out where the key is, step past it to the value cell.
+		 * The test for a cell not being set tells us that we have an
+		 * on-page key, otherwise we're looking at an instantiated key
+		 * or on-page cell, both of which require an unpack of the key's
+		 * cell to find the value cell that follows.
+		 */
+		if (__wt_row_leaf_key_info(
+		    page, copy, NULL, &kcell, &key, &size) && kcell == NULL)
+			vcell = (WT_CELL *)((uint8_t *)key + size);
+		else {
+			__wt_cell_unpack(kcell, &unpack);
+			vcell = (WT_CELL *)((uint8_t *)
+			    unpack.cell + __wt_cell_total_len(&unpack));
+		}
+	}
+
+	return (__wt_cell_leaf_value_parse(page, vcell));
+}
+
+/*
+ * __wt_row_leaf_value --
+ *	Return the value for a row-store leaf page encoded key/value pair.
+ */
+static inline int
+__wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value)
+{
+	uintptr_t v;
+
+	/* The row-store key can change underfoot; explicitly take a copy. */
+	v = (uintptr_t)WT_ROW_KEY_COPY(rip);
+
+	/*
+	 * See the comment in __wt_row_leaf_key_info for an explanation of the
+	 * magic.
+	 */
+	if ((v & 0x03) == WT_KV_FLAG) {
+		value->data =
+		    WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_VALUE_OFFSET(v));
+		value->size = WT_KV_DECODE_VALUE_LEN(v);
+		return (1);
+	}
+	return (0);
+}
+
+/*
+ * __wt_ref_info --
+ *	Return the addr/size and type triplet for a reference.
+ */
+static inline int
+__wt_ref_info(WT_SESSION_IMPL *session,
+    WT_REF *ref, const uint8_t **addrp, size_t *sizep, u_int *typep)
+{
+	WT_ADDR *addr;
+	WT_CELL_UNPACK *unpack, _unpack;
+
+	addr = ref->addr;
+	unpack = &_unpack;
+
+	/*
+	 * If NULL, there is no location.
+	 * If off-page, the pointer references a WT_ADDR structure.
+	 * If on-page, the pointer references a cell.
+	 *
+	 * The type is of a limited set: internal, leaf or no-overflow leaf.
+	 */
+	if (addr == NULL) {
+		*addrp = NULL;
+		*sizep = 0;
+		if (typep != NULL)
+			*typep = 0;
+	} else if (__wt_off_page(ref->home, addr)) {
+		*addrp = addr->addr;
+		*sizep = addr->size;
+		if (typep != NULL)
+			switch (addr->type) {
+			case WT_ADDR_INT:
+				*typep = WT_CELL_ADDR_INT;
+				break;
+			case WT_ADDR_LEAF:
+				*typep = WT_CELL_ADDR_LEAF;
+				break;
+			case WT_ADDR_LEAF_NO:
+				*typep = WT_CELL_ADDR_LEAF_NO;
+				break;
+			WT_ILLEGAL_VALUE(session);
+			}
+	} else {
+		__wt_cell_unpack((WT_CELL *)addr, unpack);
+		*addrp = unpack->data;
+		*sizep = unpack->size;
+		if (typep != NULL)
+			*typep = unpack->type;
+	}
+	return (0);
+}
+
+/*
+ * __wt_page_release --
+ *	Release a reference to a page.
+ */
+static inline int
+__wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
+{
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_PAGE *page;
+	int locked;
+
+	btree = S2BT(session);
+
+	/*
+	 * Discard our hazard pointer.  Ignore pages we don't have and the root
+	 * page, which sticks in memory, regardless.
+	 */
+	if (ref == NULL || __wt_ref_is_root(ref))
+		return (0);
+	page = ref->page;
+
+	/*
+	 * Attempt to evict pages with the special "oldest" read generation.
+	 *
+	 * This is set for pages that grow larger than the configured
+	 * memory_page_max setting, and when we are attempting to scan without
+	 * trashing the cache.
+	 *
+	 * Skip this if eviction is disabled for this operation or this tree,
+	 * or if there is no chance of eviction succeeding for dirty pages due
+	 * to a checkpoint or because we've already tried writing this page and
+	 * it contains an update that isn't stable.
+	 */
+	if (LF_ISSET(WT_READ_NO_EVICT) ||
+	    page->read_gen != WT_READGEN_OLDEST ||
+	    F_ISSET(btree, WT_BTREE_NO_EVICTION) ||
+	    (__wt_page_is_modified(page) && (btree->checkpointing ||
+	    !__wt_txn_visible_all(session, page->modify->first_dirty_txn))))
+		return (__wt_hazard_clear(session, page));
+
+	/*
+	 * Take some care with order of operations: if we release the hazard
+	 * reference without first locking the page, it could be evicted in
+	 * between.
+	 */
+	locked = WT_ATOMIC_CAS4(ref->state, WT_REF_MEM, WT_REF_LOCKED);
+	WT_TRET(__wt_hazard_clear(session, page));
+	if (!locked)
+		return (ret);
+
+	(void)WT_ATOMIC_ADD4(btree->evict_busy, 1);
+	if ((ret = __wt_evict_page(session, ref)) == 0)
+		WT_STAT_FAST_CONN_INCR(session, cache_eviction_force);
+	else {
+		WT_STAT_FAST_CONN_INCR(session, cache_eviction_force_fail);
+		if (ret == EBUSY)
+			ret = 0;
+	}
+	(void)WT_ATOMIC_SUB4(btree->evict_busy, 1);
+
+	return (ret);
+}
+
+/*
+ * __wt_page_swap_func --
+ *	Swap one page's hazard pointer for another one when hazard pointer
+ * coupling up/down the tree.
+ */
+static inline int
+__wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held,
+    WT_REF *want, uint32_t flags
+#ifdef HAVE_DIAGNOSTIC
+    , const char *file, int line
+#endif
+    )
+{
+	WT_DECL_RET;
+	int acquired;
+
+	/*
+	 * This function is here to simplify the error handling during hazard
+	 * pointer coupling so we never leave a hazard pointer dangling.  The
+	 * assumption is we're holding a hazard pointer on "held", and want to
+	 * acquire a hazard pointer on "want", releasing the hazard pointer on
+	 * "held" when we're done.
+	 */
+	ret = __wt_page_in_func(session, want, flags
+#ifdef HAVE_DIAGNOSTIC
+	    , file, line
+#endif
+	    );
+
+	/* An expected failure: WT_NOTFOUND when doing a cache-only read. */
+	if (LF_ISSET(WT_READ_CACHE) && ret == WT_NOTFOUND)
+		return (WT_NOTFOUND);
+
+	/* An expected failure: WT_RESTART */
+	if (ret == WT_RESTART)
+		return (WT_RESTART);
+
+	/* Discard the original held page. */
+	acquired = ret == 0;
+	WT_TRET(__wt_page_release(session, held, flags));
+
+	/*
+	 * If there was an error discarding the original held page, discard
+	 * the acquired page too, keeping it is never useful.
+	 */
+	if (acquired && ret != 0)
+		WT_TRET(__wt_page_release(session, want, flags));
+	return (ret);
+}
+
+/*
+ * __wt_page_hazard_check --
+ *	Return if there's a hazard pointer to the page in the system.
+ */
+static inline WT_HAZARD *
+__wt_page_hazard_check(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_HAZARD *hp;
+	WT_SESSION_IMPL *s;
+	uint32_t i, hazard_size, session_cnt;
+
+	conn = S2C(session);
+
+	/*
+	 * No lock is required because the session array is fixed size, but it
+	 * may contain inactive entries.  We must review any active session
+	 * that might contain a hazard pointer, so insert a barrier before
+	 * reading the active session count.  That way, no matter what sessions
+	 * come or go, we'll check the slots for all of the sessions that could
+	 * have been active when we started our check.
+	 */
+	WT_ORDERED_READ(session_cnt, conn->session_cnt);
+	for (s = conn->sessions, i = 0; i < session_cnt; ++s, ++i) {
+		if (!s->active)
+			continue;
+		WT_ORDERED_READ(hazard_size, s->hazard_size);
+		for (hp = s->hazard; hp < s->hazard + hazard_size; ++hp)
+			if (hp->page == page)
+				return (hp);
+	}
+	return (NULL);
+}
+
+/*
+ * __wt_skip_choose_depth --
+ *	Randomly choose a depth for a skiplist insert.
+ */
+static inline u_int
+__wt_skip_choose_depth(WT_SESSION_IMPL *session)
+{
+	u_int d;
+
+	for (d = 1; d < WT_SKIP_MAXDEPTH &&
+	    __wt_random(session->rnd) < WT_SKIP_PROBABILITY; d++)
+		;
+	return (d);
+}
+
+/*
+ * __wt_btree_size_overflow --
+ *	Check if the size of an in-memory tree with a single leaf page is over
+ * a specified maximum.  If called on anything other than a simple tree with a
+ * single leaf page, returns true so the calling code will switch to a new tree.
+ */
+static inline int
+__wt_btree_size_overflow(WT_SESSION_IMPL *session, uint64_t maxsize)
+{
+	WT_BTREE *btree;
+	WT_PAGE *child, *root;
+	WT_PAGE_INDEX *pindex;
+	WT_REF *first;
+
+	btree = S2BT(session);
+	root = btree->root.page;
+
+	/* Check for a non-existent tree. */
+	if (root == NULL)
+		return (0);
+
+	/* A tree that can be evicted always requires a switch. */
+	if (!F_ISSET(btree, WT_BTREE_NO_EVICTION))
+		return (1);
+
+	/* Check for a tree with a single leaf page. */
+	pindex = WT_INTL_INDEX_COPY(root);
+	if (pindex->entries != 1)		/* > 1 child page, switch */
+		return (1);
+
+	first = pindex->index[0];
+	if (first->state != WT_REF_MEM)		/* no child page, ignore */
+		return (0);
+
+	/*
+	 * We're reaching down into the page without a hazard pointer, but
+	 * that's OK because we know that no-eviction is set and so the page
+	 * cannot disappear.
+	 */
+	child = first->page;
+	if (child->type != WT_PAGE_ROW_LEAF)	/* not a single leaf page */
+		return (1);
+
+	return (child->memory_footprint > maxsize);
+}
+
+/*
+ * __wt_lex_compare --
+ *	Lexicographic comparison routine.
+ *
+ * Returns:
+ *	< 0 if user_item is lexicographically < tree_item
+ *	= 0 if user_item is lexicographically = tree_item
+ *	> 0 if user_item is lexicographically > tree_item
+ *
+ * We use the names "user" and "tree" so it's clear in the btree code which
+ * the application is looking at when we call its comparison func.
+ */
+static inline int
+__wt_lex_compare(const WT_ITEM *user_item, const WT_ITEM *tree_item)
+{
+	const uint8_t *userp, *treep;
+	size_t len, usz, tsz;
+
+	usz = user_item->size;
+	tsz = tree_item->size;
+	len = WT_MIN(usz, tsz);
+
+	for (userp = user_item->data, treep = tree_item->data;
+	    len > 0;
+	    --len, ++userp, ++treep)
+		if (*userp != *treep)
+			return (*userp < *treep ? -1 : 1);
+
+	/* Contents are equal up to the smallest length. */
+	return ((usz == tsz) ? 0 : (usz < tsz) ? -1 : 1);
+}
+
+/*
+ * __wt_compare --
+ *	The same as __wt_lex_compare, but using the application's collator
+ * function when configured.
+ */
+static inline int
+__wt_compare(WT_SESSION_IMPL *session, WT_COLLATOR *collator,
+    const WT_ITEM *user_item, const WT_ITEM *tree_item, int *cmpp)
+{
+	if (collator == NULL) {
+		*cmpp = __wt_lex_compare(user_item, tree_item);
+		return (0);
+	}
+	return (collator->compare(
+	    collator, &session->iface, user_item, tree_item, cmpp));
+}
+
+/*
+ * __wt_lex_compare_skip --
+ *	Lexicographic comparison routine, skipping leading bytes.
+ *
+ * Returns:
+ *	< 0 if user_item is lexicographically < tree_item
+ *	= 0 if user_item is lexicographically = tree_item
+ *	> 0 if user_item is lexicographically > tree_item
+ *
+ * We use the names "user" and "tree" so it's clear in the btree code which
+ * the application is looking at when we call its comparison func.
+ */
+static inline int
+__wt_lex_compare_skip(
+    const WT_ITEM *user_item, const WT_ITEM *tree_item, size_t *matchp)
+{
+	const uint8_t *userp, *treep;
+	size_t len, usz, tsz;
+
+	usz = user_item->size;
+	tsz = tree_item->size;
+	len = WT_MIN(usz, tsz) - *matchp;
+
+	for (userp = (uint8_t *)user_item->data + *matchp,
+	    treep = (uint8_t *)tree_item->data + *matchp;
+	    len > 0;
+	    --len, ++userp, ++treep, ++*matchp)
+		if (*userp != *treep)
+			return (*userp < *treep ? -1 : 1);
+
+	/* Contents are equal up to the smallest length. */
+	return ((usz == tsz) ? 0 : (usz < tsz) ? -1 : 1);
+}
+
+/*
+ * __wt_compare_skip --
+ *	The same as __wt_lex_compare_skip, but using the application's collator
+ * function when configured.
+ */
+static inline int
+__wt_compare_skip(WT_SESSION_IMPL *session, WT_COLLATOR *collator,
+    const WT_ITEM *user_item, const WT_ITEM *tree_item, int *cmpp,
+    size_t *matchp)
+{
+	if (collator == NULL) {
+		*cmpp = __wt_lex_compare_skip(user_item, tree_item, matchp);
+		return (0);
+	}
+	return (collator->compare(
+	    collator, &session->iface, user_item, tree_item, cmpp));
+}
diff --git a/src/third_party/wiredtiger/src/include/buf.i b/src/third_party/wiredtiger/src/include/buf.i
new file mode 100644
index 00000000000..09bee9ff831
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/buf.i
@@ -0,0 +1,133 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __wt_buf_grow --
+ *	Grow a buffer that may be in-use, and ensure that all data is local to
+ * the buffer.
+ */
+static inline int
+__wt_buf_grow(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
+{
+	return (size > buf->memsize || !WT_DATA_IN_ITEM(buf) ?
+	    __wt_buf_grow_worker(session, buf, size) : 0);
+}
+
+/*
+ * __wt_buf_extend --
+ *	Grow a buffer that's currently in-use.
+ */
+static inline int
+__wt_buf_extend(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
+{
+	/*
+	 * The difference between __wt_buf_grow and __wt_buf_extend is that the
+	 * latter is expected to be called repeatedly for the same buffer, and
+	 * so grows the buffer exponentially to avoid repeated costly calls to
+	 * realloc.
+	 */
+	return (size > buf->memsize ?
+	    __wt_buf_grow(session, buf, WT_MAX(size, 2 * buf->memsize)) : 0);
+}
+
+/*
+ * __wt_buf_init --
+ *	Initialize a buffer at a specific size.
+ */
+static inline int
+__wt_buf_init(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
+{
+	buf->data = buf->mem;
+	buf->size = 0;				/* Clear existing data length */
+	WT_RET(__wt_buf_grow(session, buf, size));
+
+	return (0);
+}
+
+/*
+ * __wt_buf_initsize --
+ *	Initialize a buffer at a specific size, and set the data length.
+ */
+static inline int
+__wt_buf_initsize(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
+{
+	buf->data = buf->mem;
+	buf->size = 0;				/* Clear existing data length */
+	WT_RET(__wt_buf_grow(session, buf, size));
+	buf->size = size;			/* Set the data length. */
+
+	return (0);
+}
+
+/*
+ * __wt_buf_set --
+ *	Set the contents of the buffer.
+ */
+static inline int
+__wt_buf_set(
+    WT_SESSION_IMPL *session, WT_ITEM *buf, const void *data, size_t size)
+{
+	/* Ensure the buffer is large enough. */
+	WT_RET(__wt_buf_initsize(session, buf, size));
+
+	/* Copy the data, allowing for overlapping strings. */
+	memmove(buf->mem, data, size);
+
+	return (0);
+}
+
+/*
+ * __wt_buf_setstr --
+ *	Set the contents of the buffer to a NUL-terminated string.
+ */
+static inline int
+__wt_buf_setstr(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *s)
+{
+	return (__wt_buf_set(session, buf, s, strlen(s) + 1));
+}
+
+/*
+ * __wt_buf_set_printable --
+ *	Set the contents of the buffer to a printable representation of a
+ * byte string.
+ */
+static inline int
+__wt_buf_set_printable(
+    WT_SESSION_IMPL *session, WT_ITEM *buf, const void *from_arg, size_t size)
+{
+	return (__wt_raw_to_esc_hex(session, from_arg, size, buf));
+}
+
+/*
+ * __wt_buf_free --
+ *	Free a buffer.
+ */
+static inline void
+__wt_buf_free(WT_SESSION_IMPL *session, WT_ITEM *buf)
+{
+	__wt_free(session, buf->mem);
+
+	memset(buf, 0, sizeof(WT_ITEM));
+}
+
+/*
+ * __wt_scr_free --
+ *	Release a scratch buffer.
+ */
+static inline void
+__wt_scr_free(WT_ITEM **bufp)
+{
+	WT_ITEM *buf;
+
+	if ((buf = *bufp) != NULL) {
+		*bufp = NULL;
+
+		buf->data = NULL;
+		buf->size = 0;
+		F_CLR(buf, WT_ITEM_INUSE);
+	}
+}
diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h
new file mode 100644
index 00000000000..b7dbd8401a9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/cache.h
@@ -0,0 +1,139 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Tuning constants: I hesitate to call this tuning, but we want to review some
+ * number of pages from each file's in-memory tree for each page we evict.
+ */
+#define	WT_EVICT_INT_SKEW  (1<<20)	/* Prefer leaf pages over internal
+					   pages by this many increments of the
+					   read generation. */
+#define	WT_EVICT_WALK_PER_FILE	10	/* Pages to visit per file */
+#define	WT_EVICT_WALK_BASE     300	/* Pages tracked across file visits */
+#define	WT_EVICT_WALK_INCR     100	/* Pages added each walk */
+
+#define	WT_EVICT_PASS_AGGRESSIVE	0x01
+#define	WT_EVICT_PASS_ALL		0x02
+#define	WT_EVICT_PASS_DIRTY		0x04
+
+/*
+ * WT_EVICT_ENTRY --
+ *	Encapsulation of an eviction candidate.
+ */
+struct __wt_evict_entry {
+	WT_BTREE *btree;			/* Enclosing btree object */
+	WT_REF	 *ref;				/* Page to flush/evict */
+};
+
+/*
+ * WT_EVICT_WORKER --
+ *	Encapsulation of an eviction worker thread.
+ */
+
+struct __wt_evict_worker {
+	WT_SESSION_IMPL *session;
+	u_int id;
+	wt_thread_t tid;
+#define	WT_EVICT_WORKER_RUN	0x01
+	uint32_t flags;
+};
+
+/*
+ * WiredTiger cache structure.
+ */
+struct __wt_cache {
+	/*
+	 * Different threads read/write pages to/from the cache and create pages
+	 * in the cache, so we cannot know precisely how much memory is in use
+	 * at any specific time.  However, even though the values don't have to
+	 * be exact, they can't be garbage, we track what comes in and what goes
+	 * out and calculate the difference as needed.
+	 */
+	uint64_t bytes_inmem;		/* Bytes/pages in memory */
+	uint64_t pages_inmem;
+	uint64_t bytes_evict;		/* Bytes/pages discarded by eviction */
+	uint64_t pages_evict;
+	uint64_t bytes_dirty;		/* Bytes/pages currently dirty */
+	uint64_t pages_dirty;
+
+	/*
+	 * Read information.
+	 */
+	uint64_t   read_gen;		/* Page read generation (LRU) */
+
+	/*
+	 * Eviction thread information.
+	 */
+	WT_CONDVAR *evict_cond;		/* Eviction server condition */
+	WT_SPINLOCK evict_lock;		/* Eviction LRU queue */
+	WT_SPINLOCK evict_walk_lock;	/* Eviction walk location */
+	/* Condition signalled when the eviction server populates the queue */
+	WT_CONDVAR *evict_waiter_cond;
+
+	u_int eviction_trigger;		/* Percent to trigger eviction */
+	u_int eviction_target;		/* Percent to end eviction */
+	u_int eviction_dirty_target;    /* Percent to allow dirty */
+
+	/*
+	 * LRU eviction list information.
+	 */
+	WT_EVICT_ENTRY *evict;		/* LRU pages being tracked */
+	WT_EVICT_ENTRY *evict_current;	/* LRU current page to be evicted */
+	uint32_t evict_candidates;	/* LRU list pages to evict */
+	uint32_t evict_entries;		/* LRU entries in the queue */
+	volatile uint32_t evict_max;	/* LRU maximum eviction slot used */
+	uint32_t evict_slots;		/* LRU list eviction slots */
+	WT_DATA_HANDLE
+		*evict_file_next;	/* LRU next file to search */
+
+	/*
+	 * Sync/flush request information.
+	 */
+	volatile uint64_t sync_request;	/* File sync requests */
+	volatile uint64_t sync_complete;/* File sync requests completed */
+
+	/*
+	 * Cache pool information.
+	 */
+	uint64_t cp_saved_evict;	/* Evict count from last pass */
+	uint64_t cp_current_evict;	/* Evict count from current pass */
+	uint32_t cp_skip_count;		/* Post change stabilization */
+	uint64_t cp_reserved;		/* Base size for this cache */
+	WT_SESSION_IMPL *cp_session;	/* May be used for cache management */
+	wt_thread_t cp_tid;		/* Thread ID for cache pool manager */
+
+	/*
+	 * Flags.
+	 */
+#define	WT_CACHE_POOL_MANAGER	0x01	/* The active cache pool manager */
+#define	WT_CACHE_POOL_RUN	0x02	/* Cache pool thread running */
+#define	WT_EVICT_ACTIVE		0x04	/* Eviction server is active */
+#define	WT_EVICT_CLEAR_WALKS	0x08	/* Clear eviction walks */
+#define	WT_EVICT_NO_PROGRESS	0x10	/* Check if pages are being evicted */
+#define	WT_EVICT_STUCK		0x20	/* Eviction server is stuck */
+	uint32_t flags;
+};
+
+/*
+ * WT_CACHE_POOL --
+ *	A structure that represents a shared cache.
+ */
+struct __wt_cache_pool {
+	WT_SPINLOCK cache_pool_lock;
+	WT_CONDVAR *cache_pool_cond;
+	const char *name;
+	uint64_t size;
+	uint64_t chunk;
+	uint64_t currently_used;
+	uint32_t refs;		/* Reference count for structure. */
+	/* Locked: List of connections participating in the cache pool. */
+	TAILQ_HEAD(__wt_cache_pool_qh, __wt_connection_impl) cache_pool_qh;
+
+#define	WT_CACHE_POOL_MANAGED	0x01	/* Cache pool has a manager thread */
+#define	WT_CACHE_POOL_ACTIVE	0x02	/* Cache pool is active */
+	uint8_t flags_atomic;
+};
diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i
new file mode 100644
index 00000000000..fdb7302f4a8
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/cache.i
@@ -0,0 +1,174 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __wt_eviction_check --
+ *	Wake the eviction server if necessary.
+ */
+static inline int
+__wt_eviction_check(WT_SESSION_IMPL *session, int *fullp, int wake)
+{
+	WT_CACHE *cache;
+	WT_CONNECTION_IMPL *conn;
+	uint64_t bytes_inuse, bytes_max, dirty_inuse;
+
+	conn = S2C(session);
+	cache = conn->cache;
+
+	/*
+	 * If we're over the maximum cache, shut out reads (which include page
+	 * allocations) until we evict to back under the maximum cache.
+	 * Eviction will keep pushing out pages so we don't run on the edge all
+	 * the time.  Avoid division by zero if the cache size has not yet been
+	 * in a shared cache.
+	 */
+	bytes_inuse = __wt_cache_bytes_inuse(cache);
+	dirty_inuse = cache->bytes_dirty;
+	bytes_max = conn->cache_size + 1;
+
+	/* Calculate the cache full percentage. */
+	*fullp = (int)((100 * bytes_inuse) / bytes_max);
+
+	/* Wake eviction when we're over the trigger cache size. */
+	if (wake &&
+	    (bytes_inuse > (cache->eviction_trigger * bytes_max) / 100 ||
+	    dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100))
+		WT_RET(__wt_evict_server_wake(session));
+	return (0);
+}
+
+/*
+ * __wt_session_can_wait --
+ *	Return if a session available for a potentially slow operation.
+ */
+static inline int
+__wt_session_can_wait(WT_SESSION_IMPL *session)
+{
+	/*
+	 * Return if a session available for a potentially slow operation;
+	 * for example, used by the block manager in the case of flushing
+	 * the system cache.
+	 */
+	if (!F_ISSET(session, WT_SESSION_CAN_WAIT))
+		return (0);
+
+	/*
+	 * LSM sets the no-cache-check flag when holding the LSM tree lock,
+	 * in that case, or when holding the schema lock, we don't want to
+	 * highjack the thread for eviction.
+	 */
+	if (F_ISSET(session,
+	    WT_SESSION_NO_CACHE_CHECK | WT_SESSION_SCHEMA_LOCKED))
+		return (0);
+
+	return (1);
+}
+
+/*
+ * __wt_cache_full_check --
+ *	Wait for there to be space in the cache before a read or update.
+ */
+static inline int
+__wt_cache_full_check(WT_SESSION_IMPL *session)
+{
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_TXN_GLOBAL *txn_global;
+	WT_TXN_STATE *txn_state;
+	int busy, count, full;
+
+	/*
+	 * LSM sets the no-cache-check flag when holding the LSM tree lock,
+	 * in that case, or when holding the schema lock, we don't want to
+	 * highjack the thread for eviction.
+	 */
+	if (F_ISSET(session,
+	    WT_SESSION_NO_CACHE_CHECK | WT_SESSION_SCHEMA_LOCKED))
+		return (0);
+
+	/*
+	 * Threads operating on trees that cannot be evicted are ignored,
+	 * mostly because they're not contributing to the problem.
+	 */
+	if ((btree = S2BT_SAFE(session)) != NULL &&
+	    F_ISSET(btree, WT_BTREE_NO_EVICTION))
+		return (0);
+
+	/*
+	 * Only wake the eviction server the first time through here (if the
+	 * cache is too full).
+	 *
+	 * If the cache is less than 95% full, no work to be done.
+	 */
+	WT_RET(__wt_eviction_check(session, &full, 1));
+	if (full < 95)
+		return (0);
+
+	/*
+	 * If we are at the API boundary and the cache is more than 95% full,
+	 * try to evict at least one page before we start an operation.  This
+	 * helps with some eviction-dominated workloads.
+	 *
+	 * If the current transaction is keeping the oldest ID pinned, it is in
+	 * the middle of an operation.	This may prevent the oldest ID from
+	 * moving forward, leading to deadlock, so only evict what we can.
+	 * Otherwise, we are at a transaction boundary and we can work harder
+	 * to make sure there is free space in the cache.
+	 */
+	txn_global = &S2C(session)->txn_global;
+	txn_state = &txn_global->states[session->id];
+	busy = txn_state->id != WT_TXN_NONE ||
+	    session->nhazard > 0 ||
+	    (txn_state->snap_min != WT_TXN_NONE &&
+	    txn_global->current != txn_global->oldest_id);
+	if (busy && full < 100)
+		return (0);
+	count = busy ? 1 : 10;
+
+	for (;;) {
+		switch (ret = __wt_evict_lru_page(session, 1)) {
+		case 0:
+			if (--count == 0)
+				return (0);
+			break;
+		case EBUSY:
+			continue;
+		case WT_NOTFOUND:
+			break;
+		default:
+			return (ret);
+		}
+
+		WT_RET(__wt_eviction_check(session, &full, 0));
+		if (full < 100)
+			return (0);
+		else if (ret == 0)
+			continue;
+
+		/*
+		 * The cache is still full and no pages were found in the queue
+		 * to evict.  If this transaction is the one holding back the
+		 * oldest ID, we can't wait forever.  We'll block next time we
+		 * are not busy.
+		 */
+		if (busy) {
+			__wt_txn_update_oldest(session);
+			if (txn_state->id == txn_global->oldest_id ||
+			    txn_state->snap_min == txn_global->oldest_id)
+				return (0);
+		}
+
+		/* Wait for the queue to re-populate before trying again. */
+		WT_RET(__wt_cond_wait(session,
+		    S2C(session)->cache->evict_waiter_cond, 100000));
+
+		/* Check if things have changed so that we are busy. */
+		if (!busy && txn_state->snap_min != WT_TXN_NONE &&
+		    txn_global->current != txn_global->oldest_id)
+			busy = count = 1;
+	}
+}
diff --git a/src/third_party/wiredtiger/src/include/cell.i b/src/third_party/wiredtiger/src/include/cell.i
new file mode 100644
index 00000000000..42c7c07a30c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/cell.i
@@ -0,0 +1,816 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * WT_CELL --
+ *	Variable-length cell type.
+ *
+ * Pages containing variable-length keys or values data (the WT_PAGE_ROW_INT,
+ * WT_PAGE_ROW_LEAF, WT_PAGE_COL_INT and WT_PAGE_COL_VAR page types), have
+ * cells after the page header.
+ *
+ * There are 4 basic cell types: keys and data (each of which has an overflow
+ * form), deleted cells and off-page references.  The cell is usually followed
+ * by additional data, varying by type: a key or data cell is followed by a set
+ * of bytes, an address cookie follows overflow or off-page cells.
+ *
+ * Deleted cells are place-holders for column-store files, where entries cannot
+ * be removed in order to preserve the record count.
+ *
+ * Here's the cell use by page type:
+ *
+ * WT_PAGE_ROW_INT (row-store internal page):
+ *	Keys and offpage-reference pairs (a WT_CELL_KEY or WT_CELL_KEY_OVFL
+ * cell followed by a WT_CELL_ADDR_XXX cell).
+ *
+ * WT_PAGE_ROW_LEAF (row-store leaf page):
+ *	Keys with optional data cells (a WT_CELL_KEY or WT_CELL_KEY_OVFL cell,
+ *	normally followed by a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell).
+ *
+ *	WT_PAGE_ROW_LEAF pages optionally prefix-compress keys, using a single
+ *	byte count immediately following the cell.
+ *
+ * WT_PAGE_COL_INT (Column-store internal page):
+ *	Off-page references (a WT_CELL_ADDR_XXX cell).
+ *
+ * WT_PAGE_COL_VAR (Column-store leaf page storing variable-length cells):
+ *	Data cells (a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell), or deleted
+ * cells (a WT_CELL_DEL cell).
+ *
+ * Each cell starts with a descriptor byte:
+ *
+ * Bits 1 and 2 are reserved for "short" key and value cells (that is, a cell
+ * carrying data less than 64B, where we can store the data length in the cell
+ * descriptor byte):
+ *	0x00	Not a short key/data cell
+ *	0x01	Short key cell
+ *	0x10	Short key cell, with a following prefix-compression byte
+ *	0x11	Short value cell
+ * In these cases, the other 6 bits of the descriptor byte are the data length.
+ *
+ * Bit 3 marks an 8B packed, uint64_t value following the cell description byte.
+ * (A run-length counter or a record number for variable-length column store.)
+ *
+ * Bit 4 is unused.
+ *
+ * Bits 5-8 are cell "types".
+ */
+#define	WT_CELL_KEY_SHORT	0x01		/* Short key */
+#define	WT_CELL_KEY_SHORT_PFX	0x02		/* Short key with prefix byte */
+#define	WT_CELL_VALUE_SHORT	0x03		/* Short data */
+#define	WT_CELL_SHORT_TYPE(v)	((v) & 0x03U)
+
+#define	WT_CELL_SHORT_MAX	63		/* Maximum short key/value */
+#define	WT_CELL_SHORT_SHIFT	2		/* Shift for short key/value */
+
+#define	WT_CELL_64V		0x04		/* Associated value */
+
+/*
+ * We could use bit 4 as a single bit (similar to bit 3), or as a type bit in a
+ * backward compatible way by adding bit 4 to the type mask and adding new types
+ * that incorporate it.
+ */
+#define	WT_CELL_UNUSED_BIT4	0x08		/* Unused */
+
+/*
+ * WT_CELL_ADDR_INT is an internal block location, WT_CELL_ADDR_LEAF is a leaf
+ * block location, and WT_CELL_ADDR_LEAF_NO is a leaf block location where the
+ * page has no overflow items.  (The goal is to speed up truncation as we don't
+ * have to read pages without overflow items in order to delete them.  Note,
+ * WT_CELL_ADDR_LEAF_NO is not guaranteed to be set on every page without
+ * overflow items, the only guarantee is that if set, the page has no overflow
+ * items.)
+ *
+ * WT_CELL_VALUE_COPY is a reference to a previous cell on the page, supporting
+ * value dictionaries: if the two values are the same, we only store them once
+ * and have the second and subsequent use reference the original.
+ */
+#define	WT_CELL_ADDR_DEL	 (0)		/* Address: deleted */
+#define	WT_CELL_ADDR_INT	 (1 << 4)	/* Address: internal  */
+#define	WT_CELL_ADDR_LEAF	 (2 << 4)	/* Address: leaf */
+#define	WT_CELL_ADDR_LEAF_NO	 (3 << 4)	/* Address: leaf no overflow */
+#define	WT_CELL_DEL		 (4 << 4)	/* Deleted value */
+#define	WT_CELL_KEY		 (5 << 4)	/* Key */
+#define	WT_CELL_KEY_OVFL	 (6 << 4)	/* Overflow key */
+#define	WT_CELL_KEY_OVFL_RM	(12 << 4)	/* Overflow key (removed) */
+#define	WT_CELL_KEY_PFX		 (7 << 4)	/* Key with prefix byte */
+#define	WT_CELL_VALUE		 (8 << 4)	/* Value */
+#define	WT_CELL_VALUE_COPY	 (9 << 4)	/* Value copy */
+#define	WT_CELL_VALUE_OVFL	(10 << 4)	/* Overflow value */
+#define	WT_CELL_VALUE_OVFL_RM	(11 << 4)	/* Overflow value (removed) */
+
+#define	WT_CELL_TYPE_MASK	(0x0fU << 4)	/* Maximum 16 cell types */
+#define	WT_CELL_TYPE(v)		((v) & WT_CELL_TYPE_MASK)
+
+/*
+ * When we aren't able to create a short key or value (and, in the case of a
+ * value, there's no associated RLE), the key or value is at least 64B, else
+ * we'd have been able to store it as a short cell.  Decrement/Increment the
+ * size before storing it, in the hopes that relatively small key/value sizes
+ * will pack into a single byte instead of two bytes.
+ */
+#define	WT_CELL_SIZE_ADJUST	64
+
+/*
+ * WT_CELL --
+ *	Variable-length, on-page cell header.
+ */
+struct __wt_cell {
+	/*
+	 * Maximum of 16 bytes:
+	 * 1: cell descriptor byte
+	 * 1: prefix compression count
+	 * 9: associated 64-bit value	(uint64_t encoding, max 9 bytes)
+	 * 5: data length		(uint32_t encoding, max 5 bytes)
+	 *
+	 * This calculation is pessimistic: the prefix compression count and
+	 * 64V value overlap, the 64V value and data length are optional.
+	 */
+	uint8_t __chunk[1 + 1 + WT_INTPACK64_MAXSIZE + WT_INTPACK32_MAXSIZE];
+};
+
+/*
+ * WT_CELL_UNPACK --
+ *	Unpacked cell.
+ */
+struct __wt_cell_unpack {
+	WT_CELL *cell;			/* Cell's disk image address */
+
+	uint64_t v;			/* RLE count or recno */
+
+	/*
+	 * !!!
+	 * The size and __len fields are reasonably type size_t; don't change
+	 * the type, performance drops significantly if they're type size_t.
+	 */
+	const void *data;		/* Data */
+	uint32_t    size;		/* Data size */
+
+	uint32_t __len;			/* Cell + data length (usually) */
+
+	uint8_t prefix;			/* Cell prefix length */
+
+	uint8_t raw;			/* Raw cell type (include "shorts") */
+	uint8_t type;			/* Cell type */
+
+	uint8_t ovfl;			/* boolean: cell is an overflow */
+};
+
+/*
+ * WT_CELL_FOREACH --
+ *	Walk the cells on a page.
+ */
+#define	WT_CELL_FOREACH(btree, dsk, cell, unpack, i)			\
+	for ((cell) =							\
+	    WT_PAGE_HEADER_BYTE(btree, dsk), (i) = (dsk)->u.entries;	\
+	    (i) > 0;							\
+	    (cell) = (WT_CELL *)((uint8_t *)(cell) + (unpack)->__len), --(i))
+
+/*
+ * __wt_cell_pack_addr --
+ *	Pack an address cell.
+ */
+static inline size_t
+__wt_cell_pack_addr(WT_CELL *cell, u_int cell_type, uint64_t recno, size_t size)
+{
+	uint8_t *p;
+
+	p = cell->__chunk + 1;
+
+	if (recno == 0)
+		cell->__chunk[0] = cell_type;		/* Type */
+	else {
+		cell->__chunk[0] = cell_type | WT_CELL_64V;
+		(void)__wt_vpack_uint(&p, 0, recno);	/* Record number */
+	}
+	(void)__wt_vpack_uint(&p, 0, (uint64_t)size);	/* Length */
+	return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_pack_data --
+ *	Set a data item's WT_CELL contents.
+ */
+static inline size_t
+__wt_cell_pack_data(WT_CELL *cell, uint64_t rle, size_t size)
+{
+	uint8_t byte, *p;
+
+	/*
+	 * Short data cells without run-length encoding have 6 bits of data
+	 * length in the descriptor byte.
+	 */
+	if (rle < 2 && size <= WT_CELL_SHORT_MAX) {
+		byte = (uint8_t)size;			/* Type + length */
+		cell->__chunk[0] =
+		    (byte << WT_CELL_SHORT_SHIFT) | WT_CELL_VALUE_SHORT;
+		return (1);
+	}
+
+	p = cell->__chunk + 1;
+	if (rle < 2) {
+		size -= WT_CELL_SIZE_ADJUST;
+		cell->__chunk[0] = WT_CELL_VALUE;	/* Type */
+	} else {
+		cell->__chunk[0] = WT_CELL_VALUE | WT_CELL_64V;
+		(void)__wt_vpack_uint(&p, 0, rle);	/* RLE */
+	}
+	(void)__wt_vpack_uint(&p, 0, (uint64_t)size);	/* Length */
+	return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_pack_data_match --
+ *	Return if two items would have identical WT_CELLs (except for any RLE).
+ */
+static inline int
+__wt_cell_pack_data_match(
+    WT_CELL *page_cell, WT_CELL *val_cell, const uint8_t *val_data, int *matchp)
+{
+	const uint8_t *a, *b;
+	uint64_t av, bv;
+	int rle;
+
+	*matchp = 0;				/* Default to no-match */
+
+	/*
+	 * This is a special-purpose function used by reconciliation to support
+	 * dictionary lookups.  We're passed an on-page cell and a created cell
+	 * plus a chunk of data we're about to write on the page, and we return
+	 * if they would match on the page.  The column-store comparison ignores
+	 * the RLE because the copied cell will have its own RLE.
+	 */
+	a = (uint8_t *)page_cell;
+	b = (uint8_t *)val_cell;
+
+	if (WT_CELL_SHORT_TYPE(a[0]) == WT_CELL_VALUE_SHORT) {
+		av = a[0] >> WT_CELL_SHORT_SHIFT;
+		++a;
+	} else if (WT_CELL_TYPE(a[0]) == WT_CELL_VALUE) {
+		rle = a[0] & WT_CELL_64V ? 1 : 0;	/* Skip any RLE */
+		++a;
+		if (rle)
+			WT_RET(__wt_vunpack_uint(&a, 0, &av));
+		WT_RET(__wt_vunpack_uint(&a, 0, &av));	/* Length */
+	} else
+		return (0);
+
+	if (WT_CELL_SHORT_TYPE(b[0]) == WT_CELL_VALUE_SHORT) {
+		bv = b[0] >> WT_CELL_SHORT_SHIFT;
+		++b;
+	} else if (WT_CELL_TYPE(b[0]) == WT_CELL_VALUE) {
+		rle = b[0] & WT_CELL_64V ? 1 : 0;	/* Skip any RLE */
+		++b;
+		if (rle)
+			WT_RET(__wt_vunpack_uint(&b, 0, &bv));
+		WT_RET(__wt_vunpack_uint(&b, 0, &bv));	/* Length */
+	} else
+		return (0);
+
+	if (av == bv)
+		*matchp = memcmp(a, val_data, av) == 0 ? 1 : 0;
+	return (0);
+}
+
+/*
+ * __wt_cell_pack_copy --
+ *	Write a copy value cell.
+ */
+static inline size_t
+__wt_cell_pack_copy(WT_CELL *cell, uint64_t rle, uint64_t v)
+{
+	uint8_t *p;
+
+	p = cell->__chunk + 1;
+
+	if (rle < 2)					/* Type */
+		cell->__chunk[0] = WT_CELL_VALUE_COPY;
+	else {						/* Type */
+		cell->__chunk[0] = WT_CELL_VALUE_COPY | WT_CELL_64V;
+		(void)__wt_vpack_uint(&p, 0, rle);	/* RLE */
+	}
+	(void)__wt_vpack_uint(&p, 0, v);		/* Copy offset */
+	return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_pack_del --
+ *	Write a deleted value cell.
+ */
+static inline size_t
+__wt_cell_pack_del(WT_CELL *cell, uint64_t rle)
+{
+	uint8_t *p;
+
+	p = cell->__chunk + 1;
+	if (rle < 2) {					/* Type */
+		cell->__chunk[0] = WT_CELL_DEL;
+		return (1);
+	}
+							/* Type */
+	cell->__chunk[0] = WT_CELL_DEL | WT_CELL_64V;
+	(void)__wt_vpack_uint(&p, 0, rle);		/* RLE */
+	return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_pack_int_key --
+ *	Set a row-store internal page key's WT_CELL contents.
+ */
+static inline size_t
+__wt_cell_pack_int_key(WT_CELL *cell, size_t size)
+{
+	uint8_t byte, *p;
+
+	/* Short keys have 6 bits of data length in the descriptor byte. */
+	if (size <= WT_CELL_SHORT_MAX) {
+		byte = (uint8_t)size;
+		cell->__chunk[0] =
+		    (byte << WT_CELL_SHORT_SHIFT) | WT_CELL_KEY_SHORT;
+		return (1);
+	}
+
+	cell->__chunk[0] = WT_CELL_KEY;			/* Type */
+	p = cell->__chunk + 1;
+
+	size -= WT_CELL_SIZE_ADJUST;
+	(void)__wt_vpack_uint(&p, 0, (uint64_t)size);	/* Length */
+
+	return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_pack_leaf_key --
+ *	Set a row-store leaf page key's WT_CELL contents.
+ */
+static inline size_t
+__wt_cell_pack_leaf_key(WT_CELL *cell, uint8_t prefix, size_t size)
+{
+	uint8_t byte, *p;
+
+	/* Short keys have 6 bits of data length in the descriptor byte. */
+	if (size <= WT_CELL_SHORT_MAX) {
+		if (prefix == 0) {
+			byte = (uint8_t)size;		/* Type + length */
+			cell->__chunk[0] =
+			    (byte << WT_CELL_SHORT_SHIFT) | WT_CELL_KEY_SHORT;
+			return (1);
+		} else {
+			byte = (uint8_t)size;		/* Type + length */
+			cell->__chunk[0] =
+			    (byte << WT_CELL_SHORT_SHIFT) |
+			    WT_CELL_KEY_SHORT_PFX;
+			cell->__chunk[1] = prefix;	/* Prefix */
+			return (2);
+		}
+	}
+
+	if (prefix == 0) {
+		cell->__chunk[0] = WT_CELL_KEY;		/* Type */
+		p = cell->__chunk + 1;
+	} else {
+		cell->__chunk[0] = WT_CELL_KEY_PFX;	/* Type */
+		cell->__chunk[1] = prefix;		/* Prefix */
+		p = cell->__chunk + 2;
+	}
+
+	size -= WT_CELL_SIZE_ADJUST;
+	(void)__wt_vpack_uint(&p, 0, (uint64_t)size);	/* Length */
+
+	return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_pack_ovfl --
+ *	Pack an overflow cell.
+ */
+static inline size_t
+__wt_cell_pack_ovfl(WT_CELL *cell, uint8_t type, uint64_t rle, size_t size)
+{
+	uint8_t *p;
+
+	p = cell->__chunk + 1;
+	if (rle < 2)					/* Type */
+		cell->__chunk[0] = type;
+	else {
+		cell->__chunk[0] = type | WT_CELL_64V;
+		(void)__wt_vpack_uint(&p, 0, rle);	/* RLE */
+	}
+	(void)__wt_vpack_uint(&p, 0, (uint64_t)size);	/* Length */
+	return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_rle --
+ *	Return the cell's RLE value.
+ */
+static inline uint64_t
+__wt_cell_rle(WT_CELL_UNPACK *unpack)
+{
+	/*
+	 * Any item with only 1 occurrence is stored with an RLE of 0, that is,
+	 * without any RLE at all.  This code is a single place to handle that
+	 * correction, for simplicity.
+	 */
+	return (unpack->v < 2 ? 1 : unpack->v);
+}
+
+/*
+ * __wt_cell_total_len --
+ *	Return the cell's total length, including data.
+ */
+static inline size_t
+__wt_cell_total_len(WT_CELL_UNPACK *unpack)
+{
+	/*
+	 * The length field is specially named because it's dangerous to use it:
+	 * it represents the length of the current cell (normally used for the
+	 * loop that walks through cells on the page), but occasionally we want
+	 * to copy a cell directly from the page, and what we need is the cell's
+	 * total length.   The problem is dictionary-copy cells, because in that
+	 * case, the __len field is the length of the current cell, not the cell
+	 * for which we're returning data.  To use the __len field, you must be
+	 * sure you're not looking at a copy cell.
+	 */
+	return (unpack->__len);
+}
+
+/*
+ * __wt_cell_type --
+ *	Return the cell's type (collapsing special types).
+ */
+static inline u_int
+__wt_cell_type(WT_CELL *cell)
+{
+	u_int type;
+
+	switch (WT_CELL_SHORT_TYPE(cell->__chunk[0])) {
+	case WT_CELL_KEY_SHORT:
+	case WT_CELL_KEY_SHORT_PFX:
+		return (WT_CELL_KEY);
+	case WT_CELL_VALUE_SHORT:
+		return (WT_CELL_VALUE);
+	}
+
+	switch (type = WT_CELL_TYPE(cell->__chunk[0])) {
+	case WT_CELL_KEY_PFX:
+		return (WT_CELL_KEY);
+	case WT_CELL_KEY_OVFL_RM:
+		return (WT_CELL_KEY_OVFL);
+	case WT_CELL_VALUE_OVFL_RM:
+		return (WT_CELL_VALUE_OVFL);
+	}
+	return (type);
+}
+
+/*
+ * __wt_cell_type_raw --
+ *	Return the cell's type.
+ */
+static inline u_int
+__wt_cell_type_raw(WT_CELL *cell)
+{
+	return (WT_CELL_SHORT_TYPE(cell->__chunk[0]) == 0 ?
+	    WT_CELL_TYPE(cell->__chunk[0]) :
+	    WT_CELL_SHORT_TYPE(cell->__chunk[0]));
+}
+
+/*
+ * __wt_cell_type_reset --
+ *	Reset the cell's type.
+ */
+static inline void
+__wt_cell_type_reset(
+    WT_SESSION_IMPL *session, WT_CELL *cell, u_int old_type, u_int new_type)
+{
+	/*
+	 * For all current callers of this function, this should happen once
+	 * and only once, assert we're setting what we think we're setting.
+	 */
+	WT_ASSERT(session, old_type == 0 || old_type == __wt_cell_type(cell));
+	WT_UNUSED(old_type);
+
+	cell->__chunk[0] =
+	    (cell->__chunk[0] & ~WT_CELL_TYPE_MASK) | WT_CELL_TYPE(new_type);
+}
+
+/*
+ * __wt_cell_leaf_value_parse --
+ *	Return the cell if it's a row-store leaf page value, otherwise return
+ * NULL.
+ */
+static inline WT_CELL *
+__wt_cell_leaf_value_parse(WT_PAGE *page, WT_CELL *cell)
+{
+	/*
+	 * This function exists so there's a place for this comment.
+	 *
+	 * Row-store leaf pages may have a single data cell between each key, or
+	 * keys may be adjacent (when the data cell is empty).
+	 *
+	 * One special case: if the last key on a page is a key without a value,
+	 * don't walk off the end of the page: the size of the underlying disk
+	 * image is exact, which means the end of the last cell on the page plus
+	 * the length of the cell should be the byte immediately after the page
+	 * disk image.
+	 *
+	 * !!!
+	 * This line of code is really a call to __wt_off_page, but we know the
+	 * cell we're given will either be on the page or past the end of page,
+	 * so it's a simpler check.  (I wouldn't bother, but the real problem is
+	 * we can't call __wt_off_page directly, it's in btree.i which requires
+	 * this file be included first.)
+	 */
+	if (cell >= (WT_CELL *)((uint8_t *)page->dsk + page->dsk->mem_size))
+		return (NULL);
+
+	switch (__wt_cell_type_raw(cell)) {
+	case WT_CELL_KEY:
+	case WT_CELL_KEY_OVFL:
+	case WT_CELL_KEY_OVFL_RM:
+	case WT_CELL_KEY_PFX:
+	case WT_CELL_KEY_SHORT:
+	case WT_CELL_KEY_SHORT_PFX:
+		return (NULL);
+	default:
+		return (cell);
+	}
+}
+
+/*
+ * __wt_cell_unpack_safe --
+ *	Unpack a WT_CELL into a structure during verification.
+ */
+static inline int
+__wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end)
+{
+	uint64_t saved_v, v;
+	uint32_t saved_len;
+	int copied;
+	const uint8_t *p;
+
+	copied = 0;
+	saved_len = 0;
+	saved_v = 0;
+
+	/*
+	 * The verification code specifies an end argument, a pointer to 1 past
+	 * the end-of-page.  In that case, make sure we don't go past the end
+	 * of the page when reading.  If an error occurs, we simply return the
+	 * error code, the verification code takes care of complaining (and, in
+	 * the case of salvage, it won't complain at all, it's OK to fail).
+	 */
+#define	WT_CELL_LEN_CHK(p, len) do {					\
+	if (end != NULL && (((uint8_t *)p) + (len)) > end)		\
+		return (WT_ERROR);					\
+} while (0)
+
+restart:
+	/*
+	 * This code is performance critical for scans through read-only trees.
+	 * Avoid WT_CLEAR here: it makes this code run significantly slower.
+	 */
+	WT_CLEAR_INLINE(WT_CELL_UNPACK, *unpack);
+	WT_CELL_LEN_CHK(cell, 0);
+	unpack->cell = cell;
+	unpack->type = __wt_cell_type(cell);
+	unpack->raw = __wt_cell_type_raw(cell);
+
+	/*
+	 * Handle cells with neither an RLE count or data length: short key/data
+	 * cells have 6 bits of data length in the descriptor byte.
+	 */
+	switch (unpack->raw) {
+	case WT_CELL_KEY_SHORT_PFX:
+		WT_CELL_LEN_CHK(cell, 1);		/* skip prefix */
+		unpack->prefix = cell->__chunk[1];
+
+		unpack->data = cell->__chunk + 2;
+		unpack->size = cell->__chunk[0] >> WT_CELL_SHORT_SHIFT;
+		unpack->__len = 2 + unpack->size;
+		goto done;
+	case WT_CELL_KEY_SHORT:
+	case WT_CELL_VALUE_SHORT:
+		unpack->data = cell->__chunk + 1;
+		unpack->size = cell->__chunk[0] >> WT_CELL_SHORT_SHIFT;
+		unpack->__len = 1 + unpack->size;
+		goto done;
+	}
+
+	p = (uint8_t *)cell + 1;			/* skip cell */
+
+	/*
+	 * Check for a prefix byte that optionally follows the cell descriptor
+	 * byte on row-store leaf pages.
+	 */
+	if (unpack->raw == WT_CELL_KEY_PFX) {
+		++p;					/* skip prefix */
+		WT_CELL_LEN_CHK(p, 0);
+		unpack->prefix = cell->__chunk[1];
+	}
+
+	/*
+	 * Check for an RLE count or record number that optionally follows the
+	 * cell descriptor byte on column-store variable-length pages.
+	 */
+	if (cell->__chunk[0] & WT_CELL_64V)		/* skip value */
+		WT_RET(__wt_vunpack_uint(
+		    &p, end == NULL ? 0 : (size_t)(end - p), &unpack->v));
+
+	/*
+	 * Handle special actions for a few different cell types and set the
+	 * data length (deleted cells are fixed-size without length bytes,
+	 * almost everything else has data length bytes).
+	 */
+	switch (unpack->raw) {
+	case WT_CELL_VALUE_COPY:
+		/*
+		 * The cell is followed by an offset to a cell written earlier
+		 * in the page.  Save/restore the length and RLE of this cell,
+		 * we need the length to step through the set of cells on the
+		 * page and this RLE is probably different from the RLE of the
+		 * earlier cell.
+		 */
+		WT_RET(__wt_vunpack_uint(
+		    &p, end == NULL ? 0 : (size_t)(end - p), &v));
+		saved_len = WT_PTRDIFF32(p, cell);
+		saved_v = unpack->v;
+		cell = (WT_CELL *)((uint8_t *)cell - v);
+		copied = 1;
+		goto restart;
+
+	case WT_CELL_KEY_OVFL:
+	case WT_CELL_KEY_OVFL_RM:
+	case WT_CELL_VALUE_OVFL:
+	case WT_CELL_VALUE_OVFL_RM:
+		/*
+		 * Set overflow flag.
+		 */
+		unpack->ovfl = 1;
+		/* FALLTHROUGH */
+
+	case WT_CELL_ADDR_DEL:
+	case WT_CELL_ADDR_INT:
+	case WT_CELL_ADDR_LEAF:
+	case WT_CELL_ADDR_LEAF_NO:
+	case WT_CELL_KEY:
+	case WT_CELL_KEY_PFX:
+	case WT_CELL_VALUE:
+		/*
+		 * The cell is followed by a 4B data length and a chunk of
+		 * data.
+		 */
+		WT_RET(__wt_vunpack_uint(
+		    &p, end == NULL ? 0 : (size_t)(end - p), &v));
+
+		if (unpack->raw == WT_CELL_KEY ||
+		    unpack->raw == WT_CELL_KEY_PFX ||
+		    (unpack->raw == WT_CELL_VALUE && unpack->v == 0))
+			v += WT_CELL_SIZE_ADJUST;
+
+		unpack->data = p;
+		unpack->size = (uint32_t)v;
+		unpack->__len = WT_PTRDIFF32(p + unpack->size, cell);
+		break;
+
+	case WT_CELL_DEL:
+		unpack->__len = WT_PTRDIFF32(p, cell);
+		break;
+	default:
+		return (WT_ERROR);			/* Unknown cell type. */
+	}
+
+	/*
+	 * Check the original cell against the full cell length (this is a
+	 * diagnostic as well, we may be copying the cell from the page and
+	 * we need the right length).
+	 */
+done:	WT_CELL_LEN_CHK(cell, unpack->__len);
+	if (copied) {
+		unpack->raw = WT_CELL_VALUE_COPY;
+		unpack->__len = saved_len;
+		unpack->v = saved_v;
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_cell_unpack --
+ *	Unpack a WT_CELL into a structure.
+ */
+static inline void
+__wt_cell_unpack(WT_CELL *cell, WT_CELL_UNPACK *unpack)
+{
+	(void)__wt_cell_unpack_safe(cell, unpack, NULL);
+}
+
+/*
+ * __cell_data_ref --
+ *	Set a buffer to reference the data from an unpacked cell.
+ */
+static inline int
+__cell_data_ref(WT_SESSION_IMPL *session,
+    WT_PAGE *page, int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+{
+	WT_BTREE *btree;
+	void *huffman;
+
+	btree = S2BT(session);
+
+	/* Reference the cell's data, optionally decode it. */
+	switch (unpack->type) {
+	case WT_CELL_KEY:
+		store->data = unpack->data;
+		store->size = unpack->size;
+		if (page_type == WT_PAGE_ROW_INT)
+			return (0);
+
+		huffman = btree->huffman_key;
+		break;
+	case WT_CELL_VALUE:
+		store->data = unpack->data;
+		store->size = unpack->size;
+		huffman = btree->huffman_value;
+		break;
+	case WT_CELL_KEY_OVFL:
+		WT_RET(__wt_ovfl_read(session, page, unpack, store));
+		if (page_type == WT_PAGE_ROW_INT)
+			return (0);
+
+		huffman = btree->huffman_key;
+		break;
+	case WT_CELL_VALUE_OVFL:
+		WT_RET(__wt_ovfl_read(session, page, unpack, store));
+		huffman = btree->huffman_value;
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	return (huffman == NULL ? 0 :
+	    __wt_huffman_decode(
+	    session, huffman, store->data, store->size, store));
+}
+
+/*
+ * __wt_dsk_cell_data_ref --
+ *	Set a buffer to reference the data from an unpacked cell.
+ *
+ * There are two versions because of WT_CELL_VALUE_OVFL_RM type cells.  When an
+ * overflow item is deleted, its backing blocks are removed; if there are still
+ * running transactions that might need to see the overflow item, we cache a
+ * copy of the item and reset the item's cell to WT_CELL_VALUE_OVFL_RM.  If we
+ * find a WT_CELL_VALUE_OVFL_RM cell when reading an overflow item, we use the
+ * page reference to look aside into the cache.  So, calling the "dsk" version
+ * of the function declares the cell cannot be of type WT_CELL_VALUE_OVFL_RM,
+ * and calling the "page" version means it might be.
+ */
+static inline int
+__wt_dsk_cell_data_ref(WT_SESSION_IMPL *session,
+    int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+{
+	WT_ASSERT(session,
+	    __wt_cell_type_raw(unpack->cell) != WT_CELL_VALUE_OVFL_RM);
+	return (__cell_data_ref(session, NULL, page_type, unpack, store));
+}
+
+/*
+ * __wt_page_cell_data_ref --
+ *	Set a buffer to reference the data from an unpacked cell.
+ */
+static inline int
+__wt_page_cell_data_ref(WT_SESSION_IMPL *session,
+    WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+{
+	return (__cell_data_ref(session, page, page->type, unpack, store));
+}
+
+/*
+ * __wt_cell_data_copy --
+ *	Copy the data from an unpacked cell into a buffer.
+ */
+static inline int
+__wt_cell_data_copy(WT_SESSION_IMPL *session,
+    int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+{
+	/*
+	 * We have routines to both copy and reference a cell's information.  In
+	 * most cases, all we need is a reference and we prefer that, especially
+	 * when returning key/value items.  In a few we need a real copy: call
+	 * the standard reference function and get a reference.  In some cases,
+	 * a copy will be made (for example, when reading an overflow item from
+	 * the underlying object.  If that happens, we're done, otherwise make
+	 * a copy.
+	 *
+	 * We don't require two versions of this function, no callers need to
+	 * handle WT_CELL_VALUE_OVFL_RM cells.
+	 */
+	WT_RET(__wt_dsk_cell_data_ref(session, page_type, unpack, store));
+	if (!WT_DATA_IN_ITEM(store))
+		WT_RET(__wt_buf_set(session, store, store->data, store->size));
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/include/column.i b/src/third_party/wiredtiger/src/include/column.i
new file mode 100644
index 00000000000..42c3664323d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/column.i
@@ -0,0 +1,201 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __col_insert_search_match --
+ *	Search an column-store insert list for an exact match.
+ */
+static inline WT_INSERT *
+__col_insert_search_match(WT_INSERT_HEAD *inshead, uint64_t recno)
+{
+	WT_INSERT **insp, *ret_ins;
+	uint64_t ins_recno;
+	int cmp, i;
+
+	/* If there's no insert chain to search, we're done. */
+	if ((ret_ins = WT_SKIP_LAST(inshead)) == NULL)
+		return (NULL);
+
+	/* Fast path the check for values at the end of the skiplist. */
+	if (recno > WT_INSERT_RECNO(ret_ins))
+		return (NULL);
+	else if (recno == WT_INSERT_RECNO(ret_ins))
+		return (ret_ins);
+
+	/*
+	 * The insert list is a skip list: start at the highest skip level, then
+	 * go as far as possible at each level before stepping down to the next.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0; ) {
+		if (*insp == NULL) {
+			--i;
+			--insp;
+			continue;
+		}
+
+		ins_recno = WT_INSERT_RECNO(*insp);
+		cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1;
+
+		if (cmp == 0)			/* Exact match: return */
+			return (*insp);
+		else if (cmp > 0)		/* Keep going at this level */
+			insp = &(*insp)->next[i];
+		else {				/* Drop down a level */
+			--i;
+			--insp;
+		}
+	}
+
+	return (NULL);
+}
+
+/*
+ * __col_insert_search --
+ *	Search a column-store insert list, creating a skiplist stack as we go.
+ */
+static inline WT_INSERT *
+__col_insert_search(WT_INSERT_HEAD *inshead,
+    WT_INSERT ***ins_stack, WT_INSERT **next_stack, uint64_t recno)
+{
+	WT_INSERT **insp, *ret_ins;
+	uint64_t ins_recno;
+	int cmp, i;
+
+	/* If there's no insert chain to search, we're done. */
+	if ((ret_ins = WT_SKIP_LAST(inshead)) == NULL)
+		return (NULL);
+
+	/* Fast path appends. */
+	if (recno >= WT_INSERT_RECNO(ret_ins)) {
+		for (i = 0; i < WT_SKIP_MAXDEPTH; i++) {
+			ins_stack[i] = (i == 0) ? &ret_ins->next[0] :
+			    (inshead->tail[i] != NULL) ?
+			    &inshead->tail[i]->next[i] : &inshead->head[i];
+			next_stack[i] = NULL;
+		}
+		return (ret_ins);
+	}
+
+	/*
+	 * The insert list is a skip list: start at the highest skip level, then
+	 * go as far as possible at each level before stepping down to the next.
+	 */
+	for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0; ) {
+		if ((ret_ins = *insp) == NULL) {
+			next_stack[i] = NULL;
+			ins_stack[i--] = insp--;
+			continue;
+		}
+
+		ins_recno = WT_INSERT_RECNO(ret_ins);
+		cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1;
+
+		if (cmp > 0)			/* Keep going at this level */
+			insp = &ret_ins->next[i];
+		else if (cmp == 0)		/* Exact match: return */
+			for (; i >= 0; i--) {
+				next_stack[i] = ret_ins->next[i];
+				ins_stack[i] = &ret_ins->next[i];
+			}
+		else {				/* Drop down a level */
+			next_stack[i] = ret_ins;
+			ins_stack[i--] = insp--;
+		}
+	}
+	return (ret_ins);
+}
+
+/*
+ * __col_var_last_recno --
+ *	Return the last record number for a variable-length column-store page.
+ */
+static inline uint64_t
+__col_var_last_recno(WT_PAGE *page)
+{
+	WT_COL_RLE *repeat;
+
+	/*
+	 * If there's an append list (the last page), then there may be more
+	 * records on the page.  This function ignores those records, so our
+	 * callers have to handle that explicitly, if they care.
+	 */
+	if (page->pg_var_nrepeats == 0)
+		return (page->pg_var_entries == 0 ? 0 :
+		    page->pg_var_recno + (page->pg_var_entries - 1));
+
+	repeat = &page->pg_var_repeats[page->pg_var_nrepeats - 1];
+	return ((repeat->recno + repeat->rle) - 1 +
+	    (page->pg_var_entries - (repeat->indx + 1)));
+}
+
+/*
+ * __col_fix_last_recno --
+ *	Return the last record number for a fixed-length column-store page.
+ */
+static inline uint64_t
+__col_fix_last_recno(WT_PAGE *page)
+{
+	/*
+	 * If there's an append list (the last page), then there may be more
+	 * records on the page.  This function ignores those records, so our
+	 * callers have to handle that explicitly, if they care.
+	 */
+	return (page->pg_fix_entries == 0 ? 0 :
+	    page->pg_fix_recno + (page->pg_fix_entries - 1));
+}
+
+/*
+ * __col_var_search --
+ *	Search a variable-length column-store page for a record.
+ */
+static inline WT_COL *
+__col_var_search(WT_PAGE *page, uint64_t recno)
+{
+	WT_COL_RLE *repeat;
+	uint64_t start_recno;
+	uint32_t base, indx, limit, start_indx;
+
+	/*
+	 * Find the matching slot.
+	 *
+	 * This is done in two stages: first, we do a binary search among any
+	 * repeating records to find largest repeating less than the search key.
+	 * Once there, we can do a simple offset calculation to find the correct
+	 * slot for this record number, because we know any intervening records
+	 * have repeat counts of 1.
+	 */
+	for (base = 0, limit = page->pg_var_nrepeats; limit != 0; limit >>= 1) {
+		indx = base + (limit >> 1);
+
+		repeat = page->pg_var_repeats + indx;
+		if (recno >= repeat->recno &&
+		    recno < repeat->recno + repeat->rle)
+			return (page->pg_var_d + repeat->indx);
+		if (recno < repeat->recno)
+			continue;
+		base = indx + 1;
+		--limit;
+	}
+
+	/*
+	 * We didn't find an exact match, move forward from the largest repeat
+	 * less than the search key.
+	 */
+	if (base == 0) {
+		start_indx = 0;
+		start_recno = page->pg_var_recno;
+	} else {
+		repeat = page->pg_var_repeats + (base - 1);
+		start_indx = repeat->indx + 1;
+		start_recno = repeat->recno + repeat->rle;
+	}
+
+	if (recno >= start_recno + (page->pg_var_entries - start_indx))
+		return (NULL);
+
+	return (page->pg_var_d + start_indx + (uint32_t)(recno - start_recno));
+}
diff --git a/src/third_party/wiredtiger/src/include/compact.h b/src/third_party/wiredtiger/src/include/compact.h
new file mode 100644
index 00000000000..aa34eab4d24
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/compact.h
@@ -0,0 +1,12 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+struct __wt_compact {
+	uint32_t	lsm_count;	/* Number of LSM trees seen */
+	uint32_t	file_count;	/* Number of files seen */
+	uint64_t	max_time;	/* Configured timeout */
+};
diff --git a/src/third_party/wiredtiger/src/include/config.h b/src/third_party/wiredtiger/src/include/config.h
new file mode 100644
index 00000000000..b9c4c97fa00
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/config.h
@@ -0,0 +1,85 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+struct __wt_config {
+	WT_SESSION_IMPL *session;
+	const char *orig;
+	const char *end;
+	const char *cur;
+
+	int depth, top;
+	const int8_t *go;
+};
+
+struct __wt_config_check {
+	const char *name;
+	const char *type;
+	const char *checks;
+	const WT_CONFIG_CHECK *subconfigs;
+};
+
+#define	WT_CONFIG_REF(session, n)					\
+	(S2C(session)->config_entries[WT_CONFIG_ENTRY_##n])
+struct __wt_config_entry {
+	const char *method;			/* method name */
+
+#define	WT_CONFIG_BASE(session, n)	(WT_CONFIG_REF(session, n)->base)
+	const char *base;			/* configuration base */
+
+	const WT_CONFIG_CHECK *checks;		/* check array */
+};
+
+struct __wt_config_parser_impl {
+	WT_CONFIG_PARSER iface;
+
+	WT_SESSION_IMPL *session;
+	WT_CONFIG config;
+	WT_CONFIG_ITEM config_item;
+};
+
+/*
+ * DO NOT EDIT: automatically built by dist/api_config.py.
+ * configuration section: BEGIN
+ */
+#define	WT_CONFIG_ENTRY_colgroup_meta			 0
+#define	WT_CONFIG_ENTRY_connection_add_collator		 1
+#define	WT_CONFIG_ENTRY_connection_add_compressor	 2
+#define	WT_CONFIG_ENTRY_connection_add_data_source	 3
+#define	WT_CONFIG_ENTRY_connection_add_extractor	 4
+#define	WT_CONFIG_ENTRY_connection_async_new_op		 5
+#define	WT_CONFIG_ENTRY_connection_close		 6
+#define	WT_CONFIG_ENTRY_connection_load_extension	 7
+#define	WT_CONFIG_ENTRY_connection_open_session		 8
+#define	WT_CONFIG_ENTRY_connection_reconfigure		 9
+#define	WT_CONFIG_ENTRY_cursor_close			10
+#define	WT_CONFIG_ENTRY_file_meta			11
+#define	WT_CONFIG_ENTRY_index_meta			12
+#define	WT_CONFIG_ENTRY_session_begin_transaction	13
+#define	WT_CONFIG_ENTRY_session_checkpoint		14
+#define	WT_CONFIG_ENTRY_session_close			15
+#define	WT_CONFIG_ENTRY_session_commit_transaction	16
+#define	WT_CONFIG_ENTRY_session_compact			17
+#define	WT_CONFIG_ENTRY_session_create			18
+#define	WT_CONFIG_ENTRY_session_drop			19
+#define	WT_CONFIG_ENTRY_session_log_printf		20
+#define	WT_CONFIG_ENTRY_session_open_cursor		21
+#define	WT_CONFIG_ENTRY_session_reconfigure		22
+#define	WT_CONFIG_ENTRY_session_rename			23
+#define	WT_CONFIG_ENTRY_session_rollback_transaction	24
+#define	WT_CONFIG_ENTRY_session_salvage			25
+#define	WT_CONFIG_ENTRY_session_truncate		26
+#define	WT_CONFIG_ENTRY_session_upgrade			27
+#define	WT_CONFIG_ENTRY_session_verify			28
+#define	WT_CONFIG_ENTRY_table_meta			29
+#define	WT_CONFIG_ENTRY_wiredtiger_open			30
+#define	WT_CONFIG_ENTRY_wiredtiger_open_all		31
+#define	WT_CONFIG_ENTRY_wiredtiger_open_basecfg		32
+#define	WT_CONFIG_ENTRY_wiredtiger_open_usercfg		33
+/*
+ * configuration section: END
+ * DO NOT EDIT: automatically built by dist/flags.py.
+ */
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
new file mode 100644
index 00000000000..81866e39df9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -0,0 +1,270 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*******************************************
+ * Global per-process structure.
+ *******************************************/
+/*
+ * WT_PROCESS --
+ *	Per-process information for the library.
+ */
+struct __wt_process {
+	WT_SPINLOCK spinlock;		/* Per-process spinlock */
+
+					/* Locked: connection queue */
+	TAILQ_HEAD(__wt_connection_impl_qh, __wt_connection_impl) connqh;
+	WT_CACHE_POOL *cache_pool;
+};
+extern WT_PROCESS __wt_process;
+
+/*
+ * WT_NAMED_COLLATOR --
+ *	A collator list entry
+ */
+struct __wt_named_collator {
+	const char *name;		/* Name of collator */
+	WT_COLLATOR *collator;		/* User supplied object */
+	TAILQ_ENTRY(__wt_named_collator) q;	/* Linked list of collators */
+};
+
+/*
+ * WT_NAMED_COMPRESSOR --
+ *	A compressor list entry
+ */
+struct __wt_named_compressor {
+	const char *name;		/* Name of compressor */
+	WT_COMPRESSOR *compressor;	/* User supplied callbacks */
+					/* Linked list of compressors */
+	TAILQ_ENTRY(__wt_named_compressor) q;
+};
+
+/*
+ * WT_NAMED_DATA_SOURCE --
+ *	A data source list entry
+ */
+struct __wt_named_data_source {
+	const char *prefix;		/* Name of data source */
+	WT_DATA_SOURCE *dsrc;		/* User supplied callbacks */
+					/* Linked list of data sources */
+	TAILQ_ENTRY(__wt_named_data_source) q;
+};
+
+/*
+ * Allocate some additional slots for internal sessions.  There is a default
+ * session for each connection, plus a session for each server thread.
+ */
+#define	WT_NUM_INTERNAL_SESSIONS	10
+
+/*
+ * WT_CONNECTION_IMPL --
+ *	Implementation of WT_CONNECTION
+ */
+struct __wt_connection_impl {
+	WT_CONNECTION iface;
+
+	/* For operations without an application-supplied session */
+	WT_SESSION_IMPL *default_session;
+	WT_SESSION_IMPL  dummy_session;
+
+	const char *cfg;		/* Connection configuration */
+
+	WT_SPINLOCK api_lock;		/* Connection API spinlock */
+	WT_SPINLOCK checkpoint_lock;	/* Checkpoint spinlock */
+	WT_SPINLOCK fh_lock;		/* File handle queue spinlock */
+	WT_SPINLOCK reconfig_lock;	/* Single thread reconfigure */
+	WT_SPINLOCK schema_lock;	/* Schema operation spinlock */
+
+	/*
+	 * We distribute the btree page locks across a set of spin locks; it
+	 * can't be an array, we impose cache-line alignment and gcc doesn't
+	 * support that for arrays.  Don't use too many: they are only held for
+	 * very short operations, each one is 64 bytes, so 256 will fill the L1
+	 * cache on most CPUs.
+	 */
+#define	WT_PAGE_LOCKS(conn)	16
+	WT_SPINLOCK *page_lock;	        /* Btree page spinlocks */
+	u_int	     page_lock_cnt;	/* Next spinlock to use */
+
+					/* Connection queue */
+	TAILQ_ENTRY(__wt_connection_impl) q;
+					/* Cache pool queue */
+	TAILQ_ENTRY(__wt_connection_impl) cpq;
+
+	const char *home;		/* Database home */
+	const char *error_prefix;	/* Database error prefix */
+	int is_new;			/* Connection created database */
+
+	WT_EXTENSION_API extension_api;	/* Extension API */
+
+					/* Configuration */
+	const WT_CONFIG_ENTRY **config_entries;
+
+	void  **foc;			/* Free-on-close array */
+	size_t  foc_cnt;		/* Array entries */
+	size_t  foc_size;		/* Array size */
+
+	WT_FH *lock_fh;			/* Lock file handle */
+
+	uint64_t  split_gen;		/* Generation number for splits */
+
+	WT_SPINLOCK dhandle_lock;	/* Locked: dhandle sweep */
+					/* Locked: data handle list */
+	SLIST_HEAD(__wt_dhandle_lh, __wt_data_handle) dhlh;
+					/* Locked: LSM handle list. */
+	TAILQ_HEAD(__wt_lsm_qh, __wt_lsm_tree) lsmqh;
+					/* Locked: file list */
+	TAILQ_HEAD(__wt_fh_qh, __wt_fh) fhqh;
+					/* Locked: library list */
+	TAILQ_HEAD(__wt_dlh_qh, __wt_dlh) dlhqh;
+
+	WT_SPINLOCK block_lock;		/* Locked: block manager list */
+	TAILQ_HEAD(__wt_block_qh, __wt_block) blockqh;
+
+	u_int open_btree_count;		/* Locked: open writable btree count */
+	uint32_t next_file_id;		/* Locked: file ID counter */
+
+	/*
+	 * WiredTiger allocates space for 50 simultaneous sessions (threads of
+	 * control) by default.  Growing the number of threads dynamically is
+	 * possible, but tricky since server threads are walking the array
+	 * without locking it.
+	 *
+	 * There's an array of WT_SESSION_IMPL pointers that reference the
+	 * allocated array; we do it that way because we want an easy way for
+	 * the server thread code to avoid walking the entire array when only a
+	 * few threads are running.
+	 */
+	WT_SESSION_IMPL	*sessions;	/* Session reference */
+	uint32_t	 session_size;	/* Session array size */
+	uint32_t	 session_cnt;	/* Session count */
+
+	/*
+	 * WiredTiger allocates space for a fixed number of hazard pointers
+	 * in each thread of control.
+	 */
+	uint32_t   hazard_max;		/* Hazard array size */
+
+	WT_CACHE  *cache;		/* Page cache */
+	uint64_t   cache_size;
+
+	WT_TXN_GLOBAL txn_global;	/* Global transaction state */
+
+	WT_SPINLOCK hot_backup_lock;	/* Hot backup serialization */
+	int hot_backup;
+
+	WT_SESSION_IMPL *ckpt_session;	/* Checkpoint thread session */
+	wt_thread_t	 ckpt_tid;	/* Checkpoint thread */
+	int		 ckpt_tid_set;	/* Checkpoint thread set */
+	WT_CONDVAR	*ckpt_cond;	/* Checkpoint wait mutex */
+	const char	*ckpt_config;	/* Checkpoint configuration */
+#define	WT_CKPT_LOGSIZE(conn)	((conn)->ckpt_logsize != 0)
+	wt_off_t		 ckpt_logsize;	/* Checkpoint log size period */
+	uint32_t	 ckpt_signalled; /* Checkpoint signalled */
+	long		 ckpt_usecs;	/* Checkpoint period */
+
+	int compact_in_memory_pass;	/* Compaction serialization */
+
+#define	WT_CONN_STAT_ALL	0x01	/* "all" statistics configured */
+#define	WT_CONN_STAT_CLEAR	0x02	/* clear after gathering */
+#define	WT_CONN_STAT_FAST	0x04	/* "fast" statistics configured */
+#define	WT_CONN_STAT_NONE	0x08	/* don't gather statistics */
+#define	WT_CONN_STAT_ON_CLOSE	0x10	/* output statistics on close */
+	uint32_t stat_flags;
+
+	WT_CONNECTION_STATS stats;	/* Connection statistics */
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+	/*
+	 * Spinlock registration, so we can track which spinlocks are heavily
+	 * used, which are blocking and where.
+	 *
+	 * There's an array of spinlocks, and an array of blocking IDs.
+	 */
+#define	WT_SPINLOCK_MAX			1024
+#define	WT_SPINLOCK_MAX_LOCATION_ID	60
+	WT_SPINLOCK *spinlock_list[WT_SPINLOCK_MAX];
+
+					/* Spinlock blocking matrix */
+	struct __wt_connection_stats_spinlock {
+		const char *name;	/* Mutex name */
+
+		const char *file;	/* Caller's file/line, ID location */
+		int line;
+
+		u_int total;		/* Count of total, blocked calls */
+		u_int blocked[WT_SPINLOCK_MAX_LOCATION_ID];
+	} spinlock_block[WT_SPINLOCK_MAX_LOCATION_ID];
+#endif
+
+	WT_ASYNC	*async;		/* Async structure */
+	int		 async_cfg;	/* Global async configuration */
+	uint32_t	 async_size;	/* Async op array size */
+	uint32_t	 async_workers;	/* Number of async workers */
+
+	WT_LSM_MANAGER	lsm_manager;	/* LSM worker thread information */
+
+	WT_SESSION_IMPL *evict_session; /* Eviction server sessions */
+	wt_thread_t	 evict_tid;	/* Eviction server thread ID */
+	int		 evict_tid_set;	/* Eviction server thread ID set */
+
+	uint32_t	 evict_workers_max;/* Max eviction workers */
+	uint32_t	 evict_workers_min;/* Min eviction workers */
+	uint32_t	 evict_workers;	/* Number of eviction workers */
+	WT_EVICT_WORKER	*evict_workctx;	/* Eviction worker context */
+
+	WT_SESSION_IMPL *stat_session;	/* Statistics log session */
+	wt_thread_t	 stat_tid;	/* Statistics log thread */
+	int		 stat_tid_set;	/* Statistics log thread set */
+	WT_CONDVAR	*stat_cond;	/* Statistics log wait mutex */
+	const char	*stat_format;	/* Statistics log timestamp format */
+	FILE		*stat_fp;	/* Statistics log file handle */
+	char		*stat_path;	/* Statistics log path format */
+	char	       **stat_sources;	/* Statistics log list of objects */
+	const char	*stat_stamp;	/* Statistics log entry timestamp */
+	long		 stat_usecs;	/* Statistics log period */
+
+	int		 logging;	/* Global logging configuration */
+	int		 archive;	/* Global archive configuration */
+	WT_CONDVAR	*arch_cond;	/* Log archive wait mutex */
+	WT_SESSION_IMPL *arch_session;	/* Log archive session */
+	wt_thread_t	 arch_tid;	/* Log archive thread */
+	int		 arch_tid_set;	/* Log archive thread set */
+	WT_LOG		*log;		/* Logging structure */
+	wt_off_t	 log_file_max;	/* Log file max size */
+	const char	*log_path;	/* Logging path format */
+	uint32_t	txn_logsync;	/* Log sync configuration */
+
+	WT_SESSION_IMPL *sweep_session;	/* Handle sweep session */
+	wt_thread_t	 sweep_tid;	/* Handle sweep thread */
+	int		 sweep_tid_set;	/* Handle sweep thread set */
+	WT_CONDVAR	*sweep_cond;	/* Handle sweep wait mutex */
+
+					/* Locked: collator list */
+	TAILQ_HEAD(__wt_coll_qh, __wt_named_collator) collqh;
+
+					/* Locked: compressor list */
+	TAILQ_HEAD(__wt_comp_qh, __wt_named_compressor) compqh;
+
+					/* Locked: data source list */
+	TAILQ_HEAD(__wt_dsrc_qh, __wt_named_data_source) dsrcqh;
+
+	void	*lang_private;		/* Language specific private storage */
+
+	/* If non-zero, all buffers used for I/O will be aligned to this. */
+	size_t buffer_alignment;
+
+	uint32_t schema_gen;		/* Schema generation number */
+
+	wt_off_t data_extend_len;	/* file_extend data length */
+	wt_off_t log_extend_len;	/* file_extend log length */
+
+	uint32_t direct_io;		/* O_DIRECT file type flags */
+	int	 mmap;			/* mmap configuration */
+	uint32_t verbose;
+
+	uint32_t flags;
+};
diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h
new file mode 100644
index 00000000000..17185499b88
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/cursor.h
@@ -0,0 +1,380 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Initialize a static WT_CURSOR structure.
+ */
+#define	WT_CURSOR_STATIC_INIT(n,					\
+	get_key,							\
+	get_value,							\
+	set_key,							\
+	set_value,							\
+	compare,							\
+	next,								\
+	prev,								\
+	reset,								\
+	search,								\
+	search_near,							\
+	insert,								\
+	update,								\
+	remove,								\
+	close)								\
+	static const WT_CURSOR n = {					\
+	NULL,				/* session */			\
+	NULL,				/* uri */			\
+	NULL,				/* key_format */		\
+	NULL,				/* value_format */		\
+	(int (*)(WT_CURSOR *, ...))(get_key),				\
+	(int (*)(WT_CURSOR *, ...))(get_value),				\
+	(void (*)(WT_CURSOR *, ...))(set_key),				\
+	(void (*)(WT_CURSOR *, ...))(set_value),			\
+	(int (*)(WT_CURSOR *, WT_CURSOR *, int *))(compare),		\
+	next,								\
+	prev,								\
+	reset,								\
+	search,								\
+	(int (*)(WT_CURSOR *, int *))(search_near),			\
+	insert,								\
+	update,								\
+	remove,								\
+	close,								\
+	{ NULL, NULL },			/* TAILQ_ENTRY q */		\
+	0,				/* recno key */			\
+	{ 0 },				/* recno raw buffer */		\
+	NULL,				/* json_private */		\
+	NULL,				/* lang_private */		\
+	{ NULL, 0, 0, NULL, 0 },	/* WT_ITEM key */		\
+	{ NULL, 0, 0, NULL, 0 },	/* WT_ITEM value */		\
+	0,				/* int saved_err */		\
+	NULL,				/* internal_uri */		\
+	0				/* uint32_t flags */		\
+}
+
+struct __wt_cursor_backup_entry {
+	char *name;			/* File name */
+	WT_DATA_HANDLE *handle;		/* Handle */
+};
+struct __wt_cursor_backup {
+	WT_CURSOR iface;
+
+	size_t next;			/* Cursor position */
+	FILE *bfp;			/* Backup file */
+
+	WT_CURSOR_BACKUP_ENTRY *list;	/* List of files to be copied. */
+	size_t list_allocated;
+	size_t list_next;
+};
+
+struct __wt_cursor_btree {
+	WT_CURSOR iface;
+
+	WT_BTREE *btree;		/* Enclosing btree */
+
+	/*
+	 * The following fields are set by the search functions as a precursor
+	 * to page modification: we have a page, a WT_COL/WT_ROW slot on the
+	 * page, an insert head, insert list and a skiplist stack (the stack of
+	 * skiplist entries leading to the insert point).  The search functions
+	 * also return the relationship of the search key to the found key.
+	 */
+	WT_REF	  *ref;			/* Current page */
+	uint32_t   slot;		/* WT_COL/WT_ROW 0-based slot */
+
+	WT_INSERT_HEAD	*ins_head;	/* Insert chain head */
+	WT_INSERT	*ins;		/* Current insert node */
+					/* Search stack */
+	WT_INSERT	**ins_stack[WT_SKIP_MAXDEPTH];
+
+					/* Next item(s) found during search */
+	WT_INSERT	*next_stack[WT_SKIP_MAXDEPTH];
+
+	uint64_t recno;			/* Record number */
+
+	/*
+	 * The search function sets compare to:
+	 *	< 1 if the found key is less than the specified key
+	 *	  0 if the found key matches the specified key
+	 *	> 1 if the found key is larger than the specified key
+	 */
+	int	compare;
+
+	/*
+	 * The key value from a binary search of a row-store files; we keep a
+	 * copy of the last key we retrieved in the search, it avoids having
+	 * doing the additional work of getting the key again for return to
+	 * the application.
+	 */
+	WT_ITEM search_key;
+
+	/*
+	 * It's relatively expensive to calculate the last record on a variable-
+	 * length column-store page because of the repeat values.  Calculate it
+	 * once per page and cache it.  This value doesn't include the skiplist
+	 * of appended entries on the last page.
+	 */
+	uint64_t last_standard_recno;
+
+	/*
+	 * For row-store pages, we need a single item that tells us the part of
+	 * the page we're walking (otherwise switching from next to prev and
+	 * vice-versa is just too complicated), so we map the WT_ROW and
+	 * WT_INSERT_HEAD insert array slots into a single name space: slot 1
+	 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
+	 * WT_INSERT_HEAD[0], and so on.  This means WT_INSERT lists are
+	 * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
+	 */
+	uint32_t row_iteration_slot;	/* Row-store iteration slot */
+
+	/*
+	 * Variable-length column-store values are run-length encoded and may
+	 * be overflow values or Huffman encoded.   To avoid repeatedly reading
+	 * overflow values or decompressing encoded values, process it once and
+	 * store the result in a temporary buffer.  The cip_saved field is used
+	 * to determine if we've switched columns since our last cursor call.
+	 */
+	WT_COL *cip_saved;		/* Last iteration reference */
+
+	/*
+	 * We don't instantiate prefix-compressed keys on pages where there's no
+	 * Huffman encoding because we don't want to waste memory if only moving
+	 * a cursor through the page, and it's faster to build keys while moving
+	 * through the page than to roll-forward from a previously instantiated
+	 * key (we don't instantiate all of the keys, just the ones at binary
+	 * search points).  We can't use the application's WT_CURSOR key field
+	 * as a copy of the last-returned key because it may have been altered
+	 * by the API layer, for example, dump cursors.  Instead we store the
+	 * last-returned key in a temporary buffer.  The rip_saved field is used
+	 * to determine if the key in the temporary buffer has the prefix needed
+	 * for building the current key.
+	 */
+	WT_ROW *rip_saved;		/* Last-returned key reference */
+
+	/*
+	 * A temporary buffer for caching RLE values for column-store files.
+	 */
+	WT_ITEM tmp;
+
+	/*
+	 * The update structure allocated by the row- and column-store modify
+	 * functions, used to avoid a data copy in the WT_CURSOR.update call.
+	 */
+	WT_UPDATE *modify_update;
+
+	/*
+	 * Fixed-length column-store items are a single byte, and it's simpler
+	 * and cheaper to allocate the space for it now than keep checking to
+	 * see if we need to grow the buffer.
+	 */
+	uint8_t v;			/* Fixed-length return value */
+
+	uint8_t	append_tree;		/* Cursor appended to the tree */
+
+#define	WT_CBT_ACTIVE		0x01	/* Active in the tree */
+#define	WT_CBT_ITERATE_APPEND	0x02	/* Col-store: iterating append list */
+#define	WT_CBT_ITERATE_NEXT	0x04	/* Next iteration configuration */
+#define	WT_CBT_ITERATE_PREV	0x08	/* Prev iteration configuration */
+#define	WT_CBT_MAX_RECORD	0x10	/* Col-store: past end-of-table */
+#define	WT_CBT_SEARCH_SMALLEST	0x20	/* Row-store: small-key insert list */
+	uint8_t flags;
+};
+
+struct __wt_cursor_bulk {
+	WT_CURSOR_BTREE cbt;
+
+	WT_REF	*ref;			/* The leaf page */
+	WT_PAGE *leaf;
+
+	/*
+	 * Variable-length column store compares values during bulk load as
+	 * part of RLE compression, row-store compares keys during bulk load
+	 * to avoid corruption.
+	 */
+	WT_ITEM last;			/* Last key/value seen */
+
+	/*
+	 * Variable-length column-store RLE counter (also overloaded to mean
+	 * the first time through the bulk-load insert routine, when set to 0).
+	 */
+	uint64_t rle;
+
+	/*
+	 * Fixed-length column-store current entry in memory chunk count, and
+	 * the maximum number of records per chunk.
+	 */
+	uint32_t entry;			/* Entry count */
+	uint32_t nrecs;			/* Max records per chunk */
+
+	/* Special bitmap bulk load for fixed-length column stores. */
+	int	bitmap;
+
+	void	*reconcile;		/* Reconciliation information */
+};
+
+struct __wt_cursor_config {
+	WT_CURSOR iface;
+};
+
+struct __wt_cursor_data_source {
+	WT_CURSOR iface;
+
+	WT_COLLATOR *collator;		/* Configured collator */
+	int collator_owned;		/* Collator needs to be terminated */
+
+	WT_CURSOR *source;		/* Application-owned cursor */
+};
+
+struct __wt_cursor_dump {
+	WT_CURSOR iface;
+
+	WT_CURSOR *child;
+};
+
+struct __wt_cursor_index {
+	WT_CURSOR iface;
+
+	WT_TABLE *table;
+	WT_INDEX *index;
+	const char *key_plan, *value_plan;
+
+	WT_CURSOR *child;
+	WT_CURSOR **cg_cursors;
+};
+
+struct __wt_cursor_json {
+	char	*key_buf;		/* JSON formatted string */
+	char	*value_buf;		/* JSON formatted string */
+	WT_CONFIG_ITEM key_names;	/* Names of key columns */
+	WT_CONFIG_ITEM value_names;	/* Names of value columns */
+};
+
+struct __wt_cursor_log {
+	WT_CURSOR iface;
+
+	WT_LSN		*cur_lsn;	/* LSN of current record */
+	WT_LSN		*next_lsn;	/* LSN of next record */
+	WT_ITEM		*logrec;	/* Copy of record for cursor */
+	WT_ITEM		*opkey, *opvalue;	/* Op key/value copy */
+	const uint8_t	*stepp, *stepp_end;	/* Pointer within record */
+	uint32_t	step_count;	/* Intra-record count */
+	uint32_t	rectype;	/* Record type */
+	uint64_t	txnid;		/* Record txnid */
+	uint32_t	flags;
+};
+
+struct __wt_cursor_metadata {
+	WT_CURSOR iface;
+
+	WT_CURSOR *file_cursor;		/* Queries of regular metadata */
+
+#define	WT_MDC_POSITIONED	0x01
+#define	WT_MDC_ONMETADATA	0x02
+	uint32_t flags;
+};
+
+struct __wt_cursor_stat {
+	WT_CURSOR iface;
+
+	int	notpositioned;		/* Cursor not positioned */
+
+	WT_STATS *stats;		/* Stats owned by the cursor */
+	WT_STATS *stats_first;		/* First stats reference */
+	int	  stats_base;		/* Base statistics value */
+	int	  stats_count;		/* Count of stats elements */
+
+	union {				/* Copies of the statistics */
+		WT_DSRC_STATS dsrc_stats;
+		WT_CONNECTION_STATS conn_stats;
+	} u;
+
+	int	 key;			/* Current stats key */
+	uint64_t v;			/* Current stats value */
+	WT_ITEM	 pv;			/* Current stats value (string) */
+
+	/* Uses the same values as WT_CONNECTION::stat_flags field */
+	uint32_t flags;
+};
+
+/*
+ * WT_CURSOR_STATS --
+ *	Return a reference to a statistic cursor's stats structures; use the
+ * WT_CURSOR.stats_first field instead of WT_CURSOR.stats because the latter
+ * is NULL when non-cursor memory is used to hold the statistics.
+ */
+#define	WT_CURSOR_STATS(cursor)						\
+	(((WT_CURSOR_STAT *)cursor)->stats_first)
+
+struct __wt_cursor_table {
+	WT_CURSOR iface;
+
+	WT_TABLE *table;
+	const char *plan;
+
+	const char **cfg;		/* Saved configuration string */
+
+	WT_CURSOR **cg_cursors;
+	WT_CURSOR **idx_cursors;
+};
+
+#define	WT_CURSOR_PRIMARY(cursor)					\
+	(((WT_CURSOR_TABLE *)cursor)->cg_cursors[0])
+
+#define	WT_CURSOR_RECNO(cursor)	WT_STREQ((cursor)->key_format, "r")
+
+/*
+ * WT_CURSOR_NEEDKEY, WT_CURSOR_NEEDVALUE --
+ *	Check if we have a key/value set.  There's an additional semantic
+ * implemented here: if we're pointing into the tree, and about to perform
+ * a cursor operation, get a local copy of whatever we're referencing in
+ * the tree, there's an obvious race with the cursor moving and the key or
+ * value reference, and it's better to solve it here than in the underlying
+ * data-source layers.
+ *
+ * WT_CURSOR_CHECKKEY --
+ *	Check if a key is set without making a copy.
+ *
+ * WT_CURSOR_NOVALUE --
+ *	Release any cached value before an operation that could update the
+ * transaction context and free data a value is pointing to.
+ */
+#define	WT_CURSOR_CHECKKEY(cursor) do {					\
+	if (!F_ISSET(cursor, WT_CURSTD_KEY_SET))			\
+		WT_ERR(__wt_cursor_kv_not_set(cursor, 1));		\
+} while (0)
+#define	WT_CURSOR_CHECKVALUE(cursor) do {				\
+	if (!F_ISSET(cursor, WT_CURSTD_VALUE_SET))			\
+		WT_ERR(__wt_cursor_kv_not_set(cursor, 0));		\
+} while (0)
+#define	WT_CURSOR_NEEDKEY(cursor) do {					\
+	if (F_ISSET(cursor, WT_CURSTD_KEY_INT)) {			\
+		if (!WT_DATA_IN_ITEM(&(cursor)->key))			\
+			WT_ERR(__wt_buf_set(				\
+			    (WT_SESSION_IMPL *)(cursor)->session,	\
+			    &(cursor)->key,				\
+			    (cursor)->key.data, (cursor)->key.size));	\
+		F_CLR(cursor, WT_CURSTD_KEY_INT);			\
+		F_SET(cursor, WT_CURSTD_KEY_EXT);			\
+	}								\
+	WT_CURSOR_CHECKKEY(cursor);					\
+} while (0)
+#define	WT_CURSOR_NEEDVALUE(cursor) do {				\
+	if (F_ISSET(cursor, WT_CURSTD_VALUE_INT)) {			\
+		if (!WT_DATA_IN_ITEM(&(cursor)->value))			\
+			WT_ERR(__wt_buf_set(				\
+			    (WT_SESSION_IMPL *)(cursor)->session,	\
+			    &(cursor)->value,				\
+			    (cursor)->value.data, (cursor)->value.size));\
+		F_CLR(cursor, WT_CURSTD_VALUE_INT);			\
+		F_SET(cursor, WT_CURSTD_VALUE_EXT);			\
+	}								\
+	WT_CURSOR_CHECKVALUE(cursor);					\
+} while (0)
+#define	WT_CURSOR_NOVALUE(cursor) do {					\
+	F_CLR(cursor, WT_CURSTD_VALUE_INT);				\
+} while (0)
+
+#define	WT_CURSOR_RAW_OK						\
+	WT_CURSTD_DUMP_HEX | WT_CURSTD_DUMP_PRINT | WT_CURSTD_RAW
diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i
new file mode 100644
index 00000000000..7f8e83643c5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/cursor.i
@@ -0,0 +1,277 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __cursor_set_recno --
+ *	The cursor value in the interface has to track the value in the
+ * underlying cursor, update them in parallel.
+ */
+static inline void
+__cursor_set_recno(WT_CURSOR_BTREE *cbt, uint64_t v)
+{
+	cbt->iface.recno = cbt->recno = v;
+}
+
+/*
+ * __cursor_pos_clear --
+ *	Reset the cursor's location.
+ */
+static inline void
+__cursor_pos_clear(WT_CURSOR_BTREE *cbt)
+{
+	/*
+	 * Most of the cursor's location information that needs to be set on
+	 * successful return is always set by a successful return, for example,
+	 * we don't initialize the compare return value because it's always
+	 * set by the row-store search.  The other stuff gets cleared here,
+	 * and it's a minimal set of things we need to clear. It would be a
+	 * lot simpler to clear everything, but we call this function a lot.
+	 */
+	cbt->recno = 0;
+
+	cbt->ins = NULL;
+	cbt->ins_head = NULL;
+	cbt->ins_stack[0] = NULL;
+
+	cbt->cip_saved = NULL;
+	cbt->rip_saved = NULL;
+
+	/*
+	 * Don't clear the active flag, it's owned by the cursor enter/leave
+	 * functions.
+	 */
+	F_CLR(cbt, ~WT_CBT_ACTIVE);
+}
+
+/*
+ * __cursor_enter --
+ *	Activate a cursor.
+ */
+static inline int
+__cursor_enter(WT_SESSION_IMPL *session)
+{
+	/*
+	 * If there are no other cursors positioned in the session, check
+	 * whether the cache is full.
+	 */
+	if (session->ncursors == 0)
+		WT_RET(__wt_cache_full_check(session));
+	++session->ncursors;
+	return (0);
+}
+
+/*
+ * __cursor_leave --
+ *	Deactivate a cursor.
+ */
+static inline int
+__cursor_leave(WT_SESSION_IMPL *session)
+{
+	/*
+	 * Decrement the count of active cursors in the session.  When that
+	 * goes to zero, there are no active cursors, and we can release any
+	 * snapshot we're holding for read committed isolation.
+	 */
+	WT_ASSERT(session, session->ncursors > 0);
+	if (--session->ncursors == 0)
+		__wt_txn_read_last(session);
+
+	return (0);
+}
+
+/*
+ * __curfile_enter --
+ *	Activate a file cursor.
+ */
+static inline int
+__curfile_enter(WT_CURSOR_BTREE *cbt)
+{
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+	WT_RET(__cursor_enter(session));
+	F_SET(cbt, WT_CBT_ACTIVE);
+	return (0);
+}
+
+/*
+ * __curfile_leave --
+ *	Clear a file cursor's position.
+ */
+static inline int
+__curfile_leave(WT_CURSOR_BTREE *cbt)
+{
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+	/* If the cursor was active, deactivate it. */
+	if (F_ISSET(cbt, WT_CBT_ACTIVE)) {
+		WT_RET(__cursor_leave(session));
+		F_CLR(cbt, WT_CBT_ACTIVE);
+	}
+
+	/*
+	 * Release any page references we're holding.  This can trigger
+	 * eviction (e.g., forced eviction of big pages), so it is important to
+	 * do it after releasing our snapshot above.
+	 */
+	WT_RET(__wt_page_release(session, cbt->ref, 0));
+	cbt->ref = NULL;
+	return (0);
+}
+
+/*
+ * __cursor_func_init --
+ *	Cursor call setup.
+ */
+static inline int
+__cursor_func_init(WT_CURSOR_BTREE *cbt, int reenter)
+{
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+	if (reenter)
+		WT_RET(__curfile_leave(cbt));
+	if (!F_ISSET(cbt, WT_CBT_ACTIVE))
+		WT_RET(__curfile_enter(cbt));
+	__wt_txn_cursor_op(session);
+	return (0);
+}
+
+/*
+ * __cursor_reset --
+ *	Reset the cursor.
+ */
+static inline int
+__cursor_reset(WT_CURSOR_BTREE *cbt)
+{
+	WT_DECL_RET;
+
+	/*
+	 * The cursor is leaving the API, and no longer holds any position,
+	 * generally called to clean up the cursor after an error.
+	 */
+	ret = __curfile_leave(cbt);
+	__cursor_pos_clear(cbt);
+	return (ret);
+}
+
+/*
+ * __cursor_row_slot_return --
+ *	Return a row-store leaf page slot's K/V pair.
+ */
+static inline int
+__cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_UPDATE *upd)
+{
+	WT_BTREE *btree;
+	WT_ITEM *kb, *vb;
+	WT_CELL *cell;
+	WT_CELL_UNPACK *unpack, _unpack;
+	WT_PAGE *page;
+	WT_SESSION_IMPL *session;
+	void *copy;
+
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
+	btree = S2BT(session);
+	page = cbt->ref->page;
+
+	unpack = NULL;
+
+	kb = &cbt->iface.key;
+	vb = &cbt->iface.value;
+
+	/*
+	 * The row-store key can change underfoot; explicitly take a copy.
+	 */
+	copy = WT_ROW_KEY_COPY(rip);
+
+	/*
+	 * Get a key: we could just call __wt_row_leaf_key, but as a cursor
+	 * is running through the tree, we may have additional information
+	 * here (we may have the fully-built key that's immediately before
+	 * the prefix-compressed key we want, so it's a faster construction).
+	 *
+	 * First, check for an immediately available key.
+	 */
+	if (__wt_row_leaf_key_info(
+	    page, copy, NULL, &cell, &kb->data, &kb->size))
+		goto value;
+
+	/* Huffman encoded keys are a slow path in all cases. */
+	if (btree->huffman_key != NULL)
+		goto slow;
+
+	/*
+	 * Unpack the cell and deal with overflow and prefix-compressed keys.
+	 * Inline building simple prefix-compressed keys from a previous key,
+	 * otherwise build from scratch.
+	 */
+	unpack = &_unpack;
+	__wt_cell_unpack(cell, unpack);
+	if (unpack->type == WT_CELL_KEY &&
+	    cbt->rip_saved != NULL && cbt->rip_saved == rip - 1) {
+		WT_ASSERT(session, cbt->tmp.size >= unpack->prefix);
+
+		/*
+		 * Grow the buffer as necessary as well as ensure data has been
+		 * copied into local buffer space, then append the suffix to the
+		 * prefix already in the buffer.
+		 *
+		 * Don't grow the buffer unnecessarily or copy data we don't
+		 * need, truncate the item's data length to the prefix bytes.
+		 */
+		cbt->tmp.size = unpack->prefix;
+		WT_RET(__wt_buf_grow(
+		    session, &cbt->tmp, cbt->tmp.size + unpack->size));
+		memcpy((uint8_t *)cbt->tmp.data + cbt->tmp.size,
+		    unpack->data, unpack->size);
+		cbt->tmp.size += unpack->size;
+	} else {
+		/*
+		 * Call __wt_row_leaf_key_work instead of __wt_row_leaf_key: we
+		 * already did __wt_row_leaf_key's fast-path checks inline.
+		 */
+slow:		WT_RET(
+		    __wt_row_leaf_key_work(session, page, rip, &cbt->tmp, 0));
+	}
+	kb->data = cbt->tmp.data;
+	kb->size = cbt->tmp.size;
+	cbt->rip_saved = rip;
+
+value:
+	/*
+	 * If the item was ever modified, use the WT_UPDATE data.  Note the
+	 * caller passes us the update: it has already resolved which one
+	 * (if any) is visible.
+	 */
+	if (upd != NULL) {
+		vb->data = WT_UPDATE_DATA(upd);
+		vb->size = upd->size;
+		return (0);
+	}
+
+	/* Else, simple values have their location encoded in the WT_ROW. */
+	if (__wt_row_leaf_value(page, rip, vb))
+		return (0);
+
+	/*
+	 * Else, take the value from the original page cell (which may be
+	 * empty).
+	 */
+	if ((cell = __wt_row_leaf_value_cell(page, rip, unpack)) == NULL) {
+		vb->data = "";
+		vb->size = 0;
+		return (0);
+	}
+
+	unpack = &_unpack;
+	__wt_cell_unpack(cell, unpack);
+	return (__wt_page_cell_data_ref(session, cbt->ref->page, unpack, vb));
+}
diff --git a/src/third_party/wiredtiger/src/include/dhandle.h b/src/third_party/wiredtiger/src/include/dhandle.h
new file mode 100644
index 00000000000..5556627c74d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/dhandle.h
@@ -0,0 +1,73 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * XXX
+ * The server threads use their own WT_SESSION_IMPL handles because they may
+ * want to block (for example, the eviction server calls reconciliation, and
+ * some of the reconciliation diagnostic code reads pages), and the user's
+ * session handle is already blocking on a server thread.  The problem is the
+ * server thread needs to reference the correct btree handle, and that's
+ * hanging off the application's thread of control.  For now, I'm just making
+ * it obvious where that's getting done.
+ */
+#define	WT_SET_BTREE_IN_SESSION(s, b)	((s)->dhandle = b->dhandle)
+#define	WT_CLEAR_BTREE_IN_SESSION(s)	((s)->dhandle = NULL)
+
+#define	WT_WITH_DHANDLE(s, d, e) do {					\
+	WT_DATA_HANDLE *__saved_dhandle = (s)->dhandle;			\
+	(s)->dhandle = (d);						\
+	e;								\
+	(s)->dhandle = __saved_dhandle;					\
+} while (0)
+
+#define	WT_WITH_BTREE(s, b, e)	WT_WITH_DHANDLE(s, (b)->dhandle, e)
+
+/*
+ * WT_DATA_HANDLE --
+ *	A handle for a generic named data source.
+ */
+struct __wt_data_handle {
+	WT_RWLOCK *rwlock;		/* Lock for shared/exclusive ops */
+	SLIST_ENTRY(__wt_data_handle) l;/* Linked list of handles */
+
+	/*
+	 * Sessions caching a connection's data handle will have a non-zero
+	 * reference count; sessions using a connection's data handle will
+	 * have a non-zero in-use count.
+	 */
+	uint32_t session_ref;		/* Sessions referencing this handle */
+	int32_t	 session_inuse;		/* Sessions using this handle */
+	time_t	 timeofdeath;		/* Use count went to 0 */
+
+	uint64_t name_hash;		/* Hash of name */
+	const char *name;		/* Object name as a URI */
+	const char *checkpoint;		/* Checkpoint name (or NULL) */
+	const char **cfg;		/* Configuration information */
+
+	WT_DATA_SOURCE *dsrc;		/* Data source for this handle */
+	void *handle;			/* Generic handle */
+
+	/*
+	 * Data handles can be closed without holding the schema lock; threads
+	 * walk the list of open handles, operating on them (checkpoint is the
+	 * best example).  To avoid sources disappearing underneath checkpoint,
+	 * lock the data handle when closing it.
+	 */
+	WT_SPINLOCK	close_lock;	/* Lock to close the handle */
+
+	WT_DSRC_STATS stats;		/* Data-source statistics */
+
+	/* Flags values over 0xff are reserved for WT_BTREE_* */
+#define	WT_DHANDLE_DISCARD	        0x01	/* Discard on release */
+#define	WT_DHANDLE_DISCARD_CLOSE	0x02	/* Close on release */
+#define	WT_DHANDLE_EXCLUSIVE	        0x04	/* Need exclusive access */
+#define	WT_DHANDLE_HAVE_REF		0x08	/* Already have ref */
+#define	WT_DHANDLE_LOCK_ONLY	        0x10	/* Handle only used as a lock */
+#define	WT_DHANDLE_OPEN		        0x20	/* Handle is open */
+	uint32_t flags;
+};
diff --git a/src/third_party/wiredtiger/src/include/dlh.h b/src/third_party/wiredtiger/src/include/dlh.h
new file mode 100644
index 00000000000..3974ae2792c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/dlh.h
@@ -0,0 +1,15 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+struct __wt_dlh {
+	TAILQ_ENTRY(__wt_dlh) q;		/* List of open libraries. */
+
+	void	*handle;			/* Handle returned by dlopen. */
+	char	*name;
+
+	int (*terminate)(WT_CONNECTION *);	/* Terminate function. */
+};
diff --git a/src/third_party/wiredtiger/src/include/error.h b/src/third_party/wiredtiger/src/include/error.h
new file mode 100644
index 00000000000..9bccc80faec
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/error.h
@@ -0,0 +1,141 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define	WT_DEBUG_POINT	((void *)0xdeadbeef)
+#define	WT_DEBUG_BYTE	(0xab)
+
+/* In DIAGNOSTIC mode, yield in places where we want to encourage races. */
+#ifdef HAVE_DIAGNOSTIC
+#define	WT_HAVE_DIAGNOSTIC_YIELD do {					\
+	__wt_yield();							\
+} while (0)
+#else
+#define	WT_HAVE_DIAGNOSTIC_YIELD
+#endif
+
+/* Set "ret" and branch-to-err-label tests. */
+#define	WT_ERR(a) do {							\
+	if ((ret = (a)) != 0)						\
+		goto err;						\
+} while (0)
+#define	WT_ERR_MSG(session, v, ...) do {				\
+	ret = (v);							\
+	__wt_err(session, ret, __VA_ARGS__);				\
+	goto err;							\
+} while (0)
+#define	WT_ERR_BUSY_OK(a) do {						\
+	if ((ret = (a)) != 0) {						\
+		if (ret == EBUSY)					\
+			ret = 0;					\
+		else							\
+			goto err;					\
+	}								\
+} while (0)
+#define	WT_ERR_NOTFOUND_OK(a) do {					\
+	if ((ret = (a)) != 0) {						\
+		if (ret == WT_NOTFOUND)					\
+			ret = 0;					\
+		else							\
+			goto err;					\
+	}								\
+} while (0)
+#define	WT_ERR_TEST(a, v) do {						\
+	if (a) {							\
+		ret = (v);						\
+		goto err;						\
+	}								\
+} while (0)
+
+/* Return tests. */
+#define	WT_RET(a) do {							\
+	int __ret;							\
+	if ((__ret = (a)) != 0)						\
+		return (__ret);						\
+} while (0)
+#define	WT_RET_TEST(a, v) do {						\
+	if (a)								\
+		return (v);						\
+} while (0)
+#define	WT_RET_MSG(session, v, ...) do {				\
+	int __ret = (v);						\
+	__wt_err(session, __ret, __VA_ARGS__);				\
+	return (__ret);							\
+} while (0)
+#define	WT_RET_BUSY_OK(a) do {						\
+	int __ret;							\
+	if ((__ret = (a)) != 0 && __ret != EBUSY)			\
+		return (__ret);						\
+} while (0)
+#define	WT_RET_NOTFOUND_OK(a) do {					\
+	int __ret;							\
+	if ((__ret = (a)) != 0 && __ret != WT_NOTFOUND)			\
+		return (__ret);						\
+} while (0)
+/* Set "ret" if not already set. */
+#define	WT_TRET(a) do {							\
+	int __ret;							\
+	if ((__ret = (a)) != 0 &&					\
+	    (__ret == WT_PANIC ||					\
+	    ret == 0 || ret == WT_DUPLICATE_KEY || ret == WT_NOTFOUND))	\
+		ret = __ret;						\
+} while (0)
+#define	WT_TRET_BUSY_OK(a) do {						\
+	int __ret;							\
+	if ((__ret = (a)) != 0 && __ret != EBUSY &&			\
+	    (__ret == WT_PANIC ||					\
+	    ret == 0 || ret == WT_DUPLICATE_KEY || ret == WT_NOTFOUND))	\
+		ret = __ret;						\
+} while (0)
+#define	WT_TRET_NOTFOUND_OK(a) do {					\
+	int __ret;							\
+	if ((__ret = (a)) != 0 && __ret != WT_NOTFOUND &&		\
+	    (__ret == WT_PANIC ||					\
+	    ret == 0 || ret == WT_DUPLICATE_KEY || ret == WT_NOTFOUND))	\
+		ret = __ret;						\
+} while (0)
+
+/* Return and branch-to-err-label cases for switch statements. */
+#define	WT_ILLEGAL_VALUE(session)					\
+	default:							\
+		return (__wt_illegal_value(session, NULL))
+#define	WT_ILLEGAL_VALUE_ERR(session)					\
+	default:							\
+		WT_ERR(__wt_illegal_value(session, NULL))
+#define	WT_ILLEGAL_VALUE_SET(session)					\
+	default:							\
+		ret = __wt_illegal_value(session, NULL);		\
+		break
+
+#define	WT_PANIC_MSG(session, v, ...) do {				\
+	__wt_err(session, v, __VA_ARGS__);				\
+	(void)__wt_panic(session);					\
+} while (0)
+#define	WT_PANIC_ERR(session, v, ...) do {				\
+	WT_PANIC_MSG(session, v, __VA_ARGS__);				\
+	WT_ERR(WT_PANIC);						\
+} while (0)
+#define	WT_PANIC_RET(session, v, ...) do {				\
+	WT_PANIC_MSG(session, v, __VA_ARGS__);				\
+	/* Return WT_PANIC regardless of earlier return codes. */	\
+	return (WT_PANIC);						\
+} while (0)
+
+/*
+ * WT_ASSERT
+ *	Assert an expression, aborting in diagnostic mode.  Otherwise,
+ * "use" the session to keep the compiler quiet and don't evaluate the
+ * expression.
+ */
+#ifdef HAVE_DIAGNOSTIC
+#define	WT_ASSERT(session, exp) do {					\
+	if (!(exp))							\
+		__wt_assert(session, 0, __FILE__, __LINE__, "%s", #exp);\
+} while (0)
+#else
+#define	WT_ASSERT(session, exp)						\
+	WT_UNUSED(session)
+#endif
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
new file mode 100644
index 00000000000..2ab964475d8
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -0,0 +1,650 @@
+/* DO NOT EDIT: automatically built by dist/s_prototypes. */
+
+extern void __wt_async_stats_update(WT_SESSION_IMPL *session);
+extern int __wt_async_create(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_async_reconfig(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_async_destroy(WT_SESSION_IMPL *session);
+extern int __wt_async_flush(WT_SESSION_IMPL *session);
+extern int __wt_async_new_op(WT_SESSION_IMPL *session, const char *uri, const char *config, const char *cfg[], WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP_IMPL **opp);
+extern int __wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op);
+extern int __wt_async_op_init(WT_SESSION_IMPL *session);
+extern void *__wt_async_worker(void *arg);
+extern int __wt_block_addr_to_buffer(WT_BLOCK *block, uint8_t **pp, wt_off_t offset, uint32_t size, uint32_t cksum);
+extern int __wt_block_buffer_to_addr(WT_BLOCK *block, const uint8_t *p, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump);
+extern int __wt_block_addr_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, int live);
+extern int __wt_block_addr_string(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, const uint8_t *addr, size_t addr_size);
+extern int __wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci);
+extern int __wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t **pp, WT_BLOCK_CKPT *ci);
+extern int __wt_block_ckpt_init( WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name);
+extern int __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, uint8_t *root_addr, size_t *root_addr_sizep, int checkpoint);
+extern int __wt_block_checkpoint_unload( WT_SESSION_IMPL *session, WT_BLOCK *block, int checkpoint);
+extern void __wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci);
+extern int __wt_block_checkpoint(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase, int data_cksum);
+extern int __wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, int *skipp);
+extern int __wt_block_compact_page_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, int *skipp);
+extern int __wt_block_misplaced(WT_SESSION_IMPL *session, WT_BLOCK *block, const char *tag, wt_off_t offset, uint32_t size, int live);
+extern int __wt_block_off_remove_overlap( WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size);
+extern int __wt_block_alloc( WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size);
+extern int __wt_block_free(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size);
+extern int __wt_block_off_free( WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, wt_off_t size);
+extern int __wt_block_extlist_check( WT_SESSION_IMPL *session, WT_EXTLIST *al, WT_EXTLIST *bl);
+extern int __wt_block_extlist_overlap( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci);
+extern int __wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b);
+extern int __wt_block_insert_ext( WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size);
+extern int __wt_block_extlist_read_avail(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size);
+extern int __wt_block_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size);
+extern int __wt_block_extlist_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, WT_EXTLIST *additional);
+extern int __wt_block_extlist_truncate( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el);
+extern int __wt_block_extlist_init(WT_SESSION_IMPL *session, WT_EXTLIST *el, const char *name, const char *extname, int track_size);
+extern void __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el);
+extern int __wt_block_map( WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapp, size_t *maplenp, void **mappingcookie);
+extern int __wt_block_unmap( WT_SESSION_IMPL *session, WT_BLOCK *block, void *map, size_t maplen, void **mappingcookie);
+extern int __wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], int forced_salvage, int readonly, uint32_t allocsize, WT_BM **bmp);
+extern int __wt_block_manager_truncate( WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize);
+extern int __wt_block_manager_create( WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize);
+extern int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], int forced_salvage, int readonly, uint32_t allocsize, WT_BLOCK **blockp);
+extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize);
+extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats);
+extern int __wt_bm_preload(WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
+extern int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size);
+extern int __wt_block_read_off_blind( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset);
+extern int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t cksum);
+extern int __wt_block_ext_alloc(WT_SESSION_IMPL *session, WT_EXT **extp);
+extern void __wt_block_ext_free(WT_SESSION_IMPL *session, WT_EXT *ext);
+extern int __wt_block_size_alloc(WT_SESSION_IMPL *session, WT_SIZE **szp);
+extern void __wt_block_size_free(WT_SESSION_IMPL *session, WT_SIZE *sz);
+extern int __wt_block_ext_prealloc(WT_SESSION_IMPL *session, u_int max);
+extern int __wt_block_ext_discard(WT_SESSION_IMPL *session, u_int max);
+extern int __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size);
+extern int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, int *eofp);
+extern int __wt_block_salvage_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t addr_size, int valid);
+extern int __wt_block_verify_start( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase);
+extern int __wt_block_verify_end(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_verify_ckpt_load( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci);
+extern int __wt_verify_ckpt_unload(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_block_verify_addr(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size);
+extern u_int __wt_block_header(WT_BLOCK *block);
+extern int __wt_block_write_size(WT_SESSION_IMPL *session, WT_BLOCK *block, size_t *sizep);
+extern int __wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, int data_cksum);
+extern int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump, int data_cksum, int caller_locked);
+extern int __wt_bloom_create( WT_SESSION_IMPL *session, const char *uri, const char *config, uint64_t count, uint32_t factor, uint32_t k, WT_BLOOM **bloomp);
+extern int __wt_bloom_open(WT_SESSION_IMPL *session, const char *uri, uint32_t factor, uint32_t k, WT_CURSOR *owner, WT_BLOOM **bloomp);
+extern int __wt_bloom_insert(WT_BLOOM *bloom, WT_ITEM *key);
+extern int __wt_bloom_finalize(WT_BLOOM *bloom);
+extern int __wt_bloom_hash(WT_BLOOM *bloom, WT_ITEM *key, WT_BLOOM_HASH *bhash);
+extern int __wt_bloom_hash_get(WT_BLOOM *bloom, WT_BLOOM_HASH *bhash);
+extern int __wt_bloom_get(WT_BLOOM *bloom, WT_ITEM *key);
+extern int __wt_bloom_close(WT_BLOOM *bloom);
+extern int __wt_bloom_drop(WT_BLOOM *bloom, const char *config);
+extern int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp);
+extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt, int next);
+extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating);
+extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating);
+extern int __wt_btcur_reset(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_search(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp);
+extern int __wt_btcur_insert(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_update_check(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_remove(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_update(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp);
+extern int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop);
+extern int __wt_btcur_close(WT_CURSOR_BTREE *cbt);
+extern int __wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v);
+extern int __wt_debug_addr_print( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
+extern int __wt_debug_addr(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, const char *ofile);
+extern int __wt_debug_offset_blind( WT_SESSION_IMPL *session, wt_off_t offset, const char *ofile);
+extern int __wt_debug_offset(WT_SESSION_IMPL *session, wt_off_t offset, uint32_t size, uint32_t cksum, const char *ofile);
+extern int __wt_debug_disk( WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile);
+extern int __wt_debug_tree_shape( WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
+extern int __wt_debug_tree_all(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
+extern int __wt_debug_tree(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
+extern int __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
+extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp);
+extern void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref);
+extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref);
+extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep);
+extern void __wt_free_ref( WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, int free_pages);
+extern void __wt_free_ref_index(WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE_INDEX *pindex, int free_pages);
+extern void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_evict_server_wake(WT_SESSION_IMPL *session);
+extern int __wt_evict_create(WT_SESSION_IMPL *session);
+extern int __wt_evict_destroy(WT_SESSION_IMPL *session);
+extern int __wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session);
+extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session);
+extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app);
+extern void __wt_cache_dump(WT_SESSION_IMPL *session);
+extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]);
+extern int __wt_btree_close(WT_SESSION_IMPL *session);
+extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno);
+extern int __wt_btree_tree_open( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
+extern int __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep);
+extern void __wt_btree_evictable(WT_SESSION_IMPL *session, int on);
+extern uint32_t __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize);
+extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session);
+extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session);
+extern int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size);
+extern int __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, int checkpoint, int compressed);
+extern const char *__wt_page_type_string(u_int type);
+extern const char *__wt_cell_type_string(uint8_t type);
+extern const char *__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf);
+extern const char *__wt_addr_string(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ITEM *buf);
+extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store);
+extern int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack);
+extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell);
+extern int
+__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ );
+extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep);
+extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep);
+extern int __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd);
+extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]);
+extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst);
+extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op);
+extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk, size_t size);
+extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf);
+extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags);
+extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove);
+extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt);
+extern int __wt_rec_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive);
+extern void __wt_split_stash_discard(WT_SESSION_IMPL *session);
+extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session);
+extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp);
+extern int __wt_split_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive);
+extern int __wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell);
+extern void __wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page, uint8_t **addrp, size_t *addr_sizep, const void *value, size_t value_size);
+extern int __wt_ovfl_reuse_add(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr, size_t addr_size, const void *value, size_t value_size);
+extern void __wt_ovfl_reuse_free(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_ovfl_txnc_search( WT_PAGE *page, const uint8_t *addr, size_t addr_size, WT_ITEM *store);
+extern int __wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr, size_t addr_size, const void *value, size_t value_size);
+extern void __wt_ovfl_txnc_free(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_rec_write(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags);
+extern int __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
+extern int __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
+extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
+extern int __wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
+extern int __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
+extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key);
+extern int __wt_row_leaf_key_work(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, int instantiate);
+extern int __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t cell_offset, const void *key, size_t size, void *ikeyp);
+extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, size_t size, void *ikeyp);
+extern int __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd, int is_remove);
+extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session, WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep);
+extern int __wt_update_alloc( WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep);
+extern WT_UPDATE *__wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd);
+extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd);
+extern int __wt_search_insert( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key);
+extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, int insert);
+extern int __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
+extern int __wt_config_initn( WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len);
+extern int __wt_config_init(WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str);
+extern int __wt_config_subinit( WT_SESSION_IMPL *session, WT_CONFIG *conf, WT_CONFIG_ITEM *item);
+extern int __wt_config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value);
+extern int __wt_config_get(WT_SESSION_IMPL *session, const char **cfg, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value);
+extern int __wt_config_gets(WT_SESSION_IMPL *session, const char **cfg, const char *key, WT_CONFIG_ITEM *value);
+extern int __wt_config_getone(WT_SESSION_IMPL *session, const char *config, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value);
+extern int __wt_config_getones(WT_SESSION_IMPL *session, const char *config, const char *key, WT_CONFIG_ITEM *value);
+extern int __wt_config_gets_def(WT_SESSION_IMPL *session, const char **cfg, const char *key, int def, WT_CONFIG_ITEM *value);
+extern int __wt_config_subgetraw(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cfg, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value);
+extern int __wt_config_subgets(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cfg, const char *key, WT_CONFIG_ITEM *value);
+extern void __wt_conn_foc_discard(WT_SESSION_IMPL *session);
+extern int __wt_configure_method(WT_SESSION_IMPL *session, const char *method, const char *uri, const char *config, const char *type, const char *check);
+extern int __wt_config_check(WT_SESSION_IMPL *session, const WT_CONFIG_ENTRY *entry, const char *config, size_t config_len);
+extern int __wt_config_collapse( WT_SESSION_IMPL *session, const char **cfg, const char **config_ret);
+extern int __wt_config_merge( WT_SESSION_IMPL *session, const char **cfg, const char **config_ret);
+extern int __wt_config_concat( WT_SESSION_IMPL *session, const char **cfg, const char **config_ret);
+extern int __wt_conn_config_init(WT_SESSION_IMPL *session);
+extern void __wt_conn_config_discard(WT_SESSION_IMPL *session);
+extern int __wt_ext_config_parser_open(WT_EXTENSION_API *wt_ext, WT_SESSION *wt_session, const char *config, size_t len, WT_CONFIG_PARSER **config_parserp);
+extern int __wt_ext_config_get(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, WT_CONFIG_ARG *cfg_arg, const char *key, WT_CONFIG_ITEM *cval);
+extern int __wt_config_upgrade(WT_SESSION_IMPL *session, WT_ITEM *buf);
+extern int __wt_collator_config(WT_SESSION_IMPL *session, const char **cfg, WT_COLLATOR **collatorp, int *ownp);
+extern int __wt_conn_remove_collator(WT_SESSION_IMPL *session);
+extern int __wt_conn_remove_compressor(WT_SESSION_IMPL *session);
+extern int __wt_conn_remove_data_source(WT_SESSION_IMPL *session);
+extern int __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_cache_config(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]);
+extern void __wt_cache_stats_update(WT_SESSION_IMPL *session);
+extern int __wt_cache_destroy(WT_SESSION_IMPL *session);
+extern int __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg);
+extern int __wt_conn_cache_pool_open(WT_SESSION_IMPL *session);
+extern int __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session);
+extern void *__wt_cache_pool_server(void *arg);
+extern int __wt_checkpoint_server_create(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session);
+extern int __wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize);
+extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int force);
+extern int __wt_conn_btree_get(WT_SESSION_IMPL *session, const char *name, const char *ckpt, const char *op_cfg[], uint32_t flags);
+extern int __wt_conn_btree_apply(WT_SESSION_IMPL *session, int apply_checkpoints, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]);
+extern int __wt_conn_btree_apply_single(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]);
+extern void __wt_conn_btree_close(WT_SESSION_IMPL *session);
+extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *name, int force);
+extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, int final);
+extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session);
+extern int __wt_connection_init(WT_CONNECTION_IMPL *conn);
+extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn);
+extern int __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_logmgr_destroy(WT_SESSION_IMPL *session);
+extern int __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]);
+extern int __wt_connection_close(WT_CONNECTION_IMPL *conn);
+extern int __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]);
+extern void __wt_conn_stat_init(WT_SESSION_IMPL *session);
+extern int __wt_statlog_log_one(WT_SESSION_IMPL *session);
+extern int __wt_statlog_create(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_statlog_destroy(WT_SESSION_IMPL *session, int is_close);
+extern int __wt_sweep_create(WT_SESSION_IMPL *session);
+extern int __wt_sweep_destroy(WT_SESSION_IMPL *session);
+extern int __wt_curbackup_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_backup_list_uri_append( WT_SESSION_IMPL *session, const char *name, int *skip);
+extern int __wt_curbulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, int bitmap, int skip_sort_check);
+extern int __wt_curconfig_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_curds_open( WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp);
+extern int __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp);
+extern int __wt_curfile_update_check(WT_CURSOR *cursor);
+extern int __wt_curfile_create(WT_SESSION_IMPL *session, WT_CURSOR *owner, const char *cfg[], int bulk, int bitmap, WT_CURSOR **cursorp);
+extern int __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_curindex_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_json_alloc_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, WT_CURSOR_JSON *json, int iskey, va_list ap);
+extern void __wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor);
+extern size_t __wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, int force_unicode);
+extern int __wt_json_column_init(WT_CURSOR *cursor, const char *keyformat, const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf);
+extern int __wt_json_token(WT_SESSION *wt_session, const char *src, int *toktype, const char **tokstart, size_t *toklen);
+extern const char *__wt_json_tokname(int toktype);
+extern int __wt_json_to_item(WT_SESSION_IMPL *session, const char *jstr, const char *format, WT_CURSOR_JSON *json, int iskey, WT_ITEM *item);
+extern ssize_t __wt_json_strlen(const char *src, size_t srclen);
+extern int __wt_json_strncpy(char **pdst, size_t dstlen, const char *src, size_t srclen);
+extern int __wt_curlog_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_curmetadata_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern void __wt_curstat_dsrc_final(WT_CURSOR_STAT *cst);
+extern int __wt_curstat_init(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR_STAT *cst);
+extern int __wt_curstat_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_cursor_notsup(WT_CURSOR *cursor);
+extern int __wt_cursor_noop(WT_CURSOR *cursor);
+extern void __wt_cursor_set_notsup(WT_CURSOR *cursor);
+extern int __wt_cursor_config_readonly(WT_CURSOR *cursor, const char *cfg[], int def);
+extern int __wt_cursor_kv_not_set(WT_CURSOR *cursor, int key);
+extern int __wt_cursor_get_key(WT_CURSOR *cursor, ...);
+extern void __wt_cursor_set_key(WT_CURSOR *cursor, ...);
+extern int __wt_cursor_get_raw_key(WT_CURSOR *cursor, WT_ITEM *key);
+extern void __wt_cursor_set_raw_key(WT_CURSOR *cursor, WT_ITEM *key);
+extern int __wt_cursor_get_raw_value(WT_CURSOR *cursor, WT_ITEM *value);
+extern void __wt_cursor_set_raw_value(WT_CURSOR *cursor, WT_ITEM *value);
+extern int __wt_cursor_get_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap);
+extern void __wt_cursor_set_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap);
+extern int __wt_cursor_get_value(WT_CURSOR *cursor, ...);
+extern int __wt_cursor_get_valuev(WT_CURSOR *cursor, va_list ap);
+extern void __wt_cursor_set_value(WT_CURSOR *cursor, ...);
+extern void __wt_cursor_set_valuev(WT_CURSOR *cursor, va_list ap);
+extern int __wt_cursor_close(WT_CURSOR *cursor);
+extern int __wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor);
+extern int __wt_cursor_init(WT_CURSOR *cursor, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_curtable_get_key(WT_CURSOR *cursor, ...);
+extern int __wt_curtable_get_value(WT_CURSOR *cursor, ...);
+extern void __wt_curtable_set_key(WT_CURSOR *cursor, ...);
+extern void __wt_curtable_set_value(WT_CURSOR *cursor, ...);
+extern int __wt_table_range_truncate(WT_CURSOR_TABLE *start, WT_CURSOR_TABLE *stop);
+extern int __wt_curtable_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn);
+extern void __wt_log_written_reset(WT_SESSION_IMPL *session);
+extern int __wt_log_get_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp);
+extern int __wt_log_get_active_files( 	WT_SESSION_IMPL *session, char ***filesp, u_int *countp);
+extern void __wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count);
+extern int __wt_log_filename(WT_SESSION_IMPL *session, uint32_t id, WT_ITEM *buf);
+extern int __wt_log_extract_lognum( WT_SESSION_IMPL *session, const char *name, uint32_t *id);
+extern int __wt_log_remove(WT_SESSION_IMPL *session, uint32_t lognum);
+extern int __wt_log_open(WT_SESSION_IMPL *session);
+extern int __wt_log_close(WT_SESSION_IMPL *session);
+extern int __wt_log_newfile(WT_SESSION_IMPL *session, int conn_create);
+extern int __wt_log_read(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags);
+extern int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, void *cookie), void *cookie);
+extern int __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags);
+extern int __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap);
+extern int __wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp);
+extern void __wt_logrec_free(WT_SESSION_IMPL *session, WT_ITEM **logrecp);
+extern int __wt_logrec_read(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *rectypep);
+extern int __wt_logop_read(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *optypep, uint32_t *opsizep);
+extern int __wt_logop_col_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno, WT_ITEM *value);
+extern int __wt_logop_col_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop, WT_ITEM *valuep);
+extern int __wt_logop_col_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_col_remove_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno);
+extern int __wt_logop_col_remove_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop);
+extern int __wt_logop_col_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_col_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t start, uint64_t stop);
+extern int __wt_logop_col_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *startp, uint64_t *stopp);
+extern int __wt_logop_col_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_row_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key, WT_ITEM *value);
+extern int __wt_logop_row_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp, WT_ITEM *valuep);
+extern int __wt_logop_row_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_row_remove_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key);
+extern int __wt_logop_row_remove_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp);
+extern int __wt_logop_row_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_row_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *start, WT_ITEM *stop, uint32_t mode);
+extern int __wt_logop_row_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep);
+extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_log_slot_init(WT_SESSION_IMPL *session);
+extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session);
+extern int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslotp);
+extern int __wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
+extern int __wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
+extern int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
+extern int64_t __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size);
+extern int __wt_log_slot_free(WT_LOGSLOT *slot);
+extern int __wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize);
+extern int __wt_clsm_init_merge( WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks);
+extern int __wt_clsm_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_lsm_manager_config(WT_SESSION_IMPL *session, const char **cfg);
+extern int __wt_lsm_manager_reconfig(WT_SESSION_IMPL *session, const char **cfg);
+extern int __wt_lsm_manager_start(WT_SESSION_IMPL *session);
+extern void __wt_lsm_manager_free_work_unit( WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT *entry);
+extern int __wt_lsm_manager_destroy(WT_SESSION_IMPL *session);
+extern int __wt_lsm_manager_clear_tree( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_manager_pop_entry( WT_SESSION_IMPL *session, uint32_t type, WT_LSM_WORK_UNIT **entryp);
+extern int __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session, uint32_t type, uint32_t flags, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_merge_update_tree(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int start_chunk, u_int nchunks, WT_LSM_CHUNK *chunk);
+extern int __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id);
+extern int __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_curstat_lsm_init( WT_SESSION_IMPL *session, const char *uri, WT_CURSOR_STAT *cst);
+extern int __wt_lsm_tree_close_all(WT_SESSION_IMPL *session);
+extern int __wt_lsm_tree_bloom_name(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, uint32_t id, const char **retp);
+extern int __wt_lsm_tree_chunk_name(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, uint32_t id, const char **retp);
+extern int __wt_lsm_tree_set_chunk_size( WT_SESSION_IMPL *session, WT_LSM_CHUNK *chunk);
+extern int __wt_lsm_tree_setup_chunk( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk);
+extern int __wt_lsm_tree_create(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config);
+extern int __wt_lsm_tree_get(WT_SESSION_IMPL *session, const char *uri, int exclusive, WT_LSM_TREE **treep);
+extern void __wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern void __wt_lsm_tree_throttle( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int decrease_only);
+extern int __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_tree_drop( WT_SESSION_IMPL *session, const char *name, const char *cfg[]);
+extern int __wt_lsm_tree_rename(WT_SESSION_IMPL *session, const char *olduri, const char *newuri, const char *cfg[]);
+extern int __wt_lsm_tree_truncate( WT_SESSION_IMPL *session, const char *name, const char *cfg[]);
+extern int __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip);
+extern int __wt_lsm_tree_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, int *), const char *cfg[], uint32_t open_flags);
+extern int __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int force, WT_LSM_CHUNK **chunkp);
+extern int __wt_lsm_work_switch( WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT **entryp, int *ran);
+extern int __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk);
+extern int __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args);
+extern int __wt_meta_btree_apply(WT_SESSION_IMPL *session, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]);
+extern int __wt_meta_checkpoint(WT_SESSION_IMPL *session, const char *fname, const char *checkpoint, WT_CKPT *ckpt);
+extern int __wt_meta_checkpoint_last_name( WT_SESSION_IMPL *session, const char *fname, const char **namep);
+extern int __wt_meta_checkpoint_clear(WT_SESSION_IMPL *session, const char *fname);
+extern int __wt_meta_ckptlist_get( WT_SESSION_IMPL *session, const char *fname, WT_CKPT **ckptbasep);
+extern int __wt_meta_ckptlist_set(WT_SESSION_IMPL *session, const char *fname, WT_CKPT *ckptbase, WT_LSN *ckptlsn);
+extern void __wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT *ckptbase);
+extern void __wt_meta_checkpoint_free(WT_SESSION_IMPL *session, WT_CKPT *ckpt);
+extern int __wt_ext_metadata_insert(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key, const char *value);
+extern int __wt_ext_metadata_remove( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key);
+extern int __wt_ext_metadata_search(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key, const char **valuep);
+extern int __wt_ext_metadata_update(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key, const char *value);
+extern int __wt_metadata_get_ckptlist( WT_SESSION *session, const char *name, WT_CKPT **ckptbasep);
+extern void __wt_metadata_free_ckptlist(WT_SESSION *session, WT_CKPT *ckptbase);
+extern int __wt_metadata_open(WT_SESSION_IMPL *session);
+extern int __wt_metadata_cursor( WT_SESSION_IMPL *session, const char *config, WT_CURSOR **cursorp);
+extern int __wt_metadata_insert( WT_SESSION_IMPL *session, const char *key, const char *value);
+extern int __wt_metadata_update( WT_SESSION_IMPL *session, const char *key, const char *value);
+extern int __wt_metadata_remove(WT_SESSION_IMPL *session, const char *key);
+extern int __wt_metadata_search( WT_SESSION_IMPL *session, const char *key, const char **valuep);
+extern void __wt_meta_track_discard(WT_SESSION_IMPL *session);
+extern int __wt_meta_track_on(WT_SESSION_IMPL *session);
+extern int __wt_meta_track_off(WT_SESSION_IMPL *session, int unroll);
+extern int __wt_meta_track_sub_on(WT_SESSION_IMPL *session);
+extern int __wt_meta_track_sub_off(WT_SESSION_IMPL *session);
+extern int __wt_meta_track_checkpoint(WT_SESSION_IMPL *session);
+extern int __wt_meta_track_insert(WT_SESSION_IMPL *session, const char *key);
+extern int __wt_meta_track_update(WT_SESSION_IMPL *session, const char *key);
+extern int __wt_meta_track_fileop( WT_SESSION_IMPL *session, const char *olduri, const char *newuri);
+extern int __wt_meta_track_handle_lock(WT_SESSION_IMPL *session, int created);
+extern int __wt_turtle_init(WT_SESSION_IMPL *session);
+extern int __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, const char **valuep);
+extern int __wt_turtle_update( WT_SESSION_IMPL *session, const char *key, const char *value);
+extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_ATTRIBUTE((noreturn));
+extern int __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp);
+extern int __wt_realloc(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp);
+extern int __wt_realloc_aligned(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp);
+extern int __wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp);
+extern int __wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp);
+extern void __wt_free_int(WT_SESSION_IMPL *session, const void *p_arg);
+extern int __wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix, uint32_t flags, char ***dirlist, u_int *countp);
+extern int __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp);
+extern int __wt_dlsym(WT_SESSION_IMPL *session, WT_DLH *dlh, const char *name, int fail, void *sym_ret);
+extern int __wt_dlclose(WT_SESSION_IMPL *session, WT_DLH *dlh);
+extern int __wt_errno(void);
+extern int __wt_exist(WT_SESSION_IMPL *session, const char *filename, int *existp);
+extern void __wt_fallocate_config(WT_SESSION_IMPL *session, WT_FH *fh);
+extern int __wt_fallocate( WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len);
+extern int __wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep);
+extern int __wt_filesize_name( WT_SESSION_IMPL *session, const char *filename, wt_off_t *sizep);
+extern int __wt_bytelock(WT_FH *fhp, wt_off_t byte, int lock);
+extern int __wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh);
+extern int __wt_fsync_async(WT_SESSION_IMPL *session, WT_FH *fh);
+extern int __wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len);
+extern int __wt_getline(WT_SESSION_IMPL *session, WT_ITEM *buf, FILE *fp);
+extern int __wt_getopt( const char *progname, int nargc, char *const *nargv, const char *ostr);
+extern int __wt_mmap(WT_SESSION_IMPL *session, WT_FH *fh, void *mapp, size_t *lenp, void **mappingcookie);
+extern int __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size);
+extern int __wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size);
+extern int __wt_munmap(WT_SESSION_IMPL *session, WT_FH *fh, void *map, size_t len, void **mappingcookie);
+extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, int is_signalled, WT_CONDVAR **condp);
+extern int __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, long usecs);
+extern int __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond);
+extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp);
+extern int __wt_rwlock_alloc( WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name);
+extern int __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
+extern int __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
+extern int __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
+extern int __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
+extern int __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
+extern int __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
+extern int __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp);
+extern int __wt_once(void (*init_routine)(void));
+extern int __wt_open(WT_SESSION_IMPL *session, const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp);
+extern int __wt_close(WT_SESSION_IMPL *session, WT_FH *fh);
+extern int __wt_absolute_path(const char *path);
+extern const char *__wt_path_separator(void);
+extern int __wt_has_priv(void);
+extern int __wt_remove(WT_SESSION_IMPL *session, const char *name);
+extern int __wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to);
+extern int __wt_read( WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf);
+extern int __wt_write(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, const void *buf);
+extern void __wt_sleep(long seconds, long micro_seconds);
+extern uint64_t __wt_strtouq(const char *nptr, char **endptr, int base);
+extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, void *(*func)(void *), void *arg);
+extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid);
+extern void __wt_thread_id(char *buf, size_t buflen);
+extern int __wt_seconds(WT_SESSION_IMPL *session, time_t *timep);
+extern int __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp);
+extern void __wt_yield(void);
+extern int __wt_ext_struct_pack(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, void *buffer, size_t size, const char *fmt, ...);
+extern int __wt_ext_struct_size(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, size_t *sizep, const char *fmt, ...);
+extern int __wt_ext_struct_unpack(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const void *buffer, size_t size, const char *fmt, ...);
+extern int __wt_struct_check(WT_SESSION_IMPL *session, const char *fmt, size_t len, int *fixedp, uint32_t *fixed_lenp);
+extern int __wt_struct_size(WT_SESSION_IMPL *session, size_t *sizep, const char *fmt, ...);
+extern int __wt_struct_pack(WT_SESSION_IMPL *session, void *buffer, size_t size, const char *fmt, ...);
+extern int __wt_struct_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, ...);
+extern int __wt_direct_io_size_check(WT_SESSION_IMPL *session, const char **cfg, const char *config_name, uint32_t *allocsizep);
+extern int __wt_schema_colgroup_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf);
+extern int __wt_schema_index_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, const char *config, WT_ITEM *buf);
+extern int __wt_schema_create( WT_SESSION_IMPL *session, const char *uri, const char *config);
+extern int __wt_schema_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]);
+extern int __wt_schema_get_table(WT_SESSION_IMPL *session, const char *name, size_t namelen, int ok_incomplete, WT_TABLE **tablep);
+extern void __wt_schema_release_table(WT_SESSION_IMPL *session, WT_TABLE *table);
+extern void __wt_schema_destroy_colgroup(WT_SESSION_IMPL *session, WT_COLGROUP *colgroup);
+extern void __wt_schema_destroy_index(WT_SESSION_IMPL *session, WT_INDEX *idx);
+extern void __wt_schema_destroy_table(WT_SESSION_IMPL *session, WT_TABLE *table);
+extern void __wt_schema_remove_table( WT_SESSION_IMPL *session, WT_TABLE *table);
+extern void __wt_schema_close_tables(WT_SESSION_IMPL *session);
+extern int __wt_schema_colgroup_name(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, size_t len, WT_ITEM *buf);
+extern int __wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table);
+extern int __wt_schema_open_index(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp);
+extern int __wt_schema_open_indices(WT_SESSION_IMPL *session, WT_TABLE *table);
+extern int __wt_schema_open_table(WT_SESSION_IMPL *session, const char *name, size_t namelen, WT_TABLE **tablep);
+extern int __wt_schema_get_colgroup(WT_SESSION_IMPL *session, const char *uri, WT_TABLE **tablep, WT_COLGROUP **colgroupp);
+extern int __wt_schema_get_index(WT_SESSION_IMPL *session, const char *uri, WT_TABLE **tablep, WT_INDEX **indexp);
+extern int __wt_schema_colcheck(WT_SESSION_IMPL *session, const char *key_format, const char *value_format, WT_CONFIG_ITEM *colconf, u_int *kcolsp, u_int *vcolsp);
+extern int __wt_table_check(WT_SESSION_IMPL *session, WT_TABLE *table);
+extern int __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table, const char *columns, size_t len, int value_only, WT_ITEM *plan);
+extern int __wt_struct_reformat(WT_SESSION_IMPL *session, WT_TABLE *table, const char *columns, size_t len, const char *extra_cols, int value_only, WT_ITEM *format);
+extern int __wt_struct_truncate(WT_SESSION_IMPL *session, const char *input_fmt, u_int ncols, WT_ITEM *format);
+extern int __wt_schema_project_in(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, va_list ap);
+extern int __wt_schema_project_out(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, va_list ap);
+extern int __wt_schema_project_slice(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, int key_only, const char *vformat, WT_ITEM *value);
+extern int __wt_schema_project_merge(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, const char *vformat, WT_ITEM *value);
+extern int __wt_schema_rename(WT_SESSION_IMPL *session, const char *uri, const char *newuri, const char *cfg[]);
+extern int __wt_curstat_colgroup_init(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR_STAT *cst);
+extern int __wt_curstat_index_init(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR_STAT *cst);
+extern int __wt_curstat_table_init(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR_STAT *cst);
+extern int __wt_schema_truncate( WT_SESSION_IMPL *session, const char *uri, const char *cfg[]);
+extern int __wt_range_truncate(WT_CURSOR *start, WT_CURSOR *stop);
+extern int __wt_schema_range_truncate( WT_SESSION_IMPL *session, WT_CURSOR *start, WT_CURSOR *stop);
+extern WT_DATA_SOURCE *__wt_schema_get_source(WT_SESSION_IMPL *session, const char *name);
+extern int __wt_str_name_check(WT_SESSION_IMPL *session, const char *str);
+extern int __wt_name_check(WT_SESSION_IMPL *session, const char *str, size_t len);
+extern int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, int *), const char *cfg[], uint32_t open_flags);
+extern int __wt_session_reset_cursors(WT_SESSION_IMPL *session);
+extern int __wt_session_copy_values(WT_SESSION_IMPL *session);
+extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_session_create_strip(WT_SESSION *wt_session, const char *v1, const char *v2, const char **value_ret);
+extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, int uses_dhandles, int open_metadata, WT_SESSION_IMPL **sessionp);
+extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, WT_SESSION_IMPL **sessionp);
+extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, int *skip);
+extern int __wt_session_compact( WT_SESSION *wt_session, const char *uri, const char *config);
+extern void __wt_session_dhandle_incr_use(WT_SESSION_IMPL *session);
+extern int __wt_session_lock_btree(WT_SESSION_IMPL *session, uint32_t flags);
+extern int __wt_session_release_btree(WT_SESSION_IMPL *session);
+extern int __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], uint32_t flags);
+extern void __wt_session_close_cache(WT_SESSION_IMPL *session);
+extern int __wt_session_get_btree(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, const char *cfg[], uint32_t flags);
+extern int __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint);
+extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]);
+extern uint32_t __wt_cksum(const void *chunk, size_t len);
+extern void __wt_cksum_init(void);
+extern void __wt_event_handler_set(WT_SESSION_IMPL *session, WT_EVENT_HANDLER *handler);
+extern int __wt_eventv(WT_SESSION_IMPL *session, int msg_event, int error, const char *file_name, int line_number, const char *fmt, va_list ap);
+extern void __wt_err(WT_SESSION_IMPL *session, int error, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4)));
+extern void __wt_errx(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 2, 3)));
+extern int __wt_ext_err_printf( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4)));
+extern int __wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 2, 3)));
+extern int __wt_ext_msg_printf( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4)));
+extern int __wt_progress(WT_SESSION_IMPL *session, const char *s, uint64_t v);
+extern void __wt_assert(WT_SESSION_IMPL *session, int error, const char *file_name, int line_number, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 5, 6)));
+extern int __wt_panic(WT_SESSION_IMPL *session);
+extern int __wt_illegal_value(WT_SESSION_IMPL *session, const char *name);
+extern int __wt_object_unsupported(WT_SESSION_IMPL *session, const char *uri);
+extern int __wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri);
+extern int __wt_filename(WT_SESSION_IMPL *session, const char *name, char **path);
+extern int __wt_nfilename( WT_SESSION_IMPL *session, const char *name, size_t namelen, char **path);
+extern int __wt_library_init(void);
+extern int __wt_breakpoint(void);
+extern void __wt_attach(WT_SESSION_IMPL *session);
+extern uint64_t __wt_hash_city64(const void *s, size_t len);
+extern uint64_t __wt_hash_fnv64(const void *string, size_t len);
+extern int
+__wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ );
+extern int __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern void __wt_hazard_close(WT_SESSION_IMPL *session);
+extern int __wt_raw_to_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to);
+extern int __wt_raw_to_esc_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to);
+extern int __wt_hex2byte(const u_char *from, u_char *to);
+extern int __wt_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to);
+extern int __wt_nhex_to_raw( WT_SESSION_IMPL *session, const char *from, size_t size, WT_ITEM *to);
+extern int __wt_esc_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to);
+extern int __wt_huffman_open(WT_SESSION_IMPL *session, void *symbol_frequency_array, u_int symcnt, u_int numbytes, void *retp);
+extern void __wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg);
+extern int __wt_print_huffman_code(void *huffman_arg, uint16_t symbol);
+extern int __wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf);
+extern int __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf);
+extern int __wt_spin_lock_register_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t);
+extern void __wt_spin_lock_unregister_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t);
+extern int __wt_spin_lock_register_caller(WT_SESSION_IMPL *session, const char *name, const char *file, int line, int *idp);
+extern int __wt_statlog_dump_spinlock(WT_CONNECTION_IMPL *conn, const char *tag);
+extern uint32_t __wt_nlpo2_round(uint32_t v);
+extern uint32_t __wt_nlpo2(uint32_t v);
+extern uint32_t __wt_log2_int(uint32_t n);
+extern int __wt_ispo2(uint32_t v);
+extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2);
+extern void __wt_random_init(uint32_t *rnd);
+extern uint32_t __wt_random(uint32_t *rnd);
+extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size);
+extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4)));
+extern int __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4)));
+extern int
+__wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ );
+extern void __wt_scr_discard(WT_SESSION_IMPL *session);
+extern void *__wt_ext_scr_alloc( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, size_t size);
+extern void __wt_ext_scr_free(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, void *p);
+extern void __wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats);
+extern void __wt_stat_refresh_dsrc_stats(void *stats_arg);
+extern void __wt_stat_aggregate_dsrc_stats(const void *child, const void *parent);
+extern void __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats);
+extern void __wt_stat_refresh_connection_stats(void *stats_arg);
+extern int __wt_txnid_cmp(const void *v1, const void *v2);
+extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session);
+extern void __wt_txn_update_oldest(WT_SESSION_IMPL *session);
+extern void __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot);
+extern int __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]);
+extern void __wt_txn_release(WT_SESSION_IMPL *session);
+extern int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_txn_init(WT_SESSION_IMPL *session);
+extern void __wt_txn_stats_update(WT_SESSION_IMPL *session);
+extern void __wt_txn_destroy(WT_SESSION_IMPL *session);
+extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]);
+extern void __wt_txn_global_destroy(WT_SESSION_IMPL *session);
+extern int __wt_checkpoint_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len);
+extern int __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_checkpoint_close(WT_SESSION_IMPL *session, int force);
+extern uint64_t __wt_ext_transaction_id(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session);
+extern int __wt_ext_transaction_isolation_level( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session);
+extern int __wt_ext_transaction_notify( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, WT_TXN_NOTIFY *notify);
+extern uint64_t __wt_ext_transaction_oldest(WT_EXTENSION_API *wt_api);
+extern int __wt_ext_transaction_visible( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, uint64_t transaction_id);
+extern void __wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op);
+extern int __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
+extern int __wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_txn_checkpoint_logread( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_LSN *ckpt_lsn);
+extern int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session, int full, uint32_t flags, WT_LSN *lsnp);
+extern int __wt_txn_truncate_log( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop);
+extern int __wt_txn_truncate_end(WT_SESSION_IMPL *session);
+extern int __wt_txn_printlog(WT_SESSION *wt_session, FILE *out);
+extern int __wt_txn_recover(WT_CONNECTION_IMPL *conn);
diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h
new file mode 100644
index 00000000000..3aac7193407
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/flags.h
@@ -0,0 +1,88 @@
+/*
+ * DO NOT EDIT: automatically built by dist/flags.py.
+ * flags section: BEGIN
+ */
+#define	WT_CONN_CACHE_POOL				0x00001000
+#define	WT_CONN_CKPT_SYNC				0x00000800
+#define	WT_CONN_EVICTION_RUN				0x00000400
+#define	WT_CONN_LEAK_MEMORY				0x00000200
+#define	WT_CONN_LSM_MERGE				0x00000100
+#define	WT_CONN_PANIC					0x00000080
+#define	WT_CONN_SERVER_ASYNC				0x00000040
+#define	WT_CONN_SERVER_CHECKPOINT			0x00000020
+#define	WT_CONN_SERVER_LSM				0x00000010
+#define	WT_CONN_SERVER_RUN				0x00000008
+#define	WT_CONN_SERVER_STATISTICS			0x00000004
+#define	WT_CONN_SERVER_SWEEP				0x00000002
+#define	WT_CONN_WAS_BACKUP				0x00000001
+#define	WT_EVICTING					0x00000004
+#define	WT_FILE_TYPE_CHECKPOINT				0x00000004
+#define	WT_FILE_TYPE_DATA				0x00000002
+#define	WT_FILE_TYPE_LOG				0x00000001
+#define	WT_LOGSCAN_FIRST				0x00000008
+#define	WT_LOGSCAN_FROM_CKP				0x00000004
+#define	WT_LOGSCAN_ONE					0x00000002
+#define	WT_LOGSCAN_RECOVER				0x00000001
+#define	WT_LOG_DSYNC					0x00000004
+#define	WT_LOG_FLUSH					0x00000002
+#define	WT_LOG_FSYNC					0x00000001
+#define	WT_READ_CACHE					0x00000200
+#define	WT_READ_COMPACT					0x00000100
+#define	WT_READ_NO_EVICT				0x00000080
+#define	WT_READ_NO_GEN					0x00000040
+#define	WT_READ_NO_WAIT					0x00000020
+#define	WT_READ_PREV					0x00000010
+#define	WT_READ_SKIP_INTL				0x00000008
+#define	WT_READ_SKIP_LEAF				0x00000004
+#define	WT_READ_TRUNCATE				0x00000002
+#define	WT_READ_WONT_NEED				0x00000001
+#define	WT_SESSION_CAN_WAIT				0x00000800
+#define	WT_SESSION_DISCARD_FORCE			0x00000400
+#define	WT_SESSION_INTERNAL				0x00000200
+#define	WT_SESSION_LOGGING_INMEM			0x00000100
+#define	WT_SESSION_NO_CACHE				0x00000080
+#define	WT_SESSION_NO_CACHE_CHECK			0x00000040
+#define	WT_SESSION_NO_DATA_HANDLES			0x00000020
+#define	WT_SESSION_NO_LOGGING				0x00000010
+#define	WT_SESSION_NO_SCHEMA_LOCK			0x00000008
+#define	WT_SESSION_SALVAGE_CORRUPT_OK			0x00000004
+#define	WT_SESSION_SCHEMA_LOCKED			0x00000002
+#define	WT_SESSION_SERVER_ASYNC				0x00000001
+#define	WT_SKIP_UPDATE_ERR				0x00000002
+#define	WT_SKIP_UPDATE_RESTORE				0x00000001
+#define	WT_SYNC_CHECKPOINT				0x00000010
+#define	WT_SYNC_CLOSE					0x00000008
+#define	WT_SYNC_DISCARD					0x00000004
+#define	WT_SYNC_DISCARD_FORCE				0x00000002
+#define	WT_SYNC_WRITE_LEAVES				0x00000001
+#define	WT_TXN_LOG_CKPT_FAIL				0x00000008
+#define	WT_TXN_LOG_CKPT_PREPARE				0x00000004
+#define	WT_TXN_LOG_CKPT_START				0x00000002
+#define	WT_TXN_LOG_CKPT_STOP				0x00000001
+#define	WT_VERB_API					0x00400000
+#define	WT_VERB_BLOCK					0x00200000
+#define	WT_VERB_CHECKPOINT				0x00100000
+#define	WT_VERB_COMPACT					0x00080000
+#define	WT_VERB_EVICT					0x00040000
+#define	WT_VERB_EVICTSERVER				0x00020000
+#define	WT_VERB_FILEOPS					0x00010000
+#define	WT_VERB_LOG					0x00008000
+#define	WT_VERB_LSM					0x00004000
+#define	WT_VERB_METADATA				0x00002000
+#define	WT_VERB_MUTEX					0x00001000
+#define	WT_VERB_OVERFLOW				0x00000800
+#define	WT_VERB_READ					0x00000400
+#define	WT_VERB_RECONCILE				0x00000200
+#define	WT_VERB_RECOVERY				0x00000100
+#define	WT_VERB_SALVAGE					0x00000080
+#define	WT_VERB_SHARED_CACHE				0x00000040
+#define	WT_VERB_SPLIT					0x00000020
+#define	WT_VERB_TEMPORARY				0x00000010
+#define	WT_VERB_TRANSACTION				0x00000008
+#define	WT_VERB_VERIFY					0x00000004
+#define	WT_VERB_VERSION					0x00000002
+#define	WT_VERB_WRITE					0x00000001
+/*
+ * flags section: END
+ * DO NOT EDIT: automatically built by dist/flags.py.
+ */
diff --git a/src/third_party/wiredtiger/src/include/gcc.h b/src/third_party/wiredtiger/src/include/gcc.h
new file mode 100644
index 00000000000..50e237a1fed
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/gcc.h
@@ -0,0 +1,152 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/* Add GCC-specific attributes to types and function declarations. */
+#define	WT_GCC_ATTRIBUTE(x)	__attribute__(x)
+
+/*
+ * Attribute are only permitted on function declarations, not definitions.
+ * This macro is a marker for function definitions that is rewritten by
+ * dist/s_prototypes to create extern.h.
+ */
+#define	WT_GCC_FUNC_ATTRIBUTE(x)
+
+/*
+ * Atomic writes:
+ *
+ * WiredTiger requires pointers (void *) and some variables to be read/written
+ * atomically, that is, in a single cycle.  This is not write ordering -- to be
+ * clear, the requirement is that no partial value can ever be read or written.
+ * For example, if 8-bits of a 32-bit quantity were written, then the rest of
+ * the 32-bits were written, and another thread of control was able to read the
+ * memory location after the first 8-bits were written and before the subsequent
+ * 24-bits were written, WiredTiger would break.   Or, if two threads of control
+ * attempt to write the same location simultaneously, the result must be one or
+ * the other of the two values, not some combination of both.
+ *
+ * To reduce memory requirements, we use a 32-bit type on 64-bit machines, which
+ * is OK if the compiler doesn't accumulate two adjacent 32-bit variables into a
+ * single 64-bit write, that is, there needs to be a single load/store of the 32
+ * bits, not a load/store of 64 bits, where the 64 bits is comprised of two
+ * adjacent 32-bit locations.  The problem is when two threads are cooperating
+ * (thread X finds 32-bits set to 0, writes in a new value, flushes memory;
+ * thread Y reads 32-bits that are non-zero, does some operation, resets the
+ * memory location to 0 and flushes).   If thread X were to read the 32 bits
+ * adjacent to a different 32 bits, and write them both, the two threads could
+ * race.  If that can happen, you must increase the size of the memory type to
+ * a type guaranteed to be written atomically in a single cycle, without writing
+ * an adjacent memory location.
+ *
+ * WiredTiger additionally requires atomic writes for 64-bit memory locations,
+ * and so cannot run on machines with a 32-bit memory bus.
+ *
+ * We don't depend on writes across cache lines being atomic, and to make sure
+ * that never happens, we check address alignment: we know of no architectures
+ * with cache lines other than a multiple of 4 bytes in size, so aligned 4-byte
+ * accesses will always be in a single cache line.
+ *
+ * Atomic writes are often associated with memory barriers, implemented by the
+ * WT_READ_BARRIER and WT_WRITE_BARRIER macros.  WiredTiger's requirement as
+ * described by the Solaris membar_enter description:
+ *
+ *	No stores from after the memory barrier will reach visibility and
+ *	no loads from after the barrier will be resolved before the lock
+ *	acquisition reaches global visibility
+ *
+ * In other words, the WT_WRITE_BARRIER macro must ensure that memory stores by
+ * the processor, made before the WT_WRITE_BARRIER call, be visible to all
+ * processors in the system before any memory stores by the processor, made
+ * after the WT_WRITE_BARRIER call, are visible to any processor.  The
+ * WT_READ_BARRIER macro ensures that all loads before the barrier are complete
+ * before any loads after the barrier.  The compiler cannot reorder or cache
+ * values across a barrier.
+ *
+ * Lock and unlock operations imply both read and write barriers.  In other
+ * words, barriers are not required for values protected by locking.
+ *
+ * Data locations may also be marked volatile, forcing the compiler to re-load
+ * the data on each access.  This is a weaker semantic than barriers provide,
+ * only ensuring that the compiler will not cache values.  It makes no ordering
+ * guarantees and may have no effect on systems with weaker cache guarantees.
+ *
+ * In summary, locking > barriers > volatile.
+ *
+ * To avoid locking shared data structures such as statistics and to permit
+ * atomic state changes, we rely on the WT_ATOMIC_ADD and WT_ATOMIC_CAS
+ * (compare and swap) operations.
+ */
+#define	__WT_ATOMIC_ADD(v, val, n)					\
+	(WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_add_and_fetch(&(v), val))
+#define	__WT_ATOMIC_FETCH_ADD(v, val, n)				\
+	(WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_fetch_and_add(&(v), val))
+#define	__WT_ATOMIC_CAS(v, old, new, n)					\
+	(WT_STATIC_ASSERT(sizeof(v) == (n)),				\
+	__sync_bool_compare_and_swap(&(v), old, new))
+#define	__WT_ATOMIC_CAS_VAL(v, old, new, n)				\
+	(WT_STATIC_ASSERT(sizeof(v) == (n)),				\
+	__sync_val_compare_and_swap(&(v), old, new))
+#define	__WT_ATOMIC_STORE(v, val, n)					\
+	(WT_STATIC_ASSERT(sizeof(v) == (n)),				\
+	__sync_lock_test_and_set(&(v), val))
+#define	__WT_ATOMIC_SUB(v, val, n)					\
+	(WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_sub_and_fetch(&(v), val))
+
+#define	WT_ATOMIC_ADD1(v, val)		__WT_ATOMIC_ADD(v, val, 1)
+#define	WT_ATOMIC_FETCH_ADD1(v, val)	__WT_ATOMIC_FETCH_ADD(v, val, 1)
+#define	WT_ATOMIC_CAS1(v, old, new)	__WT_ATOMIC_CAS(v, old, new, 1)
+#define	WT_ATOMIC_CAS_VAL1(v, old, new)	__WT_ATOMIC_CAS_VAL(v, old, new, 1)
+#define	WT_ATOMIC_STORE1(v, val)	__WT_ATOMIC_STORE(v, val, 1)
+#define	WT_ATOMIC_SUB1(v, val)		__WT_ATOMIC_SUB(v, val, 1)
+
+#define	WT_ATOMIC_ADD2(v, val)		__WT_ATOMIC_ADD(v, val, 2)
+#define	WT_ATOMIC_FETCH_ADD2(v, val)	__WT_ATOMIC_FETCH_ADD(v, val, 2)
+#define	WT_ATOMIC_CAS2(v, old, new)	__WT_ATOMIC_CAS(v, old, new, 2)
+#define	WT_ATOMIC_CAS_VAL2(v, old, new)	__WT_ATOMIC_CAS_VAL(v, old, new, 2)
+#define	WT_ATOMIC_STORE2(v, val)	__WT_ATOMIC_STORE(v, val, 2)
+#define	WT_ATOMIC_SUB2(v, val)		__WT_ATOMIC_SUB(v, val, 2)
+
+#define	WT_ATOMIC_ADD4(v, val)		__WT_ATOMIC_ADD(v, val, 4)
+#define	WT_ATOMIC_FETCH_ADD4(v, val)	__WT_ATOMIC_FETCH_ADD(v, val, 4)
+#define	WT_ATOMIC_CAS4(v, old, new)	__WT_ATOMIC_CAS(v, old, new, 4)
+#define	WT_ATOMIC_CAS_VAL4(v, old, new)	__WT_ATOMIC_CAS_VAL(v, old, new, 4)
+#define	WT_ATOMIC_STORE4(v, val)	__WT_ATOMIC_STORE(v, val, 4)
+#define	WT_ATOMIC_SUB4(v, val)		__WT_ATOMIC_SUB(v, val, 4)
+
+#define	WT_ATOMIC_ADD8(v, val)		__WT_ATOMIC_ADD(v, val, 8)
+#define	WT_ATOMIC_FETCH_ADD8(v, val)	__WT_ATOMIC_FETCH_ADD(v, val, 8)
+#define	WT_ATOMIC_CAS8(v, old, new)	__WT_ATOMIC_CAS(v, old, new, 8)
+#define	WT_ATOMIC_CAS_VAL8(v, old, new)	__WT_ATOMIC_CAS_VAL(v, old, new, 8)
+#define	WT_ATOMIC_STORE8(v, val)	__WT_ATOMIC_STORE(v, val, 8)
+#define	WT_ATOMIC_SUB8(v, val)		__WT_ATOMIC_SUB(v, val, 8)
+
+/* Compile read-write barrier */
+#define	WT_BARRIER() __asm__ volatile("" ::: "memory")
+
+/* Pause instruction to prevent excess processor bus usage */
+#define	WT_PAUSE() __asm__ volatile("pause\n" ::: "memory")
+
+#if defined(x86_64) || defined(__x86_64__)
+#define	WT_FULL_BARRIER() do {						\
+	__asm__ volatile ("mfence" ::: "memory");			\
+} while (0)
+#define	WT_READ_BARRIER() do {						\
+	__asm__ volatile ("lfence" ::: "memory");			\
+} while (0)
+#define	WT_WRITE_BARRIER() do {						\
+	__asm__ volatile ("sfence" ::: "memory");			\
+} while (0)
+
+#elif defined(i386) || defined(__i386__)
+#define	WT_FULL_BARRIER() do {						\
+	__asm__ volatile ("lock; addl $0, 0(%%esp)" ::: "memory");	\
+} while (0)
+#define	WT_READ_BARRIER()	WT_FULL_BARRIER()
+#define	WT_WRITE_BARRIER()	WT_FULL_BARRIER()
+
+#else
+#error "No write barrier implementation for this hardware"
+#endif
diff --git a/src/third_party/wiredtiger/src/include/hardware.h b/src/third_party/wiredtiger/src/include/hardware.h
new file mode 100644
index 00000000000..720f512cf2d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/hardware.h
@@ -0,0 +1,60 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Publish a value to a shared location.  All previous stores must complete
+ * before the value is made public.
+ */
+#define	WT_PUBLISH(v, val) do {						\
+	WT_WRITE_BARRIER();						\
+	(v) = (val);							\
+} while (0)
+
+/*
+ * Read a shared location and guarantee that subsequent reads do not see any
+ * earlier state.
+ */
+#define	WT_ORDERED_READ(v, val) do {					\
+	(v) = (val);							\
+	WT_READ_BARRIER();						\
+} while (0)
+
+/*
+ * Atomic versions of the flag set/clear macros.
+ */
+#define	F_ISSET_ATOMIC(p, mask)	((p)->flags_atomic & (uint8_t)(mask))
+
+#define	F_SET_ATOMIC(p, mask) do {					\
+	uint8_t __orig;							\
+	do {								\
+		__orig = (p)->flags_atomic;				\
+	} while (!WT_ATOMIC_CAS1((p)->flags_atomic,			\
+	    __orig, __orig | (uint8_t)(mask)));				\
+} while (0)
+
+#define	F_CAS_ATOMIC(p, mask, ret) do {					\
+	uint8_t __orig;						\
+	ret = 0;							\
+	do {								\
+		__orig = (p)->flags_atomic;				\
+		if ((__orig & (uint8_t)(mask)) != 0) {			\
+			ret = EBUSY;					\
+			break;						\
+		}							\
+	} while (!WT_ATOMIC_CAS1((p)->flags_atomic,			\
+	    __orig, __orig | (uint8_t)(mask)));				\
+} while (0)
+
+#define	F_CLR_ATOMIC(p, mask)	do {					\
+	uint8_t __orig;							\
+	do {								\
+		__orig = (p)->flags_atomic;				\
+	} while (!WT_ATOMIC_CAS1((p)->flags_atomic,			\
+	    __orig, __orig & ~(uint8_t)(mask)));			\
+} while (0)
+
+#define	WT_CACHE_LINE_ALIGNMENT	64	/* Cache line alignment */
diff --git a/src/third_party/wiredtiger/src/include/intpack.i b/src/third_party/wiredtiger/src/include/intpack.i
new file mode 100644
index 00000000000..01559657acd
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/intpack.i
@@ -0,0 +1,371 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Variable-length integer encoding.
+ * We need up to 64 bits, signed and unsigned.  Further, we want the packed
+ * representation to have the same lexicographic ordering as the integer
+ * values.  This avoids the need for special-purpose comparison code.
+ *
+ * Try hard to keep small values small (up to ~2 bytes): that gives the biggest
+ * benefit for common cases storing small values.  After that, just encode the
+ * length in the first byte: we could squeeze in a couple of extra bits, but
+ * the marginal benefit is small, and we want this code to be relatively
+ * easy to implement in client code or scripting APIs.
+ *
+ * First byte  | Next |                        |
+ * byte        | bytes| Min Value              | Max Value
+ * ------------+------+------------------------+--------------------------------
+ * [00 00xxxx] | free | N/A                    | N/A
+ * [00 01llll] | llll | -2^64                  | -2^13 - 2^6
+ * [00 1xxxxx] | 1    | -2^13 - 2^6            | -2^6 - 1
+ * [01 xxxxxx] | 0    | -2^6                   | -1
+ * [10 xxxxxx] | 0    | 0                      | 2^6 - 1
+ * [11 0xxxxx] | 1    | 2^6                    | 2^13 + 2^6 - 1
+ * [11 10llll] | llll | 2^13 + 2^6             | 2^64 - 1
+ * [11 11xxxx] | free | N/A                    | N/A
+ */
+
+#define	NEG_MULTI_MARKER (uint8_t)0x10
+#define	NEG_2BYTE_MARKER (uint8_t)0x20
+#define	NEG_1BYTE_MARKER (uint8_t)0x40
+#define	POS_1BYTE_MARKER (uint8_t)0x80
+#define	POS_2BYTE_MARKER (uint8_t)0xc0
+#define	POS_MULTI_MARKER (uint8_t)0xe0
+
+#define	NEG_1BYTE_MIN ((-1) << 6)
+#define	NEG_2BYTE_MIN (((-1) << 13) + NEG_1BYTE_MIN)
+#define	POS_1BYTE_MAX ((1 << 6) - 1)
+#define	POS_2BYTE_MAX ((1 << 13) + POS_1BYTE_MAX)
+
+/* Extract bits <start> to <end> from a value (counting from LSB == 0). */
+#define	GET_BITS(x, start, end)                                         \
+	(((uint64_t)(x) & ((1U << (start)) - 1U)) >> (end))
+
+#define	WT_SIZE_CHECK(l, maxl)						\
+	WT_RET_TEST((maxl) != 0 && (size_t)(l) > (maxl), ENOMEM)
+
+/* Count the leading zero bytes. */
+#if defined(__GNUC__)
+#define	WT_LEADING_ZEROS(x, i)						\
+	(i = (x == 0) ? (int)sizeof (x) : __builtin_clzll(x) >> 3)
+#elif defined(_MSC_VER)
+#define	WT_LEADING_ZEROS(x, i)	do {					\
+	if (x == 0) i = (int)sizeof(x);				\
+	else  { 							\
+		unsigned long __index;					\
+		_BitScanReverse64(&__index, x);				\
+		__index = 63 ^ __index;					\
+		i = (int)(__index >> 3); }				\
+	} while (0)
+#else
+#define	WT_LEADING_ZEROS(x, i) do {					\
+	uint64_t __x = (x);						\
+	uint64_t __m = (uint64_t)0xff << 56;				\
+	for (i = 0; !(__x & __m) && i != 8; i++)			\
+		__m >>= 8;						\
+} while (0)
+#endif
+
+/*
+ * __wt_vpack_posint --
+ *	Packs a positive variable-length integer in the specified location.
+ */
+static inline int
+__wt_vpack_posint(uint8_t **pp, size_t maxlen, uint64_t x)
+{
+	uint8_t *p;
+	int len, lz, shift;
+
+	WT_LEADING_ZEROS(x, lz);
+	len = (int)sizeof (x) - lz;
+	WT_SIZE_CHECK(len + 1, maxlen);
+	p = *pp;
+
+	/* There are four bits we can use in the first byte. */
+	*p++ |= (len & 0xf);
+
+	for (shift = (len - 1) << 3; len != 0; --len, shift -= 8)
+		*p++ = (uint8_t)(x >> shift);
+
+	*pp = p;
+	return (0);
+}
+
+/*
+ * __wt_vpack_negint --
+ *	Packs a negative variable-length integer in the specified location.
+ */
+static inline int
+__wt_vpack_negint(uint8_t **pp, size_t maxlen, uint64_t x)
+{
+	uint8_t *p;
+	int len, lz, shift;
+
+	WT_LEADING_ZEROS(~x, lz);
+	len = (int)sizeof (x) - lz;
+	WT_SIZE_CHECK(len + 1, maxlen);
+	p = *pp;
+
+	/*
+	 * There are four size bits we can use in the first byte.
+	 * For negative numbers, we store the number of leading 0xff bytes
+	 * to maintain ordering (if this is not obvious, it may help to
+	 * remember that -1 is the largest negative number).
+	 */
+	*p++ |= (lz & 0xf);
+
+	for (shift = (len - 1) << 3; len != 0; shift -= 8, --len)
+		*p++ = (uint8_t)(x >> shift);
+
+	*pp = p;
+	return (0);
+}
+
+/*
+ * __wt_vunpack_posint --
+ *	Reads a variable-length positive integer from the specified location.
+ */
+static inline int
+__wt_vunpack_posint(const uint8_t **pp, size_t maxlen, uint64_t *retp)
+{
+	uint64_t x;
+	const uint8_t *p;
+	uint8_t len;
+
+	/* There are four length bits in the first byte. */
+	p = *pp;
+	len = (*p++ & 0xf);
+	WT_SIZE_CHECK(len + 1, maxlen);
+
+	for (x = 0; len != 0; --len)
+		x = (x << 8) | *p++;
+
+	*retp = x;
+	*pp = p;
+	return (0);
+}
+
+/*
+ * __wt_vunpack_negint --
+ *	Reads a variable-length negative integer from the specified location.
+ */
+static inline int
+__wt_vunpack_negint(const uint8_t **pp, size_t maxlen, uint64_t *retp)
+{
+	uint64_t x;
+	const uint8_t *p;
+	uint8_t len;
+
+	/* There are four length bits in the first byte. */
+	p = *pp;
+	len = (int)sizeof (x) - (*p++ & 0xf);
+	WT_SIZE_CHECK(len + 1, maxlen);
+
+	for (x = UINT64_MAX; len != 0; --len)
+		x = (x << 8) | *p++;
+
+	*retp = x;
+	*pp = p;
+	return (0);
+}
+
+/*
+ * __wt_vpack_uint --
+ *	Variable-sized packing for unsigned integers
+ */
+static inline int
+__wt_vpack_uint(uint8_t **pp, size_t maxlen, uint64_t x)
+{
+	uint8_t *p;
+
+	WT_SIZE_CHECK(1, maxlen);
+	p = *pp;
+	if (x <= POS_1BYTE_MAX)
+		*p++ = POS_1BYTE_MARKER | GET_BITS(x, 6, 0);
+	else if (x <= POS_2BYTE_MAX) {
+		WT_SIZE_CHECK(2, maxlen);
+		x -= POS_1BYTE_MAX + 1;
+		*p++ = POS_2BYTE_MARKER | GET_BITS(x, 13, 8);
+		*p++ = GET_BITS(x, 8, 0);
+	} else {
+		x -= POS_2BYTE_MAX + 1;
+		*p = POS_MULTI_MARKER;
+		return (__wt_vpack_posint(pp, maxlen, x));
+	}
+
+	*pp = p;
+	return (0);
+}
+
+/*
+ * __wt_vpack_int --
+ *	Variable-sized packing for signed integers
+ */
+static inline int
+__wt_vpack_int(uint8_t **pp, size_t maxlen, int64_t x)
+{
+	uint8_t *p;
+
+	WT_SIZE_CHECK(1, maxlen);
+	p = *pp;
+	if (x < NEG_2BYTE_MIN) {
+		*p = NEG_MULTI_MARKER;
+		return (__wt_vpack_negint(pp, maxlen, (uint64_t)x));
+	} else if (x < NEG_1BYTE_MIN) {
+		WT_SIZE_CHECK(2, maxlen);
+		x -= NEG_2BYTE_MIN;
+		*p++ = NEG_2BYTE_MARKER | GET_BITS(x, 13, 8);
+		*p++ = GET_BITS(x, 8, 0);
+	} else if (x < 0) {
+		x -= NEG_1BYTE_MIN;
+		*p++ = NEG_1BYTE_MARKER | GET_BITS(x, 6, 0);
+	} else
+		/* For non-negative values, use the unsigned code above. */
+		return (__wt_vpack_uint(pp, maxlen, (uint64_t)x));
+
+	*pp = p;
+	return (0);
+}
+
+/*
+ * __wt_vunpack_uint --
+ *	Variable-sized unpacking for unsigned integers
+ */
+static inline int
+__wt_vunpack_uint(const uint8_t **pp, size_t maxlen, uint64_t *xp)
+{
+	const uint8_t *p;
+
+	WT_SIZE_CHECK(1, maxlen);
+	p = *pp;
+	switch (*p & 0xf0) {
+	case POS_1BYTE_MARKER:
+	case POS_1BYTE_MARKER | 0x10:
+	case POS_1BYTE_MARKER | 0x20:
+	case POS_1BYTE_MARKER | 0x30:
+		*xp = GET_BITS(*p, 6, 0);
+		p += 1;
+		break;
+	case POS_2BYTE_MARKER:
+	case POS_2BYTE_MARKER | 0x10:
+		WT_SIZE_CHECK(2, maxlen);
+		*xp = GET_BITS(*p++, 5, 0) << 8;
+		*xp |= *p++;
+		*xp += POS_1BYTE_MAX + 1;
+		break;
+	case POS_MULTI_MARKER:
+		WT_RET(__wt_vunpack_posint(pp, maxlen, xp));
+		*xp += POS_2BYTE_MAX + 1;
+		return (0);
+	default:
+		return (EINVAL);
+	}
+
+	*pp = p;
+	return (0);
+}
+
+/*
+ * __wt_vunpack_int --
+ *	Variable-sized packing for signed integers
+ */
+static inline int
+__wt_vunpack_int(const uint8_t **pp, size_t maxlen, int64_t *xp)
+{
+	const uint8_t *p;
+
+	WT_SIZE_CHECK(1, maxlen);
+	p = *pp;
+	switch (*p & 0xf0) {
+	case NEG_MULTI_MARKER:
+		WT_RET(__wt_vunpack_negint(pp, maxlen, (uint64_t *)xp));
+		return (0);
+	case NEG_2BYTE_MARKER:
+	case NEG_2BYTE_MARKER | 0x10:
+		WT_SIZE_CHECK(2, maxlen);
+		*xp = (int64_t)(GET_BITS(*p++, 5, 0) << 8);
+		*xp |= *p++;
+		*xp += NEG_2BYTE_MIN;
+		p += 2;
+		break;
+	case NEG_1BYTE_MARKER:
+	case NEG_1BYTE_MARKER | 0x10:
+	case NEG_1BYTE_MARKER | 0x20:
+	case NEG_1BYTE_MARKER | 0x30:
+		*xp = NEG_1BYTE_MIN + (int64_t)GET_BITS(*p, 6, 0);
+		p += 1;
+		break;
+	default:
+		/* Identical to the unsigned case. */
+		return (__wt_vunpack_uint(pp, maxlen, (uint64_t *)xp));
+	}
+
+	*pp = p;
+	return (0);
+}
+
+/*
+ * __wt_vsize_posint --
+ *	Return the packed size of a positive variable-length integer.
+ */
+static inline size_t
+__wt_vsize_posint(uint64_t x)
+{
+	int lz;
+
+	WT_LEADING_ZEROS(x, lz);
+	return ((size_t)(WT_INTPACK64_MAXSIZE - lz));
+}
+
+/*
+ * __wt_vsize_negint --
+ *	Return the packed size of a negative variable-length integer.
+ */
+static inline size_t
+__wt_vsize_negint(uint64_t x)
+{
+	int lz;
+
+	WT_LEADING_ZEROS(~x, lz);
+	return (size_t)(WT_INTPACK64_MAXSIZE - lz);
+}
+
+/*
+ * __wt_vsize_uint --
+ *	Return the packed size of an unsigned integer.
+ */
+static inline size_t
+__wt_vsize_uint(uint64_t x)
+{
+	if (x <= POS_1BYTE_MAX)
+		return (1);
+	else if (x <= POS_2BYTE_MAX) {
+		return (2);
+	} else {
+		x -= POS_2BYTE_MAX + 1;
+		return (__wt_vsize_posint(x));
+	}
+}
+
+/*
+ * __wt_vsize_int --
+ *	Return the packed size of a signed integer.
+ */
+static inline size_t
+__wt_vsize_int(int64_t x)
+{
+	if (x < NEG_2BYTE_MIN) {
+		return (__wt_vsize_negint((uint64_t)x));
+	} else if (x < NEG_1BYTE_MIN) {
+		return (2);
+	} else if (x < 0) {
+		return (1);
+	} else
+		/* For non-negative values, use the unsigned code above. */
+		return (__wt_vsize_uint((uint64_t)x));
+}
diff --git a/src/third_party/wiredtiger/src/include/lint.h b/src/third_party/wiredtiger/src/include/lint.h
new file mode 100644
index 00000000000..7c0a103a8ee
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/lint.h
@@ -0,0 +1,56 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define	WT_GCC_ATTRIBUTE(x)
+#define	WT_GCC_FUNC_ATTRIBUTE(x)
+
+#define	__WT_ATOMIC_ADD(v, val)						\
+    ((v) += (val))
+#define	__WT_ATOMIC_FETCH_ADD(v, val)					\
+    ((v) += (val), (v))
+#define	__WT_ATOMIC_CAS(v, old, new)					\
+    ((v) = ((v) == (old) ? (new) : (old)), (v) == (old))
+#define	__WT_ATOMIC_CAS_VAL(v, old, new)				\
+    ((v) = ((v) == (old) ? (new) : (old)), (v) == (old))
+#define	__WT_ATOMIC_STORE(v, val)					\
+    ((v) = (val))
+#define	__WT_ATOMIC_SUB(v, val)						\
+    ((v) -= (val), (v))
+
+#define	WT_ATOMIC_ADD1(v, val)		__WT_ATOMIC_ADD(v, val)
+#define	WT_ATOMIC_FETCH_ADD1(v, val)	__WT_ATOMIC_FETCH_ADD(v, val)
+#define	WT_ATOMIC_CAS1(v, old, new)	__WT_ATOMIC_CAS(v, old, new)
+#define	WT_ATOMIC_CAS_VAL1(v, old, new)	__WT_ATOMIC_CAS_VAL(v, old, new)
+#define	WT_ATOMIC_STORE1(v, val)	__WT_ATOMIC_STORE(v, val)
+#define	WT_ATOMIC_SUB1(v, val)		__WT_ATOMIC_SUB(v, val)
+
+#define	WT_ATOMIC_ADD2(v, val)		__WT_ATOMIC_ADD(v, val)
+#define	WT_ATOMIC_FETCH_ADD2(v, val)	__WT_ATOMIC_FETCH_ADD(v, val)
+#define	WT_ATOMIC_CAS2(v, old, new)	__WT_ATOMIC_CAS(v, old, new)
+#define	WT_ATOMIC_CAS_VAL2(v, old, new)	__WT_ATOMIC_CAS_VAL(v, old, new)
+#define	WT_ATOMIC_STORE2(v, val)	__WT_ATOMIC_STORE(v, val)
+#define	WT_ATOMIC_SUB2(v, val)		__WT_ATOMIC_SUB(v, val)
+
+#define	WT_ATOMIC_ADD4(v, val)		__WT_ATOMIC_ADD(v, val)
+#define	WT_ATOMIC_FETCH_ADD4(v, val)	__WT_ATOMIC_FETCH_ADD(v, val)
+#define	WT_ATOMIC_CAS4(v, old, new)	__WT_ATOMIC_CAS(v, old, new)
+#define	WT_ATOMIC_CAS_VAL4(v, old, new)	__WT_ATOMIC_CAS_VAL(v, old, new)
+#define	WT_ATOMIC_STORE4(v, val)	__WT_ATOMIC_STORE(v, val)
+#define	WT_ATOMIC_SUB4(v, val)		__WT_ATOMIC_SUB(v, val)
+
+#define	WT_ATOMIC_ADD8(v, val)		__WT_ATOMIC_ADD(v, val)
+#define	WT_ATOMIC_FETCH_ADD8(v, val)	__WT_ATOMIC_FETCH_ADD(v, val)
+#define	WT_ATOMIC_CAS8(v, old, new)	__WT_ATOMIC_CAS(v, old, new)
+#define	WT_ATOMIC_CAS_VAL8(v, old, new)	__WT_ATOMIC_CAS_VAL(v, old, new)
+#define	WT_ATOMIC_STORE8(v, val)	__WT_ATOMIC_STORE(v, val)
+#define	WT_ATOMIC_SUB8(v, val)		__WT_ATOMIC_SUB(v, val)
+
+static inline void WT_BARRIER(void) { return; }
+static inline void WT_FULL_BARRIER(void) { return; }
+static inline void WT_PAUSE(void) { return; }
+static inline void WT_READ_BARRIER(void) { return; }
+static inline void WT_WRITE_BARRIER(void) { return; }
diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h
new file mode 100644
index 00000000000..15054e34906
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/log.h
@@ -0,0 +1,177 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define	WT_LOG_FILENAME	"WiredTigerLog"		/* Log file name */
+
+/* Logging subsystem declarations. */
+#define	LOG_ALIGN		128
+#define	WT_LOG_SLOT_BUF_INIT_SIZE	64 * 1024
+
+#define	INIT_LSN(l)	do {						\
+	(l)->file = 1;							\
+	(l)->offset = 0;						\
+} while (0)
+
+#define	IS_INIT_LSN(l)	((l)->file == 1 && (l)->offset == 0)
+
+/*
+ * Both of the macros below need to change if the content of __wt_lsn
+ * ever changes.  The value is the following:
+ * txnid, record type, operation type, file id, operation key, operation value
+ */
+#define	LOGC_KEY_FORMAT		WT_UNCHECKED_STRING(IqI)
+#define	LOGC_VALUE_FORMAT	WT_UNCHECKED_STRING(qIIIuu)
+
+#define	LOG_SKIP_HEADER(data)						\
+    ((const uint8_t *)(data) + offsetof(WT_LOG_RECORD, record))
+#define	LOG_REC_SIZE(size)						\
+    ((size) - offsetof(WT_LOG_RECORD, record))
+
+#define	MAX_LSN(l)	do {						\
+	(l)->file = UINT32_MAX;						\
+	(l)->offset = INT64_MAX;					\
+} while (0)
+
+/*
+ * Compare 2 LSNs, return -1 if lsn0 < lsn1, 0 if lsn0 == lsn1
+ * and 1 if lsn0 > lsn1.
+ */
+#define	LOG_CMP(lsn1, lsn2)						\
+	((lsn1)->file != (lsn2)->file ?					\
+	((lsn1)->file < (lsn2)->file ? -1 : 1) :			\
+	((lsn1)->offset != (lsn2)->offset ?				\
+	((lsn1)->offset < (lsn2)->offset ? -1 : 1) : 0))
+
+/*
+ * Possible values for the consolidation array slot states:
+ * < WT_LOG_SLOT_DONE - threads are actively writing to the log.
+ * WT_LOG_SLOT_DONE - all activity on this slot is complete.
+ * WT_LOG_SLOT_FREE - slot is available for allocation.
+ * WT_LOG_SLOT_PENDING - slot is transitioning from ready to active.
+ * WT_LOG_SLOT_READY - slot is ready for threads to join.
+ * > WT_LOG_SLOT_READY - threads are actively consolidating on this slot.
+ */
+#define	WT_LOG_SLOT_DONE	0
+#define	WT_LOG_SLOT_FREE	1
+#define	WT_LOG_SLOT_PENDING	2
+#define	WT_LOG_SLOT_READY	3
+typedef struct {
+	int64_t	 slot_state;		/* Slot state */
+	uint64_t slot_group_size;	/* Group size */
+	int32_t	 slot_error;		/* Error value */
+#define	SLOT_INVALID_INDEX	0xffffffff
+	uint32_t slot_index;		/* Active slot index */
+	wt_off_t slot_start_offset;	/* Starting file offset */
+	WT_LSN	slot_release_lsn;	/* Slot release LSN */
+	WT_LSN	slot_start_lsn;		/* Slot starting LSN */
+	WT_LSN	slot_end_lsn;		/* Slot ending LSN */
+	WT_FH	*slot_fh;		/* File handle for this group */
+	WT_ITEM slot_buf;		/* Buffer for grouped writes */
+	int32_t	slot_churn;		/* Active slots are scarce. */
+
+#define	SLOT_BUF_GROW	0x01			/* Grow buffer on release */
+#define	SLOT_BUFFERED	0x02			/* Buffer writes */
+#define	SLOT_CLOSEFH	0x04			/* Close old fh on release */
+#define	SLOT_SYNC	0x08			/* Needs sync on release */
+	uint32_t flags;			/* Flags */
+} WT_LOGSLOT WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT)));
+
+typedef struct {
+	WT_LOGSLOT	*slot;
+	wt_off_t		 offset;
+} WT_MYSLOT;
+
+					/* Offset of first record */
+#define	LOG_FIRST_RECORD	log->allocsize
+
+typedef struct {
+	uint32_t	allocsize;	/* Allocation alignment size */
+	wt_off_t	log_written;	/* Amount of log written this period */
+	/*
+	 * Log file information
+	 */
+	uint32_t	 fileid;	/* Current log file number */
+	WT_FH           *log_fh;	/* Logging file handle */
+	WT_FH           *log_close_fh;	/* Logging file handle to close */
+
+	/*
+	 * System LSNs
+	 */
+	WT_LSN		alloc_lsn;	/* Next LSN for allocation */
+	WT_LSN		ckpt_lsn;	/* Last checkpoint LSN */
+	WT_LSN		first_lsn;	/* First LSN */
+	WT_LSN		sync_lsn;	/* LSN of the last sync */
+	WT_LSN		trunc_lsn;	/* End LSN for recovery truncation */
+	WT_LSN		write_lsn;	/* Last LSN written to log file */
+
+	/*
+	 * Synchronization resources
+	 */
+	WT_SPINLOCK      log_lock;      /* Locked: Logging fields */
+	WT_SPINLOCK      log_slot_lock; /* Locked: Consolidation array */
+	WT_SPINLOCK      log_sync_lock; /* Locked: Single-thread fsync */
+
+	WT_RWLOCK	 *log_archive_lock; /* Archive and log cursors */
+
+	/* Notify any waiting threads when sync_lsn is updated. */
+	WT_CONDVAR	*log_sync_cond;
+
+	/*
+	 * Consolidation array information
+	 * SLOT_ACTIVE must be less than SLOT_POOL.
+	 * Our testing shows that the more consolidation we generate the
+	 * better the performance we see which equates to an active slot
+	 * slot count of one.
+	 */
+#define	SLOT_ACTIVE	1
+#define	SLOT_POOL	16
+	uint32_t	 pool_index;		/* Global pool index */
+	WT_LOGSLOT	*slot_array[SLOT_ACTIVE];	/* Active slots */
+	WT_LOGSLOT	 slot_pool[SLOT_POOL];	/* Pool of all slots */
+
+#define	WT_LOG_FORCE_CONSOLIDATE	0x01	/* Disable direct writes */
+	uint32_t	 flags;
+} WT_LOG;
+
+typedef struct {
+	uint32_t	len;		/* 00-03: Record length including hdr */
+	uint32_t	checksum;	/* 04-07: Checksum of the record */
+	uint8_t		unused[8];	/* 08-15: Padding */
+	uint8_t		record[0];	/* Beginning of actual data */
+} WT_LOG_RECORD;
+
+/*
+ * WT_LOG_DESC --
+ *	The log file's description.
+ */
+struct __wt_log_desc {
+#define	WT_LOG_MAGIC		0x101064
+	uint32_t	log_magic;	/* 00-03: Magic number */
+#define	WT_LOG_MAJOR_VERSION	1
+	uint16_t	majorv;		/* 04-05: Major version */
+#define	WT_LOG_MINOR_VERSION	0
+	uint16_t	minorv;		/* 06-07: Minor version */
+	uint64_t	log_size;	/* 08-15: Log file size */
+};
+
+/*
+ * WT_LOG_REC_DESC --
+ *	A descriptor for a log record type.
+ */
+struct __wt_log_rec_desc {
+	const char *fmt;
+	int (*print)(WT_SESSION_IMPL *session, uint8_t **pp, uint8_t *end);
+};
+
+/*
+ * WT_LOG_OP_DESC --
+ *	A descriptor for a log operation type.
+ */
+struct __wt_log_op_desc {
+	const char *fmt;
+	int (*print)(WT_SESSION_IMPL *session, uint8_t **pp, uint8_t *end);
+};
diff --git a/src/third_party/wiredtiger/src/include/lsm.h b/src/third_party/wiredtiger/src/include/lsm.h
new file mode 100644
index 00000000000..99532b97850
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/lsm.h
@@ -0,0 +1,232 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * WT_LSM_WORKER_COOKIE --
+ *	State for an LSM worker thread.
+ */
+struct __wt_lsm_worker_cookie {
+	WT_LSM_CHUNK **chunk_array;
+	size_t chunk_alloc;
+	u_int nchunks;
+};
+
+/*
+ * WT_LSM_WORKER_ARGS --
+ *	State for an LSM worker thread.
+ */
+struct __wt_lsm_worker_args {
+	WT_SESSION_IMPL	*session;	/* Session */
+	WT_CONDVAR	*work_cond;	/* Owned by the manager */
+	wt_thread_t	tid;		/* Thread id */
+	u_int		id;		/* My manager slot id */
+	uint32_t	type;		/* Types of operations handled */
+#define	WT_LSM_WORKER_RUN	0x01
+	uint32_t	flags;		/* Worker flags */
+};
+
+/*
+ * WT_CURSOR_LSM --
+ *	An LSM cursor.
+ */
+struct __wt_cursor_lsm {
+	WT_CURSOR iface;
+
+	WT_LSM_TREE *lsm_tree;
+	uint64_t dsk_gen;
+
+	u_int nchunks;			/* Number of chunks in the cursor */
+	u_int nupdates;			/* Updates needed (including
+					   snapshot isolation checks). */
+	WT_BLOOM **blooms;		/* Bloom filter handles. */
+	size_t bloom_alloc;
+
+	WT_CURSOR **cursors;		/* Cursor handles. */
+	size_t cursor_alloc;
+
+	WT_CURSOR *current;     	/* The current cursor for iteration */
+	WT_LSM_CHUNK *primary_chunk;	/* The current primary chunk */
+
+	uint64_t *switch_txn;		/* Switch txn for each chunk */
+	size_t txnid_alloc;
+
+	u_int update_count;		/* Updates performed. */
+
+#define	WT_CLSM_ACTIVE		0x01    /* Incremented the session count */
+#define	WT_CLSM_ITERATE_NEXT    0x02    /* Forward iteration */
+#define	WT_CLSM_ITERATE_PREV    0x04    /* Backward iteration */
+#define	WT_CLSM_MERGE           0x08    /* Merge cursor, don't update */
+#define	WT_CLSM_MINOR_MERGE	0x10    /* Minor merge, include tombstones */
+#define	WT_CLSM_MULTIPLE        0x20    /* Multiple cursors have values for the
+					   current key */
+#define	WT_CLSM_OPEN_READ	0x40    /* Open for reads */
+#define	WT_CLSM_OPEN_SNAPSHOT	0x80    /* Open for snapshot isolation */
+	uint32_t flags;
+};
+
+/*
+ * WT_LSM_CHUNK --
+ *	A single chunk (file) in an LSM tree.
+ */
+struct __wt_lsm_chunk {
+	const char *uri;		/* Data source for this chunk */
+	const char *bloom_uri;		/* URI of Bloom filter, if any */
+	struct timespec create_ts;	/* Creation time (for rate limiting) */
+	uint64_t count;			/* Approximate count of records */
+	uint64_t size;			/* Final chunk size */
+
+	uint64_t switch_txn;		/*
+					 * Largest transaction that can write
+					 * to this chunk, set by a worker
+					 * thread when the chunk is switched
+					 * out, or by compact to get the most
+					 * recent chunk flushed.
+					 */
+
+	uint32_t id;			/* ID used to generate URIs */
+	uint32_t generation;		/* Merge generation */
+	uint32_t refcnt;		/* Number of worker thread references */
+	uint32_t bloom_busy;		/* Number of worker thread references */
+
+	int8_t empty;			/* 1/0: checkpoint missing */
+	int8_t evicted;			/* 1/0: in-memory chunk was evicted */
+
+#define	WT_LSM_CHUNK_BLOOM	0x01
+#define	WT_LSM_CHUNK_MERGING	0x02
+#define	WT_LSM_CHUNK_ONDISK	0x04
+#define	WT_LSM_CHUNK_STABLE	0x08
+	uint32_t flags;
+} WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT)));
+
+/*
+ * Different types of work units. Used by LSM worker threads to choose which
+ * type of work they will execute, and by work units to define which action
+ * is required.
+ */
+#define	WT_LSM_WORK_BLOOM	0x01	/* Create a bloom filter */
+#define	WT_LSM_WORK_DROP	0x02	/* Drop unused chunks */
+#define	WT_LSM_WORK_FLUSH	0x04	/* Flush a chunk to disk */
+#define	WT_LSM_WORK_MERGE	0x08	/* Look for a tree merge */
+#define	WT_LSM_WORK_SWITCH	0x10	/* Switch to new in-memory chunk */
+
+/*
+ * WT_LSM_WORK_UNIT --
+ *	A definition of maintenance that an LSM tree needs done.
+ */
+struct __wt_lsm_work_unit {
+	TAILQ_ENTRY(__wt_lsm_work_unit) q;	/* Worker unit queue */
+	uint32_t	type;			/* Type of operation */
+#define	WT_LSM_WORK_FORCE	0x0001		/* Force operation */
+	uint32_t	flags;			/* Flags for operation */
+	WT_LSM_TREE *lsm_tree;
+};
+
+/*
+ * WT_LSM_MANAGER --
+ *	A structure that holds resources used to manage any LSM trees in a
+ *	database.
+ */
+struct __wt_lsm_manager {
+	/*
+	 * Queues of work units for LSM worker threads. We maintain three
+	 * queues, to allow us to keep each queue FIFO, rather than needing
+	 * to manage the order of work by shuffling the queue order.
+	 * One queue for switches - since switches should never wait for other
+	 *   work to be done.
+	 * One queue for application requested work. For example flushing
+	 *   and creating bloom filters.
+	 * One queue that is for longer running operations such as merges.
+	 */
+	TAILQ_HEAD(__wt_lsm_work_switch_qh, __wt_lsm_work_unit)  switchqh;
+	TAILQ_HEAD(__wt_lsm_work_app_qh, __wt_lsm_work_unit)	  appqh;
+	TAILQ_HEAD(__wt_lsm_work_manager_qh, __wt_lsm_work_unit) managerqh;
+	WT_SPINLOCK	switch_lock;	/* Lock for switch queue */
+	WT_SPINLOCK	app_lock;	/* Lock for application queue */
+	WT_SPINLOCK	manager_lock;	/* Lock for manager queue */
+	WT_CONDVAR     *work_cond;	/* Used to notify worker of activity */
+	uint32_t	lsm_workers;	/* Current number of LSM workers */
+	uint32_t	lsm_workers_max;
+#define	WT_LSM_MAX_WORKERS	20
+	WT_LSM_WORKER_ARGS lsm_worker_cookies[WT_LSM_MAX_WORKERS];
+};
+
+/*
+ * WT_LSM_TREE --
+ *	An LSM tree.
+ */
+struct __wt_lsm_tree {
+	const char *name, *config, *filename;
+	const char *key_format, *value_format;
+	const char *bloom_config, *file_config;
+
+	WT_COLLATOR *collator;
+	const char *collator_name;
+
+	int refcnt;			/* Number of users of the tree */
+#define	LSM_TREE_MAX_QUEUE	100
+	int queue_ref;
+	WT_RWLOCK *rwlock;
+	TAILQ_ENTRY(__wt_lsm_tree) q;
+
+	WT_DSRC_STATS stats;		/* LSM-level statistics */
+
+	uint64_t dsk_gen;
+
+	long ckpt_throttle;		/* Rate limiting due to checkpoints */
+	long merge_throttle;		/* Rate limiting due to merges */
+	uint64_t chunk_fill_ms;		/* Estimate of time to fill a chunk */
+	struct timespec last_flush_ts;	/* Timestamp last flush finished */
+	struct timespec work_push_ts;	/* Timestamp last work unit added */
+	uint64_t merge_progressing;	/* Bumped when merges are active */
+	uint32_t merge_syncing;		/* Bumped when merges are syncing */
+
+	/* Configuration parameters */
+	uint32_t bloom_bit_count;
+	uint32_t bloom_hash_count;
+	uint64_t chunk_size;
+	uint64_t chunk_max;
+	u_int merge_min, merge_max;
+
+	u_int merge_idle;		/* Count of idle merge threads */
+
+#define	WT_LSM_BLOOM_MERGED				0x00000001
+#define	WT_LSM_BLOOM_OFF				0x00000002
+#define	WT_LSM_BLOOM_OLDEST				0x00000004
+	uint32_t bloom;			/* Bloom creation policy */
+
+	WT_LSM_CHUNK **chunk;		/* Array of active LSM chunks */
+	size_t chunk_alloc;		/* Space allocated for chunks */
+	u_int nchunks;			/* Number of active chunks */
+	uint32_t last;			/* Last allocated ID */
+	int modified;			/* Have there been updates? */
+
+	WT_LSM_CHUNK **old_chunks;	/* Array of old LSM chunks */
+	size_t old_alloc;		/* Space allocated for old chunks */
+	u_int nold_chunks;		/* Number of old chunks */
+	int freeing_old_chunks;		/* Whether chunks are being freed */
+	uint32_t merge_aggressiveness;	/* Increase amount of work per merge */
+
+#define	WT_LSM_TREE_ACTIVE		0x01	/* Workers are active */
+#define	WT_LSM_TREE_COMPACTING		0x02	/* Tree being compacted */
+#define	WT_LSM_TREE_NEED_SWITCH		0x04	/* New chunk needs creating */
+#define	WT_LSM_TREE_OPEN		0x08	/* The tree is open */
+#define	WT_LSM_TREE_THROTTLE		0x10	/* Throttle updates */
+	uint32_t flags;
+
+#define	WT_LSM_TREE_EXCLUSIVE	0x01	/* Tree is opened exclusively */
+	uint8_t flags_atomic;
+};
+
+/*
+ * WT_LSM_DATA_SOURCE --
+ *	Implementation of the WT_DATA_SOURCE interface for LSM.
+ */
+struct __wt_lsm_data_source {
+	WT_DATA_SOURCE iface;
+
+	WT_RWLOCK *rwlock;
+};
diff --git a/src/third_party/wiredtiger/src/include/meta.h b/src/third_party/wiredtiger/src/include/meta.h
new file mode 100644
index 00000000000..e4d7fd64f94
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/meta.h
@@ -0,0 +1,58 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define	WT_WIREDTIGER		"WiredTiger"		/* Version file */
+#define	WT_SINGLETHREAD		"WiredTiger.lock"	/* Locking file */
+
+#define	WT_BASECONFIG		"WiredTiger.basecfg"	/* Configuration */
+#define	WT_USERCONFIG		"WiredTiger.config"	/* Configuration */
+
+#define	WT_METADATA_BACKUP	"WiredTiger.backup"	/* Hot backup file */
+
+#define	WT_METADATA_TURTLE	"WiredTiger.turtle"	/* Metadata metadata */
+#define	WT_METADATA_TURTLE_SET	"WiredTiger.turtle.set"	/* Turtle temp file */
+
+#define	WT_METADATA_URI		"metadata:"		/* Metadata alias */
+#define	WT_METAFILE_URI		"file:WiredTiger.wt"	/* Metadata file URI */
+#define	WT_IS_METADATA(dh)						\
+	(strcmp((dh)->name, WT_METAFILE_URI) == 0)
+#define	WT_METAFILE_ID		0			/* Metadata file ID */
+
+#define	WT_METADATA_VERSION	"WiredTiger version"	/* Version keys */
+#define	WT_METADATA_VERSION_STR	"WiredTiger version string"
+
+/*
+ * WT_CKPT --
+ *	Encapsulation of checkpoint information, shared by the metadata, the
+ * btree engine, and the block manager.
+ */
+#define	WT_CHECKPOINT		"WiredTigerCheckpoint"
+#define	WT_CKPT_FOREACH(ckptbase, ckpt)					\
+	for ((ckpt) = (ckptbase); (ckpt)->name != NULL; ++(ckpt))
+
+struct __wt_ckpt {
+	char	*name;				/* Name or NULL */
+
+	WT_ITEM  addr;				/* Checkpoint cookie string */
+	WT_ITEM  raw;				/* Checkpoint cookie raw */
+
+	int64_t	 order;				/* Checkpoint order */
+
+	uintmax_t sec;				/* Timestamp */
+
+	uint64_t ckpt_size;			/* Checkpoint size */
+
+	uint64_t write_gen;			/* Write generation */
+
+	void	*bpriv;				/* Block manager private */
+
+#define	WT_CKPT_ADD	0x01			/* Checkpoint to be added */
+#define	WT_CKPT_DELETE	0x02			/* Checkpoint to be deleted */
+#define	WT_CKPT_FAKE	0x04			/* Checkpoint is a fake */
+#define	WT_CKPT_UPDATE	0x08			/* Checkpoint requires update */
+	uint32_t flags;
+};
diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h
new file mode 100644
index 00000000000..bf2c4ccb8cf
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/misc.h
@@ -0,0 +1,221 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Quiet compiler warnings about unused function parameters and variables,
+ * and unused function return values.
+ */
+#define	WT_UNUSED(var)		(void)(var)
+
+/* Basic constants. */
+#define	WT_MILLION	(1000000)
+#define	WT_BILLION	(1000000000)
+
+#define	WT_KILOBYTE	(1024)
+#define	WT_MEGABYTE	(1048576)
+#define	WT_GIGABYTE	(1073741824)
+#define	WT_TERABYTE	((uint64_t)1099511627776)
+#define	WT_PETABYTE	((uint64_t)1125899906842624)
+
+/*
+ * Number of directory entries can grow dynamically.
+ */
+#define	WT_DIR_ENTRY	32
+
+#define	WT_DIRLIST_EXCLUDE	0x1	/* Exclude files matching prefix */
+#define	WT_DIRLIST_INCLUDE	0x2	/* Include files matching prefix */
+
+/*
+ * Sizes that cannot be larger than 2**32 are stored in uint32_t fields in
+ * common structures to save space.  To minimize conversions from size_t to
+ * uint32_t through the code, we use the following macros.
+ */
+#define	WT_STORE_SIZE(s)	((uint32_t)(s))
+#define	WT_PTRDIFF(end, begin)						\
+	((size_t)((uint8_t *)(end) - (uint8_t *)(begin)))
+#define	WT_PTRDIFF32(end, begin)					\
+	WT_STORE_SIZE(WT_PTRDIFF((end), (begin)))
+#define	WT_BLOCK_FITS(p, len, begin, maxlen)				\
+	((uint8_t *)(p) >= (uint8_t *)(begin) &&			\
+	((uint8_t *)(p) + (len) <= (uint8_t *)(begin) + (maxlen)))
+#define	WT_PTR_IN_RANGE(p, begin, maxlen)				\
+	WT_BLOCK_FITS((p), 1, (begin), (maxlen))
+
+/*
+ * Align an unsigned value of any type to a specified power-of-2, including the
+ * offset result of a pointer subtraction; do the calculation using the largest
+ * unsigned integer type available.
+ */
+#define	WT_ALIGN(n, v)							\
+	((((uintmax_t)(n)) + ((v) - 1)) & ~(((uintmax_t)(v)) - 1))
+
+/* Min, max. */
+#define	WT_MIN(a, b)	((a) < (b) ? (a) : (b))
+#define	WT_MAX(a, b)	((a) < (b) ? (b) : (a))
+
+/* Elements in an array. */
+#define	WT_ELEMENTS(a)	(sizeof(a) / sizeof(a[0]))
+
+/* 10 level skip lists, 1/4 have a link to the next element. */
+#define	WT_SKIP_MAXDEPTH	10
+#define	WT_SKIP_PROBABILITY	(UINT32_MAX >> 2)
+
+/*
+ * __wt_calloc_def --
+ *	Simple calls don't need separate sizeof arguments.
+ */
+#define	__wt_calloc_def(session, number, addr)				\
+	__wt_calloc(session, (size_t)(number), sizeof(**(addr)), addr)
+
+/*
+ * __wt_realloc_def --
+ *	Common case allocate-and-grow function.
+ *	Starts by allocating the requested number of items (at least 10), then
+ *	doubles each time the list needs to grow.
+ */
+#define	__wt_realloc_def(session, sizep, number, addr)			\
+	(((number) * sizeof(**(addr)) <= *(sizep)) ? 0 :		\
+	    __wt_realloc(session, sizep, WT_MAX(*(sizep) * 2,		\
+		WT_MAX(10, (number)) * sizeof(**(addr))), addr))
+/*
+ * Our internal free function clears the underlying address atomically so there
+ * is a smaller chance of racing threads seeing intermediate results while a
+ * structure is being free'd.	(That would be a bug, of course, but I'd rather
+ * not drop core, just the same.)  That's a non-standard "free" API, and the
+ * resulting bug is a mother to find -- make sure we get it right, don't make
+ * the caller remember to put the & operator on the pointer.
+ */
+#define	__wt_free(session, p) do {					\
+	if ((p) != NULL)						\
+		__wt_free_int(session, (void *)&(p));			\
+} while (0)
+#ifdef HAVE_DIAGNOSTIC
+#define	__wt_overwrite_and_free(session, p) do {			\
+	memset(p, WT_DEBUG_BYTE, sizeof(*(p)));				\
+	__wt_free(session, p);						\
+} while (0)
+#define	__wt_overwrite_and_free_len(session, p, len) do {		\
+	memset(p, WT_DEBUG_BYTE, len);					\
+	__wt_free(session, p);						\
+} while (0)
+#else
+#define	__wt_overwrite_and_free(session, p)		__wt_free(session, p)
+#define	__wt_overwrite_and_free_len(session, p, len)	__wt_free(session, p)
+#endif
+
+/*
+ * Flag set, clear and test.
+ *
+ * They come in 3 flavors: F_XXX (handles a field named "flags" in the structure
+ * referenced by its argument), LF_XXX (handles a local variable named "flags"),
+ * and FLD_XXX (handles any variable, anywhere).
+ *
+ * Flags are unsigned 32-bit values -- we cast to keep the compiler quiet (the
+ * hex constant might be a negative integer), and to ensure the hex constant is
+ * the correct size before applying the bitwise not operator.
+ */
+#define	F_CLR(p, mask)		((p)->flags &= ~((uint32_t)(mask)))
+#define	F_ISSET(p, mask)	((p)->flags & ((uint32_t)(mask)))
+#define	F_SET(p, mask)		((p)->flags |= ((uint32_t)(mask)))
+
+#define	LF_CLR(mask)		((flags) &= ~((uint32_t)(mask)))
+#define	LF_ISSET(mask)		((flags) & ((uint32_t)(mask)))
+#define	LF_SET(mask)		((flags) |= ((uint32_t)(mask)))
+
+#define	FLD_CLR(field, mask)	((field) &= ~((uint32_t)(mask)))
+#define	FLD_ISSET(field, mask)	((field) & ((uint32_t)(mask)))
+#define	FLD_SET(field, mask)	((field) |= ((uint32_t)(mask)))
+
+/* Verbose messages. */
+#ifdef HAVE_VERBOSE
+#define	WT_VERBOSE_ISSET(session, f)					\
+	(FLD_ISSET(S2C(session)->verbose, f))
+#else
+#define	WT_VERBOSE_ISSET(session, f)	0
+#endif
+
+/*
+ * Clear a structure, two flavors: inline when we want to guarantee there's
+ * no function call or setup/tear-down of a loop, and the default where the
+ * compiler presumably chooses.  Gcc 4.3 is supposed to get this right, but
+ * we've seen problems when calling memset to clear structures in performance
+ * critical paths.
+ */
+#define	WT_CLEAR_INLINE(type, s) do {					\
+	static const type __clear;					\
+	s = __clear;							\
+} while (0)
+#define	WT_CLEAR(s)							\
+	memset(&(s), 0, sizeof(s))
+
+/* Check if a string matches a prefix. */
+#define	WT_PREFIX_MATCH(str, pfx)					\
+	(((const char *)str)[0] == ((const char *)pfx)[0] &&		\
+	    strncmp((str), (pfx), strlen(pfx)) == 0)
+
+/* Check if a non-nul-terminated string matches a prefix. */
+#define	WT_PREFIX_MATCH_LEN(str, len, pfx)				\
+	((len) >= strlen(pfx) && WT_PREFIX_MATCH(str, pfx))
+
+/* Check if a string matches a prefix, and move past it. */
+#define	WT_PREFIX_SKIP(str, pfx)					\
+	(WT_PREFIX_MATCH(str, pfx) ? ((str) += strlen(pfx), 1) : 0)
+
+/*
+ * Check if a variable string equals a constant string.  Inline the common
+ * case for WiredTiger of a single byte string.  This is required because not
+ * all compilers optimize this case in strcmp (e.g., clang).
+ */
+#define	WT_STREQ(s, cs)							\
+	(sizeof(cs) == 2 ? (s)[0] == (cs)[0] && (s)[1] == '\0' :	\
+	strcmp(s, cs) == 0)
+
+/* Check if a string matches a byte string of len bytes. */
+#define	WT_STRING_MATCH(str, bytes, len)				\
+	(((const char *)str)[0] == ((const char *)bytes)[0] &&		\
+	    strncmp(str, bytes, len) == 0 && (str)[(len)] == '\0')
+
+/*
+ * Macro that produces a string literal that isn't wrapped in quotes, to avoid
+ * tripping up spell checkers.
+ */
+#define	WT_UNCHECKED_STRING(str) #str
+
+/* Function return value and scratch buffer declaration and initialization. */
+#define	WT_DECL_ITEM(i)	WT_ITEM *i = NULL
+#define	WT_DECL_RET	int ret = 0
+
+/* If a WT_ITEM data field points somewhere in its allocated memory. */
+#define	WT_DATA_IN_ITEM(i)						\
+	((i)->mem != NULL && (i)->data >= (i)->mem &&			\
+	    WT_PTRDIFF((i)->data, (i)->mem) < (i)->memsize)
+
+/* Copy the data and size fields of an item. */
+#define	WT_ITEM_SET(dst, src) do {					\
+	(dst).data = (src).data;					\
+	(dst).size = (src).size;					\
+} while (0)
+
+/*
+ * In diagnostic mode we track the locations from which hazard pointers and
+ * scratch buffers were acquired.
+ */
+#ifdef HAVE_DIAGNOSTIC
+#define	__wt_scr_alloc(session, size, scratchp)				\
+	__wt_scr_alloc_func(session, size, scratchp, __FILE__, __LINE__)
+#define	__wt_page_in(session, ref, flags)				\
+	__wt_page_in_func(session, ref, flags, __FILE__, __LINE__)
+#define	__wt_page_swap(session, held, want, flags)			\
+	__wt_page_swap_func(session, held, want, flags, __FILE__, __LINE__)
+#else
+#define	__wt_scr_alloc(session, size, scratchp)				\
+	__wt_scr_alloc_func(session, size, scratchp)
+#define	__wt_page_in(session, ref, flags)				\
+	__wt_page_in_func(session, ref, flags)
+#define	__wt_page_swap(session, held, want, flags)			\
+	__wt_page_swap_func(session, held, want, flags)
+#endif
diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i
new file mode 100644
index 00000000000..73caed09c8c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/misc.i
@@ -0,0 +1,32 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __wt_verbose --
+ * 	Verbose message.
+ */
+static inline int
+__wt_verbose(WT_SESSION_IMPL *session, int flag, const char *fmt, ...)
+    WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3)))
+{
+#ifdef HAVE_VERBOSE
+	WT_DECL_RET;
+	va_list ap;
+
+	if (WT_VERBOSE_ISSET(session, flag)) {
+		va_start(ap, fmt);
+		ret = __wt_eventv(session, 1, 0, NULL, 0, fmt, ap);
+		va_end(ap);
+	}
+	return (ret);
+#else
+	WT_UNUSED(session);
+	WT_UNUSED(fmt);
+	WT_UNUSED(flag);
+	return (0);
+#endif
+}
diff --git a/src/third_party/wiredtiger/src/include/msvc.h b/src/third_party/wiredtiger/src/include/msvc.h
new file mode 100644
index 00000000000..8f44a329940
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/msvc.h
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+#include <intrin.h>
+
+#ifndef _M_AMD64
+#error "Only x64 is supported with MSVC"
+#endif
+
+#define	inline __inline
+
+#define	WT_GCC_ATTRIBUTE(x)
+#define	WT_GCC_FUNC_ATTRIBUTE(x)
+
+#define	__WT_ATOMIC_ADD(v, val, n, s, t)				\
+	(WT_STATIC_ASSERT(sizeof(v) == (n)),				\
+	_InterlockedExchangeAdd ## s((t*)&(v), (t)(val)) + (val))
+#define	__WT_ATOMIC_CAS(v, old, new, n, s, t)				\
+	(WT_STATIC_ASSERT(sizeof(v) == (n)),				\
+	_InterlockedCompareExchange ## s				\
+	((t*)&(v), (t)(new), (t)(old)) == (t)(old))
+#define	__WT_ATOMIC_CAS_VAL(v, old, new, n, s, t)			\
+	(WT_STATIC_ASSERT(sizeof(v) == (n)),				\
+	_InterlockedCompareExchange ## s((t*)&(v), (t)(new), (t)(old)))
+#define	__WT_ATOMIC_STORE(v, val, n, s, t)				\
+	(WT_STATIC_ASSERT(sizeof(v) == (n)),				\
+	_InterlockedExchange ## s((t*)&(v), (t)(val)))
+#define	__WT_ATOMIC_SUB(v, val, n, s, t)				\
+	(WT_STATIC_ASSERT(sizeof(v) == (n)),				\
+	_InterlockedExchangeAdd ## s((t*)&(v), -(t) val) - (val))
+
+#define	WT_ATOMIC_ADD1(v, val)		__WT_ATOMIC_ADD(v, val, 1, 8, char)
+#define	WT_ATOMIC_CAS1(v, old, new)	__WT_ATOMIC_CAS(v, old, new, 1, 8, char)
+#define	WT_ATOMIC_CAS_VAL1(v, old, new)					\
+	__WT_ATOMIC_CAS_VAL(v, old, new, 1, 8, char)
+#define	WT_ATOMIC_STORE1(v, val)	__WT_ATOMIC_STORE(v, val, 1, 8, char)
+#define	WT_ATOMIC_SUB1(v, val)		__WT_ATOMIC_SUB(v, val, 1, 8, char)
+
+#define	WT_ATOMIC_ADD2(v, val)		__WT_ATOMIC_ADD(v, val, 2, 16, short)
+#define	WT_ATOMIC_CAS2(v, old, new)					\
+	__WT_ATOMIC_CAS(v, old, new, 2, 16, short)
+#define	WT_ATOMIC_CAS_VAL2(v, old, new)					\
+	__WT_ATOMIC_CAS_VAL(v, old, new, 2, 16, short)
+#define	WT_ATOMIC_STORE2(v, val)	__WT_ATOMIC_STORE(v, val, 2, 16, short)
+#define	WT_ATOMIC_SUB2(v, val)		__WT_ATOMIC_SUB(v, val, 2, 16, short)
+
+#define	WT_ATOMIC_ADD4(v, val)		__WT_ATOMIC_ADD(v, val, 4, , long)
+#define	WT_ATOMIC_CAS4(v, old, new)	__WT_ATOMIC_CAS(v, old, new, 4, , long)
+#define	WT_ATOMIC_CAS_VAL4(v, old, new)					\
+	__WT_ATOMIC_CAS_VAL(v, old, new, 4, , long)
+#define	WT_ATOMIC_STORE4(v, val)	__WT_ATOMIC_STORE(v, val, 4, , long)
+#define	WT_ATOMIC_SUB4(v, val)		__WT_ATOMIC_SUB(v, val, 4, , long)
+
+#define	WT_ATOMIC_ADD8(v, val)		__WT_ATOMIC_ADD(v, val, 8, 64, __int64)
+#define	WT_ATOMIC_CAS8(v, old, new)					\
+	__WT_ATOMIC_CAS(v, old, new, 8, 64, __int64)
+#define	WT_ATOMIC_CAS_VAL8(v, old, new)					\
+	__WT_ATOMIC_CAS_VAL(v, old, new, 8, 64, __int64)
+#define	WT_ATOMIC_STORE8(v, val)					\
+	__WT_ATOMIC_STORE(v, val, 8, 64, __int64)
+#define	WT_ATOMIC_SUB8(v, val)		__WT_ATOMIC_SUB(v, val, 8, 64, __int64)
+
+static inline void WT_BARRIER(void) { _ReadWriteBarrier(); }
+static inline void WT_FULL_BARRIER(void) { _mm_mfence(); }
+static inline void WT_PAUSE(void) { _mm_pause(); }
+static inline void WT_READ_BARRIER(void) { _mm_lfence(); }
+static inline void WT_WRITE_BARRIER(void) { _mm_sfence(); }
diff --git a/src/third_party/wiredtiger/src/include/mutex.h b/src/third_party/wiredtiger/src/include/mutex.h
new file mode 100644
index 00000000000..b71496dd595
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/mutex.h
@@ -0,0 +1,73 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Condition variables:
+ *
+ * WiredTiger uses condition variables to signal between threads, and for
+ * locking operations that are expected to block.
+ */
+struct __wt_condvar {
+	const char *name;		/* Mutex name for debugging */
+
+	wt_mutex_t mtx;			/* Mutex */
+	wt_cond_t  cond;		/* Condition variable */
+
+	int waiters;			/* Numbers of waiters, or
+					   -1 if signalled with no waiters. */
+};
+
+/*
+ * Read/write locks:
+ *
+ * WiredTiger uses read/write locks for shared/exclusive access to resources.
+ */
+struct __wt_rwlock {
+	const char *name;		/* Lock name for debugging */
+
+	wt_rwlock_t rwlock;		/* Read/write lock */
+};
+
+/*
+ * Spin locks:
+ *
+ * WiredTiger uses spinlocks for fast mutual exclusion (where operations done
+ * while holding the spin lock are expected to complete in a small number of
+ * instructions).
+ */
+#define	SPINLOCK_GCC			0
+#define	SPINLOCK_PTHREAD_MUTEX		1
+#define	SPINLOCK_PTHREAD_MUTEX_ADAPTIVE	2
+#define	SPINLOCK_PTHREAD_MUTEX_LOGGING	3
+#define	SPINLOCK_MSVC			4
+
+#if SPINLOCK_TYPE == SPINLOCK_GCC
+
+typedef volatile int
+    WT_SPINLOCK WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT)));
+
+#elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\
+	SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE ||\
+	SPINLOCK_TYPE == SPINLOCK_MSVC ||\
+	SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+
+typedef struct {
+	wt_mutex_t lock;
+
+	uint64_t counter;		/* Statistics: counter */
+
+	const char *name;		/* Statistics: mutex name */
+	int8_t id;			/* Statistics: current holder ID */
+
+	int8_t initialized;		/* Lock initialized, for cleanup */
+} WT_SPINLOCK WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT)));
+
+#else
+
+#error Unknown spinlock type
+
+#endif
diff --git a/src/third_party/wiredtiger/src/include/mutex.i b/src/third_party/wiredtiger/src/include/mutex.i
new file mode 100644
index 00000000000..0d5a8586051
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/mutex.i
@@ -0,0 +1,368 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Spin locks:
+ *
+ * These used for cases where fast mutual exclusion is needed (where operations
+ * done while holding the spin lock are expected to complete in a small number
+ * of instructions.
+ */
+
+#if SPINLOCK_TYPE == SPINLOCK_GCC
+
+#define	WT_DECL_SPINLOCK_ID(i)
+#define	__wt_spin_trylock(session, lock, idp)				\
+	__wt_spin_trylock_func(session, lock)
+
+/* Default to spinning 1000 times before yielding. */
+#ifndef WT_SPIN_COUNT
+#define	WT_SPIN_COUNT 1000
+#endif
+
+/*
+ * __wt_spin_init --
+ *      Initialize a spinlock.
+ */
+static inline int
+__wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name)
+{
+	WT_UNUSED(session);
+	WT_UNUSED(name);
+
+	*(t) = 0;
+	return (0);
+}
+
+/*
+ * __wt_spin_destroy --
+ *      Destroy a spinlock.
+ */
+static inline void
+__wt_spin_destroy(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+	WT_UNUSED(session);
+
+	*(t) = 0;
+}
+
+/*
+ * __wt_spin_trylock_func --
+ *      Try to lock a spinlock or fail immediately if it is busy.
+ */
+static inline int
+__wt_spin_trylock_func(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+	WT_UNUSED(session);
+
+	return (__sync_lock_test_and_set(t, 1) == 0 ? 0 : EBUSY);
+}
+
+/*
+ * __wt_spin_lock --
+ *      Spin until the lock is acquired.
+ */
+static inline void
+__wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+	int i;
+
+	WT_UNUSED(session);
+
+	while (__sync_lock_test_and_set(t, 1)) {
+		for (i = 0; *t && i < WT_SPIN_COUNT; i++)
+			WT_PAUSE();
+		if (*t)
+			__wt_yield();
+	}
+}
+
+/*
+ * __wt_spin_unlock --
+ *      Release the spinlock.
+ */
+static inline void
+__wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+	WT_UNUSED(session);
+
+	__sync_lock_release(t);
+}
+
+#elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\
+	SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE ||\
+	SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+
+/*
+ * __wt_spin_init --
+ *      Initialize a spinlock.
+ */
+static inline int
+__wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name)
+{
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE
+	pthread_mutexattr_t attr;
+
+	WT_RET(pthread_mutexattr_init(&attr));
+	WT_RET(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP));
+	WT_RET(pthread_mutex_init(&t->lock, &attr));
+#else
+	WT_RET(pthread_mutex_init(&t->lock, NULL));
+#endif
+
+	t->name = name;
+	t->initialized = 1;
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+	WT_RET(__wt_spin_lock_register_lock(session, t));
+#endif
+
+	WT_UNUSED(session);
+	return (0);
+}
+
+/*
+ * __wt_spin_destroy --
+ *      Destroy a spinlock.
+ */
+static inline void
+__wt_spin_destroy(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+	WT_UNUSED(session);
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+	__wt_spin_lock_unregister_lock(session, t);
+#endif
+	if (t->initialized) {
+		(void)pthread_mutex_destroy(&t->lock);
+		t->initialized = 0;
+	}
+}
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\
+	SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE
+
+#define	WT_DECL_SPINLOCK_ID(i)
+#define	__wt_spin_trylock(session, lock, idp)				\
+	__wt_spin_trylock_func(session, lock)
+
+/*
+ * __wt_spin_trylock_func --
+ *      Try to lock a spinlock or fail immediately if it is busy.
+ */
+static inline int
+__wt_spin_trylock_func(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+	WT_UNUSED(session);
+
+	return (pthread_mutex_trylock(&t->lock));
+}
+
+/*
+ * __wt_spin_lock --
+ *      Spin until the lock is acquired.
+ */
+static inline void
+__wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+	WT_UNUSED(session);
+
+	pthread_mutex_lock(&t->lock);
+}
+
+#endif
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+
+/*
+ * When logging statistics, we track which spinlocks block and why.
+ */
+#define	WT_DECL_SPINLOCK_ID(i)						\
+	static int i = WT_SPINLOCK_REGISTER
+#define	WT_SPINLOCK_REGISTER		-1
+#define	WT_SPINLOCK_REGISTER_FAILED	-2
+#define	__wt_spin_trylock(session, lock, idp)				\
+	__wt_spin_trylock_func(session, lock, idp, __FILE__, __LINE__)
+#define	__wt_spin_lock(session, lock) do {				\
+	WT_DECL_SPINLOCK_ID(__id);					\
+	__wt_spin_lock_func(session, lock, &__id, __FILE__, __LINE__);	\
+} while (0)
+
+/*
+ * __wt_spin_trylock_func --
+ *      Try to lock a spinlock or fail immediately if it is busy.
+ */
+static inline int
+__wt_spin_trylock_func(WT_SESSION_IMPL *session,
+    WT_SPINLOCK *t, int *idp, const char *file, int line)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+
+	conn = S2C_SAFE(session);
+	/* If we're not maintaining statistics, it's simple. */
+	if (session == NULL || !FLD_ISSET(conn->stat_flags, WT_STAT_CONN_FAST))
+		return (pthread_mutex_trylock(&t->lock));
+
+	/*
+	 * If this caller hasn't yet registered, do so.  The caller's location
+	 * ID is a static offset into a per-connection structure, and that has
+	 * problems: first, if there are multiple connections, we'll need to
+	 * hold some kind of lock to avoid racing when setting that value, and
+	 * second, if/when there are multiple connections and/or a single
+	 * connection is closed and re-opened, the variable may be initialized
+	 * and the underlying connection information may not.  Check both.
+	 */
+	if (*idp == WT_SPINLOCK_REGISTER ||
+	    conn->spinlock_block[*idp].name == NULL)
+		WT_RET(__wt_spin_lock_register_caller(
+		    session, t->name, file, line, idp));
+
+	/*
+	 * Try to acquire the mutex: on failure, update blocking statistics, on
+	 * success, set our ID as the mutex holder.
+	 *
+	 * Note the race between acquiring the lock and setting our ID as the
+	 * holder, this can appear in the output as mutexes blocking in ways
+	 * that can't actually happen (although still an indicator of a mutex
+	 * that's busier than we'd like).
+	 */
+	if ((ret = pthread_mutex_trylock(&t->lock)) == 0)
+		t->id = *idp;
+	else
+		if (*idp >= 0) {
+			++conn->spinlock_block[*idp].total;
+			if (t->id >= 0)
+				++conn->spinlock_block[*idp].blocked[t->id];
+		}
+
+	/* Update the mutex counter and flush to minimize the windows. */
+	++t->counter;
+	WT_FULL_BARRIER();
+	return (ret);
+}
+
+/*
+ * __wt_spin_lock_func --
+ *      Spin until the lock is acquired.
+ */
+static inline void
+__wt_spin_lock_func(WT_SESSION_IMPL *session,
+    WT_SPINLOCK *t, int *idp, const char *file, int line)
+{
+	/* If we're not maintaining statistics, it's simple. */
+	if (session == NULL ||
+	    !FLD_ISSET(conn->stat_flags, WT_STAT_CONN_FAST)) {
+		pthread_mutex_lock(&t->lock);
+		return;
+	}
+
+	/* Try to acquire the mutex. */
+	if (__wt_spin_trylock_func(session, t, idp, file, line) == 0)
+		return;
+
+	/*
+	 * On failure, wait on the mutex; once acquired, set our ID as the
+	 * holder and flush to minimize the windows.
+	 */
+	pthread_mutex_lock(&t->lock);
+	t->id = *idp;
+	WT_FULL_BARRIER();
+}
+
+#endif
+
+/*
+ * __wt_spin_unlock --
+ *      Release the spinlock.
+ */
+static inline void
+__wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+	WT_UNUSED(session);
+
+	pthread_mutex_unlock(&t->lock);
+}
+
+#elif SPINLOCK_TYPE == SPINLOCK_MSVC
+
+#define	WT_DECL_SPINLOCK_ID(i)	
+#define	WT_SPINLOCK_REGISTER		-1
+#define	WT_SPINLOCK_REGISTER_FAILED	-2
+
+#define	__wt_spin_trylock(session, lock, idp)				\
+	__wt_spin_trylock_func(session, lock)
+
+/*
+ * __wt_spin_init --
+ *      Initialize a spinlock.
+ */
+static inline int
+__wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name)
+{
+	WT_UNUSED(session);
+	WT_UNUSED(name);
+
+	InitializeCriticalSectionAndSpinCount(&t->lock, 4000);
+
+	return (0);
+}
+
+/*
+ * __wt_spin_destroy --
+ *      Destroy a spinlock.
+ */
+static inline void
+__wt_spin_destroy(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+	WT_UNUSED(session);
+
+	DeleteCriticalSection(&t->lock);
+}
+
+/*
+ * __wt_spin_trylock_func --
+ *      Try to lock a spinlock or fail immediately if it is busy.
+ */
+static inline int
+__wt_spin_trylock_func(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+	WT_UNUSED(session);
+
+	BOOL b = TryEnterCriticalSection(&t->lock);
+	return (b == 0 ? EBUSY : 0);
+}
+
+/*
+ * __wt_spin_lock --
+ *      Spin until the lock is acquired.
+ */
+static inline void
+__wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+	WT_UNUSED(session);
+
+	EnterCriticalSection(&t->lock);
+}
+
+/*
+ * __wt_spin_unlock --
+ *      Release the spinlock.
+ */
+static inline void
+__wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+	WT_UNUSED(session);
+
+	LeaveCriticalSection(&t->lock);
+}
+
+#else
+
+#error Unknown spinlock type
+
+#endif
diff --git a/src/third_party/wiredtiger/src/include/os.h b/src/third_party/wiredtiger/src/include/os.h
new file mode 100644
index 00000000000..846249294fe
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/os.h
@@ -0,0 +1,72 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define	WT_SYSCALL_RETRY(call, ret) do {				\
+	int __retry;							\
+	for (__retry = 0; __retry < 10; ++__retry) {			\
+		if ((call) == 0) {					\
+			(ret) = 0;					\
+			break;						\
+		}							\
+		switch ((ret) = __wt_errno()) {				\
+		case 0:							\
+			/* The call failed but didn't set errno. */	\
+			(ret) = WT_ERROR;				\
+			break;						\
+		case EAGAIN:						\
+		case EBUSY:						\
+		case EINTR:						\
+		case EIO:						\
+		case EMFILE:						\
+		case ENFILE:						\
+		case ENOSPC:						\
+			__wt_sleep(0L, 500000L);			\
+			continue;					\
+		default:						\
+			break;						\
+		}							\
+		break;							\
+	}								\
+} while (0)
+
+#define	WT_TIMEDIFF(end, begin)						\
+	(1000000000 * (uint64_t)((end).tv_sec - (begin).tv_sec) +	\
+	    (uint64_t)(end).tv_nsec - (uint64_t)(begin).tv_nsec)
+#define	WT_TIMECMP(t1, t2)						\
+	((t1).tv_sec < (t2).tv_sec ? -1 :				\
+	     (t1).tv_sec == (t2.tv_sec) ?				\
+	     (t1).tv_nsec < (t2).tv_nsec ? -1 :				\
+	     (t1).tv_nsec == (t2).tv_nsec ? 0 : 1 : 1)
+
+struct __wt_fh {
+	char	*name;				/* File name */
+	TAILQ_ENTRY(__wt_fh) q;			/* List of open handles */
+
+	u_int	ref;				/* Reference count */
+
+#ifndef _WIN32
+	int	 fd;				/* POSIX file handle */
+#else
+	HANDLE filehandle;			/* Windows file handle */
+	HANDLE filehandle_secondary;		/* Windows file handle
+						   for file size changes */
+#endif
+	wt_off_t size;				/* File size */
+	wt_off_t extend_size;			/* File extended size */
+	wt_off_t extend_len;			/* File extend chunk size */
+
+	int	direct_io;			/* O_DIRECT configured */
+
+	int	fallocate_available;		/* fallocate/posix_fallocate */
+	int	fallocate_requires_locking;
+};
+
+#ifndef _WIN32
+#define	WT_SIZET_FMT	"zu"			/* size_t format string */
+#else
+#define	WT_SIZET_FMT	"Iu"			/* size_t format string */
+#endif
diff --git a/src/third_party/wiredtiger/src/include/os_windows.h b/src/third_party/wiredtiger/src/include/os_windows.h
new file mode 100644
index 00000000000..fcae531184f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/os_windows.h
@@ -0,0 +1,60 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Define WT threading and concurrency primitives
+ * Assumes Windows 7+/2008 R2+
+ */
+typedef CONDITION_VARIABLE	wt_cond_t;
+typedef CRITICAL_SECTION	wt_mutex_t;
+typedef HANDLE			wt_thread_t;
+typedef SRWLOCK			wt_rwlock_t;
+
+/* Timespec is a POSIX structure not defined in Windows */
+struct timespec {
+	time_t tv_sec;		/* seconds */
+	long   tv_nsec;		/* nanoseconds */
+};
+
+#define	strncasecmp _strnicmp
+
+/*
+ * Windows Portability stuff
+ * These are POSIX types which Windows lacks
+ * Eventually WiredTiger will migrate away from these types
+ */
+typedef uint32_t	u_int;
+typedef unsigned char	u_char;
+typedef unsigned long	u_long;
+
+/* < VS 2013 is not C99 compat */
+#if _MSC_VER < 1900
+#define	snprintf _snprintf
+#endif
+
+/*
+ * Windows does have ssize_t
+ * Python headers declare also though so we need to guard it
+ */
+#ifndef HAVE_SSIZE_T
+typedef int ssize_t;
+#endif
+
+/*
+ * Provide a custom version of vsnprintf that returns the
+ * needed buffer length instead of -1 on truncation
+ */
+#define	vsnprintf _wt_vsnprintf
+
+_Check_return_opt_ int __cdecl _wt_vsnprintf(
+    _Out_writes_(_MaxCount) char * _DstBuf,
+    _In_ size_t _MaxCount,
+    _In_z_ _Printf_format_string_ const char * _Format,
+    va_list _ArgList);
+
+/* Provide a custom version of localtime_r */
+struct tm *localtime_r(const time_t* timer, struct tm* result);
diff --git a/src/third_party/wiredtiger/src/include/packing.i b/src/third_party/wiredtiger/src/include/packing.i
new file mode 100644
index 00000000000..6e0e7be13eb
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/packing.i
@@ -0,0 +1,685 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Throughout this code we have to be aware of default argument conversion.
+ *
+ * Refer to Chapter 8 of "Expert C Programming" by Peter van der Linden for the
+ * gory details.  The short version is that we have less cases to deal with
+ * because the compiler promotes shorter types to int or unsigned int.
+ */
+typedef struct {
+	union {
+		int64_t i;
+		uint64_t u;
+		const char *s;
+		WT_ITEM item;
+	} u;
+	uint32_t size;
+	int8_t havesize;
+	char type;
+} WT_PACK_VALUE;
+
+#define	WT_PACK_VALUE_INIT  { { 0 }, 0, 0, 0 }
+#define	WT_DECL_PACK_VALUE(pv)  WT_PACK_VALUE pv = WT_PACK_VALUE_INIT
+
+typedef struct {
+	WT_SESSION_IMPL *session;
+	const char *cur, *end, *orig;
+	unsigned long repeats;
+	WT_PACK_VALUE lastv;
+} WT_PACK;
+
+#define	WT_PACK_INIT    { NULL, NULL, NULL, NULL, 0, WT_PACK_VALUE_INIT }
+#define	WT_DECL_PACK(pack)  WT_PACK pack = WT_PACK_INIT
+
+typedef struct {
+	WT_CONFIG config;
+	char buf[20];
+	int count;
+	int iskey;
+	int genname;
+} WT_PACK_NAME;
+
+/*
+ * __pack_initn --
+ *      Initialize a pack iterator with the specified string and length.
+ */
+static inline int
+__pack_initn(
+    WT_SESSION_IMPL *session, WT_PACK *pack, const char *fmt, size_t len)
+{
+	if (*fmt == '@' || *fmt == '<' || *fmt == '>')
+		return (EINVAL);
+	if (*fmt == '.')
+		++fmt;
+
+	pack->session = session;
+	pack->cur = pack->orig = fmt;
+	pack->end = fmt + len;
+	pack->repeats = 0;
+	return (0);
+}
+
+/*
+ * __pack_init --
+ *      Initialize a pack iterator with the specified string.
+ */
+static inline int
+__pack_init(WT_SESSION_IMPL *session, WT_PACK *pack, const char *fmt)
+{
+	return (__pack_initn(session, pack, fmt, strlen(fmt)));
+}
+
+/*
+ * __pack_name_init --
+ *      Initialize the name of a pack iterator.
+ */
+static inline int
+__pack_name_init(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *names,
+    int iskey, WT_PACK_NAME *pn)
+{
+	WT_CLEAR(*pn);
+	pn->iskey = iskey;
+
+	if (names->str != NULL)
+		WT_RET(__wt_config_subinit(session, &pn->config, names));
+	else
+		pn->genname = 1;
+
+	return (0);
+}
+
+/*
+ * __pack_name_next --
+ *      Get the next field type from a pack iterator.
+ */
+static inline int
+__pack_name_next(WT_PACK_NAME *pn, WT_CONFIG_ITEM *name)
+{
+	WT_CONFIG_ITEM ignore;
+
+	if (pn->genname) {
+		(void)snprintf(pn->buf, sizeof(pn->buf),
+		    (pn->iskey ? "key%d" : "value%d"), pn->count);
+		WT_CLEAR(*name);
+		name->str = pn->buf;
+		name->len = strlen(pn->buf);
+		name->type = WT_CONFIG_ITEM_STRING;
+		pn->count++;
+	}
+	else
+		WT_RET(__wt_config_next(&pn->config, name, &ignore));
+
+	return (0);
+}
+
+/*
+ * __pack_next --
+ *      Next pack iterator.
+ */
+static inline int
+__pack_next(WT_PACK *pack, WT_PACK_VALUE *pv)
+{
+	char *endsize;
+
+	if (pack->repeats > 0) {
+		*pv = pack->lastv;
+		--pack->repeats;
+		return (0);
+	}
+
+next:	if (pack->cur == pack->end)
+		return (WT_NOTFOUND);
+
+	if (isdigit(*pack->cur)) {
+		pv->havesize = 1;
+		pv->size = WT_STORE_SIZE(strtoul(pack->cur, &endsize, 10));
+		pack->cur = endsize;
+	} else {
+		pv->havesize = 0;
+		pv->size = 1;
+	}
+
+	pv->type = *pack->cur++;
+	pack->repeats = 0;
+
+	switch (pv->type) {
+	case 'S':
+	case 's':
+	case 'x':
+		return (0);
+	case 't':
+		if (pv->size < 1 || pv->size > 8)
+			WT_RET_MSG(pack->session, EINVAL,
+			    "Bitfield sizes must be between 1 and 8 bits "
+			    "in format '%.*s'",
+			    (int)(pack->end - pack->orig), pack->orig);
+		return (0);
+	case 'u':
+	case 'U':
+		/* Special case for items with a size prefix. */
+		pv->type = (!pv->havesize && *pack->cur != '\0') ? 'U' : 'u';
+		return (0);
+	case 'b':
+	case 'h':
+	case 'i':
+	case 'B':
+	case 'H':
+	case 'I':
+	case 'l':
+	case 'L':
+	case 'q':
+	case 'Q':
+	case 'r':
+	case 'R':
+		/* Integral types repeat <size> times. */
+		if (pv->size == 0)
+			goto next;
+		pack->repeats = pv->size - 1;
+		pack->lastv = *pv;
+		return (0);
+	default:
+		WT_RET_MSG(pack->session, EINVAL,
+		   "Invalid type '%c' found in format '%.*s'",
+		    pv->type, (int)(pack->end - pack->orig), pack->orig);
+	}
+
+}
+
+#define	WT_PACK_GET(session, pv, ap) do {				\
+	WT_ITEM *__item;						\
+	switch (pv.type) {						\
+	case 'x':							\
+		break;							\
+	case 's':							\
+	case 'S':							\
+		pv.u.s = va_arg(ap, const char *);			\
+		break;							\
+	case 'U':							\
+	case 'u':							\
+		__item = va_arg(ap, WT_ITEM *);				\
+		pv.u.item.data = __item->data;				\
+		pv.u.item.size = __item->size;				\
+		break;							\
+	case 'b':							\
+	case 'h':							\
+	case 'i':							\
+		pv.u.i = va_arg(ap, int);				\
+		break;							\
+	case 'B':							\
+	case 'H':							\
+	case 'I':							\
+	case 't':							\
+		pv.u.u = va_arg(ap, unsigned int);			\
+		break;							\
+	case 'l':							\
+		pv.u.i = va_arg(ap, long);				\
+		break;							\
+	case 'L':							\
+		pv.u.u = va_arg(ap, unsigned long);			\
+		break;							\
+	case 'q':							\
+		pv.u.i = va_arg(ap, int64_t);				\
+		break;							\
+	case 'Q':							\
+	case 'r':							\
+	case 'R':							\
+		pv.u.u = va_arg(ap, uint64_t);				\
+		break;							\
+	/* User format strings have already been validated. */		\
+	WT_ILLEGAL_VALUE(session);					\
+	}								\
+} while (0)
+
+/*
+ * __pack_size --
+ *      Get the size of a packed value.
+ */
+static inline size_t
+__pack_size(WT_SESSION_IMPL *session, WT_PACK_VALUE *pv)
+{
+	size_t s, pad;
+
+	switch (pv->type) {
+	case 'x':
+		return (pv->size);
+	case 'j':
+	case 'J':
+		if (pv->type == 'j' || pv->havesize)
+			s = pv->size;
+		else {
+			ssize_t len;
+
+			/* The string was previously validated. */
+			len = __wt_json_strlen(pv->u.item.data,
+			    pv->u.item.size);
+			WT_ASSERT(session, len >= 0);
+			s = (size_t)len + 1;
+		}
+		return (s);
+	case 's':
+	case 'S':
+		if (pv->type == 's' || pv->havesize)
+			s = pv->size;
+		else
+			s = strlen(pv->u.s) + 1;
+		return (s);
+	case 'U':
+	case 'u':
+		s = pv->u.item.size;
+		pad = 0;
+		if (pv->havesize && pv->size < s)
+			s = pv->size;
+		else if (pv->havesize)
+			pad = pv->size - s;
+		if (pv->type == 'U')
+			s += __wt_vsize_uint(s + pad);
+		return (s + pad);
+	case 'b':
+	case 'B':
+	case 't':
+		return (1);
+	case 'h':
+	case 'i':
+	case 'l':
+	case 'q':
+		return (__wt_vsize_int(pv->u.i));
+	case 'H':
+	case 'I':
+	case 'L':
+	case 'Q':
+	case 'r':
+		return (__wt_vsize_uint(pv->u.u));
+	case 'R':
+		return (sizeof(uint64_t));
+	}
+
+	__wt_err(session, EINVAL, "unknown pack-value type: %c", (int)pv->type);
+	return ((size_t)-1);
+}
+
+/*
+ * __pack_write --
+ *      Pack a value into a buffer.
+ */
+static inline int
+__pack_write(
+    WT_SESSION_IMPL *session, WT_PACK_VALUE *pv, uint8_t **pp, size_t maxlen)
+{
+	uint8_t *oldp;
+	size_t s, pad;
+
+	switch (pv->type) {
+	case 'x':
+		WT_SIZE_CHECK(pv->size, maxlen);
+		memset(*pp, 0, pv->size);
+		*pp += pv->size;
+		break;
+	case 's':
+	case 'S':
+		/*
+		 * XXX if pv->havesize, only want to know if there is a
+		 * '\0' in the first pv->size characters.
+		 */
+		s = strlen(pv->u.s);
+		if ((pv->type == 's' || pv->havesize) && pv->size < s) {
+			s = pv->size;
+			pad = 0;
+		} else if (pv->havesize)
+			pad = pv->size - s;
+		else
+			pad = 1;
+		WT_SIZE_CHECK(s + pad, maxlen);
+		if (s > 0)
+			memcpy(*pp, pv->u.s, s);
+		*pp += s;
+		if (pad > 0) {
+			memset(*pp, 0, pad);
+			*pp += pad;
+		}
+		break;
+	case 'j':
+	case 'J':
+		s = pv->u.item.size;
+		if ((pv->type == 'j' || pv->havesize) && pv->size < s) {
+			s = pv->size;
+			pad = 0;
+		} else if (pv->havesize)
+			pad = pv->size - s;
+		else
+			pad = 1;
+		if (s > 0) {
+			oldp = *pp;
+			WT_RET(__wt_json_strncpy((char **)pp, maxlen,
+			    pv->u.item.data, s));
+			maxlen -= (size_t)(*pp - oldp);
+		}
+		if (pad > 0) {
+			WT_SIZE_CHECK(pad, maxlen);
+			memset(*pp, 0, pad);
+			*pp += pad;
+		}
+		break;
+	case 'U':
+	case 'u':
+		s = pv->u.item.size;
+		pad = 0;
+		if (pv->havesize && pv->size < s)
+			s = pv->size;
+		else if (pv->havesize)
+			pad = pv->size - s;
+		if (pv->type == 'U') {
+			oldp = *pp;
+			WT_RET(__wt_vpack_uint(pp, maxlen, s + pad));
+			maxlen -= (size_t)(*pp - oldp);
+		}
+		WT_SIZE_CHECK(s + pad, maxlen);
+		if (s > 0)
+			memcpy(*pp, pv->u.item.data, s);
+		*pp += s;
+		if (pad > 0) {
+			memset(*pp, 0, pad);
+			*pp += pad;
+		}
+		break;
+	case 'b':
+		/* Translate to maintain ordering with the sign bit. */
+		WT_SIZE_CHECK(1, maxlen);
+		**pp = (uint8_t)(pv->u.i + 0x80);
+		*pp += 1;
+		break;
+	case 'B':
+	case 't':
+		WT_SIZE_CHECK(1, maxlen);
+		**pp = (uint8_t)pv->u.u;
+		*pp += 1;
+		break;
+	case 'h':
+	case 'i':
+	case 'l':
+	case 'q':
+		WT_RET(__wt_vpack_int(pp, maxlen, pv->u.i));
+		break;
+	case 'H':
+	case 'I':
+	case 'L':
+	case 'Q':
+	case 'r':
+		WT_RET(__wt_vpack_uint(pp, maxlen, pv->u.u));
+		break;
+	case 'R':
+		WT_SIZE_CHECK(sizeof(uint64_t), maxlen);
+		*(uint64_t *)*pp = pv->u.u;
+		*pp += sizeof(uint64_t);
+		break;
+	default:
+		WT_RET_MSG(session, EINVAL,
+		    "unknown pack-value type: %c", (int)pv->type);
+	}
+
+	return (0);
+}
+
+/*
+ * __unpack_read --
+ *      Read a packed value from a buffer.
+ */
+static inline int
+__unpack_read(WT_SESSION_IMPL *session,
+    WT_PACK_VALUE *pv, const uint8_t **pp, size_t maxlen)
+{
+	size_t s;
+
+	switch (pv->type) {
+	case 'x':
+		WT_SIZE_CHECK(pv->size, maxlen);
+		*pp += pv->size;
+		break;
+	case 's':
+	case 'S':
+		if (pv->type == 's' || pv->havesize)
+			s = pv->size;
+		else
+			s = strlen((const char *)*pp) + 1;
+		if (s > 0)
+			pv->u.s = (const char *)*pp;
+		WT_SIZE_CHECK(s, maxlen);
+		*pp += s;
+		break;
+	case 'U':
+		WT_RET(__wt_vunpack_uint(pp, maxlen, &pv->u.u));
+		/* FALLTHROUGH */
+	case 'u':
+		if (pv->havesize)
+			s = pv->size;
+		else if (pv->type == 'U')
+			s = (size_t)pv->u.u;
+		else
+			s = maxlen;
+		WT_SIZE_CHECK(s, maxlen);
+		pv->u.item.data = *pp;
+		pv->u.item.size = s;
+		*pp += s;
+		break;
+	case 'b':
+		/* Translate to maintain ordering with the sign bit. */
+		WT_SIZE_CHECK(1, maxlen);
+		pv->u.i = (int8_t)(*(*pp)++ - 0x80);
+		break;
+	case 'B':
+	case 't':
+		WT_SIZE_CHECK(1, maxlen);
+		pv->u.u = *(*pp)++;
+		break;
+	case 'h':
+	case 'i':
+	case 'l':
+	case 'q':
+		WT_RET(__wt_vunpack_int(pp, maxlen, &pv->u.i));
+		break;
+	case 'H':
+	case 'I':
+	case 'L':
+	case 'Q':
+	case 'r':
+		WT_RET(__wt_vunpack_uint(pp, maxlen, &pv->u.u));
+		break;
+	case 'R':
+		WT_SIZE_CHECK(sizeof(uint64_t), maxlen);
+		pv->u.u = *(uint64_t *)*pp;
+		*pp += sizeof(uint64_t);
+		break;
+	default:
+		WT_RET_MSG(session, EINVAL,
+		    "unknown pack-value type: %c", (int)pv->type);
+	}
+
+	return (0);
+}
+
+#define	WT_UNPACK_PUT(session, pv, ap) do {				\
+	WT_ITEM *__item;						\
+	switch (pv.type) {						\
+	case 'x':							\
+		break;							\
+	case 's':							\
+	case 'S':							\
+		*va_arg(ap, const char **) = pv.u.s;			\
+		break;							\
+	case 'U':							\
+	case 'u':							\
+		__item = va_arg(ap, WT_ITEM *);				\
+		__item->data = pv.u.item.data;				\
+		__item->size = pv.u.item.size;				\
+		break;							\
+	case 'b':							\
+		*va_arg(ap, int8_t *) = (int8_t)pv.u.i;			\
+		break;							\
+	case 'h':							\
+		*va_arg(ap, int16_t *) = (short)pv.u.i;			\
+		break;							\
+	case 'i':							\
+	case 'l':							\
+		*va_arg(ap, int32_t *) = (int32_t)pv.u.i;		\
+		break;							\
+	case 'q':							\
+		*va_arg(ap, int64_t *) = pv.u.i;			\
+		break;							\
+	case 'B':							\
+	case 't':							\
+		*va_arg(ap, uint8_t *) = (uint8_t)pv.u.u;		\
+		break;							\
+	case 'H':							\
+		*va_arg(ap, uint16_t *) = (uint16_t)pv.u.u;		\
+		break;							\
+	case 'I':							\
+	case 'L':							\
+		*va_arg(ap, uint32_t *) = (uint32_t)pv.u.u;		\
+		break;							\
+	case 'Q':							\
+	case 'r':							\
+	case 'R':							\
+		*va_arg(ap, uint64_t *) = pv.u.u;			\
+		break;							\
+	/* User format strings have already been validated. */		\
+	WT_ILLEGAL_VALUE(session);					\
+	}								\
+} while (0)
+
+/*
+ * __wt_struct_packv --
+ *	Pack a byte string (va_list version).
+ */
+static inline int
+__wt_struct_packv(WT_SESSION_IMPL *session,
+    void *buffer, size_t size, const char *fmt, va_list ap)
+{
+	WT_DECL_PACK_VALUE(pv);
+	WT_DECL_RET;
+	WT_PACK pack;
+	uint8_t *p, *end;
+
+	p = buffer;
+	end = p + size;
+
+	if (fmt[0] != '\0' && fmt[1] == '\0') {
+		pv.type = fmt[0];
+		WT_PACK_GET(session, pv, ap);
+		return (__pack_write(session, &pv, &p, size));
+	}
+
+	WT_RET(__pack_init(session, &pack, fmt));
+	while ((ret = __pack_next(&pack, &pv)) == 0) {
+		WT_PACK_GET(session, pv, ap);
+		WT_RET(__pack_write(session, &pv, &p, (size_t)(end - p)));
+	}
+
+	/* Be paranoid - __pack_write should never overflow. */
+	WT_ASSERT(session, p <= end);
+
+	if (ret != WT_NOTFOUND)
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __wt_struct_sizev --
+ *	Calculate the size of a packed byte string (va_list version).
+ */
+static inline int
+__wt_struct_sizev(
+    WT_SESSION_IMPL *session, size_t *sizep, const char *fmt, va_list ap)
+{
+	WT_DECL_PACK_VALUE(pv);
+	WT_PACK pack;
+	size_t total;
+
+	if (fmt[0] != '\0' && fmt[1] == '\0') {
+		pv.type = fmt[0];
+		WT_PACK_GET(session, pv, ap);
+		*sizep = __pack_size(session, &pv);
+		return (0);
+	}
+
+	WT_RET(__pack_init(session, &pack, fmt));
+	for (total = 0; __pack_next(&pack, &pv) == 0;) {
+		WT_PACK_GET(session, pv, ap);
+		total += __pack_size(session, &pv);
+	}
+	*sizep = total;
+	return (0);
+}
+
+/*
+ * __wt_struct_unpackv --
+ *	Unpack a byte string (va_list version).
+ */
+static inline int
+__wt_struct_unpackv(WT_SESSION_IMPL *session,
+    const void *buffer, size_t size, const char *fmt, va_list ap)
+{
+	WT_DECL_PACK_VALUE(pv);
+	WT_DECL_RET;
+	WT_PACK pack;
+	const uint8_t *p, *end;
+
+	p = buffer;
+	end = p + size;
+
+	if (fmt[0] != '\0' && fmt[1] == '\0') {
+		pv.type = fmt[0];
+		if ((ret = __unpack_read(session, &pv, &p, size)) == 0)
+			WT_UNPACK_PUT(session, pv, ap);
+		return (0);
+	}
+
+	WT_RET(__pack_init(session, &pack, fmt));
+	while ((ret = __pack_next(&pack, &pv)) == 0) {
+		WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p)));
+		WT_UNPACK_PUT(session, pv, ap);
+	}
+
+	/* Be paranoid - __pack_write should never overflow. */
+	WT_ASSERT(session, p <= end);
+
+	if (ret != WT_NOTFOUND)
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __wt_struct_size_adjust --
+ *	Adjust the size field for a packed structure.
+ *
+ *      Sometimes we want to include the size as a field in a packed structure.
+ *      This is done by calling __wt_struct_size with the expected format and
+ *      a size of zero.  Then we want to pack the structure using the final
+ *      size.  This function adjusts the size appropriately (taking into
+ *      account the size of the final size or the size field itself).
+ */
+static inline void
+__wt_struct_size_adjust(WT_SESSION_IMPL *session, size_t *sizep)
+{
+	size_t prev_size = 1;
+	size_t orig_size = *sizep;
+	size_t field_size0 = __wt_vsize_uint(orig_size);
+	size_t field_size1 =
+	    __wt_vsize_uint(orig_size + field_size0 - prev_size);
+	*sizep += field_size1 - prev_size;
+
+	/*
+	 * Make sure the field size we calculated matches the adjusted size.
+	 * This relies on the fact that we are only adjusting by a small number
+	 * of bytes, so we won't cross multiple boundaries in the packing
+	 * routine.  If that were not true, we would need to iterate here until
+	 * the field size stops growing.
+	 */
+	WT_ASSERT(session, field_size1 == __wt_vsize_uint(*sizep));
+}
diff --git a/src/third_party/wiredtiger/src/include/posix.h b/src/third_party/wiredtiger/src/include/posix.h
new file mode 100644
index 00000000000..e3b43ea38ab
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/posix.h
@@ -0,0 +1,47 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/* Some systems don't configure 64-bit MIN/MAX by default. */
+#ifndef	ULLONG_MAX
+#define	ULLONG_MAX	0xffffffffffffffffULL
+#endif
+#ifndef	LLONG_MAX
+#define	LLONG_MAX	0x7fffffffffffffffLL
+#endif
+#ifndef	LLONG_MIN
+#define	LLONG_MIN	(-0x7fffffffffffffffLL - 1)
+#endif
+
+/* Define O_BINARY for Posix systems */
+#define	O_BINARY 	0
+
+/*
+ * Define WT threading and concurrency primitives
+ */
+typedef pthread_cond_t		wt_cond_t;
+typedef pthread_mutex_t		wt_mutex_t;
+typedef pthread_t		wt_thread_t;
+
+/*
+ * !!!
+ * Don't touch this structure without understanding the read/write
+ * locking functions.
+ */
+typedef union {			/* Read/write lock */
+#ifdef WORDS_BIGENDIAN
+	WiredTiger read/write locks require modification for big-endian systems.
+#else
+	uint64_t u;
+	uint32_t us;
+	struct {
+		uint16_t writers;
+		uint16_t readers;
+		uint16_t users;
+		uint16_t pad;
+	} s;
+#endif
+} wt_rwlock_t;
diff --git a/src/third_party/wiredtiger/src/include/queue.h b/src/third_party/wiredtiger/src/include/queue.h
new file mode 100644
index 00000000000..42e736e7b09
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/queue.h
@@ -0,0 +1,559 @@
+/*
+ * Copyright (c) 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)queue.h	8.5 (Berkeley) 8/20/94
+ * $FreeBSD: src/sys/sys/queue.h,v 1.54 2002/08/05 05:18:43 alfred Exp $
+ */
+
+#ifndef	_DB_QUEUE_H_
+#define	_DB_QUEUE_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * This file defines four types of data structures: singly-linked lists,
+ * singly-linked tail queues, lists and tail queues.
+ *
+ * A singly-linked list is headed by a single forward pointer. The elements
+ * are singly linked for minimum space and pointer manipulation overhead at
+ * the expense of O(n) removal for arbitrary elements. New elements can be
+ * added to the list after an existing element or at the head of the list.
+ * Elements being removed from the head of the list should use the explicit
+ * macro for this purpose for optimum efficiency. A singly-linked list may
+ * only be traversed in the forward direction.  Singly-linked lists are ideal
+ * for applications with large datasets and few or no removals or for
+ * implementing a LIFO queue.
+ *
+ * A singly-linked tail queue is headed by a pair of pointers, one to the
+ * head of the list and the other to the tail of the list. The elements are
+ * singly linked for minimum space and pointer manipulation overhead at the
+ * expense of O(n) removal for arbitrary elements. New elements can be added
+ * to the list after an existing element, at the head of the list, or at the
+ * end of the list. Elements being removed from the head of the tail queue
+ * should use the explicit macro for this purpose for optimum efficiency.
+ * A singly-linked tail queue may only be traversed in the forward direction.
+ * Singly-linked tail queues are ideal for applications with large datasets
+ * and few or no removals or for implementing a FIFO queue.
+ *
+ * A list is headed by a single forward pointer (or an array of forward
+ * pointers for a hash table header). The elements are doubly linked
+ * so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before
+ * or after an existing element or at the head of the list. A list
+ * may only be traversed in the forward direction.
+ *
+ * A tail queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or
+ * after an existing element, at the head of the list, or at the end of
+ * the list. A tail queue may be traversed in either direction.
+ *
+ * For details on the use of these macros, see the queue(3) manual page.
+ *
+ *
+ *			SLIST	LIST	STAILQ	TAILQ
+ * _HEAD		+	+	+	+
+ * _HEAD_INITIALIZER	+	+	+	+
+ * _ENTRY		+	+	+	+
+ * _INIT		+	+	+	+
+ * _EMPTY		+	+	+	+
+ * _FIRST		+	+	+	+
+ * _NEXT		+	+	+	+
+ * _PREV		-	-	-	+
+ * _LAST		-	-	+	+
+ * _FOREACH		+	+	+	+
+ * _FOREACH_REVERSE	-	-	-	+
+ * _INSERT_HEAD		+	+	+	+
+ * _INSERT_BEFORE	-	+	-	+
+ * _INSERT_AFTER	+	+	+	+
+ * _INSERT_TAIL		-	-	+	+
+ * _CONCAT		-	-	+	+
+ * _REMOVE_HEAD		+	-	+	-
+ * _REMOVE		+	+	+	+
+ *
+ */
+
+/*
+ * XXX
+ * We #undef all of the macros because there are incompatible versions of this
+ * file and these macros on various systems.  What makes the problem worse is
+ * they are included and/or defined by system include files which we may have
+ * already loaded into Berkeley DB before getting here.  For example, FreeBSD's
+ * <rpc/rpc.h> includes its system <sys/queue.h>, and VxWorks UnixLib.h defines
+ * several of the LIST_XXX macros.  Visual C.NET 7.0 also defines some of these
+ * same macros in Vc7\PlatformSDK\Include\WinNT.h.  Make sure we use ours.
+ */
+#undef LIST_EMPTY
+#undef LIST_ENTRY
+#undef LIST_FIRST
+#undef LIST_FOREACH
+#undef LIST_HEAD
+#undef LIST_HEAD_INITIALIZER
+#undef LIST_INIT
+#undef LIST_INSERT_AFTER
+#undef LIST_INSERT_BEFORE
+#undef LIST_INSERT_HEAD
+#undef LIST_NEXT
+#undef LIST_REMOVE
+#undef QMD_TRACE_ELEM
+#undef QMD_TRACE_HEAD
+#undef QUEUE_MACRO_DEBUG
+#undef SLIST_EMPTY
+#undef SLIST_ENTRY
+#undef SLIST_FIRST
+#undef SLIST_FOREACH
+#undef SLIST_FOREACH_PREVPTR
+#undef SLIST_HEAD
+#undef SLIST_HEAD_INITIALIZER
+#undef SLIST_INIT
+#undef SLIST_INSERT_AFTER
+#undef SLIST_INSERT_HEAD
+#undef SLIST_NEXT
+#undef SLIST_REMOVE
+#undef SLIST_REMOVE_HEAD
+#undef STAILQ_CONCAT
+#undef STAILQ_EMPTY
+#undef STAILQ_ENTRY
+#undef STAILQ_FIRST
+#undef STAILQ_FOREACH
+#undef STAILQ_HEAD
+#undef STAILQ_HEAD_INITIALIZER
+#undef STAILQ_INIT
+#undef STAILQ_INSERT_AFTER
+#undef STAILQ_INSERT_HEAD
+#undef STAILQ_INSERT_TAIL
+#undef STAILQ_LAST
+#undef STAILQ_NEXT
+#undef STAILQ_REMOVE
+#undef STAILQ_REMOVE_HEAD
+#undef STAILQ_REMOVE_HEAD_UNTIL
+#undef TAILQ_CONCAT
+#undef TAILQ_EMPTY
+#undef TAILQ_ENTRY
+#undef TAILQ_FIRST
+#undef TAILQ_FOREACH
+#undef TAILQ_FOREACH_REVERSE
+#undef TAILQ_HEAD
+#undef TAILQ_HEAD_INITIALIZER
+#undef TAILQ_INIT
+#undef TAILQ_INSERT_AFTER
+#undef TAILQ_INSERT_BEFORE
+#undef TAILQ_INSERT_HEAD
+#undef TAILQ_INSERT_TAIL
+#undef TAILQ_LAST
+#undef TAILQ_NEXT
+#undef TAILQ_PREV
+#undef TAILQ_REMOVE
+#undef TRACEBUF
+#undef TRASHIT
+
+#define	QUEUE_MACRO_DEBUG 0
+#if QUEUE_MACRO_DEBUG
+/* Store the last 2 places the queue element or head was altered */
+struct qm_trace {
+	char * lastfile;
+	int lastline;
+	char * prevfile;
+	int prevline;
+};
+
+#define	TRACEBUF	struct qm_trace trace;
+#define	TRASHIT(x)	do {(x) = (void *)-1;} while (0)
+
+#define	QMD_TRACE_HEAD(head) do {					\
+	(head)->trace.prevline = (head)->trace.lastline;		\
+	(head)->trace.prevfile = (head)->trace.lastfile;		\
+	(head)->trace.lastline = __LINE__;				\
+	(head)->trace.lastfile = __FILE__;				\
+} while (0)
+
+#define	QMD_TRACE_ELEM(elem) do {					\
+	(elem)->trace.prevline = (elem)->trace.lastline;		\
+	(elem)->trace.prevfile = (elem)->trace.lastfile;		\
+	(elem)->trace.lastline = __LINE__;				\
+	(elem)->trace.lastfile = __FILE__;				\
+} while (0)
+
+#else
+#define	QMD_TRACE_ELEM(elem)
+#define	QMD_TRACE_HEAD(head)
+#define	TRACEBUF
+#define	TRASHIT(x)
+#endif	/* QUEUE_MACRO_DEBUG */
+
+/*
+ * Singly-linked List declarations.
+ */
+#define	SLIST_HEAD(name, type)						\
+struct name {								\
+	struct type *slh_first;	/* first element */			\
+}
+
+#define	SLIST_HEAD_INITIALIZER(head)					\
+	{ NULL }
+
+#define	SLIST_ENTRY(type)						\
+struct {								\
+	struct type *sle_next;	/* next element */			\
+}
+
+/*
+ * Singly-linked List functions.
+ */
+#define	SLIST_EMPTY(head)	((head)->slh_first == NULL)
+
+#define	SLIST_FIRST(head)	((head)->slh_first)
+
+#define	SLIST_FOREACH(var, head, field)					\
+	for ((var) = SLIST_FIRST((head));				\
+	    (var);							\
+	    (var) = SLIST_NEXT((var), field))
+
+#define	SLIST_FOREACH_PREVPTR(var, varp, head, field)			\
+	for ((varp) = &SLIST_FIRST((head));				\
+	    ((var) = *(varp)) != NULL;					\
+	    (varp) = &SLIST_NEXT((var), field))
+
+#define	SLIST_INIT(head) do {						\
+	SLIST_FIRST((head)) = NULL;					\
+} while (0)
+
+#define	SLIST_INSERT_AFTER(slistelm, elm, field) do {			\
+	SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field);	\
+	SLIST_NEXT((slistelm), field) = (elm);				\
+} while (0)
+
+#define	SLIST_INSERT_HEAD(head, elm, field) do {			\
+	SLIST_NEXT((elm), field) = SLIST_FIRST((head));			\
+	SLIST_FIRST((head)) = (elm);					\
+} while (0)
+
+#define	SLIST_NEXT(elm, field)	((elm)->field.sle_next)
+
+#define	SLIST_REMOVE(head, elm, type, field) do {			\
+	if (SLIST_FIRST((head)) == (elm)) {				\
+		SLIST_REMOVE_HEAD((head), field);			\
+	}								\
+	else {								\
+		struct type *curelm = SLIST_FIRST((head));		\
+		while (SLIST_NEXT(curelm, field) != (elm))		\
+			curelm = SLIST_NEXT(curelm, field);		\
+		SLIST_NEXT(curelm, field) =				\
+		    SLIST_NEXT(SLIST_NEXT(curelm, field), field);	\
+	}								\
+} while (0)
+
+#define	SLIST_REMOVE_HEAD(head, field) do {				\
+	SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field);	\
+} while (0)
+
+/*
+ * Singly-linked Tail queue declarations.
+ */
+#define	STAILQ_HEAD(name, type)						\
+struct name {								\
+	struct type *stqh_first;/* first element */			\
+	struct type **stqh_last;/* addr of last next element */		\
+}
+
+#define	STAILQ_HEAD_INITIALIZER(head)					\
+	{ NULL, &(head).stqh_first }
+
+#define	STAILQ_ENTRY(type)						\
+struct {								\
+	struct type *stqe_next;	/* next element */			\
+}
+
+/*
+ * Singly-linked Tail queue functions.
+ */
+#define	STAILQ_CONCAT(head1, head2) do {				\
+	if (!STAILQ_EMPTY((head2))) {					\
+		*(head1)->stqh_last = (head2)->stqh_first;		\
+		(head1)->stqh_last = (head2)->stqh_last;		\
+		STAILQ_INIT((head2));					\
+	}								\
+} while (0)
+
+#define	STAILQ_EMPTY(head)	((head)->stqh_first == NULL)
+
+#define	STAILQ_FIRST(head)	((head)->stqh_first)
+
+#define	STAILQ_FOREACH(var, head, field)				\
+	for ((var) = STAILQ_FIRST((head));				\
+	   (var);							\
+	   (var) = STAILQ_NEXT((var), field))
+
+#define	STAILQ_INIT(head) do {						\
+	STAILQ_FIRST((head)) = NULL;					\
+	(head)->stqh_last = &STAILQ_FIRST((head));			\
+} while (0)
+
+#define	STAILQ_INSERT_AFTER(head, tqelm, elm, field) do {		\
+	if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\
+		(head)->stqh_last = &STAILQ_NEXT((elm), field);		\
+	STAILQ_NEXT((tqelm), field) = (elm);				\
+} while (0)
+
+#define	STAILQ_INSERT_HEAD(head, elm, field) do {			\
+	if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL)	\
+		(head)->stqh_last = &STAILQ_NEXT((elm), field);		\
+	STAILQ_FIRST((head)) = (elm);					\
+} while (0)
+
+#define	STAILQ_INSERT_TAIL(head, elm, field) do {			\
+	STAILQ_NEXT((elm), field) = NULL;				\
+	*(head)->stqh_last = (elm);					\
+	(head)->stqh_last = &STAILQ_NEXT((elm), field);			\
+} while (0)
+
+#define	STAILQ_LAST(head, type, field)					\
+	(STAILQ_EMPTY((head)) ?						\
+		NULL :							\
+		((struct type *)					\
+		((char *)((head)->stqh_last) - __offsetof(struct type, field))))
+
+#define	STAILQ_NEXT(elm, field)	((elm)->field.stqe_next)
+
+#define	STAILQ_REMOVE(head, elm, type, field) do {			\
+	if (STAILQ_FIRST((head)) == (elm)) {				\
+		STAILQ_REMOVE_HEAD((head), field);			\
+	}								\
+	else {								\
+		struct type *curelm = STAILQ_FIRST((head));		\
+		while (STAILQ_NEXT(curelm, field) != (elm))		\
+			curelm = STAILQ_NEXT(curelm, field);		\
+		if ((STAILQ_NEXT(curelm, field) =			\
+		     STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\
+			(head)->stqh_last = &STAILQ_NEXT((curelm), field);\
+	}								\
+} while (0)
+
+#define	STAILQ_REMOVE_HEAD(head, field) do {				\
+	if ((STAILQ_FIRST((head)) =					\
+	     STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL)		\
+		(head)->stqh_last = &STAILQ_FIRST((head));		\
+} while (0)
+
+#define	STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do {			\
+	if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL)	\
+		(head)->stqh_last = &STAILQ_FIRST((head));		\
+} while (0)
+
+/*
+ * List declarations.
+ */
+#define	LIST_HEAD(name, type)						\
+struct name {								\
+	struct type *lh_first;	/* first element */			\
+}
+
+#define	LIST_HEAD_INITIALIZER(head)					\
+	{ NULL }
+
+#define	LIST_ENTRY(type)						\
+struct {								\
+	struct type *le_next;	/* next element */			\
+	struct type **le_prev;	/* address of previous next element */	\
+}
+
+/*
+ * List functions.
+ */
+
+#define	LIST_EMPTY(head)	((head)->lh_first == NULL)
+
+#define	LIST_FIRST(head)	((head)->lh_first)
+
+#define	LIST_FOREACH(var, head, field)					\
+	for ((var) = LIST_FIRST((head));				\
+	    (var);							\
+	    (var) = LIST_NEXT((var), field))
+
+#define	LIST_INIT(head) do {						\
+	LIST_FIRST((head)) = NULL;					\
+} while (0)
+
+#define	LIST_INSERT_AFTER(listelm, elm, field) do {			\
+	if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\
+		LIST_NEXT((listelm), field)->field.le_prev =		\
+		    &LIST_NEXT((elm), field);				\
+	LIST_NEXT((listelm), field) = (elm);				\
+	(elm)->field.le_prev = &LIST_NEXT((listelm), field);		\
+} while (0)
+
+#define	LIST_INSERT_BEFORE(listelm, elm, field) do {			\
+	(elm)->field.le_prev = (listelm)->field.le_prev;		\
+	LIST_NEXT((elm), field) = (listelm);				\
+	*(listelm)->field.le_prev = (elm);				\
+	(listelm)->field.le_prev = &LIST_NEXT((elm), field);		\
+} while (0)
+
+#define	LIST_INSERT_HEAD(head, elm, field) do {				\
+	if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL)	\
+		LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\
+	LIST_FIRST((head)) = (elm);					\
+	(elm)->field.le_prev = &LIST_FIRST((head));			\
+} while (0)
+
+#define	LIST_NEXT(elm, field)	((elm)->field.le_next)
+
+#define	LIST_REMOVE(elm, field) do {					\
+	if (LIST_NEXT((elm), field) != NULL)				\
+		LIST_NEXT((elm), field)->field.le_prev =		\
+		    (elm)->field.le_prev;				\
+	*(elm)->field.le_prev = LIST_NEXT((elm), field);		\
+} while (0)
+
+/*
+ * Tail queue declarations.
+ */
+#define	TAILQ_HEAD(name, type)						\
+struct name {								\
+	struct type *tqh_first;	/* first element */			\
+	struct type **tqh_last;	/* addr of last next element */		\
+	TRACEBUF							\
+}
+
+#define	TAILQ_HEAD_INITIALIZER(head)					\
+	{ NULL, &(head).tqh_first }
+
+#define	TAILQ_ENTRY(type)						\
+struct {								\
+	struct type *tqe_next;	/* next element */			\
+	struct type **tqe_prev;	/* address of previous next element */	\
+	TRACEBUF							\
+}
+
+/*
+ * Tail queue functions.
+ */
+#define	TAILQ_CONCAT(head1, head2, field) do {				\
+	if (!TAILQ_EMPTY(head2)) {					\
+		*(head1)->tqh_last = (head2)->tqh_first;		\
+		(head2)->tqh_first->field.tqe_prev = (head1)->tqh_last;	\
+		(head1)->tqh_last = (head2)->tqh_last;			\
+		TAILQ_INIT((head2));					\
+		QMD_TRACE_HEAD(head);					\
+		QMD_TRACE_HEAD(head2);					\
+	}								\
+} while (0)
+
+#define	TAILQ_EMPTY(head)	((head)->tqh_first == NULL)
+
+#define	TAILQ_FIRST(head)	((head)->tqh_first)
+
+#define	TAILQ_FOREACH(var, head, field)					\
+	for ((var) = TAILQ_FIRST((head));				\
+	    (var);							\
+	    (var) = TAILQ_NEXT((var), field))
+
+#define	TAILQ_FOREACH_REVERSE(var, head, headname, field)		\
+	for ((var) = TAILQ_LAST((head), headname);			\
+	    (var);							\
+	    (var) = TAILQ_PREV((var), headname, field))
+
+#define	TAILQ_INIT(head) do {						\
+	TAILQ_FIRST((head)) = NULL;					\
+	(head)->tqh_last = &TAILQ_FIRST((head));			\
+	QMD_TRACE_HEAD(head);						\
+} while (0)
+
+#define	TAILQ_INSERT_AFTER(head, listelm, elm, field) do {		\
+	if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\
+		TAILQ_NEXT((elm), field)->field.tqe_prev =		\
+		    &TAILQ_NEXT((elm), field);				\
+	else {								\
+		(head)->tqh_last = &TAILQ_NEXT((elm), field);		\
+		QMD_TRACE_HEAD(head);					\
+	}								\
+	TAILQ_NEXT((listelm), field) = (elm);				\
+	(elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field);		\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+	QMD_TRACE_ELEM(&listelm->field);				\
+} while (0)
+
+#define	TAILQ_INSERT_BEFORE(listelm, elm, field) do {			\
+	(elm)->field.tqe_prev = (listelm)->field.tqe_prev;		\
+	TAILQ_NEXT((elm), field) = (listelm);				\
+	*(listelm)->field.tqe_prev = (elm);				\
+	(listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field);		\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+	QMD_TRACE_ELEM(&listelm->field);				\
+} while (0)
+
+#define	TAILQ_INSERT_HEAD(head, elm, field) do {			\
+	if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL)	\
+		TAILQ_FIRST((head))->field.tqe_prev =			\
+		    &TAILQ_NEXT((elm), field);				\
+	else								\
+		(head)->tqh_last = &TAILQ_NEXT((elm), field);		\
+	TAILQ_FIRST((head)) = (elm);					\
+	(elm)->field.tqe_prev = &TAILQ_FIRST((head));			\
+	QMD_TRACE_HEAD(head);						\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+} while (0)
+
+#define	TAILQ_INSERT_TAIL(head, elm, field) do {			\
+	TAILQ_NEXT((elm), field) = NULL;				\
+	(elm)->field.tqe_prev = (head)->tqh_last;			\
+	*(head)->tqh_last = (elm);					\
+	(head)->tqh_last = &TAILQ_NEXT((elm), field);			\
+	QMD_TRACE_HEAD(head);						\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+} while (0)
+
+#define	TAILQ_LAST(head, headname)					\
+	(*(((struct headname *)((head)->tqh_last))->tqh_last))
+
+#define	TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)
+
+#define	TAILQ_PREV(elm, headname, field)				\
+	(*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))
+
+#define	TAILQ_REMOVE(head, elm, field) do {				\
+	if ((TAILQ_NEXT((elm), field)) != NULL)				\
+		TAILQ_NEXT((elm), field)->field.tqe_prev =		\
+		    (elm)->field.tqe_prev;				\
+	else {								\
+		(head)->tqh_last = (elm)->field.tqe_prev;		\
+		QMD_TRACE_HEAD(head);					\
+	}								\
+	*(elm)->field.tqe_prev = TAILQ_NEXT((elm), field);		\
+	TRASHIT((elm)->field.tqe_next);					\
+	TRASHIT((elm)->field.tqe_prev);					\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+} while (0)
+
+#if defined(__cplusplus)
+}
+#endif
+#endif	/* !_DB_QUEUE_H_ */
diff --git a/src/third_party/wiredtiger/src/include/schema.h b/src/third_party/wiredtiger/src/include/schema.h
new file mode 100644
index 00000000000..e24a19b03ca
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/schema.h
@@ -0,0 +1,101 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/* Character constants for projection plans */
+#define	WT_PROJ_KEY	'k' /* Go to key in cursor <arg> */
+#define	WT_PROJ_NEXT	'n' /* Process the next item (<arg> repeats) */
+#define	WT_PROJ_REUSE	'r' /* Reuse the previous item (<arg> repeats) */
+#define	WT_PROJ_SKIP	's' /* Skip a column in the cursor (<arg> repeats) */
+#define	WT_PROJ_VALUE	'v' /* Go to the value in cursor <arg> */
+
+struct __wt_colgroup {
+	const char *name;		/* Logical name */
+	const char *source;		/* Underlying data source */
+	const char *config;		/* Configuration string */
+
+	WT_CONFIG_ITEM colconf;		/* List of columns from config */
+};
+
+struct __wt_index {
+	const char *name;		/* Logical name */
+	const char *source;		/* Underlying data source */
+	const char *config;		/* Configuration string */
+
+	WT_CONFIG_ITEM colconf;		/* List of columns from config */
+
+	const char *idxkey_format;	/* Index key format (hides primary) */
+	const char *key_format;		/* Key format */
+	const char *key_plan;		/* Key projection plan */
+	const char *value_plan;		/* Value projection plan */
+};
+
+/*
+ * WT_TABLE --
+ *	Handle for a logical table.  A table consists of one or more column
+ *	groups, each of which holds some set of columns all sharing a primary
+ *	key; and zero or more indices, each of which holds some set of columns
+ *	in an index key that can be used to reconstruct the primary key.
+ */
+struct __wt_table {
+	const char *name, *config, *plan;
+	const char *key_format, *value_format;
+
+	WT_CONFIG_ITEM cgconf, colconf;
+
+	WT_COLGROUP **cgroups;
+	WT_INDEX **indices;
+	size_t idx_alloc;
+
+	TAILQ_ENTRY(__wt_table) q;
+
+	int cg_complete, idx_complete, is_simple;
+	u_int ncolgroups, nindices, nkey_columns;
+
+	uint32_t refcnt;	/* Number of open cursors */
+	uint32_t schema_gen;	/* Cached schema generation number */
+};
+
+/*
+ * Tables without explicit column groups have a single default column group
+ * containing all of the columns.
+ */
+#define	WT_COLGROUPS(t)	WT_MAX((t)->ncolgroups, 1)
+
+/*
+ * WT_WITH_SCHEMA_LOCK --
+ *	Acquire the schema lock, perform an operation, drop the lock.
+ */
+#define	WT_WITH_SCHEMA_LOCK(session, op) do {				\
+	WT_ASSERT(session,						\
+	    F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) ||		\
+	    !F_ISSET(session, WT_SESSION_NO_SCHEMA_LOCK));		\
+	if (F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) {		\
+		(op);							\
+	} else {							\
+		__wt_spin_lock(session, &S2C(session)->schema_lock);	\
+		F_SET(session, WT_SESSION_SCHEMA_LOCKED);		\
+		(op);							\
+		__wt_spin_unlock(session, &S2C(session)->schema_lock);	\
+		F_CLR(session, WT_SESSION_SCHEMA_LOCKED);		\
+	}								\
+} while (0)
+
+/*
+ * WT_WITHOUT_SCHEMA_LOCK --
+ *	Drop the schema lock, perform an operation, re-acquire the lock.
+ */
+#define	WT_WITHOUT_SCHEMA_LOCK(session, op) do {			\
+	if (F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) {		\
+		__wt_spin_unlock(session, &S2C(session)->schema_lock);	\
+		F_CLR(session, WT_SESSION_SCHEMA_LOCKED);		\
+		(op);							\
+		__wt_spin_lock(session, &S2C(session)->schema_lock);	\
+		F_SET(session, WT_SESSION_SCHEMA_LOCKED);		\
+	} else {							\
+		(op);							\
+	}								\
+} while (0)
diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i
new file mode 100644
index 00000000000..70dc6b8764d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/serial.i
@@ -0,0 +1,329 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __page_write_gen_wrapped_check --
+ *	Confirm the page's write generation number won't wrap.
+ */
+static inline int
+__page_write_gen_wrapped_check(WT_PAGE *page)
+{
+	return (page->modify->write_gen >
+	    UINT32_MAX - WT_MILLION ? WT_RESTART : 0);
+}
+
+/*
+ * __insert_serial_func --
+ *	Worker function to add a WT_INSERT entry to a skiplist.
+ */
+static inline int
+__insert_serial_func(WT_SESSION_IMPL *session,
+    WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT *new_ins,
+    u_int skipdepth)
+{
+	u_int i;
+
+	WT_UNUSED(session);
+
+	/*
+	 * Confirm we are still in the expected position, and no item has been
+	 * added where our insert belongs.  Take extra care at the beginning
+	 * and end of the list (at each level): retry if we race there.
+	 *
+	 * !!!
+	 * Note the test for ins_stack[0] == NULL: that's the test for an
+	 * uninitialized cursor, ins_stack[0] is cleared as part of
+	 * initializing a cursor for a search.
+	 */
+	for (i = 0; i < skipdepth; i++) {
+		if (ins_stack[i] == NULL ||
+		    *ins_stack[i] != new_ins->next[i])
+			return (WT_RESTART);
+		if (new_ins->next[i] == NULL &&
+		    ins_head->tail[i] != NULL &&
+		    ins_stack[i] != &ins_head->tail[i]->next[i])
+			return (WT_RESTART);
+	}
+
+	/* Update the skiplist elements referencing the new WT_INSERT item. */
+	for (i = 0; i < skipdepth; i++) {
+		if (ins_head->tail[i] == NULL ||
+		    ins_stack[i] == &ins_head->tail[i]->next[i])
+			ins_head->tail[i] = new_ins;
+		*ins_stack[i] = new_ins;
+	}
+
+	return (0);
+}
+
+/*
+ * __col_append_serial_func --
+ *	Worker function to allocate a record number as necessary, then add a
+ * WT_INSERT entry to a skiplist.
+ */
+static inline int
+__col_append_serial_func(WT_SESSION_IMPL *session,
+    WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT *new_ins,
+    uint64_t *recnop, u_int skipdepth)
+{
+	WT_BTREE *btree;
+	uint64_t recno;
+	u_int i;
+
+	btree = S2BT(session);
+
+	/*
+	 * If the application didn't specify a record number, allocate a new one
+	 * and set up for an append.
+	 */
+	if ((recno = WT_INSERT_RECNO(new_ins)) == 0) {
+		recno = WT_INSERT_RECNO(new_ins) = btree->last_recno + 1;
+		WT_ASSERT(session, WT_SKIP_LAST(ins_head) == NULL ||
+		    recno > WT_INSERT_RECNO(WT_SKIP_LAST(ins_head)));
+		for (i = 0; i < skipdepth; i++)
+			ins_stack[i] = ins_head->tail[i] == NULL ?
+			    &ins_head->head[i] : &ins_head->tail[i]->next[i];
+	}
+
+	/* Confirm position and insert the new WT_INSERT item. */
+	WT_RET(__insert_serial_func(
+	    session, ins_head, ins_stack, new_ins, skipdepth));
+
+	/*
+	 * Set the calling cursor's record number.
+	 * If we extended the file, update the last record number.
+	 */
+	*recnop = recno;
+	if (recno > btree->last_recno)
+		btree->last_recno = recno;
+
+	return (0);
+}
+
+/*
+ * __update_serial_func --
+ *	Worker function to add an WT_UPDATE entry in the page array.
+ */
+static inline int
+__update_serial_func(WT_SESSION_IMPL *session,
+    WT_PAGE *page, WT_UPDATE **upd_entry, WT_UPDATE *upd)
+{
+	WT_DECL_RET;
+	WT_UPDATE *obsolete;
+	WT_DECL_SPINLOCK_ID(id);			/* Must appear last */
+
+	/*
+	 * Swap the update into place.  If that fails, a new update was added
+	 * after our search, we raced.  Check if our update is still permitted,
+	 * and if it is, do a full-barrier to ensure the update's next pointer
+	 * is set before we update the linked list and try again.
+	 */
+	while (!WT_ATOMIC_CAS8(*upd_entry, upd->next, upd)) {
+		WT_RET(__wt_txn_update_check(session, upd->next = *upd_entry));
+		WT_WRITE_BARRIER();
+	}
+
+	/*
+	 * If there are subsequent WT_UPDATE structures, we're evicting pages
+	 * and the page-scanning mutex isn't held, discard obsolete WT_UPDATE
+	 * structures.  Serialization is needed so only one thread does the
+	 * obsolete check at a time, and to protect updates from disappearing
+	 * under reconciliation.
+	 */
+	if (upd->next != NULL &&
+	    F_ISSET(S2C(session)->cache, WT_EVICT_ACTIVE)) {
+		F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret);
+		/* If we can't lock it, don't scan, that's okay. */
+		if (ret != 0)
+			return (0);
+		obsolete = __wt_update_obsolete_check(session, upd->next);
+		F_CLR_ATOMIC(page, WT_PAGE_SCANNING);
+		if (obsolete != NULL)
+			__wt_update_obsolete_free(session, page, obsolete);
+	}
+	return (0);
+}
+
+/*
+ * DO NOT EDIT: automatically built by dist/serial.py.
+ * Serialization function section: BEGIN
+ */
+
+static inline int
+__wt_col_append_serial(
+	WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *ins_head,
+	WT_INSERT ***ins_stack, WT_INSERT **new_insp, size_t new_ins_size,
+	uint64_t *recnop, u_int skipdepth)
+{
+	WT_INSERT *new_ins = *new_insp;
+	WT_DECL_RET;
+	size_t incr_mem;
+
+	/* Clear references to memory we now own. */
+	*new_insp = NULL;
+
+	/*
+	 * Check to see if the page's write generation is about to wrap (wildly
+	 * unlikely as it implies 4B updates between clean page reconciliations,
+	 * but technically possible), and fail the update.
+	 *
+	 * The check is outside of the serialization mutex because the page's
+	 * write generation is going to be a hot cache line, so technically it's
+	 * possible for the page's write generation to wrap between the test and
+	 * our subsequent modification of it.  However, the test is (4B-1M), and
+	 * there cannot be a million threads that have done the test but not yet
+	 * completed their modification.
+	 */
+	 WT_RET(__page_write_gen_wrapped_check(page));
+
+	/* Acquire the page's spinlock, call the worker function. */
+	WT_PAGE_LOCK(session, page);
+	ret = __col_append_serial_func(
+	    session, ins_head, ins_stack, new_ins, recnop, skipdepth);
+	WT_PAGE_UNLOCK(session, page);
+
+	/* Free unused memory on error. */
+	if (ret != 0) {
+		__wt_free(session, new_ins);
+
+		return (ret);
+	}
+
+	/*
+	 * Increment in-memory footprint after releasing the mutex: that's safe
+	 * because the structures we added cannot be discarded while visible to
+	 * any running transaction, and we're a running transaction, which means
+	 * there can be no corresponding delete until we complete.
+	 */
+	incr_mem = 0;
+	WT_ASSERT(session, new_ins_size != 0);
+	incr_mem += new_ins_size;
+	if (incr_mem != 0)
+		__wt_cache_page_inmem_incr(session, page, incr_mem);
+
+	/* Mark the page dirty after updating the footprint. */
+	__wt_page_modify_set(session, page);
+
+	return (0);
+}
+
+static inline int
+__wt_insert_serial(
+	WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *ins_head,
+	WT_INSERT ***ins_stack, WT_INSERT **new_insp, size_t new_ins_size,
+	u_int skipdepth)
+{
+	WT_INSERT *new_ins = *new_insp;
+	WT_DECL_RET;
+	size_t incr_mem;
+
+	/* Clear references to memory we now own. */
+	*new_insp = NULL;
+
+	/*
+	 * Check to see if the page's write generation is about to wrap (wildly
+	 * unlikely as it implies 4B updates between clean page reconciliations,
+	 * but technically possible), and fail the update.
+	 *
+	 * The check is outside of the serialization mutex because the page's
+	 * write generation is going to be a hot cache line, so technically it's
+	 * possible for the page's write generation to wrap between the test and
+	 * our subsequent modification of it.  However, the test is (4B-1M), and
+	 * there cannot be a million threads that have done the test but not yet
+	 * completed their modification.
+	 */
+	 WT_RET(__page_write_gen_wrapped_check(page));
+
+	/* Acquire the page's spinlock, call the worker function. */
+	WT_PAGE_LOCK(session, page);
+	ret = __insert_serial_func(
+	    session, ins_head, ins_stack, new_ins, skipdepth);
+	WT_PAGE_UNLOCK(session, page);
+
+	/* Free unused memory on error. */
+	if (ret != 0) {
+		__wt_free(session, new_ins);
+
+		return (ret);
+	}
+
+	/*
+	 * Increment in-memory footprint after releasing the mutex: that's safe
+	 * because the structures we added cannot be discarded while visible to
+	 * any running transaction, and we're a running transaction, which means
+	 * there can be no corresponding delete until we complete.
+	 */
+	incr_mem = 0;
+	WT_ASSERT(session, new_ins_size != 0);
+	incr_mem += new_ins_size;
+	if (incr_mem != 0)
+		__wt_cache_page_inmem_incr(session, page, incr_mem);
+
+	/* Mark the page dirty after updating the footprint. */
+	__wt_page_modify_set(session, page);
+
+	return (0);
+}
+
+static inline int
+__wt_update_serial(
+	WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE **srch_upd,
+	WT_UPDATE **updp, size_t upd_size)
+{
+	WT_UPDATE *upd = *updp;
+	WT_DECL_RET;
+	size_t incr_mem;
+
+	/* Clear references to memory we now own. */
+	*updp = NULL;
+
+	/*
+	 * Check to see if the page's write generation is about to wrap (wildly
+	 * unlikely as it implies 4B updates between clean page reconciliations,
+	 * but technically possible), and fail the update.
+	 *
+	 * The check is outside of the serialization mutex because the page's
+	 * write generation is going to be a hot cache line, so technically it's
+	 * possible for the page's write generation to wrap between the test and
+	 * our subsequent modification of it.  However, the test is (4B-1M), and
+	 * there cannot be a million threads that have done the test but not yet
+	 * completed their modification.
+	 */
+	 WT_RET(__page_write_gen_wrapped_check(page));
+
+	ret = __update_serial_func(
+	    session, page, srch_upd, upd);
+
+	/* Free unused memory on error. */
+	if (ret != 0) {
+		__wt_free(session, upd);
+
+		return (ret);
+	}
+
+	/*
+	 * Increment in-memory footprint after releasing the mutex: that's safe
+	 * because the structures we added cannot be discarded while visible to
+	 * any running transaction, and we're a running transaction, which means
+	 * there can be no corresponding delete until we complete.
+	 */
+	incr_mem = 0;
+	WT_ASSERT(session, upd_size != 0);
+	incr_mem += upd_size;
+	if (incr_mem != 0)
+		__wt_cache_page_inmem_incr(session, page, incr_mem);
+
+	/* Mark the page dirty after updating the footprint. */
+	__wt_page_modify_set(session, page);
+
+	return (0);
+}
+
+/*
+ * Serialization function section: END
+ * DO NOT EDIT: automatically built by dist/serial.py.
+ */
diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h
new file mode 100644
index 00000000000..788ffe5eb45
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/session.h
@@ -0,0 +1,156 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * WT_DATA_HANDLE_CACHE --
+ *	Per-session cache of handles to avoid synchronization when opening
+ *	cursors.
+ */
+struct __wt_data_handle_cache {
+	WT_DATA_HANDLE *dhandle;
+
+	SLIST_ENTRY(__wt_data_handle_cache) l;
+};
+
+/*
+ * WT_HAZARD --
+ *	A hazard pointer.
+ */
+struct __wt_hazard {
+	WT_PAGE *page;			/* Page address */
+#ifdef HAVE_DIAGNOSTIC
+	const char *file;		/* File/line where hazard acquired */
+	int	    line;
+#endif
+};
+
+/* Get the connection implementation for a session */
+#define	S2C(session)	  ((WT_CONNECTION_IMPL *)(session)->iface.connection)
+#define	S2C_SAFE(session) ((session) == NULL ? NULL : S2C(session))
+
+/* Get the btree for a session */
+#define	S2BT(session)	   ((WT_BTREE *)(session)->dhandle->handle)
+#define	S2BT_SAFE(session) ((session)->dhandle == NULL ? NULL : S2BT(session))
+
+/*
+ * WT_SESSION_IMPL --
+ *	Implementation of WT_SESSION.
+ */
+struct __wt_session_impl {
+	WT_SESSION iface;
+
+	void	*lang_private;		/* Language specific private storage */
+
+	u_int active;			/* Non-zero if the session is in-use */
+
+	const char *name;		/* Name */
+	const char *lastop;		/* Last operation */
+	uint32_t id;			/* UID, offset in session array */
+
+	WT_CONDVAR *cond;		/* Condition variable */
+
+	uint32_t rnd[2];		/* Random number generation state */
+
+	WT_EVENT_HANDLER *event_handler;/* Application's event handlers */
+
+	WT_DATA_HANDLE *dhandle;	/* Current data handle */
+
+					/* Session handle reference list */
+	SLIST_HEAD(__dhandles, __wt_data_handle_cache) dhandles;
+#define	WT_DHANDLE_SWEEP_WAIT	60	/* Wait before discarding */
+#define	WT_DHANDLE_SWEEP_PERIOD	20	/* Only sweep every 20 seconds */
+	time_t last_sweep;		/* Last sweep for dead handles */
+
+	WT_CURSOR *cursor;		/* Current cursor */
+					/* Cursors closed with the session */
+	TAILQ_HEAD(__cursors, __wt_cursor) cursors;
+
+	WT_CURSOR_BACKUP *bkp_cursor;	/* Hot backup cursor */
+	WT_COMPACT	 *compact;	/* Compact state */
+
+	WT_BTREE *metafile;		/* Metadata file */
+	void	*meta_track;		/* Metadata operation tracking */
+	void	*meta_track_next;	/* Current position */
+	void	*meta_track_sub;	/* Child transaction / save point */
+	size_t	 meta_track_alloc;	/* Currently allocated */
+	int	 meta_track_nest;	/* Nesting level of meta transaction */
+#define	WT_META_TRACKING(session)	(session->meta_track_next != NULL)
+
+	TAILQ_HEAD(__tables, __wt_table) tables;
+
+	WT_ITEM	**scratch;		/* Temporary memory for any function */
+	u_int	scratch_alloc;		/* Currently allocated */
+#ifdef HAVE_DIAGNOSTIC
+	/*
+	 * It's hard to figure out from where a buffer was allocated after it's
+	 * leaked, so in diagnostic mode we track them; DIAGNOSTIC can't simply
+	 * add additional fields to WT_ITEM structures because they are visible
+	 * to applications, create a parallel structure instead.
+	 */
+	struct __wt_scratch_track {
+		const char *file;	/* Allocating file, line */
+		int line;
+	} *scratch_track;
+#endif
+
+	WT_TXN_ISOLATION isolation;
+	WT_TXN	txn;			/* Transaction state */
+	u_int	ncursors;		/* Count of active file cursors. */
+
+	WT_REF **excl;			/* Eviction exclusive list */
+	u_int	 excl_next;		/* Next empty slot */
+	size_t	 excl_allocated;	/* Bytes allocated */
+
+	void	*block_manager;		/* Block-manager support */
+	int	(*block_manager_cleanup)(WT_SESSION_IMPL *);
+
+	WT_DATA_HANDLE **ckpt_handle;	/* Checkpoint support */
+	u_int   ckpt_handle_next;	/* Next empty slot */
+	size_t  ckpt_handle_allocated;	/* Bytes allocated */
+
+	void	*reconcile;		/* Reconciliation support */
+	int	(*reconcile_cleanup)(WT_SESSION_IMPL *);
+
+	int compaction;			/* Compaction did some work */
+
+	/*
+	 * The split stash memory and hazard information persist past session
+	 * close, because they are accessed by threads of control other than
+	 * the thread owning the session.  They live at the end of the
+	 * structure so it's somewhat easier to clear everything but the fields
+	 * that persist.
+	 */
+#define	WT_SESSION_CLEAR_SIZE(s)					\
+	(WT_PTRDIFF(&(s)->flags, s) + sizeof((s)->flags))
+	uint32_t flags;
+
+	/*
+	 * Splits can "free" memory that may still be in use, and we use a
+	 * split generation number to track it, that is, the session stores a
+	 * reference to the memory and allocates a split generation; when no
+	 * session is reading from that split generation, the memory can be
+	 * freed for real.
+	 */
+	struct __wt_split_stash {
+		uint64_t    split_gen;	/* Split generation */
+		void       *p;		/* Memory, length */
+		size_t	    len;
+	} *split_stash;			/* Split stash array */
+	size_t  split_stash_cnt;	/* Array entries */
+	size_t  split_stash_alloc;	/* Allocated bytes */
+
+	uint64_t split_gen;		/* Reading split generation */
+
+	/*
+	 * Hazard pointers.
+	 * The number of hazard pointers that can be in use grows dynamically.
+	 */
+#define	WT_HAZARD_INCR		10
+	uint32_t   hazard_size;		/* Allocated slots in hazard array. */
+	uint32_t   nhazard;		/* Count of active hazard pointers */
+	WT_HAZARD *hazard;		/* Hazard pointer array */
+} WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT)));
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
new file mode 100644
index 00000000000..11f42ac5500
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -0,0 +1,332 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+struct __wt_stats {
+	const char	*desc;				/* text description */
+	uint64_t	 v;				/* 64-bit value */
+};
+
+/*
+ * Read/write statistics without any test for statistics configuration.
+ */
+#define	WT_STAT(stats, fld)						\
+	((stats)->fld.v)
+#define	WT_STAT_ATOMIC_DECRV(stats, fld, value) do {			\
+	(void)WT_ATOMIC_SUB8(WT_STAT(stats, fld), (value));		\
+} while (0)
+#define	WT_STAT_ATOMIC_DECR(stats, fld) WT_STAT_ATOMIC_DECRV(stats, fld, 1)
+#define	WT_STAT_ATOMIC_INCRV(stats, fld, value) do {			\
+	(void)WT_ATOMIC_ADD8(WT_STAT(stats, fld), (value));		\
+} while (0)
+#define	WT_STAT_ATOMIC_INCR(stats, fld) WT_ATOMIC_ADD(WT_STAT(stats, fld), 1)
+#define	WT_STAT_DECRV(stats, fld, value) do {				\
+	(stats)->fld.v -= (value);					\
+} while (0)
+#define	WT_STAT_DECR(stats, fld) WT_STAT_DECRV(stats, fld, 1)
+#define	WT_STAT_INCRV(stats, fld, value) do {				\
+	(stats)->fld.v += (value);					\
+} while (0)
+#define	WT_STAT_INCR(stats, fld) WT_STAT_INCRV(stats, fld, 1)
+#define	WT_STAT_SET(stats, fld, value) do {				\
+	(stats)->fld.v = (uint64_t)(value);				\
+} while (0)
+
+/*
+ * Read/write statistics if "fast" statistics are configured.
+ */
+#define	WT_STAT_FAST_ATOMIC_DECRV(session, stats, fld, value) do {	\
+	if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST))	\
+		WT_STAT_ATOMIC_DECRV(stats, fld, value);		\
+} while (0)
+#define	WT_STAT_FAST_ATOMIC_DECR(session, stats, fld)			\
+	WT_STAT_FAST_ATOMIC_DECRV(session, stats, fld, 1)
+#define	WT_STAT_FAST_ATOMIC_INCRV(session, stats, fld, value) do {	\
+	if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST))	\
+		WT_STAT_ATOMIC_INCRV(stats, fld, value);		\
+} while (0)
+#define	WT_STAT_FAST_ATOMIC_INCR(session, stats, fld)			\
+	WT_STAT_FAST_ATOMIC_INCRV(session, stats, fld, 1)
+#define	WT_STAT_FAST_DECRV(session, stats, fld, value) do {		\
+	if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST))	\
+		WT_STAT_DECRV(stats, fld, value);			\
+} while (0)
+#define	WT_STAT_FAST_DECR(session, stats, fld)				\
+	WT_STAT_FAST_DECRV(session, stats, fld, 1)
+#define	WT_STAT_FAST_INCRV(session, stats, fld, value) do {		\
+	if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST))	\
+		WT_STAT_INCRV(stats, fld, value);			\
+} while (0)
+#define	WT_STAT_FAST_INCR(session, stats, fld)				\
+	WT_STAT_FAST_INCRV(session, stats, fld, 1)
+#define	WT_STAT_FAST_SET(session, stats, fld, value) do {		\
+	if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST))	\
+		WT_STAT_SET(stats, fld, value);				\
+} while (0)
+
+/*
+ * Read/write connection handle statistics if "fast" statistics are configured.
+ */
+#define	WT_STAT_FAST_CONN_ATOMIC_DECRV(session, fld, value)		\
+	WT_STAT_FAST_ATOMIC_DECRV(session, &S2C(session)->stats, fld, value)
+#define	WT_STAT_FAST_CONN_ATOMIC_DECR(session, fld)			\
+	WT_STAT_FAST_ATOMIC_DECR(session, &S2C(session)->stats, fld)
+#define	WT_STAT_FAST_CONN_ATOMIC_INCRV(session, fld, value)		\
+	WT_STAT_FAST_ATOMIC_INCRV(session, &S2C(session)->stats, fld, value)
+#define	WT_STAT_FAST_CONN_ATOMIC_INCR(session, fld)			\
+	WT_STAT_FAST_ATOMIC_INCR(session, &S2C(session)->stats, fld)
+#define	WT_STAT_FAST_CONN_DECR(session, fld)				\
+	WT_STAT_FAST_DECR(session, &S2C(session)->stats, fld)
+#define	WT_STAT_FAST_CONN_DECRV(session, fld, value)			\
+	WT_STAT_FAST_DECRV(session, &S2C(session)->stats, fld, value)
+#define	WT_STAT_FAST_CONN_INCR(session, fld)				\
+	WT_STAT_FAST_INCR(session, &S2C(session)->stats, fld)
+#define	WT_STAT_FAST_CONN_INCRV(session, fld, value)			\
+	WT_STAT_FAST_INCRV(session, &S2C(session)->stats, fld, value)
+#define	WT_STAT_FAST_CONN_SET(session, fld, value)			\
+	WT_STAT_FAST_SET(session, &S2C(session)->stats, fld, value)
+
+/*
+ * Read/write data-source handle statistics if the data-source handle is set
+ * and "fast" statistics are configured.
+ *
+ * XXX
+ * We shouldn't have to check if the data-source handle is NULL, but it's
+ * useful until everything is converted to using data-source handles.
+ */
+#define	WT_STAT_FAST_DATA_DECRV(session, fld, value) do {		\
+	if ((session)->dhandle != NULL)					\
+		WT_STAT_FAST_DECRV(					\
+		    session, &(session)->dhandle->stats, fld, value);	\
+} while (0)
+#define	WT_STAT_FAST_DATA_DECR(session, fld)				\
+	WT_STAT_FAST_DATA_DECRV(session, fld, 1)
+#define	WT_STAT_FAST_DATA_INCRV(session, fld, value) do {		\
+	if ((session)->dhandle != NULL)					\
+		WT_STAT_FAST_INCRV(					\
+		    session, &(session)->dhandle->stats, fld, value);	\
+} while (0)
+#define	WT_STAT_FAST_DATA_INCR(session, fld)				\
+	WT_STAT_FAST_DATA_INCRV(session, fld, 1)
+#define	WT_STAT_FAST_DATA_SET(session, fld, value) do {			\
+	if ((session)->dhandle != NULL)					\
+		WT_STAT_FAST_SET(					\
+		   session, &(session)->dhandle->stats, fld, value);	\
+} while (0)
+
+/*
+ * DO NOT EDIT: automatically built by dist/stat.py.
+ */
+/* Statistics section: BEGIN */
+
+/*
+ * Statistics entries for connections.
+ */
+#define	WT_CONNECTION_STATS_BASE	1000
+struct __wt_connection_stats {
+	WT_STATS async_alloc_race;
+	WT_STATS async_alloc_view;
+	WT_STATS async_cur_queue;
+	WT_STATS async_flush;
+	WT_STATS async_full;
+	WT_STATS async_max_queue;
+	WT_STATS async_nowork;
+	WT_STATS async_op_alloc;
+	WT_STATS async_op_compact;
+	WT_STATS async_op_insert;
+	WT_STATS async_op_remove;
+	WT_STATS async_op_search;
+	WT_STATS async_op_update;
+	WT_STATS block_byte_map_read;
+	WT_STATS block_byte_read;
+	WT_STATS block_byte_write;
+	WT_STATS block_map_read;
+	WT_STATS block_preload;
+	WT_STATS block_read;
+	WT_STATS block_write;
+	WT_STATS cache_bytes_dirty;
+	WT_STATS cache_bytes_inuse;
+	WT_STATS cache_bytes_max;
+	WT_STATS cache_bytes_read;
+	WT_STATS cache_bytes_write;
+	WT_STATS cache_eviction_checkpoint;
+	WT_STATS cache_eviction_clean;
+	WT_STATS cache_eviction_deepen;
+	WT_STATS cache_eviction_dirty;
+	WT_STATS cache_eviction_fail;
+	WT_STATS cache_eviction_force;
+	WT_STATS cache_eviction_force_fail;
+	WT_STATS cache_eviction_hazard;
+	WT_STATS cache_eviction_internal;
+	WT_STATS cache_eviction_queue_empty;
+	WT_STATS cache_eviction_queue_not_empty;
+	WT_STATS cache_eviction_server_evicting;
+	WT_STATS cache_eviction_server_not_evicting;
+	WT_STATS cache_eviction_slow;
+	WT_STATS cache_eviction_split;
+	WT_STATS cache_eviction_walk;
+	WT_STATS cache_pages_dirty;
+	WT_STATS cache_pages_inuse;
+	WT_STATS cache_read;
+	WT_STATS cache_write;
+	WT_STATS cond_wait;
+	WT_STATS cursor_create;
+	WT_STATS cursor_insert;
+	WT_STATS cursor_next;
+	WT_STATS cursor_prev;
+	WT_STATS cursor_remove;
+	WT_STATS cursor_reset;
+	WT_STATS cursor_search;
+	WT_STATS cursor_search_near;
+	WT_STATS cursor_update;
+	WT_STATS dh_session_handles;
+	WT_STATS dh_session_sweeps;
+	WT_STATS file_open;
+	WT_STATS log_buffer_grow;
+	WT_STATS log_buffer_size;
+	WT_STATS log_bytes_user;
+	WT_STATS log_bytes_written;
+	WT_STATS log_close_yields;
+	WT_STATS log_max_filesize;
+	WT_STATS log_reads;
+	WT_STATS log_scan_records;
+	WT_STATS log_scan_rereads;
+	WT_STATS log_scans;
+	WT_STATS log_slot_closes;
+	WT_STATS log_slot_consolidated;
+	WT_STATS log_slot_joins;
+	WT_STATS log_slot_races;
+	WT_STATS log_slot_switch_fails;
+	WT_STATS log_slot_toobig;
+	WT_STATS log_slot_toosmall;
+	WT_STATS log_slot_transitions;
+	WT_STATS log_sync;
+	WT_STATS log_writes;
+	WT_STATS lsm_checkpoint_throttle;
+	WT_STATS lsm_merge_throttle;
+	WT_STATS lsm_rows_merged;
+	WT_STATS lsm_work_queue_app;
+	WT_STATS lsm_work_queue_manager;
+	WT_STATS lsm_work_queue_max;
+	WT_STATS lsm_work_queue_switch;
+	WT_STATS lsm_work_units_created;
+	WT_STATS lsm_work_units_discarded;
+	WT_STATS lsm_work_units_done;
+	WT_STATS memory_allocation;
+	WT_STATS memory_free;
+	WT_STATS memory_grow;
+	WT_STATS read_io;
+	WT_STATS rec_pages;
+	WT_STATS rec_pages_eviction;
+	WT_STATS rec_split_stashed_bytes;
+	WT_STATS rec_split_stashed_objects;
+	WT_STATS rwlock_read;
+	WT_STATS rwlock_write;
+	WT_STATS session_cursor_open;
+	WT_STATS session_open;
+	WT_STATS txn_begin;
+	WT_STATS txn_checkpoint;
+	WT_STATS txn_checkpoint_running;
+	WT_STATS txn_commit;
+	WT_STATS txn_fail_cache;
+	WT_STATS txn_pinned_range;
+	WT_STATS txn_rollback;
+	WT_STATS write_io;
+};
+
+/*
+ * Statistics entries for data sources.
+ */
+#define	WT_DSRC_STATS_BASE	2000
+struct __wt_dsrc_stats {
+	WT_STATS allocation_size;
+	WT_STATS block_alloc;
+	WT_STATS block_checkpoint_size;
+	WT_STATS block_extension;
+	WT_STATS block_free;
+	WT_STATS block_magic;
+	WT_STATS block_major;
+	WT_STATS block_minor;
+	WT_STATS block_reuse_bytes;
+	WT_STATS block_size;
+	WT_STATS bloom_count;
+	WT_STATS bloom_false_positive;
+	WT_STATS bloom_hit;
+	WT_STATS bloom_miss;
+	WT_STATS bloom_page_evict;
+	WT_STATS bloom_page_read;
+	WT_STATS bloom_size;
+	WT_STATS btree_column_deleted;
+	WT_STATS btree_column_fix;
+	WT_STATS btree_column_internal;
+	WT_STATS btree_column_variable;
+	WT_STATS btree_compact_rewrite;
+	WT_STATS btree_entries;
+	WT_STATS btree_fixed_len;
+	WT_STATS btree_maximum_depth;
+	WT_STATS btree_maxintlitem;
+	WT_STATS btree_maxintlpage;
+	WT_STATS btree_maxleafitem;
+	WT_STATS btree_maxleafpage;
+	WT_STATS btree_overflow;
+	WT_STATS btree_row_internal;
+	WT_STATS btree_row_leaf;
+	WT_STATS cache_bytes_read;
+	WT_STATS cache_bytes_write;
+	WT_STATS cache_eviction_checkpoint;
+	WT_STATS cache_eviction_clean;
+	WT_STATS cache_eviction_dirty;
+	WT_STATS cache_eviction_fail;
+	WT_STATS cache_eviction_hazard;
+	WT_STATS cache_eviction_internal;
+	WT_STATS cache_overflow_value;
+	WT_STATS cache_read;
+	WT_STATS cache_read_overflow;
+	WT_STATS cache_write;
+	WT_STATS compress_raw_fail;
+	WT_STATS compress_raw_fail_temporary;
+	WT_STATS compress_raw_ok;
+	WT_STATS compress_read;
+	WT_STATS compress_write;
+	WT_STATS compress_write_fail;
+	WT_STATS compress_write_too_small;
+	WT_STATS cursor_create;
+	WT_STATS cursor_insert;
+	WT_STATS cursor_insert_bulk;
+	WT_STATS cursor_insert_bytes;
+	WT_STATS cursor_next;
+	WT_STATS cursor_prev;
+	WT_STATS cursor_remove;
+	WT_STATS cursor_remove_bytes;
+	WT_STATS cursor_reset;
+	WT_STATS cursor_search;
+	WT_STATS cursor_search_near;
+	WT_STATS cursor_update;
+	WT_STATS cursor_update_bytes;
+	WT_STATS lsm_checkpoint_throttle;
+	WT_STATS lsm_chunk_count;
+	WT_STATS lsm_generation_max;
+	WT_STATS lsm_lookup_no_bloom;
+	WT_STATS lsm_merge_throttle;
+	WT_STATS rec_dictionary;
+	WT_STATS rec_multiblock_internal;
+	WT_STATS rec_multiblock_leaf;
+	WT_STATS rec_multiblock_max;
+	WT_STATS rec_overflow_key_internal;
+	WT_STATS rec_overflow_key_leaf;
+	WT_STATS rec_overflow_value;
+	WT_STATS rec_page_delete;
+	WT_STATS rec_page_match;
+	WT_STATS rec_pages;
+	WT_STATS rec_pages_eviction;
+	WT_STATS rec_prefix_compression;
+	WT_STATS rec_suffix_compression;
+	WT_STATS session_compact;
+	WT_STATS session_cursor_open;
+	WT_STATS txn_update_conflict;
+};
+
+/* Statistics section: END */
diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h
new file mode 100644
index 00000000000..c28a9231750
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/txn.h
@@ -0,0 +1,139 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define	WT_TXN_NONE	0		/* No txn running in a session. */
+#define	WT_TXN_ABORTED	UINT64_MAX	/* Update rolled back, ignore. */
+
+/*
+ * Transaction ID comparison dealing with edge cases.
+ *
+ * WT_TXN_ABORTED is the largest possible ID (never visible to a running
+ * transaction), WT_TXN_NONE is smaller than any possible ID (visible to all
+ * running transactions).
+ */
+#define	TXNID_LE(t1, t2)						\
+	((t1) <= (t2))
+
+#define	TXNID_LT(t1, t2)						\
+	((t1) != (t2) && TXNID_LE(t1, t2))
+
+#define	WT_SESSION_TXN_STATE(s) (&S2C(s)->txn_global.states[(s)->id])
+
+struct __wt_txn_state {
+	volatile uint64_t id;
+	volatile uint64_t snap_min;
+} WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT)));
+
+struct __wt_txn_global {
+	volatile uint64_t current;	/* Current transaction ID. */
+
+	/* The oldest running transaction ID (may race). */
+	uint64_t last_running;
+
+	/*
+	 * The oldest transaction ID that is not yet visible to some
+	 * transaction in the system.
+	 */
+	volatile uint64_t oldest_id;
+
+	/* The oldest session found in the last scan. */
+	uint32_t oldest_session;
+
+	/* Count of scanning threads, or -1 for exclusive access. */
+	volatile int32_t scan_count;
+
+	WT_TXN_STATE *states;		/* Per-session transaction states */
+};
+
+typedef enum __wt_txn_isolation {
+	TXN_ISO_EVICTION,		/* Internal: eviction context */
+	TXN_ISO_READ_UNCOMMITTED,
+	TXN_ISO_READ_COMMITTED,
+	TXN_ISO_SNAPSHOT
+} WT_TXN_ISOLATION;
+
+/*
+ * WT_TXN_OP --
+ *	A transactional operation.  Each transaction builds an in-memory array
+ *	of these operations as it runs, then uses the array to either write log
+ *	records during commit or undo the operations during rollback.
+ */
+struct __wt_txn_op {
+	uint32_t fileid;
+	enum {
+		TXN_OP_BASIC,
+		TXN_OP_INMEM,
+		TXN_OP_REF,
+		TXN_OP_TRUNCATE_COL,
+		TXN_OP_TRUNCATE_ROW
+	} type;
+	union {
+		/* TXN_OP_BASIC, TXN_OP_INMEM */
+		WT_UPDATE *upd;
+		/* TXN_OP_REF */
+		WT_REF *ref;
+		/* TXN_OP_TRUNCATE_COL */
+		struct {
+			uint64_t start, stop;
+		} truncate_col;
+		/* TXN_OP_TRUNCATE_ROW */
+		struct {
+			WT_ITEM start, stop;
+			enum {
+				TXN_TRUNC_ALL,
+				TXN_TRUNC_BOTH,
+				TXN_TRUNC_START,
+				TXN_TRUNC_STOP
+			} mode;
+		} truncate_row;
+	} u;
+};
+
+/*
+ * WT_TXN --
+ *	Per-session transaction context.
+ */
+struct __wt_txn {
+	uint64_t id;
+
+	WT_TXN_ISOLATION isolation;
+
+	/*
+	 * Snapshot data:
+	 *	ids < snap_min are visible,
+	 *	ids > snap_max are invisible,
+	 *	everything else is visible unless it is in the snapshot.
+	 */
+	uint64_t snap_min, snap_max;
+	uint64_t *snapshot;
+	uint32_t snapshot_count;
+	uint32_t txn_logsync;	/* Log sync configuration */
+
+	/* Array of modifications by this transaction. */
+	WT_TXN_OP      *mod;
+	size_t		mod_alloc;
+	u_int		mod_count;
+
+	/* Scratch buffer for in-memory log records. */
+	WT_ITEM	       *logrec;
+
+	/* Requested notification when transactions are resolved. */
+	WT_TXN_NOTIFY *notify;
+
+	/* Checkpoint status. */
+	WT_LSN		ckpt_lsn;
+	int		full_ckpt;
+	uint32_t	ckpt_nsnapshot;
+	WT_ITEM		*ckpt_snapshot;
+
+#define	TXN_AUTOCOMMIT		0x01
+#define	TXN_ERROR		0x02
+#define	TXN_HAS_ID	        0x04
+#define	TXN_HAS_SNAPSHOT	0x08
+#define	TXN_RUNNING		0x10
+	uint32_t flags;
+};
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
new file mode 100644
index 00000000000..127176c67ea
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -0,0 +1,382 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+static inline int __wt_txn_id_check(WT_SESSION_IMPL *session);
+static inline void __wt_txn_read_last(WT_SESSION_IMPL *session);
+
+/*
+ * __txn_next_op --
+ *	Mark a WT_UPDATE object modified by the current transaction.
+ */
+static inline int
+__txn_next_op(WT_SESSION_IMPL *session, WT_TXN_OP **opp)
+{
+	WT_TXN *txn;
+
+	txn = &session->txn;
+	*opp = NULL;
+
+	/* 
+	 * We're about to perform an update.
+	 * Make sure we have allocated a transaction ID.
+	 */
+	WT_RET(__wt_txn_id_check(session));
+	WT_ASSERT(session, F_ISSET(txn, TXN_HAS_ID));
+
+	WT_RET(__wt_realloc_def(session, &txn->mod_alloc,
+	    txn->mod_count + 1, &txn->mod));
+
+	*opp = &txn->mod[txn->mod_count++];
+	WT_CLEAR(**opp);
+	(*opp)->fileid = S2BT(session)->id;
+	return (0);
+}
+
+/*
+ * __wt_txn_unmodify --
+ *	If threads race making updates, they may discard the last referenced
+ *	WT_UPDATE item while the transaction is still active.  This function
+ *	removes the last update item from the "log".
+ */
+static inline void
+__wt_txn_unmodify(WT_SESSION_IMPL *session)
+{
+	WT_TXN *txn;
+
+	txn = &session->txn;
+	if (F_ISSET(txn, TXN_HAS_ID)) {
+		WT_ASSERT(session, txn->mod_count > 0);
+		txn->mod_count--;
+	}
+}
+
+/*
+ * __wt_txn_modify --
+ *	Mark a WT_UPDATE object modified by the current transaction.
+ */
+static inline int
+__wt_txn_modify(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+{
+	WT_DECL_RET;
+	WT_TXN_OP *op;
+
+	WT_RET(__txn_next_op(session, &op));
+	op->type = F_ISSET(session, WT_SESSION_LOGGING_INMEM) ?
+	    TXN_OP_INMEM : TXN_OP_BASIC;
+	op->u.upd = upd;
+	upd->txnid = session->txn.id;
+	return (ret);
+}
+
+/*
+ * __wt_txn_modify_ref --
+ *	Remember a WT_REF object modified by the current transaction.
+ */
+static inline int
+__wt_txn_modify_ref(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+	WT_TXN_OP *op;
+
+	WT_RET(__txn_next_op(session, &op));
+	op->type = TXN_OP_REF;
+	op->u.ref = ref;
+	return (__wt_txn_log_op(session, NULL));
+}
+
+/*
+ * __wt_txn_visible_all --
+ *	Check if a given transaction ID is "globally visible".	This is, if
+ *	all sessions in the system will see the transaction ID.
+ */
+static inline int
+__wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id)
+{
+	uint64_t oldest_id;
+
+	oldest_id = S2C(session)->txn_global.oldest_id;
+	return (TXNID_LT(id, oldest_id));
+}
+
+/*
+ * __wt_txn_visible --
+ *	Can the current transaction see the given ID?
+ */
+static inline int
+__wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id)
+{
+	WT_TXN *txn;
+
+	txn = &session->txn;
+
+	/*
+	 * Eviction only sees globally visible updates, or if there is a
+	 * checkpoint transaction running, use its transaction.
+	*/
+	if (txn->isolation == TXN_ISO_EVICTION)
+		return (__wt_txn_visible_all(session, id));
+
+	/* Nobody sees the results of aborted transactions. */
+	if (id == WT_TXN_ABORTED)
+		return (0);
+
+	/* Changes with no associated transaction are always visible. */
+	if (id == WT_TXN_NONE)
+		return (1);
+
+	/*
+	 * Read-uncommitted transactions see all other changes.
+	 *
+	 * All metadata reads are at read-uncommitted isolation.  That's
+	 * because once a schema-level operation completes, subsequent
+	 * operations must see the current version of checkpoint metadata, or
+	 * they may try to read blocks that may have been freed from a file.
+	 * Metadata updates use non-transactional techniques (such as the
+	 * schema and metadata locks) to protect access to in-flight updates.
+	 */
+	if (txn->isolation == TXN_ISO_READ_UNCOMMITTED ||
+	    S2BT_SAFE(session) == session->metafile)
+		return (1);
+
+	/* Transactions see their own changes. */
+	if (id == txn->id)
+		return (1);
+
+	/*
+	 * TXN_ISO_SNAPSHOT, TXN_ISO_READ_COMMITTED: the ID is visible if it is
+	 * not the result of a concurrent transaction, that is, if was
+	 * committed before the snapshot was taken.
+	 *
+	 * The order here is important: anything newer than the maximum ID we
+	 * saw when taking the snapshot should be invisible, even if the
+	 * snapshot is empty.
+	 */
+	if (TXNID_LE(txn->snap_max, id))
+		return (0);
+	if (txn->snapshot_count == 0 || TXNID_LT(id, txn->snap_min))
+		return (1);
+
+	return (bsearch(&id, txn->snapshot, txn->snapshot_count,
+	    sizeof(uint64_t), __wt_txnid_cmp) == NULL);
+}
+
+/*
+ * __wt_txn_read --
+ *	Get the first visible update in a list (or NULL if none are visible).
+ */
+static inline WT_UPDATE *
+__wt_txn_read(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+{
+	while (upd != NULL && !__wt_txn_visible(session, upd->txnid))
+		upd = upd->next;
+
+	return (upd);
+}
+
+/*
+ * __wt_txn_autocommit_check --
+ *      If an auto-commit transaction is required, start one.
+*/
+static inline int
+__wt_txn_autocommit_check(WT_SESSION_IMPL *session)
+{
+	WT_TXN *txn;
+
+	txn = &session->txn;
+	if (F_ISSET(txn, TXN_AUTOCOMMIT)) {
+		F_CLR(txn, TXN_AUTOCOMMIT);
+		return (__wt_txn_begin(session, NULL));
+	}
+	return (0);
+}
+
+/*
+ * __wt_txn_new_id --
+ *	Allocate a new transaction ID.
+ */
+static inline uint64_t
+__wt_txn_new_id(WT_SESSION_IMPL *session)
+{
+	/*
+	 * We want the global value to lead the allocated values, so that any
+	 * allocated transaction ID eventually becomes globally visible.  When
+	 * there are no transactions running, the oldest_id will reach the
+	 * global current ID, so we want post-increment semantics.  Our atomic
+	 * add primitive does pre-increment, so adjust the result here.
+	 */
+	return (WT_ATOMIC_ADD8(S2C(session)->txn_global.current, 1) - 1);
+}
+
+/*
+ * __wt_txn_id_check --
+ *	A transaction is going to do an update, start an auto commit
+ *      transaction if required and allocate a transaction ID.
+ */
+static inline int
+__wt_txn_id_check(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_TXN *txn;
+	WT_TXN_GLOBAL *txn_global;
+	WT_TXN_STATE *txn_state;
+
+	txn = &session->txn;
+
+	WT_ASSERT(session, F_ISSET(txn, TXN_RUNNING));
+	if (!F_ISSET(txn, TXN_HAS_ID)) {
+		conn = S2C(session);
+		txn_global = &conn->txn_global;
+		txn_state = &txn_global->states[session->id];
+
+		WT_ASSERT(session, txn_state->id == WT_TXN_NONE);
+
+		/*
+		 * Allocate a transaction ID.
+		 *
+		 * We use an atomic compare and swap to ensure that we get a
+		 * unique ID that is published before the global counter is
+		 * updated.
+		 *
+		 * If two threads race to allocate an ID, only the latest ID
+		 * will proceed.  The winning thread can be sure its snapshot
+		 * contains all of the earlier active IDs.  Threads that race
+		 * and get an earlier ID may not appear in the snapshot, but
+		 * they will loop and allocate a new ID before proceeding to
+		 * make any updates.
+		 *
+		 * This potentially wastes transaction IDs when threads race to
+		 * begin transactions: that is the price we pay to keep this
+		 * path latch free.
+		 */
+		do {
+			txn_state->id = txn->id = txn_global->current;
+		} while (!WT_ATOMIC_CAS8(
+		    txn_global->current, txn->id, txn->id + 1));
+
+		/*
+		 * If we have used 64-bits of transaction IDs, there is nothing
+		 * more we can do.
+		 */
+		if (txn->id == WT_TXN_ABORTED)
+			WT_RET_MSG(session, ENOMEM, "Out of transaction IDs");
+		F_SET(txn, TXN_HAS_ID);
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_txn_update_check --
+ *	Check if the current transaction can update an item.
+ */
+static inline int
+__wt_txn_update_check(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+{
+	WT_TXN *txn;
+
+	txn = &session->txn;
+	if (txn->isolation == TXN_ISO_SNAPSHOT)
+		while (upd != NULL && !__wt_txn_visible(session, upd->txnid)) {
+			if (upd->txnid != WT_TXN_ABORTED) {
+				WT_STAT_FAST_DATA_INCR(
+				    session, txn_update_conflict);
+				return (WT_ROLLBACK);
+			}
+			upd = upd->next;
+		}
+
+	return (0);
+}
+
+/*
+ * __wt_txn_read_last --
+ *	Called when the last page for a session is released.
+ */
+static inline void
+__wt_txn_read_last(WT_SESSION_IMPL *session)
+{
+	WT_TXN *txn;
+
+	txn = &session->txn;
+
+	/* Release the snap_min ID we put in the global table. */
+	if (!F_ISSET(txn, TXN_RUNNING) ||
+	    txn->isolation != TXN_ISO_SNAPSHOT)
+		__wt_txn_release_snapshot(session);
+}
+
+/*
+ * __wt_txn_cursor_op --
+ *	Called for each cursor operation.
+ */
+static inline void
+__wt_txn_cursor_op(WT_SESSION_IMPL *session)
+{
+	WT_TXN *txn;
+	WT_TXN_GLOBAL *txn_global;
+	WT_TXN_STATE *txn_state;
+
+	txn = &session->txn;
+	txn_global = &S2C(session)->txn_global;
+	txn_state = &txn_global->states[session->id];
+
+	/*
+	 * If there is no transaction running (so we don't have an ID), and no
+	 * snapshot allocated, put an ID in the global table to prevent any
+	 * update that we are reading from being trimmed to save memory.  Do a
+	 * read before the write because this shared data is accessed a lot.
+	 *
+	 * !!!
+	 * Note:  We are updating the global table unprotected, so the
+	 * oldest_id may move past this ID if a scan races with this
+	 * value being published.  That said, read-uncommitted operations
+	 * always take the most recent version of a value, so for that version
+	 * to be freed, two newer versions would have to be committed.	Putting
+	 * this snap_min ID in the table prevents the oldest ID from moving
+	 * further forward, so that once a read-uncommitted cursor is
+	 * positioned on a value, it can't be freed.
+	 */
+	if (txn->isolation == TXN_ISO_READ_UNCOMMITTED &&
+	    !F_ISSET(txn, TXN_HAS_ID) &&
+	    TXNID_LT(txn_state->snap_min, txn_global->last_running))
+		txn_state->snap_min = txn_global->last_running;
+
+	if (txn->isolation != TXN_ISO_READ_UNCOMMITTED &&
+	    !F_ISSET(txn, TXN_HAS_SNAPSHOT))
+		__wt_txn_refresh(session, 1);
+}
+
+/*
+ * __wt_txn_am_oldest --
+ *	Am I the oldest transaction in the system?
+ */
+static inline int
+__wt_txn_am_oldest(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_TXN *txn;
+	WT_TXN_GLOBAL *txn_global;
+	WT_TXN_STATE *s;
+	uint64_t id;
+	uint32_t i, session_cnt;
+
+	conn = S2C(session);
+	txn = &session->txn;
+	txn_global = &conn->txn_global;
+
+	if (txn->id == WT_TXN_NONE)
+		return (0);
+
+	WT_ORDERED_READ(session_cnt, conn->session_cnt);
+	for (i = 0, s = txn_global->states;
+	    i < session_cnt;
+	    i++, s++)
+		if ((id = s->id) != WT_TXN_NONE &&
+		    TXNID_LT(id, txn->id))
+			return (0);
+
+	return (1);
+}
diff --git a/src/third_party/wiredtiger/src/include/verify_build.h b/src/third_party/wiredtiger/src/include/verify_build.h
new file mode 100644
index 00000000000..5f05db11c4b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/verify_build.h
@@ -0,0 +1,75 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#undef ALIGN_CHECK
+#undef SIZE_CHECK
+
+/*
+ * NOTE: If you see a compile failure in this file, your compiler is laying out
+ * structs in memory in a way WiredTiger does not expect.  Please refer to the
+ * build instructions in the documentation (docs/html/install.html) for more
+ * information.
+ */
+
+/*
+ * Compile time assertions.
+ *
+ * If the argument to WT_STATIC_ASSERT is zero, the macro evaluates to:
+ *
+ *	(void)sizeof(char[-1])
+ *
+ * which fails to compile (which is what we want, the assertion failed).
+ * If the value of the argument to WT_STATIC_ASSERT is non-zero, then the
+ * macro evaluates to:
+ *
+ *	(void)sizeof(char[1]);
+ *
+ * which compiles with no warnings, and produces no code.
+ *
+ * For more details about why this works, see
+ * http://scaryreasoner.wordpress.com/2009/02/28/
+ */
+#define	WT_STATIC_ASSERT(cond)	(void)sizeof(char[1 - 2 * !(cond)])
+
+#define	SIZE_CHECK(type, e)	do {					\
+	char __check_##type[1 - 2 * !(sizeof(type) == (e))];		\
+	(void)__check_##type;						\
+} while (0)
+
+#define	ALIGN_CHECK(type, a)						\
+	WT_STATIC_ASSERT(WT_ALIGN(sizeof(type), (a)) == sizeof(type))
+
+/*
+ * __wt_verify_build --
+ *      This function is never called: it exists so there is a place for code
+ *      that checks build-time conditions.
+ */
+static inline void
+__wt_verify_build(void)
+{
+	/* Check specific structures weren't padded. */
+	SIZE_CHECK(WT_BLOCK_DESC, WT_BLOCK_DESC_SIZE);
+	SIZE_CHECK(WT_REF, WT_REF_SIZE);
+
+	/*
+	 * The btree code encodes key/value pairs in size_t's, and requires at
+	 * least 8B size_t's.
+	 */
+	WT_STATIC_ASSERT(sizeof(size_t) >= 8);
+
+	/*
+	 * We require a wt_off_t fit into an 8B chunk because 8B is the largest
+	 * integral value we can encode into an address cookie.
+	 *
+	 * WiredTiger has never been tested on a system with 4B file offsets,
+	 * disallow them for now.
+	 */
+	WT_STATIC_ASSERT(sizeof(wt_off_t) == 8);
+}
+
+#undef ALIGN_CHECK
+#undef SIZE_CHECK
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
new file mode 100644
index 00000000000..09cbca89f17
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -0,0 +1,3463 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#ifndef	__WIREDTIGER_H_
+#define	__WIREDTIGER_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*******************************************
+ * Version information
+ *******************************************/
+#define	WIREDTIGER_VERSION_MAJOR	@VERSION_MAJOR@
+#define	WIREDTIGER_VERSION_MINOR	@VERSION_MINOR@
+#define	WIREDTIGER_VERSION_PATCH	@VERSION_PATCH@
+#define	WIREDTIGER_VERSION_STRING	@VERSION_STRING@
+
+/*******************************************
+ * Required includes
+ *******************************************/
+@wiredtiger_includes_decl@
+
+/*******************************************
+ * Portable type names
+ *******************************************/
+@off_t_decl@
+@uintmax_t_decl@
+@uintptr_t_decl@
+
+#if defined(DOXYGEN) || defined(SWIG)
+#define	__F(func) func
+#else
+#define	__F(func) (*func)
+#endif
+
+#ifdef SWIG
+%{
+#include <wiredtiger.h>
+%}
+#endif
+
+/*!
+ * @defgroup wt WiredTiger API
+ * The functions, handles and methods applications use to access and manage
+ * data with WiredTiger.
+ *
+ * @{
+ */
+
+/*******************************************
+ * Public forward structure declarations
+ *******************************************/
+struct __wt_async_callback;
+	typedef struct __wt_async_callback WT_ASYNC_CALLBACK;
+struct __wt_async_op;	    typedef struct __wt_async_op WT_ASYNC_OP;
+struct __wt_collator;	    typedef struct __wt_collator WT_COLLATOR;
+struct __wt_compressor;	    typedef struct __wt_compressor WT_COMPRESSOR;
+struct __wt_config_item;    typedef struct __wt_config_item WT_CONFIG_ITEM;
+struct __wt_config_parser;
+	typedef struct __wt_config_parser WT_CONFIG_PARSER;
+struct __wt_connection;	    typedef struct __wt_connection WT_CONNECTION;
+struct __wt_cursor;	    typedef struct __wt_cursor WT_CURSOR;
+struct __wt_data_source;    typedef struct __wt_data_source WT_DATA_SOURCE;
+struct __wt_event_handler;  typedef struct __wt_event_handler WT_EVENT_HANDLER;
+struct __wt_extension_api;  typedef struct __wt_extension_api WT_EXTENSION_API;
+struct __wt_extractor;	    typedef struct __wt_extractor WT_EXTRACTOR;
+struct __wt_item;	    typedef struct __wt_item WT_ITEM;
+struct __wt_lsn;	    typedef struct __wt_lsn WT_LSN;
+struct __wt_session;	    typedef struct __wt_session WT_SESSION;
+
+#if defined(SWIGJAVA)
+#define	WT_HANDLE_NULLABLE(typename)	typename##_NULLABLE
+#define	WT_HANDLE_CLOSED(typename)	typename##_CLOSED
+typedef WT_CURSOR			WT_CURSOR_NULLABLE;
+typedef WT_CURSOR			WT_CURSOR_CLOSED;
+typedef WT_SESSION			WT_SESSION_CLOSED;
+typedef WT_CONNECTION			WT_CONNECTION_CLOSED;
+#elif !defined(DOXYGEN)
+#define	WT_HANDLE_NULLABLE(typename)	typename
+#define	WT_HANDLE_CLOSED(typename)	typename
+#endif
+
+/*!
+ * A raw item of data to be managed, including a pointer to the data and a
+ * length.
+ *
+ * WT_ITEM structures do not need to be cleared before use.
+ */
+struct __wt_item {
+	/*!
+	 * The memory reference of the data item.
+	 *
+	 * For items returned by a WT_CURSOR, the pointer is only valid until
+	 * the next operation on that cursor.  Applications that need to keep
+	 * an item across multiple cursor operations must make a copy.
+	 */
+	const void *data;
+
+	/*!
+	 * The number of bytes in the data item.
+	 *
+	 * The maximum length of a single column stored in a table is not fixed
+	 * (as it partially depends on the underlying file configuration), but
+	 * is always a small number of bytes less than 4GB.
+	 */
+	size_t size;
+
+#ifndef DOXYGEN
+#define	WT_ITEM_ALIGNED					0x00000001
+#define	WT_ITEM_INUSE					0x00000002
+	/* This appears in the middle of the struct to avoid padding. */
+	/*! Object flags (internal use). */
+	uint32_t flags;
+
+	/*! Managed memory chunk (internal use). */
+	void *mem;
+	/*! Managed memory size (internal use). */
+	size_t memsize;
+#endif
+};
+
+/*
+ * We rely on this structure being aligned at 64 bits by the compiler,
+ * if we were paranoid we could add an unused field to ensure the padding
+ * is correct.
+ *
+ * NOTE:  If you change the contents of this structure you must also update
+ * the macros in log.h.
+ */
+/*!
+ * A log sequence number, representing a position in the transaction log.
+ */
+struct __wt_lsn {
+	uint32_t        file;           /*!< Log file number */
+	wt_off_t        offset;         /*!< Log file offset */
+};
+
+/*!
+ * The maximum packed size of a 64-bit integer.  The ::wiredtiger_struct_pack
+ * function will pack single long integers into at most this many bytes.
+ */
+#define	WT_INTPACK64_MAXSIZE	((int)sizeof (int64_t) + 1)
+
+/*!
+ * The maximum packed size of a 32-bit integer.  The ::wiredtiger_struct_pack
+ * function will pack single integers into at most this many bytes.
+ */
+#define	WT_INTPACK32_MAXSIZE	((int)sizeof (int32_t) + 1)
+
+/*!
+ * A WT_CURSOR handle is the interface to a cursor.
+ *
+ * Cursors allow data to be searched, iterated and modified, implementing the
+ * CRUD (create, read, update and delete) operations.  Cursors are opened in
+ * the context of a session.  If a transaction is started, cursors operate in
+ * the context of the transaction until the transaction is resolved.
+ *
+ * Raw data is represented by key/value pairs of WT_ITEM structures, but
+ * cursors can also provide access to fields within the key and value if the
+ * formats are described in the WT_SESSION::create method.
+ *
+ * In the common case, a cursor is used to access records in a table.  However,
+ * cursors can be used on subsets of tables (such as a single column or a
+ * projection of multiple columns), as an interface to statistics, configuration
+ * data or application-specific data sources.  See WT_SESSION::open_cursor for
+ * more information.
+ *
+ * <b>Thread safety:</b> A WT_CURSOR handle is not usually shared between
+ * threads, see @ref threads for more information.
+ */
+struct __wt_cursor {
+	WT_SESSION *session;	/*!< The session handle for this cursor. */
+
+	/*!
+	 * The name of the data source for the cursor, matches the \c uri
+	 * parameter to WT_SESSION::open_cursor used to open the cursor.
+	 */
+	const char *uri;
+
+	/*!
+	 * The format of the data packed into key items.  See @ref packing for
+	 * details.  If not set, a default value of "u" is assumed, and
+	 * applications must use WT_ITEM structures to manipulate untyped byte
+	 * arrays.
+	 */
+	const char *key_format;
+
+	/*!
+	 * The format of the data packed into value items.  See @ref packing
+	 * for details.  If not set, a default value of "u" is assumed, and
+	 * applications must use WT_ITEM structures to manipulate untyped byte
+	 * arrays.
+	 */
+	const char *value_format;
+
+	/*!
+	 * @name Data access
+	 * @{
+	 */
+	/*!
+	 * Get the key for the current record.
+	 *
+	 * @snippet ex_all.c Get the cursor's string key
+	 *
+	 * @snippet ex_all.c Get the cursor's record number key
+	 *
+	 * @param cursor the cursor handle
+	 * @param ... pointers to hold key fields corresponding to
+	 * WT_CURSOR::key_format.
+	 * @errors
+	 */
+	int __F(get_key)(WT_CURSOR *cursor, ...);
+
+	/*!
+	 * Get the value for the current record.
+	 *
+	 * @snippet ex_all.c Get the cursor's string value
+	 *
+	 * @snippet ex_all.c Get the cursor's raw value
+	 *
+	 * @param cursor the cursor handle
+	 * @param ... pointers to hold value fields corresponding to
+	 * WT_CURSOR::value_format.
+	 * @errors
+	 */
+	int __F(get_value)(WT_CURSOR *cursor, ...);
+
+	/*!
+	 * Set the key for the next operation.
+	 *
+	 * @snippet ex_all.c Set the cursor's string key
+	 *
+	 * @snippet ex_all.c Set the cursor's record number key
+	 *
+	 * @param cursor the cursor handle
+	 * @param ... key fields corresponding to WT_CURSOR::key_format.
+	 *
+	 * If an error occurs during this operation, a flag will be set in the
+	 * cursor, and the next operation to access the key will fail.  This
+	 * simplifies error handling in applications.
+	 */
+	void __F(set_key)(WT_CURSOR *cursor, ...);
+
+	/*!
+	 * Set the value for the next operation.
+	 *
+	 * @snippet ex_all.c Set the cursor's string value
+	 *
+	 * @snippet ex_all.c Set the cursor's raw value
+	 *
+	 * @param cursor the cursor handle
+	 * @param ... value fields corresponding to WT_CURSOR::value_format.
+	 *
+	 * If an error occurs during this operation, a flag will be set in the
+	 * cursor, and the next operation to access the value will fail.  This
+	 * simplifies error handling in applications.
+	 */
+	void __F(set_value)(WT_CURSOR *cursor, ...);
+	/*! @} */
+
+	/*!
+	 * @name Cursor positioning
+	 * @{
+	 */
+	/*!
+	 * Return the ordering relationship between two cursors: both cursors
+	 * must have the same data source and have valid keys.
+	 *
+	 * @snippet ex_all.c Cursor comparison
+	 *
+	 * @param cursor the cursor handle
+	 * @param other another cursor handle
+	 * @param comparep the status of the comparison: < 0 if
+	 * <code>cursor</code> refers to a key that appears before
+	 * <code>other</code>, 0 if the cursors refer to the same key,
+	 * and > 0 if <code>cursor</code> refers to a key that appears after
+	 * <code>other</code>.
+	 * @errors
+	 */
+	int __F(compare)(WT_CURSOR *cursor, WT_CURSOR *other, int *comparep);
+
+	/*!
+	 * Return the next record.
+	 *
+	 * @snippet ex_all.c Return the next record
+	 *
+	 * @param cursor the cursor handle
+	 * @errors
+	 */
+	int __F(next)(WT_CURSOR *cursor);
+
+	/*!
+	 * Return the previous record.
+	 *
+	 * @snippet ex_all.c Return the previous record
+	 *
+	 * @param cursor the cursor handle
+	 * @errors
+	 */
+	int __F(prev)(WT_CURSOR *cursor);
+
+	/*!
+	 * Reset the position of the cursor.  Any resources held by the cursor
+	 * are released, and the cursor's key and position are no longer valid.
+	 * A subsequent iteration with WT_CURSOR::next will move to the first
+	 * record, or with WT_CURSOR::prev will move to the last record.
+	 *
+	 * @snippet ex_all.c Reset the cursor
+	 *
+	 * @param cursor the cursor handle
+	 * @errors
+	 */
+	int __F(reset)(WT_CURSOR *cursor);
+
+	/*!
+	 * Return the record matching the key. The key must first be set.
+	 *
+	 * @snippet ex_all.c Search for an exact match
+	 *
+	 * On success, the cursor ends positioned at the returned record; to
+	 * minimize cursor resources, the WT_CURSOR::reset method should be
+	 * called as soon as the record has been retrieved and the cursor no
+	 * longer needs that position.
+	 *
+	 * @param cursor the cursor handle
+	 * @errors
+	 */
+	int __F(search)(WT_CURSOR *cursor);
+
+	/*!
+	 * Return the record matching the key if it exists, or an adjacent
+	 * record.  An adjacent record is either the smallest record larger
+	 * than the key or the largest record smaller than the key (in other
+	 * words, a logically adjacent key).
+	 *
+	 * The key must first be set.
+	 *
+	 * An example of a search for an exact or adjacent match:
+	 *
+	 * @snippet ex_all.c Search for an exact or adjacent match
+	 *
+	 * An example of a forward scan through the table, where all keys
+	 * greater than or equal to a specified prefix are included in the
+	 * scan:
+	 *
+	 * @snippet ex_all.c Forward scan greater than or equal
+	 *
+	 * An example of a backward scan through the table, where all keys
+	 * less than a specified prefix are included in the scan:
+	 *
+	 * @snippet ex_all.c Backward scan less than
+	 *
+	 * On success, the cursor ends positioned at the returned record; to
+	 * minimize cursor resources, the WT_CURSOR::reset method should be
+	 * called as soon as the record has been retrieved and the cursor no
+	 * longer needs that position.
+	 *
+	 * @param cursor the cursor handle
+	 * @param exactp the status of the search: 0 if an exact match is
+	 * found, < 0 if a smaller key is returned, > 0 if a larger key is
+	 * returned
+	 * @errors
+	 */
+	int __F(search_near)(WT_CURSOR *cursor, int *exactp);
+	/*! @} */
+
+	/*!
+	 * @name Data modification
+	 * @{
+	 */
+	/*!
+	 * Insert a record and optionally update an existing record.
+	 *
+	 * If the cursor was configured with "overwrite=true" (the default),
+	 * both the key and value must be set; if the record already exists,
+	 * the key's value will be updated, otherwise, the record will be
+	 * inserted.
+	 *
+	 * @snippet ex_all.c Insert a new record or overwrite an existing record
+	 *
+	 * If the cursor was not configured with "overwrite=true", both the key
+	 * and value must be set and the record must not already exist; the
+	 * record will be inserted.
+	 *
+	 * @snippet ex_all.c Insert a new record and fail if the record exists
+	 *
+	 * If a cursor with record number keys was configured with
+	 * "append=true" (not the default), the value must be set; a new record
+	 * will be appended and the record number set as the cursor key value.
+	 *
+	 * @snippet ex_all.c Insert a new record and assign a record number
+	 *
+	 * The cursor ends with no position, and a subsequent call to the
+	 * WT_CURSOR::next (WT_CURSOR::prev) method will iterate from the
+	 * beginning (end) of the table.
+	 *
+	 * Inserting a new record after the current maximum record in a
+	 * fixed-length bit field column-store (that is, a store with an
+	 * 'r' type key and 't' type value) may implicitly create the missing
+	 * records as records with a value of 0.
+	 *
+	 * When loading a large amount of data into a new object, using
+	 * a cursor with the \c bulk configuration string enabled and
+	 * loading the data in sorted order will be much faster than doing
+	 * out-of-order inserts.  See @ref tune_bulk_load for more information.
+	 *
+	 * The maximum length of a single column stored in a table is not fixed
+	 * (as it partially depends on the underlying file configuration), but
+	 * is always a small number of bytes less than 4GB.
+	 *
+	 * @param cursor the cursor handle
+	 * @errors
+	 * In particular, if \c overwrite is not configured and a record with
+	 * the specified key already exists, ::WT_DUPLICATE_KEY is returned.
+	 */
+	int __F(insert)(WT_CURSOR *cursor);
+
+	/*!
+	 * Update a record and optionally insert an existing record.
+	 *
+	 * If the cursor was configured with "overwrite=true" (the default),
+	 * both the key and value must be set; if the record already exists, the
+	 * key's value will be updated, otherwise, the record will be inserted.
+	 *
+	 * @snippet ex_all.c Update an existing record or insert a new record
+	 *
+	 * If the cursor was not configured with "overwrite=true", both the key
+	 * and value must be set and the record must already exist; the
+	 * record will be updated.
+	 *
+	 * @snippet ex_all.c Update an existing record and fail if DNE
+	 *
+	 * On success, the cursor ends positioned at the modified record; to
+	 * minimize cursor resources, the WT_CURSOR::reset method should be
+	 * called as soon as the cursor no longer needs that position.
+	 *
+	 * The maximum length of a single column stored in a table is not fixed
+	 * (as it partially depends on the underlying file configuration), but
+	 * is always a small number of bytes less than 4GB.
+	 *
+	 * @param cursor the cursor handle
+	 * @errors
+	 * In particular, if \c overwrite is not configured and no record with
+	 * the specified key exists, ::WT_NOTFOUND is returned.
+	 */
+	int __F(update)(WT_CURSOR *cursor);
+
+	/*!
+	 * Remove a record.
+	 *
+	 * If the cursor was configured with "overwrite=true" (the default),
+	 * the key must be set; the key's record will be removed if it exists,
+	 * no error will be returned if the record does not exist.
+	 *
+	 * @snippet ex_all.c Remove a record
+	 *
+	 * If the cursor was not configured with "overwrite=true", the key must
+	 * be set and the key's record must exist; the record will be removed.
+	 *
+	 * @snippet ex_all.c Remove a record and fail if DNE
+	 *
+	 * Removing a record in a fixed-length bit field column-store
+	 * (that is, a store with an 'r' type key and 't' type value) is
+	 * identical to setting the record's value to 0.
+	 *
+	 * On success, the cursor ends positioned at the removed record; to
+	 * minimize cursor resources, the WT_CURSOR::reset method should be
+	 * called as soon as the cursor no longer needs that position.
+	 *
+	 * @param cursor the cursor handle
+	 * @errors
+	 * In particular, if \c overwrite is not configured and no record with
+	 * the specified key exists, ::WT_NOTFOUND is returned.
+	 */
+	int __F(remove)(WT_CURSOR *cursor);
+	/*! @} */
+
+	/*!
+	 * Close the cursor.
+	 *
+	 * This releases the resources associated with the cursor handle.
+	 * Cursors are closed implicitly by ending the enclosing connection or
+	 * closing the session in which they were opened.
+	 *
+	 * @snippet ex_all.c Close the cursor
+	 *
+	 * @param cursor the cursor handle
+	 * @errors
+	 */
+	int __F(close)(WT_HANDLE_CLOSED(WT_CURSOR) *cursor);
+
+	/*
+	 * Protected fields, only to be used by cursor implementations.
+	 */
+#if !defined(SWIG) && !defined(DOXYGEN)
+	/*
+	 * !!!
+	 * Explicit representations of structures from queue.h.
+	 * TAILQ_ENTRY(wt_cursor) q;
+	 */
+	struct {
+		WT_CURSOR *tqe_next;
+		WT_CURSOR **tqe_prev;
+	} q;				/* Linked list of WT_CURSORs. */
+
+	uint64_t recno;			/* Record number, normal and raw mode */
+	uint8_t raw_recno_buf[WT_INTPACK64_MAXSIZE];
+
+	void	*json_private;		/* JSON specific storage */
+	void	*lang_private;		/* Language specific private storage */
+
+	WT_ITEM key, value;
+	int saved_err;			/* Saved error in set_{key,value}. */
+	/*
+	 * URI used internally, may differ from the URI provided by the
+	 * user on open.
+	 */
+	const char *internal_uri;
+
+#define	WT_CURSTD_APPEND	0x0001
+#define	WT_CURSTD_BULK		0x0002
+#define	WT_CURSTD_DATA_SOURCE	0x0004
+#define	WT_CURSTD_DUMP_HEX	0x0008
+#define	WT_CURSTD_DUMP_JSON	0x0010
+#define	WT_CURSTD_DUMP_PRINT	0x0020
+#define	WT_CURSTD_KEY_EXT	0x0040	/* Key points out of the tree. */
+#define	WT_CURSTD_KEY_INT	0x0080	/* Key points into the tree. */
+#define	WT_CURSTD_KEY_SET	(WT_CURSTD_KEY_EXT | WT_CURSTD_KEY_INT)
+#define	WT_CURSTD_OPEN		0x0100
+#define	WT_CURSTD_OVERWRITE	0x0200
+#define	WT_CURSTD_RAW		0x0400
+#define	WT_CURSTD_VALUE_EXT	0x0800	/* Value points out of the tree. */
+#define	WT_CURSTD_VALUE_INT	0x1000	/* Value points into the tree. */
+#define	WT_CURSTD_VALUE_SET	(WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT)
+	uint32_t flags;
+#endif
+};
+
+/*! Asynchronous operation types. */
+typedef enum {
+	WT_AOP_NONE=0,	/*!< No operation type set */
+	WT_AOP_COMPACT, /*!< WT_ASYNC_OP::compact */
+	WT_AOP_INSERT,	/*!< WT_ASYNC_OP::insert */
+	WT_AOP_REMOVE,	/*!< WT_ASYNC_OP::remove */
+	WT_AOP_SEARCH,	/*!< WT_ASYNC_OP::search */
+	WT_AOP_UPDATE	/*!< WT_ASYNC_OP::update */
+} WT_ASYNC_OPTYPE;
+
+/*!
+ * A WT_ASYNC_OP handle is the interface to an asynchronous operation.
+ *
+ * An asynchronous operation describes a data manipulation to be performed
+ * asynchronously by a WiredTiger worker thread.  These operations implement
+ * the CRUD (create, read, update and delete) operations.  Each operation
+ * is a self-contained work unit.  The operation will be performed in the
+ * context of the worker thread's session.  Each operation is performed
+ * within the context of a transaction.  The application is notified of its
+ * completion with a callback.  The transaction is resolved once the callback
+ * returns.
+ *
+ * The table referenced in an operation must already exist.
+ *
+ * Raw data is represented by key/value pairs of WT_ITEM structures, but
+ * operations can also provide access to fields within the key and value if
+ * the formats are described in the WT_SESSION::create method.
+ *
+ * <b>Thread safety:</b> A WT_ASYNC_OP handle may not be shared between
+ * threads, see @ref threads for more information.
+ */
+struct __wt_async_op {
+	/*! The connection for this operation. */
+	WT_CONNECTION *connection;
+
+	/*!
+	 * The format of the data packed into key items.  See @ref packing for
+	 * details.  If not set, a default value of "u" is assumed, and
+	 * applications must use WT_ITEM structures to manipulate untyped byte
+	 * arrays.
+	 */
+	const char *key_format;
+
+	/*!
+	 * The format of the data packed into value items.  See @ref packing
+	 * for details.  If not set, a default value of "u" is assumed, and
+	 * applications must use WT_ITEM structures to manipulate untyped byte
+	 * arrays.
+	 */
+	const char *value_format;
+
+	/*
+	 * Don't expose app_private to non-C language bindings - they have
+	 * their own way to attach data to an operation.
+	 */
+#if !defined(SWIG)
+	/*!
+	 * A location for applications to store information that will be
+	 * available in the callback from an async operation.
+	 */
+	void *app_private;
+#endif
+
+	/*!
+	 * @name Data access
+	 * @{
+	 */
+	/*!
+	 * Invoke the underlying WT_CURSOR::get_key method; see that method
+	 * for configuration, return and error values.
+	 *
+	 * @param op the operation handle
+	 * @returns as described for WT_CURSOR::get_key
+	 */
+	int __F(get_key)(WT_ASYNC_OP *op, ...);
+
+	/*!
+	 * Invoke the underlying WT_CURSOR::get_value method; see that method
+	 * for configuration, return and error values.
+	 *
+	 * @param op the operation handle
+	 * @returns as described for WT_CURSOR::get_value
+	 */
+	int __F(get_value)(WT_ASYNC_OP *op, ...);
+
+	/*!
+	 * Invoke the underlying WT_CURSOR::set_key method; see that method
+	 * for configuration, return and error values.
+	 *
+	 * @param op the operation handle
+	 */
+	void __F(set_key)(WT_ASYNC_OP *op, ...);
+
+	/*!
+	 * Invoke the underlying WT_CURSOR::set_value method; see that method
+	 * for configuration, return and error values.
+	 *
+	 * @param op the operation handle
+	 */
+	void __F(set_value)(WT_ASYNC_OP *op, ...);
+	/*! @} */
+
+	/*!
+	 * @name Positioning
+	 * @{
+	 */
+	/*!
+	 * Invoke the underlying WT_CURSOR::search method; see that method
+	 * for configuration, return and error values.
+	 *
+	 * @param op the operation handle
+	 * @returns via the callback as described for WT_CURSOR::search
+	 */
+	int __F(search)(WT_ASYNC_OP *op);
+	/*! @} */
+
+	/*!
+	 * @name Data modification
+	 * @{
+	 */
+	/*!
+	 * Invoke the underlying WT_CURSOR::insert method; see that method
+	 * for configuration, return and error values.
+	 *
+	 * @param op the operation handle
+	 * @returns via the callback as described for WT_CURSOR::insert
+	 */
+	int __F(insert)(WT_ASYNC_OP *op);
+
+	/*!
+	 * Invoke the underlying WT_CURSOR::update method; see that method
+	 * for configuration, return and error values.
+	 *
+	 * @param op the operation handle
+	 * @returns via the callback as described for WT_CURSOR::update
+	 */
+	int __F(update)(WT_ASYNC_OP *op);
+
+	/*!
+	 * Invoke the underlying WT_CURSOR::remove method; see that method
+	 * for configuration, return and error values.
+	 *
+	 * @param op the operation handle
+	 * @returns via the callback as described for WT_CURSOR::remove
+	 */
+	int __F(remove)(WT_ASYNC_OP *op);
+	/*! @} */
+
+	/*!
+	 * @name Table operations
+	 * @{
+	 */
+	/*!
+	 * Invoke the underlying WT_SESSION::compact method; see that method
+	 * for configuration, return and error values.
+	 *
+	 * @param op the operation handle
+	 * @returns via the callback as described for WT_SESSION::compact
+	 */
+	int __F(compact)(WT_ASYNC_OP *op);
+	/*! @} */
+
+	/*!
+	 * Get the unique identifier for this operation.
+	 *
+	 * @snippet ex_async.c async get identifier
+	 *
+	 * @param op the operation handle
+	 * @returns the id of the operation
+	 */
+	uint64_t __F(get_id)(WT_ASYNC_OP *op);
+
+	/*!
+	 * Get the type for this operation.
+	 *
+	 * @snippet ex_async.c async get type
+	 *
+	 * @param op the operation handle
+	 * @returns the ::WT_ASYNC_OPTYPE of the operation
+	 */
+	WT_ASYNC_OPTYPE __F(get_type)(WT_ASYNC_OP *op);
+
+	/*
+	 * Protected fields, only to be used by internal implementation.
+	 * Everything we need for maintaining the key/value is part of
+	 * a cursor.  So, include one here so that we can use the cursor
+	 * functions to manage them.
+	 */
+#if !defined(SWIG) && !defined(DOXYGEN)
+	WT_CURSOR	c;
+#endif
+};
+
+/*!
+ * All data operations are performed in the context of a WT_SESSION.  This
+ * encapsulates the thread and transactional context of the operation.
+ *
+ * <b>Thread safety:</b> A WT_SESSION handle is not usually shared between
+ * threads, see @ref threads for more information.
+ */
+struct __wt_session {
+	/*! The connection for this session. */
+	WT_CONNECTION *connection;
+
+	/*!
+	 * Close the session handle.
+	 *
+	 * This will release the resources associated with the session handle,
+	 * including rolling back any active transactions and closing any
+	 * cursors that remain open in the session.
+	 *
+	 * @snippet ex_all.c Close a session
+	 *
+	 * @param session the session handle
+	 * @configempty{session.close, see dist/api_data.py}
+	 * @errors
+	 */
+	int __F(close)(WT_HANDLE_CLOSED(WT_SESSION) *session,
+	    const char *config);
+
+	/*!
+	 * Reconfigure a session handle.
+	 *
+	 * @snippet ex_all.c Reconfigure a session
+	 *
+	 * WT_SESSION::reconfigure will fail if a transaction is in progress
+	 * in the session.
+	 *
+	 * All cursors are reset.
+	 *
+	 * @param session the session handle
+	 * @configstart{session.reconfigure, see dist/api_data.py}
+	 * @config{isolation, the default isolation level for operations in this
+	 * session., a string\, chosen from the following options: \c
+	 * "read-uncommitted"\, \c "read-committed"\, \c "snapshot"; default \c
+	 * read-committed.}
+	 * @configend
+	 * @errors
+	 */
+	int __F(reconfigure)(WT_SESSION *session, const char *config);
+
+	/*!
+	 * @name Cursor handles
+	 * @{
+	 */
+
+	/*!
+	 * Open a new cursor on a data source or duplicate an existing cursor.
+	 *
+	 * @snippet ex_all.c Open a cursor
+	 *
+	 * An existing cursor can be duplicated by passing it as the \c to_dup
+	 * parameter and setting the \c uri parameter to \c NULL:
+	 *
+	 * @snippet ex_all.c Duplicate a cursor
+	 *
+	 * Cursors being duplicated must have a key set, and successfully
+	 * duplicated cursors are positioned at the same place in the data
+	 * source as the original.
+	 *
+	 * To reconfigure a cursor, duplicate it with a new configuration value:
+	 *
+	 * @snippet ex_all.c Reconfigure a cursor
+	 *
+	 * Cursor handles should be discarded by calling WT_CURSOR::close.
+	 *
+	 * Cursors capable of supporting transactional operations operate in the
+	 * context of the current transaction, if any.
+	 *
+	 * WT_SESSION::rollback_transaction implicitly resets all cursors.
+	 *
+	 * Cursors are relatively light-weight objects but may hold references
+	 * to heavier-weight objects; applications should re-use cursors when
+	 * possible, but instantiating new cursors is not so expensive that
+	 * applications need to cache cursors at all cost.
+	 *
+	 * @param session the session handle
+	 * @param uri the data source on which the cursor operates; cursors
+	 *  are usually opened on tables, however, cursors can be opened on
+	 *  any data source, regardless of whether it is ultimately stored
+	 *  in a table.  Some cursor types may have limited functionality
+	 *  (for example, they may be read-only or not support transactional
+	 *  updates).  See @ref data_sources for more information.
+	 *  <br>
+	 *  @copydoc doc_cursor_types
+	 * @param to_dup a cursor to duplicate
+	 * @configstart{session.open_cursor, see dist/api_data.py}
+	 * @config{append, append the value as a new record\, creating a new
+	 * record number key; valid only for cursors with record number keys., a
+	 * boolean flag; default \c false.}
+	 * @config{bulk, configure the cursor for bulk-loading\, a fast\,
+	 * initial load path (see @ref tune_bulk_load for more information).
+	 * Bulk-load may only be used for newly created objects and cursors
+	 * configured for bulk-load only support the WT_CURSOR::insert and
+	 * WT_CURSOR::close methods.  When bulk-loading row-store objects\, keys
+	 * must be loaded in sorted order.  The value is usually a true/false
+	 * flag; when bulk-loading fixed-length column store objects\, the
+	 * special value \c bitmap allows chunks of a memory resident bitmap to
+	 * be loaded directly into a file by passing a \c WT_ITEM to
+	 * WT_CURSOR::set_value where the \c size field indicates the number of
+	 * records in the bitmap (as specified by the object's \c value_format
+	 * configuration). Bulk-loaded bitmap values must end on a byte boundary
+	 * relative to the bit count (except for the last set of values
+	 * loaded)., a string; default \c false.}
+	 * @config{checkpoint, the name of a checkpoint to open (the reserved
+	 * name "WiredTigerCheckpoint" opens the most recent internal checkpoint
+	 * taken for the object). The cursor does not support data
+	 * modification., a string; default empty.}
+	 * @config{dump, configure the cursor for dump format inputs and
+	 * outputs: "hex" selects a simple hexadecimal format\, "json" selects a
+	 * JSON format with each record formatted as fields named by column
+	 * names if available\, and "print" selects a format where only
+	 * non-printing characters are hexadecimal encoded.  These formats are
+	 * compatible with the @ref util_dump and @ref util_load commands., a
+	 * string\, chosen from the following options: \c "hex"\, \c "json"\, \c
+	 * "print"; default empty.}
+	 * @config{next_random, configure the cursor to return a pseudo-random
+	 * record from the object; valid only for row-store cursors.  Cursors
+	 * configured with \c next_random=true only support the WT_CURSOR::next
+	 * and WT_CURSOR::close methods.  See @ref cursor_random for details., a
+	 * boolean flag; default \c false.}
+	 * @config{overwrite, configures whether the cursor's insert\, update
+	 * and remove methods check the existing state of the record.  If \c
+	 * overwrite is \c false\, WT_CURSOR::insert fails with
+	 * ::WT_DUPLICATE_KEY if the record exists\, WT_CURSOR::update and
+	 * WT_CURSOR::remove fail with ::WT_NOTFOUND if the record does not
+	 * exist., a boolean flag; default \c true.}
+	 * @config{raw, ignore the encodings for the key and value\, manage data
+	 * as if the formats were \c "u". See @ref cursor_raw for details., a
+	 * boolean flag; default \c false.}
+	 * @config{readonly, only query operations are supported by this cursor.
+	 * An error is returned if a modification is attempted using the cursor.
+	 * The default is false for all cursor types except for log and metadata
+	 * cursors., a boolean flag; default \c false.}
+	 * @config{statistics, Specify the statistics to be gathered.  Choosing
+	 * "all" gathers statistics regardless of cost and may include
+	 * traversing on-disk files; "fast" gathers a subset of relatively
+	 * inexpensive statistics.  The selection must agree with the database
+	 * \c statistics configuration specified to ::wiredtiger_open or
+	 * WT_CONNECTION::reconfigure.  For example\, "all" or "fast" can be
+	 * configured when the database is configured with "all"\, but the
+	 * cursor open will fail if "all" is specified when the database is
+	 * configured with "fast"\, and the cursor open will fail in all cases
+	 * when the database is configured with "none". If \c statistics is not
+	 * configured\, the default configuration is the database configuration.
+	 * The "clear" configuration resets statistics after gathering them\,
+	 * where appropriate (for example\, a cache size statistic is not
+	 * cleared\, while the count of cursor insert operations will be
+	 * cleared). See @ref statistics for more information., a list\, with
+	 * values chosen from the following options: \c "all"\, \c "fast"\, \c
+	 * "clear"; default empty.}
+	 * @config{target, if non-empty\, backup the list of objects; valid only
+	 * for a backup data source., a list of strings; default empty.}
+	 * @configend
+	 * @param[out] cursorp a pointer to the newly opened cursor
+	 * @errors
+	 */
+	int __F(open_cursor)(WT_SESSION *session,
+	    const char *uri, WT_HANDLE_NULLABLE(WT_CURSOR) *to_dup,
+	    const char *config, WT_CURSOR **cursorp);
+	/*! @} */
+
+	/*!
+	 * @name Table operations
+	 * @{
+	 */
+	/*!
+	 * Create a table, column group, index or file.
+	 *
+	 * @snippet ex_all.c Create a table
+	 *
+	 * @param session the session handle
+	 * @param name the URI of the object to create, such as
+	 * \c "table:stock". For a description of URI formats
+	 * see @ref data_sources.
+	 * @configstart{session.create, see dist/api_data.py}
+	 * @config{allocation_size, the file unit allocation size\, in bytes\,
+	 * must a power-of-two; smaller values decrease the file space required
+	 * by overflow items\, and the default value of 4KB is a good choice
+	 * absent requirements from the operating system or storage device., an
+	 * integer between 512B and 128MB; default \c 4KB.}
+	 * @config{app_metadata, application-owned metadata for this object., a
+	 * string; default empty.}
+	 * @config{block_allocation, configure block allocation.  Permitted
+	 * values are \c "first" or \c "best"; the \c "first" configuration uses
+	 * a first-available algorithm during block allocation\, the \c "best"
+	 * configuration uses a best-fit algorithm., a string\, chosen from the
+	 * following options: \c "first"\, \c "best"; default \c best.}
+	 * @config{block_compressor, configure a compressor for file blocks.
+	 * Permitted values are empty (off) or \c "bzip2"\, \c "snappy" or
+	 * custom compression engine \c "name" created with
+	 * WT_CONNECTION::add_compressor.  See @ref compression for more
+	 * information., a string; default empty.}
+	 * @config{cache_resident, do not ever evict the object's pages; see
+	 * @ref tuning_cache_resident for more information., a boolean flag;
+	 * default \c false.}
+	 * @config{checksum, configure block checksums; permitted values are
+	 * <code>on</code> (checksum all blocks)\, <code>off</code> (checksum no
+	 * blocks) and <code>uncompresssed</code> (checksum only blocks which
+	 * are not compressed for any reason). The \c uncompressed setting is
+	 * for applications which can rely on decompression to fail if a block
+	 * has been corrupted., a string\, chosen from the following options: \c
+	 * "on"\, \c "off"\, \c "uncompressed"; default \c uncompressed.}
+	 * @config{colgroups, comma-separated list of names of column groups.
+	 * Each column group is stored separately\, keyed by the primary key of
+	 * the table.  If no column groups are specified\, all columns are
+	 * stored together in a single file.  All value columns in the table
+	 * must appear in at least one column group.  Each column group must be
+	 * created with a separate call to WT_SESSION::create., a list of
+	 * strings; default empty.}
+	 * @config{collator, configure custom collation for keys.  Value must be
+	 * a collator name created with WT_CONNECTION::add_collator., a string;
+	 * default empty.}
+	 * @config{columns, list of the column names.  Comma-separated list of
+	 * the form <code>(column[\,...])</code>. For tables\, the number of
+	 * entries must match the total number of values in \c key_format and \c
+	 * value_format.  For colgroups and indices\, all column names must
+	 * appear in the list of columns for the table., a list of strings;
+	 * default empty.}
+	 * @config{dictionary, the maximum number of unique values remembered in
+	 * the Btree row-store leaf page value dictionary; see @ref
+	 * file_formats_compression for more information., an integer greater
+	 * than or equal to 0; default \c 0.}
+	 * @config{exclusive, fail if the object exists.  When false (the
+	 * default)\, if the object exists\, check that its settings match the
+	 * specified configuration., a boolean flag; default \c false.}
+	 * @config{format, the file format., a string\, chosen from the
+	 * following options: \c "btree"; default \c btree.}
+	 * @config{huffman_key, configure Huffman encoding for keys.  Permitted
+	 * values are empty (off)\, \c "english"\, \c "utf8<file>" or \c
+	 * "utf16<file>". See @ref huffman for more information., a string;
+	 * default empty.}
+	 * @config{huffman_value, configure Huffman encoding for values.
+	 * Permitted values are empty (off)\, \c "english"\, \c "utf8<file>" or
+	 * \c "utf16<file>". See @ref huffman for more information., a string;
+	 * default empty.}
+	 * @config{internal_item_max, the largest key stored within an internal
+	 * node\, in bytes.  If non-zero\, any key larger than the specified
+	 * size will be stored as an overflow item (which may require additional
+	 * I/O to access). If zero\, a default size is chosen that permits at
+	 * least 8 keys per internal page., an integer greater than or equal to
+	 * 0; default \c 0.}
+	 * @config{internal_key_truncate, configure internal key truncation\,
+	 * discarding unnecessary trailing bytes on internal keys (ignored for
+	 * custom collators)., a boolean flag; default \c true.}
+	 * @config{internal_page_max, the maximum page size for internal nodes\,
+	 * in bytes; the size must be a multiple of the allocation size and is
+	 * significant for applications wanting to avoid excessive L2 cache
+	 * misses while searching the tree.  The page maximum is the bytes of
+	 * uncompressed data\, that is\, the limit is applied before any block
+	 * compression is done., an integer between 512B and 512MB; default \c
+	 * 4KB.}
+	 * @config{key_format, the format of the data packed into key items.
+	 * See @ref schema_format_types for details.  By default\, the
+	 * key_format is \c 'u' and applications use WT_ITEM structures to
+	 * manipulate raw byte arrays.  By default\, records are stored in
+	 * row-store files: keys of type \c 'r' are record numbers and records
+	 * referenced by record number are stored in column-store files., a
+	 * format string; default \c u.}
+	 * @config{leaf_item_max, the largest key or value stored within a leaf
+	 * node\, in bytes.  If non-zero\, any key or value larger than the
+	 * specified size will be stored as an overflow item (which may require
+	 * additional I/O to access). If zero\, a default size is chosen that
+	 * permits at least 4 key and value pairs per leaf page., an integer
+	 * greater than or equal to 0; default \c 0.}
+	 * @config{leaf_page_max, the maximum page size for leaf nodes\, in
+	 * bytes; the size must be a multiple of the allocation size\, and is
+	 * significant for applications wanting to maximize sequential data
+	 * transfer from a storage device.  The page maximum is the bytes of
+	 * uncompressed data\, that is\, the limit is applied before any block
+	 * compression is done., an integer between 512B and 512MB; default \c
+	 * 32KB.}
+	 * @config{lsm = (, options only relevant for LSM data sources., a set
+	 * of related configuration options defined below.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;auto_throttle, Throttle inserts into
+	 * LSM trees if flushing to disk isn't keeping up., a boolean flag;
+	 * default \c true.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;bloom, create bloom
+	 * filters on LSM tree chunks as they are merged., a boolean flag;
+	 * default \c true.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;bloom_bit_count,
+	 * the number of bits used per item for LSM bloom filters., an integer
+	 * between 2 and 1000; default \c 16.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;bloom_config, config string used when
+	 * creating Bloom filter files\, passed to WT_SESSION::create., a
+	 * string; default empty.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;bloom_hash_count, the number of hash
+	 * values per item used for LSM bloom filters., an integer between 2 and
+	 * 100; default \c 8.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;bloom_oldest,
+	 * create a bloom filter on the oldest LSM tree chunk.  Only supported
+	 * if bloom filters are enabled., a boolean flag; default \c false.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;chunk_max, the maximum size a single
+	 * chunk can be.  Chunks larger than this size are not considered for
+	 * further merges.  This is a soft limit\, and chunks larger than this
+	 * value can be created.  Must be larger than chunk_size., an integer
+	 * between 100MB and 10TB; default \c 5GB.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;chunk_size, the maximum size of the
+	 * in-memory chunk of an LSM tree.  This limit is soft - it is possible
+	 * for chunks to be temporarily larger than this value.  This overrides
+	 * the \c memory_page_max setting., an integer between 512K and 500MB;
+	 * default \c 10MB.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;merge_max, the
+	 * maximum number of chunks to include in a merge operation., an integer
+	 * between 2 and 100; default \c 15.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;merge_min, the minimum number of
+	 * chunks to include in a merge operation.  If set to 0 or 1 half the
+	 * value of merge_max is used., an integer no more than 100; default \c
+	 * 0.}
+	 * @config{ ),,}
+	 * @config{memory_page_max, the maximum size a page can grow to in
+	 * memory before being reconciled to disk.  The specified size will be
+	 * adjusted to a lower bound of <code>50 * leaf_page_max</code>\, and an
+	 * upper bound of <code>cache_size / 2</code>. This limit is soft - it
+	 * is possible for pages to be temporarily larger than this value.  This
+	 * setting is ignored for LSM trees\, see \c chunk_size., an integer
+	 * between 512B and 10TB; default \c 5MB.}
+	 * @config{os_cache_dirty_max, maximum dirty system buffer cache usage\,
+	 * in bytes.  If non-zero\, schedule writes for dirty blocks belonging
+	 * to this object in the system buffer cache after that many bytes from
+	 * this object are written into the buffer cache., an integer greater
+	 * than or equal to 0; default \c 0.}
+	 * @config{os_cache_max, maximum system buffer cache usage\, in bytes.
+	 * If non-zero\, evict object blocks from the system buffer cache after
+	 * that many bytes from this object are read or written into the buffer
+	 * cache., an integer greater than or equal to 0; default \c 0.}
+	 * @config{prefix_compression, configure prefix compression on row-store
+	 * leaf pages., a boolean flag; default \c false.}
+	 * @config{prefix_compression_min, minimum gain before prefix
+	 * compression will be used on row-store leaf pages., an integer greater
+	 * than or equal to 0; default \c 4.}
+	 * @config{split_pct, the Btree page split size as a percentage of the
+	 * maximum Btree page size\, that is\, when a Btree page is split\, it
+	 * will be split into smaller pages\, where each page is the specified
+	 * percentage of the maximum Btree page size., an integer between 25 and
+	 * 100; default \c 75.}
+	 * @config{type, set the type of data source used to store a column
+	 * group\, index or simple table.  By default\, a \c "file:" URI is
+	 * derived from the object name.  The \c type configuration can be used
+	 * to switch to a different data source\, such as LSM or an extension
+	 * configured by the application., a string; default \c file.}
+	 * @config{value_format, the format of the data packed into value items.
+	 * See @ref schema_format_types for details.  By default\, the
+	 * value_format is \c 'u' and applications use a WT_ITEM structure to
+	 * manipulate raw byte arrays.  Value items of type 't' are bitfields\,
+	 * and when configured with record number type keys\, will be stored
+	 * using a fixed-length store., a format string; default \c u.}
+	 * @configend
+	 * @errors
+	 */
+	int __F(create)(WT_SESSION *session,
+	    const char *name, const char *config);
+
+	/*!
+	 * Compact a live row- or column-store btree or LSM tree.
+	 *
+	 * @snippet ex_all.c Compact a table
+	 *
+	 * @param session the session handle
+	 * @param name the URI of the object to compact, such as
+	 * \c "table:stock"
+	 * @configstart{session.compact, see dist/api_data.py}
+	 * @config{timeout, maximum amount of time to allow for compact in
+	 * seconds.  The actual amount of time spent in compact may exceed the
+	 * configured value.  A value of zero disables the timeout., an integer;
+	 * default \c 1200.}
+	 * @configend
+	 * @errors
+	 */
+	int __F(compact)(WT_SESSION *session,
+	    const char *name, const char *config);
+
+	/*!
+	 * Drop (delete) an object.
+	 *
+	 * @snippet ex_all.c Drop a table
+	 *
+	 * @param session the session handle
+	 * @param name the URI of the object to drop, such as \c "table:stock"
+	 * @configstart{session.drop, see dist/api_data.py}
+	 * @config{force, return success if the object does not exist., a
+	 * boolean flag; default \c false.}
+	 * @config{remove_files, should the underlying files be removed?., a
+	 * boolean flag; default \c true.}
+	 * @configend
+	 * @ebusy_errors
+	 */
+	int __F(drop)(WT_SESSION *session,
+	    const char *name, const char *config);
+
+	/*!
+	 * Insert a ::WT_LOGREC_MESSAGE type record in the database log files
+	 * (the database must be configured for logging when this method is
+	 * called).
+	 *
+	 * @param session the session handle
+	 * @param fmt a printf format specifier
+	 * @errors
+	 */
+	int __F(log_printf)(WT_SESSION *session, const char *fmt, ...);
+
+	/*!
+	 * Rename an object.
+	 *
+	 * @snippet ex_all.c Rename a table
+	 *
+	 * @param session the session handle
+	 * @param uri the current URI of the object, such as \c "table:old"
+	 * @param newuri the new URI of the object, such as \c "table:new"
+	 * @configempty{session.rename, see dist/api_data.py}
+	 * @ebusy_errors
+	 */
+	int __F(rename)(WT_SESSION *session,
+	    const char *uri, const char *newuri, const char *config);
+
+	/*!
+	 * Salvage a file or table
+	 *
+	 * Salvage rebuilds the file, or files of which a table is comprised,
+	 * discarding any corrupted file blocks.
+	 *
+	 * Previously deleted records may re-appear, and inserted records may
+	 * disappear, when salvage is done, so salvage should not be run
+	 * unless it is known to be necessary.  Normally, salvage should be
+	 * called after a file or table has been corrupted, as reported by the
+	 * WT_SESSION::verify method.
+	 *
+	 * Files are rebuilt in place, the salvage method overwrites the
+	 * existing files.
+	 *
+	 * @snippet ex_all.c Salvage a table
+	 *
+	 * @param session the session handle
+	 * @param name the URI of the file or table to salvage
+	 * @configstart{session.salvage, see dist/api_data.py}
+	 * @config{force, force salvage even of files that do not appear to be
+	 * WiredTiger files., a boolean flag; default \c false.}
+	 * @configend
+	 * @ebusy_errors
+	 */
+	int __F(salvage)(WT_SESSION *session,
+	    const char *name, const char *config);
+
+	/*!
+	 * Truncate a file, table or cursor range.
+	 *
+	 * Truncate a file or table.
+	 * @snippet ex_all.c Truncate a table
+	 *
+	 * Truncate a cursor range.  When truncating based on a cursor position,
+	 * it is not required the cursor reference a record in the object, only
+	 * that the key be set.  This allows applications to discard portions of
+	 * the object name space without knowing exactly what records the object
+	 * contains.
+	 * @snippet ex_all.c Truncate a range
+	 *
+	 * @param session the session handle
+	 * @param name the URI of the file or table to truncate
+	 * @param start optional cursor marking the first record discarded;
+	 * if <code>NULL</code>, the truncate starts from the beginning of
+	 * the object
+	 * @param stop optional cursor marking the last record discarded;
+	 * if <code>NULL</code>, the truncate continues to the end of the
+	 * object
+	 * @configempty{session.truncate, see dist/api_data.py}
+	 * @ebusy_errors
+	 */
+	int __F(truncate)(WT_SESSION *session,
+	    const char *name,
+	    WT_HANDLE_NULLABLE(WT_CURSOR) *start,
+	    WT_HANDLE_NULLABLE(WT_CURSOR) *stop,
+	    const char *config);
+
+	/*!
+	 * Upgrade a file or table.
+	 *
+	 * Upgrade upgrades a file or table, if upgrade is required.
+	 *
+	 * @snippet ex_all.c Upgrade a table
+	 *
+	 * @param session the session handle
+	 * @param name the URI of the file or table to upgrade
+	 * @configempty{session.upgrade, see dist/api_data.py}
+	 * @ebusy_errors
+	 */
+	int __F(upgrade)(WT_SESSION *session,
+	    const char *name, const char *config);
+
+	/*!
+	 * Verify a file or table.
+	 *
+	 * Verify reports if a file, or the files of which a table is
+	 * comprised, have been corrupted.  The WT_SESSION::salvage method
+	 * can be used to repair a corrupted file,
+	 *
+	 * @snippet ex_all.c Verify a table
+	 *
+	 * @param session the session handle
+	 * @param name the URI of the file or table to verify
+	 * @configstart{session.verify, see dist/api_data.py}
+	 * @config{dump_address, Display addresses and page types as pages are
+	 * verified\, using the application's message handler\, intended for
+	 * debugging., a boolean flag; default \c false.}
+	 * @config{dump_blocks, Display the contents of on-disk blocks as they
+	 * are verified\, using the application's message handler\, intended for
+	 * debugging., a boolean flag; default \c false.}
+	 * @config{dump_offsets, Display the contents of specific on-disk
+	 * blocks\, using the application's message handler\, intended for
+	 * debugging., a list of strings; default empty.}
+	 * @config{dump_pages, Display the contents of in-memory pages as they
+	 * are verified\, using the application's message handler\, intended for
+	 * debugging., a boolean flag; default \c false.}
+	 * @configend
+	 * @ebusy_errors
+	 */
+	int __F(verify)(WT_SESSION *session,
+	    const char *name, const char *config);
+	/*! @} */
+
+	/*!
+	 * @name Transactions
+	 * @{
+	 */
+	/*!
+	 * Start a transaction in this session.
+	 *
+	 * The transaction remains active until ended by
+	 * WT_SESSION::commit_transaction or WT_SESSION::rollback_transaction.
+	 * Operations performed on cursors capable of supporting transactional
+	 * operations that are already open in this session, or which are opened
+	 * before the transaction ends, will operate in the context of the
+	 * transaction.
+	 *
+	 * WT_SESSION::begin_transaction will fail if a transaction is already
+	 * in progress in the session.
+	 *
+	 * @snippet ex_all.c transaction commit/rollback
+	 *
+	 * @param session the session handle
+	 * @configstart{session.begin_transaction, see dist/api_data.py}
+	 * @config{isolation, the isolation level for this transaction; defaults
+	 * to the session's isolation level., a string\, chosen from the
+	 * following options: \c "read-uncommitted"\, \c "read-committed"\, \c
+	 * "snapshot"; default empty.}
+	 * @config{name, name of the transaction for tracing and debugging., a
+	 * string; default empty.}
+	 * @config{priority, priority of the transaction for resolving
+	 * conflicts.  Transactions with higher values are less likely to
+	 * abort., an integer between -100 and 100; default \c 0.}
+	 * @config{sync, whether to sync log records when the transaction
+	 * commits\, inherited from ::wiredtiger_open \c transaction_sync., a
+	 * boolean flag; default empty.}
+	 * @configend
+	 * @errors
+	 */
+	int __F(begin_transaction)(WT_SESSION *session, const char *config);
+
+	/*!
+	 * Commit the current transaction.
+	 *
+	 * A transaction must be in progress when this method is called.
+	 *
+	 * If WT_SESSION::commit_transaction returns an error, the transaction
+	 * was rolled back, not committed.
+	 *
+	 * @snippet ex_all.c transaction commit/rollback
+	 *
+	 * @param session the session handle
+	 * @configempty{session.commit_transaction, see dist/api_data.py}
+	 * @errors
+	 */
+	int __F(commit_transaction)(WT_SESSION *session, const char *config);
+
+	/*!
+	 * Roll back the current transaction.
+	 *
+	 * A transaction must be in progress when this method is called.
+	 *
+	 * All cursors are reset.
+	 *
+	 * @snippet ex_all.c transaction commit/rollback
+	 *
+	 * @param session the session handle
+	 * @configempty{session.rollback_transaction, see dist/api_data.py}
+	 * @errors
+	 */
+	int __F(rollback_transaction)(WT_SESSION *session, const char *config);
+
+	/*!
+	 * Write a transactionally consistent snapshot of a database or set of
+	 * objects.  The checkpoint includes all transactions committed before
+	 * the checkpoint starts.  Additionally, checkpoints may optionally be
+	 * discarded.
+	 *
+	 * @snippet ex_all.c Checkpoint examples
+	 *
+	 * @param session the session handle
+	 * @configstart{session.checkpoint, see dist/api_data.py}
+	 * @config{drop, specify a list of checkpoints to drop.  The list may
+	 * additionally contain one of the following keys: \c "from=all" to drop
+	 * all checkpoints\, \c "from=<checkpoint>" to drop all checkpoints
+	 * after and including the named checkpoint\, or \c "to=<checkpoint>" to
+	 * drop all checkpoints before and including the named checkpoint.
+	 * Checkpoints cannot be dropped while a hot backup is in progress or if
+	 * open in a cursor., a list of strings; default empty.}
+	 * @config{force, by default\, checkpoints may be skipped if the
+	 * underlying object has not been modified\, this option forces the
+	 * checkpoint., a boolean flag; default \c false.}
+	 * @config{name, if non-empty\, specify a name for the checkpoint (note
+	 * that checkpoints including LSM trees may not be named)., a string;
+	 * default empty.}
+	 * @config{target, if non-empty\, checkpoint the list of objects., a
+	 * list of strings; default empty.}
+	 * @configend
+	 * @errors
+	 */
+	int __F(checkpoint)(WT_SESSION *session, const char *config);
+
+	/*!
+	 * Return the transaction ID range pinned by the session handle.
+	 *
+	 * The ID range is approximate and is calculated based on the oldest
+	 * ID needed for the active transaction in this session, compared
+	 * to the newest transaction in the system.
+	 *
+	 * @snippet ex_all.c transaction pinned range
+	 *
+	 * @param session the session handle
+	 * @param[out] range the range of IDs pinned by this session. Zero if
+	 * there is no active transaction.
+	 * @errors
+	 */
+	int __F(transaction_pinned_range)(WT_SESSION* session, uint64_t *range);
+
+	/*! @} */
+};
+
+/*!
+ * A connection to a WiredTiger database.  The connection may be opened within
+ * the same address space as the caller or accessed over a socket connection.
+ *
+ * Most applications will open a single connection to a database for each
+ * process.  The first process to open a connection to a database will access
+ * the database in its own address space.  Subsequent connections (if allowed)
+ * will communicate with the first process over a socket connection to perform
+ * their operations.
+ *
+ * <b>Thread safety:</b> A WT_CONNECTION handle may be shared between threads,
+ * see @ref threads for more information.
+ */
+struct __wt_connection {
+	/*!
+	 * @name Async operation handles
+	 * @{
+	 */
+	/*!
+	 * Wait for all outstanding operations to complete.
+	 *
+	 * @snippet ex_async.c async flush
+	 *
+	 * @param connection the connection handle
+	 * @errors
+	 */
+	int __F(async_flush)(WT_CONNECTION *connection);
+
+	/*!
+	 * Return an async operation handle
+	 *
+	 * @snippet ex_async.c async handle allocation
+	 *
+	 * @param connection the connection handle
+	 * @param uri the connection handle
+	 * @configstart{connection.async_new_op, see dist/api_data.py}
+	 * @config{append, append the value as a new record\, creating a new
+	 * record number key; valid only for operations with record number
+	 * keys., a boolean flag; default \c false.}
+	 * @config{overwrite, configures whether the cursor's insert\, update
+	 * and remove methods check the existing state of the record.  If \c
+	 * overwrite is \c false\, WT_CURSOR::insert fails with
+	 * ::WT_DUPLICATE_KEY if the record exists\, WT_CURSOR::update and
+	 * WT_CURSOR::remove fail with ::WT_NOTFOUND if the record does not
+	 * exist., a boolean flag; default \c true.}
+	 * @config{raw, ignore the encodings for the key and value\, manage data
+	 * as if the formats were \c "u". See @ref cursor_raw for details., a
+	 * boolean flag; default \c false.}
+	 * @config{timeout, maximum amount of time to allow for compact in
+	 * seconds.  The actual amount of time spent in compact may exceed the
+	 * configured value.  A value of zero disables the timeout., an integer;
+	 * default \c 1200.}
+	 * @configend
+	 * @param callback the operation callback
+	 * @param[out] asyncopp the new op handle
+	 * @errors
+	 * If there are no available handles, \c EBUSY is returned.
+	 */
+	int __F(async_new_op)(WT_CONNECTION *connection,
+	    const char *uri, const char *config, WT_ASYNC_CALLBACK *callback,
+	    WT_ASYNC_OP **asyncopp);
+	/*! @} */
+
+	/*!
+	 * Close a connection.
+	 *
+	 * Any open sessions will be closed.
+	 *
+	 * @snippet ex_all.c Close a connection
+	 *
+	 * @param connection the connection handle
+	 * @configstart{connection.close, see dist/api_data.py}
+	 * @config{leak_memory, don't free memory during close., a boolean flag;
+	 * default \c false.}
+	 * @configend
+	 * @errors
+	 */
+	int __F(close)(WT_HANDLE_CLOSED(WT_CONNECTION) *connection,
+	    const char *config);
+
+	/*!
+	 * Reconfigure a connection handle.
+	 *
+	 * @snippet ex_all.c Reconfigure a connection
+	 *
+	 * @param connection the connection handle
+	 * @configstart{connection.reconfigure, see dist/api_data.py}
+	 * @config{async = (, asynchronous operations configuration options., a
+	 * set of related configuration options defined below.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;enabled, enable asynchronous
+	 * operation., a boolean flag; default \c false.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;ops_max, maximum number of expected
+	 * simultaneous asynchronous operations., an integer between 10 and
+	 * 4096; default \c 1024.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;threads, the
+	 * number of worker threads to service asynchronous requests., an
+	 * integer between 1 and 20; default \c 2.}
+	 * @config{ ),,}
+	 * @config{cache_size, maximum heap memory to allocate for the cache.  A
+	 * database should configure either a cache_size or a shared_cache not
+	 * both., an integer between 1MB and 10TB; default \c 100MB.}
+	 * @config{checkpoint = (, periodically checkpoint the database., a set
+	 * of related configuration options defined below.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;log_size, wait for this amount of log
+	 * record bytes to be written to the log between each checkpoint.  A
+	 * database can configure both log_size and wait to set an upper bound
+	 * for checkpoints; setting this value above 0 configures periodic
+	 * checkpoints., an integer between 0 and 2GB; default \c 0.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;name, the checkpoint name., a string;
+	 * default \c "WiredTigerCheckpoint".}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;wait, seconds to wait between each
+	 * checkpoint; setting this value above 0 configures periodic
+	 * checkpoints., an integer between 0 and 100000; default \c 0.}
+	 * @config{ ),,}
+	 * @config{error_prefix, prefix string for error messages., a string;
+	 * default empty.}
+	 * @config{eviction = (, eviction configuration options., a set of
+	 * related configuration options defined below.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;threads_max, maximum number of
+	 * threads WiredTiger will start to help evict pages from cache.  The
+	 * number of threads started will vary depending on the current eviction
+	 * load., an integer between 1 and 20; default \c 1.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;threads_min, minimum number of
+	 * threads WiredTiger will start to help evict pages from cache.  The
+	 * number of threads currently running will vary depending on the
+	 * current eviction load., an integer between 1 and 20; default \c 1.}
+	 * @config{ ),,}
+	 * @config{eviction_dirty_target, continue evicting until the cache has
+	 * less dirty memory than the value\, as a percentage of the total cache
+	 * size.  Dirty pages will only be evicted if the cache is full enough
+	 * to trigger eviction., an integer between 10 and 99; default \c 80.}
+	 * @config{eviction_target, continue evicting until the cache has less
+	 * total memory than the value\, as a percentage of the total cache
+	 * size.  Must be less than \c eviction_trigger., an integer between 10
+	 * and 99; default \c 80.}
+	 * @config{eviction_trigger, trigger eviction when the cache is using
+	 * this much memory\, as a percentage of the total cache size., an
+	 * integer between 10 and 99; default \c 95.}
+	 * @config{lsm_manager = (, configure database wide options for LSM tree
+	 * management., a set of related configuration options defined below.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;merge, merge LSM chunks where
+	 * possible., a boolean flag; default \c true.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;worker_thread_max, Configure a set of
+	 * threads to manage merging LSM trees in the database., an integer
+	 * between 3 and 20; default \c 4.}
+	 * @config{ ),,}
+	 * @config{shared_cache = (, shared cache configuration options.  A
+	 * database should configure either a cache_size or a shared_cache not
+	 * both., a set of related configuration options defined below.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;chunk, the granularity that a shared
+	 * cache is redistributed., an integer between 1MB and 10TB; default \c
+	 * 10MB.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;name, name of a cache that is
+	 * shared between databases., a string; default empty.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;reserve, amount of cache this
+	 * database is guaranteed to have available from the shared cache.  This
+	 * setting is per database.  Defaults to the chunk size., an integer;
+	 * default \c 0.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;size, maximum memory
+	 * to allocate for the shared cache.  Setting this will update the value
+	 * if one is already set., an integer between 1MB and 10TB; default \c
+	 * 500MB.}
+	 * @config{ ),,}
+	 * @config{statistics, Maintain database statistics\, which may impact
+	 * performance.  Choosing "all" maintains all statistics regardless of
+	 * cost\, "fast" maintains a subset of statistics that are relatively
+	 * inexpensive\, "none" turns off all statistics.  The "clear"
+	 * configuration resets statistics after they are gathered\, where
+	 * appropriate (for example\, a cache size statistic is not cleared\,
+	 * while the count of cursor insert operations will be cleared). When
+	 * "clear" is configured for the database\, gathered statistics are
+	 * reset each time a statistics cursor is used to gather statistics\, as
+	 * well as each time statistics are logged using the \c statistics_log
+	 * configuration.  See @ref statistics for more information., a list\,
+	 * with values chosen from the following options: \c "all"\, \c "fast"\,
+	 * \c "none"\, \c "clear"; default \c none.}
+	 * @config{statistics_log = (, log any statistics the database is
+	 * configured to maintain\, to a file.  See @ref statistics for more
+	 * information., a set of related configuration options defined below.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;on_close, log statistics on database
+	 * close., a boolean flag; default \c false.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;path, the pathname to a file into
+	 * which the log records are written\, may contain ISO C standard
+	 * strftime conversion specifications.  If the value is not an absolute
+	 * path name\, the file is created relative to the database home., a
+	 * string; default \c "WiredTigerStat.%d.%H".}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;sources, if non-empty\, include
+	 * statistics for the list of data source URIs\, if they are open at the
+	 * time of the statistics logging.  The list may include URIs matching a
+	 * single data source ("table:mytable")\, or a URI matching all data
+	 * sources of a particular type ("table:")., a list of strings; default
+	 * empty.}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;timestamp, a timestamp
+	 * prepended to each log record\, may contain strftime conversion
+	 * specifications., a string; default \c "%b %d %H:%M:%S".}
+	 * @config{&nbsp;&nbsp;&nbsp;&nbsp;wait, seconds to wait between each
+	 * write of the log records., an integer between 0 and 100000; default
+	 * \c 0.}
+	 * @config{ ),,}
+	 * @config{verbose, enable messages for various events.  Only available
+	 * if WiredTiger is configured with --enable-verbose.  Options are given
+	 * as a list\, such as <code>"verbose=[evictserver\,read]"</code>., a
+	 * list\, with values chosen from the following options: \c "api"\, \c
+	 * "block"\, \c "checkpoint"\, \c "compact"\, \c "evict"\, \c
+	 * "evictserver"\, \c "fileops"\, \c "log"\, \c "lsm"\, \c "metadata"\,
+	 * \c "mutex"\, \c "overflow"\, \c "read"\, \c "reconcile"\, \c
+	 * "recovery"\, \c "salvage"\, \c "shared_cache"\, \c "split"\, \c
+	 * "temporary"\, \c "transaction"\, \c "verify"\, \c "version"\, \c
+	 * "write"; default empty.}
+	 * @configend
+	 * @errors
+	 */
+	int __F(reconfigure)(WT_CONNECTION *connection, const char *config);
+
+	/*!
+	 * The home directory of the connection.
+	 *
+	 * @snippet ex_all.c Get the database home directory
+	 *
+	 * @param connection the connection handle
+	 * @returns a pointer to a string naming the home directory
+	 */
+	const char *__F(get_home)(WT_CONNECTION *connection);
+
+	/*!
+	 * Add configuration options for a method.  See
+	 * @ref custom_ds_config_add for more information.
+	 *
+	 * @snippet ex_all.c Configure method configuration
+	 *
+	 * @param connection the connection handle
+	 * @param method the name of the method
+	 * @param uri the object type or NULL for all object types
+	 * @param config the additional configuration's name and default value
+	 * @param type the additional configuration's type (must be one of
+	 * \c "boolean"\, \c "int", \c "list" or \c "string")
+	 * @param check the additional configuration check string, or NULL if
+	 * none
+	 * @errors
+	 */
+	int __F(configure_method)(WT_CONNECTION *connection,
+	    const char *method, const char *uri,
+	    const char *config, const char *type, const char *check);
+
+	/*!
+	 * Return if opening this handle created the database.
+	 *
+	 * @snippet ex_all.c Check if the database is newly created
+	 *
+	 * @param connection the connection handle
+	 * @returns false (zero) if the connection existed before the call to
+	 * ::wiredtiger_open, true (non-zero) if it was created by opening this
+	 * handle.
+	 */
+	int __F(is_new)(WT_CONNECTION *connection);
+
+	/*!
+	 * @name Session handles
+	 * @{
+	 */
+	/*!
+	 * Open a session.
+	 *
+	 * @snippet ex_all.c Open a session
+	 *
+	 * @param connection the connection handle
+	 * @param errhandler An error handler.  If <code>NULL</code>, the
+	 * connection's error handler is used
+	 * @configstart{connection.open_session, see dist/api_data.py}
+	 * @config{isolation, the default isolation level for operations in this
+	 * session., a string\, chosen from the following options: \c
+	 * "read-uncommitted"\, \c "read-committed"\, \c "snapshot"; default \c
+	 * read-committed.}
+	 * @configend
+	 * @param[out] sessionp the new session handle
+	 * @errors
+	 */
+	int __F(open_session)(WT_CONNECTION *connection,
+	    WT_EVENT_HANDLER *errhandler, const char *config,
+	    WT_SESSION **sessionp);
+	/*! @} */
+
+	/*!
+	 * @name Extensions
+	 * @{
+	 */
+	/*!
+	 * Load an extension.
+	 *
+	 * @snippet ex_all.c Load an extension
+	 *
+	 * @param connection the connection handle
+	 * @param path the filename of the extension module, or \c "local" to
+	 * search the current application binary for the initialization
+	 * function, see @ref extensions for more details.
+	 * @configstart{connection.load_extension, see dist/api_data.py}
+	 * @config{config, configuration string passed to the entry point of the
+	 * extension as its WT_CONFIG_ARG argument., a string; default empty.}
+	 * @config{entry, the entry point of the extension\, called to
+	 * initialize the extension when it is loaded.  The signature of the
+	 * function must match ::wiredtiger_extension_init., a string; default
+	 * \c wiredtiger_extension_init.}
+	 * @config{terminate, an optional function in the extension that is
+	 * called before the extension is unloaded during WT_CONNECTION::close.
+	 * The signature of the function must match
+	 * ::wiredtiger_extension_terminate., a string; default \c
+	 * wiredtiger_extension_terminate.}
+	 * @configend
+	 * @errors
+	 */
+	int __F(load_extension)(WT_CONNECTION *connection,
+	    const char *path, const char *config);
+
+	/*!
+	 * Add a custom data source.  See @ref custom_data_sources for more
+	 * information.
+	 *
+	 * The application must first implement the WT_DATA_SOURCE interface
+	 * and then register the implementation with WiredTiger:
+	 *
+	 * @snippet ex_data_source.c WT_DATA_SOURCE register
+	 *
+	 * @param connection the connection handle
+	 * @param prefix the URI prefix for this data source, e.g., "file:"
+	 * @param data_source the application-supplied implementation of
+	 *	WT_DATA_SOURCE to manage this data source.
+	 * @configempty{connection.add_data_source, see dist/api_data.py}
+	 * @errors
+	 */
+	int __F(add_data_source)(WT_CONNECTION *connection, const char *prefix,
+	    WT_DATA_SOURCE *data_source, const char *config);
+
+	/*!
+	 * Add a custom collation function.
+	 *
+	 * The application must first implement the WT_COLLATOR interface and
+	 * then register the implementation with WiredTiger:
+	 *
+	 * @snippet ex_all.c WT_COLLATOR register
+	 *
+	 * @param connection the connection handle
+	 * @param name the name of the collation to be used in calls to
+	 * 	WT_SESSION::create
+	 * @param collator the application-supplied collation handler
+	 * @configempty{connection.add_collator, see dist/api_data.py}
+	 * @errors
+	 */
+	int __F(add_collator)(WT_CONNECTION *connection,
+	    const char *name, WT_COLLATOR *collator, const char *config);
+
+	/*!
+	 * Add a compression function.
+	 *
+	 * The application must first implement the WT_COMPRESSOR interface
+	 * and then register the implementation with WiredTiger:
+	 *
+	 * @snippet nop_compress.c WT_COMPRESSOR initialization structure
+	 *
+	 * @snippet nop_compress.c WT_COMPRESSOR initialization function
+	 *
+	 * @param connection the connection handle
+	 * @param name the name of the compression function to be used in calls
+	 *	to WT_SESSION::create
+	 * @param compressor the application-supplied compression handler
+	 * @configempty{connection.add_compressor, see dist/api_data.py}
+	 * @errors
+	 */
+	int __F(add_compressor)(WT_CONNECTION *connection,
+	    const char *name, WT_COMPRESSOR *compressor, const char *config);
+
+	/*!
+	 * Add a custom extractor for index keys or column groups.
+	 * @notyet{custom extractors}
+	 *
+	 * The application must first implement the WT_EXTRACTOR interface and
+	 * then register the implementation with WiredTiger:
+	 *
+	 * @snippet ex_all.c WT_EXTRACTOR register
+	 *
+	 * @param connection the connection handle
+	 * @param name the name of the extractor to be used in calls to
+	 * 	WT_SESSION::create
+	 * @param extractor the application-supplied extractor
+	 * @configempty{connection.add_extractor, see dist/api_data.py}
+	 * @errors
+	 */
+	int __F(add_extractor)(WT_CONNECTION *connection, const char *name,
+	    WT_EXTRACTOR *extractor, const char *config);
+
+	/*!
+	 * Return a reference to the WiredTiger extension functions.
+	 *
+	 * @snippet ex_data_source.c WT_EXTENSION_API declaration
+	 *
+	 * @param wt_conn the WT_CONNECTION handle
+	 * @returns a reference to a WT_EXTENSION_API structure.
+	 */
+	WT_EXTENSION_API *__F(get_extension_api)(WT_CONNECTION *wt_conn);
+	/*! @} */
+};
+
+/*!
+ * Open a connection to a database.
+ *
+ * @snippet ex_all.c Open a connection
+ *
+ * @param home The path to the database home directory.  See @ref home
+ * for more information.
+ * @param errhandler An error handler.  If <code>NULL</code>, a builtin error
+ * handler is installed that writes error messages to stderr
+ * @configstart{wiredtiger_open, see dist/api_data.py}
+ * @config{async = (, asynchronous operations configuration options., a set of
+ * related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;enabled, enable asynchronous operation., a
+ * boolean flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;ops_max,
+ * maximum number of expected simultaneous asynchronous operations., an integer
+ * between 10 and 4096; default \c 1024.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;threads, the number of worker threads to
+ * service asynchronous requests., an integer between 1 and 20; default \c 2.}
+ * @config{ ),,}
+ * @config{buffer_alignment, in-memory alignment (in bytes) for buffers used for
+ * I/O. The default value of -1 indicates a platform-specific alignment value
+ * should be used (4KB on Linux systems\, zero elsewhere)., an integer between
+ * -1 and 1MB; default \c -1.}
+ * @config{cache_size, maximum heap memory to allocate for the cache.  A
+ * database should configure either a cache_size or a shared_cache not both., an
+ * integer between 1MB and 10TB; default \c 100MB.}
+ * @config{checkpoint = (, periodically checkpoint the database., a set of
+ * related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;log_size, wait for this amount of log record
+ * bytes to be written to the log between each checkpoint.  A database can
+ * configure both log_size and wait to set an upper bound for checkpoints;
+ * setting this value above 0 configures periodic checkpoints., an integer
+ * between 0 and 2GB; default \c 0.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;name, the
+ * checkpoint name., a string; default \c "WiredTigerCheckpoint".}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;wait, seconds to wait between each
+ * checkpoint; setting this value above 0 configures periodic checkpoints., an
+ * integer between 0 and 100000; default \c 0.}
+ * @config{ ),,}
+ * @config{checkpoint_sync, flush files to stable storage when closing or
+ * writing checkpoints., a boolean flag; default \c true.}
+ * @config{config_base, write the base configuration file if creating the
+ * database\, see @ref config_base for more information., a boolean flag;
+ * default \c true.}
+ * @config{create, create the database if it does not exist., a boolean flag;
+ * default \c false.}
+ * @config{direct_io, Use \c O_DIRECT to access files.  Options are given as a
+ * list\, such as <code>"direct_io=[data]"</code>. Configuring \c direct_io
+ * requires care\, see @ref tuning_system_buffer_cache_direct_io for important
+ * warnings.  Including \c "data" will cause WiredTiger data files to use \c
+ * O_DIRECT\, including \c "log" will cause WiredTiger log files to use \c
+ * O_DIRECT\, and including \c "checkpoint" will cause WiredTiger data files
+ * opened at a checkpoint (i.e: read only) to use \c O_DIRECT., a list\, with
+ * values chosen from the following options: \c "checkpoint"\, \c "data"\, \c
+ * "log"; default empty.}
+ * @config{error_prefix, prefix string for error messages., a string; default
+ * empty.}
+ * @config{eviction = (, eviction configuration options., a set of related
+ * configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;threads_max, maximum number of threads
+ * WiredTiger will start to help evict pages from cache.  The number of threads
+ * started will vary depending on the current eviction load., an integer between
+ * 1 and 20; default \c 1.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;threads_min, minimum
+ * number of threads WiredTiger will start to help evict pages from cache.  The
+ * number of threads currently running will vary depending on the current
+ * eviction load., an integer between 1 and 20; default \c 1.}
+ * @config{ ),,}
+ * @config{eviction_dirty_target, continue evicting until the cache has less
+ * dirty memory than the value\, as a percentage of the total cache size.  Dirty
+ * pages will only be evicted if the cache is full enough to trigger eviction.,
+ * an integer between 10 and 99; default \c 80.}
+ * @config{eviction_target, continue evicting until the cache has less total
+ * memory than the value\, as a percentage of the total cache size.  Must be
+ * less than \c eviction_trigger., an integer between 10 and 99; default \c 80.}
+ * @config{eviction_trigger, trigger eviction when the cache is using this much
+ * memory\, as a percentage of the total cache size., an integer between 10 and
+ * 99; default \c 95.}
+ * @config{exclusive, fail if the database already exists\, generally used with
+ * the \c create option., a boolean flag; default \c false.}
+ * @config{extensions, list of shared library extensions to load (using dlopen).
+ * Any values specified to an library extension are passed to
+ * WT_CONNECTION::load_extension as the \c config parameter (for example\,
+ * <code>extensions=(/path/ext.so={entry=my_entry})</code>)., a list of strings;
+ * default empty.}
+ * @config{file_extend, file extension configuration.  If set\, extend files of
+ * the set type in allocations of the set size\, instead of a block at a time as
+ * each new block is written.  For example\,
+ * <code>file_extend=(data=16MB)</code>., a list\, with values chosen from the
+ * following options: \c "data"\, \c "log"; default empty.}
+ * @config{hazard_max, maximum number of simultaneous hazard pointers per
+ * session handle., an integer greater than or equal to 15; default \c 1000.}
+ * @config{log = (, enable logging., a set of related configuration options
+ * defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;archive, automatically
+ * archive unneeded log files., a boolean flag; default \c true.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;enabled, enable logging subsystem., a boolean
+ * flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;file_max, the
+ * maximum size of log files., an integer between 100KB and 2GB; default \c
+ * 100MB.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;path, the path to a directory into
+ * which the log files are written.  If the value is not an absolute path name\,
+ * the files are created relative to the database home., a string; default \c
+ * "".}
+ * @config{ ),,}
+ * @config{lsm_manager = (, configure database wide options for LSM tree
+ * management., a set of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;merge, merge LSM chunks where possible., a
+ * boolean flag; default \c true.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;worker_thread_max, Configure a set of threads
+ * to manage merging LSM trees in the database., an integer between 3 and 20;
+ * default \c 4.}
+ * @config{ ),,}
+ * @config{mmap, Use memory mapping to access files when possible., a boolean
+ * flag; default \c true.}
+ * @config{multiprocess, permit sharing between processes (will automatically
+ * start an RPC server for primary processes and use RPC for secondary
+ * processes). <b>Not yet supported in WiredTiger</b>., a boolean flag; default
+ * \c false.}
+ * @config{session_max, maximum expected number of sessions (including server
+ * threads)., an integer greater than or equal to 1; default \c 100.}
+ * @config{shared_cache = (, shared cache configuration options.  A database
+ * should configure either a cache_size or a shared_cache not both., a set of
+ * related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;chunk, the granularity that a shared cache is
+ * redistributed., an integer between 1MB and 10TB; default \c 10MB.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;name, name of a cache that is shared between
+ * databases., a string; default empty.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;reserve, amount of cache this database is
+ * guaranteed to have available from the shared cache.  This setting is per
+ * database.  Defaults to the chunk size., an integer; default \c 0.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;size, maximum memory to allocate for the
+ * shared cache.  Setting this will update the value if one is already set., an
+ * integer between 1MB and 10TB; default \c 500MB.}
+ * @config{ ),,}
+ * @config{statistics, Maintain database statistics\, which may impact
+ * performance.  Choosing "all" maintains all statistics regardless of cost\,
+ * "fast" maintains a subset of statistics that are relatively inexpensive\,
+ * "none" turns off all statistics.  The "clear" configuration resets statistics
+ * after they are gathered\, where appropriate (for example\, a cache size
+ * statistic is not cleared\, while the count of cursor insert operations will
+ * be cleared). When "clear" is configured for the database\, gathered
+ * statistics are reset each time a statistics cursor is used to gather
+ * statistics\, as well as each time statistics are logged using the \c
+ * statistics_log configuration.  See @ref statistics for more information., a
+ * list\, with values chosen from the following options: \c "all"\, \c "fast"\,
+ * \c "none"\, \c "clear"; default \c none.}
+ * @config{statistics_log = (, log any statistics the database is configured to
+ * maintain\, to a file.  See @ref statistics for more information., a set of
+ * related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;on_close, log statistics on database close.,
+ * a boolean flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;path, the
+ * pathname to a file into which the log records are written\, may contain ISO C
+ * standard strftime conversion specifications.  If the value is not an absolute
+ * path name\, the file is created relative to the database home., a string;
+ * default \c "WiredTigerStat.%d.%H".}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;sources,
+ * if non-empty\, include statistics for the list of data source URIs\, if they
+ * are open at the time of the statistics logging.  The list may include URIs
+ * matching a single data source ("table:mytable")\, or a URI matching all data
+ * sources of a particular type ("table:")., a list of strings; default empty.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;timestamp, a timestamp prepended to each log
+ * record\, may contain strftime conversion specifications., a string; default
+ * \c "%b %d %H:%M:%S".}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;wait, seconds to wait
+ * between each write of the log records., an integer between 0 and 100000;
+ * default \c 0.}
+ * @config{ ),,}
+ * @config{transaction_sync = (, how to sync log records when the transaction
+ * commits., a set of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;enabled, whether to sync the log on every
+ * commit by default\, can be overridden by the \c sync setting to
+ * WT_SESSION::begin_transaction., a boolean flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;method, the method used to ensure log records
+ * are stable on disk\, see @ref tune_durability for more information., a
+ * string\, chosen from the following options: \c "dsync"\, \c "fsync"\, \c
+ * "none"; default \c fsync.}
+ * @config{ ),,}
+ * @config{use_environment_priv, use the \c WIREDTIGER_CONFIG and \c
+ * WIREDTIGER_HOME environment variables regardless of whether or not the
+ * process is running with special privileges.  See @ref home for more
+ * information., a boolean flag; default \c false.}
+ * @config{verbose, enable messages for various events.  Only available if
+ * WiredTiger is configured with --enable-verbose.  Options are given as a
+ * list\, such as <code>"verbose=[evictserver\,read]"</code>., a list\, with
+ * values chosen from the following options: \c "api"\, \c "block"\, \c
+ * "checkpoint"\, \c "compact"\, \c "evict"\, \c "evictserver"\, \c "fileops"\,
+ * \c "log"\, \c "lsm"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c
+ * "read"\, \c "reconcile"\, \c "recovery"\, \c "salvage"\, \c "shared_cache"\,
+ * \c "split"\, \c "temporary"\, \c "transaction"\, \c "verify"\, \c "version"\,
+ * \c "write"; default empty.}
+ * @configend
+ * Additionally, if files named \c WiredTiger.config or \c WiredTiger.basecfg
+ * appear in the WiredTiger home directory, they are read for configuration
+ * values (see @ref config_file and @ref config_base for details).
+ * See @ref config_order for ordering of the configuration mechanisms.
+ * @param[out] connectionp A pointer to the newly opened connection handle
+ * @errors
+ */
+int wiredtiger_open(const char *home,
+    WT_EVENT_HANDLER *errhandler, const char *config,
+    WT_CONNECTION **connectionp);
+
+/*!
+ * Return information about an error as a string; wiredtiger_strerror is a
+ * superset of the ISO C99/POSIX 1003.1-2001 function strerror.
+ *
+ * @snippet ex_all.c Display an error
+ *
+ * @param err a return value from a WiredTiger, C library or POSIX function
+ * @returns a string representation of the error
+ */
+const char *wiredtiger_strerror(int err);
+
+#if !defined(SWIG)
+/*!
+ * The interface implemented by applications to accept notifications
+ * of the completion of asynchronous operations.
+ *
+ * Applications register their implementation with WiredTiger by calling
+ * WT_CONNECTION::async_new_op.
+ *
+ * @snippet ex_async.c async handle allocation
+ */
+struct __wt_async_callback {
+	/*!
+	 * Callback to receive completion notification.
+	 *
+	 * @param[in] op the operation handle
+	 * @param[in] op_ret the result of the async operation
+	 * @param[in] flags currently unused
+	 * @returns zero for success, non-zero to indicate an error.
+	 *
+	 * @snippet ex_async.c async example callback implementation
+	 */
+	int (*notify)(WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP *op,
+	    int op_ret, uint32_t flags);
+};
+#endif
+
+/*!
+ * The interface implemented by applications to handle error, informational and
+ * progress messages.  Entries set to NULL are ignored and the default handlers
+ * will continue to be used.
+ */
+struct __wt_event_handler {
+	/*!
+	 * Callback to handle error messages; by default, error messages are
+	 * written to the stderr stream.
+	 *
+	 * Error handler returns are not ignored: if the handler returns
+	 * non-zero, the error may cause the WiredTiger function posting the
+	 * event to fail, and may even cause operation or library failure.
+	 *
+	 * @param session the WiredTiger session handle in use when the error
+	 * was generated. The handle may have been created by the application
+	 * or automatically by WiredTiger.
+	 * @param error a WiredTiger, C99 or POSIX error code, which can
+	 * be converted to a string using ::wiredtiger_strerror
+	 * @param message an error string
+	 */
+	int (*handle_error)(WT_EVENT_HANDLER *handler,
+	    WT_SESSION *session, int error, const char *message);
+
+	/*!
+	 * Callback to handle informational messages; by default, informational
+	 * messages are written to the stdout stream.
+	 *
+	 * Message handler returns are not ignored: if the handler returns
+	 * non-zero, the error may cause the WiredTiger function posting the
+	 * event to fail, and may even cause operation or library failure.
+	 *
+	 * @param session the WiredTiger session handle in use when the message
+	 * was generated. The handle may have been created by the application
+	 * or automatically by WiredTiger.
+	 * @param message an informational string
+	 */
+	int (*handle_message)(WT_EVENT_HANDLER *handler,
+	    WT_SESSION *session, const char *message);
+
+	/*!
+	 * Callback to handle progress messages; by default, no progress
+	 * messages are written.
+	 *
+	 * Progress handler returns are not ignored: if the handler returns
+	 * non-zero, the error may cause the WiredTiger function posting the
+	 * event to fail, and may even cause operation or library failure.
+	 *
+	 * @param session the WiredTiger session handle in use when the
+	 * progress message was generated. The handle may have been created by
+	 * the application or automatically by WiredTiger.
+	 * @param operation a string representation of the operation
+	 * @param progress a counter
+	 */
+	int (*handle_progress)(WT_EVENT_HANDLER *handler,
+	    WT_SESSION *session, const char *operation, uint64_t progress);
+
+	/*!
+	 * Callback to handle automatic close of a WiredTiger handle.
+	 *
+	 * Close handler returns are not ignored: if the handler returns
+	 * non-zero, the error may cause the WiredTiger function posting the
+	 * event to fail, and may even cause operation or library failure.
+	 *
+	 * @param session The session handle that is being closed if the
+	 * cursor parameter is NULL.
+	 * @param cursor The cursor handle that is being closed, or NULL if
+	 * it is a session handle being closed.
+	 */
+	int (*handle_close)(WT_EVENT_HANDLER *handler,
+	    WT_SESSION *session, WT_CURSOR *cursor);
+};
+
+/*!
+ * @name Data packing and unpacking
+ * @{
+ */
+
+/*!
+ * Pack a structure into a buffer.
+ *
+ * See @ref packing for a description of the permitted format strings.
+ *
+ * @section pack_examples Packing Examples
+ *
+ * For example, the string <code>"iSh"</code> will pack a 32-bit integer
+ * followed by a NUL-terminated string, followed by a 16-bit integer.  The
+ * default, big-endian encoding will be used, with no alignment.  This could be
+ * used in C as follows:
+ *
+ * @snippet ex_all.c Pack fields into a buffer
+ *
+ * Then later, the values can be unpacked as follows:
+ *
+ * @snippet ex_all.c Unpack fields from a buffer
+ *
+ * @param session the session handle
+ * @param buffer a pointer to a packed byte array
+ * @param size the number of valid bytes in the buffer
+ * @param format the data format, see @ref packing
+ * @errors
+ */
+int wiredtiger_struct_pack(WT_SESSION *session,
+    void *buffer, size_t size, const char *format, ...);
+
+/*!
+ * Calculate the size required to pack a structure.
+ *
+ * Note that for variable-sized fields including variable-sized strings and
+ * integers, the calculated sized merely reflects the expected sizes specified
+ * in the format string itself.
+ *
+ * @snippet ex_all.c Get the packed size
+ *
+ * @param session the session handle
+ * @param sizep a location where the number of bytes needed for the
+ * matching call to ::wiredtiger_struct_pack is returned
+ * @param format the data format, see @ref packing
+ * @errors
+ */
+int wiredtiger_struct_size(WT_SESSION *session,
+    size_t *sizep, const char *format, ...);
+
+/*!
+ * Unpack a structure from a buffer.
+ *
+ * Reverse of ::wiredtiger_struct_pack: gets values out of a
+ * packed byte string.
+ *
+ * @snippet ex_all.c Unpack fields from a buffer
+ *
+ * @param session the session handle
+ * @param buffer a pointer to a packed byte array
+ * @param size the number of valid bytes in the buffer
+ * @param format the data format, see @ref packing
+ * @errors
+ */
+int wiredtiger_struct_unpack(WT_SESSION *session,
+    const void *buffer, size_t size, const char *format, ...);
+
+#if !defined(SWIG)
+
+/*!
+ * Streaming interface to packing.
+ *
+ * This allows applications to pack or unpack records one field at a time.
+ * This is an opaque handle returned by ::wiredtiger_pack_start or
+ * ::wiredtiger_unpack_start.  It must be closed with ::wiredtiger_pack_close.
+ */
+typedef struct __wt_pack_stream WT_PACK_STREAM;
+
+/*!
+ * Start a packing operation into a buffer with the given format string.  This
+ * should be followed by a series of calls to ::wiredtiger_pack_item,
+ * ::wiredtiger_pack_int, ::wiredtiger_pack_str or ::wiredtiger_pack_uint
+ * to fill in the values.
+ *
+ * @param session the session handle
+ * @param format the data format, see @ref packing
+ * @param buffer a pointer to memory to hold the packed data
+ * @param size the size of the buffer
+ * @param[out] psp the new packing stream handle
+ * @errors
+ */
+int wiredtiger_pack_start(WT_SESSION *session,
+    const char *format, void *buffer, size_t size, WT_PACK_STREAM **psp);
+
+/*!
+ * Start an unpacking operation from a buffer with the given format string.
+ * This should be followed by a series of calls to ::wiredtiger_unpack_item,
+ * ::wiredtiger_unpack_int, ::wiredtiger_unpack_str or ::wiredtiger_unpack_uint
+ * to retrieve the packed values.
+ *
+ * @param session the session handle
+ * @param format the data format, see @ref packing
+ * @param buffer a pointer to memory holding the packed data
+ * @param size the size of the buffer
+ * @param[out] psp the new packing stream handle
+ * @errors
+ */
+int wiredtiger_unpack_start(WT_SESSION *session,
+    const char *format, const void *buffer, size_t size, WT_PACK_STREAM **psp);
+
+/*!
+ * Close a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param[out] usedp the number of bytes in the buffer used by the stream
+ * @errors
+ */
+int wiredtiger_pack_close(WT_PACK_STREAM *ps, size_t *usedp);
+
+/*!
+ * Pack an item into a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param item an item to pack
+ * @errors
+ */
+int wiredtiger_pack_item(WT_PACK_STREAM *ps, WT_ITEM *item);
+
+/*!
+ * Pack a signed integer into a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param i a signed integer to pack
+ * @errors
+ */
+int wiredtiger_pack_int(WT_PACK_STREAM *ps, int64_t i);
+
+/*!
+ * Pack a string into a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param s a string to pack
+ * @errors
+ */
+int wiredtiger_pack_str(WT_PACK_STREAM *ps, const char *s);
+
+/*!
+ * Pack an unsigned integer into a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param u an unsigned integer to pack
+ * @errors
+ */
+int wiredtiger_pack_uint(WT_PACK_STREAM *ps, uint64_t u);
+
+/*!
+ * Unpack an item from a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param item an item to unpack
+ * @errors
+ */
+int wiredtiger_unpack_item(WT_PACK_STREAM *ps, WT_ITEM *item);
+
+/*!
+ * Unpack a signed integer from a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param[out] ip the unpacked signed integer
+ * @errors
+ */
+int wiredtiger_unpack_int(WT_PACK_STREAM *ps, int64_t *ip);
+
+/*!
+ * Unpack a string from a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param[out] sp the unpacked string
+ * @errors
+ */
+int wiredtiger_unpack_str(WT_PACK_STREAM *ps, const char **sp);
+
+/*!
+ * Unpack an unsigned integer from a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param[out] up the unpacked unsigned integer
+ * @errors
+ */
+int wiredtiger_unpack_uint(WT_PACK_STREAM *ps, uint64_t *up);
+/*! @} */
+
+/*!
+ * @name Configuration string parsing
+ * @{
+ */
+
+/*!
+ * The configuration information returned by the WiredTiger configuration
+ * parsing functions in the WT_EXTENSION_API and the public API.
+ */
+struct __wt_config_item {
+	/*!
+	 * The value of a configuration string.
+	 *
+	 * Regardless of the type of the configuration string (boolean, int,
+	 * list or string), the \c str field will reference the value of the
+	 * configuration string.
+	 *
+	 * The bytes referenced by \c str are <b>not</b> nul-terminated,
+	 * use the \c len field instead of a terminating nul byte.
+	 */
+	const char *str;
+
+	/*! The number of bytes in the value referenced by \c str. */
+	size_t len;
+
+	/*!
+	 * The value of a configuration boolean or integer.
+	 *
+	 * If the configuration string's value is "true" or "false", the
+	 * \c val field will be set to 1 (true), or 0 (false).
+	 *
+	 * If the configuration string can be legally interpreted as an integer,
+	 * using the strtoll function rules as specified in ISO/IEC 9899:1990
+	 * ("ISO C90"), that integer will be stored in the \c val field.
+	 */
+	int64_t val;
+
+	/*! Permitted values of the \c type field. */
+	enum {
+		/*! A string value with quotes stripped. */
+		WT_CONFIG_ITEM_STRING,
+		/*! A boolean literal ("true" or "false"). */
+		WT_CONFIG_ITEM_BOOL,
+		/*! An unquoted identifier: a string value without quotes. */
+		WT_CONFIG_ITEM_ID,
+		/*! A numeric value. */
+		WT_CONFIG_ITEM_NUM,
+		/*! A nested structure or list, including brackets. */
+		WT_CONFIG_ITEM_STRUCT
+	}
+	/*!
+	 * The type of value determined by the parser.  In all cases,
+	 * the \c str and \c len fields are set.
+	 */
+	type;
+};
+
+/*!
+ * Create a handle that can be used to parse or create configuration strings
+ * compatible with WiredTiger APIs.
+ * This API is outside the scope of a WiredTiger connection handle, since
+ * applications may need to generate configuration strings prior to calling
+ * ::wiredtiger_open.
+ * @param session the session handle to be used for error reporting. If NULL
+ *        error messages will be written to stdout.
+ * @param config the configuration string being parsed. The string must
+ *        remain valid for the lifetime of the parser handle.
+ * @param len the number of valid bytes in \c config
+ * @param[out] config_parserp A pointer to the newly opened handle
+ * @errors
+ */
+int wiredtiger_config_parser_open(WT_SESSION *session,
+    const char *config, size_t len, WT_CONFIG_PARSER **config_parserp);
+
+/*!
+ * A handle that can be used to search and traverse configuration strings
+ * compatible with WiredTiger APIs.
+ * To parse the contents of a list or nested configuration string use a new
+ * configuration parser handle based on the content of the ::WT_CONFIG_ITEM
+ * retrieved from the parent configuration string.
+ *
+ * @section config_parse_examples Configuration String Parsing examples
+ *
+ * This could be used in C to create a configuration parser as follows:
+ *
+ * @snippet ex_config_parse.c Create a configuration parser
+ *
+ * Once the parser has been created the content can be queried directly:
+ *
+ * @snippet ex_config_parse.c get
+ *
+ * Or the content can be traversed linearly:
+ *
+ * @snippet ex_config_parse.c next
+ *
+ * Nested configuration values can be queried using a shorthand notation:
+ *
+ * @snippet ex_config_parse.c nested get
+ *
+ * Nested configuration values can be traversed using multiple
+ * ::WT_CONFIG_PARSER handles:
+ *
+ * @snippet ex_config_parse.c nested traverse
+ */
+struct __wt_config_parser {
+
+	/*!
+	 * Close the configuration scanner releasing any resources.
+	 *
+	 * @param config_parser the configuration parser handle
+	 * @errors
+	 *
+	 */
+	int __F(close)(WT_CONFIG_PARSER *config_parser);
+
+	/*!
+	 * Return the next key/value pair.
+	 *
+	 * When iteration would pass the end of the configuration string
+	 * ::WT_NOTFOUND will be returned.
+	 *
+	 * If an item has no explicitly assigned value, the item will be
+	 * returned in \c key and the \c value will be set to the boolean
+	 * \c "true" value.
+	 *
+	 * @param config_parser the configuration parser handle
+	 * @param key the returned key
+	 * @param value the returned value
+	 * @errors
+	 *
+	 */
+	int __F(next)(WT_CONFIG_PARSER *config_parser,
+	    WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value);
+
+	/*!
+	 * Return the value of an item in the configuration string.
+	 *
+	 * @param config_parser the configuration parser handle
+	 * @param key configuration key string
+	 * @param value the returned value
+	 * @errors
+	 *
+	 */
+	int __F(get)(WT_CONFIG_PARSER *config_parser,
+	    const char *key, WT_CONFIG_ITEM *value);
+};
+
+#endif /* !defined(SWIG) */
+/*! @} */
+
+/*!
+ * Get version information.
+ *
+ * @snippet ex_all.c Get the WiredTiger library version #1
+ * @snippet ex_all.c Get the WiredTiger library version #2
+ *
+ * @param majorp a location where the major version number is returned
+ * @param minorp a location where the minor version number is returned
+ * @param patchp a location where the patch version number is returned
+ * @returns a string representation of the version
+ */
+const char *wiredtiger_version(int *majorp, int *minorp, int *patchp);
+
+/*******************************************
+ * Error returns
+ *******************************************/
+/*!
+ * @anchor error_returns
+ * @name Error returns
+ * Most functions and methods in WiredTiger return an integer code indicating
+ * whether the operation succeeded or failed.  A return of zero indicates
+ * success, all non-zero return values indicate some kind of failure.
+ *
+ * WiredTiger reserves all values from -31,800 to -31,999 as possible error
+ * return values.  WiredTiger may also return C99/POSIX error codes such as
+ * \c ENOMEM, \c EINVAL and \c ENOTSUP, with the usual meanings.
+ *
+ * The following are all of the WiredTiger-specific error returns:
+ * @{
+ */
+/*
+ * DO NOT EDIT: automatically built by dist/api_err.py.
+ * Error return section: BEGIN
+ */
+/*!
+ * Attempt to insert an existing key.
+ * This error is generated when the application attempts to insert a record with
+ * the same key as an existing record without the 'overwrite' configuration to
+ * WT_SESSION::open_cursor.
+ */
+#define	WT_DUPLICATE_KEY	-31800
+/*!
+ * Non-specific WiredTiger error.
+ * This error is returned when an error is not covered by a specific error
+ * return.
+ */
+#define	WT_ERROR	-31801
+/*!
+ * Item not found.
+ * This error indicates an operation did not find a value to return.  This
+ * includes cursor search and other operations where no record matched the
+ * cursor's search key such as WT_CURSOR::update or WT_CURSOR::remove.
+ */
+#define	WT_NOTFOUND	-31802
+/*!
+ * WiredTiger library panic.
+ * This error indicates an underlying problem that requires the application exit
+ * and restart.
+ */
+#define	WT_PANIC	-31803
+/*! @cond internal */
+/*! Restart the operation (internal). */
+#define	WT_RESTART	-31804
+/*! @endcond */
+/*!
+ * Conflict between concurrent operations.
+ * This error is generated when an operation cannot be completed due to a
+ * conflict with concurrent operations.  The operation may be retried; if a
+ * transaction is in progress, it should be rolled back and the operation
+ * retried in a new transaction.
+ */
+#define	WT_ROLLBACK	-31805
+/*
+ * Error return section: END
+ * DO NOT EDIT: automatically built by dist/api_err.py.
+ */
+/*! @} */
+
+#ifndef DOXYGEN
+#define	WT_DEADLOCK	WT_ROLLBACK		/* Backward compatibility */
+#endif
+
+/*! @} */
+
+/*!
+ * @defgroup wt_ext WiredTiger Extension API
+ * The functions and interfaces applications use to customize and extend the
+ * behavior of WiredTiger.
+ * @{
+ */
+
+/*******************************************
+ * Forward structure declarations for the extension API
+ *******************************************/
+struct __wt_config_arg;     typedef struct __wt_config_arg WT_CONFIG_ARG;
+
+/*!
+ * The interface implemented by applications to provide custom ordering of
+ * records.
+ *
+ * Applications register their implementation with WiredTiger by calling
+ * WT_CONNECTION::add_collator.
+ *
+ * @snippet ex_extending.c add collator nocase
+ *
+ * @snippet ex_extending.c add collator prefix10
+ */
+struct __wt_collator {
+	/*!
+	 * Callback to compare keys.
+	 *
+	 * @param[out] cmp set to -1 if <code>key1 < key2</code>,
+	 * 	0 if <code>key1 == key2</code>,
+	 * 	1 if <code>key1 > key2</code>.
+	 * @returns zero for success, non-zero to indicate an error.
+	 *
+	 * @snippet ex_all.c Implement WT_COLLATOR
+	 *
+	 * @snippet ex_extending.c case insensitive comparator
+	 *
+	 * @snippet ex_extending.c n character comparator
+	 */
+	int (*compare)(WT_COLLATOR *collator, WT_SESSION *session,
+	    const WT_ITEM *key1, const WT_ITEM *key2, int *cmp);
+
+	/*!
+	 * If non-NULL, this callback is called to customize the collator
+	 * for each data source.  If the callback returns a non-NULL
+	 * collator, that instance is used instead of this one for all
+	 * comparisons.
+	 */
+	int (*customize)(WT_COLLATOR *collator, WT_SESSION *session,
+	    const char *uri, WT_CONFIG_ITEM *appcfg, WT_COLLATOR **customp);
+
+	/*!
+	 * If non-NULL, a callback performed when the database is closed.
+	 *
+	 * The WT_COLLATOR::terminate callback is intended to allow cleanup,
+	 * the handle will not be subsequently accessed by WiredTiger.
+	 */
+	int (*terminate)(WT_COLLATOR *collator, WT_SESSION *session);
+};
+
+/*!
+ * The interface implemented by applications to provide custom compression.
+ *
+ * Compressors must implement the WT_COMPRESSOR interface: the
+ * WT_COMPRESSOR::compress and WT_COMPRESSOR::decompress callbacks must be
+ * specified, and WT_COMPRESSOR::pre_size is optional.  To build your own
+ * compressor, use one of the compressors in \c ext/compressors as a template:
+ * \c ext/nop_compress is a simple compressor that passes through data
+ * unchanged, and is a reasonable starting point.
+ *
+ * Applications register their implementation with WiredTiger by calling
+ * WT_CONNECTION::add_compressor.
+ *
+ * @snippet nop_compress.c WT_COMPRESSOR initialization structure
+ * @snippet nop_compress.c WT_COMPRESSOR initialization function
+ */
+struct __wt_compressor {
+	/*!
+	 * Callback to compress a chunk of data.
+	 *
+	 * WT_COMPRESSOR::compress takes a source buffer and a destination
+	 * buffer, by default of the same size.  If the callback can compress
+	 * the buffer to a smaller size in the destination, it does so, sets
+	 * the \c compression_failed return to 0 and returns 0.  If compression
+	 * does not produce a smaller result, the callback sets the
+	 * \c compression_failed return to 1 and returns 0. If another
+	 * error occurs, it returns an errno or WiredTiger error code.
+	 *
+	 * On entry, \c src will point to memory, with the length of the memory
+	 * in \c src_len.  After successful completion, the callback should
+	 * return \c 0 and set \c result_lenp to the number of bytes required
+	 * for the compressed representation.
+	 *
+	 * On entry, \c dst points to the destination buffer with a length
+	 * of \c dst_len.  If the WT_COMPRESSOR::pre_size method is specified,
+	 * the destination buffer will be at least the size returned by that
+	 * method; otherwise, the destination buffer will be at least as large
+	 * as \c src_len.
+	 *
+	 * If compression would not shrink the data or the \c dst buffer is not
+	 * large enough to hold the compressed data, the callback should set
+	 * \c compression_failed to a non-zero value and return 0.
+	 *
+	 * @param[in] src the data to compress
+	 * @param[in] src_len the length of the data to compress
+	 * @param[in] dst the destination buffer
+	 * @param[in] dst_len the length of the destination buffer
+	 * @param[out] result_lenp the length of the compressed data
+	 * @param[out] compression_failed non-zero if compression did not
+	 * decrease the length of the data (compression may not have completed)
+	 * @returns zero for success, non-zero to indicate an error.
+	 *
+	 * @snippet nop_compress.c WT_COMPRESSOR compress
+	 */
+	int (*compress)(WT_COMPRESSOR *compressor, WT_SESSION *session,
+	    uint8_t *src, size_t src_len,
+	    uint8_t *dst, size_t dst_len,
+	    size_t *result_lenp, int *compression_failed);
+
+	/*!
+	 * Callback to compress a list of byte strings.
+	 *
+	 * WT_COMPRESSOR::compress_raw gives applications fine-grained control
+	 * over disk block size when writing row-store or variable-length
+	 * column-store pages.  Where this level of control is not required by
+	 * the underlying storage device, set the WT_COMPRESSOR::compress_raw
+	 * callback to \c NULL and WiredTiger will internally split each page
+	 * into blocks, each block then compressed by WT_COMPRESSOR::compress.
+	 *
+	 * WT_COMPRESSOR::compress_raw takes a source buffer and an array of
+	 * 0-based offsets of byte strings in that buffer.  The callback then
+	 * encodes none, some or all of the byte strings and copies the encoded
+	 * representation into a destination buffer.  The callback returns the
+	 * number of byte strings encoded and the bytes needed for the encoded
+	 * representation.   The encoded representation has header information
+	 * prepended and is written as a block to the underlying file object.
+	 *
+	 * On entry, \c page_max is the configured maximum size for objects of
+	 * this type.  (This value is provided for convenience, and will be
+	 * either the \c internal_page_max or \c leaf_page_max value specified
+	 * to WT_SESSION::create when the object was created.)
+	 *
+	 * On entry, \c split_pct is the configured Btree page split size for
+	 * this object.  (This value is provided for convenience, and will be
+	 * the \c split_pct value specified to WT_SESSION::create when the
+	 * object was created.)
+	 *
+	 * On entry, \c extra is a count of additional bytes that will be added
+	 * to the encoded representation before it is written.  In other words,
+	 * if the target write size is 8KB, the returned encoded representation
+	 * should be less than or equal to (8KB - \c extra).  The method does
+	 * not need to skip bytes in the destination buffer based on \c extra,
+	 * the method should only use \c extra to decide how many bytes to store
+	 * into the destination buffer for its ideal block size.
+	 *
+	 * On entry, \c src points to the source buffer; \c offsets is an array
+	 * of \c slots 0-based offsets into \c src, where each offset is the
+	 * start of a byte string, except for the last offset, which is the
+	 * offset of the first byte past the end of the last byte string.  (In
+	 * other words, <code>offsets[0]</code> will be 0, the offset of the
+	 * first byte of the first byte string in \c src, and
+	 * <code>offsets[slots]</code> is the total length of all of the byte
+	 * strings in the \c src buffer.)
+	 *
+	 * On entry, \c dst points to the destination buffer with a length
+	 * of \c dst_len.  If the WT_COMPRESSOR::pre_size method is specified,
+	 * the destination buffer will be at least the size returned by that
+	 * method; otherwise, the destination buffer will be at least the
+	 * maximum size for the page being written (that is, when writing a
+	 * row-store leaf page, the destination buffer will be at least as
+	 * large as the \c leaf_page_max configuration value).
+	 *
+	 * After successful completion, the callback should return \c 0, and
+	 * set \c result_slotsp to the number of byte strings encoded and
+	 * \c result_lenp to the bytes needed for the encoded representation.
+	 *
+	 * There is no requirement the callback encode any or all of the byte
+	 * strings passed by WiredTiger.  If the callback does not encode any
+	 * of the byte strings and compression should not be retried, the
+	 * callback should set \c result_slotsp to 0.
+	 *
+	 * If the callback does not encode any of the byte strings and
+	 * compression should be retried with additional byte strings, the
+	 * callback must return \c EAGAIN.  In that case, WiredTiger will
+	 * accumulate more rows and repeat the call.
+	 *
+	 * If there are no more rows to accumulate or the callback indicates
+	 * that it cannot be retried, WiredTiger writes the remaining rows
+	 * using \c WT_COMPRESSOR::compress.
+	 *
+	 * On entry, \c final is zero if there are more rows to be written as
+	 * part of this page (if there will be additional data provided to the
+	 * callback), and non-zero if there are no more rows to be written as
+	 * part of this page.  If \c final is set and the callback fails to
+	 * encode any rows, WiredTiger writes the remaining rows without further
+	 * calls to the callback.  If \c final is set and the callback encodes
+	 * any number of rows, WiredTiger continues to call the callback until
+	 * all of the rows are encoded or the callback fails to encode any rows.
+	 *
+	 * The WT_COMPRESSOR::compress_raw callback is intended for applications
+	 * wanting to create disk blocks in specific sizes.
+	 * WT_COMPRESSOR::compress_raw is not a replacement for
+	 * WT_COMPRESSOR::compress: objects which WT_COMPRESSOR::compress_raw
+	 * cannot handle (for example, overflow key or value items), or which
+	 * WT_COMPRESSOR::compress_raw chooses not to compress for any reason
+	 * (for example, if WT_COMPRESSOR::compress_raw callback chooses not to
+	 * compress a small number of rows, but the page being written has no
+	 * more rows to accumulate), will be passed to WT_COMPRESSOR::compress.
+	 *
+	 * The WT_COMPRESSOR::compress_raw callback is only called for objects
+	 * where it is applicable, that is, for row-store and variable-length
+	 * column-store objects, where both row-store key prefix compression
+	 * and row-store and variable-length column-store dictionary compression
+	 * are \b not configured.  When WT_COMPRESSOR::compress_raw is not
+	 * applicable, the WT_COMPRESSOR::compress callback is used instead.
+	 *
+	 * @param[in] page_max the configured maximum page size for this object
+	 * @param[in] split_pct the configured page split size for this object
+	 * @param[in] extra the count of the additional bytes
+	 * @param[in] src the data to compress
+	 * @param[in] offsets the byte offsets of the byte strings in src
+	 * @param[in] slots the number of entries in offsets
+	 * @param[in] dst the destination buffer
+	 * @param[in] dst_len the length of the destination buffer
+	 * @param[in] final non-zero if there are no more rows to accumulate
+	 * @param[out] result_lenp the length of the compressed data
+	 * @param[out] result_slotsp the number of byte offsets taken
+	 * @returns zero for success, non-zero to indicate an error.
+	 */
+	int (*compress_raw)(WT_COMPRESSOR *compressor, WT_SESSION *session,
+	    size_t page_max, int split_pct, size_t extra,
+	    uint8_t *src, uint32_t *offsets, uint32_t slots,
+	    uint8_t *dst, size_t dst_len,
+	    int final,
+	    size_t *result_lenp, uint32_t *result_slotsp);
+
+	/*!
+	 * Callback to decompress a chunk of data.
+	 *
+	 * WT_COMPRESSOR::decompress takes a source buffer and a destination
+	 * buffer.  The contents are switched from \c compress: the
+	 * source buffer is the compressed value, and the destination buffer is
+	 * sized to be the original size.  If the callback successfully
+	 * decompresses the source buffer to the destination buffer, it returns
+	 * 0.  If an error occurs, it returns an errno or WiredTiger error code.
+	 * The source buffer that WT_COMPRESSOR::decompress takes may have a
+	 * size that is rounded up from the size originally produced by
+	 * WT_COMPRESSOR::compress, with the remainder of the buffer set to
+	 * zeroes. Most compressors do not care about this difference if the
+	 * size to be decompressed can be implicitly discovered from the
+	 * compressed data.  If your compressor cares, you may need to allocate
+	 * space for, and store, the actual size in the compressed buffer.  See
+	 * the source code for the included snappy compressor for an example.
+	 *
+	 * On entry, \c src will point to memory, with the length of the memory
+	 * in \c src_len.  After successful completion, the callback should
+	 * return \c 0 and set \c result_lenp to the number of bytes required
+	 * for the decompressed representation.
+	 *
+	 * If the \c dst buffer is not big enough to hold the decompressed
+	 * data, the callback should return an error.
+	 *
+	 * @param[in] src the data to decompress
+	 * @param[in] src_len the length of the data to decompress
+	 * @param[in] dst the destination buffer
+	 * @param[in] dst_len the length of the destination buffer
+	 * @param[out] result_lenp the length of the decompressed data
+	 * @returns zero for success, non-zero to indicate an error.
+	 *
+	 * @snippet nop_compress.c WT_COMPRESSOR decompress
+	 */
+	int (*decompress)(WT_COMPRESSOR *compressor, WT_SESSION *session,
+	    uint8_t *src, size_t src_len,
+	    uint8_t *dst, size_t dst_len,
+	    size_t *result_lenp);
+
+	/*!
+	 * Callback to size a destination buffer for compression
+	 *
+	 * WT_COMPRESSOR::pre_size is an optional callback that, given the
+	 * source buffer and size, produces the size of the destination buffer
+	 * to be given to WT_COMPRESSOR::compress.  This is useful for
+	 * compressors that assume that the output buffer is sized for the
+	 * worst case and thus no overrun checks are made.  If your compressor
+	 * works like this, WT_COMPRESSOR::pre_size will need to be defined.
+	 * See the source code for the snappy compressor for an example.
+	 * However, if your compressor detects and avoids overruns against its
+	 * target buffer, you will not need to define WT_COMPRESSOR::pre_size.
+	 * When WT_COMPRESSOR::pre_size is set to NULL, the destination buffer
+	 * is sized the same as the source buffer.  This is always sufficient,
+	 * since a compression result that is larger than the source buffer is
+	 * discarded by WiredTiger.
+	 *
+	 * If not NULL, this callback is called before each call to
+	 * WT_COMPRESS::compress to determine the size of the destination
+	 * buffer to provide.  If the callback is NULL, the destination
+	 * buffer will be the same size as the source buffer.
+	 *
+	 * The callback should set \c result_lenp to a suitable buffer size
+	 * for compression, typically the maximum length required by
+	 * WT_COMPRESSOR::compress.
+	 *
+	 * This callback function is for compressors that require an output
+	 * buffer larger than the source buffer (for example, that do not
+	 * check for buffer overflow during compression).
+	 *
+	 * @param[in] src the data to compress
+	 * @param[in] src_len the length of the data to compress
+	 * @param[out] result_lenp the required destination buffer size
+	 * @returns zero for success, non-zero to indicate an error.
+	 *
+	 * @snippet nop_compress.c WT_COMPRESSOR presize
+	 */
+	int (*pre_size)(WT_COMPRESSOR *compressor, WT_SESSION *session,
+	    uint8_t *src, size_t src_len, size_t *result_lenp);
+
+	/*!
+	 * If non-NULL, a callback performed when the database is closed.
+	 *
+	 * The WT_COMPRESSOR::terminate callback is intended to allow cleanup,
+	 * the handle will not be subsequently accessed by WiredTiger.
+	 *
+	 * @snippet nop_compress.c WT_COMPRESSOR terminate
+	 */
+	int (*terminate)(WT_COMPRESSOR *compressor, WT_SESSION *session);
+};
+
+/*!
+ * Applications can extend WiredTiger by providing new implementations of the
+ * WT_DATA_SOURCE class.  Each data source supports a different URI scheme for
+ * data sources to WT_SESSION::create, WT_SESSION::open_cursor and related
+ * methods.  See @ref custom_data_sources for more information.
+ *
+ * <b>Thread safety:</b> WiredTiger may invoke methods on the WT_DATA_SOURCE
+ * interface from multiple threads concurrently.  It is the responsibility of
+ * the implementation to protect any shared data.
+ *
+ * Applications register their implementation with WiredTiger by calling
+ * WT_CONNECTION::add_data_source.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE register
+ */
+struct __wt_data_source {
+	/*!
+	 * Callback to create a new object.
+	 *
+	 * @snippet ex_data_source.c WT_DATA_SOURCE create
+	 */
+	int (*create)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+	    const char *uri, WT_CONFIG_ARG *config);
+
+	/*!
+	 * Callback to compact an object.
+	 *
+	 * @snippet ex_data_source.c WT_DATA_SOURCE compact
+	 */
+	int (*compact)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+	    const char *uri, WT_CONFIG_ARG *config);
+
+	/*!
+	 * Callback to drop an object.
+	 *
+	 * @snippet ex_data_source.c WT_DATA_SOURCE drop
+	 */
+	int (*drop)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+	    const char *uri, WT_CONFIG_ARG *config);
+
+	/*!
+	 * Callback to initialize a cursor.
+	 *
+	 * @snippet ex_data_source.c WT_DATA_SOURCE open_cursor
+	 */
+	int (*open_cursor)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+	    const char *uri, WT_CONFIG_ARG *config, WT_CURSOR **new_cursor);
+
+	/*!
+	 * Callback to rename an object.
+	 *
+	 * @snippet ex_data_source.c WT_DATA_SOURCE rename
+	 */
+	int (*rename)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+	    const char *uri, const char *newuri, WT_CONFIG_ARG *config);
+
+	/*!
+	 * Callback to salvage an object.
+	 *
+	 * @snippet ex_data_source.c WT_DATA_SOURCE salvage
+	 */
+	int (*salvage)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+	    const char *uri, WT_CONFIG_ARG *config);
+
+	/*!
+	 * Callback to truncate an object.
+	 *
+	 * @snippet ex_data_source.c WT_DATA_SOURCE truncate
+	 */
+	int (*truncate)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+	    const char *uri, WT_CONFIG_ARG *config);
+
+	/*!
+	 * Callback to truncate a range of an object.
+	 *
+	 * @snippet ex_data_source.c WT_DATA_SOURCE range truncate
+	 */
+	int (*range_truncate)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+	    WT_CURSOR *start, WT_CURSOR *stop);
+
+	/*!
+	 * Callback to verify an object.
+	 *
+	 * @snippet ex_data_source.c WT_DATA_SOURCE verify
+	 */
+	int (*verify)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+	    const char *uri, WT_CONFIG_ARG *config);
+
+	/*!
+	 * Callback to checkpoint the database.
+	 *
+	 * @snippet ex_data_source.c WT_DATA_SOURCE checkpoint
+	 */
+	int (*checkpoint)(
+	    WT_DATA_SOURCE *dsrc, WT_SESSION *session, WT_CONFIG_ARG *config);
+
+	/*!
+	 * If non-NULL, a callback performed when the database is closed.
+	 *
+	 * The WT_DATA_SOURCE::terminate callback is intended to allow cleanup,
+	 * the handle will not be subsequently accessed by WiredTiger.
+	 *
+	 * @snippet ex_data_source.c WT_DATA_SOURCE terminate
+	 */
+	int (*terminate)(WT_DATA_SOURCE *dsrc, WT_SESSION *session);
+};
+
+/*!
+ * The interface implemented by applications to provide custom extraction of
+ * index keys or column group values.
+ *
+ * Applications register implementations with WiredTiger by calling
+ * WT_CONNECTION::add_extractor.
+ *
+ * @snippet ex_all.c WT_EXTRACTOR register
+ */
+struct __wt_extractor {
+	/*!
+	 * Callback to extract a value for an index or column group.
+	 *
+	 * @errors
+	 *
+	 * @snippet ex_all.c WT_EXTRACTOR
+	 */
+	int (*extract)(WT_EXTRACTOR *extractor, WT_SESSION *session,
+	    const WT_ITEM *key, const WT_ITEM *value, WT_ITEM *result);
+};
+
+/*!
+ * Entry point to an extension, called when the extension is loaded.
+ *
+ * @param connection the connection handle
+ * @param config the config information passed to WT_CONNECTION::load_extension
+ * @errors
+ */
+extern int wiredtiger_extension_init(
+    WT_CONNECTION *connection, WT_CONFIG_ARG *config);
+
+/*!
+ * Optional cleanup function for an extension, called during
+ * WT_CONNECTION::close.
+ *
+ * @param connection the connection handle
+ * @errors
+ */
+extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
+
+/*! @} */
+
+/*******************************************
+ * Statistic reference.
+ *******************************************/
+/*!
+ * @addtogroup wt
+ * @{
+ */
+/*
+ * DO NOT EDIT: automatically built by dist/api_stat.py.
+ * Statistics section: BEGIN
+ */
+
+/*!
+ * @name Connection statistics
+ * @anchor statistics_keys
+ * @anchor statistics_conn
+ * Statistics are accessed through cursors with \c "statistics:" URIs.
+ * Individual statistics can be queried through the cursor using the following
+ * keys.  See @ref data_statistics for more information.
+ * @{
+ */
+/*! async: number of allocation state races */
+#define	WT_STAT_CONN_ASYNC_ALLOC_RACE			1000
+/*! async: number of op slots viewed for alloc */
+#define	WT_STAT_CONN_ASYNC_ALLOC_VIEW			1001
+/*! async: current work queue length */
+#define	WT_STAT_CONN_ASYNC_CUR_QUEUE			1002
+/*! async: number of async flush calls */
+#define	WT_STAT_CONN_ASYNC_FLUSH			1003
+/*! async: number of times op allocation failed */
+#define	WT_STAT_CONN_ASYNC_FULL				1004
+/*! async: maximum work queue length */
+#define	WT_STAT_CONN_ASYNC_MAX_QUEUE			1005
+/*! async: number of times worker found no work */
+#define	WT_STAT_CONN_ASYNC_NOWORK			1006
+/*! async: op allocations */
+#define	WT_STAT_CONN_ASYNC_OP_ALLOC			1007
+/*! async: op compact calls */
+#define	WT_STAT_CONN_ASYNC_OP_COMPACT			1008
+/*! async: op insert calls */
+#define	WT_STAT_CONN_ASYNC_OP_INSERT			1009
+/*! async: op remove calls */
+#define	WT_STAT_CONN_ASYNC_OP_REMOVE			1010
+/*! async: op search calls */
+#define	WT_STAT_CONN_ASYNC_OP_SEARCH			1011
+/*! async: op update calls */
+#define	WT_STAT_CONN_ASYNC_OP_UPDATE			1012
+/*! block manager: mapped bytes read */
+#define	WT_STAT_CONN_BLOCK_BYTE_MAP_READ		1013
+/*! block manager: bytes read */
+#define	WT_STAT_CONN_BLOCK_BYTE_READ			1014
+/*! block manager: bytes written */
+#define	WT_STAT_CONN_BLOCK_BYTE_WRITE			1015
+/*! block manager: mapped blocks read */
+#define	WT_STAT_CONN_BLOCK_MAP_READ			1016
+/*! block manager: blocks pre-loaded */
+#define	WT_STAT_CONN_BLOCK_PRELOAD			1017
+/*! block manager: blocks read */
+#define	WT_STAT_CONN_BLOCK_READ				1018
+/*! block manager: blocks written */
+#define	WT_STAT_CONN_BLOCK_WRITE			1019
+/*! cache: tracked dirty bytes in the cache */
+#define	WT_STAT_CONN_CACHE_BYTES_DIRTY			1020
+/*! cache: bytes currently in the cache */
+#define	WT_STAT_CONN_CACHE_BYTES_INUSE			1021
+/*! cache: maximum bytes configured */
+#define	WT_STAT_CONN_CACHE_BYTES_MAX			1022
+/*! cache: bytes read into cache */
+#define	WT_STAT_CONN_CACHE_BYTES_READ			1023
+/*! cache: bytes written from cache */
+#define	WT_STAT_CONN_CACHE_BYTES_WRITE			1024
+/*! cache: checkpoint blocked page eviction */
+#define	WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT		1025
+/*! cache: unmodified pages evicted */
+#define	WT_STAT_CONN_CACHE_EVICTION_CLEAN		1026
+/*! cache: page split during eviction deepened the tree */
+#define	WT_STAT_CONN_CACHE_EVICTION_DEEPEN		1027
+/*! cache: modified pages evicted */
+#define	WT_STAT_CONN_CACHE_EVICTION_DIRTY		1028
+/*! cache: pages selected for eviction unable to be evicted */
+#define	WT_STAT_CONN_CACHE_EVICTION_FAIL		1029
+/*! cache: pages evicted because they exceeded the in-memory maximum */
+#define	WT_STAT_CONN_CACHE_EVICTION_FORCE		1030
+/*! cache: failed eviction of pages that exceeded the in-memory maximum */
+#define	WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL		1031
+/*! cache: hazard pointer blocked page eviction */
+#define	WT_STAT_CONN_CACHE_EVICTION_HAZARD		1032
+/*! cache: internal pages evicted */
+#define	WT_STAT_CONN_CACHE_EVICTION_INTERNAL		1033
+/*! cache: eviction server candidate queue empty when topping up */
+#define	WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY		1034
+/*! cache: eviction server candidate queue not empty when topping up */
+#define	WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY	1035
+/*! cache: eviction server evicting pages */
+#define	WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING	1036
+/*! cache: eviction server populating queue, but not evicting pages */
+#define	WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING	1037
+/*! cache: eviction server unable to reach eviction goal */
+#define	WT_STAT_CONN_CACHE_EVICTION_SLOW		1038
+/*! cache: pages split during eviction */
+#define	WT_STAT_CONN_CACHE_EVICTION_SPLIT		1039
+/*! cache: pages walked for eviction */
+#define	WT_STAT_CONN_CACHE_EVICTION_WALK		1040
+/*! cache: tracked dirty pages in the cache */
+#define	WT_STAT_CONN_CACHE_PAGES_DIRTY			1041
+/*! cache: pages currently held in the cache */
+#define	WT_STAT_CONN_CACHE_PAGES_INUSE			1042
+/*! cache: pages read into cache */
+#define	WT_STAT_CONN_CACHE_READ				1043
+/*! cache: pages written from cache */
+#define	WT_STAT_CONN_CACHE_WRITE			1044
+/*! conn: pthread mutex condition wait calls */
+#define	WT_STAT_CONN_COND_WAIT				1045
+/*! Btree: cursor create calls */
+#define	WT_STAT_CONN_CURSOR_CREATE			1046
+/*! Btree: cursor insert calls */
+#define	WT_STAT_CONN_CURSOR_INSERT			1047
+/*! Btree: cursor next calls */
+#define	WT_STAT_CONN_CURSOR_NEXT			1048
+/*! Btree: cursor prev calls */
+#define	WT_STAT_CONN_CURSOR_PREV			1049
+/*! Btree: cursor remove calls */
+#define	WT_STAT_CONN_CURSOR_REMOVE			1050
+/*! Btree: cursor reset calls */
+#define	WT_STAT_CONN_CURSOR_RESET			1051
+/*! Btree: cursor search calls */
+#define	WT_STAT_CONN_CURSOR_SEARCH			1052
+/*! Btree: cursor search near calls */
+#define	WT_STAT_CONN_CURSOR_SEARCH_NEAR			1053
+/*! Btree: cursor update calls */
+#define	WT_STAT_CONN_CURSOR_UPDATE			1054
+/*! dhandle: session dhandles swept */
+#define	WT_STAT_CONN_DH_SESSION_HANDLES			1055
+/*! dhandle: session sweep attempts */
+#define	WT_STAT_CONN_DH_SESSION_SWEEPS			1056
+/*! conn: files currently open */
+#define	WT_STAT_CONN_FILE_OPEN				1057
+/*! log: log buffer size increases */
+#define	WT_STAT_CONN_LOG_BUFFER_GROW			1058
+/*! log: total log buffer size */
+#define	WT_STAT_CONN_LOG_BUFFER_SIZE			1059
+/*! log: user provided log bytes written */
+#define	WT_STAT_CONN_LOG_BYTES_USER			1060
+/*! log: log bytes written */
+#define	WT_STAT_CONN_LOG_BYTES_WRITTEN			1061
+/*! log: yields waiting for previous log file close */
+#define	WT_STAT_CONN_LOG_CLOSE_YIELDS			1062
+/*! log: maximum log file size */
+#define	WT_STAT_CONN_LOG_MAX_FILESIZE			1063
+/*! log: log read operations */
+#define	WT_STAT_CONN_LOG_READS				1064
+/*! log: records processed by log scan */
+#define	WT_STAT_CONN_LOG_SCAN_RECORDS			1065
+/*! log: log scan records requiring two reads */
+#define	WT_STAT_CONN_LOG_SCAN_REREADS			1066
+/*! log: log scan operations */
+#define	WT_STAT_CONN_LOG_SCANS				1067
+/*! log: consolidated slot closures */
+#define	WT_STAT_CONN_LOG_SLOT_CLOSES			1068
+/*! log: logging bytes consolidated */
+#define	WT_STAT_CONN_LOG_SLOT_CONSOLIDATED		1069
+/*! log: consolidated slot joins */
+#define	WT_STAT_CONN_LOG_SLOT_JOINS			1070
+/*! log: consolidated slot join races */
+#define	WT_STAT_CONN_LOG_SLOT_RACES			1071
+/*! log: slots selected for switching that were unavailable */
+#define	WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS		1072
+/*! log: record size exceeded maximum */
+#define	WT_STAT_CONN_LOG_SLOT_TOOBIG			1073
+/*! log: failed to find a slot large enough for record */
+#define	WT_STAT_CONN_LOG_SLOT_TOOSMALL			1074
+/*! log: consolidated slot join transitions */
+#define	WT_STAT_CONN_LOG_SLOT_TRANSITIONS		1075
+/*! log: log sync operations */
+#define	WT_STAT_CONN_LOG_SYNC				1076
+/*! log: log write operations */
+#define	WT_STAT_CONN_LOG_WRITES				1077
+/*! LSM: sleep for LSM checkpoint throttle */
+#define	WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE		1078
+/*! LSM: sleep for LSM merge throttle */
+#define	WT_STAT_CONN_LSM_MERGE_THROTTLE			1079
+/*! LSM: rows merged in an LSM tree */
+#define	WT_STAT_CONN_LSM_ROWS_MERGED			1080
+/*! LSM: App work units currently queued */
+#define	WT_STAT_CONN_LSM_WORK_QUEUE_APP			1081
+/*! LSM: Merge work units currently queued */
+#define	WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER		1082
+/*! LSM: tree queue hit maximum */
+#define	WT_STAT_CONN_LSM_WORK_QUEUE_MAX			1083
+/*! LSM: Switch work units currently queued */
+#define	WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH		1084
+/*! LSM: tree maintenance operations scheduled */
+#define	WT_STAT_CONN_LSM_WORK_UNITS_CREATED		1085
+/*! LSM: tree maintenance operations discarded */
+#define	WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED		1086
+/*! LSM: tree maintenance operations executed */
+#define	WT_STAT_CONN_LSM_WORK_UNITS_DONE		1087
+/*! conn: memory allocations */
+#define	WT_STAT_CONN_MEMORY_ALLOCATION			1088
+/*! conn: memory frees */
+#define	WT_STAT_CONN_MEMORY_FREE			1089
+/*! conn: memory re-allocations */
+#define	WT_STAT_CONN_MEMORY_GROW			1090
+/*! conn: total read I/Os */
+#define	WT_STAT_CONN_READ_IO				1091
+/*! reconciliation: page reconciliation calls */
+#define	WT_STAT_CONN_REC_PAGES				1092
+/*! reconciliation: page reconciliation calls for eviction */
+#define	WT_STAT_CONN_REC_PAGES_EVICTION			1093
+/*! reconciliation: split bytes currently awaiting free */
+#define	WT_STAT_CONN_REC_SPLIT_STASHED_BYTES		1094
+/*! reconciliation: split objects currently awaiting free */
+#define	WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS		1095
+/*! conn: pthread mutex shared lock read-lock calls */
+#define	WT_STAT_CONN_RWLOCK_READ			1096
+/*! conn: pthread mutex shared lock write-lock calls */
+#define	WT_STAT_CONN_RWLOCK_WRITE			1097
+/*! session: open cursor count */
+#define	WT_STAT_CONN_SESSION_CURSOR_OPEN		1098
+/*! session: open session count */
+#define	WT_STAT_CONN_SESSION_OPEN			1099
+/*! txn: transaction begins */
+#define	WT_STAT_CONN_TXN_BEGIN				1100
+/*! txn: transaction checkpoints */
+#define	WT_STAT_CONN_TXN_CHECKPOINT			1101
+/*! txn: transaction checkpoint currently running */
+#define	WT_STAT_CONN_TXN_CHECKPOINT_RUNNING		1102
+/*! txn: transactions committed */
+#define	WT_STAT_CONN_TXN_COMMIT				1103
+/*! txn: transaction failures due to cache overflow */
+#define	WT_STAT_CONN_TXN_FAIL_CACHE			1104
+/*! txn: transaction range of IDs currently pinned */
+#define	WT_STAT_CONN_TXN_PINNED_RANGE			1105
+/*! txn: transactions rolled back */
+#define	WT_STAT_CONN_TXN_ROLLBACK			1106
+/*! conn: total write I/Os */
+#define	WT_STAT_CONN_WRITE_IO				1107
+
+/*!
+ * @}
+ * @name Statistics for data sources
+ * @anchor statistics_dsrc
+ * @{
+ */
+/*! block manager: file allocation unit size */
+#define	WT_STAT_DSRC_ALLOCATION_SIZE			2000
+/*! block manager: blocks allocated */
+#define	WT_STAT_DSRC_BLOCK_ALLOC			2001
+/*! block manager: checkpoint size */
+#define	WT_STAT_DSRC_BLOCK_CHECKPOINT_SIZE		2002
+/*! block manager: allocations requiring file extension */
+#define	WT_STAT_DSRC_BLOCK_EXTENSION			2003
+/*! block manager: blocks freed */
+#define	WT_STAT_DSRC_BLOCK_FREE				2004
+/*! block manager: file magic number */
+#define	WT_STAT_DSRC_BLOCK_MAGIC			2005
+/*! block manager: file major version number */
+#define	WT_STAT_DSRC_BLOCK_MAJOR			2006
+/*! block manager: minor version number */
+#define	WT_STAT_DSRC_BLOCK_MINOR			2007
+/*! block manager: file bytes available for reuse */
+#define	WT_STAT_DSRC_BLOCK_REUSE_BYTES			2008
+/*! block manager: file size in bytes */
+#define	WT_STAT_DSRC_BLOCK_SIZE				2009
+/*! LSM: bloom filters in the LSM tree */
+#define	WT_STAT_DSRC_BLOOM_COUNT			2010
+/*! LSM: bloom filter false positives */
+#define	WT_STAT_DSRC_BLOOM_FALSE_POSITIVE		2011
+/*! LSM: bloom filter hits */
+#define	WT_STAT_DSRC_BLOOM_HIT				2012
+/*! LSM: bloom filter misses */
+#define	WT_STAT_DSRC_BLOOM_MISS				2013
+/*! LSM: bloom filter pages evicted from cache */
+#define	WT_STAT_DSRC_BLOOM_PAGE_EVICT			2014
+/*! LSM: bloom filter pages read into cache */
+#define	WT_STAT_DSRC_BLOOM_PAGE_READ			2015
+/*! LSM: total size of bloom filters */
+#define	WT_STAT_DSRC_BLOOM_SIZE				2016
+/*! btree: column-store variable-size deleted values */
+#define	WT_STAT_DSRC_BTREE_COLUMN_DELETED		2017
+/*! btree: column-store fixed-size leaf pages */
+#define	WT_STAT_DSRC_BTREE_COLUMN_FIX			2018
+/*! btree: column-store internal pages */
+#define	WT_STAT_DSRC_BTREE_COLUMN_INTERNAL		2019
+/*! btree: column-store variable-size leaf pages */
+#define	WT_STAT_DSRC_BTREE_COLUMN_VARIABLE		2020
+/*! btree: pages rewritten by compaction */
+#define	WT_STAT_DSRC_BTREE_COMPACT_REWRITE		2021
+/*! btree: number of key/value pairs */
+#define	WT_STAT_DSRC_BTREE_ENTRIES			2022
+/*! btree: fixed-record size */
+#define	WT_STAT_DSRC_BTREE_FIXED_LEN			2023
+/*! btree: maximum tree depth */
+#define	WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH		2024
+/*! btree: maximum internal page item size */
+#define	WT_STAT_DSRC_BTREE_MAXINTLITEM			2025
+/*! btree: maximum internal page size */
+#define	WT_STAT_DSRC_BTREE_MAXINTLPAGE			2026
+/*! btree: maximum leaf page item size */
+#define	WT_STAT_DSRC_BTREE_MAXLEAFITEM			2027
+/*! btree: maximum leaf page size */
+#define	WT_STAT_DSRC_BTREE_MAXLEAFPAGE			2028
+/*! btree: overflow pages */
+#define	WT_STAT_DSRC_BTREE_OVERFLOW			2029
+/*! btree: row-store internal pages */
+#define	WT_STAT_DSRC_BTREE_ROW_INTERNAL			2030
+/*! btree: row-store leaf pages */
+#define	WT_STAT_DSRC_BTREE_ROW_LEAF			2031
+/*! cache: bytes read into cache */
+#define	WT_STAT_DSRC_CACHE_BYTES_READ			2032
+/*! cache: bytes written from cache */
+#define	WT_STAT_DSRC_CACHE_BYTES_WRITE			2033
+/*! cache: checkpoint blocked page eviction */
+#define	WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT		2034
+/*! cache: unmodified pages evicted */
+#define	WT_STAT_DSRC_CACHE_EVICTION_CLEAN		2035
+/*! cache: modified pages evicted */
+#define	WT_STAT_DSRC_CACHE_EVICTION_DIRTY		2036
+/*! cache: data source pages selected for eviction unable to be evicted */
+#define	WT_STAT_DSRC_CACHE_EVICTION_FAIL		2037
+/*! cache: hazard pointer blocked page eviction */
+#define	WT_STAT_DSRC_CACHE_EVICTION_HAZARD		2038
+/*! cache: internal pages evicted */
+#define	WT_STAT_DSRC_CACHE_EVICTION_INTERNAL		2039
+/*! cache: overflow values cached in memory */
+#define	WT_STAT_DSRC_CACHE_OVERFLOW_VALUE		2040
+/*! cache: pages read into cache */
+#define	WT_STAT_DSRC_CACHE_READ				2041
+/*! cache: overflow pages read into cache */
+#define	WT_STAT_DSRC_CACHE_READ_OVERFLOW		2042
+/*! cache: pages written from cache */
+#define	WT_STAT_DSRC_CACHE_WRITE			2043
+/*! compression: raw compression call failed, no additional data available */
+#define	WT_STAT_DSRC_COMPRESS_RAW_FAIL			2044
+/*! compression: raw compression call failed, additional data available */
+#define	WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY	2045
+/*! compression: raw compression call succeeded */
+#define	WT_STAT_DSRC_COMPRESS_RAW_OK			2046
+/*! compression: compressed pages read */
+#define	WT_STAT_DSRC_COMPRESS_READ			2047
+/*! compression: compressed pages written */
+#define	WT_STAT_DSRC_COMPRESS_WRITE			2048
+/*! compression: page written failed to compress */
+#define	WT_STAT_DSRC_COMPRESS_WRITE_FAIL		2049
+/*! compression: page written was too small to compress */
+#define	WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL		2050
+/*! cursor: create calls */
+#define	WT_STAT_DSRC_CURSOR_CREATE			2051
+/*! cursor: insert calls */
+#define	WT_STAT_DSRC_CURSOR_INSERT			2052
+/*! cursor: bulk-loaded cursor-insert calls */
+#define	WT_STAT_DSRC_CURSOR_INSERT_BULK			2053
+/*! cursor: cursor-insert key and value bytes inserted */
+#define	WT_STAT_DSRC_CURSOR_INSERT_BYTES		2054
+/*! cursor: next calls */
+#define	WT_STAT_DSRC_CURSOR_NEXT			2055
+/*! cursor: prev calls */
+#define	WT_STAT_DSRC_CURSOR_PREV			2056
+/*! cursor: remove calls */
+#define	WT_STAT_DSRC_CURSOR_REMOVE			2057
+/*! cursor: cursor-remove key bytes removed */
+#define	WT_STAT_DSRC_CURSOR_REMOVE_BYTES		2058
+/*! cursor: reset calls */
+#define	WT_STAT_DSRC_CURSOR_RESET			2059
+/*! cursor: search calls */
+#define	WT_STAT_DSRC_CURSOR_SEARCH			2060
+/*! cursor: search near calls */
+#define	WT_STAT_DSRC_CURSOR_SEARCH_NEAR			2061
+/*! cursor: update calls */
+#define	WT_STAT_DSRC_CURSOR_UPDATE			2062
+/*! cursor: cursor-update value bytes updated */
+#define	WT_STAT_DSRC_CURSOR_UPDATE_BYTES		2063
+/*! LSM: sleep for LSM checkpoint throttle */
+#define	WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE		2064
+/*! LSM: chunks in the LSM tree */
+#define	WT_STAT_DSRC_LSM_CHUNK_COUNT			2065
+/*! LSM: highest merge generation in the LSM tree */
+#define	WT_STAT_DSRC_LSM_GENERATION_MAX			2066
+/*! LSM: queries that could have benefited from a Bloom filter that did
+ * not exist */
+#define	WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM		2067
+/*! LSM: sleep for LSM merge throttle */
+#define	WT_STAT_DSRC_LSM_MERGE_THROTTLE			2068
+/*! reconciliation: dictionary matches */
+#define	WT_STAT_DSRC_REC_DICTIONARY			2069
+/*! reconciliation: internal page multi-block writes */
+#define	WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL		2070
+/*! reconciliation: leaf page multi-block writes */
+#define	WT_STAT_DSRC_REC_MULTIBLOCK_LEAF		2071
+/*! reconciliation: maximum blocks required for a page */
+#define	WT_STAT_DSRC_REC_MULTIBLOCK_MAX			2072
+/*! reconciliation: internal-page overflow keys */
+#define	WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL		2073
+/*! reconciliation: leaf-page overflow keys */
+#define	WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF		2074
+/*! reconciliation: overflow values written */
+#define	WT_STAT_DSRC_REC_OVERFLOW_VALUE			2075
+/*! reconciliation: pages deleted */
+#define	WT_STAT_DSRC_REC_PAGE_DELETE			2076
+/*! reconciliation: page checksum matches */
+#define	WT_STAT_DSRC_REC_PAGE_MATCH			2077
+/*! reconciliation: page reconciliation calls */
+#define	WT_STAT_DSRC_REC_PAGES				2078
+/*! reconciliation: page reconciliation calls for eviction */
+#define	WT_STAT_DSRC_REC_PAGES_EVICTION			2079
+/*! reconciliation: leaf page key bytes discarded using prefix compression */
+#define	WT_STAT_DSRC_REC_PREFIX_COMPRESSION		2080
+/*! reconciliation: internal page key bytes discarded using suffix
+ * compression */
+#define	WT_STAT_DSRC_REC_SUFFIX_COMPRESSION		2081
+/*! session: object compaction */
+#define	WT_STAT_DSRC_SESSION_COMPACT			2082
+/*! session: open cursor count */
+#define	WT_STAT_DSRC_SESSION_CURSOR_OPEN		2083
+/*! txn: update conflicts */
+#define	WT_STAT_DSRC_TXN_UPDATE_CONFLICT		2084
+/*! @} */
+/*
+ * Statistics section: END
+ * DO NOT EDIT: automatically built by dist/api_stat.py.
+ */
+/*!
+ * @name Log record and operation types
+ * @anchor log_types
+ * @{
+ */
+/*
+ * DO NOT EDIT: automatically built by dist/log.py.
+ * Log record declarations: BEGIN
+ */
+/*! invalid operation */
+#define	WT_LOGOP_INVALID	0
+/*! checkpoint */
+#define	WT_LOGREC_CHECKPOINT	0
+/*! transaction commit */
+#define	WT_LOGREC_COMMIT	1
+/*! file sync */
+#define	WT_LOGREC_FILE_SYNC	2
+/*! message */
+#define	WT_LOGREC_MESSAGE	3
+/*! column put */
+#define	WT_LOGOP_COL_PUT	1
+/*! column remove */
+#define	WT_LOGOP_COL_REMOVE	2
+/*! column truncate */
+#define	WT_LOGOP_COL_TRUNCATE	3
+/*! row put */
+#define	WT_LOGOP_ROW_PUT	4
+/*! row remove */
+#define	WT_LOGOP_ROW_REMOVE	5
+/*! row truncate */
+#define	WT_LOGOP_ROW_TRUNCATE	6
+/*
+ * Log record declarations: END
+ * DO NOT EDIT: automatically built by dist/log.py.
+ */
+/*! @} */
+/*! @} */
+
+#undef __F
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* __WIREDTIGER_H_ */
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger_ext.h b/src/third_party/wiredtiger/src/include/wiredtiger_ext.h
new file mode 100644
index 00000000000..fd0282cd50c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/wiredtiger_ext.h
@@ -0,0 +1,398 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#ifndef	__WIREDTIGER_EXT_H_
+#define	__WIREDTIGER_EXT_H_
+
+#include <wiredtiger.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if !defined(SWIG)
+
+/*!
+ * @addtogroup wt_ext
+ * @{
+ */
+
+/*!
+ * Read-committed isolation level, returned by
+ * WT_EXTENSION_API::transaction_isolation_level.
+ */
+#define	WT_TXN_ISO_READ_COMMITTED       1
+/*!
+ * Read-uncommitted isolation level, returned by
+ * WT_EXTENSION_API::transaction_isolation_level.
+ */
+#define	WT_TXN_ISO_READ_UNCOMMITTED     2
+/*!
+ * Snapshot isolation level, returned by
+ * WT_EXTENSION_API::transaction_isolation_level.
+ */
+#define	WT_TXN_ISO_SNAPSHOT             3
+
+typedef struct __wt_txn_notify WT_TXN_NOTIFY;
+/*!
+ * Snapshot isolation level, returned by
+ * WT_EXTENSION_API::transaction_isolation_level.
+ */
+struct __wt_txn_notify {
+	/*!
+	 * A method called when the session's current transaction is committed
+	 * or rolled back.
+	 *
+	 * @param notify a pointer to the event handler
+	 * @param session the current session handle
+	 * @param txnid the transaction ID
+	 * @param committed an integer value which is non-zero if the
+	 * transaction is being committed.
+	 */
+	int (*notify)(WT_TXN_NOTIFY *notify, WT_SESSION *session,
+	    uint64_t txnid, int committed);
+};
+
+/*!
+ * Table of WiredTiger extension methods.
+ *
+ * This structure is used to provide a set of WiredTiger methods to extension
+ * modules without needing to link the modules with the WiredTiger library.
+ *
+ * The extension methods may be used both by modules that are linked with
+ * the WiredTiger library (for example, a data source configured using the
+ * WT_CONNECTION::add_data_source method), and by modules not linked with the
+ * WiredTiger library (for example, a compression module configured using the
+ * WT_CONNECTION::add_compressor method).
+ *
+ * To use these functions:
+ * - include the wiredtiger_ext.h header file,
+ * - declare a variable which references a WT_EXTENSION_API structure, and
+ * - initialize the variable using WT_CONNECTION::get_extension_api method.
+ *
+ * @snippet ex_data_source.c WT_EXTENSION_API declaration
+ *
+ * The following code is from the sample compression module, where compression
+ * extension functions are configured in the extension's entry point:
+ *
+ * @snippet nop_compress.c WT_COMPRESSOR initialization structure
+ * @snippet nop_compress.c WT_COMPRESSOR initialization function
+ */
+struct __wt_extension_api {
+/* !!! To maintain backwards compatibility, this structure is append-only. */
+#if !defined(DOXYGEN)
+	/*
+	 * Private fields.
+	 */
+	WT_CONNECTION *conn;		/* Enclosing connection */
+#endif
+	/*!
+	 * Insert an error message into the WiredTiger error stream.
+	 *
+	 * @param wt_api the extension handle
+	 * @param session the session handle (or NULL if none available)
+	 * @param fmt a printf-like format specification
+	 * @errors
+	 *
+	 * @snippet ex_data_source.c WT_EXTENSION_API err_printf
+	 */
+	int (*err_printf)(WT_EXTENSION_API *wt_api,
+	    WT_SESSION *session, const char *fmt, ...);
+
+	/*!
+	 * Insert a message into the WiredTiger message stream.
+	 *
+	 * @param wt_api the extension handle
+	 * @param session the session handle (or NULL if none available)
+	 * @param fmt a printf-like format specification
+	 * @errors
+	 *
+	 * @snippet ex_data_source.c WT_EXTENSION_API msg_printf
+	 */
+	int (*msg_printf)(
+	    WT_EXTENSION_API *, WT_SESSION *session, const char *fmt, ...);
+
+	/*!
+	 * Return information about an error as a string; the strerror method
+	 * is a superset of the ISO C99/POSIX 1003.1-2001 function strerror.
+	 *
+	 * @snippet ex_data_source.c WT_EXTENSION_API strerror
+	 *
+	 * @param err a return value from a WiredTiger, C library or POSIX
+	 * function
+	 * @returns a string representation of the error
+	 */
+	const char *(*strerror)(int err);
+
+	/*!
+	 * Allocate short-term use scratch memory.
+	 *
+	 * @param wt_api the extension handle
+	 * @param session the session handle (or NULL if none available)
+	 * @param bytes the number of bytes of memory needed
+	 * @returns A valid memory reference on success or NULL on error
+	 *
+	 * @snippet ex_data_source.c WT_EXTENSION_API scr_alloc
+	 */
+	void *(*scr_alloc)(
+	    WT_EXTENSION_API *wt_api, WT_SESSION *session, size_t bytes);
+
+	/*!
+	 * Free short-term use scratch memory.
+	 *
+	 * @param wt_api the extension handle
+	 * @param session the session handle (or NULL if none available)
+	 * @param ref a memory reference returned by WT_EXTENSION_API::scr_alloc
+	 *
+	 * @snippet ex_data_source.c WT_EXTENSION_API scr_free
+	 */
+	void (*scr_free)(WT_EXTENSION_API *, WT_SESSION *session, void *ref);
+
+	/*!
+	 * Configure the extension collator method.
+	 *
+	 * @param wt_api the extension handle
+	 * @param session the session handle (or NULL if none available)
+	 * @param config the configuration information passed to an application
+	 * @param collatorp the selector collator, if any
+	 * @param ownp set if the collator terminate method should be called
+	 * when no longer needed
+	 * @errors
+	 *
+	 * @snippet ex_data_source.c WT_EXTENSION collator config
+	 */
+	int (*collator_config)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+	    WT_CONFIG_ARG *config, WT_COLLATOR **collatorp, int *ownp);
+
+	/*!
+	 * The extension collator method.
+	 *
+	 * @param wt_api the extension handle
+	 * @param session the session handle (or NULL if none available)
+	 * @param collator the collator (or NULL if none available)
+	 * @param first first item
+	 * @param second second item
+	 * @param[out] cmp set less than 0 if \c first collates less than
+	 * \c second, set equal to 0 if \c first collates equally to \c second,
+	 * set greater than 0 if \c first collates greater than \c second
+	 * @errors
+	 *
+	 * @snippet ex_data_source.c WT_EXTENSION collate
+	 */
+	int (*collate)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+	    WT_COLLATOR *collator, WT_ITEM *first, WT_ITEM *second, int *cmp);
+
+	/*!
+	 * @copydoc wiredtiger_config_parser_open
+	 */
+	int (*config_parser_open)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+	    const char *config, size_t len, WT_CONFIG_PARSER **config_parserp);
+
+	/*!
+	 * Return the value of a configuration string.
+	 *
+	 * @param wt_api the extension handle
+	 * @param session the session handle (or NULL if none available)
+	 * @param key configuration key string
+	 * @param config the configuration information passed to an application
+	 * @param value the returned value
+	 * @errors
+	 *
+	 * @snippet ex_data_source.c WT_EXTENSION config_get
+	 */
+	int (*config_get)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+	    WT_CONFIG_ARG *config, const char *key, WT_CONFIG_ITEM *value);
+
+	/*!
+	 * Insert a row into the metadata if it does not already exist.
+	 *
+	 * @param wt_api the extension handle
+	 * @param session the session handle (or NULL if none available)
+	 * @param key row key
+	 * @param value row value
+	 * @errors
+	 *
+	 * @snippet ex_data_source.c WT_EXTENSION metadata insert
+	 */
+	int (*metadata_insert)(WT_EXTENSION_API *wt_api,
+	    WT_SESSION *session, const char *key, const char *value);
+
+	/*!
+	 * Remove a row from the metadata.
+	 *
+	 * @param wt_api the extension handle
+	 * @param session the session handle (or NULL if none available)
+	 * @param key row key
+	 * @errors
+	 *
+	 * @snippet ex_data_source.c WT_EXTENSION metadata remove
+	 */
+	int (*metadata_remove)(
+	    WT_EXTENSION_API *wt_api, WT_SESSION *session, const char *key);
+
+	/*!
+	 * Return a row from the metadata.
+	 *
+	 * @param wt_api the extension handle
+	 * @param session the session handle (or NULL if none available)
+	 * @param key row key
+	 * @param [out] valuep the row value
+	 * @errors
+	 *
+	 * @snippet ex_data_source.c WT_EXTENSION metadata search
+	 */
+	int (*metadata_search)(WT_EXTENSION_API *wt_api,
+	    WT_SESSION *session, const char *key, const char **valuep);
+
+	/*!
+	 * Update a row in the metadata by either inserting a new record or
+	 * updating an existing record.
+	 *
+	 * @param wt_api the extension handle
+	 * @param session the session handle (or NULL if none available)
+	 * @param key row key
+	 * @param value row value
+	 * @errors
+	 *
+	 * @snippet ex_data_source.c WT_EXTENSION metadata update
+	 */
+	int (*metadata_update)(WT_EXTENSION_API *wt_api,
+	    WT_SESSION *session, const char *key, const char *value);
+
+	/*!
+	 * Pack a structure into a buffer.
+	 * See ::wiredtiger_struct_pack for details.
+	 *
+	 * @param wt_api the extension handle
+	 * @param session the session handle
+	 * @param buffer a pointer to a packed byte array
+	 * @param size the number of valid bytes in the buffer
+	 * @param format the data format, see @ref packing
+	 * @errors
+	 */
+	int (*struct_pack)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+	    void *buffer, size_t size, const char *format, ...);
+
+	/*!
+	 * Calculate the size required to pack a structure.
+	 * See ::wiredtiger_struct_size for details.
+	 *
+	 * @param wt_api the extension handle
+	 * @param session the session handle
+	 * @param sizep a location where the number of bytes needed for the
+	 * matching call to WT_EXTENSION_API::struct_pack is returned
+	 * @param format the data format, see @ref packing
+	 * @errors
+	 */
+	int (*struct_size)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+	    size_t *sizep, const char *format, ...);
+
+	/*!
+	 * Unpack a structure from a buffer.
+	 * See ::wiredtiger_struct_unpack for details.
+	 *
+	 * @param wt_api the extension handle
+	 * @param session the session handle
+	 * @param buffer a pointer to a packed byte array
+	 * @param size the number of valid bytes in the buffer
+	 * @param format the data format, see @ref packing
+	 * @errors
+	 */
+	int (*struct_unpack)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+	    const void *buffer, size_t size, const char *format, ...);
+
+	/*!
+	 * Return the current transaction ID.
+	 *
+	 * @param wt_api the extension handle
+	 * @param session the session handle
+	 * @returns the current transaction ID.
+	 *
+	 * @snippet ex_data_source.c WT_EXTENSION transaction ID
+	 */
+	uint64_t (*transaction_id)(WT_EXTENSION_API *wt_api,
+	    WT_SESSION *session);
+
+	/*!
+	 * Return the current transaction's isolation level; returns one of
+	 * ::WT_TXN_ISO_READ_COMMITTED, ::WT_TXN_ISO_READ_UNCOMMITTED, or
+	 * ::WT_TXN_ISO_SNAPSHOT.
+	 *
+	 * @param wt_api the extension handle
+	 * @param session the session handle
+	 * @returns the current transaction's isolation level.
+	 *
+	 * @snippet ex_data_source.c WT_EXTENSION transaction isolation level
+	 */
+	int (*transaction_isolation_level)(WT_EXTENSION_API *wt_api,
+	    WT_SESSION *session);
+
+	/*!
+	 * Request notification of transaction resolution by specifying a
+	 * function to be called when the session's current transaction is
+	 * either committed or rolled back.  If the transaction is being
+	 * committed, but the notification function returns an error, the
+	 * transaction will be rolled back.
+	 *
+	 * @param wt_api the extension handle
+	 * @param session the session handle
+	 * @param notify a handler for commit or rollback events
+	 * @errors
+	 *
+	 * @snippet ex_data_source.c WT_EXTENSION transaction notify
+	 */
+	int (*transaction_notify)(WT_EXTENSION_API *wt_api,
+	    WT_SESSION *session, WT_TXN_NOTIFY *notify);
+
+	/*!
+	 * Return the oldest transaction ID not yet visible to a running
+	 * transaction.
+	 *
+	 * @param wt_api the extension handle
+	 * @param session the session handle
+	 * @returns the oldest transaction ID not yet visible to a running
+	 * transaction.
+	 *
+	 * @snippet ex_data_source.c WT_EXTENSION transaction oldest
+	 */
+	uint64_t (*transaction_oldest)(WT_EXTENSION_API *wt_api);
+
+	/*!
+	 * Return if the current transaction can see the given transaction ID.
+	 *
+	 * @param wt_api the extension handle
+	 * @param session the session handle
+	 * @param transaction_id the transaction ID
+	 * @returns true (non-zero) if the transaction ID is visible to the
+	 * current transaction.
+	 *
+	 * @snippet ex_data_source.c WT_EXTENSION transaction visible
+	 */
+	int (*transaction_visible)(WT_EXTENSION_API *wt_api,
+	    WT_SESSION *session, uint64_t transaction_id);
+
+	/*!
+	 * @copydoc wiredtiger_version
+	 */
+	const char *(*version)(int *majorp, int *minorp, int *patchp);
+};
+
+/*!
+ * @typedef WT_CONFIG_ARG
+ *
+ * A configuration object passed to some extension interfaces.  This is an
+ * opaque type: configuration values can be queried using
+ * WT_EXTENSION_API::config_get
+ */
+
+/*! @} */
+#endif /* SWIG */
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* __WIREDTIGER_EXT_H_ */
diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h
new file mode 100644
index 00000000000..e9482c688d3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/wt_internal.h
@@ -0,0 +1,337 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*******************************************
+ * WiredTiger public include file, and configuration control.
+ *******************************************/
+#include "wiredtiger_config.h"
+#include "wiredtiger_ext.h"
+
+/*******************************************
+ * WiredTiger system include files.
+ *******************************************/
+#ifndef _WIN32
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/uio.h>
+#endif
+#include <ctype.h>
+#ifndef _WIN32
+#include <dlfcn.h>
+#endif
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#ifdef _WIN32
+#include <io.h>
+#endif
+#include <limits.h>
+#ifndef _WIN32
+#include <pthread.h>
+#endif
+#ifdef HAVE_PTHREAD_NP_H
+#include <pthread_np.h>
+#endif
+#include <stddef.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+#include <time.h>
+#ifdef _WIN32
+#define	WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
+/*******************************************
+ * WiredTiger externally maintained include files.
+ *******************************************/
+#include "queue.h"
+
+/*
+ * DO NOT EDIT: automatically built by dist/s_typedef.
+ * Forward type declarations for internal types: BEGIN
+ */
+struct __wt_addr;
+    typedef struct __wt_addr WT_ADDR;
+struct __wt_async;
+    typedef struct __wt_async WT_ASYNC;
+struct __wt_async_cursor;
+    typedef struct __wt_async_cursor WT_ASYNC_CURSOR;
+struct __wt_async_format;
+    typedef struct __wt_async_format WT_ASYNC_FORMAT;
+struct __wt_async_op_impl;
+    typedef struct __wt_async_op_impl WT_ASYNC_OP_IMPL;
+struct __wt_async_worker_state;
+    typedef struct __wt_async_worker_state WT_ASYNC_WORKER_STATE;
+struct __wt_block;
+    typedef struct __wt_block WT_BLOCK;
+struct __wt_block_ckpt;
+    typedef struct __wt_block_ckpt WT_BLOCK_CKPT;
+struct __wt_block_desc;
+    typedef struct __wt_block_desc WT_BLOCK_DESC;
+struct __wt_block_header;
+    typedef struct __wt_block_header WT_BLOCK_HEADER;
+struct __wt_bloom;
+    typedef struct __wt_bloom WT_BLOOM;
+struct __wt_bloom_hash;
+    typedef struct __wt_bloom_hash WT_BLOOM_HASH;
+struct __wt_bm;
+    typedef struct __wt_bm WT_BM;
+struct __wt_btree;
+    typedef struct __wt_btree WT_BTREE;
+struct __wt_cache;
+    typedef struct __wt_cache WT_CACHE;
+struct __wt_cache_pool;
+    typedef struct __wt_cache_pool WT_CACHE_POOL;
+struct __wt_cell;
+    typedef struct __wt_cell WT_CELL;
+struct __wt_cell_unpack;
+    typedef struct __wt_cell_unpack WT_CELL_UNPACK;
+struct __wt_ckpt;
+    typedef struct __wt_ckpt WT_CKPT;
+struct __wt_col;
+    typedef struct __wt_col WT_COL;
+struct __wt_col_rle;
+    typedef struct __wt_col_rle WT_COL_RLE;
+struct __wt_colgroup;
+    typedef struct __wt_colgroup WT_COLGROUP;
+struct __wt_compact;
+    typedef struct __wt_compact WT_COMPACT;
+struct __wt_condvar;
+    typedef struct __wt_condvar WT_CONDVAR;
+struct __wt_config;
+    typedef struct __wt_config WT_CONFIG;
+struct __wt_config_check;
+    typedef struct __wt_config_check WT_CONFIG_CHECK;
+struct __wt_config_entry;
+    typedef struct __wt_config_entry WT_CONFIG_ENTRY;
+struct __wt_config_parser_impl;
+    typedef struct __wt_config_parser_impl WT_CONFIG_PARSER_IMPL;
+struct __wt_connection_impl;
+    typedef struct __wt_connection_impl WT_CONNECTION_IMPL;
+struct __wt_connection_stats;
+    typedef struct __wt_connection_stats WT_CONNECTION_STATS;
+struct __wt_connection_stats_spinlock;
+    typedef struct __wt_connection_stats_spinlock WT_CONNECTION_STATS_SPINLOCK;
+struct __wt_cursor_backup;
+    typedef struct __wt_cursor_backup WT_CURSOR_BACKUP;
+struct __wt_cursor_backup_entry;
+    typedef struct __wt_cursor_backup_entry WT_CURSOR_BACKUP_ENTRY;
+struct __wt_cursor_btree;
+    typedef struct __wt_cursor_btree WT_CURSOR_BTREE;
+struct __wt_cursor_bulk;
+    typedef struct __wt_cursor_bulk WT_CURSOR_BULK;
+struct __wt_cursor_config;
+    typedef struct __wt_cursor_config WT_CURSOR_CONFIG;
+struct __wt_cursor_data_source;
+    typedef struct __wt_cursor_data_source WT_CURSOR_DATA_SOURCE;
+struct __wt_cursor_dump;
+    typedef struct __wt_cursor_dump WT_CURSOR_DUMP;
+struct __wt_cursor_index;
+    typedef struct __wt_cursor_index WT_CURSOR_INDEX;
+struct __wt_cursor_json;
+    typedef struct __wt_cursor_json WT_CURSOR_JSON;
+struct __wt_cursor_log;
+    typedef struct __wt_cursor_log WT_CURSOR_LOG;
+struct __wt_cursor_lsm;
+    typedef struct __wt_cursor_lsm WT_CURSOR_LSM;
+struct __wt_cursor_metadata;
+    typedef struct __wt_cursor_metadata WT_CURSOR_METADATA;
+struct __wt_cursor_stat;
+    typedef struct __wt_cursor_stat WT_CURSOR_STAT;
+struct __wt_cursor_table;
+    typedef struct __wt_cursor_table WT_CURSOR_TABLE;
+struct __wt_data_handle;
+    typedef struct __wt_data_handle WT_DATA_HANDLE;
+struct __wt_data_handle_cache;
+    typedef struct __wt_data_handle_cache WT_DATA_HANDLE_CACHE;
+struct __wt_dlh;
+    typedef struct __wt_dlh WT_DLH;
+struct __wt_dsrc_stats;
+    typedef struct __wt_dsrc_stats WT_DSRC_STATS;
+struct __wt_evict_entry;
+    typedef struct __wt_evict_entry WT_EVICT_ENTRY;
+struct __wt_evict_worker;
+    typedef struct __wt_evict_worker WT_EVICT_WORKER;
+struct __wt_ext;
+    typedef struct __wt_ext WT_EXT;
+struct __wt_extlist;
+    typedef struct __wt_extlist WT_EXTLIST;
+struct __wt_fh;
+    typedef struct __wt_fh WT_FH;
+struct __wt_hazard;
+    typedef struct __wt_hazard WT_HAZARD;
+struct __wt_ikey;
+    typedef struct __wt_ikey WT_IKEY;
+struct __wt_index;
+    typedef struct __wt_index WT_INDEX;
+struct __wt_insert;
+    typedef struct __wt_insert WT_INSERT;
+struct __wt_insert_head;
+    typedef struct __wt_insert_head WT_INSERT_HEAD;
+struct __wt_log_desc;
+    typedef struct __wt_log_desc WT_LOG_DESC;
+struct __wt_log_op_desc;
+    typedef struct __wt_log_op_desc WT_LOG_OP_DESC;
+struct __wt_log_rec_desc;
+    typedef struct __wt_log_rec_desc WT_LOG_REC_DESC;
+struct __wt_lsm_chunk;
+    typedef struct __wt_lsm_chunk WT_LSM_CHUNK;
+struct __wt_lsm_data_source;
+    typedef struct __wt_lsm_data_source WT_LSM_DATA_SOURCE;
+struct __wt_lsm_manager;
+    typedef struct __wt_lsm_manager WT_LSM_MANAGER;
+struct __wt_lsm_tree;
+    typedef struct __wt_lsm_tree WT_LSM_TREE;
+struct __wt_lsm_work_unit;
+    typedef struct __wt_lsm_work_unit WT_LSM_WORK_UNIT;
+struct __wt_lsm_worker_args;
+    typedef struct __wt_lsm_worker_args WT_LSM_WORKER_ARGS;
+struct __wt_lsm_worker_cookie;
+    typedef struct __wt_lsm_worker_cookie WT_LSM_WORKER_COOKIE;
+struct __wt_multi;
+    typedef struct __wt_multi WT_MULTI;
+struct __wt_named_collator;
+    typedef struct __wt_named_collator WT_NAMED_COLLATOR;
+struct __wt_named_compressor;
+    typedef struct __wt_named_compressor WT_NAMED_COMPRESSOR;
+struct __wt_named_data_source;
+    typedef struct __wt_named_data_source WT_NAMED_DATA_SOURCE;
+struct __wt_ovfl_reuse;
+    typedef struct __wt_ovfl_reuse WT_OVFL_REUSE;
+struct __wt_ovfl_track;
+    typedef struct __wt_ovfl_track WT_OVFL_TRACK;
+struct __wt_ovfl_txnc;
+    typedef struct __wt_ovfl_txnc WT_OVFL_TXNC;
+struct __wt_page;
+    typedef struct __wt_page WT_PAGE;
+struct __wt_page_deleted;
+    typedef struct __wt_page_deleted WT_PAGE_DELETED;
+struct __wt_page_header;
+    typedef struct __wt_page_header WT_PAGE_HEADER;
+struct __wt_page_index;
+    typedef struct __wt_page_index WT_PAGE_INDEX;
+struct __wt_page_modify;
+    typedef struct __wt_page_modify WT_PAGE_MODIFY;
+struct __wt_process;
+    typedef struct __wt_process WT_PROCESS;
+struct __wt_ref;
+    typedef struct __wt_ref WT_REF;
+struct __wt_row;
+    typedef struct __wt_row WT_ROW;
+struct __wt_rwlock;
+    typedef struct __wt_rwlock WT_RWLOCK;
+struct __wt_salvage_cookie;
+    typedef struct __wt_salvage_cookie WT_SALVAGE_COOKIE;
+struct __wt_scratch_track;
+    typedef struct __wt_scratch_track WT_SCRATCH_TRACK;
+struct __wt_session_impl;
+    typedef struct __wt_session_impl WT_SESSION_IMPL;
+struct __wt_size;
+    typedef struct __wt_size WT_SIZE;
+struct __wt_split_stash;
+    typedef struct __wt_split_stash WT_SPLIT_STASH;
+struct __wt_stats;
+    typedef struct __wt_stats WT_STATS;
+struct __wt_table;
+    typedef struct __wt_table WT_TABLE;
+struct __wt_txn;
+    typedef struct __wt_txn WT_TXN;
+struct __wt_txn_global;
+    typedef struct __wt_txn_global WT_TXN_GLOBAL;
+struct __wt_txn_op;
+    typedef struct __wt_txn_op WT_TXN_OP;
+struct __wt_txn_state;
+    typedef struct __wt_txn_state WT_TXN_STATE;
+struct __wt_upd_skipped;
+    typedef struct __wt_upd_skipped WT_UPD_SKIPPED;
+struct __wt_update;
+    typedef struct __wt_update WT_UPDATE;
+/*
+ * Forward type declarations for internal types: END
+ * DO NOT EDIT: automatically built by dist/s_typedef.
+ */
+
+/*******************************************
+ * WiredTiger internal include files.
+ *******************************************/
+#if defined(_lint)
+#include "lint.h"
+#elif defined(__GNUC__)
+#include "gcc.h"
+#elif defined(_MSC_VER)
+#include "msvc.h"
+#endif
+#include "hardware.h"
+
+#ifdef _WIN32
+#include "os_windows.h"
+#else
+#include "posix.h"
+#endif
+
+#include "misc.h"
+#include "mutex.h"
+
+#include "stat.h"			/* required by dhandle.h */
+#include "dhandle.h"			/* required by btree.h */
+
+#include "api.h"
+#include "async.h"
+#include "block.h"
+#include "bloom.h"
+#include "btmem.h"
+#include "btree.h"
+#include "cache.h"
+#include "config.h"
+#include "compact.h"
+#include "cursor.h"
+#include "dlh.h"
+#include "error.h"
+#include "flags.h"
+#include "log.h"
+#include "lsm.h"
+#include "meta.h"
+#include "os.h"
+#include "schema.h"
+#include "txn.h"
+
+#include "session.h"			/* required by connection.h */
+#include "connection.h"
+
+#include "extern.h"
+#include "verify_build.h"
+
+#include "buf.i"
+#include "misc.i"
+#include "intpack.i"			/* required by cell.i, packing.i */
+#include "packing.i"
+#include "cell.i"			/* required by btree.i */
+
+#include "mutex.i"			/* required by btree.i */
+#include "txn.i"			/* required by btree.i */
+
+#include "btree.i"			/* required by cursor.i */
+#include "cache.i"			/* required by cursor.i */
+#include "cursor.i"
+
+#include "bitstring.i"
+#include "column.i"
+#include "serial.i"
+
+#if defined(__cplusplus)
+}
+#endif
diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c
new file mode 100644
index 00000000000..d13002cdc5a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/log/log.c
@@ -0,0 +1,1243 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_log_ckpt --
+ *	Record the given LSN as the checkpoint LSN and signal the archive
+ *	thread as needed.
+ */
+int
+__wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_LOG *log;
+
+	conn = S2C(session);
+	log = conn->log;
+	log->ckpt_lsn = *ckp_lsn;
+	if (conn->arch_cond != NULL)
+		WT_RET(__wt_cond_signal(session, conn->arch_cond));
+	return (0);
+}
+
+/*
+ * __wt_log_written_reset --
+ *	Interface to reset the amount of log written during this
+ *	during this checkpoint period.  Called from the checkpoint code.
+ */
+void
+__wt_log_written_reset(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_LOG *log;
+
+	conn = S2C(session);
+	if (!conn->logging)
+		return;
+	log = conn->log;
+	log->log_written = 0;
+	return;
+}
+
+/*
+ * __wt_log_get_files --
+ *	Retrieve the list of all existing log files.
+ */
+int
+__wt_log_get_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp)
+{
+	WT_CONNECTION_IMPL *conn;
+	const char *log_path;
+
+	*countp = 0;
+	*filesp = NULL;
+
+	conn = S2C(session);
+	log_path = conn->log_path;
+	if (log_path == NULL)
+		log_path = "";
+	return (__wt_dirlist(session, log_path, WT_LOG_FILENAME,
+	    WT_DIRLIST_INCLUDE, filesp, countp));
+}
+
+/*
+ * __wt_log_get_active_files --
+ *	Retrieve the list of active log files (those that are not candidates
+ *	for archiving).
+ */
+int
+__wt_log_get_active_files(
+	WT_SESSION_IMPL *session, char ***filesp, u_int *countp)
+{
+	WT_DECL_RET;
+	WT_LOG *log;
+	char **files;
+	uint32_t id;
+	u_int count, i;
+
+	id = 0;
+	log = S2C(session)->log;
+
+	WT_RET(__wt_log_get_files(session, &files, &count));
+
+	/* Filter out any files that are below the checkpoint LSN. */
+	for (i = 0; i < count; ) {
+		WT_ERR(__wt_log_extract_lognum(session, files[i], &id));
+		if (id < log->ckpt_lsn.file) {
+			__wt_free(session, files[i]);
+			files[i] = files[count - 1];
+			files[--count] = NULL;
+		} else
+			i++;
+	}
+
+	*filesp = files;
+	*countp = count;
+
+	if (0) {
+err:		__wt_log_files_free(session, files, count);
+	}
+	return (ret);
+}
+
+/*
+ * __wt_log_files_free --
+ *	Free memory associated with a log file list.
+ */
+void
+__wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count)
+{
+	u_int i;
+
+	for (i = 0; i < count; i++)
+		__wt_free(session, files[i]);
+	__wt_free(session, files);
+}
+
+/*
+ * __wt_log_filename --
+ *	Given a log number, return a WT_ITEM of a generated log file name.
+ */
+int
+__wt_log_filename(WT_SESSION_IMPL *session, uint32_t id, WT_ITEM *buf)
+{
+	const char *log_path;
+
+	log_path = S2C(session)->log_path;
+
+	if (log_path != NULL && log_path[0] != '\0')
+		WT_RET(__wt_buf_fmt(session, buf, "%s/%s.%010" PRIu32,
+		    log_path, WT_LOG_FILENAME, id));
+	else
+		WT_RET(__wt_buf_fmt(session, buf, "%s.%010" PRIu32,
+		    WT_LOG_FILENAME, id));
+
+	return (0);
+}
+
+/*
+ * __wt_log_extract_lognum --
+ *	Given a log file name, extract out the log number.
+ */
+int
+__wt_log_extract_lognum(
+    WT_SESSION_IMPL *session, const char *name, uint32_t *id)
+{
+	const char *p;
+
+	WT_UNUSED(session);
+
+	if (id == NULL || name == NULL)
+		return (WT_ERROR);
+	if ((p = strrchr(name, '.')) == NULL ||
+	    sscanf(++p, "%" PRIu32, id) != 1)
+		WT_RET_MSG(session, WT_ERROR, "Bad log file name '%s'", name);
+	return (0);
+}
+
+/*
+ * __wt_log_remove --
+ *	Given a log number, remove that log file.
+ */
+int
+__wt_log_remove(WT_SESSION_IMPL *session, uint32_t lognum)
+{
+	WT_DECL_ITEM(path);
+	WT_DECL_RET;
+
+	WT_ERR(__wt_scr_alloc(session, 0, &path));
+	WT_ERR(__wt_log_filename(session, lognum, path));
+	WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+	    "log_remove: remove log %s", (char *)path->data));
+	WT_ERR(__wt_remove(session, path->data));
+err:	__wt_scr_free(&path);
+	return (ret);
+}
+
+/*
+ * __log_openfile --
+ *	Open a log file with the given log file number and return the WT_FH.
+ */
+static int
+__log_openfile(WT_SESSION_IMPL *session, int ok_create, WT_FH **fh, uint32_t id)
+{
+	WT_DECL_ITEM(path);
+	WT_DECL_RET;
+
+	WT_RET(__wt_scr_alloc(session, 0, &path));
+	WT_ERR(__wt_log_filename(session, id, path));
+	WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+	    "opening log %s", (const char *)path->data));
+	WT_ERR(__wt_open(
+	    session, path->data, ok_create, 0, WT_FILE_TYPE_LOG, fh));
+err:	__wt_scr_free(&path);
+	return (ret);
+}
+
+/*
+ * __wt_log_open --
+ *	Open the appropriate log file for the connection.  The purpose is
+ *	to find the last log file that exists, open it and set our initial
+ *	LSNs to the end of that file.  If none exist, call __wt_log_newfile
+ *	to create it.
+ */
+int
+__wt_log_open(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_LOG *log;
+	uint32_t firstlog, lastlog, lognum;
+	u_int i, logcount;
+	char **logfiles;
+
+	conn = S2C(session);
+	log = conn->log;
+	lastlog = 0;
+	firstlog = UINT32_MAX;
+
+	WT_RET(__wt_log_get_files(session, &logfiles, &logcount));
+	for (i = 0; i < logcount; i++) {
+		WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
+		lastlog = WT_MAX(lastlog, lognum);
+		firstlog = WT_MIN(firstlog, lognum);
+	}
+	log->fileid = lastlog;
+	WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+	    "log_open: first log %d last log %d", firstlog, lastlog));
+	log->first_lsn.file = firstlog;
+	log->first_lsn.offset = 0;
+
+	/*
+	 * Start logging at the beginning of the next log file, no matter
+	 * where the previous log file ends.
+	 */
+	WT_ERR(__wt_log_newfile(session, 1));
+
+	/*
+	 * If there were log files, run recovery.
+	 * XXX belongs at a higher level than this.
+	 */
+	if (logcount > 0) {
+		log->trunc_lsn = log->alloc_lsn;
+		WT_ERR(__wt_txn_recover(conn));
+	}
+
+err:	__wt_log_files_free(session, logfiles, logcount);
+	return (ret);
+}
+
+/*
+ * __wt_log_close --
+ *	Close the log file.
+ */
+int
+__wt_log_close(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_LOG *log;
+
+	conn = S2C(session);
+	log = conn->log;
+
+	if (log->log_close_fh != NULL && log->log_close_fh != log->log_fh) {
+		WT_RET(__wt_verbose(session, WT_VERB_LOG,
+		    "closing old log %s", log->log_close_fh->name));
+		WT_RET(__wt_close(session, log->log_close_fh));
+	}
+	if (log->log_fh != NULL) {
+		WT_RET(__wt_verbose(session, WT_VERB_LOG,
+		    "closing log %s", log->log_fh->name));
+		WT_RET(__wt_close(session, log->log_fh));
+		log->log_fh = NULL;
+	}
+	return (0);
+}
+
+/*
+ * __log_fill --
+ *	Copy a thread's log records into the assigned slot.
+ */
+static int
+__log_fill(WT_SESSION_IMPL *session,
+    WT_MYSLOT *myslot, int direct, WT_ITEM *record, WT_LSN *lsnp)
+{
+	WT_DECL_RET;
+	WT_LOG_RECORD *logrec;
+
+	logrec = (WT_LOG_RECORD *)record->mem;
+	/*
+	 * Call __wt_write.  For now the offset is the real byte offset.
+	 * If the offset becomes a unit of LOG_ALIGN this is where we would
+	 * multiply by LOG_ALIGN to get the real file byte offset for write().
+	 */
+	if (direct)
+		WT_ERR(__wt_write(session, myslot->slot->slot_fh,
+		    myslot->offset + myslot->slot->slot_start_offset,
+		    (size_t)logrec->len, (void *)logrec));
+	else
+		memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset,
+		    logrec, logrec->len);
+
+	WT_STAT_FAST_CONN_INCRV(session, log_bytes_written, logrec->len);
+	if (lsnp != NULL) {
+		*lsnp = myslot->slot->slot_start_lsn;
+		lsnp->offset += (wt_off_t)myslot->offset;
+	}
+err:
+	if (ret != 0 && myslot->slot->slot_error == 0)
+		myslot->slot->slot_error = ret;
+	return (ret);
+}
+
+/*
+ * __log_size_fit --
+ *	Return whether or not recsize will fit in the log file.
+ */
+static int
+__log_size_fit(WT_SESSION_IMPL *session, WT_LSN *lsn, uint64_t recsize)
+{
+	WT_CONNECTION_IMPL *conn;
+
+	conn = S2C(session);
+	return (lsn->offset + (wt_off_t)recsize < conn->log_file_max);
+}
+
+/*
+ * __log_truncate --
+ *	Truncate the log to the given LSN.  If this_log is set, it will only
+ *	truncate the log file indicated in the given LSN.  If not set,
+ *	it will truncate between the given LSN and the trunc_lsn.  That is,
+ *	since we pre-allocate log files, it will free that space and allow the
+ *	log to be traversed.  We use the trunc_lsn because logging has already
+ *	opened the new/next log file before recovery ran.  This function assumes
+ *	we are in recovery or other dedicated time and not during live running.
+ */
+static int
+__log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn, uint32_t this_log)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_FH *log_fh, *tmp_fh;
+	WT_LOG *log;
+	uint32_t lognum;
+	u_int i, logcount;
+	char **logfiles;
+
+	conn = S2C(session);
+	log = conn->log;
+	log_fh = NULL;
+	logcount = 0;
+	logfiles = NULL;
+
+	/*
+	 * Truncate the log file to the given LSN.
+	 */
+	WT_ERR(__log_openfile(session, 0, &log_fh, lsn->file));
+	WT_ERR(__wt_ftruncate(session, log_fh, lsn->offset));
+	tmp_fh = log_fh;
+	log_fh = NULL;
+	WT_ERR(__wt_close(session, tmp_fh));
+
+	/*
+	 * If we just want to truncate the current log, return and skip
+	 * looking for intervening logs.
+	 */
+	if (this_log)
+		goto err;
+	WT_ERR(__wt_log_get_files(session, &logfiles, &logcount));
+	for (i = 0; i < logcount; i++) {
+		WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
+		if (lognum > lsn->file && lognum < log->trunc_lsn.file) {
+			WT_ERR(__log_openfile(session, 0, &log_fh, lognum));
+			/*
+			 * If there are intervening files pre-allocated,
+			 * truncate them to the end of the log file header.
+			 */
+			WT_ERR(__wt_ftruncate(session,
+			    log_fh, LOG_FIRST_RECORD));
+			tmp_fh = log_fh;
+			log_fh = NULL;
+			WT_ERR(__wt_close(session, tmp_fh));
+		}
+	}
+err:	if (log_fh != NULL)
+		WT_TRET(__wt_close(session, log_fh));
+	if (logfiles != NULL)
+		__wt_log_files_free(session, logfiles, logcount);
+	return (ret);
+}
+
+/*
+ * __log_filesize --
+ *	Returns an estimate of the real end of log file.
+ */
+static int
+__log_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *eof)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_LOG *log;
+	wt_off_t log_size, off, off1;
+	uint32_t allocsize, bufsz;
+	char *buf, *zerobuf;
+
+	conn = S2C(session);
+	log = conn->log;
+	if (eof == NULL)
+		return (0);
+	*eof = 0;
+	WT_RET(__wt_filesize(session, fh, &log_size));
+	if (log == NULL)
+		allocsize = LOG_ALIGN;
+	else
+		allocsize = log->allocsize;
+
+	/*
+	 * It can be very slow looking for the last real record in the log
+	 * in very small chunks.  Walk backward by a megabyte at a time.  When
+	 * we find a part of the log that is not just zeroes, walk to find
+	 * the last record.
+	 */
+	buf = zerobuf = NULL;
+	if (allocsize < WT_MEGABYTE && log_size > WT_MEGABYTE)
+		bufsz = WT_MEGABYTE;
+	else
+		bufsz = allocsize;
+	WT_RET(__wt_calloc_def(session, bufsz, &buf));
+	WT_ERR(__wt_calloc_def(session, bufsz, &zerobuf));
+
+	/*
+	 * Read in a chunk starting at the end of the file.  Keep going until
+	 * we reach the beginning or we find a chunk that contains any non-zero
+	 * bytes.  Compare against a known zero byte chunk.
+	 */
+	for (off = log_size - (wt_off_t)bufsz;
+	    off >= 0;
+	    off -= (wt_off_t)bufsz) {
+		WT_ERR(__wt_read(session, fh, off, bufsz, buf));
+		if (memcmp(buf, zerobuf, bufsz) != 0)
+			break;
+	}
+
+	/*
+	 * If we're walking by large amounts, now walk by the real allocsize
+	 * to find the real end, if we found something.  Otherwise we reached
+	 * the beginning of the file.  Offset can go negative if the log file
+	 * size is not a multiple of a megabyte.  The first chunk of the log
+	 * file will always be non-zero.
+	 */
+	if (off < 0)
+		off = 0;
+
+	/*
+	 * We know all log records are aligned at log->allocsize.  The first
+	 * item in a log record is always a 32-bit length.  Look for any
+	 * non-zero length at the allocsize boundary.  This may not be a true
+	 * log record since it could be the middle of a large record.  But we
+	 * know no log record starts after it.  Return an estimate of the log
+	 * file size.
+	 */
+	for (off1 = bufsz - allocsize;
+	    off1 > 0; off1 -= (wt_off_t)allocsize)
+		if (memcmp(buf + off1, zerobuf, sizeof(uint32_t)) != 0)
+			break;
+	off = off + off1;
+
+	/*
+	 * Set EOF to the last zero-filled record we saw.
+	 */
+	*eof = off + (wt_off_t)allocsize;
+err:
+	if (buf != NULL)
+		__wt_free(session, buf);
+	if (zerobuf != NULL)
+		__wt_free(session, zerobuf);
+	return (ret);
+}
+
+/*
+ * __log_acquire --
+ *	Called with the log slot lock held.  Can be called recursively
+ *	from __wt_log_newfile when we change log files.
+ */
+static int
+__log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_LOG *log;
+
+	conn = S2C(session);
+	log = conn->log;
+	/*
+	 * Called locked.  Add recsize to alloc_lsn.  Save our starting LSN
+	 * where the previous allocation finished for the release LSN.
+	 * That way when log files switch, we're waiting for the correct LSN
+	 * from outstanding writes.
+	 */
+	slot->slot_release_lsn = log->alloc_lsn;
+	if (!__log_size_fit(session, &log->alloc_lsn, recsize)) {
+		WT_RET(__wt_log_newfile(session, 0));
+		if (log->log_close_fh != NULL)
+			F_SET(slot, SLOT_CLOSEFH);
+	}
+	/*
+	 * Checkpoints can be configured based on amount of log written.
+	 * Add in this log record to the sum and if needed, signal the
+	 * checkpoint condition.  The logging subsystem manages the
+	 * accumulated field.  There is a bit of layering violation
+	 * here checking the connection ckpt field and using its
+	 * condition.
+	 */
+	if (WT_CKPT_LOGSIZE(conn)) {
+		log->log_written += (wt_off_t)recsize;
+		WT_RET(__wt_checkpoint_signal(session, log->log_written));
+	}
+
+	/*
+	 * Need to minimally fill in slot info here.  Our slot start LSN
+	 * comes after any potential new log file creations.
+	 */
+	slot->slot_start_lsn = log->alloc_lsn;
+	slot->slot_start_offset = log->alloc_lsn.offset;
+	/*
+	 * Pre-allocate on the first real write into the log file.
+	 */
+	if (log->alloc_lsn.offset == LOG_FIRST_RECORD) {
+		if (!log->log_fh->fallocate_available ||
+		    (ret = __wt_fallocate(session, log->log_fh,
+		    LOG_FIRST_RECORD, conn->log_file_max)) == ENOTSUP)
+			ret = __wt_ftruncate(session, log->log_fh,
+			    LOG_FIRST_RECORD + conn->log_file_max);
+		WT_RET(ret);
+	}
+
+	log->alloc_lsn.offset += (wt_off_t)recsize;
+	slot->slot_end_lsn = log->alloc_lsn;
+	slot->slot_error = 0;
+	slot->slot_fh = log->log_fh;
+	return (0);
+}
+
+/*
+ * __log_release --
+ *	Release a log slot.
+ */
+static int
+__log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_FH *close_fh;
+	WT_LOG *log;
+	WT_LSN sync_lsn;
+	size_t write_size;
+	WT_DECL_SPINLOCK_ID(id);			/* Must appear last */
+
+	conn = S2C(session);
+	log = conn->log;
+	/*
+	 * If we're going to have to close our log file, make a local copy
+	 * of the file handle structure.
+	 */
+	close_fh = NULL;
+	if (F_ISSET(slot, SLOT_CLOSEFH)) {
+		close_fh = log->log_close_fh;
+		log->log_close_fh = NULL;
+		F_CLR(slot, SLOT_CLOSEFH);
+	}
+
+	/* Write the buffered records */
+	if (F_ISSET(slot, SLOT_BUFFERED)) {
+		write_size = (size_t)
+		    (slot->slot_end_lsn.offset - slot->slot_start_offset);
+		WT_ERR(__wt_write(session, slot->slot_fh,
+		    slot->slot_start_offset, write_size, slot->slot_buf.mem));
+	}
+
+	/*
+	 * Wait for earlier groups to finish, otherwise there could be holes
+	 * in the log file.
+	 */
+	while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0)
+		__wt_yield();
+	log->write_lsn = slot->slot_end_lsn;
+	/*
+	 * Try to consolidate calls to fsync to wait less.  Acquire a spin lock
+	 * so that threads finishing writing to the log will wait while the
+	 * current fsync completes and advance log->write_lsn.
+	 */
+	while (F_ISSET(slot, SLOT_SYNC) &&
+	    LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
+		if (__wt_spin_trylock(session, &log->log_sync_lock, &id) != 0) {
+			(void)__wt_cond_wait(
+			    session, log->log_sync_cond, 10000);
+			continue;
+		}
+		/*
+		 * Record the current end of log after we grabbed the lock.
+		 * That is how far our fsync call with guarantee.
+		 */
+		sync_lsn = log->write_lsn;
+		if (LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
+			WT_STAT_FAST_CONN_INCR(session, log_sync);
+			ret = __wt_fsync(session, log->log_fh);
+			if (ret == 0) {
+				F_CLR(slot, SLOT_SYNC);
+				log->sync_lsn = sync_lsn;
+				ret = __wt_cond_signal(
+				    session, log->log_sync_cond);
+			}
+		}
+		__wt_spin_unlock(session, &log->log_sync_lock);
+		WT_ERR(ret);
+	}
+	if (F_ISSET(slot, SLOT_BUF_GROW)) {
+		WT_STAT_FAST_CONN_INCR(session, log_buffer_grow);
+		F_CLR(slot, SLOT_BUF_GROW);
+		WT_STAT_FAST_CONN_INCRV(session,
+		    log_buffer_size, slot->slot_buf.memsize);
+		WT_ERR(__wt_buf_grow(session,
+		    &slot->slot_buf, slot->slot_buf.memsize * 2));
+	}
+	/*
+	 * If we have a file to close, close it now.
+	 */
+	if (close_fh)
+		WT_ERR(__wt_close(session, close_fh));
+
+err:	if (ret != 0 && slot->slot_error == 0)
+		slot->slot_error = ret;
+	return (ret);
+}
+
+/*
+ * __wt_log_newfile --
+ *	Create the next log file and write the file header record into it.
+ */
+int
+__wt_log_newfile(WT_SESSION_IMPL *session, int conn_create)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+	WT_LOG *log;
+	WT_LOG_DESC *desc;
+	WT_LOG_RECORD *logrec;
+	WT_LOGSLOT tmp;
+	WT_MYSLOT myslot;
+
+	conn = S2C(session);
+	log = conn->log;
+
+	/*
+	 * Set aside the log file handle to be closed later.  Other threads
+	 * may still be using it to write to the log.  If the log file size
+	 * is small we could fill a log file before the previous one is closed.
+	 * Wait for that to close.
+	 */
+	while (log->log_close_fh != NULL) {
+		__wt_errx(session,
+		    "log_newfile: Log file size %" PRIuMAX " too small",
+		    (uintmax_t)conn->log_file_max);
+		WT_STAT_FAST_CONN_INCR(session, log_close_yields);
+		__wt_yield();
+	}
+	log->log_close_fh = log->log_fh;
+	log->fileid++;
+	WT_RET(__log_openfile(session, 1, &log->log_fh, log->fileid));
+	log->alloc_lsn.file = log->fileid;
+	log->alloc_lsn.offset = log->log_fh->size;
+
+	/*
+	 * Set up the log descriptor record.  Use a scratch buffer to
+	 * get correct alignment for direct I/O.
+	 */
+	WT_ASSERT(session, sizeof(WT_LOG_DESC) < log->allocsize);
+	WT_RET(__wt_scr_alloc(session, log->allocsize, &buf));
+	memset(buf->mem, 0, log->allocsize);
+	logrec = (WT_LOG_RECORD *)buf->mem;
+	desc = (WT_LOG_DESC *)logrec->record;
+	desc->log_magic = WT_LOG_MAGIC;
+	desc->majorv = WT_LOG_MAJOR_VERSION;
+	desc->minorv = WT_LOG_MINOR_VERSION;
+	desc->log_size = (uint64_t)conn->log_file_max;
+
+	/*
+	 * Now that the record is set up, initialize the record header.
+	 */
+	logrec->len = log->allocsize;
+	logrec->checksum = 0;
+	logrec->checksum = __wt_cksum(logrec, log->allocsize);
+	WT_CLEAR(tmp);
+	myslot.slot = &tmp;
+	myslot.offset = 0;
+
+	/*
+	 * Recursively call __log_acquire to allocate log space for the
+	 * log descriptor record.  Call __log_fill to write it, but we
+	 * do not need to call __log_release because we're not waiting for
+	 * earlier operations to complete.
+	 */
+	WT_ERR(__log_acquire(session, logrec->len, &tmp));
+	WT_ERR(__log_fill(session, &myslot, 1, buf, NULL));
+
+	/*
+	 * If we're called from connection creation code, we need to update
+	 * the LSNs since we're the only write in progress.
+	 */
+	if (conn_create) {
+		WT_ERR(__wt_fsync(session, log->log_fh));
+		log->sync_lsn = tmp.slot_end_lsn;
+		log->write_lsn = tmp.slot_end_lsn;
+	}
+
+err:	__wt_scr_free(&buf);
+	return (ret);
+}
+
+/*
+ * __wt_log_read --
+ *	Read the log record at the given LSN.  Return the record (including
+ *	the log header) in the WT_ITEM.  Caller is responsible for freeing it.
+ */
+int
+__wt_log_read(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
+    uint32_t flags)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_FH *log_fh;
+	WT_LOG *log;
+	WT_LOG_RECORD *logrec;
+	uint32_t cksum, rdup_len, reclen;
+
+	WT_UNUSED(flags);
+	/*
+	 * If the caller didn't give us an LSN or something to return,
+	 * there's nothing to do.
+	 */
+	if (lsnp == NULL || record == NULL)
+		return (0);
+	conn = S2C(session);
+	log = conn->log;
+	/*
+	 * If the offset isn't on an allocation boundary it must be wrong.
+	 */
+	if (lsnp->offset % log->allocsize != 0 || lsnp->file > log->fileid)
+		return (WT_NOTFOUND);
+
+	WT_RET(__log_openfile(session, 0, &log_fh, lsnp->file));
+	/*
+	 * Read the minimum allocation size a record could be.
+	 */
+	WT_ERR(__wt_buf_init(session, record, log->allocsize));
+	WT_ERR(__wt_read(session,
+	    log_fh, lsnp->offset, (size_t)log->allocsize, record->mem));
+	/*
+	 * First 4 bytes is the real record length.  See if we
+	 * need to read more than the allocation size.  We expect
+	 * that we rarely will have to read more.  Most log records
+	 * will be fairly small.
+	 */
+	reclen = *(uint32_t *)record->mem;
+	if (reclen == 0) {
+		ret = WT_NOTFOUND;
+		goto err;
+	}
+	if (reclen > log->allocsize) {
+		rdup_len = __wt_rduppo2(reclen, log->allocsize);
+		WT_ERR(__wt_buf_grow(session, record, rdup_len));
+		WT_ERR(__wt_read(session,
+		    log_fh, lsnp->offset, (size_t)rdup_len, record->mem));
+	}
+	/*
+	 * We read in the record, verify checksum.
+	 */
+	logrec = (WT_LOG_RECORD *)record->mem;
+	cksum = logrec->checksum;
+	logrec->checksum = 0;
+	logrec->checksum = __wt_cksum(logrec, logrec->len);
+	if (logrec->checksum != cksum)
+		WT_ERR_MSG(session, WT_ERROR, "log_read: Bad checksum");
+	record->size = logrec->len;
+	WT_STAT_FAST_CONN_INCR(session, log_reads);
+err:
+	WT_TRET(__wt_close(session, log_fh));
+	return (ret);
+}
+
+/*
+ * __wt_log_scan --
+ *	Scan the logs, calling a function on each record found.
+ */
+int
+__wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
+    int (*func)(WT_SESSION_IMPL *session,
+    WT_ITEM *record, WT_LSN *lsnp, void *cookie), void *cookie)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_FH *log_fh;
+	WT_ITEM buf;
+	WT_LOG *log;
+	WT_LOG_RECORD *logrec;
+	WT_LSN end_lsn, rd_lsn, start_lsn;
+	wt_off_t log_size;
+	uint32_t allocsize, cksum, firstlog, lastlog, lognum, rdup_len, reclen;
+	u_int i, logcount;
+	int eol;
+	char **logfiles;
+
+	conn = S2C(session);
+	log = conn->log;
+	log_fh = NULL;
+	logcount = 0;
+	logfiles = NULL;
+	eol = 0;
+	WT_CLEAR(buf);
+
+	/*
+	 * If the caller did not give us a callback function there is nothing
+	 * to do.
+	 */
+	if (func == NULL)
+		return (0);
+
+	if (LF_ISSET(WT_LOGSCAN_RECOVER))
+		WT_RET(__wt_verbose(session, WT_VERB_LOG,
+		    "__wt_log_scan truncating to %u/%" PRIuMAX,
+		    log->trunc_lsn.file, (uintmax_t)log->trunc_lsn.offset));
+
+	if (log != NULL) {
+		allocsize = log->allocsize;
+
+		if (lsnp == NULL) {
+			if (LF_ISSET(WT_LOGSCAN_FIRST))
+				start_lsn = log->first_lsn;
+			else if (LF_ISSET(WT_LOGSCAN_FROM_CKP))
+				start_lsn = log->ckpt_lsn;
+			else
+				return (WT_ERROR);	/* Illegal usage */
+		} else {
+			if (LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP))
+				WT_RET_MSG(session, WT_ERROR,
+			    "choose either a start LSN or a start flag");
+
+			/* Offsets must be on allocation boundaries. */
+			if (lsnp->offset % allocsize != 0 ||
+			    lsnp->file > log->fileid)
+				return (WT_NOTFOUND);
+
+			/*
+			 * Log cursors may not know the starting LSN.  If an
+			 * LSN pointer is passed in, but it is the INIT_LSN,
+			 * start from the first_lsn.
+			 */
+			start_lsn = *lsnp;
+			if (IS_INIT_LSN(&start_lsn))
+				start_lsn = log->first_lsn;
+		}
+		end_lsn = log->alloc_lsn;
+	} else {
+		/*
+		 * If logging is not configured, we can still print out the log
+		 * if log files exist.  We just need to set the LSNs from what
+		 * is in the files versus what is in the live connection.
+		 */
+		/*
+		 * Set allocsize to the minimum alignment it could be.  Larger
+		 * records and larger allocation boundaries should always be
+		 * a multiple of this.
+		 */
+		allocsize = LOG_ALIGN;
+		lastlog = 0;
+		firstlog = UINT32_MAX;
+		WT_RET(__wt_log_get_files(session, &logfiles, &logcount));
+		if (logcount == 0)
+			/*
+			 * Return it is not supported if none don't exist.
+			 */
+			return (ENOTSUP);
+		for (i = 0; i < logcount; i++) {
+			WT_ERR(__wt_log_extract_lognum(session, logfiles[i],
+			    &lognum));
+			lastlog = WT_MAX(lastlog, lognum);
+			firstlog = WT_MIN(firstlog, lognum);
+		}
+		start_lsn.file = firstlog;
+		end_lsn.file = lastlog;
+		start_lsn.offset = end_lsn.offset = 0;
+		__wt_log_files_free(session, logfiles, logcount);
+		logfiles = NULL;
+	}
+	WT_ERR(__log_openfile(session, 0, &log_fh, start_lsn.file));
+	WT_ERR(__log_filesize(session, log_fh, &log_size));
+	rd_lsn = start_lsn;
+	WT_ERR(__wt_buf_initsize(session, &buf, LOG_ALIGN));
+	for (;;) {
+		if (rd_lsn.offset + allocsize > log_size) {
+advance:
+			/*
+			 * If we read the last record, go to the next file.
+			 */
+			WT_ERR(__wt_close(session, log_fh));
+			log_fh = NULL;
+			eol = 1;
+			/*
+			 * Truncate this log file before we move to the next.
+			 */
+			if (LF_ISSET(WT_LOGSCAN_RECOVER))
+				WT_ERR(__log_truncate(session, &rd_lsn, 1));
+			rd_lsn.file++;
+			rd_lsn.offset = 0;
+			/*
+			 * Avoid an error message when we reach end of log
+			 * by checking here.
+			 */
+			if (rd_lsn.file > end_lsn.file)
+				break;
+			WT_ERR(__log_openfile(
+			    session, 0, &log_fh, rd_lsn.file));
+			WT_ERR(__log_filesize(session, log_fh, &log_size));
+			continue;
+		}
+		/*
+		 * Read the minimum allocation size a record could be.
+		 */
+		WT_ASSERT(session, buf.memsize >= allocsize);
+		WT_ERR(__wt_read(session,
+		    log_fh, rd_lsn.offset, (size_t)allocsize, buf.mem));
+		/*
+		 * First 8 bytes is the real record length.  See if we
+		 * need to read more than the allocation size.  We expect
+		 * that we rarely will have to read more.  Most log records
+		 * will be fairly small.
+		 */
+		reclen = *(uint32_t *)buf.mem;
+		/*
+		 * Log files are pre-allocated.  We never expect a zero length
+		 * unless we've reached the end of the log.  The log can be
+		 * written out of order, so when recovery finds the end of
+		 * the log, truncate the file and remove any later log files
+		 * that may exist.
+		 */
+		if (reclen == 0) {
+			/* This LSN is the end. */
+			break;
+		}
+		rdup_len = __wt_rduppo2(reclen, allocsize);
+		if (reclen > allocsize) {
+			/*
+			 * The log file end could be the middle of this
+			 * log record.
+			 */
+			if (rd_lsn.offset + rdup_len > log_size)
+				goto advance;
+			/*
+			 * We need to round up and read in the full padded
+			 * record, especially for direct I/O.
+			 */
+			WT_ERR(__wt_buf_grow(session, &buf, rdup_len));
+			WT_ERR(__wt_read(session,
+			    log_fh, rd_lsn.offset, (size_t)rdup_len, buf.mem));
+			WT_STAT_FAST_CONN_INCR(session, log_scan_rereads);
+		}
+		/*
+		 * We read in the record, verify checksum.
+		 */
+		buf.size = reclen;
+		logrec = (WT_LOG_RECORD *)buf.mem;
+		cksum = logrec->checksum;
+		logrec->checksum = 0;
+		logrec->checksum = __wt_cksum(logrec, logrec->len);
+		if (logrec->checksum != cksum) {
+			/*
+			 * A checksum mismatch means we have reached the end of
+			 * the useful part of the log.  This should be found on
+			 * the first pass through recovery.  In the second pass
+			 * where we truncate the log, this is where it should
+			 * end.
+			 */
+			if (log != NULL)
+				log->trunc_lsn = rd_lsn;
+			break;
+		}
+
+		/*
+		 * We have a valid log record.  If it is not the log file
+		 * header, invoke the callback.
+		 */
+		WT_STAT_FAST_CONN_INCR(session, log_scan_records);
+		if (rd_lsn.offset != 0) {
+			WT_ERR((*func)(session, &buf, &rd_lsn, cookie));
+			if (LF_ISSET(WT_LOGSCAN_ONE))
+				break;
+		}
+		rd_lsn.offset += (wt_off_t)rdup_len;
+	}
+
+	/* Truncate if we're in recovery. */
+	if (LF_ISSET(WT_LOGSCAN_RECOVER) &&
+	    LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0)
+		WT_ERR(__log_truncate(session, &rd_lsn, 0));
+
+err:	WT_STAT_FAST_CONN_INCR(session, log_scans);
+	if (logfiles != NULL)
+		__wt_log_files_free(session, logfiles, logcount);
+	__wt_buf_free(session, &buf);
+	/*
+	 * If the caller wants one record and it is at the end of log,
+	 * return WT_NOTFOUND.
+	 */
+	if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0)
+		ret = WT_NOTFOUND;
+	if (ret == ENOENT)
+		ret = 0;
+	if (log_fh != NULL)
+		WT_TRET(__wt_close(session, log_fh));
+	return (ret);
+}
+
+/*
+ * __log_direct_write --
+ *	Write a log record without using the consolidation arrays.
+ */
+static int
+__log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
+    uint32_t flags)
+{
+	WT_DECL_RET;
+	WT_LOG *log;
+	WT_LOGSLOT tmp;
+	WT_MYSLOT myslot;
+	int locked;
+	WT_DECL_SPINLOCK_ID(id);			/* Must appear last */
+
+	log = S2C(session)->log;
+	myslot.slot = &tmp;
+	myslot.offset = 0;
+	WT_CLEAR(tmp);
+
+	/* Fast path the contended case. */
+	if (__wt_spin_trylock(session, &log->log_slot_lock, &id) != 0)
+		return (EAGAIN);
+	locked = 1;
+
+	if (LF_ISSET(WT_LOG_FSYNC))
+		F_SET(&tmp, SLOT_SYNC);
+	WT_ERR(__log_acquire(session, record->size, &tmp));
+	__wt_spin_unlock(session, &log->log_slot_lock);
+	locked = 0;
+	WT_ERR(__log_fill(session, &myslot, 1, record, lsnp));
+	WT_ERR(__log_release(session, &tmp));
+
+err:	if (locked)
+		__wt_spin_unlock(session, &log->log_slot_lock);
+	return (ret);
+}
+
+/*
+ * __wt_log_write --
+ *	Write a record into the log.
+ */
+int
+__wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
+    uint32_t flags)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_LOG *log;
+	WT_LOG_RECORD *logrec;
+	WT_LSN lsn;
+	WT_MYSLOT myslot;
+	uint32_t rdup_len;
+	int locked;
+
+	conn = S2C(session);
+	log = conn->log;
+	locked = 0;
+	INIT_LSN(&lsn);
+	myslot.slot = NULL;
+	/*
+	 * Assume the WT_ITEM the user passed is a WT_LOG_RECORD, which has
+	 * a header at the beginning for us to fill in.
+	 *
+	 * If using direct_io, the caller should pass us an aligned record.
+	 * But we need to make sure it is big enough and zero-filled so
+	 * that we can write the full amount.  Do this whether or not
+	 * direct_io is in use because it makes the reading code cleaner.
+	 */
+	WT_STAT_FAST_CONN_INCRV(session, log_bytes_user, record->size);
+	rdup_len = __wt_rduppo2((uint32_t)record->size, log->allocsize);
+	WT_ERR(__wt_buf_grow(session, record, rdup_len));
+	WT_ASSERT(session, record->data == record->mem);
+	/*
+	 * If the caller's record only partially fills the necessary
+	 * space, we need to zero-fill the remainder.
+	 */
+	if (record->size != rdup_len) {
+		memset((uint8_t *)record->mem + record->size, 0,
+		    rdup_len - record->size);
+		record->size = rdup_len;
+	}
+	logrec = (WT_LOG_RECORD *)record->mem;
+	logrec->len = (uint32_t)record->size;
+	logrec->checksum = 0;
+	logrec->checksum = __wt_cksum(logrec, record->size);
+
+	WT_STAT_FAST_CONN_INCR(session, log_writes);
+
+	if (!F_ISSET(log, WT_LOG_FORCE_CONSOLIDATE)) {
+		ret = __log_direct_write(session, record, lsnp, flags);
+		if (ret == 0)
+			return (0);
+		if (ret != EAGAIN)
+			WT_ERR(ret);
+		/*
+		 * An EAGAIN return means we failed to get the try lock -
+		 * fall through to the consolidation code in that case.
+		 */
+	}
+
+	/*
+	 * As soon as we see contention for the log slot, disable direct
+	 * log writes. We get better performance by forcing writes through
+	 * the consolidation code. This is because individual writes flood
+	 * the I/O system faster than they contend on the log slot lock.
+	 */
+	F_SET(log, WT_LOG_FORCE_CONSOLIDATE);
+	if ((ret = __wt_log_slot_join(
+	    session, rdup_len, flags, &myslot)) == ENOMEM) {
+		/*
+		 * If we couldn't find a consolidated slot for this record
+		 * write the record directly.
+		 */
+		while ((ret = __log_direct_write(
+		    session, record, lsnp, flags)) == EAGAIN)
+			;
+		WT_ERR(ret);
+		/*
+		 * Increase the buffer size of any slots we can get access
+		 * to, so future consolidations are likely to succeed.
+		 */
+		WT_ERR(__wt_log_slot_grow_buffers(session, 4 * rdup_len));
+		return (0);
+	}
+	WT_ERR(ret);
+	if (myslot.offset == 0) {
+		__wt_spin_lock(session, &log->log_slot_lock);
+		locked = 1;
+		WT_ERR(__wt_log_slot_close(session, myslot.slot));
+		WT_ERR(__log_acquire(
+		    session, myslot.slot->slot_group_size, myslot.slot));
+		__wt_spin_unlock(session, &log->log_slot_lock);
+		locked = 0;
+		WT_ERR(__wt_log_slot_notify(session, myslot.slot));
+	} else
+		WT_ERR(__wt_log_slot_wait(session, myslot.slot));
+	WT_ERR(__log_fill(session, &myslot, 0, record, &lsn));
+	if (__wt_log_slot_release(myslot.slot, rdup_len) == WT_LOG_SLOT_DONE) {
+		WT_ERR(__log_release(session, myslot.slot));
+		WT_ERR(__wt_log_slot_free(myslot.slot));
+	} else if (LF_ISSET(WT_LOG_FSYNC)) {
+		/* Wait for our writes to reach disk */
+		while (LOG_CMP(&log->sync_lsn, &lsn) <= 0 &&
+		    myslot.slot->slot_error == 0)
+			(void)__wt_cond_wait(
+			    session, log->log_sync_cond, 10000);
+	}
+err:
+	if (locked)
+		__wt_spin_unlock(session, &log->log_slot_lock);
+	if (ret == 0 && lsnp != NULL)
+		*lsnp = lsn;
+	/*
+	 * If we're synchronous and some thread had an error, we don't know
+	 * if our write made it out to the file or not.  The error could be
+	 * before or after us.  So, if anyone got an error, we report it.
+	 * If we're not synchronous, only report if our own operation got
+	 * an error.
+	 */
+	if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC) && ret == 0 &&
+	    myslot.slot != NULL)
+		ret = myslot.slot->slot_error;
+	return (ret);
+}
+
+/*
+ * __wt_log_vprintf --
+ *	Write a message into the log.
+ */
+int
+__wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_ITEM(logrec);
+	WT_DECL_RET;
+	va_list ap_copy;
+	const char *rec_fmt = WT_UNCHECKED_STRING(I);
+	uint32_t rectype = WT_LOGREC_MESSAGE;
+	size_t header_size, len;
+
+	conn = S2C(session);
+
+	if (!conn->logging)
+		return (0);
+
+	va_copy(ap_copy, ap);
+	len = (size_t)vsnprintf(NULL, 0, fmt, ap_copy) + 1;
+	va_end(ap_copy);
+
+	WT_RET(
+	    __wt_logrec_alloc(session, sizeof(WT_LOG_RECORD) + len, &logrec));
+
+	/*
+	 * We're writing a record with the type (an integer) followed by a
+	 * string (NUL-terminated data).  To avoid writing the string into
+	 * a buffer before copying it, we write the header first, then the
+	 * raw bytes of the string.
+	 */
+	WT_ERR(__wt_struct_size(session, &header_size, rec_fmt, rectype));
+	WT_ERR(__wt_struct_pack(session,
+	    (uint8_t *)logrec->data + logrec->size, header_size,
+	    rec_fmt, rectype));
+	logrec->size += (uint32_t)header_size;
+
+	(void)vsnprintf((char *)logrec->data + logrec->size, len, fmt, ap);
+
+	WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+	    "log_printf: %s", (char *)logrec->data + logrec->size));
+
+	logrec->size += len;
+	WT_ERR(__wt_log_write(session, logrec, NULL, 0));
+err:	__wt_scr_free(&logrec);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/log/log_auto.c b/src/third_party/wiredtiger/src/log/log_auto.c
new file mode 100644
index 00000000000..f3db79f4daf
--- /dev/null
+++ b/src/third_party/wiredtiger/src/log/log_auto.c
@@ -0,0 +1,437 @@
+/* DO NOT EDIT: automatically built by dist/log.py. */
+
+#include "wt_internal.h"
+
+int
+__wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp)
+{
+	WT_ITEM *logrec;
+
+	WT_RET(__wt_scr_alloc(session, WT_ALIGN(size + 1, LOG_ALIGN), &logrec));
+	WT_CLEAR(*(WT_LOG_RECORD *)logrec->data);
+	logrec->size = offsetof(WT_LOG_RECORD, record);
+
+	*logrecp = logrec;
+	return (0);
+}
+
+void
+__wt_logrec_free(WT_SESSION_IMPL *session, WT_ITEM **logrecp)
+{
+	WT_UNUSED(session);
+	__wt_scr_free(logrecp);
+}
+
+int
+__wt_logrec_read(WT_SESSION_IMPL *session,
+    const uint8_t **pp, const uint8_t *end, uint32_t *rectypep)
+{
+	uint64_t rectype;
+
+	WT_UNUSED(session);
+	WT_RET(__wt_vunpack_uint(pp, WT_PTRDIFF(end, *pp), &rectype));
+	*rectypep = (uint32_t)rectype;
+	return (0);
+}
+
+int
+__wt_logop_read(WT_SESSION_IMPL *session,
+    const uint8_t **pp, const uint8_t *end,
+    uint32_t *optypep, uint32_t *opsizep)
+{
+	return (__wt_struct_unpack(
+	    session, *pp, WT_PTRDIFF(end, *pp), "II", optypep, opsizep));
+}
+
+int
+__wt_logop_col_put_pack(
+    WT_SESSION_IMPL *session, WT_ITEM *logrec,
+    uint32_t fileid, uint64_t recno, WT_ITEM *value)
+{
+	const char *fmt = WT_UNCHECKED_STRING(IIIru);
+	size_t size;
+	uint32_t optype, recsize;
+
+	optype = WT_LOGOP_COL_PUT;
+	WT_RET(__wt_struct_size(session, &size, fmt,
+	    optype, 0, fileid, recno, value));
+
+	__wt_struct_size_adjust(session, &size);
+	WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+	recsize = (uint32_t)size;
+	WT_RET(__wt_struct_pack(session,
+	    (uint8_t *)logrec->data + logrec->size, size, fmt,
+	    optype, recsize, fileid, recno, value));
+
+	logrec->size += (uint32_t)size;
+	return (0);
+}
+
+int
+__wt_logop_col_put_unpack(
+    WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+    uint32_t *fileidp, uint64_t *recnop, WT_ITEM *valuep)
+{
+	const char *fmt = WT_UNCHECKED_STRING(IIIru);
+	uint32_t optype, size;
+
+	WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+	    &optype, &size, fileidp, recnop, valuep));
+	WT_ASSERT(session, optype == WT_LOGOP_COL_PUT);
+
+	*pp += size;
+	return (0);
+}
+
+int
+__wt_logop_col_put_print(
+    WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+	uint32_t fileid;
+	uint64_t recno;
+	WT_ITEM value;
+
+	WT_RET(__wt_logop_col_put_unpack(
+	    session, pp, end, &fileid, &recno, &value));
+
+	fprintf(out, "    \"optype\": \"col_put\",\n");
+	fprintf(out, "    \"fileid\": \"%" PRIu32 "\",\n", fileid);
+	fprintf(out, "    \"recno\": \"%" PRIu64 "\",\n", recno);
+	fprintf(out, "    \"value\": \"%.*s\",\n",
+	    (int)value.size, (const char *)value.data);
+	return (0);
+}
+
+int
+__wt_logop_col_remove_pack(
+    WT_SESSION_IMPL *session, WT_ITEM *logrec,
+    uint32_t fileid, uint64_t recno)
+{
+	const char *fmt = WT_UNCHECKED_STRING(IIIr);
+	size_t size;
+	uint32_t optype, recsize;
+
+	optype = WT_LOGOP_COL_REMOVE;
+	WT_RET(__wt_struct_size(session, &size, fmt,
+	    optype, 0, fileid, recno));
+
+	__wt_struct_size_adjust(session, &size);
+	WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+	recsize = (uint32_t)size;
+	WT_RET(__wt_struct_pack(session,
+	    (uint8_t *)logrec->data + logrec->size, size, fmt,
+	    optype, recsize, fileid, recno));
+
+	logrec->size += (uint32_t)size;
+	return (0);
+}
+
+int
+__wt_logop_col_remove_unpack(
+    WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+    uint32_t *fileidp, uint64_t *recnop)
+{
+	const char *fmt = WT_UNCHECKED_STRING(IIIr);
+	uint32_t optype, size;
+
+	WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+	    &optype, &size, fileidp, recnop));
+	WT_ASSERT(session, optype == WT_LOGOP_COL_REMOVE);
+
+	*pp += size;
+	return (0);
+}
+
+int
+__wt_logop_col_remove_print(
+    WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+	uint32_t fileid;
+	uint64_t recno;
+
+	WT_RET(__wt_logop_col_remove_unpack(
+	    session, pp, end, &fileid, &recno));
+
+	fprintf(out, "    \"optype\": \"col_remove\",\n");
+	fprintf(out, "    \"fileid\": \"%" PRIu32 "\",\n", fileid);
+	fprintf(out, "    \"recno\": \"%" PRIu64 "\",\n", recno);
+	return (0);
+}
+
+int
+__wt_logop_col_truncate_pack(
+    WT_SESSION_IMPL *session, WT_ITEM *logrec,
+    uint32_t fileid, uint64_t start, uint64_t stop)
+{
+	const char *fmt = WT_UNCHECKED_STRING(IIIrr);
+	size_t size;
+	uint32_t optype, recsize;
+
+	optype = WT_LOGOP_COL_TRUNCATE;
+	WT_RET(__wt_struct_size(session, &size, fmt,
+	    optype, 0, fileid, start, stop));
+
+	__wt_struct_size_adjust(session, &size);
+	WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+	recsize = (uint32_t)size;
+	WT_RET(__wt_struct_pack(session,
+	    (uint8_t *)logrec->data + logrec->size, size, fmt,
+	    optype, recsize, fileid, start, stop));
+
+	logrec->size += (uint32_t)size;
+	return (0);
+}
+
+int
+__wt_logop_col_truncate_unpack(
+    WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+    uint32_t *fileidp, uint64_t *startp, uint64_t *stopp)
+{
+	const char *fmt = WT_UNCHECKED_STRING(IIIrr);
+	uint32_t optype, size;
+
+	WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+	    &optype, &size, fileidp, startp, stopp));
+	WT_ASSERT(session, optype == WT_LOGOP_COL_TRUNCATE);
+
+	*pp += size;
+	return (0);
+}
+
+int
+__wt_logop_col_truncate_print(
+    WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+	uint32_t fileid;
+	uint64_t start;
+	uint64_t stop;
+
+	WT_RET(__wt_logop_col_truncate_unpack(
+	    session, pp, end, &fileid, &start, &stop));
+
+	fprintf(out, "    \"optype\": \"col_truncate\",\n");
+	fprintf(out, "    \"fileid\": \"%" PRIu32 "\",\n", fileid);
+	fprintf(out, "    \"start\": \"%" PRIu64 "\",\n", start);
+	fprintf(out, "    \"stop\": \"%" PRIu64 "\",\n", stop);
+	return (0);
+}
+
+int
+__wt_logop_row_put_pack(
+    WT_SESSION_IMPL *session, WT_ITEM *logrec,
+    uint32_t fileid, WT_ITEM *key, WT_ITEM *value)
+{
+	const char *fmt = WT_UNCHECKED_STRING(IIIuu);
+	size_t size;
+	uint32_t optype, recsize;
+
+	optype = WT_LOGOP_ROW_PUT;
+	WT_RET(__wt_struct_size(session, &size, fmt,
+	    optype, 0, fileid, key, value));
+
+	__wt_struct_size_adjust(session, &size);
+	WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+	recsize = (uint32_t)size;
+	WT_RET(__wt_struct_pack(session,
+	    (uint8_t *)logrec->data + logrec->size, size, fmt,
+	    optype, recsize, fileid, key, value));
+
+	logrec->size += (uint32_t)size;
+	return (0);
+}
+
+int
+__wt_logop_row_put_unpack(
+    WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+    uint32_t *fileidp, WT_ITEM *keyp, WT_ITEM *valuep)
+{
+	const char *fmt = WT_UNCHECKED_STRING(IIIuu);
+	uint32_t optype, size;
+
+	WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+	    &optype, &size, fileidp, keyp, valuep));
+	WT_ASSERT(session, optype == WT_LOGOP_ROW_PUT);
+
+	*pp += size;
+	return (0);
+}
+
+int
+__wt_logop_row_put_print(
+    WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+	uint32_t fileid;
+	WT_ITEM key;
+	WT_ITEM value;
+
+	WT_RET(__wt_logop_row_put_unpack(
+	    session, pp, end, &fileid, &key, &value));
+
+	fprintf(out, "    \"optype\": \"row_put\",\n");
+	fprintf(out, "    \"fileid\": \"%" PRIu32 "\",\n", fileid);
+	fprintf(out, "    \"key\": \"%.*s\",\n",
+	    (int)key.size, (const char *)key.data);
+	fprintf(out, "    \"value\": \"%.*s\",\n",
+	    (int)value.size, (const char *)value.data);
+	return (0);
+}
+
+int
+__wt_logop_row_remove_pack(
+    WT_SESSION_IMPL *session, WT_ITEM *logrec,
+    uint32_t fileid, WT_ITEM *key)
+{
+	const char *fmt = WT_UNCHECKED_STRING(IIIu);
+	size_t size;
+	uint32_t optype, recsize;
+
+	optype = WT_LOGOP_ROW_REMOVE;
+	WT_RET(__wt_struct_size(session, &size, fmt,
+	    optype, 0, fileid, key));
+
+	__wt_struct_size_adjust(session, &size);
+	WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+	recsize = (uint32_t)size;
+	WT_RET(__wt_struct_pack(session,
+	    (uint8_t *)logrec->data + logrec->size, size, fmt,
+	    optype, recsize, fileid, key));
+
+	logrec->size += (uint32_t)size;
+	return (0);
+}
+
+int
+__wt_logop_row_remove_unpack(
+    WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+    uint32_t *fileidp, WT_ITEM *keyp)
+{
+	const char *fmt = WT_UNCHECKED_STRING(IIIu);
+	uint32_t optype, size;
+
+	WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+	    &optype, &size, fileidp, keyp));
+	WT_ASSERT(session, optype == WT_LOGOP_ROW_REMOVE);
+
+	*pp += size;
+	return (0);
+}
+
+int
+__wt_logop_row_remove_print(
+    WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+	uint32_t fileid;
+	WT_ITEM key;
+
+	WT_RET(__wt_logop_row_remove_unpack(
+	    session, pp, end, &fileid, &key));
+
+	fprintf(out, "    \"optype\": \"row_remove\",\n");
+	fprintf(out, "    \"fileid\": \"%" PRIu32 "\",\n", fileid);
+	fprintf(out, "    \"key\": \"%.*s\",\n",
+	    (int)key.size, (const char *)key.data);
+	return (0);
+}
+
+int
+__wt_logop_row_truncate_pack(
+    WT_SESSION_IMPL *session, WT_ITEM *logrec,
+    uint32_t fileid, WT_ITEM *start, WT_ITEM *stop, uint32_t mode)
+{
+	const char *fmt = WT_UNCHECKED_STRING(IIIuuI);
+	size_t size;
+	uint32_t optype, recsize;
+
+	optype = WT_LOGOP_ROW_TRUNCATE;
+	WT_RET(__wt_struct_size(session, &size, fmt,
+	    optype, 0, fileid, start, stop, mode));
+
+	__wt_struct_size_adjust(session, &size);
+	WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+	recsize = (uint32_t)size;
+	WT_RET(__wt_struct_pack(session,
+	    (uint8_t *)logrec->data + logrec->size, size, fmt,
+	    optype, recsize, fileid, start, stop, mode));
+
+	logrec->size += (uint32_t)size;
+	return (0);
+}
+
+int
+__wt_logop_row_truncate_unpack(
+    WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+    uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep)
+{
+	const char *fmt = WT_UNCHECKED_STRING(IIIuuI);
+	uint32_t optype, size;
+
+	WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+	    &optype, &size, fileidp, startp, stopp, modep));
+	WT_ASSERT(session, optype == WT_LOGOP_ROW_TRUNCATE);
+
+	*pp += size;
+	return (0);
+}
+
+int
+__wt_logop_row_truncate_print(
+    WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+	uint32_t fileid;
+	WT_ITEM start;
+	WT_ITEM stop;
+	uint32_t mode;
+
+	WT_RET(__wt_logop_row_truncate_unpack(
+	    session, pp, end, &fileid, &start, &stop, &mode));
+
+	fprintf(out, "    \"optype\": \"row_truncate\",\n");
+	fprintf(out, "    \"fileid\": \"%" PRIu32 "\",\n", fileid);
+	fprintf(out, "    \"start\": \"%.*s\",\n",
+	    (int)start.size, (const char *)start.data);
+	fprintf(out, "    \"stop\": \"%.*s\",\n",
+	    (int)stop.size, (const char *)stop.data);
+	fprintf(out, "    \"mode\": \"%" PRIu32 "\",\n", mode);
+	return (0);
+}
+
+int
+__wt_txn_op_printlog(
+    WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+	uint32_t optype, opsize;
+
+	/* Peek at the size and the type. */
+	WT_RET(__wt_logop_read(session, pp, end, &optype, &opsize));
+	end = *pp + opsize;
+
+	switch (optype) {
+	case WT_LOGOP_COL_PUT:
+		WT_RET(__wt_logop_col_put_print(session, pp, end, out));
+		break;
+
+	case WT_LOGOP_COL_REMOVE:
+		WT_RET(__wt_logop_col_remove_print(session, pp, end, out));
+		break;
+
+	case WT_LOGOP_COL_TRUNCATE:
+		WT_RET(__wt_logop_col_truncate_print(session, pp, end, out));
+		break;
+
+	case WT_LOGOP_ROW_PUT:
+		WT_RET(__wt_logop_row_put_print(session, pp, end, out));
+		break;
+
+	case WT_LOGOP_ROW_REMOVE:
+		WT_RET(__wt_logop_row_remove_print(session, pp, end, out));
+		break;
+
+	case WT_LOGOP_ROW_TRUNCATE:
+		WT_RET(__wt_logop_row_truncate_print(session, pp, end, out));
+		break;
+
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c
new file mode 100644
index 00000000000..c12f47d231b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/log/log_slot.c
@@ -0,0 +1,354 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * This file implements the consolidated array algorithm as described in
+ * the paper:
+ * Scalability of write-ahead logging on multicore and multisocket hardware
+ * by Ryan Johnson, Ippokratis Pandis, Radu Stoica, Manos Athanassoulis
+ * and Anastasia Ailamaki.
+ *
+ * It appeared in The VLDB Journal, DOI 10.1007/s00778-011-0260-8 and can
+ * be found at:
+ * http://infoscience.epfl.ch/record/170505/files/aether-smpfulltext.pdf
+ */
+
+/*
+ * __wt_log_slot_init --
+ *	Initialize the slot array.
+ */
+int
+__wt_log_slot_init(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_LOG *log;
+	WT_LOGSLOT *slot;
+	int32_t i;
+
+	conn = S2C(session);
+	log = conn->log;
+	for (i = 0; i < SLOT_POOL; i++) {
+		log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE;
+		log->slot_pool[i].slot_index = SLOT_INVALID_INDEX;
+	}
+
+	/*
+	 * Set up the available slots from the pool the first time.
+	 */
+	for (i = 0; i < SLOT_ACTIVE; i++) {
+		slot = &log->slot_pool[i];
+		slot->slot_index = (uint32_t)i;
+		slot->slot_state = WT_LOG_SLOT_READY;
+		log->slot_array[i] = slot;
+	}
+
+	/*
+	 * Allocate memory for buffers now that the arrays are setup. Split
+	 * this out to make error handling simpler.
+	 */
+	for (i = 0; i < SLOT_POOL; i++) {
+		WT_ERR(__wt_buf_init(session,
+		    &log->slot_pool[i].slot_buf, WT_LOG_SLOT_BUF_INIT_SIZE));
+		F_SET(&log->slot_pool[i], SLOT_BUFFERED);
+	}
+	WT_STAT_FAST_CONN_INCRV(session,
+	    log_buffer_size, WT_LOG_SLOT_BUF_INIT_SIZE * SLOT_POOL);
+	if (0) {
+err:		while (--i >= 0)
+			__wt_buf_free(session, &log->slot_pool[i].slot_buf);
+	}
+	return (ret);
+}
+
+/*
+ * __wt_log_slot_destroy --
+ *	Clean up the slot array on shutdown.
+ */
+int
+__wt_log_slot_destroy(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_LOG *log;
+	int i;
+
+	conn = S2C(session);
+	log = conn->log;
+
+	for (i = 0; i < SLOT_POOL; i++)
+		__wt_buf_free(session, &log->slot_pool[i].slot_buf);
+	return (0);
+}
+
+/*
+ * __wt_log_slot_join --
+ *	Join a consolidated logging slot. Callers should be prepared to deal
+ *	with a ENOMEM return - which indicates no slots could accommodate
+ *	the log record.
+ */
+int
+__wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
+    uint32_t flags, WT_MYSLOT *myslotp)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_LOG *log;
+	WT_LOGSLOT *slot;
+	int64_t cur_state, new_state, old_state;
+	uint32_t allocated_slot, slot_grow_attempts;
+
+	conn = S2C(session);
+	log = conn->log;
+	slot_grow_attempts = 0;
+find_slot:
+	allocated_slot = __wt_random(session->rnd) % SLOT_ACTIVE;
+	slot = log->slot_array[allocated_slot];
+	old_state = slot->slot_state;
+join_slot:
+	/*
+	 * WT_LOG_SLOT_READY and higher means the slot is available for
+	 * joining.  Any other state means it is in use and transitioning
+	 * from the active array.
+	 */
+	if (old_state < WT_LOG_SLOT_READY) {
+		WT_STAT_FAST_CONN_INCR(session, log_slot_transitions);
+		goto find_slot;
+	}
+	/*
+	 * Add in our size to the state and then atomically swap that
+	 * into place if it is still the same value.
+	 */
+	new_state = old_state + (int64_t)mysize;
+	if (new_state < old_state) {
+		/* Our size doesn't fit here. */
+		WT_STAT_FAST_CONN_INCR(session, log_slot_toobig);
+		goto find_slot;
+	}
+	/*
+	 * If the slot buffer isn't big enough to hold this update, mark
+	 * the slot for a buffer size increase and find another slot.
+	 */
+	if (new_state > (int64_t)slot->slot_buf.memsize) {
+		F_SET(slot, SLOT_BUF_GROW);
+		if (++slot_grow_attempts > 5) {
+			WT_STAT_FAST_CONN_INCR(session, log_slot_toosmall);
+			return (ENOMEM);
+		}
+		goto find_slot;
+	}
+	cur_state = WT_ATOMIC_CAS_VAL8(slot->slot_state, old_state, new_state);
+	/*
+	 * We lost a race to add our size into this slot.  Check the state
+	 * and try again.
+	 */
+	if (cur_state != old_state) {
+		old_state = cur_state;
+		WT_STAT_FAST_CONN_INCR(session, log_slot_races);
+		goto join_slot;
+	}
+	WT_ASSERT(session, myslotp != NULL);
+	/*
+	 * We joined this slot.  Fill in our information to return to
+	 * the caller.
+	 */
+	WT_STAT_FAST_CONN_INCR(session, log_slot_joins);
+	if (LF_ISSET(WT_LOG_FSYNC))
+		F_SET(slot, SLOT_SYNC);
+	myslotp->slot = slot;
+	myslotp->offset = (wt_off_t)old_state - WT_LOG_SLOT_READY;
+	return (0);
+}
+
+/*
+ * __wt_log_slot_close --
+ *	Close a slot and do not allow any other threads to join this slot.
+ *	Remove this from the active slot array and move a new slot from
+ *	the pool into its place.  Set up the size of this group;
+ *	Must be called with the logging spinlock held.
+ */
+int
+__wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_LOG *log;
+	WT_LOGSLOT *newslot;
+	int64_t old_state;
+	int32_t yields;
+	uint32_t pool_i, switch_fails;
+
+	conn = S2C(session);
+	log = conn->log;
+	switch_fails = 0;
+retry:
+	/*
+	 * Find an unused slot in the pool.
+	 */
+	pool_i = log->pool_index;
+	newslot = &log->slot_pool[pool_i];
+	if (++log->pool_index >= SLOT_POOL)
+		log->pool_index = 0;
+	if (newslot->slot_state != WT_LOG_SLOT_FREE) {
+		WT_STAT_FAST_CONN_INCR(session, log_slot_switch_fails);
+		/*
+		 * If it takes a number of attempts to find an available slot
+		 * it's likely all slots are waiting to be released. This
+		 * churn is used to change how long we pause before closing
+		 * the slot - which leads to more consolidation and less churn.
+		 */
+		if (++switch_fails % SLOT_POOL == 0 &&
+		    switch_fails != 0 && slot->slot_churn < 5)
+			++slot->slot_churn;
+		__wt_yield();
+		goto retry;
+	} else if (slot->slot_churn > 0) {
+		--slot->slot_churn;
+		WT_ASSERT(session, slot->slot_churn >= 0);
+	}
+
+	/* Pause to allow other threads a chance to consolidate. */
+	for (yields = slot->slot_churn; yields >= 0; yields--)
+		__wt_yield();
+
+	/*
+	 * Swap out the slot we're going to use and put a free one in the
+	 * slot array in its place so that threads can use it right away.
+	 */
+	WT_STAT_FAST_CONN_INCR(session, log_slot_closes);
+	newslot->slot_state = WT_LOG_SLOT_READY;
+	newslot->slot_index = slot->slot_index;
+	log->slot_array[newslot->slot_index] = &log->slot_pool[pool_i];
+	old_state = WT_ATOMIC_STORE8(slot->slot_state, WT_LOG_SLOT_PENDING);
+	slot->slot_group_size = (uint64_t)(old_state - WT_LOG_SLOT_READY);
+	/*
+	 * Note that this statistic may be much bigger than in reality,
+	 * especially when compared with the total bytes written in
+	 * __log_fill.  The reason is that this size reflects any
+	 * rounding up that is needed and the total bytes in __log_fill
+	 * is the amount of user bytes.
+	 */
+	WT_STAT_FAST_CONN_INCRV(session,
+	    log_slot_consolidated, (uint64_t)slot->slot_group_size);
+	return (0);
+}
+
+/*
+ * __wt_log_slot_notify --
+ *	Notify all threads waiting for the state to be < WT_LOG_SLOT_DONE.
+ */
+int
+__wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+{
+	WT_UNUSED(session);
+
+	slot->slot_state =
+	    (int64_t)WT_LOG_SLOT_DONE - (int64_t)slot->slot_group_size;
+	return (0);
+}
+
+/*
+ * __wt_log_slot_wait --
+ *	Wait for slot leader to allocate log area and tell us our log offset.
+ */
+int
+__wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+{
+	WT_UNUSED(session);
+
+	while (slot->slot_state > WT_LOG_SLOT_DONE)
+		__wt_yield();
+	return (0);
+}
+
+/*
+ * __wt_log_slot_release --
+ *	Each thread in a consolidated group releases its portion to
+ *	signal it has completed writing its piece of the log.
+ */
+int64_t
+__wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size)
+{
+	int64_t newsize;
+
+	/*
+	 * Add my size into the state.  When it reaches WT_LOG_SLOT_DONE
+	 * all participatory threads have completed copying their piece.
+	 */
+	newsize = WT_ATOMIC_ADD8(slot->slot_state, (int64_t)size);
+	return (newsize);
+}
+
+/*
+ * __wt_log_slot_free --
+ *	Free a slot back into the pool.
+ */
+int
+__wt_log_slot_free(WT_LOGSLOT *slot)
+{
+	slot->slot_state = WT_LOG_SLOT_FREE;
+	return (0);
+}
+
+/*
+ * __wt_log_slot_grow_buffers --
+ *	Increase the buffer size of all available slots in the buffer pool.
+ *	Go to some lengths to include active (but unused) slots to handle
+ *	the case where all log write record sizes exceed the size of the
+ *	active buffer.
+ */
+int
+__wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_LOG *log;
+	WT_LOGSLOT *slot;
+	int64_t orig_state;
+	uint64_t old_size, total_growth;
+	int i;
+
+	conn = S2C(session);
+	log = conn->log;
+	total_growth = 0;
+	WT_STAT_FAST_CONN_INCR(session, log_buffer_grow);
+	/*
+	 * Take the log slot lock to prevent other threads growing buffers
+	 * at the same time. Could tighten the scope of this lock, or have
+	 * a separate lock if there is contention.
+	 */
+	__wt_spin_lock(session, &log->log_slot_lock);
+	for (i = 0; i < SLOT_POOL; i++) {
+		slot = &log->slot_pool[i];
+		/* Avoid atomic operations if they won't succeed. */
+		if (slot->slot_state != WT_LOG_SLOT_FREE &&
+		    slot->slot_state != WT_LOG_SLOT_READY)
+			continue;
+		/* Don't keep growing unrelated buffers. */
+		if (slot->slot_buf.memsize > (10 * newsize) &&
+		    !F_ISSET(slot, SLOT_BUF_GROW))
+			continue;
+		orig_state = WT_ATOMIC_CAS_VAL8(
+		    slot->slot_state, WT_LOG_SLOT_FREE, WT_LOG_SLOT_PENDING);
+		if (orig_state != WT_LOG_SLOT_FREE) {
+			orig_state = WT_ATOMIC_CAS_VAL8(slot->slot_state,
+			    WT_LOG_SLOT_READY, WT_LOG_SLOT_PENDING);
+			if (orig_state != WT_LOG_SLOT_READY)
+				continue;
+		}
+
+		/* We have a slot - now go ahead and grow the buffer. */
+		old_size = slot->slot_buf.memsize;
+		F_CLR(slot, SLOT_BUF_GROW);
+		WT_ERR(__wt_buf_grow(session, &slot->slot_buf,
+		    WT_MAX(slot->slot_buf.memsize * 2, newsize)));
+		slot->slot_state = orig_state;
+		total_growth += slot->slot_buf.memsize - old_size;
+	}
+err:	__wt_spin_unlock(session, &log->log_slot_lock);
+	WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, total_growth);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
new file mode 100644
index 00000000000..f50706fb2e9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
@@ -0,0 +1,1519 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#define	WT_FORALL_CURSORS(clsm, c, i)					\
+	for ((i) = (clsm)->nchunks; (i) > 0;)				\
+		if (((c) = (clsm)->cursors[--i]) != NULL)
+
+#define	WT_LSM_CURCMP(s, lsm_tree, c1, c2, cmp)				\
+	__wt_compare(s, (lsm_tree)->collator, &(c1)->key, &(c2)->key, &cmp)
+
+static int __clsm_lookup(WT_CURSOR_LSM *, WT_ITEM *);
+static int __clsm_open_cursors(WT_CURSOR_LSM *, int, u_int, uint32_t);
+static int __clsm_reset_cursors(WT_CURSOR_LSM *, WT_CURSOR *);
+
+/*
+ * __clsm_enter_update --
+ *	Make sure an LSM cursor is ready to perform an update.
+ */
+static int
+__clsm_enter_update(WT_CURSOR_LSM *clsm)
+{
+	WT_CURSOR *primary;
+	WT_DECL_RET;
+	WT_LSM_CHUNK *primary_chunk;
+	WT_LSM_TREE *lsm_tree;
+	WT_SESSION_IMPL *session;
+	int have_primary, ovfl, waited;
+
+	lsm_tree = clsm->lsm_tree;
+	if (clsm->nchunks == 0 ||
+	    (primary = clsm->cursors[clsm->nchunks - 1]) == NULL)
+		return (0);
+	session = (WT_SESSION_IMPL *)primary->session;
+	primary_chunk = clsm->primary_chunk;
+	have_primary = (primary_chunk != NULL &&
+	    primary_chunk->switch_txn == WT_TXN_NONE);
+	ovfl = 0;
+
+	/*
+	 * In LSM there are multiple btrees active at one time. The tree
+	 * switch code needs to use btree API methods, and it wants to
+	 * operate on the btree for the primary chunk. Set that up now.
+	 *
+	 * If the primary chunk has grown too large, set a flag so the worker
+	 * thread will switch when it gets a chance to avoid introducing high
+	 * latency into application threads.  Don't do this indefinitely: if a
+	 * chunk grows twice as large as the configured size, block until it
+	 * can be switched.
+	 */
+	if (!F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
+		if (have_primary)
+			WT_WITH_BTREE(session,
+			    ((WT_CURSOR_BTREE *)primary)->btree,
+			    ovfl = __wt_btree_size_overflow(
+			    session, lsm_tree->chunk_size));
+
+		if (ovfl || !have_primary) {
+			/*
+			 * Check that we are up-to-date: don't set the switch
+			 * if the tree has changed since we last opened
+			 * cursors: that can lead to switching multiple times
+			 * when only one switch is required, creating very
+			 * small chunks.
+			 */
+			WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
+			if (clsm->dsk_gen == lsm_tree->dsk_gen &&
+			    !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
+				ret = __wt_lsm_manager_push_entry(
+				    session, WT_LSM_WORK_SWITCH, 0, lsm_tree);
+				F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
+			}
+			WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree));
+			WT_RET(ret);
+			ovfl = 0;
+		}
+	} else if (have_primary)
+		WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)primary)->btree,
+		    ovfl = __wt_btree_size_overflow(
+		    session, 2 * lsm_tree->chunk_size));
+
+	/*
+	 * If there is no primary chunk, or it has really overflowed, which
+	 * either means a worker thread has fallen behind or there has just
+	 * been a user-level checkpoint, wait until the tree changes.
+	 *
+	 * We used to switch chunks in the application thread if we got to
+	 * here, but that is problematic because there is a transaction in
+	 * progress and it could roll back, leaving the metadata inconsistent.
+	 */
+	if (ovfl || !have_primary) {
+		for (waited = 0;
+		    clsm->dsk_gen == lsm_tree->dsk_gen;
+		    ++waited) {
+			if (waited % 100 == 0)
+				WT_RET(__wt_lsm_manager_push_entry(
+				    session, WT_LSM_WORK_SWITCH, 0, lsm_tree));
+			__wt_sleep(0, 10);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * __clsm_enter --
+ *	Start an operation on an LSM cursor, update if the tree has changed.
+ */
+static inline int
+__clsm_enter(WT_CURSOR_LSM *clsm, int reset, int update)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	uint64_t *switch_txnp;
+	uint64_t snap_min;
+
+	session = (WT_SESSION_IMPL *)clsm->iface.session;
+
+	/* Merge cursors never update. */
+	if (F_ISSET(clsm, WT_CLSM_MERGE))
+		return (0);
+
+	if (reset) {
+		WT_ASSERT(session, !F_ISSET(&clsm->iface,
+		   WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT));
+		WT_RET(__clsm_reset_cursors(clsm, NULL));
+	}
+
+	for (;;) {
+		/*
+		 * If the cursor looks up-to-date, check if the cache is full.
+		 * In case this call blocks, the check will be repeated before
+		 * proceeding.
+		 */
+		if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen)
+			goto open;
+
+		WT_RET(__wt_cache_full_check(session));
+
+		if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen)
+			goto open;
+
+		/* Update the maximum transaction ID in the primary chunk. */
+		if (update) {
+			WT_RET(__clsm_enter_update(clsm));
+			if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen)
+				goto open;
+
+			/*
+			 * Ensure that there is a transaction snapshot active.
+			 */
+			WT_RET(__wt_txn_autocommit_check(session));
+
+			if (session->txn.isolation == TXN_ISO_SNAPSHOT)
+				__wt_txn_cursor_op(session);
+
+			/*
+			 * Figure out how many updates are required for
+			 * snapshot isolation.
+			 *
+			 * This is not a normal visibility check on the maximum
+			 * transaction ID in each chunk: any transaction ID
+			 * that overlaps with our snapshot is a potential
+			 * conflict.
+			 */
+			clsm->nupdates = 1;
+			if (session->txn.isolation == TXN_ISO_SNAPSHOT &&
+			    F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
+				WT_ASSERT(session,
+				    F_ISSET(&session->txn, TXN_HAS_SNAPSHOT));
+				snap_min = session->txn.snap_min;
+				for (switch_txnp =
+				    &clsm->switch_txn[clsm->nchunks - 2];
+				    clsm->nupdates < clsm->nchunks;
+				    clsm->nupdates++, switch_txnp--) {
+					if (TXNID_LT(*switch_txnp, snap_min))
+						break;
+					WT_ASSERT(session,
+					    !__wt_txn_visible_all(
+					    session, *switch_txnp));
+				}
+			}
+		}
+
+		/*
+		 * Stop when we are up-to-date, as long as this is:
+		 *   - a snapshot isolation update and the cursor is set up for
+		 *     that;
+		 *   - an update operation with a primary chunk, or
+		 *   - a read operation and the cursor is open for reading.
+		 */
+		if ((!update ||
+		    session->txn.isolation != TXN_ISO_SNAPSHOT ||
+		    F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) &&
+		    ((update && clsm->primary_chunk != NULL) ||
+		    (!update && F_ISSET(clsm, WT_CLSM_OPEN_READ))))
+			break;
+
+open:		WT_WITH_SCHEMA_LOCK(session,
+		    ret = __clsm_open_cursors(clsm, update, 0, 0));
+		WT_RET(ret);
+	}
+
+	if (!F_ISSET(clsm, WT_CLSM_ACTIVE)) {
+		WT_RET(__cursor_enter(session));
+		F_SET(clsm, WT_CLSM_ACTIVE);
+	}
+
+	return (0);
+}
+
+/*
+ * __clsm_leave --
+ *	Finish an operation on an LSM cursor.
+ */
+static int
+__clsm_leave(WT_CURSOR_LSM *clsm)
+{
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)clsm->iface.session;
+
+	if (F_ISSET(clsm, WT_CLSM_ACTIVE)) {
+		WT_RET(__cursor_leave(session));
+		F_CLR(clsm, WT_CLSM_ACTIVE);
+	}
+
+	return (0);
+}
+
+/*
+ * We need a tombstone to mark deleted records, and we use the special
+ * value below for that purpose.  We use two 0x14 (Device Control 4) bytes to
+ * minimize the likelihood of colliding with an application-chosen encoding
+ * byte, if the application uses two leading DC4 byte for some reason, we'll do
+ * a wasted data copy each time a new value is inserted into the object.
+ */
+static const WT_ITEM __tombstone = { "\x14\x14", 2, 0, NULL, 0 };
+
+/*
+ * __clsm_deleted --
+ *	Check whether the current value is a tombstone.
+ */
+static inline int
+__clsm_deleted(WT_CURSOR_LSM *clsm, const WT_ITEM *item)
+{
+	return (!F_ISSET(clsm, WT_CLSM_MINOR_MERGE) &&
+	    item->size == __tombstone.size &&
+	    memcmp(item->data, __tombstone.data, __tombstone.size) == 0);
+}
+
+/*
+ * __clsm_deleted_encode --
+ *	Encode values that are in the encoded name space.
+ */
+static inline int
+__clsm_deleted_encode(WT_SESSION_IMPL *session,
+    const WT_ITEM *value, WT_ITEM *final_value, WT_ITEM **tmpp)
+{
+	WT_ITEM *tmp;
+
+	/*
+	 * If value requires encoding, get a scratch buffer of the right size
+	 * and create a copy of the data with the first byte of the tombstone
+	 * appended.
+	 */
+	if (value->size >= __tombstone.size &&
+	    memcmp(value->data, __tombstone.data, __tombstone.size) == 0) {
+		WT_RET(__wt_scr_alloc(session, value->size + 1, tmpp));
+		tmp = *tmpp;
+
+		memcpy(tmp->mem, value->data, value->size);
+		memcpy((uint8_t *)tmp->mem + value->size, __tombstone.data, 1);
+		final_value->data = tmp->mem;
+		final_value->size = value->size + 1;
+	} else {
+		final_value->data = value->data;
+		final_value->size = value->size;
+	}
+
+	return (0);
+}
+
+/*
+ * __clsm_deleted_decode --
+ *	Decode values that start with the tombstone.
+ */
+static inline void
+__clsm_deleted_decode(WT_ITEM *value)
+{
+	/*
+	 * Take care with this check: when an LSM cursor is used for a merge,
+	 * and/or to create a Bloom filter, it is valid to return the tombstone
+	 * value.
+	 */
+	if (value->size > __tombstone.size &&
+	    memcmp(value->data, __tombstone.data, __tombstone.size) == 0)
+		--value->size;
+}
+
+/*
+ * __clsm_close_cursors --
+ *	Close any btree cursors that are not needed.
+ */
+static int
+__clsm_close_cursors(WT_CURSOR_LSM *clsm, u_int start, u_int end)
+{
+	WT_BLOOM *bloom;
+	WT_CURSOR *c;
+	u_int i;
+
+	if (clsm->cursors == NULL || clsm->nchunks == 0)
+		return (0);
+
+	/*
+	 * Walk the cursors, closing any we don't need.  Note that the exit
+	 * condition here is special, don't use WT_FORALL_CURSORS, and be
+	 * careful with unsigned integer wrapping.
+	 */
+	for (i = start; i < end; i++) {
+		if ((c = (clsm)->cursors[i]) != NULL) {
+			clsm->cursors[i] = NULL;
+			WT_RET(c->close(c));
+		}
+		if ((bloom = clsm->blooms[i]) != NULL) {
+			clsm->blooms[i] = NULL;
+			WT_RET(__wt_bloom_close(bloom));
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * __clsm_open_cursors --
+ *	Open cursors for the current set of files.
+ */
+static int
+__clsm_open_cursors(
+    WT_CURSOR_LSM *clsm, int update, u_int start_chunk, uint32_t start_id)
+{
+	WT_BTREE *btree;
+	WT_CURSOR *c, **cp, *primary;
+	WT_DECL_RET;
+	WT_LSM_CHUNK *chunk;
+	WT_LSM_TREE *lsm_tree;
+	WT_SESSION_IMPL *session;
+	WT_TXN *txn;
+	const char *checkpoint, *ckpt_cfg[3];
+	uint64_t saved_gen;
+	u_int i, nchunks, ngood, nupdates;
+	int locked;
+
+	c = &clsm->iface;
+	session = (WT_SESSION_IMPL *)c->session;
+	txn = &session->txn;
+	lsm_tree = clsm->lsm_tree;
+	chunk = NULL;
+
+	ckpt_cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
+	ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw";
+	ckpt_cfg[2] = NULL;
+
+	/* Copy the key, so we don't lose the cursor position. */
+	if (F_ISSET(c, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&c->key))
+		WT_RET(__wt_buf_set(
+		    session, &c->key, c->key.data, c->key.size));
+
+	F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
+
+	if (update) {
+		if (txn->isolation == TXN_ISO_SNAPSHOT)
+			F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT);
+	} else
+		F_SET(clsm, WT_CLSM_OPEN_READ);
+
+	WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
+	locked = 1;
+
+	/*
+	 * If there is no in-memory chunk in the tree for an update operation,
+	 * create one.
+	 *
+	 * !!!
+	 * It is exceeding unlikely that we get here at all, but if we were to
+	 * switch chunks in this thread and our transaction roll back, it would
+	 * leave the metadata inconsistent.  Signal for the LSM worker thread
+	 * to create the chunk instead to avoid the issue.
+	 */
+	if (update && (lsm_tree->nchunks == 0 ||
+	    (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) == NULL ||
+	    chunk->switch_txn != WT_TXN_NONE)) {
+		/* Release our lock because switch will get a write lock. */
+		F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
+		locked = 0;
+		WT_ERR(__wt_lsm_tree_readunlock(session, lsm_tree));
+
+		/*
+		 * Give the worker thread a chance to run before locking the
+		 * tree again -- we will loop in __clsm_enter until there is an
+		 * in-memory chunk in the tree.
+		 */
+		__wt_sleep(0, 1000);
+		WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree));
+		locked = 1;
+	}
+
+	/* Merge cursors have already figured out how many chunks they need. */
+retry:	if (F_ISSET(clsm, WT_CLSM_MERGE)) {
+		nchunks = clsm->nchunks;
+		ngood = 0;
+
+		/*
+		 * We may have raced with another merge completing.  Check that
+		 * we're starting at the right offset in the chunk array.
+		 */
+		if (start_chunk >= lsm_tree->nchunks ||
+		    lsm_tree->chunk[start_chunk]->id != start_id) {
+			for (start_chunk = 0;
+			    start_chunk < lsm_tree->nchunks;
+			    start_chunk++) {
+				chunk = lsm_tree->chunk[start_chunk];
+				if (chunk->id == start_id)
+					break;
+			}
+			/* We have to find the start chunk: merge locked it. */
+			WT_ASSERT(session, start_chunk < lsm_tree->nchunks);
+		}
+
+		WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
+	} else {
+		nchunks = lsm_tree->nchunks;
+
+		/*
+		 * If we are only opening the cursor for updates, only open the
+		 * primary chunk, plus any other chunks that might be required
+		 * to detect snapshot isolation conflicts.
+		 */
+		if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
+			WT_ERR(__wt_realloc_def(session,
+			    &clsm->txnid_alloc, nchunks,
+			    &clsm->switch_txn));
+		if (F_ISSET(clsm, WT_CLSM_OPEN_READ))
+			ngood = nupdates = 0;
+		else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
+			/*
+			 * Keep going until all updates in the next
+			 * chunk are globally visible.  Copy the maximum
+			 * transaction IDs into the cursor as we go.
+			 */
+			for (ngood = nchunks - 1, nupdates = 1;
+			    ngood > 0;
+			    ngood--, nupdates++) {
+				chunk = lsm_tree->chunk[ngood - 1];
+				clsm->switch_txn[ngood - 1] = chunk->switch_txn;
+				if (__wt_txn_visible_all(
+				    session, chunk->switch_txn))
+					break;
+			}
+		} else {
+			nupdates = 1;
+			ngood = nchunks - 1;
+		}
+
+		/* Check how many cursors are already open. */
+		for (cp = clsm->cursors + ngood;
+		    ngood < clsm->nchunks && ngood < nchunks;
+		    cp++, ngood++) {
+			chunk = lsm_tree->chunk[ngood];
+
+			/* If the cursor isn't open yet, we're done. */
+			if (*cp == NULL)
+				break;
+
+			/* Easy case: the URIs don't match. */
+			if (strcmp((*cp)->uri, chunk->uri) != 0)
+				break;
+
+			/* Make sure the checkpoint config matches. */
+			checkpoint = ((WT_CURSOR_BTREE *)*cp)->
+			    btree->dhandle->checkpoint;
+			if (checkpoint == NULL &&
+			    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
+			    !chunk->empty)
+				break;
+
+			/* Make sure the Bloom config matches. */
+			if (clsm->blooms[ngood] == NULL &&
+			    F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+				break;
+		}
+
+		/* Spurious generation bump? */
+		if (ngood == clsm->nchunks && clsm->nchunks == nchunks) {
+			clsm->dsk_gen = lsm_tree->dsk_gen;
+			goto err;
+		}
+
+		/*
+		 * Close any cursors we no longer need.  If the cursor is a
+		 * pure update cursor, close everything -- we usually only need
+		 * a single chunk open in that case and we haven't walked all
+		 * of the other slots in the loop above.
+		 *
+		 * Drop the LSM tree lock while we do this: if the cache is
+		 * full, we may block while closing a cursor.  Save the
+		 * generation number and retry if it has changed under us.
+		 */
+		if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0)
+			ngood = 0;
+		if (clsm->cursors != NULL && ngood < clsm->nchunks) {
+			saved_gen = lsm_tree->dsk_gen;
+			locked = 0;
+			WT_ERR(__wt_lsm_tree_readunlock(session, lsm_tree));
+			WT_ERR(__clsm_close_cursors(
+			    clsm, ngood, clsm->nchunks));
+			WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree));
+			locked = 1;
+			if (lsm_tree->dsk_gen != saved_gen)
+				goto retry;
+		}
+
+		/* Detach from our old primary. */
+		clsm->primary_chunk = NULL;
+		clsm->current = NULL;
+	}
+
+	WT_ERR(__wt_realloc_def(session,
+	    &clsm->bloom_alloc, nchunks, &clsm->blooms));
+	WT_ERR(__wt_realloc_def(session,
+	    &clsm->cursor_alloc, nchunks, &clsm->cursors));
+
+	clsm->nchunks = nchunks;
+
+	/* Open the cursors for chunks that have changed. */
+	for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) {
+		chunk = lsm_tree->chunk[i + start_chunk];
+		/* Copy the maximum transaction ID. */
+		if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
+			clsm->switch_txn[i] = chunk->switch_txn;
+
+		/*
+		 * Read from the checkpoint if the file has been written.
+		 * Once all cursors switch, the in-memory tree can be evicted.
+		 */
+		WT_ASSERT(session, *cp == NULL);
+		ret = __wt_open_cursor(session, chunk->uri, c,
+		    (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ?
+			ckpt_cfg : NULL, cp);
+
+		/*
+		 * XXX kludge: we may have an empty chunk where no checkpoint
+		 * was written.  If so, try to open the ordinary handle on that
+		 * chunk instead.
+		 */
+		if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
+			ret = __wt_open_cursor(
+			    session, chunk->uri, c, NULL, cp);
+			if (ret == 0)
+				chunk->empty = 1;
+		}
+		WT_ERR(ret);
+
+		/*
+		 * Setup all cursors other than the primary to only do conflict
+		 * checks on insert operations. This allows us to execute
+		 * inserts on non-primary chunks as a way of checking for
+		 * write conflicts with concurrent updates.
+		 */
+		if (i != nchunks - 1)
+			(*cp)->insert = __wt_curfile_update_check;
+
+		if (!F_ISSET(clsm, WT_CLSM_MERGE) &&
+		    F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+			WT_ERR(__wt_bloom_open(session, chunk->bloom_uri,
+			    lsm_tree->bloom_bit_count,
+			    lsm_tree->bloom_hash_count,
+			    c, &clsm->blooms[i]));
+
+		/* Child cursors always use overwrite and raw mode. */
+		F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW);
+	}
+
+	/* The last chunk is our new primary. */
+	if (chunk != NULL &&
+	    !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
+	    chunk->switch_txn == WT_TXN_NONE) {
+		clsm->primary_chunk = chunk;
+		primary = clsm->cursors[clsm->nchunks - 1];
+		/*
+		 * Disable eviction for the in-memory chunk.  Also clear the
+		 * bulk load flag here, otherwise eviction will be enabled by
+		 * the first update.
+		 */
+		btree = ((WT_CURSOR_BTREE *)(primary))->btree;
+		if (btree->bulk_load_ok) {
+			btree->bulk_load_ok = 0;
+			WT_WITH_BTREE(session, btree,
+			    __wt_btree_evictable(session, 0));
+		}
+	}
+
+	clsm->dsk_gen = lsm_tree->dsk_gen;
+
+err:	
+#ifdef HAVE_DIAGNOSTIC
+	/* Check that all cursors are open as expected. */
+	if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) {
+		for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) {
+			chunk = lsm_tree->chunk[i + start_chunk];
+
+			/* Make sure the cursor is open. */
+			WT_ASSERT(session, *cp != NULL);
+
+			/* Easy case: the URIs should match. */
+			WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0);
+
+			/* Make sure the checkpoint config matches. */
+			checkpoint = ((WT_CURSOR_BTREE *)*cp)->
+			    btree->dhandle->checkpoint;
+			WT_ASSERT(session,
+			    (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
+			    !chunk->empty) ?
+			    checkpoint != NULL : checkpoint == NULL);
+
+			/* Make sure the Bloom config matches. */
+			WT_ASSERT(session,
+			    (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) &&
+			    !F_ISSET(clsm, WT_CLSM_MERGE)) ?
+			    clsm->blooms[i] != NULL : clsm->blooms[i] == NULL);
+		}
+	}
+#endif
+	if (locked)
+		WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree));
+	return (ret);
+}
+
+/*
+ * __wt_clsm_init_merge --
+ *	Initialize an LSM cursor for a merge.
+ */
+int
+__wt_clsm_init_merge(
+    WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks)
+{
+	WT_CURSOR_LSM *clsm;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	clsm = (WT_CURSOR_LSM *)cursor;
+	session = (WT_SESSION_IMPL *)cursor->session;
+
+	F_SET(clsm, WT_CLSM_MERGE);
+	if (start_chunk != 0)
+		F_SET(clsm, WT_CLSM_MINOR_MERGE);
+	clsm->nchunks = nchunks;
+
+	WT_WITH_SCHEMA_LOCK(session,
+	    ret = __clsm_open_cursors(clsm, 0, start_chunk, start_id));
+	return (ret);
+}
+
+/*
+ * __clsm_get_current --
+ *	Find the smallest / largest of the cursors and copy its key/value.
+ */
+static int
+__clsm_get_current(
+    WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, int smallest, int *deletedp)
+{
+	WT_CURSOR *c, *current;
+	int cmp, multiple;
+	u_int i;
+
+	current = NULL;
+	multiple = 0;
+
+	WT_FORALL_CURSORS(clsm, c, i) {
+		if (!F_ISSET(c, WT_CURSTD_KEY_INT))
+			continue;
+		if (current == NULL) {
+			current = c;
+			continue;
+		}
+		WT_RET(WT_LSM_CURCMP(session, clsm->lsm_tree, c, current, cmp));
+		if (smallest ? cmp < 0 : cmp > 0) {
+			current = c;
+			multiple = 0;
+		} else if (cmp == 0)
+			multiple = 1;
+	}
+
+	c = &clsm->iface;
+	if ((clsm->current = current) == NULL) {
+		F_CLR(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+		return (WT_NOTFOUND);
+	}
+
+	if (multiple)
+		F_SET(clsm, WT_CLSM_MULTIPLE);
+	else
+		F_CLR(clsm, WT_CLSM_MULTIPLE);
+
+	WT_RET(current->get_key(current, &c->key));
+	WT_RET(current->get_value(current, &c->value));
+
+	F_CLR(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+	if ((*deletedp = __clsm_deleted(clsm, &c->value)) == 0)
+		F_SET(c, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+	return (0);
+}
+
+/*
+ * __clsm_compare --
+ *	WT_CURSOR->compare implementation for the LSM cursor type.
+ */
+static int
+__clsm_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
+{
+	WT_CURSOR_LSM *alsm;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	/* There's no need to sync with the LSM tree, avoid WT_LSM_ENTER. */
+	alsm = (WT_CURSOR_LSM *)a;
+	CURSOR_API_CALL(a, session, compare, NULL);
+
+	/*
+	 * Confirm both cursors refer to the same source and have keys, then
+	 * compare the keys.
+	 */
+	if (strcmp(a->uri, b->uri) != 0)
+		WT_ERR_MSG(session, EINVAL,
+		    "comparison method cursors must reference the same object");
+
+	WT_CURSOR_NEEDKEY(a);
+	WT_CURSOR_NEEDKEY(b);
+
+	WT_ERR(__wt_compare(
+	    session, alsm->lsm_tree->collator, &a->key, &b->key, cmpp));
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __clsm_next --
+ *	WT_CURSOR->next method for the LSM cursor type.
+ */
+static int
+__clsm_next(WT_CURSOR *cursor)
+{
+	WT_CURSOR_LSM *clsm;
+	WT_CURSOR *c;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	u_int i;
+	int check, cmp, deleted;
+
+	clsm = (WT_CURSOR_LSM *)cursor;
+
+	CURSOR_API_CALL(cursor, session, next, NULL);
+	WT_CURSOR_NOVALUE(cursor);
+	WT_ERR(__clsm_enter(clsm, 0, 0));
+
+	/* If we aren't positioned for a forward scan, get started. */
+	if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_NEXT)) {
+		F_CLR(clsm, WT_CLSM_MULTIPLE);
+		WT_FORALL_CURSORS(clsm, c, i) {
+			if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) {
+				WT_ERR(c->reset(c));
+				ret = c->next(c);
+			} else if (c != clsm->current) {
+				c->set_key(c, &cursor->key);
+				if ((ret = c->search_near(c, &cmp)) == 0) {
+					if (cmp < 0)
+						ret = c->next(c);
+					else if (cmp == 0) {
+						if (clsm->current == NULL)
+							clsm->current = c;
+						else
+							F_SET(clsm,
+							    WT_CLSM_MULTIPLE);
+					}
+				} else
+					F_CLR(c, WT_CURSTD_KEY_SET);
+			}
+			WT_ERR_NOTFOUND_OK(ret);
+		}
+		F_SET(clsm, WT_CLSM_ITERATE_NEXT);
+		F_CLR(clsm, WT_CLSM_ITERATE_PREV);
+
+		/* We just positioned *at* the key, now move. */
+		if (clsm->current != NULL)
+			goto retry;
+	} else {
+retry:		/*
+		 * If there are multiple cursors on that key, move them
+		 * forward.
+		 */
+		if (F_ISSET(clsm, WT_CLSM_MULTIPLE)) {
+			check = 0;
+			WT_FORALL_CURSORS(clsm, c, i) {
+				if (!F_ISSET(c, WT_CURSTD_KEY_INT))
+					continue;
+				if (check) {
+					WT_ERR(WT_LSM_CURCMP(session,
+					    clsm->lsm_tree, c, clsm->current,
+					    cmp));
+					if (cmp == 0)
+						WT_ERR_NOTFOUND_OK(c->next(c));
+				}
+				if (c == clsm->current)
+					check = 1;
+			}
+		}
+
+		/* Move the smallest cursor forward. */
+		c = clsm->current;
+		WT_ERR_NOTFOUND_OK(c->next(c));
+	}
+
+	/* Find the cursor(s) with the smallest key. */
+	if ((ret = __clsm_get_current(session, clsm, 1, &deleted)) == 0 &&
+	    deleted)
+		goto retry;
+
+err:	WT_TRET(__clsm_leave(clsm));
+	API_END(session, ret);
+	if (ret == 0)
+		__clsm_deleted_decode(&cursor->value);
+	return (ret);
+}
+
+/*
+ * __clsm_prev --
+ *	WT_CURSOR->prev method for the LSM cursor type.
+ */
+static int
+__clsm_prev(WT_CURSOR *cursor)
+{
+	WT_CURSOR_LSM *clsm;
+	WT_CURSOR *c;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	u_int i;
+	int check, cmp, deleted;
+
+	clsm = (WT_CURSOR_LSM *)cursor;
+
+	CURSOR_API_CALL(cursor, session, prev, NULL);
+	WT_CURSOR_NOVALUE(cursor);
+	WT_ERR(__clsm_enter(clsm, 0, 0));
+
+	/* If we aren't positioned for a reverse scan, get started. */
+	if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_PREV)) {
+		F_CLR(clsm, WT_CLSM_MULTIPLE);
+		WT_FORALL_CURSORS(clsm, c, i) {
+			if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) {
+				WT_ERR(c->reset(c));
+				ret = c->prev(c);
+			} else if (c != clsm->current) {
+				c->set_key(c, &cursor->key);
+				if ((ret = c->search_near(c, &cmp)) == 0) {
+					if (cmp > 0)
+						ret = c->prev(c);
+					else if (cmp == 0) {
+						if (clsm->current == NULL)
+							clsm->current = c;
+						else
+							F_SET(clsm,
+							    WT_CLSM_MULTIPLE);
+					}
+				}
+			}
+			WT_ERR_NOTFOUND_OK(ret);
+		}
+		F_SET(clsm, WT_CLSM_ITERATE_PREV);
+		F_CLR(clsm, WT_CLSM_ITERATE_NEXT);
+
+		/* We just positioned *at* the key, now move. */
+		if (clsm->current != NULL)
+			goto retry;
+	} else {
+retry:		/*
+		 * If there are multiple cursors on that key, move them
+		 * backwards.
+		 */
+		if (F_ISSET(clsm, WT_CLSM_MULTIPLE)) {
+			check = 0;
+			WT_FORALL_CURSORS(clsm, c, i) {
+				if (!F_ISSET(c, WT_CURSTD_KEY_INT))
+					continue;
+				if (check) {
+					WT_ERR(WT_LSM_CURCMP(session,
+					    clsm->lsm_tree, c, clsm->current,
+					    cmp));
+					if (cmp == 0)
+						WT_ERR_NOTFOUND_OK(c->prev(c));
+				}
+				if (c == clsm->current)
+					check = 1;
+			}
+		}
+
+		/* Move the smallest cursor backwards. */
+		c = clsm->current;
+		WT_ERR_NOTFOUND_OK(c->prev(c));
+	}
+
+	/* Find the cursor(s) with the largest key. */
+	if ((ret = __clsm_get_current(session, clsm, 0, &deleted)) == 0 &&
+	    deleted)
+		goto retry;
+
+err:	WT_TRET(__clsm_leave(clsm));
+	API_END(session, ret);
+	if (ret == 0)
+		__clsm_deleted_decode(&cursor->value);
+	return (ret);
+}
+
+/*
+ * __clsm_reset_cursors --
+ *	Reset any positioned chunk cursors.
+ *
+ *	If the skip parameter is non-NULL, that cursor is about to be used, so
+ *	there is no need to reset it.
+ */
+static int
+__clsm_reset_cursors(WT_CURSOR_LSM *clsm, WT_CURSOR *skip)
+{
+	WT_CURSOR *c;
+	WT_DECL_RET;
+	u_int i;
+
+	/* Fast path if the cursor is not positioned. */
+	if ((clsm->current == NULL || clsm->current == skip) &&
+	    !F_ISSET(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV))
+		return (0);
+
+	WT_FORALL_CURSORS(clsm, c, i) {
+		if (c == skip)
+			continue;
+		if (F_ISSET(c, WT_CURSTD_KEY_INT))
+			WT_TRET(c->reset(c));
+	}
+
+	clsm->current = NULL;
+	F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
+
+	return (ret);
+}
+
+/*
+ * __clsm_reset --
+ *	WT_CURSOR->reset method for the LSM cursor type.
+ */
+static int
+__clsm_reset(WT_CURSOR *cursor)
+{
+	WT_CURSOR_LSM *clsm;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	/*
+	 * Don't use the normal __clsm_enter path: that is wasted work when all
+	 * we want to do is give up our position.
+	 */
+	clsm = (WT_CURSOR_LSM *)cursor;
+	CURSOR_API_CALL(cursor, session, reset, NULL);
+	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+	WT_TRET(__clsm_reset_cursors(clsm, NULL));
+
+	/* In case we were left positioned, clear that. */
+	WT_TRET(__clsm_leave(clsm));
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __clsm_lookup --
+ *	Position an LSM cursor.
+ */
+static int
+__clsm_lookup(WT_CURSOR_LSM *clsm, WT_ITEM *value)
+{
+	WT_BLOOM *bloom;
+	WT_BLOOM_HASH bhash;
+	WT_CURSOR *c, *cursor;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	u_int i;
+	int have_hash;
+
+	c = NULL;
+	cursor = &clsm->iface;
+	have_hash = 0;
+	session = (WT_SESSION_IMPL *)cursor->session;
+
+	WT_FORALL_CURSORS(clsm, c, i) {
+		/* If there is a Bloom filter, see if we can skip the read. */
+		bloom = NULL;
+		if ((bloom = clsm->blooms[i]) != NULL) {
+			if (!have_hash) {
+				WT_ERR(__wt_bloom_hash(
+				    bloom, &cursor->key, &bhash));
+				have_hash = 1;
+			}
+
+			ret = __wt_bloom_hash_get(bloom, &bhash);
+			if (ret == WT_NOTFOUND) {
+				WT_STAT_FAST_INCR(session,
+				    &clsm->lsm_tree->stats, bloom_miss);
+				continue;
+			} else if (ret == 0)
+				WT_STAT_FAST_INCR(session,
+				    &clsm->lsm_tree->stats, bloom_hit);
+			WT_ERR(ret);
+		}
+		c->set_key(c, &cursor->key);
+		if ((ret = c->search(c)) == 0) {
+			WT_ERR(c->get_key(c, &cursor->key));
+			WT_ERR(c->get_value(c, value));
+			if (__clsm_deleted(clsm, value))
+				ret = WT_NOTFOUND;
+			goto done;
+		}
+		WT_ERR_NOTFOUND_OK(ret);
+		F_CLR(c, WT_CURSTD_KEY_SET);
+		/* Update stats: the active chunk can't have a bloom filter. */
+		if (bloom != NULL)
+			WT_STAT_FAST_INCR(session,
+			    &clsm->lsm_tree->stats, bloom_false_positive);
+		else if (clsm->primary_chunk == NULL || i != clsm->nchunks)
+			WT_STAT_FAST_INCR(session,
+			    &clsm->lsm_tree->stats, lsm_lookup_no_bloom);
+	}
+	WT_ERR(WT_NOTFOUND);
+
+done:
+err:	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+	if (ret == 0) {
+		clsm->current = c;
+		F_SET(cursor, WT_CURSTD_KEY_INT);
+		if (value == &cursor->value)
+			F_SET(cursor, WT_CURSTD_VALUE_INT);
+	} else if (c != NULL)
+		WT_TRET(c->reset(c));
+
+	return (ret);
+}
+
+/*
+ * __clsm_search --
+ *	WT_CURSOR->search method for the LSM cursor type.
+ */
+static int
+__clsm_search(WT_CURSOR *cursor)
+{
+	WT_CURSOR_LSM *clsm;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	clsm = (WT_CURSOR_LSM *)cursor;
+
+	CURSOR_API_CALL(cursor, session, search, NULL);
+	WT_CURSOR_NEEDKEY(cursor);
+	WT_CURSOR_NOVALUE(cursor);
+	WT_ERR(__clsm_enter(clsm, 1, 0));
+
+	ret = __clsm_lookup(clsm, &cursor->value);
+
+err:	WT_TRET(__clsm_leave(clsm));
+	API_END(session, ret);
+	if (ret == 0)
+		__clsm_deleted_decode(&cursor->value);
+	return (ret);
+}
+
+/*
+ * __clsm_search_near --
+ *	WT_CURSOR->search_near method for the LSM cursor type.
+ */
+static int
+__clsm_search_near(WT_CURSOR *cursor, int *exactp)
+{
+	WT_CURSOR *c, *larger, *smaller;
+	WT_CURSOR_LSM *clsm;
+	WT_DECL_RET;
+	WT_ITEM v;
+	WT_SESSION_IMPL *session;
+	u_int i;
+	int cmp, deleted;
+
+	larger = smaller = NULL;
+	clsm = (WT_CURSOR_LSM *)cursor;
+
+	CURSOR_API_CALL(cursor, session, search_near, NULL);
+	WT_CURSOR_NEEDKEY(cursor);
+	WT_CURSOR_NOVALUE(cursor);
+	WT_ERR(__clsm_enter(clsm, 1, 0));
+	F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
+
+	/*
+	 * search_near is somewhat fiddly: we can't just use a nearby key from
+	 * the in-memory chunk because there could be a closer key on disk.
+	 *
+	 * As we search down the chunks, we stop as soon as we find an exact
+	 * match.  Otherwise, we maintain the smallest cursor larger than the
+	 * search key and the largest cursor smaller than the search key.  At
+	 * the bottom, we prefer the larger cursor, but if no record is larger,
+	 * use the smaller cursor, or if no record at all was found,
+	 * WT_NOTFOUND.
+	 */
+	WT_FORALL_CURSORS(clsm, c, i) {
+		c->set_key(c, &cursor->key);
+		if ((ret = c->search_near(c, &cmp)) == WT_NOTFOUND) {
+			F_CLR(c, WT_CURSTD_KEY_SET);
+			ret = 0;
+			continue;
+		} else if (ret != 0)
+			goto err;
+
+		WT_ERR(c->get_value(c, &v));
+		deleted = __clsm_deleted(clsm, &v);
+
+		if (cmp == 0 && !deleted) {
+			clsm->current = c;
+			*exactp = 0;
+			goto done;
+		}
+
+		/*
+		 * Prefer larger cursors.  There are two reasons: (1) we expect
+		 * prefix searches to be a common case (as in our own indices);
+		 * and (2) we need a way to unambiguously know we have the
+		 * "closest" result.
+		 */
+		if (cmp < 0) {
+			if ((ret = c->next(c)) == 0)
+				cmp = 1;
+			else if (ret == WT_NOTFOUND)
+				ret = c->prev(c);
+			if (ret != 0)
+				goto err;
+		}
+
+		/*
+		 * If we land on a deleted item, try going forwards or
+		 * backwards to find one that isn't deleted.
+		 */
+		while (deleted && (ret = c->next(c)) == 0) {
+			cmp = 1;
+			WT_ERR(c->get_value(c, &v));
+			deleted = __clsm_deleted(clsm, &v);
+		}
+		WT_ERR_NOTFOUND_OK(ret);
+		while (deleted && (ret = c->prev(c)) == 0) {
+			cmp = -1;
+			WT_ERR(c->get_value(c, &v));
+			deleted = __clsm_deleted(clsm, &v);
+		}
+		WT_ERR_NOTFOUND_OK(ret);
+		if (deleted)
+			continue;
+
+		/*
+		 * We are trying to find the smallest cursor greater than the
+		 * search key, or, if there is no larger key, the largest
+		 * cursor smaller than the search key.
+		 *
+		 * It could happen that one cursor contains both of the closest
+		 * records.  In that case, we will track it in "larger", and it
+		 * will be the one we finally choose.
+		 */
+		if (cmp > 0) {
+			if (larger == NULL)
+				larger = c;
+			else {
+				WT_ERR(WT_LSM_CURCMP(session,
+				    clsm->lsm_tree, c, larger, cmp));
+				if (cmp < 0) {
+					WT_ERR(larger->reset(larger));
+					larger = c;
+				}
+			}
+		} else {
+			if (smaller == NULL)
+				smaller = c;
+			else {
+				WT_ERR(WT_LSM_CURCMP(session,
+				    clsm->lsm_tree, c, smaller, cmp));
+				if (cmp > 0) {
+					WT_ERR(smaller->reset(smaller));
+					smaller = c;
+				}
+			}
+		}
+
+		if (c != smaller && c != larger)
+			WT_ERR(c->reset(c));
+	}
+
+	if (larger != NULL) {
+		clsm->current = larger;
+		larger = NULL;
+		*exactp = 1;
+	} else if (smaller != NULL) {
+		clsm->current = smaller;
+		smaller = NULL;
+		*exactp = -1;
+	} else
+		ret = WT_NOTFOUND;
+
+done:
+err:	WT_TRET(__clsm_leave(clsm));
+	API_END(session, ret);
+	if (ret == 0) {
+		c = clsm->current;
+		WT_TRET(c->get_key(c, &cursor->key));
+		WT_TRET(c->get_value(c, &cursor->value));
+	}
+	if (smaller != NULL)
+		WT_TRET(smaller->reset(smaller));
+	if (larger != NULL)
+		WT_TRET(larger->reset(larger));
+
+	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+	if (ret == 0) {
+		F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+		__clsm_deleted_decode(&cursor->value);
+	} else
+		clsm->current = NULL;
+
+	return (ret);
+}
+
+/*
+ * __clsm_put --
+ *	Put an entry into the in-memory tree, trigger a file switch if
+ *	necessary.
+ */
+static inline int
+__clsm_put(WT_SESSION_IMPL *session,
+    WT_CURSOR_LSM *clsm, const WT_ITEM *key, const WT_ITEM *value, int position)
+{
+	WT_CURSOR *c, *primary;
+	WT_LSM_TREE *lsm_tree;
+	u_int i;
+
+	lsm_tree = clsm->lsm_tree;
+
+	WT_ASSERT(session,
+	    clsm->primary_chunk != NULL &&
+	    (clsm->primary_chunk->switch_txn == WT_TXN_NONE ||
+	    TXNID_LE(session->txn.id, clsm->primary_chunk->switch_txn)));
+
+	/*
+	 * Clear the existing cursor position.  Don't clear the primary cursor:
+	 * we're about to use it anyway.
+	 */
+	primary = clsm->cursors[clsm->nchunks - 1];
+	WT_RET(__clsm_reset_cursors(clsm, primary));
+
+	/* If necessary, set the position for future scans. */
+	if (position)
+		clsm->current = primary;
+
+	for (i = 0; i < clsm->nupdates; i++) {
+		c = clsm->cursors[(clsm->nchunks - i) - 1];
+		c->set_key(c, key);
+		c->set_value(c, value);
+		WT_RET((position && i == 0) ? c->update(c) : c->insert(c));
+	}
+
+	/*
+	 * Update the record count.  It is in a shared structure, but it's only
+	 * approximate, so don't worry about protecting access.
+	 *
+	 * Throttle if necessary.  Every 100 update operations on each cursor,
+	 * check if throttling is required.  Don't rely only on the shared
+	 * counter because it can race, and because for some workloads, there
+	 * may not be enough records per chunk to get effective throttling.
+	 */
+	if ((++clsm->primary_chunk->count % 100 == 0 ||
+	    ++clsm->update_count >= 100) &&
+	    lsm_tree->merge_throttle + lsm_tree->ckpt_throttle > 0) {
+		clsm->update_count = 0;
+		WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats,
+		    lsm_checkpoint_throttle, (uint64_t)lsm_tree->ckpt_throttle);
+		WT_STAT_FAST_CONN_INCRV(session,
+		    lsm_checkpoint_throttle, (uint64_t)lsm_tree->ckpt_throttle);
+		WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats,
+		    lsm_merge_throttle, (uint64_t)lsm_tree->merge_throttle);
+		WT_STAT_FAST_CONN_INCRV(session,
+		    lsm_merge_throttle, (uint64_t)lsm_tree->merge_throttle);
+		__wt_sleep(0,
+		    lsm_tree->ckpt_throttle + lsm_tree->merge_throttle);
+	}
+
+	return (0);
+}
+
+/*
+ * __clsm_insert --
+ *	WT_CURSOR->insert method for the LSM cursor type.
+ */
+static int
+__clsm_insert(WT_CURSOR *cursor)
+{
+	WT_CURSOR_LSM *clsm;
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+	WT_ITEM value;
+	WT_SESSION_IMPL *session;
+
+	clsm = (WT_CURSOR_LSM *)cursor;
+
+	CURSOR_UPDATE_API_CALL(cursor, session, insert, NULL);
+	WT_CURSOR_NEEDKEY(cursor);
+	WT_CURSOR_NEEDVALUE(cursor);
+	WT_ERR(__clsm_enter(clsm, 0, 1));
+
+	if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
+	    (ret = __clsm_lookup(clsm, &value)) != WT_NOTFOUND) {
+		if (ret == 0)
+			ret = WT_DUPLICATE_KEY;
+		goto err;
+	}
+
+	WT_ERR(__clsm_deleted_encode(session, &cursor->value, &value, &buf));
+	ret = __clsm_put(session, clsm, &cursor->key, &value, 0);
+
+err:	__wt_scr_free(&buf);
+	WT_TRET(__clsm_leave(clsm));
+	CURSOR_UPDATE_API_END(session, ret);
+	return (ret);
+}
+
+/*
+ * __clsm_update --
+ *	WT_CURSOR->update method for the LSM cursor type.
+ */
+static int
+__clsm_update(WT_CURSOR *cursor)
+{
+	WT_CURSOR_LSM *clsm;
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+	WT_ITEM value;
+	WT_SESSION_IMPL *session;
+
+	clsm = (WT_CURSOR_LSM *)cursor;
+
+	CURSOR_UPDATE_API_CALL(cursor, session, update, NULL);
+	WT_CURSOR_NEEDKEY(cursor);
+	WT_CURSOR_NEEDVALUE(cursor);
+	WT_ERR(__clsm_enter(clsm, 0, 1));
+
+	if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) ||
+	    (ret = __clsm_lookup(clsm, &value)) == 0) {
+		WT_ERR(__clsm_deleted_encode(
+		    session, &cursor->value, &value, &buf));
+		ret = __clsm_put(session, clsm, &cursor->key, &value, 1);
+	}
+
+err:	__wt_scr_free(&buf);
+	WT_TRET(__clsm_leave(clsm));
+	CURSOR_UPDATE_API_END(session, ret);
+	return (ret);
+}
+
+/*
+ * __clsm_remove --
+ *	WT_CURSOR->remove method for the LSM cursor type.
+ */
+static int
+__clsm_remove(WT_CURSOR *cursor)
+{
+	WT_CURSOR_LSM *clsm;
+	WT_DECL_RET;
+	WT_ITEM value;
+	WT_SESSION_IMPL *session;
+
+	clsm = (WT_CURSOR_LSM *)cursor;
+
+	CURSOR_UPDATE_API_CALL(cursor, session, remove, NULL);
+	WT_CURSOR_NEEDKEY(cursor);
+	WT_CURSOR_NOVALUE(cursor);
+	WT_ERR(__clsm_enter(clsm, 0, 1));
+
+	if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) ||
+	    (ret = __clsm_lookup(clsm, &value)) == 0)
+		ret = __clsm_put(session, clsm, &cursor->key, &__tombstone, 1);
+
+err:	WT_TRET(__clsm_leave(clsm));
+	CURSOR_UPDATE_API_END(session, ret);
+	return (ret);
+}
+
+/*
+ * __clsm_close --
+ *	WT_CURSOR->close method for the LSM cursor type.
+ */
+static int
+__clsm_close(WT_CURSOR *cursor)
+{
+	WT_CURSOR_LSM *clsm;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	/*
+	 * Don't use the normal __clsm_enter path: that is wasted work when
+	 * closing, and the cursor may never have been used.
+	 */
+	clsm = (WT_CURSOR_LSM *)cursor;
+	CURSOR_API_CALL(cursor, session, close, NULL);
+	WT_TRET(__clsm_close_cursors(clsm, 0, clsm->nchunks));
+	__wt_free(session, clsm->blooms);
+	__wt_free(session, clsm->cursors);
+	__wt_free(session, clsm->switch_txn);
+
+	/* In case we were somehow left positioned, clear that. */
+	WT_TRET(__clsm_leave(clsm));
+
+	/* The WT_LSM_TREE owns the URI. */
+	cursor->uri = NULL;
+	if (clsm->lsm_tree != NULL)
+		__wt_lsm_tree_release(session, clsm->lsm_tree);
+	WT_TRET(__wt_cursor_close(cursor));
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __wt_clsm_open --
+ *	WT_SESSION->open_cursor method for LSM cursors.
+ */
+int
+__wt_clsm_open(WT_SESSION_IMPL *session,
+    const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+{
+	WT_CONFIG_ITEM cval;
+	WT_CURSOR_STATIC_INIT(iface,
+	    __wt_cursor_get_key,	/* get-key */
+	    __wt_cursor_get_value,	/* get-value */
+	    __wt_cursor_set_key,	/* set-key */
+	    __wt_cursor_set_value,	/* set-value */
+	    __clsm_compare,		/* compare */
+	    __clsm_next,		/* next */
+	    __clsm_prev,		/* prev */
+	    __clsm_reset,		/* reset */
+	    __clsm_search,		/* search */
+	    __clsm_search_near,		/* search-near */
+	    __clsm_insert,		/* insert */
+	    __clsm_update,		/* update */
+	    __clsm_remove,		/* remove */
+	    __clsm_close);		/* close */
+	WT_CURSOR *cursor;
+	WT_CURSOR_LSM *clsm;
+	WT_DECL_RET;
+	WT_LSM_TREE *lsm_tree;
+
+	clsm = NULL;
+	cursor = NULL;
+
+	if (!WT_PREFIX_MATCH(uri, "lsm:"))
+		return (EINVAL);
+
+	WT_RET(__wt_config_gets_def(session, cfg, "checkpoint", 0, &cval));
+	if (cval.len != 0)
+		WT_RET_MSG(session, EINVAL,
+		    "LSM does not support opening by checkpoint");
+
+	/* Get the LSM tree. */
+	WT_WITH_SCHEMA_LOCK(session,
+	    ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree));
+	WT_RET(ret);
+
+	WT_ERR(__wt_calloc_def(session, 1, &clsm));
+
+	cursor = &clsm->iface;
+	*cursor = iface;
+	cursor->session = &session->iface;
+	cursor->uri = lsm_tree->name;
+	cursor->key_format = lsm_tree->key_format;
+	cursor->value_format = lsm_tree->value_format;
+
+	WT_ERR(__wt_cursor_config_readonly(cursor, cfg, 0));
+
+	clsm->lsm_tree = lsm_tree;
+
+	/*
+	 * The tree's dsk_gen starts at one, so starting the cursor on zero
+	 * will force a call into open_cursors on the first operation.
+	 */
+	clsm->dsk_gen = 0;
+
+	WT_STATIC_ASSERT(offsetof(WT_CURSOR_LSM, iface) == 0);
+	WT_ERR(__wt_cursor_init(cursor, cursor->uri, owner, cfg, cursorp));
+
+	if (0) {
+err:		__wt_lsm_tree_release(session, lsm_tree);
+		if (clsm != NULL) {
+			clsm->lsm_tree = NULL;
+			WT_TRET(__clsm_close(cursor));
+		}
+	}
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
new file mode 100644
index 00000000000..8f4b3ba49ef
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
@@ -0,0 +1,667 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __lsm_manager_aggressive_update(WT_SESSION_IMPL *, WT_LSM_TREE *);
+static int __lsm_manager_run_server(WT_SESSION_IMPL *);
+static int __lsm_manager_worker_setup(WT_SESSION_IMPL *);
+
+static void * __lsm_worker_manager(void *);
+
+/*
+ * __wt_lsm_manager_config --
+ *	Configure the LSM manager.
+ */
+int
+__wt_lsm_manager_config(WT_SESSION_IMPL *session, const char **cfg)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_CONFIG_ITEM cval;
+
+	conn = S2C(session);
+
+	WT_RET(__wt_config_gets(session, cfg, "lsm_manager.merge", &cval));
+	if (cval.val)
+		F_SET(conn, WT_CONN_LSM_MERGE);
+	WT_RET(__wt_config_gets(
+	    session, cfg, "lsm_manager.worker_thread_max", &cval));
+	if (cval.val)
+		conn->lsm_manager.lsm_workers_max = (uint32_t)cval.val;
+	return (0);
+}
+
+/*
+ * __lsm_general_worker_start --
+ *	Start up all of the general LSM worker threads.
+ */
+static int
+__lsm_general_worker_start(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_LSM_MANAGER *manager;
+	WT_LSM_WORKER_ARGS *worker_args;
+
+	conn = S2C(session);
+	manager = &conn->lsm_manager;
+
+	/*
+	 * Start the remaining worker threads.
+	 * This should get more sophisticated in the future - only launching
+	 * as many worker threads as are required to keep up with demand.
+	 */
+	WT_ASSERT(session, manager->lsm_workers > 1);
+	for (; manager->lsm_workers < manager->lsm_workers_max;
+	    manager->lsm_workers++) {
+		worker_args =
+		    &manager->lsm_worker_cookies[manager->lsm_workers];
+		worker_args->work_cond = manager->work_cond;
+		worker_args->id = manager->lsm_workers;
+		worker_args->type =
+		    WT_LSM_WORK_BLOOM |
+		    WT_LSM_WORK_DROP |
+		    WT_LSM_WORK_FLUSH |
+		    WT_LSM_WORK_SWITCH;
+		F_SET(worker_args, WT_LSM_WORKER_RUN);
+		/*
+		 * Only allow half of the threads to run merges to avoid all
+		 * all workers getting stuck in long-running merge operations.
+		 * Make sure the first worker is allowed, so that there is at
+		 * least one thread capable of running merges.  We know the
+		 * first worker is id 2, so set merges on even numbered workers.
+		 */
+		if (manager->lsm_workers % 2 == 0)
+			FLD_SET(worker_args->type, WT_LSM_WORK_MERGE);
+		WT_RET(__wt_lsm_worker_start(session, worker_args));
+	}
+	return (0);
+}
+
+/*
+ * __lsm_stop_workers --
+ *	Stop worker threads until the number reaches the configured amount.
+ */
+static int
+__lsm_stop_workers(WT_SESSION_IMPL *session)
+{
+	WT_LSM_MANAGER *manager;
+	WT_LSM_WORKER_ARGS *worker_args;
+	uint32_t i;
+
+	manager = &S2C(session)->lsm_manager;
+	/*
+	 * Start at the end of the list of threads and stop them until we
+	 * have the desired number.  We want to keep all active threads
+	 * packed at the front of the worker array.
+	 */
+	WT_ASSERT(session, manager->lsm_workers != 0);
+	for (i = manager->lsm_workers - 1; i >= manager->lsm_workers_max; i--) {
+		worker_args = &manager->lsm_worker_cookies[i];
+		/*
+		 * Clear this worker's flag so it stops.
+		 */
+		F_CLR(worker_args, WT_LSM_WORKER_RUN);
+		WT_ASSERT(session, worker_args->tid != 0);
+		WT_RET(__wt_thread_join(session, worker_args->tid));
+		worker_args->tid = 0;
+		worker_args->type = 0;
+		worker_args->flags = 0;
+		manager->lsm_workers--;
+		/*
+		 * We do not clear the session because they are allocated
+		 * statically when the connection was opened.
+		 */
+	}
+	return (0);
+}
+
+/*
+ * __wt_lsm_manager_reconfig --
+ *	Re-configure the LSM manager.
+ */
+int
+__wt_lsm_manager_reconfig(WT_SESSION_IMPL *session, const char **cfg)
+{
+	WT_LSM_MANAGER *manager;
+	uint32_t orig_workers;
+
+	manager = &S2C(session)->lsm_manager;
+	orig_workers = manager->lsm_workers_max;
+
+	WT_RET(__wt_lsm_manager_config(session, cfg));
+	/*
+	 * If LSM hasn't started yet, we simply reconfigured the settings
+	 * and we'll let the normal code path start the threads.
+	 */
+	if (manager->lsm_workers_max == 0)
+		return (0);
+	if (manager->lsm_workers == 0)
+		return (0);
+	/*
+	 * If the number of workers has not changed, we're done.
+	 */
+	if (orig_workers == manager->lsm_workers_max)
+		return (0);
+	/*
+	 * If we want more threads, start them.
+	 */
+	if (manager->lsm_workers_max > orig_workers)
+		return (__lsm_general_worker_start(session));
+
+	/*
+	 * Otherwise we want to reduce the number of workers.
+	 */
+	WT_ASSERT(session, manager->lsm_workers_max < orig_workers);
+	WT_RET(__lsm_stop_workers(session));
+	return (0);
+}
+
+/*
+ * __wt_lsm_manager_start --
+ *	Start the LSM management infrastructure. Our queues and locks were
+ *	initialized when the connection was initialized.
+ */
+int
+__wt_lsm_manager_start(WT_SESSION_IMPL *session)
+{
+	WT_DECL_RET;
+	WT_LSM_MANAGER *manager;
+	WT_SESSION_IMPL *worker_session;
+	uint32_t i;
+
+	manager = &S2C(session)->lsm_manager;
+
+	/*
+	 * We need at least a manager, a switch thread and a generic
+	 * worker.
+	 */
+	WT_ASSERT(session, manager->lsm_workers_max > 2);
+
+	/*
+	 * Open sessions for all potential worker threads here - it's not
+	 * safe to have worker threads open/close sessions themselves.
+	 * All the LSM worker threads do their operations on read-only
+	 * files. Use read-uncommitted isolation to avoid keeping
+	 * updates in cache unnecessarily.
+	 */
+	for (i = 0; i < WT_LSM_MAX_WORKERS; i++) {
+		WT_ERR(__wt_open_internal_session(
+		    S2C(session), "lsm-worker", 1, 0, &worker_session));
+		worker_session->isolation = TXN_ISO_READ_UNCOMMITTED;
+		manager->lsm_worker_cookies[i].session = worker_session;
+	}
+
+	/* Start the LSM manager thread. */
+	WT_ERR(__wt_thread_create(session, &manager->lsm_worker_cookies[0].tid,
+	    __lsm_worker_manager, &manager->lsm_worker_cookies[0]));
+
+	F_SET(S2C(session), WT_CONN_SERVER_LSM);
+
+	if (0) {
+err:		for (i = 0;
+		    (worker_session =
+		    manager->lsm_worker_cookies[i].session) != NULL;
+		    i++)
+			WT_TRET((&worker_session->iface)->close(
+			    &worker_session->iface, NULL));
+	}
+	return (ret);
+}
+
+/*
+ * __wt_lsm_manager_free_work_unit --
+ *	Release an LSM tree work unit.
+ */
+void
+__wt_lsm_manager_free_work_unit(
+    WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT *entry)
+{
+	if (entry != NULL) {
+		WT_ASSERT(session, entry->lsm_tree->queue_ref > 0);
+
+		(void)WT_ATOMIC_SUB4(entry->lsm_tree->queue_ref, 1);
+		__wt_free(session, entry);
+	}
+}
+
+/*
+ * __wt_lsm_manager_destroy --
+ *	Destroy the LSM manager threads and subsystem.
+ */
+int
+__wt_lsm_manager_destroy(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_LSM_MANAGER *manager;
+	WT_LSM_WORK_UNIT *current, *next;
+	WT_SESSION *wt_session;
+	uint32_t i;
+	uint64_t removed;
+
+	conn = S2C(session);
+	manager = &conn->lsm_manager;
+	removed = 0;
+
+	if (manager->lsm_workers > 0) {
+		/*
+		 * Stop the main LSM manager thread first.
+		 */
+		while (F_ISSET(conn, WT_CONN_SERVER_LSM))
+			__wt_yield();
+
+		/* Clean up open LSM handles. */
+		ret = __wt_lsm_tree_close_all(session);
+
+		WT_TRET(__wt_thread_join(
+		    session, manager->lsm_worker_cookies[0].tid));
+		manager->lsm_worker_cookies[0].tid = 0;
+
+		/* Release memory from any operations left on the queue. */
+		for (current = TAILQ_FIRST(&manager->switchqh);
+		    current != NULL; current = next) {
+			next = TAILQ_NEXT(current, q);
+			TAILQ_REMOVE(&manager->switchqh, current, q);
+			++removed;
+			__wt_lsm_manager_free_work_unit(session, current);
+		}
+		for (current = TAILQ_FIRST(&manager->appqh);
+		    current != NULL; current = next) {
+			next = TAILQ_NEXT(current, q);
+			TAILQ_REMOVE(&manager->appqh, current, q);
+			++removed;
+			__wt_lsm_manager_free_work_unit(session, current);
+		}
+		for (current = TAILQ_FIRST(&manager->managerqh);
+		    current != NULL; current = next) {
+			next = TAILQ_NEXT(current, q);
+			TAILQ_REMOVE(&manager->managerqh, current, q);
+			++removed;
+			__wt_lsm_manager_free_work_unit(session, current);
+		}
+
+		/* Close all LSM worker sessions. */
+		for (i = 0; i < WT_LSM_MAX_WORKERS; i++) {
+			wt_session =
+			    &manager->lsm_worker_cookies[i].session->iface;
+			WT_TRET(wt_session->close(wt_session, NULL));
+		}
+	}
+	WT_STAT_FAST_CONN_INCRV(session,
+	    lsm_work_units_discarded, removed);
+
+	/* Free resources that are allocated in connection initialize */
+	__wt_spin_destroy(session, &manager->switch_lock);
+	__wt_spin_destroy(session, &manager->app_lock);
+	__wt_spin_destroy(session, &manager->manager_lock);
+	WT_TRET(__wt_cond_destroy(session, &manager->work_cond));
+
+	return (ret);
+}
+
+/*
+ * __lsm_manager_aggressive_update --
+ *	Update the merge aggressiveness for a single LSM tree.
+ */
+static int
+__lsm_manager_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+	struct timespec now;
+	uint64_t chunk_wait, stallms;
+	u_int new_aggressive;
+
+	WT_RET(__wt_epoch(session, &now));
+	stallms = WT_TIMEDIFF(now, lsm_tree->last_flush_ts) / WT_MILLION;
+	/*
+	 * Get aggressive if more than enough chunks for a merge should have
+	 * been created by now. Use 10 seconds as a default if we don't have an
+	 * estimate.
+	 */
+	if (lsm_tree->nchunks > 1)
+		chunk_wait = stallms / (lsm_tree->chunk_fill_ms == 0 ?
+		    10000 : lsm_tree->chunk_fill_ms);
+	else
+		chunk_wait = 0;
+	new_aggressive = (u_int)(chunk_wait / lsm_tree->merge_min);
+
+	if (new_aggressive > lsm_tree->merge_aggressiveness) {
+		WT_RET(__wt_verbose(session, WT_VERB_LSM,
+		    "LSM merge %s got aggressive (old %u new %u), "
+		    "merge_min %d, %u / %" PRIu64,
+		    lsm_tree->name, lsm_tree->merge_aggressiveness,
+		    new_aggressive, lsm_tree->merge_min, stallms,
+		    lsm_tree->chunk_fill_ms));
+		lsm_tree->merge_aggressiveness = new_aggressive;
+	}
+	return (0);
+}
+
+/*
+ * __lsm_manager_worker_setup --
+ *	Do setup owned by the LSM manager thread including starting the worker
+ *	threads.
+ */
+static int
+__lsm_manager_worker_setup(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_LSM_MANAGER *manager;
+	WT_LSM_WORKER_ARGS *worker_args;
+
+	conn = S2C(session);
+	manager = &conn->lsm_manager;
+
+	WT_ASSERT(session, manager->lsm_workers == 1);
+	/*
+	 * The LSM manager is worker[0].  The switch thread is worker[1].
+	 * Setup and start the switch/drop worker explicitly.
+	 */
+	worker_args = &manager->lsm_worker_cookies[1];
+	worker_args->work_cond = manager->work_cond;
+	worker_args->id = manager->lsm_workers++;
+	worker_args->type = WT_LSM_WORK_DROP | WT_LSM_WORK_SWITCH;
+	F_SET(worker_args, WT_LSM_WORKER_RUN);
+	/* Start the switch thread. */
+	WT_RET(__wt_lsm_worker_start(session, worker_args));
+	WT_RET(__lsm_general_worker_start(session));
+
+	return (0);
+}
+
+/*
+ * __lsm_manager_worker_shutdown --
+ *	Shutdown the LSM manager and worker threads.
+ */
+static int
+__lsm_manager_worker_shutdown(WT_SESSION_IMPL *session)
+{
+	WT_DECL_RET;
+	WT_LSM_MANAGER *manager;
+	u_int i;
+
+	manager = &S2C(session)->lsm_manager;
+
+	/*
+	 * Wait for the rest of the LSM workers to shutdown. Stop at index
+	 * one - since we (the manager) are at index 0.
+	 */
+	for (i = 1; i < manager->lsm_workers; i++) {
+		WT_ASSERT(session, manager->lsm_worker_cookies[i].tid != 0);
+		WT_TRET(__wt_cond_signal(session, manager->work_cond));
+		WT_TRET(__wt_thread_join(
+		    session, manager->lsm_worker_cookies[i].tid));
+	}
+	return (ret);
+}
+
+/*
+ * __lsm_manager_run_server --
+ *	Run manager thread operations.
+ */
+static int
+__lsm_manager_run_server(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_LSM_TREE *lsm_tree;
+	struct timespec now;
+	uint64_t fillms, pushms;
+
+	conn = S2C(session);
+	while (F_ISSET(conn, WT_CONN_SERVER_RUN)) {
+		if (TAILQ_EMPTY(&conn->lsmqh)) {
+			__wt_sleep(0, 10000);
+			continue;
+		}
+		__wt_sleep(0, 10000);
+		TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) {
+			if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
+				continue;
+			WT_RET(__lsm_manager_aggressive_update(
+			    session, lsm_tree));
+			WT_RET(__wt_epoch(session, &now));
+			pushms = lsm_tree->work_push_ts.tv_sec == 0 ? 0 :
+			    WT_TIMEDIFF(
+			    now, lsm_tree->work_push_ts) / WT_MILLION;
+			fillms = 3 * lsm_tree->chunk_fill_ms;
+			if (fillms == 0)
+				fillms = 10000;
+			/*
+			 * If the tree appears to not be triggering enough
+			 * LSM maintenance, help it out. Additional work units
+			 * don't hurt, and can be necessary if some work
+			 * units aren't completed for some reason.
+			 * If the tree hasn't been modified, and there are
+			 * more than 1 chunks - try to get the tree smaller
+			 * so queries run faster.
+			 * If we are getting aggressive - ensure there are
+			 * enough work units that we can get chunks merged.
+			 * If we aren't pushing enough work units, compared
+			 * to how often new chunks are being created add some
+			 * more.
+			 */
+			if (lsm_tree->queue_ref >= LSM_TREE_MAX_QUEUE)
+				WT_STAT_FAST_CONN_INCR(session,
+				    lsm_work_queue_max);
+			else if ((!lsm_tree->modified &&
+			    lsm_tree->nchunks > 1) ||
+			    (lsm_tree->queue_ref == 0 &&
+			    lsm_tree->nchunks > 1) ||
+			    (lsm_tree->merge_aggressiveness > 3 &&
+			     !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) ||
+			    pushms > fillms) {
+				WT_RET(__wt_lsm_manager_push_entry(
+				    session, WT_LSM_WORK_SWITCH, 0, lsm_tree));
+				WT_RET(__wt_lsm_manager_push_entry(
+				    session, WT_LSM_WORK_DROP, 0, lsm_tree));
+				WT_RET(__wt_lsm_manager_push_entry(
+				    session, WT_LSM_WORK_FLUSH, 0, lsm_tree));
+				WT_RET(__wt_lsm_manager_push_entry(
+				    session, WT_LSM_WORK_BLOOM, 0, lsm_tree));
+				WT_RET(__wt_verbose(session, WT_VERB_LSM,
+				    "MGR %s: queue %d mod %d nchunks %d"
+				    " flags 0x%x aggressive %d pushms %" PRIu64
+				    " fillms %" PRIu64,
+				    lsm_tree->name, lsm_tree->queue_ref,
+				    lsm_tree->modified, lsm_tree->nchunks,
+				    lsm_tree->flags,
+				    lsm_tree->merge_aggressiveness,
+				    pushms, fillms));
+				WT_RET(__wt_lsm_manager_push_entry(
+				    session, WT_LSM_WORK_MERGE, 0, lsm_tree));
+			}
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * __lsm_worker_manager --
+ *	A thread that manages all open LSM trees, and the shared LSM worker
+ *	threads.
+ */
+static void *
+__lsm_worker_manager(void *arg)
+{
+	WT_DECL_RET;
+	WT_LSM_WORKER_ARGS *cookie;
+	WT_SESSION_IMPL *session;
+
+	cookie = (WT_LSM_WORKER_ARGS *)arg;
+	session = cookie->session;
+
+	WT_ERR(__lsm_manager_worker_setup(session));
+	WT_ERR(__lsm_manager_run_server(session));
+	WT_ERR(__lsm_manager_worker_shutdown(session));
+
+	if (ret != 0) {
+err:		__wt_err(session, ret, "LSM worker manager thread error");
+	}
+	F_CLR(S2C(session), WT_CONN_SERVER_LSM);
+	return (NULL);
+}
+
+/*
+ * __wt_lsm_manager_clear_tree --
+ *	Remove all entries for a tree from the LSM manager queues. This
+ *	introduces an inefficiency if LSM trees are being opened and closed
+ *	regularly.
+ */
+int
+__wt_lsm_manager_clear_tree(
+    WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+	WT_LSM_MANAGER *manager;
+	WT_LSM_WORK_UNIT *current, *next;
+	uint64_t removed;
+
+	manager = &S2C(session)->lsm_manager;
+	removed = 0;
+
+	/* Clear out the tree from the switch queue */
+	__wt_spin_lock(session, &manager->switch_lock);
+
+	/* Structure the loop so that it's safe to free as we iterate */
+	for (current = TAILQ_FIRST(&manager->switchqh);
+	    current != NULL; current = next) {
+		next = TAILQ_NEXT(current, q);
+		if (current->lsm_tree != lsm_tree)
+			continue;
+		++removed;
+		TAILQ_REMOVE(&manager->switchqh, current, q);
+		__wt_lsm_manager_free_work_unit(session, current);
+	}
+	__wt_spin_unlock(session, &manager->switch_lock);
+	/* Clear out the tree from the application queue */
+	__wt_spin_lock(session, &manager->app_lock);
+	for (current = TAILQ_FIRST(&manager->appqh);
+	    current != NULL; current = next) {
+		next = TAILQ_NEXT(current, q);
+		if (current->lsm_tree != lsm_tree)
+			continue;
+		++removed;
+		TAILQ_REMOVE(&manager->appqh, current, q);
+		__wt_lsm_manager_free_work_unit(session, current);
+	}
+	__wt_spin_unlock(session, &manager->app_lock);
+	/* Clear out the tree from the manager queue */
+	__wt_spin_lock(session, &manager->manager_lock);
+	for (current = TAILQ_FIRST(&manager->managerqh);
+	    current != NULL; current = next) {
+		next = TAILQ_NEXT(current, q);
+		if (current->lsm_tree != lsm_tree)
+			continue;
+		++removed;
+		TAILQ_REMOVE(&manager->managerqh, current, q);
+		__wt_lsm_manager_free_work_unit(session, current);
+	}
+	__wt_spin_unlock(session, &manager->manager_lock);
+	WT_STAT_FAST_CONN_INCRV(session, lsm_work_units_discarded, removed);
+	return (0);
+}
+
+/*
+ * We assume this is only called from __wt_lsm_manager_pop_entry and we
+ * have session, entry and type available to use.  If the queue is empty
+ * we may return from the macro.
+ */
+#define	LSM_POP_ENTRY(qh, qlock, qlen) do {				\
+	if (TAILQ_EMPTY(qh))						\
+		return (0);						\
+	__wt_spin_lock(session, qlock);					\
+	TAILQ_FOREACH(entry, (qh), q) {					\
+		if (FLD_ISSET(type, entry->type)) {			\
+			TAILQ_REMOVE(qh, entry, q);			\
+			WT_STAT_FAST_CONN_DECR(session, qlen);		\
+			break;						\
+		}							\
+	}								\
+	__wt_spin_unlock(session, (qlock));				\
+} while (0)
+
+/*
+ * __wt_lsm_manager_pop_entry --
+ *	Retrieve the head of the queue, if it matches the requested work
+ *	unit type.
+ */
+int
+__wt_lsm_manager_pop_entry(
+    WT_SESSION_IMPL *session, uint32_t type, WT_LSM_WORK_UNIT **entryp)
+{
+	WT_LSM_MANAGER *manager;
+	WT_LSM_WORK_UNIT *entry;
+
+	manager = &S2C(session)->lsm_manager;
+	*entryp = NULL;
+	entry = NULL;
+
+	/*
+	 * Pop the entry off the correct queue based on our work type.
+	 */
+	if (type == WT_LSM_WORK_SWITCH)
+		LSM_POP_ENTRY(&manager->switchqh,
+		    &manager->switch_lock, lsm_work_queue_switch);
+	else if (type == WT_LSM_WORK_MERGE)
+		LSM_POP_ENTRY(&manager->managerqh,
+		    &manager->manager_lock, lsm_work_queue_manager);
+	else
+		LSM_POP_ENTRY(&manager->appqh,
+		    &manager->app_lock, lsm_work_queue_app);
+	if (entry != NULL)
+		WT_STAT_FAST_CONN_INCR(session, lsm_work_units_done);
+	*entryp = entry;
+	return (0);
+}
+
+/*
+ * Push a work unit onto the appropriate queue.  This macro assumes we are
+ * called from __wt_lsm_manager_push_entry and we have session and entry
+ * available for use.
+ */
+#define	LSM_PUSH_ENTRY(qh, qlock, qlen) do {				\
+	__wt_spin_lock(session, qlock);					\
+	TAILQ_INSERT_TAIL((qh), entry, q);				\
+	WT_STAT_FAST_CONN_INCR(session, qlen);				\
+	__wt_spin_unlock(session, qlock);				\
+} while (0)
+
+/*
+ * __wt_lsm_manager_push_entry --
+ *	Add an entry to the end of the switch queue.
+ */
+int
+__wt_lsm_manager_push_entry(WT_SESSION_IMPL *session,
+    uint32_t type, uint32_t flags, WT_LSM_TREE *lsm_tree)
+{
+	WT_LSM_MANAGER *manager;
+	WT_LSM_WORK_UNIT *entry;
+
+	manager = &S2C(session)->lsm_manager;
+
+	WT_RET(__wt_epoch(session, &lsm_tree->work_push_ts));
+
+	WT_RET(__wt_calloc_def(session, 1, &entry));
+	entry->type = type;
+	entry->flags = flags;
+	entry->lsm_tree = lsm_tree;
+	(void)WT_ATOMIC_ADD4(lsm_tree->queue_ref, 1);
+	WT_STAT_FAST_CONN_INCR(session, lsm_work_units_created);
+
+	if (type == WT_LSM_WORK_SWITCH)
+		LSM_PUSH_ENTRY(&manager->switchqh,
+		    &manager->switch_lock, lsm_work_queue_switch);
+	else if (type == WT_LSM_WORK_MERGE)
+		LSM_PUSH_ENTRY(&manager->managerqh,
+		    &manager->manager_lock, lsm_work_queue_manager);
+	else
+		LSM_PUSH_ENTRY(&manager->appqh,
+		    &manager->app_lock, lsm_work_queue_app);
+
+	WT_RET(__wt_cond_signal(session, manager->work_cond));
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_merge.c b/src/third_party/wiredtiger/src/lsm/lsm_merge.c
new file mode 100644
index 00000000000..784837092cd
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_merge.c
@@ -0,0 +1,489 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_lsm_merge_update_tree --
+ *	Merge a set of chunks and populate a new one.
+ *	Must be called with the LSM lock held.
+ */
+int
+__wt_lsm_merge_update_tree(WT_SESSION_IMPL *session,
+    WT_LSM_TREE *lsm_tree, u_int start_chunk, u_int nchunks,
+    WT_LSM_CHUNK *chunk)
+{
+	size_t chunks_after_merge;
+	u_int i;
+
+	WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
+
+	/* Setup the array of obsolete chunks. */
+	WT_RET(__wt_realloc_def(session, &lsm_tree->old_alloc,
+	    lsm_tree->nold_chunks + nchunks, &lsm_tree->old_chunks));
+
+	/* Copy entries one at a time, so we can reuse gaps in the list. */
+	for (i = 0; i < nchunks; i++)
+		lsm_tree->old_chunks[lsm_tree->nold_chunks++] =
+		    lsm_tree->chunk[start_chunk + i];
+
+	/* Update the current chunk list. */
+	chunks_after_merge = lsm_tree->nchunks - (nchunks + start_chunk);
+	memmove(lsm_tree->chunk + start_chunk + 1,
+	    lsm_tree->chunk + start_chunk + nchunks,
+	    chunks_after_merge * sizeof(*lsm_tree->chunk));
+	lsm_tree->nchunks -= nchunks - 1;
+	memset(lsm_tree->chunk + lsm_tree->nchunks, 0,
+	    (nchunks - 1) * sizeof(*lsm_tree->chunk));
+	lsm_tree->chunk[start_chunk] = chunk;
+
+	return (0);
+}
+
+/*
+ * __wt_lsm_merge --
+ *	Merge a set of chunks of an LSM tree.
+ */
+int
+__wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
+{
+	WT_BLOOM *bloom;
+	WT_CURSOR *dest, *src;
+	WT_DECL_ITEM(bbuf);
+	WT_DECL_RET;
+	WT_ITEM key, value;
+	WT_LSM_CHUNK *chunk, *previous, *youngest;
+	uint32_t aggressive, generation, max_gap, max_gen, max_level, start_id;
+	uint64_t insert_count, record_count, chunk_size;
+	u_int dest_id, end_chunk, i, merge_max, merge_min, nchunks, start_chunk;
+	u_int verb;
+	int create_bloom, locked, in_sync, tret;
+	const char *cfg[3];
+	const char *drop_cfg[] =
+	    { WT_CONFIG_BASE(session, session_drop), "force", NULL };
+
+	bloom = NULL;
+	chunk_size = 0;
+	create_bloom = 0;
+	dest = src = NULL;
+	locked = 0;
+	start_id = 0;
+	in_sync = 0;
+
+	/*
+	 * If the tree is open read-only or we are compacting, be very
+	 * aggressive. Otherwise, we can spend a long time waiting for merges
+	 * to start in read-only applications.
+	 */
+	if (!lsm_tree->modified ||
+	    F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING))
+		lsm_tree->merge_aggressiveness = 10;
+
+	aggressive = lsm_tree->merge_aggressiveness;
+	merge_max = (aggressive > 5) ? 100 : lsm_tree->merge_min;
+	merge_min = (aggressive > 5) ? 2 : lsm_tree->merge_min;
+	max_gap = (aggressive + 4) / 5;
+	max_level = (lsm_tree->merge_throttle > 0) ? 0 : id + aggressive;
+
+	/*
+	 * If there aren't any chunks to merge, or some of the chunks aren't
+	 * yet written, we're done.  A non-zero error indicates that the worker
+	 * should assume there is no work to do: if there are unwritten chunks,
+	 * the worker should write them immediately.
+	 */
+	if (lsm_tree->nchunks < merge_min)
+		return (WT_NOTFOUND);
+
+	/*
+	 * Use the lsm_tree lock to read the chunks (so no switches occur), but
+	 * avoid holding it while the merge is in progress: that may take a
+	 * long time.
+	 */
+	WT_RET(__wt_lsm_tree_writelock(session, lsm_tree));
+
+	/*
+	 * Only include chunks that already have a Bloom filter or are the
+	 * result of a merge and not involved in a merge.
+	 */
+	for (end_chunk = lsm_tree->nchunks - 1; end_chunk > 0; --end_chunk) {
+		chunk = lsm_tree->chunk[end_chunk];
+		WT_ASSERT(session, chunk != NULL);
+		if (F_ISSET(chunk, WT_LSM_CHUNK_MERGING))
+			continue;
+		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) || chunk->generation > 0)
+			break;
+		else if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) &&
+		    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))
+			break;
+	}
+
+	/*
+	 * Give up immediately if there aren't enough on disk chunks in the
+	 * tree for a merge.
+	 */
+	if (end_chunk < merge_min - 1) {
+		WT_RET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+		return (WT_NOTFOUND);
+	}
+
+	/*
+	 * Look for the most efficient merge we can do.  We define efficiency
+	 * as collapsing as many levels as possible while processing the
+	 * smallest number of rows.
+	 *
+	 * We make a distinction between "major" and "minor" merges.  The
+	 * difference is whether the oldest chunk is involved: if it is, we can
+	 * discard tombstones, because there can be no older record to marked
+	 * deleted.
+	 *
+	 * Respect the configured limit on the number of chunks to merge: start
+	 * with the most recent set of chunks and work backwards until going
+	 * further becomes significantly less efficient.
+	 */
+	for (start_chunk = end_chunk + 1, record_count = 0;
+	    start_chunk > 0; ) {
+		chunk = lsm_tree->chunk[start_chunk - 1];
+		youngest = lsm_tree->chunk[end_chunk];
+		nchunks = (end_chunk + 1) - start_chunk;
+
+		/*
+		 * If the chunk is already involved in a merge or a Bloom
+		 * filter is being built for it, stop.
+		 */
+		if (F_ISSET(chunk, WT_LSM_CHUNK_MERGING) || chunk->bloom_busy)
+			break;
+
+		/*
+		 * Look for small merges before trying a big one: some threads
+		 * should stay in low levels until we get more aggressive.
+		 */
+		if (chunk->generation > max_level)
+			break;
+
+		/*
+		 * If the size of the chunks selected so far exceeds the
+		 * configured maximum chunk size, stop.  Keep going if we can
+		 * slide the window further into the tree: we don't want to
+		 * leave small chunks in the middle.
+		 */
+		if ((chunk_size += chunk->size) > lsm_tree->chunk_max)
+			if (nchunks < merge_min ||
+			    (chunk->generation > youngest->generation &&
+			    chunk_size - youngest->size > lsm_tree->chunk_max))
+				break;
+
+		/*
+		 * If we have enough chunks for a merge and the next chunk is
+		 * in too high a generation, stop.
+		 */
+		if (nchunks >= merge_min) {
+			previous = lsm_tree->chunk[start_chunk];
+			max_gen = youngest->generation + max_gap;
+			if (previous->generation <= max_gen &&
+			    chunk->generation > max_gen)
+				break;
+		}
+
+		F_SET(chunk, WT_LSM_CHUNK_MERGING);
+		record_count += chunk->count;
+		--start_chunk;
+
+		/*
+		 * If we have a full window, or the merge would be too big,
+		 * remove the youngest chunk.
+		 */
+		if (nchunks == merge_max ||
+		    chunk_size > lsm_tree->chunk_max) {
+			WT_ASSERT(session,
+			    F_ISSET(youngest, WT_LSM_CHUNK_MERGING));
+			F_CLR(youngest, WT_LSM_CHUNK_MERGING);
+			record_count -= youngest->count;
+			chunk_size -= youngest->size;
+			--end_chunk;
+		}
+	}
+
+	nchunks = (end_chunk + 1) - start_chunk;
+	WT_ASSERT(session, nchunks <= merge_max);
+
+	if (nchunks > 0) {
+		WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
+		for (i = 0; i < nchunks; i++) {
+			chunk = lsm_tree->chunk[start_chunk + i];
+			WT_ASSERT(session,
+			    F_ISSET(chunk, WT_LSM_CHUNK_MERGING));
+		}
+
+		chunk = lsm_tree->chunk[start_chunk];
+		youngest = lsm_tree->chunk[end_chunk];
+		start_id = chunk->id;
+
+		/*
+		 * Don't do merges that are too small or across too many
+		 * generations.
+		 */
+		if (nchunks < merge_min ||
+		    chunk->generation > youngest->generation + max_gap) {
+			for (i = 0; i < nchunks; i++) {
+				chunk = lsm_tree->chunk[start_chunk + i];
+				WT_ASSERT(session,
+				    F_ISSET(chunk, WT_LSM_CHUNK_MERGING));
+				F_CLR(chunk, WT_LSM_CHUNK_MERGING);
+			}
+			nchunks = 0;
+		}
+	}
+
+	/* Find the merge generation. */
+	for (generation = 0, i = 0; i < nchunks; i++)
+		generation = WT_MAX(generation,
+		    lsm_tree->chunk[start_chunk + i]->generation + 1);
+
+	WT_RET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+
+	if (nchunks == 0)
+		return (WT_NOTFOUND);
+
+	/* Allocate an ID for the merge. */
+	dest_id = WT_ATOMIC_ADD4(lsm_tree->last, 1);
+
+	/*
+	 * We only want to do the chunk loop if we're running with verbose,
+	 * so we wrap these statements in the conditional.  Avoid the loop
+	 * in the normal path.
+	 */
+	if (WT_VERBOSE_ISSET(session, WT_VERB_LSM)) {
+		WT_RET(__wt_verbose(session, WT_VERB_LSM,
+		    "Merging %s chunks %u-%u into %u (%" PRIu64 " records)"
+		    ", generation %" PRIu32,
+		    lsm_tree->name,
+		    start_chunk, end_chunk, dest_id, record_count, generation));
+		for (verb = start_chunk; verb <= end_chunk; verb++)
+			WT_RET(__wt_verbose(session, WT_VERB_LSM,
+			    "%s: Chunk[%u] id %u",
+			    lsm_tree->name, verb, lsm_tree->chunk[verb]->id));
+	}
+
+	WT_RET(__wt_calloc_def(session, 1, &chunk));
+	chunk->id = dest_id;
+
+	if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED) &&
+	    (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) ||
+	    start_chunk > 0) && record_count > 0)
+		create_bloom = 1;
+
+	/*
+	 * Special setup for the merge cursor:
+	 * first, reset to open the dependent cursors;
+	 * then restrict the cursor to a specific number of chunks;
+	 * then set MERGE so the cursor doesn't track updates to the tree.
+	 */
+	WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
+	F_SET(src, WT_CURSTD_RAW);
+	WT_ERR(__wt_clsm_init_merge(src, start_chunk, start_id, nchunks));
+
+	WT_WITH_SCHEMA_LOCK(session,
+	    ret = __wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
+	WT_ERR(ret);
+	if (create_bloom) {
+		WT_ERR(__wt_lsm_tree_bloom_name(
+		    session, lsm_tree, chunk->id, &chunk->bloom_uri));
+
+		WT_ERR(__wt_bloom_create(session, chunk->bloom_uri,
+		    lsm_tree->bloom_config,
+		    record_count, lsm_tree->bloom_bit_count,
+		    lsm_tree->bloom_hash_count, &bloom));
+	}
+
+	/* Discard pages we read as soon as we're done with them. */
+	F_SET(session, WT_SESSION_NO_CACHE);
+
+	cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
+	cfg[1] = "bulk,raw,skip_sort_check";
+	cfg[2] = NULL;
+	WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));
+
+#define	LSM_MERGE_CHECK_INTERVAL	1000
+	for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
+		if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) {
+			if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
+				WT_ERR(EINTR);
+			/*
+			 * Help out with switching chunks in case the
+			 * checkpoint worker is busy.
+			 */
+			if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
+				WT_WITH_SCHEMA_LOCK(session, ret =
+				    __wt_lsm_tree_switch(session, lsm_tree));
+				WT_ERR(ret);
+			}
+			WT_STAT_FAST_CONN_INCRV(session,
+			    lsm_rows_merged, LSM_MERGE_CHECK_INTERVAL);
+			++lsm_tree->merge_progressing;
+		}
+
+		WT_ERR(src->get_key(src, &key));
+		dest->set_key(dest, &key);
+		WT_ERR(src->get_value(src, &value));
+		dest->set_value(dest, &value);
+		WT_ERR(dest->insert(dest));
+		if (create_bloom)
+			WT_ERR(__wt_bloom_insert(bloom, &key));
+	}
+	WT_ERR_NOTFOUND_OK(ret);
+
+	WT_STAT_FAST_CONN_INCRV(session,
+	    lsm_rows_merged, insert_count % LSM_MERGE_CHECK_INTERVAL);
+	++lsm_tree->merge_progressing;
+	WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+	    "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted.",
+	    record_count, insert_count));
+
+	/*
+	 * Closing and syncing the files can take a while.  Set the
+	 * merge_syncing field so that compact knows it is still in
+	 * progress.
+	 */
+	(void)WT_ATOMIC_ADD4(lsm_tree->merge_syncing, 1);
+	in_sync = 1;
+	/*
+	 * We've successfully created the new chunk.  Now install it.  We need
+	 * to ensure that the NO_CACHE flag is cleared and the bloom filter
+	 * is closed (even if a step fails), so track errors but don't return
+	 * until we've cleaned up.
+	 */
+	WT_TRET(src->close(src));
+	WT_TRET(dest->close(dest));
+	src = dest = NULL;
+
+	F_CLR(session, WT_SESSION_NO_CACHE);
+
+	/*
+	 * We're doing advisory reads to fault the new trees into cache.
+	 * Don't block if the cache is full: our next unit of work may be to
+	 * discard some trees to free space.
+	 */
+	F_SET(session, WT_SESSION_NO_CACHE_CHECK);
+
+	if (create_bloom) {
+		if (ret == 0)
+			WT_TRET(__wt_bloom_finalize(bloom));
+
+		/*
+		 * Read in a key to make sure the Bloom filters btree handle is
+		 * open before it becomes visible to application threads.
+		 * Otherwise application threads will stall while it is opened
+		 * and internal pages are read into cache.
+		 */
+		if (ret == 0) {
+			WT_CLEAR(key);
+			WT_TRET_NOTFOUND_OK(__wt_bloom_get(bloom, &key));
+		}
+
+		WT_TRET(__wt_bloom_close(bloom));
+		bloom = NULL;
+	}
+	WT_ERR(ret);
+
+	/*
+	 * Open a handle on the new chunk before application threads attempt
+	 * to access it, opening it pre-loads internal pages into the file
+	 * system cache.
+	 */
+	cfg[1] = "checkpoint=" WT_CHECKPOINT;
+	WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));
+	WT_TRET(dest->close(dest));
+	dest = NULL;
+	++lsm_tree->merge_progressing;
+	(void)WT_ATOMIC_SUB4(lsm_tree->merge_syncing, 1);
+	in_sync = 0;
+	WT_ERR_NOTFOUND_OK(ret);
+
+	WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk));
+	WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+	locked = 1;
+
+	/*
+	 * Check whether we raced with another merge, and adjust the chunk
+	 * array offset as necessary.
+	 */
+	if (start_chunk >= lsm_tree->nchunks ||
+	    lsm_tree->chunk[start_chunk]->id != start_id)
+		for (start_chunk = 0;
+		    start_chunk < lsm_tree->nchunks;
+		    start_chunk++)
+			if (lsm_tree->chunk[start_chunk]->id == start_id)
+				break;
+
+	/*
+	 * It is safe to error out here - since the update can only fail
+	 * prior to making updates to the tree.
+	 */
+	WT_ERR(__wt_lsm_merge_update_tree(
+	    session, lsm_tree, start_chunk, nchunks, chunk));
+
+	if (create_bloom)
+		F_SET(chunk, WT_LSM_CHUNK_BLOOM);
+	chunk->count = insert_count;
+	chunk->generation = generation;
+	F_SET(chunk, WT_LSM_CHUNK_ONDISK);
+
+	/*
+	 * We have no current way of continuing if the metadata update fails,
+	 * so we will panic in that case.  Put some effort into cleaning up
+	 * after ourselves here - so things have a chance of shutting down.
+	 *
+	 * Any errors that happened after the tree was locked are
+	 * fatal - we can't guarantee the state of the tree.
+	 */
+	if ((ret = __wt_lsm_meta_write(session, lsm_tree)) != 0)
+		WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge");
+
+	lsm_tree->dsk_gen++;
+
+	/* Update the throttling while holding the tree lock. */
+	__wt_lsm_tree_throttle(session, lsm_tree, 1);
+
+	/* Schedule a pass to discard old chunks */
+	WT_ERR(__wt_lsm_manager_push_entry(
+	    session, WT_LSM_WORK_DROP, 0, lsm_tree));
+
+err:	if (locked)
+		WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+	if (in_sync)
+		(void)WT_ATOMIC_SUB4(lsm_tree->merge_syncing, 1);
+	if (src != NULL)
+		WT_TRET(src->close(src));
+	if (dest != NULL)
+		WT_TRET(dest->close(dest));
+	if (bloom != NULL)
+		WT_TRET(__wt_bloom_close(bloom));
+	__wt_scr_free(&bbuf);
+	if (ret != 0) {
+		/* Drop the newly-created files on error. */
+		WT_WITH_SCHEMA_LOCK(session,
+		    tret = __wt_schema_drop(session, chunk->uri, drop_cfg));
+		WT_TRET(tret);
+		if (create_bloom) {
+			WT_WITH_SCHEMA_LOCK(session, tret = __wt_schema_drop(
+			    session, chunk->bloom_uri, drop_cfg));
+			WT_TRET(tret);
+		}
+		__wt_free(session, chunk->bloom_uri);
+		__wt_free(session, chunk->uri);
+		__wt_free(session, chunk);
+
+		if (ret == EINTR)
+			WT_TRET(__wt_verbose(session, WT_VERB_LSM,
+			    "Merge aborted due to close"));
+		else
+			WT_TRET(__wt_verbose(session, WT_VERB_LSM,
+			    "Merge failed with %s", wiredtiger_strerror(ret)));
+	}
+	F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_meta.c b/src/third_party/wiredtiger/src/lsm/lsm_meta.c
new file mode 100644
index 00000000000..fbb5a9958d5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_meta.c
@@ -0,0 +1,238 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_lsm_meta_read --
+ *	Read the metadata for an LSM tree.
+ */
+int
+__wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+	WT_CONFIG cparser, lparser;
+	WT_CONFIG_ITEM ck, cv, lk, lv;
+	WT_DECL_RET;
+	WT_LSM_CHUNK *chunk;
+	WT_NAMED_COLLATOR *ncoll;
+	const char *lsmconfig;
+	u_int nchunks;
+
+	chunk = NULL;			/* -Wconditional-uninitialized */
+
+	WT_RET(__wt_metadata_search(session, lsm_tree->name, &lsmconfig));
+	WT_ERR(__wt_config_init(session, &cparser, lsmconfig));
+	while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) {
+		if (WT_STRING_MATCH("key_format", ck.str, ck.len)) {
+			__wt_free(session, lsm_tree->key_format);
+			WT_ERR(__wt_strndup(session,
+			    cv.str, cv.len, &lsm_tree->key_format));
+		} else if (WT_STRING_MATCH("value_format", ck.str, ck.len)) {
+			__wt_free(session, lsm_tree->value_format);
+			WT_ERR(__wt_strndup(session,
+			    cv.str, cv.len, &lsm_tree->value_format));
+		} else if (WT_STRING_MATCH("collator", ck.str, ck.len)) {
+			if (cv.len == 0)
+				continue;
+			TAILQ_FOREACH(ncoll, &S2C(session)->collqh, q) {
+				if (WT_STRING_MATCH(
+				    ncoll->name, cv.str, cv.len)) {
+					lsm_tree->collator = ncoll->collator;
+					break;
+				}
+			}
+			if (lsm_tree->collator == NULL)
+				WT_ERR_MSG(session, EINVAL,
+				    "unknown collator '%.*s'",
+				    (int)cv.len, cv.str);
+			WT_ERR(__wt_strndup(session,
+			    cv.str, cv.len, &lsm_tree->collator_name));
+		} else if (WT_STRING_MATCH("bloom_config", ck.str, ck.len)) {
+			__wt_free(session, lsm_tree->bloom_config);
+			/* Don't include the brackets. */
+			WT_ERR(__wt_strndup(session,
+			    cv.str + 1, cv.len - 2, &lsm_tree->bloom_config));
+		} else if (WT_STRING_MATCH("file_config", ck.str, ck.len)) {
+			__wt_free(session, lsm_tree->file_config);
+			/* Don't include the brackets. */
+			WT_ERR(__wt_strndup(session,
+			    cv.str + 1, cv.len - 2, &lsm_tree->file_config));
+		} else if (WT_STRING_MATCH("auto_throttle", ck.str, ck.len)) {
+			if (cv.val)
+				F_SET(lsm_tree, WT_LSM_TREE_THROTTLE);
+			else
+				F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE);
+		} else if (WT_STRING_MATCH("bloom", ck.str, ck.len))
+			lsm_tree->bloom = (uint32_t)cv.val;
+		else if (WT_STRING_MATCH("bloom_bit_count", ck.str, ck.len))
+			lsm_tree->bloom_bit_count = (uint32_t)cv.val;
+		else if (WT_STRING_MATCH("bloom_hash_count", ck.str, ck.len))
+			lsm_tree->bloom_hash_count = (uint32_t)cv.val;
+		else if (WT_STRING_MATCH("chunk_max", ck.str, ck.len))
+			lsm_tree->chunk_max = (uint64_t)cv.val;
+		else if (WT_STRING_MATCH("chunk_size", ck.str, ck.len))
+			lsm_tree->chunk_size = (uint64_t)cv.val;
+		else if (WT_STRING_MATCH("merge_max", ck.str, ck.len))
+			lsm_tree->merge_max = (uint32_t)cv.val;
+		else if (WT_STRING_MATCH("merge_min", ck.str, ck.len))
+			lsm_tree->merge_min = (uint32_t)cv.val;
+		else if (WT_STRING_MATCH("last", ck.str, ck.len))
+			lsm_tree->last = (u_int)cv.val;
+		else if (WT_STRING_MATCH("chunks", ck.str, ck.len)) {
+			WT_ERR(__wt_config_subinit(session, &lparser, &cv));
+			for (nchunks = 0; (ret =
+			    __wt_config_next(&lparser, &lk, &lv)) == 0; ) {
+				if (WT_STRING_MATCH("id", lk.str, lk.len)) {
+					WT_ERR(__wt_realloc_def(session,
+					    &lsm_tree->chunk_alloc,
+					    nchunks + 1, &lsm_tree->chunk));
+					WT_ERR(__wt_calloc_def(
+					    session, 1, &chunk));
+					lsm_tree->chunk[nchunks++] = chunk;
+					chunk->id = (uint32_t)lv.val;
+					WT_ERR(__wt_lsm_tree_chunk_name(session,
+					    lsm_tree, chunk->id, &chunk->uri));
+					F_SET(chunk,
+					    WT_LSM_CHUNK_ONDISK |
+					    WT_LSM_CHUNK_STABLE);
+				} else if (WT_STRING_MATCH(
+				    "bloom", lk.str, lk.len)) {
+					WT_ERR(__wt_lsm_tree_bloom_name(
+					    session, lsm_tree,
+					    chunk->id, &chunk->bloom_uri));
+					F_SET(chunk, WT_LSM_CHUNK_BLOOM);
+					continue;
+				} else if (WT_STRING_MATCH(
+				    "chunk_size", lk.str, lk.len)) {
+					chunk->size = (uint64_t)lv.val;
+					continue;
+				} else if (WT_STRING_MATCH(
+				    "count", lk.str, lk.len)) {
+					chunk->count = (uint64_t)lv.val;
+					continue;
+				} else if (WT_STRING_MATCH(
+				    "generation", lk.str, lk.len)) {
+					chunk->generation = (uint32_t)lv.val;
+					continue;
+				}
+			}
+			WT_ERR_NOTFOUND_OK(ret);
+			lsm_tree->nchunks = nchunks;
+		} else if (WT_STRING_MATCH("old_chunks", ck.str, ck.len)) {
+			WT_ERR(__wt_config_subinit(session, &lparser, &cv));
+			for (nchunks = 0; (ret =
+			    __wt_config_next(&lparser, &lk, &lv)) == 0; ) {
+				if (WT_STRING_MATCH("bloom", lk.str, lk.len)) {
+					WT_ERR(__wt_strndup(session,
+					    lv.str, lv.len, &chunk->bloom_uri));
+					F_SET(chunk, WT_LSM_CHUNK_BLOOM);
+					continue;
+				}
+				WT_ERR(__wt_realloc_def(session,
+				    &lsm_tree->old_alloc, nchunks + 1,
+				    &lsm_tree->old_chunks));
+				WT_ERR(__wt_calloc_def(session, 1, &chunk));
+				lsm_tree->old_chunks[nchunks++] = chunk;
+				WT_ERR(__wt_strndup(session,
+				    lk.str, lk.len, &chunk->uri));
+				F_SET(chunk, WT_LSM_CHUNK_ONDISK);
+			}
+			WT_ERR_NOTFOUND_OK(ret);
+			lsm_tree->nold_chunks = nchunks;
+		/* Values included for backward compatibility */
+		} else if (WT_STRING_MATCH("merge_threads", ck.str, ck.len)) {
+		} else
+			WT_ERR(__wt_illegal_value(session, "LSM metadata"));
+	}
+	WT_ERR_NOTFOUND_OK(ret);
+
+	/*
+	 * If the default merge_min was not overridden, calculate it now.  We
+	 * do this here so that trees created before merge_min was added get a
+	 * sane value.
+	 */
+	if (lsm_tree->merge_min < 2)
+		lsm_tree->merge_min = WT_MAX(2, lsm_tree->merge_max / 2);
+
+err:	__wt_free(session, lsmconfig);
+	return (ret);
+}
+
+/*
+ * __wt_lsm_meta_write --
+ *	Write the metadata for an LSM tree.
+ */
+int
+__wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+	WT_LSM_CHUNK *chunk;
+	u_int i;
+	int first;
+
+	WT_RET(__wt_scr_alloc(session, 0, &buf));
+	WT_ERR(__wt_buf_fmt(session, buf,
+	    "key_format=%s,value_format=%s,bloom_config=(%s),file_config=(%s)",
+	    lsm_tree->key_format, lsm_tree->value_format,
+	    lsm_tree->bloom_config, lsm_tree->file_config));
+	if (lsm_tree->collator_name != NULL)
+		WT_ERR(__wt_buf_catfmt(
+		    session, buf, ",collator=%s", lsm_tree->collator_name));
+	WT_ERR(__wt_buf_catfmt(session, buf,
+	    ",last=%" PRIu32
+	    ",chunk_max=%" PRIu64
+	    ",chunk_size=%" PRIu64
+	    ",auto_throttle=%" PRIu32
+	    ",merge_max=%" PRIu32
+	    ",merge_min=%" PRIu32
+	    ",bloom=%" PRIu32
+	    ",bloom_bit_count=%" PRIu32
+	    ",bloom_hash_count=%" PRIu32,
+	    lsm_tree->last, lsm_tree->chunk_max, lsm_tree->chunk_size,
+	    F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) ? 1 : 0,
+	    lsm_tree->merge_max, lsm_tree->merge_min, lsm_tree->bloom,
+	    lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count));
+	WT_ERR(__wt_buf_catfmt(session, buf, ",chunks=["));
+	for (i = 0; i < lsm_tree->nchunks; i++) {
+		chunk = lsm_tree->chunk[i];
+		if (i > 0)
+			WT_ERR(__wt_buf_catfmt(session, buf, ","));
+		WT_ERR(__wt_buf_catfmt(session, buf, "id=%" PRIu32, chunk->id));
+		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+			WT_ERR(__wt_buf_catfmt(session, buf, ",bloom"));
+		if (chunk->size != 0)
+			WT_ERR(__wt_buf_catfmt(session, buf,
+			    ",chunk_size=%" PRIu64, chunk->size));
+		if (chunk->count != 0)
+			WT_ERR(__wt_buf_catfmt(
+			    session, buf, ",count=%" PRIu64, chunk->count));
+		WT_ERR(__wt_buf_catfmt(
+		    session, buf, ",generation=%" PRIu32, chunk->generation));
+	}
+	WT_ERR(__wt_buf_catfmt(session, buf, "]"));
+	WT_ERR(__wt_buf_catfmt(session, buf, ",old_chunks=["));
+	first = 1;
+	for (i = 0; i < lsm_tree->nold_chunks; i++) {
+		chunk = lsm_tree->old_chunks[i];
+		WT_ASSERT(session, chunk != NULL);
+		if (first)
+			first = 0;
+		else
+			WT_ERR(__wt_buf_catfmt(session, buf, ","));
+		WT_ERR(__wt_buf_catfmt(session, buf, "\"%s\"", chunk->uri));
+		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+			WT_ERR(__wt_buf_catfmt(
+			    session, buf, ",bloom=\"%s\"", chunk->bloom_uri));
+	}
+	WT_ERR(__wt_buf_catfmt(session, buf, "]"));
+	ret = __wt_metadata_update(session, lsm_tree->name, buf->data);
+	WT_ERR(ret);
+
+err:	__wt_scr_free(&buf);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_stat.c b/src/third_party/wiredtiger/src/lsm/lsm_stat.c
new file mode 100644
index 00000000000..dc7d17e7a2c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_stat.c
@@ -0,0 +1,162 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __lsm_stat_init --
+ *	Initialize a LSM statistics structure.
+ */
+static int
+__lsm_stat_init(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR_STAT *cst)
+{
+	WT_CURSOR *stat_cursor;
+	WT_DECL_ITEM(uribuf);
+	WT_DECL_RET;
+	WT_DSRC_STATS *new, *stats;
+	WT_LSM_CHUNK *chunk;
+	WT_LSM_TREE *lsm_tree;
+	u_int i;
+	int locked;
+	char config[64];
+	const char *cfg[] = {
+	    WT_CONFIG_BASE(session, session_open_cursor), NULL, NULL };
+	const char *disk_cfg[] = {
+	   WT_CONFIG_BASE(session, session_open_cursor),
+	   "checkpoint=" WT_CHECKPOINT, NULL, NULL };
+
+	locked = 0;
+	WT_RET(__wt_lsm_tree_get(session, uri, 0, &lsm_tree));
+	WT_ERR(__wt_scr_alloc(session, 0, &uribuf));
+
+	/* Propagate all, fast and/or clear to the cursors we open. */
+	if (!F_ISSET(cst, WT_CONN_STAT_NONE)) {
+		(void)snprintf(config, sizeof(config),
+		    "statistics=(%s%s%s)",
+		    F_ISSET(cst, WT_CONN_STAT_CLEAR) ? "clear," : "",
+		    F_ISSET(cst, WT_CONN_STAT_ALL) ? "all," : "",
+		    !F_ISSET(cst, WT_CONN_STAT_ALL) &&
+		    F_ISSET(cst, WT_CONN_STAT_FAST) ? "fast," : "");
+		cfg[1] = disk_cfg[1] = config;
+	}
+
+	/*
+	 * Set the cursor to reference the data source statistics; we don't
+	 * initialize it, instead we copy (rather than aggregate), the first
+	 * chunk's statistics, which has the same effect.
+	 */
+	stats = &cst->u.dsrc_stats;
+
+	/* Hold the LSM lock so that we can safely walk through the chunks. */
+	WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree));
+	locked = 1;
+
+	/*
+	 * For each chunk, aggregate its statistics, as well as any associated
+	 * bloom filter statistics, into the total statistics.
+	 */
+	for (i = 0; i < lsm_tree->nchunks; i++) {
+		chunk = lsm_tree->chunk[i];
+
+		/*
+		 * Get the statistics for the chunk's underlying object.
+		 *
+		 * XXX kludge: we may have an empty chunk where no checkpoint
+		 * was written.  If so, try to open the ordinary handle on that
+		 * chunk instead.
+		 */
+		WT_ERR(__wt_buf_fmt(
+		    session, uribuf, "statistics:%s", chunk->uri));
+		ret = __wt_curstat_open(session, uribuf->data,
+		    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ? disk_cfg : cfg,
+		    &stat_cursor);
+		if (ret == WT_NOTFOUND &&
+		    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))
+			ret = __wt_curstat_open(
+			    session, uribuf->data, cfg, &stat_cursor);
+		WT_ERR(ret);
+
+		/*
+		 * The underlying statistics have now been initialized; fill in
+		 * values from the chunk's information, then aggregate into the
+		 * top-level.
+		 */
+		new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
+		WT_STAT_SET(new, lsm_generation_max, chunk->generation);
+
+		/*
+		 * We want to aggregate the table's statistics.  Get a base set
+		 * of statistics from the first chunk, then aggregate statistics
+		 * from each new chunk.
+		 */
+		if (i == 0)
+			*stats = *new;
+		else
+			__wt_stat_aggregate_dsrc_stats(new, stats);
+		WT_ERR(stat_cursor->close(stat_cursor));
+
+		if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+			continue;
+
+		/* Maintain a count of bloom filters. */
+		WT_STAT_INCR(&lsm_tree->stats, bloom_count);
+
+		/* Get the bloom filter's underlying object. */
+		WT_ERR(__wt_buf_fmt(
+		    session, uribuf, "statistics:%s", chunk->bloom_uri));
+		WT_ERR(__wt_curstat_open(
+		    session, uribuf->data, cfg, &stat_cursor));
+
+		/*
+		 * The underlying statistics have now been initialized; fill in
+		 * values from the bloom filter's information, then aggregate
+		 * into the top-level.
+		 */
+		new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
+		WT_STAT_SET(new,
+		    bloom_size, (chunk->count * lsm_tree->bloom_bit_count) / 8);
+		WT_STAT_SET(new, bloom_page_evict,
+		    WT_STAT(new, cache_eviction_clean) +
+		    WT_STAT(new, cache_eviction_dirty));
+		WT_STAT_SET(new, bloom_page_read, WT_STAT(new, cache_read));
+
+		__wt_stat_aggregate_dsrc_stats(new, stats);
+		WT_ERR(stat_cursor->close(stat_cursor));
+	}
+
+	/* Set statistics that aren't aggregated directly into the cursor */
+	WT_STAT_SET(stats, lsm_chunk_count, lsm_tree->nchunks);
+
+	/* Aggregate, and optionally clear, LSM-level specific information. */
+	__wt_stat_aggregate_dsrc_stats(&lsm_tree->stats, stats);
+	if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
+		__wt_stat_refresh_dsrc_stats(&lsm_tree->stats);
+
+	__wt_curstat_dsrc_final(cst);
+
+err:	if (locked)
+		WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree));
+	__wt_lsm_tree_release(session, lsm_tree);
+	__wt_scr_free(&uribuf);
+
+	return (ret);
+}
+
+/*
+ * __wt_curstat_lsm_init --
+ *	Initialize the statistics for a LSM tree.
+ */
+int
+__wt_curstat_lsm_init(
+    WT_SESSION_IMPL *session, const char *uri, WT_CURSOR_STAT *cst)
+{
+	WT_DECL_RET;
+
+	WT_WITH_SCHEMA_LOCK(session, ret = __lsm_stat_init(session, uri, cst));
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
new file mode 100644
index 00000000000..447a8eb60a6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
@@ -0,0 +1,1266 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __lsm_tree_open_check(WT_SESSION_IMPL *, WT_LSM_TREE *);
+static int __lsm_tree_open(WT_SESSION_IMPL *, const char *, WT_LSM_TREE **);
+static int __lsm_tree_set_name(WT_SESSION_IMPL *, WT_LSM_TREE *, const char *);
+
+/*
+ * __lsm_tree_discard --
+ *	Free an LSM tree structure.
+ */
+static int
+__lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+	WT_DECL_RET;
+	WT_LSM_CHUNK *chunk;
+	u_int i;
+
+	/* We may be destroying an lsm_tree before it was added. */
+	if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN))
+		TAILQ_REMOVE(&S2C(session)->lsmqh, lsm_tree, q);
+
+	__wt_free(session, lsm_tree->name);
+	__wt_free(session, lsm_tree->config);
+	__wt_free(session, lsm_tree->key_format);
+	__wt_free(session, lsm_tree->value_format);
+	__wt_free(session, lsm_tree->collator_name);
+	__wt_free(session, lsm_tree->bloom_config);
+	__wt_free(session, lsm_tree->file_config);
+
+	WT_TRET(__wt_rwlock_destroy(session, &lsm_tree->rwlock));
+
+	for (i = 0; i < lsm_tree->nchunks; i++) {
+		if ((chunk = lsm_tree->chunk[i]) == NULL)
+			continue;
+
+		__wt_free(session, chunk->bloom_uri);
+		__wt_free(session, chunk->uri);
+		__wt_free(session, chunk);
+	}
+	__wt_free(session, lsm_tree->chunk);
+
+	for (i = 0; i < lsm_tree->nold_chunks; i++) {
+		chunk = lsm_tree->old_chunks[i];
+		WT_ASSERT(session, chunk != NULL);
+
+		__wt_free(session, chunk->bloom_uri);
+		__wt_free(session, chunk->uri);
+		__wt_free(session, chunk);
+	}
+	__wt_free(session, lsm_tree->old_chunks);
+	__wt_free(session, lsm_tree);
+
+	return (ret);
+}
+
+/*
+ * __lsm_tree_close --
+ *	Close an LSM tree structure.
+ */
+static int
+__lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+	WT_DECL_RET;
+	int i;
+
+	/* Stop any active merges. */
+	F_CLR(lsm_tree, WT_LSM_TREE_ACTIVE);
+
+	/*
+	 * Wait for all LSM operations and work units that were in flight to
+	 * finish.
+	 */
+	for (i = 0; lsm_tree->refcnt > 1 || lsm_tree->queue_ref > 0; ++i) {
+		/*
+		 * Remove any work units from the manager queues. Do this step
+		 * repeatedly in case a work unit was in the process of being
+		 * created when we cleared the active flag.
+		 * !! Drop the schema lock whilst completing this step so that
+		 * we don't block any operations that require the schema
+		 * lock to complete. This is safe because any operation that
+		 * is closing the tree should first have gotten exclusive
+		 * access to the LSM tree via __wt_lsm_tree_get, so other
+		 * schema level operations will return EBUSY, even though
+		 * we're dropping the schema lock here.
+		 */
+		if (i % 1000 == 0) {
+			WT_WITHOUT_SCHEMA_LOCK(session, ret =
+			    __wt_lsm_manager_clear_tree(session, lsm_tree));
+			WT_RET(ret);
+		}
+		__wt_yield();
+	}
+	return (0);
+}
+
+/*
+ * __wt_lsm_tree_close_all --
+ *	Close all LSM tree structures.
+ */
+int
+__wt_lsm_tree_close_all(WT_SESSION_IMPL *session)
+{
+	WT_DECL_RET;
+	WT_LSM_TREE *lsm_tree;
+
+	while ((lsm_tree = TAILQ_FIRST(&S2C(session)->lsmqh)) != NULL) {
+		/*
+		 * Tree close assumes that we have a reference to the tree
+		 * so it can tell when it's safe to do the close. We could
+		 * got through tree get here, but short circuit instead. There
+		 * is no need to decrement the reference count since destroy
+		 * is unconditional.
+		 */
+		(void)WT_ATOMIC_ADD4(lsm_tree->refcnt, 1);
+		WT_TRET(__lsm_tree_close(session, lsm_tree));
+		WT_TRET(__lsm_tree_discard(session, lsm_tree));
+	}
+
+	return (ret);
+}
+
+/*
+ * __lsm_tree_set_name --
+ *	Set or reset the name of an LSM tree
+ */
+static int
+__lsm_tree_set_name(WT_SESSION_IMPL *session,
+    WT_LSM_TREE *lsm_tree, const char *uri)
+{
+	if (lsm_tree->name != NULL)
+		__wt_free(session, lsm_tree->name);
+	WT_RET(__wt_strdup(session, uri, &lsm_tree->name));
+	lsm_tree->filename = lsm_tree->name + strlen("lsm:");
+	return (0);
+}
+
+/*
+ * __wt_lsm_tree_bloom_name --
+ *	Get the URI of the Bloom filter for a given chunk.
+ */
+int
+__wt_lsm_tree_bloom_name(WT_SESSION_IMPL *session,
+    WT_LSM_TREE *lsm_tree, uint32_t id, const char **retp)
+{
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+
+	WT_RET(__wt_scr_alloc(session, 0, &tmp));
+	WT_ERR(__wt_buf_fmt(
+	    session, tmp, "file:%s-%06" PRIu32 ".bf", lsm_tree->filename, id));
+	WT_ERR(__wt_strndup(session, tmp->data, tmp->size, retp));
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __wt_lsm_tree_chunk_name --
+ *	Get the URI of the file for a given chunk.
+ */
+int
+__wt_lsm_tree_chunk_name(WT_SESSION_IMPL *session,
+    WT_LSM_TREE *lsm_tree, uint32_t id, const char **retp)
+{
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+
+	WT_RET(__wt_scr_alloc(session, 0, &tmp));
+	WT_ERR(__wt_buf_fmt(
+	    session, tmp, "file:%s-%06" PRIu32 ".lsm", lsm_tree->filename, id));
+	WT_ERR(__wt_strndup(session, tmp->data, tmp->size, retp));
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __wt_lsm_tree_set_chunk_size --
+ *	Set the size of the chunk. Should only be called for chunks that are
+ *	on disk, or about to become on disk.
+ */
+int
+__wt_lsm_tree_set_chunk_size(
+    WT_SESSION_IMPL *session, WT_LSM_CHUNK *chunk)
+{
+	wt_off_t size;
+	const char *filename;
+
+	filename = chunk->uri;
+	if (!WT_PREFIX_SKIP(filename, "file:"))
+		WT_RET_MSG(session, EINVAL,
+		    "Expected a 'file:' URI: %s", chunk->uri);
+	WT_RET(__wt_filesize_name(session, filename, &size));
+
+	chunk->size = (uint64_t)size;
+
+	return (0);
+}
+
+/*
+ * __wt_lsm_tree_setup_chunk --
+ *	Initialize a chunk of an LSM tree.
+ */
+int
+__wt_lsm_tree_setup_chunk(
+    WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
+{
+	const char *cfg[] =
+	    { WT_CONFIG_BASE(session, session_drop), "force", NULL };
+	int exists;
+
+	WT_RET(__wt_epoch(session, &chunk->create_ts));
+
+	WT_RET(__wt_lsm_tree_chunk_name(
+	    session, lsm_tree, chunk->id, &chunk->uri));
+
+	/*
+	 * If the underlying file exists, drop the chunk first - there may be
+	 * some content hanging over from an aborted merge or checkpoint.
+	 *
+	 * Don't do this for the very first chunk: we are called during
+	 * WT_SESSION::create, and doing a drop inside there does interesting
+	 * things with handle locks and metadata tracking.  It can never have
+	 * been the result of an interrupted merge, anyway.
+	 */
+	if (chunk->id > 1) {
+		WT_RET(__wt_exist(
+		    session, chunk->uri + strlen("file:"), &exists));
+		if (exists)
+			WT_RET(__wt_schema_drop(session, chunk->uri, cfg));
+	}
+	return (__wt_schema_create(session, chunk->uri, lsm_tree->file_config));
+}
+
+/*
+ * __wt_lsm_tree_create --
+ *	Create an LSM tree structure for the given name.
+ */
+int
+__wt_lsm_tree_create(WT_SESSION_IMPL *session,
+    const char *uri, int exclusive, const char *config)
+{
+	WT_CONFIG_ITEM cval;
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+	WT_LSM_TREE *lsm_tree;
+	const char *cfg[] =
+	    { WT_CONFIG_BASE(session, session_create), config, NULL };
+	const char *tmpconfig;
+
+	/* If the tree is open, it already exists. */
+	if ((ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)) == 0) {
+		__wt_lsm_tree_release(session, lsm_tree);
+		return (exclusive ? EEXIST : 0);
+	}
+	WT_RET_NOTFOUND_OK(ret);
+
+	/*
+	 * If the tree has metadata, it already exists.
+	 *
+	 * !!!
+	 * Use a local variable: we don't care what the existing configuration
+	 * is, but we don't want to overwrite the real config.
+	 */
+	if (__wt_metadata_search(session, uri, &tmpconfig) == 0) {
+		__wt_free(session, tmpconfig);
+		return (exclusive ? EEXIST : 0);
+	}
+	WT_RET_NOTFOUND_OK(ret);
+
+	WT_RET(__wt_config_gets(session, cfg, "key_format", &cval));
+	if (WT_STRING_MATCH("r", cval.str, cval.len))
+		WT_RET_MSG(session, EINVAL,
+		    "LSM trees cannot be configured as column stores");
+
+	WT_RET(__wt_calloc_def(session, 1, &lsm_tree));
+
+	WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));
+
+	WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval));
+	WT_ERR(__wt_strndup(
+	    session, cval.str, cval.len, &lsm_tree->key_format));
+	WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval));
+	WT_ERR(__wt_strndup(
+	    session, cval.str, cval.len, &lsm_tree->value_format));
+
+	WT_ERR(__wt_config_gets(session, cfg, "collator", &cval));
+	WT_ERR(__wt_strndup(
+	    session, cval.str, cval.len, &lsm_tree->collator_name));
+
+	WT_ERR(__wt_config_gets(session, cfg, "lsm.auto_throttle", &cval));
+	if (cval.val)
+		F_SET(lsm_tree, WT_LSM_TREE_THROTTLE);
+	else
+		F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE);
+	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom", &cval));
+	FLD_SET(lsm_tree->bloom,
+	    (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED));
+	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_oldest", &cval));
+	if (cval.val != 0)
+		FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST);
+
+	if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) &&
+	    FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST))
+		WT_ERR_MSG(session, EINVAL,
+		    "Bloom filters can only be created on newest and oldest "
+		    "chunks if bloom filters are enabled");
+
+	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_config", &cval));
+	if (cval.type == WT_CONFIG_ITEM_STRUCT) {
+		cval.str++;
+		cval.len -= 2;
+	}
+	WT_ERR(__wt_strndup(
+	    session, cval.str, cval.len, &lsm_tree->bloom_config));
+
+	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_bit_count", &cval));
+	lsm_tree->bloom_bit_count = (uint32_t)cval.val;
+	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_hash_count", &cval));
+	lsm_tree->bloom_hash_count = (uint32_t)cval.val;
+	WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_max", &cval));
+	lsm_tree->chunk_max = (uint64_t)cval.val;
+	WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_size", &cval));
+	lsm_tree->chunk_size = (uint64_t)cval.val;
+	if (lsm_tree->chunk_size > lsm_tree->chunk_max)
+		WT_ERR_MSG(session, EINVAL,
+		    "Chunk size (chunk_size) must be smaller than or equal to "
+		    "the maximum chunk size (chunk_max)");
+	WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_max", &cval));
+	lsm_tree->merge_max = (uint32_t)cval.val;
+	WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_min", &cval));
+	lsm_tree->merge_min = (uint32_t)cval.val;
+	if (lsm_tree->merge_min > lsm_tree->merge_max)
+		WT_ERR_MSG(session, EINVAL,
+		    "LSM merge_min must be less than or equal to merge_max");
+
+	/*
+	 * Set up the config for each chunk.
+	 *
+	 * Make the memory_page_max double the chunk size, so application
+	 * threads don't immediately try to force evict the chunk when the
+	 * worker thread clears the NO_EVICTION flag.
+	 */
+	WT_ERR(__wt_scr_alloc(session, 0, &buf));
+	WT_ERR(__wt_buf_fmt(session, buf,
+	    "%s,key_format=u,value_format=u,memory_page_max=%" PRIu64,
+	    config, 2 * lsm_tree->chunk_max));
+	WT_ERR(__wt_strndup(
+	    session, buf->data, buf->size, &lsm_tree->file_config));
+
+	/* Create the first chunk and flush the metadata. */
+	WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
+
+	/* Discard our partially populated handle. */
+	ret = __lsm_tree_discard(session, lsm_tree);
+	lsm_tree = NULL;
+
+	/*
+	 * Open our new tree and add it to the handle cache. Don't discard on
+	 * error: the returned handle is NULL on error, and the metadata
+	 * tracking macros handle cleaning up on failure.
+	 */
+	if (ret == 0)
+		ret = __lsm_tree_open(session, uri, &lsm_tree);
+	if (ret == 0)
+		__wt_lsm_tree_release(session, lsm_tree);
+
+	if (0) {
+err:		WT_TRET(__lsm_tree_discard(session, lsm_tree));
+	}
+	__wt_scr_free(&buf);
+	return (ret);
+}
+
+/*
+ * __lsm_tree_open_check --
+ *	Validate the configuration of an LSM tree.
+ */
+static int
+__lsm_tree_open_check(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+	WT_CONFIG_ITEM cval;
+	uint64_t maxleafpage, required;
+	const char *cfg[] = { WT_CONFIG_BASE(
+	    session, session_create), lsm_tree->file_config, NULL };
+
+	WT_RET(__wt_config_gets(session, cfg, "leaf_page_max", &cval));
+	maxleafpage = (uint64_t)cval.val;
+
+	/*
+	 * Three chunks, plus one page for each participant in up to three
+	 * concurrent merges.
+	 */
+	required = 3 * lsm_tree->chunk_size +
+	    3 * (lsm_tree->merge_max * maxleafpage);
+	if (S2C(session)->cache_size < required)
+		WT_RET_MSG(session, EINVAL,
+		    "LSM cache size %" PRIu64 " (%" PRIu64 "MB) too small, "
+		    "must be at least %" PRIu64 " (%" PRIu64 "MB)",
+		    S2C(session)->cache_size,
+		    S2C(session)->cache_size / WT_MEGABYTE,
+		    required, required / WT_MEGABYTE);
+	return (0);
+}
+
+/*
+ * __lsm_tree_open --
+ *	Open an LSM tree structure.
+ */
+static int
+__lsm_tree_open(
+    WT_SESSION_IMPL *session, const char *uri, WT_LSM_TREE **treep)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_LSM_TREE *lsm_tree;
+
+	conn = S2C(session);
+	lsm_tree = NULL;
+
+	WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+
+	/* Start the LSM manager thread if it isn't running. */
+	if (WT_ATOMIC_CAS4(conn->lsm_manager.lsm_workers, 0, 1))
+		WT_RET(__wt_lsm_manager_start(session));
+
+	/* Make sure no one beat us to it. */
+	TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q)
+		if (strcmp(uri, lsm_tree->name) == 0) {
+			*treep = lsm_tree;
+			return (0);
+		}
+
+	/* Try to open the tree. */
+	WT_RET(__wt_calloc_def(session, 1, &lsm_tree));
+	WT_ERR(__wt_rwlock_alloc(session, &lsm_tree->rwlock, "lsm tree"));
+
+	WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));
+
+	WT_ERR(__wt_lsm_meta_read(session, lsm_tree));
+
+	/*
+	 * Sanity check the configuration. Do it now since this is the first
+	 * time we have the LSM tree configuration.
+	 */
+	WT_ERR(__lsm_tree_open_check(session, lsm_tree));
+
+	if (lsm_tree->nchunks == 0) {
+		F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
+		WT_ERR(__wt_lsm_tree_switch(session, lsm_tree));
+	}
+
+	/* Set the generation number so cursors are opened on first usage. */
+	lsm_tree->dsk_gen = 1;
+
+	/*
+	 * Setup reference counting. Use separate reference counts for tree
+	 * handles and queue entries, so that queue entries don't interfere
+	 * with getting handles exclusive.
+	 */
+	lsm_tree->refcnt = 1;
+	lsm_tree->queue_ref = 0;
+
+	/* Set a flush timestamp as a baseline. */
+	WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts));
+
+	/* Now the tree is setup, make it visible to others. */
+	TAILQ_INSERT_HEAD(&S2C(session)->lsmqh, lsm_tree, q);
+	F_SET(lsm_tree, WT_LSM_TREE_ACTIVE | WT_LSM_TREE_OPEN);
+
+	*treep = lsm_tree;
+
+	if (0) {
+err:		WT_TRET(__lsm_tree_discard(session, lsm_tree));
+	}
+	return (ret);
+}
+
+/*
+ * __wt_lsm_tree_get --
+ *	Get an LSM tree structure for the given name. Optionally get exclusive
+ *	access to the handle. Exclusive access works separately to the LSM
+ *	tree lock - since operations that need exclusive access may also need
+ *	to take the LSM tree lock for example outstanding work unit operations.
+ */
+int
+__wt_lsm_tree_get(WT_SESSION_IMPL *session,
+    const char *uri, int exclusive, WT_LSM_TREE **treep)
+{
+	WT_LSM_TREE *lsm_tree;
+
+	/* See if the tree is already open. */
+	TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q)
+		if (strcmp(uri, lsm_tree->name) == 0) {
+			/*
+			 * Short circuit if the handle is already held
+			 * exclusively or exclusive access is requested and
+			 * there are references held.
+			 */
+			if ((exclusive && lsm_tree->refcnt > 0) ||
+			    F_ISSET_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE))
+			    return (EBUSY);
+
+			if (exclusive) {
+				F_SET_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE);
+				if (!WT_ATOMIC_CAS4(lsm_tree->refcnt, 0, 1)) {
+					F_CLR(lsm_tree, WT_LSM_TREE_EXCLUSIVE);
+					return (EBUSY);
+				}
+			} else
+				(void)WT_ATOMIC_ADD4(lsm_tree->refcnt, 1);
+
+			/*
+			 * If we got a reference, but an exclusive reference
+			 * beat us to it, give our reference up.
+			 */
+			if (!exclusive &&
+			    F_ISSET_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE)) {
+				(void)WT_ATOMIC_SUB4(lsm_tree->refcnt, 1);
+				return (EBUSY);
+			}
+			*treep = lsm_tree;
+			return (0);
+		}
+
+	/* Open a new tree. */
+	return (__lsm_tree_open(session, uri, treep));
+}
+
+/*
+ * __wt_lsm_tree_release --
+ *	Release an LSM tree structure.
+ */
+void
+__wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+	WT_ASSERT(session, lsm_tree->refcnt > 0);
+	(void)WT_ATOMIC_SUB4(lsm_tree->refcnt, 1);
+	F_CLR_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE);
+}
+
+/* How aggressively to ramp up or down throttle due to level 0 merging */
+#define	WT_LSM_MERGE_THROTTLE_BUMP_PCT	(100 / lsm_tree->merge_max)
+/* Number of level 0 chunks that need to be present to throttle inserts */
+#define	WT_LSM_MERGE_THROTTLE_THRESHOLD					\
+	(2 * lsm_tree->merge_min)
+/* Minimal throttling time */
+#define	WT_LSM_THROTTLE_START		20
+
+#define	WT_LSM_MERGE_THROTTLE_INCREASE(val)	do {			\
+	(val) += ((val) * WT_LSM_MERGE_THROTTLE_BUMP_PCT) / 100;	\
+	if ((val) < WT_LSM_THROTTLE_START)				\
+		(val) = WT_LSM_THROTTLE_START;				\
+	} while (0)
+
+#define	WT_LSM_MERGE_THROTTLE_DECREASE(val)	do {			\
+	(val) -= ((val) * WT_LSM_MERGE_THROTTLE_BUMP_PCT) / 100;	\
+	if ((val) < WT_LSM_THROTTLE_START)				\
+		(val) = 0;						\
+	} while (0)
+
+/*
+ * __wt_lsm_tree_throttle --
+ *	Calculate whether LSM updates need to be throttled. Must be called
+ *	with the LSM tree lock held.
+ */
+void
+__wt_lsm_tree_throttle(
+    WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int decrease_only)
+{
+	WT_LSM_CHUNK *last_chunk, **cp, *ondisk, *prev_chunk;
+	uint64_t cache_sz, cache_used, oldtime, record_count, timediff;
+	uint32_t in_memory, gen0_chunks;
+
+	/* Never throttle in small trees. */
+	if (lsm_tree->nchunks < 3) {
+		lsm_tree->ckpt_throttle = lsm_tree->merge_throttle = 0;
+		return;
+	}
+
+	cache_sz = S2C(session)->cache_size;
+
+	/*
+	 * In the steady state, we expect that the checkpoint worker thread
+	 * will keep up with inserts.  If not, throttle the insert rate to
+	 * avoid filling the cache with in-memory chunks.  Threads sleep every
+	 * 100 operations, so take that into account in the calculation.
+	 *
+	 * Also throttle based on whether merge threads are keeping up.  If
+	 * there are enough chunks that have never been merged we slow down
+	 * inserts so that merges have some chance of keeping up.
+	 *
+	 * Count the number of in-memory chunks, the number of unmerged chunk
+	 * on disk, and find the most recent on-disk chunk (if any).
+	 */
+	record_count = 1;
+	gen0_chunks = in_memory = 0;
+	ondisk = NULL;
+	for (cp = lsm_tree->chunk + lsm_tree->nchunks - 1;
+	    cp >= lsm_tree->chunk;
+	    --cp)
+		if (!F_ISSET(*cp, WT_LSM_CHUNK_ONDISK)) {
+			record_count += (*cp)->count;
+			++in_memory;
+		} else {
+			/*
+			 * Assign ondisk to the last chunk that has been
+			 * flushed since the tree was last opened (i.e it's on
+			 * disk and stable is not set).
+			 */
+			if (ondisk == NULL &&
+			    ((*cp)->generation == 0 &&
+			    !F_ISSET(*cp, WT_LSM_CHUNK_STABLE)))
+				ondisk = *cp;
+
+			if ((*cp)->generation == 0 &&
+			    !F_ISSET(*cp, WT_LSM_CHUNK_MERGING))
+				++gen0_chunks;
+		}
+
+	last_chunk = lsm_tree->chunk[lsm_tree->nchunks - 1];
+
+	/* Checkpoint throttling, based on the number of in-memory chunks. */
+	if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 3)
+		lsm_tree->ckpt_throttle = 0;
+	else if (decrease_only)
+		; /* Nothing to do */
+	else if (ondisk == NULL) {
+		/*
+		 * No checkpoint has completed this run.  Keep slowing down
+		 * inserts until one does.
+		 */
+		lsm_tree->ckpt_throttle =
+		    WT_MAX(WT_LSM_THROTTLE_START, 2 * lsm_tree->ckpt_throttle);
+	} else {
+		WT_ASSERT(session,
+		    WT_TIMECMP(last_chunk->create_ts, ondisk->create_ts) >= 0);
+		timediff =
+		    WT_TIMEDIFF(last_chunk->create_ts, ondisk->create_ts);
+		lsm_tree->ckpt_throttle =
+		    (long)((in_memory - 2) * timediff / (20 * record_count));
+
+		/*
+		 * Get more aggressive as the number of in memory chunks
+		 * consumes a large proportion of the cache. In memory chunks
+		 * are allowed to grow up to twice as large as the configured
+		 * value when checkpoints aren't keeping up. That worst case
+		 * is when this calculation is relevant.
+		 * There is nothing particularly special about the chosen
+		 * multipliers.
+		 */
+		cache_used = in_memory * lsm_tree->chunk_size * 2;
+		if (cache_used > cache_sz * 0.8)
+			lsm_tree->ckpt_throttle *= 5;
+	}
+
+	/*
+	 * Merge throttling, based on the number of on-disk, level 0 chunks.
+	 *
+	 * Don't throttle if the tree has less than a single level's number
+	 * of chunks.
+	 */
+	if (lsm_tree->nchunks < lsm_tree->merge_max)
+		lsm_tree->merge_throttle = 0;
+	else if (gen0_chunks < WT_LSM_MERGE_THROTTLE_THRESHOLD)
+		WT_LSM_MERGE_THROTTLE_DECREASE(lsm_tree->merge_throttle);
+	else if (!decrease_only)
+		WT_LSM_MERGE_THROTTLE_INCREASE(lsm_tree->merge_throttle);
+
+	/* Put an upper bound of 1s on both throttle calculations. */
+	lsm_tree->ckpt_throttle = WT_MIN(1000000, lsm_tree->ckpt_throttle);
+	lsm_tree->merge_throttle = WT_MIN(1000000, lsm_tree->merge_throttle);
+
+	/*
+	 * Update our estimate of how long each in-memory chunk stays active.
+	 * Filter out some noise by keeping a weighted history of the
+	 * calculated value.  Wait until we have enough chunks that we can
+	 * check that the new value is sane: otherwise, after a long idle
+	 * period, we can calculate a crazy value.
+	 */
+	if (in_memory > 1 && ondisk != NULL) {
+		prev_chunk = lsm_tree->chunk[lsm_tree->nchunks - 2];
+		WT_ASSERT(session, prev_chunk->generation == 0);
+		WT_ASSERT(session, WT_TIMECMP(
+		    last_chunk->create_ts, prev_chunk->create_ts) >= 0);
+		timediff =
+		    WT_TIMEDIFF(last_chunk->create_ts, prev_chunk->create_ts);
+		WT_ASSERT(session,
+		    WT_TIMECMP(prev_chunk->create_ts, ondisk->create_ts) >= 0);
+		oldtime = WT_TIMEDIFF(prev_chunk->create_ts, ondisk->create_ts);
+		if (timediff < 10 * oldtime)
+			lsm_tree->chunk_fill_ms =
+			    (3 * lsm_tree->chunk_fill_ms +
+			    timediff / 1000000) / 4;
+	}
+}
+
+/*
+ * __wt_lsm_tree_switch --
+ *	Switch to a new in-memory tree.
+ */
+int
+__wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+	WT_DECL_RET;
+	WT_LSM_CHUNK *chunk;
+	uint32_t nchunks, new_id;
+	int first_switch;
+
+	WT_RET(__wt_lsm_tree_writelock(session, lsm_tree));
+
+	nchunks = lsm_tree->nchunks;
+
+	first_switch = nchunks == 0 ? 1 : 0;
+	/*
+	 * Check if a switch is still needed: we may have raced while waiting
+	 * for a lock.
+	 */
+	chunk = NULL;
+	if (!first_switch &&
+	    (chunk = lsm_tree->chunk[nchunks - 1]) != NULL &&
+	    !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
+	    !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))
+		goto err;
+
+	/* Set the switch transaction in the previous chunk, if necessary. */
+	if (chunk != NULL && chunk->switch_txn == WT_TXN_NONE)
+		chunk->switch_txn = __wt_txn_new_id(session);
+
+	/* Update the throttle time. */
+	__wt_lsm_tree_throttle(session, lsm_tree, 0);
+
+	new_id = WT_ATOMIC_ADD4(lsm_tree->last, 1);
+
+	WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc,
+	    nchunks + 1, &lsm_tree->chunk));
+
+	WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+	    "Tree %s switch to: %" PRIu32 ", checkpoint throttle %ld, "
+	    "merge throttle %ld", lsm_tree->name,
+	    new_id, lsm_tree->ckpt_throttle, lsm_tree->merge_throttle));
+
+	WT_ERR(__wt_calloc_def(session, 1, &chunk));
+	chunk->id = new_id;
+	chunk->switch_txn = WT_TXN_NONE;
+	lsm_tree->chunk[lsm_tree->nchunks++] = chunk;
+	WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
+
+	WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
+	F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
+	++lsm_tree->dsk_gen;
+
+	lsm_tree->modified = 1;
+
+err:	WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+	/*
+	 * Errors that happen during a tree switch leave the tree in a state
+	 * where we can't make progress. Error out of WiredTiger.
+	 */
+	if (ret != 0)
+		WT_PANIC_RET(session, ret, "Failed doing LSM switch");
+	else if (!first_switch)
+		WT_RET(__wt_lsm_manager_push_entry(
+		    session, WT_LSM_WORK_FLUSH, 0, lsm_tree));
+	return (ret);
+}
+
+/*
+ * __wt_lsm_tree_drop --
+ *	Drop an LSM tree.
+ */
+int
+__wt_lsm_tree_drop(
+    WT_SESSION_IMPL *session, const char *name, const char *cfg[])
+{
+	WT_DECL_RET;
+	WT_LSM_CHUNK *chunk;
+	WT_LSM_TREE *lsm_tree;
+	u_int i;
+	int locked;
+
+	locked = 0;
+
+	/* Get the LSM tree. */
+	WT_RET(__wt_lsm_tree_get(session, name, 1, &lsm_tree));
+
+	/* Shut down the LSM worker. */
+	WT_ERR(__lsm_tree_close(session, lsm_tree));
+
+	/* Prevent any new opens. */
+	WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+	locked = 1;
+
+	/* Drop the chunks. */
+	for (i = 0; i < lsm_tree->nchunks; i++) {
+		chunk = lsm_tree->chunk[i];
+		WT_ERR(__wt_schema_drop(session, chunk->uri, cfg));
+		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+			WT_ERR(
+			    __wt_schema_drop(session, chunk->bloom_uri, cfg));
+	}
+
+	/* Drop any chunks on the obsolete list. */
+	for (i = 0; i < lsm_tree->nold_chunks; i++) {
+		if ((chunk = lsm_tree->old_chunks[i]) == NULL)
+			continue;
+		WT_ERR(__wt_schema_drop(session, chunk->uri, cfg));
+		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+			WT_ERR(
+			    __wt_schema_drop(session, chunk->bloom_uri, cfg));
+	}
+
+	locked = 0;
+	WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));
+	ret = __wt_metadata_remove(session, name);
+
+err:	if (locked)
+		WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+	WT_TRET(__lsm_tree_discard(session, lsm_tree));
+	return (ret);
+}
+
+/*
+ * __wt_lsm_tree_rename --
+ *	Rename an LSM tree.
+ */
+int
+__wt_lsm_tree_rename(WT_SESSION_IMPL *session,
+    const char *olduri, const char *newuri, const char *cfg[])
+{
+	WT_DECL_RET;
+	WT_LSM_CHUNK *chunk;
+	WT_LSM_TREE *lsm_tree;
+	const char *old;
+	u_int i;
+	int locked;
+
+	old = NULL;
+	locked = 0;
+
+	/* Get the LSM tree. */
+	WT_RET(__wt_lsm_tree_get(session, olduri, 1, &lsm_tree));
+
+	/* Shut down the LSM worker. */
+	WT_ERR(__lsm_tree_close(session, lsm_tree));
+
+	/* Prevent any new opens. */
+	WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+	locked = 1;
+
+	/* Set the new name. */
+	WT_ERR(__lsm_tree_set_name(session, lsm_tree, newuri));
+
+	/* Rename the chunks. */
+	for (i = 0; i < lsm_tree->nchunks; i++) {
+		chunk = lsm_tree->chunk[i];
+		old = chunk->uri;
+		chunk->uri = NULL;
+
+		WT_ERR(__wt_lsm_tree_chunk_name(
+		    session, lsm_tree, chunk->id, &chunk->uri));
+		WT_ERR(__wt_schema_rename(session, old, chunk->uri, cfg));
+		__wt_free(session, old);
+
+		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) {
+			old = chunk->bloom_uri;
+			chunk->bloom_uri = NULL;
+			WT_ERR(__wt_lsm_tree_bloom_name(
+			    session, lsm_tree, chunk->id, &chunk->bloom_uri));
+			F_SET(chunk, WT_LSM_CHUNK_BLOOM);
+			WT_ERR(__wt_schema_rename(
+			    session, old, chunk->uri, cfg));
+			__wt_free(session, old);
+		}
+	}
+
+	WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
+	locked = 0;
+	WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));
+	WT_ERR(__wt_metadata_remove(session, olduri));
+
+err:	if (locked)
+		WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+	if (old != NULL)
+		__wt_free(session, old);
+	/*
+	 * Discard this LSM tree structure. The first operation on the renamed
+	 * tree will create a new one.
+	 */
+	WT_TRET(__lsm_tree_discard(session, lsm_tree));
+	return (ret);
+}
+
+/*
+ * __wt_lsm_tree_truncate --
+ *	Truncate an LSM tree.
+ */
+int
+__wt_lsm_tree_truncate(
+    WT_SESSION_IMPL *session, const char *name, const char *cfg[])
+{
+	WT_DECL_RET;
+	WT_LSM_CHUNK *chunk;
+	WT_LSM_TREE *lsm_tree;
+	int locked;
+
+	WT_UNUSED(cfg);
+	chunk = NULL;
+	locked = 0;
+
+	/* Get the LSM tree. */
+	WT_RET(__wt_lsm_tree_get(session, name, 1, &lsm_tree));
+
+	/* Shut down the LSM worker. */
+	WT_ERR(__lsm_tree_close(session, lsm_tree));
+
+	/* Prevent any new opens. */
+	WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+	locked = 1;
+
+	/* Create the new chunk. */
+	WT_ERR(__wt_calloc_def(session, 1, &chunk));
+	chunk->id = WT_ATOMIC_ADD4(lsm_tree->last, 1);
+	WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
+
+	/* Mark all chunks old. */
+	WT_ERR(__wt_lsm_merge_update_tree(
+	    session, lsm_tree, 0, lsm_tree->nchunks, chunk));
+
+	WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
+
+	locked = 0;
+	WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));
+	__wt_lsm_tree_release(session, lsm_tree);
+
+err:	if (locked)
+		WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+	if (ret != 0) {
+		if (chunk != NULL) {
+			(void)__wt_schema_drop(session, chunk->uri, NULL);
+			__wt_free(session, chunk);
+		}
+		/*
+		 * Discard the LSM tree structure on error. This will force the
+		 * LSM tree to be re-opened the next time it is accessed and
+		 * the last good version of the metadata will be used, resulting
+		 * in a valid (not truncated) tree.
+		 */
+		WT_TRET(__lsm_tree_discard(session, lsm_tree));
+	}
+	return (ret);
+}
+
+/*
+ * __wt_lsm_tree_readlock --
+ *	Acquire a shared lock on an LSM tree.
+ */
+int
+__wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+	WT_RET(__wt_readlock(session, lsm_tree->rwlock));
+
+	/*
+	 * Diagnostic: avoid deadlocks with the schema lock: if we need it for
+	 * an operation, we should already have it.
+	 */
+	F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+	return (0);
+}
+
+/*
+ * __wt_lsm_tree_readunlock --
+ *	Release a shared lock on an LSM tree.
+ */
+int
+__wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+	WT_DECL_RET;
+
+	F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+
+	if ((ret = __wt_readunlock(session, lsm_tree->rwlock)) != 0)
+		WT_PANIC_RET(session, ret, "Unlocking an LSM tree");
+	return (0);
+}
+
+/*
+ * __wt_lsm_tree_writelock --
+ *	Acquire an exclusive lock on an LSM tree.
+ */
+int
+__wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+	WT_RET(__wt_writelock(session, lsm_tree->rwlock));
+
+	/*
+	 * Diagnostic: avoid deadlocks with the schema lock: if we need it for
+	 * an operation, we should already have it.
+	 */
+	F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+	return (0);
+}
+
+/*
+ * __wt_lsm_tree_writeunlock --
+ *	Release an exclusive lock on an LSM tree.
+ */
+int
+__wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+	WT_DECL_RET;
+
+	F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+
+	if ((ret = __wt_writeunlock(session, lsm_tree->rwlock)) != 0)
+		WT_PANIC_RET(session, ret, "Unlocking an LSM tree");
+	return (0);
+}
+
+/*
+ * __wt_lsm_compact --
+ *	Compact an LSM tree called via __wt_schema_worker.
+ */
+int
+__wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip)
+{
+	WT_DECL_RET;
+	WT_LSM_CHUNK *chunk;
+	WT_LSM_TREE *lsm_tree;
+	time_t begin, end;
+	uint64_t progress;
+	int i, compacting, flushing, locked, ref;
+
+	compacting = flushing = locked = ref = 0;
+	chunk = NULL;
+	/*
+	 * This function is applied to all matching sources: ignore anything
+	 * that is not an LSM tree.
+	 */
+	if (!WT_PREFIX_MATCH(name, "lsm:"))
+		return (0);
+
+	/* Tell __wt_schema_worker not to look inside the LSM tree. */
+	*skip = 1;
+
+	WT_RET(__wt_lsm_tree_get(session, name, 0, &lsm_tree));
+
+	if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE))
+		WT_ERR_MSG(session, EINVAL,
+		    "LSM compaction requires active merge threads");
+
+	WT_ERR(__wt_seconds(session, &begin));
+
+	/*
+	 * Compacting has two distinct phases.
+	 * 1.  All in-memory chunks up to and including the current
+	 * current chunk must be flushed.  Normally, the flush code
+	 * does not flush the last, in-use chunk, so we set a force
+	 * flag to include that last chunk.  We monitor the state of the
+	 * last chunk and periodically push another forced flush work
+	 * unit until it is complete.
+	 * 2.  After all flushing is done, we move onto the merging
+	 * phase for compaction.  Again, we monitor the state and
+	 * continue to push merge work units until all merging is done.
+	 */
+
+	/* Lock the tree: single-thread compaction. */
+	WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+	locked = 1;
+
+	/* Clear any merge throttle: compact throws out that calculation. */
+	lsm_tree->merge_throttle = 0;
+	lsm_tree->merge_aggressiveness = 0;
+	progress = lsm_tree->merge_progressing;
+
+	/* If another thread started a compact on this tree, we're done. */
+	if (F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING))
+		goto err;
+
+	/*
+	 * Set the switch transaction on the current chunk, if it
+	 * hasn't been set before.  This prevents further writes, so it
+	 * can be flushed by the checkpoint worker.
+	 */
+	if (lsm_tree->nchunks > 0 &&
+	    (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) != NULL) {
+		if (chunk->switch_txn == WT_TXN_NONE)
+			chunk->switch_txn = __wt_txn_new_id(session);
+		/*
+		 * If we have a chunk, we want to look for it to be on-disk.
+		 * So we need to add a reference to keep it available.
+		 */
+		(void)WT_ATOMIC_ADD4(chunk->refcnt, 1);
+		ref = 1;
+	}
+
+	locked = 0;
+	WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));
+
+	if (chunk != NULL) {
+		WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+		    "Compact force flush %s flags 0x%" PRIx32
+		    " chunk %u flags 0x%"
+		    PRIx32, name, lsm_tree->flags, chunk->id, chunk->flags));
+		flushing = 1;
+		/*
+		 * Make sure the in-memory chunk gets flushed do not push a
+		 * switch, because we don't want to create a new in-memory
+		 * chunk if the tree is being used read-only now.
+		 */
+		WT_ERR(__wt_lsm_manager_push_entry(session,
+		    WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree));
+	} else {
+		/*
+		 * If there is no chunk to flush, go straight to the
+		 * compacting state.
+		 */
+		compacting = 1;
+		progress = lsm_tree->merge_progressing;
+		F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
+		WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+		    "COMPACT: Start compacting %s", lsm_tree->name));
+	}
+
+	/* Wait for the work unit queues to drain. */
+	while (F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) {
+		/*
+		 * The flush flag is cleared when the chunk has been flushed.
+		 * Continue to push forced flushes until the chunk is on disk.
+		 * Once it is on disk move to the compacting phase.
+		 */
+		if (flushing) {
+			WT_ASSERT(session, chunk != NULL);
+			if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
+				WT_ERR(__wt_verbose(session,
+				    WT_VERB_LSM,
+				    "Compact flush done %s chunk %u.  "
+				    "Start compacting progress %" PRIu64,
+				    name, chunk->id,
+				    lsm_tree->merge_progressing));
+				(void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
+				flushing = ref = 0;
+				compacting = 1;
+				F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
+				progress = lsm_tree->merge_progressing;
+			} else {
+				WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+				    "Compact flush retry %s chunk %u",
+				    name, chunk->id));
+				WT_ERR(__wt_lsm_manager_push_entry(session,
+				    WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE,
+				    lsm_tree));
+			}
+		}
+
+		/*
+		 * The compacting flag is cleared when no merges can be done.
+		 * Ensure that we push through some aggressive merges before
+		 * stopping otherwise we might not do merges that would
+		 * span chunks with different generations.
+		 */
+		if (compacting && !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) {
+			if (lsm_tree->merge_aggressiveness < 10 ||
+			    (progress < lsm_tree->merge_progressing) ||
+			    lsm_tree->merge_syncing) {
+				progress = lsm_tree->merge_progressing;
+				F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
+				lsm_tree->merge_aggressiveness = 10;
+			} else
+				break;
+		}
+		__wt_sleep(1, 0);
+		WT_ERR(__wt_seconds(session, &end));
+		if (session->compact->max_time > 0 &&
+		    session->compact->max_time < (uint64_t)(end - begin)) {
+			WT_ERR(ETIMEDOUT);
+		}
+		/*
+		 * Push merge operations while they are still getting work
+		 * done. If we are pushing merges, make sure they are
+		 * aggressive, to avoid duplicating effort.
+		 */
+		if (compacting)
+#define	COMPACT_PARALLEL_MERGES	5
+			for (i = lsm_tree->queue_ref;
+			    i < COMPACT_PARALLEL_MERGES; i++) {
+				lsm_tree->merge_aggressiveness = 10;
+				WT_ERR(__wt_lsm_manager_push_entry(
+				    session, WT_LSM_WORK_MERGE, 0, lsm_tree));
+			}
+	}
+err:
+	/* Ensure anything we set is cleared. */
+	if (ref)
+		(void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
+	if (compacting) {
+		F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING);
+		lsm_tree->merge_aggressiveness = 0;
+	}
+	if (locked)
+		WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+
+	WT_TRET(__wt_verbose(session, WT_VERB_LSM,
+	    "Compact %s complete, return %d", name, ret));
+
+	__wt_lsm_tree_release(session, lsm_tree);
+	return (ret);
+
+}
+
+/*
+ * __wt_lsm_tree_worker --
+ *	Run a schema worker operation on each level of a LSM tree.
+ */
+int
+__wt_lsm_tree_worker(WT_SESSION_IMPL *session,
+   const char *uri,
+   int (*file_func)(WT_SESSION_IMPL *, const char *[]),
+   int (*name_func)(WT_SESSION_IMPL *, const char *, int *),
+   const char *cfg[], uint32_t open_flags)
+{
+	WT_DECL_RET;
+	WT_LSM_CHUNK *chunk;
+	WT_LSM_TREE *lsm_tree;
+	u_int i;
+	int exclusive, locked;
+
+	locked = 0;
+	exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE) ? 1 : 0;
+	WT_RET(__wt_lsm_tree_get(session, uri, exclusive, &lsm_tree));
+
+	/*
+	 * We mark that we're busy using the tree to coordinate
+	 * with merges so that merging doesn't change the chunk
+	 * array out from underneath us.
+	 */
+	WT_ERR(exclusive ?
+	    __wt_lsm_tree_writelock(session, lsm_tree) :
+	    __wt_lsm_tree_readlock(session, lsm_tree));
+	locked = 1;
+	for (i = 0; i < lsm_tree->nchunks; i++) {
+		chunk = lsm_tree->chunk[i];
+		if (file_func == __wt_checkpoint &&
+		    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))
+			continue;
+		WT_ERR(__wt_schema_worker(session, chunk->uri,
+		    file_func, name_func, cfg, open_flags));
+		if (name_func == __wt_backup_list_uri_append &&
+		    F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+			WT_ERR(__wt_schema_worker(session, chunk->bloom_uri,
+			    file_func, name_func, cfg, open_flags));
+	}
+err:	if (locked)
+		WT_TRET(exclusive ?
+		    __wt_lsm_tree_writeunlock(session, lsm_tree) :
+		    __wt_lsm_tree_readunlock(session, lsm_tree));
+	__wt_lsm_tree_release(session, lsm_tree);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
new file mode 100644
index 00000000000..278c400070f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
@@ -0,0 +1,625 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __lsm_bloom_create(
+    WT_SESSION_IMPL *, WT_LSM_TREE *, WT_LSM_CHUNK *, u_int);
+static int __lsm_discard_handle(WT_SESSION_IMPL *, const char *, const char *);
+
+/*
+ * __lsm_copy_chunks --
+ *	 Take a copy of part of the LSM tree chunk array so that we can work on
+ *	 the contents without holding the LSM tree handle lock long term.
+ */
+static int
+__lsm_copy_chunks(WT_SESSION_IMPL *session,
+    WT_LSM_TREE *lsm_tree, WT_LSM_WORKER_COOKIE *cookie, int old_chunks)
+{
+	WT_DECL_RET;
+	u_int i, nchunks;
+	size_t alloc;
+
+	/* Always return zero chunks on error. */
+	cookie->nchunks = 0;
+
+	WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
+	if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
+		return (__wt_lsm_tree_readunlock(session, lsm_tree));
+
+	/* Take a copy of the current state of the LSM tree. */
+	nchunks = old_chunks ? lsm_tree->nold_chunks : lsm_tree->nchunks;
+	alloc = old_chunks ? lsm_tree->old_alloc : lsm_tree->chunk_alloc;
+
+	/*
+	 * If the tree array of active chunks is larger than our current buffer,
+	 * increase the size of our current buffer to match.
+	 */
+	if (cookie->chunk_alloc < alloc)
+		WT_ERR(__wt_realloc(session,
+		    &cookie->chunk_alloc, alloc, &cookie->chunk_array));
+	if (nchunks > 0)
+		memcpy(cookie->chunk_array,
+		    old_chunks ? lsm_tree->old_chunks : lsm_tree->chunk,
+		    nchunks * sizeof(*cookie->chunk_array));
+
+	/*
+	 * Mark each chunk as active, so we don't drop it until after we know
+	 * it's safe.
+	 */
+	for (i = 0; i < nchunks; i++)
+		(void)WT_ATOMIC_ADD4(cookie->chunk_array[i]->refcnt, 1);
+
+err:	WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree));
+
+	if (ret == 0)
+		cookie->nchunks = nchunks;
+	return (ret);
+}
+
+/*
+ * __wt_lsm_get_chunk_to_flush --
+ *	Find and pin a chunk in the LSM tree that is likely to need flushing.
+ */
+int
+__wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session,
+    WT_LSM_TREE *lsm_tree, int force, WT_LSM_CHUNK **chunkp)
+{
+	u_int i, end;
+
+	*chunkp = NULL;
+
+	WT_ASSERT(session, lsm_tree->queue_ref > 0);
+	WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
+	if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
+		return (__wt_lsm_tree_readunlock(session, lsm_tree));
+
+	/*
+	 * Normally we don't want to force out the last chunk.  But if we're
+	 * doing a forced flush, likely from a compact call, then we want
+	 * to include the final chunk.
+	 */
+	end = force ? lsm_tree->nchunks : lsm_tree->nchunks - 1;
+	for (i = 0; i < end; i++) {
+		if (!F_ISSET(lsm_tree->chunk[i], WT_LSM_CHUNK_ONDISK)) {
+			(void)WT_ATOMIC_ADD4(lsm_tree->chunk[i]->refcnt, 1);
+			WT_RET(__wt_verbose(session, WT_VERB_LSM,
+			    "Flush%s: return chunk %u of %u: %s",
+			    force ? " w/ force" : "", i, end - 1,
+			    lsm_tree->chunk[i]->uri));
+			*chunkp = lsm_tree->chunk[i];
+			break;
+		}
+	}
+
+	WT_RET(__wt_lsm_tree_readunlock(session, lsm_tree));
+
+	return (0);
+}
+
+/*
+ * __lsm_unpin_chunks --
+ *	Decrement the reference count for a set of chunks. Allowing those
+ *	chunks to be considered for deletion.
+ */
+static void
+__lsm_unpin_chunks(WT_SESSION_IMPL *session, WT_LSM_WORKER_COOKIE *cookie)
+{
+	u_int i;
+
+	for (i = 0; i < cookie->nchunks; i++) {
+		if (cookie->chunk_array[i] == NULL)
+			continue;
+		WT_ASSERT(session, cookie->chunk_array[i]->refcnt > 0);
+		(void)WT_ATOMIC_SUB4(cookie->chunk_array[i]->refcnt, 1);
+	}
+	/* Ensure subsequent calls don't double decrement. */
+	cookie->nchunks = 0;
+}
+
+/*
+ * __wt_lsm_work_switch --
+ *	Do a switch if the LSM tree needs one.
+ */
+int
+__wt_lsm_work_switch(
+    WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT **entryp, int *ran)
+{
+	WT_DECL_RET;
+	WT_LSM_WORK_UNIT *entry;
+
+	/* We've become responsible for freeing the work unit. */
+	entry = *entryp;
+	*ran = 0;
+	*entryp = NULL;
+
+	if (F_ISSET(entry->lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
+		WT_WITH_SCHEMA_LOCK(session, ret =
+		    __wt_lsm_tree_switch(session, entry->lsm_tree));
+		/* Failing to complete the switch is fine */
+		if (ret == EBUSY) {
+			if (F_ISSET(entry->lsm_tree, WT_LSM_TREE_NEED_SWITCH))
+				WT_ERR(__wt_lsm_manager_push_entry(session,
+				    WT_LSM_WORK_SWITCH, 0, entry->lsm_tree));
+			ret = 0;
+		} else
+			*ran = 1;
+	}
+err:	__wt_lsm_manager_free_work_unit(session, entry);
+	return (ret);
+}
+
+/*
+ * __wt_lsm_work_bloom --
+ *	Try to create a Bloom filter for the newest on-disk chunk that doesn't
+ *	have one.
+ */
+int
+__wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+	WT_DECL_RET;
+	WT_LSM_CHUNK *chunk;
+	WT_LSM_WORKER_COOKIE cookie;
+	u_int i, merge;
+
+	WT_CLEAR(cookie);
+
+	WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, 0));
+
+	/* Create bloom filters in all checkpointed chunks. */
+	merge = 0;
+	for (i = 0; i < cookie.nchunks; i++) {
+		chunk = cookie.chunk_array[i];
+
+		/*
+		 * Skip if a thread is still active in the chunk or it
+		 * isn't suitable.
+		 */
+		if (!F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ||
+		    F_ISSET(chunk, WT_LSM_CHUNK_BLOOM | WT_LSM_CHUNK_MERGING) ||
+		    chunk->generation > 0 ||
+		    chunk->count == 0)
+			continue;
+
+		/*
+		 * See if we win the race to switch on the "busy" flag and
+		 * recheck that the chunk still needs a Bloom filter.
+		 */
+		if (WT_ATOMIC_CAS4(chunk->bloom_busy, 0, 1)) {
+			if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) {
+				ret = __lsm_bloom_create(
+				    session, lsm_tree, chunk, (u_int)i);
+				/*
+				 * Record if we were successful so that we can
+				 * later push a merge work unit.
+				 */
+				if (ret == 0)
+					merge = 1;
+			}
+			chunk->bloom_busy = 0;
+			break;
+		}
+	}
+	/*
+	 * If we created any bloom filters, we push a merge work unit now.
+	 */
+	if (merge)
+		WT_ERR(__wt_lsm_manager_push_entry(
+		    session, WT_LSM_WORK_MERGE, 0, lsm_tree));
+
+err:
+	__lsm_unpin_chunks(session, &cookie);
+	__wt_free(session, cookie.chunk_array);
+	return (ret);
+}
+
+/*
+ * __wt_lsm_checkpoint_chunk --
+ *	Flush a single LSM chunk to disk.
+ */
+int
+__wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
+    WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
+{
+	WT_DECL_RET;
+	WT_TXN_ISOLATION saved_isolation;
+
+	/*
+	 * If the chunk is already checkpointed, make sure it is also evicted.
+	 * Either way, there is no point trying to checkpoint it again.
+	 */
+	if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
+	    !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) &&
+	    !chunk->evicted) {
+		if ((ret = __lsm_discard_handle(
+		    session, chunk->uri, NULL)) == 0)
+			chunk->evicted = 1;
+		else if (ret == EBUSY)
+			ret = 0;
+		else
+			WT_RET_MSG(session, ret, "discard handle");
+	}
+	if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
+		WT_RET(__wt_verbose(session, WT_VERB_LSM,
+		    "LSM worker %s already on disk",
+		    chunk->uri));
+		return (0);
+	}
+
+	/* Stop if a running transaction needs the chunk. */
+	__wt_txn_update_oldest(session);
+	if (chunk->switch_txn == WT_TXN_NONE ||
+	    !__wt_txn_visible_all(session, chunk->switch_txn)) {
+		WT_RET(__wt_verbose(session, WT_VERB_LSM,
+		    "LSM worker %s: running transaction, return",
+		    chunk->uri));
+		return (0);
+	}
+
+	WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s",
+	    chunk->uri));
+
+	/*
+	 * Flush the file before checkpointing: this is the expensive part in
+	 * terms of I/O.
+	 *
+	 * Use the special eviction isolation level to avoid interfering with
+	 * an application checkpoint: we have already checked that all of the
+	 * updates in this chunk are globally visible.
+	 *
+	 * !!! We can wait here for checkpoints and fsyncs to complete, which
+	 * can be a long time.
+	 */
+	if ((ret = __wt_session_get_btree(
+	    session, chunk->uri, NULL, NULL, 0)) == 0) {
+		saved_isolation = session->txn.isolation;
+		session->txn.isolation = TXN_ISO_EVICTION;
+		ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES);
+		session->txn.isolation = saved_isolation;
+		WT_TRET(__wt_session_release_btree(session));
+	}
+	WT_RET(ret);
+
+	WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s",
+	    chunk->uri));
+
+	WT_WITH_SCHEMA_LOCK(session,
+	    ret = __wt_schema_worker(session, chunk->uri,
+	    __wt_checkpoint, NULL, NULL, 0));
+
+	if (ret != 0)
+		WT_RET_MSG(session, ret, "LSM checkpoint");
+
+	/* Now the file is written, get the chunk size. */
+	WT_RET(__wt_lsm_tree_set_chunk_size(session, chunk));
+
+	/* Update the flush timestamp to help track ongoing progress. */
+	WT_RET(__wt_epoch(session, &lsm_tree->last_flush_ts));
+
+	/* Lock the tree, mark the chunk as on disk and update the metadata. */
+	WT_RET(__wt_lsm_tree_writelock(session, lsm_tree));
+	F_SET(chunk, WT_LSM_CHUNK_ONDISK);
+	ret = __wt_lsm_meta_write(session, lsm_tree);
+	++lsm_tree->dsk_gen;
+
+	/* Update the throttle time. */
+	__wt_lsm_tree_throttle(session, lsm_tree, 1);
+	WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+
+	if (ret != 0)
+		WT_RET_MSG(session, ret, "LSM metadata write");
+
+	/*
+	 * Clear the "cache resident" flag so the primary can be evicted and
+	 * eventually closed.  Only do this once the checkpoint has succeeded:
+	 * otherwise, accessing the leaf page during the checkpoint can trigger
+	 * forced eviction.
+	 */
+	WT_RET(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0));
+	__wt_btree_evictable(session, 1);
+	WT_RET(__wt_session_release_btree(session));
+
+	/* Make sure we aren't pinning a transaction ID. */
+	__wt_txn_release_snapshot(session);
+
+	WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s",
+	    chunk->uri));
+	/*
+	 * Schedule a bloom filter create for our newly flushed chunk */
+	if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF))
+		WT_RET(__wt_lsm_manager_push_entry(
+		    session, WT_LSM_WORK_BLOOM, 0, lsm_tree));
+	else
+		WT_RET(__wt_lsm_manager_push_entry(
+		    session, WT_LSM_WORK_MERGE, 0, lsm_tree));
+	return (0);
+}
+
+/*
+ * __lsm_bloom_create --
+ *	Create a bloom filter for a chunk of the LSM tree that has been
+ *	checkpointed but not yet been merged.
+ */
+static int
+__lsm_bloom_create(WT_SESSION_IMPL *session,
+    WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off)
+{
+	WT_BLOOM *bloom;
+	WT_CURSOR *src;
+	WT_DECL_RET;
+	WT_ITEM key;
+	WT_SESSION *wt_session;
+	uint64_t insert_count;
+	int exist;
+
+	/*
+	 * Normally, the Bloom URI is populated when the chunk struct is
+	 * allocated.  After an open, however, it may not have been.
+	 * Deal with that here.
+	 */
+	if (chunk->bloom_uri == NULL)
+		WT_RET(__wt_lsm_tree_bloom_name(
+		    session, lsm_tree, chunk->id, &chunk->bloom_uri));
+
+	/*
+	 * Drop the bloom filter first - there may be some content hanging over
+	 * from an aborted merge or checkpoint.
+	 */
+	wt_session = &session->iface;
+	WT_RET(__wt_exist(session, chunk->bloom_uri + strlen("file:"), &exist));
+	if (exist)
+		WT_RET(wt_session->drop(wt_session, chunk->bloom_uri, "force"));
+
+	bloom = NULL;
+	/*
+	 * This is merge-like activity, and we don't want compacts to give up
+	 * because we are creating a bunch of bloom filters before merging.
+	 */
+	++lsm_tree->merge_progressing;
+	WT_RET(__wt_bloom_create(session, chunk->bloom_uri,
+	    lsm_tree->bloom_config, chunk->count,
+	    lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom));
+
+	/* Open a special merge cursor just on this chunk. */
+	WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
+	F_SET(src, WT_CURSTD_RAW);
+	WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1));
+
+	F_SET(session, WT_SESSION_NO_CACHE);
+	for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
+		WT_ERR(src->get_key(src, &key));
+		WT_ERR(__wt_bloom_insert(bloom, &key));
+	}
+	WT_ERR_NOTFOUND_OK(ret);
+	WT_TRET(src->close(src));
+
+	WT_TRET(__wt_bloom_finalize(bloom));
+	WT_ERR(ret);
+
+	F_CLR(session, WT_SESSION_NO_CACHE);
+
+	/*
+	 * Load the new Bloom filter into cache.
+	 *
+	 * We're doing advisory reads to fault the new trees into cache.
+	 * Don't block if the cache is full: our next unit of work may be to
+	 * discard some trees to free space.
+	 */
+	F_SET(session, WT_SESSION_NO_CACHE_CHECK);
+
+	WT_CLEAR(key);
+	WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key));
+
+	WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+	    "LSM worker created bloom filter %s. "
+	    "Expected %" PRIu64 " items, got %" PRIu64,
+	    chunk->bloom_uri, chunk->count, insert_count));
+
+	/* Ensure the bloom filter is in the metadata. */
+	WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+	F_SET(chunk, WT_LSM_CHUNK_BLOOM);
+	ret = __wt_lsm_meta_write(session, lsm_tree);
+	++lsm_tree->dsk_gen;
+	WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+
+	if (ret != 0)
+		WT_ERR_MSG(session, ret, "LSM bloom worker metadata write");
+
+err:	if (bloom != NULL)
+		WT_TRET(__wt_bloom_close(bloom));
+	F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK);
+	return (ret);
+}
+
+/*
+ * __lsm_discard_handle --
+ *	Try to discard a handle from cache.
+ */
+static int
+__lsm_discard_handle(
+    WT_SESSION_IMPL *session, const char *uri, const char *checkpoint)
+{
+	/* This will fail with EBUSY if the file is still in use. */
+	WT_RET(__wt_session_get_btree(session, uri, checkpoint, NULL,
+	    WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY));
+
+	F_SET(session->dhandle, WT_DHANDLE_DISCARD);
+	return (__wt_session_release_btree(session));
+}
+
+/*
+ * __lsm_drop_file --
+ *	Helper function to drop part of an LSM tree.
+ */
+static int
+__lsm_drop_file(WT_SESSION_IMPL *session, const char *uri)
+{
+	WT_DECL_RET;
+	const char *drop_cfg[] = {
+	    WT_CONFIG_BASE(session, session_drop), "remove_files=false", NULL
+	};
+
+	/*
+	 * We need to grab the schema lock to drop the file, so first try to
+	 * make sure there is minimal work to freeing space in the cache.  Only
+	 * bother trying to discard the checkpoint handle: the in-memory handle
+	 * should have been closed already.
+	 *
+	 * This will fail with EBUSY if the file is still in use.
+	 */
+	WT_RET(__lsm_discard_handle(session, uri, WT_CHECKPOINT));
+
+	/*
+	 * Take the schema lock for the drop operation.  Since __wt_schema_drop
+	 * results in the hot backup lock being taken when it updates the
+	 * metadata (which would be too late to prevent our drop).
+	 */
+	WT_WITH_SCHEMA_LOCK(session,
+	    ret = __wt_schema_drop(session, uri, drop_cfg));
+
+	if (ret == 0)
+		ret = __wt_remove(session, uri + strlen("file:"));
+	WT_RET(__wt_verbose(session, WT_VERB_LSM, "Dropped %s", uri));
+
+	if (ret == EBUSY || ret == ENOENT)
+		WT_RET(__wt_verbose(session, WT_VERB_LSM,
+		    "LSM worker drop of %s failed with %d", uri, ret));
+
+	return (ret);
+}
+
+/*
+ * __wt_lsm_free_chunks --
+ *	Try to drop chunks from the tree that are no longer required.
+ */
+int
+__wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+	WT_DECL_RET;
+	WT_LSM_CHUNK *chunk;
+	WT_LSM_WORKER_COOKIE cookie;
+	u_int i, skipped;
+	int flush_metadata, drop_ret;
+
+	flush_metadata = 0;
+
+	if (lsm_tree->nold_chunks == 0)
+		return (0);
+
+	/*
+	 * Make sure only a single thread is freeing the old chunk array
+	 * at any time.
+	 */
+	if (!WT_ATOMIC_CAS4(lsm_tree->freeing_old_chunks, 0, 1))
+		return (0);
+	/*
+	 * Take a copy of the current state of the LSM tree and look for chunks
+	 * to drop.  We do it this way to avoid holding the LSM tree lock while
+	 * doing I/O or waiting on the schema lock.
+	 *
+	 * This is safe because only one thread will be in this function at a
+	 * time.  Merges may complete concurrently, and the old_chunks array
+	 * may be extended, but we shuffle down the pointers each time we free
+	 * one to keep the non-NULL slots at the beginning of the array.
+	 */
+	WT_CLEAR(cookie);
+	WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, 1));
+	for (i = skipped = 0; i < cookie.nchunks; i++) {
+		chunk = cookie.chunk_array[i];
+		WT_ASSERT(session, chunk != NULL);
+		/* Skip the chunk if another worker is using it. */
+		if (chunk->refcnt > 1) {
+			++skipped;
+			continue;
+		}
+
+		/*
+		 * Don't remove files if a hot backup is in progress.
+		 *
+		 * The schema lock protects the set of live files, this check
+		 * prevents us from removing a file that hot backup already
+		 * knows about.
+		 */
+		if (S2C(session)->hot_backup != 0)
+			break;
+
+		/*
+		 * Drop any bloom filters and chunks we can. Don't try to drop
+		 * a chunk if the bloom filter drop fails.
+		 *  An EBUSY return indicates that a cursor is still open in
+		 *       the tree - move to the next chunk in that case.
+		 * An ENOENT return indicates that the LSM tree metadata was
+		 *       out of sync with the on disk state. Update the
+		 *       metadata to match in that case.
+		 */
+		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) {
+			drop_ret = __lsm_drop_file(session, chunk->bloom_uri);
+			if (drop_ret == EBUSY) {
+				++skipped;
+				continue;
+			} else if (drop_ret != ENOENT)
+				WT_ERR(drop_ret);
+
+			flush_metadata = 1;
+			F_CLR(chunk, WT_LSM_CHUNK_BLOOM);
+		}
+		if (chunk->uri != NULL) {
+			drop_ret = __lsm_drop_file(session, chunk->uri);
+			if (drop_ret == EBUSY) {
+				++skipped;
+				continue;
+			} else if (drop_ret != ENOENT)
+				WT_ERR(drop_ret);
+			flush_metadata = 1;
+		}
+
+		/* Lock the tree to clear out the old chunk information. */
+		WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+
+		/*
+		 * The chunk we are looking at should be the first one in the
+		 * tree that we haven't already skipped over.
+		 */
+		WT_ASSERT(session, lsm_tree->old_chunks[skipped] == chunk);
+		__wt_free(session, chunk->bloom_uri);
+		__wt_free(session, chunk->uri);
+		__wt_free(session, lsm_tree->old_chunks[skipped]);
+
+		/* Shuffle down to keep all occupied slots at the beginning. */
+		if (--lsm_tree->nold_chunks > skipped) {
+			memmove(lsm_tree->old_chunks + skipped,
+			    lsm_tree->old_chunks + skipped + 1,
+			    (lsm_tree->nold_chunks - skipped) *
+			    sizeof(WT_LSM_CHUNK *));
+			lsm_tree->old_chunks[lsm_tree->nold_chunks] = NULL;
+		}
+
+		WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));
+
+		/*
+		 * Clear the chunk in the cookie so we don't attempt to
+		 * decrement the reference count.
+		 */
+		cookie.chunk_array[i] = NULL;
+	}
+
+err:	/* Flush the metadata unless the system is in panic */
+	if (flush_metadata && ret != WT_PANIC) {
+		WT_TRET(__wt_lsm_tree_writelock(session, lsm_tree));
+		WT_TRET(__wt_lsm_meta_write(session, lsm_tree));
+		WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+	}
+	__lsm_unpin_chunks(session, &cookie);
+	__wt_free(session, cookie.chunk_array);
+	lsm_tree->freeing_old_chunks = 0;
+
+	/* Returning non-zero means there is no work to do. */
+	if (!flush_metadata)
+		WT_TRET(WT_NOTFOUND);
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_worker.c b/src/third_party/wiredtiger/src/lsm/lsm_worker.c
new file mode 100644
index 00000000000..f24e58148b1
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_worker.c
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __lsm_worker_general_op(
+    WT_SESSION_IMPL *, WT_LSM_WORKER_ARGS *, int *);
+static void * __lsm_worker(void *);
+
+/*
+ * __wt_lsm_worker_start --
+ *	A wrapper around the LSM worker thread start.
+ */
+int
+__wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args)
+{
+	WT_RET(__wt_verbose(session, WT_VERB_LSM,
+	    "Start LSM worker %d type 0x%x", args->id, args->type));
+	return (__wt_thread_create(session, &args->tid, __lsm_worker, args));
+}
+
+/*
+ * __lsm_worker_general_op --
+ *	Execute a single bloom, drop or flush work unit.
+ */
+static int
+__lsm_worker_general_op(
+    WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *cookie, int *completed)
+{
+	WT_DECL_RET;
+	WT_LSM_CHUNK *chunk;
+	WT_LSM_WORK_UNIT *entry;
+	int force;
+
+	*completed = 0;
+	/*
+	 * Return if this thread cannot process a bloom, drop or flush.
+	 */
+	if (!FLD_ISSET(cookie->type,
+	    WT_LSM_WORK_BLOOM | WT_LSM_WORK_DROP | WT_LSM_WORK_FLUSH))
+		return (WT_NOTFOUND);
+
+	if ((ret = __wt_lsm_manager_pop_entry(session,
+	    cookie->type, &entry)) != 0 || entry == NULL)
+		return (ret);
+
+	if (entry->type == WT_LSM_WORK_FLUSH) {
+		force = F_ISSET(entry, WT_LSM_WORK_FORCE);
+		F_CLR(entry, WT_LSM_WORK_FORCE);
+		WT_ERR(__wt_lsm_get_chunk_to_flush(session,
+		    entry->lsm_tree, force, &chunk));
+		/*
+		 * If we got a chunk to flush, checkpoint it.
+		 */
+		if (chunk != NULL) {
+			WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+			    "Flush%s chunk %d %s",
+			    force ? " w/ force" : "",
+			    chunk->id, chunk->uri));
+			ret = __wt_lsm_checkpoint_chunk(
+			    session, entry->lsm_tree, chunk);
+			WT_ASSERT(session, chunk->refcnt > 0);
+			(void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
+			WT_ERR(ret);
+		}
+	} else if (entry->type == WT_LSM_WORK_DROP)
+		WT_ERR(__wt_lsm_free_chunks(session, entry->lsm_tree));
+	else if (entry->type == WT_LSM_WORK_BLOOM)
+		WT_ERR(__wt_lsm_work_bloom(session, entry->lsm_tree));
+	*completed = 1;
+
+err:	__wt_lsm_manager_free_work_unit(session, entry);
+	return (ret);
+}
+
+/*
+ * __lsm_worker --
+ *	A thread that executes work units for all open LSM trees.
+ */
+static void *
+__lsm_worker(void *arg)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_LSM_WORK_UNIT *entry;
+	WT_LSM_WORKER_ARGS *cookie;
+	WT_SESSION_IMPL *session;
+	int progress, ran;
+
+	cookie = (WT_LSM_WORKER_ARGS *)arg;
+	session = cookie->session;
+	conn = S2C(session);
+
+	entry = NULL;
+	while (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
+	    F_ISSET(cookie, WT_LSM_WORKER_RUN)) {
+		progress = 0;
+
+		/*
+		 * Workers process the different LSM work queues.  Some workers
+		 * can handle several or all work unit types.  So the code is
+		 * prioritized so important operations happen first.
+		 * Switches are the highest priority.
+		 */
+		while (FLD_ISSET(cookie->type, WT_LSM_WORK_SWITCH) &&
+		    (ret = __wt_lsm_manager_pop_entry(
+		    session, WT_LSM_WORK_SWITCH, &entry)) == 0 &&
+		    entry != NULL)
+			WT_ERR(
+			    __wt_lsm_work_switch(session, &entry, &progress));
+		/* Flag an error if the pop failed. */
+		WT_ERR(ret);
+
+		/*
+		 * Next the general operations.
+		 */
+		ret = __lsm_worker_general_op(session, cookie, &ran);
+		if (ret == EBUSY || ret == WT_NOTFOUND)
+			ret = 0;
+		WT_ERR(ret);
+		progress = progress || ran;
+
+		/*
+		 * Finally see if there is any merge work we can do.  This is
+		 * last because the earlier operations may result in adding
+		 * merge work to the queue.
+		 */
+		if (FLD_ISSET(cookie->type, WT_LSM_WORK_MERGE) &&
+		    (ret = __wt_lsm_manager_pop_entry(
+		    session, WT_LSM_WORK_MERGE, &entry)) == 0 &&
+		    entry != NULL) {
+			WT_ASSERT(session, entry->type == WT_LSM_WORK_MERGE);
+			ret = __wt_lsm_merge(session,
+			    entry->lsm_tree, cookie->id);
+			if (ret == WT_NOTFOUND) {
+				F_CLR(entry->lsm_tree, WT_LSM_TREE_COMPACTING);
+				ret = 0;
+			} else if (ret == EBUSY)
+				ret = 0;
+			/* Clear any state */
+			WT_CLEAR_BTREE_IN_SESSION(session);
+			__wt_lsm_manager_free_work_unit(session, entry);
+			entry = NULL;
+			progress = 1;
+		}
+		/* Flag an error if the pop failed. */
+		WT_ERR(ret);
+
+		/* Don't busy wait if there was any work to do. */
+		if (!progress) {
+			WT_ERR(
+			    __wt_cond_wait(session, cookie->work_cond, 10000));
+			continue;
+		}
+	}
+
+	if (ret != 0) {
+err:		__wt_lsm_manager_free_work_unit(session, entry);
+		__wt_err(session, ret,
+		    "Error in LSM worker thread %d", cookie->id);
+	}
+	return (NULL);
+}
diff --git a/src/third_party/wiredtiger/src/meta/meta_apply.c b/src/third_party/wiredtiger/src/meta/meta_apply.c
new file mode 100644
index 00000000000..313516148c0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/meta/meta_apply.c
@@ -0,0 +1,62 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_meta_btree_apply --
+ *	Apply a function to all files listed in the metadata, apart from the
+ *	metadata file.
+ */
+int
+__wt_meta_btree_apply(WT_SESSION_IMPL *session,
+    int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[])
+{
+	WT_CURSOR *cursor;
+	WT_DATA_HANDLE *saved_dhandle;
+	WT_DECL_RET;
+	const char *uri;
+	int cmp, tret;
+
+	saved_dhandle = session->dhandle;
+	WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
+	cursor->set_key(cursor, "file:");
+	if ((tret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0)
+		tret = cursor->next(cursor);
+	for (; tret == 0; tret = cursor->next(cursor)) {
+		WT_ERR(cursor->get_key(cursor, &uri));
+		if (!WT_PREFIX_MATCH(uri, "file:"))
+			break;
+		else if (strcmp(uri, WT_METAFILE_URI) == 0)
+			continue;
+
+		/*
+		 * We need to pull the handle into the session handle cache
+		 * and make sure it's referenced to stop other internal code
+		 * dropping the handle (e.g in LSM when cleaning up obsolete
+		 * chunks).  Holding the metadata lock isn't enough.
+		 */
+		ret = __wt_session_get_btree(session, uri, NULL, NULL, 0);
+		if (ret == 0) {
+			ret = func(session, cfg);
+			if (WT_META_TRACKING(session))
+				WT_TRET(
+				    __wt_meta_track_handle_lock(session, 0));
+			else
+				WT_TRET(__wt_session_release_btree(session));
+		} else if (ret == EBUSY)
+			ret = __wt_conn_btree_apply_single(
+			    session, uri, NULL, func, cfg);
+		WT_ERR(ret);
+	}
+
+	if (tret != WT_NOTFOUND)
+		WT_TRET(tret);
+err:	WT_TRET(cursor->close(cursor));
+	session->dhandle = saved_dhandle;
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
new file mode 100644
index 00000000000..998ae7e0d02
--- /dev/null
+++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
@@ -0,0 +1,528 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int  __ckpt_last(WT_SESSION_IMPL *, const char *, WT_CKPT *);
+static int  __ckpt_last_name(WT_SESSION_IMPL *, const char *, const char **);
+static int  __ckpt_load(WT_SESSION_IMPL *,
+		WT_CONFIG_ITEM *, WT_CONFIG_ITEM *, WT_CKPT *);
+static int  __ckpt_named(
+		WT_SESSION_IMPL *, const char *, const char *, WT_CKPT *);
+static int  __ckpt_set(WT_SESSION_IMPL *, const char *, const char *);
+static int  __ckpt_version_chk(WT_SESSION_IMPL *, const char *, const char *);
+
+/*
+ * __wt_meta_checkpoint --
+ *	Return a file's checkpoint information.
+ */
+int
+__wt_meta_checkpoint(WT_SESSION_IMPL *session,
+    const char *fname, const char *checkpoint, WT_CKPT *ckpt)
+{
+	WT_DECL_RET;
+	const char *config;
+
+	config = NULL;
+
+	/* Retrieve the metadata entry for the file. */
+	WT_ERR(__wt_metadata_search(session, fname, &config));
+
+	/* Check the major/minor version numbers. */
+	WT_ERR(__ckpt_version_chk(session, fname, config));
+
+	/*
+	 * Retrieve the named checkpoint or the last checkpoint.
+	 *
+	 * If we don't find a named checkpoint, we're done, they're read-only.
+	 * If we don't find a default checkpoint, it's creation, return "no
+	 * data" and let our caller handle it.
+	 */
+	if (checkpoint == NULL) {
+		if ((ret = __ckpt_last(session, config, ckpt)) == WT_NOTFOUND) {
+			ret = 0;
+			ckpt->addr.data = ckpt->raw.data = NULL;
+			ckpt->addr.size = ckpt->raw.size = 0;
+		}
+	} else
+		WT_ERR(__ckpt_named(session, checkpoint, config, ckpt));
+
+err:	__wt_free(session, config);
+	return (ret);
+}
+
+/*
+ * __wt_meta_checkpoint_last_name --
+ *	Return the last unnamed checkpoint's name.
+ */
+int
+__wt_meta_checkpoint_last_name(
+    WT_SESSION_IMPL *session, const char *fname, const char **namep)
+{
+	WT_DECL_RET;
+	const char *config;
+
+	config = NULL;
+
+	/* Retrieve the metadata entry for the file. */
+	WT_ERR(__wt_metadata_search(session, fname, &config));
+
+	/* Check the major/minor version numbers. */
+	WT_ERR(__ckpt_version_chk(session, fname, config));
+
+	/* Retrieve the name of the last unnamed checkpoint. */
+	WT_ERR(__ckpt_last_name(session, config, namep));
+
+err:	__wt_free(session, config);
+	return (ret);
+}
+
+/*
+ * __wt_meta_checkpoint_clear --
+ *	Clear a file's checkpoint.
+ */
+int
+__wt_meta_checkpoint_clear(WT_SESSION_IMPL *session, const char *fname)
+{
+	/*
+	 * If we are unrolling a failed create, we may have already removed the
+	 * metadata entry.  If no entry is found to update and we're trying to
+	 * clear the checkpoint, just ignore it.
+	 */
+	WT_RET_NOTFOUND_OK(__ckpt_set(session, fname, NULL));
+
+	return (0);
+}
+
+/*
+ * __ckpt_set --
+ *	Set a file's checkpoint.
+ */
+static int
+__ckpt_set(WT_SESSION_IMPL *session, const char *fname, const char *v)
+{
+	WT_DECL_RET;
+	const char *config, *cfg[3], *newcfg;
+
+	config = newcfg = NULL;
+
+	/* Retrieve the metadata for this file. */
+	WT_ERR(__wt_metadata_search(session, fname, &config));
+
+	/* Replace the checkpoint entry. */
+	cfg[0] = config;
+	cfg[1] = v == NULL ? "checkpoint=()" : v;
+	cfg[2] = NULL;
+	WT_ERR(__wt_config_collapse(session, cfg, &newcfg));
+	WT_ERR(__wt_metadata_update(session, fname, newcfg));
+
+err:	__wt_free(session, config);
+	__wt_free(session, newcfg);
+	return (ret);
+}
+
+/*
+ * __ckpt_named --
+ *	Return the information associated with a file's named checkpoint.
+ */
+static int
+__ckpt_named(WT_SESSION_IMPL *session,
+    const char *checkpoint, const char *config, WT_CKPT *ckpt)
+{
+	WT_CONFIG ckptconf;
+	WT_CONFIG_ITEM k, v;
+
+	WT_RET(__wt_config_getones(session, config, "checkpoint", &v));
+	WT_RET(__wt_config_subinit(session, &ckptconf, &v));
+
+	/*
+	 * Take the first match: there should never be more than a single
+	 * checkpoint of any name.
+	 */
+	while (__wt_config_next(&ckptconf, &k, &v) == 0)
+		if (WT_STRING_MATCH(checkpoint, k.str, k.len))
+			return (__ckpt_load(session, &k, &v, ckpt));
+
+	return (WT_NOTFOUND);
+}
+
+/*
+ * __ckpt_last --
+ *	Return the information associated with the file's last checkpoint.
+ */
+static int
+__ckpt_last(WT_SESSION_IMPL *session, const char *config, WT_CKPT *ckpt)
+{
+	WT_CONFIG ckptconf;
+	WT_CONFIG_ITEM a, k, v;
+	int64_t found;
+
+	WT_RET(__wt_config_getones(session, config, "checkpoint", &v));
+	WT_RET(__wt_config_subinit(session, &ckptconf, &v));
+	for (found = 0; __wt_config_next(&ckptconf, &k, &v) == 0;) {
+		/* Ignore checkpoints before the ones we've already seen. */
+		WT_RET(__wt_config_subgets(session, &v, "order", &a));
+		if (found) {
+			if (a.val < found)
+				continue;
+			__wt_meta_checkpoint_free(session, ckpt);
+		}
+		found = a.val;
+		WT_RET(__ckpt_load(session, &k, &v, ckpt));
+	}
+
+	return (found ? 0 : WT_NOTFOUND);
+}
+
+/*
+ * __ckpt_last_name --
+ *	Return the name associated with the file's last unnamed checkpoint.
+ */
+static int
+__ckpt_last_name(
+    WT_SESSION_IMPL *session, const char *config, const char **namep)
+{
+	WT_CONFIG ckptconf;
+	WT_CONFIG_ITEM a, k, v;
+	WT_DECL_RET;
+	int64_t found;
+
+	*namep = NULL;
+
+	WT_ERR(__wt_config_getones(session, config, "checkpoint", &v));
+	WT_ERR(__wt_config_subinit(session, &ckptconf, &v));
+	for (found = 0; __wt_config_next(&ckptconf, &k, &v) == 0;) {
+		/*
+		 * We only care about unnamed checkpoints; applications may not
+		 * use any matching prefix as a checkpoint name, the comparison
+		 * is pretty simple.
+		 */
+		if (k.len < strlen(WT_CHECKPOINT) ||
+		    strncmp(k.str, WT_CHECKPOINT, strlen(WT_CHECKPOINT)) != 0)
+			continue;
+
+		/* Ignore checkpoints before the ones we've already seen. */
+		WT_ERR(__wt_config_subgets(session, &v, "order", &a));
+		if (found && a.val < found)
+			continue;
+
+		if (*namep != NULL)
+			__wt_free(session, *namep);
+		WT_ERR(__wt_strndup(session, k.str, k.len, namep));
+		found = a.val;
+	}
+	if (!found)
+		ret = WT_NOTFOUND;
+
+	if (0) {
+err:		__wt_free(session, namep);
+	}
+	return (ret);
+}
+
+/*
+ * __ckpt_compare_order --
+ *	Qsort comparison routine for the checkpoint list.
+ */
+static int
+__ckpt_compare_order(const void *a, const void *b)
+{
+	WT_CKPT *ackpt, *bckpt;
+
+	ackpt = (WT_CKPT *)a;
+	bckpt = (WT_CKPT *)b;
+
+	return (ackpt->order > bckpt->order ? 1 : -1);
+}
+
+/*
+ * __wt_meta_ckptlist_get --
+ *	Load all available checkpoint information for a file.
+ */
+int
+__wt_meta_ckptlist_get(
+    WT_SESSION_IMPL *session, const char *fname, WT_CKPT **ckptbasep)
+{
+	WT_CKPT *ckpt, *ckptbase;
+	WT_CONFIG ckptconf;
+	WT_CONFIG_ITEM k, v;
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+	size_t allocated, slot;
+	const char *config;
+
+	*ckptbasep = NULL;
+
+	ckptbase = NULL;
+	allocated = slot = 0;
+	config = NULL;
+
+	/* Retrieve the metadata information for the file. */
+	WT_RET(__wt_metadata_search(session, fname, &config));
+
+	/* Load any existing checkpoints into the array. */
+	WT_ERR(__wt_scr_alloc(session, 0, &buf));
+	if (__wt_config_getones(session, config, "checkpoint", &v) == 0 &&
+	    __wt_config_subinit(session, &ckptconf, &v) == 0)
+		for (; __wt_config_next(&ckptconf, &k, &v) == 0; ++slot) {
+			WT_ERR(__wt_realloc_def(
+			    session, &allocated, slot + 1, &ckptbase));
+			ckpt = &ckptbase[slot];
+
+			WT_ERR(__ckpt_load(session, &k, &v, ckpt));
+		}
+
+	/*
+	 * Allocate an extra slot for a new value, plus a slot to mark the end.
+	 *
+	 * This isn't very clean, but there's necessary cooperation between the
+	 * schema layer (that maintains the list of checkpoints), the btree
+	 * layer (that knows when the root page is written, creating a new
+	 * checkpoint), and the block manager (which actually creates the
+	 * checkpoint).  All of that cooperation is handled in the WT_CKPT
+	 * structure referenced from the WT_BTREE structure.
+	 */
+	WT_ERR(__wt_realloc_def(session, &allocated, slot + 2, &ckptbase));
+
+	/* Sort in creation-order. */
+	qsort(ckptbase, slot, sizeof(WT_CKPT), __ckpt_compare_order);
+
+	/* Return the array to our caller. */
+	*ckptbasep = ckptbase;
+
+	if (0) {
+err:		__wt_meta_ckptlist_free(session, ckptbase);
+	}
+	__wt_free(session, config);
+	__wt_scr_free(&buf);
+
+	return (ret);
+}
+
+/*
+ * __ckpt_load --
+ *	Load a single checkpoint's information into a WT_CKPT structure.
+ */
+static int
+__ckpt_load(WT_SESSION_IMPL *session,
+    WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v, WT_CKPT *ckpt)
+{
+	WT_CONFIG_ITEM a;
+	char timebuf[64];
+
+	/*
+	 * Copy the name, address (raw and hex), order and time into the slot.
+	 * If there's no address, it's a fake.
+	 */
+	WT_RET(__wt_strndup(session, k->str, k->len, &ckpt->name));
+
+	WT_RET(__wt_config_subgets(session, v, "addr", &a));
+	WT_RET(__wt_buf_set(session, &ckpt->addr, a.str, a.len));
+	if (a.len == 0)
+		F_SET(ckpt, WT_CKPT_FAKE);
+	else
+		WT_RET(__wt_nhex_to_raw(session, a.str, a.len, &ckpt->raw));
+
+	WT_RET(__wt_config_subgets(session, v, "order", &a));
+	if (a.len == 0)
+		goto format;
+	ckpt->order = a.val;
+
+	WT_RET(__wt_config_subgets(session, v, "time", &a));
+	if (a.len == 0 || a.len > sizeof(timebuf) - 1)
+		goto format;
+	memcpy(timebuf, a.str, a.len);
+	timebuf[a.len] = '\0';
+	if (sscanf(timebuf, "%" SCNuMAX, &ckpt->sec) != 1)
+		goto format;
+
+	WT_RET(__wt_config_subgets(session, v, "size", &a));
+	ckpt->ckpt_size = (uint64_t)a.val;
+
+	WT_RET(__wt_config_subgets(session, v, "write_gen", &a));
+	if (a.len == 0)
+		goto format;
+	/*
+	 * The largest value a WT_CONFIG_ITEM can handle is signed: this value
+	 * appears on disk and I don't want to sign it there, so I'm casting it
+	 * here instead.
+	 */
+	ckpt->write_gen = (uint64_t)a.val;
+
+	return (0);
+
+format:
+	WT_RET_MSG(session, WT_ERROR, "corrupted checkpoint list");
+}
+
+/*
+ * __wt_meta_ckptlist_set --
+ *	Set a file's checkpoint value from the WT_CKPT list.
+ */
+int
+__wt_meta_ckptlist_set(WT_SESSION_IMPL *session,
+    const char *fname, WT_CKPT *ckptbase, WT_LSN *ckptlsn)
+{
+	WT_CKPT *ckpt;
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+	time_t secs;
+	int64_t maxorder;
+	const char *sep;
+
+	WT_ERR(__wt_scr_alloc(session, 0, &buf));
+	maxorder = 0;
+	sep = "";
+	WT_ERR(__wt_buf_fmt(session, buf, "checkpoint=("));
+	WT_CKPT_FOREACH(ckptbase, ckpt) {
+		/*
+		 * Each internal checkpoint name is appended with a generation
+		 * to make it a unique name.  We're solving two problems: when
+		 * two checkpoints are taken quickly, the timer may not be
+		 * unique and/or we can even see time travel on the second
+		 * checkpoint if we snapshot the time in-between nanoseconds
+		 * rolling over.  Second, if we reset the generational counter
+		 * when new checkpoints arrive, we could logically re-create
+		 * specific checkpoints, racing with cursors open on those
+		 * checkpoints.  I can't think of any way to return incorrect
+		 * results by racing with those cursors, but it's simpler not
+		 * to worry about it.
+		 */
+		if (ckpt->order > maxorder)
+			maxorder = ckpt->order;
+
+		/* Skip deleted checkpoints. */
+		if (F_ISSET(ckpt, WT_CKPT_DELETE))
+			continue;
+
+		if (F_ISSET(ckpt, WT_CKPT_ADD | WT_CKPT_UPDATE)) {
+			/*
+			 * We fake checkpoints for handles in the middle of a
+			 * bulk load.  If there is a checkpoint, convert the
+			 * raw cookie to a hex string.
+			 */
+			if (ckpt->raw.size == 0)
+				ckpt->addr.size = 0;
+			else
+				WT_ERR(__wt_raw_to_hex(session,
+				    ckpt->raw.data,
+				    ckpt->raw.size, &ckpt->addr));
+
+			/* Set the order and timestamp. */
+			if (F_ISSET(ckpt, WT_CKPT_ADD))
+				ckpt->order = ++maxorder;
+
+			/*
+			 * XXX
+			 * Assumes a time_t fits into a uintmax_t, which isn't
+			 * guaranteed, a time_t has to be an arithmetic type,
+			 * but not an integral type.
+			 */
+			WT_ERR(__wt_seconds(session, &secs));
+			ckpt->sec = (uintmax_t)secs;
+		}
+		if (strcmp(ckpt->name, WT_CHECKPOINT) == 0)
+			WT_ERR(__wt_buf_catfmt(session, buf,
+			    "%s%s.%" PRId64 "=(addr=\"%.*s\",order=%" PRIu64
+			    ",time=%" PRIuMAX ",size=%" PRIu64
+			    ",write_gen=%" PRIu64 ")",
+			    sep, ckpt->name, ckpt->order,
+			    (int)ckpt->addr.size, (char *)ckpt->addr.data,
+			    ckpt->order, ckpt->sec, ckpt->ckpt_size,
+			    ckpt->write_gen));
+		else
+			WT_ERR(__wt_buf_catfmt(session, buf,
+			    "%s%s=(addr=\"%.*s\",order=%" PRIu64
+			    ",time=%" PRIuMAX ",size=%" PRIu64
+			    ",write_gen=%" PRIu64 ")",
+			    sep, ckpt->name,
+			    (int)ckpt->addr.size, (char *)ckpt->addr.data,
+			    ckpt->order, ckpt->sec, ckpt->ckpt_size,
+			    ckpt->write_gen));
+		sep = ",";
+	}
+	WT_ERR(__wt_buf_catfmt(session, buf, ")"));
+	if (ckptlsn != NULL)
+		WT_ERR(__wt_buf_catfmt(session, buf,
+		    ",checkpoint_lsn=(%" PRIu32 ",%" PRIuMAX ")",
+		    ckptlsn->file, (uintmax_t)ckptlsn->offset));
+	WT_ERR(__ckpt_set(session, fname, buf->mem));
+
+err:	__wt_scr_free(&buf);
+	return (ret);
+}
+
+/*
+ * __wt_meta_ckptlist_free --
+ *	Discard the checkpoint array.
+ */
+void
+__wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
+{
+	WT_CKPT *ckpt;
+
+	if (ckptbase == NULL)
+		return;
+
+	WT_CKPT_FOREACH(ckptbase, ckpt)
+		__wt_meta_checkpoint_free(session, ckpt);
+	__wt_free(session, ckptbase);
+}
+
+/*
+ * __wt_meta_checkpoint_free --
+ *	Clean up a single checkpoint structure.
+ */
+void
+__wt_meta_checkpoint_free(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
+{
+	if (ckpt == NULL)
+		return;
+
+	__wt_free(session, ckpt->name);
+	__wt_buf_free(session, &ckpt->addr);
+	__wt_buf_free(session, &ckpt->raw);
+	__wt_free(session, ckpt->bpriv);
+
+	WT_CLEAR(*ckpt);		/* Clear to prepare for re-use. */
+}
+
+/*
+ * __ckpt_version_chk --
+ *	Check the version major/minor numbers.
+ */
+static int
+__ckpt_version_chk(
+    WT_SESSION_IMPL *session, const char *fname, const char *config)
+{
+	WT_CONFIG_ITEM a, v;
+	int majorv, minorv;
+
+	WT_RET(__wt_config_getones(session, config, "version", &v));
+	WT_RET(__wt_config_subgets(session, &v, "major", &a));
+	majorv = (int)a.val;
+	WT_RET(__wt_config_subgets(session, &v, "minor", &a));
+	minorv = (int)a.val;
+
+	if (majorv < WT_BTREE_MAJOR_VERSION_MIN ||
+	    majorv > WT_BTREE_MAJOR_VERSION_MAX ||
+	    (majorv == WT_BTREE_MAJOR_VERSION_MIN &&
+	    minorv < WT_BTREE_MINOR_VERSION_MIN) ||
+	    (majorv == WT_BTREE_MAJOR_VERSION_MAX &&
+	    minorv > WT_BTREE_MINOR_VERSION_MAX))
+		WT_RET_MSG(session, EACCES,
+		    "%s is an unsupported WiredTiger source file version %d.%d"
+		    "; this WiredTiger build only supports versions from %d.%d "
+		    "to %d.%d",
+		    fname,
+		    majorv, minorv,
+		    WT_BTREE_MAJOR_VERSION_MIN,
+		    WT_BTREE_MINOR_VERSION_MIN,
+		    WT_BTREE_MAJOR_VERSION_MAX,
+		    WT_BTREE_MINOR_VERSION_MAX);
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/meta/meta_ext.c b/src/third_party/wiredtiger/src/meta/meta_ext.c
new file mode 100644
index 00000000000..b68058a6e91
--- /dev/null
+++ b/src/third_party/wiredtiger/src/meta/meta_ext.c
@@ -0,0 +1,103 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_ext_metadata_insert --
+ *	Insert a row into the metadata (external API version).
+ */
+int
+__wt_ext_metadata_insert(WT_EXTENSION_API *wt_api,
+    WT_SESSION *wt_session, const char *key, const char *value)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_SESSION_IMPL *session;
+
+	conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+	if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+		session = conn->default_session;
+
+	return (__wt_metadata_insert(session, key, value));
+}
+
+/*
+ * __wt_ext_metadata_remove --
+ *	Remove a row from the metadata (external API version).
+ */
+int
+__wt_ext_metadata_remove(
+    WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_SESSION_IMPL *session;
+
+	conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+	if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+		session = conn->default_session;
+
+	return (__wt_metadata_remove(session, key));
+}
+
+/*
+ * __wt_ext_metadata_search --
+ *	Return a copied row from the metadata (external API version).
+ *	The caller is responsible for freeing the allocated memory.
+ */
+int
+__wt_ext_metadata_search(WT_EXTENSION_API *wt_api,
+    WT_SESSION *wt_session, const char *key, const char **valuep)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_SESSION_IMPL *session;
+
+	conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+	if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+		session = conn->default_session;
+
+	return (__wt_metadata_search(session, key, valuep));
+}
+
+/*
+ * __wt_ext_metadata_update --
+ *	Update a row in the metadata (external API version).
+ */
+int
+__wt_ext_metadata_update(WT_EXTENSION_API *wt_api,
+    WT_SESSION *wt_session, const char *key, const char *value)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_SESSION_IMPL *session;
+
+	conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+	if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+		session = conn->default_session;
+
+	return (__wt_metadata_update(session, key, value));
+}
+
+/*
+ * __wt_metadata_get_ckptlist --
+ *	Public entry point to __wt_meta_ckptlist_get (for wt list).
+ */
+int
+__wt_metadata_get_ckptlist(
+    WT_SESSION *session, const char *name, WT_CKPT **ckptbasep)
+{
+	return (__wt_meta_ckptlist_get(
+	    (WT_SESSION_IMPL *)session, name, ckptbasep));
+}
+
+/*
+ * __wt_metadata_free_ckptlist --
+ *	Public entry point to __wt_meta_ckptlist_free (for wt list).
+ */
+void
+__wt_metadata_free_ckptlist(WT_SESSION *session, WT_CKPT *ckptbase)
+{
+	__wt_meta_ckptlist_free((WT_SESSION_IMPL *)session, ckptbase);
+}
diff --git a/src/third_party/wiredtiger/src/meta/meta_table.c b/src/third_party/wiredtiger/src/meta/meta_table.c
new file mode 100644
index 00000000000..e66ed609952
--- /dev/null
+++ b/src/third_party/wiredtiger/src/meta/meta_table.c
@@ -0,0 +1,206 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __metadata_turtle --
+ *	Return if a key's value should be taken from the turtle file.
+ */
+static int
+__metadata_turtle(const char *key)
+{
+	switch (key[0]) {
+	case 'f':
+		if (strcmp(key, WT_METAFILE_URI) == 0)
+			return (1);
+		break;
+	case 'W':
+		if (strcmp(key, "WiredTiger version") == 0)
+			return (1);
+		if (strcmp(key, "WiredTiger version string") == 0)
+			return (1);
+		break;
+	}
+	return (0);
+}
+
+/*
+ * __wt_metadata_open --
+ *	Opens the metadata file, sets session->metafile.
+ */
+int
+__wt_metadata_open(WT_SESSION_IMPL *session)
+{
+	if (session->metafile != NULL)
+		return (0);
+
+	WT_RET(__wt_session_get_btree(session, WT_METAFILE_URI, NULL, NULL, 0));
+
+	session->metafile = S2BT(session);
+	WT_ASSERT(session, session->metafile != NULL);
+
+	/* The metafile doesn't need to stay locked -- release it. */
+	return (__wt_session_release_btree(session));
+}
+
+/*
+ * __wt_metadata_cursor --
+ *	Opens a cursor on the metadata.
+ */
+int
+__wt_metadata_cursor(
+    WT_SESSION_IMPL *session, const char *config, WT_CURSOR **cursorp)
+{
+	WT_DATA_HANDLE *saved_dhandle;
+	WT_DECL_RET;
+	const char *cfg[] =
+	    { WT_CONFIG_BASE(session, session_open_cursor), config, NULL };
+
+	saved_dhandle = session->dhandle;
+	WT_ERR(__wt_metadata_open(session));
+
+	WT_SET_BTREE_IN_SESSION(session, session->metafile);
+
+	/* 
+	 * We use the metadata a lot, so we have a handle cached; lock it and
+	 * increment the in-use counter.
+	 */
+	WT_ERR(__wt_session_lock_btree(session, 0));
+	__wt_session_dhandle_incr_use(session);
+
+	ret = __wt_curfile_create(session, NULL, cfg, 0, 0, cursorp);
+
+	/* Restore the caller's btree. */
+err:	session->dhandle = saved_dhandle;
+	return (ret);
+}
+
+/*
+ * __wt_metadata_insert --
+ *	Insert a row into the metadata.
+ */
+int
+__wt_metadata_insert(
+    WT_SESSION_IMPL *session, const char *key, const char *value)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+
+	WT_RET(__wt_verbose(session, WT_VERB_METADATA,
+	    "Insert: key: %s, value: %s, tracking: %s, %s" "turtle",
+	    key, value, WT_META_TRACKING(session) ? "true" : "false",
+	    __metadata_turtle(key) ? "" : "not "));
+
+	if (__metadata_turtle(key))
+		WT_RET_MSG(session, EINVAL,
+		    "%s: insert not supported on the turtle file", key);
+
+	WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
+	cursor->set_key(cursor, key);
+	cursor->set_value(cursor, value);
+	WT_ERR(cursor->insert(cursor));
+	if (WT_META_TRACKING(session))
+		WT_ERR(__wt_meta_track_insert(session, key));
+
+err:	WT_TRET(cursor->close(cursor));
+	return (ret);
+}
+
+/*
+ * __wt_metadata_update --
+ *	Update a row in the metadata.
+ */
+int
+__wt_metadata_update(
+    WT_SESSION_IMPL *session, const char *key, const char *value)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+
+	WT_RET(__wt_verbose(session, WT_VERB_METADATA,
+	    "Update: key: %s, value: %s, tracking: %s, %s" "turtle",
+	    key, value, WT_META_TRACKING(session) ? "true" : "false",
+	    __metadata_turtle(key) ? "" : "not "));
+
+	if (__metadata_turtle(key))
+		return (__wt_turtle_update(session, key, value));
+
+	if (WT_META_TRACKING(session))
+		WT_RET(__wt_meta_track_update(session, key));
+
+	WT_RET(__wt_metadata_cursor(session, "overwrite", &cursor));
+	cursor->set_key(cursor, key);
+	cursor->set_value(cursor, value);
+	WT_ERR(cursor->insert(cursor));
+
+err:	WT_TRET(cursor->close(cursor));
+	return (ret);
+}
+
+/*
+ * __wt_metadata_remove --
+ *	Remove a row from the metadata.
+ */
+int
+__wt_metadata_remove(WT_SESSION_IMPL *session, const char *key)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+
+	WT_RET(__wt_verbose(session, WT_VERB_METADATA,
+	    "Remove: key: %s, tracking: %s, %s" "turtle",
+	    key, WT_META_TRACKING(session) ? "true" : "false",
+	    __metadata_turtle(key) ? "" : "not "));
+
+	if (__metadata_turtle(key))
+		WT_RET_MSG(session, EINVAL,
+		    "%s: remove not supported on the turtle file", key);
+
+	WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
+	cursor->set_key(cursor, key);
+	WT_ERR(cursor->search(cursor));
+	if (WT_META_TRACKING(session))
+		WT_ERR(__wt_meta_track_update(session, key));
+	WT_ERR(cursor->remove(cursor));
+
+err:	WT_TRET(cursor->close(cursor));
+	return (ret);
+}
+
+/*
+ * __wt_metadata_search --
+ *	Return a copied row from the metadata.
+ *	The caller is responsible for freeing the allocated memory.
+ */
+int
+__wt_metadata_search(
+    WT_SESSION_IMPL *session, const char *key, const char **valuep)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	const char *value;
+
+	*valuep = NULL;
+
+	WT_RET(__wt_verbose(session, WT_VERB_METADATA,
+	    "Search: key: %s, tracking: %s, %s" "turtle",
+	    key, WT_META_TRACKING(session) ? "true" : "false",
+	    __metadata_turtle(key) ? "" : "not "));
+
+	if (__metadata_turtle(key))
+		return (__wt_turtle_read(session, key, valuep));
+
+	WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
+	cursor->set_key(cursor, key);
+	WT_ERR(cursor->search(cursor));
+	WT_ERR(cursor->get_value(cursor, &value));
+	WT_ERR(__wt_strdup(session, value, valuep));
+
+err:	WT_TRET(cursor->close(cursor));
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c
new file mode 100644
index 00000000000..55e61f8d1bc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/meta/meta_track.c
@@ -0,0 +1,365 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * WT_META_TRACK -- A tracked metadata operation: a non-transactional log,
+ * maintained to make it easy to unroll simple metadata and filesystem
+ * operations.
+ */
+typedef struct __wt_meta_track {
+	enum {
+		WT_ST_EMPTY,		/* Unused slot */
+		WT_ST_CHECKPOINT,	/* Complete a checkpoint */
+		WT_ST_FILEOP,		/* File operation */
+		WT_ST_LOCK,		/* Lock a handle */
+		WT_ST_REMOVE,		/* Remove a metadata entry */
+		WT_ST_SET		/* Reset a metadata entry */
+	} op;
+	const char *a, *b;		/* Strings */
+	WT_BTREE *btree;		/* Locked handle */
+	int created;			/* Handle on newly created file */
+} WT_META_TRACK;
+
+/*
+ * __meta_track_next --
+ *	Extend the list of operations we're tracking, as necessary, and
+ *	optionally return the next slot.
+ */
+static int
+__meta_track_next(WT_SESSION_IMPL *session, WT_META_TRACK **trkp)
+{
+	size_t offset, sub_off;
+
+	if (session->meta_track_next == NULL)
+		session->meta_track_next = session->meta_track;
+
+	offset = WT_PTRDIFF(session->meta_track_next, session->meta_track);
+	sub_off = WT_PTRDIFF(session->meta_track_sub, session->meta_track);
+	if (offset == session->meta_track_alloc) {
+		WT_RET(__wt_realloc(session, &session->meta_track_alloc,
+		    WT_MAX(2 * session->meta_track_alloc,
+		    20 * sizeof(WT_META_TRACK)), &session->meta_track));
+
+		/* Maintain positions in the new chunk of memory. */
+		session->meta_track_next =
+		    (uint8_t *)session->meta_track + offset;
+		if (session->meta_track_sub != NULL)
+			session->meta_track_sub =
+			    (uint8_t *)session->meta_track + sub_off;
+	}
+
+	WT_ASSERT(session, session->meta_track_next != NULL);
+
+	if (trkp != NULL) {
+		*trkp = session->meta_track_next;
+		session->meta_track_next = *trkp + 1;
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_meta_track_discard --
+ *	Cleanup metadata tracking when closing a session.
+ */
+void
+__wt_meta_track_discard(WT_SESSION_IMPL *session)
+{
+	__wt_free(session, session->meta_track);
+	session->meta_track_next = NULL;
+	session->meta_track_alloc = 0;
+}
+
+/*
+ * __wt_meta_track_on --
+ *	Turn on metadata operation tracking.
+ */
+int
+__wt_meta_track_on(WT_SESSION_IMPL *session)
+{
+	if (session->meta_track_nest++ == 0)
+		WT_RET(__meta_track_next(session, NULL));
+
+	return (0);
+}
+
+/*
+ * __meta_track_apply --
+ *	Apply the changes in a metadata tracking record.
+ */
+static int
+__meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll)
+{
+	WT_BM *bm;
+	WT_DECL_RET;
+	int tret;
+
+	/*
+	 * Unlock handles and complete checkpoints regardless of whether we are
+	 * unrolling.
+	 */
+	if (!unroll && trk->op != WT_ST_CHECKPOINT && trk->op != WT_ST_LOCK)
+		goto free;
+
+	switch (trk->op) {
+	case WT_ST_EMPTY:	/* Unused slot */
+		break;
+	case WT_ST_CHECKPOINT:	/* Checkpoint, see above */
+		if (!unroll) {
+			bm = trk->btree->bm;
+			WT_WITH_BTREE(session, trk->btree,
+			    WT_TRET(bm->checkpoint_resolve(bm, session)));
+		}
+		break;
+	case WT_ST_LOCK:	/* Handle lock, see above */
+		if (unroll && trk->created)
+			F_SET(trk->btree->dhandle, WT_DHANDLE_DISCARD);
+		WT_WITH_BTREE(session, trk->btree,
+		    WT_TRET(__wt_session_release_btree(session)));
+		break;
+	case WT_ST_FILEOP:	/* File operation */
+		/*
+		 * For renames, both a and b are set.
+		 * For creates, a is NULL.
+		 * For removes, b is NULL.
+		 */
+		if (trk->a != NULL && trk->b != NULL &&
+		    (tret = __wt_rename(session,
+		    trk->b + strlen("file:"),
+		    trk->a + strlen("file:"))) != 0) {
+			__wt_err(session, tret,
+			    "metadata unroll rename %s to %s",
+			    trk->b, trk->a);
+			WT_TRET(tret);
+		} else if (trk->a == NULL) {
+			if ((tret = __wt_remove(session,
+			    trk->b + strlen("file:"))) != 0) {
+				__wt_err(session, tret,
+				    "metadata unroll create %s",
+				    trk->b);
+				WT_TRET(tret);
+			}
+		}
+		/*
+		 * We can't undo removes yet: that would imply
+		 * some kind of temporary rename and remove in
+		 * roll forward.
+		 */
+		break;
+	case WT_ST_REMOVE:	/* Remove trk.a */
+		if ((tret = __wt_metadata_remove(session, trk->a)) != 0) {
+			__wt_err(session, tret,
+			    "metadata unroll remove: %s",
+			    trk->a);
+			WT_TRET(tret);
+		}
+		break;
+	case WT_ST_SET:		/* Set trk.a to trk.b */
+		if ((tret = __wt_metadata_update(
+		    session, trk->a, trk->b)) != 0) {
+			__wt_err(session, tret,
+			    "metadata unroll update %s to %s",
+			    trk->a, trk->b);
+			WT_TRET(tret);
+		}
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+free:	trk->op = WT_ST_EMPTY;
+	__wt_free(session, trk->a);
+	__wt_free(session, trk->b);
+	trk->btree = NULL;
+
+	return (ret);
+}
+
+/*
+ * __wt_meta_track_off --
+ *	Turn off metadata operation tracking, unrolling on error.
+ */
+int
+__wt_meta_track_off(WT_SESSION_IMPL *session, int unroll)
+{
+	WT_DECL_RET;
+	WT_META_TRACK *trk, *trk_orig;
+
+	WT_ASSERT(session,
+	    WT_META_TRACKING(session) && session->meta_track_nest > 0);
+
+	trk_orig = session->meta_track;
+	trk = session->meta_track_next;
+
+	/* If it was a nested transaction, there is nothing to do. */
+	if (--session->meta_track_nest != 0)
+		return (0);
+
+	/* Turn off tracking for unroll. */
+	session->meta_track_next = session->meta_track_sub = NULL;
+
+	/*
+	 * If there were no operations logged, return now and avoid unnecessary
+	 * metadata checkpoints.  For example, this happens if attempting to
+	 * create a data source that already exists (or drop one that doesn't).
+	 */
+	if (trk == trk_orig)
+		return (0);
+
+	while (--trk >= trk_orig)
+		WT_TRET(__meta_track_apply(session, trk, unroll));
+
+	/*
+	 * If the operation succeeded and we aren't relying on the log for
+	 * durability, checkpoint the metadata. */
+	if (!unroll && ret == 0 && session->metafile != NULL &&
+	    !S2C(session)->logging)
+		WT_WITH_BTREE(session, session->metafile,
+		    ret = __wt_checkpoint(session, NULL));
+
+	return (ret);
+}
+
+/*
+ * __wt_meta_track_sub_on --
+ *	Start a group of operations that can be committed independent of the
+ *	main transaction.
+ */
+int
+__wt_meta_track_sub_on(WT_SESSION_IMPL *session)
+{
+	WT_ASSERT(session, session->meta_track_sub == NULL);
+	session->meta_track_sub = session->meta_track_next;
+	return (0);
+}
+
+/*
+ * __wt_meta_track_sub_off --
+ *	Commit a group of operations independent of the main transaction.
+ */
+int
+__wt_meta_track_sub_off(WT_SESSION_IMPL *session)
+{
+	WT_DECL_RET;
+	WT_META_TRACK *trk, *trk_orig;
+
+	if (!WT_META_TRACKING(session) || session->meta_track_sub == NULL)
+		return (0);
+
+	trk_orig = session->meta_track_sub;
+	trk = session->meta_track_next;
+
+	/* Turn off tracking for unroll. */
+	session->meta_track_next = session->meta_track_sub = NULL;
+
+	while (--trk >= trk_orig)
+		WT_TRET(__meta_track_apply(session, trk, 0));
+
+	session->meta_track_next = trk_orig;
+	return (ret);
+}
+
+/*
+ * __wt_meta_track_checkpoint --
+ *	Track a handle involved in a checkpoint.
+ */
+int
+__wt_meta_track_checkpoint(WT_SESSION_IMPL *session)
+{
+	WT_META_TRACK *trk;
+
+	WT_ASSERT(session, session->dhandle != NULL);
+
+	WT_RET(__meta_track_next(session, &trk));
+
+	trk->op = WT_ST_CHECKPOINT;
+	trk->btree = S2BT(session);
+	return (0);
+}
+/*
+ * __wt_meta_track_insert --
+ *	Track an insert operation.
+ */
+int
+__wt_meta_track_insert(WT_SESSION_IMPL *session, const char *key)
+{
+	WT_META_TRACK *trk;
+
+	WT_RET(__meta_track_next(session, &trk));
+
+	trk->op = WT_ST_REMOVE;
+	WT_RET(__wt_strdup(session, key, &trk->a));
+
+	return (0);
+}
+
+/*
+ * __wt_meta_track_update --
+ *	Track a metadata update operation.
+ */
+int
+__wt_meta_track_update(WT_SESSION_IMPL *session, const char *key)
+{
+	WT_DECL_RET;
+	WT_META_TRACK *trk;
+
+	WT_RET(__meta_track_next(session, &trk));
+
+	trk->op = WT_ST_SET;
+	WT_RET(__wt_strdup(session, key, &trk->a));
+
+	/*
+	 * If there was a previous value, keep it around -- if not, then this
+	 * "update" is really an insert.
+	 */
+	if ((ret =
+	    __wt_metadata_search(session, key, &trk->b)) == WT_NOTFOUND) {
+		trk->op = WT_ST_REMOVE;
+		ret = 0;
+	}
+	return (ret);
+}
+
+/*
+ * __wt_meta_track_fileop --
+ *	Track a filesystem operation.
+ */
+int
+__wt_meta_track_fileop(
+    WT_SESSION_IMPL *session, const char *olduri, const char *newuri)
+{
+	WT_META_TRACK *trk;
+
+	WT_RET(__meta_track_next(session, &trk));
+
+	trk->op = WT_ST_FILEOP;
+	if (olduri != NULL)
+		WT_RET(__wt_strdup(session, olduri, &trk->a));
+	if (newuri != NULL)
+		WT_RET(__wt_strdup(session, newuri, &trk->b));
+	return (0);
+}
+
+/*
+ * __wt_meta_track_handle_lock --
+ *	Track a locked handle.
+ */
+int
+__wt_meta_track_handle_lock(WT_SESSION_IMPL *session, int created)
+{
+	WT_META_TRACK *trk;
+
+	WT_ASSERT(session, session->dhandle != NULL);
+
+	WT_RET(__meta_track_next(session, &trk));
+
+	trk->op = WT_ST_LOCK;
+	trk->btree = S2BT(session);
+	trk->created = created;
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/meta/meta_turtle.c b/src/third_party/wiredtiger/src/meta/meta_turtle.c
new file mode 100644
index 00000000000..d6060ebf47b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/meta/meta_turtle.c
@@ -0,0 +1,318 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __metadata_config --
+ *	Return the default configuration information for the metadata file.
+ */
+static int
+__metadata_config(WT_SESSION_IMPL *session, const char **metaconfp)
+{
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+	const char *cfg[] = { WT_CONFIG_BASE(session, file_meta), NULL, NULL };
+	const char *metaconf;
+
+	*metaconfp = NULL;
+
+	metaconf = NULL;
+
+	/* Create a turtle file with default values. */
+	WT_RET(__wt_scr_alloc(session, 0, &buf));
+	WT_ERR(__wt_buf_fmt(session, buf,
+	    "key_format=S,value_format=S,id=%d,version=(major=%d,minor=%d)",
+	    WT_METAFILE_ID,
+	    WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX));
+	cfg[1] = buf->data;
+	WT_ERR(__wt_config_collapse(session, cfg, &metaconf));
+
+	*metaconfp = metaconf;
+
+	if (0) {
+err:		__wt_free(session, metaconf);
+	}
+	__wt_scr_free(&buf);
+	return (ret);
+}
+
+/*
+ * __metadata_init --
+ *	Create the metadata file.
+ */
+static int
+__metadata_init(WT_SESSION_IMPL *session)
+{
+	WT_DECL_RET;
+
+	/*
+	 * We're single-threaded, but acquire the schema lock regardless: the
+	 * lower level code checks that it is appropriately synchronized.
+	 */
+	WT_WITH_SCHEMA_LOCK(session,
+	    ret = __wt_schema_create(session, WT_METAFILE_URI, NULL));
+
+	return (ret);
+}
+
+/*
+ * __metadata_load_hot_backup --
+ *	Load the contents of any hot backup file.
+ */
+static int
+__metadata_load_hot_backup(WT_SESSION_IMPL *session)
+{
+	FILE *fp;
+	WT_DECL_ITEM(key);
+	WT_DECL_ITEM(value);
+	WT_DECL_RET;
+	char *path;
+
+	fp = NULL;
+	path = NULL;
+
+	/* Look for a hot backup file: if we find it, load it. */
+	WT_RET(__wt_filename(session, WT_METADATA_BACKUP, &path));
+	fp = fopen(path, "r");
+	__wt_free(session, path);
+	if (fp == NULL)
+		return (0);
+
+	/* Read line pairs and load them into the metadata file. */
+	WT_ERR(__wt_scr_alloc(session, 512, &key));
+	WT_ERR(__wt_scr_alloc(session, 512, &value));
+	for (;;) {
+		WT_ERR(__wt_getline(session, key, fp));
+		if (key->size == 0)
+			break;
+		WT_ERR(__wt_getline(session, value, fp));
+		if (value->size == 0)
+			WT_ERR(__wt_illegal_value(session, WT_METADATA_BACKUP));
+		WT_ERR(__wt_metadata_update(session, key->data, value->data));
+	}
+
+	F_SET(S2C(session), WT_CONN_WAS_BACKUP);
+
+err:	if (fp != NULL)
+		WT_TRET(fclose(fp) == 0 ? 0 : __wt_errno());
+	__wt_scr_free(&key);
+	__wt_scr_free(&value);
+	return (ret);
+}
+
+/*
+ * __metadata_load_bulk --
+ *	Create any bulk-loaded file stubs.
+ */
+static int
+__metadata_load_bulk(WT_SESSION_IMPL *session)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	uint32_t allocsize;
+	int exist;
+	const char *filecfg[] = { WT_CONFIG_BASE(session, file_meta), NULL };
+	const char *key;
+
+	/*
+	 * If a file was being bulk-loaded during the hot backup, it will appear
+	 * in the metadata file, but the file won't exist.  Create on demand.
+	 */
+	WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+	while ((ret = cursor->next(cursor)) == 0) {
+		WT_ERR(cursor->get_key(cursor, &key));
+		if (!WT_PREFIX_SKIP(key, "file:"))
+			continue;
+
+		/* If the file exists, it's all good. */
+		WT_ERR(__wt_exist(session, key, &exist));
+		if (exist)
+			continue;
+
+		/*
+		 * If the file doesn't exist, assume it's a bulk-loaded file;
+		 * retrieve the allocation size and re-create the file.
+		 */
+		WT_ERR(__wt_direct_io_size_check(
+		    session, filecfg, "allocation_size", &allocsize));
+		WT_ERR(__wt_block_manager_create(session, key, allocsize));
+	}
+	WT_ERR_NOTFOUND_OK(ret);
+
+err:	if (cursor != NULL)
+		WT_TRET(cursor->close(cursor));
+
+	return (ret);
+}
+
+/*
+ * __wt_turtle_init --
+ *	Check the turtle file and create if necessary.
+ */
+int
+__wt_turtle_init(WT_SESSION_IMPL *session)
+{
+	WT_DECL_RET;
+	int exist;
+	const char *metaconf;
+
+	metaconf = NULL;
+
+	/*
+	 * Discard any turtle setup file left-over from previous runs.  This
+	 * doesn't matter for correctness, it's just cleaning up random files.
+	 */
+	WT_RET(__wt_exist(session, WT_METADATA_TURTLE_SET, &exist));
+	if (exist)
+		WT_RET(__wt_remove(session, WT_METADATA_TURTLE_SET));
+
+	/*
+	 * We could die after creating the turtle file and before creating the
+	 * metadata file, or worse, the metadata file might be in some random
+	 * state.  Make sure that doesn't happen: if we don't find the turtle
+	 * file, first create the metadata file, load any hot backup, and then
+	 * create the turtle file.  No matter what happens, if metadata file
+	 * creation doesn't fully complete, we won't have a turtle file and we
+	 * will repeat the process until we succeed.
+	 *
+	 * If there's already a turtle file, we're done.
+	 */
+	WT_RET(__wt_exist(session, WT_METADATA_TURTLE, &exist));
+	if (exist)
+		return (0);
+
+	/* Create the metadata file. */
+	WT_RET(__metadata_init(session));
+
+	/* Load any hot-backup information. */
+	WT_RET(__metadata_load_hot_backup(session));
+
+	/* Create any bulk-loaded file stubs. */
+	WT_RET(__metadata_load_bulk(session));
+
+	/* Create the turtle file. */
+	WT_RET(__metadata_config(session, &metaconf));
+	WT_ERR(__wt_turtle_update(session, WT_METAFILE_URI, metaconf));
+
+	/* Remove the backup file if it exists, we'll never read it again. */
+	WT_ERR(__wt_exist(session, WT_METADATA_BACKUP, &exist));
+	if (exist)
+		WT_ERR(__wt_remove(session, WT_METADATA_BACKUP));
+
+err:	__wt_free(session, metaconf);
+	return (ret);
+}
+
+/*
+ * __wt_turtle_read --
+ *	Read the turtle file.
+ */
+int
+__wt_turtle_read(WT_SESSION_IMPL *session, const char *key, const char **valuep)
+{
+	FILE *fp;
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+	int match;
+	char *path;
+
+	*valuep = NULL;
+
+	fp = NULL;
+	path = NULL;
+
+	/*
+	 * Open the turtle file; there's one case where we won't find the turtle
+	 * file, yet still succeed.  We create the metadata file before creating
+	 * the turtle file, and that means returning the default configuration
+	 * string for the metadata file.
+	 */
+	WT_RET(__wt_filename(session, WT_METADATA_TURTLE, &path));
+	if ((fp = fopen(path, "r")) == NULL)
+		ret = __wt_errno();
+	__wt_free(session, path);
+	if (fp == NULL)
+		return (strcmp(key, WT_METAFILE_URI) == 0 ?
+		    __metadata_config(session, valuep) : ret);
+
+	/* Search for the key. */
+	WT_ERR(__wt_scr_alloc(session, 512, &buf));
+	for (match = 0;;) {
+		WT_ERR(__wt_getline(session, buf, fp));
+		if (buf->size == 0)
+			WT_ERR(WT_NOTFOUND);
+		if (strcmp(key, buf->data) == 0)
+			match = 1;
+
+		/* Key matched: read the subsequent line for the value. */
+		WT_ERR(__wt_getline(session, buf, fp));
+		if (buf->size == 0)
+			WT_ERR(__wt_illegal_value(session, WT_METADATA_TURTLE));
+		if (match)
+			break;
+	}
+
+	/* Copy the value for the caller. */
+	WT_ERR(__wt_strdup(session, buf->data, valuep));
+
+err:	if (fp != NULL)
+		WT_TRET(fclose(fp) == 0 ? 0 : __wt_errno());
+	__wt_scr_free(&buf);
+	return (ret);
+}
+
+/*
+ * __wt_turtle_update --
+ *	Update the turtle file.
+ */
+int
+__wt_turtle_update(
+    WT_SESSION_IMPL *session, const char *key,  const char *value)
+{
+	FILE *fp;
+	WT_DECL_RET;
+	int vmajor, vminor, vpatch;
+	const char *version;
+	char *path;
+
+	fp = NULL;
+	path = NULL;
+
+	/*
+	 * Create the turtle setup file: we currently re-write it from scratch
+	 * every time.
+	 */
+	WT_RET(__wt_filename(session, WT_METADATA_TURTLE_SET, &path));
+	if ((fp = fopen(path, "w")) == NULL)
+		ret = __wt_errno();
+	__wt_free(session, path);
+	if (fp == NULL)
+		return (ret);
+
+	version = wiredtiger_version(&vmajor, &vminor, &vpatch);
+	WT_ERR_TEST((fprintf(fp,
+	    "%s\n%s\n%s\n" "major=%d,minor=%d,patch=%d\n%s\n%s\n",
+	    WT_METADATA_VERSION_STR, version,
+	    WT_METADATA_VERSION, vmajor, vminor, vpatch,
+	    key, value) < 0), __wt_errno());
+
+	ret = fclose(fp);
+	fp = NULL;
+	WT_ERR_TEST(ret == EOF, __wt_errno());
+
+	WT_ERR(
+	    __wt_rename(session, WT_METADATA_TURTLE_SET, WT_METADATA_TURTLE));
+
+	if (0) {
+err:		WT_TRET(__wt_remove(session, WT_METADATA_TURTLE_SET));
+	}
+
+	if  (fp != NULL)
+		WT_TRET(fclose(fp) == 0 ? 0 : __wt_errno());
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_abort.c b/src/third_party/wiredtiger/src/os_posix/os_abort.c
new file mode 100644
index 00000000000..3d99ffe20b2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_abort.c
@@ -0,0 +1,26 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_abort --
+ *	Abort the process, dropping core.
+ */
+void
+__wt_abort(WT_SESSION_IMPL *session)
+    WT_GCC_FUNC_ATTRIBUTE((noreturn))
+{
+	__wt_errx(session, "aborting WiredTiger library");
+
+#ifdef HAVE_DIAGNOSTIC
+	__wt_attach(session);
+#endif
+
+	abort();
+	/* NOTREACHED */
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_alloc.c b/src/third_party/wiredtiger/src/os_posix/os_alloc.c
new file mode 100644
index 00000000000..f7344032a15
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_alloc.c
@@ -0,0 +1,238 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * There's no malloc interface, WiredTiger never calls malloc.
+ *
+ * The problem is an application might allocate memory, write secret stuff in
+ * it, free the memory, then WiredTiger allocates the memory and uses it for a
+ * file page or log record, then writes it to disk, without having overwritten
+ * it fully.  That results in the secret stuff being protected by WiredTiger's
+ * permission mechanisms, potentially inappropriate for the secret stuff.
+ */
+
+/*
+ * __wt_calloc --
+ *	ANSI calloc function.
+ */
+int
+__wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp)
+{
+	void *p;
+
+	/*
+	 * !!!
+	 * This function MUST handle a NULL WT_SESSION_IMPL handle.
+	 */
+	WT_ASSERT(session, number != 0 && size != 0);
+
+	if (session != NULL)
+		WT_STAT_FAST_CONN_INCR(session, memory_allocation);
+
+	if ((p = calloc(number, size)) == NULL)
+		WT_RET_MSG(session, __wt_errno(), "memory allocation");
+
+	*(void **)retp = p;
+	return (0);
+}
+
+/*
+ * __wt_realloc --
+ *	ANSI realloc function.
+ */
+int
+__wt_realloc(WT_SESSION_IMPL *session,
+    size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp)
+{
+	void *p;
+	size_t bytes_allocated;
+
+	/*
+	 * !!!
+	 * This function MUST handle a NULL WT_SESSION_IMPL handle.
+	 *
+	 * Sometimes we're allocating memory and we don't care about the
+	 * final length -- bytes_allocated_ret may be NULL.
+	 */
+	p = *(void **)retp;
+	bytes_allocated =
+	    (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret;
+	WT_ASSERT(session,
+	    (p == NULL && bytes_allocated == 0) ||
+	    (p != NULL &&
+	    (bytes_allocated_ret == NULL || bytes_allocated != 0)));
+	WT_ASSERT(session, bytes_to_allocate != 0);
+	WT_ASSERT(session, bytes_allocated < bytes_to_allocate);
+
+	if (session != NULL) {
+		if (p == NULL)
+			WT_STAT_FAST_CONN_INCR(session, memory_allocation);
+		else
+			WT_STAT_FAST_CONN_INCR(session, memory_grow);
+	}
+
+	if ((p = realloc(p, bytes_to_allocate)) == NULL)
+		WT_RET_MSG(session, __wt_errno(), "memory allocation");
+
+	/*
+	 * Clear the allocated memory -- an application might: allocate memory,
+	 * write secret stuff into it, free the memory, then we re-allocate the
+	 * memory and use it for a file page or log record, and then write it to
+	 * disk.  That would result in the secret stuff being protected by the
+	 * WiredTiger permission mechanisms, potentially inappropriate for the
+	 * secret stuff.
+	 */
+	memset((uint8_t *)
+	    p + bytes_allocated, 0, bytes_to_allocate - bytes_allocated);
+
+	/* Update caller's bytes allocated value. */
+	if (bytes_allocated_ret != NULL)
+		*bytes_allocated_ret = bytes_to_allocate;
+
+	*(void **)retp = p;
+	return (0);
+}
+
+/*
+ * __wt_realloc_aligned --
+ *	ANSI realloc function that aligns to buffer boundaries, configured with
+ *	the "buffer_alignment" key to wiredtiger_open.
+ */
+int
+__wt_realloc_aligned(WT_SESSION_IMPL *session,
+    size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp)
+{
+#if defined(HAVE_POSIX_MEMALIGN)
+	WT_DECL_RET;
+
+	/*
+	 * !!!
+	 * This function MUST handle a NULL WT_SESSION_IMPL handle.
+	 */
+	if (session != NULL && S2C(session)->buffer_alignment > 0) {
+		void *p, *newp;
+		size_t bytes_allocated;
+
+		/*
+		 * Sometimes we're allocating memory and we don't care about the
+		 * final length -- bytes_allocated_ret may be NULL.
+		 */
+		p = *(void **)retp;
+		bytes_allocated =
+		    (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret;
+		WT_ASSERT(session,
+		    (p == NULL && bytes_allocated == 0) ||
+		    (p != NULL &&
+		    (bytes_allocated_ret == NULL || bytes_allocated != 0)));
+		WT_ASSERT(session, bytes_to_allocate != 0);
+		WT_ASSERT(session, bytes_allocated < bytes_to_allocate);
+
+		if (session != NULL)
+			WT_STAT_FAST_CONN_INCR(session, memory_allocation);
+
+		if ((ret = posix_memalign(&newp,
+		    S2C(session)->buffer_alignment,
+		    bytes_to_allocate)) != 0)
+			WT_RET_MSG(session, ret, "memory allocation");
+
+		if (p != NULL)
+			memcpy(newp, p, bytes_allocated);
+		__wt_free(session, p);
+		p = newp;
+
+		/* Clear the allocated memory (see above). */
+		memset((uint8_t *)p + bytes_allocated, 0,
+		    bytes_to_allocate - bytes_allocated);
+
+		/* Update caller's bytes allocated value. */
+		if (bytes_allocated_ret != NULL)
+			*bytes_allocated_ret = bytes_to_allocate;
+
+		*(void **)retp = p;
+		return (0);
+	}
+#endif
+	/*
+	 * If there is no posix_memalign function, or no alignment configured,
+	 * fall back to realloc.
+	 *
+	 * Windows note: Visual C CRT memalign does not match Posix behavior
+	 * and would also double each allocation so it is bad for memory use
+	 */
+	return (__wt_realloc(
+	    session, bytes_allocated_ret, bytes_to_allocate, retp));
+}
+
+/*
+ * __wt_strndup --
+ *	Duplicate a byte string of a given length (and NUL-terminate).
+ */
+int
+__wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp)
+{
+	void *p;
+
+	if (str == NULL) {
+		*(void **)retp = NULL;
+		return (0);
+	}
+
+	WT_RET(__wt_calloc(session, len + 1, 1, &p));
+
+	/*
+	 * Don't change this to strncpy, we rely on this function to duplicate
+	 * "strings" that contain nul bytes.
+	 */
+	memcpy(p, str, len);
+
+	*(void **)retp = p;
+	return (0);
+}
+
+/*
+ * __wt_strdup --
+ *	ANSI strdup function.
+ */
+int
+__wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp)
+{
+	return (__wt_strndup(
+	    session, str, (str == NULL) ? 0 : strlen(str), retp));
+}
+
+/*
+ * __wt_free_int --
+ *	ANSI free function.
+ */
+void
+__wt_free_int(WT_SESSION_IMPL *session, const void *p_arg)
+{
+	void *p;
+
+	p = *(void **)p_arg;
+	if (p == NULL)				/* ANSI C free semantics */
+		return;
+
+	/*
+	 * If there's a serialization bug we might race with another thread.
+	 * We can't avoid the race (and we aren't willing to flush memory),
+	 * but we minimize the window by clearing the free address, hoping a
+	 * racing thread will see, and won't free, a NULL pointer.
+	 */
+	*(void **)p_arg = NULL;
+
+	/*
+	 * !!!
+	 * This function MUST handle a NULL WT_SESSION_IMPL handle.
+	 */
+	if (session != NULL)
+		WT_STAT_FAST_CONN_INCR(session, memory_free);
+
+	free(p);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_dir.c b/src/third_party/wiredtiger/src/os_posix/os_dir.c
new file mode 100644
index 00000000000..98b2d4926cd
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_dir.c
@@ -0,0 +1,94 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+/* I'm sure we need to config this */
+#include <dirent.h>
+
+/*
+ * __wt_dirlist --
+ *	Get a list of files from a directory, optionally filtered by
+ *	a given prefix.
+ */
+int
+__wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix,
+    uint32_t flags, char ***dirlist, u_int *countp)
+{
+	struct dirent *dp;
+	DIR *dirp;
+	WT_DECL_RET;
+	size_t dirallocsz;
+	u_int count, dirsz;
+	int match;
+	char **entries, *path;
+
+	*dirlist = NULL;
+	*countp = 0;
+
+	WT_RET(__wt_filename(session, dir, &path));
+
+	dirp = NULL;
+	dirallocsz = 0;
+	dirsz = 0;
+	entries = NULL;
+	if (flags == 0)
+		LF_SET(WT_DIRLIST_INCLUDE);
+
+	WT_ERR(__wt_verbose(session, WT_VERB_FILEOPS,
+	    "wt_dirlist of %s %s prefix %s",
+	    path, LF_ISSET(WT_DIRLIST_INCLUDE) ? "include" : "exclude",
+	    prefix == NULL ? "all" : prefix));
+
+	WT_SYSCALL_RETRY(((dirp = opendir(path)) == NULL ? 1 : 0), ret);
+	if (ret != 0)
+		WT_ERR_MSG(session, ret, "%s: opendir", path);
+	for (dirsz = 0, count = 0; (dp = readdir(dirp)) != NULL;) {
+		/*
+		 * Skip . and ..
+		 */
+		if (strcmp(dp->d_name, ".") == 0 ||
+		    strcmp(dp->d_name, "..") == 0)
+			continue;
+		match = 0;
+		if (prefix != NULL &&
+		    ((LF_ISSET(WT_DIRLIST_INCLUDE) &&
+		    WT_PREFIX_MATCH(dp->d_name, prefix)) ||
+		    (LF_ISSET(WT_DIRLIST_EXCLUDE) &&
+		    !WT_PREFIX_MATCH(dp->d_name, prefix))))
+			match = 1;
+		if (prefix == NULL || match) {
+			/*
+			 * We have a file name we want to return.
+			 */
+			count++;
+			if (count > dirsz) {
+				dirsz += WT_DIR_ENTRY;
+				WT_ERR(__wt_realloc_def(
+				    session, &dirallocsz, dirsz, &entries));
+			}
+			WT_ERR(__wt_strdup(
+			    session, dp->d_name, &entries[count-1]));
+		}
+	}
+	if (count > 0)
+		*dirlist = entries;
+	*countp = count;
+err:
+	if (dirp != NULL)
+		(void)closedir(dirp);
+	__wt_free(session, path);
+
+	if (ret == 0)
+		return (0);
+
+	if (*dirlist != NULL) {
+		for (count = dirsz; count > 0; count--)
+			__wt_free(session, entries[count]);
+		__wt_free(session, entries);
+	}
+	WT_RET_MSG(session, ret, "dirlist %s prefix %s", dir, prefix);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_dlopen.c b/src/third_party/wiredtiger/src/os_posix/os_dlopen.c
new file mode 100644
index 00000000000..91410c54c04
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_dlopen.c
@@ -0,0 +1,83 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_dlopen --
+ *	Open a dynamic library.
+ */
+int
+__wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp)
+{
+	WT_DECL_RET;
+	WT_DLH *dlh;
+
+	WT_RET(__wt_calloc_def(session, 1, &dlh));
+	WT_ERR(__wt_strdup(session, path, &dlh->name));
+
+	if ((dlh->handle = dlopen(path, RTLD_LAZY)) == NULL)
+		WT_ERR_MSG(
+		    session, __wt_errno(), "dlopen(%s): %s", path, dlerror());
+
+	*dlhp = dlh;
+	if (0) {
+err:		__wt_free(session, dlh->name);
+		__wt_free(session, dlh);
+	}
+	return (ret);
+}
+
+/*
+ * __wt_dlsym --
+ *	Lookup a symbol in a dynamic library.
+ */
+int
+__wt_dlsym(WT_SESSION_IMPL *session,
+    WT_DLH *dlh, const char *name, int fail, void *sym_ret)
+{
+	void *sym;
+
+	*(void **)sym_ret = NULL;
+	if ((sym = dlsym(dlh->handle, name)) == NULL) {
+		if (fail)
+			WT_RET_MSG(session, __wt_errno(),
+			    "dlsym(%s in %s): %s", name, dlh->name, dlerror());
+		return (0);
+	}
+
+	*(void **)sym_ret = sym;
+	return (0);
+}
+
+/*
+ * __wt_dlclose --
+ *	Close a dynamic library
+ */
+int
+__wt_dlclose(WT_SESSION_IMPL *session, WT_DLH *dlh)
+{
+	WT_DECL_RET;
+
+	/*
+	 * FreeBSD dies inside __cxa_finalize when closing handles.
+	 *
+	 * For now, just skip the dlclose: this may leak some resources until
+	 * the process exits, but that is preferable to hard-to-debug crashes
+	 * during exit.
+	 */
+#ifndef __FreeBSD__
+	if (dlclose(dlh->handle) != 0) {
+		ret = __wt_errno();
+		__wt_err(session, ret, "dlclose: %s", dlerror());
+	}
+#endif
+
+	__wt_free(session, dlh->name);
+	__wt_free(session, dlh);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_errno.c b/src/third_party/wiredtiger/src/os_posix/os_errno.c
new file mode 100644
index 00000000000..9290f7d651f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_errno.c
@@ -0,0 +1,22 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_errno --
+ *	Return errno, or WT_ERROR if errno not set.
+ */
+int
+__wt_errno(void)
+{
+	/*
+	 * Called when we know an error occurred, and we want the system
+	 * error code, but there's some chance it's not set.
+	 */
+	return (errno == 0 ? WT_ERROR : errno);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_exist.c b/src/third_party/wiredtiger/src/os_posix/os_exist.c
new file mode 100644
index 00000000000..723f07026e1
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_exist.c
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_exist --
+ *	Return if the file exists.
+ */
+int
+__wt_exist(WT_SESSION_IMPL *session, const char *filename, int *existp)
+{
+	struct stat sb;
+	WT_DECL_RET;
+	char *path;
+
+	WT_RET(__wt_filename(session, filename, &path));
+
+	WT_SYSCALL_RETRY(stat(path, &sb), ret);
+
+	__wt_free(session, path);
+
+	if (ret == 0) {
+		*existp = 1;
+		return (0);
+	}
+	if (ret == ENOENT) {
+		*existp = 0;
+		return (0);
+	}
+
+	WT_RET_MSG(session, ret, "%s: fstat", filename);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_fallocate.c b/src/third_party/wiredtiger/src/os_posix/os_fallocate.c
new file mode 100644
index 00000000000..28cd1979c77
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_fallocate.c
@@ -0,0 +1,97 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#if defined(HAVE_FALLOCATE)
+#include <linux/falloc.h>
+#endif
+
+/*
+ * __wt_fallocate_config --
+ *	Configure fallocate behavior for a file handle.
+ */
+void
+__wt_fallocate_config(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+	WT_UNUSED(session);
+
+	fh->fallocate_available = 0;
+	fh->fallocate_requires_locking = 0;
+
+#ifdef __linux__
+	/*
+	 * We've seen Linux systems where posix_fallocate corrupts existing data
+	 * (even though that is explicitly disallowed by POSIX).  We've not seen
+	 * problems with fallocate, it's unlocked for now.
+	 */
+#if defined(HAVE_FALLOCATE)
+	fh->fallocate_available = 1;
+	fh->fallocate_requires_locking = 0;
+#elif defined(HAVE_POSIX_FALLOCATE)
+	fh->fallocate_available = 1;
+	fh->fallocate_requires_locking = 1;
+#endif
+#elif defined(HAVE_POSIX_FALLOCATE)
+	/*
+	 * FreeBSD and Solaris support posix_fallocate, and so far we've seen
+	 * no problems leaving it unlocked.
+	 */
+	fh->fallocate_available = 1;
+	fh->fallocate_requires_locking = 0;
+#endif
+}
+
+/*
+ * __wt_fallocate --
+ *	Allocate space for a file handle.
+ */
+int
+__wt_fallocate(
+    WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len)
+{
+	WT_DECL_RET;
+
+#if defined(HAVE_FALLOCATE)
+	WT_RET(__wt_verbose(
+	    session, WT_VERB_FILEOPS, "%s: fallocate", fh->name));
+	WT_SYSCALL_RETRY(
+	    fallocate(fh->fd, FALLOC_FL_KEEP_SIZE, offset, len), ret);
+	if (ret == 0)
+		return (0);
+
+	/*
+	 * Linux returns ENOTSUP for fallocate on some file systems; we return
+	 * ENOTSUP, and our caller should avoid calling us again.
+	 */
+	if (ret != ENOTSUP)
+		WT_RET_MSG(session, ret, "%s: fallocate", fh->name);
+#elif defined(HAVE_POSIX_FALLOCATE)
+	WT_RET(__wt_verbose(
+	    session, WT_VERB_FILEOPS, "%s: posix_fallocate", fh->name));
+	WT_SYSCALL_RETRY(posix_fallocate(fh->fd, offset, len), ret);
+	if (ret == 0)
+		return (0);
+
+	/*
+	 * Solaris returns EINVAL for posix_fallocate on some file systems; we
+	 * return ENOTSUP, and our caller should avoid calling us again.
+	 */
+	if (ret != EINVAL)
+		WT_RET_MSG(session, ret, "%s: posix_fallocate", fh->name);
+#else
+	WT_UNUSED(session);
+	WT_UNUSED(fh);
+	WT_UNUSED(offset);
+	WT_UNUSED(len);
+	WT_UNUSED(ret);
+#endif
+
+	fh->fallocate_available = 0;
+	fh->fallocate_requires_locking = 0;
+	return (ENOTSUP);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_filesize.c b/src/third_party/wiredtiger/src/os_posix/os_filesize.c
new file mode 100644
index 00000000000..3692b135d73
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_filesize.c
@@ -0,0 +1,55 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_filesize --
+ *	Get the size of a file in bytes.
+ */
+int
+__wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep)
+{
+	struct stat sb;
+	WT_DECL_RET;
+
+	WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: fstat", fh->name));
+
+	WT_SYSCALL_RETRY(fstat(fh->fd, &sb), ret);
+	if (ret == 0) {
+		*sizep = sb.st_size;
+		return (0);
+	}
+
+	WT_RET_MSG(session, ret, "%s: fstat", fh->name);
+}
+
+/*
+ * __wt_filesize_name --
+ *	Return the size of a file in bytes, given a file name.
+ */
+int
+__wt_filesize_name(
+    WT_SESSION_IMPL *session, const char *filename, wt_off_t *sizep)
+{
+	struct stat sb;
+	WT_DECL_RET;
+	char *path;
+
+	WT_RET(__wt_filename(session, filename, &path));
+
+	WT_SYSCALL_RETRY(stat(path, &sb), ret);
+
+	__wt_free(session, path);
+
+	if (ret == 0) {
+		*sizep = sb.st_size;
+		return (0);
+	}
+
+	WT_RET_MSG(session, ret, "%s: fstat", filename);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_flock.c b/src/third_party/wiredtiger/src/os_posix/os_flock.c
new file mode 100644
index 00000000000..e9e653d73e6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_flock.c
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_bytelock --
+ *	Lock/unlock a byte in a file.
+ */
+int
+__wt_bytelock(WT_FH *fhp, wt_off_t byte, int lock)
+{
+	struct flock fl;
+	WT_DECL_RET;
+
+	/*
+	 * WiredTiger requires this function be able to acquire locks past
+	 * the end of file.
+	 *
+	 * Note we're using fcntl(2) locking: all fcntl locks associated with a
+	 * file for a given process are removed when any file descriptor for the
+	 * file is closed by the process, even if a lock was never requested for
+	 * that file descriptor.
+	 */
+	fl.l_start = byte;
+	fl.l_len = 1;
+	fl.l_type = lock ? F_WRLCK : F_UNLCK;
+	fl.l_whence = SEEK_SET;
+
+	WT_SYSCALL_RETRY(fcntl(fhp->fd, F_SETLK, &fl), ret);
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_fsync.c b/src/third_party/wiredtiger/src/os_posix/os_fsync.c
new file mode 100644
index 00000000000..c181809df95
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_fsync.c
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_fsync --
+ *	Flush a file handle.
+ */
+int
+__wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+	WT_DECL_RET;
+
+	WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: fsync", fh->name));
+
+#ifdef HAVE_FDATASYNC
+	WT_SYSCALL_RETRY(fdatasync(fh->fd), ret);
+#else
+	WT_SYSCALL_RETRY(fsync(fh->fd), ret);
+#endif
+	if (ret != 0)
+		WT_RET_MSG(session, ret, "%s fsync error", fh->name);
+
+	return (0);
+}
+
+/*
+ * __wt_fsync_async --
+ *	Flush a file handle and don't wait for the result.
+ */
+int
+__wt_fsync_async(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+#ifdef	HAVE_SYNC_FILE_RANGE
+	WT_DECL_RET;
+
+	WT_RET(__wt_verbose(
+	    session, WT_VERB_FILEOPS, "%s: sync_file_range", fh->name));
+
+	if ((ret = sync_file_range(fh->fd,
+	    (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE)) == 0)
+		return (0);
+	WT_RET_MSG(session, ret, "%s: sync_file_range", fh->name);
+#else
+	WT_UNUSED(session);
+	WT_UNUSED(fh);
+	return (0);
+#endif
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_ftruncate.c b/src/third_party/wiredtiger/src/os_posix/os_ftruncate.c
new file mode 100644
index 00000000000..3f3034de551
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_ftruncate.c
@@ -0,0 +1,26 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_ftruncate --
+ *	Truncate a file.
+ */
+int
+__wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len)
+{
+	WT_DECL_RET;
+
+	WT_SYSCALL_RETRY(ftruncate(fh->fd, len), ret);
+	if (ret == 0) {
+		fh->size = fh->extend_size = len;
+		return (0);
+	}
+
+	WT_RET_MSG(session, ret, "%s ftruncate error", fh->name);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_getline.c b/src/third_party/wiredtiger/src/os_posix/os_getline.c
new file mode 100644
index 00000000000..7ef4065ac3b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_getline.c
@@ -0,0 +1,48 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_getline --
+ *	Get a line from a stream.
+ *
+ * Implementation of the POSIX getline or BSD fgetln functions (finding the
+ * function in a portable way is hard, it's simple enough to write it instead).
+ *
+ * Note: Unlike the standard getline calls, this function doesn't include the
+ * trailing newline character in the returned buffer and discards empty lines
+ * (so the caller's EOF marker is a returned line length of 0).
+ */
+int
+__wt_getline(WT_SESSION_IMPL *session, WT_ITEM *buf, FILE *fp)
+{
+	int c;
+
+	/*
+	 * We always NUL-terminate the returned string (even if it's empty),
+	 * make sure there's buffer space for a trailing NUL in all cases.
+	 */
+	WT_RET(__wt_buf_init(session, buf, 100));
+
+	while ((c = fgetc(fp)) != EOF) {
+		/* Leave space for a trailing NUL. */
+		WT_RET(__wt_buf_extend(session, buf, buf->size + 2));
+		if (c == '\n') {
+			if (buf->size == 0)
+				continue;
+			break;
+		}
+		((char *)buf->mem)[buf->size++] = (char)c;
+	}
+	if (c == EOF && ferror(fp))
+		WT_RET_MSG(session, __wt_errno(), "file read");
+
+	((char *)buf->mem)[buf->size] = '\0';
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_getopt.c b/src/third_party/wiredtiger/src/os_posix/os_getopt.c
new file mode 100644
index 00000000000..1c25521dacd
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_getopt.c
@@ -0,0 +1,150 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*	$NetBSD: getopt.c,v 1.26 2003/08/07 16:43:40 agc Exp $	*/
+
+/*
+ * Copyright (c) 1987, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "wt_internal.h"
+
+extern int __wt_opterr, __wt_optind, __wt_optopt, __wt_optreset;
+int	__wt_opterr = 1,	/* if error message should be printed */
+	__wt_optind = 1,	/* index into parent argv vector */
+	__wt_optopt,		/* character checked for validity */
+	__wt_optreset;		/* reset getopt */
+
+extern char *__wt_optarg;
+char	*__wt_optarg;		/* argument associated with option */
+
+#define	BADCH	(int)'?'
+#define	BADARG	(int)':'
+#define	EMSG	""
+
+/*
+ * __wt_getopt --
+ *	Parse argc/argv argument vector.
+ */
+int
+__wt_getopt(
+    const char *progname, int nargc, char * const *nargv, const char *ostr)
+{
+	static const char *place = EMSG;	/* option letter processing */
+	const char *oli;			/* option letter list index */
+
+	if (__wt_optreset || *place == 0) {	/* update scanning pointer */
+		__wt_optreset = 0;
+		place = nargv[__wt_optind];
+		if (__wt_optind >= nargc || *place++ != '-') {
+			/* Argument is absent or is not an option */
+			place = EMSG;
+			return (-1);
+		}
+		__wt_optopt = *place++;
+		if (__wt_optopt == '-' && *place == 0) {
+			/* "--" => end of options */
+			++__wt_optind;
+			place = EMSG;
+			return (-1);
+		}
+		if (__wt_optopt == 0) {
+			/* Solitary '-', treat as a '-' option
+			   if the program (eg su) is looking for it. */
+			place = EMSG;
+			if (strchr(ostr, '-') == NULL)
+				return (-1);
+			__wt_optopt = '-';
+		}
+	} else
+		__wt_optopt = *place++;
+
+	/* See if option letter is one the caller wanted... */
+	if (__wt_optopt == ':' || (oli = strchr(ostr, __wt_optopt)) == NULL) {
+		if (*place == 0)
+			++__wt_optind;
+		if (__wt_opterr && *ostr != ':')
+			(void)fprintf(stderr,
+			    "%s: illegal option -- %c\n", progname,
+			    __wt_optopt);
+		return (BADCH);
+	}
+
+	/* Does this option need an argument? */
+	if (oli[1] != ':') {
+		/* don't need argument */
+		__wt_optarg = NULL;
+		if (*place == 0)
+			++__wt_optind;
+	} else {
+		/* Option-argument is either the rest of this argument or the
+		   entire next argument. */
+		if (*place)
+			__wt_optarg = (char *)place;
+		else if (nargc > ++__wt_optind)
+			__wt_optarg = nargv[__wt_optind];
+		else {
+			/* option-argument absent */
+			place = EMSG;
+			if (*ostr == ':')
+				return (BADARG);
+			if (__wt_opterr)
+				(void)fprintf(stderr,
+				    "%s: option requires an argument -- %c\n",
+				    progname, __wt_optopt);
+			return (BADCH);
+		}
+		place = EMSG;
+		++__wt_optind;
+	}
+	return (__wt_optopt);			/* return option letter */
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_map.c b/src/third_party/wiredtiger/src/os_posix/os_map.c
new file mode 100644
index 00000000000..be4d27e96a3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_map.c
@@ -0,0 +1,136 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_mmap --
+ *	Map a file into memory.
+ */
+int
+__wt_mmap(WT_SESSION_IMPL *session,
+    WT_FH *fh, void *mapp, size_t *lenp, void **mappingcookie)
+{
+	void *map;
+	size_t orig_size;
+
+	WT_UNUSED(mappingcookie);
+
+	/*
+	 * Record the current size and only map and set that as the length, it
+	 * could change between the map call and when we set the return length.
+	 * For the same reason we could actually map past the end of the file;
+	 * we don't read bytes past the end of the file though, so as long as
+	 * the map call succeeds, it's all OK.
+	 */
+	orig_size = (size_t)fh->size;
+	if ((map = mmap(NULL, orig_size,
+	    PROT_READ,
+#ifdef MAP_NOCORE
+	    MAP_NOCORE |
+#endif
+	    MAP_PRIVATE,
+	    fh->fd, (wt_off_t)0)) == MAP_FAILED) {
+		WT_RET_MSG(session, __wt_errno(),
+		    "%s map error: failed to map %" WT_SIZET_FMT " bytes",
+		    fh->name, orig_size);
+	}
+	(void)__wt_verbose(session, WT_VERB_FILEOPS,
+	    "%s: map %p: %" WT_SIZET_FMT " bytes", fh->name, map, orig_size);
+
+	*(void **)mapp = map;
+	*lenp = orig_size;
+	return (0);
+}
+
+#define	WT_VM_PAGESIZE	4096
+
+/*
+ * __wt_mmap_preload --
+ *	Cause a section of a memory map to be faulted in.
+ */
+int
+__wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size)
+{
+#ifdef HAVE_POSIX_MADVISE
+	/* Linux requires the address be aligned to a 4KB boundary. */
+	WT_BM *bm = S2BT(session)->bm;
+	WT_DECL_RET;
+	void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1));
+	size += WT_PTRDIFF(p, blk);
+
+	/* XXX proxy for "am I doing a scan?" -- manual read-ahead */
+	if (F_ISSET(session, WT_SESSION_NO_CACHE)) {
+		/* Read in 2MB blocks every 1MB of data. */
+		if (((uintptr_t)((uint8_t *)blk + size) &
+		    (uintptr_t)((1<<20) - 1)) < (uintptr_t)blk)
+			return (0);
+		size = WT_MIN(WT_MAX(20 * size, 2 << 20),
+		    WT_PTRDIFF((uint8_t *)bm->map + bm->maplen, blk));
+	}
+
+	/*
+	 * Manual pages aren't clear on whether alignment is required for the
+	 * size, so we will be conservative.
+	 */
+	size &= ~(size_t)(WT_VM_PAGESIZE - 1);
+
+	if (size > WT_VM_PAGESIZE &&
+	    (ret = posix_madvise(blk, size, POSIX_MADV_WILLNEED)) != 0)
+		WT_RET_MSG(session, ret, "posix_madvise will need");
+#else
+	WT_UNUSED(session);
+	WT_UNUSED(p);
+	WT_UNUSED(size);
+#endif
+
+	return (0);
+}
+
+/*
+ * __wt_mmap_discard --
+ *	Discard a chunk of the memory map.
+ */
+int
+__wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size)
+{
+#ifdef HAVE_POSIX_MADVISE
+	/* Linux requires the address be aligned to a 4KB boundary. */
+	WT_DECL_RET;
+	void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1));
+	size += WT_PTRDIFF(p, blk);
+
+	if ((ret = posix_madvise(blk, size, POSIX_MADV_DONTNEED)) != 0)
+		WT_RET_MSG(session, ret, "posix_madvise don't need");
+#else
+	WT_UNUSED(session);
+	WT_UNUSED(p);
+	WT_UNUSED(size);
+#endif
+	return (0);
+}
+
+/*
+ * __wt_munmap --
+ *	Remove a memory mapping.
+ */
+int
+__wt_munmap(WT_SESSION_IMPL *session,
+    WT_FH *fh, void *map, size_t len, void **mappingcookie)
+{
+	WT_UNUSED(mappingcookie);
+
+	WT_RET(__wt_verbose(session, WT_VERB_FILEOPS,
+	    "%s: unmap %p: %" WT_SIZET_FMT " bytes", fh->name, map, len));
+
+	if (munmap(map, len) == 0)
+		return (0);
+
+	WT_RET_MSG(session, __wt_errno(),
+	    "%s unmap error: failed to unmap %" WT_SIZET_FMT " bytes",
+	    fh->name, len);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
new file mode 100644
index 00000000000..3a76cceb3f0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
@@ -0,0 +1,157 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cond_alloc --
+ *	Allocate and initialize a condition variable.
+ */
+int
+__wt_cond_alloc(WT_SESSION_IMPL *session,
+    const char *name, int is_signalled, WT_CONDVAR **condp)
+{
+	WT_CONDVAR *cond;
+	WT_DECL_RET;
+
+	/*
+	 * !!!
+	 * This function MUST handle a NULL session handle.
+	 */
+	WT_RET(__wt_calloc(session, 1, sizeof(WT_CONDVAR), &cond));
+
+	WT_ERR(pthread_mutex_init(&cond->mtx, NULL));
+
+	/* Initialize the condition variable to permit self-blocking. */
+	WT_ERR(pthread_cond_init(&cond->cond, NULL));
+
+	cond->name = name;
+	cond->waiters = is_signalled ? -1 : 0;
+
+	*condp = cond;
+	return (0);
+
+err:	__wt_free(session, cond);
+	return (ret);
+}
+
+/*
+ * __wt_cond_wait --
+ *	Wait on a mutex, optionally timing out.
+ */
+int
+__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, long usecs)
+{
+	struct timespec ts;
+	WT_DECL_RET;
+	int locked;
+
+	locked = 0;
+	WT_ASSERT(session, usecs >= 0);
+
+	/* Fast path if already signalled. */
+	if (WT_ATOMIC_ADD4(cond->waiters, 1) == 0)
+		return (0);
+
+	/*
+	 * !!!
+	 * This function MUST handle a NULL session handle.
+	 */
+	if (session != NULL) {
+		WT_RET(__wt_verbose(session, WT_VERB_MUTEX,
+		    "wait %s cond (%p)", cond->name, cond));
+		WT_STAT_FAST_CONN_INCR(session, cond_wait);
+	}
+
+	WT_ERR(pthread_mutex_lock(&cond->mtx));
+	locked = 1;
+
+	if (usecs > 0) {
+		WT_ERR(__wt_epoch(session, &ts));
+		ts.tv_sec += (ts.tv_nsec + 1000 * usecs) / WT_BILLION;
+		ts.tv_nsec = (ts.tv_nsec + 1000 * usecs) % WT_BILLION;
+		ret = pthread_cond_timedwait(&cond->cond, &cond->mtx, &ts);
+	} else
+		ret = pthread_cond_wait(&cond->cond, &cond->mtx);
+
+	/*
+	 * Check pthread_cond_wait() return for EINTR, ETIME and
+	 * ETIMEDOUT, some systems return these errors.
+	 */
+	if (ret == EINTR ||
+#ifdef ETIME
+	    ret == ETIME ||
+#endif
+	    ret == ETIMEDOUT)
+		ret = 0;
+
+	(void)WT_ATOMIC_SUB4(cond->waiters, 1);
+
+err:	if (locked)
+		WT_TRET(pthread_mutex_unlock(&cond->mtx));
+	if (ret == 0)
+		return (0);
+	WT_RET_MSG(session, ret, "pthread_cond_wait");
+}
+
+/*
+ * __wt_cond_signal --
+ *	Signal a waiting thread.
+ */
+int
+__wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
+{
+	WT_DECL_RET;
+	int locked;
+
+	locked = 0;
+
+	/*
+	 * !!!
+	 * This function MUST handle a NULL session handle.
+	 */
+	if (session != NULL)
+		WT_RET(__wt_verbose(session, WT_VERB_MUTEX,
+		    "signal %s cond (%p)", cond->name, cond));
+
+	/* Fast path if already signalled. */
+	if (cond->waiters == -1)
+		return (0);
+
+	if (cond->waiters > 0 || !WT_ATOMIC_CAS4(cond->waiters, 0, -1)) {
+		WT_ERR(pthread_mutex_lock(&cond->mtx));
+		locked = 1;
+		WT_ERR(pthread_cond_broadcast(&cond->cond));
+	}
+
+err:	if (locked)
+		WT_TRET(pthread_mutex_unlock(&cond->mtx));
+	if (ret == 0)
+		return (0);
+	WT_RET_MSG(session, ret, "pthread_cond_broadcast");
+}
+
+/*
+ * __wt_cond_destroy --
+ *	Destroy a condition variable.
+ */
+int
+__wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp)
+{
+	WT_CONDVAR *cond;
+	WT_DECL_RET;
+
+	cond = *condp;
+	if (cond == NULL)
+		return (0);
+
+	ret = pthread_cond_destroy(&cond->cond);
+	WT_TRET(pthread_mutex_destroy(&cond->mtx));
+	__wt_free(session, *condp);
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c
new file mode 100644
index 00000000000..1a692f71dce
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c
@@ -0,0 +1,227 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Based on "Spinlocks and Read-Write Locks" by Dr. Steven Fuerst:
+ *	http://locklessinc.com/articles/locks/
+ *
+ * Dr. Fuerst further credits:
+ *	There exists a form of the ticket lock that is designed for read-write
+ * locks. An example written in assembly was posted to the Linux kernel mailing
+ * list in 2002 by David Howells from RedHat. This was a highly optimized
+ * version of a read-write ticket lock developed at IBM in the early 90's by
+ * Joseph Seigh. Note that a similar (but not identical) algorithm was published
+ * by John Mellor-Crummey and Michael Scott in their landmark paper "Scalable
+ * Reader-Writer Synchronization for Shared-Memory Multiprocessors".
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_rwlock_alloc --
+ *	Allocate and initialize a read/write lock.
+ */
+int
+__wt_rwlock_alloc(
+    WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name)
+{
+	WT_RWLOCK *rwlock;
+
+	WT_RET(__wt_verbose(session, WT_VERB_MUTEX, "rwlock: alloc %s", name));
+
+	WT_RET(__wt_calloc_def(session, 1, &rwlock));
+
+	rwlock->name = name;
+
+	*rwlockp = rwlock;
+	return (0);
+}
+
+/*
+ * __wt_try_readlock --
+ *	Try to get a shared lock, fail immediately if unavailable.
+ */
+int
+__wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+	wt_rwlock_t *l;
+	uint64_t old, new, pad, users, writers;
+
+	WT_RET(__wt_verbose(
+	    session, WT_VERB_MUTEX, "rwlock: try_readlock %s", rwlock->name));
+	WT_STAT_FAST_CONN_INCR(session, rwlock_read);
+
+	l = &rwlock->rwlock;
+	pad = l->s.pad;
+	users = l->s.users;
+	writers = l->s.writers;
+	old = (pad << 48) + (users << 32) + (users << 16) + writers;
+	new = (pad << 48) + ((users + 1) << 32) + ((users + 1) << 16) + writers;
+	return (WT_ATOMIC_CAS_VAL8(l->u, old, new) == old ? 0 : EBUSY);
+}
+
+/*
+ * __wt_readlock --
+ *	Get a shared lock.
+ */
+int
+__wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+	wt_rwlock_t *l;
+	uint64_t me;
+	uint16_t val;
+
+	WT_RET(__wt_verbose(
+	    session, WT_VERB_MUTEX, "rwlock: readlock %s", rwlock->name));
+	WT_STAT_FAST_CONN_INCR(session, rwlock_read);
+
+	l = &rwlock->rwlock;
+	me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32);
+	val = (uint16_t)(me >> 32);
+	while (val != l->s.readers)
+		WT_PAUSE();
+
+	++l->s.readers;
+
+	return (0);
+}
+
+/*
+ * __wt_readunlock --
+ *	Release a shared lock.
+ */
+int
+__wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+	wt_rwlock_t *l;
+
+	WT_RET(__wt_verbose(
+	    session, WT_VERB_MUTEX, "rwlock: read unlock %s", rwlock->name));
+
+	l = &rwlock->rwlock;
+	WT_ATOMIC_ADD2(l->s.writers, 1);
+
+	return (0);
+}
+
+/*
+ * __wt_try_writelock --
+ *	Try to get an exclusive lock, fail immediately if unavailable.
+ */
+int
+__wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+	wt_rwlock_t *l;
+	uint64_t old, new, pad, readers, users;
+
+	WT_RET(__wt_verbose(
+	    session, WT_VERB_MUTEX, "rwlock: try_writelock %s", rwlock->name));
+	WT_STAT_FAST_CONN_INCR(session, rwlock_write);
+
+	l = &rwlock->rwlock;
+	pad = l->s.pad;
+	readers = l->s.readers;
+	users = l->s.users;
+	old = (pad << 48) + (users << 32) + (readers << 16) + users;
+	new = (pad << 48) + ((users + 1) << 32) + (readers << 16) + users;
+	return (WT_ATOMIC_CAS_VAL8(l->u, old, new) == old ? 0 : EBUSY);
+}
+
+/*
+ * __wt_writelock --
+ *	Wait to get an exclusive lock.
+ */
+int
+__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+	wt_rwlock_t *l;
+	uint64_t me;
+	uint16_t val;
+
+	WT_RET(__wt_verbose(
+	    session, WT_VERB_MUTEX, "rwlock: writelock %s", rwlock->name));
+	WT_STAT_FAST_CONN_INCR(session, rwlock_write);
+
+	/*
+	 * Possibly wrap: if we have more than 64K lockers waiting, the count
+	 * of writers will wrap and two lockers will simultaneously be granted
+	 * the write lock.
+	 */
+	l = &rwlock->rwlock;
+	me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32);
+	val = (uint16_t)(me >> 32);
+	while (val != l->s.writers)
+		WT_PAUSE();
+
+	return (0);
+}
+
+/*
+ * __wt_writeunlock --
+ *	Release an exclusive lock.
+ */
+int
+__wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+	wt_rwlock_t *l, copy;
+
+	WT_RET(__wt_verbose(
+	    session, WT_VERB_MUTEX, "rwlock: writeunlock %s", rwlock->name));
+
+	l = &rwlock->rwlock;
+
+	copy = *l;
+
+	WT_BARRIER();
+
+	++copy.s.writers;
+	++copy.s.readers;
+
+	l->us = copy.us;
+	return (0);
+}
+
+/*
+ * __wt_rwlock_destroy --
+ *	Destroy a read/write lock.
+ */
+int
+__wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp)
+{
+	WT_RWLOCK *rwlock;
+
+	rwlock = *rwlockp;		/* Clear our caller's reference. */
+	if (rwlock == NULL)
+		return (0);
+	*rwlockp = NULL;
+
+	WT_RET(__wt_verbose(
+	    session, WT_VERB_MUTEX, "rwlock: destroy %s", rwlock->name));
+
+	__wt_free(session, rwlock);
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_once.c b/src/third_party/wiredtiger/src/os_posix/os_once.c
new file mode 100644
index 00000000000..22eaf5f0ee5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_once.c
@@ -0,0 +1,20 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_once --
+ *	One-time initialization per process.
+ */
+int
+__wt_once(void (*init_routine)(void))
+{
+	static pthread_once_t once_control = PTHREAD_ONCE_INIT;
+
+	return (pthread_once(&once_control, init_routine));
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_open.c b/src/third_party/wiredtiger/src/os_posix/os_open.c
new file mode 100644
index 00000000000..a1bc3feb7d2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_open.c
@@ -0,0 +1,253 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __open_directory_sync --
+ *	Fsync the directory in which we created the file.
+ */
+static int
+__open_directory_sync(WT_SESSION_IMPL *session, char *path)
+{
+#ifdef __linux__
+	WT_DECL_RET;
+	int fd;
+	char *dir;
+
+	/*
+	 * According to the Linux fsync man page:
+	 *	Calling fsync() does not necessarily ensure that the entry in
+	 *	the directory containing the file has also reached disk. For
+	 *	that an explicit fsync() on a file descriptor for the directory
+	 *	is also needed.
+	 *
+	 * Open the WiredTiger home directory and sync it, I don't want the rest
+	 * of the system to have to wonder if opening a file creates it.
+	 */
+	if ((dir = strrchr(path, '/')) == NULL)
+		path = (char *)".";
+	else
+		*dir = '\0';
+	WT_SYSCALL_RETRY(((fd =
+	    open(path, O_RDONLY, 0444)) == -1 ? 1 : 0), ret);
+	if (dir != NULL)
+		*dir = '/';
+	if (ret != 0)
+		WT_RET_MSG(session, ret, "%s: open", path);
+
+	WT_SYSCALL_RETRY(fsync(fd), ret);
+	if (ret != 0)
+		WT_ERR_MSG(session, ret, "%s: fsync", path);
+
+err:	WT_SYSCALL_RETRY(close(fd), ret);
+	if (ret != 0)
+		__wt_err(session, ret, "%s: close", path);
+	return (ret);
+#else
+	WT_UNUSED(session);
+	WT_UNUSED(path);
+	return (0);
+#endif
+}
+
+/*
+ * __wt_open --
+ *	Open a file handle.
+ */
+int
+__wt_open(WT_SESSION_IMPL *session,
+    const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_FH *fh, *tfh;
+	mode_t mode;
+	int direct_io, f, fd, matched;
+	char *path;
+
+	conn = S2C(session);
+	fh = NULL;
+	fd = -1;
+	path = NULL;
+
+	WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: open", name));
+
+	/* Increment the reference count if we already have the file open. */
+	matched = 0;
+	__wt_spin_lock(session, &conn->fh_lock);
+	TAILQ_FOREACH(tfh, &conn->fhqh, q)
+		if (strcmp(name, tfh->name) == 0) {
+			++tfh->ref;
+			*fhp = tfh;
+			matched = 1;
+			break;
+		}
+	__wt_spin_unlock(session, &conn->fh_lock);
+	if (matched)
+		return (0);
+
+	WT_RET(__wt_filename(session, name, &path));
+
+	f = O_RDWR;
+#ifdef O_BINARY
+	/* Windows clones: we always want to treat the file as a binary. */
+	f |= O_BINARY;
+#endif
+#ifdef O_CLOEXEC
+	/*
+	 * Security:
+	 * The application may spawn a new process, and we don't want another
+	 * process to have access to our file handles.
+	 */
+	f |= O_CLOEXEC;
+#endif
+#ifdef O_NOATIME
+	/* Avoid updating metadata for read-only workloads. */
+	if (dio_type == WT_FILE_TYPE_DATA ||
+	    dio_type == WT_FILE_TYPE_CHECKPOINT)
+		f |= O_NOATIME;
+#endif
+
+	if (ok_create) {
+		f |= O_CREAT;
+		if (exclusive)
+			f |= O_EXCL;
+		mode = 0666;
+	} else
+		mode = 0;
+
+	direct_io = 0;
+#ifdef O_DIRECT
+	if (dio_type && FLD_ISSET(conn->direct_io, dio_type)) {
+		f |= O_DIRECT;
+		direct_io = 1;
+	}
+#endif
+	if (dio_type == WT_FILE_TYPE_LOG &&
+	    FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC))
+#ifdef O_DSYNC
+		f |= O_DSYNC;
+#elif defined(O_SYNC)
+		f |= O_SYNC;
+#else
+		WT_ERR_MSG(session, ENOTSUP,
+		    "Unsupported log sync mode requested");
+#endif
+	WT_SYSCALL_RETRY(((fd = open(path, f, mode)) == -1 ? 1 : 0), ret);
+	if (ret != 0)
+		WT_ERR_MSG(session, ret,
+		    direct_io ?
+		    "%s: open failed with direct I/O configured, some "
+		    "filesystem types do not support direct I/O" : "%s", path);
+
+#if defined(HAVE_FCNTL) && defined(FD_CLOEXEC) && !defined(O_CLOEXEC)
+	/*
+	 * Security:
+	 * The application may spawn a new process, and we don't want another
+	 * process to have access to our file handles.  There's an obvious
+	 * race here, so we prefer the flag to open if available.
+	 */
+	if ((f = fcntl(fd, F_GETFD)) == -1 ||
+	    fcntl(fd, F_SETFD, f | FD_CLOEXEC) == -1)
+		WT_ERR_MSG(session, __wt_errno(), "%s: fcntl", name);
+#endif
+
+#if defined(HAVE_POSIX_FADVISE)
+	/* Disable read-ahead on trees: it slows down random read workloads. */
+	if (dio_type == WT_FILE_TYPE_DATA ||
+	    dio_type == WT_FILE_TYPE_CHECKPOINT)
+		WT_ERR(posix_fadvise(fd, 0, 0, POSIX_FADV_RANDOM));
+#endif
+
+	if (F_ISSET(conn, WT_CONN_CKPT_SYNC))
+		WT_ERR(__open_directory_sync(session, path));
+
+	WT_ERR(__wt_calloc(session, 1, sizeof(WT_FH), &fh));
+	WT_ERR(__wt_strdup(session, name, &fh->name));
+	fh->fd = fd;
+	fh->ref = 1;
+	fh->direct_io = direct_io;
+
+	/* Set the file's size. */
+	WT_ERR(__wt_filesize(session, fh, &fh->size));
+
+	/* Configure file extension. */
+	if (dio_type == WT_FILE_TYPE_DATA ||
+	    dio_type == WT_FILE_TYPE_CHECKPOINT)
+		fh->extend_len = conn->data_extend_len;
+
+	/* Configure fallocate/posix_fallocate calls. */
+	__wt_fallocate_config(session, fh);
+
+	/*
+	 * Repeat the check for a match, but then link onto the database's list
+	 * of files.
+	 */
+	matched = 0;
+	__wt_spin_lock(session, &conn->fh_lock);
+	TAILQ_FOREACH(tfh, &conn->fhqh, q)
+		if (strcmp(name, tfh->name) == 0) {
+			++tfh->ref;
+			*fhp = tfh;
+			matched = 1;
+			break;
+		}
+	if (!matched) {
+		TAILQ_INSERT_TAIL(&conn->fhqh, fh, q);
+		WT_STAT_FAST_CONN_INCR(session, file_open);
+
+		*fhp = fh;
+	}
+	__wt_spin_unlock(session, &conn->fh_lock);
+	if (matched) {
+err:		if (fh != NULL) {
+			__wt_free(session, fh->name);
+			__wt_free(session, fh);
+		}
+		if (fd != -1)
+			(void)close(fd);
+	}
+
+	__wt_free(session, path);
+	return (ret);
+}
+
+/*
+ * __wt_close --
+ *	Close a file handle.
+ */
+int
+__wt_close(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+
+	conn = S2C(session);
+
+	__wt_spin_lock(session, &conn->fh_lock);
+	if (fh == NULL || fh->ref == 0 || --fh->ref > 0) {
+		__wt_spin_unlock(session, &conn->fh_lock);
+		return (0);
+	}
+
+	/* Remove from the list. */
+	TAILQ_REMOVE(&conn->fhqh, fh, q);
+	WT_STAT_FAST_CONN_DECR(session, file_open);
+
+	__wt_spin_unlock(session, &conn->fh_lock);
+
+	/* Discard the memory. */
+	if (close(fh->fd) != 0) {
+		ret = __wt_errno();
+		__wt_err(session, ret, "close: %s", fh->name);
+	}
+
+	__wt_free(session, fh->name);
+	__wt_free(session, fh);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_path.c b/src/third_party/wiredtiger/src/os_posix/os_path.c
new file mode 100644
index 00000000000..aed99d1d027
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_path.c
@@ -0,0 +1,28 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_absolute_path --
+ *	Return if a filename is an absolute path.
+ */
+int
+__wt_absolute_path(const char *path)
+{
+	return (path[0] == '/' ? 1 : 0);
+}
+
+/*
+ * __wt_path_separator --
+ *	Return the path separator string.
+ */
+const char *
+__wt_path_separator(void)
+{
+	return ("/");
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_priv.c b/src/third_party/wiredtiger/src/os_posix/os_priv.c
new file mode 100644
index 00000000000..7d56359da4f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_priv.c
@@ -0,0 +1,19 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_has_priv --
+ *	Return if the process has special privileges, defined as having
+ *	different effective and read UIDs or GIDs.
+ */
+int
+__wt_has_priv(void)
+{
+	return (getuid() != geteuid() || getgid() != getegid());
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_remove.c b/src/third_party/wiredtiger/src/os_posix/os_remove.c
new file mode 100644
index 00000000000..a52a4db6bc7
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_remove.c
@@ -0,0 +1,66 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __remove_file_check --
+ *	Check if the file is currently open before removing it.
+ */
+static void
+__remove_file_check(WT_SESSION_IMPL *session, const char *name)
+{
+#ifdef HAVE_DIAGNOSTIC
+	WT_CONNECTION_IMPL *conn;
+	WT_FH *fh;
+
+	conn = S2C(session);
+	fh = NULL;
+
+	/*
+	 * Check if the file is open: it's an error if it is, since a higher
+	 * level should have closed it before removing.
+	 */
+	__wt_spin_lock(session, &conn->fh_lock);
+	TAILQ_FOREACH(fh, &conn->fhqh, q) {
+		if (strcmp(name, fh->name) == 0)
+			break;
+	}
+	__wt_spin_unlock(session, &conn->fh_lock);
+
+	WT_ASSERT(session, fh == NULL);
+#else
+	WT_UNUSED(session);
+	WT_UNUSED(name);
+#endif
+}
+
+/*
+ * __wt_remove --
+ *	Remove a file.
+ */
+int
+__wt_remove(WT_SESSION_IMPL *session, const char *name)
+{
+	WT_DECL_RET;
+	char *path;
+
+	WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: remove", name));
+
+	__remove_file_check(session, name);
+
+	WT_RET(__wt_filename(session, name, &path));
+
+	WT_SYSCALL_RETRY(remove(path), ret);
+
+	__wt_free(session, path);
+
+	if (ret == 0 || ret == ENOENT)
+		return (0);
+
+	WT_RET_MSG(session, ret, "%s: remove", name);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_rename.c b/src/third_party/wiredtiger/src/os_posix/os_rename.c
new file mode 100644
index 00000000000..ddbb59aaf37
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_rename.c
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_rename --
+ *	Rename a file.
+ */
+int
+__wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to)
+{
+	WT_DECL_RET;
+	char *from_path, *to_path;
+
+	WT_RET(__wt_verbose(
+	    session, WT_VERB_FILEOPS, "rename %s to %s", from, to));
+
+	from_path = to_path = NULL;
+
+	WT_RET(__wt_filename(session, from, &from_path));
+	WT_TRET(__wt_filename(session, to, &to_path));
+
+	if (ret == 0)
+		WT_SYSCALL_RETRY(rename(from_path, to_path), ret);
+
+	__wt_free(session, from_path);
+	__wt_free(session, to_path);
+
+	if (ret == 0)
+		return (0);
+
+	WT_RET_MSG(session, ret, "rename %s to %s", from, to);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_rw.c b/src/third_party/wiredtiger/src/os_posix/os_rw.c
new file mode 100644
index 00000000000..4247fb30fd1
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_rw.c
@@ -0,0 +1,86 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_read --
+ *	Read a chunk.
+ */
+int
+__wt_read(
+    WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf)
+{
+	size_t chunk;
+	ssize_t nr;
+	uint8_t *addr;
+
+	WT_STAT_FAST_CONN_INCR(session, read_io);
+
+	WT_RET(__wt_verbose(session, WT_VERB_FILEOPS,
+	    "%s: read %" WT_SIZET_FMT " bytes at offset %" PRIuMAX,
+	    fh->name, len, (uintmax_t)offset));
+
+	/* Assert direct I/O is aligned and a multiple of the alignment. */
+	WT_ASSERT(session,
+	    !fh->direct_io ||
+	    S2C(session)->buffer_alignment == 0 ||
+	    (!((uintptr_t)buf &
+	    (uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
+	    len >= S2C(session)->buffer_alignment &&
+	    len % S2C(session)->buffer_alignment == 0));
+
+	/* Break reads larger than 1GB into 1GB chunks. */
+	for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) {
+		chunk = WT_MIN(len, WT_GIGABYTE);
+		if ((nr = pread(fh->fd, addr, chunk, offset)) <= 0)
+			WT_RET_MSG(session, nr == 0 ? WT_ERROR : __wt_errno(),
+			    "%s read error: failed to read %" WT_SIZET_FMT
+			    " bytes at offset %" PRIuMAX,
+			    fh->name, chunk, (uintmax_t)offset);
+	}
+	return (0);
+}
+
+/*
+ * __wt_write --
+ *	Write a chunk.
+ */
+int
+__wt_write(WT_SESSION_IMPL *session,
+    WT_FH *fh, wt_off_t offset, size_t len, const void *buf)
+{
+	size_t chunk;
+	ssize_t nw;
+	const uint8_t *addr;
+
+	WT_STAT_FAST_CONN_INCR(session, write_io);
+
+	WT_RET(__wt_verbose(session, WT_VERB_FILEOPS,
+	    "%s: write %" WT_SIZET_FMT " bytes at offset %" PRIuMAX,
+	    fh->name, len, (uintmax_t)offset));
+
+	/* Assert direct I/O is aligned and a multiple of the alignment. */
+	WT_ASSERT(session,
+	    !fh->direct_io ||
+	    S2C(session)->buffer_alignment == 0 ||
+	    (!((uintptr_t)buf &
+	    (uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
+	    len >= S2C(session)->buffer_alignment &&
+	    len % S2C(session)->buffer_alignment == 0));
+
+	/* Break writes larger than 1GB into 1GB chunks. */
+	for (addr = buf; len > 0; addr += nw, len -= (size_t)nw, offset += nw) {
+		chunk = WT_MIN(len, WT_GIGABYTE);
+		if ((nw = pwrite(fh->fd, addr, chunk, offset)) < 0)
+			WT_RET_MSG(session, __wt_errno(),
+			    "%s write error: failed to write %" WT_SIZET_FMT
+			    " bytes at offset %" PRIuMAX,
+			    fh->name, chunk, (uintmax_t)offset);
+	}
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_sleep.c b/src/third_party/wiredtiger/src/os_posix/os_sleep.c
new file mode 100644
index 00000000000..665330a26e7
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_sleep.c
@@ -0,0 +1,23 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_sleep --
+ *	Pause the thread of control.
+ */
+void
+__wt_sleep(long seconds, long micro_seconds)
+{
+	struct timeval t;
+
+	t.tv_sec = seconds + micro_seconds / 1000000;
+	t.tv_usec = (suseconds_t)(micro_seconds % 1000000);
+
+	(void)select(0, NULL, NULL, NULL, &t);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_strtouq.c b/src/third_party/wiredtiger/src/os_posix/os_strtouq.c
new file mode 100644
index 00000000000..97f9759f76f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_strtouq.c
@@ -0,0 +1,24 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_strtouq --
+ *	Convert a string to an unsigned quad integer.
+ */
+uint64_t
+__wt_strtouq(const char *nptr, char **endptr, int base)
+{
+#if defined(HAVE_STRTOUQ)
+	return (strtouq(nptr, endptr, base));
+#else
+	WT_STATIC_ASSERT(sizeof(uint64_t) == sizeof(unsigned long long));
+
+	return (strtoull(nptr, endptr, base));
+#endif
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_thread.c b/src/third_party/wiredtiger/src/os_posix/os_thread.c
new file mode 100644
index 00000000000..7c447710b46
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_thread.c
@@ -0,0 +1,59 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_thread_create --
+ *	Create a new thread of control.
+ */
+int
+__wt_thread_create(WT_SESSION_IMPL *session,
+    wt_thread_t *tidret, void *(*func)(void *), void *arg)
+{
+	WT_DECL_RET;
+
+	/* Spawn a new thread of control. */
+	if ((ret = pthread_create(tidret, NULL, func, arg)) == 0)
+		return (0);
+	WT_RET_MSG(session, ret, "pthread_create");
+}
+
+/*
+ * __wt_thread_join --
+ *	Wait for a thread of control to exit.
+ */
+int
+__wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid)
+{
+	WT_DECL_RET;
+
+	if ((ret = pthread_join(tid, NULL)) == 0)
+		return (0);
+
+	WT_RET_MSG(session, ret, "pthread_join");
+}
+
+/*
+ * __wt_thread_id --
+ *	Fill in a printable version of the process and thread IDs.
+ */
+void
+__wt_thread_id(char *buf, size_t buflen)
+{
+	pthread_t self;
+
+	/*
+	 * POSIX 1003.1 allows pthread_t to be an opaque type, but on systems
+	 * where it's a pointer, we'd rather print out the pointer and match
+	 * gdb output. Since we don't yet run on any systems where pthread_t
+	 * is not a pointer, do it that way for now.
+	 */
+	self = pthread_self();
+	(void)snprintf(buf, buflen,
+	    "%" PRIu64 ":%p", (uint64_t)getpid(), (void *)self);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_time.c b/src/third_party/wiredtiger/src/os_posix/os_time.c
new file mode 100644
index 00000000000..56f688a1e14
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_time.c
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_seconds --
+ *	Return the seconds since the Epoch.
+ */
+int
+__wt_seconds(WT_SESSION_IMPL *session, time_t *timep)
+{
+	struct timespec t;
+
+	WT_RET(__wt_epoch(session, &t));
+
+	*timep = t.tv_sec;
+
+	return (0);
+}
+
+/*
+ * __wt_epoch --
+ *	Return the time since the Epoch.
+ */
+int
+__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
+{
+	WT_DECL_RET;
+
+#if defined(HAVE_CLOCK_GETTIME)
+	WT_SYSCALL_RETRY(clock_gettime(CLOCK_REALTIME, tsp), ret);
+	if (ret == 0)
+		return (0);
+	WT_RET_MSG(session, ret, "clock_gettime");
+#elif defined(HAVE_GETTIMEOFDAY)
+	struct timeval v;
+
+	WT_SYSCALL_RETRY(gettimeofday(&v, NULL), ret);
+	if (ret == 0) {
+		tsp->tv_sec = v.tv_sec;
+		tsp->tv_nsec = v.tv_usec * 1000;
+		return (0);
+	}
+	WT_RET_MSG(session, ret, "gettimeofday");
+#else
+	NO TIME-OF-DAY IMPLEMENTATION: see src/os_posix/os_time.c
+#endif
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_yield.c b/src/third_party/wiredtiger/src/os_posix/os_yield.c
new file mode 100644
index 00000000000..6af30803e81
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_yield.c
@@ -0,0 +1,18 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_yield --
+ *	Yield the thread of control.
+ */
+void
+__wt_yield(void)
+{
+	sched_yield();
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_dir.c b/src/third_party/wiredtiger/src/os_win/os_dir.c
new file mode 100644
index 00000000000..076c64670d4
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_dir.c
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_dirlist --
+ *	Get a list of files from a directory, optionally filtered by
+ *	a given prefix.
+ */
+int
+__wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix,
+    uint32_t flags, char ***dirlist, u_int *countp)
+{
+	HANDLE findhandle;
+	WIN32_FIND_DATA finddata;
+	WT_DECL_ITEM(pathbuf);
+	WT_DECL_RET;
+	size_t dirallocsz, pathlen;
+	u_int count, dirsz;
+	int match;
+	char **entries, *path;
+
+	*dirlist = NULL;
+	*countp = 0;
+
+	findhandle = INVALID_HANDLE_VALUE;
+	count = 0;
+
+	WT_RET(__wt_filename(session, dir, &path));
+
+	pathlen = strlen(path);
+	if (path[pathlen - 1] == '\\') {
+		path[pathlen - 1] = '\0';
+	}
+
+	WT_ERR(__wt_scr_alloc(session, 0, &pathbuf));
+	WT_ERR(__wt_buf_fmt(session, pathbuf, "%s\\*", path));
+
+	dirallocsz = 0;
+	dirsz = 0;
+	entries = NULL;
+	if (flags == 0)
+	    LF_SET(WT_DIRLIST_INCLUDE);
+
+	WT_ERR(__wt_verbose(session, WT_VERB_FILEOPS,
+	    "wt_dirlist of %s %s prefix %s",
+	    pathbuf->data, LF_ISSET(WT_DIRLIST_INCLUDE) ? "include" : "exclude",
+	    prefix == NULL ? "all" : prefix));
+
+	findhandle = FindFirstFileA(pathbuf->data, &finddata);
+
+	if (INVALID_HANDLE_VALUE == findhandle)
+		WT_ERR_MSG(session, __wt_errno(), "%s: FindFirstFile",
+		    pathbuf->data);
+	else {
+		do {
+			/*
+			 * Skip . and ..
+			 */
+			if (strcmp(finddata.cFileName, ".") == 0 ||
+			    strcmp(finddata.cFileName, "..") == 0)
+				continue;
+			match = 0;
+			if (prefix != NULL &&
+			    ((LF_ISSET(WT_DIRLIST_INCLUDE) &&
+			    WT_PREFIX_MATCH(finddata.cFileName, prefix)) ||
+			    (LF_ISSET(WT_DIRLIST_EXCLUDE) &&
+			    !WT_PREFIX_MATCH(finddata.cFileName, prefix))))
+				match = 1;
+			if (prefix == NULL || match) {
+				/*
+				 * We have a file name we want to return.
+				 */
+				count++;
+				if (count > dirsz) {
+					dirsz += WT_DIR_ENTRY;
+					WT_ERR(__wt_realloc_def(session,
+					    &dirallocsz, dirsz, &entries));
+				}
+				WT_ERR(__wt_strdup(session,
+				    finddata.cFileName, &entries[count - 1]));
+			}
+		} while (FindNextFileA(findhandle, &finddata) != 0);
+	}
+
+	if (count > 0)
+		*dirlist = entries;
+	*countp = count;
+
+err:
+	if (findhandle != INVALID_HANDLE_VALUE)
+		(void)FindClose(findhandle);
+	__wt_free(session, path);
+	__wt_buf_free(session, pathbuf);
+
+	if (ret == 0)
+		return (0);
+
+	if (*dirlist != NULL) {
+		for (count = dirsz; count > 0; count--)
+			__wt_free(session, entries[count]);
+		__wt_free(session, entries);
+	}
+
+	WT_RET_MSG(session, ret, "dirlist %s prefix %s", dir, prefix);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_dlopen.c b/src/third_party/wiredtiger/src/os_win/os_dlopen.c
new file mode 100644
index 00000000000..ebc90edd2b2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_dlopen.c
@@ -0,0 +1,86 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_dlopen --
+ *	Open a dynamic library.
+ */
+int
+__wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp)
+{
+	WT_DECL_RET;
+	WT_DLH *dlh;
+
+	WT_RET(__wt_calloc_def(session, 1, &dlh));
+	WT_ERR(__wt_strdup(session, path, &dlh->name));
+
+	/* NULL means load from the current binary */
+	if (path == NULL) {
+		ret = GetModuleHandleExA(0, NULL, &dlh->handle);
+		if (ret == FALSE)
+			WT_ERR_MSG(session,
+			    __wt_errno(), "GetModuleHandleEx(%s): %s", path, 0);
+	} else {
+		// TODO: load dll here
+		DebugBreak();
+	}
+
+	/* Windows returns 0 on failure, WT expects 0 on success */
+	ret = !ret;
+
+	*dlhp = dlh;
+	if (0) {
+err:		__wt_free(session, dlh->name);
+		__wt_free(session, dlh);
+	}
+	return (ret);
+}
+
+/*
+ * __wt_dlsym --
+ *	Lookup a symbol in a dynamic library.
+ */
+int
+__wt_dlsym(WT_SESSION_IMPL *session,
+    WT_DLH *dlh, const char *name, int fail, void *sym_ret)
+{
+	void *sym;
+
+	*(void **)sym_ret = NULL;
+
+	sym = GetProcAddress(dlh->handle, name);
+	if (sym == NULL && fail) {
+		WT_RET_MSG(session, __wt_errno(),
+		    "GetProcAddress(%s in %s): %s", name, dlh->name, 0);
+	}
+
+	*(void **)sym_ret = sym;
+	return (0);
+}
+
+/*
+ * __wt_dlclose --
+ *	Close a dynamic library
+ */
+int
+__wt_dlclose(WT_SESSION_IMPL *session, WT_DLH *dlh)
+{
+	WT_DECL_RET;
+
+	if ((ret = FreeLibrary(dlh->handle)) == FALSE) {
+		__wt_err(session, __wt_errno(), "FreeLibrary");
+	}
+
+	/* Windows returns 0 on failure, WT expects 0 on success */
+	ret = !ret;
+
+	__wt_free(session, dlh->name);
+	__wt_free(session, dlh);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_errno.c b/src/third_party/wiredtiger/src/os_win/os_errno.c
new file mode 100644
index 00000000000..ce50106b0cc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_errno.c
@@ -0,0 +1,27 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_errno --
+ *	Return errno, or WT_ERROR if errno not set.
+ */
+int
+__wt_errno(void)
+{
+	/*
+	 * Called when we know an error occurred, and we want the system
+	 * error code, but there's some chance it's not set.
+	 */
+	DWORD err = GetLastError();
+
+	/* GetLastError should only be called if we hit an actual error */
+	WT_ASSERT(NULL, err != ERROR_SUCCESS);
+
+	return (err == ERROR_SUCCESS ? WT_ERROR : err);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_exist.c b/src/third_party/wiredtiger/src/os_win/os_exist.c
new file mode 100644
index 00000000000..ab3805f19df
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_exist.c
@@ -0,0 +1,32 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_exist --
+ *	Return if the file exists.
+ */
+int
+__wt_exist(WT_SESSION_IMPL *session, const char *filename, int *existp)
+{
+	WT_DECL_RET;
+	char *path;
+
+	WT_RET(__wt_filename(session, filename, &path));
+
+	ret = GetFileAttributesA(path);
+
+	__wt_free(session, path);
+
+	if (ret != INVALID_FILE_ATTRIBUTES)
+		*existp = 1;
+	else
+		*existp = 0;
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_fallocate.c b/src/third_party/wiredtiger/src/os_win/os_fallocate.c
new file mode 100644
index 00000000000..bd71c780dc5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_fallocate.c
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_fallocate_config --
+ *	Configure fallocate behavior for a file handle.
+ */
+void
+__wt_fallocate_config(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+	fh->fallocate_available = 1;
+
+	/*
+	 * We use a separate handle for file size changes, so there's no need
+	 * for locking.
+	 */
+	fh->fallocate_requires_locking = 0;
+}
+
+/*
+ * __wt_fallocate --
+ *	Allocate space for a file handle.
+ */
+int
+__wt_fallocate(
+    WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len)
+{
+	WT_DECL_RET;
+	LARGE_INTEGER largeint;
+
+	WT_RET(__wt_verbose(
+	    session, WT_VERB_FILEOPS, "%s: fallocate", fh->name));
+
+	largeint.QuadPart = offset + len;
+
+	if ((ret = SetFilePointerEx(
+	    fh->filehandle_secondary, largeint, NULL, FILE_BEGIN)) == FALSE)
+		WT_RET_MSG(session,
+		    __wt_errno(), "%s SetFilePointerEx error", fh->name);
+
+	if ((ret = SetEndOfFile(fh->filehandle_secondary)) != FALSE) {
+		fh->size = fh->extend_size = len;
+		return (0);
+	}
+
+	WT_RET_MSG(session, __wt_errno(), "%s SetEndOfFile error", fh->name);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_filesize.c b/src/third_party/wiredtiger/src/os_win/os_filesize.c
new file mode 100644
index 00000000000..309ee1db40b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_filesize.c
@@ -0,0 +1,56 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_filesize --
+ *	Get the size of a file in bytes.
+ */
+int
+__wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep)
+{
+	WT_DECL_RET;
+	LARGE_INTEGER size;
+
+	WT_RET(__wt_verbose(
+	    session, WT_VERB_FILEOPS, "%s: GetFileSizeEx", fh->name));
+
+	if ((ret = GetFileSizeEx(fh->filehandle, &size)) != 0) {
+		*sizep = size.QuadPart;
+		return (0);
+	}
+
+	WT_RET_MSG(session, __wt_errno(), "%s: GetFileSizeEx", fh->name);
+}
+
+/*
+ * __wt_filesize_name --
+ *	Return the size of a file in bytes, given a file name.
+ */
+int
+__wt_filesize_name(
+    WT_SESSION_IMPL *session, const char *filename, wt_off_t *sizep)
+{
+	WT_DECL_RET;
+	WIN32_FILE_ATTRIBUTE_DATA data;
+	char *path;
+
+	WT_RET(__wt_filename(session, filename, &path));
+
+	ret = GetFileAttributesExA(path, GetFileExInfoStandard, &data);
+
+	__wt_free(session, path);
+
+	if (ret != 0) {
+		*sizep =
+		    ((int64_t)data.nFileSizeHigh << 32) | data.nFileSizeLow;
+		return (0);
+	}
+
+	WT_RET_MSG(session, __wt_errno(), "%s: GetFileAttributesEx", filename);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_flock.c b/src/third_party/wiredtiger/src/os_win/os_flock.c
new file mode 100644
index 00000000000..4b3ca34d65f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_flock.c
@@ -0,0 +1,46 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_bytelock --
+ *	Lock/unlock a byte in a file.
+ */
+int
+__wt_bytelock(WT_FH *fhp, wt_off_t byte, int lock)
+{
+	WT_DECL_RET;
+
+	/*
+	 * WiredTiger requires this function be able to acquire locks past
+	 * the end of file.
+	 *
+	 * Note we're using fcntl(2) locking: all fcntl locks associated with a
+	 * file for a given process are removed when any file descriptor for the
+	 * file is closed by the process, even if a lock was never requested for
+	 * that file descriptor.
+	 *
+	 * http://msdn.microsoft.com/
+	 *    en-us/library/windows/desktop/aa365202%28v=vs.85%29.aspx
+	 *
+	 * You can lock bytes that are beyond the end of the current file.
+	 * This is useful to coordinate adding records to the end of a file.
+	 */
+	if (lock) {
+		ret = LockFile(fhp->filehandle, UINT32_MAX & byte,
+		    UINT32_MAX & (byte >> 32), 1, 0);
+	} else {
+		ret = UnlockFile(fhp->filehandle, UINT32_MAX & byte,
+		    UINT32_MAX & (byte >> 32), 1, 0);
+	}
+
+	if (ret == FALSE)
+		WT_RET_MSG(NULL, __wt_errno(), "%s: LockFile", fhp->name);
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_fsync.c b/src/third_party/wiredtiger/src/os_win/os_fsync.c
new file mode 100644
index 00000000000..cd509131649
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_fsync.c
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_fsync --
+ *	Flush a file handle.
+ */
+int
+__wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+	WT_DECL_RET;
+
+	WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: FlushFileBuffers",
+	    fh->name));
+
+	if ((ret = FlushFileBuffers(fh->filehandle)) == FALSE)
+		WT_RET_MSG(session,
+		    __wt_errno(), "%s FlushFileBuffers error", fh->name);
+
+	return (0);
+}
+
+/*
+ * __wt_fsync_async --
+ *	Flush a file handle and don't wait for the result.
+ */
+int
+__wt_fsync_async(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+	WT_UNUSED(session);
+	WT_UNUSED(fh);
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_ftruncate.c b/src/third_party/wiredtiger/src/os_win/os_ftruncate.c
new file mode 100644
index 00000000000..5d87f1ce06a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_ftruncate.c
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_ftruncate --
+ *	Truncate a file.
+ */
+int
+__wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len)
+{
+	WT_DECL_RET;
+	LARGE_INTEGER largeint;
+	uint32_t lasterror;
+
+	largeint.QuadPart = len;
+
+	if ((ret = SetFilePointerEx(
+	    fh->filehandle_secondary, largeint, NULL, FILE_BEGIN)) == FALSE)
+		WT_RET_MSG(session, __wt_errno(), "%s SetFilePointerEx error",
+		    fh->name);
+
+	ret = SetEndOfFile(fh->filehandle_secondary);
+	if (ret != FALSE) {
+		fh->size = fh->extend_size = len;
+		return (0);
+	}
+
+	lasterror = GetLastError();
+
+	if (lasterror = ERROR_USER_MAPPED_FILE)
+		return (EBUSY);
+
+	WT_RET_MSG(session, lasterror, "%s SetEndOfFile error", fh->name);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_map.c b/src/third_party/wiredtiger/src/os_win/os_map.c
new file mode 100644
index 00000000000..b3b4f0f7501
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_map.c
@@ -0,0 +1,106 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_mmap --
+ *	Map a file into memory.
+ */
+int
+__wt_mmap(WT_SESSION_IMPL *session, WT_FH *fh, void *mapp, size_t *lenp,
+   void** mappingcookie)
+{
+	void *map;
+	size_t orig_size;
+
+	/*
+	 * Record the current size and only map and set that as the length, it
+	 * could change between the map call and when we set the return length.
+	 * For the same reason we could actually map past the end of the file;
+	 * we don't read bytes past the end of the file though, so as long as
+	 * the map call succeeds, it's all OK.
+	 */
+	orig_size = (size_t)fh->size;
+	*mappingcookie =
+	    CreateFileMappingA(fh->filehandle, NULL, PAGE_READONLY, 0, 0, NULL);
+	if (*mappingcookie == NULL)
+		WT_RET_MSG(session, __wt_errno(),
+			"%s CreateFileMapping error: failed to map %"
+			WT_SIZET_FMT " bytes",
+			fh->name, orig_size);
+
+	if ((map = MapViewOfFile(
+	    *mappingcookie, FILE_MAP_READ, 0, 0, orig_size)) == NULL) {
+		CloseHandle(*mappingcookie);
+		*mappingcookie = NULL;
+
+		WT_RET_MSG(session, __wt_errno(),
+		    "%s map error: failed to map %" WT_SIZET_FMT " bytes",
+		    fh->name, orig_size);
+	}
+	(void)__wt_verbose(session, WT_VERB_FILEOPS,
+	    "%s: MapViewOfFile %p: %" WT_SIZET_FMT " bytes",
+	    fh->name, map, orig_size);
+
+	*(void **)mapp = map;
+	*lenp = orig_size;
+	return (0);
+}
+
+/*
+ * __wt_mmap_preload --
+ *	Cause a section of a memory map to be faulted in.
+ */
+int
+__wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size)
+{
+	WT_UNUSED(session);
+	WT_UNUSED(p);
+	WT_UNUSED(size);
+
+	return (0);
+}
+
+/*
+ * __wt_mmap_discard --
+ *	Discard a chunk of the memory map.
+ */
+int
+__wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size)
+{
+	WT_UNUSED(session);
+	WT_UNUSED(p);
+	WT_UNUSED(size);
+	return (0);
+}
+
+/*
+ * __wt_munmap --
+ *	Remove a memory mapping.
+ */
+int
+__wt_munmap(WT_SESSION_IMPL *session, WT_FH *fh, void *map, size_t len,
+   void** mappingcookie)
+{
+	WT_RET(__wt_verbose(session, WT_VERB_FILEOPS,
+	    "%s: UnmapViewOfFile %p: %" WT_SIZET_FMT " bytes",
+	    fh->name, map, len));
+
+	if (UnmapViewOfFile(map) == 0) {
+		WT_RET_MSG(session, __wt_errno(),
+		    "%s UnmapViewOfFile error: failed to unmap %" WT_SIZET_FMT
+		    " bytes",
+		    fh->name, len);
+	}
+
+	CloseHandle(*mappingcookie);
+
+	*mappingcookie = 0;
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c
new file mode 100644
index 00000000000..9c9907bd8be
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c
@@ -0,0 +1,155 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cond_alloc --
+ *	Allocate and initialize a condition variable.
+ */
+int
+__wt_cond_alloc(WT_SESSION_IMPL *session,
+    const char *name, int is_signalled, WT_CONDVAR **condp)
+{
+	WT_CONDVAR *cond;
+
+	/*
+	 * !!!
+	 * This function MUST handle a NULL session handle.
+	 */
+	WT_RET(__wt_calloc(session, 1, sizeof(WT_CONDVAR), &cond));
+
+	InitializeCriticalSection(&cond->mtx);
+
+	/* Initialize the condition variable to permit self-blocking. */
+	InitializeConditionVariable(&cond->cond);
+
+	cond->name = name;
+	cond->waiters = is_signalled ? -1 : 0;
+
+	*condp = cond;
+	return (0);
+}
+
+/*
+ * __wt_cond_wait --
+ *	Wait on a mutex, optionally timing out.
+ */
+int
+__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, long usecs)
+{
+	WT_DECL_RET;
+	int locked;
+	int lasterror;
+	int milliseconds;
+	locked = 0;
+	WT_ASSERT(session, usecs >= 0);
+
+	/* Fast path if already signalled. */
+	if (WT_ATOMIC_ADD4(cond->waiters, 1) == 0)
+		return (0);
+
+	/*
+	 * !!!
+	 * This function MUST handle a NULL session handle.
+	 */
+	if (session != NULL) {
+		WT_RET(__wt_verbose(session, WT_VERB_MUTEX,
+			"wait %s cond (%p)", cond->name, cond));
+		WT_STAT_FAST_CONN_INCR(session, cond_wait);
+	}
+
+	EnterCriticalSection(&cond->mtx);
+	locked = 1;
+
+	if (usecs > 0) {
+		milliseconds = usecs / 1000;
+		/*
+		 * 0 would mean the CV sleep becomes a TryCV which we do not
+		 * want
+		 */
+		if (milliseconds == 0)
+			milliseconds = 1;
+		ret = SleepConditionVariableCS(
+		    &cond->cond, &cond->mtx, milliseconds);
+	} else
+		ret = SleepConditionVariableCS(
+		    &cond->cond, &cond->mtx, INFINITE);
+
+	if (ret == 0) {
+		lasterror = GetLastError();
+		if (lasterror == ERROR_TIMEOUT) {
+			ret = 1;
+		}
+	}
+
+	(void)WT_ATOMIC_SUB4(cond->waiters, 1);
+
+	if (locked)
+		LeaveCriticalSection(&cond->mtx);
+	if (ret != 0)
+		return (0);
+	WT_RET_MSG(session, ret, "SleepConditionVariableCS");
+}
+
+/*
+ * __wt_cond_signal --
+ *	Signal a waiting thread.
+ */
+int
+__wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
+{
+	WT_DECL_RET;
+	int locked;
+
+	locked = 0;
+
+	/*
+	 * !!!
+	 * This function MUST handle a NULL session handle.
+	 */
+	if (session != NULL)
+		WT_RET(__wt_verbose(session, WT_VERB_MUTEX,
+			"signal %s cond (%p)", cond->name, cond));
+
+	/* Fast path if already signalled. */
+	if (cond->waiters == -1)
+		return (0);
+
+	if (cond->waiters > 0 || !WT_ATOMIC_CAS4(cond->waiters, 0, -1)) {
+		EnterCriticalSection(&cond->mtx);
+		locked = 1;
+		WakeAllConditionVariable(&cond->cond);
+	}
+
+	if (locked)
+		LeaveCriticalSection(&cond->mtx);
+	if (ret == 0)
+		return (0);
+	WT_RET_MSG(session, ret, "WakeAllConditionVariable");
+}
+
+/*
+ * __wt_cond_destroy --
+ *	Destroy a condition variable.
+ */
+int
+__wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp)
+{
+	WT_CONDVAR *cond;
+	WT_DECL_RET;
+
+	cond = *condp;
+	if (cond == NULL)
+		return (0);
+
+	/* Do nothing to delete Condition Variable */
+	DeleteCriticalSection(&cond->mtx);
+	__wt_free(session, *condp);
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_mtx_rw.c b/src/third_party/wiredtiger/src/os_win/os_mtx_rw.c
new file mode 100644
index 00000000000..ec0894a2f29
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_mtx_rw.c
@@ -0,0 +1,126 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_rwlock_alloc --
+ *	Allocate and initialize a read/write lock.
+ */
+int
+__wt_rwlock_alloc(
+    WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name)
+{
+	WT_RWLOCK *rwlock;
+
+	WT_RET(__wt_verbose(session, WT_VERB_MUTEX, "rwlock: alloc %s", name));
+
+	WT_RET(__wt_calloc_def(session, 1, &rwlock));
+
+	rwlock->name = name;
+	InitializeSRWLock(&rwlock->rwlock);
+
+	*rwlockp = rwlock;
+	return (0);
+}
+
+/*
+ * __wt_readlock --
+ *	Get a shared lock.
+ */
+int
+__wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+	WT_RET(__wt_verbose(
+	    session, WT_VERB_MUTEX, "rwlock: readlock %s", rwlock->name));
+	WT_STAT_FAST_CONN_INCR(session, rwlock_read);
+
+	AcquireSRWLockShared(&rwlock->rwlock);
+
+	return (0);
+}
+
+/*
+ * __wt_readunlock --
+ *	Release a shared lock.
+ */
+int
+__wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+	WT_RET(__wt_verbose(
+	    session, WT_VERB_MUTEX, "rwlock: read unlock %s", rwlock->name));
+
+	ReleaseSRWLockShared(&rwlock->rwlock);
+	return (0);
+}
+
+/*
+ * __wt_try_writelock --
+ *	Try to get an exclusive lock, fail immediately if unavailable.
+ */
+int
+__wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+	WT_RET(__wt_verbose(
+	    session, WT_VERB_MUTEX, "rwlock: try_writelock %s", rwlock->name));
+	WT_STAT_FAST_CONN_INCR(session, rwlock_write);
+
+	return (TryAcquireSRWLockExclusive(&rwlock->rwlock) == 0 ? EBUSY : 0);
+}
+
+/*
+ * __wt_writelock --
+ *	Wait to get an exclusive lock.
+ */
+int
+__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+	WT_RET(__wt_verbose(
+	    session, WT_VERB_MUTEX, "rwlock: writelock %s", rwlock->name));
+	WT_STAT_FAST_CONN_INCR(session, rwlock_write);
+
+	AcquireSRWLockExclusive(&rwlock->rwlock);
+
+	return (0);
+}
+
+/*
+ * __wt_writeunlock --
+ *	Release an exclusive lock.
+ */
+int
+__wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+	WT_RET(__wt_verbose(
+	    session, WT_VERB_MUTEX, "rwlock: writeunlock %s", rwlock->name));
+
+	ReleaseSRWLockExclusive(&rwlock->rwlock);
+	return (0);
+}
+
+/*
+ * __wt_rwlock_destroy --
+ *	Destroy a read/write lock.
+ */
+int
+__wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp)
+{
+	WT_RWLOCK *rwlock;
+
+	rwlock = *rwlockp;		/* Clear our caller's reference. */
+	if (rwlock == NULL)
+		return (0);
+	*rwlockp = NULL;
+
+	WT_RET(__wt_verbose(
+	    session, WT_VERB_MUTEX, "rwlock: destroy %s", rwlock->name));
+
+	/* Nothing to delete for Slim Reader Writer lock */
+
+	__wt_free(session, rwlock);
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_once.c b/src/third_party/wiredtiger/src/os_win/os_once.c
new file mode 100644
index 00000000000..40640acf129
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_once.c
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_init_once_callback --
+ *	Global initialization, run once.
+ */
+BOOL CALLBACK _wt_init_once_callback(
+    _Inout_      PINIT_ONCE InitOnce,
+    _Inout_opt_  PVOID Parameter,
+    _Out_opt_    PVOID *Context
+    )
+{
+	void(*init_routine)(void) = Parameter;
+
+	init_routine();
+
+	return (TRUE);
+}
+
+/*
+ * __wt_library_init --
+ *	Some things to do, before we do anything else.
+ */
+int
+__wt_once(void(*init_routine)(void))
+{
+	INIT_ONCE once_control = INIT_ONCE_STATIC_INIT;
+	PVOID lpContext = NULL;
+
+	return !InitOnceExecuteOnce(&once_control, &_wt_init_once_callback,
+	    init_routine, lpContext);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_open.c b/src/third_party/wiredtiger/src/os_win/os_open.c
new file mode 100644
index 00000000000..7be98b604ec
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_open.c
@@ -0,0 +1,219 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_open --
+ *	Open a file handle.
+ */
+int
+__wt_open(WT_SESSION_IMPL *session,
+    const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp)
+{
+	DWORD dwCreationDisposition;
+	HANDLE filehandle, filehandle_secondary;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_FH *fh, *tfh;
+	int direct_io, f, matched, share_mode;
+	char *path;
+
+	conn = S2C(session);
+	fh = NULL;
+	path = NULL;
+	filehandle = INVALID_HANDLE_VALUE;
+	filehandle_secondary = INVALID_HANDLE_VALUE;
+
+	WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: open", name));
+
+	/* Increment the reference count if we already have the file open. */
+	matched = 0;
+	__wt_spin_lock(session, &conn->fh_lock);
+	TAILQ_FOREACH(tfh, &conn->fhqh, q)
+		if (strcmp(name, tfh->name) == 0) {
+			++tfh->ref;
+			*fhp = tfh;
+			matched = 1;
+			break;
+		}
+	__wt_spin_unlock(session, &conn->fh_lock);
+	if (matched)
+		return (0);
+
+	WT_RET(__wt_filename(session, name, &path));
+
+	share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE;
+	/*
+	 * Security:
+	 * The application may spawn a new process, and we don't want another
+	 * process to have access to our file handles.
+	 *
+	 * TODO: Set tighter file permissions but set bInheritHandle to false
+	 * to prevent inheritance
+	 */
+
+	f = FILE_ATTRIBUTE_NORMAL;
+
+	dwCreationDisposition = 0;
+	if (ok_create) {
+		dwCreationDisposition = CREATE_NEW;
+		if (exclusive)
+			dwCreationDisposition = CREATE_ALWAYS;
+	} else
+		dwCreationDisposition = OPEN_EXISTING;
+
+	direct_io = 0;
+
+	if (dio_type && FLD_ISSET(conn->direct_io, dio_type)) {
+		f |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH;
+		direct_io = 1;
+	}
+
+	if (dio_type == WT_FILE_TYPE_LOG &&
+	    FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC)) {
+		f |= FILE_FLAG_WRITE_THROUGH;
+	}
+
+	/* Disable read-ahead on trees: it slows down random read workloads. */
+	if (dio_type == WT_FILE_TYPE_DATA ||
+	    dio_type == WT_FILE_TYPE_CHECKPOINT)
+		f |= FILE_FLAG_RANDOM_ACCESS;
+
+	filehandle = CreateFileA(path,
+				(GENERIC_READ | GENERIC_WRITE),
+				share_mode,
+				NULL,
+				dwCreationDisposition,
+				f,
+				NULL);
+	if (filehandle == INVALID_HANDLE_VALUE) {
+		if (GetLastError() == ERROR_FILE_EXISTS && ok_create)
+			filehandle = CreateFileA(path,
+						(GENERIC_READ | GENERIC_WRITE),
+						share_mode,
+						NULL,
+						OPEN_EXISTING,
+						f,
+						NULL);
+
+		if (filehandle == INVALID_HANDLE_VALUE)
+			WT_ERR_MSG(session, __wt_errno(),
+			    direct_io ?
+			    "%s: open failed with direct I/O configured, some "
+			    "filesystem types do not support direct I/O" :
+			    "%s", path);
+	}
+
+	/*
+	 * Open a second handle to file to support allocation/truncation
+	 * concurrently with reads on the file. Writes would also move the file
+	 * pointer.
+	 */
+	filehandle_secondary = CreateFileA(path,
+	    (GENERIC_READ | GENERIC_WRITE),
+	    share_mode,
+	    NULL,
+	    OPEN_EXISTING,
+	    f,
+	    NULL);
+	if (filehandle == INVALID_HANDLE_VALUE)
+		WT_ERR_MSG(session, __wt_errno(),
+		    "open failed for secondary handle: %s", path);
+
+	WT_ERR(__wt_calloc(session, 1, sizeof(WT_FH), &fh));
+	WT_ERR(__wt_strdup(session, name, &fh->name));
+	fh->filehandle = filehandle;
+	fh->filehandle_secondary = filehandle_secondary;
+	fh->ref = 1;
+	fh->direct_io = direct_io;
+
+	/* Set the file's size. */
+	WT_ERR(__wt_filesize(session, fh, &fh->size));
+
+	/* Configure file extension. */
+	if (dio_type == WT_FILE_TYPE_DATA ||
+	    dio_type == WT_FILE_TYPE_CHECKPOINT)
+		fh->extend_len = conn->data_extend_len;
+
+	/* Configure fallocate/posix_fallocate calls. */
+	__wt_fallocate_config(session, fh);
+
+	/*
+	 * Repeat the check for a match, but then link onto the database's list
+	 * of files.
+	 */
+	matched = 0;
+	__wt_spin_lock(session, &conn->fh_lock);
+	TAILQ_FOREACH(tfh, &conn->fhqh, q)
+		if (strcmp(name, tfh->name) == 0) {
+			++tfh->ref;
+			*fhp = tfh;
+			matched = 1;
+			break;
+		}
+	if (!matched) {
+		TAILQ_INSERT_TAIL(&conn->fhqh, fh, q);
+		WT_STAT_FAST_CONN_INCR(session, file_open);
+
+		*fhp = fh;
+	}
+	__wt_spin_unlock(session, &conn->fh_lock);
+	if (matched) {
+err:		if (fh != NULL) {
+			__wt_free(session, fh->name);
+			__wt_free(session, fh);
+		}
+		if (filehandle != INVALID_HANDLE_VALUE)
+			(void)CloseHandle(filehandle);
+		if (filehandle_secondary != INVALID_HANDLE_VALUE)
+			(void)CloseHandle(filehandle_secondary);
+	}
+
+	__wt_free(session, path);
+	return (ret);
+}
+
+/*
+ * __wt_close --
+ *	Close a file handle.
+ */
+int
+__wt_close(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+
+	conn = S2C(session);
+
+	__wt_spin_lock(session, &conn->fh_lock);
+	if (fh == NULL || fh->ref == 0 || --fh->ref > 0) {
+		__wt_spin_unlock(session, &conn->fh_lock);
+		return (0);
+	}
+
+	/* Remove from the list. */
+	TAILQ_REMOVE(&conn->fhqh, fh, q);
+	WT_STAT_FAST_CONN_DECR(session, file_open);
+
+	__wt_spin_unlock(session, &conn->fh_lock);
+
+	/* Discard the memory. */
+	if (!CloseHandle(fh->filehandle) != 0) {
+		ret = __wt_errno();
+		__wt_err(session, ret, "CloseHandle: %s", fh->name);
+	}
+
+	if (!CloseHandle(fh->filehandle_secondary) != 0) {
+		ret = __wt_errno();
+		__wt_err(session, ret, "CloseHandle: secondary: %s", fh->name);
+	}
+
+	__wt_free(session, fh->name);
+	__wt_free(session, fh);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_path.c b/src/third_party/wiredtiger/src/os_win/os_path.c
new file mode 100644
index 00000000000..9f6b79c565c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_path.c
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_absolute_path --
+ *	Return if a filename is an absolute path.
+ */
+int
+__wt_absolute_path(const char *path)
+{
+	/*
+	 * Check for a drive name (for example, "D:"), allow both forward and
+	 * backward slashes.
+	 */
+	if (strlen(path) >= 3 && isalpha(path[0]) && path[1] == ':')
+		path += 2;
+	return (path[0] == '/' || path[0] == '\\' ? 1 : 0);
+}
+
+/*
+ * __wt_path_separator --
+ *	Return the path separator string.
+ */
+const char *
+__wt_path_separator(void)
+{
+	return ("\\");
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_priv.c b/src/third_party/wiredtiger/src/os_win/os_priv.c
new file mode 100644
index 00000000000..7b5152b4652
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_priv.c
@@ -0,0 +1,19 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_has_priv --
+ *	Return if the process has special privileges, defined as having
+ *	different effective and read UIDs or GIDs.
+ */
+int
+__wt_has_priv(void)
+{
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_remove.c b/src/third_party/wiredtiger/src/os_win/os_remove.c
new file mode 100644
index 00000000000..d15ee929c00
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_remove.c
@@ -0,0 +1,68 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __remove_file_check --
+ *	Check if the file is currently open before removing it.
+ */
+static inline void
+__remove_file_check(WT_SESSION_IMPL *session, const char *name)
+{
+#ifdef HAVE_DIAGNOSTIC
+	WT_CONNECTION_IMPL *conn;
+	WT_FH *fh;
+
+	conn = S2C(session);
+	fh = NULL;
+
+	/*
+	 * Check if the file is open: it's an error if it is, since a higher
+	 * level should have closed it before removing.
+	 */
+	__wt_spin_lock(session, &conn->fh_lock);
+	TAILQ_FOREACH(fh, &conn->fhqh, q) {
+		if (strcmp(name, fh->name) == 0)
+			break;
+	}
+	__wt_spin_unlock(session, &conn->fh_lock);
+
+	WT_ASSERT(session, fh == NULL);
+#else
+	WT_UNUSED(session);
+	WT_UNUSED(name);
+#endif
+}
+
+/*
+ * __wt_remove --
+ *	Remove a file.
+ */
+int
+__wt_remove(WT_SESSION_IMPL *session, const char *name)
+{
+	WT_DECL_RET;
+	char *path;
+	uint32_t lasterror;
+
+	WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: remove", name));
+
+	__remove_file_check(session, name);
+
+	WT_RET(__wt_filename(session, name, &path));
+
+	if ((ret = DeleteFileA(path)) == FALSE)
+		lasterror = __wt_errno();
+
+	__wt_free(session, path);
+
+	if (ret != FALSE)
+		return (0);
+
+	WT_RET_MSG(session, lasterror, "%s: remove", name);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_rename.c b/src/third_party/wiredtiger/src/os_win/os_rename.c
new file mode 100644
index 00000000000..092f5d62a40
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_rename.c
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_rename --
+ *	Rename a file.
+ */
+int
+__wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to)
+{
+	WT_DECL_RET;
+	uint32_t lasterror;
+	char *from_path, *to_path;
+
+	WT_RET(__wt_verbose(
+		session, WT_VERB_FILEOPS, "rename %s to %s", from, to));
+
+	from_path = to_path = NULL;
+
+	WT_RET(__wt_filename(session, from, &from_path));
+	WT_TRET(__wt_filename(session, to, &to_path));
+
+	/*
+	 * Check if file exists since Windows does not override the file if
+	 * it exists.
+	 */
+	if ((ret = GetFileAttributesA(to_path)) != INVALID_FILE_ATTRIBUTES) {
+		if ((ret = DeleteFileA(to_path)) == FALSE) {
+			lasterror = GetLastError();
+			goto err;
+		}
+	}
+
+	if ((MoveFileA(from_path, to_path)) == FALSE)
+		lasterror = GetLastError();
+
+err:
+	__wt_free(session, from_path);
+	__wt_free(session, to_path);
+
+	if (ret != FALSE)
+		return (0);
+
+	WT_RET_MSG(session, lasterror, "MoveFile %s to %s", from, to);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_rw.c b/src/third_party/wiredtiger/src/os_win/os_rw.c
new file mode 100644
index 00000000000..291533bc6bc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_rw.c
@@ -0,0 +1,98 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_read --
+ *	Read a chunk.
+ */
+int
+__wt_read(
+    WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf)
+{
+	DWORD chunk;
+	DWORD nr;
+	uint8_t *addr;
+	OVERLAPPED overlapped = { 0 };
+
+	nr = 0;
+
+	WT_STAT_FAST_CONN_INCR(session, read_io);
+
+	WT_RET(__wt_verbose(session, WT_VERB_FILEOPS,
+	    "%s: read %" WT_SIZET_FMT " bytes at offset %" PRIuMAX,
+	    fh->name, len, (uintmax_t)offset));
+
+	/* Assert direct I/O is aligned and a multiple of the alignment. */
+	WT_ASSERT(session,
+	    !fh->direct_io ||
+	    S2C(session)->buffer_alignment == 0 ||
+	    (!((uintptr_t)buf &
+	    (uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
+	    len >= S2C(session)->buffer_alignment &&
+	    len % S2C(session)->buffer_alignment == 0));
+
+	/* Break reads larger than 1GB into 1GB chunks. */
+	for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) {
+		chunk = (DWORD)WT_MIN(len, WT_GIGABYTE);
+		overlapped.Offset = UINT32_MAX & offset;
+		overlapped.OffsetHigh = UINT32_MAX & (offset >> 32);
+
+		if (!ReadFile(fh->filehandle, addr, chunk, &nr, &overlapped))
+			WT_RET_MSG(session, nr == 0 ? WT_ERROR : __wt_errno(),
+			    "%s read error: failed to read %" WT_SIZET_FMT
+			    " bytes at offset %" PRIuMAX,
+			    fh->name, chunk, (uintmax_t)offset);
+	}
+	return (0);
+}
+
+/*
+ * __wt_write --
+ *	Write a chunk.
+ */
+int
+__wt_write(WT_SESSION_IMPL *session,
+    WT_FH *fh, wt_off_t offset, size_t len, const void *buf)
+{
+	DWORD chunk;
+	DWORD nw;
+	const uint8_t *addr;
+	OVERLAPPED overlapped = { 0 };
+
+	nw = 0;
+
+	WT_STAT_FAST_CONN_INCR(session, write_io);
+
+	WT_RET(__wt_verbose(session, WT_VERB_FILEOPS,
+		"%s: write %" WT_SIZET_FMT " bytes at offset %" PRIuMAX,
+		fh->name, len, (uintmax_t)offset));
+
+	/* Assert direct I/O is aligned and a multiple of the alignment. */
+	WT_ASSERT(session,
+	    !fh->direct_io ||
+	    S2C(session)->buffer_alignment == 0 ||
+	    (!((uintptr_t)buf &
+	    (uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
+	    len >= S2C(session)->buffer_alignment &&
+	    len % S2C(session)->buffer_alignment == 0));
+
+	/* Break writes larger than 1GB into 1GB chunks. */
+	for (addr = buf; len > 0; addr += nw, len -= (size_t)nw, offset += nw) {
+		chunk = (DWORD)WT_MIN(len, WT_GIGABYTE);
+		overlapped.Offset = UINT32_MAX & offset;
+		overlapped.OffsetHigh = UINT32_MAX & (offset >> 32);
+
+		if (!WriteFile(fh->filehandle, addr, chunk, &nw, &overlapped))
+			WT_RET_MSG(session, __wt_errno(),
+			    "%s write error: failed to write %" WT_SIZET_FMT
+			    " bytes at offset %" PRIuMAX,
+			    fh->name, chunk, (uintmax_t)offset);
+	}
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_sleep.c b/src/third_party/wiredtiger/src/os_win/os_sleep.c
new file mode 100644
index 00000000000..b9a8cc2e545
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_sleep.c
@@ -0,0 +1,18 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_sleep --
+ *	Pause the thread of control.
+ */
+void
+__wt_sleep(long seconds, long micro_seconds)
+{
+	Sleep(seconds * 1000 + micro_seconds / 1000);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_thread.c b/src/third_party/wiredtiger/src/os_win/os_thread.c
new file mode 100644
index 00000000000..4d8cf89f264
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_thread.c
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_thread_create --
+ *	Create a new thread of control.
+ */
+int
+__wt_thread_create(WT_SESSION_IMPL *session,
+    wt_thread_t *tidret, void *(*func)(void *), void *arg)
+{
+	/* Spawn a new thread of control. */
+	*tidret = CreateThread(NULL, 0, func, arg, 0, NULL);
+	if (*tidret != NULL)
+		return (0);
+
+	WT_RET_MSG(session, __wt_errno(), "CreateThread");
+}
+
+/*
+ * __wt_thread_join --
+ *	Wait for a thread of control to exit.
+ */
+int
+__wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid)
+{
+	WT_DECL_RET;
+
+	if ((ret = WaitForSingleObject(tid, INFINITE)) == WAIT_OBJECT_0)
+		return (0);
+
+	WT_RET_MSG(session, ret, "WaitForSingleObject");
+}
+
+/*
+ * __wt_thread_id --
+ *	Fill in a printable version of the process and thread IDs.
+ */
+void
+__wt_thread_id(char* buf, size_t buflen)
+{
+	(void)snprintf(buf, buflen,
+	    "%" PRIu64 ":%" PRIu64,
+	    (uint64_t)GetCurrentProcessId(), (uint64_t)GetCurrentThreadId);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_time.c b/src/third_party/wiredtiger/src/os_win/os_time.c
new file mode 100644
index 00000000000..b49b738fe54
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_time.c
@@ -0,0 +1,62 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_seconds --
+ *	Return the seconds since the Epoch.
+ */
+int
+__wt_seconds(WT_SESSION_IMPL *session, time_t *timep)
+{
+	struct timespec t;
+
+	WT_RET(__wt_epoch(session, &t));
+
+	*timep = t.tv_sec;
+
+	return (0);
+}
+
+/*
+ * __wt_epoch --
+ *	Return the time since the Epoch.
+ */
+int
+__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
+{
+	uint64_t ns100;
+
+	FILETIME time;
+	GetSystemTimeAsFileTime(&time);
+
+	ns100 = (((int64_t)time.dwHighDateTime << 32) + time.dwLowDateTime)
+	    - 116444736000000000LL;
+	tsp->tv_sec = ns100 / 10000000;
+	tsp->tv_nsec = (long)((ns100 % 10000000) * 100);
+
+	return (0);
+}
+
+/*
+ * localtime_r --
+ *	Return the current local time.
+ */
+struct tm *
+localtime_r(const time_t *timer, struct tm *result)
+{
+	errno_t err;
+
+	err = localtime_s(result, timer);
+	if (err != 0) {
+		__wt_err(NULL, err, "localtime_s");
+		return (NULL);
+	}
+
+	return (result);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_vsnprintf.c b/src/third_party/wiredtiger/src/os_win/os_vsnprintf.c
new file mode 100644
index 00000000000..1058203e326
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_vsnprintf.c
@@ -0,0 +1,31 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#undef vsnprintf
+
+_Check_return_opt_ int __cdecl _wt_vsnprintf(
+    _Out_writes_(_MaxCount) char * _DstBuf,
+    _In_ size_t _MaxCount,
+    _In_z_ _Printf_format_string_ const char * _Format,
+    va_list _ArgList)
+{
+	int len;
+
+	len = (size_t)vsnprintf(_DstBuf, _MaxCount, _Format, _ArgList);
+
+	/*
+	 * The MSVC implementation returns -1 on truncation instead of what
+	 * it would have written.  We could iteratively grow the buffer, or
+	 * just ask us how big a buffer they would like.
+	 */
+	if (len == -1)
+		len = _vscprintf(_Format, _ArgList) + 1;
+
+	return (len);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_yield.c b/src/third_party/wiredtiger/src/os_win/os_yield.c
new file mode 100644
index 00000000000..970bfa139d0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_yield.c
@@ -0,0 +1,18 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_yield --
+ *	Yield the thread of control.
+ */
+void
+__wt_yield(void)
+{
+	SwitchToThread();
+}
diff --git a/src/third_party/wiredtiger/src/packing/pack_api.c b/src/third_party/wiredtiger/src/packing/pack_api.c
new file mode 100644
index 00000000000..c0c1e53c8ca
--- /dev/null
+++ b/src/third_party/wiredtiger/src/packing/pack_api.c
@@ -0,0 +1,137 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * wiredtiger_struct_pack --
+ *	Pack a byte string (extension API).
+ */
+int
+wiredtiger_struct_pack(WT_SESSION *wt_session,
+    void *buffer, size_t size, const char *fmt, ...)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	va_list ap;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+
+	va_start(ap, fmt);
+	ret = __wt_struct_packv(session, buffer, size, fmt, ap);
+	va_end(ap);
+
+	return (ret);
+}
+
+/*
+ * wiredtiger_struct_size --
+ *	Calculate the size of a packed byte string (extension API).
+ */
+int
+wiredtiger_struct_size(WT_SESSION *wt_session,
+    size_t *sizep, const char *fmt, ...)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	va_list ap;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+
+	va_start(ap, fmt);
+	ret = __wt_struct_sizev(session, sizep, fmt, ap);
+	va_end(ap);
+
+	return (ret);
+}
+
+/*
+ * wiredtiger_struct_unpack --
+ *	Unpack a byte string (extension API).
+ */
+int
+wiredtiger_struct_unpack(WT_SESSION *wt_session,
+    const void *buffer, size_t size, const char *fmt, ...)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	va_list ap;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+
+	va_start(ap, fmt);
+	ret = __wt_struct_unpackv(session, buffer, size, fmt, ap);
+	va_end(ap);
+
+	return (ret);
+}
+
+/*
+ * __wt_ext_struct_pack --
+ *	Pack a byte string (extension API).
+ */
+int
+__wt_ext_struct_pack(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session,
+    void *buffer, size_t size, const char *fmt, ...)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	va_list ap;
+
+	session = (wt_session != NULL) ? (WT_SESSION_IMPL *)wt_session :
+	    ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+	va_start(ap, fmt);
+	ret = __wt_struct_packv(session, buffer, size, fmt, ap);
+	va_end(ap);
+
+	return (ret);
+}
+
+/*
+ * __wt_ext_struct_size --
+ *	Calculate the size of a packed byte string (extension API).
+ */
+int
+__wt_ext_struct_size(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session,
+    size_t *sizep, const char *fmt, ...)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	va_list ap;
+
+	session = (wt_session != NULL) ? (WT_SESSION_IMPL *)wt_session :
+	    ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+	va_start(ap, fmt);
+	ret = __wt_struct_sizev(session, sizep, fmt, ap);
+	va_end(ap);
+
+	return (ret);
+}
+
+/*
+ * __wt_ext_struct_unpack --
+ *	Unpack a byte string (extension API).
+ */
+int
+__wt_ext_struct_unpack(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session,
+    const void *buffer, size_t size, const char *fmt, ...)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	va_list ap;
+
+	session = (wt_session != NULL) ? (WT_SESSION_IMPL *)wt_session :
+	    ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+	va_start(ap, fmt);
+	ret = __wt_struct_unpackv(session, buffer, size, fmt, ap);
+	va_end(ap);
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/packing/pack_impl.c b/src/third_party/wiredtiger/src/packing/pack_impl.c
new file mode 100644
index 00000000000..12b1582e6d0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/packing/pack_impl.c
@@ -0,0 +1,96 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_struct_check --
+ *	Check that the specified packing format is valid, and whether it fits
+ *	into a fixed-sized bitfield.
+ */
+int
+__wt_struct_check(WT_SESSION_IMPL *session,
+    const char *fmt, size_t len, int *fixedp, uint32_t *fixed_lenp)
+{
+	WT_DECL_PACK_VALUE(pv);
+	WT_DECL_RET;
+	WT_PACK pack;
+	int fields;
+
+	WT_RET(__pack_initn(session, &pack, fmt, len));
+	for (fields = 0; (ret = __pack_next(&pack, &pv)) == 0; fields++)
+		;
+
+	if (ret != WT_NOTFOUND)
+		return (ret);
+
+	if (fixedp != NULL && fixed_lenp != NULL) {
+		if (fields == 0) {
+			*fixedp = 1;
+			*fixed_lenp = 0;
+		} else if (fields == 1 && pv.type == 't') {
+			*fixedp = 1;
+			*fixed_lenp = pv.size;
+		} else
+			*fixedp = 0;
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_struct_size --
+ *	Calculate the size of a packed byte string.
+ */
+int
+__wt_struct_size(WT_SESSION_IMPL *session, size_t *sizep, const char *fmt, ...)
+{
+	WT_DECL_RET;
+	va_list ap;
+
+	va_start(ap, fmt);
+	ret = __wt_struct_sizev(session, sizep, fmt, ap);
+	va_end(ap);
+
+	return (ret);
+}
+
+/*
+ * __wt_struct_pack --
+ *	Pack a byte string.
+ */
+int
+__wt_struct_pack(WT_SESSION_IMPL *session,
+    void *buffer, size_t size, const char *fmt, ...)
+{
+	WT_DECL_RET;
+	va_list ap;
+
+	va_start(ap, fmt);
+	ret = __wt_struct_packv(session, buffer, size, fmt, ap);
+	va_end(ap);
+
+	return (ret);
+}
+
+/*
+ * __wt_struct_unpack --
+ *	Unpack a byte string.
+ */
+int
+__wt_struct_unpack(WT_SESSION_IMPL *session,
+    const void *buffer, size_t size, const char *fmt, ...)
+{
+	WT_DECL_RET;
+	va_list ap;
+
+	va_start(ap, fmt);
+	ret = __wt_struct_unpackv(session, buffer, size, fmt, ap);
+	va_end(ap);
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/packing/pack_stream.c b/src/third_party/wiredtiger/src/packing/pack_stream.c
new file mode 100644
index 00000000000..efbbd5d9adb
--- /dev/null
+++ b/src/third_party/wiredtiger/src/packing/pack_stream.c
@@ -0,0 +1,296 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Streaming interface to packing.
+ *
+ * This allows applications to pack or unpack records one field at a time.
+ */
+struct __wt_pack_stream {
+	WT_PACK pack;
+	uint8_t *end, *p, *start;
+};
+
+/*
+ * wiredtiger_pack_start --
+ *	Open a stream for packing.
+ */
+int
+wiredtiger_pack_start(WT_SESSION *wt_session,
+	const char *format, void *buffer, size_t len, WT_PACK_STREAM **psp)
+{
+	WT_DECL_RET;
+	WT_PACK_STREAM *ps;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+	WT_RET(__wt_calloc_def(session, 1, &ps));
+	WT_ERR(__pack_init(session, &ps->pack, format));
+	ps->p = ps->start = buffer;
+	ps->end = ps->p + len;
+	*psp = ps;
+
+	if (0) {
+err:		(void)wiredtiger_pack_close(ps, NULL);
+	}
+	return (ret);
+}
+
+/*
+ * wiredtiger_unpack_start --
+ *	Open a stream for unpacking.
+ */
+int
+wiredtiger_unpack_start(WT_SESSION *wt_session, const char *format,
+	const void *buffer, size_t size, WT_PACK_STREAM **psp)
+{
+	return (wiredtiger_pack_start(
+	    wt_session, format, (void *)buffer, size, psp));
+}
+
+/*
+ * wiredtiger_pack_close --
+ *	Close a packing stream.
+ */
+int
+wiredtiger_pack_close(WT_PACK_STREAM *ps, size_t *usedp)
+{
+	if (usedp != NULL)
+		*usedp = WT_PTRDIFF(ps->p, ps->start);
+
+	if (ps != NULL)
+		__wt_free(ps->pack.session, ps);
+
+	return (0);
+}
+
+/*
+ * wiredtiger_pack_item --
+ *	Pack an item.
+ */
+int
+wiredtiger_pack_item(WT_PACK_STREAM *ps, WT_ITEM *item)
+{
+	WT_DECL_PACK_VALUE(pv);
+	WT_SESSION_IMPL *session;
+
+	session = ps->pack.session;
+
+	WT_RET(__pack_next(&ps->pack, &pv));
+	switch (pv.type) {
+	case 'U':
+	case 'u':
+		pv.u.item.data = item->data;
+		pv.u.item.size = item->size;
+		WT_RET(__pack_write(
+		    session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	return (0);
+}
+
+/*
+ * wiredtiger_pack_int --
+ *	Pack a signed integer.
+ */
+int
+wiredtiger_pack_int(WT_PACK_STREAM *ps, int64_t i)
+{
+	WT_DECL_PACK_VALUE(pv);
+	WT_SESSION_IMPL *session;
+
+	session = ps->pack.session;
+
+	WT_RET(__pack_next(&ps->pack, &pv));
+	switch (pv.type) {
+	case 'b':
+	case 'h':
+	case 'i':
+	case 'l':
+	case 'q':
+		pv.u.i = i;
+		WT_RET(__pack_write(
+		    session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	return (0);
+}
+
+/*
+ * wiredtiger_pack_str --
+ *	Pack a string.
+ */
+int
+wiredtiger_pack_str(WT_PACK_STREAM *ps, const char *s)
+{
+	WT_DECL_PACK_VALUE(pv);
+	WT_SESSION_IMPL *session;
+
+	session = ps->pack.session;
+
+	WT_RET(__pack_next(&ps->pack, &pv));
+	switch (pv.type) {
+	case 'S':
+	case 's':
+		pv.u.s = s;
+		WT_RET(__pack_write(
+		    session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	return (0);
+}
+
+/*
+ * wiredtiger_pack_uint --
+ *	Pack an unsigned int.
+ */
+int
+wiredtiger_pack_uint(WT_PACK_STREAM *ps, uint64_t u)
+{
+	WT_DECL_PACK_VALUE(pv);
+	WT_SESSION_IMPL *session;
+
+	session = ps->pack.session;
+
+	WT_RET(__pack_next(&ps->pack, &pv));
+	switch (pv.type) {
+	case 'B':
+	case 'H':
+	case 'I':
+	case 'L':
+	case 'Q':
+	case 'R':
+	case 'r':
+	case 't':
+		pv.u.u = u;
+		WT_RET(__pack_write(
+		    session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	return (0);
+}
+
+/*
+ * wiredtiger_unpack_item --
+ *	Unpack an item.
+ */
+int
+wiredtiger_unpack_item(WT_PACK_STREAM *ps, WT_ITEM *item)
+{
+	WT_DECL_PACK_VALUE(pv);
+	WT_SESSION_IMPL *session;
+
+	session = ps->pack.session;
+
+	WT_RET(__pack_next(&ps->pack, &pv));
+	switch (pv.type) {
+	case 'U':
+	case 'u':
+		WT_RET(__unpack_read(session,
+		    &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p)));
+		item->data = pv.u.item.data;
+		item->size = pv.u.item.size;
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	return (0);
+}
+
+/*
+ * wiredtiger_unpack_int --
+ *	Unpack a signed integer.
+ */
+int
+wiredtiger_unpack_int(WT_PACK_STREAM *ps, int64_t *ip)
+{
+	WT_DECL_PACK_VALUE(pv);
+	WT_SESSION_IMPL *session;
+
+	session = ps->pack.session;
+
+	WT_RET(__pack_next(&ps->pack, &pv));
+	switch (pv.type) {
+	case 'b':
+	case 'h':
+	case 'i':
+	case 'l':
+	case 'q':
+		WT_RET(__unpack_read(session,
+		    &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p)));
+		*ip = pv.u.i;
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+	return (0);
+}
+
+/*
+ * wiredtiger_unpack_str --
+ *	Unpack a string.
+ */
+int
+wiredtiger_unpack_str(WT_PACK_STREAM *ps, const char **sp)
+{
+	WT_DECL_PACK_VALUE(pv);
+	WT_SESSION_IMPL *session;
+
+	session = ps->pack.session;
+
+	WT_RET(__pack_next(&ps->pack, &pv));
+	switch (pv.type) {
+	case 'S':
+	case 's':
+		WT_RET(__unpack_read(session,
+		    &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p)));
+		*sp = pv.u.s;
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+	return (0);
+}
+
+/*
+ * wiredtiger_unpack_uint --
+ *	Unpack an unsigned integer.
+ */
+int
+wiredtiger_unpack_uint(WT_PACK_STREAM *ps, uint64_t *up)
+{
+	WT_DECL_PACK_VALUE(pv);
+	WT_SESSION_IMPL *session;
+
+	session = ps->pack.session;
+
+	WT_RET(__pack_next(&ps->pack, &pv));
+	switch (pv.type) {
+	case 'B':
+	case 'H':
+	case 'I':
+	case 'L':
+	case 'Q':
+	case 'R':
+	case 'r':
+	case 't':
+		WT_RET(__unpack_read(session,
+		    &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p)));
+		*up = pv.u.u;
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_create.c b/src/third_party/wiredtiger/src/schema/schema_create.c
new file mode 100644
index 00000000000..398fea4476f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_create.c
@@ -0,0 +1,595 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_direct_io_size_check --
+ *	Return a size from the configuration, complaining if it's insufficient
+ * for direct I/O.
+ */
+int
+__wt_direct_io_size_check(WT_SESSION_IMPL *session,
+    const char **cfg, const char *config_name, uint32_t *allocsizep)
+{
+	WT_CONFIG_ITEM cval;
+	WT_CONNECTION_IMPL *conn;
+	int64_t align;
+
+	*allocsizep = 0;
+
+	conn = S2C(session);
+
+	WT_RET(__wt_config_gets(session, cfg, config_name, &cval));
+
+	/*
+	 * This function exists as a place to hang this comment: if direct I/O
+	 * is configured, page sizes must be at least as large as any buffer
+	 * alignment as well as a multiple of the alignment.  Linux gets unhappy
+	 * if you configure direct I/O and then don't do I/O in alignments and
+	 * units of its happy place.
+	 */
+	if (FLD_ISSET(conn->direct_io,
+	   WT_FILE_TYPE_CHECKPOINT | WT_FILE_TYPE_DATA)) {
+		align = (int64_t)conn->buffer_alignment;
+		if (align != 0 && (cval.val < align || cval.val % align != 0))
+			WT_RET_MSG(session, EINVAL,
+			    "when direct I/O is configured, the %s size must "
+			    "be at least as large as the buffer alignment as "
+			    "well as a multiple of the buffer alignment",
+			    config_name);
+	}
+	*allocsizep = (uint32_t)cval.val;
+	return (0);
+}
+
+/*
+ * __create_file --
+ *	Create a new 'file:' object.
+ */
+static int
+__create_file(WT_SESSION_IMPL *session,
+    const char *uri, int exclusive, const char *config)
+{
+	WT_DECL_ITEM(val);
+	WT_DECL_RET;
+	uint32_t allocsize;
+	int is_metadata;
+	const char *fileconf, *filename;
+	const char **p, *filecfg[] =
+	    { WT_CONFIG_BASE(session, file_meta), config, NULL, NULL };
+
+	fileconf = NULL;
+
+	is_metadata = strcmp(uri, WT_METAFILE_URI) == 0;
+
+	filename = uri;
+	if (!WT_PREFIX_SKIP(filename, "file:"))
+		WT_RET_MSG(session, EINVAL, "Expected a 'file:' URI: %s", uri);
+
+	/* Check if the file already exists. */
+	if (!is_metadata && (ret =
+	    __wt_metadata_search(session, uri, &fileconf)) != WT_NOTFOUND) {
+		if (exclusive)
+			WT_TRET(EEXIST);
+		goto err;
+	}
+
+	/* Sanity check the allocation size. */
+	WT_RET(__wt_direct_io_size_check(
+	    session, filecfg, "allocation_size", &allocsize));
+
+	/* Create the file. */
+	WT_ERR(__wt_block_manager_create(session, filename, allocsize));
+	if (WT_META_TRACKING(session))
+		WT_ERR(__wt_meta_track_fileop(session, NULL, uri));
+
+	/*
+	 * If creating an ordinary file, append the file ID and current version
+	 * numbers to the passed-in configuration and insert the resulting
+	 * configuration into the metadata.
+	 */
+	if (!is_metadata) {
+		WT_ERR(__wt_scr_alloc(session, 0, &val));
+		WT_ERR(__wt_buf_fmt(session, val,
+		    "id=%" PRIu32 ",version=(major=%d,minor=%d)",
+		    ++S2C(session)->next_file_id,
+		    WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX));
+		for (p = filecfg; *p != NULL; ++p)
+			;
+		*p = val->data;
+		WT_ERR(__wt_config_collapse(session, filecfg, &fileconf));
+		WT_ERR(__wt_metadata_insert(session, uri, fileconf));
+	}
+
+	/*
+	 * Open the file to check that it was setup correctly.   We don't need
+	 * to pass the configuration, we just wrote the collapsed configuration
+	 * into the metadata file, and it's going to be read/used by underlying
+	 * functions.
+	 *
+	 * Keep the handle exclusive until it is released at the end of the
+	 * call, otherwise we could race with a drop.
+	 */
+	WT_ERR(__wt_session_get_btree(
+	    session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE));
+	if (WT_META_TRACKING(session))
+		WT_ERR(__wt_meta_track_handle_lock(session, 1));
+	else
+		WT_ERR(__wt_session_release_btree(session));
+
+err:	__wt_scr_free(&val);
+	__wt_free(session, fileconf);
+	return (ret);
+}
+
+/*
+ * __wt_schema_colgroup_source --
+ *	Get the URI of the data source for a column group.
+ */
+int
+__wt_schema_colgroup_source(WT_SESSION_IMPL *session,
+    WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf)
+{
+	WT_CONFIG_ITEM cval;
+	WT_DECL_RET;
+	size_t len;
+	const char *prefix, *suffix, *tablename;
+
+	tablename = table->name + strlen("table:");
+	if ((ret = __wt_config_getones(session, config, "type", &cval)) == 0 &&
+	    !WT_STRING_MATCH("file", cval.str, cval.len)) {
+		prefix = cval.str;
+		len = cval.len;
+		suffix = "";
+	} else {
+		prefix = "file";
+		len = strlen(prefix);
+		suffix = ".wt";
+	}
+	WT_RET_NOTFOUND_OK(ret);
+
+	if (cgname == NULL)
+		WT_RET(__wt_buf_fmt(session, buf, "%.*s:%s%s",
+		    (int)len, prefix, tablename, suffix));
+	else
+		WT_RET(__wt_buf_fmt(session, buf, "%.*s:%s_%s%s",
+		    (int)len, prefix, tablename, cgname, suffix));
+
+	return (0);
+}
+
+/*
+ * __create_colgroup --
+ *	Create a column group.
+ */
+static int
+__create_colgroup(WT_SESSION_IMPL *session,
+    const char *name, int exclusive, const char *config)
+{
+	WT_CONFIG_ITEM cval;
+	WT_DECL_RET;
+	WT_ITEM confbuf, fmt, namebuf;
+	WT_TABLE *table;
+	size_t tlen;
+	const char *cfg[4] =
+	    { WT_CONFIG_BASE(session, colgroup_meta), config, NULL, NULL };
+	const char *sourcecfg[] = { config, NULL, NULL };
+	const char **cfgp;
+	const char *cgconf, *cgname, *sourceconf, *oldconf;
+	const char *source, *tablename;
+
+	cgconf = sourceconf = oldconf = NULL;
+	WT_CLEAR(fmt);
+	WT_CLEAR(confbuf);
+	WT_CLEAR(namebuf);
+
+	tablename = name;
+	if (!WT_PREFIX_SKIP(tablename, "colgroup:"))
+		return (EINVAL);
+	cgname = strchr(tablename, ':');
+	if (cgname != NULL) {
+		tlen = (size_t)(cgname - tablename);
+		++cgname;
+	} else
+		tlen = strlen(tablename);
+
+	if ((ret =
+	    __wt_schema_get_table(session, tablename, tlen, 1, &table)) != 0)
+		WT_RET_MSG(session, (ret == WT_NOTFOUND) ? ENOENT : ret,
+		    "Can't create '%s' for non-existent table '%.*s'",
+		    name, (int)tlen, tablename);
+
+	/* Make sure the column group is referenced from the table. */
+	if (cgname != NULL && (ret =
+	    __wt_config_subgets(session, &table->cgconf, cgname, &cval)) != 0)
+		WT_ERR_MSG(session, EINVAL,
+		    "Column group '%s' not found in table '%.*s'",
+		    cgname, (int)tlen, tablename);
+
+	/* Find the first NULL entry in the cfg stack. */
+	for (cfgp = &cfg[1]; *cfgp; cfgp++)
+		;
+
+	/* Add the source to the colgroup config before collapsing. */
+	if (__wt_config_getones(
+	    session, config, "source", &cval) == 0 && cval.len != 0) {
+		WT_ERR(__wt_buf_fmt(
+		    session, &namebuf, "%.*s", (int)cval.len, cval.str));
+		source = namebuf.data;
+	} else {
+		WT_ERR(__wt_schema_colgroup_source(
+		    session, table, cgname, config, &namebuf));
+		source = namebuf.data;
+		WT_ERR(__wt_buf_fmt(
+		    session, &confbuf, "source=\"%s\"", source));
+		*cfgp++ = confbuf.data;
+	}
+
+	/* Calculate the key/value formats: these go into the source config. */
+	WT_ERR(__wt_buf_fmt(session, &fmt, "key_format=%s", table->key_format));
+	if (cgname == NULL)
+		WT_ERR(__wt_buf_catfmt
+		    (session, &fmt, ",value_format=%s", table->value_format));
+	else {
+		if (__wt_config_getones(session, config, "columns", &cval) != 0)
+			WT_ERR_MSG(session, EINVAL,
+			    "No 'columns' configuration for '%s'", name);
+		WT_ERR(__wt_buf_catfmt(session, &fmt, ",value_format="));
+		WT_ERR(__wt_struct_reformat(session,
+		    table, cval.str, cval.len, NULL, 1, &fmt));
+	}
+	sourcecfg[1] = fmt.data;
+	WT_ERR(__wt_config_concat(session, sourcecfg, &sourceconf));
+
+	WT_ERR(__wt_schema_create(session, source, sourceconf));
+
+	WT_ERR(__wt_config_collapse(session, cfg, &cgconf));
+	if ((ret = __wt_metadata_insert(session, name, cgconf)) != 0) {
+		/*
+		 * If the entry already exists in the metadata, we're done.
+		 * This is an error for exclusive creates but okay otherwise.
+		 */
+		if (ret == WT_DUPLICATE_KEY)
+			ret = exclusive ? EEXIST : 0;
+		goto err;
+	}
+
+	WT_ERR(__wt_schema_open_colgroups(session, table));
+
+err:	__wt_free(session, cgconf);
+	__wt_free(session, sourceconf);
+	__wt_free(session, oldconf);
+	__wt_buf_free(session, &confbuf);
+	__wt_buf_free(session, &fmt);
+	__wt_buf_free(session, &namebuf);
+
+	__wt_schema_release_table(session, table);
+	return (ret);
+}
+
+/*
+ * __wt_schema_index_source --
+ *	Get the URI of the data source for an index.
+ */
+int
+__wt_schema_index_source(WT_SESSION_IMPL *session,
+    WT_TABLE *table, const char *idxname, const char *config, WT_ITEM *buf)
+{
+	WT_CONFIG_ITEM cval;
+	WT_DECL_RET;
+	size_t len;
+	const char *prefix, *suffix, *tablename;
+
+	tablename = table->name + strlen("table:");
+	if ((ret = __wt_config_getones(session, config, "type", &cval)) == 0 &&
+	    !WT_STRING_MATCH("file", cval.str, cval.len)) {
+		prefix = cval.str;
+		len = cval.len;
+		suffix = "_idx";
+	} else {
+		prefix = "file";
+		len = strlen(prefix);
+		suffix = ".wti";
+	}
+	WT_RET_NOTFOUND_OK(ret);
+
+	WT_RET(__wt_buf_fmt(session, buf, "%.*s:%s_%s%s",
+	    (int)len, prefix, tablename, idxname, suffix));
+
+	return (0);
+}
+
+/*
+ * __create_index --
+ *	Create an index.
+ */
+static int
+__create_index(WT_SESSION_IMPL *session,
+    const char *name, int exclusive, const char *config)
+{
+	WT_CONFIG pkcols;
+	WT_CONFIG_ITEM ckey, cval, icols;
+	WT_DECL_RET;
+	WT_ITEM confbuf, extra_cols, fmt, namebuf;
+	WT_TABLE *table;
+	const char *cfg[4] =
+	    { WT_CONFIG_BASE(session, index_meta), NULL, NULL, NULL };
+	const char *sourcecfg[] = { config, NULL, NULL };
+	const char *sourceconf, *source, *idxconf, *idxname;
+	const char *tablename;
+	size_t tlen;
+	u_int i;
+
+	idxconf = sourceconf = NULL;
+	WT_CLEAR(confbuf);
+	WT_CLEAR(fmt);
+	WT_CLEAR(extra_cols);
+	WT_CLEAR(namebuf);
+
+	tablename = name;
+	if (!WT_PREFIX_SKIP(tablename, "index:"))
+		return (EINVAL);
+	idxname = strchr(tablename, ':');
+	if (idxname == NULL)
+		WT_RET_MSG(session, EINVAL, "Invalid index name, "
+		    "should be <table name>:<index name>: %s", name);
+
+	tlen = (size_t)(idxname++ - tablename);
+	if ((ret =
+	    __wt_schema_get_table(session, tablename, tlen, 1, &table)) != 0)
+		WT_RET_MSG(session, ret,
+		    "Can't create an index for a non-existent table: %.*s",
+		    (int)tlen, tablename);
+
+	if (__wt_config_getones(session, config, "source", &cval) == 0) {
+		WT_ERR(__wt_buf_fmt(session, &namebuf,
+		    "%.*s", (int)cval.len, cval.str));
+		source = namebuf.data;
+	} else {
+		WT_ERR(__wt_schema_index_source(
+		    session, table, idxname, config, &namebuf));
+		source = namebuf.data;
+
+		/* Add the source name to the index config before collapsing. */
+		WT_ERR(__wt_buf_catfmt(session, &confbuf,
+		    ",source=\"%s\"", source));
+	}
+
+	/* Calculate the key/value formats. */
+	if (__wt_config_getones(session, config, "columns", &icols) != 0)
+		WT_ERR_MSG(session, EINVAL,
+		    "No 'columns' configuration for '%s'", name);
+
+	/*
+	 * The key format for an index is somewhat subtle: the application
+	 * specifies a set of columns that it will use for the key, but the
+	 * engine usually adds some hidden columns in order to derive the
+	 * primary key.  These hidden columns are part of the source's
+	 * key_format, which we are calculating now, but not part of an index
+	 * cursor's key_format.
+	 */
+	WT_ERR(__wt_config_subinit(session, &pkcols, &table->colconf));
+	for (i = 0; i < table->nkey_columns &&
+	    (ret = __wt_config_next(&pkcols, &ckey, &cval)) == 0;
+	    i++) {
+		/*
+		 * If the primary key column is already in the secondary key,
+		 * don't add it again.
+		 */
+		if (__wt_config_subgetraw(session, &icols, &ckey, &cval) == 0)
+			continue;
+		WT_ERR(__wt_buf_catfmt(
+		    session, &extra_cols, "%.*s,", (int)ckey.len, ckey.str));
+	}
+	if (ret != 0 && ret != WT_NOTFOUND)
+		goto err;
+
+	/*
+	 * Index values are normally empty: all columns are packed into the
+	 * index key.  The exception is LSM, which (currently) reserves empty
+	 * values as tombstones.  Use a single padding byte in that case.
+	 */
+	if (WT_PREFIX_MATCH(source, "lsm:"))
+		WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=x,"));
+	else
+		WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=,"));
+	WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=,key_format="));
+	WT_ERR(__wt_struct_reformat(session, table,
+	    icols.str, icols.len, (const char *)extra_cols.data, 0, &fmt));
+
+	/* Check for a record number index key, which makes no sense. */
+	WT_ERR(__wt_config_getones(session, fmt.data, "key_format", &cval));
+	if (cval.len == 1 && cval.str[0] == 'r')
+		WT_ERR_MSG(session, EINVAL,
+		    "column-store index may not use the record number as its "
+		    "index key");
+
+	sourcecfg[1] = fmt.data;
+	WT_ERR(__wt_config_concat(session, sourcecfg, &sourceconf));
+
+	WT_ERR(__wt_schema_create(session, source, sourceconf));
+
+	cfg[1] = sourceconf;
+	cfg[2] = confbuf.data;
+	WT_ERR(__wt_config_collapse(session, cfg, &idxconf));
+	if ((ret = __wt_metadata_insert(session, name, idxconf)) != 0) {
+		/*
+		 * If the entry already exists in the metadata, we're done.
+		 * This is an error for exclusive creates but okay otherwise.
+		 */
+		if (ret == WT_DUPLICATE_KEY)
+			ret = exclusive ? EEXIST : 0;
+		goto err;
+	}
+
+err:	__wt_free(session, idxconf);
+	__wt_free(session, sourceconf);
+	__wt_buf_free(session, &confbuf);
+	__wt_buf_free(session, &extra_cols);
+	__wt_buf_free(session, &fmt);
+	__wt_buf_free(session, &namebuf);
+
+	__wt_schema_release_table(session, table);
+	return (ret);
+}
+
+/*
+ * __create_table --
+ *	Create a table.
+ */
+static int
+__create_table(WT_SESSION_IMPL *session,
+    const char *name, int exclusive, const char *config)
+{
+	WT_CONFIG conf;
+	WT_CONFIG_ITEM cgkey, cgval, cval;
+	WT_DECL_RET;
+	WT_TABLE *table;
+	size_t cgsize;
+	int ncolgroups;
+	char *cgname;
+	const char *cfg[4] =
+	    { WT_CONFIG_BASE(session, table_meta), config, NULL, NULL };
+	const char *tableconf, *tablename;
+
+	cgname = NULL;
+	table = NULL;
+	tableconf = NULL;
+
+	tablename = name;
+	if (!WT_PREFIX_SKIP(tablename, "table:"))
+		return (EINVAL);
+
+	if ((ret = __wt_schema_get_table(session,
+	    tablename, strlen(tablename), 0, &table)) == 0) {
+		__wt_schema_release_table(session, table);
+		return (exclusive ? EEXIST : 0);
+	}
+	WT_RET_NOTFOUND_OK(ret);
+
+	WT_RET(__wt_config_gets(session, cfg, "colgroups", &cval));
+	WT_RET(__wt_config_subinit(session, &conf, &cval));
+	for (ncolgroups = 0;
+	    (ret = __wt_config_next(&conf, &cgkey, &cgval)) == 0;
+	    ncolgroups++)
+		;
+	WT_RET_NOTFOUND_OK(ret);
+
+	WT_RET(__wt_config_collapse(session, cfg, &tableconf));
+	if ((ret = __wt_metadata_insert(session, name, tableconf)) != 0) {
+		/*
+		 * If the entry already exists in the metadata, we're done.
+		 * This is an error for exclusive creates but okay otherwise.
+		 */
+		if (ret == WT_DUPLICATE_KEY)
+			ret = exclusive ? EEXIST : 0;
+		goto err;
+	}
+
+	/* Attempt to open the table now to catch any errors. */
+	WT_ERR(__wt_schema_get_table(
+	    session, tablename, strlen(tablename), 1, &table));
+
+	if (ncolgroups == 0) {
+		cgsize = strlen("colgroup:") + strlen(tablename) + 1;
+		WT_ERR(__wt_calloc_def(session, cgsize, &cgname));
+		snprintf(cgname, cgsize, "colgroup:%s", tablename);
+		WT_ERR(__create_colgroup(session, cgname, exclusive, config));
+	}
+
+	if (0) {
+err:		if (table != NULL) {
+			__wt_schema_remove_table(session, table);
+			table = NULL;
+		}
+	}
+	if (table != NULL)
+		__wt_schema_release_table(session, table);
+	__wt_free(session, cgname);
+	__wt_free(session, tableconf);
+	return (ret);
+}
+
+/*
+ * __create_data_source --
+ *	Create a custom data source.
+ */
+static int
+__create_data_source(WT_SESSION_IMPL *session,
+    const char *uri, const char *config, WT_DATA_SOURCE *dsrc)
+{
+	WT_CONFIG_ITEM cval;
+	const char *cfg[] = {
+	    WT_CONFIG_BASE(session, session_create), config, NULL };
+
+	/*
+	 * Check to be sure the key/value formats are legal: the underlying
+	 * data source doesn't have access to the functions that check.
+	 */
+	WT_RET(__wt_config_gets(session, cfg, "key_format", &cval));
+	WT_RET(__wt_struct_check(session, cval.str, cval.len, NULL, NULL));
+	WT_RET(__wt_config_gets(session, cfg, "value_format", &cval));
+	WT_RET(__wt_struct_check(session, cval.str, cval.len, NULL, NULL));
+
+	/*
+	 * User-specified collators aren't supported for data-source objects.
+	 */
+	if (__wt_config_getones(
+	    session, config, "collator", &cval) != WT_NOTFOUND)
+		WT_RET_MSG(session, EINVAL,
+		    "WT_DATA_SOURCE objects do not support WT_COLLATOR "
+		    "ordering");
+
+	return (dsrc->create(dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg));
+}
+
+/*
+ * __wt_schema_create --
+ *	Process a WT_SESSION::create operation for all supported types.
+ */
+int
+__wt_schema_create(
+    WT_SESSION_IMPL *session, const char *uri, const char *config)
+{
+	WT_CONFIG_ITEM cval;
+	WT_DATA_SOURCE *dsrc;
+	WT_DECL_RET;
+	int exclusive;
+
+	exclusive = (
+	    __wt_config_getones(session, config, "exclusive", &cval) == 0 &&
+	    cval.val != 0);
+
+	/*
+	 * We track create operations: if we fail in the middle of creating a
+	 * complex object, we want to back it all out.
+	 */
+	WT_RET(__wt_meta_track_on(session));
+
+	if (WT_PREFIX_MATCH(uri, "colgroup:"))
+		ret = __create_colgroup(session, uri, exclusive, config);
+	else if (WT_PREFIX_MATCH(uri, "file:"))
+		ret = __create_file(session, uri, exclusive, config);
+	else if (WT_PREFIX_MATCH(uri, "lsm:"))
+		ret = __wt_lsm_tree_create(session, uri, exclusive, config);
+	else if (WT_PREFIX_MATCH(uri, "index:"))
+		ret = __create_index(session, uri, exclusive, config);
+	else if (WT_PREFIX_MATCH(uri, "table:"))
+		ret = __create_table(session, uri, exclusive, config);
+	else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL)
+		ret = dsrc->create == NULL ?
+		    __wt_object_unsupported(session, uri) :
+		    __create_data_source(session, uri, config, dsrc);
+	else
+		ret = __wt_bad_object_type(session, uri);
+
+	session->dhandle = NULL;
+	WT_TRET(__wt_meta_track_off(session, ret != 0));
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_drop.c b/src/third_party/wiredtiger/src/schema/schema_drop.c
new file mode 100644
index 00000000000..6df7e6930c9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_drop.c
@@ -0,0 +1,204 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __drop_file --
+ *	Drop a file.
+ */
+static int
+__drop_file(
+    WT_SESSION_IMPL *session, const char *uri, int force, const char *cfg[])
+{
+	WT_CONFIG_ITEM cval;
+	WT_DECL_RET;
+	int exist, remove_files;
+	const char *filename;
+
+	WT_RET(__wt_config_gets(session, cfg, "remove_files", &cval));
+	remove_files = (cval.val != 0);
+
+	filename = uri;
+	if (!WT_PREFIX_SKIP(filename, "file:"))
+		return (EINVAL);
+
+	/* Close all btree handles associated with this file. */
+	WT_RET(__wt_conn_dhandle_close_all(session, uri, force));
+
+	/* Remove the metadata entry (ignore missing items). */
+	WT_TRET(__wt_metadata_remove(session, uri));
+	if (force && ret == WT_NOTFOUND)
+		ret = 0;
+
+	if (!remove_files)
+		return (ret);
+
+	/* Remove the underlying physical file. */
+	exist = 0;
+	WT_TRET(__wt_exist(session, filename, &exist));
+	if (exist) {
+		/*
+		 * There is no point tracking this operation: there is no going
+		 * back from here.
+		 */
+		WT_TRET(__wt_remove(session, filename));
+	}
+
+	return (ret);
+}
+
+/*
+ * __drop_colgroup --
+ *	WT_SESSION::drop for a colgroup.
+ */
+static int
+__drop_colgroup(
+    WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
+{
+	WT_COLGROUP *colgroup;
+	WT_DECL_RET;
+	WT_TABLE *table;
+
+	/* If we can get the colgroup, detach it from the table. */
+	if ((ret = __wt_schema_get_colgroup(
+	    session, uri, &table, &colgroup)) == 0) {
+		table->cg_complete = 0;
+		WT_TRET(__wt_schema_drop(session, colgroup->source, cfg));
+	}
+
+	WT_TRET(__wt_metadata_remove(session, uri));
+	return (ret);
+}
+
+/*
+ * __drop_index --
+ *	WT_SESSION::drop for a colgroup.
+ */
+static int
+__drop_index(
+    WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
+{
+	WT_INDEX *idx;
+	WT_DECL_RET;
+	WT_TABLE *table;
+
+	/* If we can get the colgroup, detach it from the table. */
+	if ((ret = __wt_schema_get_index(session, uri, &table, &idx)) == 0) {
+		table->idx_complete = 0;
+		WT_TRET(__wt_schema_drop(session, idx->source, cfg));
+	}
+
+	WT_TRET(__wt_metadata_remove(session, uri));
+	return (ret);
+}
+
+/*
+ * __drop_table --
+ *	WT_SESSION::drop for a table.
+ */
+static int
+__drop_table(
+    WT_SESSION_IMPL *session, const char *uri, int force, const char *cfg[])
+{
+	WT_COLGROUP *colgroup;
+	WT_DECL_RET;
+	WT_INDEX *idx;
+	WT_TABLE *table;
+	const char *name;
+	u_int i;
+
+	name = uri;
+	(void)WT_PREFIX_SKIP(name, "table:");
+
+	table = NULL;
+	WT_ERR(__wt_schema_get_table(session, name, strlen(name), 1, &table));
+
+	/* Drop the column groups. */
+	for (i = 0; i < WT_COLGROUPS(table); i++) {
+		if ((colgroup = table->cgroups[i]) == NULL)
+			continue;
+		WT_ERR(__wt_metadata_remove(session, colgroup->name));
+		WT_ERR(__wt_schema_drop(session, colgroup->source, cfg));
+	}
+
+	/* Drop the indices. */
+	WT_ERR(__wt_schema_open_indices(session, table));
+	for (i = 0; i < table->nindices; i++) {
+		if ((idx = table->indices[i]) == NULL)
+			continue;
+		WT_ERR(__wt_metadata_remove(session, idx->name));
+		WT_ERR(__wt_schema_drop(session, idx->source, cfg));
+	}
+
+	__wt_schema_remove_table(session, table);
+	table = NULL;
+
+	/* Remove the metadata entry (ignore missing items). */
+	WT_ERR(__wt_metadata_remove(session, uri));
+
+err:	if (force && ret == WT_NOTFOUND)
+		ret = 0;
+	if (table != NULL)
+		__wt_schema_release_table(session, table);
+	return (ret);
+}
+
+/*
+ * __wt_schema_drop --
+ *	Process a WT_SESSION::drop operation for all supported types.
+ */
+int
+__wt_schema_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
+{
+	WT_CONFIG_ITEM cval;
+	WT_DATA_SOURCE *dsrc;
+	WT_DECL_RET;
+	int force;
+
+	WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval));
+	force = (cval.val != 0);
+
+	WT_RET(__wt_meta_track_on(session));
+
+	/* Be careful to ignore any btree handle in our caller. */
+	WT_CLEAR_BTREE_IN_SESSION(session);
+
+	if (WT_PREFIX_MATCH(uri, "colgroup:"))
+		ret = __drop_colgroup(session, uri, cfg);
+	else if (WT_PREFIX_MATCH(uri, "file:"))
+		ret = __drop_file(session, uri, force, cfg);
+	else if (WT_PREFIX_MATCH(uri, "index:"))
+		ret = __drop_index(session, uri, cfg);
+	else if (WT_PREFIX_MATCH(uri, "lsm:"))
+		ret = __wt_lsm_tree_drop(session, uri, cfg);
+	else if (WT_PREFIX_MATCH(uri, "table:"))
+		ret = __drop_table(session, uri, force, cfg);
+	else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL)
+		ret = dsrc->drop == NULL ?
+		    __wt_object_unsupported(session, uri) :
+		    dsrc->drop(
+		    dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg);
+	else
+		ret = __wt_bad_object_type(session, uri);
+
+	/*
+	 * Map WT_NOTFOUND to ENOENT (or to 0 if "force" is set), based on the
+	 * assumption WT_NOTFOUND means there was no metadata entry.  The
+	 * underlying drop functions should handle this case (we passed them
+	 * the "force" value), but better safe than sorry.
+	 */
+	if (ret == WT_NOTFOUND)
+		ret = force ? 0 : ENOENT;
+
+	/* Bump the schema generation so that stale data is ignored. */
+	++S2C(session)->schema_gen;
+
+	WT_TRET(__wt_meta_track_off(session, ret != 0));
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_list.c b/src/third_party/wiredtiger/src/schema/schema_list.c
new file mode 100644
index 00000000000..05421283bf6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_list.c
@@ -0,0 +1,204 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __schema_add_table --
+ *	Add a table handle to the session's cache.
+ */
+static int
+__schema_add_table(WT_SESSION_IMPL *session,
+    const char *name, size_t namelen, WT_TABLE **tablep)
+{
+	WT_TABLE *table;
+
+	WT_RET(__wt_schema_open_table(session, name, namelen, &table));
+
+	/* Copy the schema generation into the new table. */
+	table->schema_gen = S2C(session)->schema_gen;
+
+	TAILQ_INSERT_HEAD(&session->tables, table, q);
+	*tablep = table;
+
+	return (0);
+}
+
+/*
+ * __schema_find_table --
+ *	Find the table handle for the named table in the session cache.
+ */
+static int
+__schema_find_table(WT_SESSION_IMPL *session,
+    const char *name, size_t namelen, WT_TABLE **tablep)
+{
+	WT_TABLE *table;
+	const char *tablename;
+
+restart:
+	TAILQ_FOREACH(table, &session->tables, q) {
+		tablename = table->name;
+		(void)WT_PREFIX_SKIP(tablename, "table:");
+		if (WT_STRING_MATCH(tablename, name, namelen)) {
+			/*
+			 * Ignore stale tables.
+			 *
+			 * XXX: should be managed the same as btree handles,
+			 * with a local cache in each session and a shared list
+			 * in the connection.  There is still a race here
+			 * between checking the generation and opening the
+			 * first column group.
+			 */
+			if (table->schema_gen != S2C(session)->schema_gen) {
+				if (table->refcnt == 0) {
+					__wt_schema_remove_table(
+					    session, table);
+					goto restart;
+				}
+				continue;
+			}
+			*tablep = table;
+			return (0);
+		}
+	}
+
+	return (WT_NOTFOUND);
+}
+
+/*
+ * __wt_schema_get_table --
+ *	Get the table handle for the named table.
+ */
+int
+__wt_schema_get_table(WT_SESSION_IMPL *session,
+    const char *name, size_t namelen, int ok_incomplete, WT_TABLE **tablep)
+{
+	WT_DECL_RET;
+	WT_TABLE *table;
+
+	*tablep = table = NULL;
+	ret = __schema_find_table(session, name, namelen, &table);
+
+	if (ret == WT_NOTFOUND)
+		WT_WITH_SCHEMA_LOCK(session,
+		    ret = __schema_add_table(session, name, namelen, &table));
+
+	if (ret == 0) {
+		if (!ok_incomplete && !table->cg_complete)
+			WT_RET_MSG(session, EINVAL, "'%s' cannot be used "
+			    "until all column groups are created",
+			    table->name);
+
+		++table->refcnt;
+		*tablep = table;
+	}
+
+	return (ret);
+}
+
+/*
+ * __wt_schema_release_table --
+ *	Release a table handle.
+ */
+void
+__wt_schema_release_table(WT_SESSION_IMPL *session, WT_TABLE *table)
+{
+	WT_ASSERT(session, table->refcnt > 0);
+	--table->refcnt;
+}
+
+/*
+ * __wt_schema_destroy_colgroup --
+ *	Free a column group handle.
+ */
+void
+__wt_schema_destroy_colgroup(WT_SESSION_IMPL *session, WT_COLGROUP *colgroup)
+{
+	__wt_free(session, colgroup->name);
+	__wt_free(session, colgroup->source);
+	__wt_free(session, colgroup->config);
+	__wt_free(session, colgroup);
+}
+
+/*
+ * __wt_schema_destroy_index --
+ *	Free an index handle.
+ */
+void
+__wt_schema_destroy_index(WT_SESSION_IMPL *session, WT_INDEX *idx)
+{
+	__wt_free(session, idx->name);
+	__wt_free(session, idx->source);
+	__wt_free(session, idx->config);
+	__wt_free(session, idx->key_format);
+	__wt_free(session, idx->key_plan);
+	__wt_free(session, idx->value_plan);
+	__wt_free(session, idx->idxkey_format);
+	__wt_free(session, idx);
+}
+
+/*
+ * __wt_schema_destroy_table --
+ *	Free a table handle.
+ */
+void
+__wt_schema_destroy_table(WT_SESSION_IMPL *session, WT_TABLE *table)
+{
+	WT_COLGROUP *colgroup;
+	WT_INDEX *idx;
+	u_int i;
+
+	__wt_free(session, table->name);
+	__wt_free(session, table->config);
+	__wt_free(session, table->plan);
+	__wt_free(session, table->key_format);
+	__wt_free(session, table->value_format);
+	if (table->cgroups != NULL) {
+		for (i = 0; i < WT_COLGROUPS(table); i++) {
+			if ((colgroup = table->cgroups[i]) == NULL)
+				continue;
+			__wt_schema_destroy_colgroup(session, colgroup);
+		}
+		__wt_free(session, table->cgroups);
+	}
+	if (table->indices != NULL) {
+		for (i = 0; i < table->nindices; i++) {
+			if ((idx = table->indices[i]) == NULL)
+				continue;
+			__wt_schema_destroy_index(session, idx);
+		}
+		__wt_free(session, table->indices);
+	}
+	__wt_free(session, table);
+}
+
+/*
+ * __wt_schema_remove_table --
+ *	Remove the table handle from the session, closing if necessary.
+ */
+void
+__wt_schema_remove_table(
+    WT_SESSION_IMPL *session, WT_TABLE *table)
+{
+	WT_ASSERT(session, table->refcnt <= 1);
+
+	TAILQ_REMOVE(&session->tables, table, q);
+	__wt_schema_destroy_table(session, table);
+}
+
+/*
+ * __wt_schema_close_tables --
+ *	Close all of the tables in a session.
+ */
+void
+__wt_schema_close_tables(WT_SESSION_IMPL *session)
+{
+	WT_TABLE *table;
+
+	while ((table = TAILQ_FIRST(&session->tables)) != NULL)
+		__wt_schema_remove_table(session, table);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_open.c b/src/third_party/wiredtiger/src/schema/schema_open.c
new file mode 100644
index 00000000000..0332569a8e3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_open.c
@@ -0,0 +1,510 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_schema_colgroup_name --
+ *	Get the URI for a column group.  This is used for metadata lookups.
+ *	The only complexity here is that simple tables (with a single column
+ *	group) use a simpler naming scheme.
+ */
+int
+__wt_schema_colgroup_name(WT_SESSION_IMPL *session,
+    WT_TABLE *table, const char *cgname, size_t len, WT_ITEM *buf)
+{
+	const char *tablename;
+
+	tablename = table->name;
+	(void)WT_PREFIX_SKIP(tablename, "table:");
+
+	return ((table->ncolgroups == 0) ?
+	    __wt_buf_fmt(session, buf, "colgroup:%s", tablename) :
+	    __wt_buf_fmt(session, buf, "colgroup:%s:%.*s",
+	    tablename, (int)len, cgname));
+}
+
+/*
+ * __wt_schema_open_colgroups --
+ *	Open the column groups for a table.
+ */
+int
+__wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table)
+{
+	WT_COLGROUP *colgroup;
+	WT_CONFIG cparser;
+	WT_CONFIG_ITEM ckey, cval;
+	WT_DECL_RET;
+	WT_DECL_ITEM(buf);
+	const char *cgconfig;
+	u_int i;
+
+	if (table->cg_complete)
+		return (0);
+
+	colgroup = NULL;
+	cgconfig = NULL;
+
+	WT_RET(__wt_scr_alloc(session, 0, &buf));
+
+	WT_ERR(__wt_config_subinit(session, &cparser, &table->cgconf));
+
+	/* Open each column group. */
+	for (i = 0; i < WT_COLGROUPS(table); i++) {
+		if (table->ncolgroups > 0)
+			WT_ERR(__wt_config_next(&cparser, &ckey, &cval));
+		else
+			WT_CLEAR(ckey);
+
+		/*
+		 * Always open from scratch: we may have failed part of the way
+		 * through opening a table, or column groups may have changed.
+		 */
+		if (table->cgroups[i] != NULL) {
+			__wt_schema_destroy_colgroup(
+			    session, table->cgroups[i]);
+			table->cgroups[i] = NULL;
+		}
+
+		WT_ERR(__wt_buf_init(session, buf, 0));
+		WT_ERR(__wt_schema_colgroup_name(session, table,
+		    ckey.str, ckey.len, buf));
+		if ((ret = __wt_metadata_search(
+		    session, buf->data, &cgconfig)) != 0) {
+			/* It is okay if the table is incomplete. */
+			if (ret == WT_NOTFOUND)
+				ret = 0;
+			goto err;
+		}
+
+		WT_ERR(__wt_calloc_def(session, 1, &colgroup));
+		WT_ERR(__wt_strndup(
+		    session, buf->data, buf->size, &colgroup->name));
+		colgroup->config = cgconfig;
+		cgconfig = NULL;
+		WT_ERR(__wt_config_getones(session,
+		    colgroup->config, "columns", &colgroup->colconf));
+		WT_ERR(__wt_config_getones(
+		    session, colgroup->config, "source", &cval));
+		WT_ERR(__wt_buf_init(session, buf, 0));
+		WT_ERR(__wt_buf_fmt(
+		    session, buf, "%.*s", (int)cval.len, cval.str));
+		WT_ERR(__wt_strndup(
+		    session, buf->data, buf->size, &colgroup->source));
+		table->cgroups[i] = colgroup;
+		colgroup = NULL;
+	}
+
+	if (!table->is_simple) {
+		WT_ERR(__wt_table_check(session, table));
+
+		WT_ERR(__wt_buf_init(session, buf, 0));
+		WT_ERR(__wt_struct_plan(session,
+		    table, table->colconf.str, table->colconf.len, 1, buf));
+		WT_ERR(__wt_strndup(
+		    session, buf->data, buf->size, &table->plan));
+	}
+
+	table->cg_complete = 1;
+
+err:	__wt_scr_free(&buf);
+	if (colgroup != NULL)
+		__wt_schema_destroy_colgroup(session, colgroup);
+	if (cgconfig != NULL)
+		__wt_free(session, cgconfig);
+	return (ret);
+}
+
+/*
+ * __open_index --
+ *	Open an index.
+ */
+static int
+__open_index(WT_SESSION_IMPL *session, WT_TABLE *table, WT_INDEX *idx)
+{
+	WT_CONFIG colconf;
+	WT_CONFIG_ITEM ckey, cval;
+	WT_DECL_ITEM(buf);
+	WT_DECL_ITEM(plan);
+	WT_DECL_RET;
+	u_int cursor_key_cols, i;
+
+	WT_ERR(__wt_scr_alloc(session, 0, &buf));
+
+	/* Get the data source from the index config. */
+	WT_ERR(__wt_config_getones(session, idx->config, "source", &cval));
+	WT_ERR(__wt_buf_fmt(session, buf, "%.*s", (int)cval.len, cval.str));
+	WT_ERR(__wt_strndup(session, buf->data, buf->size, &idx->source));
+
+	WT_ERR(__wt_buf_init(session, buf, 0));
+	WT_ERR(__wt_config_getones(session, idx->config, "key_format", &cval));
+	WT_ERR(__wt_buf_fmt(session, buf, "%.*s", (int)cval.len, cval.str));
+	WT_ERR(__wt_strndup(session, buf->data, buf->size, &idx->key_format));
+
+	/*
+	 * The key format for an index is somewhat subtle: the application
+	 * specifies a set of columns that it will use for the key, but the
+	 * engine usually adds some hidden columns in order to derive the
+	 * primary key.  These hidden columns are part of the file's key.
+	 *
+	 * The file's key_format is stored persistently, we need to calculate
+	 * the index cursor key format (which will usually omit some of those
+	 * keys).
+	 */
+	WT_ERR(__wt_buf_init(session, buf, 0));
+	WT_ERR(__wt_config_getones(
+	    session, idx->config, "columns", &idx->colconf));
+
+	/* Start with the declared index columns. */
+	WT_ERR(__wt_config_subinit(session, &colconf, &idx->colconf));
+	cursor_key_cols = 0;
+	while ((ret = __wt_config_next(&colconf, &ckey, &cval)) == 0) {
+		WT_ERR(__wt_buf_catfmt(
+		    session, buf, "%.*s,", (int)ckey.len, ckey.str));
+		++cursor_key_cols;
+	}
+	if (ret != 0 && ret != WT_NOTFOUND)
+		goto err;
+
+	/*
+	 * Now add any primary key columns from the table that are not
+	 * already part of the index key.
+	 */
+	WT_ERR(__wt_config_subinit(session, &colconf, &table->colconf));
+	for (i = 0; i < table->nkey_columns &&
+	    (ret = __wt_config_next(&colconf, &ckey, &cval)) == 0;
+	    i++) {
+		/*
+		 * If the primary key column is already in the secondary key,
+		 * don't add it again.
+		 */
+		if (__wt_config_subgetraw(
+		    session, &idx->colconf, &ckey, &cval) == 0)
+			continue;
+		WT_ERR(__wt_buf_catfmt(
+		    session, buf, "%.*s,", (int)ckey.len, ckey.str));
+	}
+	if (ret != 0 && ret != WT_NOTFOUND)
+		goto err;
+
+	WT_ERR(__wt_scr_alloc(session, 0, &plan));
+	WT_ERR(__wt_struct_plan(session, table, buf->data, buf->size, 0, plan));
+	WT_ERR(__wt_strndup(session, plan->data, plan->size, &idx->key_plan));
+
+	/* Set up the cursor key format (the visible columns). */
+	WT_ERR(__wt_buf_init(session, buf, 0));
+	WT_ERR(__wt_struct_truncate(session,
+	    idx->key_format, cursor_key_cols, buf));
+	WT_ERR(__wt_strndup(
+	    session, buf->data, buf->size, &idx->idxkey_format));
+
+	/* By default, index cursor values are the table value columns. */
+	/* TODO Optimize to use index columns in preference to table lookups. */
+	WT_ERR(__wt_buf_init(session, plan, 0));
+	WT_ERR(__wt_struct_plan(session,
+	    table, table->colconf.str, table->colconf.len, 1, plan));
+	WT_ERR(__wt_strndup(session, plan->data, plan->size, &idx->value_plan));
+
+err:	__wt_scr_free(&buf);
+	__wt_scr_free(&plan);
+	return (ret);
+}
+
+/*
+ * __wt_schema_open_index --
+ *	Open one or more indices for a table.
+ */
+int
+__wt_schema_open_index(WT_SESSION_IMPL *session,
+    WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	WT_INDEX *idx;
+	u_int i;
+	int cmp, match;
+	const char *idxconf, *name, *tablename, *uri;
+
+	/* Check if we've already done the work. */
+	if (idxname == NULL && table->idx_complete)
+		return (0);
+
+	cursor = NULL;
+	idx = NULL;
+
+	/* Build a search key. */
+	tablename = table->name;
+	(void)WT_PREFIX_SKIP(tablename, "table:");
+	WT_ERR(__wt_scr_alloc(session, 512, &tmp));
+	WT_ERR(__wt_buf_fmt(session, tmp, "index:%s:", tablename));
+
+	/* Find matching indices. */
+	WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+	cursor->set_key(cursor, tmp->data);
+	if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0)
+		ret = cursor->next(cursor);
+	for (i = 0; ret == 0; i++, ret = cursor->next(cursor)) {
+		WT_ERR(cursor->get_key(cursor, &uri));
+		name = uri;
+		if (!WT_PREFIX_SKIP(name, tmp->data))
+			break;
+
+		/* Is this the index we are looking for? */
+		match = idxname == NULL || WT_STRING_MATCH(name, idxname, len);
+
+		/*
+		 * Ensure there is space, including if we have to make room for
+		 * a new entry in the middle of the list.
+		 */
+		WT_ERR(__wt_realloc_def(session, &table->idx_alloc,
+		    WT_MAX(i, table->nindices) + 1, &table->indices));
+
+		/* Keep the in-memory list in sync with the metadata. */
+		cmp = 0;
+		while (table->indices[i] != NULL &&
+		    (cmp = strcmp(uri, table->indices[i]->name)) > 0) {
+			/* Index no longer exists, remove it. */
+			__wt_free(session, table->indices[i]);
+			memmove(&table->indices[i], &table->indices[i + 1],
+			    (table->nindices - i) * sizeof(WT_INDEX *));
+			table->indices[--table->nindices] = NULL;
+		}
+		if (cmp < 0) {
+			/* Make room for a new index. */
+			memmove(&table->indices[i + 1], &table->indices[i],
+			    (table->nindices - i) * sizeof(WT_INDEX *));
+			table->indices[i] = NULL;
+			++table->nindices;
+		}
+
+		if (!match)
+			continue;
+
+		if (table->indices[i] == NULL) {
+			WT_ERR(cursor->get_value(cursor, &idxconf));
+			WT_ERR(__wt_calloc_def(session, 1, &idx));
+			WT_ERR(__wt_strdup(session, uri, &idx->name));
+			WT_ERR(__wt_strdup(session, idxconf, &idx->config));
+			WT_ERR(__open_index(session, table, idx));
+
+			table->indices[i] = idx;
+			idx = NULL;
+		}
+
+		/* If we were looking for a single index, we're done. */
+		if (indexp != NULL)
+			*indexp = table->indices[i];
+		if (idxname != NULL)
+			break;
+	}
+	WT_ERR_NOTFOUND_OK(ret);
+
+	/* If we did a full pass, we won't need to do it again. */
+	if (idxname == NULL) {
+		table->nindices = i;
+		table->idx_complete = 1;
+	}
+
+err:	__wt_scr_free(&tmp);
+	if (idx != NULL)
+		__wt_schema_destroy_index(session, idx);
+	if (cursor != NULL)
+		WT_TRET(cursor->close(cursor));
+	return (ret);
+}
+
+/*
+ * __wt_schema_open_indices --
+ *	Open the indices for a table.
+ */
+int
+__wt_schema_open_indices(WT_SESSION_IMPL *session, WT_TABLE *table)
+{
+	return (__wt_schema_open_index(session, table, NULL, 0, NULL));
+}
+
+/*
+ * __wt_schema_open_table --
+ *	Open a named table.
+ */
+int
+__wt_schema_open_table(WT_SESSION_IMPL *session,
+    const char *name, size_t namelen, WT_TABLE **tablep)
+{
+	WT_CONFIG cparser;
+	WT_CONFIG_ITEM ckey, cval;
+	WT_CURSOR *cursor;
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+	WT_TABLE *table;
+	const char *tconfig;
+	char *tablename;
+
+	cursor = NULL;
+	table = NULL;
+	tablename = NULL;
+
+	WT_ERR(__wt_scr_alloc(session, 0, &buf));
+	WT_ERR(__wt_buf_fmt(session, buf, "table:%.*s", (int)namelen, name));
+	WT_ERR(__wt_strndup(session, buf->data, buf->size, &tablename));
+
+	WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+	cursor->set_key(cursor, tablename);
+	WT_ERR(cursor->search(cursor));
+	WT_ERR(cursor->get_value(cursor, &tconfig));
+
+	WT_ERR(__wt_calloc_def(session, 1, &table));
+	table->name = tablename;
+	tablename = NULL;
+
+	WT_ERR(__wt_config_getones(session, tconfig, "columns", &cval));
+
+	WT_ERR(__wt_config_getones(session, tconfig, "key_format", &cval));
+	WT_ERR(__wt_strndup(session, cval.str, cval.len, &table->key_format));
+	WT_ERR(__wt_config_getones(session, tconfig, "value_format", &cval));
+	WT_ERR(__wt_strndup(session, cval.str, cval.len, &table->value_format));
+	WT_ERR(__wt_strdup(session, tconfig, &table->config));
+
+	/* Point to some items in the copy to save re-parsing. */
+	WT_ERR(__wt_config_getones(session, table->config,
+	    "columns", &table->colconf));
+
+	/*
+	 * Count the number of columns: tables are "simple" if the columns
+	 * are not named.
+	 */
+	WT_ERR(__wt_config_subinit(session, &cparser, &table->colconf));
+	table->is_simple = 1;
+	while ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0)
+		table->is_simple = 0;
+	if (ret != WT_NOTFOUND)
+		goto err;
+
+	/* Check that the columns match the key and value formats. */
+	if (!table->is_simple)
+		WT_ERR(__wt_schema_colcheck(session,
+		    table->key_format, table->value_format, &table->colconf,
+		    &table->nkey_columns, NULL));
+
+	WT_ERR(__wt_config_getones(session, table->config,
+	    "colgroups", &table->cgconf));
+
+	/* Count the number of column groups. */
+	WT_ERR(__wt_config_subinit(session, &cparser, &table->cgconf));
+	table->ncolgroups = 0;
+	while ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0)
+		++table->ncolgroups;
+	if (ret != WT_NOTFOUND)
+		goto err;
+
+	WT_ERR(__wt_calloc_def(session, WT_COLGROUPS(table), &table->cgroups));
+	WT_ERR(__wt_schema_open_colgroups(session, table));
+	*tablep = table;
+
+	if (0) {
+err:		if (table != NULL)
+			__wt_schema_destroy_table(session, table);
+	}
+	if (cursor != NULL)
+		WT_TRET(cursor->close(cursor));
+
+	__wt_free(session, tablename);
+	__wt_scr_free(&buf);
+	return (ret);
+}
+
+/*
+ * __wt_schema_get_colgroup --
+ *	Find a column group by URI.
+ */
+int
+__wt_schema_get_colgroup(WT_SESSION_IMPL *session,
+    const char *uri, WT_TABLE **tablep, WT_COLGROUP **colgroupp)
+{
+	WT_COLGROUP *colgroup;
+	WT_TABLE *table;
+	const char *tablename, *tend;
+	u_int i;
+
+	*colgroupp = NULL;
+
+	tablename = uri;
+	if (!WT_PREFIX_SKIP(tablename, "colgroup:"))
+		return (__wt_bad_object_type(session, uri));
+
+	if ((tend = strchr(tablename, ':')) == NULL)
+		tend = tablename + strlen(tablename);
+
+	WT_RET(__wt_schema_get_table(session,
+	    tablename, WT_PTRDIFF(tend, tablename), 0, &table));
+
+	for (i = 0; i < WT_COLGROUPS(table); i++) {
+		colgroup = table->cgroups[i];
+		if (strcmp(colgroup->name, uri) == 0) {
+			*colgroupp = colgroup;
+			if (tablep != NULL)
+				*tablep = table;
+			else
+				__wt_schema_release_table(session, table);
+			return (0);
+		}
+	}
+
+	__wt_schema_release_table(session, table);
+	WT_RET_MSG(session, ENOENT, "%s not found in table", uri);
+}
+
+/*
+ * __wt_schema_get_index --
+ *	Find a column group by URI.
+ */
+int
+__wt_schema_get_index(WT_SESSION_IMPL *session,
+    const char *uri, WT_TABLE **tablep, WT_INDEX **indexp)
+{
+	WT_DECL_RET;
+	WT_INDEX *idx;
+	WT_TABLE *table;
+	const char *tablename, *tend;
+	u_int i;
+
+	*indexp = NULL;
+
+	tablename = uri;
+	if (!WT_PREFIX_SKIP(tablename, "index:") ||
+	    (tend = strchr(tablename, ':')) == NULL)
+		return (__wt_bad_object_type(session, uri));
+
+	WT_RET(__wt_schema_get_table(session,
+	    tablename, WT_PTRDIFF(tend, tablename), 0, &table));
+
+	/* Try to find the index in the table. */
+	for (i = 0; i < table->nindices; i++) {
+		idx = table->indices[i];
+		if (strcmp(idx->name, uri) == 0) {
+			if (tablep != NULL)
+				*tablep = table;
+			else
+				__wt_schema_release_table(session, table);
+			*indexp = idx;
+			return (0);
+		}
+	}
+
+	/* Otherwise, open it. */
+	WT_ERR(__wt_schema_open_index(
+	    session, table, tend + 1, strlen(tend + 1), indexp));
+
+err:	__wt_schema_release_table(session, table);
+	WT_RET(ret);
+
+	if (*indexp != NULL)
+		return (0);
+
+	WT_RET_MSG(session, ENOENT, "%s not found in table", uri);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_plan.c b/src/third_party/wiredtiger/src/schema/schema_plan.c
new file mode 100644
index 00000000000..5abe0dd67d4
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_plan.c
@@ -0,0 +1,394 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __find_next_col --
+ *	Find the next column to use for a plan.
+ */
+static int
+__find_next_col(WT_SESSION_IMPL *session, WT_TABLE *table,
+    WT_CONFIG_ITEM *colname, u_int *cgnump, u_int *colnump, char *coltype)
+{
+	WT_COLGROUP *colgroup;
+	WT_CONFIG conf;
+	WT_CONFIG_ITEM cval, k, v;
+	WT_DECL_RET;
+	u_int cg, col, foundcg, foundcol, matchcg, matchcol;
+	int getnext;
+
+	foundcg = foundcol = UINT_MAX;
+	matchcg = *cgnump;
+	matchcol = (*coltype == WT_PROJ_KEY) ?
+	    *colnump : *colnump + table->nkey_columns;
+
+	getnext = 1;
+	for (colgroup = NULL, cg = 0; cg < WT_COLGROUPS(table); cg++) {
+		colgroup = table->cgroups[cg];
+
+		/*
+		 * If there is only one column group, we just scan through all
+		 * of the columns.  For tables with multiple column groups, we
+		 * look at the key columns once, then go through the value
+		 * columns for each group.
+		 */
+		if (cg == 0) {
+			cval = table->colconf;
+			col = 0;
+		} else {
+cgcols:			cval = colgroup->colconf;
+			col = table->nkey_columns;
+		}
+		WT_RET(__wt_config_subinit(session, &conf, &cval));
+		for (; (ret = __wt_config_next(&conf, &k, &v)) == 0; col++) {
+			if (k.len == colname->len &&
+			    strncmp(colname->str, k.str, k.len) == 0) {
+				if (getnext) {
+					foundcg = cg;
+					foundcol = col;
+				}
+				getnext = (cg == matchcg && col == matchcol);
+			}
+			if (cg == 0 && table->ncolgroups > 0 &&
+			    col == table->nkey_columns - 1)
+				goto cgcols;
+		}
+		WT_RET_TEST(ret != WT_NOTFOUND, ret);
+
+		colgroup = NULL;
+	}
+
+	if (foundcg == UINT_MAX)
+		return (WT_NOTFOUND);
+
+	*cgnump = foundcg;
+	if (foundcol < table->nkey_columns) {
+		*coltype = WT_PROJ_KEY;
+		*colnump = foundcol;
+	} else {
+		*coltype = WT_PROJ_VALUE;
+		*colnump = foundcol - table->nkey_columns;
+	}
+	return (0);
+}
+
+/*
+ * __wt_schema_colcheck --
+ *	Check that a list of columns matches a (key,value) format pair.
+ */
+int
+__wt_schema_colcheck(WT_SESSION_IMPL *session,
+    const char *key_format, const char *value_format, WT_CONFIG_ITEM *colconf,
+    u_int *kcolsp, u_int *vcolsp)
+{
+	WT_CONFIG conf;
+	WT_CONFIG_ITEM k, v;
+	WT_DECL_PACK_VALUE(pv);
+	WT_DECL_RET;
+	WT_PACK pack;
+	u_int kcols, ncols, vcols;
+
+	WT_RET(__pack_init(session, &pack, key_format));
+	for (kcols = 0; (ret = __pack_next(&pack, &pv)) == 0; kcols++)
+		;
+	WT_RET_TEST(ret != WT_NOTFOUND, ret);
+
+	WT_RET(__pack_init(session, &pack, value_format));
+	for (vcols = 0; (ret = __pack_next(&pack, &pv)) == 0; vcols++)
+		;
+	WT_RET_TEST(ret != WT_NOTFOUND, ret);
+
+	/* Walk through the named columns. */
+	WT_RET(__wt_config_subinit(session, &conf, colconf));
+	for (ncols = 0; (ret = __wt_config_next(&conf, &k, &v)) == 0; ncols++)
+		;
+	WT_RET_TEST(ret != WT_NOTFOUND, ret);
+
+	if (ncols != 0 && ncols != kcols + vcols)
+		WT_RET_MSG(session, EINVAL, "Number of columns in '%.*s' "
+		    "does not match key format '%s' plus value format '%s'",
+		    (int)colconf->len, colconf->str, key_format, value_format);
+
+	if (kcolsp != NULL)
+		*kcolsp = kcols;
+	if (vcolsp != NULL)
+		*vcolsp = vcols;
+
+	return (0);
+}
+
+/*
+ * __wt_table_check --
+ *	Make sure all columns appear in a column group.
+ */
+int
+__wt_table_check(WT_SESSION_IMPL *session, WT_TABLE *table)
+{
+	WT_CONFIG conf;
+	WT_CONFIG_ITEM k, v;
+	WT_DECL_RET;
+	u_int cg, col, i;
+	char coltype;
+
+	if (table->is_simple)
+		return (0);
+
+	/* Walk through the columns. */
+	WT_RET(__wt_config_subinit(session, &conf, &table->colconf));
+
+	/* Skip over the key columns. */
+	for (i = 0; i < table->nkey_columns; i++)
+		WT_RET(__wt_config_next(&conf, &k, &v));
+	cg = col = 0;
+	coltype = 0;
+	while ((ret = __wt_config_next(&conf, &k, &v)) == 0) {
+		if (__find_next_col(
+		    session, table, &k, &cg, &col, &coltype) != 0)
+			WT_RET_MSG(session, EINVAL,
+			    "Column '%.*s' in '%s' does not appear in a "
+			    "column group",
+			    (int)k.len, k.str, table->name);
+		/*
+		 * Column groups can't store key columns in their value:
+		 * __wt_struct_reformat should have already detected this case.
+		 */
+		WT_ASSERT(session, coltype == WT_PROJ_VALUE);
+
+	}
+	WT_RET_TEST(ret != WT_NOTFOUND, ret);
+
+	return (0);
+}
+
+/*
+ * __wt_struct_plan --
+ *	Given a table cursor containing a complete table, build the "projection
+ *	plan" to distribute the columns to dependent stores.  A string
+ *	representing the plan will be appended to the plan buffer.
+ */
+int
+__wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table,
+    const char *columns, size_t len, int value_only, WT_ITEM *plan)
+{
+	WT_CONFIG conf;
+	WT_CONFIG_ITEM k, v;
+	WT_DECL_RET;
+	u_int cg, col, current_cg, current_col, i, start_cg, start_col;
+	int have_it;
+	char coltype, current_coltype;
+
+	start_cg = start_col = UINT_MAX;	/* -Wuninitialized */
+
+	/* Work through the value columns by skipping over the key columns. */
+	WT_RET(__wt_config_initn(session, &conf, columns, len));
+	if (value_only)
+		for (i = 0; i < table->nkey_columns; i++)
+			WT_RET(__wt_config_next(&conf, &k, &v));
+
+	current_cg = cg = 0;
+	current_col = col = INT_MAX;
+	current_coltype = coltype = WT_PROJ_KEY; /* Keep lint quiet. */
+	for (i = 0; (ret = __wt_config_next(&conf, &k, &v)) == 0; i++) {
+		have_it = 0;
+
+		while (__find_next_col(session, table,
+		    &k, &cg, &col, &coltype) == 0 &&
+		    (!have_it || cg != start_cg || col != start_col)) {
+			/*
+			 * First we move to the column.  If that is in a
+			 * different column group to the last column we
+			 * accessed, or before the last column in the same
+			 * column group, or moving from the key to the value,
+			 * we need to switch column groups or rewind.
+			 */
+			if (current_cg != cg || current_col > col ||
+			    current_coltype != coltype) {
+				WT_ASSERT(session, !value_only ||
+				    coltype == WT_PROJ_VALUE);
+				WT_RET(__wt_buf_catfmt(
+				    session, plan, "%d%c", cg, coltype));
+
+				/*
+				 * Set the current column group and column
+				 * within the table.
+				 */
+				current_cg = cg;
+				current_col = 0;
+				current_coltype = coltype;
+			}
+			/* Now move to the column we want. */
+			if (current_col < col) {
+				if (col - current_col > 1)
+					WT_RET(__wt_buf_catfmt(session,
+					    plan, "%d", col - current_col));
+				WT_RET(__wt_buf_catfmt(session,
+				    plan, "%c", WT_PROJ_SKIP));
+			}
+			/*
+			 * Now copy the value in / out.  In the common case,
+			 * where each value is used in one column, we do a
+			 * "next" operation.  If the value is used again, we do
+			 * a "reuse" operation to avoid making another copy.
+			 */
+			if (!have_it) {
+				WT_RET(__wt_buf_catfmt(session,
+				    plan, "%c", WT_PROJ_NEXT));
+
+				start_cg = cg;
+				start_col = col;
+				have_it = 1;
+			} else
+				WT_RET(__wt_buf_catfmt(session,
+				    plan, "%c", WT_PROJ_REUSE));
+			current_col = col + 1;
+		}
+	}
+	WT_RET_TEST(ret != WT_NOTFOUND, ret);
+
+	/* Special case empty plans. */
+	if (i == 0 && plan->size == 0)
+		WT_RET(__wt_buf_set(session, plan, "", 1));
+
+	return (0);
+}
+
+/*
+ * __find_column_format --
+ *	Find the format of the named column.
+ */
+static int
+__find_column_format(WT_SESSION_IMPL *session,
+    WT_TABLE *table, WT_CONFIG_ITEM *colname, int value_only, WT_PACK_VALUE *pv)
+{
+	WT_CONFIG conf;
+	WT_CONFIG_ITEM k, v;
+	WT_DECL_RET;
+	WT_PACK pack;
+	int inkey;
+
+	WT_RET(__wt_config_subinit(session, &conf, &table->colconf));
+	WT_RET(__pack_init(session, &pack, table->key_format));
+	inkey = 1;
+
+	while ((ret = __wt_config_next(&conf, &k, &v)) == 0) {
+		if ((ret = __pack_next(&pack, pv)) == WT_NOTFOUND && inkey) {
+			ret = __pack_init(session, &pack, table->value_format);
+			if (ret == 0)
+				ret = __pack_next(&pack, pv);
+			inkey = 0;
+		}
+		if (ret != 0)
+			return (ret);
+
+		if (k.len == colname->len &&
+		    strncmp(colname->str, k.str, k.len) == 0) {
+			if (value_only && inkey)
+				return (EINVAL);
+			return (0);
+		}
+	}
+
+	return (ret);
+}
+
+/*
+ * __wt_struct_reformat --
+ *	Given a table and a list of columns (which could be values in a column
+ *	group or index keys), calculate the resulting new format string.
+ *	The result will be appended to the format buffer.
+ */
+int
+__wt_struct_reformat(WT_SESSION_IMPL *session, WT_TABLE *table,
+    const char *columns, size_t len, const char *extra_cols, int value_only,
+    WT_ITEM *format)
+{
+	WT_CONFIG config;
+	WT_CONFIG_ITEM k, next_k, next_v;
+	WT_DECL_PACK_VALUE(pv);
+	WT_DECL_RET;
+	int have_next;
+
+	WT_RET(__wt_config_initn(session, &config, columns, len));
+	/*
+	 * If an empty column list is specified, this will fail with
+	 * WT_NOTFOUND, that's okay.
+	 */
+	WT_RET_NOTFOUND_OK(ret = __wt_config_next(&config, &next_k, &next_v));
+	if (ret == WT_NOTFOUND) {
+		if (format->size == 0)
+			WT_RET(__wt_buf_set(session, format, "", 1));
+		return (0);
+	}
+	do {
+		k = next_k;
+		ret = __wt_config_next(&config, &next_k, &next_v);
+		if (ret != 0 && ret != WT_NOTFOUND)
+			return (ret);
+		have_next = (ret == 0);
+
+		if (!have_next && extra_cols != NULL) {
+			WT_RET(__wt_config_init(session, &config, extra_cols));
+			WT_RET(__wt_config_next(&config, &next_k, &next_v));
+			have_next = 1;
+			extra_cols = NULL;
+		}
+
+		if ((ret = __find_column_format(session,
+		    table, &k, value_only, &pv)) != 0) {
+			if (value_only && ret == EINVAL)
+				WT_RET_MSG(session, EINVAL,
+				    "A column group cannot store key column "
+				    "'%.*s' in its value", (int)k.len, k.str);
+			WT_RET_MSG(session, EINVAL,
+			    "Column '%.*s' not found", (int)k.len, k.str);
+		}
+
+		/*
+		 * Check whether we're moving an unsized WT_ITEM from the end
+		 * to the middle, or vice-versa.  This determines whether the
+		 * size needs to be prepended.  This is the only case where the
+		 * destination size can be larger than the source size.
+		 */
+		if (pv.type == 'u' && !pv.havesize && have_next)
+			pv.type = 'U';
+		else if (pv.type == 'U' && !have_next)
+			pv.type = 'u';
+
+		if (pv.havesize)
+			WT_RET(__wt_buf_catfmt(
+			    session, format, "%d%c", (int)pv.size, pv.type));
+		else
+			WT_RET(__wt_buf_catfmt(session, format, "%c", pv.type));
+	} while (have_next);
+
+	return (0);
+}
+
+/*
+ * __wt_struct_truncate --
+ *	Return a packing string for the first N columns in a value.
+ */
+int
+__wt_struct_truncate(WT_SESSION_IMPL *session,
+    const char *input_fmt, u_int ncols, WT_ITEM *format)
+{
+	WT_DECL_PACK_VALUE(pv);
+	WT_PACK pack;
+
+	WT_RET(__pack_init(session, &pack, input_fmt));
+	while (ncols-- > 0) {
+		WT_RET(__pack_next(&pack, &pv));
+		if (pv.havesize)
+			WT_RET(__wt_buf_catfmt(
+			    session, format, "%d%c", (int)pv.size, pv.type));
+		else
+			WT_RET(__wt_buf_catfmt(session, format, "%c", pv.type));
+	}
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_project.c b/src/third_party/wiredtiger/src/schema/schema_project.c
new file mode 100644
index 00000000000..9aff4c8dded
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_project.c
@@ -0,0 +1,474 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_schema_project_in --
+ *	Given list of cursors and a projection, read columns from the
+ *	application into the dependent cursors.
+ */
+int
+__wt_schema_project_in(WT_SESSION_IMPL *session,
+    WT_CURSOR **cp, const char *proj_arg, va_list ap)
+{
+	WT_CURSOR *c;
+	WT_DECL_ITEM(buf);
+	WT_DECL_PACK_VALUE(pv);
+	WT_DECL_PACK(pack);
+	WT_PACK_VALUE old_pv;
+	size_t len, offset, old_len;
+	u_long arg;
+	char *proj;
+	uint8_t *p, *end;
+	const uint8_t *next;
+
+	p = end = NULL;		/* -Wuninitialized */
+
+	/* Reset any of the buffers we will be setting. */
+	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
+		arg = strtoul(proj, &proj, 10);
+		if (*proj == WT_PROJ_KEY) {
+			c = cp[arg];
+			WT_RET(__wt_buf_init(session, &c->key, 0));
+		} else if (*proj == WT_PROJ_VALUE) {
+			c = cp[arg];
+			WT_RET(__wt_buf_init(session, &c->value, 0));
+		}
+	}
+
+	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
+		arg = strtoul(proj, &proj, 10);
+
+		switch (*proj) {
+		case WT_PROJ_KEY:
+			c = cp[arg];
+			if (WT_CURSOR_RECNO(c)) {
+				c->key.data = &c->recno;
+				c->key.size = sizeof(c->recno);
+				WT_RET(__pack_init(session, &pack, "R"));
+			} else
+				WT_RET(__pack_init(
+				    session, &pack, c->key_format));
+			buf = &c->key;
+			p = (uint8_t *)buf->data;
+			end = p + buf->size;
+			continue;
+
+		case WT_PROJ_VALUE:
+			c = cp[arg];
+			WT_RET(__pack_init(session, &pack, c->value_format));
+			buf = &c->value;
+			p = (uint8_t *)buf->data;
+			end = p + buf->size;
+			continue;
+		}
+
+		/* We have to get a key or value before any operations. */
+		WT_ASSERT(session, buf != NULL);
+
+		/*
+		 * Otherwise, the argument is a count, where a missing
+		 * count means a count of 1.
+		 */
+		for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
+			switch (*proj) {
+			case WT_PROJ_SKIP:
+				WT_RET(__pack_next(&pack, &pv));
+				/*
+				 * A nasty case: if we are inserting
+				 * out-of-order, we may reach the end of the
+				 * data.  That's okay: we want to append in
+				 * that case, and we're positioned to do that.
+				 */
+				if (p == end) {
+					/* Set up an empty value. */
+					WT_CLEAR(pv.u);
+					if (pv.type == 'S' || pv.type == 's')
+						pv.u.s = "";
+
+					len = __pack_size(session, &pv);
+					WT_RET(__wt_buf_grow(session,
+					    buf, buf->size + len));
+					p = (uint8_t *)buf->mem + buf->size;
+					WT_RET(__pack_write(
+					    session, &pv, &p, len));
+					buf->size += len;
+					end = (uint8_t *)buf->mem + buf->size;
+				} else if (*proj == WT_PROJ_SKIP)
+					WT_RET(__unpack_read(session,
+					    &pv, (const uint8_t **)&p,
+					    (size_t)(end - p)));
+				break;
+
+			case WT_PROJ_NEXT:
+				WT_RET(__pack_next(&pack, &pv));
+				WT_PACK_GET(session, pv, ap);
+				/* FALLTHROUGH */
+
+			case WT_PROJ_REUSE:
+				/* Read the item we're about to overwrite. */
+				next = p;
+				if (p < end) {
+					old_pv = pv;
+					WT_RET(__unpack_read(session, &old_pv,
+					    &next, (size_t)(end - p)));
+				}
+				old_len = (size_t)(next - p);
+
+				len = __pack_size(session, &pv);
+				offset = WT_PTRDIFF(p, buf->mem);
+				WT_RET(__wt_buf_grow(session,
+				    buf, buf->size + len));
+				p = (uint8_t *)buf->mem + offset;
+				end = (uint8_t *)buf->mem + buf->size + len;
+				/* Make room if we're inserting out-of-order. */
+				if (offset + old_len < buf->size)
+					memmove(p + len, p + old_len,
+					    buf->size - (offset + old_len));
+				WT_RET(__pack_write(session, &pv, &p, len));
+				buf->size += len;
+				break;
+
+			default:
+				WT_RET_MSG(session, EINVAL,
+				    "unexpected projection plan: %c",
+				    (int)*proj);
+			}
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_schema_project_out --
+ *	Given list of cursors and a projection, read columns from the
+ *	dependent cursors and return them to the application.
+ */
+int
+__wt_schema_project_out(WT_SESSION_IMPL *session,
+    WT_CURSOR **cp, const char *proj_arg, va_list ap)
+{
+	WT_CURSOR *c;
+	WT_DECL_PACK(pack);
+	WT_DECL_PACK_VALUE(pv);
+	u_long arg;
+	char *proj;
+	uint8_t *p, *end;
+
+	p = end = NULL;		/* -Wuninitialized */
+
+	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
+		arg = strtoul(proj, &proj, 10);
+
+		switch (*proj) {
+		case WT_PROJ_KEY:
+			c = cp[arg];
+			if (WT_CURSOR_RECNO(c)) {
+				c->key.data = &c->recno;
+				c->key.size = sizeof(c->recno);
+				WT_RET(__pack_init(session, &pack, "R"));
+			} else
+				WT_RET(__pack_init(
+				    session, &pack, c->key_format));
+			p = (uint8_t *)c->key.data;
+			end = p + c->key.size;
+			continue;
+
+		case WT_PROJ_VALUE:
+			c = cp[arg];
+			WT_RET(__pack_init(session, &pack, c->value_format));
+			p = (uint8_t *)c->value.data;
+			end = p + c->value.size;
+			continue;
+		}
+
+		/*
+		 * Otherwise, the argument is a count, where a missing
+		 * count means a count of 1.
+		 */
+		for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
+			switch (*proj) {
+			case WT_PROJ_NEXT:
+			case WT_PROJ_SKIP:
+			case WT_PROJ_REUSE:
+				WT_RET(__pack_next(&pack, &pv));
+				WT_RET(__unpack_read(session, &pv,
+				    (const uint8_t **)&p, (size_t)(end - p)));
+				/* Only copy the value out once. */
+				if (*proj != WT_PROJ_NEXT)
+					break;
+				WT_UNPACK_PUT(session, pv, ap);
+				break;
+			}
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_schema_project_slice --
+ *	Given list of cursors and a projection, read columns from the
+ *	a raw buffer.
+ */
+int
+__wt_schema_project_slice(WT_SESSION_IMPL *session, WT_CURSOR **cp,
+    const char *proj_arg, int key_only, const char *vformat, WT_ITEM *value)
+{
+	WT_CURSOR *c;
+	WT_DECL_ITEM(buf);
+	WT_DECL_PACK(pack);
+	WT_DECL_PACK_VALUE(pv);
+	WT_DECL_PACK_VALUE(vpv);
+	WT_PACK vpack;
+	u_long arg;
+	char *proj;
+	uint8_t *end, *p;
+	const uint8_t *next, *vp, *vend;
+	size_t len, offset, old_len;
+	int skip;
+
+	p = end = NULL;		/* -Wuninitialized */
+
+	WT_RET(__pack_init(session, &vpack, vformat));
+	vp = value->data;
+	vend = vp + value->size;
+
+	/* Reset any of the buffers we will be setting. */
+	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
+		arg = strtoul(proj, &proj, 10);
+		if (*proj == WT_PROJ_KEY) {
+			c = cp[arg];
+			WT_RET(__wt_buf_init(session, &c->key, 0));
+		} else if (*proj == WT_PROJ_VALUE && !key_only) {
+			c = cp[arg];
+			WT_RET(__wt_buf_init(session, &c->value, 0));
+		}
+	}
+
+	skip = key_only;
+	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
+		arg = strtoul(proj, &proj, 10);
+
+		switch (*proj) {
+		case WT_PROJ_KEY:
+			skip = 0;
+			c = cp[arg];
+			if (WT_CURSOR_RECNO(c)) {
+				c->key.data = &c->recno;
+				c->key.size = sizeof(c->recno);
+				WT_RET(__pack_init(session, &pack, "R"));
+			} else
+				WT_RET(__pack_init(
+				    session, &pack, c->key_format));
+			buf = &c->key;
+			p = (uint8_t *)buf->data;
+			end = p + buf->size;
+			continue;
+
+		case WT_PROJ_VALUE:
+			if ((skip = key_only) != 0)
+				continue;
+			c = cp[arg];
+			WT_RET(__pack_init(session, &pack, c->value_format));
+			buf = &c->value;
+			p = (uint8_t *)buf->data;
+			end = p + buf->size;
+			continue;
+		}
+
+		/* We have to get a key or value before any operations. */
+		WT_ASSERT(session, skip || buf != NULL);
+
+		/*
+		 * Otherwise, the argument is a count, where a missing
+		 * count means a count of 1.
+		 */
+		for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
+			switch (*proj) {
+			case WT_PROJ_SKIP:
+				if (skip)
+					break;
+				WT_RET(__pack_next(&pack, &pv));
+
+				/*
+				 * A nasty case: if we are inserting
+				 * out-of-order, append a zero value to keep
+				 * the buffer in the correct format.
+				 */
+				if (p == end) {
+					/* Set up an empty value. */
+					WT_CLEAR(pv.u);
+					if (pv.type == 'S' || pv.type == 's')
+						pv.u.s = "";
+
+					len = __pack_size(session, &pv);
+					WT_RET(__wt_buf_grow(session,
+					    buf, buf->size + len));
+					p = (uint8_t *)buf->data + buf->size;
+					WT_RET(__pack_write(
+					    session, &pv, &p, len));
+					end = p;
+					buf->size += len;
+				} else
+					WT_RET(__unpack_read(session,
+					    &pv, (const uint8_t **)&p,
+					    (size_t)(end - p)));
+				break;
+
+			case WT_PROJ_NEXT:
+				WT_RET(__pack_next(&vpack, &vpv));
+				WT_RET(__unpack_read(session, &vpv,
+				    &vp, (size_t)(vend - vp)));
+				/* FALLTHROUGH */
+
+			case WT_PROJ_REUSE:
+				if (skip)
+					break;
+
+				/*
+				 * Read the item we're about to overwrite.
+				 *
+				 * There is subtlety here: the value format
+				 * may not exactly match the cursor's format.
+				 * In particular, we need lengths with raw
+				 * columns in the middle of a packed struct,
+				 * but not if they are at the end of a struct.
+				 */
+				WT_RET(__pack_next(&pack, &pv));
+
+				next = p;
+				if (p < end)
+					WT_RET(__unpack_read(session, &pv,
+					    &next, (size_t)(end - p)));
+				old_len = (size_t)(next - p);
+
+				/* Make sure the types are compatible. */
+				WT_ASSERT(session,
+				    tolower(pv.type) == tolower(vpv.type));
+				pv.u = vpv.u;
+
+				len = __pack_size(session, &pv);
+				offset = WT_PTRDIFF(p, buf->data);
+				/*
+				 * Avoid growing the buffer if the value fits.
+				 * This is not just a performance issue: it
+				 * covers the case of record number keys, which
+				 * have to be written to cursor->recno.
+				 */
+				if (len > old_len)
+					WT_RET(__wt_buf_grow(session,
+					    buf, buf->size + len - old_len));
+				p = (uint8_t *)buf->data + offset;
+				/* Make room if we're inserting out-of-order. */
+				if (offset + old_len < buf->size)
+					memmove(p + len, p + old_len,
+					    buf->size - (offset + old_len));
+				WT_RET(__pack_write(session, &pv, &p, len));
+				buf->size += len - old_len;
+				end = (uint8_t *)buf->data + buf->size;
+				break;
+			default:
+				WT_RET_MSG(session, EINVAL,
+				    "unexpected projection plan: %c",
+				    (int)*proj);
+			}
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_schema_project_merge --
+ *	Given list of cursors and a projection, build a buffer containing the
+ *	column values read from the cursors.
+ */
+int
+__wt_schema_project_merge(WT_SESSION_IMPL *session,
+    WT_CURSOR **cp, const char *proj_arg, const char *vformat, WT_ITEM *value)
+{
+	WT_CURSOR *c;
+	WT_ITEM *buf;
+	WT_DECL_PACK(pack);
+	WT_DECL_PACK_VALUE(pv);
+	WT_DECL_PACK_VALUE(vpv);
+	WT_PACK vpack;
+	u_long arg;
+	char *proj;
+	const uint8_t *p, *end;
+	uint8_t *vp;
+	size_t len;
+
+	p = end = NULL;		/* -Wuninitialized */
+
+	WT_RET(__wt_buf_init(session, value, 0));
+	WT_RET(__pack_init(session, &vpack, vformat));
+
+	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
+		arg = strtoul(proj, &proj, 10);
+
+		switch (*proj) {
+		case WT_PROJ_KEY:
+			c = cp[arg];
+			if (WT_CURSOR_RECNO(c)) {
+				c->key.data = &c->recno;
+				c->key.size = sizeof(c->recno);
+				WT_RET(__pack_init(session, &pack, "R"));
+			} else
+				WT_RET(__pack_init(
+				    session, &pack, c->key_format));
+			buf = &c->key;
+			p = buf->data;
+			end = p + buf->size;
+			continue;
+
+		case WT_PROJ_VALUE:
+			c = cp[arg];
+			WT_RET(__pack_init(session, &pack, c->value_format));
+			buf = &c->value;
+			p = buf->data;
+			end = p + buf->size;
+			continue;
+		}
+
+		/*
+		 * Otherwise, the argument is a count, where a missing
+		 * count means a count of 1.
+		 */
+		for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
+			switch (*proj) {
+			case WT_PROJ_NEXT:
+			case WT_PROJ_SKIP:
+			case WT_PROJ_REUSE:
+				WT_RET(__pack_next(&pack, &pv));
+				WT_RET(__unpack_read(session, &pv,
+				    &p, (size_t)(end - p)));
+				/* Only copy the value out once. */
+				if (*proj != WT_PROJ_NEXT)
+					break;
+
+				WT_RET(__pack_next(&vpack, &vpv));
+				/* Make sure the types are compatible. */
+				WT_ASSERT(session,
+				    tolower(pv.type) == tolower(vpv.type));
+				vpv.u = pv.u;
+				len = __pack_size(session, &vpv);
+				WT_RET(__wt_buf_grow(session,
+				    value, value->size + len));
+				vp = (uint8_t *)value->mem + value->size;
+				WT_RET(__pack_write(session, &vpv, &vp, len));
+				value->size += len;
+				break;
+			}
+		}
+	}
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_rename.c b/src/third_party/wiredtiger/src/schema/schema_rename.c
new file mode 100644
index 00000000000..8605ea41c80
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_rename.c
@@ -0,0 +1,276 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __rename_file --
+ *	WT_SESSION::rename for a file.
+ */
+static int
+__rename_file(
+    WT_SESSION_IMPL *session, const char *uri, const char *newuri)
+{
+	WT_DECL_RET;
+	int exist;
+	const char *filename, *newfile, *newvalue, *oldvalue;
+
+	newvalue = oldvalue = NULL;
+
+	filename = uri;
+	newfile = newuri;
+	if (!WT_PREFIX_SKIP(filename, "file:") ||
+	    !WT_PREFIX_SKIP(newfile, "file:"))
+		return (EINVAL);
+
+	/* Close any btree handles in the file. */
+	WT_ERR(__wt_conn_dhandle_close_all(session, uri, 0));
+
+	/*
+	 * First, check if the file being renamed exists in the system.  Doing
+	 * this check first matches the table rename behavior because we return
+	 * WT_NOTFOUND when the renamed file doesn't exist (subsequently mapped
+	 * to ENOENT by the session layer).
+	 */
+	WT_ERR(__wt_metadata_search(session, uri, &oldvalue));
+
+	/*
+	 * Check to see if the proposed name is already in use, in either the
+	 * metadata or the filesystem.
+	 */
+	switch (ret = __wt_metadata_search(session, newuri, &newvalue)) {
+	case 0:
+		WT_ERR_MSG(session, EEXIST, "%s", newuri);
+		/* NOTREACHED */
+	case WT_NOTFOUND:
+		break;
+	default:
+		WT_ERR(ret);
+	}
+	WT_ERR(__wt_exist(session, newfile, &exist));
+	if (exist)
+		WT_ERR_MSG(session, EEXIST, "%s", newfile);
+
+	/* Replace the old file entries with new file entries. */
+	WT_ERR(__wt_metadata_remove(session, uri));
+	WT_ERR(__wt_metadata_insert(session, newuri, oldvalue));
+
+	/* Rename the underlying file. */
+	WT_ERR(__wt_rename(session, filename, newfile));
+	if (WT_META_TRACKING(session))
+		WT_ERR(__wt_meta_track_fileop(session, uri, newuri));
+
+err:	__wt_free(session, newvalue);
+	__wt_free(session, oldvalue);
+	return (ret);
+}
+
+/*
+ * __rename_tree --
+ *	Rename an index or colgroup reference.
+ */
+static int
+__rename_tree(WT_SESSION_IMPL *session,
+    WT_TABLE *table, const char *newuri, const char *name, const char *cfg[])
+{
+	WT_CONFIG_ITEM cval;
+	WT_DECL_ITEM(nn);
+	WT_DECL_ITEM(ns);
+	WT_DECL_ITEM(nv);
+	WT_DECL_ITEM(os);
+	WT_DECL_RET;
+	const char *newname, *olduri, *suffix, *value;
+	int is_colgroup;
+
+	olduri = table->name;
+	value = NULL;
+
+	newname = newuri;
+	(void)WT_PREFIX_SKIP(newname, "table:");
+
+	/*
+	 * Create the new data source URI and update the schema value.
+	 *
+	 * 'name' has the format (colgroup|index):<tablename>[:<suffix>];
+	 * we need the suffix.
+	 */
+	is_colgroup = WT_PREFIX_MATCH(name, "colgroup:");
+	if (!is_colgroup && !WT_PREFIX_MATCH(name, "index:"))
+		WT_ERR_MSG(session, EINVAL,
+		    "expected a 'colgroup:' or 'index:' source: '%s'", name);
+
+	suffix = strchr(name, ':');
+	/* An existing table should have a well formed name. */
+	WT_ASSERT(session, suffix != NULL);
+	suffix = strchr(suffix + 1, ':');
+
+	WT_ERR(__wt_scr_alloc(session, 0, &nn));
+	WT_ERR(__wt_buf_fmt(session, nn, "%s%s%s",
+	    is_colgroup ? "colgroup:" : "index:",
+	    newname,
+	    (suffix == NULL) ? "" : suffix));
+
+	/* Skip the colon, if any. */
+	if (suffix != NULL)
+		++suffix;
+
+	/* Read the old schema value. */
+	WT_ERR(__wt_metadata_search(session, name, &value));
+
+	/*
+	 * Calculate the new data source URI.  Use the existing table structure
+	 * and substitute the new name temporarily.
+	 */
+	WT_ERR(__wt_scr_alloc(session, 0, &ns));
+	table->name = newuri;
+	if (is_colgroup)
+		WT_ERR(__wt_schema_colgroup_source(
+		    session, table, suffix, value, ns));
+	else
+		WT_ERR(__wt_schema_index_source(
+		    session, table, suffix, value, ns));
+
+	if ((ret = __wt_config_getones(session, value, "source", &cval)) != 0)
+		WT_ERR_MSG(session, EINVAL,
+		    "index or column group has no data source: %s", value);
+
+	/* Take a copy of the old data source. */
+	WT_ERR(__wt_scr_alloc(session, 0, &os));
+	WT_ERR(__wt_buf_fmt(session, os, "%.*s", (int)cval.len, cval.str));
+
+	/* Overwrite it with the new data source. */
+	WT_ERR(__wt_scr_alloc(session, 0, &nv));
+	WT_ERR(__wt_buf_fmt(session, nv, "%.*s%s%s",
+	    (int)WT_PTRDIFF(cval.str, value), value,
+	    (const char *)ns->data,
+	    cval.str + cval.len));
+
+	/*
+	 * Remove the old metadata entry.
+	 * Insert the new metadata entry.
+	 */
+	WT_ERR(__wt_metadata_remove(session, name));
+	WT_ERR(__wt_metadata_insert(session, nn->data, nv->data));
+
+	/* Rename the file. */
+	WT_ERR(__wt_schema_rename(session, os->data, ns->data, cfg));
+
+err:	__wt_scr_free(&nn);
+	__wt_scr_free(&ns);
+	__wt_scr_free(&nv);
+	__wt_scr_free(&os);
+	__wt_free(session, value);
+	table->name = olduri;
+	return (ret);
+}
+
+/*
+ * __metadata_rename --
+ *	Rename an entry in the metadata table.
+ */
+static int
+__metadata_rename(WT_SESSION_IMPL *session, const char *uri, const char *newuri)
+{
+	WT_DECL_RET;
+	const char *value;
+
+	WT_RET(__wt_metadata_search(session, uri, &value));
+	WT_ERR(__wt_metadata_remove(session, uri));
+	WT_ERR(__wt_metadata_insert(session, newuri, value));
+
+err:	__wt_free(session, value);
+	return (ret);
+}
+
+/*
+ * __rename_table --
+ *	WT_SESSION::rename for a table.
+ */
+static int
+__rename_table(WT_SESSION_IMPL *session,
+    const char *uri, const char *newuri, const char *cfg[])
+{
+	WT_DECL_RET;
+	WT_TABLE *table;
+	u_int i;
+	const char *oldname;
+
+	oldname = uri;
+	(void)WT_PREFIX_SKIP(oldname, "table:");
+
+	WT_RET(__wt_schema_get_table(
+	    session, oldname, strlen(oldname), 0, &table));
+
+	/* Rename the column groups. */
+	for (i = 0; i < WT_COLGROUPS(table); i++)
+		WT_ERR(__rename_tree(session, table, newuri,
+		    table->cgroups[i]->name, cfg));
+
+	/* Rename the indices. */
+	WT_ERR(__wt_schema_open_indices(session, table));
+	for (i = 0; i < table->nindices; i++)
+		WT_ERR(__rename_tree(session, table, newuri,
+		    table->indices[i]->name, cfg));
+
+	__wt_schema_remove_table(session, table);
+	table = NULL;
+
+	/* Rename the table. */
+	WT_ERR(__metadata_rename(session, uri, newuri));
+
+err:	if (table != NULL)
+		__wt_schema_release_table(session, table);
+	return (ret);
+}
+
+/*
+ * __wt_schema_rename --
+ *	WT_SESSION::rename.
+ */
+int
+__wt_schema_rename(WT_SESSION_IMPL *session,
+    const char *uri, const char *newuri, const char *cfg[])
+{
+	WT_DATA_SOURCE *dsrc;
+	WT_DECL_RET;
+	const char *p, *t;
+
+	/* The target type must match the source type. */
+	for (p = uri, t = newuri; *p == *t && *p != ':'; ++p, ++t)
+		;
+	if (*p != ':' || *t != ':')
+		WT_RET_MSG(session, EINVAL,
+		    "rename target type must match URI: %s to %s", uri, newuri);
+
+	/*
+	 * We track rename operations, if we fail in the middle, we want to
+	 * back it all out.
+	 */
+	WT_RET(__wt_meta_track_on(session));
+
+	if (WT_PREFIX_MATCH(uri, "file:"))
+		ret = __rename_file(session, uri, newuri);
+	else if (WT_PREFIX_MATCH(uri, "lsm:"))
+		ret = __wt_lsm_tree_rename(session, uri, newuri, cfg);
+	else if (WT_PREFIX_MATCH(uri, "table:"))
+		ret = __rename_table(session, uri, newuri, cfg);
+	else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL)
+		ret = dsrc->rename == NULL ?
+		    __wt_object_unsupported(session, uri) :
+		    dsrc->rename(dsrc,
+		    &session->iface, uri, newuri, (WT_CONFIG_ARG *)cfg);
+	else
+		ret = __wt_bad_object_type(session, uri);
+
+	/* Bump the schema generation so that stale data is ignored. */
+	++S2C(session)->schema_gen;
+
+	WT_TRET(__wt_meta_track_off(session, ret != 0));
+
+	/* If we didn't find a metadata entry, map that error to ENOENT. */
+	return (ret == WT_NOTFOUND ? ENOENT : ret);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_stat.c b/src/third_party/wiredtiger/src/schema/schema_stat.c
new file mode 100644
index 00000000000..cb8e7f6c418
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_stat.c
@@ -0,0 +1,114 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_curstat_colgroup_init --
+ *	Initialize the statistics for a column group.
+ */
+int
+__wt_curstat_colgroup_init(WT_SESSION_IMPL *session,
+    const char *uri, const char *cfg[], WT_CURSOR_STAT *cst)
+{
+	WT_COLGROUP *colgroup;
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+
+	WT_RET(__wt_schema_get_colgroup(session, uri, NULL, &colgroup));
+
+	WT_RET(__wt_scr_alloc(session, 0, &buf));
+	WT_ERR(__wt_buf_fmt(session, buf, "statistics:%s", colgroup->source));
+	ret = __wt_curstat_init(session, buf->data, cfg, cst);
+
+err:	__wt_scr_free(&buf);
+	return (ret);
+}
+
+/*
+ * __wt_curstat_index_init --
+ *	Initialize the statistics for an index.
+ */
+int
+__wt_curstat_index_init(WT_SESSION_IMPL *session,
+    const char *uri, const char *cfg[], WT_CURSOR_STAT *cst)
+{
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+	WT_INDEX *idx;
+
+	WT_RET(__wt_schema_get_index(session, uri, NULL, &idx));
+
+	WT_RET(__wt_scr_alloc(session, 0, &buf));
+	WT_ERR(__wt_buf_fmt(session, buf, "statistics:%s", idx->source));
+	ret = __wt_curstat_init(session, buf->data, cfg, cst);
+
+err:	__wt_scr_free(&buf);
+	return (ret);
+}
+
+/*
+ * __wt_curstat_table_init --
+ *	Initialize the statistics for a table.
+ */
+int
+__wt_curstat_table_init(WT_SESSION_IMPL *session,
+    const char *uri, const char *cfg[], WT_CURSOR_STAT *cst)
+{
+	WT_CURSOR *stat_cursor;
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+	WT_DSRC_STATS *new, *stats;
+	WT_TABLE *table;
+	u_int i;
+	const char *name;
+
+	name = uri + strlen("table:");
+	WT_RET(__wt_schema_get_table(session, name, strlen(name), 0, &table));
+
+	WT_ERR(__wt_scr_alloc(session, 0, &buf));
+
+	/*
+	 * Process the column groups.
+	 *
+	 * Set the cursor to reference the data source statistics; we don't
+	 * initialize it, instead we copy (rather than aggregate), the first
+	 * column's statistics, which has the same effect.
+	 */
+	stats = &cst->u.dsrc_stats;
+	for (i = 0; i < WT_COLGROUPS(table); i++) {
+		WT_ERR(__wt_buf_fmt(
+		    session, buf, "statistics:%s", table->cgroups[i]->name));
+		WT_ERR(__wt_curstat_open(
+		    session, buf->data, cfg, &stat_cursor));
+		new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
+		if (i == 0)
+			*stats = *new;
+		else
+			__wt_stat_aggregate_dsrc_stats(new, stats);
+		WT_ERR(stat_cursor->close(stat_cursor));
+	}
+
+	/* Process the indices. */
+	WT_ERR(__wt_schema_open_indices(session, table));
+	for (i = 0; i < table->nindices; i++) {
+		WT_ERR(__wt_buf_fmt(
+		    session, buf, "statistics:%s", table->indices[i]->name));
+		WT_ERR(__wt_curstat_open(
+		    session, buf->data, cfg, &stat_cursor));
+		new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
+		__wt_stat_aggregate_dsrc_stats(new, stats);
+		WT_ERR(stat_cursor->close(stat_cursor));
+	}
+
+	__wt_curstat_dsrc_final(cst);
+
+err:	__wt_schema_release_table(session, table);
+
+	__wt_scr_free(&buf);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_truncate.c b/src/third_party/wiredtiger/src/schema/schema_truncate.c
new file mode 100644
index 00000000000..1da3b103f10
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_truncate.c
@@ -0,0 +1,183 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __truncate_file --
+ *	WT_SESSION::truncate for a file.
+ */
+static int
+__truncate_file(WT_SESSION_IMPL *session, const char *name)
+{
+	const char *filename;
+	uint32_t allocsize;
+
+	filename = name;
+	if (!WT_PREFIX_SKIP(filename, "file:"))
+		return (EINVAL);
+
+	/* Open and lock the file. */
+	WT_RET(__wt_session_get_btree(
+	    session, name, NULL, NULL, WT_DHANDLE_EXCLUSIVE));
+
+	/* Get the allocation size. */
+	allocsize = S2BT(session)->allocsize;
+
+	WT_RET(__wt_session_release_btree(session));
+
+	/* Close any btree handles in the file. */
+	WT_RET(__wt_conn_dhandle_close_all(session, name, 0));
+
+	/* Delete the root address and truncate the file. */
+	WT_RET(__wt_meta_checkpoint_clear(session, name));
+	WT_RET(__wt_block_manager_truncate(session, filename, allocsize));
+
+	return (0);
+}
+
+/*
+ * __truncate_table --
+ *	WT_SESSION::truncate for a table.
+ */
+static int
+__truncate_table(WT_SESSION_IMPL *session, const char *name, const char *cfg[])
+{
+	WT_DECL_RET;
+	WT_TABLE *table;
+	u_int i;
+
+	WT_RET(__wt_schema_get_table(session, name, strlen(name), 0, &table));
+
+	/* Truncate the column groups. */
+	for (i = 0; i < WT_COLGROUPS(table); i++)
+		WT_ERR(__wt_schema_truncate(
+		    session, table->cgroups[i]->source, cfg));
+
+	/* Truncate the indices. */
+	WT_ERR(__wt_schema_open_indices(session, table));
+	for (i = 0; i < table->nindices; i++)
+		WT_ERR(__wt_schema_truncate(
+		    session, table->indices[i]->source, cfg));
+
+err:	__wt_schema_release_table(session, table);
+	return (ret);
+}
+
+/*
+ * __truncate_dsrc --
+ *	WT_SESSION::truncate for a data-source without a truncate operation.
+ */
+static int
+__truncate_dsrc(WT_SESSION_IMPL *session, const char *uri)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	const char *cfg[2];
+
+	/* Open a cursor and traverse the object, removing every entry. */
+	cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
+	cfg[1] = NULL;
+	WT_RET(__wt_open_cursor(session, uri, NULL, cfg, &cursor));
+	while ((ret = cursor->next(cursor)) == 0)
+		WT_ERR(cursor->remove(cursor));
+	WT_ERR_NOTFOUND_OK(ret);
+
+err:	WT_TRET(cursor->close(cursor));
+	return (ret);
+}
+
+/*
+ * __wt_schema_truncate --
+ *	WT_SESSION::truncate without a range.
+ */
+int
+__wt_schema_truncate(
+    WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
+{
+	WT_DATA_SOURCE *dsrc;
+	WT_DECL_RET;
+	const char *tablename;
+
+	tablename = uri;
+
+	if (WT_PREFIX_MATCH(uri, "file:")) {
+		ret = __truncate_file(session, uri);
+	} else if (WT_PREFIX_MATCH(uri, "lsm:"))
+		ret = __wt_lsm_tree_truncate(session, uri, cfg);
+	else if (WT_PREFIX_SKIP(tablename, "table:"))
+		ret = __truncate_table(session, tablename, cfg);
+	else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL)
+		ret = dsrc->truncate == NULL ?
+		    __truncate_dsrc(session, uri) :
+		    dsrc->truncate(
+		    dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg);
+	else
+		ret = __wt_bad_object_type(session, uri);
+
+	/* If we didn't find a metadata entry, map that error to ENOENT. */
+	return (ret == WT_NOTFOUND ? ENOENT : ret);
+}
+
+/*
+ * __wt_range_truncate --
+ *	Truncate of a cursor range, default implementation.
+ */
+int
+__wt_range_truncate(WT_CURSOR *start, WT_CURSOR *stop)
+{
+	WT_DECL_RET;
+	int cmp;
+
+	if (start == NULL) {
+		do {
+			WT_RET(stop->remove(stop));
+		} while ((ret = stop->prev(stop)) == 0);
+		WT_RET_NOTFOUND_OK(ret);
+	} else {
+		cmp = -1;
+		do {
+			if (stop != NULL)
+				WT_RET(start->compare(start, stop, &cmp));
+			WT_RET(start->remove(start));
+		} while (cmp < 0 && (ret = start->next(start)) == 0);
+		WT_RET_NOTFOUND_OK(ret);
+	}
+	return (0);
+}
+
+/*
+ * __wt_schema_range_truncate --
+ *	WT_SESSION::truncate with a range.
+ */
+int
+__wt_schema_range_truncate(
+    WT_SESSION_IMPL *session, WT_CURSOR *start, WT_CURSOR *stop)
+{
+	WT_CURSOR *cursor;
+	WT_DATA_SOURCE *dsrc;
+	WT_DECL_RET;
+	const char *uri;
+
+	cursor = (start != NULL) ? start : stop;
+	uri = cursor->internal_uri;
+
+	if (WT_PREFIX_MATCH(uri, "file:"))
+		WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)cursor)->btree,
+		    ret = __wt_btcur_range_truncate(
+			(WT_CURSOR_BTREE *)start, (WT_CURSOR_BTREE *)stop));
+	else if (WT_PREFIX_MATCH(uri, "table:"))
+		ret = __wt_table_range_truncate(
+		    (WT_CURSOR_TABLE *)start, (WT_CURSOR_TABLE *)stop);
+	else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL &&
+	    dsrc->range_truncate != NULL)
+		ret = dsrc->range_truncate(dsrc, &session->iface, start, stop);
+	else
+		ret = __wt_range_truncate(start, stop);
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_util.c b/src/third_party/wiredtiger/src/schema/schema_util.c
new file mode 100644
index 00000000000..263f56f1c41
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_util.c
@@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_schema_get_source --
+ *	Find a matching data source or report an error.
+ */
+WT_DATA_SOURCE *
+__wt_schema_get_source(WT_SESSION_IMPL *session, const char *name)
+{
+	WT_NAMED_DATA_SOURCE *ndsrc;
+
+	TAILQ_FOREACH(ndsrc, &S2C(session)->dsrcqh, q)
+		if (WT_PREFIX_MATCH(name, ndsrc->prefix))
+			return (ndsrc->dsrc);
+	return (NULL);
+}
+
+/*
+ * __wt_str_name_check --
+ *	Disallow any use of the WiredTiger name space.
+ */
+int
+__wt_str_name_check(WT_SESSION_IMPL *session, const char *str)
+{
+	const char *name, *sep;
+	int skipped;
+
+	/*
+	 * Check if name is somewhere in the WiredTiger name space: it would be
+	 * "bad" if the application truncated the metadata file.  Skip any
+	 * leading URI prefix, check and then skip over a table name.
+	 */
+	name = str;
+	for (skipped = 0; skipped < 2; skipped++) {
+		if ((sep = strchr(name, ':')) == NULL)
+			break;
+
+		name = sep + 1;
+		if (WT_PREFIX_MATCH(name, "WiredTiger"))
+			WT_RET_MSG(session, EINVAL,
+			    "%s: the \"WiredTiger\" name space may not be "
+			    "used by applications", name);
+	}
+
+	/*
+	 * Disallow JSON quoting characters -- the config string parsing code
+	 * supports quoted strings, but there's no good reason to use them in
+	 * names and we're not going to do the testing.
+	 */
+	if (strpbrk(name, "{},:[]\\\"'") != NULL)
+		WT_RET_MSG(session, EINVAL,
+		    "%s: WiredTiger objects should not include grouping "
+		    "characters in their names",
+		    name);
+
+	return (0);
+}
+
+/*
+ * __wt_name_check --
+ *	Disallow any use of the WiredTiger name space.
+ */
+int
+__wt_name_check(WT_SESSION_IMPL *session, const char *str, size_t len)
+{
+	WT_DECL_RET;
+	WT_DECL_ITEM(tmp);
+
+	WT_RET(__wt_scr_alloc(session, len, &tmp));
+
+	WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)len, str));
+
+	ret = __wt_str_name_check(session, tmp->data);
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_worker.c b/src/third_party/wiredtiger/src/schema/schema_worker.c
new file mode 100644
index 00000000000..8e7ed3925f6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_worker.c
@@ -0,0 +1,134 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_schema_worker --
+ *	Get Btree handles for the object and cycle through calls to an
+ *	underlying worker function with each handle.
+ */
+int
+__wt_schema_worker(WT_SESSION_IMPL *session,
+   const char *uri,
+   int (*file_func)(WT_SESSION_IMPL *, const char *[]),
+   int (*name_func)(WT_SESSION_IMPL *, const char *, int *),
+   const char *cfg[], uint32_t open_flags)
+{
+	WT_COLGROUP *colgroup;
+	WT_DATA_SOURCE *dsrc;
+	WT_DECL_RET;
+	WT_INDEX *idx;
+	WT_SESSION *wt_session;
+	WT_TABLE *table;
+	const char *tablename;
+	u_int i;
+	int skip;
+
+	table = NULL;
+	tablename = uri;
+
+	skip = 0;
+	if (name_func != NULL)
+		WT_ERR(name_func(session, uri, &skip));
+
+	/* If the callback said to skip this object, we're done. */
+	if (skip)
+		return (0);
+
+	/* Get the btree handle(s) and call the underlying function. */
+	if (WT_PREFIX_MATCH(uri, "file:")) {
+		if (file_func != NULL) {
+			/*
+			 * If the operation requires exclusive access, close
+			 * any open file handles, including checkpoints.
+			 */
+			if (FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE))
+				WT_ERR(__wt_conn_dhandle_close_all(
+				    session, uri, 0));
+
+			WT_ERR(__wt_session_get_btree_ckpt(
+			    session, uri, cfg, open_flags));
+			ret = file_func(session, cfg);
+			WT_TRET(__wt_session_release_btree(session));
+		}
+	} else if (WT_PREFIX_MATCH(uri, "colgroup:")) {
+		WT_ERR(__wt_schema_get_colgroup(session, uri, NULL, &colgroup));
+		WT_ERR(__wt_schema_worker(session, colgroup->source,
+		    file_func, name_func, cfg, open_flags));
+	} else if (WT_PREFIX_SKIP(tablename, "index:")) {
+		idx = NULL;
+		WT_ERR(__wt_schema_get_index(session, uri, NULL, &idx));
+		WT_ERR(__wt_schema_worker(session, idx->source,
+		    file_func, name_func, cfg, open_flags));
+	} else if (WT_PREFIX_MATCH(uri, "lsm:")) {
+		/*
+		 * LSM compaction is handled elsewhere, but if we get here
+		 * trying to compact files, don't descend into an LSM tree.
+		 */
+		if (file_func != __wt_compact)
+			WT_ERR(__wt_lsm_tree_worker(session,
+			    uri, file_func, name_func, cfg, open_flags));
+	} else if (WT_PREFIX_SKIP(tablename, "table:")) {
+		WT_ERR(__wt_schema_get_table(session,
+		    tablename, strlen(tablename), 0, &table));
+		WT_ASSERT(session, session->dhandle == NULL);
+
+		/*
+		 * We could make a recursive call for each colgroup or index
+		 * URI, but since we have already opened the table, we can take
+		 * a short cut and skip straight to the sources.  If we have a
+		 * name function, it needs to know about the intermediate URIs.
+		 */
+		for (i = 0; i < WT_COLGROUPS(table); i++) {
+			colgroup = table->cgroups[i];
+			skip = 0;
+			if (name_func != NULL)
+				WT_ERR(name_func(
+				    session, colgroup->name, &skip));
+			if (!skip)
+				WT_ERR(__wt_schema_worker(
+				    session, colgroup->source,
+				    file_func, name_func, cfg, open_flags));
+		}
+
+		WT_ERR(__wt_schema_open_indices(session, table));
+		for (i = 0; i < table->nindices; i++) {
+			idx = table->indices[i];
+			skip = 0;
+			if (name_func != NULL)
+				WT_ERR(name_func(session, idx->name, &skip));
+			if (!skip)
+				WT_ERR(__wt_schema_worker(session, idx->source,
+				    file_func, name_func, cfg, open_flags));
+		}
+	} else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL) {
+		wt_session = (WT_SESSION *)session;
+		if (file_func == __wt_compact && dsrc->compact != NULL)
+			WT_ERR(dsrc->compact(
+			    dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg));
+		else if (file_func == __wt_salvage && dsrc->salvage != NULL)
+			WT_ERR(dsrc->salvage(
+			   dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg));
+		else if (file_func == __wt_verify && dsrc->verify != NULL)
+			WT_ERR(dsrc->verify(
+			   dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg));
+		else if (file_func == __wt_checkpoint)
+			;
+		else if (file_func == __wt_checkpoint_list)
+			;
+		else if (file_func == __wt_checkpoint_sync)
+			;
+		else
+			WT_ERR(__wt_object_unsupported(session, uri));
+	} else
+		WT_ERR(__wt_bad_object_type(session, uri));
+
+err:	if (table != NULL)
+		__wt_schema_release_table(session, table);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
new file mode 100644
index 00000000000..39b9dd0de61
--- /dev/null
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -0,0 +1,1054 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __session_checkpoint(WT_SESSION *, const char *);
+static int __session_rollback_transaction(WT_SESSION *, const char *);
+
+/*
+ * __wt_session_reset_cursors --
+ *	Reset all open cursors.
+ */
+int
+__wt_session_reset_cursors(WT_SESSION_IMPL *session)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+
+	TAILQ_FOREACH(cursor, &session->cursors, q) {
+		/* Stop when there are no positioned cursors. */
+		if (session->ncursors == 0)
+			break;
+		WT_TRET(cursor->reset(cursor));
+	}
+	return (ret);
+}
+
+/*
+ * __wt_session_copy_values --
+ *	Copy values into all positioned cursors, so that they don't keep
+ *	transaction IDs pinned.
+ */
+int
+__wt_session_copy_values(WT_SESSION_IMPL *session)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+
+	TAILQ_FOREACH(cursor, &session->cursors, q)
+		if (F_ISSET(cursor, WT_CURSTD_VALUE_INT)) {
+			F_CLR(cursor, WT_CURSTD_VALUE_INT);
+			WT_RET(__wt_buf_set(session, &cursor->value,
+			    cursor->value.data, cursor->value.size));
+			F_SET(cursor, WT_CURSTD_VALUE_EXT);
+		}
+
+	return (ret);
+}
+
+/*
+ * __session_clear --
+ *	Clear a session structure.
+ */
+static void
+__session_clear(WT_SESSION_IMPL *session)
+{
+	/*
+	 * There's no serialization support around the review of the hazard
+	 * array, which means threads checking for hazard pointers first check
+	 * the active field (which may be 0) and then use the hazard pointer
+	 * (which cannot be NULL).
+	 *
+	 * Additionally, the session structure can include information that
+	 * persists past the session's end-of-life, stored as part of page
+	 * splits.
+	 *
+	 * For these reasons, be careful when clearing the session structure.
+	 */
+	memset(session, 0, WT_SESSION_CLEAR_SIZE(session));
+	session->hazard_size = 0;
+	session->nhazard = 0;
+}
+
+/*
+ * __session_close --
+ *	WT_SESSION->close method.
+ */
+static int
+__session_close(WT_SESSION *wt_session, const char *config)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	conn = (WT_CONNECTION_IMPL *)wt_session->connection;
+	session = (WT_SESSION_IMPL *)wt_session;
+
+	SESSION_API_CALL(session, close, config, cfg);
+	WT_UNUSED(cfg);
+
+	/* Rollback any active transaction. */
+	if (F_ISSET(&session->txn, TXN_RUNNING))
+		WT_TRET(__session_rollback_transaction(wt_session, NULL));
+
+	/*
+	 * Also release any pinned transaction ID from a non-transactional
+	 * operation.
+	 */
+	if (conn->txn_global.states != NULL)
+		__wt_txn_release_snapshot(session);
+
+	/* Close all open cursors. */
+	while ((cursor = TAILQ_FIRST(&session->cursors)) != NULL) {
+		/*
+		 * Notify the user that we are closing the cursor handle
+		 * via the registered close callback.
+		 */
+		if (session->event_handler->handle_close != NULL)
+			WT_TRET(session->event_handler->handle_close(
+			    session->event_handler, wt_session, cursor));
+		WT_TRET(cursor->close(cursor));
+	}
+
+	WT_ASSERT(session, session->ncursors == 0);
+
+	/* Discard cached handles. */
+	__wt_session_close_cache(session);
+
+	/* Close all tables. */
+	__wt_schema_close_tables(session);
+
+	/* Discard metadata tracking. */
+	__wt_meta_track_discard(session);
+
+	/* Discard scratch buffers. */
+	__wt_scr_discard(session);
+
+	/* Free transaction information. */
+	__wt_txn_destroy(session);
+
+	/* Confirm we're not holding any hazard pointers. */
+	__wt_hazard_close(session);
+
+	/* Cleanup */
+	if (session->block_manager_cleanup != NULL)
+		WT_TRET(session->block_manager_cleanup(session));
+	if (session->reconcile_cleanup != NULL)
+		WT_TRET(session->reconcile_cleanup(session));
+
+	/* Free the eviction exclusive-lock information. */
+	__wt_free(session, session->excl);
+
+	/* Destroy the thread's mutex. */
+	WT_TRET(__wt_cond_destroy(session, &session->cond));
+
+	/* The API lock protects opening and closing of sessions. */
+	__wt_spin_lock(session, &conn->api_lock);
+
+	/* Decrement the count of open sessions. */
+	WT_STAT_FAST_CONN_DECR(session, session_open);
+
+	/*
+	 * Sessions are re-used, clear the structure: the clear sets the active
+	 * field to 0, which will exclude the hazard array from review by the
+	 * eviction thread.   Because some session fields are accessed by other
+	 * threads, the structure must be cleared carefully.
+	 *
+	 * We don't need to publish here, because regardless of the active field
+	 * being non-zero, the hazard pointer is always valid.
+	 */
+	__session_clear(session);
+	session = conn->default_session;
+
+	/*
+	 * Decrement the count of active sessions if that's possible: a session
+	 * being closed may or may not be at the end of the array, step toward
+	 * the beginning of the array until we reach an active session.
+	 */
+	while (conn->sessions[conn->session_cnt - 1].active == 0)
+		if (--conn->session_cnt == 0)
+			break;
+
+	__wt_spin_unlock(session, &conn->api_lock);
+
+err:	API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_reconfigure --
+ *	WT_SESSION->reconfigure method.
+ */
+static int
+__session_reconfigure(WT_SESSION *wt_session, const char *config)
+{
+	WT_CONFIG_ITEM cval;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+	SESSION_API_CALL(session, reconfigure, config, cfg);
+
+	if (F_ISSET(&session->txn, TXN_RUNNING))
+		WT_ERR_MSG(session, EINVAL, "transaction in progress");
+
+	WT_TRET(__wt_session_reset_cursors(session));
+
+	WT_ERR(__wt_config_gets_def(session, cfg, "isolation", 0, &cval));
+	if (cval.len != 0)
+		session->isolation = session->txn.isolation =
+		    WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
+		    TXN_ISO_SNAPSHOT :
+		    WT_STRING_MATCH("read-uncommitted", cval.str, cval.len) ?
+		    TXN_ISO_READ_UNCOMMITTED : TXN_ISO_READ_COMMITTED;
+
+err:	API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __wt_open_cursor --
+ *	Internal version of WT_SESSION::open_cursor.
+ */
+int
+__wt_open_cursor(WT_SESSION_IMPL *session,
+    const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+{
+	WT_COLGROUP *colgroup;
+	WT_DATA_SOURCE *dsrc;
+	WT_DECL_RET;
+
+	*cursorp = NULL;
+
+	/*
+	 * Open specific cursor types we know about, or call the generic data
+	 * source open function.
+	 *
+	 * Unwind a set of string comparisons into a switch statement hoping
+	 * the compiler can make it fast, but list the common choices first
+	 * instead of sorting so if/else patterns are still fast.
+	 */
+	switch (uri[0]) {
+	/*
+	 * Common cursor types.
+	 */
+	case 't':
+		if (WT_PREFIX_MATCH(uri, "table:"))
+			WT_RET(__wt_curtable_open(session, uri, cfg, cursorp));
+		break;
+	case 'c':
+		if (WT_PREFIX_MATCH(uri, "colgroup:")) {
+			/*
+			 * Column groups are a special case: open a cursor on
+			 * the underlying data source.
+			 */
+			WT_RET(__wt_schema_get_colgroup(
+			    session, uri, NULL, &colgroup));
+			WT_RET(__wt_open_cursor(
+			    session, colgroup->source, owner, cfg, cursorp));
+		} else if (WT_PREFIX_MATCH(uri, "config:"))
+			WT_RET(__wt_curconfig_open(
+			    session, uri, cfg, cursorp));
+		break;
+	case 'i':
+		if (WT_PREFIX_MATCH(uri, "index:"))
+			WT_RET(__wt_curindex_open(
+			    session, uri, owner, cfg, cursorp));
+		break;
+	case 'l':
+		if (WT_PREFIX_MATCH(uri, "lsm:"))
+			WT_RET(__wt_clsm_open(
+			    session, uri, owner, cfg, cursorp));
+		else if (WT_PREFIX_MATCH(uri, "log:"))
+			WT_RET(__wt_curlog_open(session, uri, cfg, cursorp));
+		break;
+
+	/*
+	 * Less common cursor types.
+	 */
+	case 'f':
+		if (WT_PREFIX_MATCH(uri, "file:"))
+			WT_RET(__wt_curfile_open(
+			    session, uri, owner, cfg, cursorp));
+		break;
+	case 'm':
+		if (WT_PREFIX_MATCH(uri, WT_METADATA_URI))
+			WT_RET(__wt_curmetadata_open(
+			    session, uri, owner, cfg, cursorp));
+		break;
+	case 'b':
+		if (WT_PREFIX_MATCH(uri, "backup:"))
+			WT_RET(__wt_curbackup_open(
+			    session, uri, cfg, cursorp));
+		break;
+	case 's':
+		if (WT_PREFIX_MATCH(uri, "statistics:"))
+			WT_RET(__wt_curstat_open(session, uri, cfg, cursorp));
+		break;
+	default:
+		break;
+	}
+
+	if (*cursorp == NULL &&
+	    (dsrc = __wt_schema_get_source(session, uri)) != NULL)
+		WT_RET(dsrc->open_cursor == NULL ?
+		    __wt_object_unsupported(session, uri) :
+		    __wt_curds_open(session, uri, owner, cfg, dsrc, cursorp));
+
+	if (*cursorp == NULL)
+		return (__wt_bad_object_type(session, uri));
+
+	/*
+	 * When opening simple tables, the table code calls this function on the
+	 * underlying data source, in which case the application's URI has been
+	 * copied.
+	 */
+	if ((*cursorp)->uri == NULL &&
+	    (ret = __wt_strdup(session, uri, &(*cursorp)->uri)) != 0)
+		WT_TRET((*cursorp)->close(*cursorp));
+
+	return (ret);
+}
+
+/*
+ * __session_open_cursor --
+ *	WT_SESSION->open_cursor method.
+ */
+static int
+__session_open_cursor(WT_SESSION *wt_session,
+    const char *uri, WT_CURSOR *to_dup, const char *config, WT_CURSOR **cursorp)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	cursor = *cursorp = NULL;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+	SESSION_API_CALL(session, open_cursor, config, cfg);
+
+	if ((to_dup == NULL && uri == NULL) || (to_dup != NULL && uri != NULL))
+		WT_ERR_MSG(session, EINVAL,
+		    "should be passed either a URI or a cursor to duplicate, "
+		    "but not both");
+
+	if (to_dup != NULL) {
+		uri = to_dup->uri;
+		if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
+		    !WT_PREFIX_MATCH(uri, "index:") &&
+		    !WT_PREFIX_MATCH(uri, "file:") &&
+		    !WT_PREFIX_MATCH(uri, "lsm:") &&
+		    !WT_PREFIX_MATCH(uri, WT_METADATA_URI) &&
+		    !WT_PREFIX_MATCH(uri, "table:") &&
+		    __wt_schema_get_source(session, uri) == NULL)
+			WT_ERR(__wt_bad_object_type(session, uri));
+	}
+
+	WT_ERR(__wt_open_cursor(session, uri, NULL, cfg, &cursor));
+	if (to_dup != NULL)
+		WT_ERR(__wt_cursor_dup_position(to_dup, cursor));
+
+	*cursorp = cursor;
+
+	if (0) {
+err:		if (cursor != NULL)
+			WT_TRET(cursor->close(cursor));
+	}
+
+	/*
+	 * Opening a cursor on a non-existent data source will set ret to
+	 * either of ENOENT or WT_NOTFOUND at this point.  However,
+	 * applications may reasonably do this inside a transaction to check
+	 * for the existence of a table or index.
+	 *
+	 * Prefer WT_NOTFOUND here: that does not force running transactions to
+	 * roll back.  It will be mapped back to ENOENT.
+	 */
+	if (ret == ENOENT)
+		ret = WT_NOTFOUND;
+
+	API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __wt_session_create_strip --
+ *	Discard any configuration information from a schema entry that is not
+ * applicable to an session.create call, here for the wt dump command utility,
+ * which only wants to dump the schema information needed for load.
+ */
+int
+__wt_session_create_strip(WT_SESSION *wt_session,
+    const char *v1, const char *v2, const char **value_ret)
+{
+	WT_SESSION_IMPL *session = (WT_SESSION_IMPL *)wt_session;
+	const char *cfg[] =
+	    { WT_CONFIG_BASE(session, session_create), v1, v2, NULL };
+
+	return (__wt_config_collapse(session, cfg, value_ret));
+}
+
+/*
+ * __session_create --
+ *	WT_SESSION->create method.
+ */
+static int
+__session_create(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+	WT_CONFIG_ITEM cval;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+	SESSION_API_CALL(session, create, config, cfg);
+	WT_UNUSED(cfg);
+
+	/* Disallow objects in the WiredTiger name space. */
+	WT_ERR(__wt_str_name_check(session, uri));
+
+	/*
+	 * Type configuration only applies to tables, column groups and indexes.
+	 * We don't want applications to attempt to layer LSM on top of their
+	 * extended data-sources, and the fact we allow LSM as a valid URI is an
+	 * invitation to that mistake: nip it in the bud.
+	 */
+	if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
+	    !WT_PREFIX_MATCH(uri, "index:") &&
+	    !WT_PREFIX_MATCH(uri, "table:")) {
+		/*
+		 * We can't disallow type entirely, a configuration string might
+		 * innocently include it, for example, a dump/load pair.  If the
+		 * URI type prefix and the type are the same, let it go.
+		 */
+		if ((ret =
+		    __wt_config_getones(session, config, "type", &cval)) == 0 &&
+		    (strncmp(uri, cval.str, cval.len) != 0 ||
+		    uri[cval.len] != ':'))
+			WT_ERR_MSG(session, EINVAL,
+			    "%s: unsupported type configuration", uri);
+		WT_ERR_NOTFOUND_OK(ret);
+	}
+
+	WT_WITH_SCHEMA_LOCK(session,
+	    ret = __wt_schema_create(session, uri, config));
+
+err:	API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_log_printf --
+ *	WT_SESSION->log_printf method.
+ */
+static int
+__session_log_printf(WT_SESSION *wt_session, const char *fmt, ...)
+    WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3)))
+{
+	WT_SESSION_IMPL *session;
+	WT_DECL_RET;
+	va_list ap;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+	SESSION_API_CALL_NOCONF(session, log_printf);
+
+	va_start(ap, fmt);
+	ret = __wt_log_vprintf(session, fmt, ap);
+	va_end(ap);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __session_rename --
+ *	WT_SESSION->rename method.
+ */
+static int
+__session_rename(WT_SESSION *wt_session,
+    const char *uri, const char *newuri, const char *config)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+	SESSION_API_CALL(session, rename, config, cfg);
+
+	/* Disallow objects in the WiredTiger name space. */
+	WT_ERR(__wt_str_name_check(session, uri));
+	WT_ERR(__wt_str_name_check(session, newuri));
+
+	WT_WITH_SCHEMA_LOCK(session,
+	    ret = __wt_schema_rename(session, uri, newuri, cfg));
+
+err:	API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_compact --
+ *	WT_SESSION->compact method.
+ */
+static int
+__session_compact(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+
+	/* Disallow objects in the WiredTiger name space. */
+	WT_RET(__wt_str_name_check(session, uri));
+
+	if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
+	    !WT_PREFIX_MATCH(uri, "file:") &&
+	    !WT_PREFIX_MATCH(uri, "index:") &&
+	    !WT_PREFIX_MATCH(uri, "lsm:") &&
+	    !WT_PREFIX_MATCH(uri, "table:"))
+		return (__wt_bad_object_type(session, uri));
+
+	return (__wt_session_compact(wt_session, uri, config));
+}
+
+/*
+ * __session_drop --
+ *	WT_SESSION->drop method.
+ */
+static int
+__session_drop(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+	SESSION_API_CALL(session, drop, config, cfg);
+
+	/* Disallow objects in the WiredTiger name space. */
+	WT_ERR(__wt_str_name_check(session, uri));
+
+	WT_WITH_SCHEMA_LOCK(session,
+	    ret = __wt_schema_drop(session, uri, cfg));
+
+err:	/* Note: drop operations cannot be unrolled (yet?). */
+	API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_salvage --
+ *	WT_SESSION->salvage method.
+ */
+static int
+__session_salvage(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+
+	SESSION_API_CALL(session, salvage, config, cfg);
+	WT_WITH_SCHEMA_LOCK(session,
+	    ret = __wt_schema_worker(session, uri, __wt_salvage,
+		NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_SALVAGE));
+
+err:	API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_truncate --
+ *	WT_SESSION->truncate method.
+ */
+static int
+__session_truncate(WT_SESSION *wt_session,
+    const char *uri, WT_CURSOR *start, WT_CURSOR *stop, const char *config)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	WT_CURSOR *cursor;
+	int cmp;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+	SESSION_TXN_API_CALL(session, truncate, config, cfg);
+
+	/*
+	 * If the URI is specified, we don't need a start/stop, if start/stop
+	 * is specified, we don't need a URI.
+	 *
+	 * If no URI is specified, and both cursors are specified, start/stop
+	 * must reference the same object.
+	 *
+	 * Any specified cursor must have been initialized.
+	 */
+	if ((uri == NULL && start == NULL && stop == NULL) ||
+	    (uri != NULL && (start != NULL || stop != NULL)))
+		WT_ERR_MSG(session, EINVAL,
+		    "the truncate method should be passed either a URI or "
+		    "start/stop cursors, but not both");
+
+	if (uri != NULL) {
+		/* Disallow objects in the WiredTiger name space. */
+		WT_ERR(__wt_str_name_check(session, uri));
+
+		WT_WITH_SCHEMA_LOCK(session,
+		    ret = __wt_schema_truncate(session, uri, cfg));
+		goto done;
+	}
+
+	/*
+	 * Cursor truncate is only supported for some objects, check for the
+	 * supporting methods we need, range_truncate and compare.
+	 */
+	cursor = start == NULL ? stop : start;
+	if (cursor->compare == NULL)
+		WT_ERR(__wt_bad_object_type(session, cursor->uri));
+
+	/*
+	 * If both cursors set, check they're correctly ordered with respect to
+	 * each other.  We have to test this before any search, the search can
+	 * change the initial cursor position.
+	 *
+	 * Rather happily, the compare routine will also confirm the cursors
+	 * reference the same object and the keys are set.
+	 */
+	if (start != NULL && stop != NULL) {
+		WT_ERR(start->compare(start, stop, &cmp));
+		if (cmp > 0)
+			WT_ERR_MSG(session, EINVAL,
+			    "the start cursor position is after the stop "
+			    "cursor position");
+	}
+
+	/*
+	 * Truncate does not require keys actually exist so that applications
+	 * can discard parts of the object's name space without knowing exactly
+	 * what records currently appear in the object.  For this reason, do a
+	 * search-near, rather than a search.  Additionally, we have to correct
+	 * after calling search-near, to position the start/stop cursors on the
+	 * next record greater than/less than the original key.  If the cursors
+	 * hit the beginning/end of the object, or the start/stop keys cross,
+	 * we're done, the range must be empty.
+	 */
+	if (start != NULL) {
+		WT_ERR(start->search_near(start, &cmp));
+		if (cmp < 0 && (ret = start->next(start)) != 0) {
+			WT_ERR_NOTFOUND_OK(ret);
+			goto done;
+		}
+	}
+	if (stop != NULL) {
+		WT_ERR(stop->search_near(stop, &cmp));
+		if (cmp > 0 && (ret = stop->prev(stop)) != 0) {
+			WT_ERR_NOTFOUND_OK(ret);
+			goto done;
+		}
+
+		if (start != NULL) {
+			WT_ERR(start->compare(start, stop, &cmp));
+			if (cmp > 0)
+				goto done;
+		}
+	}
+
+	WT_ERR(__wt_schema_range_truncate(session, start, stop));
+
+done:
+err:	TXN_API_END_RETRY(session, ret, 0);
+	return ((ret) == WT_NOTFOUND ? ENOENT : (ret));
+}
+
+/*
+ * __session_upgrade --
+ *	WT_SESSION->upgrade method.
+ */
+static int
+__session_upgrade(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+
+	SESSION_API_CALL(session, upgrade, config, cfg);
+	WT_WITH_SCHEMA_LOCK(session,
+	    ret = __wt_schema_worker(session, uri, __wt_upgrade,
+		NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_UPGRADE));
+
+err:	API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_verify --
+ *	WT_SESSION->verify method.
+ */
+static int
+__session_verify(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+
+	SESSION_API_CALL(session, verify, config, cfg);
+	WT_WITH_SCHEMA_LOCK(session,
+	    ret = __wt_schema_worker(session, uri, __wt_verify,
+		NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_VERIFY));
+
+err:	API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_begin_transaction --
+ *	WT_SESSION->begin_transaction method.
+ */
+static int
+__session_begin_transaction(WT_SESSION *wt_session, const char *config)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+	SESSION_API_CALL(session, begin_transaction, config, cfg);
+	WT_STAT_FAST_CONN_INCR(session, txn_begin);
+
+	if (F_ISSET(&session->txn, TXN_RUNNING))
+		WT_ERR_MSG(session, EINVAL, "Transaction already running");
+
+	/*
+	 * There is no transaction active in this thread; check if the cache is
+	 * full, if we have to block for eviction, this is the best time to do
+	 * it.
+	 */
+	WT_ERR(__wt_cache_full_check(session));
+
+	ret = __wt_txn_begin(session, cfg);
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __session_commit_transaction --
+ *	WT_SESSION->commit_transaction method.
+ */
+static int
+__session_commit_transaction(WT_SESSION *wt_session, const char *config)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	WT_TXN *txn;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+	SESSION_API_CALL(session, commit_transaction, config, cfg);
+	WT_STAT_FAST_CONN_INCR(session, txn_commit);
+
+	txn = &session->txn;
+	if (F_ISSET(txn, TXN_ERROR)) {
+		__wt_errx(session, "failed transaction requires rollback");
+		ret = EINVAL;
+	}
+
+	if (ret == 0)
+		ret = __wt_txn_commit(session, cfg);
+	else {
+		WT_TRET(__wt_session_reset_cursors(session));
+		WT_TRET(__wt_txn_rollback(session, cfg));
+	}
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __session_rollback_transaction --
+ *	WT_SESSION->rollback_transaction method.
+ */
+static int
+__session_rollback_transaction(WT_SESSION *wt_session, const char *config)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+	SESSION_API_CALL(session, rollback_transaction, config, cfg);
+	WT_STAT_FAST_CONN_INCR(session, txn_rollback);
+
+	WT_TRET(__wt_session_reset_cursors(session));
+
+	WT_TRET(__wt_txn_rollback(session, cfg));
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __session_transaction_pinned_range --
+ *	WT_SESSION->transaction_pinned_range method.
+ */
+static int
+__session_transaction_pinned_range(WT_SESSION *wt_session, uint64_t *prange)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	WT_TXN_STATE *txn_state;
+	uint64_t pinned;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+	SESSION_API_CALL_NOCONF(session, pinned_range);
+
+	txn_state = WT_SESSION_TXN_STATE(session);
+
+	/* Assign pinned to the lesser of id or snap_min */
+	if (txn_state->id != WT_TXN_NONE &&
+	    TXNID_LT(txn_state->id, txn_state->snap_min))
+		pinned = txn_state->id;
+	else
+		pinned = txn_state->snap_min;
+
+	if (pinned == WT_TXN_NONE)
+		*prange = 0;
+	else
+		*prange = S2C(session)->txn_global.current - pinned;
+
+err:	API_END_RET(session, ret);
+}
+
+/*
+ * __session_checkpoint --
+ *	WT_SESSION->checkpoint method.
+ */
+static int
+__session_checkpoint(WT_SESSION *wt_session, const char *config)
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	WT_TXN *txn;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+
+	txn = &session->txn;
+
+	WT_STAT_FAST_CONN_INCR(session, txn_checkpoint);
+	SESSION_API_CALL(session, checkpoint, config, cfg);
+
+	/*
+	 * Checkpoints require a snapshot to write a transactionally consistent
+	 * snapshot of the data.
+	 *
+	 * We can't use an application's transaction: if it has uncommitted
+	 * changes, they will be written in the checkpoint and may appear after
+	 * a crash.
+	 *
+	 * Use a real snapshot transaction: we don't want any chance of the
+	 * snapshot being updated during the checkpoint.  Eviction is prevented
+	 * from evicting anything newer than this because we track the oldest
+	 * transaction ID in the system that is not visible to all readers.
+	 */
+	if (F_ISSET(txn, TXN_RUNNING))
+		WT_ERR_MSG(session, EINVAL,
+		    "Checkpoint not permitted in a transaction");
+
+	/*
+	 * Reset open cursors.  Do this explicitly, even though it will happen
+	 * implicitly in the call to begin_transaction for the checkpoint, the
+	 * checkpoint code will acquire the schema lock before we do that, and
+	 * some implementation of WT_CURSOR::reset might need the schema lock.
+	 */
+	WT_ERR(__wt_session_reset_cursors(session));
+
+	/*
+	 * Don't highjack the session checkpoint thread for eviction.
+	 *
+	 * Application threads are not generally available for potentially slow
+	 * operations, but checkpoint does enough I/O it may be called upon to
+	 * perform slow operations for the block manager.
+	 */
+	F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK);
+
+	/*
+	 * Only one checkpoint can be active at a time, and checkpoints must run
+	 * in the same order as they update the metadata.  It's probably a bad
+	 * idea to run checkpoints out of multiple threads, but serialize them
+	 * here to ensure we don't get into trouble.
+	 */
+	WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 1);
+	__wt_spin_lock(session, &S2C(session)->checkpoint_lock);
+
+	ret = __wt_txn_checkpoint(session, cfg);
+
+	WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 0);
+	__wt_spin_unlock(session, &S2C(session)->checkpoint_lock);
+
+err:	F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK);
+
+	API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __wt_open_internal_session --
+ *	Allocate a session for WiredTiger's use.
+ */
+int
+__wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name,
+    int uses_dhandles, int open_metadata, WT_SESSION_IMPL **sessionp)
+{
+	WT_SESSION_IMPL *session;
+
+	*sessionp = NULL;
+
+	WT_RET(__wt_open_session(conn, NULL, NULL, &session));
+	session->name = name;
+
+	/*
+	 * Public sessions are automatically closed during WT_CONNECTION->close.
+	 * If the session handles for internal threads were to go on the public
+	 * list, there would be complex ordering issues during close.  Set a
+	 * flag to avoid this: internal sessions are not closed automatically.
+	 */
+	F_SET(session, WT_SESSION_INTERNAL);
+
+	/*
+	 * Some internal threads must keep running after we close all data
+	 * handles.  Make sure these threads don't open their own handles.
+	 */
+	if (!uses_dhandles)
+		F_SET(session, WT_SESSION_NO_DATA_HANDLES);
+
+	/*
+	 * Acquiring the metadata handle requires the schema lock; we've seen
+	 * problems in the past where a worker thread has acquired the schema
+	 * lock unexpectedly, relatively late in the run, and deadlocked. Be
+	 * defensive, get it now.  The metadata file may not exist when the
+	 * connection first creates its default session or the shared cache
+	 * pool creates its sessions, let our caller decline this work.
+	 */
+	if (open_metadata) {
+		WT_ASSERT(session, !F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+		WT_RET(__wt_metadata_open(session));
+	}
+
+	*sessionp = session;
+	return (0);
+}
+
+/*
+ * __wt_open_session --
+ *	Allocate a session handle.  The internal parameter is used for sessions
+ *	opened by WiredTiger for its own use.
+ */
+int
+__wt_open_session(WT_CONNECTION_IMPL *conn,
+    WT_EVENT_HANDLER *event_handler, const char *config,
+    WT_SESSION_IMPL **sessionp)
+{
+	static const WT_SESSION stds = {
+		NULL,
+		__session_close,
+		__session_reconfigure,
+		__session_open_cursor,
+		__session_create,
+		__session_compact,
+		__session_drop,
+		__session_log_printf,
+		__session_rename,
+		__session_salvage,
+		__session_truncate,
+		__session_upgrade,
+		__session_verify,
+		__session_begin_transaction,
+		__session_commit_transaction,
+		__session_rollback_transaction,
+		__session_checkpoint,
+		__session_transaction_pinned_range
+	};
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session, *session_ret;
+	uint32_t i;
+
+	*sessionp = NULL;
+
+	session = conn->default_session;
+	session_ret = NULL;
+
+	__wt_spin_lock(session, &conn->api_lock);
+
+	/*
+	 * Make sure we don't try to open a new session after the application
+	 * closes the connection.  This is particularly intended to catch
+	 * cases where server threads open sessions.
+	 */
+	WT_ASSERT(session, F_ISSET(conn, WT_CONN_SERVER_RUN));
+
+	/* Find the first inactive session slot. */
+	for (session_ret = conn->sessions,
+	    i = 0; i < conn->session_size; ++session_ret, ++i)
+		if (!session_ret->active)
+			break;
+	if (i == conn->session_size)
+		WT_ERR_MSG(session, ENOMEM,
+		    "only configured to support %" PRIu32 " sessions"
+		    " (including %" PRIu32 " internal)",
+		    conn->session_size, WT_NUM_INTERNAL_SESSIONS);
+
+	/*
+	 * If the active session count is increasing, update it.  We don't worry
+	 * about correcting the session count on error, as long as we don't mark
+	 * this session as active, we'll clean it up on close.
+	 */
+	if (i >= conn->session_cnt)	/* Defend against off-by-one errors. */
+		conn->session_cnt = i + 1;
+
+	session_ret->id = i;
+	session_ret->iface = stds;
+	session_ret->iface.connection = &conn->iface;
+
+	WT_ERR(__wt_cond_alloc(session, "session", 0, &session_ret->cond));
+
+	__wt_random_init(session_ret->rnd);
+
+	__wt_event_handler_set(session_ret,
+	    event_handler == NULL ? session->event_handler : event_handler);
+
+	TAILQ_INIT(&session_ret->cursors);
+	SLIST_INIT(&session_ret->dhandles);
+
+	/* Initialize transaction support: default to read-committed. */
+	session_ret->isolation = TXN_ISO_READ_COMMITTED;
+	WT_ERR(__wt_txn_init(session_ret));
+
+	/*
+	 * The session's hazard pointer memory isn't discarded during normal
+	 * session close because access to it isn't serialized.  Allocate the
+	 * first time we open this session.
+	 */
+	if (session_ret->hazard == NULL)
+		WT_ERR(__wt_calloc_def(
+		    session, conn->hazard_max, &session_ret->hazard));
+
+	/*
+	 * Set an initial size for the hazard array. It will be grown as
+	 * required up to hazard_max. The hazard_size is reset on close, since
+	 * __wt_hazard_close ensures the array is cleared - so it is safe to
+	 * reset the starting size on each open.
+	 */
+	session_ret->hazard_size = WT_HAZARD_INCR;
+
+	/*
+	 * Configuration: currently, the configuration for open_session is the
+	 * same as session.reconfigure, so use that function.
+	 */
+	if (config != NULL)
+		WT_ERR(
+		    __session_reconfigure((WT_SESSION *)session_ret, config));
+
+	session_ret->name = NULL;
+
+	/*
+	 * Publish: make the entry visible to server threads.  There must be a
+	 * barrier for two reasons, to ensure structure fields are set before
+	 * any other thread will consider the session, and to push the session
+	 * count to ensure the eviction thread can't review too few slots.
+	 */
+	WT_PUBLISH(session_ret->active, 1);
+
+	WT_STATIC_ASSERT(offsetof(WT_SESSION_IMPL, iface) == 0);
+	*sessionp = session_ret;
+
+	WT_STAT_FAST_CONN_INCR(session, session_open);
+
+err:	__wt_spin_unlock(session, &conn->api_lock);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/session/session_compact.c b/src/third_party/wiredtiger/src/session/session_compact.c
new file mode 100644
index 00000000000..6eca8a58d13
--- /dev/null
+++ b/src/third_party/wiredtiger/src/session/session_compact.c
@@ -0,0 +1,236 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Compaction is the place where the underlying block manager becomes visible
+ * in the higher engine btree and API layers.  As there is currently only one
+ * block manager, this code is written with it in mind: other block managers
+ * may need changes to support compaction, and a smart block manager might need
+ * far less support from the engine.
+ *
+ * First, the default block manager cannot entirely own compaction because it
+ * has no way to find a block after it moves other than a request from the
+ * btree layer with the new address.  In other words, if internal page X points
+ * to leaf page Y, and page Y moves, the address of page Y has to be updated in
+ * page X.  Generally, this is solved by building a translation layer in the
+ * block manager so internal pages don't require updates to relocate blocks:
+ * however, the translation table must be durable, has its own garbage
+ * collection issues and might be slower, all of which have their own problems.
+ *
+ * Second, the btree layer cannot entirely own compaction because page
+ * addresses are opaque, it cannot know where a page is in the file from the
+ * address cookie.
+ *
+ * For these reasons, compaction is a cooperative process between the btree
+ * layer and the block manager.  The btree layer walks files, and asks the
+ * block manager if rewriting a particular block would reduce the file
+ * footprint: if writing the page will help, the page is marked dirty so it
+ * will eventually be written.  As pages are written, the original page
+ * potentially becomes available for reuse and if enough pages at the end of
+ * the file are available for reuse, the file can be truncated, and compaction
+ * succeeds.
+ *
+ * However, writing a page is not by itself sufficient to make a page available
+ * for reuse.  The original version of the page is still referenced by at least
+ * the most recent checkpoint in the file.  To make a page available for reuse,
+ * we have to checkpoint the file so we can discard the checkpoint referencing
+ * the original version of the block; once no checkpoint references a block, it
+ * becomes available for reuse.
+ *
+ * Compaction is not necessarily possible in WiredTiger, even in a file with
+ * lots of available space.  If a block at the end of the file is referenced by
+ * a named checkpoint, there is nothing we can do to compact the file, no
+ * matter how many times we rewrite the block, the named checkpoint can't be
+ * discarded and so the reference count on the original block will never go to
+ * zero.   What's worse, because the block manager doesn't reference count
+ * blocks, it can't easily know this is the case, and so we'll waste a lot of
+ * effort trying to compact files that can't be compacted.
+ *
+ * Now, to the actual process.  First, we checkpoint the high-level object
+ * (which is potentially composed of multiple files): there are potentially
+ * many dirty blocks in the cache, and we want to write them out and then
+ * discard previous checkpoints so we have as many blocks as possible on the
+ * file's "available for reuse" list when we start compaction.
+ *
+ * Then, we compact the high-level object.
+ *
+ * Compacting the object is done 10% at a time, that is, we try and move blocks
+ * from the last 10% of the file into the beginning of the file (the 10% is
+ * hard coded in the block manager).  The reason for this is because we are
+ * walking the file in logical order, not block offset order, and we can fail
+ * to compact a file if we write the wrong blocks first.
+ *
+ * For example, imagine a file with 10 blocks in the first 10% of a file, 1,000
+ * blocks in the 3rd quartile of the file, and 10 blocks in the last 10% of the
+ * file.  If we were to rewrite blocks from more than the last 10% of the file,
+ * and found the 1,000 blocks in the 3rd quartile of the file first, we'd copy
+ * 10 of them without ever rewriting the blocks from the end of the file which
+ * would allow us to compact the file.  So, we compact the last 10% of the
+ * file, and if that works, we compact the last 10% of the file again, and so
+ * on.  Note the block manager uses a first-fit block selection algorithm
+ * during compaction to maximize block movement.
+ *
+ * After each 10% compaction, we checkpoint two more times (seriously, twice).
+ * The second and third checkpoints are because the block manager checkpoints
+ * in two steps: blocks made available for reuse during a checkpoint are put on
+ * a special checkpoint-available list and only moved to the real available
+ * list after the metadata has been updated with the new checkpoint's
+ * information.  (Otherwise it is possible to allocate a rewritten block, crash
+ * before the metadata is updated, and see corruption.)  For this reason,
+ * blocks allocated to write the checkpoint itself cannot be taken from the
+ * blocks made available by the checkpoint.
+ *
+ * To say it another way, the second checkpoint puts the blocks from the end of
+ * the file that were made available by compaction onto the checkpoint-available
+ * list, but then potentially writes the checkpoint itself at the end of the
+ * file, which would prevent any file truncation.  When the metadata is updated
+ * for the second checkpoint, the blocks freed by compaction become available
+ * for the third checkpoint, so the third checkpoint's blocks are written
+ * towards the beginning of the file, and then the file can be truncated.
+ */
+
+/*
+ * __wt_compact_uri_analyze --
+ *	Extract information relevant to deciding what work compact needs to
+ *	do from a URI that is part of a table schema.
+ *	Called via the schema_worker function.
+ */
+int
+__wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, int *skip)
+{
+	/*
+	 * Add references to schema URI objects to the list of objects to be
+	 * compacted.  Skip over LSM trees or we will get false positives on
+	 * the "file:" URIs for the chunks.
+	 */
+	if (WT_PREFIX_MATCH(uri, "lsm:")) {
+		session->compact->lsm_count++;
+		*skip = 1;
+	} else if (WT_PREFIX_MATCH(uri, "file:"))
+		session->compact->file_count++;
+
+	return (0);
+}
+
+/*
+ * __session_compact_check_timeout --
+ *	Check if the timeout has been exceeded.
+ */
+static int
+__session_compact_check_timeout(
+    WT_SESSION_IMPL *session, struct timespec begin)
+{
+	struct timespec end;
+
+	if (session->compact->max_time == 0)
+		return (0);
+
+	WT_RET(__wt_epoch(session, &end));
+	if (session->compact->max_time <
+	    WT_TIMEDIFF(end, begin) / WT_BILLION)
+		WT_RET(ETIMEDOUT);
+	return (0);
+}
+
+/*
+ * __compact_file --
+ *	Function to alternate between checkpoints and compaction calls.
+ */
+static int
+__compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
+{
+	WT_DECL_RET;
+	WT_DECL_ITEM(t);
+	WT_SESSION *wt_session;
+	WT_TXN *txn;
+	int i;
+	struct timespec start_time;
+
+	txn = &session->txn;
+	wt_session = &session->iface;
+
+	/*
+	 * File compaction requires checkpoints, which will fail in a
+	 * transactional context.  Check now so the error message isn't
+	 * confusing.
+	 */
+	if (session->compact->file_count != 0 && F_ISSET(txn, TXN_RUNNING))
+		WT_ERR_MSG(session, EINVAL,
+		    " File compaction not permitted in a transaction");
+
+	/*
+	 * Force the checkpoint: we don't want to skip it because the work we
+	 * need to have done is done in the underlying block manager.
+	 */
+	WT_ERR(__wt_scr_alloc(session, 128, &t));
+	WT_ERR(__wt_buf_fmt(session, t, "target=(\"%s\"),force=1", uri));
+
+	WT_ERR(__wt_epoch(session, &start_time));
+
+	/*
+	 * We compact 10% of the file on each pass, try 10 times (which is
+	 * probably overkill), and quit if we make no progress. Check for a
+	 * timeout each time through the loop.
+	 */
+	for (i = 0; i < 10; ++i) {
+		WT_ERR(wt_session->checkpoint(wt_session, t->data));
+
+		session->compaction = 0;
+		WT_WITH_SCHEMA_LOCK(session,
+		    ret = __wt_schema_worker(
+		    session, uri, __wt_compact, NULL, cfg, 0));
+		WT_ERR(ret);
+		if (!session->compaction)
+			break;
+
+		WT_ERR(wt_session->checkpoint(wt_session, t->data));
+		WT_ERR(wt_session->checkpoint(wt_session, t->data));
+		WT_ERR(__session_compact_check_timeout(session, start_time));
+	}
+
+err:	__wt_scr_free(&t);
+	return (ret);
+}
+
+/*
+ * __wt_session_compact --
+ */
+int
+__wt_session_compact(
+    WT_SESSION *wt_session, const char *uri, const char *config)
+{
+	WT_COMPACT compact;
+	WT_CONFIG_ITEM cval;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+	SESSION_API_CALL(session, compact, config, cfg);
+
+	/* Setup the structure in the session handle */
+	memset(&compact, 0, sizeof(WT_COMPACT));
+	session->compact = &compact;
+
+	WT_ERR(__wt_config_gets(session, cfg, "timeout", &cval));
+	session->compact->max_time = (uint64_t)cval.val;
+
+	/* Find the types of data sources are being compacted. */
+	WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(
+	    session, uri, NULL, __wt_compact_uri_analyze, cfg, 0));
+	WT_ERR(ret);
+
+	if (session->compact->lsm_count != 0)
+		WT_ERR(__wt_schema_worker(
+		    session, uri, NULL, __wt_lsm_compact, cfg, 0));
+	if (session->compact->file_count != 0)
+		WT_ERR(__compact_file(session, uri, cfg));
+
+err:	session->compact = NULL;
+	API_END_RET_NOTFOUND_MAP(session, ret);
+}
diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c
new file mode 100644
index 00000000000..0c07e5fa259
--- /dev/null
+++ b/src/third_party/wiredtiger/src/session/session_dhandle.c
@@ -0,0 +1,478 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_session_dhandle_incr_use --
+ *	Increment the session data source's in-use counter.
+ */
+void
+__wt_session_dhandle_incr_use(WT_SESSION_IMPL *session)
+{
+	WT_DATA_HANDLE *dhandle;
+
+	dhandle = session->dhandle;
+
+	(void)WT_ATOMIC_ADD4(dhandle->session_inuse, 1);
+}
+
+/*
+ * __session_dhandle_decr_use --
+ *	Decrement the session data source's in-use counter.
+ */
+static int
+__session_dhandle_decr_use(WT_SESSION_IMPL *session)
+{
+	WT_DATA_HANDLE *dhandle;
+	WT_DECL_RET;
+
+	dhandle = session->dhandle;
+
+	/*
+	 * Decrement the in-use count on the underlying data-source -- if we're
+	 * the last reference, set the time-of-death timestamp.
+	 */
+	WT_ASSERT(session, dhandle->session_inuse > 0);
+	if (WT_ATOMIC_SUB4(dhandle->session_inuse, 1) == 0)
+		WT_TRET(__wt_seconds(session, &dhandle->timeofdeath));
+	return (0);
+}
+
+/*
+ * __session_add_btree --
+ *	Add a handle to the session's cache.
+ */
+static int
+__session_add_btree(
+    WT_SESSION_IMPL *session, WT_DATA_HANDLE_CACHE **dhandle_cachep)
+{
+	WT_DATA_HANDLE_CACHE *dhandle_cache;
+
+	WT_RET(__wt_calloc_def(session, 1, &dhandle_cache));
+	dhandle_cache->dhandle = session->dhandle;
+
+	SLIST_INSERT_HEAD(&session->dhandles, dhandle_cache, l);
+
+	if (dhandle_cachep != NULL)
+		*dhandle_cachep = dhandle_cache;
+
+	return (0);
+}
+
+/*
+ * __wt_session_lock_btree --
+ *	Lock a btree handle.
+ */
+int
+__wt_session_lock_btree(WT_SESSION_IMPL *session, uint32_t flags)
+{
+	enum { NOLOCK, READLOCK, WRITELOCK } locked;
+	WT_BTREE *btree;
+	WT_DATA_HANDLE *dhandle;
+	uint32_t special_flags;
+
+	btree = S2BT(session);
+	dhandle = session->dhandle;
+	locked = NOLOCK;
+
+	/*
+	 * Special operation flags will cause the handle to be reopened.
+	 * For example, a handle opened with WT_BTREE_BULK cannot use the same
+	 * internal data structures as a handle opened for ordinary access.
+	 */
+	special_flags = LF_ISSET(WT_BTREE_SPECIAL_FLAGS);
+	WT_ASSERT(session,
+	    special_flags == 0 || LF_ISSET(WT_DHANDLE_EXCLUSIVE));
+
+	if (LF_ISSET(WT_DHANDLE_EXCLUSIVE)) {
+		/*
+		 * Try to get an exclusive handle lock and fail immediately if
+		 * it's unavailable.  We don't expect exclusive operations on
+		 * trees to be mixed with ordinary cursor access, but if there
+		 * is a use case in the future, we could make blocking here
+		 * configurable.
+		 *
+		 * Special flags will cause the handle to be reopened, which
+		 * will get the necessary lock, so don't bother here.
+		 */
+		if (LF_ISSET(WT_DHANDLE_LOCK_ONLY) || special_flags == 0) {
+			WT_RET(__wt_try_writelock(session, dhandle->rwlock));
+			F_SET(dhandle, WT_DHANDLE_EXCLUSIVE);
+			locked = WRITELOCK;
+		}
+	} else if (F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS))
+		return (EBUSY);
+	else {
+		WT_RET(__wt_readlock(session, dhandle->rwlock));
+		locked = READLOCK;
+	}
+
+	/*
+	 * At this point, we have the requested lock -- if that is all that was
+	 * required, we're done.  Otherwise, check that the handle is open and
+	 * that no special flags are required.
+	 */
+	if (LF_ISSET(WT_DHANDLE_LOCK_ONLY) ||
+	    (F_ISSET(dhandle, WT_DHANDLE_OPEN) && special_flags == 0))
+		return (0);
+
+	/*
+	 * The handle needs to be opened.  If we locked the handle above,
+	 * unlock it before returning.
+	 */
+	switch (locked) {
+	case NOLOCK:
+		break;
+	case READLOCK:
+		WT_RET(__wt_readunlock(session, dhandle->rwlock));
+		break;
+	case WRITELOCK:
+		F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
+		WT_RET(__wt_writeunlock(session, dhandle->rwlock));
+		break;
+	}
+
+	/* Treat an unopened handle just like a non-existent handle. */
+	return (WT_NOTFOUND);
+}
+
+/*
+ * __wt_session_release_btree --
+ *	Unlock a btree handle.
+ */
+int
+__wt_session_release_btree(WT_SESSION_IMPL *session)
+{
+	enum { NOLOCK, READLOCK, WRITELOCK } locked;
+	WT_BTREE *btree;
+	WT_DATA_HANDLE *dhandle;
+	WT_DECL_RET;
+
+	btree = S2BT(session);
+	dhandle = session->dhandle;
+
+	/*
+	 * Decrement the data-source's in-use counter. We ignore errors because
+	 * they're insignificant and handling them complicates error handling in
+	 * this function more than I'm willing to live with.
+	 */
+	(void)__session_dhandle_decr_use(session);
+
+	locked = F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) ? WRITELOCK : READLOCK;
+	if (F_ISSET(dhandle, WT_DHANDLE_DISCARD_CLOSE)) {
+		/*
+		 * If configured to discard on last close, trade any read lock
+		 * for an exclusive lock. If the exchange succeeds, setup for
+		 * discard. It is expected acquiring an exclusive lock will fail
+		 * sometimes since the handle may still be in use: in that case
+		 * we're done.
+		 */
+		if (locked == READLOCK) {
+			locked = NOLOCK;
+			WT_ERR(__wt_readunlock(session, dhandle->rwlock));
+			ret = __wt_try_writelock(session, dhandle->rwlock);
+			if (ret != 0) {
+				if (ret == EBUSY)
+					ret = 0;
+				goto err;
+			}
+			locked = WRITELOCK;
+			F_CLR(dhandle, WT_DHANDLE_DISCARD_CLOSE);
+			F_SET(dhandle,
+			    WT_DHANDLE_DISCARD | WT_DHANDLE_EXCLUSIVE);
+		}
+	}
+
+	/*
+	 * If we had special flags set, close the handle so that future access
+	 * can get a handle without special flags.
+	 */
+	if (F_ISSET(dhandle, WT_DHANDLE_DISCARD) ||
+	    F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) {
+		WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE));
+		F_CLR(dhandle, WT_DHANDLE_DISCARD);
+
+		WT_TRET(__wt_conn_btree_sync_and_close(session, 0));
+	}
+
+	if (F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE))
+		F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
+
+err:	switch (locked) {
+	case NOLOCK:
+		break;
+	case READLOCK:
+		WT_TRET(__wt_readunlock(session, dhandle->rwlock));
+		break;
+	case WRITELOCK:
+		WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
+		break;
+	}
+
+	session->dhandle = NULL;
+	return (ret);
+}
+
+/*
+ * __wt_session_get_btree_ckpt --
+ *	Check the configuration strings for a checkpoint name, get a btree
+ * handle for the given name, set session->dhandle.
+ */
+int
+__wt_session_get_btree_ckpt(WT_SESSION_IMPL *session,
+    const char *uri, const char *cfg[], uint32_t flags)
+{
+	WT_CONFIG_ITEM cval;
+	WT_DECL_RET;
+	int last_ckpt;
+	const char *checkpoint;
+
+	last_ckpt = 0;
+	checkpoint = NULL;
+
+	/*
+	 * This function exists to handle checkpoint configuration.  Callers
+	 * that never open a checkpoint call the underlying function directly.
+	 */
+	WT_RET_NOTFOUND_OK(
+	    __wt_config_gets_def(session, cfg, "checkpoint", 0, &cval));
+	if (cval.len != 0) {
+		/*
+		 * The internal checkpoint name is special, find the last
+		 * unnamed checkpoint of the object.
+		 */
+		if (WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) {
+			last_ckpt = 1;
+retry:			WT_RET(__wt_meta_checkpoint_last_name(
+			    session, uri, &checkpoint));
+		} else
+			WT_RET(__wt_strndup(
+			    session, cval.str, cval.len, &checkpoint));
+	}
+
+	ret = __wt_session_get_btree(session, uri, checkpoint, cfg, flags);
+
+	__wt_free(session, checkpoint);
+
+	/*
+	 * There's a potential race: we get the name of the most recent unnamed
+	 * checkpoint, but if it's discarded (or locked so it can be discarded)
+	 * by the time we try to open it, we'll fail the open.  Retry in those
+	 * cases, a new "last" checkpoint should surface, and we can't return an
+	 * error, the application will be justifiably upset if we can't open the
+	 * last checkpoint instance of an object.
+	 *
+	 * The check against WT_NOTFOUND is correct: if there was no checkpoint
+	 * for the object (that is, the object has never been in a checkpoint),
+	 * we returned immediately after the call to search for that name.
+	 */
+	if (last_ckpt && (ret == WT_NOTFOUND || ret == EBUSY))
+		goto retry;
+	return (ret);
+}
+
+/*
+ * __session_discard_btree --
+ *	Discard our reference to the btree.
+ */
+static void
+__session_discard_btree(
+    WT_SESSION_IMPL *session, WT_DATA_HANDLE_CACHE *dhandle_cache)
+{
+	WT_DATA_HANDLE *saved_dhandle;
+
+	SLIST_REMOVE(
+	    &session->dhandles, dhandle_cache, __wt_data_handle_cache, l);
+
+	saved_dhandle = session->dhandle;
+	session->dhandle = dhandle_cache->dhandle;
+
+	__wt_overwrite_and_free(session, dhandle_cache);
+	__wt_conn_btree_close(session);
+
+	/* Restore the original handle in the session. */
+	session->dhandle = saved_dhandle;
+}
+
+/*
+ * __wt_session_close_cache --
+ *	Close any cached handles in a session.
+ */
+void
+__wt_session_close_cache(WT_SESSION_IMPL *session)
+{
+	WT_DATA_HANDLE_CACHE *dhandle_cache;
+
+	while ((dhandle_cache = SLIST_FIRST(&session->dhandles)) != NULL)
+		__session_discard_btree(session, dhandle_cache);
+}
+
+/*
+ * __session_dhandle_sweep --
+ *	Discard any session dhandles that are not open.
+ */
+static int
+__session_dhandle_sweep(WT_SESSION_IMPL *session, uint32_t flags)
+{
+	WT_DATA_HANDLE *dhandle;
+	WT_DATA_HANDLE_CACHE *dhandle_cache, *dhandle_cache_next;
+	time_t now;
+
+	/*
+	 * Check the local flag WT_DHANDLE_LOCK_ONLY; a common caller with that
+	 * flag is in the path to discard the handle, don't sweep in that case.
+	 */
+	if (LF_ISSET(WT_DHANDLE_LOCK_ONLY))
+		return (0);
+
+	/*
+	 * Periodically sweep for dead handles; if we've swept recently, don't
+	 * do it again.
+	 */
+	WT_RET(__wt_seconds(session, &now));
+	if (now - session->last_sweep < WT_DHANDLE_SWEEP_PERIOD)
+		return (0);
+	session->last_sweep = now;
+
+	WT_STAT_FAST_CONN_INCR(session, dh_session_sweeps);
+
+	dhandle_cache = SLIST_FIRST(&session->dhandles);
+	while (dhandle_cache != NULL) {
+		dhandle_cache_next = SLIST_NEXT(dhandle_cache, l);
+		dhandle = dhandle_cache->dhandle;
+		if (dhandle != session->dhandle &&
+		    dhandle->session_inuse == 0 &&
+		    now - dhandle->timeofdeath > WT_DHANDLE_SWEEP_WAIT) {
+			WT_STAT_FAST_CONN_INCR(session, dh_session_handles);
+			__session_discard_btree(session, dhandle_cache);
+		}
+		dhandle_cache = dhandle_cache_next;
+	}
+	return (0);
+}
+
+/*
+ * __wt_session_get_btree --
+ *	Get a btree handle for the given name, set session->dhandle.
+ */
+int
+__wt_session_get_btree(WT_SESSION_IMPL *session,
+    const char *uri, const char *checkpoint, const char *cfg[], uint32_t flags)
+{
+	WT_DATA_HANDLE *dhandle;
+	WT_DATA_HANDLE_CACHE *dhandle_cache;
+	WT_DECL_RET;
+	uint64_t hash;
+	int candidate;
+
+	WT_ASSERT(session, !F_ISSET(session, WT_SESSION_NO_DATA_HANDLES));
+
+	dhandle = NULL;
+	candidate = 0;
+
+	hash = __wt_hash_city64(uri, strlen(uri));
+	SLIST_FOREACH(dhandle_cache, &session->dhandles, l) {
+		dhandle = dhandle_cache->dhandle;
+		if (hash != dhandle->name_hash ||
+		    strcmp(uri, dhandle->name) != 0)
+			continue;
+		if (checkpoint == NULL && dhandle->checkpoint == NULL)
+			break;
+		if (checkpoint != NULL && dhandle->checkpoint != NULL &&
+		    strcmp(checkpoint, dhandle->checkpoint) == 0)
+			break;
+	}
+
+	if (dhandle_cache != NULL) {
+		candidate = 1;
+		/* We found the data handle, don't try to get it again. */
+		LF_SET(WT_DHANDLE_HAVE_REF);
+		session->dhandle = dhandle;
+
+		/*
+		 * Try to lock the file; if we succeed, our "exclusive" state
+		 * must match.
+		 */
+		ret = __wt_session_lock_btree(session, flags);
+		if (ret == WT_NOTFOUND)
+			dhandle_cache = NULL;
+		else
+			WT_RET(ret);
+	}
+
+	if (dhandle_cache == NULL) {
+		/* Sweep the handle list to remove any dead handles. */
+		WT_RET(__session_dhandle_sweep(session, flags));
+
+		/*
+		 * Acquire the schema lock if we don't already hold it, find
+		 * and/or open the handle.
+		 */
+		WT_WITH_SCHEMA_LOCK(session, ret =
+		    __wt_conn_btree_get(session, uri, checkpoint, cfg, flags));
+		WT_RET(ret);
+
+		if (!candidate)
+			WT_RET(__session_add_btree(session, NULL));
+		WT_ASSERT(session, LF_ISSET(WT_DHANDLE_LOCK_ONLY) ||
+		    F_ISSET(session->dhandle, WT_DHANDLE_OPEN));
+	}
+
+	/* Increment the data-source's in-use counter. */
+	__wt_session_dhandle_incr_use(session);
+
+	WT_ASSERT(session, LF_ISSET(WT_DHANDLE_EXCLUSIVE) ==
+	    F_ISSET(session->dhandle, WT_DHANDLE_EXCLUSIVE));
+	F_SET(session->dhandle, LF_ISSET(WT_DHANDLE_DISCARD_CLOSE));
+
+	return (0);
+}
+
+/*
+ * __wt_session_lock_checkpoint --
+ *	Lock the btree handle for the given checkpoint name.
+ */
+int
+__wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint)
+{
+	WT_DATA_HANDLE *dhandle, *saved_dhandle;
+	WT_DECL_RET;
+
+	saved_dhandle = session->dhandle;
+
+	/*
+	 * Get the checkpoint handle exclusive, so no one else can access it
+	 * while we are creating the new checkpoint.
+	 */
+	WT_ERR(__wt_session_get_btree(session, saved_dhandle->name,
+	    checkpoint, NULL, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY));
+
+	/*
+	 * Flush any pages in this checkpoint from the cache (we are about to
+	 * re-write the checkpoint which will mean cached pages no longer have
+	 * valid contents).  This is especially noticeable with memory mapped
+	 * files, since changes to the underlying file are visible to the in
+	 * memory pages.
+	 */
+	WT_ERR(__wt_cache_op(session, NULL, WT_SYNC_DISCARD));
+
+	/*
+	 * We lock checkpoint handles that we are overwriting, so the handle
+	 * must be closed when we release it.
+	 */
+	dhandle = session->dhandle;
+	F_SET(dhandle, WT_DHANDLE_DISCARD);
+
+	WT_ASSERT(session, WT_META_TRACKING(session));
+	WT_ERR(__wt_meta_track_handle_lock(session, 0));
+
+	/* Restore the original btree in the session. */
+err:	session->dhandle = saved_dhandle;
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/session/session_salvage.c b/src/third_party/wiredtiger/src/session/session_salvage.c
new file mode 100644
index 00000000000..1512c6515ec
--- /dev/null
+++ b/src/third_party/wiredtiger/src/session/session_salvage.c
@@ -0,0 +1,58 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_salvage --
+ *	Salvage a single file.
+ */
+int
+__wt_salvage(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_CKPT *ckptbase;
+	WT_DATA_HANDLE *dhandle;
+	WT_DECL_RET;
+
+	dhandle = session->dhandle;
+
+	/*
+	 * XXX
+	 * The salvage process reads and discards previous checkpoints, so the
+	 * underlying block manager has to ignore any previous checkpoint
+	 * entries when creating a new checkpoint, in other words, we can't use
+	 * the metadata checkpoint list, it has all of those checkpoint listed
+	 * and we don't care about them.  Build a clean checkpoint list and use
+	 * it instead.
+	 *
+	 * Don't first clear the metadata checkpoint list and call the function
+	 * to get a list of checkpoints: a crash between clearing the metadata
+	 * checkpoint list and creating a new checkpoint list would look like a
+	 * create or open of a file without a checkpoint to roll-forward from,
+	 * and the contents of the file would be discarded.
+	 */
+	WT_RET(__wt_calloc_def(session, 2, &ckptbase));
+	WT_ERR(__wt_strdup(session, WT_CHECKPOINT, &ckptbase[0].name));
+	F_SET(&ckptbase[0], WT_CKPT_ADD);
+
+	WT_ERR(__wt_bt_salvage(session, ckptbase, cfg));
+
+	/*
+	 * If no checkpoint was created, well, it's probably bad news, but there
+	 * is nothing to do but clear any recorded checkpoints for the file.  If
+	 * a checkpoint was created, life is good, replace any existing list of
+	 * checkpoints with the single new one.
+	 */
+	if (ckptbase[0].raw.data == NULL)
+		WT_ERR(__wt_meta_checkpoint_clear(session, dhandle->name));
+	else
+		WT_ERR(__wt_meta_ckptlist_set(
+		    session, dhandle->name, ckptbase, NULL));
+
+err:	__wt_meta_ckptlist_free(session, ckptbase);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/support/cksum.c b/src/third_party/wiredtiger/src/support/cksum.c
new file mode 100644
index 00000000000..1eaa345d1fe
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/cksum.c
@@ -0,0 +1,1306 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * This file contains two implementations for computing CRC: one that uses
+ * hardware CRC instructions, available on newer x86_64/amd64, and one that uses
+ * a fast software algorithm.  __wt_cksum() provides a common entry point that
+ * indirects to one of these two methods.
+ */
+static uint32_t (*__wt_cksum_func)(const void *chunk, size_t len);
+
+/*
+ * The CRC slicing tables are used by __wt_cksum_sw.
+ */
+static const uint32_t g_crc_slicing[8][256] = {
+#ifdef WORDS_BIGENDIAN
+	/*
+	 * Big endian tables have entries that are byte reversed from little
+	 * endian tables.
+	 */
+	{
+	0x00000000, 0x03836bf2, 0xf7703be1, 0xf4f35013,
+	0x1f979ac7, 0x1c14f135, 0xe8e7a126, 0xeb64cad4,
+	0xcf58d98a, 0xccdbb278, 0x3828e26b, 0x3bab8999,
+	0xd0cf434d, 0xd34c28bf, 0x27bf78ac, 0x243c135e,
+	0x6fc75e10, 0x6c4435e2, 0x98b765f1, 0x9b340e03,
+	0x7050c4d7, 0x73d3af25, 0x8720ff36, 0x84a394c4,
+	0xa09f879a, 0xa31cec68, 0x57efbc7b, 0x546cd789,
+	0xbf081d5d, 0xbc8b76af, 0x487826bc, 0x4bfb4d4e,
+	0xde8ebd20, 0xdd0dd6d2, 0x29fe86c1, 0x2a7ded33,
+	0xc11927e7, 0xc29a4c15, 0x36691c06, 0x35ea77f4,
+	0x11d664aa, 0x12550f58, 0xe6a65f4b, 0xe52534b9,
+	0x0e41fe6d, 0x0dc2959f, 0xf931c58c, 0xfab2ae7e,
+	0xb149e330, 0xb2ca88c2, 0x4639d8d1, 0x45bab323,
+	0xaede79f7, 0xad5d1205, 0x59ae4216, 0x5a2d29e4,
+	0x7e113aba, 0x7d925148, 0x8961015b, 0x8ae26aa9,
+	0x6186a07d, 0x6205cb8f, 0x96f69b9c, 0x9575f06e,
+	0xbc1d7b41, 0xbf9e10b3, 0x4b6d40a0, 0x48ee2b52,
+	0xa38ae186, 0xa0098a74, 0x54fada67, 0x5779b195,
+	0x7345a2cb, 0x70c6c939, 0x8435992a, 0x87b6f2d8,
+	0x6cd2380c, 0x6f5153fe, 0x9ba203ed, 0x9821681f,
+	0xd3da2551, 0xd0594ea3, 0x24aa1eb0, 0x27297542,
+	0xcc4dbf96, 0xcfced464, 0x3b3d8477, 0x38beef85,
+	0x1c82fcdb, 0x1f019729, 0xebf2c73a, 0xe871acc8,
+	0x0315661c, 0x00960dee, 0xf4655dfd, 0xf7e6360f,
+	0x6293c661, 0x6110ad93, 0x95e3fd80, 0x96609672,
+	0x7d045ca6, 0x7e873754, 0x8a746747, 0x89f70cb5,
+	0xadcb1feb, 0xae487419, 0x5abb240a, 0x59384ff8,
+	0xb25c852c, 0xb1dfeede, 0x452cbecd, 0x46afd53f,
+	0x0d549871, 0x0ed7f383, 0xfa24a390, 0xf9a7c862,
+	0x12c302b6, 0x11406944, 0xe5b33957, 0xe63052a5,
+	0xc20c41fb, 0xc18f2a09, 0x357c7a1a, 0x36ff11e8,
+	0xdd9bdb3c, 0xde18b0ce, 0x2aebe0dd, 0x29688b2f,
+	0x783bf682, 0x7bb89d70, 0x8f4bcd63, 0x8cc8a691,
+	0x67ac6c45, 0x642f07b7, 0x90dc57a4, 0x935f3c56,
+	0xb7632f08, 0xb4e044fa, 0x401314e9, 0x43907f1b,
+	0xa8f4b5cf, 0xab77de3d, 0x5f848e2e, 0x5c07e5dc,
+	0x17fca892, 0x147fc360, 0xe08c9373, 0xe30ff881,
+	0x086b3255, 0x0be859a7, 0xff1b09b4, 0xfc986246,
+	0xd8a47118, 0xdb271aea, 0x2fd44af9, 0x2c57210b,
+	0xc733ebdf, 0xc4b0802d, 0x3043d03e, 0x33c0bbcc,
+	0xa6b54ba2, 0xa5362050, 0x51c57043, 0x52461bb1,
+	0xb922d165, 0xbaa1ba97, 0x4e52ea84, 0x4dd18176,
+	0x69ed9228, 0x6a6ef9da, 0x9e9da9c9, 0x9d1ec23b,
+	0x767a08ef, 0x75f9631d, 0x810a330e, 0x828958fc,
+	0xc97215b2, 0xcaf17e40, 0x3e022e53, 0x3d8145a1,
+	0xd6e58f75, 0xd566e487, 0x2195b494, 0x2216df66,
+	0x062acc38, 0x05a9a7ca, 0xf15af7d9, 0xf2d99c2b,
+	0x19bd56ff, 0x1a3e3d0d, 0xeecd6d1e, 0xed4e06ec,
+	0xc4268dc3, 0xc7a5e631, 0x3356b622, 0x30d5ddd0,
+	0xdbb11704, 0xd8327cf6, 0x2cc12ce5, 0x2f424717,
+	0x0b7e5449, 0x08fd3fbb, 0xfc0e6fa8, 0xff8d045a,
+	0x14e9ce8e, 0x176aa57c, 0xe399f56f, 0xe01a9e9d,
+	0xabe1d3d3, 0xa862b821, 0x5c91e832, 0x5f1283c0,
+	0xb4764914, 0xb7f522e6, 0x430672f5, 0x40851907,
+	0x64b90a59, 0x673a61ab, 0x93c931b8, 0x904a5a4a,
+	0x7b2e909e, 0x78adfb6c, 0x8c5eab7f, 0x8fddc08d,
+	0x1aa830e3, 0x192b5b11, 0xedd80b02, 0xee5b60f0,
+	0x053faa24, 0x06bcc1d6, 0xf24f91c5, 0xf1ccfa37,
+	0xd5f0e969, 0xd673829b, 0x2280d288, 0x2103b97a,
+	0xca6773ae, 0xc9e4185c, 0x3d17484f, 0x3e9423bd,
+	0x756f6ef3, 0x76ec0501, 0x821f5512, 0x819c3ee0,
+	0x6af8f434, 0x697b9fc6, 0x9d88cfd5, 0x9e0ba427,
+	0xba37b779, 0xb9b4dc8b, 0x4d478c98, 0x4ec4e76a,
+	0xa5a02dbe, 0xa623464c, 0x52d0165f, 0x51537dad
+	},{
+	0x00000000, 0x7798a213, 0xee304527, 0x99a8e734,
+	0xdc618a4e, 0xabf9285d, 0x3251cf69, 0x45c96d7a,
+	0xb8c3149d, 0xcf5bb68e, 0x56f351ba, 0x216bf3a9,
+	0x64a29ed3, 0x133a3cc0, 0x8a92dbf4, 0xfd0a79e7,
+	0x81f1c53f, 0xf669672c, 0x6fc18018, 0x1859220b,
+	0x5d904f71, 0x2a08ed62, 0xb3a00a56, 0xc438a845,
+	0x3932d1a2, 0x4eaa73b1, 0xd7029485, 0xa09a3696,
+	0xe5535bec, 0x92cbf9ff, 0x0b631ecb, 0x7cfbbcd8,
+	0x02e38b7f, 0x757b296c, 0xecd3ce58, 0x9b4b6c4b,
+	0xde820131, 0xa91aa322, 0x30b24416, 0x472ae605,
+	0xba209fe2, 0xcdb83df1, 0x5410dac5, 0x238878d6,
+	0x664115ac, 0x11d9b7bf, 0x8871508b, 0xffe9f298,
+	0x83124e40, 0xf48aec53, 0x6d220b67, 0x1abaa974,
+	0x5f73c40e, 0x28eb661d, 0xb1438129, 0xc6db233a,
+	0x3bd15add, 0x4c49f8ce, 0xd5e11ffa, 0xa279bde9,
+	0xe7b0d093, 0x90287280, 0x098095b4, 0x7e1837a7,
+	0x04c617ff, 0x735eb5ec, 0xeaf652d8, 0x9d6ef0cb,
+	0xd8a79db1, 0xaf3f3fa2, 0x3697d896, 0x410f7a85,
+	0xbc050362, 0xcb9da171, 0x52354645, 0x25ade456,
+	0x6064892c, 0x17fc2b3f, 0x8e54cc0b, 0xf9cc6e18,
+	0x8537d2c0, 0xf2af70d3, 0x6b0797e7, 0x1c9f35f4,
+	0x5956588e, 0x2ecefa9d, 0xb7661da9, 0xc0febfba,
+	0x3df4c65d, 0x4a6c644e, 0xd3c4837a, 0xa45c2169,
+	0xe1954c13, 0x960dee00, 0x0fa50934, 0x783dab27,
+	0x06259c80, 0x71bd3e93, 0xe815d9a7, 0x9f8d7bb4,
+	0xda4416ce, 0xaddcb4dd, 0x347453e9, 0x43ecf1fa,
+	0xbee6881d, 0xc97e2a0e, 0x50d6cd3a, 0x274e6f29,
+	0x62870253, 0x151fa040, 0x8cb74774, 0xfb2fe567,
+	0x87d459bf, 0xf04cfbac, 0x69e41c98, 0x1e7cbe8b,
+	0x5bb5d3f1, 0x2c2d71e2, 0xb58596d6, 0xc21d34c5,
+	0x3f174d22, 0x488fef31, 0xd1270805, 0xa6bfaa16,
+	0xe376c76c, 0x94ee657f, 0x0d46824b, 0x7ade2058,
+	0xf9fac3fb, 0x8e6261e8, 0x17ca86dc, 0x605224cf,
+	0x259b49b5, 0x5203eba6, 0xcbab0c92, 0xbc33ae81,
+	0x4139d766, 0x36a17575, 0xaf099241, 0xd8913052,
+	0x9d585d28, 0xeac0ff3b, 0x7368180f, 0x04f0ba1c,
+	0x780b06c4, 0x0f93a4d7, 0x963b43e3, 0xe1a3e1f0,
+	0xa46a8c8a, 0xd3f22e99, 0x4a5ac9ad, 0x3dc26bbe,
+	0xc0c81259, 0xb750b04a, 0x2ef8577e, 0x5960f56d,
+	0x1ca99817, 0x6b313a04, 0xf299dd30, 0x85017f23,
+	0xfb194884, 0x8c81ea97, 0x15290da3, 0x62b1afb0,
+	0x2778c2ca, 0x50e060d9, 0xc94887ed, 0xbed025fe,
+	0x43da5c19, 0x3442fe0a, 0xadea193e, 0xda72bb2d,
+	0x9fbbd657, 0xe8237444, 0x718b9370, 0x06133163,
+	0x7ae88dbb, 0x0d702fa8, 0x94d8c89c, 0xe3406a8f,
+	0xa68907f5, 0xd111a5e6, 0x48b942d2, 0x3f21e0c1,
+	0xc22b9926, 0xb5b33b35, 0x2c1bdc01, 0x5b837e12,
+	0x1e4a1368, 0x69d2b17b, 0xf07a564f, 0x87e2f45c,
+	0xfd3cd404, 0x8aa47617, 0x130c9123, 0x64943330,
+	0x215d5e4a, 0x56c5fc59, 0xcf6d1b6d, 0xb8f5b97e,
+	0x45ffc099, 0x3267628a, 0xabcf85be, 0xdc5727ad,
+	0x999e4ad7, 0xee06e8c4, 0x77ae0ff0, 0x0036ade3,
+	0x7ccd113b, 0x0b55b328, 0x92fd541c, 0xe565f60f,
+	0xa0ac9b75, 0xd7343966, 0x4e9cde52, 0x39047c41,
+	0xc40e05a6, 0xb396a7b5, 0x2a3e4081, 0x5da6e292,
+	0x186f8fe8, 0x6ff72dfb, 0xf65fcacf, 0x81c768dc,
+	0xffdf5f7b, 0x8847fd68, 0x11ef1a5c, 0x6677b84f,
+	0x23bed535, 0x54267726, 0xcd8e9012, 0xba163201,
+	0x471c4be6, 0x3084e9f5, 0xa92c0ec1, 0xdeb4acd2,
+	0x9b7dc1a8, 0xece563bb, 0x754d848f, 0x02d5269c,
+	0x7e2e9a44, 0x09b63857, 0x901edf63, 0xe7867d70,
+	0xa24f100a, 0xd5d7b219, 0x4c7f552d, 0x3be7f73e,
+	0xc6ed8ed9, 0xb1752cca, 0x28ddcbfe, 0x5f4569ed,
+	0x1a8c0497, 0x6d14a684, 0xf4bc41b0, 0x8324e3a3
+	},{
+	0x00000000, 0x7e9241a5, 0x0d526f4f, 0x73c02eea,
+	0x1aa4de9e, 0x64369f3b, 0x17f6b1d1, 0x6964f074,
+	0xc53e5138, 0xbbac109d, 0xc86c3e77, 0xb6fe7fd2,
+	0xdf9a8fa6, 0xa108ce03, 0xd2c8e0e9, 0xac5aa14c,
+	0x8a7da270, 0xf4efe3d5, 0x872fcd3f, 0xf9bd8c9a,
+	0x90d97cee, 0xee4b3d4b, 0x9d8b13a1, 0xe3195204,
+	0x4f43f348, 0x31d1b2ed, 0x42119c07, 0x3c83dda2,
+	0x55e72dd6, 0x2b756c73, 0x58b54299, 0x2627033c,
+	0x14fb44e1, 0x6a690544, 0x19a92bae, 0x673b6a0b,
+	0x0e5f9a7f, 0x70cddbda, 0x030df530, 0x7d9fb495,
+	0xd1c515d9, 0xaf57547c, 0xdc977a96, 0xa2053b33,
+	0xcb61cb47, 0xb5f38ae2, 0xc633a408, 0xb8a1e5ad,
+	0x9e86e691, 0xe014a734, 0x93d489de, 0xed46c87b,
+	0x8422380f, 0xfab079aa, 0x89705740, 0xf7e216e5,
+	0x5bb8b7a9, 0x252af60c, 0x56ead8e6, 0x28789943,
+	0x411c6937, 0x3f8e2892, 0x4c4e0678, 0x32dc47dd,
+	0xd98065c7, 0xa7122462, 0xd4d20a88, 0xaa404b2d,
+	0xc324bb59, 0xbdb6fafc, 0xce76d416, 0xb0e495b3,
+	0x1cbe34ff, 0x622c755a, 0x11ec5bb0, 0x6f7e1a15,
+	0x061aea61, 0x7888abc4, 0x0b48852e, 0x75dac48b,
+	0x53fdc7b7, 0x2d6f8612, 0x5eafa8f8, 0x203de95d,
+	0x49591929, 0x37cb588c, 0x440b7666, 0x3a9937c3,
+	0x96c3968f, 0xe851d72a, 0x9b91f9c0, 0xe503b865,
+	0x8c674811, 0xf2f509b4, 0x8135275e, 0xffa766fb,
+	0xcd7b2126, 0xb3e96083, 0xc0294e69, 0xbebb0fcc,
+	0xd7dfffb8, 0xa94dbe1d, 0xda8d90f7, 0xa41fd152,
+	0x0845701e, 0x76d731bb, 0x05171f51, 0x7b855ef4,
+	0x12e1ae80, 0x6c73ef25, 0x1fb3c1cf, 0x6121806a,
+	0x47068356, 0x3994c2f3, 0x4a54ec19, 0x34c6adbc,
+	0x5da25dc8, 0x23301c6d, 0x50f03287, 0x2e627322,
+	0x8238d26e, 0xfcaa93cb, 0x8f6abd21, 0xf1f8fc84,
+	0x989c0cf0, 0xe60e4d55, 0x95ce63bf, 0xeb5c221a,
+	0x4377278b, 0x3de5662e, 0x4e2548c4, 0x30b70961,
+	0x59d3f915, 0x2741b8b0, 0x5481965a, 0x2a13d7ff,
+	0x864976b3, 0xf8db3716, 0x8b1b19fc, 0xf5895859,
+	0x9ceda82d, 0xe27fe988, 0x91bfc762, 0xef2d86c7,
+	0xc90a85fb, 0xb798c45e, 0xc458eab4, 0xbacaab11,
+	0xd3ae5b65, 0xad3c1ac0, 0xdefc342a, 0xa06e758f,
+	0x0c34d4c3, 0x72a69566, 0x0166bb8c, 0x7ff4fa29,
+	0x16900a5d, 0x68024bf8, 0x1bc26512, 0x655024b7,
+	0x578c636a, 0x291e22cf, 0x5ade0c25, 0x244c4d80,
+	0x4d28bdf4, 0x33bafc51, 0x407ad2bb, 0x3ee8931e,
+	0x92b23252, 0xec2073f7, 0x9fe05d1d, 0xe1721cb8,
+	0x8816eccc, 0xf684ad69, 0x85448383, 0xfbd6c226,
+	0xddf1c11a, 0xa36380bf, 0xd0a3ae55, 0xae31eff0,
+	0xc7551f84, 0xb9c75e21, 0xca0770cb, 0xb495316e,
+	0x18cf9022, 0x665dd187, 0x159dff6d, 0x6b0fbec8,
+	0x026b4ebc, 0x7cf90f19, 0x0f3921f3, 0x71ab6056,
+	0x9af7424c, 0xe46503e9, 0x97a52d03, 0xe9376ca6,
+	0x80539cd2, 0xfec1dd77, 0x8d01f39d, 0xf393b238,
+	0x5fc91374, 0x215b52d1, 0x529b7c3b, 0x2c093d9e,
+	0x456dcdea, 0x3bff8c4f, 0x483fa2a5, 0x36ade300,
+	0x108ae03c, 0x6e18a199, 0x1dd88f73, 0x634aced6,
+	0x0a2e3ea2, 0x74bc7f07, 0x077c51ed, 0x79ee1048,
+	0xd5b4b104, 0xab26f0a1, 0xd8e6de4b, 0xa6749fee,
+	0xcf106f9a, 0xb1822e3f, 0xc24200d5, 0xbcd04170,
+	0x8e0c06ad, 0xf09e4708, 0x835e69e2, 0xfdcc2847,
+	0x94a8d833, 0xea3a9996, 0x99fab77c, 0xe768f6d9,
+	0x4b325795, 0x35a01630, 0x466038da, 0x38f2797f,
+	0x5196890b, 0x2f04c8ae, 0x5cc4e644, 0x2256a7e1,
+	0x0471a4dd, 0x7ae3e578, 0x0923cb92, 0x77b18a37,
+	0x1ed57a43, 0x60473be6, 0x1387150c, 0x6d1554a9,
+	0xc14ff5e5, 0xbfddb440, 0xcc1d9aaa, 0xb28fdb0f,
+	0xdbeb2b7b, 0xa5796ade, 0xd6b94434, 0xa82b0591
+	},{
+	0x00000000, 0xb8aa45dd, 0x812367bf, 0x39892262,
+	0xf331227b, 0x4b9b67a6, 0x721245c4, 0xcab80019,
+	0xe66344f6, 0x5ec9012b, 0x67402349, 0xdfea6694,
+	0x1552668d, 0xadf82350, 0x94710132, 0x2cdb44ef,
+	0x3db164e9, 0x851b2134, 0xbc920356, 0x0438468b,
+	0xce804692, 0x762a034f, 0x4fa3212d, 0xf70964f0,
+	0xdbd2201f, 0x637865c2, 0x5af147a0, 0xe25b027d,
+	0x28e30264, 0x904947b9, 0xa9c065db, 0x116a2006,
+	0x8b1425d7, 0x33be600a, 0x0a374268, 0xb29d07b5,
+	0x782507ac, 0xc08f4271, 0xf9066013, 0x41ac25ce,
+	0x6d776121, 0xd5dd24fc, 0xec54069e, 0x54fe4343,
+	0x9e46435a, 0x26ec0687, 0x1f6524e5, 0xa7cf6138,
+	0xb6a5413e, 0x0e0f04e3, 0x37862681, 0x8f2c635c,
+	0x45946345, 0xfd3e2698, 0xc4b704fa, 0x7c1d4127,
+	0x50c605c8, 0xe86c4015, 0xd1e56277, 0x694f27aa,
+	0xa3f727b3, 0x1b5d626e, 0x22d4400c, 0x9a7e05d1,
+	0xe75fa6ab, 0x5ff5e376, 0x667cc114, 0xded684c9,
+	0x146e84d0, 0xacc4c10d, 0x954de36f, 0x2de7a6b2,
+	0x013ce25d, 0xb996a780, 0x801f85e2, 0x38b5c03f,
+	0xf20dc026, 0x4aa785fb, 0x732ea799, 0xcb84e244,
+	0xdaeec242, 0x6244879f, 0x5bcda5fd, 0xe367e020,
+	0x29dfe039, 0x9175a5e4, 0xa8fc8786, 0x1056c25b,
+	0x3c8d86b4, 0x8427c369, 0xbdaee10b, 0x0504a4d6,
+	0xcfbca4cf, 0x7716e112, 0x4e9fc370, 0xf63586ad,
+	0x6c4b837c, 0xd4e1c6a1, 0xed68e4c3, 0x55c2a11e,
+	0x9f7aa107, 0x27d0e4da, 0x1e59c6b8, 0xa6f38365,
+	0x8a28c78a, 0x32828257, 0x0b0ba035, 0xb3a1e5e8,
+	0x7919e5f1, 0xc1b3a02c, 0xf83a824e, 0x4090c793,
+	0x51fae795, 0xe950a248, 0xd0d9802a, 0x6873c5f7,
+	0xa2cbc5ee, 0x1a618033, 0x23e8a251, 0x9b42e78c,
+	0xb799a363, 0x0f33e6be, 0x36bac4dc, 0x8e108101,
+	0x44a88118, 0xfc02c4c5, 0xc58be6a7, 0x7d21a37a,
+	0x3fc9a052, 0x8763e58f, 0xbeeac7ed, 0x06408230,
+	0xccf88229, 0x7452c7f4, 0x4ddbe596, 0xf571a04b,
+	0xd9aae4a4, 0x6100a179, 0x5889831b, 0xe023c6c6,
+	0x2a9bc6df, 0x92318302, 0xabb8a160, 0x1312e4bd,
+	0x0278c4bb, 0xbad28166, 0x835ba304, 0x3bf1e6d9,
+	0xf149e6c0, 0x49e3a31d, 0x706a817f, 0xc8c0c4a2,
+	0xe41b804d, 0x5cb1c590, 0x6538e7f2, 0xdd92a22f,
+	0x172aa236, 0xaf80e7eb, 0x9609c589, 0x2ea38054,
+	0xb4dd8585, 0x0c77c058, 0x35fee23a, 0x8d54a7e7,
+	0x47eca7fe, 0xff46e223, 0xc6cfc041, 0x7e65859c,
+	0x52bec173, 0xea1484ae, 0xd39da6cc, 0x6b37e311,
+	0xa18fe308, 0x1925a6d5, 0x20ac84b7, 0x9806c16a,
+	0x896ce16c, 0x31c6a4b1, 0x084f86d3, 0xb0e5c30e,
+	0x7a5dc317, 0xc2f786ca, 0xfb7ea4a8, 0x43d4e175,
+	0x6f0fa59a, 0xd7a5e047, 0xee2cc225, 0x568687f8,
+	0x9c3e87e1, 0x2494c23c, 0x1d1de05e, 0xa5b7a583,
+	0xd89606f9, 0x603c4324, 0x59b56146, 0xe11f249b,
+	0x2ba72482, 0x930d615f, 0xaa84433d, 0x122e06e0,
+	0x3ef5420f, 0x865f07d2, 0xbfd625b0, 0x077c606d,
+	0xcdc46074, 0x756e25a9, 0x4ce707cb, 0xf44d4216,
+	0xe5276210, 0x5d8d27cd, 0x640405af, 0xdcae4072,
+	0x1616406b, 0xaebc05b6, 0x973527d4, 0x2f9f6209,
+	0x034426e6, 0xbbee633b, 0x82674159, 0x3acd0484,
+	0xf075049d, 0x48df4140, 0x71566322, 0xc9fc26ff,
+	0x5382232e, 0xeb2866f3, 0xd2a14491, 0x6a0b014c,
+	0xa0b30155, 0x18194488, 0x219066ea, 0x993a2337,
+	0xb5e167d8, 0x0d4b2205, 0x34c20067, 0x8c6845ba,
+	0x46d045a3, 0xfe7a007e, 0xc7f3221c, 0x7f5967c1,
+	0x6e3347c7, 0xd699021a, 0xef102078, 0x57ba65a5,
+	0x9d0265bc, 0x25a82061, 0x1c210203, 0xa48b47de,
+	0x88500331, 0x30fa46ec, 0x0973648e, 0xb1d92153,
+	0x7b61214a, 0xc3cb6497, 0xfa4246f5, 0x42e80328
+	},{
+	0x00000000, 0xac6f1138, 0x58df2270, 0xf4b03348,
+	0xb0be45e0, 0x1cd154d8, 0xe8616790, 0x440e76a8,
+	0x910b67c5, 0x3d6476fd, 0xc9d445b5, 0x65bb548d,
+	0x21b52225, 0x8dda331d, 0x796a0055, 0xd505116d,
+	0xd361228f, 0x7f0e33b7, 0x8bbe00ff, 0x27d111c7,
+	0x63df676f, 0xcfb07657, 0x3b00451f, 0x976f5427,
+	0x426a454a, 0xee055472, 0x1ab5673a, 0xb6da7602,
+	0xf2d400aa, 0x5ebb1192, 0xaa0b22da, 0x066433e2,
+	0x57b5a81b, 0xfbdab923, 0x0f6a8a6b, 0xa3059b53,
+	0xe70bedfb, 0x4b64fcc3, 0xbfd4cf8b, 0x13bbdeb3,
+	0xc6becfde, 0x6ad1dee6, 0x9e61edae, 0x320efc96,
+	0x76008a3e, 0xda6f9b06, 0x2edfa84e, 0x82b0b976,
+	0x84d48a94, 0x28bb9bac, 0xdc0ba8e4, 0x7064b9dc,
+	0x346acf74, 0x9805de4c, 0x6cb5ed04, 0xc0dafc3c,
+	0x15dfed51, 0xb9b0fc69, 0x4d00cf21, 0xe16fde19,
+	0xa561a8b1, 0x090eb989, 0xfdbe8ac1, 0x51d19bf9,
+	0xae6a5137, 0x0205400f, 0xf6b57347, 0x5ada627f,
+	0x1ed414d7, 0xb2bb05ef, 0x460b36a7, 0xea64279f,
+	0x3f6136f2, 0x930e27ca, 0x67be1482, 0xcbd105ba,
+	0x8fdf7312, 0x23b0622a, 0xd7005162, 0x7b6f405a,
+	0x7d0b73b8, 0xd1646280, 0x25d451c8, 0x89bb40f0,
+	0xcdb53658, 0x61da2760, 0x956a1428, 0x39050510,
+	0xec00147d, 0x406f0545, 0xb4df360d, 0x18b02735,
+	0x5cbe519d, 0xf0d140a5, 0x046173ed, 0xa80e62d5,
+	0xf9dff92c, 0x55b0e814, 0xa100db5c, 0x0d6fca64,
+	0x4961bccc, 0xe50eadf4, 0x11be9ebc, 0xbdd18f84,
+	0x68d49ee9, 0xc4bb8fd1, 0x300bbc99, 0x9c64ada1,
+	0xd86adb09, 0x7405ca31, 0x80b5f979, 0x2cdae841,
+	0x2abedba3, 0x86d1ca9b, 0x7261f9d3, 0xde0ee8eb,
+	0x9a009e43, 0x366f8f7b, 0xc2dfbc33, 0x6eb0ad0b,
+	0xbbb5bc66, 0x17daad5e, 0xe36a9e16, 0x4f058f2e,
+	0x0b0bf986, 0xa764e8be, 0x53d4dbf6, 0xffbbcace,
+	0x5cd5a26e, 0xf0bab356, 0x040a801e, 0xa8659126,
+	0xec6be78e, 0x4004f6b6, 0xb4b4c5fe, 0x18dbd4c6,
+	0xcddec5ab, 0x61b1d493, 0x9501e7db, 0x396ef6e3,
+	0x7d60804b, 0xd10f9173, 0x25bfa23b, 0x89d0b303,
+	0x8fb480e1, 0x23db91d9, 0xd76ba291, 0x7b04b3a9,
+	0x3f0ac501, 0x9365d439, 0x67d5e771, 0xcbbaf649,
+	0x1ebfe724, 0xb2d0f61c, 0x4660c554, 0xea0fd46c,
+	0xae01a2c4, 0x026eb3fc, 0xf6de80b4, 0x5ab1918c,
+	0x0b600a75, 0xa70f1b4d, 0x53bf2805, 0xffd0393d,
+	0xbbde4f95, 0x17b15ead, 0xe3016de5, 0x4f6e7cdd,
+	0x9a6b6db0, 0x36047c88, 0xc2b44fc0, 0x6edb5ef8,
+	0x2ad52850, 0x86ba3968, 0x720a0a20, 0xde651b18,
+	0xd80128fa, 0x746e39c2, 0x80de0a8a, 0x2cb11bb2,
+	0x68bf6d1a, 0xc4d07c22, 0x30604f6a, 0x9c0f5e52,
+	0x490a4f3f, 0xe5655e07, 0x11d56d4f, 0xbdba7c77,
+	0xf9b40adf, 0x55db1be7, 0xa16b28af, 0x0d043997,
+	0xf2bff359, 0x5ed0e261, 0xaa60d129, 0x060fc011,
+	0x4201b6b9, 0xee6ea781, 0x1ade94c9, 0xb6b185f1,
+	0x63b4949c, 0xcfdb85a4, 0x3b6bb6ec, 0x9704a7d4,
+	0xd30ad17c, 0x7f65c044, 0x8bd5f30c, 0x27bae234,
+	0x21ded1d6, 0x8db1c0ee, 0x7901f3a6, 0xd56ee29e,
+	0x91609436, 0x3d0f850e, 0xc9bfb646, 0x65d0a77e,
+	0xb0d5b613, 0x1cbaa72b, 0xe80a9463, 0x4465855b,
+	0x006bf3f3, 0xac04e2cb, 0x58b4d183, 0xf4dbc0bb,
+	0xa50a5b42, 0x09654a7a, 0xfdd57932, 0x51ba680a,
+	0x15b41ea2, 0xb9db0f9a, 0x4d6b3cd2, 0xe1042dea,
+	0x34013c87, 0x986e2dbf, 0x6cde1ef7, 0xc0b10fcf,
+	0x84bf7967, 0x28d0685f, 0xdc605b17, 0x700f4a2f,
+	0x766b79cd, 0xda0468f5, 0x2eb45bbd, 0x82db4a85,
+	0xc6d53c2d, 0x6aba2d15, 0x9e0a1e5d, 0x32650f65,
+	0xe7601e08, 0x4b0f0f30, 0xbfbf3c78, 0x13d02d40,
+	0x57de5be8, 0xfbb14ad0, 0x0f017998, 0xa36e68a0
+	},{
+	0x00000000, 0x196b30ef, 0xc3a08cdb, 0xdacbbc34,
+	0x7737f5b2, 0x6e5cc55d, 0xb4977969, 0xadfc4986,
+	0x1f180660, 0x0673368f, 0xdcb88abb, 0xc5d3ba54,
+	0x682ff3d2, 0x7144c33d, 0xab8f7f09, 0xb2e44fe6,
+	0x3e300cc0, 0x275b3c2f, 0xfd90801b, 0xe4fbb0f4,
+	0x4907f972, 0x506cc99d, 0x8aa775a9, 0x93cc4546,
+	0x21280aa0, 0x38433a4f, 0xe288867b, 0xfbe3b694,
+	0x561fff12, 0x4f74cffd, 0x95bf73c9, 0x8cd44326,
+	0x8d16f485, 0x947dc46a, 0x4eb6785e, 0x57dd48b1,
+	0xfa210137, 0xe34a31d8, 0x39818dec, 0x20eabd03,
+	0x920ef2e5, 0x8b65c20a, 0x51ae7e3e, 0x48c54ed1,
+	0xe5390757, 0xfc5237b8, 0x26998b8c, 0x3ff2bb63,
+	0xb326f845, 0xaa4dc8aa, 0x7086749e, 0x69ed4471,
+	0xc4110df7, 0xdd7a3d18, 0x07b1812c, 0x1edab1c3,
+	0xac3efe25, 0xb555ceca, 0x6f9e72fe, 0x76f54211,
+	0xdb090b97, 0xc2623b78, 0x18a9874c, 0x01c2b7a3,
+	0xeb5b040e, 0xf23034e1, 0x28fb88d5, 0x3190b83a,
+	0x9c6cf1bc, 0x8507c153, 0x5fcc7d67, 0x46a74d88,
+	0xf443026e, 0xed283281, 0x37e38eb5, 0x2e88be5a,
+	0x8374f7dc, 0x9a1fc733, 0x40d47b07, 0x59bf4be8,
+	0xd56b08ce, 0xcc003821, 0x16cb8415, 0x0fa0b4fa,
+	0xa25cfd7c, 0xbb37cd93, 0x61fc71a7, 0x78974148,
+	0xca730eae, 0xd3183e41, 0x09d38275, 0x10b8b29a,
+	0xbd44fb1c, 0xa42fcbf3, 0x7ee477c7, 0x678f4728,
+	0x664df08b, 0x7f26c064, 0xa5ed7c50, 0xbc864cbf,
+	0x117a0539, 0x081135d6, 0xd2da89e2, 0xcbb1b90d,
+	0x7955f6eb, 0x603ec604, 0xbaf57a30, 0xa39e4adf,
+	0x0e620359, 0x170933b6, 0xcdc28f82, 0xd4a9bf6d,
+	0x587dfc4b, 0x4116cca4, 0x9bdd7090, 0x82b6407f,
+	0x2f4a09f9, 0x36213916, 0xecea8522, 0xf581b5cd,
+	0x4765fa2b, 0x5e0ecac4, 0x84c576f0, 0x9dae461f,
+	0x30520f99, 0x29393f76, 0xf3f28342, 0xea99b3ad,
+	0xd6b7081c, 0xcfdc38f3, 0x151784c7, 0x0c7cb428,
+	0xa180fdae, 0xb8ebcd41, 0x62207175, 0x7b4b419a,
+	0xc9af0e7c, 0xd0c43e93, 0x0a0f82a7, 0x1364b248,
+	0xbe98fbce, 0xa7f3cb21, 0x7d387715, 0x645347fa,
+	0xe88704dc, 0xf1ec3433, 0x2b278807, 0x324cb8e8,
+	0x9fb0f16e, 0x86dbc181, 0x5c107db5, 0x457b4d5a,
+	0xf79f02bc, 0xeef43253, 0x343f8e67, 0x2d54be88,
+	0x80a8f70e, 0x99c3c7e1, 0x43087bd5, 0x5a634b3a,
+	0x5ba1fc99, 0x42cacc76, 0x98017042, 0x816a40ad,
+	0x2c96092b, 0x35fd39c4, 0xef3685f0, 0xf65db51f,
+	0x44b9faf9, 0x5dd2ca16, 0x87197622, 0x9e7246cd,
+	0x338e0f4b, 0x2ae53fa4, 0xf02e8390, 0xe945b37f,
+	0x6591f059, 0x7cfac0b6, 0xa6317c82, 0xbf5a4c6d,
+	0x12a605eb, 0x0bcd3504, 0xd1068930, 0xc86db9df,
+	0x7a89f639, 0x63e2c6d6, 0xb9297ae2, 0xa0424a0d,
+	0x0dbe038b, 0x14d53364, 0xce1e8f50, 0xd775bfbf,
+	0x3dec0c12, 0x24873cfd, 0xfe4c80c9, 0xe727b026,
+	0x4adbf9a0, 0x53b0c94f, 0x897b757b, 0x90104594,
+	0x22f40a72, 0x3b9f3a9d, 0xe15486a9, 0xf83fb646,
+	0x55c3ffc0, 0x4ca8cf2f, 0x9663731b, 0x8f0843f4,
+	0x03dc00d2, 0x1ab7303d, 0xc07c8c09, 0xd917bce6,
+	0x74ebf560, 0x6d80c58f, 0xb74b79bb, 0xae204954,
+	0x1cc406b2, 0x05af365d, 0xdf648a69, 0xc60fba86,
+	0x6bf3f300, 0x7298c3ef, 0xa8537fdb, 0xb1384f34,
+	0xb0faf897, 0xa991c878, 0x735a744c, 0x6a3144a3,
+	0xc7cd0d25, 0xdea63dca, 0x046d81fe, 0x1d06b111,
+	0xafe2fef7, 0xb689ce18, 0x6c42722c, 0x752942c3,
+	0xd8d50b45, 0xc1be3baa, 0x1b75879e, 0x021eb771,
+	0x8ecaf457, 0x97a1c4b8, 0x4d6a788c, 0x54014863,
+	0xf9fd01e5, 0xe096310a, 0x3a5d8d3e, 0x2336bdd1,
+	0x91d2f237, 0x88b9c2d8, 0x52727eec, 0x4b194e03,
+	0xe6e50785, 0xff8e376a, 0x25458b5e, 0x3c2ebbb1
+	},{
+	0x00000000, 0xc82c0368, 0x905906d0, 0x587505b8,
+	0xd1c5e0a5, 0x19e9e3cd, 0x419ce675, 0x89b0e51d,
+	0x53fd2d4e, 0x9bd12e26, 0xc3a42b9e, 0x0b8828f6,
+	0x8238cdeb, 0x4a14ce83, 0x1261cb3b, 0xda4dc853,
+	0xa6fa5b9c, 0x6ed658f4, 0x36a35d4c, 0xfe8f5e24,
+	0x773fbb39, 0xbf13b851, 0xe766bde9, 0x2f4abe81,
+	0xf50776d2, 0x3d2b75ba, 0x655e7002, 0xad72736a,
+	0x24c29677, 0xecee951f, 0xb49b90a7, 0x7cb793cf,
+	0xbd835b3d, 0x75af5855, 0x2dda5ded, 0xe5f65e85,
+	0x6c46bb98, 0xa46ab8f0, 0xfc1fbd48, 0x3433be20,
+	0xee7e7673, 0x2652751b, 0x7e2770a3, 0xb60b73cb,
+	0x3fbb96d6, 0xf79795be, 0xafe29006, 0x67ce936e,
+	0x1b7900a1, 0xd35503c9, 0x8b200671, 0x430c0519,
+	0xcabce004, 0x0290e36c, 0x5ae5e6d4, 0x92c9e5bc,
+	0x48842def, 0x80a82e87, 0xd8dd2b3f, 0x10f12857,
+	0x9941cd4a, 0x516dce22, 0x0918cb9a, 0xc134c8f2,
+	0x7a07b77a, 0xb22bb412, 0xea5eb1aa, 0x2272b2c2,
+	0xabc257df, 0x63ee54b7, 0x3b9b510f, 0xf3b75267,
+	0x29fa9a34, 0xe1d6995c, 0xb9a39ce4, 0x718f9f8c,
+	0xf83f7a91, 0x301379f9, 0x68667c41, 0xa04a7f29,
+	0xdcfdece6, 0x14d1ef8e, 0x4ca4ea36, 0x8488e95e,
+	0x0d380c43, 0xc5140f2b, 0x9d610a93, 0x554d09fb,
+	0x8f00c1a8, 0x472cc2c0, 0x1f59c778, 0xd775c410,
+	0x5ec5210d, 0x96e92265, 0xce9c27dd, 0x06b024b5,
+	0xc784ec47, 0x0fa8ef2f, 0x57ddea97, 0x9ff1e9ff,
+	0x16410ce2, 0xde6d0f8a, 0x86180a32, 0x4e34095a,
+	0x9479c109, 0x5c55c261, 0x0420c7d9, 0xcc0cc4b1,
+	0x45bc21ac, 0x8d9022c4, 0xd5e5277c, 0x1dc92414,
+	0x617eb7db, 0xa952b4b3, 0xf127b10b, 0x390bb263,
+	0xb0bb577e, 0x78975416, 0x20e251ae, 0xe8ce52c6,
+	0x32839a95, 0xfaaf99fd, 0xa2da9c45, 0x6af69f2d,
+	0xe3467a30, 0x2b6a7958, 0x731f7ce0, 0xbb337f88,
+	0xf40e6ef5, 0x3c226d9d, 0x64576825, 0xac7b6b4d,
+	0x25cb8e50, 0xede78d38, 0xb5928880, 0x7dbe8be8,
+	0xa7f343bb, 0x6fdf40d3, 0x37aa456b, 0xff864603,
+	0x7636a31e, 0xbe1aa076, 0xe66fa5ce, 0x2e43a6a6,
+	0x52f43569, 0x9ad83601, 0xc2ad33b9, 0x0a8130d1,
+	0x8331d5cc, 0x4b1dd6a4, 0x1368d31c, 0xdb44d074,
+	0x01091827, 0xc9251b4f, 0x91501ef7, 0x597c1d9f,
+	0xd0ccf882, 0x18e0fbea, 0x4095fe52, 0x88b9fd3a,
+	0x498d35c8, 0x81a136a0, 0xd9d43318, 0x11f83070,
+	0x9848d56d, 0x5064d605, 0x0811d3bd, 0xc03dd0d5,
+	0x1a701886, 0xd25c1bee, 0x8a291e56, 0x42051d3e,
+	0xcbb5f823, 0x0399fb4b, 0x5becfef3, 0x93c0fd9b,
+	0xef776e54, 0x275b6d3c, 0x7f2e6884, 0xb7026bec,
+	0x3eb28ef1, 0xf69e8d99, 0xaeeb8821, 0x66c78b49,
+	0xbc8a431a, 0x74a64072, 0x2cd345ca, 0xe4ff46a2,
+	0x6d4fa3bf, 0xa563a0d7, 0xfd16a56f, 0x353aa607,
+	0x8e09d98f, 0x4625dae7, 0x1e50df5f, 0xd67cdc37,
+	0x5fcc392a, 0x97e03a42, 0xcf953ffa, 0x07b93c92,
+	0xddf4f4c1, 0x15d8f7a9, 0x4dadf211, 0x8581f179,
+	0x0c311464, 0xc41d170c, 0x9c6812b4, 0x544411dc,
+	0x28f38213, 0xe0df817b, 0xb8aa84c3, 0x708687ab,
+	0xf93662b6, 0x311a61de, 0x696f6466, 0xa143670e,
+	0x7b0eaf5d, 0xb322ac35, 0xeb57a98d, 0x237baae5,
+	0xaacb4ff8, 0x62e74c90, 0x3a924928, 0xf2be4a40,
+	0x338a82b2, 0xfba681da, 0xa3d38462, 0x6bff870a,
+	0xe24f6217, 0x2a63617f, 0x721664c7, 0xba3a67af,
+	0x6077affc, 0xa85bac94, 0xf02ea92c, 0x3802aa44,
+	0xb1b24f59, 0x799e4c31, 0x21eb4989, 0xe9c74ae1,
+	0x9570d92e, 0x5d5cda46, 0x0529dffe, 0xcd05dc96,
+	0x44b5398b, 0x8c993ae3, 0xd4ec3f5b, 0x1cc03c33,
+	0xc68df460, 0x0ea1f708, 0x56d4f2b0, 0x9ef8f1d8,
+	0x174814c5, 0xdf6417ad, 0x87111215, 0x4f3d117d
+	},{
+	0x00000000, 0x277d3c49, 0x4efa7892, 0x698744db,
+	0x6d821d21, 0x4aff2168, 0x237865b3, 0x040559fa,
+	0xda043b42, 0xfd79070b, 0x94fe43d0, 0xb3837f99,
+	0xb7862663, 0x90fb1a2a, 0xf97c5ef1, 0xde0162b8,
+	0xb4097684, 0x93744acd, 0xfaf30e16, 0xdd8e325f,
+	0xd98b6ba5, 0xfef657ec, 0x97711337, 0xb00c2f7e,
+	0x6e0d4dc6, 0x4970718f, 0x20f73554, 0x078a091d,
+	0x038f50e7, 0x24f26cae, 0x4d752875, 0x6a08143c,
+	0x9965000d, 0xbe183c44, 0xd79f789f, 0xf0e244d6,
+	0xf4e71d2c, 0xd39a2165, 0xba1d65be, 0x9d6059f7,
+	0x43613b4f, 0x641c0706, 0x0d9b43dd, 0x2ae67f94,
+	0x2ee3266e, 0x099e1a27, 0x60195efc, 0x476462b5,
+	0x2d6c7689, 0x0a114ac0, 0x63960e1b, 0x44eb3252,
+	0x40ee6ba8, 0x679357e1, 0x0e14133a, 0x29692f73,
+	0xf7684dcb, 0xd0157182, 0xb9923559, 0x9eef0910,
+	0x9aea50ea, 0xbd976ca3, 0xd4102878, 0xf36d1431,
+	0x32cb001a, 0x15b63c53, 0x7c317888, 0x5b4c44c1,
+	0x5f491d3b, 0x78342172, 0x11b365a9, 0x36ce59e0,
+	0xe8cf3b58, 0xcfb20711, 0xa63543ca, 0x81487f83,
+	0x854d2679, 0xa2301a30, 0xcbb75eeb, 0xecca62a2,
+	0x86c2769e, 0xa1bf4ad7, 0xc8380e0c, 0xef453245,
+	0xeb406bbf, 0xcc3d57f6, 0xa5ba132d, 0x82c72f64,
+	0x5cc64ddc, 0x7bbb7195, 0x123c354e, 0x35410907,
+	0x314450fd, 0x16396cb4, 0x7fbe286f, 0x58c31426,
+	0xabae0017, 0x8cd33c5e, 0xe5547885, 0xc22944cc,
+	0xc62c1d36, 0xe151217f, 0x88d665a4, 0xafab59ed,
+	0x71aa3b55, 0x56d7071c, 0x3f5043c7, 0x182d7f8e,
+	0x1c282674, 0x3b551a3d, 0x52d25ee6, 0x75af62af,
+	0x1fa77693, 0x38da4ada, 0x515d0e01, 0x76203248,
+	0x72256bb2, 0x555857fb, 0x3cdf1320, 0x1ba22f69,
+	0xc5a34dd1, 0xe2de7198, 0x8b593543, 0xac24090a,
+	0xa82150f0, 0x8f5c6cb9, 0xe6db2862, 0xc1a6142b,
+	0x64960134, 0x43eb3d7d, 0x2a6c79a6, 0x0d1145ef,
+	0x09141c15, 0x2e69205c, 0x47ee6487, 0x609358ce,
+	0xbe923a76, 0x99ef063f, 0xf06842e4, 0xd7157ead,
+	0xd3102757, 0xf46d1b1e, 0x9dea5fc5, 0xba97638c,
+	0xd09f77b0, 0xf7e24bf9, 0x9e650f22, 0xb918336b,
+	0xbd1d6a91, 0x9a6056d8, 0xf3e71203, 0xd49a2e4a,
+	0x0a9b4cf2, 0x2de670bb, 0x44613460, 0x631c0829,
+	0x671951d3, 0x40646d9a, 0x29e32941, 0x0e9e1508,
+	0xfdf30139, 0xda8e3d70, 0xb30979ab, 0x947445e2,
+	0x90711c18, 0xb70c2051, 0xde8b648a, 0xf9f658c3,
+	0x27f73a7b, 0x008a0632, 0x690d42e9, 0x4e707ea0,
+	0x4a75275a, 0x6d081b13, 0x048f5fc8, 0x23f26381,
+	0x49fa77bd, 0x6e874bf4, 0x07000f2f, 0x207d3366,
+	0x24786a9c, 0x030556d5, 0x6a82120e, 0x4dff2e47,
+	0x93fe4cff, 0xb48370b6, 0xdd04346d, 0xfa790824,
+	0xfe7c51de, 0xd9016d97, 0xb086294c, 0x97fb1505,
+	0x565d012e, 0x71203d67, 0x18a779bc, 0x3fda45f5,
+	0x3bdf1c0f, 0x1ca22046, 0x7525649d, 0x525858d4,
+	0x8c593a6c, 0xab240625, 0xc2a342fe, 0xe5de7eb7,
+	0xe1db274d, 0xc6a61b04, 0xaf215fdf, 0x885c6396,
+	0xe25477aa, 0xc5294be3, 0xacae0f38, 0x8bd33371,
+	0x8fd66a8b, 0xa8ab56c2, 0xc12c1219, 0xe6512e50,
+	0x38504ce8, 0x1f2d70a1, 0x76aa347a, 0x51d70833,
+	0x55d251c9, 0x72af6d80, 0x1b28295b, 0x3c551512,
+	0xcf380123, 0xe8453d6a, 0x81c279b1, 0xa6bf45f8,
+	0xa2ba1c02, 0x85c7204b, 0xec406490, 0xcb3d58d9,
+	0x153c3a61, 0x32410628, 0x5bc642f3, 0x7cbb7eba,
+	0x78be2740, 0x5fc31b09, 0x36445fd2, 0x1139639b,
+	0x7b3177a7, 0x5c4c4bee, 0x35cb0f35, 0x12b6337c,
+	0x16b36a86, 0x31ce56cf, 0x58491214, 0x7f342e5d,
+	0xa1354ce5, 0x864870ac, 0xefcf3477, 0xc8b2083e,
+	0xccb751c4, 0xebca6d8d, 0x824d2956, 0xa530151f
+	}
+#else
+	{
+	0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4,
+	0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
+	0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
+	0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24,
+	0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b,
+	0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+	0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54,
+	0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b,
+	0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a,
+	0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35,
+	0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5,
+	0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+	0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45,
+	0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a,
+	0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a,
+	0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595,
+	0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48,
+	0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+	0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687,
+	0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198,
+	0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927,
+	0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38,
+	0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8,
+	0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
+	0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096,
+	0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789,
+	0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859,
+	0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46,
+	0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9,
+	0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+	0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36,
+	0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829,
+	0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c,
+	0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93,
+	0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043,
+	0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+	0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3,
+	0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc,
+	0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c,
+	0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033,
+	0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652,
+	0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+	0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d,
+	0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982,
+	0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d,
+	0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622,
+	0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2,
+	0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
+	0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530,
+	0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f,
+	0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff,
+	0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0,
+	0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f,
+	0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
+	0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90,
+	0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f,
+	0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee,
+	0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1,
+	0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321,
+	0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+	0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81,
+	0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e,
+	0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e,
+	0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351
+	},{
+	0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899,
+	0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945,
+	0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21,
+	0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd,
+	0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918,
+	0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4,
+	0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0,
+	0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c,
+	0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b,
+	0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47,
+	0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823,
+	0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff,
+	0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a,
+	0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6,
+	0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2,
+	0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e,
+	0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d,
+	0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41,
+	0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25,
+	0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9,
+	0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c,
+	0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0,
+	0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4,
+	0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78,
+	0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f,
+	0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43,
+	0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27,
+	0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb,
+	0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e,
+	0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2,
+	0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6,
+	0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a,
+	0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260,
+	0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc,
+	0x66d73941, 0x7575a136, 0x419209af, 0x523091d8,
+	0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004,
+	0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1,
+	0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d,
+	0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059,
+	0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185,
+	0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162,
+	0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be,
+	0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da,
+	0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306,
+	0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3,
+	0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f,
+	0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b,
+	0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287,
+	0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464,
+	0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8,
+	0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc,
+	0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600,
+	0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5,
+	0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439,
+	0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d,
+	0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781,
+	0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766,
+	0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba,
+	0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de,
+	0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502,
+	0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7,
+	0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b,
+	0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f,
+	0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483
+	},{
+	0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073,
+	0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469,
+	0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6,
+	0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac,
+	0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9,
+	0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3,
+	0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c,
+	0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726,
+	0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67,
+	0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d,
+	0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2,
+	0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8,
+	0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed,
+	0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7,
+	0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828,
+	0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32,
+	0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa,
+	0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0,
+	0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f,
+	0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75,
+	0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20,
+	0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a,
+	0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5,
+	0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff,
+	0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe,
+	0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4,
+	0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b,
+	0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161,
+	0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634,
+	0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e,
+	0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1,
+	0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb,
+	0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730,
+	0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a,
+	0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5,
+	0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def,
+	0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba,
+	0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0,
+	0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f,
+	0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065,
+	0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24,
+	0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e,
+	0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1,
+	0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb,
+	0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae,
+	0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4,
+	0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b,
+	0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71,
+	0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9,
+	0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3,
+	0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c,
+	0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36,
+	0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63,
+	0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79,
+	0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6,
+	0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc,
+	0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd,
+	0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7,
+	0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238,
+	0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622,
+	0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177,
+	0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d,
+	0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2,
+	0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8
+	},{
+	0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939,
+	0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca,
+	0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf,
+	0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c,
+	0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804,
+	0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7,
+	0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2,
+	0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11,
+	0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2,
+	0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41,
+	0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54,
+	0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7,
+	0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f,
+	0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c,
+	0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69,
+	0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a,
+	0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de,
+	0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d,
+	0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538,
+	0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb,
+	0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3,
+	0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610,
+	0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405,
+	0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6,
+	0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255,
+	0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6,
+	0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3,
+	0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040,
+	0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368,
+	0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b,
+	0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e,
+	0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d,
+	0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006,
+	0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5,
+	0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0,
+	0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213,
+	0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b,
+	0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8,
+	0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd,
+	0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e,
+	0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d,
+	0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e,
+	0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b,
+	0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698,
+	0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0,
+	0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443,
+	0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656,
+	0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5,
+	0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1,
+	0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12,
+	0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07,
+	0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4,
+	0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc,
+	0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f,
+	0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a,
+	0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9,
+	0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a,
+	0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99,
+	0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c,
+	0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f,
+	0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57,
+	0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4,
+	0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1,
+	0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842
+	},{
+	0x00000000, 0x38116fac, 0x7022df58, 0x4833b0f4,
+	0xe045beb0, 0xd854d11c, 0x906761e8, 0xa8760e44,
+	0xc5670b91, 0xfd76643d, 0xb545d4c9, 0x8d54bb65,
+	0x2522b521, 0x1d33da8d, 0x55006a79, 0x6d1105d5,
+	0x8f2261d3, 0xb7330e7f, 0xff00be8b, 0xc711d127,
+	0x6f67df63, 0x5776b0cf, 0x1f45003b, 0x27546f97,
+	0x4a456a42, 0x725405ee, 0x3a67b51a, 0x0276dab6,
+	0xaa00d4f2, 0x9211bb5e, 0xda220baa, 0xe2336406,
+	0x1ba8b557, 0x23b9dafb, 0x6b8a6a0f, 0x539b05a3,
+	0xfbed0be7, 0xc3fc644b, 0x8bcfd4bf, 0xb3debb13,
+	0xdecfbec6, 0xe6ded16a, 0xaeed619e, 0x96fc0e32,
+	0x3e8a0076, 0x069b6fda, 0x4ea8df2e, 0x76b9b082,
+	0x948ad484, 0xac9bbb28, 0xe4a80bdc, 0xdcb96470,
+	0x74cf6a34, 0x4cde0598, 0x04edb56c, 0x3cfcdac0,
+	0x51eddf15, 0x69fcb0b9, 0x21cf004d, 0x19de6fe1,
+	0xb1a861a5, 0x89b90e09, 0xc18abefd, 0xf99bd151,
+	0x37516aae, 0x0f400502, 0x4773b5f6, 0x7f62da5a,
+	0xd714d41e, 0xef05bbb2, 0xa7360b46, 0x9f2764ea,
+	0xf236613f, 0xca270e93, 0x8214be67, 0xba05d1cb,
+	0x1273df8f, 0x2a62b023, 0x625100d7, 0x5a406f7b,
+	0xb8730b7d, 0x806264d1, 0xc851d425, 0xf040bb89,
+	0x5836b5cd, 0x6027da61, 0x28146a95, 0x10050539,
+	0x7d1400ec, 0x45056f40, 0x0d36dfb4, 0x3527b018,
+	0x9d51be5c, 0xa540d1f0, 0xed736104, 0xd5620ea8,
+	0x2cf9dff9, 0x14e8b055, 0x5cdb00a1, 0x64ca6f0d,
+	0xccbc6149, 0xf4ad0ee5, 0xbc9ebe11, 0x848fd1bd,
+	0xe99ed468, 0xd18fbbc4, 0x99bc0b30, 0xa1ad649c,
+	0x09db6ad8, 0x31ca0574, 0x79f9b580, 0x41e8da2c,
+	0xa3dbbe2a, 0x9bcad186, 0xd3f96172, 0xebe80ede,
+	0x439e009a, 0x7b8f6f36, 0x33bcdfc2, 0x0badb06e,
+	0x66bcb5bb, 0x5eadda17, 0x169e6ae3, 0x2e8f054f,
+	0x86f90b0b, 0xbee864a7, 0xf6dbd453, 0xcecabbff,
+	0x6ea2d55c, 0x56b3baf0, 0x1e800a04, 0x269165a8,
+	0x8ee76bec, 0xb6f60440, 0xfec5b4b4, 0xc6d4db18,
+	0xabc5decd, 0x93d4b161, 0xdbe70195, 0xe3f66e39,
+	0x4b80607d, 0x73910fd1, 0x3ba2bf25, 0x03b3d089,
+	0xe180b48f, 0xd991db23, 0x91a26bd7, 0xa9b3047b,
+	0x01c50a3f, 0x39d46593, 0x71e7d567, 0x49f6bacb,
+	0x24e7bf1e, 0x1cf6d0b2, 0x54c56046, 0x6cd40fea,
+	0xc4a201ae, 0xfcb36e02, 0xb480def6, 0x8c91b15a,
+	0x750a600b, 0x4d1b0fa7, 0x0528bf53, 0x3d39d0ff,
+	0x954fdebb, 0xad5eb117, 0xe56d01e3, 0xdd7c6e4f,
+	0xb06d6b9a, 0x887c0436, 0xc04fb4c2, 0xf85edb6e,
+	0x5028d52a, 0x6839ba86, 0x200a0a72, 0x181b65de,
+	0xfa2801d8, 0xc2396e74, 0x8a0ade80, 0xb21bb12c,
+	0x1a6dbf68, 0x227cd0c4, 0x6a4f6030, 0x525e0f9c,
+	0x3f4f0a49, 0x075e65e5, 0x4f6dd511, 0x777cbabd,
+	0xdf0ab4f9, 0xe71bdb55, 0xaf286ba1, 0x9739040d,
+	0x59f3bff2, 0x61e2d05e, 0x29d160aa, 0x11c00f06,
+	0xb9b60142, 0x81a76eee, 0xc994de1a, 0xf185b1b6,
+	0x9c94b463, 0xa485dbcf, 0xecb66b3b, 0xd4a70497,
+	0x7cd10ad3, 0x44c0657f, 0x0cf3d58b, 0x34e2ba27,
+	0xd6d1de21, 0xeec0b18d, 0xa6f30179, 0x9ee26ed5,
+	0x36946091, 0x0e850f3d, 0x46b6bfc9, 0x7ea7d065,
+	0x13b6d5b0, 0x2ba7ba1c, 0x63940ae8, 0x5b856544,
+	0xf3f36b00, 0xcbe204ac, 0x83d1b458, 0xbbc0dbf4,
+	0x425b0aa5, 0x7a4a6509, 0x3279d5fd, 0x0a68ba51,
+	0xa21eb415, 0x9a0fdbb9, 0xd23c6b4d, 0xea2d04e1,
+	0x873c0134, 0xbf2d6e98, 0xf71ede6c, 0xcf0fb1c0,
+	0x6779bf84, 0x5f68d028, 0x175b60dc, 0x2f4a0f70,
+	0xcd796b76, 0xf56804da, 0xbd5bb42e, 0x854adb82,
+	0x2d3cd5c6, 0x152dba6a, 0x5d1e0a9e, 0x650f6532,
+	0x081e60e7, 0x300f0f4b, 0x783cbfbf, 0x402dd013,
+	0xe85bde57, 0xd04ab1fb, 0x9879010f, 0xa0686ea3
+	},{
+	0x00000000, 0xef306b19, 0xdb8ca0c3, 0x34bccbda,
+	0xb2f53777, 0x5dc55c6e, 0x697997b4, 0x8649fcad,
+	0x6006181f, 0x8f367306, 0xbb8ab8dc, 0x54bad3c5,
+	0xd2f32f68, 0x3dc34471, 0x097f8fab, 0xe64fe4b2,
+	0xc00c303e, 0x2f3c5b27, 0x1b8090fd, 0xf4b0fbe4,
+	0x72f90749, 0x9dc96c50, 0xa975a78a, 0x4645cc93,
+	0xa00a2821, 0x4f3a4338, 0x7b8688e2, 0x94b6e3fb,
+	0x12ff1f56, 0xfdcf744f, 0xc973bf95, 0x2643d48c,
+	0x85f4168d, 0x6ac47d94, 0x5e78b64e, 0xb148dd57,
+	0x370121fa, 0xd8314ae3, 0xec8d8139, 0x03bdea20,
+	0xe5f20e92, 0x0ac2658b, 0x3e7eae51, 0xd14ec548,
+	0x570739e5, 0xb83752fc, 0x8c8b9926, 0x63bbf23f,
+	0x45f826b3, 0xaac84daa, 0x9e748670, 0x7144ed69,
+	0xf70d11c4, 0x183d7add, 0x2c81b107, 0xc3b1da1e,
+	0x25fe3eac, 0xcace55b5, 0xfe729e6f, 0x1142f576,
+	0x970b09db, 0x783b62c2, 0x4c87a918, 0xa3b7c201,
+	0x0e045beb, 0xe13430f2, 0xd588fb28, 0x3ab89031,
+	0xbcf16c9c, 0x53c10785, 0x677dcc5f, 0x884da746,
+	0x6e0243f4, 0x813228ed, 0xb58ee337, 0x5abe882e,
+	0xdcf77483, 0x33c71f9a, 0x077bd440, 0xe84bbf59,
+	0xce086bd5, 0x213800cc, 0x1584cb16, 0xfab4a00f,
+	0x7cfd5ca2, 0x93cd37bb, 0xa771fc61, 0x48419778,
+	0xae0e73ca, 0x413e18d3, 0x7582d309, 0x9ab2b810,
+	0x1cfb44bd, 0xf3cb2fa4, 0xc777e47e, 0x28478f67,
+	0x8bf04d66, 0x64c0267f, 0x507ceda5, 0xbf4c86bc,
+	0x39057a11, 0xd6351108, 0xe289dad2, 0x0db9b1cb,
+	0xebf65579, 0x04c63e60, 0x307af5ba, 0xdf4a9ea3,
+	0x5903620e, 0xb6330917, 0x828fc2cd, 0x6dbfa9d4,
+	0x4bfc7d58, 0xa4cc1641, 0x9070dd9b, 0x7f40b682,
+	0xf9094a2f, 0x16392136, 0x2285eaec, 0xcdb581f5,
+	0x2bfa6547, 0xc4ca0e5e, 0xf076c584, 0x1f46ae9d,
+	0x990f5230, 0x763f3929, 0x4283f2f3, 0xadb399ea,
+	0x1c08b7d6, 0xf338dccf, 0xc7841715, 0x28b47c0c,
+	0xaefd80a1, 0x41cdebb8, 0x75712062, 0x9a414b7b,
+	0x7c0eafc9, 0x933ec4d0, 0xa7820f0a, 0x48b26413,
+	0xcefb98be, 0x21cbf3a7, 0x1577387d, 0xfa475364,
+	0xdc0487e8, 0x3334ecf1, 0x0788272b, 0xe8b84c32,
+	0x6ef1b09f, 0x81c1db86, 0xb57d105c, 0x5a4d7b45,
+	0xbc029ff7, 0x5332f4ee, 0x678e3f34, 0x88be542d,
+	0x0ef7a880, 0xe1c7c399, 0xd57b0843, 0x3a4b635a,
+	0x99fca15b, 0x76ccca42, 0x42700198, 0xad406a81,
+	0x2b09962c, 0xc439fd35, 0xf08536ef, 0x1fb55df6,
+	0xf9fab944, 0x16cad25d, 0x22761987, 0xcd46729e,
+	0x4b0f8e33, 0xa43fe52a, 0x90832ef0, 0x7fb345e9,
+	0x59f09165, 0xb6c0fa7c, 0x827c31a6, 0x6d4c5abf,
+	0xeb05a612, 0x0435cd0b, 0x308906d1, 0xdfb96dc8,
+	0x39f6897a, 0xd6c6e263, 0xe27a29b9, 0x0d4a42a0,
+	0x8b03be0d, 0x6433d514, 0x508f1ece, 0xbfbf75d7,
+	0x120cec3d, 0xfd3c8724, 0xc9804cfe, 0x26b027e7,
+	0xa0f9db4a, 0x4fc9b053, 0x7b757b89, 0x94451090,
+	0x720af422, 0x9d3a9f3b, 0xa98654e1, 0x46b63ff8,
+	0xc0ffc355, 0x2fcfa84c, 0x1b736396, 0xf443088f,
+	0xd200dc03, 0x3d30b71a, 0x098c7cc0, 0xe6bc17d9,
+	0x60f5eb74, 0x8fc5806d, 0xbb794bb7, 0x544920ae,
+	0xb206c41c, 0x5d36af05, 0x698a64df, 0x86ba0fc6,
+	0x00f3f36b, 0xefc39872, 0xdb7f53a8, 0x344f38b1,
+	0x97f8fab0, 0x78c891a9, 0x4c745a73, 0xa344316a,
+	0x250dcdc7, 0xca3da6de, 0xfe816d04, 0x11b1061d,
+	0xf7fee2af, 0x18ce89b6, 0x2c72426c, 0xc3422975,
+	0x450bd5d8, 0xaa3bbec1, 0x9e87751b, 0x71b71e02,
+	0x57f4ca8e, 0xb8c4a197, 0x8c786a4d, 0x63480154,
+	0xe501fdf9, 0x0a3196e0, 0x3e8d5d3a, 0xd1bd3623,
+	0x37f2d291, 0xd8c2b988, 0xec7e7252, 0x034e194b,
+	0x8507e5e6, 0x6a378eff, 0x5e8b4525, 0xb1bb2e3c
+	},{
+	0x00000000, 0x68032cc8, 0xd0065990, 0xb8057558,
+	0xa5e0c5d1, 0xcde3e919, 0x75e69c41, 0x1de5b089,
+	0x4e2dfd53, 0x262ed19b, 0x9e2ba4c3, 0xf628880b,
+	0xebcd3882, 0x83ce144a, 0x3bcb6112, 0x53c84dda,
+	0x9c5bfaa6, 0xf458d66e, 0x4c5da336, 0x245e8ffe,
+	0x39bb3f77, 0x51b813bf, 0xe9bd66e7, 0x81be4a2f,
+	0xd27607f5, 0xba752b3d, 0x02705e65, 0x6a7372ad,
+	0x7796c224, 0x1f95eeec, 0xa7909bb4, 0xcf93b77c,
+	0x3d5b83bd, 0x5558af75, 0xed5dda2d, 0x855ef6e5,
+	0x98bb466c, 0xf0b86aa4, 0x48bd1ffc, 0x20be3334,
+	0x73767eee, 0x1b755226, 0xa370277e, 0xcb730bb6,
+	0xd696bb3f, 0xbe9597f7, 0x0690e2af, 0x6e93ce67,
+	0xa100791b, 0xc90355d3, 0x7106208b, 0x19050c43,
+	0x04e0bcca, 0x6ce39002, 0xd4e6e55a, 0xbce5c992,
+	0xef2d8448, 0x872ea880, 0x3f2bddd8, 0x5728f110,
+	0x4acd4199, 0x22ce6d51, 0x9acb1809, 0xf2c834c1,
+	0x7ab7077a, 0x12b42bb2, 0xaab15eea, 0xc2b27222,
+	0xdf57c2ab, 0xb754ee63, 0x0f519b3b, 0x6752b7f3,
+	0x349afa29, 0x5c99d6e1, 0xe49ca3b9, 0x8c9f8f71,
+	0x917a3ff8, 0xf9791330, 0x417c6668, 0x297f4aa0,
+	0xe6ecfddc, 0x8eefd114, 0x36eaa44c, 0x5ee98884,
+	0x430c380d, 0x2b0f14c5, 0x930a619d, 0xfb094d55,
+	0xa8c1008f, 0xc0c22c47, 0x78c7591f, 0x10c475d7,
+	0x0d21c55e, 0x6522e996, 0xdd279cce, 0xb524b006,
+	0x47ec84c7, 0x2fefa80f, 0x97eadd57, 0xffe9f19f,
+	0xe20c4116, 0x8a0f6dde, 0x320a1886, 0x5a09344e,
+	0x09c17994, 0x61c2555c, 0xd9c72004, 0xb1c40ccc,
+	0xac21bc45, 0xc422908d, 0x7c27e5d5, 0x1424c91d,
+	0xdbb77e61, 0xb3b452a9, 0x0bb127f1, 0x63b20b39,
+	0x7e57bbb0, 0x16549778, 0xae51e220, 0xc652cee8,
+	0x959a8332, 0xfd99affa, 0x459cdaa2, 0x2d9ff66a,
+	0x307a46e3, 0x58796a2b, 0xe07c1f73, 0x887f33bb,
+	0xf56e0ef4, 0x9d6d223c, 0x25685764, 0x4d6b7bac,
+	0x508ecb25, 0x388de7ed, 0x808892b5, 0xe88bbe7d,
+	0xbb43f3a7, 0xd340df6f, 0x6b45aa37, 0x034686ff,
+	0x1ea33676, 0x76a01abe, 0xcea56fe6, 0xa6a6432e,
+	0x6935f452, 0x0136d89a, 0xb933adc2, 0xd130810a,
+	0xccd53183, 0xa4d61d4b, 0x1cd36813, 0x74d044db,
+	0x27180901, 0x4f1b25c9, 0xf71e5091, 0x9f1d7c59,
+	0x82f8ccd0, 0xeafbe018, 0x52fe9540, 0x3afdb988,
+	0xc8358d49, 0xa036a181, 0x1833d4d9, 0x7030f811,
+	0x6dd54898, 0x05d66450, 0xbdd31108, 0xd5d03dc0,
+	0x8618701a, 0xee1b5cd2, 0x561e298a, 0x3e1d0542,
+	0x23f8b5cb, 0x4bfb9903, 0xf3feec5b, 0x9bfdc093,
+	0x546e77ef, 0x3c6d5b27, 0x84682e7f, 0xec6b02b7,
+	0xf18eb23e, 0x998d9ef6, 0x2188ebae, 0x498bc766,
+	0x1a438abc, 0x7240a674, 0xca45d32c, 0xa246ffe4,
+	0xbfa34f6d, 0xd7a063a5, 0x6fa516fd, 0x07a63a35,
+	0x8fd9098e, 0xe7da2546, 0x5fdf501e, 0x37dc7cd6,
+	0x2a39cc5f, 0x423ae097, 0xfa3f95cf, 0x923cb907,
+	0xc1f4f4dd, 0xa9f7d815, 0x11f2ad4d, 0x79f18185,
+	0x6414310c, 0x0c171dc4, 0xb412689c, 0xdc114454,
+	0x1382f328, 0x7b81dfe0, 0xc384aab8, 0xab878670,
+	0xb66236f9, 0xde611a31, 0x66646f69, 0x0e6743a1,
+	0x5daf0e7b, 0x35ac22b3, 0x8da957eb, 0xe5aa7b23,
+	0xf84fcbaa, 0x904ce762, 0x2849923a, 0x404abef2,
+	0xb2828a33, 0xda81a6fb, 0x6284d3a3, 0x0a87ff6b,
+	0x17624fe2, 0x7f61632a, 0xc7641672, 0xaf673aba,
+	0xfcaf7760, 0x94ac5ba8, 0x2ca92ef0, 0x44aa0238,
+	0x594fb2b1, 0x314c9e79, 0x8949eb21, 0xe14ac7e9,
+	0x2ed97095, 0x46da5c5d, 0xfedf2905, 0x96dc05cd,
+	0x8b39b544, 0xe33a998c, 0x5b3fecd4, 0x333cc01c,
+	0x60f48dc6, 0x08f7a10e, 0xb0f2d456, 0xd8f1f89e,
+	0xc5144817, 0xad1764df, 0x15121187, 0x7d113d4f
+	},{
+	0x00000000, 0x493c7d27, 0x9278fa4e, 0xdb448769,
+	0x211d826d, 0x6821ff4a, 0xb3657823, 0xfa590504,
+	0x423b04da, 0x0b0779fd, 0xd043fe94, 0x997f83b3,
+	0x632686b7, 0x2a1afb90, 0xf15e7cf9, 0xb86201de,
+	0x847609b4, 0xcd4a7493, 0x160ef3fa, 0x5f328edd,
+	0xa56b8bd9, 0xec57f6fe, 0x37137197, 0x7e2f0cb0,
+	0xc64d0d6e, 0x8f717049, 0x5435f720, 0x1d098a07,
+	0xe7508f03, 0xae6cf224, 0x7528754d, 0x3c14086a,
+	0x0d006599, 0x443c18be, 0x9f789fd7, 0xd644e2f0,
+	0x2c1de7f4, 0x65219ad3, 0xbe651dba, 0xf759609d,
+	0x4f3b6143, 0x06071c64, 0xdd439b0d, 0x947fe62a,
+	0x6e26e32e, 0x271a9e09, 0xfc5e1960, 0xb5626447,
+	0x89766c2d, 0xc04a110a, 0x1b0e9663, 0x5232eb44,
+	0xa86bee40, 0xe1579367, 0x3a13140e, 0x732f6929,
+	0xcb4d68f7, 0x827115d0, 0x593592b9, 0x1009ef9e,
+	0xea50ea9a, 0xa36c97bd, 0x782810d4, 0x31146df3,
+	0x1a00cb32, 0x533cb615, 0x8878317c, 0xc1444c5b,
+	0x3b1d495f, 0x72213478, 0xa965b311, 0xe059ce36,
+	0x583bcfe8, 0x1107b2cf, 0xca4335a6, 0x837f4881,
+	0x79264d85, 0x301a30a2, 0xeb5eb7cb, 0xa262caec,
+	0x9e76c286, 0xd74abfa1, 0x0c0e38c8, 0x453245ef,
+	0xbf6b40eb, 0xf6573dcc, 0x2d13baa5, 0x642fc782,
+	0xdc4dc65c, 0x9571bb7b, 0x4e353c12, 0x07094135,
+	0xfd504431, 0xb46c3916, 0x6f28be7f, 0x2614c358,
+	0x1700aeab, 0x5e3cd38c, 0x857854e5, 0xcc4429c2,
+	0x361d2cc6, 0x7f2151e1, 0xa465d688, 0xed59abaf,
+	0x553baa71, 0x1c07d756, 0xc743503f, 0x8e7f2d18,
+	0x7426281c, 0x3d1a553b, 0xe65ed252, 0xaf62af75,
+	0x9376a71f, 0xda4ada38, 0x010e5d51, 0x48322076,
+	0xb26b2572, 0xfb575855, 0x2013df3c, 0x692fa21b,
+	0xd14da3c5, 0x9871dee2, 0x4335598b, 0x0a0924ac,
+	0xf05021a8, 0xb96c5c8f, 0x6228dbe6, 0x2b14a6c1,
+	0x34019664, 0x7d3deb43, 0xa6796c2a, 0xef45110d,
+	0x151c1409, 0x5c20692e, 0x8764ee47, 0xce589360,
+	0x763a92be, 0x3f06ef99, 0xe44268f0, 0xad7e15d7,
+	0x572710d3, 0x1e1b6df4, 0xc55fea9d, 0x8c6397ba,
+	0xb0779fd0, 0xf94be2f7, 0x220f659e, 0x6b3318b9,
+	0x916a1dbd, 0xd856609a, 0x0312e7f3, 0x4a2e9ad4,
+	0xf24c9b0a, 0xbb70e62d, 0x60346144, 0x29081c63,
+	0xd3511967, 0x9a6d6440, 0x4129e329, 0x08159e0e,
+	0x3901f3fd, 0x703d8eda, 0xab7909b3, 0xe2457494,
+	0x181c7190, 0x51200cb7, 0x8a648bde, 0xc358f6f9,
+	0x7b3af727, 0x32068a00, 0xe9420d69, 0xa07e704e,
+	0x5a27754a, 0x131b086d, 0xc85f8f04, 0x8163f223,
+	0xbd77fa49, 0xf44b876e, 0x2f0f0007, 0x66337d20,
+	0x9c6a7824, 0xd5560503, 0x0e12826a, 0x472eff4d,
+	0xff4cfe93, 0xb67083b4, 0x6d3404dd, 0x240879fa,
+	0xde517cfe, 0x976d01d9, 0x4c2986b0, 0x0515fb97,
+	0x2e015d56, 0x673d2071, 0xbc79a718, 0xf545da3f,
+	0x0f1cdf3b, 0x4620a21c, 0x9d642575, 0xd4585852,
+	0x6c3a598c, 0x250624ab, 0xfe42a3c2, 0xb77edee5,
+	0x4d27dbe1, 0x041ba6c6, 0xdf5f21af, 0x96635c88,
+	0xaa7754e2, 0xe34b29c5, 0x380faeac, 0x7133d38b,
+	0x8b6ad68f, 0xc256aba8, 0x19122cc1, 0x502e51e6,
+	0xe84c5038, 0xa1702d1f, 0x7a34aa76, 0x3308d751,
+	0xc951d255, 0x806daf72, 0x5b29281b, 0x1215553c,
+	0x230138cf, 0x6a3d45e8, 0xb179c281, 0xf845bfa6,
+	0x021cbaa2, 0x4b20c785, 0x906440ec, 0xd9583dcb,
+	0x613a3c15, 0x28064132, 0xf342c65b, 0xba7ebb7c,
+	0x4027be78, 0x091bc35f, 0xd25f4436, 0x9b633911,
+	0xa777317b, 0xee4b4c5c, 0x350fcb35, 0x7c33b612,
+	0x866ab316, 0xcf56ce31, 0x14124958, 0x5d2e347f,
+	0xe54c35a1, 0xac704886, 0x7734cfef, 0x3e08b2c8,
+	0xc451b7cc, 0x8d6dcaeb, 0x56294d82, 0x1f1530a5
+	}
+#endif
+};
+
+/*
+ * __wt_cksum_sw --
+ *	Return a checksum for a chunk of memory, computed in software.
+ *
+ * Slicing-by-8 algorithm by Michael E. Kounavis and Frank L. Berry from
+ * Intel Corp.:
+ * http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf
+ *
+ * Based on Peter Kanowski's posting:
+ *	http://www.strchr.com/crc32_popcnt
+ *
+ * The big endian version calculates the same result at each step, except the
+ * value of the crc is byte reversed from what it would be at that step for
+ * little endian.
+ */
+static uint32_t
+__wt_cksum_sw(const void *chunk, size_t len)
+{
+	uint32_t crc, next;
+	size_t nqwords;
+	const uint8_t *p;
+
+	crc = 0xffffffff;
+
+	/* Checksum one byte at a time to the first 4B boundary. */
+	for (p = chunk;
+	    ((uintptr_t)p & (sizeof(uint32_t) - 1)) != 0 &&
+	    len > 0; ++p, --len)
+#ifdef WORDS_BIGENDIAN
+		crc = g_crc_slicing[0][((crc >> 24) ^ *p) & 0xFF] ^ (crc << 8);
+#else
+		crc = g_crc_slicing[0][(crc ^ *p) & 0xFF] ^ (crc >> 8);
+#endif
+
+	/* Checksum in 8B chunks. */
+	for (nqwords = len / sizeof(uint64_t); nqwords; nqwords--) {
+		crc ^= *(uint32_t *)p;
+		p += sizeof(uint32_t);
+		next = *(uint32_t *)p;
+		p += sizeof(uint32_t);
+		crc =
+#ifdef WORDS_BIGENDIAN
+			g_crc_slicing[4][(crc      ) & 0xFF] ^
+			g_crc_slicing[5][(crc >>  8) & 0xFF] ^
+			g_crc_slicing[6][(crc >> 16) & 0xFF] ^
+			g_crc_slicing[7][(crc >> 24)] ^
+			g_crc_slicing[0][(next      ) & 0xFF] ^
+			g_crc_slicing[1][(next >>  8) & 0xFF] ^
+			g_crc_slicing[2][(next >> 16) & 0xFF] ^
+			g_crc_slicing[3][(next >> 24)];
+#else
+			g_crc_slicing[7][(crc      ) & 0xFF] ^
+			g_crc_slicing[6][(crc >>  8) & 0xFF] ^
+			g_crc_slicing[5][(crc >> 16) & 0xFF] ^
+			g_crc_slicing[4][(crc >> 24)] ^
+			g_crc_slicing[3][(next      ) & 0xFF] ^
+			g_crc_slicing[2][(next >>  8) & 0xFF] ^
+			g_crc_slicing[1][(next >> 16) & 0xFF] ^
+			g_crc_slicing[0][(next >> 24)];
+#endif
+	}
+
+	/* Checksum trailing bytes one byte at a time. */
+#ifdef WORDS_BIGENDIAN
+	for (len &= 0x7; len > 0; ++p, len--)
+		crc = g_crc_slicing[0][((crc >> 24) ^ *p) & 0xFF] ^ (crc << 8);
+
+	/* Do final byte swap to produce a result identical to little endian */
+	crc =
+		((crc << 24) & 0xFF000000) |
+		((crc <<  8) & 0x00FF0000) |
+		((crc >>  8) & 0x0000FF00) |
+		((crc >> 24) & 0x000000FF);
+#else
+	for (len &= 0x7; len > 0; ++p, len--)
+		crc = g_crc_slicing[0][(crc ^ *p) & 0xFF] ^ (crc >> 8);
+#endif
+	return (~crc);
+}
+
+#if (defined(__amd64) || defined(__x86_64))
+/*
+ * __wt_cksum_hw --
+ *	Return a checksum for a chunk of memory, computed in hardware
+ *	using 8 byte steps.
+ */
+static uint32_t
+__wt_cksum_hw(const void *chunk, size_t len)
+{
+	uint32_t crc;
+	size_t nqwords;
+	const uint8_t *p;
+	const uint64_t *p64;
+
+	crc = 0xffffffff;
+
+	/* Checksum one byte at a time to the first 4B boundary. */
+	for (p = chunk;
+	    ((uintptr_t)p & (sizeof(uint32_t) - 1)) != 0 &&
+		 len > 0; ++p, --len) {
+		__asm__ __volatile__(
+				     ".byte 0xF2, 0x0F, 0x38, 0xF0, 0xF1"
+				     : "=S" (crc)
+				     : "0" (crc), "c" (*p));
+	}
+
+	p64 = (const uint64_t *)p;
+	/* Checksum in 8B chunks. */
+	for (nqwords = len / sizeof(uint64_t); nqwords; nqwords--) {
+		__asm__ __volatile__ (
+				      ".byte 0xF2, 0x48, 0x0F, 0x38, 0xF1, 0xF1"
+				      : "=S"(crc)
+				      : "0"(crc), "c" (*p64));
+		p64++;
+	}
+
+	/* Checksum trailing bytes one byte at a time. */
+	p = (const uint8_t *)p64;
+	for (len &= 0x7; len > 0; ++p, len--) {
+		__asm__ __volatile__(
+				     ".byte 0xF2, 0x0F, 0x38, 0xF0, 0xF1"
+				     : "=S" (crc)
+				     : "0" (crc), "c" (*p));
+	}
+	return (~crc);
+}
+#endif
+
+#if defined(_M_AMD64)
+/*
+ * __wt_cksum_hw --
+ *	Return a checksum for a chunk of memory, computed in hardware
+ *	using 8 byte steps.
+ */
+static uint32_t
+__wt_cksum_hw(const void *chunk, size_t len)
+{
+	uint32_t crc;
+	size_t nqwords;
+	const uint8_t *p;
+	const uint64_t *p64;
+
+	crc = 0xffffffff;
+
+	/* Checksum one byte at a time to the first 4B boundary. */
+	for (p = chunk;
+	    ((uintptr_t)p & (sizeof(uint32_t) - 1)) != 0 &&
+	    len > 0; ++p, --len) {
+		crc = _mm_crc32_u8(crc, *p);
+	}
+
+	p64 = (const uint64_t *)p;
+	/* Checksum in 8B chunks. */
+	for (nqwords = len / sizeof(uint64_t); nqwords; nqwords--) {
+		crc = (uint32_t)_mm_crc32_u64(crc, *p64);
+		p64++;
+	}
+
+	/* Checksum trailing bytes one byte at a time. */
+	p = (const uint8_t *)p64;
+	for (len &= 0x7; len > 0; ++p, len--) {
+		crc = _mm_crc32_u8(crc, *p);
+	}
+
+	return (~crc);
+}
+#endif
+
+/*
+ * __wt_cksum --
+ *	Return a checksum for a chunk of memory using the fastest method
+ * available.
+ */
+uint32_t
+__wt_cksum(const void *chunk, size_t len)
+{
+	return (*__wt_cksum_func)(chunk, len);
+}
+
+/*
+ * __wt_cksum_init --
+ *	Detect CRC hardware and set the checksum function.
+ */
+void
+__wt_cksum_init(void)
+{
+#define	CPUID_ECX_HAS_SSE42	(1 << 20)
+
+#if (defined(__amd64) || defined(__x86_64))
+	unsigned int eax, ebx, ecx, edx;
+
+	__asm__ __volatile__ (
+			      "cpuid"
+			      : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
+			      : "a" (1));
+
+	if (ecx & CPUID_ECX_HAS_SSE42)
+		__wt_cksum_func = __wt_cksum_hw;
+	else
+		__wt_cksum_func = __wt_cksum_sw;
+
+#elif defined(_M_AMD64)
+	int cpuInfo[4];
+
+	__cpuid(cpuInfo, 1);
+
+	if (cpuInfo[2] & CPUID_ECX_HAS_SSE42)
+		__wt_cksum_func = __wt_cksum_hw;
+	else
+		__wt_cksum_func = __wt_cksum_sw;
+#else
+	__wt_cksum_func = __wt_cksum_sw;
+#endif
+}
diff --git a/src/third_party/wiredtiger/src/support/err.c b/src/third_party/wiredtiger/src/support/err.c
new file mode 100644
index 00000000000..3e874078fbf
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/err.c
@@ -0,0 +1,527 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __handle_error_default --
+ *	Default WT_EVENT_HANDLER->handle_error implementation: send to stderr.
+ */
+static int
+__handle_error_default(WT_EVENT_HANDLER *handler,
+    WT_SESSION *session, int error, const char *errmsg)
+{
+	WT_UNUSED(handler);
+	WT_UNUSED(session);
+	WT_UNUSED(error);
+
+	return (fprintf(stderr, "%s\n", errmsg) >= 0 &&
+	    fflush(stderr) == 0 ? 0 : __wt_errno());
+}
+
+/*
+ * __handle_message_default --
+ *	Default WT_EVENT_HANDLER->handle_message implementation: send to stdout.
+ */
+static int
+__handle_message_default(WT_EVENT_HANDLER *handler,
+    WT_SESSION *session, const char *message)
+{
+	WT_UNUSED(handler);
+	WT_UNUSED(session);
+
+	return (printf("%s\n", message) >= 0 &&
+	    fflush(stdout) == 0 ? 0 : __wt_errno());
+}
+
+/*
+ * __handle_progress_default --
+ *	Default WT_EVENT_HANDLER->handle_progress implementation: ignore.
+ */
+static int
+__handle_progress_default(WT_EVENT_HANDLER *handler,
+    WT_SESSION *session, const char *operation, uint64_t progress)
+{
+	WT_UNUSED(handler);
+	WT_UNUSED(session);
+	WT_UNUSED(operation);
+	WT_UNUSED(progress);
+
+	return (0);
+}
+
+/*
+ * __handle_close_default --
+ *	Default WT_EVENT_HANDLER->handle_close implementation: ignore.
+ */
+static int
+__handle_close_default(WT_EVENT_HANDLER *handler,
+    WT_SESSION *session, WT_CURSOR *cursor)
+{
+	WT_UNUSED(handler);
+	WT_UNUSED(session);
+	WT_UNUSED(cursor);
+
+	return (0);
+}
+
+static WT_EVENT_HANDLER __event_handler_default = {
+	__handle_error_default,
+	__handle_message_default,
+	__handle_progress_default,
+	__handle_close_default
+};
+
+/*
+ * __handler_failure --
+ *	Report the failure of an application-configured event handler.
+ */
+static void
+__handler_failure(WT_SESSION_IMPL *session,
+    int error, const char *which, int error_handler_failed)
+{
+	WT_EVENT_HANDLER *handler;
+	WT_SESSION *wt_session;
+
+	/*
+	 * !!!
+	 * SECURITY:
+	 * Buffer placed at the end of the stack in case snprintf overflows.
+	 */
+	char s[256];
+
+	(void)snprintf(s, sizeof(s),
+	    "application %s event handler failed: %s",
+	    which, wiredtiger_strerror(error));
+
+	/*
+	 * Use the error handler to report the failure, unless it was the error
+	 * handler that failed.  If it was the error handler that failed, or a
+	 * call to the error handler fails, use the default error handler.
+	 */
+	wt_session = (WT_SESSION *)session;
+	handler = session->event_handler;
+	if (!error_handler_failed &&
+	    handler->handle_error != __handle_error_default &&
+	    handler->handle_error(handler, wt_session, error, s) == 0)
+		return;
+
+	(void)__handle_error_default(NULL, wt_session, error, s);
+}
+
+/*
+ * __wt_event_handler_set --
+ *	Set an event handler, fill in any NULL methods with the defaults.
+ */
+void
+__wt_event_handler_set(WT_SESSION_IMPL *session, WT_EVENT_HANDLER *handler)
+{
+	if (handler == NULL)
+		handler = &__event_handler_default;
+	else {
+		if (handler->handle_error == NULL)
+			handler->handle_error = __handle_error_default;
+		if (handler->handle_message == NULL)
+			handler->handle_message = __handle_message_default;
+		if (handler->handle_progress == NULL)
+			handler->handle_progress = __handle_progress_default;
+	}
+
+	session->event_handler = handler;
+}
+
+/*
+ * __wt_eventv --
+ * 	Report a message to an event handler.
+ */
+int
+__wt_eventv(WT_SESSION_IMPL *session, int msg_event, int error,
+    const char *file_name, int line_number, const char *fmt, va_list ap)
+{
+	WT_EVENT_HANDLER *handler;
+	WT_DECL_RET;
+	WT_SESSION *wt_session;
+	struct timespec ts;
+	size_t len, remain, wlen;
+	int prefix_cnt;
+	const char *err, *prefix;
+	char *end, *p, tid[128];
+
+	/*
+	 * We're using a stack buffer because we want error messages no matter
+	 * what, and allocating a WT_ITEM, or the memory it needs, might fail.
+	 *
+	 * !!!
+	 * SECURITY:
+	 * Buffer placed at the end of the stack in case snprintf overflows.
+	 */
+	char s[2048];
+
+	/*
+	 * !!!
+	 * This function MUST handle a NULL WT_SESSION_IMPL handle.
+	 *
+	 * Without a session, we don't have event handlers or prefixes for the
+	 * error message.  Write the error to stderr and call it a day.  (It's
+	 * almost impossible for that to happen given how early we allocate the
+	 * first session, but if the allocation of the first session fails, for
+	 * example, we can end up here without a session.)
+	 */
+	if (session == NULL)
+		return (fprintf(stderr, "WiredTiger Error%s%s\n",
+		    error == 0 ? "" : ": ",
+		    error == 0 ? "" : wiredtiger_strerror(error)) >= 0 &&
+		    fflush(stderr) == 0 ? 0 : __wt_errno());
+
+	p = s;
+	end = s + sizeof(s);
+
+	/*
+	 * We have several prefixes for the error message:
+	 * a timestamp and the process and thread ids, the database error
+	 * prefix, the data-source's name, and the session's name.  Write them
+	 * as a comma-separate list, followed by a colon.
+	 */
+	prefix_cnt = 0;
+	if (__wt_epoch(session, &ts) == 0) {
+		__wt_thread_id(tid, sizeof(tid));
+		remain = WT_PTRDIFF(end, p);
+		wlen = (size_t)snprintf(p, remain,
+		    "[%" PRIuMAX ":%" PRIuMAX "][%s]",
+		    (uintmax_t)ts.tv_sec, (uintmax_t)ts.tv_nsec / 1000, tid);
+		p = wlen >= remain ? end : p + wlen;
+		prefix_cnt = 1;
+	}
+	if ((prefix = S2C(session)->error_prefix) != NULL) {
+		remain = WT_PTRDIFF(end, p);
+		wlen = (size_t)snprintf(p, remain,
+		    "%s%s", prefix_cnt == 0 ? "" : ", ", prefix);
+		p = wlen >= remain ? end : p + wlen;
+		prefix_cnt = 1;
+	}
+	prefix = session->dhandle == NULL ? NULL : session->dhandle->name;
+	if (prefix != NULL) {
+		remain = WT_PTRDIFF(end, p);
+		wlen = (size_t)snprintf(p, remain,
+		    "%s%s", prefix_cnt == 0 ? "" : ", ", prefix);
+		p = wlen >= remain ? end : p + wlen;
+		prefix_cnt = 1;
+	}
+	if ((prefix = session->name) != NULL) {
+		remain = WT_PTRDIFF(end, p);
+		wlen = (size_t)snprintf(p, remain,
+		    "%s%s", prefix_cnt == 0 ? "" : ", ", prefix);
+		p = wlen >= remain ? end : p + wlen;
+		prefix_cnt = 1;
+	}
+	if (prefix_cnt != 0) {
+		remain = WT_PTRDIFF(end, p);
+		wlen = (size_t)snprintf(p, remain, ": ");
+		p = wlen >= remain ? end : p + wlen;
+	}
+
+	if (file_name != NULL) {
+		remain = WT_PTRDIFF(end, p);
+		wlen = (size_t)
+		    snprintf(p, remain, "%s, %d: ", file_name, line_number);
+		p = wlen >= remain ? end : p + wlen;
+	}
+
+	remain = WT_PTRDIFF(end, p);
+	wlen = (size_t)vsnprintf(p, remain, fmt, ap);
+	p = wlen >= remain ? end : p + wlen;
+
+	if (error != 0) {
+		/*
+		 * When the engine calls __wt_err on error, it often outputs an
+		 * error message including the string associated with the error
+		 * it's returning.  We could change the calls to call __wt_errx,
+		 * but it's simpler to not append an error string if all we are
+		 * doing is duplicating an existing error string.
+		 *
+		 * Use strcmp to compare: both strings are nul-terminated, and
+		 * we don't want to run past the end of the buffer.
+		 */
+		err = wiredtiger_strerror(error);
+		len = strlen(err);
+		if (WT_PTRDIFF(p, s) < len || strcmp(p - len, err) != 0) {
+			remain = WT_PTRDIFF(end, p);
+			(void)snprintf(p, remain, ": %s", err);
+		}
+	}
+
+	/*
+	 * If a handler fails, return the error status: if we're in the process
+	 * of handling an error, any return value we provide will be ignored by
+	 * our caller, our caller presumably already has an error value it will
+	 * be returning.
+	 *
+	 * If an application-specified or default informational message handler
+	 * fails, complain using the application-specified or default error
+	 * handler.
+	 *
+	 * If an application-specified error message handler fails, complain
+	 * using the default error handler.  If the default error handler fails,
+	 * there's nothing to do.
+	 */
+	wt_session = (WT_SESSION *)session;
+	handler = session->event_handler;
+	if (msg_event) {
+		ret = handler->handle_message(handler, wt_session, s);
+		if (ret != 0)
+			__handler_failure(session, ret, "message", 0);
+	} else {
+		ret = handler->handle_error(handler, wt_session, error, s);
+		if (ret != 0 && handler->handle_error != __handle_error_default)
+			__handler_failure(session, ret, "error", 1);
+	}
+
+	return (ret);
+}
+
+/*
+ * __wt_err --
+ * 	Report an error.
+ */
+void
+__wt_err(WT_SESSION_IMPL *session, int error, const char *fmt, ...)
+    WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
+{
+	va_list ap;
+
+	/*
+	 * Ignore error returns from underlying event handlers, we already have
+	 * an error value to return.
+	 */
+	va_start(ap, fmt);
+	(void)__wt_eventv(session, 0, error, NULL, 0, fmt, ap);
+	va_end(ap);
+}
+
+/*
+ * __wt_errx --
+ * 	Report an error with no error code.
+ */
+void
+__wt_errx(WT_SESSION_IMPL *session, const char *fmt, ...)
+    WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3)))
+{
+	va_list ap;
+
+	/*
+	 * Ignore error returns from underlying event handlers, we already have
+	 * an error value to return.
+	 */
+	va_start(ap, fmt);
+	(void)__wt_eventv(session, 0, 0, NULL, 0, fmt, ap);
+	va_end(ap);
+}
+
+/*
+ * __wt_ext_err_printf --
+ *	Extension API call to print to the error stream.
+ */
+int
+__wt_ext_err_printf(
+    WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...)
+    WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	va_list ap;
+
+	if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+		session = ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+	va_start(ap, fmt);
+	ret = __wt_eventv(session, 0, 0, NULL, 0, fmt, ap);
+	va_end(ap);
+	return (ret);
+}
+
+/*
+ * info_msg --
+ * 	Informational message.
+ */
+static int
+info_msg(WT_SESSION_IMPL *session, const char *fmt, va_list ap)
+{
+	WT_EVENT_HANDLER *handler;
+	WT_SESSION *wt_session;
+
+	/*
+	 * !!!
+	 * SECURITY:
+	 * Buffer placed at the end of the stack in case snprintf overflows.
+	 */
+	char s[2048];
+
+	(void)vsnprintf(s, sizeof(s), fmt, ap);
+
+	wt_session = (WT_SESSION *)session;
+	handler = session->event_handler;
+	return (handler->handle_message(handler, wt_session, s));
+}
+
+/*
+ * __wt_msg --
+ * 	Informational message.
+ */
+int
+__wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...)
+    WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3)))
+{
+	WT_DECL_RET;
+	va_list ap;
+
+	va_start(ap, fmt);
+	ret = info_msg(session, fmt, ap);
+	va_end(ap);
+
+	return (ret);
+}
+
+/*
+ * __wt_ext_msg_printf --
+ *	Extension API call to print to the message stream.
+ */
+int
+__wt_ext_msg_printf(
+    WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...)
+    WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
+{
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	va_list ap;
+
+	if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+		session = ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+	va_start(ap, fmt);
+	ret = info_msg(session, fmt, ap);
+	va_end(ap);
+	return (ret);
+}
+
+/*
+ * __wt_progress --
+ *	Progress message.
+ */
+int
+__wt_progress(WT_SESSION_IMPL *session, const char *s, uint64_t v)
+{
+	WT_DECL_RET;
+	WT_EVENT_HANDLER *handler;
+	WT_SESSION *wt_session;
+
+	wt_session = (WT_SESSION *)session;
+	handler = session->event_handler;
+	if (handler != NULL && handler->handle_progress != NULL)
+		if ((ret = handler->handle_progress(handler,
+		    wt_session, s == NULL ? session->name : s, v)) != 0)
+			__handler_failure(session, ret, "progress", 0);
+	return (0);
+}
+
+/*
+ * __wt_assert --
+ *	Assert and other unexpected failures, includes file/line information
+ * for debugging.
+ */
+void
+__wt_assert(WT_SESSION_IMPL *session,
+    int error, const char *file_name, int line_number, const char *fmt, ...)
+    WT_GCC_FUNC_ATTRIBUTE((format (printf, 5, 6)))
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	(void)__wt_eventv(session, 0, error, file_name, line_number, fmt, ap);
+	va_end(ap);
+
+#ifdef HAVE_DIAGNOSTIC
+	__wt_abort(session);			/* Drop core if testing. */
+	/* NOTREACHED */
+#endif
+}
+
+/*
+ * __wt_panic --
+ *	A standard error message when we panic.
+ */
+int
+__wt_panic(WT_SESSION_IMPL *session)
+{
+	F_SET(S2C(session), WT_CONN_PANIC);
+	__wt_errx(session, "%s",
+	    "the WiredTiger library cannot continue; the process must exit "
+	    "and restart");
+
+#if !defined(HAVE_DIAGNOSTIC)
+	/*
+	 * Chaos reigns within.
+	 * Reflect, repent, and reboot.
+	 * Order shall return.
+	 */
+	return (WT_PANIC);
+#endif
+
+	__wt_abort(session);			/* Drop core if testing. */
+	/* NOTREACHED */
+}
+
+/*
+ * __wt_illegal_value --
+ *	A standard error message when we detect an illegal value.
+ */
+int
+__wt_illegal_value(WT_SESSION_IMPL *session, const char *name)
+{
+	__wt_errx(session, "%s%s%s",
+	    name == NULL ? "" : name, name == NULL ? "" : ": ",
+	    "encountered an illegal file format or internal value");
+
+#if !defined(HAVE_DIAGNOSTIC)
+	return (__wt_panic(session));
+#endif
+
+	__wt_abort(session);			/* Drop core if testing. */
+	/* NOTREACHED */
+}
+
+/*
+ * __wt_object_unsupported --
+ *	Print a standard error message for an object that doesn't support a
+ * particular operation.
+ */
+int
+__wt_object_unsupported(WT_SESSION_IMPL *session, const char *uri)
+{
+	WT_RET_MSG(session, ENOTSUP, "unsupported object operation: %s", uri);
+}
+
+/*
+ * __wt_bad_object_type --
+ *	Print a standard error message when given an unknown or unsupported
+ * object type.
+ */
+int
+__wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri)
+{
+	if (WT_PREFIX_MATCH(uri, "backup:") ||
+	    WT_PREFIX_MATCH(uri, "colgroup:") ||
+	    WT_PREFIX_MATCH(uri, "config:") ||
+	    WT_PREFIX_MATCH(uri, "file:") ||
+	    WT_PREFIX_MATCH(uri, "index:") ||
+	    WT_PREFIX_MATCH(uri, "log:") ||
+	    WT_PREFIX_MATCH(uri, "lsm:") ||
+	    WT_PREFIX_MATCH(uri, "statistics:") ||
+	    WT_PREFIX_MATCH(uri, "table:"))
+		return (__wt_object_unsupported(session, uri));
+
+	WT_RET_MSG(session, ENOTSUP, "unknown object type: %s", uri);
+}
diff --git a/src/third_party/wiredtiger/src/support/filename.c b/src/third_party/wiredtiger/src/support/filename.c
new file mode 100644
index 00000000000..bd5d03fa633
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/filename.c
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_filename --
+ *	Build a file name in a scratch buffer, automatically calculate the
+ *	length of the file name.
+ */
+int
+__wt_filename(WT_SESSION_IMPL *session, const char *name, char **path)
+{
+	return (__wt_nfilename(session, name, strlen(name), path));
+}
+
+/*
+ * __wt_nfilename --
+ *	Build a file name in a scratch buffer.  If the name is already an
+ *	absolute path duplicate it, otherwise generate a path relative to the
+ *	connection home directory.
+ */
+int
+__wt_nfilename(
+    WT_SESSION_IMPL *session, const char *name, size_t namelen, char **path)
+{
+	WT_CONNECTION_IMPL *conn;
+	size_t len;
+	char *buf;
+
+	conn = S2C(session);
+	*path = NULL;
+
+	if (__wt_absolute_path(name))
+		WT_RET(__wt_strndup(session, name, namelen, path));
+	else {
+		len = strlen(conn->home) + 1 + namelen + 1;
+		WT_RET(__wt_calloc(session, 1, len, &buf));
+		snprintf(buf, len, "%s%s%.*s",
+		    conn->home, __wt_path_separator(), (int)namelen, name);
+		*path = buf;
+	}
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/support/global.c b/src/third_party/wiredtiger/src/support/global.c
new file mode 100644
index 00000000000..10f718d57f7
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/global.c
@@ -0,0 +1,118 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+WT_PROCESS __wt_process;			/* Per-process structure */
+static int __wt_pthread_once_failed;		/* If initialization failed */
+
+/*
+ * __system_is_little_endian --
+ *	Check if the system is little endian.
+ */
+static int
+__system_is_little_endian(void)
+{
+	uint64_t v;
+	int little;
+
+	v = 1;
+	little = *((uint8_t *)&v) == 0 ? 0 : 1;
+
+	if (little)
+		return (0);
+
+	fprintf(stderr,
+	    "This release of the WiredTiger data engine does not support "
+	    "big-endian systems; contact WiredTiger for more information.\n");
+	return (EINVAL);
+}
+
+/*
+ * __wt_global_once --
+ *	Global initialization, run once.
+ */
+static void
+__wt_global_once(void)
+{
+	WT_DECL_RET;
+
+	if ((ret = __system_is_little_endian()) != 0) {
+		__wt_pthread_once_failed = ret;
+		return;
+	}
+
+	if ((ret =
+	    __wt_spin_init(NULL, &__wt_process.spinlock, "global")) != 0) {
+		__wt_pthread_once_failed = ret;
+		return;
+	}
+
+	__wt_cksum_init();
+
+	TAILQ_INIT(&__wt_process.connqh);
+
+#ifdef HAVE_DIAGNOSTIC
+	/* Load debugging code the compiler might optimize out. */
+	(void)__wt_breakpoint();
+#endif
+}
+
+/*
+ * __wt_library_init --
+ *	Some things to do, before we do anything else.
+ */
+int
+__wt_library_init(void)
+{
+	static int first = 1;
+	WT_DECL_RET;
+
+	/*
+	 * Do per-process initialization once, before anything else, but only
+	 * once.  I don't know how heavy-weight the function (pthread_once, in
+	 * the POSIX world), might be, so I'm front-ending it with a local
+	 * static and only using that function to avoid a race.
+	 */
+	if (first) {
+		if ((ret = __wt_once(__wt_global_once)) != 0)
+			__wt_pthread_once_failed = ret;
+		first = 0;
+	}
+	return (__wt_pthread_once_failed);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_breakpoint --
+ *	A simple place to put a breakpoint, if you need one.
+ */
+int
+__wt_breakpoint(void)
+{
+	return (0);
+}
+
+/*
+ * __wt_attach --
+ *	A routine to wait for the debugging to attach.
+ */
+void
+__wt_attach(WT_SESSION_IMPL *session)
+{
+#ifdef HAVE_ATTACH
+	__wt_errx(session, "process ID %" PRIdMAX
+	    ": waiting for debugger...", (intmax_t)getpid());
+
+	/* Sleep forever, the debugger will interrupt us when it attaches. */
+	for (;;)
+		__wt_sleep(100, 0);
+#else
+	WT_UNUSED(session);
+#endif
+}
+#endif
diff --git a/src/third_party/wiredtiger/src/support/hash_city.c b/src/third_party/wiredtiger/src/support/hash_city.c
new file mode 100644
index 00000000000..c6978f6bfe6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/hash_city.c
@@ -0,0 +1,323 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Copyright (c) 2011 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * CityHash, by Geoff Pike and Jyrki Alakuijala
+ *
+ * This file provides CityHash64() and related functions.
+ *
+ * It's probably possible to create even faster hash functions by
+ * writing a program that systematically explores some of the space of
+ * possible hash functions, by using SIMD instructions, or by
+ * compromising on hash quality.
+ */
+
+#include <string.h>
+#include "wt_internal.h"
+
+/*
+ * Google City Hash implementation. Based on source code from:
+ * http://code.google.com/p/cityhash/
+ */
+
+typedef struct _uint128 uint128;
+struct _uint128 {
+  uint64_t first;
+  uint64_t second;
+};
+
+#define	Uint128Low64(x) 	(x).first
+#define	Uint128High64(x)	(x).second
+
+static uint64_t UNALIGNED_LOAD64(const char *p) {
+	uint64_t result;
+	memcpy(&result, p, sizeof(result));
+	return (result);
+}
+
+static uint32_t UNALIGNED_LOAD32(const char *p) {
+	uint32_t result;
+	memcpy(&result, p, sizeof(result));
+	return (result);
+}
+
+#if !defined(WORDS_BIGENDIAN)
+
+#define	uint32_in_expected_order(x) (x)
+#define	uint64_in_expected_order(x) (x)
+
+#else
+
+#ifdef __APPLE__
+/* Mac OS X / Darwin features */
+#include <libkern/OSByteOrder.h>
+#define	bswap_32(x) OSSwapInt32(x)
+#define	bswap_64(x) OSSwapInt64(x)
+
+#else
+#include <byteswap.h>
+#endif
+
+#define	uint32_in_expected_order(x) (bswap_32(x))
+#define	uint64_in_expected_order(x) (bswap_64(x))
+
+#endif  /* WORDS_BIGENDIAN */
+
+static uint64_t Fetch64(const char *p) {
+	return uint64_in_expected_order(UNALIGNED_LOAD64(p));
+}
+
+static uint32_t Fetch32(const char *p) {
+	return uint32_in_expected_order(UNALIGNED_LOAD32(p));
+}
+
+/* Some primes between 2^63 and 2^64 for various uses. */
+static const uint64_t k0 = 0xc3a5c85c97cb3127ULL;
+static const uint64_t k1 = 0xb492b66fbe98f273ULL;
+static const uint64_t k2 = 0x9ae16a3b2f90404fULL;
+static const uint64_t k3 = 0xc949d7c7509e6557ULL;
+
+/*
+ * Hash 128 input bits down to 64 bits of output.
+ * This is intended to be a reasonably good hash function.
+ */
+static inline uint64_t Hash128to64(const uint128 x) {
+	/* Murmur-inspired hashing. */
+	const uint64_t kMul = 0x9ddfea08eb382d69ULL;
+	uint64_t a, b;
+
+	a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul;
+	a ^= (a >> 47);
+	b = (Uint128High64(x) ^ a) * kMul;
+	b ^= (b >> 47);
+	b *= kMul;
+	return (b);
+}
+
+/*
+ * Bitwise right rotate.  Normally this will compile to a single
+ * instruction, especially if the shift is a manifest constant.
+ */
+static uint64_t Rotate(uint64_t val, int shift) {
+	/* Avoid shifting by 64: doing so yields an undefined result. */
+	return shift == 0 ? val : ((val >> shift) | (val << (64 - shift)));
+}
+
+/*
+ * Equivalent to Rotate(), but requires the second arg to be non-zero.
+ * On x86-64, and probably others, it's possible for this to compile
+ * to a single instruction if both args are already in registers.
+ */
+static uint64_t RotateByAtLeast1(uint64_t val, int shift) {
+	return (val >> shift) | (val << (64 - shift));
+}
+
+static uint64_t ShiftMix(uint64_t val) {
+	return val ^ (val >> 47);
+}
+
+static uint64_t HashLen16(uint64_t u, uint64_t v) {
+	uint128 result;
+
+	result.first = u;
+	result.second = v;
+	return Hash128to64(result);
+}
+
+static uint64_t HashLen0to16(const char *s, size_t len) {
+	uint64_t a64, b64;
+	uint32_t y, z;
+	uint8_t a8, b8, c8;
+	if (len > 8) {
+		a64 = Fetch64(s);
+		b64 = Fetch64(s + len - 8);
+		return HashLen16(
+		    a64, RotateByAtLeast1(b64 + len, (int)len)) ^ b64;
+	}
+	if (len >= 4) {
+		a64 = Fetch32(s);
+		return HashLen16(len + (a64 << 3), Fetch32(s + len - 4));
+	}
+	if (len > 0) {
+		a8 = (uint8_t)s[0];
+		b8 = (uint8_t)s[len >> 1];
+		c8 = (uint8_t)s[len - 1];
+		y = (uint32_t)(a8) + ((uint32_t)(b8) << 8);
+		z = (uint32_t)len + ((uint32_t)(c8) << 2);
+		return ShiftMix(y * k2 ^ z * k3) * k2;
+	}
+	return (k2);
+}
+
+/*
+ * This probably works well for 16-byte strings as well, but it may be overkill
+ * in that case.
+ */
+static uint64_t HashLen17to32(const char *s, size_t len) {
+	uint64_t a = Fetch64(s) * k1;
+	uint64_t b = Fetch64(s + 8);
+	uint64_t c = Fetch64(s + len - 8) * k2;
+	uint64_t d = Fetch64(s + len - 16) * k0;
+	return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d,
+			a + Rotate(b ^ k3, 20) + len - c);
+}
+
+/*
+ * Return a 16-byte hash for 48 bytes.  Quick and dirty.
+ * Callers do best to use "random-looking" values for a and b.
+ * static pair<uint64, uint64> WeakHashLen32WithSeeds(
+ */
+static void WeakHashLen32WithSeeds6(uint64_t w, uint64_t x,
+    uint64_t y, uint64_t z, uint64_t a, uint64_t b, uint128 *ret) {
+	uint64_t c;
+
+	a += w;
+	b = Rotate(b + a + z, 21);
+	c = a;
+	a += x;
+	a += y;
+	b += Rotate(a, 44);
+
+	ret->first = (uint64_t) (a + z);
+	ret->second = (uint64_t) (b + c);
+}
+
+/*
+ * Return a 16-byte hash for s[0] ... s[31], a, and b.  Quick and dirty.
+ * static pair<uint64, uint64> WeakHashLen32WithSeeds(
+ */
+static void WeakHashLen32WithSeeds(
+		const char* s, uint64_t a, uint64_t b, uint128 *ret) {
+	WeakHashLen32WithSeeds6(Fetch64(s),
+	    Fetch64(s + 8),
+	    Fetch64(s + 16),
+	    Fetch64(s + 24),
+	    a,
+	    b,
+	    ret);
+}
+
+/* Return an 8-byte hash for 33 to 64 bytes. */
+static uint64_t HashLen33to64(const char *s, size_t len) {
+	uint64_t a, b, c, r, vf, vs, wf, ws, z;
+	z = Fetch64(s + 24);
+	a = Fetch64(s) + (len + Fetch64(s + len - 16)) * k0;
+	b = Rotate(a + z, 52);
+	c = Rotate(a, 37);
+	a += Fetch64(s + 8);
+	c += Rotate(a, 7);
+	a += Fetch64(s + 16);
+	vf = a + z;
+	vs = b + Rotate(a, 31) + c;
+	a = Fetch64(s + 16) + Fetch64(s + len - 32);
+	z = Fetch64(s + len - 8);
+	b = Rotate(a + z, 52);
+	c = Rotate(a, 37);
+	a += Fetch64(s + len - 24);
+	c += Rotate(a, 7);
+	a += Fetch64(s + len - 16);
+	wf = a + z;
+	ws = b + Rotate(a, 31) + c;
+	r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0);
+	return ShiftMix(r * k0 + vs) * k2;
+}
+
+static inline uint64_t CityHash64(const char *s, size_t len) {
+	uint64_t temp, x, y, z;
+	uint128 v, w;
+
+	if (len <= 32) {
+		if (len <= 16) {
+			return HashLen0to16(s, len);
+		} else {
+			return HashLen17to32(s, len);
+		}
+	} else if (len <= 64) {
+		return HashLen33to64(s, len);
+	}
+
+	/*
+	 * For strings over 64 bytes we hash the end first, and then as we
+	 * loop we keep 56 bytes of state: v, w, x, y, and z.
+	 */
+	x = Fetch64(s + len - 40);
+	y = Fetch64(s + len - 16) + Fetch64(s + len - 56);
+	z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24));
+	WeakHashLen32WithSeeds(s + len - 64, len, z, &v);
+	WeakHashLen32WithSeeds(s + len - 32, y + k1, x, &w);
+	x = x * k1 + Fetch64(s);
+
+	/*
+	 * Use len to count multiples of 64, and operate on 64-byte chunks.
+	 */
+	for (len = (len - 1) >> 6; len != 0; len--) {
+		x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
+		y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
+		x ^= w.second;
+		y += v.first + Fetch64(s + 40);
+		z = Rotate(z + w.first, 33) * k1;
+		WeakHashLen32WithSeeds(s, v.second * k1, x + w.first, &v);
+		WeakHashLen32WithSeeds(
+		    s + 32, z + w.second, y + Fetch64(s + 16), &w);
+		temp = z;
+		z = x;
+		x = temp;
+		s += 64;
+	}
+	return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z,
+	    HashLen16(v.second, w.second) + x);
+}
+
+/*
+ * __wt_hash_city64 --
+ *	WiredTiger wrapper around third party hash implementation.
+ */
+uint64_t
+__wt_hash_city64(const void *s, size_t len)
+{
+	return (CityHash64(s, len));
+}
diff --git a/src/third_party/wiredtiger/src/support/hash_fnv.c b/src/third_party/wiredtiger/src/support/hash_fnv.c
new file mode 100644
index 00000000000..68f8537a4a0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/hash_fnv.c
@@ -0,0 +1,161 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code
+ *
+ * @(#) $Revision: 5.1 $
+ * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $
+ * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $
+ *
+ ***
+ *
+ * Fowler/Noll/Vo hash
+ *
+ * The basis of this hash algorithm was taken from an idea sent
+ * as reviewer comments to the IEEE POSIX P1003.2 committee by:
+ *
+ *      Phong Vo (http://www.research.att.com/info/kpv/)
+ *      Glenn Fowler (http://www.research.att.com/~gsf/)
+ *
+ * In a subsequent ballot round:
+ *
+ *      Landon Curt Noll (http://www.isthe.com/chongo/)
+ *
+ * improved on their algorithm.  Some people tried this hash
+ * and found that it worked rather well.  In an EMail message
+ * to Landon, they named it the ``Fowler/Noll/Vo'' or FNV hash.
+ *
+ * FNV hashes are designed to be fast while maintaining a low
+ * collision rate. The FNV speed allows one to quickly hash lots
+ * of data while maintaining a reasonable collision rate.  See:
+ *
+ *      http://www.isthe.com/chongo/tech/comp/fnv/index.html
+ *
+ * for more details as well as other forms of the FNV hash.
+ *
+ ***
+ *
+ * To use the recommended 64 bit FNV-1a hash, pass FNV1A_64_INIT as the
+ * uint64_t hashval argument to fnv_64a_buf() or fnv_64a_str().
+ *
+ ***
+ *
+ * Please do not copyright this code.  This code is in the public domain.
+ *
+ * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO
+ * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ *
+ * By:
+ *	chongo <Landon Curt Noll> /\oo/\
+ *      http://www.isthe.com/chongo/
+ *
+ * Share and Enjoy!	:-)
+ */
+
+#include <stdlib.h>
+#include "wt_internal.h"
+
+/*
+ * This file contains a 64 bit hash implementation of the FNV 1a 64 bit hash
+ * function.  The implementation is from a third party.
+ *
+ * The code has been updated to remove unnecessary content and better comply
+ * with WiredTiger coding standards.  The original source code can be found at:
+ * FNV 1a 64 bit: http://www.isthe.com/chongo/src/fnv/hash_64a.c
+ */
+
+/*
+ * 64 bit FNV-1 non-zero initial basis
+ *
+ * The FNV-1 initial basis is the FNV-0 hash of the following 32 octets:
+ *
+ *              chongo <Landon Curt Noll> /\../\
+ *
+ * NOTE: The \'s above are not back-slashing escape characters.
+ * They are literal ASCII  backslash 0x5c characters.
+ *
+ * NOTE: The FNV-1a initial basis is the same value as FNV-1 by definition.
+ */
+#define	FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
+
+/*
+ * fnv_64a_buf --
+ *	Perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer
+ *
+ * input:
+ *	buf	- start of buffer to hash
+ *	len	- length of buffer in octets
+ *	hval	- previous hash value or 0 if first call
+ *
+ * returns:
+ *	64 bit hash as a static hash type
+ *
+ * NOTE: To use the recommended 64 bit FNV-1a hash, use FNV1A_64_INIT as the
+ * 	 hval arg on the first call to either fnv_64a_buf() or fnv_64a_str().
+ */
+static inline uint64_t
+fnv_64a_buf(const void *buf, size_t len, uint64_t hval)
+{
+	const unsigned char *bp = buf;		/* start of buffer */
+	const unsigned char *be = bp + len;	/* beyond end of buffer */
+
+	/*
+	 * FNV-1a hash each octet of the buffer
+	 */
+	while (bp < be) {
+
+		/* xor the bottom with the current octet */
+		hval ^= (uint64_t)*bp++;
+
+		/*
+		 * Multiply by the 64 bit FNV magic prime mod 2^64. The
+		 * following shift operation is generally faster than
+		 * a multiply operation.
+		 */
+		hval += (hval << 1) + (hval << 4) + (hval << 5) +
+			(hval << 7) + (hval << 8) + (hval << 40);
+	}
+
+	/* return our new hash value */
+	return (hval);
+}
+
+/*
+ * __wt_hash_fnv64 --
+ *	WiredTiger wrapper around third party hash implementation.
+ */
+uint64_t
+__wt_hash_fnv64(const void *string, size_t len)
+{
+	return (fnv_64a_buf(string, len, FNV1A_64_INIT));
+}
diff --git a/src/third_party/wiredtiger/src/support/hazard.c b/src/third_party/wiredtiger/src/support/hazard.c
new file mode 100644
index 00000000000..12350ab52f4
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/hazard.c
@@ -0,0 +1,244 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#ifdef HAVE_DIAGNOSTIC
+static void __hazard_dump(WT_SESSION_IMPL *);
+#endif
+
+/*
+ * __wt_hazard_set --
+ *	Set a hazard pointer.
+ */
+int
+__wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp
+#ifdef HAVE_DIAGNOSTIC
+    , const char *file, int line
+#endif
+    )
+{
+	WT_BTREE *btree;
+	WT_HAZARD *hp;
+	int restarts = 0;
+
+	btree = S2BT(session);
+	*busyp = 0;
+
+	/* If a file can never be evicted, hazard pointers aren't required. */
+	if (F_ISSET(btree, WT_BTREE_NO_HAZARD))
+		return (0);
+
+	/*
+	 * Do the dance:
+	 *
+	 * The memory location which makes a page "real" is the WT_REF's state
+	 * of WT_REF_MEM, which can be set to WT_REF_LOCKED at any time by the
+	 * page eviction server.
+	 *
+	 * Add the WT_REF reference to the session's hazard list and flush the
+	 * write, then see if the page's state is still valid.  If so, we can
+	 * use the page because the page eviction server will see our hazard
+	 * pointer before it discards the page (the eviction server sets the
+	 * state to WT_REF_LOCKED, then flushes memory and checks the hazard
+	 * pointers).
+	 *
+	 * For sessions with many active hazard pointers, skip most of the
+	 * active slots: there may be a free slot in there, but checking is
+	 * expensive.  Most hazard pointers are released quickly: optimize
+	 * for that case.
+	 */
+	for (hp = session->hazard + session->nhazard;; ++hp) {
+		/* Expand the number of hazard pointers if available.*/
+		if (hp >= session->hazard + session->hazard_size) {
+			if (session->hazard_size >= S2C(session)->hazard_max)
+				break;
+			/* Restart the search. */
+			if (session->nhazard < session->hazard_size &&
+			    restarts++ == 0) {
+				hp = session->hazard;
+				continue;
+			}
+			WT_PUBLISH(session->hazard_size,
+			    WT_MIN(session->hazard_size + WT_HAZARD_INCR,
+			    S2C(session)->hazard_max));
+		}
+
+		if (hp->page != NULL)
+			continue;
+
+		hp->page = ref->page;
+#ifdef HAVE_DIAGNOSTIC
+		hp->file = file;
+		hp->line = line;
+#endif
+		/* Publish the hazard pointer before reading page's state. */
+		WT_FULL_BARRIER();
+
+		/*
+		 * Check if the page state is still valid, where valid means a
+		 * state of WT_REF_MEM and the pointer is unchanged.  (The
+		 * pointer can change, it means the page was evicted between
+		 * the time we set our hazard pointer and the publication.  It
+		 * would theoretically be possible for the page to be evicted
+		 * and a different page read into the same memory, so the
+		 * pointer hasn't changed but the contents have.  That's OK, we
+		 * found this page using the tree's key space, whatever page we
+		 * find here is the page for us to use.)
+		 */
+		if (ref->page == hp->page && ref->state == WT_REF_MEM) {
+			++session->nhazard;
+			return (0);
+		}
+
+		/*
+		 * The page isn't available, it's being considered for eviction
+		 * (or being evicted, for all we know).  If the eviction server
+		 * sees our hazard pointer before evicting the page, it will
+		 * return the page to use, no harm done, if it doesn't, it will
+		 * go ahead and complete the eviction.
+		 *
+		 * We don't bother publishing this update: the worst case is we
+		 * prevent some random page from being evicted.
+		 */
+		hp->page = NULL;
+		*busyp = 1;
+		return (0);
+	}
+
+	__wt_errx(session, "session %p: hazard pointer table full", session);
+#ifdef HAVE_DIAGNOSTIC
+	__hazard_dump(session);
+#endif
+
+	return (ENOMEM);
+}
+
+/*
+ * __wt_hazard_clear --
+ *	Clear a hazard pointer.
+ */
+int
+__wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_BTREE *btree;
+	WT_HAZARD *hp;
+
+	btree = S2BT(session);
+
+	/* If a file can never be evicted, hazard pointers aren't required. */
+	if (F_ISSET(btree, WT_BTREE_NO_HAZARD))
+		return (0);
+
+	/*
+	 * Clear the caller's hazard pointer.
+	 * The common pattern is LIFO, so do a reverse search.
+	 */
+	for (hp = session->hazard + session->hazard_size - 1;
+	    hp >= session->hazard;
+	    --hp)
+		if (hp->page == page) {
+			/*
+			 * We don't publish the hazard pointer clear in the
+			 * general case.  It's not required for correctness;
+			 * it gives an eviction thread faster access to the
+			 * page were the page selected for eviction, but the
+			 * generation number was just set, it's unlikely the
+			 * page will be selected for eviction.
+			 */
+			hp->page = NULL;
+
+			/*
+			 * If this was the last hazard pointer in the session,
+			 * we may need to update our transactional context.
+			 */
+			--session->nhazard;
+			return (0);
+		}
+
+	/*
+	 * A serious error, we should always find the hazard pointer.  Panic,
+	 * because using a page we didn't have pinned down implies corruption.
+	 */
+	WT_PANIC_RET(session, EINVAL,
+	    "session %p: clear hazard pointer: %p: not found", session, page);
+}
+
+/*
+ * __wt_hazard_close --
+ *	Verify that no hazard pointers are set.
+ */
+void
+__wt_hazard_close(WT_SESSION_IMPL *session)
+{
+	WT_HAZARD *hp;
+	int found;
+
+	/*
+	 * Check for a set hazard pointer and complain if we find one.  We could
+	 * just check the session's hazard pointer count, but this is a useful
+	 * diagnostic.
+	 */
+	for (found = 0, hp = session->hazard;
+	    hp < session->hazard + session->hazard_size; ++hp)
+		if (hp->page != NULL) {
+			found = 1;
+			break;
+		}
+	if (session->nhazard == 0 && !found)
+		return;
+
+	__wt_errx(session,
+	    "session %p: close hazard pointer table: table not empty", session);
+
+#ifdef HAVE_DIAGNOSTIC
+	__hazard_dump(session);
+#endif
+
+	/*
+	 * Clear any hazard pointers because it's not a correctness problem
+	 * (any hazard pointer we find can't be real because the session is
+	 * being closed when we're called).   We do this work because session
+	 * close isn't that common that it's an expensive check, and we don't
+	 * want to let a hazard pointer lie around, keeping a page from being
+	 * evicted.
+	 *
+	 * We don't panic: this shouldn't be a correctness issue (at least, I
+	 * can't think of a reason it would be).
+	 */
+	for (hp = session->hazard;
+	    hp < session->hazard + session->hazard_size; ++hp)
+		if (hp->page != NULL) {
+			hp->page = NULL;
+			--session->nhazard;
+		}
+
+	if (session->nhazard != 0)
+		__wt_errx(session,
+		    "session %p: close hazard pointer table: count didn't "
+		    "match entries",
+		    session);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __hazard_dump --
+ *	Display the list of hazard pointers.
+ */
+static void
+__hazard_dump(WT_SESSION_IMPL *session)
+{
+	WT_HAZARD *hp;
+
+	for (hp = session->hazard;
+	    hp < session->hazard + session->hazard_size; ++hp)
+		if (hp->page != NULL)
+			__wt_errx(session,
+			    "session %p: hazard pointer %p: %s, line %d",
+			    session, hp->page, hp->file, hp->line);
+}
+#endif
diff --git a/src/third_party/wiredtiger/src/support/hex.c b/src/third_party/wiredtiger/src/support/hex.c
new file mode 100644
index 00000000000..9ee3e723fa2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/hex.c
@@ -0,0 +1,215 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static const u_char hex[] = "0123456789abcdef";
+
+/*
+ * __fill_hex --
+ *	In-memory conversion of raw bytes to a hexadecimal representation.
+ */
+static inline void
+__fill_hex(const uint8_t *src, size_t src_max,
+    uint8_t *dest, size_t dest_max, size_t *lenp)
+{
+	uint8_t *dest_orig;
+
+	dest_orig = dest;
+	if (dest_max > 0)		/* save a byte for nul-termination */
+		--dest_max;
+	for (; src_max > 0 && dest_max > 1;
+	    src_max -= 1, dest_max -= 2, ++src) {
+		*dest++ = hex[(*src & 0xf0) >> 4];
+		*dest++ = hex[*src & 0x0f];
+	}
+	*dest++ = '\0';
+	if (lenp != NULL)
+		*lenp = WT_PTRDIFF(dest, dest_orig);
+}
+
+/*
+ * __wt_raw_to_hex --
+ *	Convert a chunk of data to a nul-terminated printable hex string.
+ */
+int
+__wt_raw_to_hex(
+    WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to)
+{
+	size_t len;
+
+	/*
+	 * Every byte takes up 2 spaces, plus a trailing nul byte.
+	 */
+	len = size * 2 + 1;
+	WT_RET(__wt_buf_init(session, to, len));
+
+	__fill_hex(from, size, to->mem, len, &to->size);
+	return (0);
+}
+
+/*
+ * __wt_raw_to_esc_hex --
+ *	Convert a chunk of data to a nul-terminated printable string using
+ * escaped hex, as necessary.
+ */
+int
+__wt_raw_to_esc_hex(
+    WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to)
+{
+	size_t i;
+	const uint8_t *p;
+	u_char *t;
+
+	/*
+	 * In the worst case, every character takes up 3 spaces, plus a
+	 * trailing nul byte.
+	 */
+	WT_RET(__wt_buf_init(session, to, size * 3 + 1));
+
+	/*
+	 * In the worst case, every character takes up 3 spaces, plus a
+	 * trailing nul byte.
+	 */
+	for (p = from, t = to->mem, i = size; i > 0; --i, ++p)
+		if (isprint((int)*p)) {
+			if (*p == '\\')
+				*t++ = '\\';
+			*t++ = *p;
+		} else {
+			*t++ = '\\';
+			*t++ = hex[(*p & 0xf0) >> 4];
+			*t++ = hex[*p & 0x0f];
+		}
+	*t++ = '\0';
+	to->size = WT_PTRDIFF(t, to->mem);
+	return (0);
+}
+
+/*
+ * __wt_hex2byte --
+ *	Convert a pair of hex characters into a byte.
+ */
+int
+__wt_hex2byte(const u_char *from, u_char *to)
+{
+	uint8_t byte;
+
+	switch (from[0]) {
+	case '0': byte = 0; break;
+	case '1': byte = 1 << 4; break;
+	case '2': byte = 2 << 4; break;
+	case '3': byte = 3 << 4; break;
+	case '4': byte = 4 << 4; break;
+	case '5': byte = 5 << 4; break;
+	case '6': byte = 6 << 4; break;
+	case '7': byte = 7 << 4; break;
+	case '8': byte = 8 << 4; break;
+	case '9': byte = 9 << 4; break;
+	case 'a': byte = 10 << 4; break;
+	case 'b': byte = 11 << 4; break;
+	case 'c': byte = 12 << 4; break;
+	case 'd': byte = 13 << 4; break;
+	case 'e': byte = 14 << 4; break;
+	case 'f': byte = 15 << 4; break;
+	default:
+		return (1);
+	}
+
+	switch (from[1]) {
+	case '0': break;
+	case '1': byte |= 1; break;
+	case '2': byte |= 2; break;
+	case '3': byte |= 3; break;
+	case '4': byte |= 4; break;
+	case '5': byte |= 5; break;
+	case '6': byte |= 6; break;
+	case '7': byte |= 7; break;
+	case '8': byte |= 8; break;
+	case '9': byte |= 9; break;
+	case 'a': byte |= 10; break;
+	case 'b': byte |= 11; break;
+	case 'c': byte |= 12; break;
+	case 'd': byte |= 13; break;
+	case 'e': byte |= 14; break;
+	case 'f': byte |= 15; break;
+	default:
+		return (1);
+	}
+	*to = byte;
+	return (0);
+}
+
+/*
+ * __hex_fmterr --
+ *	Hex format error message.
+ */
+static int
+__hex_fmterr(WT_SESSION_IMPL *session)
+{
+	WT_RET_MSG(session, EINVAL, "Invalid format in hexadecimal string");
+}
+
+/*
+ * __wt_hex_to_raw --
+ *	Convert a nul-terminated printable hex string to a chunk of data.
+ */
+int
+__wt_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to)
+{
+	return (__wt_nhex_to_raw(session, from, strlen(from), to));
+}
+
+/*
+ * __wt_nhex_to_raw --
+ *	Convert a printable hex string to a chunk of data.
+ */
+int
+__wt_nhex_to_raw(
+    WT_SESSION_IMPL *session, const char *from, size_t size, WT_ITEM *to)
+{
+	const u_char *p;
+	u_char *t;
+
+	if (size % 2 != 0)
+		return (__hex_fmterr(session));
+
+	WT_RET(__wt_buf_init(session, to, size / 2));
+
+	for (p = (u_char *)from, t = to->mem; size > 0; p += 2, size -= 2, ++t)
+		if (__wt_hex2byte(p, t))
+			return (__hex_fmterr(session));
+
+	to->size = WT_PTRDIFF(t, to->mem);
+	return (0);
+}
+
+/*
+ * __wt_esc_hex_to_raw --
+ *	Convert a printable string, encoded in escaped hex, to a chunk of data.
+ */
+int
+__wt_esc_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to)
+{
+	const u_char *p;
+	u_char *t;
+
+	WT_RET(__wt_buf_init(session, to, strlen(from)));
+
+	for (p = (u_char *)from, t = to->mem; *p != '\0'; ++p, ++t) {
+		if ((*t = *p) != '\\')
+			continue;
+		++p;
+		if (p[0] != '\\') {
+			if (p[0] == '\0' || p[1] == '\0' || __wt_hex2byte(p, t))
+				return (__hex_fmterr(session));
+			++p;
+		}
+	}
+	to->size = WT_PTRDIFF(t, to->mem);
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/support/huffman.c b/src/third_party/wiredtiger/src/support/huffman.c
new file mode 100644
index 00000000000..5a06b72d33e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/huffman.c
@@ -0,0 +1,899 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#define	__HUFFMAN_DETAIL	0	/* Set to 1 for debugging output. */
+
+/* Length of header in compressed message, in bits. */
+#define	WT_HUFFMAN_HEADER 	3
+
+/*
+ * Maximum allowed length of Huffman code words, which otherwise can range up
+ * to (#symbols - 1) bits long.  Lower value to use less memory for tables,
+ * higher value for better compression.  Max value = 16 (or 32-7=25 or 64-7=57
+ * if adjust data types).  FYI, JPEG uses 16.  A side effect of limiting max
+ * code length is that the worst case compression (a message of the least
+ * frequent symbols) is shorter.
+ */
+#define	MAX_CODE_LENGTH		16
+
+typedef struct __wt_freqtree_node {
+	/*
+	 * Data structure representing a node of the huffman tree. It holds a
+	 * 64-bit weight and pointers to the left and right child nodes.  The
+	 * node either has two child nodes or none.
+	 */
+	uint8_t  symbol;			/* only used in leaf nodes */
+	uint64_t weight;
+	struct __wt_freqtree_node *left;	/* bit 0 */
+	struct __wt_freqtree_node *right;	/* bit 1 */
+} WT_FREQTREE_NODE;
+
+typedef struct __wt_huffman_code {
+	uint16_t pattern;		/* requirement: length of field's type
+					 * in bits >= MAX_CODE_LENGTH.
+					 */
+	uint8_t length;
+} WT_HUFFMAN_CODE;
+
+typedef struct __wt_huffman_obj {
+	/*
+	 * Data structure here defines specific instance of the encoder/decoder.
+	 */
+	u_int	numSymbols;		/* Symbols: UINT16_MAX or UINT8_MAX */
+
+	uint16_t max_depth, min_depth;	/* Tree max/min depths */
+
+	/*
+	 * use: codes[symbol] = struct with pattern and length.
+	 * Used in encoding and decoding.
+	 * memory: codes[0-to-(number of symbols - 1)]
+	 */
+	WT_HUFFMAN_CODE *codes;
+
+	/*
+	 * use: code2symbol[Huffman_code] = symbol.
+	 * Used in decoding.
+	 * memory: code2symbol[1 << max_code_length]
+	 */
+	uint8_t *code2symbol;
+} WT_HUFFMAN_OBJ;
+
+/*
+ * Queue element data structure.
+ *
+ * Consists of a pointer to a huffman tree node, and a pointer to the next
+ * element in the queue.
+ */
+typedef struct node_queue_elem {
+	WT_FREQTREE_NODE *node;
+	struct node_queue_elem *next;
+} NODE_QUEUE_ELEM;
+
+/*
+ * Queue of huffman tree nodes.
+ *
+ * Contains a pointer to the beginning and the end of the queue, which is
+ * implemented as a linked list.
+ */
+typedef struct node_queue {
+	NODE_QUEUE_ELEM *first;
+	NODE_QUEUE_ELEM *last;
+} NODE_QUEUE;
+
+/*
+ * Internal data structure used to preserve the symbol when rearranging the
+ * frequency array.
+ */
+typedef struct __indexed_byte {
+	uint32_t symbol;	/* not uint8_t: match external data structure */
+	uint32_t frequency;
+} INDEXED_SYMBOL;
+
+static int  indexed_freq_compare(const void *, const void *);
+static int  indexed_symbol_compare(const void *, const void *);
+static void make_table(
+	WT_SESSION_IMPL *, uint8_t *, uint16_t, WT_HUFFMAN_CODE *, u_int);
+static void node_queue_close(WT_SESSION_IMPL *, NODE_QUEUE *);
+static void node_queue_dequeue(
+	WT_SESSION_IMPL *, NODE_QUEUE *, WT_FREQTREE_NODE **);
+static int  node_queue_enqueue(
+	WT_SESSION_IMPL *, NODE_QUEUE *, WT_FREQTREE_NODE *);
+static uint32_t profile_tree(
+	WT_FREQTREE_NODE *, uint16_t, uint16_t *, uint16_t *);
+static void recursive_free_node(WT_SESSION_IMPL *, WT_FREQTREE_NODE *);
+static void set_codes(WT_FREQTREE_NODE *, WT_HUFFMAN_CODE *, uint16_t, uint8_t);
+
+#define	node_queue_is_empty(queue)					\
+	((queue) == NULL || (queue)->first == NULL)
+
+/*
+ * indexed_symbol_compare --
+ *	Qsort comparator to order the table by symbol, lowest to highest.
+ */
+static int
+indexed_symbol_compare(const void *a, const void *b)
+{
+	return (((INDEXED_SYMBOL *)a)->symbol >
+	    ((INDEXED_SYMBOL *)b)->symbol ? 1 :
+	    (((INDEXED_SYMBOL *)a)->symbol <
+	    ((INDEXED_SYMBOL *)b)->symbol ? -1 : 0));
+}
+
+/*
+ * indexed_freq_compare --
+ *	Qsort comparator to order the table by frequency (the most frequent
+ * symbols will be at the end of the array).
+ */
+static int
+indexed_freq_compare(const void *a, const void *b)
+{
+	return (((INDEXED_SYMBOL *)a)->frequency >
+	    ((INDEXED_SYMBOL *)b)->frequency ? 1 :
+	    (((INDEXED_SYMBOL *)a)->frequency <
+	    ((INDEXED_SYMBOL *)b)->frequency ? -1 : 0));
+}
+
+/*
+ * profile_tree --
+ *	Traverses tree to determine #leaves under each node, max depth, min
+ *	depth of leaf.
+ */
+static uint32_t
+profile_tree(WT_FREQTREE_NODE *node,
+    uint16_t len, uint16_t *max_depth, uint16_t *min_depth)
+{
+	uint32_t leaf_cnt;
+
+	if (node->left == NULL && node->right == NULL) {	/* leaf */
+		leaf_cnt = 1;
+		if (*max_depth < len)
+			*max_depth = len;
+		if (*min_depth > len)
+			*min_depth = len;
+	} else {
+		/*
+		 * internal node -- way tree constructed internal always has
+		 * left and right children
+		 */
+		leaf_cnt =
+		    profile_tree(node->left, len + 1, max_depth, min_depth) +
+		    profile_tree(node->right, len + 1, max_depth, min_depth);
+	}
+	node->weight = leaf_cnt;		/* abuse weight field */
+	return (leaf_cnt);
+}
+
+/*
+ * set_codes --
+ *	Computes Huffman code for each symbol in tree.
+ *
+ * Method is standard way in the literature, except that limits maximum code
+ * length.  A known max code length is important for limiting memory use by
+ * the tables and for knowing how large data types need to be such as the field
+ * that holds the code pattern.
+ */
+static void
+set_codes(WT_FREQTREE_NODE *node,
+    WT_HUFFMAN_CODE *codes, uint16_t pattern, uint8_t len)
+{
+	WT_HUFFMAN_CODE *code;
+	uint16_t patternleft, patternright, half;
+	uint8_t remaining;
+
+	if (node->left == NULL && node->right == NULL) {
+		code = &codes[node->symbol];
+		code->pattern = pattern;
+		code->length = len;
+#if __HUFFMAN_DETAIL
+		printf("%" PRIx16 ": code %" PRIx16 ", len %" PRIu8 "\n",
+		    node->symbol, pattern, len);
+#endif
+	} else {
+		/*
+		 * Check each subtree individually to see if can afford to split
+		 * up bits into possibly shorter codes, or if need to employ all
+		 * remaining bits up to MAX_CODE_LENGTH to consecutively number
+		 * leaves.
+		 */
+		remaining = MAX_CODE_LENGTH - len;
+		/*
+		 * If not already in "low-bit mode", but need to be, open up
+		 * lower-order bits for consecutive numbering.
+		 */
+		if (len < MAX_CODE_LENGTH &&
+		    ((half = 1 << (remaining - 1)) < node->left->weight ||
+		    half < node->right->weight)) {
+			pattern = pattern << remaining;
+			len = MAX_CODE_LENGTH;
+		}
+
+		if (len < MAX_CODE_LENGTH) {
+			patternleft = (pattern << 1) | 0;
+			patternright = (pattern << 1) | 1;
+			len++;
+		} else {			/* "low bit mode" */
+			patternleft = pattern;
+			patternright = pattern + node->left->weight;
+						/* len unchanged */
+		}
+
+		set_codes(node->left, codes, patternleft, len);
+		set_codes(node->right, codes, patternright, len);
+	}
+}
+
+/*
+ * make_table --
+ *	Computes Huffman table used for subsequent lookups in encoding and
+ * decoding.  With the table, encoding from a symbol to Huffman code and
+ * decoding from a code to a symbol are simple array lookups.
+ */
+static void
+make_table(WT_SESSION_IMPL *session, uint8_t *code2symbol,
+    uint16_t max_depth, WT_HUFFMAN_CODE *codes, u_int symcnt)
+{
+	uint32_t j, c1, c2;	/* Exceeds uint16_t bounds at loop boundary. */
+	uint16_t c, i;
+	uint8_t len, shift;
+
+	/* Zero out, for assertion below. */
+	for (j = 0, c2 = (1U << max_depth); j < c2; j++)
+		code2symbol[j] = 0;
+
+	/*
+	 * Here's the magic: flood all bit patterns for lower-order bits to
+	 * point to same symbol.
+	 */
+	for (i = 0; i < symcnt; i++) {
+		if ((len = codes[i].length) == 0)
+			continue;
+
+		/*
+		 * The size of the array index should be enough to hold largest
+		 * index into symbol table.  Pre-existing symbols were packed
+		 * 0-255, so 8 bits is enough.  Don't want to make it larger
+		 * than necessary, we allocate (2 ^ max-code-length) of them.
+		 */
+		c = codes[i].pattern;
+		shift = max_depth - len;
+		c1 = (uint32_t)c << shift;
+		c2 = (uint32_t)(c + 1) << shift;
+		for (j = c1; j < c2; j++) {
+			WT_ASSERT(session, code2symbol[j] == 0);
+			code2symbol[j] = i;
+		}
+	}
+}
+
+/*
+ * recursive_free_node --
+ *	Recursively free the huffman frequency tree's nodes.
+ */
+static void
+recursive_free_node(WT_SESSION_IMPL *session, WT_FREQTREE_NODE *node)
+{
+	if (node != NULL) {
+		recursive_free_node(session, node->left);
+		recursive_free_node(session, node->right);
+		__wt_free(session, node);
+	}
+}
+
+/*
+ * __wt_huffman_open --
+ *	Take a frequency table and return a pointer to a descriptor object.
+ */
+int
+__wt_huffman_open(WT_SESSION_IMPL *session,
+    void *symbol_frequency_array, u_int symcnt, u_int numbytes, void *retp)
+{
+	INDEXED_SYMBOL *indexed_freqs, *sym;
+	NODE_QUEUE *combined_nodes, *leaves;
+	WT_DECL_RET;
+	WT_FREQTREE_NODE *node, *node2, **refnode, *tempnode;
+	WT_HUFFMAN_OBJ *huffman;
+	uint64_t w1, w2;
+	uint16_t i;
+
+	indexed_freqs = symbol_frequency_array;
+
+	combined_nodes = leaves = NULL;
+	node = node2 = tempnode = NULL;
+
+	WT_RET(__wt_calloc_def(session, 1, &huffman));
+
+	/*
+	 * The frequency table is 4B pairs of symbol and frequency.  The symbol
+	 * is either 1 or 2 bytes and the frequency ranges from 1 to UINT32_MAX
+	 * (a frequency of 0 means the value is never expected to appear in the
+	 * input).  Validate the symbols are within range.
+	 */
+	if (numbytes != 1 && numbytes != 2)
+		WT_ERR_MSG(session, EINVAL,
+		    "illegal number of symbol bytes specified for a huffman "
+		    "table");
+
+	if (symcnt == 0)
+		WT_ERR_MSG(session, EINVAL,
+		    "illegal number of symbols specified for a huffman table");
+
+	huffman->numSymbols = numbytes == 2 ? UINT16_MAX : UINT8_MAX;
+
+	/*
+	 * Order the array by symbol and check for invalid symbols and
+	 * duplicates.
+	 */
+	qsort((void *)indexed_freqs,
+	    symcnt, sizeof(INDEXED_SYMBOL), indexed_symbol_compare);
+	for (i = 0; i < symcnt; ++i) {
+		if (i > 0 &&
+		    indexed_freqs[i].symbol == indexed_freqs[i - 1].symbol)
+			WT_ERR_MSG(session, EINVAL,
+			    "duplicate symbol %" PRIx32
+			    " specified in a huffman table",
+			    indexed_freqs[i].symbol);
+		if (indexed_freqs[i].symbol > huffman->numSymbols)
+			WT_ERR_MSG(session, EINVAL,
+			    "illegal symbol %" PRIx32
+			    " specified in a huffman table",
+			    indexed_freqs[i].symbol);
+	}
+
+	/*
+	 * Massage frequencies.
+	 */
+	indexed_freqs = NULL;
+	WT_ERR(__wt_calloc_def(session, 256, &indexed_freqs));
+
+	/*
+	 * Minimum of frequency==1 so everybody gets a Huffman code, in case
+	 * data evolves and we need to represent this value.
+	 */
+	for (i = 0; i < 256; i++) {
+		sym = &indexed_freqs[i];
+		sym->symbol = i;
+		sym->frequency = 1;
+	}
+	/*
+	 * Avoid large tables by splitting UTF-16 frequencies into high byte
+	 * and low byte.
+	 */
+	for (i = 0; i < symcnt; i++) {
+		sym = &((INDEXED_SYMBOL *)symbol_frequency_array)[i];
+		indexed_freqs[sym->symbol & 0xff].frequency += sym->frequency;
+		if (numbytes == 2)
+			indexed_freqs[(sym->symbol >> 8) & 0xff].frequency +=
+			    sym->frequency;
+	}
+	huffman->numSymbols = symcnt = 256;
+
+	/*
+	 * The array must be sorted by frequency to be able to use a linear time
+	 * construction algorithm.
+	 */
+	qsort((void *)indexed_freqs,
+	    symcnt, sizeof(INDEXED_SYMBOL), indexed_freq_compare);
+
+	/* We need two node queues to build the tree. */
+	WT_ERR(__wt_calloc_def(session, 1, &leaves));
+	WT_ERR(__wt_calloc_def(session, 1, &combined_nodes));
+
+	/*
+	 * Adding the leaves to the queue.
+	 *
+	 * Discard symbols with a frequency of 0; this assumes these symbols
+	 * never occur in the source stream, and the purpose is to reduce the
+	 * huffman tree's size.
+	 */
+	for (i = 0; i < symcnt; ++i)
+		if (indexed_freqs[i].frequency > 0) {
+			WT_ERR(__wt_calloc_def(session, 1, &tempnode));
+			tempnode->symbol = (uint8_t)indexed_freqs[i].symbol;
+			tempnode->weight = indexed_freqs[i].frequency;
+			WT_ERR(node_queue_enqueue(session, leaves, tempnode));
+			tempnode = NULL;
+		}
+
+	while (!node_queue_is_empty(leaves) ||
+	    !node_queue_is_empty(combined_nodes)) {
+		/*
+		 * We have to get the node with the smaller weight, examining
+		 * both queues' first element.  We are collecting pairs of these
+		 * items, by alternating between node and node2:
+		 */
+		refnode = !node ? &node : &node2;
+
+		/*
+		 * To decide which queue must be used, we get the weights of
+		 * the first items from both:
+		 */
+		w1 = node_queue_is_empty(leaves) ?
+		    UINT64_MAX : leaves->first->node->weight;
+		w2 = node_queue_is_empty(combined_nodes) ?
+		    UINT64_MAX : combined_nodes->first->node->weight;
+
+		/*
+		 * Based on the two weights we finally can dequeue the smaller
+		 * element and place it to the alternating target node pointer:
+		 */
+		if (w1 < w2)
+			node_queue_dequeue(session, leaves, refnode);
+		else
+			node_queue_dequeue(session, combined_nodes, refnode);
+
+		/*
+		 * In every second run, we have both node and node2 initialized.
+		 */
+		if (node != NULL && node2 != NULL) {
+			WT_ERR(__wt_calloc_def(session, 1, &tempnode));
+
+			/* The new weight is the sum of the two weights. */
+			tempnode->weight = node->weight + node2->weight;
+			tempnode->left = node;
+			tempnode->right = node2;
+
+			/* Enqueue it to the combined nodes queue */
+			WT_ERR(node_queue_enqueue(
+			    session, combined_nodes, tempnode));
+			tempnode = NULL;
+
+			/* Reset the state pointers */
+			node = node2 = NULL;
+		}
+	}
+
+	/*
+	 * The remaining node is in the node variable, this is the root of the
+	 * tree.   Calculate how many bytes it takes to hold numSymbols bytes
+	 * bits.
+	 */
+	huffman->max_depth = 0;
+	huffman->min_depth = MAX_CODE_LENGTH;
+	(void)profile_tree(node, 0, &huffman->max_depth, &huffman->min_depth);
+	if (huffman->max_depth > MAX_CODE_LENGTH)
+		huffman->max_depth = MAX_CODE_LENGTH;
+
+	WT_ERR(__wt_calloc_def(session, huffman->numSymbols, &huffman->codes));
+	set_codes(node, huffman->codes, 0, 0);
+
+	WT_ERR(__wt_calloc_def(
+	    session, 1U << huffman->max_depth, &huffman->code2symbol));
+	make_table(session, huffman->code2symbol,
+	    huffman->max_depth, huffman->codes, huffman->numSymbols);
+
+#if __HUFFMAN_DETAIL
+	{
+	uint8_t symbol;
+	uint32_t weighted_length;
+
+	printf("leaf depth %" PRIu16 "..%" PRIu16 ", memory use: "
+	    "codes %u# * %uB  + code2symbol %u# * %uB\n",
+	    huffman->min_depth, huffman->max_depth,
+	    huffman->numSymbols, (u_int)sizeof(WT_HUFFMAN_CODE),
+	    1U << huffman->max_depth, (u_int)sizeof(uint16_t));
+
+	/*
+	 * measure quality of computed Huffman codes, for different max bit
+	 * lengths (say, 16 vs 24 vs 32)
+	 */
+	weighted_length = 0;
+	for (i = 0; i < symcnt; i++) {
+		symbol = indexed_freqs[i].symbol;
+		weighted_length +=
+		    indexed_freqs[i].frequency * huffman->codes[symbol].length;
+		printf(
+		    "\t%" PRIu16 "->%" PRIu16 ". %" PRIu32 " * %" PRIu8 "\n",
+		    i, symbol,
+		    indexed_freqs[i].frequency, huffman->codes[symbol].length);
+	}
+	printf("weighted length of all codes (the smaller the better): "
+	    "%" PRIu32 "\n", weighted_length);
+	}
+#endif
+
+	*(void **)retp = huffman;
+
+	if (0) {
+err:		if (ret == 0)
+			ret = WT_ERROR;
+	}
+	__wt_free(session, indexed_freqs);
+	if (leaves != NULL)
+		node_queue_close(session, leaves);
+	if (combined_nodes != NULL)
+		node_queue_close(session, combined_nodes);
+	if (node != NULL)
+		recursive_free_node(session, node);
+	if (node2 != NULL)
+		recursive_free_node(session, node2);
+	__wt_free(session, tempnode);
+	if (ret != 0)
+		__wt_huffman_close(session, huffman);
+	return (ret);
+}
+
+/*
+ * __wt_huffman_close --
+ *	Discard a Huffman descriptor object.
+ */
+void
+__wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg)
+{
+	WT_HUFFMAN_OBJ *huffman;
+
+	huffman = huffman_arg;
+
+	__wt_free(session, huffman->code2symbol);
+	__wt_free(session, huffman->codes);
+	__wt_free(session, huffman);
+}
+
+#if __HUFFMAN_DETAIL
+/*
+ * __wt_print_huffman_code --
+ *	Prints a symbol's Huffman code.
+ */
+int
+__wt_print_huffman_code(void *huffman_arg, uint16_t symbol)
+{
+	WT_HUFFMAN_CODE code;
+	WT_HUFFMAN_OBJ *huffman;
+
+	huffman = huffman_arg;
+
+	if (symbol >= huffman->numSymbols)
+		printf("symbol %" PRIu16 " out of range\n", symbol);
+	else {
+		code = huffman->codes[symbol];
+		if (code.length == 0)
+			printf(
+			    "symbol %" PRIu16 " not defined -- 0 frequency\n",
+			    symbol);
+		else
+			/* should print code as binary */
+			printf(
+			    "%" PRIu16 " -> code pattern "
+			    "%" PRIx16 ", length %" PRIu8 "\n",
+				symbol, code.pattern, code.length);
+	}
+
+	return (0);
+}
+#endif
+
+/*
+ * __wt_huffman_encode --
+ *	Take a byte string, encode it into the target.
+ *
+ * Translation from symbol to Huffman code is a simple array lookup.
+ *
+ * WT_HUFFMAN_OBJ contains an array called 'codes' with one WT_HUFFMAN_CODE per
+ * symbol.  Then, given a symbol:
+ *	pattern = codes[symbol].pattern;
+ *	length = codes[symbol].length;
+ *
+ * To encode byte-string, we iterate over the input symbols.  For each symbol,
+ * look it up via table, shift bits onto a shift register (an int long enough
+ * to hold the longest code word + up to 7 bits remaining from the previous),
+ * then drain out full bytes.  Finally, at the end flush remaining bits
+ * and write header bits.
+ */
+int
+__wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg,
+    const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf)
+{
+	WT_DECL_RET;
+	WT_HUFFMAN_CODE code;
+	WT_HUFFMAN_OBJ *huffman;
+	WT_ITEM *tmp;
+	size_t max_len, outlen, bytes;
+	uint64_t bitpos;
+	const uint8_t *from;
+	uint8_t len, *out, padding_info, symbol;
+
+	/*
+	 * Shift register to accumulate bits from input.
+	 * Should be >= (MAX_CODE_LENGTH + 7), but also efficient to shift bits
+	 * and preferably in a machine register.
+	 */
+	uint32_t bits;
+
+	/* Count of bits in shift register ('bits' above). */
+	uint8_t valid;
+
+	huffman = huffman_arg;
+	from = from_arg;
+	tmp = NULL;
+
+	/*
+	 * We don't want to find all of our callers and ensure they don't pass
+	 * 0-length byte strings, but there's no reason to do any work.
+	 */
+	if (from_len == 0) {
+		to_buf->size = 0;
+		return (0);
+	}
+
+	/*
+	 * Compute the largest compressed output size, which is if all symbols
+	 * are least frequent and so have largest Huffman codes, and compressed
+	 * output may be larger than the input size.  This way we don't have to
+	 * worry about resizing the buffer during compression.  Use the shared
+	 * system buffer while compressing, then allocate a new buffer of the
+	 * right size and copy the result into it.
+	 */
+	max_len = (WT_HUFFMAN_HEADER +
+	    from_len * huffman->max_depth + 7 /* round up to full byte */) / 8;
+	WT_ERR(__wt_scr_alloc(session, max_len, &tmp));
+
+	/*
+	 * Leave the first 3 bits of the encoded value empty, it holds the
+	 * number of bits actually used in the last byte of the encoded value.
+	 */
+	bits = 0;
+	bitpos = WT_HUFFMAN_HEADER;
+	valid = WT_HUFFMAN_HEADER;
+	out = tmp->mem;
+	for (bytes = 0; bytes < from_len; bytes++) {
+		WT_ASSERT(session, WT_PTR_IN_RANGE(from, from_arg, from_len));
+
+		symbol = *from++;
+
+		/* Translate symbol into Huffman code and stuff into buffer. */
+		code = huffman->codes[symbol];
+		len = code.length;
+		bits = (bits << len) | code.pattern;
+		valid += len;
+		bitpos += len;
+		while (valid >= 8) {
+			WT_ASSERT(session,
+			    WT_PTR_IN_RANGE(out, tmp->mem, tmp->memsize));
+			*out++ = (uint8_t)(bits >> (valid - 8));
+			valid -= 8;
+		}
+	}
+	if (valid > 0) {		/* Flush shift register. */
+		WT_ASSERT(session,
+		    WT_PTR_IN_RANGE(out, tmp->mem, tmp->memsize));
+		*out = (uint8_t)(bits << (8 - valid));
+	}
+
+	/*
+	 * At this point, bitpos is the total number of used bits (including
+	 * the 3 bits at the beginning of the buffer, which we'll set now to
+	 * the number of bits used in the last byte).   Note if the number of
+	 * bits used in the last byte is 8, we set the 3 bits to 0, in other
+	 * words, the first 3 bits of the encoded value are the number of bits
+	 * used in the last byte, unless they're 0, in which case there are 8
+	 * bits used in the last byte.
+	 */
+	padding_info = (bitpos % 8) << (8 - WT_HUFFMAN_HEADER);
+	((uint8_t *)tmp->mem)[0] |= padding_info;
+
+	/* Copy result of exact known size into caller's buffer. */
+	outlen = (uint32_t)((bitpos + 7) / 8);
+	WT_ERR(__wt_buf_initsize(session, to_buf, outlen));
+	memcpy(to_buf->mem, tmp->mem, outlen);
+
+#if __HUFFMAN_DETAIL
+	printf("encode: worst case %" PRIu32 " bytes -> actual %" PRIu32 "\n",
+	    max_len, outlen);
+#endif
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+
+}
+
+/*
+ * __wt_huffman_decode --
+ *	Take a byte string, decode it into the target.
+ *
+ * Translation from Huffman code to symbol is a simple array lookup.
+ *
+ * WT_HUFFMAN_OBJ contains an array called 'code2symbol' indexed by code word
+ * and whose value is the corresponding symbol.
+ * From the symbol, we index into the 'codes' array to get the code length.
+ *
+ * When decoding a message, we don't know where the boundaries are between
+ * codes.  The trick is that we collect enough bits for the longest code word,
+ * and construct the table such that for codes with fewer bits we flood the
+ * table with all of the bit patterns in the lower order bits.  This works
+ * because the Huffman code is a unique prefix, and by the flooding we are
+ * treating bits beyond the unique prefix as don't care bits.
+ *
+ * For example, we have table of length 2^max_code_length (1<<max_code_length).
+ * For a code of length, max_code_length, the position code2symbol[code] =
+ *	symbol.
+ * For a code word of (max_length - 1), we fill code2symbol[code << 1] = symbol,
+ * as well as code2symbol[(code << 1) | 1] = symbol.
+ * And so on, so in general we fill:
+ * 	code2symbol[(code) << shift inclusive .. (code+1) << shift exclusive].
+ *
+ * To decode a message, we read in enough bits from input to fill the shift
+ * register with at least MAX_CODE_LENGTH bits.
+ * We look up in the table code2symbol to obtain the symbol.
+ * We look up the symbol in 'codes' to obtain the code length
+ * Finally, subtract off these bits from the shift register.
+ */
+int
+__wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg,
+    const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf)
+{
+	WT_DECL_RET;
+	WT_ITEM *tmp;
+	WT_HUFFMAN_OBJ *huffman;
+	size_t from_bytes, len, max_len, outlen;
+	uint64_t from_len_bits;
+	uint32_t bits, mask, max;
+	uint16_t pattern;
+	const uint8_t *from;
+	uint8_t padding_info, symbol, *to, valid;
+
+	huffman = huffman_arg;
+	from = from_arg;
+	tmp = NULL;
+
+	/*
+	 * We don't want to find all of our callers and ensure they don't pass
+	 * 0-length byte strings, but there's no reason to do any work.
+	 */
+	if (from_len == 0) {
+		to_buf->size = 0;
+		return (0);
+	}
+
+	/*
+	 * The first 3 bits are the number of used bits in the last byte, unless
+	 * they're 0, in which case there are 8 bits used in the last byte.
+	 */
+	padding_info = (*from & 0xE0) >> (8 - WT_HUFFMAN_HEADER);
+	from_len_bits = from_len * 8;
+	if (padding_info != 0)
+		from_len_bits -= 8U - padding_info;
+
+	/* Number of bits that have codes. */
+	from_len_bits -= WT_HUFFMAN_HEADER;
+
+	/*
+	 * Compute largest uncompressed output size, which is if all symbols are
+	 * most frequent and so have smallest Huffman codes and therefore
+	 * largest expansion.  Use the shared system buffer while uncompressing,
+	 * then allocate a new buffer of exactly the right size and copy the
+	 * result into it.
+	 */
+	max_len = (uint32_t)(from_len_bits / huffman->min_depth);
+	WT_ERR(__wt_scr_alloc(session, max_len, &tmp));
+	to = tmp->mem;
+
+	/* The first byte of input is a special case because of header bits. */
+	bits = *from++;
+	valid = 8 - WT_HUFFMAN_HEADER;
+	from_bytes = from_len - 1;
+
+	max = huffman->max_depth;
+	mask = (1U << max) - 1;
+	for (outlen = 0; from_len_bits > 0; outlen++) {
+		while (valid < max && from_bytes > 0) {
+			WT_ASSERT(session,
+			    WT_PTR_IN_RANGE(from, from_arg, from_len));
+			bits = (bits << 8) | *from++;
+			valid += 8;
+			from_bytes--;
+		}
+		pattern = valid >= max ?	/* short patterns near end */
+		    (bits >> (valid - max)) : (bits << (max - valid));
+		symbol = huffman->code2symbol[pattern & mask];
+		len = huffman->codes[symbol].length;
+		valid -= len;
+		WT_ASSERT(session, from_len_bits >= len);
+		from_len_bits -= len;
+
+		WT_ASSERT(session,
+		    WT_PTR_IN_RANGE(to, tmp->mem, tmp->memsize));
+		*to++ = symbol;
+	}
+
+	/* Return the number of bytes used. */
+	WT_ERR(__wt_buf_initsize(session, to_buf, outlen));
+	memcpy(to_buf->mem, tmp->mem, outlen);
+
+#if __HUFFMAN_DETAIL
+	printf("decode: worst case %" PRIu32 " bytes -> actual %" PRIu32 "\n",
+	    max_len, outlen);
+#endif
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * node_queue_close --
+ *	Delete a queue from memory.
+ *
+ * It does not delete the pointed huffman tree nodes!
+ */
+static void
+node_queue_close(WT_SESSION_IMPL *session, NODE_QUEUE *queue)
+{
+	NODE_QUEUE_ELEM *elem, *next_elem;
+
+	/* Freeing each element of the queue's linked list. */
+	for (elem = queue->first; elem != NULL; elem = next_elem) {
+		next_elem = elem->next;
+		__wt_free(session, elem);
+	}
+
+	/* Freeing the queue record itself. */
+	__wt_free(session, queue);
+}
+
+/*
+ * node_queue_enqueue --
+ *	Push a tree node to the end of the queue.
+ */
+static int
+node_queue_enqueue(
+    WT_SESSION_IMPL *session, NODE_QUEUE *queue, WT_FREQTREE_NODE *node)
+{
+	NODE_QUEUE_ELEM *elem;
+
+	/* Allocating a new linked list element */
+	WT_RET(__wt_calloc_def(session, 1, &elem));
+
+	/* It holds the tree node, and has no next element yet */
+	elem->node = node;
+	elem->next = NULL;
+
+	/* If the queue is empty, the first element will be the new one. */
+	if (queue->first == NULL)
+		queue->first = elem;
+
+	/*
+	 * If the queue is not empty, the last element's next pointer must be
+	 * updated.
+	 */
+	if (queue->last != NULL)
+		queue->last->next = elem;
+
+	/* The last element is the new one */
+	queue->last = elem;
+
+	return (0);
+}
+
+/*
+ * node_queue_dequeue --
+ *	Removes a node from the beginning of the queue and copies the node's
+ *	pointer to the location referred by the retp parameter.
+ */
+static void
+node_queue_dequeue(
+    WT_SESSION_IMPL *session, NODE_QUEUE *queue, WT_FREQTREE_NODE **retp)
+{
+	NODE_QUEUE_ELEM *first_elem;
+
+	/*
+	 * Getting the first element of the queue and updating it to point to
+	 * the next element as first.
+	 */
+	first_elem = queue->first;
+	*retp = first_elem->node;
+	queue->first = first_elem->next;
+
+	/*
+	 * If the last element was the dequeued element, we have to update it
+	 * to NULL.
+	 */
+	if (queue->last == first_elem)
+		queue->last = NULL;
+
+	/* Freeing the linked list element that has been dequeued */
+	__wt_free(session, first_elem);
+}
diff --git a/src/third_party/wiredtiger/src/support/mutex.c b/src/third_party/wiredtiger/src/support/mutex.c
new file mode 100644
index 00000000000..ffe52cf28fd
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/mutex.c
@@ -0,0 +1,257 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+
+/*
+ * __wt_spin_lock_register_lock --
+ *	Add a lock to the connection's list.
+ */
+int
+__wt_spin_lock_register_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+	WT_CONNECTION_IMPL *conn;
+	u_int i;
+
+	/*
+	 * There is a spinlock we initialize before we have a connection, the
+	 * global library lock.  In that case, the session will be NULL and
+	 * we can't track the lock.
+	 */
+	if (session == NULL)
+		return (0);
+
+	conn = S2C(session);
+
+	for (i = 0; i < WT_SPINLOCK_MAX; i++)
+		if (conn->spinlock_list[i] == NULL &&
+		    WT_ATOMIC_CAS(conn->spinlock_list[i], NULL, t))
+			return (0);
+
+	WT_RET_MSG(session, ENOMEM,
+	    "spinlock connection registry failed, increase the connection's "
+	    "spinlock list size");
+}
+
+/*
+ * __wt_spin_lock_unregister_lock --
+ *	Remove a lock from the connection's list.
+ */
+void
+__wt_spin_lock_unregister_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+	WT_CONNECTION_IMPL *conn;
+	u_int i;
+
+	conn = S2C(session);
+
+	for (i = 0; i < WT_SPINLOCK_MAX; i++)
+		if (conn->spinlock_list[i] == t)
+			    conn->spinlock_list[i] = NULL;
+
+	/*
+	 * XXX
+	 * The statistics thread reads through this array, there's a possible
+	 * race: if that thread reads the pointer then goes to sleep, then we
+	 * free the spinlock, then the statistics thread wakes up, it can read
+	 * free'd memory.
+	 *
+	 * This is performance debugging code, so we're not fixing the race for
+	 * now, minimize the window.
+	 */
+	WT_FULL_BARRIER();
+}
+
+/*
+ * __spin_lock_next_id --
+ *	Return the next spinlock caller ID.
+ */
+static int
+__spin_lock_next_id(WT_SESSION_IMPL *session, int *idp)
+{
+	static int lock_id = 0, next_id = 0;
+	WT_DECL_RET;
+
+	/* If we've ever registered this location, we already have an ID. */
+	if (*idp != WT_SPINLOCK_REGISTER)
+		return (0);
+
+	/*
+	 * We can't use the global spinlock to lock the ID allocation (duh!),
+	 * use a CAS instruction to serialize access to a local variable.
+	 * This work only gets done once per library instantiation, there
+	 * isn't a performance concern.
+	 */
+	while (!WT_ATOMIC_CAS(lock_id, 0, 1))
+		__wt_yield();
+
+	/* Allocate a blocking ID for this location. */
+	if (*idp == WT_SPINLOCK_REGISTER) {
+		if (next_id < WT_SPINLOCK_MAX_LOCATION_ID)
+			*idp = next_id++;
+		else
+			WT_ERR_MSG(session, ENOMEM,
+			    "spinlock caller location registry failed, "
+			    "increase the connection's blocking matrix size");
+	}
+
+err:	WT_PUBLISH(lock_id, 0);
+	return (ret);
+}
+
+/*
+ * __wt_spin_lock_register_caller --
+ *	Register a spin-lock caller's location information in the blocking
+ * matrix.
+ */
+int
+__wt_spin_lock_register_caller(WT_SESSION_IMPL *session,
+    const char *name, const char *file, int line, int *idp)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_CONNECTION_STATS_SPINLOCK *p;
+
+	conn = S2C(session);
+
+	/*
+	 * The caller's location ID is a static offset into a per-connection
+	 * structure, and that has problems: first, if there are multiple
+	 * connections, we'll need to hold some kind of lock to avoid racing
+	 * when setting that value, and second, if/when there are multiple
+	 * connections and/or a single connection is closed and re-opened, the
+	 * variable may be initialized and underlying connection information
+	 * may not.
+	 *
+	 * First, allocate a location ID if needed.
+	 */
+	WT_RET(__spin_lock_next_id(session, idp));
+
+	/*
+	 * Add the caller's information to the blocking matrix.  We could race
+	 * here (if two threads of control register the same lock at the same
+	 * time), but we don't care as both threads are setting the identical
+	 * information.
+	 */
+	p = &conn->spinlock_block[*idp];
+	p->name = name;
+	if ((p->file = strrchr(file, '/')) == NULL)
+		p->file = file;
+	else
+		++p->file;
+	p->line = line;
+	return (0);
+}
+
+/*
+ * __wt_statlog_dump_spinlock --
+ *	Log the spin-lock statistics.
+ */
+int
+__wt_statlog_dump_spinlock(WT_CONNECTION_IMPL *conn, const char *tag)
+{
+	WT_SPINLOCK *spin;
+	WT_CONNECTION_STATS_SPINLOCK *p, *t;
+	uint64_t block_manager, btree_page, ignore;
+	u_int i, j;
+
+	/*
+	 * Ignore rare acquisition of a spinlock using a base value of 10 per
+	 * second so we don't create graphs we don't care about.
+	 */
+	ignore = (uint64_t)(conn->stat_usecs / 1000000) * 10;
+
+	/* Output the number of times each spinlock was acquired. */
+	block_manager = btree_page = 0;
+	for (i = 0; i < WT_ELEMENTS(conn->spinlock_list); ++i) {
+		if ((spin = conn->spinlock_list[i]) == NULL)
+			continue;
+
+		/*
+		 * There are two sets of spinlocks we aggregate, the btree page
+		 * locks and the block manager per-file locks.  The reason is
+		 * the block manager locks grow with the number of files open
+		 * (and LSM and bloom filters can open a lot of files), and
+		 * there are 16 btree page locks and splitting them out has not
+		 * historically been that informative.
+		 */
+		if (strcmp(spin->name, "block manager") == 0) {
+			block_manager += spin->counter;
+			if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
+				spin->counter = 0;
+			continue;
+		}
+		if (strcmp(spin->name, "btree page") == 0) {
+			btree_page += spin->counter;
+			if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
+				spin->counter = 0;
+			continue;
+		}
+
+		WT_RET_TEST((fprintf(conn->stat_fp,
+		    "%s %" PRIu64 " %s spinlock %s: acquisitions\n",
+		    conn->stat_stamp,
+		    spin->counter <= ignore ? 0 : spin->counter,
+		    tag, spin->name) < 0),
+		    __wt_errno());
+		if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
+			spin->counter = 0;
+	}
+	WT_RET_TEST((fprintf(conn->stat_fp,
+	    "%s %" PRIu64 " %s spinlock %s: acquisitions\n",
+	    conn->stat_stamp,
+	    block_manager <= ignore ? 0 : block_manager,
+	    tag, "block manager") < 0),
+	    __wt_errno());
+	WT_RET_TEST((fprintf(conn->stat_fp,
+	    "%s %" PRIu64 " %s spinlock %s: acquisitions\n",
+	    conn->stat_stamp,
+	    btree_page <= ignore ? 0 : btree_page,
+	    tag, "btree page") < 0),
+	    __wt_errno());
+
+	/*
+	 * Output the number of times each location acquires its spinlock and
+	 * the blocking matrix.
+	 */
+	for (i = 0; i < WT_ELEMENTS(conn->spinlock_block); ++i) {
+		p = &conn->spinlock_block[i];
+		if (p->name == NULL)
+			continue;
+
+		WT_RET_TEST((fprintf(conn->stat_fp,
+		    "%s %d %s spinlock %s acquired by %s(%d)\n",
+		    conn->stat_stamp,
+		    p->total <= ignore ? 0 : p->total,
+		    tag,
+		    p->name, p->file, p->line) < 0), __wt_errno());
+		if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
+			p->total = 0;
+
+		for (j = 0; j < WT_ELEMENTS(conn->spinlock_block); ++j) {
+			t = &conn->spinlock_block[j];
+			if (t->name == NULL)
+				continue;
+
+			WT_RET_TEST((fprintf(conn->stat_fp,
+			    "%s %d %s spinlock %s: %s(%d) blocked by %s(%d)\n",
+			    conn->stat_stamp,
+			    p->blocked[j] <= ignore ? 0 : p->blocked[j],
+			    tag,
+			    p->name, p->file, p->line,
+			    t->file, t->line) < 0), __wt_errno());
+			if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
+				p->blocked[j] = 0;
+		}
+	}
+
+	WT_FULL_BARRIER();			/* Minimize the window. */
+	return (0);
+}
+
+#endif /* SPINLOCK_PTHREAD_MUTEX_LOGGING */
diff --git a/src/third_party/wiredtiger/src/support/pow.c b/src/third_party/wiredtiger/src/support/pow.c
new file mode 100644
index 00000000000..a6bf6c7227f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/pow.c
@@ -0,0 +1,130 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "wt_internal.h"
+
+#ifdef __WIREDTIGER_UNUSED__
+
+/*
+ * __wt_nlpo2_round --
+ *	Round up to the next-largest power-of-two for a 32-bit unsigned value.
+ *
+ * In 12 operations, this code computes the next highest power of 2 for a 32-bit
+ * integer. The result may be expressed by the formula 1U << (lg(v - 1) + 1).
+ * Note that in the edge case where v is 0, it returns 0, which isn't a power of
+ * 2; you might append the expression v += (v == 0) to remedy this if it
+ * matters.  It would be faster by 2 operations to use the formula and the
+ * log base 2 method that uses a lookup table, but in some situations, lookup
+ * tables are not suitable, so the above code may be best. (On a Athlon XP 2100+
+ * I've found the above shift-left and then OR code is as fast as using a single
+ * BSR assembly language instruction, which scans in reverse to find the highest
+ * set bit.) It works by copying the highest set bit to all of the lower bits,
+ * and then adding one, which results in carries that set all of the lower bits
+ * to 0 and one bit beyond the highest set bit to 1. If the original number was
+ * a power of 2, then the decrement will reduce it to one less, so that we round
+ * up to the same original value.  Devised by Sean Anderson, September 14, 2001.
+ * Pete Hart pointed me to a couple newsgroup posts by him and William Lewis in
+ * February of 1997, where they arrive at the same algorithm.
+ *	http://graphics.stanford.edu/~seander/bithacks.html
+ *	Sean Eron Anderson, seander@cs.stanford.edu
+ */
+uint32_t
+__wt_nlpo2_round(uint32_t v)
+{
+	v--;				/* If v is a power-of-two, return it. */
+	v |= v >> 1;
+	v |= v >> 2;
+	v |= v >> 4;
+	v |= v >> 8;
+	v |= v >> 16;
+	return (v + 1);
+}
+
+/*
+ * __wt_nlpo2 --
+ *	Return the next largest power-of-two.
+ */
+uint32_t
+__wt_nlpo2(uint32_t v)
+{
+	v |= v >> 1;
+	v |= v >> 2;
+	v |= v >> 4;
+	v |= v >> 8;
+	v |= v >> 16;
+	return (v + 1);
+}
+#endif /* __WIREDTIGER_UNUSED__ */
+
+/*
+ * __wt_log2_int --
+ *	Find the log base 2 of an integer in O(N) operations;
+ *	http://graphics.stanford.edu/~seander/bithacks.html
+ */
+uint32_t
+__wt_log2_int(uint32_t n)
+{
+	uint32_t l = 0;
+
+	while (n >>= 1)
+		l++;
+	return (l);
+}
+
+/*
+ * __wt_ispo2 --
+ *	Return if a number is a power-of-two.
+ */
+int
+__wt_ispo2(uint32_t v)
+{
+	/*
+	 * Only numbers that are powers of two will satisfy the relationship
+	 * (v & (v - 1) == 0).
+	 *
+	 * However n must be positive, this returns 0 as a power of 2; to fix
+	 * that, use: (! (v & (v - 1)) && v)
+	 */
+	return ((v & (v - 1)) == 0);
+}
+
+/*
+ * __wt_rduppo2 --
+ *	Round the given int up to the next multiple of N, where N is power of 2.
+ */
+uint32_t
+__wt_rduppo2(uint32_t n, uint32_t po2)
+{
+	uint32_t bits, res;
+
+	if (__wt_ispo2(po2)) {
+		bits = __wt_log2_int(po2);
+		res = (((n - 1) >> bits) + 1) << bits;
+	} else
+		res = 0;
+	return (res);
+}
diff --git a/src/third_party/wiredtiger/src/support/rand.c b/src/third_party/wiredtiger/src/support/rand.c
new file mode 100644
index 00000000000..b716eb8c58b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/rand.c
@@ -0,0 +1,69 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "wt_internal.h"
+
+#undef	M_W
+#define	M_W	(rnd)[0]
+#undef	M_Z
+#define	M_Z	(rnd)[1]
+
+/*
+ * __wt_random_init --
+ *	Initialize return of a 32-bit pseudo-random number.
+ */
+void
+__wt_random_init(uint32_t *rnd)
+{
+	M_W = 521288629;
+	M_Z = 362436069;
+}
+
+/*
+ * __wt_random --
+ *	Return a 32-bit pseudo-random number.
+ *
+ * This is an implementation of George Marsaglia's multiply-with-carry pseudo-
+ * random number generator.  Computationally fast, with reasonable randomness
+ * properties.
+ *
+ * We have to be very careful about races here.  Multiple threads can call
+ * __wt_random concurrently, and it is okay if those concurrent calls get the
+ * same return value.  What is *not* okay is if reading the shared state races
+ * with an update and uses two different values for m_w or m_z.  That could
+ * result in a value of zero, in which case they would be stuck on zero
+ * forever.  Take local copies of the shared values to avoid this.
+ */
+uint32_t
+__wt_random(uint32_t *rnd)
+{
+	uint32_t w = M_W, z = M_Z;
+
+	M_Z = z = 36969 * (z & 65535) + (z >> 16);
+	M_W = w = 18000 * (w & 65535) + (w >> 16);
+	return (z << 16) + (w & 65535);
+}
diff --git a/src/third_party/wiredtiger/src/support/scratch.c b/src/third_party/wiredtiger/src/support/scratch.c
new file mode 100644
index 00000000000..ca2cdac8377
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/scratch.c
@@ -0,0 +1,319 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_buf_grow_worker --
+ *	Grow a buffer that may be in-use, and ensure that all data is local to
+ * the buffer.
+ */
+int
+__wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
+{
+	size_t offset;
+	int copy_data;
+
+	/*
+	 * Maintain the existing data: there are 3 cases:
+	 *	No existing data: allocate the required memory, and initialize
+	 * the data to reference it.
+	 *	Existing data local to the buffer: set the data to the same
+	 * offset in the re-allocated memory.
+	 *	Existing data not-local to the buffer: copy the data into the
+	 * buffer and set the data to reference it.
+	 */
+	if (WT_DATA_IN_ITEM(buf)) {
+		offset = WT_PTRDIFF(buf->data, buf->mem);
+		copy_data = 0;
+	} else {
+		offset = 0;
+		copy_data = buf->size ? 1 : 0;
+	}
+
+	/*
+	 * This function is also used to ensure data is local to the buffer,
+	 * check to see if we actually need to grow anything.
+	 */
+	if (size > buf->memsize) {
+		if (F_ISSET(buf, WT_ITEM_ALIGNED))
+			WT_RET(__wt_realloc_aligned(
+			    session, &buf->memsize, size, &buf->mem));
+		else
+			WT_RET(__wt_realloc(
+			    session, &buf->memsize, size, &buf->mem));
+	}
+
+	if (buf->data == NULL) {
+		buf->data = buf->mem;
+		buf->size = 0;
+	} else {
+		if (copy_data)
+			memcpy(buf->mem, buf->data, buf->size);
+		buf->data = (uint8_t *)buf->mem + offset;
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_buf_fmt --
+ *	Grow a buffer to accommodate a formatted string.
+ */
+int
+__wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...)
+    WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
+{
+	va_list ap;
+	size_t len;
+
+	for (;;) {
+		va_start(ap, fmt);
+		len = (size_t)vsnprintf(buf->mem, buf->memsize, fmt, ap);
+		va_end(ap);
+
+		/* Check if there was enough space. */
+		if (len < buf->memsize) {
+			buf->data = buf->mem;
+			buf->size = len;
+			return (0);
+		}
+
+		/*
+		 * If not, double the size of the buffer: we're dealing with
+		 * strings, and we don't expect these numbers to get huge.
+		 */
+		WT_RET(__wt_buf_extend(session, buf, len + 1));
+	}
+}
+
+/*
+ * __wt_buf_catfmt --
+ *	Grow a buffer to append a formatted string.
+ */
+int
+__wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...)
+    WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
+{
+	va_list ap;
+	size_t len, space;
+	char *p;
+
+	/*
+	 * If we're appending data to an existing buffer, any data field should
+	 * point into the allocated memory.  (It wouldn't be insane to copy any
+	 * previously existing data at this point, if data wasn't in the local
+	 * buffer, but we don't and it would be bad if we didn't notice it.)
+	 */
+	WT_ASSERT(session, buf->data == NULL || WT_DATA_IN_ITEM(buf));
+
+	for (;;) {
+		va_start(ap, fmt);
+		p = (char *)((uint8_t *)buf->mem + buf->size);
+		WT_ASSERT(session, buf->memsize >= buf->size);
+		space = buf->memsize - buf->size;
+		len = (size_t)vsnprintf(p, (size_t)space, fmt, ap);
+		va_end(ap);
+
+		/* Check if there was enough space. */
+		if (len < space) {
+			buf->size += len;
+			return (0);
+		}
+
+		/*
+		 * If not, double the size of the buffer: we're dealing with
+		 * strings, and we don't expect these numbers to get huge.
+		 */
+		WT_RET(__wt_buf_extend(session, buf, buf->size + len + 1));
+	}
+}
+
+/*
+ * __wt_scr_alloc_func --
+ *	Scratch buffer allocation function.
+ */
+int
+__wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp
+#ifdef HAVE_DIAGNOSTIC
+    , const char *file, int line
+#endif
+    )
+{
+	WT_DECL_RET;
+	WT_ITEM *buf, **p, **best, **slot;
+	size_t allocated;
+	u_int i;
+
+	/* Don't risk the caller not catching the error. */
+	*scratchp = NULL;
+
+	/*
+	 * Each WT_SESSION_IMPL has an array of scratch buffers available for
+	 * use by any function.  We use WT_ITEM structures for scratch memory
+	 * because we already have functions that do variable-length allocation
+	 * on a WT_ITEM.  Scratch buffers are allocated only by a single thread
+	 * of control, so no locking is necessary.
+	 *
+	 * Walk the array, looking for a buffer we can use.
+	 */
+	for (i = 0, best = slot = NULL,
+	    p = session->scratch; i < session->scratch_alloc; ++i, ++p) {
+		/* If we find an empty slot, remember it. */
+		if ((buf = *p) == NULL) {
+			if (slot == NULL)
+				slot = p;
+			continue;
+		}
+
+		if (F_ISSET(buf, WT_ITEM_INUSE))
+			continue;
+
+		/*
+		 * If we find a buffer that's not in-use, check its size: we
+		 * want the smallest buffer larger than the requested size,
+		 * or the largest buffer if none are large enough.
+		 */
+		if (best == NULL ||
+		    ((*best)->memsize < size &&
+		    buf->memsize > (*best)->memsize) ||
+		    (buf->memsize >= size && buf->memsize < (*best)->memsize))
+			best = p;
+
+		/* If we find a perfect match, use it. */
+		if ((*best)->memsize == size)
+			break;
+	}
+
+	/*
+	 * If we didn't find a free buffer, extend the array and use the first
+	 * slot we allocated.
+	 */
+	if (best == NULL && slot == NULL) {
+		allocated = session->scratch_alloc * sizeof(WT_ITEM *);
+		WT_ERR(__wt_realloc(session, &allocated,
+		    (session->scratch_alloc + 10) * sizeof(WT_ITEM *),
+		    &session->scratch));
+#ifdef HAVE_DIAGNOSTIC
+		allocated = session->scratch_alloc * sizeof(WT_SCRATCH_TRACK);
+		WT_ERR(__wt_realloc(session, &allocated,
+		    (session->scratch_alloc + 10) * sizeof(WT_SCRATCH_TRACK),
+		    &session->scratch_track));
+#endif
+		slot = session->scratch + session->scratch_alloc;
+		session->scratch_alloc += 10;
+	}
+
+	/*
+	 * If slot is non-NULL, we found an empty slot, try and allocate a
+	 * buffer.
+	 */
+	if (best == NULL) {
+		WT_ASSERT(session, slot != NULL);
+		best = slot;
+
+		WT_ERR(__wt_calloc_def(session, 1, best));
+
+		/* Scratch buffers must be aligned. */
+		F_SET(*best, WT_ITEM_ALIGNED);
+	}
+
+	/* Grow the buffer as necessary and return. */
+	WT_ERR(__wt_buf_init(session, *best, size));
+	F_SET(*best, WT_ITEM_INUSE);
+
+#ifdef HAVE_DIAGNOSTIC
+	session->scratch_track[best - session->scratch].file = file;
+	session->scratch_track[best - session->scratch].line = line;
+#endif
+
+	*scratchp = *best;
+	return (0);
+
+err:	WT_RET_MSG(session, ret,
+	    "session unable to allocate a scratch buffer");
+}
+
+/*
+ * __wt_scr_discard --
+ *	Free all memory associated with the scratch buffers.
+ */
+void
+__wt_scr_discard(WT_SESSION_IMPL *session)
+{
+	WT_ITEM **bufp;
+	u_int i;
+
+	for (i = 0,
+	    bufp = session->scratch; i < session->scratch_alloc; ++i, ++bufp) {
+		if (*bufp == NULL)
+			continue;
+		if (F_ISSET(*bufp, WT_ITEM_INUSE))
+			__wt_errx(session,
+			    "scratch buffer allocated and never discarded"
+#ifdef HAVE_DIAGNOSTIC
+			    ": %s: %d",
+			    session->
+			    scratch_track[bufp - session->scratch].file,
+			    session->
+			    scratch_track[bufp - session->scratch].line
+#endif
+			    );
+
+		__wt_buf_free(session, *bufp);
+		__wt_free(session, *bufp);
+	}
+
+	__wt_free(session, session->scratch);
+#ifdef HAVE_DIAGNOSTIC
+	__wt_free(session, session->scratch_track);
+#endif
+}
+
+/*
+ * __wt_ext_scr_alloc --
+ *	Allocate a scratch buffer, and return the memory reference.
+ */
+void *
+__wt_ext_scr_alloc(
+    WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, size_t size)
+{
+	WT_ITEM *buf;
+	WT_SESSION_IMPL *session;
+
+	if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+		session = ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+	return (__wt_scr_alloc(session, size, &buf) == 0 ? buf->mem : NULL);
+}
+
+/*
+ * __wt_ext_scr_free --
+ *	Free a scratch buffer based on the memory reference.
+ */
+void
+__wt_ext_scr_free(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, void *p)
+{
+	WT_ITEM **bufp;
+	WT_SESSION_IMPL *session;
+	u_int i;
+
+	if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+		session = ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+	for (i = 0,
+	    bufp = session->scratch; i < session->scratch_alloc; ++i, ++bufp)
+		if (*bufp != NULL && (*bufp)->mem == p) {
+			/*
+			 * Do NOT call __wt_scr_free() here, it clears the
+			 * caller's pointer, which would truncate the list.
+			 */
+			F_CLR(*bufp, WT_ITEM_INUSE);
+			return;
+		}
+	__wt_errx(session, "extension free'd non-existent scratch buffer");
+}
diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c
new file mode 100644
index 00000000000..bc468fbe938
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/stat.c
@@ -0,0 +1,567 @@
+/* DO NOT EDIT: automatically built by dist/stat.py. */
+
+#include "wt_internal.h"
+
+void
+__wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats)
+{
+	/* Clear, so can also be called for reinitialization. */
+	memset(stats, 0, sizeof(*stats));
+
+	stats->allocation_size.desc =
+	    "block manager: file allocation unit size";
+	stats->block_alloc.desc = "block manager: blocks allocated";
+	stats->block_checkpoint_size.desc = "block manager: checkpoint size";
+	stats->block_extension.desc =
+	    "block manager: allocations requiring file extension";
+	stats->block_free.desc = "block manager: blocks freed";
+	stats->block_magic.desc = "block manager: file magic number";
+	stats->block_major.desc = "block manager: file major version number";
+	stats->block_minor.desc = "block manager: minor version number";
+	stats->block_reuse_bytes.desc =
+	    "block manager: file bytes available for reuse";
+	stats->block_size.desc = "block manager: file size in bytes";
+	stats->bloom_count.desc = "LSM: bloom filters in the LSM tree";
+	stats->bloom_false_positive.desc = "LSM: bloom filter false positives";
+	stats->bloom_hit.desc = "LSM: bloom filter hits";
+	stats->bloom_miss.desc = "LSM: bloom filter misses";
+	stats->bloom_page_evict.desc =
+	    "LSM: bloom filter pages evicted from cache";
+	stats->bloom_page_read.desc =
+	    "LSM: bloom filter pages read into cache";
+	stats->bloom_size.desc = "LSM: total size of bloom filters";
+	stats->btree_column_deleted.desc =
+	    "btree: column-store variable-size deleted values";
+	stats->btree_column_fix.desc =
+	    "btree: column-store fixed-size leaf pages";
+	stats->btree_column_internal.desc =
+	    "btree: column-store internal pages";
+	stats->btree_column_variable.desc =
+	    "btree: column-store variable-size leaf pages";
+	stats->btree_compact_rewrite.desc =
+	    "btree: pages rewritten by compaction";
+	stats->btree_entries.desc = "btree: number of key/value pairs";
+	stats->btree_fixed_len.desc = "btree: fixed-record size";
+	stats->btree_maximum_depth.desc = "btree: maximum tree depth";
+	stats->btree_maxintlitem.desc =
+	    "btree: maximum internal page item size";
+	stats->btree_maxintlpage.desc = "btree: maximum internal page size";
+	stats->btree_maxleafitem.desc = "btree: maximum leaf page item size";
+	stats->btree_maxleafpage.desc = "btree: maximum leaf page size";
+	stats->btree_overflow.desc = "btree: overflow pages";
+	stats->btree_row_internal.desc = "btree: row-store internal pages";
+	stats->btree_row_leaf.desc = "btree: row-store leaf pages";
+	stats->cache_bytes_read.desc = "cache: bytes read into cache";
+	stats->cache_bytes_write.desc = "cache: bytes written from cache";
+	stats->cache_eviction_checkpoint.desc =
+	    "cache: checkpoint blocked page eviction";
+	stats->cache_eviction_clean.desc = "cache: unmodified pages evicted";
+	stats->cache_eviction_dirty.desc = "cache: modified pages evicted";
+	stats->cache_eviction_fail.desc =
+	    "cache: data source pages selected for eviction unable to be evicted";
+	stats->cache_eviction_hazard.desc =
+	    "cache: hazard pointer blocked page eviction";
+	stats->cache_eviction_internal.desc = "cache: internal pages evicted";
+	stats->cache_overflow_value.desc =
+	    "cache: overflow values cached in memory";
+	stats->cache_read.desc = "cache: pages read into cache";
+	stats->cache_read_overflow.desc =
+	    "cache: overflow pages read into cache";
+	stats->cache_write.desc = "cache: pages written from cache";
+	stats->compress_raw_fail.desc =
+	    "compression: raw compression call failed, no additional data available";
+	stats->compress_raw_fail_temporary.desc =
+	    "compression: raw compression call failed, additional data available";
+	stats->compress_raw_ok.desc =
+	    "compression: raw compression call succeeded";
+	stats->compress_read.desc = "compression: compressed pages read";
+	stats->compress_write.desc = "compression: compressed pages written";
+	stats->compress_write_fail.desc =
+	    "compression: page written failed to compress";
+	stats->compress_write_too_small.desc =
+	    "compression: page written was too small to compress";
+	stats->cursor_create.desc = "cursor: create calls";
+	stats->cursor_insert.desc = "cursor: insert calls";
+	stats->cursor_insert_bulk.desc =
+	    "cursor: bulk-loaded cursor-insert calls";
+	stats->cursor_insert_bytes.desc =
+	    "cursor: cursor-insert key and value bytes inserted";
+	stats->cursor_next.desc = "cursor: next calls";
+	stats->cursor_prev.desc = "cursor: prev calls";
+	stats->cursor_remove.desc = "cursor: remove calls";
+	stats->cursor_remove_bytes.desc =
+	    "cursor: cursor-remove key bytes removed";
+	stats->cursor_reset.desc = "cursor: reset calls";
+	stats->cursor_search.desc = "cursor: search calls";
+	stats->cursor_search_near.desc = "cursor: search near calls";
+	stats->cursor_update.desc = "cursor: update calls";
+	stats->cursor_update_bytes.desc =
+	    "cursor: cursor-update value bytes updated";
+	stats->lsm_checkpoint_throttle.desc =
+	    "LSM: sleep for LSM checkpoint throttle";
+	stats->lsm_chunk_count.desc = "LSM: chunks in the LSM tree";
+	stats->lsm_generation_max.desc =
+	    "LSM: highest merge generation in the LSM tree";
+	stats->lsm_lookup_no_bloom.desc =
+	    "LSM: queries that could have benefited from a Bloom filter that did not exist";
+	stats->lsm_merge_throttle.desc = "LSM: sleep for LSM merge throttle";
+	stats->rec_dictionary.desc = "reconciliation: dictionary matches";
+	stats->rec_multiblock_internal.desc =
+	    "reconciliation: internal page multi-block writes";
+	stats->rec_multiblock_leaf.desc =
+	    "reconciliation: leaf page multi-block writes";
+	stats->rec_multiblock_max.desc =
+	    "reconciliation: maximum blocks required for a page";
+	stats->rec_overflow_key_internal.desc =
+	    "reconciliation: internal-page overflow keys";
+	stats->rec_overflow_key_leaf.desc =
+	    "reconciliation: leaf-page overflow keys";
+	stats->rec_overflow_value.desc =
+	    "reconciliation: overflow values written";
+	stats->rec_page_delete.desc = "reconciliation: pages deleted";
+	stats->rec_page_match.desc = "reconciliation: page checksum matches";
+	stats->rec_pages.desc = "reconciliation: page reconciliation calls";
+	stats->rec_pages_eviction.desc =
+	    "reconciliation: page reconciliation calls for eviction";
+	stats->rec_prefix_compression.desc =
+	    "reconciliation: leaf page key bytes discarded using prefix compression";
+	stats->rec_suffix_compression.desc =
+	    "reconciliation: internal page key bytes discarded using suffix compression";
+	stats->session_compact.desc = "session: object compaction";
+	stats->session_cursor_open.desc = "session: open cursor count";
+	stats->txn_update_conflict.desc = "txn: update conflicts";
+}
+
+void
+__wt_stat_refresh_dsrc_stats(void *stats_arg)
+{
+	WT_DSRC_STATS *stats;
+
+	stats = (WT_DSRC_STATS *)stats_arg;
+	stats->allocation_size.v = 0;
+	stats->block_alloc.v = 0;
+	stats->block_checkpoint_size.v = 0;
+	stats->block_extension.v = 0;
+	stats->block_free.v = 0;
+	stats->block_magic.v = 0;
+	stats->block_major.v = 0;
+	stats->block_minor.v = 0;
+	stats->block_reuse_bytes.v = 0;
+	stats->block_size.v = 0;
+	stats->bloom_count.v = 0;
+	stats->bloom_false_positive.v = 0;
+	stats->bloom_hit.v = 0;
+	stats->bloom_miss.v = 0;
+	stats->bloom_page_evict.v = 0;
+	stats->bloom_page_read.v = 0;
+	stats->bloom_size.v = 0;
+	stats->btree_column_deleted.v = 0;
+	stats->btree_column_fix.v = 0;
+	stats->btree_column_internal.v = 0;
+	stats->btree_column_variable.v = 0;
+	stats->btree_compact_rewrite.v = 0;
+	stats->btree_entries.v = 0;
+	stats->btree_fixed_len.v = 0;
+	stats->btree_maximum_depth.v = 0;
+	stats->btree_maxintlitem.v = 0;
+	stats->btree_maxintlpage.v = 0;
+	stats->btree_maxleafitem.v = 0;
+	stats->btree_maxleafpage.v = 0;
+	stats->btree_overflow.v = 0;
+	stats->btree_row_internal.v = 0;
+	stats->btree_row_leaf.v = 0;
+	stats->cache_bytes_read.v = 0;
+	stats->cache_bytes_write.v = 0;
+	stats->cache_eviction_checkpoint.v = 0;
+	stats->cache_eviction_clean.v = 0;
+	stats->cache_eviction_dirty.v = 0;
+	stats->cache_eviction_fail.v = 0;
+	stats->cache_eviction_hazard.v = 0;
+	stats->cache_eviction_internal.v = 0;
+	stats->cache_overflow_value.v = 0;
+	stats->cache_read.v = 0;
+	stats->cache_read_overflow.v = 0;
+	stats->cache_write.v = 0;
+	stats->compress_raw_fail.v = 0;
+	stats->compress_raw_fail_temporary.v = 0;
+	stats->compress_raw_ok.v = 0;
+	stats->compress_read.v = 0;
+	stats->compress_write.v = 0;
+	stats->compress_write_fail.v = 0;
+	stats->compress_write_too_small.v = 0;
+	stats->cursor_create.v = 0;
+	stats->cursor_insert.v = 0;
+	stats->cursor_insert_bulk.v = 0;
+	stats->cursor_insert_bytes.v = 0;
+	stats->cursor_next.v = 0;
+	stats->cursor_prev.v = 0;
+	stats->cursor_remove.v = 0;
+	stats->cursor_remove_bytes.v = 0;
+	stats->cursor_reset.v = 0;
+	stats->cursor_search.v = 0;
+	stats->cursor_search_near.v = 0;
+	stats->cursor_update.v = 0;
+	stats->cursor_update_bytes.v = 0;
+	stats->lsm_checkpoint_throttle.v = 0;
+	stats->lsm_chunk_count.v = 0;
+	stats->lsm_generation_max.v = 0;
+	stats->lsm_lookup_no_bloom.v = 0;
+	stats->lsm_merge_throttle.v = 0;
+	stats->rec_dictionary.v = 0;
+	stats->rec_multiblock_internal.v = 0;
+	stats->rec_multiblock_leaf.v = 0;
+	stats->rec_multiblock_max.v = 0;
+	stats->rec_overflow_key_internal.v = 0;
+	stats->rec_overflow_key_leaf.v = 0;
+	stats->rec_overflow_value.v = 0;
+	stats->rec_page_delete.v = 0;
+	stats->rec_page_match.v = 0;
+	stats->rec_pages.v = 0;
+	stats->rec_pages_eviction.v = 0;
+	stats->rec_prefix_compression.v = 0;
+	stats->rec_suffix_compression.v = 0;
+	stats->session_compact.v = 0;
+	stats->txn_update_conflict.v = 0;
+}
+
+void
+__wt_stat_aggregate_dsrc_stats(const void *child, const void *parent)
+{
+	WT_DSRC_STATS *c, *p;
+
+	c = (WT_DSRC_STATS *)child;
+	p = (WT_DSRC_STATS *)parent;
+	p->block_alloc.v += c->block_alloc.v;
+	p->block_checkpoint_size.v += c->block_checkpoint_size.v;
+	p->block_extension.v += c->block_extension.v;
+	p->block_free.v += c->block_free.v;
+	p->block_reuse_bytes.v += c->block_reuse_bytes.v;
+	p->block_size.v += c->block_size.v;
+	p->bloom_count.v += c->bloom_count.v;
+	p->bloom_false_positive.v += c->bloom_false_positive.v;
+	p->bloom_hit.v += c->bloom_hit.v;
+	p->bloom_miss.v += c->bloom_miss.v;
+	p->bloom_page_evict.v += c->bloom_page_evict.v;
+	p->bloom_page_read.v += c->bloom_page_read.v;
+	p->bloom_size.v += c->bloom_size.v;
+	p->btree_column_deleted.v += c->btree_column_deleted.v;
+	p->btree_column_fix.v += c->btree_column_fix.v;
+	p->btree_column_internal.v += c->btree_column_internal.v;
+	p->btree_column_variable.v += c->btree_column_variable.v;
+	p->btree_compact_rewrite.v += c->btree_compact_rewrite.v;
+	p->btree_entries.v += c->btree_entries.v;
+	if (c->btree_maximum_depth.v > p->btree_maximum_depth.v)
+	    p->btree_maximum_depth.v = c->btree_maximum_depth.v;
+	p->btree_overflow.v += c->btree_overflow.v;
+	p->btree_row_internal.v += c->btree_row_internal.v;
+	p->btree_row_leaf.v += c->btree_row_leaf.v;
+	p->cache_bytes_read.v += c->cache_bytes_read.v;
+	p->cache_bytes_write.v += c->cache_bytes_write.v;
+	p->cache_eviction_checkpoint.v += c->cache_eviction_checkpoint.v;
+	p->cache_eviction_clean.v += c->cache_eviction_clean.v;
+	p->cache_eviction_dirty.v += c->cache_eviction_dirty.v;
+	p->cache_eviction_fail.v += c->cache_eviction_fail.v;
+	p->cache_eviction_hazard.v += c->cache_eviction_hazard.v;
+	p->cache_eviction_internal.v += c->cache_eviction_internal.v;
+	p->cache_overflow_value.v += c->cache_overflow_value.v;
+	p->cache_read.v += c->cache_read.v;
+	p->cache_read_overflow.v += c->cache_read_overflow.v;
+	p->cache_write.v += c->cache_write.v;
+	p->compress_raw_fail.v += c->compress_raw_fail.v;
+	p->compress_raw_fail_temporary.v += c->compress_raw_fail_temporary.v;
+	p->compress_raw_ok.v += c->compress_raw_ok.v;
+	p->compress_read.v += c->compress_read.v;
+	p->compress_write.v += c->compress_write.v;
+	p->compress_write_fail.v += c->compress_write_fail.v;
+	p->compress_write_too_small.v += c->compress_write_too_small.v;
+	p->cursor_create.v += c->cursor_create.v;
+	p->cursor_insert.v += c->cursor_insert.v;
+	p->cursor_insert_bulk.v += c->cursor_insert_bulk.v;
+	p->cursor_insert_bytes.v += c->cursor_insert_bytes.v;
+	p->cursor_next.v += c->cursor_next.v;
+	p->cursor_prev.v += c->cursor_prev.v;
+	p->cursor_remove.v += c->cursor_remove.v;
+	p->cursor_remove_bytes.v += c->cursor_remove_bytes.v;
+	p->cursor_reset.v += c->cursor_reset.v;
+	p->cursor_search.v += c->cursor_search.v;
+	p->cursor_search_near.v += c->cursor_search_near.v;
+	p->cursor_update.v += c->cursor_update.v;
+	p->cursor_update_bytes.v += c->cursor_update_bytes.v;
+	p->lsm_checkpoint_throttle.v += c->lsm_checkpoint_throttle.v;
+	if (c->lsm_generation_max.v > p->lsm_generation_max.v)
+	    p->lsm_generation_max.v = c->lsm_generation_max.v;
+	p->lsm_lookup_no_bloom.v += c->lsm_lookup_no_bloom.v;
+	p->lsm_merge_throttle.v += c->lsm_merge_throttle.v;
+	p->rec_dictionary.v += c->rec_dictionary.v;
+	p->rec_multiblock_internal.v += c->rec_multiblock_internal.v;
+	p->rec_multiblock_leaf.v += c->rec_multiblock_leaf.v;
+	if (c->rec_multiblock_max.v > p->rec_multiblock_max.v)
+	    p->rec_multiblock_max.v = c->rec_multiblock_max.v;
+	p->rec_overflow_key_internal.v += c->rec_overflow_key_internal.v;
+	p->rec_overflow_key_leaf.v += c->rec_overflow_key_leaf.v;
+	p->rec_overflow_value.v += c->rec_overflow_value.v;
+	p->rec_page_delete.v += c->rec_page_delete.v;
+	p->rec_page_match.v += c->rec_page_match.v;
+	p->rec_pages.v += c->rec_pages.v;
+	p->rec_pages_eviction.v += c->rec_pages_eviction.v;
+	p->rec_prefix_compression.v += c->rec_prefix_compression.v;
+	p->rec_suffix_compression.v += c->rec_suffix_compression.v;
+	p->session_compact.v += c->session_compact.v;
+	p->session_cursor_open.v += c->session_cursor_open.v;
+	p->txn_update_conflict.v += c->txn_update_conflict.v;
+}
+
+void
+__wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats)
+{
+	/* Clear, so can also be called for reinitialization. */
+	memset(stats, 0, sizeof(*stats));
+
+	stats->async_alloc_race.desc =
+	    "async: number of allocation state races";
+	stats->async_alloc_view.desc =
+	    "async: number of op slots viewed for alloc";
+	stats->async_cur_queue.desc = "async: current work queue length";
+	stats->async_flush.desc = "async: number of async flush calls";
+	stats->async_full.desc = "async: number of times op allocation failed";
+	stats->async_max_queue.desc = "async: maximum work queue length";
+	stats->async_nowork.desc =
+	    "async: number of times worker found no work";
+	stats->async_op_alloc.desc = "async: op allocations";
+	stats->async_op_compact.desc = "async: op compact calls";
+	stats->async_op_insert.desc = "async: op insert calls";
+	stats->async_op_remove.desc = "async: op remove calls";
+	stats->async_op_search.desc = "async: op search calls";
+	stats->async_op_update.desc = "async: op update calls";
+	stats->block_byte_map_read.desc = "block manager: mapped bytes read";
+	stats->block_byte_read.desc = "block manager: bytes read";
+	stats->block_byte_write.desc = "block manager: bytes written";
+	stats->block_map_read.desc = "block manager: mapped blocks read";
+	stats->block_preload.desc = "block manager: blocks pre-loaded";
+	stats->block_read.desc = "block manager: blocks read";
+	stats->block_write.desc = "block manager: blocks written";
+	stats->cache_bytes_dirty.desc =
+	    "cache: tracked dirty bytes in the cache";
+	stats->cache_bytes_inuse.desc = "cache: bytes currently in the cache";
+	stats->cache_bytes_max.desc = "cache: maximum bytes configured";
+	stats->cache_bytes_read.desc = "cache: bytes read into cache";
+	stats->cache_bytes_write.desc = "cache: bytes written from cache";
+	stats->cache_eviction_checkpoint.desc =
+	    "cache: checkpoint blocked page eviction";
+	stats->cache_eviction_clean.desc = "cache: unmodified pages evicted";
+	stats->cache_eviction_deepen.desc =
+	    "cache: page split during eviction deepened the tree";
+	stats->cache_eviction_dirty.desc = "cache: modified pages evicted";
+	stats->cache_eviction_fail.desc =
+	    "cache: pages selected for eviction unable to be evicted";
+	stats->cache_eviction_force.desc =
+	    "cache: pages evicted because they exceeded the in-memory maximum";
+	stats->cache_eviction_force_fail.desc =
+	    "cache: failed eviction of pages that exceeded the in-memory maximum";
+	stats->cache_eviction_hazard.desc =
+	    "cache: hazard pointer blocked page eviction";
+	stats->cache_eviction_internal.desc = "cache: internal pages evicted";
+	stats->cache_eviction_queue_empty.desc =
+	    "cache: eviction server candidate queue empty when topping up";
+	stats->cache_eviction_queue_not_empty.desc =
+	    "cache: eviction server candidate queue not empty when topping up";
+	stats->cache_eviction_server_evicting.desc =
+	    "cache: eviction server evicting pages";
+	stats->cache_eviction_server_not_evicting.desc =
+	    "cache: eviction server populating queue, but not evicting pages";
+	stats->cache_eviction_slow.desc =
+	    "cache: eviction server unable to reach eviction goal";
+	stats->cache_eviction_split.desc =
+	    "cache: pages split during eviction";
+	stats->cache_eviction_walk.desc = "cache: pages walked for eviction";
+	stats->cache_pages_dirty.desc =
+	    "cache: tracked dirty pages in the cache";
+	stats->cache_pages_inuse.desc =
+	    "cache: pages currently held in the cache";
+	stats->cache_read.desc = "cache: pages read into cache";
+	stats->cache_write.desc = "cache: pages written from cache";
+	stats->cond_wait.desc = "conn: pthread mutex condition wait calls";
+	stats->cursor_create.desc = "Btree: cursor create calls";
+	stats->cursor_insert.desc = "Btree: cursor insert calls";
+	stats->cursor_next.desc = "Btree: cursor next calls";
+	stats->cursor_prev.desc = "Btree: cursor prev calls";
+	stats->cursor_remove.desc = "Btree: cursor remove calls";
+	stats->cursor_reset.desc = "Btree: cursor reset calls";
+	stats->cursor_search.desc = "Btree: cursor search calls";
+	stats->cursor_search_near.desc = "Btree: cursor search near calls";
+	stats->cursor_update.desc = "Btree: cursor update calls";
+	stats->dh_session_handles.desc = "dhandle: session dhandles swept";
+	stats->dh_session_sweeps.desc = "dhandle: session sweep attempts";
+	stats->file_open.desc = "conn: files currently open";
+	stats->log_buffer_grow.desc = "log: log buffer size increases";
+	stats->log_buffer_size.desc = "log: total log buffer size";
+	stats->log_bytes_user.desc = "log: user provided log bytes written";
+	stats->log_bytes_written.desc = "log: log bytes written";
+	stats->log_close_yields.desc =
+	    "log: yields waiting for previous log file close";
+	stats->log_max_filesize.desc = "log: maximum log file size";
+	stats->log_reads.desc = "log: log read operations";
+	stats->log_scan_records.desc = "log: records processed by log scan";
+	stats->log_scan_rereads.desc =
+	    "log: log scan records requiring two reads";
+	stats->log_scans.desc = "log: log scan operations";
+	stats->log_slot_closes.desc = "log: consolidated slot closures";
+	stats->log_slot_consolidated.desc = "log: logging bytes consolidated";
+	stats->log_slot_joins.desc = "log: consolidated slot joins";
+	stats->log_slot_races.desc = "log: consolidated slot join races";
+	stats->log_slot_switch_fails.desc =
+	    "log: slots selected for switching that were unavailable";
+	stats->log_slot_toobig.desc = "log: record size exceeded maximum";
+	stats->log_slot_toosmall.desc =
+	    "log: failed to find a slot large enough for record";
+	stats->log_slot_transitions.desc =
+	    "log: consolidated slot join transitions";
+	stats->log_sync.desc = "log: log sync operations";
+	stats->log_writes.desc = "log: log write operations";
+	stats->lsm_checkpoint_throttle.desc =
+	    "LSM: sleep for LSM checkpoint throttle";
+	stats->lsm_merge_throttle.desc = "LSM: sleep for LSM merge throttle";
+	stats->lsm_rows_merged.desc = "LSM: rows merged in an LSM tree";
+	stats->lsm_work_queue_app.desc =
+	    "LSM: App work units currently queued";
+	stats->lsm_work_queue_manager.desc =
+	    "LSM: Merge work units currently queued";
+	stats->lsm_work_queue_max.desc = "LSM: tree queue hit maximum";
+	stats->lsm_work_queue_switch.desc =
+	    "LSM: Switch work units currently queued";
+	stats->lsm_work_units_created.desc =
+	    "LSM: tree maintenance operations scheduled";
+	stats->lsm_work_units_discarded.desc =
+	    "LSM: tree maintenance operations discarded";
+	stats->lsm_work_units_done.desc =
+	    "LSM: tree maintenance operations executed";
+	stats->memory_allocation.desc = "conn: memory allocations";
+	stats->memory_free.desc = "conn: memory frees";
+	stats->memory_grow.desc = "conn: memory re-allocations";
+	stats->read_io.desc = "conn: total read I/Os";
+	stats->rec_pages.desc = "reconciliation: page reconciliation calls";
+	stats->rec_pages_eviction.desc =
+	    "reconciliation: page reconciliation calls for eviction";
+	stats->rec_split_stashed_bytes.desc =
+	    "reconciliation: split bytes currently awaiting free";
+	stats->rec_split_stashed_objects.desc =
+	    "reconciliation: split objects currently awaiting free";
+	stats->rwlock_read.desc =
+	    "conn: pthread mutex shared lock read-lock calls";
+	stats->rwlock_write.desc =
+	    "conn: pthread mutex shared lock write-lock calls";
+	stats->session_cursor_open.desc = "session: open cursor count";
+	stats->session_open.desc = "session: open session count";
+	stats->txn_begin.desc = "txn: transaction begins";
+	stats->txn_checkpoint.desc = "txn: transaction checkpoints";
+	stats->txn_checkpoint_running.desc =
+	    "txn: transaction checkpoint currently running";
+	stats->txn_commit.desc = "txn: transactions committed";
+	stats->txn_fail_cache.desc =
+	    "txn: transaction failures due to cache overflow";
+	stats->txn_pinned_range.desc =
+	    "txn: transaction range of IDs currently pinned";
+	stats->txn_rollback.desc = "txn: transactions rolled back";
+	stats->write_io.desc = "conn: total write I/Os";
+}
+
+void
+__wt_stat_refresh_connection_stats(void *stats_arg)
+{
+	WT_CONNECTION_STATS *stats;
+
+	stats = (WT_CONNECTION_STATS *)stats_arg;
+	stats->async_alloc_race.v = 0;
+	stats->async_alloc_view.v = 0;
+	stats->async_cur_queue.v = 0;
+	stats->async_flush.v = 0;
+	stats->async_full.v = 0;
+	stats->async_max_queue.v = 0;
+	stats->async_nowork.v = 0;
+	stats->async_op_alloc.v = 0;
+	stats->async_op_compact.v = 0;
+	stats->async_op_insert.v = 0;
+	stats->async_op_remove.v = 0;
+	stats->async_op_search.v = 0;
+	stats->async_op_update.v = 0;
+	stats->block_byte_map_read.v = 0;
+	stats->block_byte_read.v = 0;
+	stats->block_byte_write.v = 0;
+	stats->block_map_read.v = 0;
+	stats->block_preload.v = 0;
+	stats->block_read.v = 0;
+	stats->block_write.v = 0;
+	stats->cache_bytes_dirty.v = 0;
+	stats->cache_bytes_read.v = 0;
+	stats->cache_bytes_write.v = 0;
+	stats->cache_eviction_checkpoint.v = 0;
+	stats->cache_eviction_clean.v = 0;
+	stats->cache_eviction_deepen.v = 0;
+	stats->cache_eviction_dirty.v = 0;
+	stats->cache_eviction_fail.v = 0;
+	stats->cache_eviction_force.v = 0;
+	stats->cache_eviction_force_fail.v = 0;
+	stats->cache_eviction_hazard.v = 0;
+	stats->cache_eviction_internal.v = 0;
+	stats->cache_eviction_queue_empty.v = 0;
+	stats->cache_eviction_queue_not_empty.v = 0;
+	stats->cache_eviction_server_evicting.v = 0;
+	stats->cache_eviction_server_not_evicting.v = 0;
+	stats->cache_eviction_slow.v = 0;
+	stats->cache_eviction_split.v = 0;
+	stats->cache_eviction_walk.v = 0;
+	stats->cache_pages_dirty.v = 0;
+	stats->cache_read.v = 0;
+	stats->cache_write.v = 0;
+	stats->cond_wait.v = 0;
+	stats->cursor_create.v = 0;
+	stats->cursor_insert.v = 0;
+	stats->cursor_next.v = 0;
+	stats->cursor_prev.v = 0;
+	stats->cursor_remove.v = 0;
+	stats->cursor_reset.v = 0;
+	stats->cursor_search.v = 0;
+	stats->cursor_search_near.v = 0;
+	stats->cursor_update.v = 0;
+	stats->dh_session_handles.v = 0;
+	stats->dh_session_sweeps.v = 0;
+	stats->log_buffer_grow.v = 0;
+	stats->log_bytes_user.v = 0;
+	stats->log_bytes_written.v = 0;
+	stats->log_close_yields.v = 0;
+	stats->log_reads.v = 0;
+	stats->log_scan_records.v = 0;
+	stats->log_scan_rereads.v = 0;
+	stats->log_scans.v = 0;
+	stats->log_slot_closes.v = 0;
+	stats->log_slot_consolidated.v = 0;
+	stats->log_slot_joins.v = 0;
+	stats->log_slot_races.v = 0;
+	stats->log_slot_switch_fails.v = 0;
+	stats->log_slot_toobig.v = 0;
+	stats->log_slot_toosmall.v = 0;
+	stats->log_slot_transitions.v = 0;
+	stats->log_sync.v = 0;
+	stats->log_writes.v = 0;
+	stats->lsm_checkpoint_throttle.v = 0;
+	stats->lsm_merge_throttle.v = 0;
+	stats->lsm_rows_merged.v = 0;
+	stats->lsm_work_queue_max.v = 0;
+	stats->lsm_work_units_created.v = 0;
+	stats->lsm_work_units_discarded.v = 0;
+	stats->lsm_work_units_done.v = 0;
+	stats->memory_allocation.v = 0;
+	stats->memory_free.v = 0;
+	stats->memory_grow.v = 0;
+	stats->read_io.v = 0;
+	stats->rec_pages.v = 0;
+	stats->rec_pages_eviction.v = 0;
+	stats->rwlock_read.v = 0;
+	stats->rwlock_write.v = 0;
+	stats->txn_begin.v = 0;
+	stats->txn_checkpoint.v = 0;
+	stats->txn_commit.v = 0;
+	stats->txn_fail_cache.v = 0;
+	stats->txn_rollback.v = 0;
+	stats->write_io.v = 0;
+}
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
new file mode 100644
index 00000000000..292d1a37ceb
--- /dev/null
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -0,0 +1,554 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_txnid_cmp --
+ *	Compare transaction IDs for sorting / searching.
+ */
+int
+__wt_txnid_cmp(const void *v1, const void *v2)
+{
+	uint64_t id1, id2;
+
+	id1 = *(uint64_t *)v1;
+	id2 = *(uint64_t *)v2;
+
+	return ((id1 == id2) ? 0 : TXNID_LT(id1, id2) ? -1 : 1);
+}
+
+/*
+ * __txn_sort_snapshot --
+ *	Sort a snapshot for faster searching and set the min/max bounds.
+ */
+static void
+__txn_sort_snapshot(WT_SESSION_IMPL *session, uint32_t n, uint64_t snap_max)
+{
+	WT_TXN *txn;
+
+	txn = &session->txn;
+
+	if (n > 1)
+		qsort(txn->snapshot, n, sizeof(uint64_t), __wt_txnid_cmp);
+	txn->snapshot_count = n;
+	txn->snap_max = snap_max;
+	txn->snap_min = (n > 0 && TXNID_LE(txn->snapshot[0], snap_max)) ?
+	    txn->snapshot[0] : snap_max;
+	F_SET(txn, TXN_HAS_SNAPSHOT);
+	WT_ASSERT(session, n == 0 || txn->snap_min != WT_TXN_NONE);
+}
+
+/*
+ * __wt_txn_release_snapshot --
+ *	Release the snapshot in the current transaction.
+ */
+void
+__wt_txn_release_snapshot(WT_SESSION_IMPL *session)
+{
+	WT_TXN *txn;
+	WT_TXN_STATE *txn_state;
+
+	txn = &session->txn;
+	txn_state = &S2C(session)->txn_global.states[session->id];
+
+	if (txn_state->snap_min != WT_TXN_NONE) {
+		WT_ASSERT(session,
+		    session->txn.isolation == TXN_ISO_READ_UNCOMMITTED ||
+		    !__wt_txn_visible_all(session, txn_state->snap_min));
+		txn_state->snap_min = WT_TXN_NONE;
+	}
+	F_CLR(txn, TXN_HAS_SNAPSHOT);
+}
+
+/*
+ * __wt_txn_update_oldest --
+ *	Sweep the running transactions to update the oldest ID required.
+ */
+void
+__wt_txn_update_oldest(WT_SESSION_IMPL *session)
+{
+	/*
+	 * !!!
+	 * If a data-source is calling the WT_EXTENSION_API.transaction_oldest
+	 * method (for the oldest transaction ID not yet visible to a running
+	 * transaction), and then comparing that oldest ID against committed
+	 * transactions to see if updates for a committed transaction are still
+	 * visible to running transactions, the oldest transaction ID may be
+	 * the same as the last committed transaction ID, if the transaction
+	 * state wasn't refreshed after the last transaction committed.  Push
+	 * past the last committed transaction.
+	 */
+	__wt_txn_refresh(session, 0);
+}
+
+/*
+ * __wt_txn_refresh --
+ *	Allocate a transaction ID and/or a snapshot.
+ */
+void
+__wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_TXN *txn;
+	WT_TXN_GLOBAL *txn_global;
+	WT_TXN_STATE *s, *txn_state;
+	uint64_t current_id, id, oldest_id;
+	uint64_t prev_oldest_id, snap_min;
+	uint32_t i, n, oldest_session, session_cnt;
+	int32_t count;
+
+	conn = S2C(session);
+	txn = &session->txn;
+	txn_global = &conn->txn_global;
+	txn_state = &txn_global->states[session->id];
+
+	current_id = snap_min = txn_global->current;
+	prev_oldest_id = txn_global->oldest_id;
+
+	/* For pure read-only workloads, avoid scanning. */
+	if (prev_oldest_id == current_id) {
+		if (get_snapshot) {
+			txn_state->snap_min = current_id;
+			__txn_sort_snapshot(session, 0, current_id);
+		}
+		/* Check that the oldest ID has not moved in the meantime. */
+		if (prev_oldest_id == txn_global->oldest_id &&
+		    txn_global->scan_count == 0)
+			return;
+	}
+
+	/*
+	 * We're going to scan.  Increment the count of scanners to prevent the
+	 * oldest ID from moving forwards.  Spin if the count is negative,
+	 * which indicates that some thread is moving the oldest ID forwards.
+	 */
+	do {
+		if ((count = txn_global->scan_count) < 0)
+			WT_PAUSE();
+	} while (count < 0 ||
+	    !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1));
+
+	/* The oldest ID cannot change until the scan count goes to zero. */
+	prev_oldest_id = txn_global->oldest_id;
+	current_id = oldest_id = snap_min = txn_global->current;
+	oldest_session = 0;
+
+	/* Walk the array of concurrent transactions. */
+	WT_ORDERED_READ(session_cnt, conn->session_cnt);
+	for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) {
+		/*
+		 * Build our snapshot of any concurrent transaction IDs.
+		 *
+		 * Ignore our own ID: we always read our own updates.
+		 *
+		 * Also ignore the ID if it is older than the oldest ID we saw.
+		 * This can happen if we race with a thread that is allocating
+		 * an ID -- the ID will not be used because the thread will
+		 * keep spinning until it gets a valid one.
+		 */
+		if (s != txn_state &&
+		    (id = s->id) != WT_TXN_NONE &&
+		    TXNID_LE(prev_oldest_id, id)) {
+			if (get_snapshot)
+				txn->snapshot[n++] = id;
+			if (TXNID_LT(id, snap_min))
+				snap_min = id;
+		}
+
+		/*
+		 * Ignore the session's own snap_min: we are about to update
+		 * it.
+		 */
+		if (get_snapshot && s == txn_state)
+			continue;
+
+		/*
+		 * !!!
+		 * Note: Don't ignore snap_min values older than the previous
+		 * oldest ID.  Read-uncommitted operations publish snap_min
+		 * values without incrementing scan_count to protect the global
+		 * table.  See the comment in __wt_txn_cursor_op for
+		 * more details.
+		 */
+		if ((id = s->snap_min) != WT_TXN_NONE &&
+		    TXNID_LT(id, oldest_id)) {
+			oldest_id = id;
+			oldest_session = i;
+		}
+	}
+
+	if (TXNID_LT(snap_min, oldest_id))
+		oldest_id = snap_min;
+	if (txn->id != WT_TXN_NONE && TXNID_LT(txn->id, oldest_id))
+		oldest_id = txn->id;
+
+	/*
+	 * If we got a new snapshot, update the published snap_min for this
+	 * session.
+	 */
+	if (get_snapshot) {
+		WT_ASSERT(session, TXNID_LE(prev_oldest_id, snap_min));
+		WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
+		txn_state->snap_min = snap_min;
+	}
+
+	/*
+	 * Update the last running ID if we have a much newer value or we are
+	 * forcing an update.
+	 */
+	if (!get_snapshot || snap_min > txn_global->last_running + 100)
+		txn_global->last_running = snap_min;
+
+	/*
+	 * Update the oldest ID if we have a newer ID and we can get exclusive
+	 * access.  During normal snapshot refresh, only do this if we have a
+	 * much newer value.  Once we get exclusive access, do another pass to
+	 * make sure nobody else is using an earlier ID.
+	 */
+	if (TXNID_LT(prev_oldest_id, oldest_id) &&
+	    (!get_snapshot || oldest_id - prev_oldest_id > 100) &&
+	    WT_ATOMIC_CAS4(txn_global->scan_count, 1, -1)) {
+		WT_ORDERED_READ(session_cnt, conn->session_cnt);
+		for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
+			if ((id = s->id) != WT_TXN_NONE &&
+			    TXNID_LT(id, oldest_id))
+				oldest_id = id;
+			if ((id = s->snap_min) != WT_TXN_NONE &&
+			    TXNID_LT(id, oldest_id))
+				oldest_id = id;
+		}
+		if (TXNID_LT(txn_global->oldest_id, oldest_id))
+			txn_global->oldest_id = oldest_id;
+		txn_global->scan_count = 0;
+	} else {
+		if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) &&
+		    current_id - oldest_id > 10000 &&
+		    txn_global->oldest_session != oldest_session) {
+			(void)__wt_verbose(session, WT_VERB_TRANSACTION,
+			    "old snapshot %" PRIu64
+			    " pinned in session %d [%s]"
+			    " with snap_min %" PRIu64 "\n",
+			    oldest_id, oldest_session,
+			    conn->sessions[oldest_session].lastop,
+			    conn->sessions[oldest_session].txn.snap_min);
+			txn_global->oldest_session = oldest_session;
+		}
+		WT_ASSERT(session, txn_global->scan_count > 0);
+		(void)WT_ATOMIC_SUB4(txn_global->scan_count, 1);
+	}
+
+	if (get_snapshot)
+		__txn_sort_snapshot(session, n, current_id);
+}
+
+/*
+ * __wt_txn_begin --
+ *	Begin a transaction.
+ */
+int
+__wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_CONFIG_ITEM cval;
+	WT_TXN *txn;
+
+	txn = &session->txn;
+
+	WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval));
+	if (cval.len == 0)
+		txn->isolation = session->isolation;
+	else
+		txn->isolation =
+		    WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
+		    TXN_ISO_SNAPSHOT :
+		    WT_STRING_MATCH("read-committed", cval.str, cval.len) ?
+		    TXN_ISO_READ_COMMITTED : TXN_ISO_READ_UNCOMMITTED;
+
+	/*
+	 * The default sync setting is inherited from the connection, but can
+	 * be overridden by an explicit "sync" setting for this transaction.
+	 */
+	txn->txn_logsync = S2C(session)->txn_logsync;
+	WT_RET(__wt_config_gets_def(session, cfg, "sync",
+	    FLD_ISSET(txn->txn_logsync, WT_LOG_FLUSH), &cval));
+	if (!cval.val)
+		txn->txn_logsync = 0;
+
+	F_SET(txn, TXN_RUNNING);
+	if (txn->isolation == TXN_ISO_SNAPSHOT) {
+		if (session->ncursors > 0)
+			WT_RET(__wt_session_copy_values(session));
+		__wt_txn_refresh(session, 1);
+	}
+	return (0);
+}
+
+/*
+ * __wt_txn_release --
+ *	Release the resources associated with the current transaction.
+ */
+void
+__wt_txn_release(WT_SESSION_IMPL *session)
+{
+	WT_TXN *txn;
+	WT_TXN_GLOBAL *txn_global;
+	WT_TXN_STATE *txn_state;
+
+	txn = &session->txn;
+	WT_ASSERT(session, txn->mod_count == 0);
+	txn->notify = NULL;
+
+	txn_global = &S2C(session)->txn_global;
+	txn_state = &txn_global->states[session->id];
+
+	/* Clear the transaction's ID from the global table. */
+	if (F_ISSET(txn, TXN_HAS_ID)) {
+		WT_ASSERT(session, txn_state->id != WT_TXN_NONE &&
+		    txn->id != WT_TXN_NONE);
+		WT_PUBLISH(txn_state->id, WT_TXN_NONE);
+		txn->id = WT_TXN_NONE;
+	}
+
+	/* Free the scratch buffer allocated for logging. */
+	__wt_logrec_free(session, &txn->logrec);
+
+	/* Discard any memory from the session's split stash that we can. */
+	if (session->split_stash_cnt > 0)
+		__wt_split_stash_discard(session);
+
+	/*
+	 * Reset the transaction state to not running and release the snapshot.
+	 */
+	__wt_txn_release_snapshot(session);
+	txn->isolation = session->isolation;
+	F_CLR(txn, TXN_ERROR | TXN_HAS_ID | TXN_RUNNING);
+}
+
+/*
+ * __wt_txn_commit --
+ *	Commit the current transaction.
+ */
+int
+__wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_DECL_RET;
+	WT_TXN *txn;
+	WT_TXN_OP *op;
+	u_int i;
+
+	txn = &session->txn;
+	WT_ASSERT(session, !F_ISSET(txn, TXN_ERROR));
+
+	if (!F_ISSET(txn, TXN_RUNNING))
+		WT_RET_MSG(session, EINVAL, "No transaction is active");
+
+	/* Commit notification. */
+	if (txn->notify != NULL)
+		WT_TRET(txn->notify->notify(txn->notify,
+		    (WT_SESSION *)session, txn->id, 1));
+
+	/* If we are logging, write a commit log record. */
+	if (ret == 0 &&
+	    txn->mod_count > 0 && S2C(session)->logging &&
+	    !F_ISSET(session, WT_SESSION_NO_LOGGING))
+		ret = __wt_txn_log_commit(session, cfg);
+
+	/*
+	 * If anything went wrong, roll back.
+	 *
+	 * !!!
+	 * Nothing can fail after this point.
+	 */
+	if (ret != 0) {
+		WT_TRET(__wt_txn_rollback(session, cfg));
+		return (ret);
+	}
+
+	/* Free memory associated with updates. */
+	for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++)
+		__wt_txn_op_free(session, op);
+	txn->mod_count = 0;
+
+	/*
+	 * We are about to release the snapshot: copy values into any
+	 * positioned cursors so they don't point to updates that could be
+	 * freed once we don't have a transaction ID pinned.
+	 */
+	if (session->ncursors > 0)
+		WT_RET(__wt_session_copy_values(session));
+
+	__wt_txn_release(session);
+	return (0);
+}
+
+/*
+ * __wt_txn_rollback --
+ *	Roll back the current transaction.
+ */
+int
+__wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_DECL_RET;
+	WT_TXN *txn;
+	WT_TXN_OP *op;
+	u_int i;
+
+	WT_UNUSED(cfg);
+
+	txn = &session->txn;
+	if (!F_ISSET(txn, TXN_RUNNING))
+		WT_RET_MSG(session, EINVAL, "No transaction is active");
+
+	/* Rollback notification. */
+	if (txn->notify != NULL)
+		WT_TRET(txn->notify->notify(txn->notify, (WT_SESSION *)session,
+		    txn->id, 0));
+
+	/* Rollback updates. */
+	for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
+		/* Metadata updates are never rolled back. */
+		if (op->fileid == WT_METAFILE_ID)
+			continue;
+
+		switch (op->type) {
+		case TXN_OP_BASIC:
+		case TXN_OP_INMEM:
+			op->u.upd->txnid = WT_TXN_ABORTED;
+			break;
+		case TXN_OP_REF:
+			__wt_delete_page_rollback(session, op->u.ref);
+			break;
+		case TXN_OP_TRUNCATE_COL:
+		case TXN_OP_TRUNCATE_ROW:
+			/*
+			 * Nothing to do: these operations are only logged for
+			 * recovery.  The in-memory changes will be rolled back
+			 * with a combination of TXN_OP_REF and TXN_OP_INMEM
+			 * operations.
+			 */
+			break;
+		}
+
+		/* Free any memory allocated for the operation. */
+		__wt_txn_op_free(session, op);
+	}
+	txn->mod_count = 0;
+
+	__wt_txn_release(session);
+	return (ret);
+}
+
+/*
+ * __wt_txn_init --
+ *	Initialize a session's transaction data.
+ */
+int
+__wt_txn_init(WT_SESSION_IMPL *session)
+{
+	WT_TXN *txn;
+
+	txn = &session->txn;
+	txn->id = WT_TXN_NONE;
+
+	WT_RET(__wt_calloc_def(session,
+	    S2C(session)->session_size, &txn->snapshot));
+
+#ifdef HAVE_DIAGNOSTIC
+	if (S2C(session)->txn_global.states != NULL) {
+		WT_TXN_STATE *txn_state;
+		txn_state = &S2C(session)->txn_global.states[session->id];
+		WT_ASSERT(session, txn_state->snap_min == WT_TXN_NONE);
+	}
+#endif
+
+	/*
+	 * Take care to clean these out in case we are reusing the transaction
+	 * for eviction.
+	 */
+	txn->mod = NULL;
+
+	txn->isolation = session->isolation;
+	return (0);
+}
+
+/*
+ * __wt_txn_stats_update --
+ *	Update the transaction statistics for return to the application.
+ */
+void
+__wt_txn_stats_update(WT_SESSION_IMPL *session)
+{
+	WT_TXN_GLOBAL *txn_global;
+	WT_CONNECTION_IMPL *conn;
+	WT_CONNECTION_STATS *stats;
+
+	conn = S2C(session);
+	txn_global = &conn->txn_global;
+	stats = &conn->stats;
+
+	WT_STAT_SET(stats, txn_pinned_range,
+	    txn_global->current - txn_global->oldest_id);
+}
+
+/*
+ * __wt_txn_destroy --
+ *	Destroy a session's transaction data.
+ */
+void
+__wt_txn_destroy(WT_SESSION_IMPL *session)
+{
+	WT_TXN *txn;
+
+	txn = &session->txn;
+	__wt_free(session, txn->mod);
+	__wt_free(session, txn->snapshot);
+}
+
+/*
+ * __wt_txn_global_init --
+ *	Initialize the global transaction state.
+ */
+int
+__wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_TXN_GLOBAL *txn_global;
+	WT_TXN_STATE *s;
+	u_int i;
+
+	WT_UNUSED(cfg);
+	conn = S2C(session);
+
+	txn_global = &conn->txn_global;
+	txn_global->current = 1;
+	txn_global->oldest_id = 1;
+	txn_global->last_running = 1;
+
+	WT_RET(__wt_calloc_def(
+	    session, conn->session_size, &txn_global->states));
+	for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++)
+		s->id = s->snap_min = WT_TXN_NONE;
+
+	return (0);
+}
+
+/*
+ * __wt_txn_global_destroy --
+ *	Destroy the global transaction state.
+ */
+void
+__wt_txn_global_destroy(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_TXN_GLOBAL *txn_global;
+
+	conn = S2C(session);
+	txn_global = &conn->txn_global;
+
+	if (txn_global != NULL)
+		__wt_free(session, txn_global->states);
+}
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
new file mode 100644
index 00000000000..555eec649c6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -0,0 +1,944 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_checkpoint_name_ok --
+ *	Complain if the checkpoint name isn't acceptable.
+ */
+int
+__wt_checkpoint_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len)
+{
+	/* Check for characters we don't want to see in a metadata file. */
+	WT_RET(__wt_name_check(session, name, len));
+
+	/*
+	 * The internal checkpoint name is special, applications aren't allowed
+	 * to use it.  Be aggressive and disallow any matching prefix, it makes
+	 * things easier when checking in other places.
+	 */
+	if (len < strlen(WT_CHECKPOINT))
+		return (0);
+	if (!WT_PREFIX_MATCH(name, WT_CHECKPOINT))
+		return (0);
+
+	WT_RET_MSG(session, EINVAL,
+	    "the checkpoint name \"%s\" is reserved", WT_CHECKPOINT);
+}
+
+/*
+ * __checkpoint_name_check --
+ *	Check for an attempt to name a checkpoint that includes anything
+ * other than a file object.
+ */
+static int
+__checkpoint_name_check(WT_SESSION_IMPL *session, const char *uri)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	const char *fail;
+
+	cursor = NULL;
+	fail = NULL;
+
+	/*
+	 * This function exists as a place for this comment: named checkpoints
+	 * are only supported on file objects, and not on LSM trees or Helium
+	 * devices.  If a target list is configured for the checkpoint, this
+	 * function is called with each target list entry; check the entry to
+	 * make sure it's backed by a file.  If no target list is configured,
+	 * confirm the metadata file contains no non-file objects.
+	 */
+	if (uri == NULL) {
+		WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+		while ((ret = cursor->next(cursor)) == 0) {
+			WT_ERR(cursor->get_key(cursor, &uri));
+			if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
+			    !WT_PREFIX_MATCH(uri, "file:") &&
+			    !WT_PREFIX_MATCH(uri, "index:") &&
+			    !WT_PREFIX_MATCH(uri, "table:")) {
+				fail = uri;
+				break;
+			}
+		}
+		WT_ERR_NOTFOUND_OK(ret);
+	} else
+		if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
+		    !WT_PREFIX_MATCH(uri, "file:") &&
+		    !WT_PREFIX_MATCH(uri, "index:") &&
+		    !WT_PREFIX_MATCH(uri, "table:"))
+			fail = uri;
+
+	if (fail != NULL)
+		WT_ERR_MSG(session, EINVAL,
+		    "%s object does not support named checkpoints", fail);
+
+err:	if (cursor != NULL)
+		WT_TRET(cursor->close(cursor));
+	return (ret);
+}
+
+/*
+ * __checkpoint_apply --
+ *	Apply an operation to all files involved in a checkpoint.
+ */
+static int
+__checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[],
+	int (*op)(WT_SESSION_IMPL *, const char *[]), int *fullp)
+{
+	WT_CONFIG targetconf;
+	WT_CONFIG_ITEM cval, k, v;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	int ckpt_closed, named, target_list;
+
+	target_list = 0;
+
+	/* Flag if this is a named checkpoint, and check if the name is OK. */
+	WT_RET(__wt_config_gets(session, cfg, "name", &cval));
+	named = cval.len != 0;
+	if (named)
+		WT_RET(__wt_checkpoint_name_ok(session, cval.str, cval.len));
+
+	/* Step through the targets and optionally operate on each one. */
+	WT_ERR(__wt_config_gets(session, cfg, "target", &cval));
+	WT_ERR(__wt_config_subinit(session, &targetconf, &cval));
+	while ((ret = __wt_config_next(&targetconf, &k, &v)) == 0) {
+		if (!target_list) {
+			WT_ERR(__wt_scr_alloc(session, 512, &tmp));
+			target_list = 1;
+		}
+
+		if (v.len != 0)
+			WT_ERR_MSG(session, EINVAL,
+			    "invalid checkpoint target %.*s: URIs may require "
+			    "quoting",
+			    (int)cval.len, (char *)cval.str);
+
+		/* Some objects don't support named checkpoints. */
+		if (named)
+			WT_ERR(__checkpoint_name_check(session, k.str));
+
+		if (op == NULL)
+			continue;
+		WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str));
+		if ((ret = __wt_schema_worker(
+		    session, tmp->data, op, NULL, cfg, 0)) != 0)
+			WT_ERR_MSG(session, ret, "%s", (const char *)tmp->data);
+	}
+	WT_ERR_NOTFOUND_OK(ret);
+
+	if (!target_list && named)
+		/* Some objects don't support named checkpoints. */
+		WT_ERR(__checkpoint_name_check(session, NULL));
+
+	if (!target_list && op != NULL) {
+		/*
+		 * If the checkpoint is named or we're dropping checkpoints, we
+		 * checkpoint both open and closed files; else, only checkpoint
+		 * open files.
+		 *
+		 * XXX
+		 * We don't optimize unnamed checkpoints of a list of targets,
+		 * we open the targets and checkpoint them even if they are
+		 * quiescent and don't need a checkpoint, believing applications
+		 * unlikely to checkpoint a list of closed targets.
+		 */
+		ckpt_closed = named;
+		if (!ckpt_closed) {
+			WT_ERR(__wt_config_gets(session, cfg, "drop", &cval));
+			ckpt_closed = cval.len != 0;
+		}
+		WT_ERR(ckpt_closed ?
+		    __wt_meta_btree_apply(session, op, cfg) :
+		    __wt_conn_btree_apply(session, 0, op, cfg));
+	}
+
+	if (fullp != NULL)
+		*fullp = !target_list;
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __checkpoint_data_source --
+ *	Checkpoint all data sources.
+ */
+static int
+__checkpoint_data_source(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_NAMED_DATA_SOURCE *ndsrc;
+	WT_DATA_SOURCE *dsrc;
+
+	/*
+	 * A place-holder, to support Helium devices: we assume calling the
+	 * underlying data-source session checkpoint function is sufficient to
+	 * checkpoint all objects in the data source, open or closed, and we
+	 * don't attempt to optimize the checkpoint of individual targets.
+	 * Those assumptions is correct for the Helium device, but it's not
+	 * necessarily going to be true for other data sources.
+	 *
+	 * It's not difficult to support data-source checkpoints of individual
+	 * targets (__wt_schema_worker is the underlying function that will do
+	 * the work, and it's already written to support data-sources, although
+	 * we'd probably need to pass the URI of the object to the data source
+	 * checkpoint function which we don't currently do).  However, doing a
+	 * full data checkpoint is trickier: currently, the connection code is
+	 * written to ignore all objects other than "file:", and that code will
+	 * require significant changes to work with data sources.
+	 */
+	TAILQ_FOREACH(ndsrc, &S2C(session)->dsrcqh, q) {
+		dsrc = ndsrc->dsrc;
+		if (dsrc->checkpoint != NULL)
+			WT_RET(dsrc->checkpoint(dsrc,
+			    (WT_SESSION *)session, (WT_CONFIG_ARG *)cfg));
+	}
+	return (0);
+}
+
+/*
+ * __wt_checkpoint_list --
+ *	Get a list of handles to flush.
+ */
+int
+__wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_DATA_HANDLE *saved_dhandle;
+	WT_DECL_RET;
+	const char *name;
+
+	WT_UNUSED(cfg);
+
+	/* Should not be called with anything other than a file object. */
+	WT_ASSERT(session, session->dhandle->checkpoint == NULL);
+	WT_ASSERT(session,
+	    memcmp(session->dhandle->name, "file:", strlen("file:")) == 0);
+
+	/* Make sure there is space for the next entry. */
+	WT_RET(__wt_realloc_def(session, &session->ckpt_handle_allocated,
+	    session->ckpt_handle_next + 1, &session->ckpt_handle));
+
+	/* Not strictly necessary, but cleaner to clear the current handle. */
+	name = session->dhandle->name;
+	saved_dhandle = session->dhandle;
+	session->dhandle = NULL;
+
+	/* Ignore busy files, we'll deal with them in the checkpoint. */
+	switch (ret = __wt_session_get_btree(session, name, NULL, NULL, 0)) {
+	case 0:
+		session->ckpt_handle[
+		    session->ckpt_handle_next++] = session->dhandle;
+		break;
+	case EBUSY:
+		ret = 0;
+		break;
+	default:
+		break;
+	}
+
+	session->dhandle = saved_dhandle;
+	return (ret);
+}
+
+/*
+ * __checkpoint_write_leaves --
+ *	Write any dirty leaf pages for all checkpoint handles.
+ */
+static int
+__checkpoint_write_leaves(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_DATA_HANDLE *dhandle;
+	WT_DECL_RET;
+	u_int i;
+
+	i = 0;
+
+	/* Should not be called with any handle reference. */
+	WT_ASSERT(session, session->dhandle == NULL);
+
+	/*
+	 * Get a list of handles we want to flush; this may pull closed objects
+	 * into the session cache, but we're going to do that eventually anyway.
+	 */
+	WT_WITH_SCHEMA_LOCK(session,
+	    ret = __checkpoint_apply(session, cfg, __wt_checkpoint_list, NULL));
+	WT_ERR(ret);
+
+	/*
+	 * Walk the list, flushing the leaf pages from each file, then releasing
+	 * the file.  Note that we increment inside the loop to simplify error
+	 * handling.
+	 */
+	while (i < session->ckpt_handle_next) {
+		dhandle = session->ckpt_handle[i++];
+		WT_WITH_DHANDLE(session, dhandle,
+		    ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES));
+		WT_WITH_DHANDLE(session, dhandle,
+		    WT_TRET(__wt_session_release_btree(session)));
+		WT_ERR(ret);
+	}
+
+err:	while (i < session->ckpt_handle_next) {
+		dhandle = session->ckpt_handle[i++];
+		WT_WITH_DHANDLE(session, dhandle,
+		    WT_TRET(__wt_session_release_btree(session)));
+	}
+	__wt_free(session, session->ckpt_handle);
+	session->ckpt_handle_allocated = session->ckpt_handle_next = 0;
+	return (ret);
+}
+
+/*
+ * __wt_txn_checkpoint --
+ *	Checkpoint a database or a list of objects in the database.
+ */
+int
+__wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DATA_HANDLE *dhandle;
+	WT_DECL_RET;
+	WT_TXN *txn;
+	WT_TXN_ISOLATION saved_isolation;
+	int full, logging, tracking;
+	const char *txn_cfg[] =
+	    { WT_CONFIG_BASE(session, session_begin_transaction),
+	      "isolation=snapshot", NULL };
+	void *saved_meta_next;
+
+	conn = S2C(session);
+	saved_isolation = session->isolation;
+	txn = &session->txn;
+	full = logging = tracking = 0;
+
+	/*
+	 * Do a pass over the configuration arguments and figure out what kind
+	 * kind of checkpoint this is.
+	 */
+	WT_RET(__checkpoint_apply(session, cfg, NULL, &full));
+
+	/*
+	 * Update the global oldest ID so we do all possible cleanup.
+	 *
+	 * This is particularly important for compact, so that all dirty pages
+	 * can be fully written.
+	 */
+	__wt_txn_update_oldest(session);
+
+	/* Flush data-sources before we start the checkpoint. */
+	WT_ERR(__checkpoint_data_source(session, cfg));
+
+	/* Flush dirty leaf pages before we start the checkpoint. */
+	session->isolation = txn->isolation = TXN_ISO_READ_COMMITTED;
+	WT_ERR(__checkpoint_write_leaves(session, cfg));
+
+	/* Acquire the schema lock. */
+	F_SET(session, WT_SESSION_SCHEMA_LOCKED);
+	__wt_spin_lock(session, &conn->schema_lock);
+
+	WT_ERR(__wt_meta_track_on(session));
+	tracking = 1;
+
+	/* Tell logging that we are about to start a database checkpoint. */
+	if (conn->logging && full)
+		WT_ERR(__wt_txn_checkpoint_log(
+		    session, full, WT_TXN_LOG_CKPT_PREPARE, NULL));
+
+	/*
+	 * Start a snapshot transaction for the checkpoint.
+	 *
+	 * Note: we don't go through the public API calls because they have
+	 * side effects on cursors, which applications can hold open across
+	 * calls to checkpoint.
+	 */
+	WT_ERR(__wt_txn_begin(session, txn_cfg));
+
+	/* Tell logging that we have started a database checkpoint. */
+	if (conn->logging && full) {
+		WT_ERR(__wt_txn_checkpoint_log(
+		    session, full, WT_TXN_LOG_CKPT_START, NULL));
+		logging = 1;
+	}
+
+	WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint, NULL));
+
+	/* Commit the transaction before syncing the file(s). */
+	WT_ERR(__wt_txn_commit(session, NULL));
+
+	/*
+	 * Checkpoints have to hit disk (it would be reasonable to configure for
+	 * lazy checkpoints, but we don't support them yet).
+	 */
+	if (F_ISSET(conn, WT_CONN_CKPT_SYNC))
+		WT_ERR(__checkpoint_apply(
+		    session, cfg, __wt_checkpoint_sync, NULL));
+
+	/* Checkpoint the metadata file. */
+	SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+		if (WT_IS_METADATA(dhandle) ||
+		    !WT_PREFIX_MATCH(dhandle->name, "file:"))
+			break;
+	}
+	if (dhandle == NULL)
+		WT_ERR_MSG(session, EINVAL,
+		    "checkpoint unable to find open meta-data handle");
+
+	/*
+	 * Disable metadata tracking during the metadata checkpoint.
+	 *
+	 * We don't lock old checkpoints in the metadata file: there is no way
+	 * to open one.  We are holding other handle locks, it is not safe to
+	 * lock conn->spinlock.
+	 */
+	session->isolation = txn->isolation = TXN_ISO_READ_UNCOMMITTED;
+	saved_meta_next = session->meta_track_next;
+	session->meta_track_next = NULL;
+	WT_WITH_DHANDLE(session, dhandle, ret = __wt_checkpoint(session, cfg));
+	session->meta_track_next = saved_meta_next;
+
+err:	/*
+	 * XXX
+	 * Rolling back the changes here is problematic.
+	 *
+	 * If we unroll here, we need a way to roll back changes to the avail
+	 * list for each tree that was successfully synced before the error
+	 * occurred.  Otherwise, the next time we try this operation, we will
+	 * try to free an old checkpoint again.
+	 *
+	 * OTOH, if we commit the changes after a failure, we have partially
+	 * overwritten the checkpoint, so what ends up on disk is not
+	 * consistent.
+	 */
+	session->isolation = txn->isolation = TXN_ISO_READ_UNCOMMITTED;
+	if (tracking)
+		WT_TRET(__wt_meta_track_off(session, ret != 0));
+
+	if (F_ISSET(txn, TXN_RUNNING))
+		WT_TRET(__wt_txn_rollback(session, NULL));
+
+	/* Tell logging that we have finished a database checkpoint. */
+	if (logging)
+		WT_TRET(__wt_txn_checkpoint_log(session, full,
+		    (ret == 0) ? WT_TXN_LOG_CKPT_STOP : WT_TXN_LOG_CKPT_FAIL,
+		    NULL));
+
+	if (F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) {
+		F_CLR(session, WT_SESSION_SCHEMA_LOCKED);
+		__wt_spin_unlock(session, &conn->schema_lock);
+	}
+
+	session->isolation = txn->isolation = saved_isolation;
+
+	return (ret);
+}
+
+/*
+ * __drop --
+ *	Drop all checkpoints with a specific name.
+ */
+static void
+__drop(WT_CKPT *ckptbase, const char *name, size_t len)
+{
+	WT_CKPT *ckpt;
+
+	/*
+	 * If we're dropping internal checkpoints, match to the '.' separating
+	 * the checkpoint name from the generational number, and take all that
+	 * we can find.  Applications aren't allowed to use any variant of this
+	 * name, so the test is still pretty simple, if the leading bytes match,
+	 * it's one we want to drop.
+	 */
+	if (strncmp(WT_CHECKPOINT, name, len) == 0) {
+		WT_CKPT_FOREACH(ckptbase, ckpt)
+			if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT))
+				F_SET(ckpt, WT_CKPT_DELETE);
+	} else
+		WT_CKPT_FOREACH(ckptbase, ckpt)
+			if (WT_STRING_MATCH(ckpt->name, name, len))
+				F_SET(ckpt, WT_CKPT_DELETE);
+}
+
+/*
+ * __drop_from --
+ *	Drop all checkpoints after, and including, the named checkpoint.
+ */
+static void
+__drop_from(WT_CKPT *ckptbase, const char *name, size_t len)
+{
+	WT_CKPT *ckpt;
+	int matched;
+
+	/*
+	 * There's a special case -- if the name is "all", then we delete all
+	 * of the checkpoints.
+	 */
+	if (WT_STRING_MATCH("all", name, len)) {
+		WT_CKPT_FOREACH(ckptbase, ckpt)
+			F_SET(ckpt, WT_CKPT_DELETE);
+		return;
+	}
+
+	/*
+	 * We use the first checkpoint we can find, that is, if there are two
+	 * checkpoints with the same name in the list, we'll delete from the
+	 * first match to the end.
+	 */
+	matched = 0;
+	WT_CKPT_FOREACH(ckptbase, ckpt) {
+		if (!matched && !WT_STRING_MATCH(ckpt->name, name, len))
+			continue;
+
+		matched = 1;
+		F_SET(ckpt, WT_CKPT_DELETE);
+	}
+}
+
+/*
+ * __drop_to --
+ *	Drop all checkpoints before, and including, the named checkpoint.
+ */
+static void
+__drop_to(WT_CKPT *ckptbase, const char *name, size_t len)
+{
+	WT_CKPT *ckpt, *mark;
+
+	/*
+	 * We use the last checkpoint we can find, that is, if there are two
+	 * checkpoints with the same name in the list, we'll delete from the
+	 * beginning to the second match, not the first.
+	 */
+	mark = NULL;
+	WT_CKPT_FOREACH(ckptbase, ckpt)
+		if (WT_STRING_MATCH(ckpt->name, name, len))
+			mark = ckpt;
+
+	if (mark == NULL)
+		return;
+
+	WT_CKPT_FOREACH(ckptbase, ckpt) {
+		F_SET(ckpt, WT_CKPT_DELETE);
+
+		if (ckpt == mark)
+			break;
+	}
+}
+
+/*
+ * __checkpoint_worker --
+ *	Checkpoint a tree.
+ */
+static int
+__checkpoint_worker(
+    WT_SESSION_IMPL *session, const char *cfg[], int is_checkpoint)
+{
+	WT_BM *bm;
+	WT_BTREE *btree;
+	WT_CKPT *ckpt, *ckptbase;
+	WT_CONFIG dropconf;
+	WT_CONFIG_ITEM cval, k, v;
+	WT_CONNECTION_IMPL *conn;
+	WT_DATA_HANDLE *dhandle;
+	WT_DECL_RET;
+	WT_LSN ckptlsn;
+	const char *name;
+	int deleted, force, hot_backup_locked, track_ckpt, was_modified;
+	char *name_alloc;
+
+	btree = S2BT(session);
+	bm = btree->bm;
+	conn = S2C(session);
+	ckpt = ckptbase = NULL;
+	INIT_LSN(&ckptlsn);
+	dhandle = session->dhandle;
+	name_alloc = NULL;
+	hot_backup_locked = 0;
+	name_alloc = NULL;
+	track_ckpt = 1;
+	was_modified = btree->modified;
+
+	/* Get the list of checkpoints for this file. */
+	WT_RET(__wt_meta_ckptlist_get(session, dhandle->name, &ckptbase));
+
+	/* This may be a named checkpoint, check the configuration. */
+	cval.len = 0;
+	if (cfg != NULL)
+		WT_ERR(__wt_config_gets(session, cfg, "name", &cval));
+	if (cval.len == 0)
+		name = WT_CHECKPOINT;
+	else {
+		WT_ERR(__wt_checkpoint_name_ok(session, cval.str, cval.len));
+		WT_ERR(__wt_strndup(session, cval.str, cval.len, &name_alloc));
+		name = name_alloc;
+	}
+
+	/* We may be dropping specific checkpoints, check the configuration. */
+	if (cfg != NULL) {
+		cval.len = 0;
+		WT_ERR(__wt_config_gets(session, cfg, "drop", &cval));
+		if (cval.len != 0) {
+			WT_ERR(__wt_config_subinit(session, &dropconf, &cval));
+			while ((ret =
+			    __wt_config_next(&dropconf, &k, &v)) == 0) {
+				/* Disallow unsafe checkpoint names. */
+				if (v.len == 0)
+					WT_ERR(__wt_checkpoint_name_ok(
+					    session, k.str, k.len));
+				else
+					WT_ERR(__wt_checkpoint_name_ok(
+					    session, v.str, v.len));
+
+				if (v.len == 0)
+					__drop(ckptbase, k.str, k.len);
+				else if (WT_STRING_MATCH("from", k.str, k.len))
+					__drop_from(ckptbase, v.str, v.len);
+				else if (WT_STRING_MATCH("to", k.str, k.len))
+					__drop_to(ckptbase, v.str, v.len);
+				else
+					WT_ERR_MSG(session, EINVAL,
+					    "unexpected value for checkpoint "
+					    "key: %.*s",
+					    (int)k.len, k.str);
+			}
+			WT_ERR_NOTFOUND_OK(ret);
+		}
+	}
+
+	/* Drop checkpoints with the same name as the one we're taking. */
+	__drop(ckptbase, name, strlen(name));
+
+	/*
+	 * Check for clean objects not requiring a checkpoint.
+	 *
+	 * If we're closing a handle, and the object is clean, we can skip the
+	 * checkpoint, whatever checkpoints we have are sufficient.  (We might
+	 * not have any checkpoints if the object was never modified, and that's
+	 * OK: the object creation code doesn't mark the tree modified so we can
+	 * skip newly created trees here.)
+	 *
+	 * If the application repeatedly checkpoints an object (imagine hourly
+	 * checkpoints using the same explicit or internal name), there's no
+	 * reason to repeat the checkpoint for clean objects.  The test is if
+	 * the only checkpoint we're deleting is the last one in the list and
+	 * it has the same name as the checkpoint we're about to take, skip the
+	 * work.  (We can't skip checkpoints that delete more than the last
+	 * checkpoint because deleting those checkpoints might free up space in
+	 * the file.)  This means an application toggling between two (or more)
+	 * checkpoint names will repeatedly take empty checkpoints, but that's
+	 * not likely enough to make detection worthwhile.
+	 *
+	 * Checkpoint read-only objects otherwise: the application must be able
+	 * to open the checkpoint in a cursor after taking any checkpoint, which
+	 * means it must exist.
+	 */
+	force = 0;
+	if (!btree->modified && cfg != NULL) {
+		ret = __wt_config_gets(session, cfg, "force", &cval);
+		if (ret != 0 && ret != WT_NOTFOUND)
+			WT_ERR(ret);
+		if (ret == 0 && cval.val != 0)
+			force = 1;
+	}
+	if (!btree->modified && !force) {
+		if (!is_checkpoint)
+			goto done;
+
+		deleted = 0;
+		WT_CKPT_FOREACH(ckptbase, ckpt)
+			if (F_ISSET(ckpt, WT_CKPT_DELETE))
+				++deleted;
+		/*
+		 * Complicated test: if we only deleted a single checkpoint, and
+		 * it was the last checkpoint in the object, and it has the same
+		 * name as the checkpoint we're taking (correcting for internal
+		 * checkpoint names with their generational suffix numbers), we
+		 * can skip the checkpoint, there's nothing to do.
+		 */
+		if (deleted == 1 &&
+		    F_ISSET(ckpt - 1, WT_CKPT_DELETE) &&
+		    (strcmp(name, (ckpt - 1)->name) == 0 ||
+		    (WT_PREFIX_MATCH(name, WT_CHECKPOINT) &&
+		    WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT))))
+			goto done;
+	}
+
+	/* Add a new checkpoint entry at the end of the list. */
+	WT_CKPT_FOREACH(ckptbase, ckpt)
+		;
+	WT_ERR(__wt_strdup(session, name, &ckpt->name));
+	F_SET(ckpt, WT_CKPT_ADD);
+
+	/*
+	 * We can't delete checkpoints if a backup cursor is open.  WiredTiger
+	 * checkpoints are uniquely named and it's OK to have multiple of them
+	 * in the system: clear the delete flag for them, and otherwise fail.
+	 * Hold the lock until we're done (blocking hot backups from starting),
+	 * we don't want to race with a future hot backup.
+	 */
+	__wt_spin_lock(session, &conn->hot_backup_lock);
+	hot_backup_locked = 1;
+	if (conn->hot_backup)
+		WT_CKPT_FOREACH(ckptbase, ckpt) {
+			if (!F_ISSET(ckpt, WT_CKPT_DELETE))
+				continue;
+			if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) {
+				F_CLR(ckpt, WT_CKPT_DELETE);
+				continue;
+			}
+			WT_ERR_MSG(session, EBUSY,
+			    "checkpoint %s blocked by hot backup: it would "
+			    "delete an existing checkpoint, and checkpoints "
+			    "cannot be deleted during a hot backup",
+			    ckpt->name);
+		}
+
+	/*
+	 * Lock the checkpoints that will be deleted.
+	 *
+	 * Checkpoints are only locked when tracking is enabled, which covers
+	 * checkpoint and drop operations, but not close.  The reasoning is
+	 * there should be no access to a checkpoint during close, because any
+	 * thread accessing a checkpoint will also have the current file handle
+	 * open.
+	 */
+	if (WT_META_TRACKING(session))
+		WT_CKPT_FOREACH(ckptbase, ckpt) {
+			if (!F_ISSET(ckpt, WT_CKPT_DELETE))
+				continue;
+
+			/*
+			 * We can't delete checkpoints referenced by a cursor.
+			 * WiredTiger checkpoints are uniquely named and it's
+			 * OK to have multiple in the system: clear the delete
+			 * flag for them, and otherwise fail.
+			 */
+			ret = __wt_session_lock_checkpoint(session, ckpt->name);
+			if (ret == 0)
+				continue;
+			if (ret == EBUSY &&
+			    WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) {
+				F_CLR(ckpt, WT_CKPT_DELETE);
+				continue;
+			}
+			WT_ERR_MSG(session, ret,
+			    "checkpoints cannot be dropped when in-use");
+		}
+
+	/*
+	 * There are special files: those being bulk-loaded, salvaged, upgraded
+	 * or verified during the checkpoint.  We have to do something for those
+	 * objects because a checkpoint is an external name the application can
+	 * reference and the name must exist no matter what's happening during
+	 * the checkpoint.  For bulk-loaded files, we could block until the load
+	 * completes, checkpoint the partial load, or magic up an empty-file
+	 * checkpoint.  The first is too slow, the second is insane, so do the
+	 * third.
+	 *    Salvage, upgrade and verify don't currently require any work, all
+	 * three hold the schema lock, blocking checkpoints. If we ever want to
+	 * fix that (and I bet we eventually will, at least for verify), we can
+	 * copy the last checkpoint the file has.  That works if we guarantee
+	 * salvage, upgrade and verify act on objects with previous checkpoints
+	 * (true if handles are closed/re-opened between object creation and a
+	 * subsequent salvage, upgrade or verify operation).  Presumably,
+	 * salvage and upgrade will discard all previous checkpoints when they
+	 * complete, which is fine with us.  This change will require reference
+	 * counting checkpoints, and once that's done, we should use checkpoint
+	 * copy instead of forcing checkpoints on clean objects to associate
+	 * names with checkpoints.
+	 */
+	if (is_checkpoint)
+		switch (F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) {
+		case 0:
+			break;
+		case WT_BTREE_BULK:
+			/*
+			 * The only checkpoints a bulk-loaded file should have
+			 * are fake ones we created without the underlying block
+			 * manager.  I'm leaving this code here because it's a
+			 * cheap test and a nasty race.
+			 */
+			WT_CKPT_FOREACH(ckptbase, ckpt)
+				if (!F_ISSET(ckpt, WT_CKPT_ADD | WT_CKPT_FAKE))
+					WT_ERR_MSG(session, ret,
+					    "block-manager checkpoint found "
+					    "for a bulk-loaded file");
+			track_ckpt = 0;
+			goto fake;
+		case WT_BTREE_SALVAGE:
+		case WT_BTREE_UPGRADE:
+		case WT_BTREE_VERIFY:
+			WT_ERR_MSG(session, EINVAL,
+			    "checkpoints are blocked during salvage, upgrade "
+			    "or verify operations");
+		}
+
+	/*
+	 * If an object has never been used (in other words, if it could become
+	 * a bulk-loaded file), then we must fake the checkpoint.  This is good
+	 * because we don't write physical checkpoint blocks for just-created
+	 * files, but it's not just a good idea.  The reason is because deleting
+	 * a physical checkpoint requires writing the file, and fake checkpoints
+	 * can't write the file.  If you (1) create a physical checkpoint for an
+	 * empty file which writes blocks, (2) start bulk-loading records into
+	 * the file, (3) during the bulk-load perform another checkpoint with
+	 * the same name; in order to keep from having two checkpoints with the
+	 * same name you would have to use the bulk-load's fake checkpoint to
+	 * delete a physical checkpoint, and that will end in tears.
+	 */
+	if (is_checkpoint)
+		if (btree->bulk_load_ok) {
+			track_ckpt = 0;
+			goto fake;
+		}
+
+	/*
+	 * Mark the root page dirty to ensure something gets written. (If the
+	 * tree is modified, we must write the root page anyway, this doesn't
+	 * add additional writes to the process.  If the tree is not modified,
+	 * we have to dirty the root page to ensure something gets written.)
+	 * This is really about paranoia: if the tree modification value gets
+	 * out of sync with the set of dirty pages (modify is set, but there
+	 * are no dirty pages), we perform a checkpoint without any writes, no
+	 * checkpoint is created, and then things get bad.
+	 */
+	WT_ERR(__wt_page_modify_init(session, btree->root.page));
+	__wt_page_modify_set(session, btree->root.page);
+
+	/*
+	 * Clear the tree's modified flag; any changes before we clear the flag
+	 * are guaranteed to be part of this checkpoint (unless reconciliation
+	 * skips updates for transactional reasons), and changes subsequent to
+	 * the checkpoint start, which might not be included, will re-set the
+	 * modified flag.  The "unless reconciliation skips updates" problem is
+	 * handled in the reconciliation code: if reconciliation skips updates,
+	 * it sets the modified flag itself.  Use a full barrier so we get the
+	 * store done quickly, this isn't a performance path.
+	 */
+	btree->modified = 0;
+	WT_FULL_BARRIER();
+
+	/* Tell logging that a file checkpoint is starting. */
+	if (conn->logging)
+		WT_ERR(__wt_txn_checkpoint_log(
+		    session, 0, WT_TXN_LOG_CKPT_START, &ckptlsn));
+
+	/* Flush the file from the cache, creating the checkpoint. */
+	if (is_checkpoint)
+		WT_ERR(__wt_cache_op(session, ckptbase, WT_SYNC_CHECKPOINT));
+	else
+		WT_ERR(__wt_cache_op(session, ckptbase, WT_SYNC_CLOSE));
+
+	/*
+	 * All blocks being written have been written; set the object's write
+	 * generation.
+	 */
+	WT_CKPT_FOREACH(ckptbase, ckpt)
+		if (F_ISSET(ckpt, WT_CKPT_ADD))
+			ckpt->write_gen = btree->write_gen;
+
+fake:	/* Update the object's metadata. */
+	WT_ERR(__wt_meta_ckptlist_set(
+	    session, dhandle->name, ckptbase, &ckptlsn));
+
+	/*
+	 * If we wrote a checkpoint (rather than faking one), pages may be
+	 * available for re-use.  If tracking enabled, defer making pages
+	 * available until transaction end.  The exception is if the handle
+	 * is being discarded, in which case the handle will be gone by the
+	 * time we try to apply or unroll the meta tracking event.
+	 */
+	if (track_ckpt) {
+		if (WT_META_TRACKING(session) && is_checkpoint)
+			WT_ERR(__wt_meta_track_checkpoint(session));
+		else
+			WT_ERR(bm->checkpoint_resolve(bm, session));
+	}
+
+	/* Tell logging that the checkpoint is complete. */
+	if (conn->logging)
+		WT_ERR(__wt_txn_checkpoint_log(
+		    session, 0, WT_TXN_LOG_CKPT_STOP, NULL));
+
+done: err:
+	/*
+	 * If the checkpoint didn't complete successfully, make sure the
+	 * tree is marked dirty.
+	 */
+	if (ret != 0 && !btree->modified && was_modified)
+		btree->modified = 1;
+
+	if (hot_backup_locked)
+		__wt_spin_unlock(session, &conn->hot_backup_lock);
+
+	__wt_meta_ckptlist_free(session, ckptbase);
+	__wt_free(session, name_alloc);
+
+	return (ret);
+}
+
+/*
+ * __wt_checkpoint --
+ *	Checkpoint a file.
+ */
+int
+__wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	/* Should not be called with a checkpoint handle. */
+	WT_ASSERT(session, session->dhandle->checkpoint == NULL);
+
+	/* Should be holding the schema lock. */
+	WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+
+	return (__checkpoint_worker(session, cfg, 1));
+}
+
+/*
+ * __wt_checkpoint_sync --
+ *	Sync a file that has been checkpointed, and wait for the result.
+ */
+int
+__wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_BM *bm;
+
+	WT_UNUSED(cfg);
+
+	bm = S2BT(session)->bm;
+
+	/* Should not be called with a checkpoint handle. */
+	WT_ASSERT(session, session->dhandle->checkpoint == NULL);
+
+	/* Should have an underlying block manager reference. */
+	WT_ASSERT(session, bm != NULL);
+
+	return (bm->sync(bm, session, 0));
+}
+
+/*
+ * __wt_checkpoint_close --
+ *	Checkpoint a single file as part of closing the handle.
+ */
+int
+__wt_checkpoint_close(WT_SESSION_IMPL *session, int force)
+{
+	/* If closing an unmodified file, simply discard its blocks. */
+	if (!S2BT(session)->modified || force)
+		return (__wt_cache_op(session, NULL,
+		    force ? WT_SYNC_DISCARD_FORCE : WT_SYNC_DISCARD));
+
+	/*
+	 * Else, checkpoint the file and optionally flush the writes (the
+	 * checkpoint call will discard the blocks, there's no additional
+	 * step needed).
+	 */
+	WT_RET(__checkpoint_worker(session, NULL, 0));
+	if (F_ISSET(S2C(session), WT_CONN_CKPT_SYNC))
+		WT_RET(__wt_checkpoint_sync(session, NULL));
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/txn/txn_ext.c b/src/third_party/wiredtiger/src/txn/txn_ext.c
new file mode 100644
index 00000000000..31d5506be5b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/txn/txn_ext.c
@@ -0,0 +1,104 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_ext_transaction_id --
+ *	Return the session's transaction ID.
+ */
+uint64_t
+__wt_ext_transaction_id(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session)
+{
+	WT_SESSION_IMPL *session;
+
+	(void)wt_api;					/* Unused parameters */
+	session = (WT_SESSION_IMPL *)wt_session;
+	/* Ignore failures: the only case is running out of transaction IDs. */
+	(void)__wt_txn_id_check(session);
+	return (session->txn.id);
+}
+
+/*
+ * __wt_ext_transaction_isolation_level --
+ *	Return if the current transaction's isolation level.
+ */
+int
+__wt_ext_transaction_isolation_level(
+    WT_EXTENSION_API *wt_api, WT_SESSION *wt_session)
+{
+	WT_SESSION_IMPL *session;
+	WT_TXN *txn;
+
+	(void)wt_api;					/* Unused parameters */
+
+	session = (WT_SESSION_IMPL *)wt_session;
+	txn = &session->txn;
+
+	if (txn->isolation == TXN_ISO_READ_COMMITTED)
+	    return (WT_TXN_ISO_READ_COMMITTED);
+	if (txn->isolation == TXN_ISO_READ_UNCOMMITTED)
+	    return (WT_TXN_ISO_READ_UNCOMMITTED);
+	return (WT_TXN_ISO_SNAPSHOT);
+}
+
+/*
+ * __wt_ext_transaction_notify --
+ *	Request notification of transaction resolution.
+ */
+int
+__wt_ext_transaction_notify(
+    WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, WT_TXN_NOTIFY *notify)
+{
+	WT_SESSION_IMPL *session;
+	WT_TXN *txn;
+
+	(void)wt_api;					/* Unused parameters */
+
+	session = (WT_SESSION_IMPL *)wt_session;
+	txn = &session->txn;
+
+	/*
+	 * XXX
+	 * For now, a single slot for notifications: I'm not bothering with
+	 * more than one because more than one data-source in a transaction
+	 * doesn't work anyway.
+	 */
+	if (txn->notify == notify)
+		return (0);
+	if (txn->notify != NULL)
+		return (ENOMEM);
+
+	txn->notify = notify;
+
+	return (0);
+}
+
+/*
+ * __wt_ext_transaction_oldest --
+ *	Return the oldest transaction ID not yet visible to a running
+ * transaction.
+ */
+uint64_t
+__wt_ext_transaction_oldest(WT_EXTENSION_API *wt_api)
+{
+	return (((WT_CONNECTION_IMPL *)wt_api->conn)->txn_global.oldest_id);
+}
+
+/*
+ * __wt_ext_transaction_visible --
+ *	Return if the current transaction can see the given transaction ID.
+ */
+int
+__wt_ext_transaction_visible(
+    WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, uint64_t transaction_id)
+{
+	(void)wt_api;					/* Unused parameters */
+
+	return (__wt_txn_visible(
+	    (WT_SESSION_IMPL *)wt_session, transaction_id));
+}
diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c
new file mode 100644
index 00000000000..03a71056a9a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/txn/txn_log.c
@@ -0,0 +1,500 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __txn_op_log --
+ *	Log an operation for the current transaction.
+ */
+static int
+__txn_op_log(WT_SESSION_IMPL *session,
+    WT_ITEM *logrec, WT_TXN_OP *op, WT_CURSOR_BTREE *cbt)
+{
+	WT_DECL_RET;
+	WT_ITEM key, value;
+	WT_UPDATE *upd;
+	uint64_t recno;
+
+	WT_CLEAR(key);
+	upd = op->u.upd;
+	value.data = WT_UPDATE_DATA(upd);
+	value.size = upd->size;
+
+	/*
+	 * Log the operation.  It must be one of the following:
+	 * 1) column store remove;
+	 * 2) column store insert/update;
+	 * 3) row store remove; or
+	 * 4) row store insert/update.
+	 */
+	if (cbt->btree->type != BTREE_ROW) {
+		WT_ASSERT(session, cbt->ins != NULL);
+		recno = WT_INSERT_RECNO(cbt->ins);
+		WT_ASSERT(session, recno != 0);
+
+		if (WT_UPDATE_DELETED_ISSET(upd))
+			WT_ERR(__wt_logop_col_remove_pack(session, logrec,
+			    op->fileid, recno));
+		else
+			WT_ERR(__wt_logop_col_put_pack(session, logrec,
+			    op->fileid, recno, &value));
+	} else {
+		WT_ERR(__wt_cursor_row_leaf_key(cbt, &key));
+
+		if (WT_UPDATE_DELETED_ISSET(upd))
+			WT_ERR(__wt_logop_row_remove_pack(session, logrec,
+			    op->fileid, &key));
+		else
+			WT_ERR(__wt_logop_row_put_pack(session, logrec,
+			    op->fileid, &key, &value));
+	}
+
+err:	__wt_buf_free(session, &key);
+	return (ret);
+}
+
+/*
+ * __txn_commit_printlog --
+ *	Print a commit log record.
+ */
+static int
+__txn_commit_printlog(
+    WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+	/* The logging subsystem zero-pads records. */
+	while (*pp < end && **pp)
+		WT_RET(__wt_txn_op_printlog(session, pp, end, out));
+	return (0);
+}
+
+/*
+ * __wt_txn_op_free --
+ *	Free memory associated with a transactional operation.
+ */
+void
+__wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op)
+{
+	switch (op->type) {
+	case TXN_OP_BASIC:
+	case TXN_OP_INMEM:
+	case TXN_OP_REF:
+	case TXN_OP_TRUNCATE_COL:
+		break;
+
+	case TXN_OP_TRUNCATE_ROW:
+		__wt_buf_free(session, &op->u.truncate_row.start);
+		__wt_buf_free(session, &op->u.truncate_row.stop);
+		break;
+	}
+}
+
+/*
+ * __txn_logrec_init --
+ *	Allocate and initialize a buffer for a transaction's log records.
+ */
+static int
+__txn_logrec_init(WT_SESSION_IMPL *session)
+{
+	WT_DECL_ITEM(logrec);
+	WT_DECL_RET;
+	WT_TXN *txn;
+	const char *fmt = WT_UNCHECKED_STRING(Iq);
+	uint32_t rectype = WT_LOGREC_COMMIT;
+	size_t header_size;
+
+	txn = &session->txn;
+	if (txn->logrec != NULL)
+		return (0);
+
+	WT_ASSERT(session, txn->id != WT_TXN_NONE);
+	WT_RET(__wt_struct_size(session, &header_size, fmt, rectype, txn->id));
+	WT_RET(__wt_logrec_alloc(session, header_size, &logrec));
+
+	WT_ERR(__wt_struct_pack(session,
+	    (uint8_t *)logrec->data + logrec->size, header_size,
+	    fmt, rectype, txn->id));
+	logrec->size += (uint32_t)header_size;
+	txn->logrec = logrec;
+
+	if (0) {
+err:		__wt_logrec_free(session, &logrec);
+	}
+	return (ret);
+}
+
+/*
+ * __wt_txn_log_op --
+ *	Write the last logged operation into the in-memory buffer.
+ */
+int
+__wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+	WT_ITEM *logrec;
+	WT_TXN *txn;
+	WT_TXN_OP *op;
+
+	if (!S2C(session)->logging || F_ISSET(session, WT_SESSION_NO_LOGGING))
+		return (0);
+
+	txn = &session->txn;
+
+	/* We'd better have a transaction. */
+	WT_ASSERT(session,
+	    F_ISSET(txn, TXN_RUNNING) && F_ISSET(txn, TXN_HAS_ID));
+
+	WT_ASSERT(session, txn->mod_count > 0);
+	op = txn->mod + txn->mod_count - 1;
+
+	WT_RET(__txn_logrec_init(session));
+	logrec = txn->logrec;
+
+	switch (op->type) {
+	case TXN_OP_BASIC:
+		return (__txn_op_log(session, logrec, op, cbt));
+	case TXN_OP_INMEM:
+	case TXN_OP_REF:
+		/* Nothing to log, we're done. */
+		return (0);
+	case TXN_OP_TRUNCATE_COL:
+		return (__wt_logop_col_truncate_pack(session, logrec,
+		    op->fileid,
+		    op->u.truncate_col.start, op->u.truncate_col.stop));
+	case TXN_OP_TRUNCATE_ROW:
+		return (__wt_logop_row_truncate_pack(session, txn->logrec,
+		    op->fileid,
+		    &op->u.truncate_row.start, &op->u.truncate_row.stop,
+		    (uint32_t)op->u.truncate_row.mode));
+	WT_ILLEGAL_VALUE(session);
+	}
+
+	/* NOTREACHED */
+}
+
+/*
+ * __wt_txn_log_commit --
+ *	Write the operations of a transaction to the log at commit time.
+ */
+int
+__wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_TXN *txn;
+
+	WT_UNUSED(cfg);
+	txn = &session->txn;
+
+	/* Write updates to the log. */
+	return (__wt_log_write(session, txn->logrec, NULL, txn->txn_logsync));
+}
+
+/*
+ * __txn_log_file_sync --
+ *	Write a log record for a file sync.
+ */
+static int
+__txn_log_file_sync(WT_SESSION_IMPL *session, uint32_t flags, WT_LSN *lsnp)
+{
+	WT_BTREE *btree;
+	WT_DECL_ITEM(logrec);
+	WT_DECL_RET;
+	size_t header_size;
+	uint32_t rectype = WT_LOGREC_FILE_SYNC;
+	int start;
+	const char *fmt = WT_UNCHECKED_STRING(III);
+
+	btree = S2BT(session);
+	start = LF_ISSET(WT_TXN_LOG_CKPT_START);
+
+	WT_RET(__wt_struct_size(
+	    session, &header_size, fmt, rectype, btree->id, start));
+	WT_RET(__wt_logrec_alloc(session, header_size, &logrec));
+
+	WT_ERR(__wt_struct_pack(session,
+	    (uint8_t *)logrec->data + logrec->size, header_size,
+	    fmt, rectype, btree->id, start));
+	logrec->size += (uint32_t)header_size;
+
+	WT_ERR(__wt_log_write(session, logrec, lsnp, 0));
+err:	__wt_logrec_free(session, &logrec);
+	return (ret);
+}
+
+/*
+ * __wt_txn_checkpoint_logread --
+ *	Read a log record for a checkpoint operation.
+ */
+int
+__wt_txn_checkpoint_logread(
+    WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+    WT_LSN *ckpt_lsn)
+{
+	WT_ITEM ckpt_snapshot;
+	u_int ckpt_nsnapshot;
+	const char *fmt = WT_UNCHECKED_STRING(IQIU);
+
+	WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+	    &ckpt_lsn->file, &ckpt_lsn->offset,
+	    &ckpt_nsnapshot, &ckpt_snapshot));
+	WT_UNUSED(ckpt_nsnapshot);
+	WT_UNUSED(ckpt_snapshot);
+	*pp = end;
+	return (0);
+}
+
+/*
+ * __wt_txn_checkpoint_log --
+ *	Write a log record for a checkpoint operation.
+ */
+int
+__wt_txn_checkpoint_log(
+    WT_SESSION_IMPL *session, int full, uint32_t flags, WT_LSN *lsnp)
+{
+	WT_DECL_ITEM(logrec);
+	WT_DECL_RET;
+	WT_LSN *ckpt_lsn;
+	WT_TXN *txn;
+	uint8_t *end, *p;
+	size_t recsize;
+	uint32_t i, rectype = WT_LOGREC_CHECKPOINT;
+	const char *fmt = WT_UNCHECKED_STRING(IIQIU);
+
+	txn = &session->txn;
+	ckpt_lsn = &txn->ckpt_lsn;
+
+	/*
+	 * If this is a file sync, log it unless there is a full checkpoint in
+	 * progress.
+	 */
+	if (!full) {
+		if (txn->full_ckpt) {
+			if (lsnp != NULL)
+				*lsnp = *ckpt_lsn;
+			return (0);
+		} else
+			return (__txn_log_file_sync(session, flags, lsnp));
+	}
+
+	switch (flags) {
+	case WT_TXN_LOG_CKPT_PREPARE:
+		txn->full_ckpt = 1;
+		*ckpt_lsn = S2C(session)->log->alloc_lsn;
+		break;
+
+	case WT_TXN_LOG_CKPT_START:
+		/* Take a copy of the transaction snapshot. */
+		txn->ckpt_nsnapshot = txn->snapshot_count;
+		recsize = txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE;
+		WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot));
+		p = txn->ckpt_snapshot->mem;
+		end = p + recsize;
+		for (i = 0; i < txn->snapshot_count; i++)
+			WT_ERR(__wt_vpack_uint(
+			    &p, WT_PTRDIFF(end, p), txn->snapshot[i]));
+		break;
+
+	case WT_TXN_LOG_CKPT_STOP:
+		/*
+		 * During a clean connection close, we get here without the
+		 * prepare or start steps.  In that case, log the current LSN
+		 * as the checkpoint LSN.
+		 */
+		if (!txn->full_ckpt) {
+			txn->ckpt_nsnapshot = 0;
+			*ckpt_lsn = S2C(session)->log->alloc_lsn;
+		}
+
+		/* Write the checkpoint log record. */
+		WT_ERR(__wt_struct_size(session, &recsize, fmt,
+		    rectype, ckpt_lsn->file, ckpt_lsn->offset,
+		    txn->ckpt_nsnapshot, &txn->ckpt_snapshot));
+		WT_ERR(__wt_logrec_alloc(session, recsize, &logrec));
+
+		WT_ERR(__wt_struct_pack(session,
+		    (uint8_t *)logrec->data + logrec->size, recsize, fmt,
+		    rectype, ckpt_lsn->file, ckpt_lsn->offset,
+		    txn->ckpt_nsnapshot, &txn->ckpt_snapshot));
+		logrec->size += (uint32_t)recsize;
+		WT_ERR(__wt_log_write(session, logrec, lsnp, 0));
+
+		/*
+		 * If this full checkpoint completed successfully and there is
+		 * no hot backup in progress, tell the logging subsystem the
+		 * checkpoint LSN so that it can archive.
+		 */
+		if (!S2C(session)->hot_backup)
+			WT_ERR(__wt_log_ckpt(session, ckpt_lsn));
+
+		/* FALLTHROUGH */
+	case WT_TXN_LOG_CKPT_FAIL:
+		/* Cleanup any allocated resources */
+		INIT_LSN(ckpt_lsn);
+		txn->ckpt_nsnapshot = 0;
+		__wt_scr_free(&txn->ckpt_snapshot);
+		txn->full_ckpt = 0;
+		break;
+	}
+
+err:	__wt_logrec_free(session, &logrec);
+	return (ret);
+}
+
+/*
+ * __wt_txn_truncate_log --
+ *	Begin truncating a range of a file.
+ */
+int
+__wt_txn_truncate_log(
+    WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
+{
+	WT_BTREE *btree;
+	WT_ITEM *item;
+	WT_TXN_OP *op;
+
+	btree = S2BT(session);
+
+	WT_RET(__txn_next_op(session, &op));
+
+	if (btree->type == BTREE_ROW) {
+		op->type = TXN_OP_TRUNCATE_ROW;
+		op->u.truncate_row.mode = TXN_TRUNC_ALL;
+		WT_CLEAR(op->u.truncate_row.start);
+		WT_CLEAR(op->u.truncate_row.stop);
+		if (start != NULL) {
+			op->u.truncate_row.mode = TXN_TRUNC_START;
+			item = &op->u.truncate_row.start;
+			WT_RET(__wt_cursor_get_raw_key(&start->iface, item));
+			WT_RET(__wt_buf_set(
+			    session, item, item->data, item->size));
+		}
+		if (stop != NULL) {
+			op->u.truncate_row.mode =
+			    (op->u.truncate_row.mode == TXN_TRUNC_ALL) ?
+			    TXN_TRUNC_STOP : TXN_TRUNC_BOTH;
+			item = &op->u.truncate_row.stop;
+			WT_RET(__wt_cursor_get_raw_key(&stop->iface, item));
+			WT_RET(__wt_buf_set(
+			    session, item, item->data, item->size));
+		}
+	} else {
+		op->type = TXN_OP_TRUNCATE_COL;
+		op->u.truncate_col.start =
+		    (start == NULL) ? 0 : start->recno;
+		op->u.truncate_col.stop =
+		    (stop == NULL) ? 0 : stop->recno;
+	}
+
+	/* Write that operation into the in-memory log. */
+	WT_RET(__wt_txn_log_op(session, NULL));
+
+	WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOGGING_INMEM));
+	F_SET(session, WT_SESSION_LOGGING_INMEM);
+	return (0);
+}
+
+/*
+ * __wt_txn_truncate_end --
+ *	Finish truncating a range of a file.
+ */
+int
+__wt_txn_truncate_end(WT_SESSION_IMPL *session)
+{
+	F_CLR(session, WT_SESSION_LOGGING_INMEM);
+	return (0);
+}
+
+/*
+ * __txn_printlog --
+ *	Print a log record in a human-readable format.
+ */
+static int
+__txn_printlog(
+    WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, void *cookie)
+{
+	FILE *out;
+	WT_LSN ckpt_lsn;
+	uint64_t txnid;
+	uint32_t fileid, rectype;
+	int32_t start;
+	const uint8_t *end, *p;
+	const char *msg;
+
+	out = cookie;
+
+	p = LOG_SKIP_HEADER(logrec->data);
+	end = (const uint8_t *)logrec->data + logrec->size;
+
+	/* First, peek at the log record type. */
+	WT_RET(__wt_logrec_read(session, &p, end, &rectype));
+
+	if (fprintf(out, "  { \"lsn\" : [%" PRIu32 ",%" PRId64 "],\n",
+	    lsnp->file, lsnp->offset) < 0)
+		return (errno);
+
+	switch (rectype) {
+	case WT_LOGREC_CHECKPOINT:
+		WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p),
+		    WT_UNCHECKED_STRING(IQ), &ckpt_lsn.file, &ckpt_lsn.offset));
+		if (fprintf(out, "    \"type\" : \"checkpoint\"\n") < 0 ||
+		    fprintf(
+		    out, "    \"ckpt_lsn\" : [%" PRIu32 ",%" PRId64 "],\n",
+		    ckpt_lsn.file, ckpt_lsn.offset) < 0)
+			return (errno);
+		break;
+
+	case WT_LOGREC_COMMIT:
+		WT_RET(__wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid));
+		if (fprintf(out, "    \"type\" : \"commit\"\n") < 0 ||
+		    fprintf(out, "    \"txnid\" : %" PRIu64 ",\n", txnid) < 0)
+			return (errno);
+		WT_RET(__txn_commit_printlog(session, &p, end, out));
+		break;
+
+	case WT_LOGREC_FILE_SYNC:
+		WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p),
+		    WT_UNCHECKED_STRING(Ii), &fileid, &start));
+		if (fprintf(out, "    \"type\" : \"file_sync\"\n") < 0 ||
+		    fprintf(out, "    \"fileid\" : %" PRIu32 "\n",
+		    fileid) < 0 ||
+		    fprintf(out, "    \"start\" : %" PRId32 "\n", start) < 0)
+			return (errno);
+		break;
+
+	case WT_LOGREC_MESSAGE:
+		WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p),
+		    WT_UNCHECKED_STRING(S), &msg));
+		if (fprintf(out, "    \"type\" : \"message\"\n") < 0 ||
+		    fprintf(out, "    \"message\" : \"%s\"\n", msg) < 0)
+			return (errno);
+		break;
+	}
+
+	if (fprintf(out, "  },\n") < 0)
+		return (errno);
+
+	return (0);
+}
+
+/*
+ * __wt_txn_printlog --
+ *	Print the log in a human-readable format.
+ */
+int
+__wt_txn_printlog(WT_SESSION *wt_session, FILE *out)
+{
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+
+	if (fprintf(out, "[\n") < 0)
+		return (errno);
+	WT_RET(__wt_log_scan(
+	    session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, out));
+	if (fprintf(out, "]\n") < 0)
+		return (errno);
+
+	return (0);
+}
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
new file mode 100644
index 00000000000..38c606320ef
--- /dev/null
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -0,0 +1,491 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/* State maintained during recovery. */
+typedef struct {
+	WT_SESSION_IMPL *session;
+
+	/* Files from the metadata, indexed by file ID. */
+	struct WT_RECOVERY_FILE {
+		const char *uri;	/* File URI. */
+		WT_CURSOR *c;		/* Cursor used for recovery. */
+		WT_LSN ckpt_lsn;	/* File's checkpoint LSN. */
+	} *files;
+	size_t file_alloc;		/* Allocated size of files array. */
+	u_int max_fileid;		/* Maximum file ID seen. */
+	u_int nfiles;			/* Number of files in the metadata. */
+
+	WT_LSN ckpt_lsn;		/* Start LSN for main recovery loop. */
+
+	int missing;			/* Were there missing files? */
+	int modified;			/* Did recovery make any changes? */
+	int metadata_only;		/*
+					 * Set during the first recovery pass,
+					 * when only the metadata is recovered.
+					 */
+} WT_RECOVERY;
+
+/*
+ * __recovery_cursor --
+ *	Get a cursor for a recovery operation.
+ */
+static int
+__recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r,
+    WT_LSN *lsnp, u_int id, int duplicate, WT_CURSOR **cp)
+{
+	WT_CURSOR *c;
+	const char *cfg[] = { WT_CONFIG_BASE(session, session_open_cursor),
+	    "overwrite", NULL };
+	int metadata_op;
+
+	c = NULL;
+
+	/* Track the largest file ID we have seen. */
+	if (id > r->max_fileid)
+		r->max_fileid = id;
+
+	/*
+	 * Metadata operations have an id of 0.  Match operations based
+	 * on the id and the current pass of recovery for metadata.
+	 *
+	 * Only apply operations in the correct metadata phase, and if the LSN
+	 * is more recent than the last checkpoint.  If there is no entry for a
+	 * file, assume it was dropped or missing after a hot backup.
+	 */
+	metadata_op = (id == WT_METAFILE_ID);
+	if (r->metadata_only != metadata_op)
+		;
+	else if (id >= r->nfiles || r->files[id].uri == NULL) {
+		/* If a file is missing, output a verbose message once. */
+		if (!r->missing)
+			WT_RET(__wt_verbose(session, WT_VERB_RECOVERY,
+			    "No file found with ID %u (max %u)",
+			    id, r->nfiles));
+		r->missing = 1;
+	} else if (LOG_CMP(lsnp, &r->files[id].ckpt_lsn) >= 0) {
+		/*
+		 * We're going to apply the operation.  Get the cursor, opening
+		 * one if none is cached.
+		 */
+		if ((c = r->files[id].c) == NULL) {
+			WT_RET(__wt_open_cursor(
+			    session, r->files[id].uri, NULL, cfg, &c));
+			r->files[id].c = c;
+		}
+	}
+
+	if (duplicate && c != NULL)
+		WT_RET(__wt_open_cursor(
+		    session, r->files[id].uri, NULL, cfg, &c));
+
+	*cp = c;
+	return (0);
+}
+
+/*
+ * Helper to a cursor if this operation is to be applied during recovery.
+ */
+#define	GET_RECOVERY_CURSOR(session, r, lsnp, fileid, cp)		\
+	WT_ERR(__recovery_cursor(					\
+	    (session), (r), (lsnp), (fileid), 0, (cp)));		\
+	WT_ERR(__wt_verbose((session), WT_VERB_RECOVERY,		\
+	    "%s op %d to file %d at LSN %u/%" PRIuMAX,			\
+	    (cursor == NULL) ? "Skipping" : "Applying",			\
+	    optype, fileid, lsnp->file, (uintmax_t)lsnp->offset));	\
+	if (cursor == NULL)						\
+		break
+
+/*
+ * __txn_op_apply --
+ *	Apply a transactional operation during recovery.
+ */
+static int
+__txn_op_apply(
+    WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end)
+{
+	WT_CURSOR *cursor, *start, *stop;
+	WT_DECL_RET;
+	WT_ITEM key, start_key, stop_key, value;
+	WT_SESSION_IMPL *session;
+	uint64_t recno, start_recno, stop_recno;
+	uint32_t fileid, mode, optype, opsize;
+
+	session = r->session;
+	cursor = NULL;
+
+	/* Peek at the size and the type. */
+	WT_ERR(__wt_logop_read(session, pp, end, &optype, &opsize));
+	end = *pp + opsize;
+
+	switch (optype) {
+	case WT_LOGOP_COL_PUT:
+		WT_ERR(__wt_logop_col_put_unpack(session, pp, end,
+		    &fileid, &recno, &value));
+		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+		cursor->set_key(cursor, recno);
+		__wt_cursor_set_raw_value(cursor, &value);
+		WT_ERR(cursor->insert(cursor));
+		break;
+
+	case WT_LOGOP_COL_REMOVE:
+		WT_ERR(__wt_logop_col_remove_unpack(session, pp, end,
+		    &fileid, &recno));
+		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+		cursor->set_key(cursor, recno);
+		WT_ERR(cursor->remove(cursor));
+		break;
+
+	case WT_LOGOP_COL_TRUNCATE:
+		WT_ERR(__wt_logop_col_truncate_unpack(session, pp, end,
+		    &fileid, &start_recno, &stop_recno));
+		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+
+		/* Set up the cursors. */
+		if (start_recno == 0) {
+			start = NULL;
+			stop = cursor;
+		} else if (stop_recno == 0) {
+			start = cursor;
+			stop = NULL;
+		} else {
+			start = cursor;
+			WT_ERR(__recovery_cursor(
+			    session, r, lsnp, fileid, 1, &stop));
+		}
+
+		/* Set the keys. */
+		if (start != NULL)
+			start->set_key(start, start_recno);
+		if (stop != NULL)
+			stop->set_key(stop, stop_recno);
+
+		WT_TRET(session->iface.truncate(&session->iface, NULL,
+		    start, stop, NULL));
+		/* If we opened a duplicate cursor, close it now. */
+		if (stop != NULL && stop != cursor)
+			WT_TRET(stop->close(stop));
+		WT_ERR(ret);
+		break;
+
+	case WT_LOGOP_ROW_PUT:
+		WT_ERR(__wt_logop_row_put_unpack(session, pp, end,
+		    &fileid, &key, &value));
+		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+		__wt_cursor_set_raw_key(cursor, &key);
+		__wt_cursor_set_raw_value(cursor, &value);
+		WT_ERR(cursor->insert(cursor));
+		break;
+
+	case WT_LOGOP_ROW_REMOVE:
+		WT_ERR(__wt_logop_row_remove_unpack(session, pp, end,
+		    &fileid, &key));
+		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+		__wt_cursor_set_raw_key(cursor, &key);
+		WT_ERR(cursor->remove(cursor));
+		break;
+
+	case WT_LOGOP_ROW_TRUNCATE:
+		WT_ERR(__wt_logop_row_truncate_unpack(session, pp, end,
+		    &fileid, &start_key, &stop_key, &mode));
+		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+		/* Set up the cursors. */
+		start = stop = NULL;
+		switch (mode) {
+		case TXN_TRUNC_ALL:
+			/* Both cursors stay NULL. */
+			break;
+		case TXN_TRUNC_BOTH:
+			start = cursor;
+			WT_ERR(__recovery_cursor(
+			    session, r, lsnp, fileid, 1, &stop));
+			break;
+		case TXN_TRUNC_START:
+			start = cursor;
+			break;
+		case TXN_TRUNC_STOP:
+			stop = cursor;
+			break;
+
+		WT_ILLEGAL_VALUE_ERR(session);
+		}
+
+		/* Set the keys. */
+		if (start != NULL)
+			__wt_cursor_set_raw_key(start, &start_key);
+		if (stop != NULL)
+			__wt_cursor_set_raw_key(stop, &stop_key);
+
+		WT_TRET(session->iface.truncate(&session->iface, NULL,
+		    start, stop, NULL));
+		/* If we opened a duplicate cursor, close it now. */
+		if (stop != NULL && stop != cursor)
+			WT_TRET(stop->close(stop));
+		WT_ERR(ret);
+		break;
+
+	WT_ILLEGAL_VALUE_ERR(session);
+	}
+
+	/* Reset the cursor so it doesn't block eviction. */
+	if (cursor != NULL)
+		WT_ERR(cursor->reset(cursor));
+
+	r->modified = 1;
+
+err:	if (ret != 0)
+		__wt_err(session, ret, "Operation failed during recovery");
+	return (ret);
+}
+
+/*
+ * __txn_commit_apply --
+ *	Apply a commit record during recovery.
+ */
+static int
+__txn_commit_apply(
+    WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end)
+{
+	WT_UNUSED(lsnp);
+
+	/* The logging subsystem zero-pads records. */
+	while (*pp < end && **pp)
+		WT_RET(__txn_op_apply(r, lsnp, pp, end));
+
+	return (0);
+}
+
+/*
+ * __txn_log_recover --
+ *	Roll the log forward to recover committed changes.
+ */
+static int
+__txn_log_recover(
+    WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, void *cookie)
+{
+	WT_RECOVERY *r;
+	const uint8_t *end, *p;
+	uint64_t txnid;
+	uint32_t rectype;
+
+	r = cookie;
+	p = LOG_SKIP_HEADER(logrec->data);
+	end = (const uint8_t *)logrec->data + logrec->size;
+
+	/* First, peek at the log record type. */
+	WT_RET(__wt_logrec_read(session, &p, end, &rectype));
+
+	switch (rectype) {
+	case WT_LOGREC_CHECKPOINT:
+		if (r->metadata_only)
+			WT_RET(__wt_txn_checkpoint_logread(
+			    session, &p, end, &r->ckpt_lsn));
+		break;
+
+	case WT_LOGREC_COMMIT:
+		WT_RET(__wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid));
+		WT_UNUSED(txnid);
+		WT_RET(__txn_commit_apply(r, lsnp, &p, end));
+		break;
+	}
+
+	return (0);
+}
+
+/*
+ * __recovery_setup_file --
+ *	Set up the recovery slot for a file.
+ */
+static int
+__recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config)
+{
+	WT_CONFIG_ITEM cval;
+	WT_LSN lsn;
+	uint32_t fileid;
+
+	WT_RET(__wt_config_getones(r->session, config, "id", &cval));
+	fileid = (uint32_t)cval.val;
+
+	if (r->nfiles <= fileid) {
+		WT_RET(__wt_realloc_def(
+		    r->session, &r->file_alloc, fileid + 1, &r->files));
+		r->nfiles = fileid + 1;
+	}
+
+	WT_RET(__wt_strdup(r->session, uri, &r->files[fileid].uri));
+	WT_RET(
+	    __wt_config_getones(r->session, config, "checkpoint_lsn", &cval));
+	/* If there is checkpoint logged for the file, apply everything. */
+	if (cval.type != WT_CONFIG_ITEM_STRUCT)
+		INIT_LSN(&lsn);
+	else if (sscanf(cval.str, "(%" PRIu32 ",%" PRIdMAX ")",
+	    &lsn.file, (intmax_t*)&lsn.offset) != 2)
+		WT_RET_MSG(r->session, EINVAL,
+		    "Failed to parse checkpoint LSN '%.*s'",
+		    (int)cval.len, cval.str);
+	r->files[fileid].ckpt_lsn = lsn;
+
+	WT_RET(__wt_verbose(r->session, WT_VERB_RECOVERY,
+	    "Recovering %s with id %u @ (%" PRIu32 ", %" PRIu64 ")",
+	    uri, fileid, lsn.file, lsn.offset));
+
+	return (0);
+
+}
+
+/*
+ * __recovery_free --
+ *	Free the recovery state.
+ */
+static int
+__recovery_free(WT_RECOVERY *r)
+{
+	WT_CURSOR *c;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	u_int i;
+
+	session = r->session;
+	for (i = 0; i < r->nfiles; i++) {
+		__wt_free(session, r->files[i].uri);
+		if ((c = r->files[i].c) != NULL)
+			WT_TRET(c->close(c));
+	}
+
+	__wt_free(session, r->files);
+	return (ret);
+}
+
+/*
+ * __recovery_file_scan --
+ *	Scan the files referenced from the metadata and gather information
+ *	about them for recovery.
+ */
+static int
+__recovery_file_scan(WT_RECOVERY *r)
+{
+	WT_DECL_RET;
+	WT_CURSOR *c;
+	const char *uri, *config;
+	int cmp;
+
+	/* Scan through all files in the metadata. */
+	c = r->files[0].c;
+	c->set_key(c, "file:");
+	if ((ret = c->search_near(c, &cmp)) != 0) {
+		/* Is the metadata empty? */
+		if (ret == WT_NOTFOUND)
+			ret = 0;
+		goto err;
+	}
+	if (cmp < 0)
+		WT_ERR_NOTFOUND_OK(c->next(c));
+	for (; ret == 0; ret = c->next(c)) {
+		WT_ERR(c->get_key(c, &uri));
+		if (!WT_PREFIX_MATCH(uri, "file:"))
+			break;
+		WT_ERR(c->get_value(c, &config));
+		WT_ERR(__recovery_setup_file(r, uri, config));
+	}
+	WT_ERR_NOTFOUND_OK(ret);
+
+err:	if (r->nfiles > r->max_fileid)
+		r->max_fileid = r->nfiles;
+	return (ret);
+}
+
+/*
+ * __wt_txn_recover --
+ *	Run recovery.
+ */
+int
+__wt_txn_recover(WT_CONNECTION_IMPL *conn)
+{
+	WT_CURSOR *metac;
+	WT_DECL_RET;
+	WT_RECOVERY r;
+	WT_SESSION_IMPL *session;
+	struct WT_RECOVERY_FILE *metafile;
+	const char *config;
+	int was_backup;
+
+	WT_CLEAR(r);
+	INIT_LSN(&r.ckpt_lsn);
+	was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP) ? 1 : 0;
+
+	/* We need a real session for recovery. */
+	WT_RET(__wt_open_session(conn, NULL, NULL, &session));
+	F_SET(session, WT_SESSION_NO_LOGGING);
+	r.session = session;
+
+	WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config));
+	WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config));
+	WT_ERR(__wt_metadata_cursor(session, NULL, &metac));
+	metafile = &r.files[WT_METAFILE_ID];
+	metafile->c = metac;
+
+	/*
+	 * First, do a pass through the log to recover the metadata, and
+	 * establish the last checkpoint LSN.  Skip this when opening a hot
+	 * backup: we already have the correct metadata in that case.
+	 */
+	if (!was_backup) {
+		r.metadata_only = 1;
+		if (IS_INIT_LSN(&metafile->ckpt_lsn))
+			WT_ERR(__wt_log_scan(session,
+			    NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r));
+		else
+			WT_ERR(__wt_log_scan(session,
+			    &metafile->ckpt_lsn, 0, __txn_log_recover, &r));
+
+		WT_ASSERT(session,
+		    LOG_CMP(&r.ckpt_lsn, &conn->log->first_lsn) >= 0);
+	}
+
+	/* Scan the metadata to find the live files and their IDs. */
+	WT_ERR(__recovery_file_scan(&r));
+
+	/*
+	 * We no longer need the metadata cursor: close it to avoid pinning any
+	 * resources that could block eviction during recovery.
+	 */
+	r.files[0].c = NULL;
+	WT_ERR(metac->close(metac));
+
+	/*
+	 * Now, recover all the files apart from the metadata.
+	 * Pass WT_LOGSCAN_RECOVER so that old logs get truncated.
+	 */
+	r.metadata_only = 0;
+	WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY,
+	    "Main recovery loop: starting at %u/%" PRIuMAX,
+	    r.ckpt_lsn.file, (uintmax_t)r.ckpt_lsn.offset));
+	if (IS_INIT_LSN(&r.ckpt_lsn))
+		WT_ERR(__wt_log_scan(session, NULL,
+		    WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER,
+		    __txn_log_recover, &r));
+	else
+		WT_ERR(__wt_log_scan(session, &r.ckpt_lsn,
+		    WT_LOGSCAN_RECOVER,
+		    __txn_log_recover, &r));
+
+	conn->next_file_id = r.max_fileid;
+
+	/*
+	 * If recovery ran successfully forcibly log a checkpoint so the next
+	 * open is fast and keep the metadata up to date with the checkpoint
+	 * LSN and archiving.
+	 */
+	WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));
+
+err:	WT_TRET(__recovery_free(&r));
+	__wt_free(session, config);
+	WT_TRET(session->iface.close(&session->iface, NULL));
+
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util.h b/src/third_party/wiredtiger/src/utilities/util.h
new file mode 100644
index 00000000000..1f2f0b7211a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util.h
@@ -0,0 +1,50 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include <wt_internal.h>
+
+typedef struct {
+	void   *mem;				/* Managed memory chunk */
+	size_t	memsize;			/* Managed memory size */
+} ULINE;
+
+extern const char *home;			/* Home directory */
+extern const char *progname;			/* Program name */
+extern const char *usage_prefix;		/* Global arguments */
+extern int verbose;				/* Verbose flag */
+
+extern WT_EVENT_HANDLER *verbose_handler;
+
+extern int   __wt_opterr;		/* if error message should be printed */
+extern int   __wt_optind;		/* index into parent argv vector */
+extern int   __wt_optopt;		/* character checked for validity */
+extern int   __wt_optreset;		/* reset getopt */
+extern char *__wt_optarg;		/* argument associated with option */
+
+int	 util_backup(WT_SESSION *, int, char *[]);
+int	 util_cerr(const char *, const char *, int);
+int	 util_compact(WT_SESSION *, int, char *[]);
+void	 util_copyright(void);
+int	 util_create(WT_SESSION *, int, char *[]);
+int	 util_drop(WT_SESSION *, int, char *[]);
+int	 util_dump(WT_SESSION *, int, char *[]);
+int	 util_err(int, const char *, ...);
+int	 util_flush(WT_SESSION *, const char *);
+int	 util_list(WT_SESSION *, int, char *[]);
+int	 util_load(WT_SESSION *, int, char *[]);
+int	 util_loadtext(WT_SESSION *, int, char *[]);
+char	*util_name(const char *, const char *);
+int	 util_printlog(WT_SESSION *, int, char *[]);
+int	 util_read(WT_SESSION *, int, char *[]);
+int	 util_read_line(ULINE *, int, int *);
+int	 util_rename(WT_SESSION *, int, char *[]);
+int	 util_salvage(WT_SESSION *, int, char *[]);
+int	 util_stat(WT_SESSION *, int, char *[]);
+int	 util_str2recno(const char *p, uint64_t *recnop);
+int	 util_upgrade(WT_SESSION *, int, char *[]);
+int	 util_verify(WT_SESSION *, int, char *[]);
+int	 util_write(WT_SESSION *, int, char *[]);
diff --git a/src/third_party/wiredtiger/src/utilities/util_backup.c b/src/third_party/wiredtiger/src/utilities/util_backup.c
new file mode 100644
index 00000000000..aa61cc338f0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_backup.c
@@ -0,0 +1,205 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int copy(const char *, const char *);
+static int usage(void);
+
+#define	CBUF_LEN	(128 * 1024)		/* Copy buffer and size. */
+static char *cbuf;
+
+/*
+ * append_target --
+ *	Build a list of comma-separated targets.
+ */
+static int
+append_target(const char *target, char **bufp)
+{
+	static int first = 1;
+	static size_t len = 0, remain = 0;
+	static char *buf = NULL;
+
+						/* 20 bytes of slop */
+	if (remain < strlen(target) + 20) {
+		len += strlen(target) + 512;
+		remain += strlen(target) + 512;
+		if ((buf = realloc(buf, len)) == NULL)
+			return (util_err(errno, NULL));
+		*bufp = buf;
+	}
+	if (first) {
+		first = 0;
+		strcpy(buf, "target=(");
+	} else
+		buf[strlen(buf) - 1] = ',';	/* overwrite previous ")" */
+	strcat(buf, "\"");
+	strcat(buf, target);
+	strcat(buf, "\")");
+	remain -= strlen(target) + 1;
+
+	return (0);
+}
+
+int
+util_backup(WT_SESSION *session, int argc, char *argv[])
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	int ch;
+	char *config;
+	const char *directory, *name;
+
+	config = NULL;
+	while ((ch = __wt_getopt(progname, argc, argv, "t:")) != EOF)
+		switch (ch) {
+		case 't':
+			if (append_target(__wt_optarg, &config))
+				return (1);
+			break;
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= __wt_optind;
+	argv += __wt_optind;
+
+	if (argc != 1) {
+		(void)usage();
+		goto err;
+	}
+	directory = *argv;
+
+	if ((ret = session->open_cursor(
+	    session, "backup:", NULL, config, &cursor)) != 0) {
+		fprintf(stderr, "%s: cursor open(backup:) failed: %s\n",
+		    progname, wiredtiger_strerror(ret));
+		goto err;
+	}
+
+	/* Copy the files. */
+	while (
+	    (ret = cursor->next(cursor)) == 0 &&
+	    (ret = cursor->get_key(cursor, &name)) == 0)
+		if ((ret = copy(name, directory)) != 0)
+			goto err;
+	if (ret == WT_NOTFOUND)
+		ret = 0;
+
+	if (ret != 0) {
+		fprintf(stderr, "%s: cursor next(backup:) failed: %s\n",
+		    progname, wiredtiger_strerror(ret));
+		goto err;
+	}
+
+err:	if (config != NULL)
+		free(config);
+	if (cbuf != NULL)
+		free(cbuf);
+
+	return (ret);
+}
+
+static int
+copy(const char *name, const char *directory)
+{
+	WT_DECL_RET;
+	ssize_t n;
+	int ifd, ofd;
+
+	ret = 1;
+	ifd = ofd = -1;
+
+	if (verbose &&
+	    printf("Backing up %s/%s to %s\n", home, name, directory) < 0) {
+		fprintf(stderr, "%s: %s\n", progname, strerror(errno));
+		return (1);
+	}
+
+	/* Allocate a large copy buffer (use it to build pathnames as well. */
+	if (cbuf == NULL && (cbuf = malloc(CBUF_LEN)) == NULL)
+		goto memerr;
+
+	/* Open the read file. */
+	if (snprintf(cbuf, CBUF_LEN, "%s/%s", home, name) >= CBUF_LEN)
+		goto memerr;
+	if ((ifd = open(cbuf, O_BINARY | O_RDONLY, 0)) < 0)
+		goto readerr;
+
+	/* Open the write file. */
+	if (snprintf(cbuf, CBUF_LEN, "%s/%s", directory, name) >= CBUF_LEN)
+		goto memerr;
+	if ((ofd = open(
+	    cbuf, O_BINARY | O_CREAT | O_WRONLY | O_TRUNC, 0666)) < 0)
+		goto writerr;
+
+	/* Copy the file. */
+	while ((n = read(ifd, cbuf, CBUF_LEN)) > 0)
+		if (write(ofd, cbuf, (size_t)n) != n)
+			goto writerr;
+	if (n != 0)
+		goto readerr;
+
+	/*
+	 * Close file descriptors (forcing a flush on the write side), and
+	 * check for any errors.
+	 */
+	ret = close(ifd);
+	ifd = -1;
+	if (ret != 0)
+		goto readerr;
+
+	/*
+	 * We need to know this file was successfully written, it's a backup.
+	 */
+#ifdef _WIN32
+	if (FlushFileBuffers((HANDLE)_get_osfhandle(ofd)) == 0) {
+		DWORD err = GetLastError();
+		ret = err;
+		goto writerr;
+	}
+#else
+	if (fsync(ofd))
+		goto writerr;
+#endif
+	ret = close(ofd);
+	ofd = -1;
+	if (ret != 0)
+		goto writerr;
+
+	/* Success. */
+	ret = 0;
+
+	if (0) {
+readerr:	fprintf(stderr,
+		    "%s: %s/%s: %s\n", progname, home, name, strerror(errno));
+	}
+	if (0) {
+writerr:	fprintf(stderr, "%s: %s/%s: %s\n",
+		    progname, directory, name, strerror(errno));
+	}
+	if (0) {
+memerr:		fprintf(stderr, "%s: %s\n", progname, strerror(errno));
+	}
+
+	if (ifd >= 0)
+		(void)close(ifd);
+	if (ofd >= 0)
+		(void)close(ofd);
+
+	return (ret);
+}
+
+static int
+usage(void)
+{
+	(void)fprintf(stderr,
+	    "usage: %s %s "
+	    "backup [-t uri] directory\n",
+	    progname, usage_prefix);
+	return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_compact.c b/src/third_party/wiredtiger/src/utilities/util_compact.c
new file mode 100644
index 00000000000..51d5461e43c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_compact.c
@@ -0,0 +1,59 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_compact(WT_SESSION *session, int argc, char *argv[])
+{
+	WT_DECL_RET;
+	int ch;
+	char *uri;
+
+	uri = NULL;
+	while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
+		switch (ch) {
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= __wt_optind;
+	argv += __wt_optind;
+
+	/* The remaining argument is the table name. */
+	if (argc != 1)
+		return (usage());
+	if ((uri = util_name(*argv, "table")) == NULL)
+		return (1);
+
+	if ((ret = session->compact(session, uri, NULL)) != 0) {
+		fprintf(stderr, "%s: compact(%s): %s\n",
+		    progname, uri, wiredtiger_strerror(ret));
+		goto err;
+	}
+
+	if (0) {
+err:		ret = 1;
+	}
+
+	if (uri != NULL)
+		free(uri);
+
+	return (ret);
+}
+
+static int
+usage(void)
+{
+	(void)fprintf(stderr,
+	    "usage: %s %s "
+	    "compact uri\n",
+	    progname, usage_prefix);
+	return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_cpyright.c b/src/third_party/wiredtiger/src/utilities/util_cpyright.c
new file mode 100644
index 00000000000..21d82828863
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_cpyright.c
@@ -0,0 +1,35 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+void
+util_copyright(void)
+{
+	printf("%s\n", "Copyright (c) 2008-2014 WiredTiger, Inc.");
+	printf("%s\n\n", "All rights reserved.");
+
+	printf("%s\n\n",
+	    "This program is free software: you can redistribute it and/or\n"
+	    "modify it under the terms of version 3 of the GNU General\n"
+	    "Public License as published by the Free Software Foundation.");
+
+	printf("%s\n\n",
+	    "This program is distributed in the hope that it will be useful,\n"
+	    "but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
+	    "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n"
+	    "GNU General Public License for more details:");
+
+	printf("\t%s\n\n",
+	    "http://www.gnu.org/licenses/gpl-3.0-standalone.html");
+
+	printf("%s\n",
+	    "For a license to use the WiredTiger software under conditions\n"
+	    "other than those described by the GNU General Public License,\n"
+	    "or for technical support for this software, contact WiredTiger,\n"
+	    "Inc. at info@wiredtiger.com.");
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_create.c b/src/third_party/wiredtiger/src/utilities/util_create.c
new file mode 100644
index 00000000000..ebff3a8ad05
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_create.c
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_create(WT_SESSION *session, int argc, char *argv[])
+{
+	WT_DECL_RET;
+	int ch;
+	const char *config, *uri;
+
+	config = NULL;
+	while ((ch = __wt_getopt(progname, argc, argv, "c:")) != EOF)
+		switch (ch) {
+		case 'c':			/* command-line configuration */
+			config = __wt_optarg;
+			break;
+		case '?':
+		default:
+			return (usage());
+		}
+
+	argc -= __wt_optind;
+	argv += __wt_optind;
+
+	/* The remaining argument is the uri. */
+	if (argc != 1)
+		return (usage());
+
+	if ((uri = util_name(*argv, "table")) == NULL)
+		return (1);
+
+	if ((ret = session->create(session, uri, config)) != 0)
+		return (util_err(ret, "%s: session.create", uri));
+	return (0);
+}
+
+static int
+usage(void)
+{
+	(void)fprintf(stderr,
+	    "usage: %s %s "
+	    "create [-c configuration] uri\n",
+	    progname, usage_prefix);
+	return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_drop.c b/src/third_party/wiredtiger/src/utilities/util_drop.c
new file mode 100644
index 00000000000..6fe416882a3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_drop.c
@@ -0,0 +1,50 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_drop(WT_SESSION *session, int argc, char *argv[])
+{
+	WT_DECL_RET;
+	int ch;
+	char *name;
+
+	while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
+		switch (ch) {
+		case '?':
+		default:
+			return (usage());
+		}
+
+	argc -= __wt_optind;
+	argv += __wt_optind;
+
+	/* The remaining argument is the uri. */
+	if (argc != 1)
+		return (usage());
+	if ((name = util_name(*argv, "table")) == NULL)
+		return (1);
+
+	ret = session->drop(session, name, "force");
+
+	if (name != NULL)
+		free(name);
+	return (ret);
+}
+
+static int
+usage(void)
+{
+	(void)fprintf(stderr,
+	    "usage: %s %s "
+	    "drop uri\n",
+	    progname, usage_prefix);
+	return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_dump.c b/src/third_party/wiredtiger/src/utilities/util_dump.c
new file mode 100644
index 00000000000..bd0590948b4
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_dump.c
@@ -0,0 +1,701 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int dump_config(WT_SESSION *, const char *, int);
+static int dump_json_begin(void);
+static int dump_json_end(void);
+static int dump_json_separator(void);
+static int dump_json_table_begin(WT_CURSOR *, const char *, const char *);
+static int dump_json_table_cg(WT_CURSOR *, const char *, const char *,
+    const char *, const char *);
+static int dump_json_table_config(WT_SESSION *, const char *);
+static int dump_json_table_end(void);
+static int dump_prefix(int);
+static int dump_record(WT_CURSOR *, const char *, int, int);
+static int dump_suffix(void);
+static int dump_table_config(WT_SESSION *, WT_CURSOR *, const char *);
+static int dump_table_config_type(WT_SESSION *,
+    WT_CURSOR *, WT_CURSOR *, const char *, const char *, const char *);
+static int dup_json_string(const char *, char **);
+static int print_config(WT_SESSION *, const char *, const char *, const char *);
+static int usage(void);
+
+int
+util_dump(WT_SESSION *session, int argc, char *argv[])
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	size_t len;
+	int ch, hex, i, json, reverse;
+	char *checkpoint, *config, *name;
+
+	hex = json = reverse = 0;
+	checkpoint = config = name = NULL;
+	while ((ch = __wt_getopt(progname, argc, argv, "c:f:jrx")) != EOF)
+		switch (ch) {
+		case 'c':
+			checkpoint = __wt_optarg;
+			break;
+		case 'f':			/* output file */
+			if (freopen(__wt_optarg, "w", stdout) == NULL)
+				return (
+				    util_err(errno, "%s: reopen", __wt_optarg));
+			break;
+		case 'j':
+			json = 1;
+			break;
+		case 'r':
+			reverse = 1;
+			break;
+		case 'x':
+			hex = 1;
+			break;
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= __wt_optind;
+	argv += __wt_optind;
+
+	/* -j and -x are incompatible. */
+	if (hex && json) {
+		fprintf(stderr,
+		    "%s: the -j and -x dump options are incompatible\n",
+		    progname);
+		goto err;
+	}
+
+	/* The remaining argument is the uri. */
+	if (argc < 1 || (argc != 1 && !json))
+		return (usage());
+
+	if (json && (ret = dump_json_begin()) != 0)
+		goto err;
+
+	for (i = 0; i < argc; i++) {
+		if (json && i > 0)
+			if ((ret = dump_json_separator()) != 0)
+				goto err;
+		if (name != NULL) {
+			free(name);
+			name = NULL;
+		}
+		if ((name = util_name(argv[i], "table")) == NULL)
+			goto err;
+
+		if (json && dump_json_table_config(session, name) != 0)
+			goto err;
+		if (!json && dump_config(session, name, hex) != 0)
+			goto err;
+
+		len =
+		    checkpoint == NULL ? 0 : strlen("checkpoint=") +
+		    strlen(checkpoint) + 1;
+		len += strlen(json ? "dump=json" :
+		    (hex ? "dump=hex" : "dump=print"));
+		if ((config = malloc(len + 10)) == NULL)
+			goto err;
+		if (checkpoint == NULL)
+			config[0] = '\0';
+		else {
+			(void)strcpy(config, "checkpoint=");
+			(void)strcat(config, checkpoint);
+			(void)strcat(config, ",");
+		}
+		(void)strcat(config, json ? "dump=json" :
+		    (hex ? "dump=hex" : "dump=print"));
+		if ((ret = session->open_cursor(
+		    session, name, NULL, config, &cursor)) != 0) {
+			fprintf(stderr, "%s: cursor open(%s) failed: %s\n",
+			    progname, name, wiredtiger_strerror(ret));
+			goto err;
+		}
+
+		if ((ret = dump_record(cursor, name, reverse, json)) != 0)
+			goto err;
+		if (json && (ret = dump_json_table_end()) != 0)
+			goto err;
+	}
+	if (json && ((ret = dump_json_end()) != 0))
+		goto err;
+
+	if (0) {
+err:		ret = 1;
+	}
+
+	if (config != NULL)
+		free(config);
+	if (name != NULL)
+		free(name);
+
+	return (ret);
+}
+
+/*
+ * dump_config --
+ *	Dump the config for the uri.
+ */
+static int
+dump_config(WT_SESSION *session, const char *uri, int hex)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	int tret;
+
+	/* Open a metadata cursor. */
+	if ((ret = session->open_cursor(
+	    session, WT_METADATA_URI, NULL, NULL, &cursor)) != 0) {
+		fprintf(stderr, "%s: %s: session.open_cursor: %s\n",
+		    progname, WT_METADATA_URI, wiredtiger_strerror(ret));
+		return (1);
+	}
+	/*
+	 * Search for the object itself, just to make sure it exists, we don't
+	 * want to output a header if the user entered the wrong name. This is
+	 * where we find out a table doesn't exist, use a simple error message.
+	 */
+	cursor->set_key(cursor, uri);
+	if ((ret = cursor->search(cursor)) == 0) {
+		if (dump_prefix(hex) != 0 ||
+		    dump_table_config(session, cursor, uri) != 0 ||
+		    dump_suffix() != 0)
+			ret = 1;
+	} else if (ret == WT_NOTFOUND)
+		ret = util_err(0, "%s: No such object exists", uri);
+	else
+		ret = util_err(ret, "%s", uri);
+
+	if ((tret = cursor->close(cursor)) != 0) {
+		tret = util_cerr(uri, "close", tret);
+		if (ret == 0)
+			ret = tret;
+	}
+
+	return (ret);
+}
+
+/*
+ * dump_json_begin --
+ *	Output the dump file header prefix.
+ */
+static int
+dump_json_begin(void)
+{
+	if (printf("{\n") < 0)
+		return (util_err(EIO, NULL));
+	return (0);
+}
+
+/*
+ * dump_json_end --
+ *	Output the dump file header suffix.
+ */
+static int
+dump_json_end(void)
+{
+	if (printf("\n}\n") < 0)
+		return (util_err(EIO, NULL));
+	return (0);
+}
+
+/*
+ * dump_json_begin --
+ *	Output the dump file header prefix.
+ */
+static int
+dump_json_separator(void)
+{
+	if (printf(",\n") < 0)
+		return (util_err(EIO, NULL));
+	return (0);
+}
+
+/*
+ * dump_json_table_begin --
+ *	Output the JSON syntax that starts a table, along with its config.
+ */
+static int
+dump_json_table_begin(WT_CURSOR *cursor, const char *uri, const char *config)
+{
+	WT_DECL_RET;
+	const char *name;
+	char *jsonconfig;
+
+	jsonconfig = NULL;
+
+	/* Get the table name. */
+	if ((name = strchr(uri, ':')) == NULL) {
+		fprintf(stderr, "%s: %s: corrupted uri\n", progname, uri);
+		return (1);
+	}
+	++name;
+
+	if ((ret = dup_json_string(config, &jsonconfig)) != 0)
+		return (util_cerr(uri, "config dup", ret));
+	if (printf("    \"%s\" : [\n        {\n", uri) < 0)
+		goto eio;
+	if (printf("            \"config\" : \"%s\",\n", jsonconfig) < 0)
+		goto eio;
+
+	if ((ret = dump_json_table_cg(
+	    cursor, uri, name, "colgroup:", "colgroups")) == 0) {
+		if (printf(",\n") < 0)
+			goto eio;
+		ret =
+		    dump_json_table_cg(cursor, uri, name, "index:", "indices");
+	}
+
+	if (printf("\n        },\n        {\n            \"data\" : [") < 0)
+		goto eio;
+
+	if (0) {
+eio:		ret = util_err(EIO, NULL);
+	}
+
+	free(jsonconfig);
+	return (ret);
+}
+
+/*
+ * dump_json_table_cg --
+ *	Dump the column groups or indices for a table.
+ */
+static int
+dump_json_table_cg(WT_CURSOR *cursor,
+    const char *uri, const char *name, const char *entry, const char *header)
+{
+	WT_DECL_RET;
+	const char *key, *skip, *value;
+	int exact, once;
+	char *jsonconfig;
+	static const char * const indent = "                ";
+
+	once = 0;
+	if (printf("            \"%s\" : [", header) < 0)
+		return (util_err(EIO, NULL));
+
+	/*
+	 * For table dumps, we're done.
+	 */
+	if (cursor == NULL) {
+		if (printf("]") < 0)
+			return (util_err(EIO, NULL));
+		else
+			return (0);
+	}
+
+	/*
+	 * Search the file looking for column group and index key/value pairs:
+	 * for each one, look up the related source information and append it
+	 * to the base record.
+	 */
+	cursor->set_key(cursor, entry);
+	if ((ret = cursor->search_near(cursor, &exact)) != 0) {
+		if (ret == WT_NOTFOUND)
+			return (0);
+		return (util_cerr(uri, "search_near", ret));
+	}
+	if (exact >= 0)
+		goto match;
+	while ((ret = cursor->next(cursor)) == 0) {
+match:		if ((ret = cursor->get_key(cursor, &key)) != 0)
+			return (util_cerr(uri, "get_key", ret));
+
+		/* Check if we've finished the list of entries. */
+		if (!WT_PREFIX_MATCH(key, entry))
+			break;
+
+		/* Check for a table name match. */
+		skip = key + strlen(entry);
+		if (strncmp(
+		    skip, name, strlen(name)) != 0 || skip[strlen(name)] != ':')
+			continue;
+
+		/* Get the value. */
+		if ((ret = cursor->get_value(cursor, &value)) != 0)
+			return (util_cerr(uri, "get_value", ret));
+
+		if ((ret = dup_json_string(value, &jsonconfig)) != 0)
+			return (util_cerr(uri, "config dup", ret));
+		ret = printf("%s\n"
+		    "%s{\n"
+		    "%s    \"uri\" : \"%s\",\n"
+		    "%s    \"config\" : \"%s\"\n"
+		    "%s}",
+		    (once == 0 ? "" : ","),
+		    indent, indent, key, indent, jsonconfig, indent);
+		free(jsonconfig);
+		if (ret < 0)
+			return (util_err(EIO, NULL));
+
+		once = 1;
+	}
+	if (printf("%s]", (once == 0 ? "" : "\n            ")) < 0)
+		return (util_err(EIO, NULL));
+	if (ret == 0 || ret == WT_NOTFOUND)
+		return (0);
+	return (util_cerr(uri, "next", ret));
+}
+
+/*
+ * dump_json_table_config --
+ *	Dump the config for the uri.
+ */
+static int
+dump_json_table_config(WT_SESSION *session, const char *uri)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	WT_EXTENSION_API *wtext;
+	int tret;
+	const char *value;
+
+	/* Dump the config. */
+	if (WT_PREFIX_MATCH(uri, "table:")) {
+		/* Open a metadata cursor. */
+		if ((ret = session->open_cursor(
+		    session, WT_METADATA_URI, NULL, NULL, &cursor)) != 0) {
+			fprintf(stderr, "%s: %s: session.open_cursor: %s\n",
+			    progname, WT_METADATA_URI,
+			    wiredtiger_strerror(ret));
+			return (1);
+		}
+
+		/*
+		 * Search for the object itself, to make sure it
+		 * exists, and get its config string. This where we
+		 * find out a table object doesn't exist, use a simple
+		 * error message.
+		 */
+		cursor->set_key(cursor, uri);
+		if ((ret = cursor->search(cursor)) == 0) {
+			if ((ret = cursor->get_value(cursor, &value)) != 0)
+				ret = util_cerr(uri, "get_value", ret);
+			else if (dump_json_table_begin(cursor, uri,
+			    value) != 0)
+				ret = 1;
+		} else if (ret == WT_NOTFOUND)
+			ret = util_err(0, "%s: No such object exists", uri);
+		else
+			ret = util_err(ret, "%s", uri);
+
+		if ((tret = cursor->close(cursor)) != 0) {
+			tret = util_cerr(uri, "close", tret);
+			if (ret == 0)
+				ret = tret;
+		}
+	} else {
+		/*
+		 * We want to be able to dump the metadata file itself, but the
+		 * configuration for that file lives in the turtle file.  Reach
+		 * down into the library and ask for the file's configuration,
+		 * that will work in all cases.
+		 *
+		 * This where we find out a file object doesn't exist, use a
+		 * simple error message.
+		 */
+		wtext = session->
+		    connection->get_extension_api(session->connection);
+		if ((ret =
+		    wtext->metadata_search(wtext, session, uri, &value)) == 0) {
+			if (dump_json_table_begin(NULL, uri, value) != 0)
+				ret = 1;
+		} else if (ret == WT_NOTFOUND)
+			ret = util_err(0, "%s: No such object exists", uri);
+		else
+			ret = util_err(ret, "%s", uri);
+	}
+
+	return (ret);
+}
+
+/*
+ * dump_json_table_end --
+ *	Output the JSON syntax that ends a table.
+ */
+static int
+dump_json_table_end(void)
+{
+	if (printf("            ]\n        }\n    ]") < 0)
+		return (util_err(EIO, NULL));
+	return (0);
+}
+
+/*
+ * dump_table_config --
+ *	Dump the config for a table.
+ */
+static int
+dump_table_config(WT_SESSION *session, WT_CURSOR *cursor, const char *uri)
+{
+	WT_CURSOR *srch;
+	WT_DECL_RET;
+	int tret;
+	const char *key, *name, *value;
+
+	/* Get the table name. */
+	if ((name = strchr(uri, ':')) == NULL) {
+		fprintf(stderr, "%s: %s: corrupted uri\n", progname, uri);
+		return (1);
+	}
+	++name;
+
+	/*
+	 * Dump out the config information: first, dump the uri entry itself
+	 * (requires a lookup).
+	 */
+	cursor->set_key(cursor, uri);
+	if ((ret = cursor->search(cursor)) != 0)
+		return (util_cerr(uri, "search", ret));
+	if ((ret = cursor->get_key(cursor, &key)) != 0)
+		return (util_cerr(uri, "get_key", ret));
+	if ((ret = cursor->get_value(cursor, &value)) != 0)
+		return (util_cerr(uri, "get_value", ret));
+	if (print_config(session, key, value, NULL) != 0)
+		return (1);
+
+	/*
+	 * The underlying table configuration function needs a second cursor:
+	 * open one before calling it, it makes error handling hugely simpler.
+	 */
+	if ((ret =
+	    session->open_cursor(session, NULL, cursor, NULL, &srch)) != 0)
+		return (util_cerr(uri, "open_cursor", ret));
+
+	if ((ret = dump_table_config_type(
+	    session, cursor, srch, uri, name, "colgroup:")) == 0)
+		ret = dump_table_config_type(
+		    session, cursor, srch, uri, name, "index:");
+
+	if ((tret = srch->close(srch)) != 0) {
+		tret = util_cerr(uri, "close", tret);
+		if (ret == 0)
+			ret = tret;
+	}
+
+	return (ret);
+}
+
+/*
+ * dump_table_config_type --
+ *	Dump the column groups or indices for a table.
+ */
+static int
+dump_table_config_type(WT_SESSION *session,
+    WT_CURSOR *cursor, WT_CURSOR *srch,
+    const char *uri, const char *name, const char *entry)
+{
+	WT_CONFIG_ITEM cval;
+	WT_DECL_RET;
+	const char *key, *skip, *value, *value_source;
+	int exact;
+	char *p;
+
+	/*
+	 * Search the file looking for column group and index key/value pairs:
+	 * for each one, look up the related source information and append it
+	 * to the base record.
+	 */
+	cursor->set_key(cursor, entry);
+	if ((ret = cursor->search_near(cursor, &exact)) != 0) {
+		if (ret == WT_NOTFOUND)
+			return (0);
+		return (util_cerr(uri, "search_near", ret));
+	}
+	if (exact >= 0)
+		goto match;
+	while ((ret = cursor->next(cursor)) == 0) {
+match:		if ((ret = cursor->get_key(cursor, &key)) != 0)
+			return (util_cerr(uri, "get_key", ret));
+
+		/* Check if we've finished the list of entries. */
+		if (!WT_PREFIX_MATCH(key, entry))
+			return (0);
+
+		/* Check for a table name match. */
+		skip = key + strlen(entry);
+		if (strncmp(
+		    skip, name, strlen(name)) != 0 || skip[strlen(name)] != ':')
+			continue;
+
+		/* Get the value. */
+		if ((ret = cursor->get_value(cursor, &value)) != 0)
+			return (util_cerr(uri, "get_value", ret));
+
+		/* Crack it and get the underlying source. */
+		if ((ret = __wt_config_getones(
+		    (WT_SESSION_IMPL *)session, value, "source", &cval)) != 0)
+			return (util_err(ret, "%s: source entry", key));
+
+		/* Nul-terminate the source entry. */
+		if ((p = malloc(cval.len + 10)) == NULL)
+			return (util_err(errno, NULL));
+		(void)strncpy(p, cval.str, cval.len);
+		p[cval.len] = '\0';
+		srch->set_key(srch, p);
+		if ((ret = srch->search(srch)) != 0)
+			ret = util_err(ret, "%s: %s", key, p);
+		free(p);
+		if (ret != 0)
+			return (1);
+
+		/* Get the source's value. */
+		if ((ret = srch->get_value(srch, &value_source)) != 0)
+			return (util_cerr(uri, "get_value", ret));
+
+		/*
+		 * The dumped configuration string is the original key plus the
+		 * source's configuration.
+		 */
+		if (print_config(session, key, value, value_source) != 0)
+			return (util_err(EIO, NULL));
+	}
+	if (ret == 0 || ret == WT_NOTFOUND)
+		return (0);
+	return (util_cerr(uri, "next", ret));
+}
+
+/*
+ * dump_prefix --
+ *	Output the dump file header prefix.
+ */
+static int
+dump_prefix(int hex)
+{
+	int vmajor, vminor, vpatch;
+
+	(void)wiredtiger_version(&vmajor, &vminor, &vpatch);
+
+	if (printf(
+	    "WiredTiger Dump (WiredTiger Version %d.%d.%d)\n",
+	    vmajor, vminor, vpatch) < 0 ||
+	    printf("Format=%s\n", hex ? "hex" : "print") < 0 ||
+	    printf("Header\n") < 0)
+		return (util_err(EIO, NULL));
+	return (0);
+}
+
+/*
+ * dump_record --
+ *	Dump a single record, advance cursor to next/prev, along
+ *	with JSON formatting if needed.
+ */
+static int
+dump_record(WT_CURSOR *cursor, const char *name, int reverse, int json)
+{
+	WT_DECL_RET;
+	const char *infix, *key, *prefix, *suffix, *value;
+	int once;
+
+	once = 0;
+	if (json) {
+		prefix = "\n{\n";
+		infix = ",\n";
+		suffix = "\n}";
+	} else {
+		prefix = "";
+		infix = "\n";
+		suffix = "\n";
+	}
+	while ((ret =
+	    (reverse ? cursor->prev(cursor) : cursor->next(cursor))) == 0) {
+		if ((ret = cursor->get_key(cursor, &key)) != 0)
+			return (util_cerr(name, "get_key", ret));
+		if ((ret = cursor->get_value(cursor, &value)) != 0)
+			return (util_cerr(name, "get_value", ret));
+		if (printf("%s%s%s%s%s%s", (json && once) ? "," : "",
+		    prefix, key, infix, value, suffix) < 0)
+			return (util_err(EIO, NULL));
+		once = 1;
+	}
+	if (json && once && printf("\n") < 0)
+		return (util_err(EIO, NULL));
+	return (ret == WT_NOTFOUND ? 0 :
+	    util_cerr(name, (reverse ? "prev" : "next"), ret));
+}
+
+/*
+ * dump_suffix --
+ *	Output the dump file header suffix.
+ */
+static int
+dump_suffix(void)
+{
+	if (printf("Data\n") < 0)
+		return (util_err(EIO, NULL));
+	return (0);
+}
+
+/*
+ * dup_json_string --
+ *	Like strdup, but escape any characters that are special for JSON.
+ *	The result will be embedded in a JSON string.
+ */
+static int
+dup_json_string(const char *str, char **result)
+{
+	size_t left, nchars;
+	const char *p;
+	char *q;
+
+	nchars = 0;
+	for (p = str; *p; p++, nchars++)
+		nchars += __wt_json_unpack_char(*p, NULL, 0, 0);
+	q = malloc(nchars + 1);
+	if (q == NULL)
+		return (1);
+	*result = q;
+	left = nchars;
+	for (p = str; *p; p++, nchars++) {
+		nchars = __wt_json_unpack_char(*p, (u_char *)q, left, 0);
+		left -= nchars;
+		q += nchars;
+	}
+	*q = '\0';
+	return (0);
+}
+
+/*
+ * print_config --
+ *	Output a key/value URI pair by combining v1 and v2.
+ */
+static int
+print_config(WT_SESSION *session,
+    const char *key, const char *v1, const char *v2)
+{
+	WT_DECL_RET;
+	const char *value_ret;
+
+	/*
+	 * The underlying call will ignore v2 if v1 is NULL -- check here and
+	 * swap in that case.
+	 */
+	if (v1 == NULL) {
+		v1 = v2;
+		v2 = NULL;
+	}
+
+	if ((ret = __wt_session_create_strip(session, v1, v2, &value_ret)) != 0)
+		return (util_err(ret, NULL));
+	ret = printf("%s\n%s\n", key, value_ret);
+	free((char *)value_ret);
+	if (ret < 0)
+		return (util_err(EIO, NULL));
+	return (0);
+}
+
+static int
+usage(void)
+{
+	(void)fprintf(stderr,
+	    "usage: %s %s "
+	    "dump [-jrx] [-c checkpoint] [-f output-file] uri\n",
+	    progname, usage_prefix);
+	return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_list.c b/src/third_party/wiredtiger/src/utilities/util_list.c
new file mode 100644
index 00000000000..4a1489628d1
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_list.c
@@ -0,0 +1,193 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int list_print(WT_SESSION *, const char *, int, int);
+static int list_print_checkpoint(WT_SESSION *, const char *);
+static int usage(void);
+
+int
+util_list(WT_SESSION *session, int argc, char *argv[])
+{
+	WT_DECL_RET;
+	int cflag, ch, vflag;
+	char *name;
+
+	cflag = vflag = 0;
+	name = NULL;
+	while ((ch = __wt_getopt(progname, argc, argv, "cv")) != EOF)
+		switch (ch) {
+		case 'c':
+			cflag = 1;
+			break;
+		case 'v':
+			vflag = 1;
+			break;
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= __wt_optind;
+	argv += __wt_optind;
+
+	switch (argc) {
+	case 0:
+		break;
+	case 1:
+		if ((name = util_name(*argv, "table")) == NULL)
+			return (1);
+		break;
+	default:
+		return (usage());
+	}
+
+	ret = list_print(session, name, cflag, vflag);
+
+	if (name != NULL)
+		free(name);
+
+	return (ret);
+}
+
+/*
+ * list_print --
+ *	List the high-level objects in the database.
+ */
+static int
+list_print(WT_SESSION *session, const char *name, int cflag, int vflag)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	int found;
+	const char *key, *value;
+
+	/* Open the metadata file. */
+	if ((ret = session->open_cursor(
+	    session, WT_METADATA_URI, NULL, NULL, &cursor)) != 0) {
+		/*
+		 * If there is no metadata (yet), this will return ENOENT.
+		 * Treat that the same as an empty metadata.
+		 */
+		if (ret == ENOENT)
+			return (0);
+
+		fprintf(stderr, "%s: %s: session.open_cursor: %s\n",
+		    progname, WT_METADATA_URI, wiredtiger_strerror(ret));
+		return (1);
+	}
+
+	found = name == NULL;
+	while ((ret = cursor->next(cursor)) == 0) {
+		/* Get the key. */
+		if ((ret = cursor->get_key(cursor, &key)) != 0)
+			return (util_cerr("metadata", "get_key", ret));
+
+		/*
+		 * If a name is specified, only show objects that match.
+		 */
+		if (name != NULL) {
+			if (!WT_PREFIX_MATCH(key, name))
+				continue;
+			found = 1;
+		}
+
+		/*
+		 * XXX
+		 * We don't normally say anything about the WiredTiger
+		 * metadata, it's not a normal "object" in the database.  I'm
+		 * making an exception for the checkpoint and verbose options.
+		 */
+		if (strcmp(key, WT_METADATA_URI) != 0 || cflag || vflag)
+			printf("%s\n", key);
+
+		if (!cflag && !vflag)
+			continue;
+
+		if (cflag && (ret = list_print_checkpoint(session, key)) != 0)
+			return (ret);
+		if (vflag) {
+			if ((ret = cursor->get_value(cursor, &value)) != 0)
+				return (
+				    util_cerr("metadata", "get_value", ret));
+			printf("%s\n", value);
+		}
+	}
+	if (ret != WT_NOTFOUND)
+		return (util_cerr("metadata", "next", ret));
+	if (!found) {
+		fprintf(stderr, "%s: %s: not found\n", progname, name);
+		return (1);
+	}
+
+	return (0);
+}
+
+/*
+ * list_print_checkpoint --
+ *	List the checkpoint information.
+ */
+static int
+list_print_checkpoint(WT_SESSION *session, const char *key)
+{
+	WT_DECL_RET;
+	WT_CKPT *ckpt, *ckptbase;
+	size_t len;
+	time_t t;
+	uint64_t v;
+
+	/*
+	 * We may not find any checkpoints for this file, in which case we don't
+	 * report an error, and continue our caller's loop.  Otherwise, read the
+	 * list of checkpoints and print each checkpoint's name and time.
+	 */
+	if ((ret = __wt_metadata_get_ckptlist(session, key, &ckptbase)) != 0)
+		return (ret == WT_NOTFOUND ? 0 : ret);
+
+	/* Find the longest name, so we can pretty-print. */
+	len = 0;
+	WT_CKPT_FOREACH(ckptbase, ckpt)
+		if (strlen(ckpt->name) > len)
+			len = strlen(ckpt->name);
+	++len;
+
+	WT_CKPT_FOREACH(ckptbase, ckpt) {
+		/*
+		 * Call ctime, not ctime_r; ctime_r has portability problems,
+		 * the Solaris version is different from the POSIX standard.
+		 */
+		t = (time_t)ckpt->sec;
+		printf("\t%*s: %.24s", (int)len, ckpt->name, ctime(&t));
+
+		v = ckpt->ckpt_size;
+		if (v >= WT_PETABYTE)
+			printf(" (%" PRIu64 " PB)\n", v / WT_PETABYTE);
+		else if (v >= WT_TERABYTE)
+			printf(" (%" PRIu64 " TB)\n", v / WT_TERABYTE);
+		else if (v >= WT_GIGABYTE)
+			printf(" (%" PRIu64 " GB)\n", v / WT_GIGABYTE);
+		else if (v >= WT_MEGABYTE)
+			printf(" (%" PRIu64 " MB)\n", v / WT_MEGABYTE);
+		else if (v >= WT_KILOBYTE)
+			printf(" (%" PRIu64 " KB)\n", v / WT_KILOBYTE);
+		else
+			printf(" (%" PRIu64 " B)\n", v);
+	}
+
+	__wt_metadata_free_ckptlist(session, ckptbase);
+	return (0);
+}
+
+static int
+usage(void)
+{
+	(void)fprintf(stderr,
+	    "usage: %s %s "
+	    "list [-cv] [uri]\n",
+	    progname, usage_prefix);
+	return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_load.c b/src/third_party/wiredtiger/src/utilities/util_load.c
new file mode 100644
index 00000000000..7d9dfa445dc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_load.c
@@ -0,0 +1,595 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+#include "util_load.h"
+
+static int config_read(char ***, int *);
+static int config_rename(char **, const char *);
+static void config_remove(char *, const char *);
+static int format(void);
+static int insert(WT_CURSOR *, const char *);
+static int load_dump(WT_SESSION *);
+static int usage(void);
+
+static int	append;		/* -a append (ignore record number keys) */
+static char    *cmdname;	/* -r rename */
+static char   **cmdconfig;	/* configuration pairs */
+static int	json;		/* -j input is JSON format */
+static int	no_overwrite;	/* -n don't overwrite existing data */
+
+int
+util_load(WT_SESSION *session, int argc, char *argv[])
+{
+	int ch;
+	const char *filename;
+	uint32_t flags;
+
+	flags = 0;
+
+	filename = "<stdin>";
+	while ((ch = __wt_getopt(progname, argc, argv, "af:jnr:")) != EOF)
+		switch (ch) {
+		case 'a':	/* append (ignore record number keys) */
+			append = 1;
+			break;
+		case 'f':	/* input file */
+			if (freopen(__wt_optarg, "r", stdin) == NULL)
+				return (
+				    util_err(errno, "%s: reopen", __wt_optarg));
+			else
+				filename = __wt_optarg;
+			break;
+		case 'j':	/* input is JSON */
+			json = 1;
+			break;
+		case 'n':	/* don't overwrite existing data */
+			no_overwrite = 1;
+			break;
+		case 'r':	/* rename */
+			cmdname = __wt_optarg;
+			break;
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= __wt_optind;
+	argv += __wt_optind;
+
+	/* -a and -o are mutually exclusive. */
+	if (append == 1 && no_overwrite == 1)
+		return (util_err(EINVAL,
+		    "the -a (append) and -n (no-overwrite) flags are mutually "
+		    "exclusive"));
+
+	/* The remaining arguments are configuration uri/string pairs. */
+	if (argc != 0) {
+		if (argc % 2 != 0)
+			return (usage());
+		cmdconfig = argv;
+	}
+
+	if (json) {
+		if (append)
+			flags |= LOAD_JSON_APPEND;
+		if (no_overwrite)
+			flags |= LOAD_JSON_NO_OVERWRITE;
+		return (util_load_json(session, filename, flags));
+	} else
+		return (load_dump(session));
+}
+
+/*
+ * load_dump --
+ *	Load from the WiredTiger dump format.
+ */
+static int
+load_dump(WT_SESSION *session)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	int hex, tret;
+	char **list, **tlist, *uri, config[64];
+
+	cursor = NULL;
+	list = NULL;		/* -Wuninitialized */
+	hex = 0;		/* -Wuninitialized */
+	uri = NULL;
+
+	/* Read the metadata file. */
+	if ((ret = config_read(&list, &hex)) != 0)
+		return (ret);
+
+	/* Reorder and check the list. */
+	if ((ret = config_reorder(list)) != 0)
+		goto err;
+
+	/* Update the config based on any command-line configuration. */
+	if ((ret = config_update(session, list)) != 0)
+		goto err;
+
+	uri = list[0];
+	/* Create the items in the list. */
+	if ((ret = config_exec(session, list)) != 0)
+		goto err;
+
+	/* Open the insert cursor. */
+	(void)snprintf(config, sizeof(config),
+	    "dump=%s%s%s",
+	    hex ? "hex" : "print",
+	    append ? ",append" : "", no_overwrite ? ",overwrite=false" : "");
+	if ((ret = session->open_cursor(
+	    session, uri, NULL, config, &cursor)) != 0) {
+		ret = util_err(ret, "%s: session.open", uri);
+		goto err;
+	}
+
+	/*
+	 * Check the append flag (it only applies to objects where the primary
+	 * key is a record number).
+	 */
+	if (append && strcmp(cursor->key_format, "r") != 0) {
+		fprintf(stderr,
+		    "%s: %s: -a option illegal unless the primary key is a "
+		    "record number\n",
+		    progname, uri);
+		ret = 1;
+	} else
+		ret = insert(cursor, uri);
+
+err:	/*
+	 * Technically, we don't have to close the cursor because the session
+	 * handle will do it for us, but I'd like to see the flush to disk and
+	 * the close succeed, it's better to fail early when loading files.
+	 */
+	if (cursor != NULL && (tret = cursor->close(cursor)) != 0) {
+		tret = util_err(tret, "%s: cursor.close", uri);
+		if (ret == 0)
+			ret = tret;
+	}
+	if (ret == 0)
+		ret = util_flush(session, uri);
+
+	for (tlist = list; *tlist != NULL; ++tlist)
+		free(*tlist);
+	free(list);
+
+	return (ret == 0 ? 0 : 1);
+}
+
+/*
+ * config_exec --
+ *	Create the tables/indices/colgroups implied by the list.
+ */
+int
+config_exec(WT_SESSION *session, char **list)
+{
+	WT_DECL_RET;
+
+	for (; *list != NULL; list += 2)
+		if ((ret = session->create(session, list[0], list[1])) != 0)
+			return (util_err(ret, "%s: session.create", list[0]));
+	return (0);
+}
+
+/*
+ * config_list_free --
+ *	Add a value to the config list.
+ */
+int
+config_list_add(CONFIG_LIST *clp, char *val)
+{
+	if (clp->entry + 1 >= clp->max_entry)
+		if ((clp->list = realloc(clp->list, (size_t)
+		    (clp->max_entry += 100) * sizeof(char *))) == NULL)
+			/* List already freed by realloc. */
+			return (util_err(errno, NULL));
+
+	clp->list[clp->entry++] = val;
+	clp->list[clp->entry] = NULL;
+	return (0);
+}
+
+/*
+ * config_list_free --
+ *	Free the list and any of its entries.
+ */
+void
+config_list_free(CONFIG_LIST *clp)
+{
+	char **entry;
+
+	if (clp->list != NULL)
+		for (entry = &clp->list[0]; *entry != NULL; entry++)
+			free(*entry);
+	free(clp->list);
+	clp->list = NULL;
+}
+
+/*
+ * config_read --
+ *	Read the config lines and do some basic validation.
+ */
+static int
+config_read(char ***listp, int *hexp)
+{
+	ULINE l;
+	WT_DECL_RET;
+	int entry, eof, max_entry;
+	const char *s;
+	char **list, **tlist;
+
+	list = NULL;
+	memset(&l, 0, sizeof(l));
+
+	/* Header line #1: "WiredTiger Dump" and a WiredTiger version. */
+	if (util_read_line(&l, 0, &eof))
+		return (1);
+	s = "WiredTiger Dump ";
+	if (strncmp(l.mem, s, strlen(s)) != 0)
+		return (format());
+
+	/* Header line #2: "Format={hex,print}". */
+	if (util_read_line(&l, 0, &eof))
+		return (1);
+	if (strcmp(l.mem, "Format=print") == 0)
+		*hexp = 0;
+	else if (strcmp(l.mem, "Format=hex") == 0)
+		*hexp = 1;
+	else
+		return (format());
+
+	/* Header line #3: "Header". */
+	if (util_read_line(&l, 0, &eof))
+		return (1);
+	if (strcmp(l.mem, "Header") != 0)
+		return (format());
+
+	/* Now, read in lines until we get to the end of the headers. */
+	for (entry = max_entry = 0, list = NULL;; ++entry) {
+		if ((ret = util_read_line(&l, 0, &eof)) != 0)
+			goto err;
+		if (strcmp(l.mem, "Data") == 0)
+			break;
+
+		/*
+		 * Grow the array of header lines as necessary -- we need an
+		 * extra slot for NULL termination.
+		 */
+		if (entry + 1 >= max_entry) {
+			if ((tlist = realloc(list, (size_t)
+			    (max_entry += 100) * sizeof(char *))) == NULL) {
+				ret = util_err(errno, NULL);
+
+				/*
+				 * List already freed by realloc, still use err
+				 * label for consistency.
+				 */
+				list = NULL;
+				goto err;
+			}
+			list = tlist;
+		}
+		if ((list[entry] = strdup(l.mem)) == NULL) {
+			ret = util_err(errno, NULL);
+			goto err;
+		}
+		list[entry + 1] = NULL;
+	}
+
+	/* Headers are required, and they're supposed to be in pairs. */
+	if (list == NULL || entry % 2 != 0) {
+		ret = format();
+		goto err;
+	}
+	*listp = list;
+	return (0);
+
+err:	if (list != NULL) {
+		for (tlist = list; *tlist != NULL; ++tlist)
+			free(*tlist);
+		free(list);
+	}
+	return (ret);
+}
+
+/*
+ * config_reorder --
+ *	For table dumps, reorder the list so tables are first.
+ *	For other dumps, make any needed checks.
+ */
+int
+config_reorder(char **list)
+{
+	char **entry, *p;
+
+	/*
+	 * Search for a table name -- if we find one, then it's table dump,
+	 * otherwise, it's a single file dump.
+	 */
+	for (entry = list; *entry != NULL; ++entry)
+		if (WT_PREFIX_MATCH(*entry, "table:"))
+			break;
+	if (*entry == NULL) {
+		/*
+		 * Single file dumps can only have two lines, the file name and
+		 * the configuration information.
+		 */
+		if ((list[0] == NULL || list[1] == NULL || list[2] != NULL) ||
+		    (WT_PREFIX_MATCH(list[0], "file:") &&
+		    WT_PREFIX_MATCH(list[0], "lsm:")))
+			return (format());
+
+		entry = list;
+	}
+
+	/*
+	 * Make sure the table key/value pair comes first, then we can just
+	 * run through the array in order.  (We already checked that we had
+	 * a multiple of 2 entries, so this is safe.)
+	 */
+	if (entry != list) {
+		p = list[0]; list[0] = entry[0]; entry[0] = p;
+		p = list[1]; list[1] = entry[1]; entry[1] = p;
+	}
+	return (0);
+}
+
+/*
+ * config_update --
+ *	Reconcile and update the command line configuration against the
+ *	config we found.
+ */
+int
+config_update(WT_SESSION *session, char **list)
+{
+	int found;
+	const char *cfg[] = { NULL, NULL, NULL };
+	char **configp, **listp;
+	const char **rm;
+	static const char *rmnames[] = {
+		"filename", "id", "checkpoint",	"checkpoint_lsn",
+		"version", "source", NULL };
+
+	/*
+	 * If the object has been renamed, replace all of the column group,
+	 * index, file and table names with the new name.
+	 */
+	if (cmdname != NULL) {
+		for (listp = list; *listp != NULL; listp += 2)
+			if (WT_PREFIX_MATCH(*listp, "colgroup:") ||
+			    WT_PREFIX_MATCH(*listp, "file:") ||
+			    WT_PREFIX_MATCH(*listp, "index:") ||
+			    WT_PREFIX_MATCH(*listp, "table:"))
+				if (config_rename(listp, cmdname))
+					return (1);
+
+		/*
+		 * If the object was renamed, and there are configuration pairs,
+		 * rename the configuration pairs as well, because we don't know
+		 * if the user used the old or new names for the pair's URI.
+		 */
+		for (configp = cmdconfig;
+		    cmdconfig != NULL && *configp != NULL; configp += 2)
+			if (config_rename(configp, cmdname))
+				return (1);
+	}
+
+	/*
+	 * Remove all "filename=", "source=" and other configurations
+	 * that foil loading from the values. New filenames are chosen
+	 * as part of table load.
+	 */
+	for (listp = list; *listp != NULL; listp += 2)
+		for (rm = rmnames; *rm != NULL; rm++)
+			if (strstr(listp[1], *rm) != NULL)
+				config_remove(listp[1], *rm);
+
+	/*
+	 * It's possible to update everything except the key/value formats.
+	 * If there were command-line configuration pairs, walk the list of
+	 * command-line configuration strings, and check.
+	 */
+	for (configp = cmdconfig;
+	    cmdconfig != NULL && *configp != NULL; configp += 2)
+		if (strstr(configp[1], "key_format=") ||
+		    strstr(configp[1], "value_format="))
+			return (util_err(0,
+			    "the command line configuration string may not "
+			    "modify the object's key or value format"));
+
+	/*
+	 * If there were command-line configuration pairs, walk the list of
+	 * command-line URIs and find a matching dump URI.  For each match,
+	 * rewrite the dump configuration as described by the command-line
+	 * configuration.  It is an error if a command-line URI doesn't find
+	 * a single, exact match, that's likely a mistake.
+	 */
+	for (configp = cmdconfig;
+	    cmdconfig != NULL && *configp != NULL; configp += 2) {
+		found = 0;
+		for (listp = list; *listp != NULL; listp += 2) {
+			if (strncmp(*configp, listp[0], strlen(*configp)) != 0)
+				continue;
+			/*
+			 * !!!
+			 * We support JSON configuration strings, which leads to
+			 * configuration strings with brackets.  Unfortunately,
+			 * that implies we can't simply append new configuration
+			 * strings to existing ones.  We call an unpublished
+			 * WiredTiger API to do the concatenation: if anyone
+			 * else ever needs it we can make it public, but I think
+			 * that's unlikely.  We're also playing fast and loose
+			 * with types, but it should work.
+			 */
+			cfg[0] = listp[1];
+			cfg[1] = configp[1];
+			if (__wt_config_concat(
+			    (WT_SESSION_IMPL *)session, cfg,
+			    (const char **)&listp[1]) != 0)
+				return (1);
+			++found;
+		}
+		switch (found) {
+		case 0:
+			return (util_err(0,
+			    "the command line object name %s was not matched "
+			    "by any loaded object name", *configp));
+		case 1:
+			break;
+		default:
+			return (util_err(0,
+			    "the command line object name %s was not unique, "
+			    "matching more than a single loaded object name",
+			    *configp));
+		}
+	}
+
+	/* Leak the memory, I don't care. */
+	return (0);
+}
+
+/*
+ * config_rename --
+ *	Update the URI name.
+ */
+static int
+config_rename(char **urip, const char *name)
+{
+	size_t len;
+	char *buf, *p;
+
+	/* Allocate room. */
+	len = strlen(*urip) + strlen(name) + 10;
+	if ((buf = malloc(len)) == NULL)
+		return (util_err(errno, NULL));
+
+	/*
+	 * Find the separating colon characters, but not the trailing one may
+	 * not be there.
+	 */
+	if ((p = strchr(*urip, ':')) == NULL) {
+		free(buf);
+		return (format());
+	}
+	*p = '\0';
+	p = strchr(p + 1, ':');
+	snprintf(buf, len, "%s:%s%s", *urip, name, p == NULL ? "" : p);
+	*urip = buf;
+
+	return (0);
+}
+
+/*
+ * config_remove --
+ *	Remove a single config key and its value.
+ */
+static void
+config_remove(char *config, const char *ckey)
+{
+	int parens, quoted;
+	char *begin, match[100], *next, *p;
+
+	snprintf(match, sizeof(match), "%s=", ckey);
+	if ((begin = strstr(config, match)) != NULL) {
+		parens = 0;
+		quoted = 0;
+		next = NULL;
+		for (p = begin + strlen(match); !next && *p; p++)
+			switch (*p) {
+			case '(':
+				if (!quoted)
+					parens++;
+				break;
+			case ')':
+				if (!quoted)
+					parens--;
+				break;
+			case '"':
+				quoted = !quoted;
+				break;
+			case ',':
+				if (!quoted && parens == 0)
+					next = p + 1;
+				break;
+			}
+		if (next)
+			memmove(begin, next, strlen(next) + 1);
+		else
+			*begin = '\0';
+	}
+}
+
+/*
+ * format --
+ *	The input doesn't match the dump format.
+ */
+static int
+format(void)
+{
+	return (util_err(0, "input does not match WiredTiger dump format"));
+}
+
+/*
+ * insert --
+ *	Read and insert data.
+ */
+static int
+insert(WT_CURSOR *cursor, const char *name)
+{
+	ULINE key, value;
+	WT_DECL_RET;
+	uint64_t insert_count;
+	int eof;
+
+	memset(&key, 0, sizeof(key));
+	memset(&value, 0, sizeof(value));
+
+	/* Read key/value pairs and insert them into the file. */
+	for (insert_count = 0;;) {
+		/*
+		 * Three modes: in row-store, we always read a key and use it,
+		 * in column-store, we might read it (a dump), we might read
+		 * and ignore it (a dump with "append" set), or not read it at
+		 * all (flat-text load).
+		 */
+		if (util_read_line(&key, 1, &eof))
+			return (1);
+		if (eof == 1)
+			break;
+		if (!append)
+			cursor->set_key(cursor, key.mem);
+
+		if (util_read_line(&value, 0, &eof))
+			return (1);
+		cursor->set_value(cursor, value.mem);
+
+		if ((ret = cursor->insert(cursor)) != 0)
+			return (util_err(ret, "%s: cursor.insert", name));
+
+		/* Report on progress every 100 inserts. */
+		if (verbose && ++insert_count % 100 == 0) {
+			printf("\r\t%s: %" PRIu64, name, insert_count);
+			fflush(stdout);
+		}
+	}
+
+	if (verbose)
+		printf("\r\t%s: %" PRIu64 "\n", name, insert_count);
+
+	return (0);
+}
+
+static int
+usage(void)
+{
+	(void)fprintf(stderr,
+	    "usage: %s %s "
+	    "load [-as] [-f input-file] [-r name] [object configuration ...]\n",
+	    progname, usage_prefix);
+	return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_load.h b/src/third_party/wiredtiger/src/utilities/util_load.h
new file mode 100644
index 00000000000..7bca677e178
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_load.h
@@ -0,0 +1,27 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * A list of configuration strings.
+ */
+typedef struct {
+	char **list;		/* array of alternating (uri, config) values */
+	int entry;		/* next entry available in list */
+	int max_entry;		/* how many allocated in list */
+} CONFIG_LIST;
+
+int	 config_exec(WT_SESSION *, char **);
+int	 config_list_add(CONFIG_LIST *, char *);
+void	 config_list_free(CONFIG_LIST *);
+int	 config_reorder(char **);
+int	 config_update(WT_SESSION *, char **);
+
+/* Flags for util_load_json */
+#define	LOAD_JSON_APPEND	0x0001	/* append (ignore record number keys) */
+#define	LOAD_JSON_NO_OVERWRITE	0x0002	/* don't overwrite existing data */
+
+int	 util_load_json(WT_SESSION *, const char *, uint32_t);
diff --git a/src/third_party/wiredtiger/src/utilities/util_load_json.c b/src/third_party/wiredtiger/src/utilities/util_load_json.c
new file mode 100644
index 00000000000..fb61df9ab16
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_load_json.c
@@ -0,0 +1,573 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+#include "util_load.h"
+
+/*
+ * Encapsulates the input state for parsing JSON.
+ *
+ * At any time, we may be peeking at an unconsumed token; this is
+ * indicated by 'peeking' as true.  toktype, tokstart, toklen will be
+ * set in this case.
+ *
+ * Generally we are collecting and processing tokens one by one.
+ * In JSON, tokens never span lines so this makes processing easy.
+ * The exception is that a JSON dump cursor takes the complete
+ * set of keys or values during cursor->set_key/set_value calls,
+ * which may contain many tokens and span lines.  E.g.
+ *   cursor->set_value("\"name\" : \"John\", \"phone\" : 2348765");
+ * The raw key/value string is collected in the kvraw field.
+ */
+typedef struct {
+	WT_SESSION *session;    /* associated session */
+	ULINE line;		/* current line */
+	const char *p;		/* points to cur position in line.mem */
+	int ateof;		/* current token is EOF */
+	int peeking;		/* peeking at next token */
+	int toktype;		/* next token, defined by __wt_json_token() */
+	const char *tokstart;	/* next token start (points into line.mem) */
+	size_t toklen;		/* next token length */
+	char *kvraw;		/* multiple line raw content collected so far */
+	size_t kvrawstart;	/* pos on cur line that JSON key/value starts */
+	const char *filename;   /* filename for error reporting */
+	int linenum;		/* line number for error reporting */
+} JSON_INPUT_STATE;
+
+static int json_column_group_index(WT_SESSION *, JSON_INPUT_STATE *,
+    CONFIG_LIST *, int);
+static int json_data(WT_SESSION *, JSON_INPUT_STATE *, CONFIG_LIST *, uint32_t);
+static int json_expect(WT_SESSION *, JSON_INPUT_STATE *, int);
+static int json_peek(WT_SESSION *, JSON_INPUT_STATE *);
+static int json_skip(WT_SESSION *, JSON_INPUT_STATE *, const char **);
+static int json_kvraw_append(JSON_INPUT_STATE *, const char *, size_t);
+static int json_strdup(JSON_INPUT_STATE *, char **);
+static int json_top_level(WT_SESSION *, JSON_INPUT_STATE *, uint32_t);
+
+#define	JSON_STRING_MATCH(ins, match)					\
+	((ins)->toklen - 2 == strlen(match) &&				\
+	    strncmp((ins)->tokstart + 1, (match), (ins)->toklen - 2) == 0)
+
+#define	JSON_INPUT_POS(ins)						\
+	((size_t)((ins)->p - (const char *)(ins)->line.mem))
+
+#define	JSON_EXPECT(session, ins, tok) do {				\
+	if (json_expect(session, ins, tok))				\
+		goto err;						\
+} while (0)
+
+/*
+ * json_column_group_index --
+ *	Parse a column group or index entry from JSON input.
+ */
+static int
+json_column_group_index(WT_SESSION *session, JSON_INPUT_STATE *ins,
+    CONFIG_LIST *clp, int idx)
+{
+	WT_DECL_RET;
+	char *config, *p, *uri;
+	int isconfig;
+
+	uri = NULL;
+	config = NULL;
+
+	while (json_peek(session, ins) == '{') {
+		JSON_EXPECT(session, ins, '{');
+		JSON_EXPECT(session, ins, 's');
+		isconfig = JSON_STRING_MATCH(ins, "config");
+		if (!isconfig && !JSON_STRING_MATCH(ins, "uri"))
+			goto err;
+		JSON_EXPECT(session, ins, ':');
+		JSON_EXPECT(session, ins, 's');
+
+		if ((ret = json_strdup(ins, &p)) != 0) {
+			ret = util_err(ret, NULL);
+			goto err;
+		}
+		if (isconfig)
+			config = p;
+		else
+			uri = p;
+
+		isconfig = !isconfig;
+		JSON_EXPECT(session, ins, ',');
+		JSON_EXPECT(session, ins, 's');
+		if (!JSON_STRING_MATCH(ins, isconfig ? "config" : "uri"))
+			goto err;
+		JSON_EXPECT(session, ins, ':');
+		JSON_EXPECT(session, ins, 's');
+
+		if ((ret = json_strdup(ins, &p)) != 0) {
+			ret = util_err(ret, NULL);
+			goto err;
+		}
+		if (isconfig)
+			config = p;
+		else
+			uri = p;
+		JSON_EXPECT(session, ins, '}');
+		if ((idx && strncmp(uri, "index:", 6) != 0) ||
+		    (!idx && strncmp(uri, "colgroup:", 9) != 0)) {
+			ret = util_err(EINVAL,
+			    "%s: misplaced colgroup or index", uri);
+			goto err;
+		}
+		if ((ret = config_list_add(clp, uri)) != 0 ||
+		    (ret = config_list_add(clp, config)) != 0)
+			goto err;
+
+		if (json_peek(session, ins) != ',')
+			break;
+		JSON_EXPECT(session, ins, ',');
+		if (json_peek(session, ins) != '{')
+			goto err;
+	}
+	if (0) {
+err:		if (ret == 0)
+			ret = EINVAL;
+	}
+	return (ret);
+}
+
+/*
+ * json_kvraw_append --
+ *	Append to the kvraw buffer, which is used to collect all the
+ *	raw key/value pairs from JSON input.
+ */
+static int json_kvraw_append(JSON_INPUT_STATE *ins, const char *str, size_t len)
+{
+	char *tmp;
+	size_t needsize;
+
+	if (len > 0) {
+		needsize = strlen(ins->kvraw) + len + 2;
+		if ((tmp = malloc(needsize)) == NULL)
+			return (util_err(errno, NULL));
+		snprintf(tmp, needsize, "%s %.*s", ins->kvraw, (int)len, str);
+		free(ins->kvraw);
+		ins->kvraw = tmp;
+	}
+	return (0);
+}
+
+/*
+ * json_strdup --
+ *	Return a string, with no escapes or other JSON-isms, from the
+ *	JSON string at the current input position.
+ */
+static int
+json_strdup(JSON_INPUT_STATE *ins, char **resultp)
+{
+	WT_DECL_RET;
+	char *result, *resultcpy;
+	const char *src;
+	ssize_t resultlen;
+	size_t srclen;
+
+	result = NULL;
+	src = ins->tokstart + 1;  /*strip "" from token */
+	srclen = ins->toklen - 2;
+	if ((resultlen = __wt_json_strlen(src, srclen)) < 0) {
+		ret = util_err(EINVAL, "Invalid config string");
+		goto err;
+	}
+	resultlen += 1;
+	if ((result = (char *)malloc((size_t)resultlen)) == NULL) {
+		ret = util_err(errno, NULL);
+		goto err;
+	}
+	*resultp = result;
+	resultcpy = result;
+	if ((ret = __wt_json_strncpy(&resultcpy, (size_t)resultlen, src,
+	    srclen))
+	    != 0) {
+		ret = util_err(ret, NULL);
+		goto err;
+	}
+
+	if (0) {
+err:		if (ret == 0)
+			ret = EINVAL;
+		if (result != NULL)
+			free(result);
+		*resultp = NULL;
+	}
+	return (ret);
+}
+
+/*
+ * json_data --
+ *	Parse the data portion of the JSON input, and insert all
+ *	values.
+ */
+static int
+json_data(WT_SESSION *session, JSON_INPUT_STATE *ins, CONFIG_LIST *clp,
+    uint32_t flags)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	char config[64], *endp, *uri;
+	const char *keyformat;
+	int isrec, nfield, nkeys, toktype, tret;
+	size_t keystrlen;
+	ssize_t gotnolen;
+	uint64_t gotno, recno;
+
+	cursor = NULL;
+	uri = NULL;
+
+	/* Reorder and check the list. */
+	if ((ret = config_reorder(clp->list)) != 0)
+		goto err;
+
+	/* Update config based on command-line configuration. */
+	if ((ret = config_update(session, clp->list)) != 0)
+		goto err;
+
+	/* Create the items collected. */
+	if ((ret = config_exec(session, clp->list)) != 0)
+		goto err;
+
+	uri = clp->list[0];
+	(void)snprintf(config, sizeof(config),
+	    "dump=json%s%s",
+	    LF_ISSET(LOAD_JSON_APPEND) ? ",append" : "",
+	    LF_ISSET(LOAD_JSON_NO_OVERWRITE) ? ",overwrite=false" : "");
+	if ((ret = session->open_cursor(
+	    session, uri, NULL, config, &cursor)) != 0) {
+		ret = util_err(ret, "%s: session.open", uri);
+		goto err;
+	}
+	keyformat = cursor->key_format;
+	isrec = (strcmp(keyformat, "r") == 0);
+	for (nkeys = 0; *keyformat; keyformat++)
+		if (!isdigit(*keyformat))
+			nkeys++;
+
+	recno = 0;
+	while (json_peek(session, ins) == '{') {
+		nfield = 0;
+		JSON_EXPECT(session, ins, '{');
+		if (ins->kvraw == NULL) {
+			if ((ins->kvraw = (char *)malloc(1)) == NULL) {
+				ret = util_err(errno, NULL);
+				goto err;
+			}
+		}
+		ins->kvraw[0] = '\0';
+		ins->kvrawstart = JSON_INPUT_POS(ins);
+		keystrlen = 0;
+		while (json_peek(session, ins) == 's') {
+			JSON_EXPECT(session, ins, 's');
+			JSON_EXPECT(session, ins, ':');
+			toktype = json_peek(session, ins);
+			JSON_EXPECT(session, ins, toktype);
+			if (isrec && nfield == 0) {
+				/* Verify the dump has recnos in order. */
+				recno++;
+				gotno = __wt_strtouq(ins->tokstart, &endp, 0);
+				gotnolen = (endp - ins->tokstart);
+				if (recno != gotno ||
+				    ins->toklen != (size_t)gotnolen) {
+					ret = util_err(0,
+					    "%s: recno out of order", uri);
+					goto err;
+				}
+			}
+			if (++nfield == nkeys) {
+				size_t curpos = JSON_INPUT_POS(ins);
+				if ((ret = json_kvraw_append(ins,
+				    (char *)ins->line.mem + ins->kvrawstart,
+				    curpos - ins->kvrawstart)) != 0)
+					goto err;
+				ins->kvrawstart = curpos;
+				keystrlen = strlen(ins->kvraw);
+			}
+			if (json_peek(session, ins) != ',')
+				break;
+			JSON_EXPECT(session, ins, ',');
+			if (json_peek(session, ins) != 's')
+				goto err;
+		}
+		if (json_kvraw_append(ins, ins->line.mem, JSON_INPUT_POS(ins)))
+			goto err;
+
+		ins->kvraw[keystrlen] = '\0';
+		if (!LF_ISSET(LOAD_JSON_APPEND))
+			cursor->set_key(cursor, ins->kvraw);
+		/* skip over inserted space and comma */
+		cursor->set_value(cursor, &ins->kvraw[keystrlen+2]);
+		if ((ret = cursor->insert(cursor)) != 0) {
+			ret = util_err(ret, "%s: cursor.insert", uri);
+			goto err;
+		}
+
+		JSON_EXPECT(session, ins, '}');
+		if (json_peek(session, ins) != ',')
+			break;
+		JSON_EXPECT(session, ins, ',');
+		if (json_peek(session, ins) != '{')
+			goto err;
+	}
+	if (0) {
+err:		if (ret == 0)
+			ret = EINVAL;
+	}
+	/*
+	 * Technically, we don't have to close the cursor because the session
+	 * handle will do it for us, but I'd like to see the flush to disk and
+	 * the close succeed, it's better to fail early when loading files.
+	 */
+	if (cursor != NULL && (tret = cursor->close(cursor)) != 0) {
+		tret = util_err(tret, "%s: cursor.close", uri);
+		if (ret == 0)
+			ret = tret;
+	}
+	if (ret == 0)
+		ret = util_flush(session, uri);
+	return (ret);
+}
+
+/*
+ * json_top_level --
+ *	Parse the top level JSON input.
+ */
+static int
+json_top_level(WT_SESSION *session, JSON_INPUT_STATE *ins, uint32_t flags)
+{
+	CONFIG_LIST cl;
+	WT_DECL_RET;
+	char *config, *tableuri;
+	int toktype;
+	static const char *json_markers[] = {
+	    "\"config\"", "\"colgroups\"", "\"indices\"", "\"data\"", NULL };
+
+	memset(&cl, 0, sizeof(cl));
+	tableuri = NULL;
+	JSON_EXPECT(session, ins, '{');
+	while (json_peek(session, ins) == 's') {
+		JSON_EXPECT(session, ins, 's');
+		tableuri = realloc(tableuri, ins->toklen);
+		snprintf(tableuri, ins->toklen, "%.*s",
+		    (int)(ins->toklen - 2), ins->tokstart + 1);
+		JSON_EXPECT(session, ins, ':');
+
+		/*
+		 * Allow any ordering of 'config', 'colgroups',
+		 * 'indices' before 'data', which must appear last.
+		 * The non-'data' items build up a list of entries
+		 * that created in our session before the data is
+		 * inserted.
+		 */
+		for (;;) {
+			if (json_skip(session, ins, json_markers) != 0)
+				goto err;
+			JSON_EXPECT(session, ins, 's');
+			if (JSON_STRING_MATCH(ins, "config")) {
+				JSON_EXPECT(session, ins, ':');
+				JSON_EXPECT(session, ins, 's');
+				if ((ret = json_strdup(ins, &config)) != 0) {
+					ret = util_err(ret, NULL);
+					goto err;
+				}
+				if ((ret = config_list_add(&cl, tableuri)) != 0)
+					goto err;
+				if ((ret = config_list_add(&cl, config)) != 0)
+					goto err;
+				tableuri = NULL;
+			} else if (JSON_STRING_MATCH(ins, "colgroups")) {
+				JSON_EXPECT(session, ins, ':');
+				JSON_EXPECT(session, ins, '[');
+				if ((ret = json_column_group_index(
+				    session, ins, &cl, 0)) != 0)
+					goto err;
+				JSON_EXPECT(session, ins, ']');
+			} else if (JSON_STRING_MATCH(ins, "indices")) {
+				JSON_EXPECT(session, ins, ':');
+				JSON_EXPECT(session, ins, '[');
+				if ((ret = json_column_group_index(
+				    session, ins, &cl, 1)) != 0)
+					goto err;
+				JSON_EXPECT(session, ins, ']');
+			} else if (JSON_STRING_MATCH(ins, "data")) {
+				JSON_EXPECT(session, ins, ':');
+				JSON_EXPECT(session, ins, '[');
+				if ((ret = json_data(session, ins, &cl,
+				    flags)) != 0)
+					goto err;
+				config_list_free(&cl);
+				break;
+			}
+			else
+				goto err;
+		}
+
+		while ((toktype = json_peek(session, ins)) == '}' ||
+		    toktype == ']')
+			JSON_EXPECT(session, ins, toktype);
+		if (toktype == 0) /* Check EOF. */
+			break;
+		if (toktype == ',') {
+			JSON_EXPECT(session, ins, ',');
+			if (json_peek(session, ins) != 's')
+				goto err;
+			continue;
+		}
+	}
+	JSON_EXPECT(session, ins, 0);
+
+	if (0) {
+err:		if (ret == 0)
+			ret = EINVAL;
+	}
+	config_list_free(&cl);
+	if (tableuri != NULL)
+		free(tableuri);
+	return (ret);
+}
+
+/*
+ * json_peek --
+ *	Set the input state to the next available token in the input
+ *	and return its token type, a code defined by __wt_json_token().
+ */
+static int
+json_peek(WT_SESSION *session, JSON_INPUT_STATE *ins)
+{
+	WT_DECL_RET;
+
+	if (!ins->peeking) {
+		while (!ins->ateof) {
+			while (isspace(*ins->p))
+				ins->p++;
+			if (*ins->p)
+				break;
+			if (ins->kvraw != NULL) {
+				if (json_kvraw_append(ins,
+				    (char *)ins->line.mem + ins->kvrawstart,
+				    strlen(ins->line.mem) - ins->kvrawstart)) {
+					ret = -1;
+					goto err;
+				}
+				ins->kvrawstart = 0;
+			}
+			if (util_read_line(&ins->line, 1,
+			    &ins->ateof)) {
+				ins->toktype = -1;
+				ret = -1;
+				goto err;
+			}
+			ins->linenum++;
+			ins->p = (const char *)ins->line.mem;
+		}
+		if (ins->ateof)
+			ins->toktype = 0;
+		else if (__wt_json_token(session, ins->p,
+		    &ins->toktype, &ins->tokstart,
+		    &ins->toklen) != 0)
+			ins->toktype = -1;
+		ins->peeking = 1;
+	}
+	if (0) {
+	err:	if (ret == 0)
+			ret = -1;
+	}
+	return (ret == 0 ? ins->toktype : -1);
+}
+
+/*
+ * json_expect --
+ *	Ensure that the type of the next token in the input matches
+ *	the wanted value, and advance past it.  The values of the
+ *	input state will be set so specific string or integer values
+ *	can be pulled out after this call.
+ */
+static int
+json_expect(WT_SESSION *session, JSON_INPUT_STATE *ins, int wanttok)
+{
+	if (json_peek(session, ins) < 0)
+		return (1);
+	ins->p += ins->toklen;
+	ins->peeking = 0;
+	if (ins->toktype != wanttok) {
+		fprintf(stderr,
+		    "%s: %d: %" WT_SIZET_FMT ": expected %s, got %s\n",
+		    ins->filename,
+		    ins->linenum,
+		    JSON_INPUT_POS(ins) + 1,
+		    __wt_json_tokname(wanttok),
+		    __wt_json_tokname(ins->toktype));
+		return (1);
+	}
+	return (0);
+}
+
+/*
+ * json_skip --
+ *	Skip over JSON input until one of the specified strings appears.
+ *	The tokenizer will be set to point to the beginning of
+ *	that string.
+ */
+static int
+json_skip(WT_SESSION *session, JSON_INPUT_STATE *ins, const char **matches)
+{
+	const char *hit;
+	const char **match;
+
+	if (ins->kvraw != NULL)
+		return (1);
+
+	hit = NULL;
+	while (!ins->ateof) {
+		for (match = matches; *match != NULL; match++)
+			if ((hit = strstr(ins->p, *match)) != NULL)
+				goto out;
+		if (util_read_line(&ins->line, 1, &ins->ateof)) {
+			ins->toktype = -1;
+			return (1);
+		}
+		ins->linenum++;
+		ins->p = (const char *)ins->line.mem;
+	}
+out:
+	if (hit == NULL)
+		return (1);
+
+	/* Set to this token. */
+	ins->p = hit;
+	ins->peeking = 0;
+	ins->toktype = 0;
+	(void)json_peek(session, ins);
+	return (0);
+}
+
+/*
+ * load_json --
+ *	Load from the JSON format produced by 'wt dump -j'.
+ */
+int
+util_load_json(WT_SESSION *session, const char *filename, uint32_t flags)
+{
+	JSON_INPUT_STATE instate;
+	WT_DECL_RET;
+
+	memset(&instate, 0, sizeof(instate));
+	instate.session = session;
+	if (util_read_line(&instate.line, 0, &instate.ateof))
+		return (1);
+	instate.p = (const char *)instate.line.mem;
+	instate.linenum = 1;
+	instate.filename = filename;
+
+	if ((ret = json_top_level(session, &instate, flags)) != 0)
+		goto err;
+
+err:	if (instate.line.mem != NULL)
+		free(instate.line.mem);
+	free(instate.kvraw);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_loadtext.c b/src/third_party/wiredtiger/src/utilities/util_loadtext.c
new file mode 100644
index 00000000000..27c4c23b50c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_loadtext.c
@@ -0,0 +1,157 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int insert(WT_CURSOR *, const char *, int);
+static int text(WT_SESSION *, const char *);
+static int usage(void);
+
+int
+util_loadtext(WT_SESSION *session, int argc, char *argv[])
+{
+	int ch;
+	const char *uri;
+
+	while ((ch = __wt_getopt(progname, argc, argv, "f:")) != EOF)
+		switch (ch) {
+		case 'f':	/* input file */
+			if (freopen(__wt_optarg, "r", stdin) == NULL)
+				return (
+				    util_err(errno, "%s: reopen", __wt_optarg));
+			break;
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= __wt_optind;
+	argv += __wt_optind;
+
+	/* The remaining argument is the uri. */
+	if (argc != 1)
+		return (usage());
+	if ((uri = util_name(*argv, "table")) == NULL)
+		return (1);
+
+	return (text(session, uri));
+}
+
+/*
+ * text --
+ *	Load flat-text into a file/table.
+ */
+static int
+text(WT_SESSION *session, const char *uri)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	int readkey, tret;
+
+	/*
+	 * Open the cursor, configured to append new records (in the case of
+	 * column-store objects), or to overwrite existing strings (in the
+	 * case of row-store objects).  The two flags are mutually exclusive,
+	 * but the library doesn't currently care that we set both of them.
+	 */
+	if ((ret = session->open_cursor(
+	    session, uri, NULL, "append,overwrite", &cursor)) != 0)
+		return (util_err(ret, "%s: session.open", uri));
+
+	/*
+	 * We're about to load strings, make sure the formats match.
+	 *
+	 * Row-store tables have key/value pairs, column-store tables only have
+	 * values.
+	 */
+	if (strcmp(cursor->value_format, "S") != 0 ||
+	    (strcmp(cursor->key_format, "S") != 0 &&
+	    strcmp(cursor->key_format, "r") != 0))
+		return (util_err(EINVAL,
+		    "the loadtext command can only load objects configured "
+		    "for record number or string keys, and string values"));
+	readkey = strcmp(cursor->key_format, "r") == 0 ? 0 : 1;
+
+	/* Insert the records */
+	ret = insert(cursor, uri, readkey);
+
+	/*
+	 * Technically, we don't have to close the cursor because the session
+	 * handle will do it for us, but I'd like to see the flush to disk and
+	 * the close succeed, it's better to fail early when loading files.
+	 */
+	if ((tret = cursor->close(cursor)) != 0) {
+		tret = util_err(tret, "%s: cursor.close", uri);
+		if (ret == 0)
+			ret = tret;
+	}
+	if (ret == 0)
+		ret = util_flush(session, uri);
+
+	return (ret == 0 ? 0 : 1);
+}
+
+/*
+ * insert --
+ *	Read and insert data.
+ */
+static int
+insert(WT_CURSOR *cursor, const char *name, int readkey)
+{
+	ULINE key, value;
+	WT_DECL_RET;
+	uint64_t insert_count;
+	int eof;
+
+	memset(&key, 0, sizeof(key));
+	memset(&value, 0, sizeof(value));
+
+	/* Read key/value pairs and insert them into the file. */
+	for (insert_count = 0;;) {
+		/*
+		 * Three modes: in row-store, we always read a key and use it,
+		 * in column-store, we might read it (a dump), we might read
+		 * and ignore it (a dump with "append" set), or not read it at
+		 * all (flat-text load).
+		 */
+		if (readkey) {
+			if (util_read_line(&key, 1, &eof))
+				return (1);
+			if (eof == 1)
+				break;
+			cursor->set_key(cursor, key.mem);
+		}
+		if (util_read_line(&value, readkey ? 0 : 1, &eof))
+			return (1);
+		if (eof == 1)
+			break;
+		cursor->set_value(cursor, value.mem);
+
+		if ((ret = cursor->insert(cursor)) != 0)
+			return (util_err(ret, "%s: cursor.insert", name));
+
+		/* Report on progress every 100 inserts. */
+		if (verbose && ++insert_count % 100 == 0) {
+			printf("\r\t%s: %" PRIu64, name, insert_count);
+			fflush(stdout);
+		}
+	}
+
+	if (verbose)
+		printf("\r\t%s: %" PRIu64 "\n", name, insert_count);
+
+	return (0);
+}
+
+static int
+usage(void)
+{
+	(void)fprintf(stderr,
+	    "usage: %s %s "
+	    "loadtext [-f input-file] uri\n",
+	    progname, usage_prefix);
+	return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_main.c b/src/third_party/wiredtiger/src/utilities/util_main.c
new file mode 100644
index 00000000000..04ab59f1ca9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_main.c
@@ -0,0 +1,262 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+const char *home = ".";				/* Home directory */
+const char *progname;				/* Program name */
+						/* Global arguments */
+const char *usage_prefix = "[-Vv] [-C config] [-h home]";
+int verbose;					/* Verbose flag */
+
+static const char *command;			/* Command name */
+
+static int usage(void);
+
+int
+main(int argc, char *argv[])
+{
+	WT_CONNECTION *conn;
+	WT_DECL_RET;
+	WT_SESSION *session;
+	size_t len;
+	int ch, major_v, minor_v, tret, (*func)(WT_SESSION *, int, char *[]);
+	char *p;
+	const char *cmd_config, *config;
+
+	conn = NULL;
+	p = NULL;
+
+	/* Get the program name. */
+	if ((progname = strrchr(argv[0], '/')) == NULL)
+		progname = argv[0];
+	else
+		++progname;
+	command = "";
+
+	/* Check the version against the library build. */
+	(void)wiredtiger_version(&major_v, & minor_v, NULL);
+	if (major_v != WIREDTIGER_VERSION_MAJOR ||
+	    minor_v != WIREDTIGER_VERSION_MINOR) {
+		fprintf(stderr,
+		    "%s: program build version %d.%d does not match "
+		    "library build version %d.%d\n",
+		    progname,
+		    WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR,
+		    major_v,  minor_v);
+		return (EXIT_FAILURE);
+	}
+
+	/* Check for standard options. */
+	cmd_config = config = NULL;
+	while ((ch = __wt_getopt(progname, argc, argv, "C:h:Vv")) != EOF)
+		switch (ch) {
+		case 'C':			/* wiredtiger_open config */
+			cmd_config = __wt_optarg;
+			break;
+		case 'h':			/* home directory */
+			home = __wt_optarg;
+			break;
+		case 'V':			/* version */
+			printf("%s\n", wiredtiger_version(NULL, NULL, NULL));
+			return (EXIT_SUCCESS);
+		case 'v':			/* verbose */
+			verbose = 1;
+			break;
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= __wt_optind;
+	argv += __wt_optind;
+
+	/* The next argument is the command name. */
+	if (argc < 1)
+		return (usage());
+	command = argv[0];
+
+	/* Reset getopt. */
+	__wt_optreset = __wt_optind = 1;
+
+	func = NULL;
+	switch (command[0]) {
+	case 'b':
+		if (strcmp(command, "backup") == 0)
+			func = util_backup;
+		break;
+	case 'c':
+		if (strcmp(command, "compact") == 0)
+			func = util_compact;
+		else if (strcmp(command, "copyright") == 0) {
+			util_copyright();
+			return (EXIT_SUCCESS);
+		} else if (strcmp(command, "create") == 0) {
+			func = util_create;
+			config = "create";
+		}
+		break;
+	case 'd':
+		if (strcmp(command, "drop") == 0)
+			func = util_drop;
+		else if (strcmp(command, "dump") == 0)
+			func = util_dump;
+		break;
+	case 'l':
+		if (strcmp(command, "list") == 0)
+			func = util_list;
+		else if (strcmp(command, "load") == 0) {
+			func = util_load;
+			config = "create";
+		} else if (strcmp(command, "loadtext") == 0) {
+			func = util_loadtext;
+			config = "create";
+		}
+		break;
+	case 'p':
+		if (strcmp(command, "printlog") == 0)
+			func = util_printlog;
+		break;
+	case 'r':
+		if (strcmp(command, "read") == 0)
+			func = util_read;
+		else if (strcmp(command, "rename") == 0)
+			func = util_rename;
+		break;
+	case 's':
+		if (strcmp(command, "salvage") == 0)
+			func = util_salvage;
+		else if (strcmp(command, "stat") == 0) {
+			func = util_stat;
+			config = "statistics=(all)";
+		}
+		break;
+	case 'u':
+		if (strcmp(command, "upgrade") == 0)
+			func = util_upgrade;
+		break;
+	case 'v':
+		if (strcmp(command, "verify") == 0)
+			func = util_verify;
+		break;
+	case 'w':
+		if (strcmp(command, "write") == 0)
+			func = util_write;
+		break;
+	default:
+		break;
+	}
+	if (func == NULL)
+		return (usage());
+
+	/* Build the configuration string, as necessary. */
+	if (config == NULL)
+		config = cmd_config;
+	else if (cmd_config != NULL) {
+		len = strlen(cmd_config) + strlen(config) + 10;
+		if ((p = malloc(len)) == NULL) {
+			ret = util_err(errno, NULL);
+			goto err;
+		}
+		(void)snprintf(p, len, "%s,%s", config, cmd_config);
+		config = p;
+	}
+
+	/* Open the database and a session. */
+	if ((ret = wiredtiger_open(home,
+	    verbose ? verbose_handler : NULL, config, &conn)) != 0) {
+		ret = util_err(ret, NULL);
+		goto err;
+	}
+	if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) {
+		ret = util_err(ret, NULL);
+		goto err;
+	}
+
+	/* Call the function. */
+	ret = func(session, argc, argv);
+
+	/* Close the database. */
+
+err:	if (conn != NULL && (tret = conn->close(conn, NULL)) != 0 && ret == 0)
+		ret = tret;
+
+	if (p != NULL)
+		free(p);
+
+	return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+static int
+usage(void)
+{
+	fprintf(stderr,
+	    "WiredTiger Data Engine (version %d.%d)\n",
+	    WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR);
+	fprintf(stderr,
+	    "global options:\n"
+	    "\t" "-C\twiredtiger_open configuration\n"
+	    "\t" "-h\tdatabase directory\n"
+	    "\t" "-V\tdisplay library version and exit\n"
+	    "\t" "-v\tverbose\n");
+	fprintf(stderr,
+	    "commands:\n"
+	    "\t" "backup\t  database backup\n"
+	    "\t" "compact\t  compact an object\n"
+	    "\t" "copyright copyright information\n"
+	    "\t" "create\t  create an object\n"
+	    "\t" "drop\t  drop an object\n"
+	    "\t" "dump\t  dump an object\n"
+	    "\t" "list\t  list database objects\n"
+	    "\t" "load\t  load an object\n"
+	    "\t" "loadtext\t  load an object from a text file\n"
+	    "\t" "printlog  display the database log\n"
+	    "\t" "read\t  read values from an object\n"
+	    "\t" "rename\t  rename an object\n"
+	    "\t" "salvage\t  salvage a file\n"
+	    "\t" "stat\t  display statistics for an object\n"
+	    "\t" "upgrade\t  upgrade an object\n"
+	    "\t" "verify\t  verify an object\n"
+	    "\t" "write\t  write values to an object\n");
+
+	return (EXIT_FAILURE);
+}
+
+/*
+ * util_name --
+ *	Build a name.
+ */
+char *
+util_name(const char *s, const char *type)
+{
+	size_t len;
+	char *name;
+
+	if (WT_PREFIX_MATCH(s, "backup:") ||
+	    WT_PREFIX_MATCH(s, "config:") ||
+	    WT_PREFIX_MATCH(s, "statistics:")) {
+		fprintf(stderr,
+		    "%s: %s: unsupported object type: %s\n",
+		    progname, command, s);
+		return (NULL);
+	}
+
+	len = strlen(type) + strlen(s) + 2;
+	if ((name = calloc(len, 1)) == NULL) {
+		(void)util_err(errno, NULL);
+		return (NULL);
+	}
+
+	/*
+	 * If the string has a URI prefix, use it verbatim, otherwise prepend
+	 * the default type for the operation.
+	 */
+	if (strchr(s, ':') != NULL)
+		strcpy(name, s);
+	else
+		snprintf(name, len, "%s:%s", type, s);
+	return (name);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_misc.c b/src/third_party/wiredtiger/src/utilities/util_misc.c
new file mode 100644
index 00000000000..71e307a2e0e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_misc.c
@@ -0,0 +1,146 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+int
+util_cerr(const char *uri, const char *op, int ret)
+{
+	return (util_err(ret, "%s: cursor.%s", uri, op));
+}
+
+/*
+ * util_err --
+ * 	Report an error.
+ */
+int
+util_err(int e, const char *fmt, ...)
+{
+	va_list ap;
+
+	(void)fprintf(stderr, "%s: ", progname);
+	if (fmt != NULL) {
+		va_start(ap, fmt);
+		(void)vfprintf(stderr, fmt, ap);
+		va_end(ap);
+		if (e != 0)
+			(void)fprintf(stderr, ": ");
+	}
+	if (e != 0)
+		(void)fprintf(stderr, "%s", wiredtiger_strerror(e));
+	(void)fprintf(stderr, "\n");
+	return (1);
+}
+
+/*
+ * util_read_line --
+ *	Read a line from stdin into a ULINE.
+ */
+int
+util_read_line(ULINE *l, int eof_expected, int *eofp)
+{
+	static uint64_t line = 0;
+	size_t len;
+	int ch;
+
+	++line;
+	*eofp = 0;
+
+	if (l->memsize == 0) {
+		if ((l->mem = realloc(l->mem, l->memsize + 1024)) == NULL)
+			return (util_err(errno, NULL));
+		l->memsize = 1024;
+	}
+	for (len = 0;; ++len) {
+		if ((ch = getchar()) == EOF) {
+			if (len == 0) {
+				if (eof_expected) {
+					*eofp = 1;
+					return (0);
+				}
+				return (util_err(0,
+				    "line %" PRIu64 ": unexpected end-of-file",
+				    line));
+			}
+			return (util_err(0,
+			    "line %" PRIu64 ": no newline terminator", line));
+		}
+		if (ch == '\n')
+			break;
+		/*
+		 * We nul-terminate the string so it's easier to convert the
+		 * line into a record number, that means we always need one
+		 * extra byte at the end.
+		 */
+		if (len >= l->memsize - 1) {
+			if ((l->mem =
+			    realloc(l->mem, l->memsize + 1024)) == NULL)
+				return (util_err(errno, NULL));
+			l->memsize += 1024;
+		}
+		((uint8_t *)l->mem)[len] = (uint8_t)ch;
+	}
+
+	((uint8_t *)l->mem)[len] = '\0';		/* nul-terminate */
+
+	return (0);
+}
+
+/*
+ * util_str2recno --
+ *	Convert a string to a record number.
+ */
+int
+util_str2recno(const char *p, uint64_t *recnop)
+{
+	uint64_t recno;
+	char *endptr;
+
+	/*
+	 * strtouq takes lots of things like hex values, signs and so on and so
+	 * forth -- none of them are OK with us.  Check the string starts with
+	 * digit, that turns off the special processing.
+	 */
+	if (!isdigit(p[0]))
+		goto format;
+
+	errno = 0;
+	recno = __wt_strtouq(p, &endptr, 0);
+	if (recno == ULLONG_MAX && errno == ERANGE)
+		return (util_err(ERANGE, "%s: invalid record number", p));
+
+	if (endptr[0] != '\0')
+format:		return (util_err(EINVAL, "%s: invalid record number", p));
+
+	*recnop = recno;
+	return (0);
+}
+
+/*
+ * util_flush --
+ *	Flush the file successfully, or drop it.
+ */
+int
+util_flush(WT_SESSION *session, const char *uri)
+{
+	WT_DECL_RET;
+	size_t len;
+	char *buf;
+
+	len = strlen(uri) + 100;
+	if ((buf = malloc(len)) == NULL)
+		return (util_err(errno, NULL));
+
+	(void)snprintf(buf, len, "target=(\"%s\")", uri);
+	if ((ret = session->checkpoint(session, buf)) != 0) {
+		ret = util_err(ret, "%s: session.checkpoint", uri);
+		(void)session->drop(session, uri, NULL);
+	}
+
+	free(buf);
+	return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_printlog.c b/src/third_party/wiredtiger/src/utilities/util_printlog.c
new file mode 100644
index 00000000000..7fc9bfa39b0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_printlog.c
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_printlog(WT_SESSION *session, int argc, char *argv[])
+{
+	WT_DECL_RET;
+	int ch, printable;
+
+	printable = 0;
+	while ((ch = __wt_getopt(progname, argc, argv, "f:p")) != EOF)
+		switch (ch) {
+		case 'f':			/* output file */
+			if (freopen(__wt_optarg, "w", stdout) == NULL) {
+				fprintf(stderr, "%s: %s: reopen: %s\n",
+				    progname, __wt_optarg, strerror(errno));
+				return (1);
+			}
+			break;
+		case 'p':
+			printable = 1;
+			break;
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= __wt_optind;
+	argv += __wt_optind;
+
+	/* There should not be any more arguments. */
+	if (argc != 0)
+		return (usage());
+
+	WT_UNUSED(printable);
+	ret = __wt_txn_printlog(session, stdout);
+
+	if (ret != 0) {
+		fprintf(stderr, "%s: printlog failed: %s\n",
+		    progname, wiredtiger_strerror(ret));
+		goto err;
+	}
+
+	if (0) {
+err:		ret = 1;
+	}
+	return (ret);
+}
+
+static int
+usage(void)
+{
+	(void)fprintf(stderr,
+	    "usage: %s %s "
+	    "printlog [-p] [-f output-file]\n",
+	    progname, usage_prefix);
+	return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_read.c b/src/third_party/wiredtiger/src/utilities/util_read.c
new file mode 100644
index 00000000000..d9a629e40e2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_read.c
@@ -0,0 +1,101 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_read(WT_SESSION *session, int argc, char *argv[])
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	uint64_t recno;
+	int ch, rkey, rval;
+	const char *uri, *value;
+
+	while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
+		switch (ch) {
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= __wt_optind;
+	argv += __wt_optind;
+
+	/* The remaining arguments are a uri followed by a list of keys. */
+	if (argc < 2)
+		return (usage());
+	if ((uri = util_name(*argv, "table")) == NULL)
+		return (1);
+
+	/* Open the object. */
+	if ((ret = session->open_cursor(
+	    session, uri, NULL, NULL, &cursor)) != 0)
+		return (util_err(ret, "%s: session.open", uri));
+
+	/*
+	 * A simple search only makes sense if the key format is a string or a
+	 * record number, and the value format is a single string.
+	 */
+	if (strcmp(cursor->key_format, "r") != 0 &&
+	    strcmp(cursor->key_format, "S") != 0) {
+		fprintf(stderr,
+		    "%s: read command only possible when the key format is "
+		    "a record number or string\n",
+		    progname);
+		return (1);
+	}
+	rkey = strcmp(cursor->key_format, "r") == 0 ? 1 : 0;
+	if (strcmp(cursor->value_format, "S") != 0) {
+		fprintf(stderr,
+		    "%s: read command only possible when the value format is "
+		    "a string\n",
+		    progname);
+		return (1);
+	}
+
+	/*
+	 * Run through the keys, returning non-zero on error or if any requested
+	 * key isn't found.
+	 */
+	for (rval = 0; *++argv != NULL;) {
+		if (rkey) {
+			if (util_str2recno(*argv, &recno))
+				return (1);
+			cursor->set_key(cursor, recno);
+		} else
+			cursor->set_key(cursor, *argv);
+
+		switch (ret = cursor->search(cursor)) {
+		case 0:
+			if ((ret = cursor->get_value(cursor, &value)) != 0)
+				return (util_cerr(uri, "get_value", ret));
+			if (printf("%s\n", value) < 0)
+				return (util_err(EIO, NULL));
+			break;
+		case WT_NOTFOUND:
+			(void)util_err(0, "%s: not found", *argv);
+			rval = 1;
+			break;
+		default:
+			return (util_cerr(uri, "search", ret));
+		}
+	}
+
+	return (rval);
+}
+
+static int
+usage(void)
+{
+	(void)fprintf(stderr,
+	    "usage: %s %s "
+	    "read uri key ...\n",
+	    progname, usage_prefix);
+	return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_rename.c b/src/third_party/wiredtiger/src/utilities/util_rename.c
new file mode 100644
index 00000000000..8c2aeb30c59
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_rename.c
@@ -0,0 +1,60 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_rename(WT_SESSION *session, int argc, char *argv[])
+{
+	WT_DECL_RET;
+	int ch;
+	char *uri, *newuri;
+
+	uri = NULL;
+	while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
+		switch (ch) {
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= __wt_optind;
+	argv += __wt_optind;
+
+	/* The remaining arguments are the object uri and new name. */
+	if (argc != 2)
+		return (usage());
+	if ((uri = util_name(*argv, "table")) == NULL)
+		return (1);
+	newuri = argv[1];
+
+	if ((ret = session->rename(session, uri, newuri, NULL)) != 0) {
+		fprintf(stderr, "%s: rename %s to %s: %s\n",
+		    progname, uri, newuri, wiredtiger_strerror(ret));
+		goto err;
+	}
+
+	if (0) {
+err:		ret = 1;
+	}
+
+	if (uri != NULL)
+		free(uri);
+
+	return (ret);
+}
+
+static int
+usage(void)
+{
+	(void)fprintf(stderr,
+	    "usage: %s %s "
+	    "rename uri newuri\n",
+	    progname, usage_prefix);
+	return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_salvage.c b/src/third_party/wiredtiger/src/utilities/util_salvage.c
new file mode 100644
index 00000000000..386365d8875
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_salvage.c
@@ -0,0 +1,68 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_salvage(WT_SESSION *session, int argc, char *argv[])
+{
+	WT_DECL_RET;
+	int ch;
+	const char *force;
+	char *name;
+
+	force = NULL;
+	name = NULL;
+	while ((ch = __wt_getopt(progname, argc, argv, "F")) != EOF)
+		switch (ch) {
+		case 'F':
+			force = "force";
+			break;
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= __wt_optind;
+	argv += __wt_optind;
+
+	/* The remaining argument is the file name. */
+	if (argc != 1)
+		return (usage());
+	if ((name = util_name(*argv, "file")) == NULL)
+		return (1);
+
+	if ((ret = session->salvage(session, name, force)) != 0) {
+		fprintf(stderr, "%s: salvage(%s): %s\n",
+		    progname, name, wiredtiger_strerror(ret));
+		goto err;
+	}
+
+	/* Verbose configures a progress counter, move to the next line. */
+	if (verbose)
+		printf("\n");
+
+	if (0) {
+err:		ret = 1;
+	}
+
+	if (name != NULL)
+		free(name);
+
+	return (ret);
+}
+
+static int
+usage(void)
+{
+	(void)fprintf(stderr,
+	    "usage: %s %s "
+	    "salvage [-F] uri\n",
+	    progname, usage_prefix);
+	return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_stat.c b/src/third_party/wiredtiger/src/utilities/util_stat.c
new file mode 100644
index 00000000000..caac560e839
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_stat.c
@@ -0,0 +1,103 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_stat(WT_SESSION *session, int argc, char *argv[])
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	size_t urilen;
+	int all, ch, objname_free;
+	const char *pval, *desc;
+	char *objname, *uri;
+
+	all = objname_free = 0;
+	objname = uri = NULL;
+	while ((ch = __wt_getopt(progname, argc, argv, "a")) != EOF)
+		switch (ch) {
+		case 'a':
+			all = 1;
+			break;
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= __wt_optind;
+	argv += __wt_optind;
+
+	/*
+	 * If there are no arguments, the statistics cursor operates on the
+	 * connection, otherwise, the optional remaining argument is a file
+	 * or LSM name.
+	 */
+	switch (argc) {
+	case 0:
+		objname = (char *)"";
+		break;
+	case 1:
+		if ((objname = util_name(*argv, "table")) == NULL)
+			return (1);
+		objname_free = 1;
+		break;
+	default:
+		return (usage());
+	}
+
+	urilen = strlen("statistics:") + strlen(objname) + 1;
+	if ((uri = calloc(urilen, 1)) == NULL) {
+		fprintf(stderr, "%s: %s\n", progname, strerror(errno));
+		goto err;
+	}
+	snprintf(uri, urilen, "statistics:%s", objname);
+
+	if ((ret = session->open_cursor(session, uri, NULL,
+	    all ? "statistics=(all)" : NULL, &cursor)) != 0) {
+		fprintf(stderr, "%s: cursor open(%s) failed: %s\n",
+		    progname, uri, wiredtiger_strerror(ret));
+		goto err;
+	}
+
+	/* List the statistics. */
+	while (
+	    (ret = cursor->next(cursor)) == 0 &&
+	    (ret = cursor->get_value(cursor, &desc, &pval, NULL)) == 0)
+		if (printf("%s=%s\n", desc, pval) < 0) {
+			ret = errno;
+			break;
+		}
+	if (ret == WT_NOTFOUND)
+		ret = 0;
+
+	if (ret != 0) {
+		fprintf(stderr, "%s: cursor get(%s) failed: %s\n",
+		    progname, objname, wiredtiger_strerror(ret));
+		goto err;
+	}
+
+	if (0) {
+err:		ret = 1;
+	}
+	if (objname_free)
+		free(objname);
+	free(uri);
+
+	return (ret);
+}
+
+static int
+usage(void)
+{
+	(void)fprintf(stderr,
+	    "usage: %s %s "
+	    "stat -a [uri]\n",
+	    progname, usage_prefix);
+	return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_upgrade.c b/src/third_party/wiredtiger/src/utilities/util_upgrade.c
new file mode 100644
index 00000000000..b56caca2ccd
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_upgrade.c
@@ -0,0 +1,63 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_upgrade(WT_SESSION *session, int argc, char *argv[])
+{
+	WT_DECL_RET;
+	int ch;
+	char *name;
+
+	name = NULL;
+	while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
+		switch (ch) {
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= __wt_optind;
+	argv += __wt_optind;
+
+	/* The remaining argument is the table name. */
+	if (argc != 1)
+		return (usage());
+	if ((name = util_name(*argv, "table")) == NULL)
+		return (1);
+
+	if ((ret = session->upgrade(session, name, NULL)) != 0) {
+		fprintf(stderr, "%s: upgrade(%s): %s\n",
+		    progname, name, wiredtiger_strerror(ret));
+		goto err;
+	}
+
+	/* Verbose configures a progress counter, move to the next line. */
+	if (verbose)
+		printf("\n");
+
+	if (0) {
+err:		ret = 1;
+	}
+
+	if (name != NULL)
+		free(name);
+
+	return (ret);
+}
+
+static int
+usage(void)
+{
+	(void)fprintf(stderr,
+	    "usage: %s %s "
+	    "upgrade uri\n",
+	    progname, usage_prefix);
+	return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_verbose.c b/src/third_party/wiredtiger/src/utilities/util_verbose.c
new file mode 100644
index 00000000000..12ff1c5463c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_verbose.c
@@ -0,0 +1,62 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+/*
+ * __handle_error_verbose --
+ *	Verbose WT_EVENT_HANDLER->handle_error implementation: send to stderr.
+ */
+static int
+__handle_error_verbose(WT_EVENT_HANDLER *handler,
+    WT_SESSION *session, int error, const char *errmsg)
+{
+	WT_UNUSED(handler);
+	WT_UNUSED(session);
+	WT_UNUSED(error);
+
+	return (fprintf(stderr, "%s\n", errmsg) < 0 ? EIO : 0);
+}
+
+/*
+ * __handle_message_verbose --
+ *	Verbose WT_EVENT_HANDLER->handle_message implementation: send to stdout.
+ */
+static int
+__handle_message_verbose(WT_EVENT_HANDLER *handler,
+    WT_SESSION *session, const char *message)
+{
+	WT_UNUSED(handler);
+	WT_UNUSED(session);
+
+	return (printf("%s\n", message) < 0 ? EIO : 0);
+}
+
+/*
+ * __handle_progress_verbose --
+ *	Default WT_EVENT_HANDLER->handle_progress implementation: ignore.
+ */
+static int
+__handle_progress_verbose(WT_EVENT_HANDLER *handler,
+    WT_SESSION *session, const char *operation, uint64_t progress)
+{
+	WT_UNUSED(handler);
+	WT_UNUSED(session);
+
+	return (
+	    printf("\r\t%s %-20" PRIu64, operation, progress) < 0 ? EIO : 0);
+}
+
+static WT_EVENT_HANDLER __event_handler_verbose = {
+	__handle_error_verbose,
+	__handle_message_verbose,
+	__handle_progress_verbose,
+	NULL	/* Close handler. */
+
+};
+
+WT_EVENT_HANDLER *verbose_handler = &__event_handler_verbose;
diff --git a/src/third_party/wiredtiger/src/utilities/util_verify.c b/src/third_party/wiredtiger/src/utilities/util_verify.c
new file mode 100644
index 00000000000..6ae5fdeec26
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_verify.c
@@ -0,0 +1,119 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+#undef OPT_ARGS
+#undef USAGE_ARGS
+#ifdef HAVE_DIAGNOSTIC
+#define	OPT_ARGS	"d:"
+#define	USAGE_ARGS	\
+	"[-d dump_address | dump_blocks | dump_offsets=#,# | dump_pages] uri"
+#else
+#define	OPT_ARGS	""
+#define	USAGE_ARGS	"uri"
+#endif
+
+int
+util_verify(WT_SESSION *session, int argc, char *argv[])
+{
+	WT_DECL_RET;
+	size_t size;
+	int ch, dump_address, dump_blocks, dump_pages;
+	char *config, *dump_offsets, *name;
+
+	dump_address = dump_blocks = dump_pages = 0;
+	config = dump_offsets = name = NULL;
+	while ((ch = __wt_getopt(progname, argc, argv, OPT_ARGS)) != EOF)
+		switch (ch) {
+		case 'd':
+			if (strcmp(__wt_optarg, "dump_address") == 0)
+				dump_address = 1;
+			else if (strcmp(__wt_optarg, "dump_blocks") == 0)
+				dump_blocks = 1;
+			else if (
+			    WT_PREFIX_MATCH(__wt_optarg, "dump_offsets=")) {
+				if (dump_offsets != NULL) {
+					fprintf(stderr,
+					    "%s: only a single 'dump_offsets' "
+					    "argument supported\n", progname);
+					return (usage());
+				}
+				dump_offsets =
+				    __wt_optarg + strlen("dump_offsets=");
+			} else if (strcmp(__wt_optarg, "dump_pages") == 0)
+				dump_pages = 1;
+			else
+				return (usage());
+			break;
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= __wt_optind;
+	argv += __wt_optind;
+
+	/* The remaining argument is the table name. */
+	if (argc != 1)
+		return (usage());
+	if ((name = util_name(*argv, "table")) == NULL)
+		return (1);
+
+	/* Build the configuration string as necessary. */
+	if (dump_address || dump_blocks || dump_offsets != NULL || dump_pages) {
+		size =
+		    strlen("dump_address,") +
+		    strlen("dump_blocks,") +
+		    strlen("dump_pages,") +
+		    strlen("dump_offsets[],") +
+		    (dump_offsets == NULL ? 0 : strlen(dump_offsets)) + 20;
+		if ((config = malloc(size)) == NULL) {
+			(void)util_err(errno, NULL);
+			goto err;
+		}
+		snprintf(config, size,
+		    "%s%s%s%s%s%s",
+		    dump_address ? "dump_address," : "",
+		    dump_blocks ? "dump_blocks," : "",
+		    dump_offsets != NULL ? "dump_offsets=[" : "",
+		    dump_offsets != NULL ? dump_offsets : "",
+		    dump_offsets != NULL ? "]," : "",
+		    dump_pages ? "dump_pages" : "");
+	}
+	if ((ret = session->verify(session, name, config)) != 0) {
+		fprintf(stderr, "%s: verify(%s): %s\n",
+		    progname, name, wiredtiger_strerror(ret));
+		goto err;
+	}
+
+	/* Verbose configures a progress counter, move to the next line. */
+	if (verbose)
+		printf("\n");
+
+	if (0) {
+err:		ret = 1;
+	}
+
+	if (config != NULL)
+		free(config);
+	if (name != NULL)
+		free(name);
+
+	return (ret);
+}
+
+static int
+usage(void)
+{
+	(void)fprintf(stderr,
+	    "usage: %s %s "
+	    "verify %s\n",
+	    progname, usage_prefix, USAGE_ARGS);
+	return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_write.c b/src/third_party/wiredtiger/src/utilities/util_write.c
new file mode 100644
index 00000000000..067b951c0cc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_write.c
@@ -0,0 +1,107 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_write(WT_SESSION *session, int argc, char *argv[])
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	uint64_t recno;
+	int append, ch, overwrite, rkey;
+	const char *uri;
+	char config[100];
+
+	append = overwrite = 0;
+	while ((ch = __wt_getopt(progname, argc, argv, "ao")) != EOF)
+		switch (ch) {
+		case 'a':
+			append = 1;
+			break;
+		case 'o':
+			overwrite = 1;
+			break;
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= __wt_optind;
+	argv += __wt_optind;
+
+	/*
+	 * The remaining arguments are a uri followed by a list of values (if
+	 * append is set), or key/value pairs (if append is not set).
+	 */
+	if (append) {
+		if (argc < 2)
+			return (usage());
+	} else
+		if (argc < 3 || ((argc - 1) % 2 != 0))
+			return (usage());
+	if ((uri = util_name(*argv, "table")) == NULL)
+		return (1);
+
+	/* Open the object. */
+	(void)snprintf(config, sizeof(config), "%s,%s",
+	    append ? "append=true" : "", overwrite ? "overwrite=true" : "");
+	if ((ret = session->open_cursor(
+	    session, uri, NULL, config, &cursor)) != 0)
+		return (util_err(ret, "%s: session.open", uri));
+
+	/*
+	 * A simple search only makes sense if the key format is a string or a
+	 * record number, and the value format is a single string.
+	 */
+	if (strcmp(cursor->key_format, "r") != 0 &&
+	    strcmp(cursor->key_format, "S") != 0) {
+		fprintf(stderr,
+		    "%s: write command only possible when the key format is "
+		    "a record number or string\n",
+		    progname);
+		return (1);
+	}
+	rkey = strcmp(cursor->key_format, "r") == 0 ? 1 : 0;
+	if (strcmp(cursor->value_format, "S") != 0) {
+		fprintf(stderr,
+		    "%s: write command only possible when the value format is "
+		    "a string\n",
+		    progname);
+		return (1);
+	}
+
+	/* Run through the values or key/value pairs. */
+	while (*++argv != NULL) {
+		if (!append) {
+			if (rkey) {
+				if (util_str2recno(*argv, &recno))
+					return (1);
+				cursor->set_key(cursor, recno);
+			} else
+				cursor->set_key(cursor, *argv);
+			++argv;
+		}
+		cursor->set_value(cursor, *argv);
+
+		if ((ret = cursor->insert(cursor)) != 0)
+			return (util_cerr(uri, "search", ret));
+	}
+
+	return (0);
+}
+
+static int
+usage(void)
+{
+	(void)fprintf(stderr,
+	    "usage: %s %s "
+	    "write [-ao] uri key ...\n",
+	    progname, usage_prefix);
+	return (1);
+}