Merge branch 'develop'

author: Michael Cahill <michael.cahill@wiredtiger.com> 2012-06-04 17:05:36 +1000
committer: Michael Cahill <michael.cahill@wiredtiger.com> 2012-06-04 17:05:36 +1000
commit: 953b622700125746202c116452638e0181db9165 (patch)
tree: bc03d820966af3cc87a2181137df6374ab7faacc
parent: d20711f22bad7fcd401367977d883cd1bba9c017 (diff)
parent: 97cb94c0ccdf8e4554e4b233271f9ba219fee811 (diff)
download: mongo-953b622700125746202c116452638e0181db9165.tar.gz
275 files changed, 12351 insertions, 6413 deletions
diff --git a/.hgtags b/.hgtags
index 11e91331de9..4f2ad7b0a46 100644
--- a/.hgtags
+++ b/.hgtags
@@ -7,3 +7,4 @@
 a792d468bedd7b37be9cfff545582ae8ff54ff6f 1.1.3
 8054de4cb42988cd54b395cc834a6f8ab25298f7 1.1.4
 ef844093bec2ac38945fd04487dc3a051f4b9136 1.1.5
+12cf1d5546df25ac323f0400d4764e67ad5802e2 1.2.0
diff --git a/NEWS b/NEWS
index ba9e055a1a0..dddee97e890 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,31 @@
+WiredTiger release 1.2.0, 2012-06-04
+------------------------------------
+
+This release contains many bugfixes and improvements.  The major changes are:
+
+[#138]	Add support for transactions with coarse-grained durability.
+	Transactions provide atomicity guarantees and rollback, and uncommitted
+	changes are never written to disk.  There is no on-disk log, so
+	committed changes only become durable when the next checkpoint
+	completes.  Checkpoints are implemented by creating
+	transactionally-consistent snapshots within data files.
+
+[#156]	Fully support operations that make schema changes with multiple
+	sessions open concurrently.
+
+[#159]	Disable internal page key suffix compression if a custom collator is
+	configured.  This avoids issues with collators that require complete
+	keys.
+
+[#167]	Add support for durable snapshots within files.  While a snapshot is
+	active, the pages used by the snapshot will not be overwritten.  If a
+	file is accessed after a crash or application exit without calling
+	WT_CONNECTION::close, any changes made after the last snapshot will be
+	silently ignored.
+
+[#214, #216]
+	Fixes for forcing eviction with small caches.
+
 WiredTiger release 1.1.5, 2012-04-26
 ------------------------------------
 
diff --git a/README b/README
index 1c71d43b2d6..16153cf0ef6 100644
--- a/README
+++ b/README
@@ -1,6 +1,6 @@
-WiredTiger 1.1.5: (April 26, 2012)
+WiredTiger 1.2.0: (June  4, 2012)
 
-This is version 1.1.5 of WiredTiger.
+This is version 1.2.0 of WiredTiger.
 
 WiredTiger documentation can be found at:
 
diff --git a/bench/tcbench/Makefile.am b/bench/tcbench/Makefile.am
index a48dafbcf7d..30c551d2564 100644
--- a/bench/tcbench/Makefile.am
+++ b/bench/tcbench/Makefile.am
@@ -3,3 +3,7 @@ LDADD = $(top_builddir)/libwiredtiger.la
 
 noinst_PROGRAMS = wttest
 wttest_SOURCES = wttest.c
+
+clean-local:
+	rm -rf WiredTiger* *.core casket.wt
+
diff --git a/bench/tcbench/README b/bench/tcbench/README
index 5e4b2307f3a..855135f590a 100644
--- a/bench/tcbench/README
+++ b/bench/tcbench/README
@@ -2,34 +2,65 @@
  Comparison of key/value stores
 ================================================================
 
-In this directory is a simple test of WiredTiger that inserts, then
-reads, some simple records in a single file.
+In the wiredtiger/bench/tcbench directory is a simple test of WiredTiger
+that inserts, then reads, some records in a single file.  It's designed
+to match the functionality of the Tokyo Cabinet "bros" tests, so results
+can be compared across various key/value stores.
 
-This code is designed to match the functionality of the Tokyo Cabinet
-"bros" tests, so that the results can be compared across various
-key/value stores.
+To run the test, first build WiredTiger in the top-level "build_posix"
+directory.  Building WiredTiger also builds the application "wttest" in
+the wiredtiger/build_posix/bench/tcbench directory.
 
-To run the test standalone, first build WiredTiger in the top-level
-"build_posix" directory, then do the following in this directory:
+To run wttest as a standalone read test:
 
-	$ make
-	$ ./wttest write file:casket.wt 1000000
+	$ cd wiredtiger/build_posix/bench/tcbench
 	$ ./wttest read file:casket.wt 1000000
 
+To run wttest as a standalone write test:
+
+	$ cd wiredtiger/build_posix/bench/tcbench
+	$ ./wttest write file:casket.wt 1000000
+
+================================================================
 To compare the results from WiredTiger with various other stores:
 
-	1. Configure and build Tokyo Cabinet
+1. Download, configure and build Tokyo Cabinet, using GNU make:
+
+	$ cd tokyocabinet
+	$ ./configure
+	$ gmake
+
+2. Apply the tokyocabinet-test.patch in the bros subdirectory of the
+   TokyoCabinet distribution; this patch updates the bros Makefile to
+   build wttest and to build a local version of Berkeley DB, updates the
+   "reporter" script to run wttest, as well as fixing bugs in the
+   reporter script.
+
+   NOTE: This patch was created for the TokyoCabinet 1.4.47 release
+   distribution, if you are building with a different distribution, the
+   patch may not work.
+
+	$ cd tokyocabinet/bros
+	$ patch < path-to-WiredTiger/bench/tcbench/tokyocabinet-test.patch
+
+3. Set the environment variable WT_HOME to the path of the top-level
+   WiredTiger directory and the BDB_HOME environment variable to path
+   of the top-level Berkeley DB directory, and then build the test
+   programs:
+
+	$ env \
+	WT_HOME=path-to-WiredTiger \
+	BDB_HOME=path-to-BerkeleyDB \
+	gmake bdbtest tctest wttest
+
+4. Run "reporter":
 
-	2. Apply the tokyocabinet-test.patch in the bros subdirectory;
-	this patch updates the Makefile to build wttest and to build a
-	local version of Berkeley DB, updates the "reporter" script to
-	run wttest, as well as fixing bugs in the reporter script.
+	$ ./reporter
 
-	3. Build the test programs, using a command something like:
-		$ env \
-		WT_HOME=path-to-WiredTiger \
-		BDB_HOME=path-to-BerkeleyDB \
-		make bdbtest tctest wttest
+5. The output concludes with CSV output for the various runs: the columns
+    are as follows:
 
-	4. Run "reporter":
-		$ ./reporter
+    column 1: key/value store identifier,
+    column 2: elapsed time for writing 1M records,
+    column 3: elapsed time for reading 1M records,
+    column 4: file size.
diff --git a/bench/tcbench/tokyocabinet-test.patch b/bench/tcbench/tokyocabinet-test.patch
index cd293c6bbcb..fe1a260fd33 100644
--- a/bench/tcbench/tokyocabinet-test.patch
+++ b/bench/tcbench/tokyocabinet-test.patch
@@ -77,7 +77,7 @@
 +                './wttest vlcswrite -rnd file:casket.vlwt_r ' . RECNUM,
 +                './wttest vlcsread -rnd file:casket.vlwt_r ' . RECNUM,
 +
-+                './wttest flcswrite file:casket.flwt_b ' . RECNUM,
++                './wttest flcswrite -bulk file:casket.flwt_b ' . RECNUM,
 +                './wttest flcsread file:casket.flwt_b ' . RECNUM,
 +                './wttest flcswrite file:casket.flwt_a ' . RECNUM,
 +                './wttest flcsread file:casket.flwt_a ' . RECNUM,
diff --git a/build_posix/Make.base b/build_posix/Make.base
index 4202f0fc19c..74f194f5ff6 100644
--- a/build_posix/Make.base
+++ b/build_posix/Make.base
@@ -42,7 +42,7 @@ libtool: $(LIBTOOL_DEPS)
 	$(SHELL) ./config.status libtool
 
 $(top_srcdir)/src/include/extern.h: auto-includes.chk
-$(top_srcdir)/src/include/wt_internal.in: auto-includes.chk
+$(top_srcdir)/src/include/wt_internal.h: auto-includes.chk
 
 auto-includes.chk: $(libwiredtiger_la_SOURCES)
 	@(cd $(srcdir)/dist && sh s_prototypes && sh s_typedef -b) && touch $@
diff --git a/build_posix/Make.subdirs b/build_posix/Make.subdirs
index 409b10ed5d1..80716035795 100644
--- a/build_posix/Make.subdirs
+++ b/build_posix/Make.subdirs
@@ -9,9 +9,11 @@ bench/tcbench
 examples/c
 ext/collators/reverse
 ext/compressors/bzip2_compress BZIP2
-ext/compressors/snappy_compress SNAPPY
 ext/compressors/nop_compress
+ext/compressors/snappy_compress SNAPPY
 lang/python PYTHON
+test/fops
 test/format HAVE_BDB
 test/salvage
+test/snapshot
 test/thread
diff --git a/build_posix/aclocal/version-set.m4 b/build_posix/aclocal/version-set.m4
index 244bffeae24..9750cf1547a 100644
--- a/build_posix/aclocal/version-set.m4
+++ b/build_posix/aclocal/version-set.m4
@@ -1,14 +1,14 @@
 dnl build by dist/s_version
 
 VERSION_MAJOR=1
-VERSION_MINOR=1
-VERSION_PATCH=5
-VERSION_STRING='"WiredTiger 1.1.5: (April 26, 2012)"'
+VERSION_MINOR=2
+VERSION_PATCH=0
+VERSION_STRING='"WiredTiger 1.2.0: (June  4, 2012)"'
 
 AC_SUBST(VERSION_MAJOR)
 AC_SUBST(VERSION_MINOR)
 AC_SUBST(VERSION_PATCH)
 AC_SUBST(VERSION_STRING)
 
-VERSION_NOPATCH=1.1
+VERSION_NOPATCH=1.2
 AC_SUBST(VERSION_NOPATCH)
diff --git a/build_posix/aclocal/version.m4 b/build_posix/aclocal/version.m4
index 8d740efe7df..f81807a73e2 100644
--- a/build_posix/aclocal/version.m4
+++ b/build_posix/aclocal/version.m4
@@ -1,2 +1,2 @@
 dnl WiredTiger product version for AC_INIT.  Maintained by dist/s_version
-1.1.5
+1.2.0
diff --git a/build_posix/configure.ac.in b/build_posix/configure.ac.in
index 27de77efbf2..2255e8f2329 100644
--- a/build_posix/configure.ac.in
+++ b/build_posix/configure.ac.in
@@ -101,7 +101,6 @@ AC_CONFIG_HEADERS([wiredtiger_config.h:build_posix/config.hin])
 AC_CONFIG_FILES([
 	Makefile
 	wiredtiger.h:src/include/wiredtiger.in
-	wt_internal.h:src/include/wt_internal.in
 	wiredtiger.pc:build_posix/wiredtiger.pc.in
 ])
 AC_OUTPUT
diff --git a/dist/RELEASE b/dist/RELEASE
index 1e790a96ccc..724eb3f7763 100644
--- a/dist/RELEASE
+++ b/dist/RELEASE
@@ -1,6 +1,6 @@
 WIREDTIGER_VERSION_MAJOR=1
-WIREDTIGER_VERSION_MINOR=1
-WIREDTIGER_VERSION_PATCH=5
+WIREDTIGER_VERSION_MINOR=2
+WIREDTIGER_VERSION_PATCH=0
 WIREDTIGER_VERSION="$WIREDTIGER_VERSION_MAJOR.$WIREDTIGER_VERSION_MINOR.$WIREDTIGER_VERSION_PATCH"
 
 WIREDTIGER_RELEASE_DATE=`date "+%B %e, %Y"`
diff --git a/dist/api_data.py b/dist/api_data.py
index bcd3102e709..30195b7c890 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -112,7 +112,8 @@ file_config = format_meta + [
 		"utf16<file>".  See @ref huffman for more information'''),
 	Config('internal_key_truncate', 'true', r'''
 		configure internal key truncation, discarding unnecessary
-		trailing bytes on internal keys''',
+		trailing bytes on internal keys (ignored for custom
+		collators)''',
 		type='boolean'),
 	Config('internal_page_max', '2KB', r'''
 		the maximum page size for internal nodes, in bytes; the size
@@ -157,8 +158,8 @@ file_config = format_meta + [
 
 # File metadata, including both configurable and non-configurable (internal)
 file_meta = file_config + [
-	Config('root', '', r'''
-		the root page address'''),
+	Config('snapshot', '', r'''
+		the file snapshot entries'''),
 	Config('version', '(major=0,minor=0)', r'''
 		the file version'''),
 ]
@@ -206,8 +207,19 @@ methods = {
 	Config('force', 'false', r'''
 		return success if the object does not exist''',
 		type='boolean'),
+	Config('snapshot', '', r'''
+		specify one or more snapshots to drop.
+
+		The value must be either the name of a single snapshot to drop
+		(a string), or a list containing one of the following keys:
+		\c "all" to drop all snapshots,
+		\c "from=<snapshot>" to drop all snapshots after and including
+		the named snapshots, or
+		\c "to=<snapshot>" to drop all snapshots before and including
+		the named snapshot'''),
 	]),
 
+'session.dumpfile' : Method([]),
 'session.log_printf' : Method([]),
 
 'session.open_cursor' : Method([
@@ -220,10 +232,6 @@ methods = {
 		load path for empty objects, only empty objects may be
 		bulk-loaded''',
 		type='boolean'),
-	Config('clear_on_close', 'false', r'''
-		for statistics cursors, reset statistics counters when the
-		cursor is closed''',
-		type='boolean'),
 	Config('dump', '', r'''
 		configure the cursor for dump format inputs and outputs:
 		"hex" selects a simple hexadecimal format, "print"
@@ -243,9 +251,15 @@ methods = {
 		ignore the encodings for the key and value, manage data as if
 		the formats were \c "u".  See @ref cursor_raw for details''',
 		type='boolean'),
+	Config('snapshot', '', r'''
+		the name of a snapshot to open'''),
 	Config('statistics', 'false', r'''
 		configure the cursor for statistics''',
 		type='boolean'),
+	Config('statistics_clear', 'false', r'''
+		statistics cursors only; reset statistics counters when the
+		cursor is closed''',
+		type='boolean'),
 ]),
 
 'session.rename' : Method([]),
@@ -255,17 +269,18 @@ methods = {
 		files''',
 		type='boolean'),
 ]),
-'session.sync' : Method([]),
+'session.sync' : Method([
+	Config('snapshot', '', r'''
+		if non-empty, create a named snapshot'''),
+]),
 'session.truncate' : Method([]),
 'session.upgrade' : Method([]),
 'session.verify' : Method([]),
-'session.dumpfile' : Method([]),
 
 'session.begin_transaction' : Method([
-	Config('isolation', 'read-committed', r'''
+	Config('isolation', 'snapshot', r'''
 		the isolation level for this transaction''',
-		choices=['serializable', 'snapshot', 'read-committed',
-		    'read-uncommitted']),
+		choices=['read-uncommitted', 'snapshot']),
 	Config('name', '', r'''
 		name of the transaction for tracing and debugging'''),
 	Config('sync', 'full', r'''
@@ -281,33 +296,13 @@ methods = {
 'session.rollback_transaction' : Method([]),
 
 'session.checkpoint' : Method([
-	Config('archive', 'false', r'''
-		remove log files no longer required for transactional
-		durability''',
-		type='boolean'),
-	Config('flush_cache', 'true', r'''
-		flush the cache''',
-		type='boolean'),
-	Config('flush_log', 'true', r'''
-		flush the log to disk''',
-		type='boolean'),
-	Config('log_size', '0', r'''
-		only proceed if more than the specified number of bytes of log
-		records have been written since the last checkpoint''',
-		min='0'),
-	Config('force', 'false', r'''
-		write a new checkpoint even if nothing has changed since the
-		last one''',
-		type='boolean'),
-	Config('timeout', '0', r'''
-		only proceed if more than the specified number of milliseconds
-		have elapsed since the last checkpoint''',
-		min='0'),
+	Config('snapshot', '', r'''
+		if non-empty, create named snapshots in files'''),
 ]),
 
-'connection.add_cursor_type' : Method([]),
 'connection.add_collator' : Method([]),
 'connection.add_compressor' : Method([]),
+'connection.add_data_source' : Method([]),
 'connection.add_extractor' : Method([]),
 'connection.close' : Method([]),
 
@@ -378,7 +373,10 @@ methods = {
 		maximum expected number of sessions (including server
 		threads)''',
 		min='1'),
-	Config('transactional', 'false', r'''
+	Config('sync', 'true', r'''
+		sync files when closing or writing snapshots''',
+		type='boolean'),
+	Config('transactional', 'true', r'''
 		support transactional semantics''',
 		type='boolean'),
 	Config('verbose', '', r'''
@@ -395,6 +393,7 @@ methods = {
 		    'readserver',
 		    'reconcile',
 		    'salvage',
+		    'snapshot',
 		    'verify',
 		    'write']),
 ]),
@@ -418,6 +417,7 @@ flags = {
 		'VERB_readserver',
 		'VERB_reconcile',
 		'VERB_salvage',
+		'VERB_snapshot',
 		'VERB_verify',
 		'VERB_write'
 	],
@@ -425,6 +425,6 @@ flags = {
 ###################################################
 # Structure flag declarations
 ###################################################
-	'conn' : [ 'SERVER_RUN' ],
+	'conn' : [ 'CONN_NOSYNC', 'CONN_TRANSACTIONAL', 'SERVER_RUN' ],
 	'session' : [ 'SESSION_INTERNAL', 'SESSION_SALVAGE_QUIET_ERR' ],
 }
diff --git a/dist/filelist b/dist/filelist
index 264af59c511..50506078dc6 100644
--- a/dist/filelist
+++ b/dist/filelist
@@ -1,7 +1,6 @@
 # filelist --
 #	List of source files for WiredTiger library.
 
-src/api/api_event.c
 src/api/api_strerror.c
 src/api/api_version.c
 src/block/block_addr.c
@@ -11,6 +10,7 @@ src/block/block_mgr.c
 src/block/block_open.c
 src/block/block_read.c
 src/block/block_slvg.c
+src/block/block_snap.c
 src/block/block_vrfy.c
 src/block/block_write.c
 src/btree/bt_bulk.c
@@ -64,6 +64,12 @@ src/cursor/cur_std.c
 src/cursor/cur_table.c
 src/log/log.c
 src/log/log_desc.c
+src/meta/meta_api.c
+src/meta/meta_apply.c
+src/meta/meta_snapshot.c
+src/meta/meta_table.c
+src/meta/meta_track.c
+src/meta/meta_turtle.c
 src/os_posix/os_abort.c
 src/os_posix/os_alloc.c
 src/os_posix/os_dlopen.c
@@ -92,14 +98,13 @@ src/schema/schema_open.c
 src/schema/schema_plan.c
 src/schema/schema_project.c
 src/schema/schema_rename.c
-src/schema/schema_table.c
-src/schema/schema_track.c
 src/schema/schema_truncate.c
 src/schema/schema_util.c
 src/schema/schema_worker.c
 src/session/session_api.c
 src/session/session_btree.c
-src/session/session_root.c
+src/session/session_salvage.c
+src/session/session_snapshot.c
 src/support/err.c
 src/support/filename.c
 src/support/global.c
@@ -111,3 +116,4 @@ src/support/rand.c
 src/support/scratch.c
 src/support/sess_dump.c
 src/support/stat.c
+src/txn/txn.c
diff --git a/dist/s_all b/dist/s_all
index 1c98f4283e0..dad129612d5 100644
--- a/dist/s_all
+++ b/dist/s_all
@@ -65,12 +65,10 @@ run "sh ./s_funcs" "checking for unused functions"
 run "sh ./s_stat" "checking for unused statistics fields"
 run "sh ./s_getopt" "checking for incorrect getopt usage"
 run "sh ./s_longlines" "checking for long lines"
-run "sh ./s_printf" "checking for non-portable printf/scanf"
 run "sh ./s_string" "checking string spelling"
 run "sh ./s_style" "checking style"
 run "sh ./s_symbols" "checking external symbol names"
 run "sh ./s_typedef -c" "checking for unused typedefs"
-run "sh ./s_types" "checking for old-style types"
 run "sh ./s_whitespace" "checking whitespace"
 
 run "sh ./s_docs" "generating documentation"
diff --git a/dist/s_copyright b/dist/s_copyright
index afca1458c46..2e5c07e91c7 100644
--- a/dist/s_copyright
+++ b/dist/s_copyright
@@ -1,9 +1,6 @@
 #! /bin/sh
 
-# It was a pain updating all the copyrights in the Berkeley DB tree: I'm not
-# doing that again, the only files carrying copyrights in the WiredTiger tree
-# are the source files, that is, *.[ch] and *.in.  Check automatically to be
-# sure the copyright is up-to-date.
+# Check the copyrights.
 
 c1=__wt.1$$
 c2=__wt.2$$
@@ -64,6 +61,9 @@ check()
 	if `sed -e 3,4p -e 5q -e d ../$1 | diff - $c4 > /dev/null` ; then
 		return;
 	fi
+	if `sed -e 1,2p -e 3q -e d ../$1 | diff - $c4 > /dev/null` ; then
+		return;
+	fi
 
 	echo "$1: copyright information is incorrect"
 }
@@ -80,7 +80,7 @@ done
 
 # The documentation copyright appears in two files.
 s="Copyright (c) 2008-$year WiredTiger, Inc."
-f="docs/build-javadoc.sh docs/style/footer.html"
+f="src/docs/build-javadoc.sh src/docs/style/footer.html"
 for i in $f; do
 	if `grep "$s" ../$i > /dev/null`; then
 		continue;
diff --git a/dist/s_copyright.list b/dist/s_copyright.list
index baa242346c0..b80344dd45e 100644
--- a/dist/s_copyright.list
+++ b/dist/s_copyright.list
@@ -21,6 +21,7 @@ skip	lang/python/wiredtiger.py
 skip	lang/python/wiredtiger_wrap.c
 skip	src/api/api_strerror.c
 skip	src/config/config_def.c
+skip	src/docs/tools/doxypy.py
 skip	src/include/extern.h
 skip	src/include/log.i
 skip	src/include/queue.h
diff --git a/dist/s_define.list b/dist/s_define.list
index a70ed526852..378a6c2dcf7 100644
--- a/dist/s_define.list
+++ b/dist/s_define.list
@@ -4,24 +4,15 @@ API_CALL
 API_CALL_NOCONF
 API_SESSION_INIT
 FLD_CLR
-FLD_ISSET
 HAVE_ATOMICS
 LF_CLR
-LF_SET
 LLONG_MAX
 LLONG_MIN
 SIZE_CHECK
-SPINLOCK_GCC
-SPINLOCK_PTHREAD_MUTEX
 WT_BARRIER
 WT_BLOCK_DESC_SIZE
-WT_INTPACK32_MAXSIZE
-WT_MAX
-WT_MIN
-WT_PAUSE
+WT_DEBUG_BYTE
 WT_READ_BARRIER
-WT_SKIP_PROBABILITY
-WT_STAT
 WT_STAT
 WT_STAT_DECRV
 WT_STAT_INCRV
diff --git a/dist/s_docs b/dist/s_docs
index 82b2b75b08f..528fc3ada6d 100755
--- a/dist/s_docs
+++ b/dist/s_docs
@@ -20,30 +20,39 @@ spellchk()
 	# If aspell has been installed, run a spell check.
 	type aspell > /dev/null 2>&1 || return
 
-	(cd ../docs/src &&
+	(cd ../src/docs &&
 	cat *.dox | aspell --lang=en --personal=./spell.ok list) |
 	sort -u > $t
 	test -s $t && {
 		echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
 		echo 'Documentation spelling notes'
-		echo 'Update docs/src/spell.ok to remove warnings.'
+		echo 'Update src/docs/spell.ok to remove warnings.'
 		sed -e 's/^/	/' < $t
 		echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
 		e=1
 	}
 }
 
+valid_build()
+{
+	# Complain if there are pages we don't reference directly.
+	egrep 'Related Pages' ../docs/navtree.js > /dev/null && {
+		echo 'Unreferenced page: see "Related Pages" for the list.'
+		e=1
+	}
+}
+
 build()
 {
 	# Build from scratch on demand.
-	[ "$1" -eq 0 ] || (cd ../docs && make clean)
+	[ "$1" -eq 0 ] || (cd .. && rm -rf docs && mkdir docs)
 
 	# Run doxygen to generate warnings for the base HTML documentation.
 	#
 	# We omit Python because warnings are expected there (the code generated
 	# by swig does not have named arguments, but we want to document them
 	# as if they do.
-	(cd ../docs &&
+	(cd ../src/docs &&
 	    (eval cat Doxyfile $filter ; cat <<EOF
 QUIET=YES
 EOF
@@ -55,10 +64,11 @@ EOF
 	}
 
 	# Run again to generate the full documentation set (with Python).
-	[ -f ../lang/python/wiredtiger.py ] && (cd ../docs &&
+	[ "$python" -eq 1 ] && [ -f ../lang/python/wiredtiger.py ] && (
+	cd ../src/docs &&
 		(eval cat Doxyfile $filter ; cat <<EOF
 QUIET=YES
-INPUT+=../lang/python/wiredtiger.py
+INPUT+=../../lang/python/wiredtiger.py
 EOF
 ) | doxygen -)
 
@@ -69,12 +79,17 @@ EOF
 }
 
 clean=0
+python=1
 filter="|sed '/PROJECT_NUMBER/s,=.*,=\"Version $WIREDTIGER_VERSION\",'"
 while :
 	do case "$1" in
 	-a)	# Build from scratch
 		clean=1
 		shift;;
+	-l)     # Generate the top-level landing page in ../docs/top
+		filter="$filter; cat top/Doxyfile"
+		python=0
+		shift;;
 	-p)     # Generate PDFs
 		filter="$filter| sed '/GENERATE_LATEX/s,=.*,=YES,'"
 		shift;;
@@ -92,4 +107,7 @@ spellchk
 # Build the documentation.
 build $clean
 
+# Any post-build validity checks we want to make.
+valid_build
+
 exit $e
diff --git a/dist/s_funcs.list b/dist/s_funcs.list
index 64cdcddd88d..405c34cce5d 100644
--- a/dist/s_funcs.list
+++ b/dist/s_funcs.list
@@ -5,6 +5,7 @@ __bit_nclr
 __wt_bm_addr_stderr
 __wt_btree_lex_compare
 __wt_debug_addr
+__wt_debug_off
 __wt_debug_tree
 __wt_debug_tree_all
 __wt_fsync
@@ -13,4 +14,5 @@ __wt_nlpo2
 __wt_nlpo2_round
 __wt_print_huffman_code
 wiredtiger_struct_pack
+wiredtiger_struct_size
 wiredtiger_struct_unpack
diff --git a/dist/s_printf b/dist/s_printf
deleted file mode 100644
index e039b6cffd2..00000000000
--- a/dist/s_printf
+++ /dev/null
@@ -1,21 +0,0 @@
-#! /bin/sh
-
-t=__wt.$$
-trap 'rm -f $t; exit 0' 0 1 2 3 13 15
-
-l="`sed -e '/^[a-z]/! d' -e 's,^,../,' filelist`"
-l="$l `ls ../src/include/*.[hi] ../src/include/*.in`"
-l="$l `find ../test -name '*.c' -print | egrep -v '/packing/|/insert/'`"
-
-# Look for '%l': that suggests we're trying to print a long value, and that's
-# almost always wrong: we should use a portable PRI* macro to construct the
-# format string instead.
-(for f in $l ; do
-    sed -n -E '/WT_VERBOSE|printf|scanf|__wt_errx?/,/\);/{=
-p
-}' $f |
-    sed -e 'N' -e 's/\n/:/' -e "s,^,$f:,"
- done) | grep '%l' > $t
-test -s $t && cat $t && exit 1
-
-exit 0
diff --git a/dist/s_release.list b/dist/s_release.list
index feeb32a8ae0..749aee885e9 100644
--- a/dist/s_release.list
+++ b/dist/s_release.list
@@ -3,9 +3,8 @@
 lang/java
 lang/python/src
 src/server
-src/txn
 test/format
-test/insert
 test/packing
 test/salvage
+test/snapshot
 test/thread
diff --git a/dist/s_string.ok b/dist/s_string.ok
index 1b388b3c935..c36064a69e9 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -67,6 +67,7 @@ INDX
 INIT
 INITIALIZER
 INSERT's
+INUSE
 ISSET
 JPEG
 JSON
@@ -86,16 +87,19 @@ MEM
 MERCHANTABILITY
 MVCC
 Marsaglia's
+Metadata
 Mewhort
 Mutex
 Mutexes
 NONINFRINGEMENT
 NOTFOUND
 NOTREACHED
+NOWRITE
 NRECS
 NUL
 NULLs
 NoAddr
+ONPAGE
 OVFL
 PADDR
 PAGE's
@@ -129,6 +133,9 @@ Subtrees
 TAILQ
 TODO
 TOOSMALL
+TRK
+TXN
+Timestamp
 UINT
 URI
 URIs
@@ -141,6 +148,7 @@ WIREDTIGER
 WinNT
 WiredTiger
 WiredTiger's
+WiredTigerInternalSnapshot
 Wuninitialized
 XP
 __wt_epoch
@@ -253,10 +261,13 @@ evictserver
 exactp
 extern
 extlist
+extlists
+fblocks
 fcntl
 ffc
 ffs
 fh
+filefrag
 filename
 fileops
 filesize
@@ -317,6 +328,7 @@ keycmp
 keyname
 keyv
 kv
+ld
 len
 lenp
 lex
@@ -344,10 +356,12 @@ memcpy
 memfree
 memsize
 metadata
+metafile
 mfence
 minorp
 minprefix
 msg
+msgv
 mtx
 multiprocess
 multithread
@@ -359,12 +373,15 @@ nbits
 nclr
 negint
 newname
+nhex
 nl
 nlpo
 nocase
 nop
+notset
 notsup
 notyet
+nowrite
 np
 nset
 nul
@@ -372,6 +389,7 @@ numSymbols
 numbare
 offpage
 oldname
+onpage
 os
 ovfl
 packv
@@ -421,10 +439,16 @@ sessionp
 setv
 sfence
 sizeof
+sizep
 sizev
 skiplist
 skiplists
 slvg
+snapall
+snapfrag
+snapfrom
+snaplist
+snapto
 snprintf
 sp
 spinlock
@@ -438,6 +462,7 @@ stdarg
 stderr
 stdout
 str
+strcmp
 strdup
 strerror
 stringin
@@ -466,6 +491,7 @@ toffpage
 toverflow
 tparent
 transactional
+transactionally
 trecno
 treeconfig
 trepeat
@@ -481,6 +507,7 @@ tupdate
 tvalue
 twrite
 txn
+txnid
 typedef
 uB
 uint
@@ -492,6 +519,7 @@ unescaped
 uninstantiated
 unjams
 unlinked
+unmodify
 unpackv
 unreferenced
 unsized
@@ -502,7 +530,9 @@ utf
 va
 vanishingly
 vcell
+verrx
 versa
+vmsg
 vpack
 vsize
 vslot
diff --git a/dist/s_style b/dist/s_style
index dba35944fd3..15447bbec92 100644
--- a/dist/s_style
+++ b/dist/s_style
@@ -4,19 +4,54 @@
 t=__wt.$$
 trap 'rm -f $t; exit 0' 0 1 2 3 13 15
 
-extra=`cd .. && echo dist/*.py src/include/*.[hi] src/include/*.in && \
-	find test -name '*.[ch]' -print`
+cd ..
 
-for f in `sed -e '/^[a-z]/! d' filelist` $extra; do
-	f="../$f"
-	if grep "^[^}]*while (0);" $f > /dev/null; then
+# Returns in functions after a jump to the error lable.
+for f in `find examples ext src test -name '*.[ci]'`; do
+	sed -n \
+	    -e '/^{$/,/^}$/{=;p;}' $f |
+	sed 'N;s/\n/:/' |
+	sed -e '/./{H;/^[0-9][0-9]*:}$/!d;}' \
+	    -e x \
+	    -e 's/\n/ /g' \
+	    -e p \
+	    -e '{s/.*//;x;}' |
+	egrep '(WT_ERR|WT_ERR_MSG|WT_ERR_TEST|WT_ILLEGAL_VALUE_ERR)\(.*(WT_ASSERT_RET|WT_ILLEGAL_VALUE|WT_RET|WT_RET_MSG|WT_RET_TEST|WT_VERBOSE_RET|WT_VERBOSE_RETVAL)\(.*err:' |
+	    sed 's/:.*//' > $t
+	test -s $t && {
+		echo "$f: function with return after a jump to an error label"
+		sed 's/^/function @ line:/' < $t
+	}
+done
+
+for f in `find examples ext src test -name '*.[chisy]' -o -name '*.in' |
+    sed '/Makefile.in/d'`; do
+	if grep "^[^}]*while (0);" $f > $t; then
 		echo "$f: while (0) has trailing semi-colon"
+		cat $t
 	fi
-	if grep "%dl|%ul|%xl" $f > /dev/null; then
-		echo "$f: bad printf format: %[dux]l"
+	if egrep '%l[diouxXn]|%[diouxXn]l' $f > $t; then
+		echo "$f: incorrect or dangerous printf format: %l[diouxXn]"
+		cat $t
 	fi
-	if grep "(unsigned)" $f > /dev/null; then
+	if grep "(unsigned)" $f > $t; then
 		echo "$f: (unsigned) cast is wrong"
+		cat $t
+	fi
+	egrep 'u_quad' $f | sed '/@u_quad_decl@/d' > $t
+	test -s $t && {
+		echo "$f: old-style type declaration: u_XXX_t or u_quad"
+		cat $t
+	}
+
+	if ! expr "$f" : 'examples/.*' > /dev/null &&
+	   ! expr "$f" : 'test/.*' > /dev/null &&
+	   ! expr "$f" : 'ext/.*' > /dev/null; then
+		egrep -w ret $f | egrep 'int.*[, ]ret[,;]' > $t
+		test -s $t && {
+			echo "$f: explicit declaration of \"ret\""
+			cat $t
+		}
 	fi
 
 	# Early exits from critical loops
@@ -37,7 +72,7 @@ for f in `sed -e '/^[a-z]/! d' filelist` $extra; do
 	    -e 's/\([	 ]\)if(/\1if (/' \
 	    -e 's/\([	 ]\)index(/\1strchr(/' \
 	    -e 's/\([	 ]\)return(/\1return (/' \
-	    -e 's/^\([	 ]+\)return \([^()]*\);/\1return (\2);/' \
+	    -e 's/\([	 ]\)return \([^()]*\);/\1return (\2);/' \
 	    -e 's/\([	 ]\)rindex(/\1strrchr(/' \
 	    -e 's/\([	 ]\)sizeof (/\1sizeof(/g' \
 	    -e 's/\([	 ]\)switch(/\1switch (/' \
@@ -47,7 +82,6 @@ for f in `sed -e '/^[a-z]/! d' filelist` $extra; do
 	    -e 's/\([	 ,]\)u_int16_t\([	 ,]\)/\1uint16_t\2/g' \
 	    -e 's/\([	 ,]\)u_int32_t\([	 ,]\)/\1uint32_t\2/g' \
 	    -e 's/\([	 ,]\)u_int64_t\([	 ,]\)/\1uint64_t\2/g' \
-	    -e 's/%\([dux]\)l/%l\1/' \
 	    -e 's/\([|&=+-]\)  *\([^*]\)/\1 \2/' \
 	    -e 's/(void) /(void)/' \
 	    -e '/for /!s/;;$/;/' \
@@ -55,5 +89,5 @@ for f in `sed -e '/^[a-z]/! d' filelist` $extra; do
 	    -e 's/^#define /#define	/' \
 	    -e 's/sizeof(WT_PAGE_DISK)/WT_PAGE_DISK_SIZE/g' >$t
 
-	cmp $t $f > /dev/null 2>&1 || (echo "$f" && cp $t $f)
+	cmp $t $f > /dev/null 2>&1 || (echo "modifying $f" && cp $t $f)
 done
diff --git a/dist/s_typedef b/dist/s_typedef
index 42033dff4bc..ff6fa47c5bc 100644
--- a/dist/s_typedef
+++ b/dist/s_typedef
@@ -5,7 +5,7 @@ trap 'rm -f $t; exit 0' 0 1 2 3 13 15
 
 build() {
 	# Build the standard typedefs.
-	f=../src/include/wt_internal.in
+	f=../src/include/wt_internal.h
 	(sed -e '/Forward structure declarations .*: BEGIN/{' \
 	    -e 'n' \
 	    -e 'q' \
diff --git a/dist/s_types b/dist/s_types
deleted file mode 100644
index 654cfcde7f5..00000000000
--- a/dist/s_types
+++ /dev/null
@@ -1,12 +0,0 @@
-#! /bin/sh
-
-t=__wt.$$
-trap 'rm -f $t; exit 0' 0 1 2 3 13 15
-
-(cd .. &&
-    l="`find examples ext src test -name '*.[chi]' -o -name '*.in'`"
-    egrep 'u_quad|u_int8_t|u_int16_t|u_int32_t|u_int64_t' $l |
-    sed '/@u_quad_decl@/d') > $t
-test -s $t && cat $t && exit 1
-
-exit 0
diff --git a/dist/s_whitespace b/dist/s_whitespace
index 2e2c2863b4c..3a51b251bfe 100644
--- a/dist/s_whitespace
+++ b/dist/s_whitespace
@@ -4,13 +4,27 @@
 t=__wt.$$
 trap 'rm -f $t; exit 0' 0 1 2 3 13 15
 
-inc=`cd .. && echo dist/*.py dist/s_* src/include/*.[hi] src/include/*.in &&
-	find test -name '*.[ch]' -print`
-
-for f in `sed -e '/^[a-z]/! d' filelist` $inc; do
-	f="../$f"
+ws()
+{
 	sed -e 's/[	 ][	 ]*$//' \
 	    -e '/^$/N' \
-	    -e '/\n$/D' < $f > $t
-	cmp $t $f > /dev/null 2>&1 || (echo "$f" && cp $t $f)
+	    -e '/\n$/D' < $1 > $t
+	cmp $t $1 > /dev/null 2>&1 || (echo "$1" && cp $t $1)
+}
+
+cd ..
+
+for f in `find dist -name '*.py' -name 's_*'`; do
+	ws $f
+done
+
+for f in `find examples ext src test \
+    -name '*.[chi]' -o \
+    -name '*.dox' -o \
+    -name '*.in' -o \
+    -name 'Makefile.am'`; do
+	if expr "$f" : ".*/Makefile.in" > /dev/null; then
+		continue
+	fi
+	ws $f
 done
diff --git a/dist/serial.py b/dist/serial.py
index b5262d73fd7..356985afe08 100644
--- a/dist/serial.py
+++ b/dist/serial.py
@@ -18,7 +18,8 @@ class Serial:
 msgtypes = [
 Serial('col_append', 'WT_SERIAL_FUNC', [
 		SerialArg('WT_PAGE *', 'page'),
-		SerialArg('WT_INSERT_HEAD **', 'inshead'),
+		SerialArg('uint32_t', 'write_gen'),
+		SerialArg('WT_INSERT_HEAD **', 'insheadp'),
 		SerialArg('WT_INSERT ***', 'ins_stack'),
 		SerialArg('WT_INSERT_HEAD **', 'new_inslist', 1),
 		SerialArg('WT_INSERT_HEAD *', 'new_inshead', 1),
@@ -26,10 +27,6 @@ Serial('col_append', 'WT_SERIAL_FUNC', [
 		SerialArg('u_int', 'skipdepth'),
 	]),
 
-Serial('evict_file', 'WT_SERIAL_EVICT', [
-		SerialArg('int', 'discard'),
-	]),
-
 Serial('insert', 'WT_SERIAL_FUNC', [
 		SerialArg('WT_PAGE *', 'page'),
 		SerialArg('uint32_t', 'write_gen'),
@@ -47,6 +44,10 @@ Serial('row_key', 'WT_SERIAL_FUNC', [
 		SerialArg('WT_IKEY *', 'ikey'),
 	]),
 
+Serial('sync_file', 'WT_SERIAL_EVICT', [
+		SerialArg('int', 'syncop'),
+	]),
+
 Serial('update', 'WT_SERIAL_FUNC', [
 		SerialArg('WT_PAGE *', 'page'),
 		SerialArg('uint32_t', 'write_gen'),
@@ -100,7 +101,7 @@ typedef struct {
 	f.write('''
 {
 \t__wt_''' + entry.name + '''_args _args, *args = &_args;
-\tint ret;
+\tWT_DECL_RET;
 
 ''')
 	for l in entry.args:
diff --git a/dist/stat_data.py b/dist/stat_data.py
index a95c72c09a6..61ef650aece 100644
--- a/dist/stat_data.py
+++ b/dist/stat_data.py
@@ -48,8 +48,8 @@ btree_stats = [
 	Stat('cursor_read_near', 'cursor-read-near'),
 	Stat('cursor_read_next', 'cursor-read-next'),
 	Stat('cursor_read_prev', 'cursor-read-prev'),
-	Stat('cursor_resets', 'cursor-resets'),
 	Stat('cursor_removes', 'cursor-removes'),
+	Stat('cursor_resets', 'cursor-resets'),
 	Stat('cursor_updates', 'cursor-updates'),
 	Stat('extend', 'file: block allocations required file extension'),
 	Stat('file_allocsize', 'page size allocation unit'),
@@ -60,8 +60,6 @@ btree_stats = [
 	Stat('file_col_var_pages', 'column-store variable-size leaf pages'),
 	Stat('file_entries', 'total entries'),
 	Stat('file_fixed_len', 'fixed-record size'),
-	Stat('file_freelist_bytes', 'number of bytes in the freelist'),
-	Stat('file_freelist_entries', 'number of entries in the freelist'),
 	Stat('file_magic', 'magic number'),
 	Stat('file_major', 'major version number'),
 	Stat('file_maxintlitem', 'maximum internal page item size'),
@@ -73,6 +71,7 @@ btree_stats = [
 	Stat('file_row_int_pages', 'row-store internal pages'),
 	Stat('file_row_leaf_pages', 'row-store leaf pages'),
 	Stat('file_size', 'file: size'),
+	Stat('file_write_conflicts', 'write generation conflicts'),
 	Stat('free', 'file: block frees'),
 	Stat('overflow_read', 'file: overflow pages read from the file'),
 	Stat('page_read', 'file: pages read from the file'),
diff --git a/docs/Makefile b/docs/Makefile
deleted file mode 100644
index 35a775c1259..00000000000
--- a/docs/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-all:
-	@(cd ../dist && sh s_docs -t)
-
-clean:
-	rm -f doxygen.log *.css *.html *.js *.png *.ps
-
-.PHONY: all
diff --git a/docs/src/threads.dox b/docs/src/threads.dox
deleted file mode 100644
index 36c7a2c7f0c..00000000000
--- a/docs/src/threads.dox
+++ /dev/null
@@ -1,37 +0,0 @@
-/*! @page threads Multithreading
-
-WT_CONNECTION handles can be shared between threads, and applications
-generally only open one connection to a given database per process. All
-methods on WT_CONNECTION are thread safe.
-
-WT_SESSION and WT_CURSOR handles cannot be shared between threads concurrently:
-the usual approach is for applications to open one WT_SESSION for each thread
-that accesses a database.  Applications may open multiple WT_CURSOR handles
-within a session.
-
-Multiple threads must not access a session handle concurrently (including
-accessing two or more cursor handles in the same session).  However,
-WT_SESSION handles may be accessed by different threads serially (for
-example, from a pool of threads managed by the application with a set of
-shared session handles).  There is no thread-local state in WiredTiger, but
-no built-in synchronization of session state either, so if multiple threads
-access a session handle or dependencies such as cursors, the access must be
-serialized by the application.
-
-@section threads_example Code samples
-
-The code below is taken from the complete example program
-@ex_ref{ex_thread.c}.
-
-This is an example of a thread entry point.  A new session is opened for
-the thread and used for all operations within that thread.
-
-@snippet ex_thread.c thread scan
-
-Here is the main function that starts the threads.  It opens a single
-connection, shared between the threads, and closes the connection after
-waiting for all of the threads to exit.
-
-@snippet ex_thread.c thread main
-
- */
diff --git a/docs/src/transactions.dox b/docs/src/transactions.dox
deleted file mode 100644
index cee780a8fab..00000000000
--- a/docs/src/transactions.dox
+++ /dev/null
@@ -1,82 +0,0 @@
-/*! @page transactions Transactions
-
-@notyet{transactions}
-<b>This page describes the expected behavior of the 2.X releases.</b>
-
-@section transactions_acid ACID properties
-
-Transactions provide a powerful abstraction for multiple threads to operate
-on data concurrently because they have the following properties:
-
-- Atomicity: all or none of a transaction is completed.
-- Consistency: if each transaction maintains some property when considered
-  separately, then the combined effect of executing the transactions
-  concurrently will maintain the same property.
-- Isolation: developers can reason about transactions independently.
-- Durability: once a transaction commits, its updates are saved.
-
-@section transactions_api Transactional API
-
-To configure for transactions, the database must be created with
-transaction support enabled.  This is done by passing the configuration
-string <code>"transactional"</code> to ::wiredtiger_open when creating the
-database.
-
-In WiredTiger, the transactional context is managed by the WT_SESSION
-class.  Applications call WT_SESSION::begin_transaction to start a new
-transaction, which is only permitted when no cursors are open.  Operations
-performed with that WT_SESSION handle are then part of the transaction, and
-their effects can be committed by calling WT_SESSION::commit_transaction or
-WT_SESSION::rollback_transaction, both of which implicitly close any open
-cursors.
-
-When transactions are used, operations may fail with additional errors such
-as ::WT_DEADLOCK.
-
-@todo describe transaction error cases more fully
-
-@section transactions_cc Concurrency control
-
-WiredTiger uses an optimistic concurrency control algorithm.  This avoids
-the bottleneck of a centralized lock manager and expensive graph searching
-to identify deadlock cycles.
-
-@section transaction_isolation Isolation levels
-
-The default isolation level is <code>serializable</code>, which means that
-the concurrent execution of committed transactions is equivalent to
-executing the transactions in some serial order.
-
-Weaker isolation levels are also provided, including <code>repeatable
-read</code>, which permits phantoms, <code>snapshot isolation</code>, which
-permits write skew, <code>read committed</code>, which permits lost updates
-and always returns the most recently committed changes, and <code>read
-uncommitted</code>, which always reads the most recent version of data,
-regardless of whether it is committed.
-
-@section transaction_recovery Recovery
-
-Recovery is run automatically during ::wiredtiger_open when required.
-
-Recovery works by using a database log that contains a record of the
-actions of all transactions.  Recovery first finds the last complete
-checkpoint, and then scans forward through the log from that point to
-determine which transactions committed after the checkpoint.  All actions
-are rolled forward from the checkpoint so that the in-memory tree matches
-its state when the crash occurred.  Then the "losing" transactions (those
-that did not commit) are rolled back to return the database to a consistent
-state.
-
-This suggests the importance of regular checkpoints: they limit the amount
-of work required during recovery, which speeds up the ::wiredtiger_open
-call.  See WT_SESSION::checkpoint for information about triggering
-checkpoints.
-
-@section transaction_example Code samples
-
-The code below is taken from the complete example program
-@ex_ref{ex_transaction.c}.
-
-@snippet ex_transaction.c transaction
-
- */
diff --git a/docs/src/using.dox b/docs/src/using.dox
deleted file mode 100644
index 3c1358c4a4b..00000000000
--- a/docs/src/using.dox
+++ /dev/null
@@ -1,24 +0,0 @@
-/*! @page using Writing WiredTiger applications
-
-This section explains how to use WiredTiger by developing a sequence of
-example programs:
-
-- @subpage basic_api
-- @subpage config_strings
-- @subpage schema
-- @subpage cursors
-- @subpage threads
-- @subpage transactions
-
-Additional Notes:
-
-- @subpage license\n
-- @subpage home
-- @subpage security
-- @subpage file_formats
-- @subpage compression
-- @subpage name_space
-- @subpage signals
-- @subpage tuning
-
- */
diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c
index 88d01cb467c..7c016980efb 100644
--- a/examples/c/ex_all.c
+++ b/examples/c/ex_all.c
@@ -34,18 +34,21 @@
 
 #include <assert.h>
 #include <errno.h>
+#include <inttypes.h>
 #include <stdio.h>
 #include <string.h>
+#include <unistd.h>
 
 #include <wiredtiger.h>
 
 int add_collator(WT_CONNECTION *conn);
 int add_compressor(WT_CONNECTION *conn);
-int add_cursor_type(WT_CONNECTION *conn);
+int add_data_source(WT_CONNECTION *conn);
 int add_extractor(WT_CONNECTION *conn);
 int connection_ops(WT_CONNECTION *conn);
 int cursor_ops(WT_SESSION *session);
 int cursor_search_near(WT_CURSOR *cursor);
+int pack_ops(WT_SESSION *session);
 int session_ops(WT_SESSION *session);
 
 int
@@ -282,7 +285,6 @@ cursor_search_near(WT_CURSOR *cursor)
 int
 session_ops(WT_SESSION *session)
 {
-	unsigned long mypid = 0;
 	int ret;
 
 	cursor_ops(session);
@@ -297,7 +299,20 @@ session_ops(WT_SESSION *session)
 	/*! [session checkpoint] */
 
 	/*! [session drop] */
+	/* Discard a table. */
 	ret = session->drop(session, "table:mytable", NULL);
+
+	/* Drop the "midnight" snapshot. */
+	ret = session->drop(session, "table:mytable", "snapshot=midnight");
+
+	/* Drop all snapshots from a table. */
+	ret = session->drop(session, "table:mytable", "snapshot=(all)");
+
+	/* Drop all snapshots after and including "noon". */
+	ret = session->drop(session, "table:mytable", "snapshot=(from=noon)");
+
+	/* Drop all snapshots before and including "midnight". */
+	ret = session->drop(session, "table:mytable", "snapshot=(to=midnight)");
 	/*! [session drop] */
 
 	/*! [session dumpfile] */
@@ -305,7 +320,8 @@ session_ops(WT_SESSION *session)
 	/*! [session dumpfile] */
 
 	/*! [session msg_printf] */
-	ret = session->msg_printf(session, "process pid %lu", mypid);
+	ret = session->msg_printf(
+	    session, "process ID %" PRIuMAX, (uintmax_t)getpid());
 	/*! [session msg_printf] */
 
 	/*! [session rename] */
@@ -369,26 +385,45 @@ session_ops(WT_SESSION *session)
 	return (ret);
 }
 
-/*! [WT_CURSOR_TYPE size] */
+/*! [WT_DATA_SOURCE create] */
 static int
-my_cursor_size(WT_CURSOR_TYPE *ctype, const char *obj, size_t *sizep)
+my_create(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+    const char *name, const char *config)
 {
-	(void)ctype;
-	(void)obj;
+	/* Unused parameters */
+	(void)dsrc;
+	(void)session;
+	(void)name;
+	(void)config;
 
-	*sizep = sizeof (WT_CURSOR);
 	return (0);
 }
-/*! [WT_CURSOR_TYPE size] */
+/*! [WT_DATA_SOURCE create] */
 
-/*! [WT_CURSOR_TYPE init] */
+/*! [WT_DATA_SOURCE drop] */
 static int
-my_init_cursor(WT_CURSOR_TYPE *ctype, WT_SESSION *session,
+my_drop(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+    const char *name, const char *config)
+{
+	/* Unused parameters */
+	(void)dsrc;
+	(void)session;
+	(void)name;
+	(void)config;
+
+	return (0);
+}
+/*! [WT_DATA_SOURCE drop] */
+
+/*! [WT_DATA_SOURCE open_cursor] */
+static int
+my_open_cursor(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
     const char *obj, WT_CURSOR *old_cursor, const char *config,
-    WT_CURSOR *new_cursor)
+    WT_CURSOR **new_cursor)
 {
 	/* Unused parameters */
-	(void)ctype;
+	(void)dsrc;
+
 	(void)session;
 	(void)obj;
 	(void)old_cursor;
@@ -397,17 +432,70 @@ my_init_cursor(WT_CURSOR_TYPE *ctype, WT_SESSION *session,
 
 	return (0);
 }
-/*! [WT_CURSOR_TYPE init] */
+/*! [WT_DATA_SOURCE open_cursor] */
+
+/*! [WT_DATA_SOURCE rename] */
+static int
+my_rename(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+    const char *oldname, const char *newname, const char *config)
+{
+	/* Unused parameters */
+	(void)dsrc;
+	(void)session;
+	(void)oldname;
+	(void)newname;
+	(void)config;
+
+	return (0);
+}
+/*! [WT_DATA_SOURCE rename] */
+
+/*! [WT_DATA_SOURCE sync] */
+static int
+my_sync(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+    const char *name, const char *config)
+{
+	/* Unused parameters */
+	(void)dsrc;
+	(void)session;
+	(void)name;
+	(void)config;
+
+	return (0);
+}
+/*! [WT_DATA_SOURCE sync] */
+
+/*! [WT_DATA_SOURCE truncate] */
+static int
+my_truncate(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+    const char *name, const char *config)
+{
+	/* Unused parameters */
+	(void)dsrc;
+	(void)session;
+	(void)name;
+	(void)config;
+
+	return (0);
+}
+/*! [WT_DATA_SOURCE truncate] */
 
 int
-add_cursor_type(WT_CONNECTION *conn)
+add_data_source(WT_CONNECTION *conn)
 {
 	int ret;
 
-	/*! [WT_CURSOR_TYPE register] */
-	static WT_CURSOR_TYPE my_ctype = { my_cursor_size, my_init_cursor };
-	ret = conn->add_cursor_type(conn, NULL, &my_ctype, NULL);
-	/*! [WT_CURSOR_TYPE register] */
+	/*! [WT_DATA_SOURCE register] */
+	static WT_DATA_SOURCE my_dsrc = {
+		my_create,
+		my_drop,
+		my_open_cursor,
+		my_rename,
+		my_sync,
+		my_truncate
+	};
+	ret = conn->add_data_source(conn, "dsrc:", &my_dsrc, NULL);
+	/*! [WT_DATA_SOURCE register] */
 
 	return (ret);
 }
@@ -520,7 +608,7 @@ int
 add_compressor(WT_CONNECTION *conn)
 {
 	int ret;
-	
+
 	/*! [WT_COMPRESSOR register] */
 	static WT_COMPRESSOR my_compressor = {
 	    my_compress, my_decompress, my_pre_size };
@@ -569,8 +657,8 @@ connection_ops(WT_CONNECTION *conn)
 	ret = conn->load_extension(conn, "my_extension.dll", NULL);
 	/*! [conn load extension] */
 
-	add_cursor_type(conn);
 	add_collator(conn);
+	add_data_source(conn);
 	add_extractor(conn);
 
 	/*! [conn close] */
@@ -599,42 +687,52 @@ connection_ops(WT_CONNECTION *conn)
 	return (ret);
 }
 
-int main(void)
+int
+pack_ops(WT_SESSION *session)
 {
 	int ret;
 
 	{
-	/*! [Open a connection] */
-	WT_CONNECTION *conn;
-	const char *home = "WT_TEST";
-	ret = wiredtiger_open(home, NULL, "create,transactional", &conn);
-	/*! [Open a connection] */
-	}
-
-	{
 	/*! [Get the packed size] */
 	size_t size;
-	size = wiredtiger_struct_size("iSh", 42, "hello", -3);
-	assert(size < 100);
+	ret = wiredtiger_struct_size(session, &size, "iSh", 42, "hello", -3);
 	/*! [Get the packed size] */
+	assert(size < 100);
 	}
 
 	{
 	/*! [Pack fields into a buffer] */
 	char buf[100];
-	ret = wiredtiger_struct_pack(buf, sizeof (buf), "iSh", 42, "hello", -3);
+	ret = wiredtiger_struct_pack(
+	    session, buf, sizeof(buf), "iSh", 42, "hello", -3);
 	/*! [Pack fields into a buffer] */
- 
+
 	{
 	/*! [Unpack fields from a buffer] */
 	int i;
 	char *s;
 	short h;
-	ret = wiredtiger_struct_unpack(buf, sizeof (buf), "iSh", &i, &s, &h);
+	ret = wiredtiger_struct_unpack(
+	    session, buf, sizeof(buf), "iSh", &i, &s, &h);
 	/*! [Unpack fields from a buffer] */
 	}
 	}
 
+	return (ret);
+}
+
+int main(void)
+{
+	int ret;
+
+	{
+	/*! [Open a connection] */
+	WT_CONNECTION *conn;
+	const char *home = "WT_TEST";
+	ret = wiredtiger_open(home, NULL, "create,transactional", &conn);
+	/*! [Open a connection] */
+	}
+
 	/*! [Get the WiredTiger library version #1] */
 	printf("WiredTiger version %s\n", wiredtiger_version(NULL, NULL, NULL));
 	/*! [Get the WiredTiger library version #1] */
diff --git a/examples/c/ex_call_center.c b/examples/c/ex_call_center.c
index 84e1ee955c8..8d03d7eab8b 100644
--- a/examples/c/ex_call_center.c
+++ b/examples/c/ex_call_center.c
@@ -57,9 +57,9 @@ const char *home = "WT_TEST";
 /* Customer records. */
 typedef struct {
 	uint64_t id;
-	char *name;
-	char *address;
-	char *phone;
+	const char *name;
+	const char *address;
+	const char *phone;
 } CUSTOMER;
 
 /* Call records. */
@@ -68,8 +68,8 @@ typedef struct {
 	uint64_t call_date;
 	uint64_t cust_id;
 	uint64_t emp_id;
-	char *call_type;
-	char *notes;
+	const char *call_type;
+	const char *notes;
 } CALL;
 /*! [call-center decl] */
 
@@ -79,8 +79,18 @@ int main(void)
 	WT_CONNECTION *conn;
 	WT_SESSION *session;
 	WT_CURSOR *cursor;
-	CUSTOMER cust;
-	CALL call;
+	CUSTOMER cust, *custp, cust_sample[] = {
+		{ 0, "Professor Oak", "LeafGreen Avenue", "123-456-7890" },
+		{ 0, "Lorelei", "Sevii Islands", "098-765-4321" },
+		{ 0, NULL, NULL, NULL }
+	};
+	CALL call, *callp, call_sample[] = {
+		{ 0, 32, 1, 2, "billing", "unavailable" },
+		{ 0, 33, 1, 2, "billing", "available" },
+		{ 0, 34, 1, 2, "reminder", "unavailable" },
+		{ 0, 35, 1, 2, "reminder", "available" },
+		{ 0, 0, 0, 0, NULL, NULL }
+	};
 
 	ret = wiredtiger_open(home, NULL, "create", &conn);
 	if (ret != 0) {
@@ -99,7 +109,7 @@ int main(void)
 	 * created below.
 	 */
 	ret = session->create(session, "table:customers",
-	    "key_format=S,"
+	    "key_format=r,"
 	    "value_format=SSS,"
 	    "columns=(id,name,address,phone),"
 	    "colgroups=(main,address)");
@@ -116,10 +126,19 @@ int main(void)
 	ret = session->create(session,
 	    "index:customers:phone", "columns=(phone)");
 
+	/* Populate the customers table with some data. */
+	ret = session->open_cursor(
+	    session, "table:customers", NULL, "append", &cursor);
+	for (custp = cust_sample; custp->name != NULL; custp++) {
+		cursor->set_value(cursor,
+		    custp->name, custp->address, custp->phone);
+		ret = cursor->insert(cursor);
+	}
+	ret = cursor->close(cursor);
+
 	/*
-	 * Create the calls table, give names and types to the columns.
-	 * All of the columns will be stored together, so no column groups are
-	 * declared.
+	 * Create the calls table, give names and types to the columns.  All the
+	 * columns will be stored together, so no column groups are declared.
 	 */
 	ret = session->create(session, "table:calls",
 	    "key_format=r,"
@@ -133,19 +152,14 @@ int main(void)
 	ret = session->create(session, "index:calls:cust_date",
 	    "columns=(cust_id,call_date)");
 
-	/* Populate the customers table with some data. */
+	/* Populate the calls table with some data. */
 	ret = session->open_cursor(
-	    session, "table:customers", NULL, NULL, &cursor);
-
-	cursor->set_key(cursor, "customer #1");
-	cursor->set_value(cursor,
-	    "Professor Oak", "LeafGreen Avenue", "123-456-7890");
-	ret = cursor->insert(cursor);
-
-	cursor->set_key(cursor, "customer #2");
-	cursor->set_value(cursor, "Lorelei", "Sevii Islands", "098-765-4321");
-	ret = cursor->insert(cursor);
-
+	    session, "table:calls", NULL, "append", &cursor);
+	for (callp = call_sample; callp->call_type != NULL; callp++) {
+		cursor->set_value(cursor, callp->call_date, callp->cust_id,
+		    callp->emp_id, callp->call_type, callp->notes);
+		ret = cursor->insert(cursor);
+	}
 	ret = cursor->close(cursor);
 
 	/*
@@ -162,13 +176,13 @@ int main(void)
 	 * means the cursor's value format will be "rS".
 	 */
 	ret = session->open_cursor(session,
-	    "index:customers:phone(id,name)",
-	    NULL, NULL, &cursor);
-	cursor->set_key(cursor, "212-555-1000");
+	    "index:customers:phone(id,name)", NULL, NULL, &cursor);
+	cursor->set_key(cursor, "123-456-7890");
 	ret = cursor->search(cursor);
 	if (ret == 0) {
 		ret = cursor->get_value(cursor, &cust.id, &cust.name);
-		printf("Got customer record for %s\n", cust.name);
+		printf("Read customer record for %s (ID %" PRIu64 ")\n",
+		    cust.name, cust.id);
 	}
 	ret = cursor->close(cursor);
 
@@ -181,9 +195,9 @@ int main(void)
 	 * is in increasing order by date for a given customer, we want to start
 	 * with the last record for the customer and work backwards.
 	 *
-	 * Specify a subset of columns to be returned.  If these were all
-	 * covered by the index, the primary would not be accessed.  Stop after
-	 * getting 3 records.
+	 * Specify a subset of columns to be returned.  (Note that if these were
+	 * all covered by the index, the primary would not have to be accessed.)
+	 * Stop after getting 3 records.
 	 */
 	ret = session->open_cursor(session,
 	    "index:calls:cust_date(cust_id,call_type,notes)",
@@ -194,31 +208,26 @@ int main(void)
 	 * call date for a given cust_id.  Search for (cust_id+1,0), then work
 	 * backwards.
 	 */
+	cust.id = 1;
 	cursor->set_key(cursor, cust.id + 1, 0);
 	ret = cursor->search_near(cursor, &exact);
 
 	/*
-	 * If the table is empty, search_near will return WT_NOTFOUND.
-	 * Otherwise the cursor will on a matching key if one exists, or on an
-	 * adjacent key.  If the key we find is equal or larger than the search
-	 * key, go back one.
+	 * If the table is empty, search_near will return WT_NOTFOUND, else the
+	 * cursor will be positioned on a matching key if one exists, or an
+	 * adjacent key if one does not.  If the positioned key is equal to or
+	 * larger than the search key, go back one.
 	 */
 	if (ret == 0 && exact >= 0)
 		ret = cursor->prev(cursor);
-	if (ret == 0)
+	for (count = 0; ret == 0 && count < 3; ++count) {
 		ret = cursor->get_value(cursor,
 		    &call.cust_id, &call.call_type, &call.notes);
-
-	count = 0;
-	while (ret == 0 && call.cust_id == cust.id) {
-		printf("Got call record on date %lu: type %s: %s\n",
-		    (unsigned long)call.call_date, call.call_type, call.notes);
-		if (++count == 3)
+		if (call.cust_id != cust.id)
 			break;
-
+		printf("Call record: customer %" PRIu64 " (%s: %s)\n",
+		    call.cust_id, call.call_type, call.notes);
 		ret = cursor->prev(cursor);
-		ret = cursor->get_value(cursor,
-		    &call.cust_id, &call.call_type, &call.notes);
 	}
 	/*! [call-center work] */
 
diff --git a/examples/c/ex_config.c b/examples/c/ex_config.c
index 4f3d109abde..21572a98f8f 100644
--- a/examples/c/ex_config.c
+++ b/examples/c/ex_config.c
@@ -76,4 +76,3 @@ int main(void)
 
 	return (ret);
 }
-
diff --git a/examples/c/ex_pack.c b/examples/c/ex_pack.c
index 7fe1c123181..272c8b745de 100644
--- a/examples/c/ex_pack.c
+++ b/examples/c/ex_pack.c
@@ -54,12 +54,14 @@ int main(void)
 		    home, wiredtiger_strerror(ret));
 
 	/*! [packing] */
-	size = wiredtiger_struct_size("iii", 42, 1000, -9);
-	if (size > sizeof (buf)) {
+	ret = wiredtiger_struct_size(session, &size, "iii", 42, 1000, -9);
+	if (size > sizeof(buf)) {
 		/* Allocate a bigger buffer. */
 	}
-	wiredtiger_struct_pack(buf, size, "iii", 42, 1000, -9);
-	wiredtiger_struct_unpack(buf, size, "iii", &i, &j, &k);
+
+	ret = wiredtiger_struct_pack(session, buf, size, "iii", 42, 1000, -9);
+
+	ret = wiredtiger_struct_unpack(session, buf, size, "iii", &i, &j, &k);
 	/*! [packing] */
 
 	/* Note: closing the connection implicitly closes open session(s). */
diff --git a/examples/c/ex_schema.c b/examples/c/ex_schema.c
index 6e89ebd0e4d..e3a5db0a168 100644
--- a/examples/c/ex_schema.c
+++ b/examples/c/ex_schema.c
@@ -105,7 +105,7 @@ int main(void)
 	ret = session->open_cursor(session, "table:population",
 	    NULL, "append", &cursor);
 
-	endp = pop_data + (sizeof (pop_data) / sizeof (pop_data[0]));
+	endp = pop_data + (sizeof (pop_data) / sizeof(pop_data[0]));
 	for (p = pop_data; p < endp; p++) {
 		cursor->set_value(cursor, p->country, p->year, p->population);
 		ret = cursor->insert(cursor);
diff --git a/examples/python/ex_access.py b/examples/python/ex_access.py
index 5ae7f672651..32c0b7337f8 100755
--- a/examples/python/ex_access.py
+++ b/examples/python/ex_access.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env PYTHONPATH=../../lang/python:../../lang/python/src python
-#
 # Copyright (c) 2008-2012 WiredTiger, Inc.
 #
 # This is free and unencumbered software released into the public domain.
@@ -24,41 +22,26 @@
 # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 # OTHER DEALINGS IN THE SOFTWARE.
-#
-# ex_access.py
-# 	demonstrates how to create and access a simple table.
-#
-
-import wiredtiger
-import sys
 
-home = 'WT_TEST'
+from wiredtiger import wiredtiger_open
 
-try:
-    conn = wiredtiger.wiredtiger_open(home, None, 'create')
-    print('connected: ' + `conn`);
-    session = conn.open_session(None, None)
-except BaseException as e:
-    print('Error connecting to', (home + ':'), e);
-    sys.exit(1)
+# Connect to the database and open a session
+conn = wiredtiger_open('WT_TEST', 'create')
+session = conn.open_session()
 
-# Note: further error checking omitted for clarity.
+# Create a simple table
+session.create('table:T', 'key_format=S,value_format=S')
 
-session.create_table('access', 'key_format=S,value_format=S')
-cursor = session.open_cursor('table:access', None, None)
+# Open a cursor and insert a record
+cursor = session.open_cursor('table:T', None)
 
-# Insert a record.
 cursor.set_key('key1')
 cursor.set_value('value1')
+cursor.insert()
 
-# TODO: remove try block when cursor.insert works
-try:
-    cursor.insert()
-except BaseException as tuple:
-    print('Error cursor insert: ', tuple);
-  
+# Iterate through the records
+cursor.reset()
 for key, value in cursor:
     print('Got record: ' + key + ' : ' + value)
 
-conn.close(None)
-sys.exit(0)
+conn.close()
diff --git a/examples/python/run-ex_access b/examples/python/run-ex_access
new file mode 100755
index 00000000000..a6f8348e9fd
--- /dev/null
+++ b/examples/python/run-ex_access
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+rm -rf WT_TEST ; mkdir WT_TEST
+
+exec env LD_LIBRARY_PATH=../../.libs DYLD_LIBRARY_PATH=../../.libs PYTHONPATH=.:${srcdir} python ${srcdir}/../../examples/python/ex_access.py 
diff --git a/ext/collators/reverse/reverse_collator.c b/ext/collators/reverse/reverse_collator.c
index d33a4fa31ba..9798d2caaee 100644
--- a/ext/collators/reverse/reverse_collator.c
+++ b/ext/collators/reverse/reverse_collator.c
@@ -32,21 +32,21 @@
 
 WT_EXTENSION_API *wt_api;
 
-#define __UNUSED(v)     ((void)(v))
+#define	__UNUSED(v)     ((void)(v))
 
 static int
 collate_reverse(WT_COLLATOR *collator, WT_SESSION *session,
     const WT_ITEM *k1, const WT_ITEM *k2, int *cmp)
 {
-        size_t len;
+	size_t len;
 
-        __UNUSED(collator);
-        __UNUSED(session);
+	__UNUSED(collator);
+	__UNUSED(session);
 
-        len = (k1->size < k2->size) ? k1->size : k2->size;
-        if ((*cmp = memcmp(k2->data, k1->data, len)) == 0)
-                *cmp = ((int)k1->size - (int)k2->size);
-        return (0);
+	len = (k1->size < k2->size) ? k1->size : k2->size;
+	if ((*cmp = memcmp(k2->data, k1->data, len)) == 0)
+		*cmp = ((int)k1->size - (int)k2->size);
+	return (0);
 }
 
 static WT_COLLATOR reverse_collator = { collate_reverse };
@@ -55,12 +55,12 @@ int
 wiredtiger_extension_init(
     WT_SESSION *session, WT_EXTENSION_API *api, const char *config)
 {
-        WT_CONNECTION *conn;
+	WT_CONNECTION *conn;
 
-        __UNUSED(config);
+	__UNUSED(config);
 
-        wt_api = api;
-        conn = session->connection;
+	wt_api = api;
+	conn = session->connection;
 
-        return (conn->add_collator(conn, "reverse", &reverse_collator, NULL));
+	return (conn->add_collator(conn, "reverse", &reverse_collator, NULL));
 }
diff --git a/ext/compressors/bzip2_compress/bzip2_compress.c b/ext/compressors/bzip2_compress/bzip2_compress.c
index 7d31e97ec09..bc67e0f23ee 100644
--- a/ext/compressors/bzip2_compress/bzip2_compress.c
+++ b/ext/compressors/bzip2_compress/bzip2_compress.c
@@ -120,7 +120,7 @@ bzip2_error(WT_SESSION *session, const char *call, int bzret)
 		break;
 	}
 
-	wiredtiger_err_printf(
+	(void)wiredtiger_err_printf(
 	    session, "bzip2 error: %s: %s: %d", call, msg, bzret);
 	return (WT_ERROR);
 }
diff --git a/ext/compressors/snappy_compress/snappy_compress.c b/ext/compressors/snappy_compress/snappy_compress.c
index 77ac9d9c10e..00619b675c4 100644
--- a/ext/compressors/snappy_compress/snappy_compress.c
+++ b/ext/compressors/snappy_compress/snappy_compress.c
@@ -43,7 +43,7 @@ wt_snappy_decompress(WT_COMPRESSOR *, WT_SESSION *,
     uint8_t *, size_t, uint8_t *, size_t, size_t *);
 static int
 wt_snappy_pre_size(WT_COMPRESSOR *, WT_SESSION *, uint8_t *, size_t, size_t *);
-	
+
 static WT_COMPRESSOR wt_snappy_compressor = {
     wt_snappy_compress, wt_snappy_decompress, wt_snappy_pre_size };
 
@@ -86,7 +86,7 @@ wt_snappy_error(WT_SESSION *session, const char *call, snappy_status snret)
 		break;
 	}
 
-	wiredtiger_err_printf(
+	(void)wiredtiger_err_printf(
 	    session, "snappy error: %s: %s: %d", call, msg, snret);
 	return (WT_ERROR);
 }
@@ -147,7 +147,7 @@ wt_snappy_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session,
 	/* retrieve the saved length */
 	snaplen = *(size_t *)src;
 	if (snaplen + sizeof(size_t) > src_len) {
-		wiredtiger_err_printf(
+		(void)wiredtiger_err_printf(
 		    session,
 		    "wt_snappy_decompress: stored size exceeds buffer size");
 		return (WT_ERROR);
diff --git a/lang/python/Makefile.am b/lang/python/Makefile.am
index 3a38be8a02d..fbc7d3b5835 100644
--- a/lang/python/Makefile.am
+++ b/lang/python/Makefile.am
@@ -11,3 +11,5 @@ $(PYSRC)/wiredtiger_wrap.c: $(top_srcdir)/src/include/wiredtiger.in $(PYSRC)/wir
 
 _wiredtiger.so: $(top_builddir)/libwiredtiger.la $(PYSRC)/wiredtiger_wrap.c
 	$(PYTHON) $(PYSRC)/setup.py build_ext -b . -t . -f $(PY_SETUP_DEBUG)
+
+TESTS = $(top_srcdir)/examples/python/run-ex_access
diff --git a/lang/python/wiredtiger.i b/lang/python/wiredtiger.i
index a0df51a74f9..50b81399c27 100644
--- a/lang/python/wiredtiger.i
+++ b/lang/python/wiredtiger.i
@@ -5,7 +5,7 @@
  * See the file LICENSE for redistribution information.
  *
  * wiredtiger.i
- * 	The SWIG interface file defining the wiredtiger python API.
+ *	The SWIG interface file defining the wiredtiger python API.
  */
 
 %define DOCSTRING
@@ -26,13 +26,13 @@ from packing import pack, unpack
 
 /* Set the input argument to point to a temporary variable */ 
 %typemap(in, numinputs=0) WT_CONNECTION ** (WT_CONNECTION *temp = NULL) {
-        $1 = &temp;
+	$1 = &temp;
 }
 %typemap(in, numinputs=0) WT_SESSION ** (WT_SESSION *temp = NULL) {
-        $1 = &temp;
+	$1 = &temp;
 }
 %typemap(in, numinputs=0) WT_CURSOR ** (WT_CURSOR *temp = NULL) {
-        $1 = &temp;
+	$1 = &temp;
 }
 
 /* Convert 'int *' to an output arg in search_near, wiredtiger_version */
@@ -43,43 +43,43 @@ from packing import pack, unpack
 
 /* Set the return value to the returned connection, session, or cursor */
 %typemap(argout) WT_CONNECTION ** {
-        $result = SWIG_NewPointerObj(SWIG_as_voidptr(*$1),
-             SWIGTYPE_p___wt_connection, 0);
+	$result = SWIG_NewPointerObj(SWIG_as_voidptr(*$1),
+	    SWIGTYPE_p___wt_connection, 0);
 }
 %typemap(argout) WT_SESSION ** {
-        $result = SWIG_NewPointerObj(SWIG_as_voidptr(*$1),
-             SWIGTYPE_p___wt_session, 0);
+	$result = SWIG_NewPointerObj(SWIG_as_voidptr(*$1),
+	    SWIGTYPE_p___wt_session, 0);
 }
 
 %typemap(argout) WT_CURSOR ** {
-        $result = SWIG_NewPointerObj(SWIG_as_voidptr(*$1),
-             SWIGTYPE_p___wt_cursor, 0);
-        if (*$1 != NULL) {
-                (*$1)->flags |= WT_CURSTD_RAW;
-                PyObject_SetAttrString($result, "is_column",
-                    PyBool_FromLong(strcmp((*$1)->key_format, "r") == 0));
-        }
+	$result = SWIG_NewPointerObj(SWIG_as_voidptr(*$1),
+	    SWIGTYPE_p___wt_cursor, 0);
+	if (*$1 != NULL) {
+		(*$1)->flags |= WT_CURSTD_RAW;
+		PyObject_SetAttrString($result, "is_column",
+		    PyBool_FromLong(strcmp((*$1)->key_format, "r") == 0));
+	}
 }
 
 /* 64 bit typemaps. */
 %typemap(in) uint64_t {
-    $1 = PyLong_AsUnsignedLongLong($input);
+	$1 = PyLong_AsUnsignedLongLong($input);
 }
 %typemap(out) uint64_t {
-    $result = PyLong_FromUnsignedLongLong($1);
+	$result = PyLong_FromUnsignedLongLong($1);
 }
 
 /* Throw away references after close. */
 %define DESTRUCTOR(class, method)
 %feature("shadow") class::method %{
-    def method(self, *args):
-        '''close(self, config) -> int
-        
-        @copydoc class::method'''
-        try:
-            return $action(self, *args)
-        finally:
-            self.this = None
+	def method(self, *args):
+		'''close(self, config) -> int
+		
+		@copydoc class::method'''
+		try:
+			return $action(self, *args)
+		finally:
+			self.this = None
 %}
 %enddef
 DESTRUCTOR(__wt_connection, close)
@@ -99,17 +99,16 @@ static PyObject *wtError;
 %}
 
 %init %{
-        /*
-         * Create an exception type and put it into the _wiredtiger module.
-         * First increment the reference count because PyModule_AddObject
-         * decrements it.  Then note that "m" is the local variable for the
-         * module in the SWIG generated code.  If there is a SWIG variable for
-         * this, I haven't found it.
-         */
-        wtError =
-            PyErr_NewException("_wiredtiger.WiredTigerError", NULL, NULL);
-        Py_INCREF(wtError);
-        PyModule_AddObject(m, "WiredTigerError", wtError);
+	/*
+	 * Create an exception type and put it into the _wiredtiger module.
+	 * First increment the reference count because PyModule_AddObject
+	 * decrements it.  Then note that "m" is the local variable for the
+	 * module in the SWIG generated code.  If there is a SWIG variable for
+	 * this, I haven't found it.
+	 */
+	wtError = PyErr_NewException("_wiredtiger.WiredTigerError", NULL, NULL);
+	Py_INCREF(wtError);
+	PyModule_AddObject(m, "WiredTigerError", wtError);
 %}
 
 %pythoncode %{
@@ -118,55 +117,54 @@ WiredTigerError = _wiredtiger.WiredTigerError
 ## @cond DISABLE
 # Implements the iterable contract
 class IterableCursor:
-        def __init__(self, cursor):
-                self.cursor = cursor
+	def __init__(self, cursor):
+		self.cursor = cursor
 
-        def __iter__(self):
-                return self
+	def __iter__(self):
+		return self
 
-        def next(self):
-                if self.cursor.next() == WT_NOTFOUND:
-                        raise StopIteration
-                return self.cursor.get_keys() + self.cursor.get_values()
+	def next(self):
+		if self.cursor.next() == WT_NOTFOUND:
+			raise StopIteration
+		return self.cursor.get_keys() + self.cursor.get_values()
 ## @endcond
 %}
 
 %typemap(out) int {
-        if ($1 != 0 && $1 != WT_NOTFOUND) {
-                /* We could use PyErr_SetObject for more complex reporting. */
-                SWIG_Python_SetErrorMsg(wtError, wiredtiger_strerror($1));
-                SWIG_fail;
-        }
-        $result = SWIG_From_int((int)($1));
+	if ($1 != 0 && $1 != WT_NOTFOUND) {
+		/* We could use PyErr_SetObject for more complex reporting. */
+		SWIG_Python_SetErrorMsg(wtError, wiredtiger_strerror($1));
+		SWIG_fail;
+	}
+	$result = SWIG_From_int((int)($1));
 }
 
 /*
  * Extra 'self' elimination.
  * The methods we're wrapping look like this:
  * struct __wt_xxx {
- *    int method(WT_XXX *, ...otherargs...);
+ *	int method(WT_XXX *, ...otherargs...);
  * };
  * To SWIG, that is equivalent to:
- *    int method(struct __wt_xxx *self, WT_XXX *, ...otherargs...);
+ *	int method(struct __wt_xxx *self, WT_XXX *, ...otherargs...);
  * and we use consecutive argument matching of typemaps to convert two args to
  * one.
  */
 %define SELFHELPER(type)
 %typemap(in) (type *self, type *) (void *argp = 0, int res = 0) %{
-        res = SWIG_ConvertPtr($input, &argp, $descriptor, $disown | 0);
-        if (!SWIG_IsOK(res)) { 
-                SWIG_exception_fail(SWIG_ArgError(res),
-                    "in method '" "$symname" "', argument " "$argnum"
-                    " of type '" "$type" "'");
-        }
-        $2 = $1 = ($ltype)(argp);
+	res = SWIG_ConvertPtr($input, &argp, $descriptor, $disown | 0);
+	if (!SWIG_IsOK(res)) { 
+		SWIG_exception_fail(SWIG_ArgError(res), "in method '$symname', "
+		    "argument $argnum of type '$type'");
+	}
+	$2 = $1 = ($ltype)(argp);
 %}
 %enddef
 
 SELFHELPER(struct __wt_connection)
 SELFHELPER(struct __wt_session)
 SELFHELPER(struct __wt_cursor)
-     
+
 /* WT_CURSOR customization. */
 /* First, replace the varargs get / set methods with Python equivalents. */
 %ignore __wt_cursor::get_key;
@@ -178,123 +176,132 @@ SELFHELPER(struct __wt_cursor)
 %apply (char *STRING, int LENGTH) { (char *data, int size) };
 
 %extend __wt_cursor {
-        /* Get / set keys and values */
-        void _set_key(char *data, int size) {
-                WT_ITEM k;
-                k.data = data;
-                k.size = (uint32_t)size;
-                $self->set_key($self, &k);
-        }
-
-        void _set_recno(uint64_t recno) {
-                WT_ITEM k;
-                uint8_t recno_buf[20];
-                if (wiredtiger_struct_pack(recno_buf, sizeof (recno_buf),
-                    "r", recno) == 0) {
-                        k.data = recno_buf;
-                        k.size = (uint32_t)wiredtiger_struct_size("q", recno);
-                        $self->set_key($self, &k);
-                }
-        }
-
-        void _set_value(char *data, int size) {
-                WT_ITEM v;
-                v.data = data;
-                v.size = (uint32_t)size;
-                $self->set_value($self, &v);
-        }
-
-        PyObject *_get_key() {
-                WT_ITEM k;
-                int ret = $self->get_key($self, &k);
-                if (ret != 0) {
-                        SWIG_Python_SetErrorMsg(wtError,
-                            wiredtiger_strerror(ret));
-                        return (NULL);
-                }
-                return SWIG_FromCharPtrAndSize(k.data, k.size);
-        }
-
-        PyObject *_get_recno() {
-                WT_ITEM k;
-                uint64_t r;
-                int ret = $self->get_key($self, &k);
-                if (ret == 0)
-                        ret = wiredtiger_struct_unpack(k.data, k.size, "q", &r);
-                if (ret != 0) {
-                        SWIG_Python_SetErrorMsg(wtError,
-                            wiredtiger_strerror(ret));
-                        return (NULL);
-                }
-                return PyLong_FromUnsignedLongLong(r);
-        }
-
-        PyObject *_get_value() {
-                WT_ITEM v;
-                int ret = $self->get_value($self, &v);
-                if (ret != 0) {
-                        SWIG_Python_SetErrorMsg(wtError,
-                            wiredtiger_strerror(ret));
-                        return (NULL);
-                }
-                return SWIG_FromCharPtrAndSize(v.data, v.size);
-        }
+	/* Get / set keys and values */
+	void _set_key(char *data, int size) {
+		WT_ITEM k;
+		k.data = data;
+		k.size = (uint32_t)size;
+		$self->set_key($self, &k);
+	}
+
+	void _set_recno(uint64_t recno) {
+		WT_ITEM k;
+		uint8_t recno_buf[20];
+		size_t size;
+		int ret = wiredtiger_struct_pack($self->session,
+		    recno_buf, sizeof (recno_buf), "r", recno);
+		if (ret == 0)
+			ret = wiredtiger_struct_size($self->session,
+			    &size, "q", recno);
+		if (ret != 0) {
+			SWIG_Python_SetErrorMsg(wtError,
+			    wiredtiger_strerror(ret));
+			return;
+		}
+		k.data = recno_buf;
+		k.size = (uint32_t)size;
+		$self->set_key($self, &k);
+	}
+
+	void _set_value(char *data, int size) {
+		WT_ITEM v;
+		v.data = data;
+		v.size = (uint32_t)size;
+		$self->set_value($self, &v);
+	}
+
+	PyObject *_get_key() {
+		WT_ITEM k;
+		int ret = $self->get_key($self, &k);
+		if (ret != 0) {
+			SWIG_Python_SetErrorMsg(wtError,
+			    wiredtiger_strerror(ret));
+			return (NULL);
+		}
+		return SWIG_FromCharPtrAndSize(k.data, k.size);
+	}
+
+	PyObject *_get_recno() {
+		WT_ITEM k;
+		uint64_t r;
+		int ret = $self->get_key($self, &k);
+		if (ret == 0)
+			ret = wiredtiger_struct_unpack($self->session,
+			    k.data, k.size, "q", &r);
+		if (ret != 0) {
+			SWIG_Python_SetErrorMsg(wtError,
+			    wiredtiger_strerror(ret));
+			return (NULL);
+		}
+		return PyLong_FromUnsignedLongLong(r);
+	}
+
+	PyObject *_get_value() {
+		WT_ITEM v;
+		int ret = $self->get_value($self, &v);
+		if (ret != 0) {
+			SWIG_Python_SetErrorMsg(wtError,
+			    wiredtiger_strerror(ret));
+			return (NULL);
+		}
+		return SWIG_FromCharPtrAndSize(v.data, v.size);
+	}
 
 %pythoncode %{
-        def get_key(self):
-            '''get_key(self) -> object
-            
-            @copydoc WT_CURSOR::get_key
-            Returns only the first column.'''
-            return self.get_keys()[0]
-
-        def get_keys(self):
-            '''get_keys(self) -> (object, ...)
-            
-            @copydoc WT_CURSOR::get_key'''
-            if self.is_column:
-                return [self._get_recno(),]
-            else:
-                return unpack(self.key_format, self._get_key())
-
-        def get_value(self):
-            '''get_value(self) -> object
-            
-            @copydoc WT_CURSOR::get_value
-            Returns only the first column.'''
-            return self.get_values()[0]
-
-        def get_values(self):
-            '''get_values(self) -> (object, ...)
-            
-            @copydoc WT_CURSOR::get_value'''
-            return unpack(self.value_format, self._get_value())
-
-        def set_key(self, *args):
-            '''set_key(self) -> None
-            
-            @copydoc WT_CURSOR::set_key'''
-            if self.is_column:
-                self._set_recno(args[0])
-            else:
-                # Keep the Python string pinned
-                self._key = pack(self.key_format, *args)
-                self._set_key(self._key)
-
-        def set_value(self, *args):
-            '''set_value(self) -> None
-            
-            @copydoc WT_CURSOR::set_value'''
-            # Keep the Python string pinned
-            self._value = pack(self.value_format, *args)
-            self._set_value(self._value)
-
-        def __iter__(self):
-            '''Cursor objects support iteration, equivalent to calling
-            WT_CURSOR::next until it returns ::WT_NOTFOUND.'''
-            if not hasattr(self, '_iterable'):
-                self._iterable = IterableCursor(self)
-            return self._iterable
+	def get_key(self):
+		'''get_key(self) -> object
+		
+		@copydoc WT_CURSOR::get_key
+		Returns only the first column.'''
+		return self.get_keys()[0]
+
+	def get_keys(self):
+		'''get_keys(self) -> (object, ...)
+		
+		@copydoc WT_CURSOR::get_key'''
+		if self.is_column:
+			return [self._get_recno(),]
+		else:
+			return unpack(self.key_format, self._get_key())
+
+	def get_value(self):
+		'''get_value(self) -> object
+		
+		@copydoc WT_CURSOR::get_value
+		Returns only the first column.'''
+		return self.get_values()[0]
+
+	def get_values(self):
+		'''get_values(self) -> (object, ...)
+		
+		@copydoc WT_CURSOR::get_value'''
+		return unpack(self.value_format, self._get_value())
+
+	def set_key(self, *args):
+		'''set_key(self) -> None
+		
+		@copydoc WT_CURSOR::set_key'''
+		if self.is_column:
+			self._set_recno(long(args[0]))
+		else:
+			# Keep the Python string pinned
+			self._key = pack(self.key_format, *args)
+			self._set_key(self._key)
+
+	def set_value(self, *args):
+		'''set_value(self) -> None
+		
+		@copydoc WT_CURSOR::set_value'''
+		# Keep the Python string pinned
+		self._value = pack(self.value_format, *args)
+		self._set_value(self._value)
+
+	def __iter__(self):
+		'''Cursor objects support iteration, equivalent to calling
+		WT_CURSOR::next until it returns ::WT_NOTFOUND.'''
+		if not hasattr(self, '_iterable'):
+			self._iterable = IterableCursor(self)
+		return self._iterable
 %}
 };
 
@@ -310,8 +317,8 @@ SELFHELPER(struct __wt_cursor)
 %ignore __wt_connection::add_collator;
 %ignore __wt_compressor;
 %ignore __wt_connection::add_compressor;
-%ignore __wt_cursor_type;
-%ignore __wt_connection::add_cursor_type;
+%ignore __wt_data_source;
+%ignore __wt_connection::add_data_source;
 %ignore __wt_event_handler;
 %ignore __wt_extractor;
 %ignore __wt_connection::add_extractor;
diff --git a/src/api/api_event.c b/src/api/api_event.c
deleted file mode 100644
index 5298767a0c9..00000000000
--- a/src/api/api_event.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/*-
- * Copyright (c) 2008-2012 WiredTiger, Inc.
- *	All rights reserved.
- *
- * See the file LICENSE for redistribution information.
- */
-
-#include "wt_internal.h"
-
-/*
- * __handle_error_default --
- *	Default WT_EVENT_HANDLER->handle_error implementation: send to stderr.
- */
-static void
-__handle_error_default(WT_EVENT_HANDLER *handler, int error, const char *errmsg)
-{
-	size_t len_err, len_errmsg;
-	const char *err;
-
-	WT_UNUSED(handler);
-
-	if (error != 0) {
-		err = wiredtiger_strerror(error);
-		len_err = strlen(err);
-		len_errmsg = strlen(errmsg);
-		if (len_err >= len_errmsg &&
-		    strcmp(errmsg + (len_errmsg - len_err), err) != 0) {
-			fprintf(stderr,
-			    "%s: %s\n", errmsg, wiredtiger_strerror(error));
-			return;
-		}
-	}
-	fprintf(stderr, "%s\n", errmsg);
-}
-
-/*
- * __handle_message_default --
- *	Default WT_EVENT_HANDLER->handle_message implementation: send to stdout.
- */
-static int
-__handle_message_default(WT_EVENT_HANDLER *handler, const char *message)
-{
-	WT_UNUSED(handler);
-
-	printf("%s\n", message);
-
-	return (0);
-}
-
-/*
- * __handle_progress_default --
- *	Default WT_EVENT_HANDLER->handle_progress implementation: ignore.
- */
-static int
-__handle_progress_default(WT_EVENT_HANDLER *handler,
-     const char *operation, uint64_t progress)
-{
-	WT_UNUSED(handler);
-	WT_UNUSED(operation);
-	WT_UNUSED(progress);
-
-	return (0);
-}
-
-static WT_EVENT_HANDLER __event_handler_default = {
-	__handle_error_default,
-	__handle_message_default,
-	__handle_progress_default
-};
-
-WT_EVENT_HANDLER *__wt_event_handler_default = &__event_handler_default;
diff --git a/src/block/block_addr.c b/src/block/block_addr.c
index f292c8afca0..2b5d10ef7b6 100644
--- a/src/block/block_addr.c
+++ b/src/block/block_addr.c
@@ -8,27 +8,40 @@
 #include "wt_internal.h"
 
 /*
- * __wt_block_buffer_to_addr --
- *	Convert a filesystem address cookie into its components.
+ * __block_buffer_to_addr --
+ *	Convert a filesystem address cookie into its components, UPDATING the
+ * caller's buffer reference so it can be called repeatedly to load a buffer.
  */
-int
-__wt_block_buffer_to_addr(WT_BLOCK *block,
-    const uint8_t *p, off_t *offsetp, uint32_t *sizep, uint32_t *cksump)
+static int
+__block_buffer_to_addr(WT_BLOCK *block,
+    const uint8_t **pp, off_t *offsetp, uint32_t *sizep, uint32_t *cksump)
 {
-	uint64_t a;
-
-	WT_RET(__wt_vunpack_uint(&p, 0, &a));
-	if (offsetp != NULL)
-		*offsetp = (off_t)a * block->allocsize + WT_BLOCK_DESC_SECTOR;
-	WT_RET(__wt_vunpack_uint(&p, 0, &a));
-	if (sizep != NULL)
-		*sizep = (uint32_t)a * block->allocsize;
-
-	if (cksump != NULL) {
-		WT_RET(__wt_vunpack_uint(&p, 0, &a));
-		*cksump = (uint32_t)a;
+	uint64_t o, s, c;
+
+	WT_RET(__wt_vunpack_uint(pp, 0, &o));
+	WT_RET(__wt_vunpack_uint(pp, 0, &s));
+	WT_RET(__wt_vunpack_uint(pp, 0, &c));
+
+	/*
+	 * To avoid storing large offsets, we minimize the value by subtracting
+	 * 512B (the size of the description sector), and then storing a count
+	 * of block allocation units.   That implies there is no such thing as
+	 * an "invalid" offset though, they could all be valid (other than very
+	 * large numbers), which is what we didn't want to store in the first
+	 * place.  Use the size: writing a block of size 0 makes no sense, so
+	 * that's the out-of-band value.  Once we're out of this function and
+	 * are working with a real file offset, size and checksum triplet, there
+	 * are invalid offsets, that's simpler than testing sizes of 0 all over
+	 * the place.
+	 */
+	if (s == 0) {
+		*offsetp = 0;
+		*sizep = *cksump = 0;
+	} else {
+		*offsetp = (off_t)o * block->allocsize + WT_BLOCK_DESC_SECTOR;
+		*sizep = (uint32_t)s * block->allocsize;
+		*cksump = (uint32_t)c;
 	}
-
 	return (0);
 }
 
@@ -38,20 +51,39 @@ __wt_block_buffer_to_addr(WT_BLOCK *block,
  */
 int
 __wt_block_addr_to_buffer(WT_BLOCK *block,
-    uint8_t **p, off_t offset, uint32_t size, uint32_t cksum)
+    uint8_t **pp, off_t offset, uint32_t size, uint32_t cksum)
 {
-	uint64_t a;
-
-	a = (uint64_t)(offset - WT_BLOCK_DESC_SECTOR) / block->allocsize;
-	WT_RET(__wt_vpack_uint(p, 0, a));
-	a = size / block->allocsize;
-	WT_RET(__wt_vpack_uint(p, 0, a));
-	a = cksum;
-	WT_RET(__wt_vpack_uint(p, 0, a));
+	uint64_t o, s, c;
+
+	/* See the comment above: this is the reverse operation. */
+	if (size == 0) {
+		o = WT_BLOCK_INVALID_OFFSET;
+		s = c = 0;
+	} else {
+		o = (uint64_t)
+		    (offset - WT_BLOCK_DESC_SECTOR) / block->allocsize;
+		s = size / block->allocsize;
+		c = cksum;
+	}
+	WT_RET(__wt_vpack_uint(pp, 0, o));
+	WT_RET(__wt_vpack_uint(pp, 0, s));
+	WT_RET(__wt_vpack_uint(pp, 0, c));
 	return (0);
 }
 
 /*
+ * __wt_block_buffer_to_addr --
+ *	Convert a filesystem address cookie into its components NOT UPDATING
+ * the caller's buffer reference.
+ */
+int
+__wt_block_buffer_to_addr(WT_BLOCK *block,
+    const uint8_t *p, off_t *offsetp, uint32_t *sizep, uint32_t *cksump)
+{
+	return (__block_buffer_to_addr(block, &p, offsetp, sizep, cksump));
+}
+
+/*
  * __wt_block_addr_valid --
  *	Return if an address cookie is valid.
  */
@@ -60,13 +92,13 @@ __wt_block_addr_valid(WT_SESSION_IMPL *session,
     WT_BLOCK *block, const uint8_t *addr, uint32_t addr_size)
 {
 	off_t offset;
-	uint32_t size;
+	uint32_t cksum, size;
 
 	WT_UNUSED(session);
 	WT_UNUSED(addr_size);
 
 	/* Crack the cookie. */
-	WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, NULL));
+	WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
 
 	/* All we care about is if it's past the end of the file. */
 	return (offset + size > block->fh->file_size ? 0 : 1);
@@ -95,3 +127,71 @@ __wt_block_addr_string(WT_SESSION_IMPL *session,
 
 	return (0);
 }
+
+/*
+ * __wt_block_buffer_to_snapshot --
+ *	Convert a filesystem snapshot cookie into its components.
+ */
+int
+__wt_block_buffer_to_snapshot(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, const uint8_t *p, WT_BLOCK_SNAPSHOT *si)
+{
+	uint64_t a;
+	const uint8_t **pp;
+
+	si->version = *p++;
+	if (si->version != WT_BM_SNAPSHOT_VERSION)
+		WT_RET_MSG(session, WT_ERROR, "illegal snapshot address");
+
+	pp = &p;
+	WT_RET(__block_buffer_to_addr(block, pp,
+	    &si->root_offset, &si->root_size, &si->root_cksum));
+	WT_RET(__block_buffer_to_addr(block, pp,
+	    &si->alloc.offset, &si->alloc.size, &si->alloc.cksum));
+	WT_RET(__block_buffer_to_addr(block, pp,
+	    &si->avail.offset, &si->avail.size, &si->avail.cksum));
+	WT_RET(__block_buffer_to_addr(block, pp,
+	    &si->discard.offset, &si->discard.size, &si->discard.cksum));
+	WT_RET(__wt_vunpack_uint(pp, 0, &a));
+	si->file_size = (off_t)a;
+	WT_RET(__wt_vunpack_uint(pp, 0, &a));
+	si->snapshot_size = a;
+	WT_RET(__wt_vunpack_uint(pp, 0, &a));
+	si->write_gen = a;
+
+	return (0);
+}
+
+/*
+ * __wt_block_snapshot_to_buffer --
+ *	Convert the filesystem components into its snapshot cookie.
+ */
+int
+__wt_block_snapshot_to_buffer(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, uint8_t **pp, WT_BLOCK_SNAPSHOT *si)
+{
+	uint64_t a;
+
+	if (si->version != WT_BM_SNAPSHOT_VERSION)
+		WT_RET_MSG(session, WT_ERROR, "illegal snapshot address");
+
+	(*pp)[0] = si->version;
+	(*pp)++;
+
+	WT_RET(__wt_block_addr_to_buffer(block, pp,
+	    si->root_offset, si->root_size, si->root_cksum));
+	WT_RET(__wt_block_addr_to_buffer(block, pp,
+	    si->alloc.offset, si->alloc.size, si->alloc.cksum));
+	WT_RET(__wt_block_addr_to_buffer(block, pp,
+	    si->avail.offset, si->avail.size, si->avail.cksum));
+	WT_RET(__wt_block_addr_to_buffer(block, pp,
+	    si->discard.offset, si->discard.size, si->discard.cksum));
+	a = (uint64_t)si->file_size;
+	WT_RET(__wt_vpack_uint(pp, 0, a));
+	a = (uint64_t)si->snapshot_size;
+	WT_RET(__wt_vpack_uint(pp, 0, a));
+	a = si->write_gen;
+	WT_RET(__wt_vpack_uint(pp, 0, a));
+
+	return (0);
+}
diff --git a/src/block/block_ext.c b/src/block/block_ext.c
index a03101c74a5..5d7ae8af45b 100644
--- a/src/block/block_ext.c
+++ b/src/block/block_ext.c
@@ -7,13 +7,9 @@
 
 #include "wt_internal.h"
 
-static int __block_extend(WT_SESSION_IMPL *, WT_BLOCK *, off_t *, off_t);
 static int __block_merge(WT_SESSION_IMPL *, WT_EXTLIST *, off_t, off_t);
-static int __block_truncate(WT_SESSION_IMPL *, WT_BLOCK *);
-
-#ifdef HAVE_VERBOSE
-static void __block_extlist_dump(WT_SESSION_IMPL *, WT_EXTLIST *);
-#endif
+static int __block_ext_overlap(WT_SESSION_IMPL *,
+	WT_BLOCK *, WT_EXTLIST *, WT_EXT **, WT_EXTLIST *, WT_EXT **);
 
 /*
  * __block_off_srch --
@@ -68,12 +64,12 @@ __block_size_srch(WT_SIZE **head, off_t size, WT_SIZE ***stack)
 }
 
 /*
- * __block_off_pair_srch --
+ * __block_off_srch_pair --
  *	Search a by-offset skiplist for before/after records of the specified
  * offset.
  */
 static void
-__block_off_pair_srch(
+__block_off_srch_pair(
     WT_EXTLIST *el, off_t off, WT_EXT **beforep, WT_EXT **afterp)
 {
 	WT_EXT **head, **extp;
@@ -107,7 +103,7 @@ __block_off_pair_srch(
 
 /*
  * __block_extlist_last --
- *	Return the last extent range in the skiplist.
+ *	Return the last extent in the skiplist.
  */
 static WT_EXT *
 __block_extlist_last(WT_EXT **head)
@@ -130,11 +126,11 @@ __block_extlist_last(WT_EXT **head)
 }
 
 /*
- * __block_off_insert --
- *	Insert a record into an extent list.
+ * __block_ext_insert --
+ *	Insert an extent into an extent list.
  */
 static int
-__block_off_insert(WT_SESSION_IMPL *session, WT_EXTLIST *el, WT_EXT *ext)
+__block_ext_insert(WT_SESSION_IMPL *session, WT_EXTLIST *el, WT_EXT *ext)
 {
 	WT_EXT **astack[WT_SKIP_MAXDEPTH];
 	WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
@@ -144,7 +140,7 @@ __block_off_insert(WT_SESSION_IMPL *session, WT_EXTLIST *el, WT_EXT *ext)
 	 * If we are inserting a new size onto the size skiplist, we'll need
 	 * a new WT_EXT structure for that skiplist.
 	 */
-	__block_size_srch(el->size, ext->size, sstack);
+	__block_size_srch(el->sz, ext->size, sstack);
 	szp = *sstack[0];
 	if (szp == NULL || szp->size != ext->size) {
 		WT_RET(__wt_calloc(session, 1,
@@ -181,6 +177,28 @@ __block_off_insert(WT_SESSION_IMPL *session, WT_EXTLIST *el, WT_EXT *ext)
 }
 
 /*
+ * __block_off_insert --
+ *	Insert a file range into an extent list.
+ */
+static int
+__block_off_insert(
+    WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size)
+{
+	WT_EXT *ext;
+	u_int skipdepth;
+
+	/* Allocate a new WT_EXT structure. */
+	skipdepth = __wt_skip_choose_depth();
+	WT_RET(__wt_calloc(session, 1,
+	    sizeof(WT_EXT) + skipdepth * 2 * sizeof(WT_EXT *), &ext));
+
+	ext->off = off;
+	ext->size = size;
+	ext->depth = (uint8_t)skipdepth;
+	return (__block_ext_insert(session, el, ext));
+}
+
+/*
  * __block_off_remove --
  *	Remove a record from an extent list.
  */
@@ -196,7 +214,7 @@ __block_off_remove(
 	__block_off_srch(el->off, off, astack, 0);
 	ext = *astack[0];
 	if (ext == NULL || ext->off != off)
-		return (EINVAL);
+		goto corrupt;
 	for (i = 0; i < ext->depth; ++i)
 		*astack[i] = ext->next[i];
 
@@ -204,14 +222,14 @@ __block_off_remove(
 	 * Find and remove the record from the size's offset skiplist; if that
 	 * empties the by-size skiplist entry, remove it as well.
 	 */
-	__block_size_srch(el->size, ext->size, sstack);
+	__block_size_srch(el->sz, ext->size, sstack);
 	szp = *sstack[0];
 	if (szp == NULL || szp->size != ext->size)
 		return (EINVAL);
 	__block_off_srch(szp->off, off, astack, 1);
 	ext = *astack[0];
 	if (ext == NULL || ext->off != off)
-		return (EINVAL);
+		goto corrupt;
 	for (i = 0; i < ext->depth; ++i)
 		*astack[i] = ext->next[i + ext->depth];
 	if (szp->off[0] == NULL) {
@@ -230,6 +248,73 @@ __block_off_remove(
 		*extp = ext;
 
 	return (0);
+
+corrupt:
+	WT_RET_MSG(session, EINVAL,
+	    "attempt to remove non-existent offset from an extent list");
+}
+
+/*
+ * __wt_block_off_remove_overlap --
+ *	Remove a range from an extent list, where the range may be part of a
+ * overlapping entry.
+ */
+int
+__wt_block_off_remove_overlap(
+    WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size)
+{
+	WT_EXT *before, *after, *ext;
+	off_t a_off, a_size, b_off, b_size;
+
+	/* Search for before and after entries for the offset. */
+	__block_off_srch_pair(el, off, &before, &after);
+
+	/* If "before" or "after" overlaps, retrieve the overlapping entry. */
+	if (before != NULL && before->off + before->size > off) {
+		WT_RET(__block_off_remove(session, el, before->off, &ext));
+
+		/* Calculate overlapping extents. */
+		a_off = ext->off;
+		a_size = off - ext->off;
+		b_off = off + size;
+		b_size = ext->size - (a_size + size);
+	} else if (after != NULL && off + size > after->off) {
+		WT_RET(__block_off_remove(session, el, after->off, &ext));
+
+		/*
+		 * Calculate overlapping extents.  There's no initial overlap
+		 * since the after extent presumably cannot begin before "off".
+		 */
+		a_off = WT_BLOCK_INVALID_OFFSET;
+		a_size = 0;
+		b_off = off + size;
+		b_size = ext->size - (b_off - ext->off);
+	} else
+		return (WT_NOTFOUND);
+
+	/*
+	 * If there are overlaps, insert the item; re-use the extent structure
+	 * and save the allocation (we know there's no need to merge).
+	 */
+	if (a_size != 0) {
+		ext->off = a_off;
+		ext->size = a_size;
+		WT_RET(__block_ext_insert(session, el, ext));
+		ext = NULL;
+	}
+	if (b_size != 0) {
+		if (ext == NULL)
+			WT_RET(__block_off_insert(session, el, b_off, b_size));
+		else {
+			ext->off = b_off;
+			ext->size = b_size;
+			WT_RET(__block_ext_insert(session, el, ext));
+			ext = NULL;
+		}
+	}
+	if (ext != NULL)
+		__wt_free(session, ext);
+	return (0);
 }
 
 /*
@@ -240,11 +325,9 @@ int
 __wt_block_alloc(
     WT_SESSION_IMPL *session, WT_BLOCK *block, off_t *offp, off_t size)
 {
+	WT_DECL_RET;
 	WT_EXT *ext;
 	WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
-	int ret;
-
-	ret = 0;
 
 	WT_BSTAT_INCR(session, alloc);
 	if (size % block->allocsize != 0)
@@ -253,28 +336,28 @@ __wt_block_alloc(
 		    "a multiple of the allocation size %" PRIu32,
 		    (intmax_t)size, block->allocsize);
 
-	__wt_spin_lock(session, &block->freelist_lock);
+	__wt_spin_lock(session, &block->live_lock);
 
 	/*
 	 * Allocation is first-fit by size: search the by-size skiplist for the
 	 * requested size and take the first entry on the by-size offset list.
 	 * If we don't have anything large enough, extend the file.
 	 */
-	__block_size_srch(block->free.size, size, sstack);
+	__block_size_srch(block->live.avail.sz, size, sstack);
 	szp = *sstack[0];
 	if (szp == NULL) {
-		WT_ERR(__block_extend(session, block, offp, size));
+		WT_ERR(__wt_block_extend(session, block, offp, size));
 		goto done;
 	}
 
 	/* Remove the first record, and set the returned offset. */
 	ext = szp->off[0];
-	WT_ERR(__block_off_remove(session, &block->free, ext->off, &ext));
+	WT_ERR(__block_off_remove(session, &block->live.avail, ext->off, &ext));
 	*offp = ext->off;
 
 	/* If doing a partial allocation, adjust the record and put it back. */
 	if (ext->size > size) {
-		WT_VERBOSE(session, block,
+		WT_VERBOSE_ERR(session, block,
 		    "allocate %" PRIdMAX " from range %" PRIdMAX "-%"
 		    PRIdMAX ", range shrinks to %" PRIdMAX "-%" PRIdMAX,
 		    (intmax_t)size,
@@ -284,33 +367,41 @@ __wt_block_alloc(
 
 		ext->off += size;
 		ext->size -= size;
-		WT_ERR(__block_off_insert(session, &block->free, ext));
+		WT_ERR(__block_ext_insert(session, &block->live.avail, ext));
 	} else {
-		WT_VERBOSE(session, block,
+		WT_VERBOSE_ERR(session, block,
 		    "allocate range %" PRIdMAX "-%" PRIdMAX,
 		    (intmax_t)ext->off, (intmax_t)(ext->off + ext->size));
 
 		__wt_free(session, ext);
 	}
 
-done: err:
-	__wt_spin_unlock(session, &block->freelist_lock);
+	/* Add the newly allocated extent to the list of allocations. */
+done:	WT_ERR(__block_merge(
+	    session, &block->live.alloc, *offp, (off_t)size));
+
+err:	__wt_spin_unlock(session, &block->live_lock);
 	return (ret);
 }
 
 /*
- * __block_extend --
+ * __wt_block_extend --
  *	Extend the file to allocate space.
  */
-static int
-__block_extend(
+int
+__wt_block_extend(
     WT_SESSION_IMPL *session, WT_BLOCK *block, off_t *offp, off_t size)
 {
 	WT_FH *fh;
 
 	fh = block->fh;
 
-	/* We should never be allocating from an empty file. */
+	/*
+	 * Callers of this function are expected to be holding any locks
+	 * required to extend the file.
+	 *
+	 * We should never be allocating from an empty file.
+	 */
 	if (fh->file_size < WT_BLOCK_DESC_SECTOR)
 		WT_RET_MSG(session, EINVAL,
 		    "cannot allocate from a file with no description "
@@ -321,15 +412,6 @@ __block_extend(
 	 * easy way to know the maximum off_t on a system, limit growth to 8B
 	 * bits (we currently check an off_t is 8B in verify_build.h).  I don't
 	 * think we're likely to see anything bigger for awhile.
-	 *
-	 * XXX
-	 * This isn't sufficient: if we grow the file to the end, there isn't
-	 * enough room to write the free-list out when we close the file.  It
-	 * is vanishingly unlikely to happen (we use free blocks where they're
-	 * available to write the free list), but if the free-list is a bunch
-	 * of small blocks, each group of which are insufficient to hold the
-	 * free list, and the file has been fully populated, file close will
-	 * fail because we can't write the free list.
 	 */
 	if (fh->file_size > (off_t)INT64_MAX - size)
 		WT_RET_MSG(session, WT_ERROR,
@@ -339,7 +421,7 @@ __block_extend(
 	fh->file_size += size;
 
 	WT_BSTAT_INCR(session, extend);
-	WT_VERBOSE(session, block,
+	WT_VERBOSE_RET(session, block,
 	    "file extend %" PRIdMAX "B @ %" PRIdMAX,
 	    (intmax_t)size, (intmax_t)*offp);
 
@@ -347,61 +429,370 @@ __block_extend(
 }
 
 /*
- * __wt_block_free_buf --
+ * __wt_block_free --
  *	Free a cookie-referenced chunk of space to the underlying file.
  */
 int
-__wt_block_free_buf(WT_SESSION_IMPL *session,
+__wt_block_free(WT_SESSION_IMPL *session,
     WT_BLOCK *block, const uint8_t *addr, uint32_t addr_size)
 {
+	WT_DECL_RET;
 	off_t off;
-	uint32_t size;
+	uint32_t cksum, size;
 
 	WT_UNUSED(addr_size);
+	WT_BSTAT_INCR(session, free);
 
 	/* Crack the cookie. */
-	WT_RET(__wt_block_buffer_to_addr(block, addr, &off, &size, NULL));
-	WT_RET(__wt_block_free(session, block, off, size));
+	WT_RET(__wt_block_buffer_to_addr(block, addr, &off, &size, &cksum));
 
-	return (0);
+	WT_VERBOSE_RET(session, block,
+	    "free %" PRIdMAX "/%" PRIdMAX, (intmax_t)off, (intmax_t)size);
+
+	__wt_spin_lock(session, &block->live_lock);
+	ret = __wt_block_off_free(session, block, off, (off_t)size);
+	__wt_spin_unlock(session, &block->live_lock);
+
+	return (ret);
 }
 
 /*
- * __wt_block_free --
- *	Free a chunk of space to the underlying file.
+ * __wt_block_off_free --
+ *	Free a file range to the underlying file.
  */
 int
-__wt_block_free(
+__wt_block_off_free(
     WT_SESSION_IMPL *session, WT_BLOCK *block, off_t off, off_t size)
 {
-	int ret;
+	WT_EXTLIST *el;
 
-	WT_BSTAT_INCR(session, free);
-	WT_VERBOSE(session, block,
-	    "free %" PRIdMAX "/%" PRIdMAX, (intmax_t)off, (intmax_t)size);
+	/*
+	 * Callers of this function are expected to be holding any locks
+	 * required to manipulate the extent lists.
+	 *
+	 * We can reuse this extent immediately if it was allocated during this
+	 * snapshot,  merge it into the avail list (which slows file growth in
+	 * workloads including repeated overflow record modification).  If this
+	 * extent is referenced in a previous snapshot, merge into the discard
+	 * list.
+	 */
+	el = __wt_block_off_remove_overlap(
+	    session, &block->live.alloc, off, size) == 0 ?
+	    &block->live.avail : &block->live.discard;
+	WT_RET(__block_merge(session, el, off, (off_t)size));
 
-	__wt_spin_lock(session, &block->freelist_lock);
-	ret = __block_merge(session, &block->free, off, (off_t)size);
-	__wt_spin_unlock(session, &block->freelist_lock);
+	return (0);
+}
 
-	return (ret);
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_block_extlist_check --
+ *	Return if the extent lists overlap.
+ */
+int
+__wt_block_extlist_check(
+    WT_SESSION_IMPL *session, WT_EXTLIST *al, WT_EXTLIST *bl)
+{
+	WT_EXT *a, *b;
+
+	a = al->off[0];
+	b = bl->off[0];
+
+	/* Walk the lists in parallel, looking for overlaps. */
+	while (a != NULL && b != NULL) {
+		/*
+		 * If there's no overlap, move the lower-offset entry to the
+		 * next entry in its list.
+		 */
+		if (a->off + a->size <= b->off) {
+			a = a->next[0];
+			continue;
+		}
+		if (b->off + b->size <= a->off) {
+			b = b->next[0];
+			continue;
+		}
+		WT_RET_MSG(session, EINVAL,
+		    "snapshot merge check: %s list overlaps the %s list",
+		    al->name, bl->name);
+	}
+	return (0);
+}
+#endif
+
+/*
+ * __wt_block_extlist_overlap --
+ *	Review a snapshot's alloc/discard extent lists, move overlaps into the
+ * live system's snapshot-avail list.
+ */
+int
+__wt_block_extlist_overlap(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_SNAPSHOT *si)
+{
+	WT_EXT *alloc, *discard;
+
+	alloc = si->alloc.off[0];
+	discard = si->discard.off[0];
+
+	/* Walk the lists in parallel, looking for overlaps. */
+	while (alloc != NULL && discard != NULL) {
+		/*
+		 * If there's no overlap, move the lower-offset entry to the
+		 * next entry in its list.
+		 */
+		if (alloc->off + alloc->size <= discard->off) {
+			alloc = alloc->next[0];
+			continue;
+		}
+		if (discard->off + discard->size <= alloc->off) {
+			discard = discard->next[0];
+			continue;
+		}
+
+		/* Reconcile the overlap. */
+		WT_RET(__block_ext_overlap(session, block,
+		    &si->alloc, &alloc, &si->discard, &discard));
+	}
+	return (0);
+}
+
+/*
+ * __block_ext_overlap --
+ *	Reconcile two overlapping ranges.
+ */
+static int
+__block_ext_overlap(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, WT_EXTLIST *ael, WT_EXT **ap, WT_EXTLIST *bel, WT_EXT **bp)
+{
+	WT_EXT *a, *b, **ext;
+	WT_EXTLIST *avail, *el;
+	off_t off, size;
+
+	avail = &block->live.snapshot_avail;
+
+	/*
+	 * The ranges overlap, choose the range we're going to take from each.
+	 *
+	 * We can think of the overlap possibilities as 11 different cases:
+	 *
+	 *		AAAAAAAAAAAAAAAAAA
+	 * #1		BBBBBBBBBBBBBBBBBB		ranges are are the same
+	 * #2	BBBBBBBBBBBBB				overlaps the beginning
+	 * #3			BBBBBBBBBBBBBBBB	overlaps the end
+	 * #4		BBBBB				B is a prefix of A
+	 * #5			BBBBBB			B is middle of A
+	 * #6			BBBBBBBBBB		B is a suffix of A
+	 *
+	 * and:
+	 *
+	 *		BBBBBBBBBBBBBBBBBB
+	 * #7	AAAAAAAAAAAAA				same as #3
+	 * #8			AAAAAAAAAAAAAAAA	same as #2
+	 * #9		AAAAA				A is a prefix of B
+	 * #10			AAAAAA			A is middle of B
+	 * #11			AAAAAAAAAA		A is a suffix of B
+	 *
+	 *
+	 * By swapping the arguments so "A" is always the lower range, we can
+	 * eliminate cases #2, #8, #10 and #11, and only handle 7 cases:
+	 *
+	 *		AAAAAAAAAAAAAAAAAA
+	 * #1		BBBBBBBBBBBBBBBBBB		ranges are are the same
+	 * #3			BBBBBBBBBBBBBBBB	overlaps the end
+	 * #4		BBBBB				B is a prefix of A
+	 * #5			BBBBBB			B is middle of A
+	 * #6			BBBBBBBBBB		B is a suffix of A
+	 *
+	 * and:
+	 *
+	 *		BBBBBBBBBBBBBBBBBB
+	 * #7	AAAAAAAAAAAAA				same as #3
+	 * #9		AAAAA				A is a prefix of B
+	 */
+	a = *ap;
+	b = *bp;
+	if (a->off > b->off) {				/* Swap */
+		b = *ap;
+		a = *bp;
+		ext = ap; ap = bp; bp = ext;
+		el = ael; ael = bel; bel = el;
+	}
+
+	if (a->off == b->off) {				/* Case #1, #4, #9 */
+		if (a->size == b->size) {		/* Case #1 */
+			/*
+			 * Move caller's A and B to the next element
+			 * Add that A and B range to the avail list
+			 * Delete A and B
+			 */
+			*ap = (*ap)->next[0];
+			*bp = (*bp)->next[0];
+			WT_RET(__block_merge(session, avail, b->off, b->size));
+			WT_RET(__block_off_remove(session, ael, a->off, NULL));
+			WT_RET(__block_off_remove(session, bel, b->off, NULL));
+		}
+		else if (a->size > b->size) {		/* Case #4 */
+			/*
+			 * Remove A from its list
+			 * Increment/Decrement A's offset/size by the size of B
+			 * Insert A on its list
+			 */
+			WT_RET(__block_off_remove(session, ael, a->off, &a));
+			a->off += b->size;
+			a->size -= b->size;
+			WT_RET(__block_ext_insert(session, ael, a));
+
+			/*
+			 * Move caller's B to the next element
+			 * Add B's range to the avail list
+			 * Delete B
+			 */
+			*bp = (*bp)->next[0];
+			WT_RET(__block_merge(session, avail, b->off, b->size));
+			WT_RET(__block_off_remove(session, bel, b->off, NULL));
+		} else {				/* Case #9 */
+			/*
+			 * Remove B from its list
+			 * Increment/Decrement B's offset/size by the size of A
+			 * Insert B on its list
+			 */
+			WT_RET(__block_off_remove(session, bel, b->off, &b));
+			b->off += a->size;
+			b->size -= a->size;
+			WT_RET(__block_ext_insert(session, bel, b));
+
+			/*
+			 * Move caller's A to the next element
+			 * Add A's range to the avail list
+			 * Delete A
+			 */
+			*ap = (*ap)->next[0];
+			WT_RET(__block_merge(session, avail, a->off, a->size));
+			WT_RET(__block_off_remove(session, ael, a->off, NULL));
+		}					/* Case #6 */
+	} else if (a->off + a->size == b->off + b->size) {
+		/*
+		 * Remove A from its list
+		 * Decrement A's size by the size of B
+		 * Insert A on its list
+		 */
+		WT_RET(__block_off_remove(session, ael, a->off, &a));
+		a->size -= b->size;
+		WT_RET(__block_ext_insert(session, ael, a));
+
+		/*
+		 * Move caller's B to the next element
+		 * Add B's range to the avail list
+		 * Delete B
+		 */
+		*bp = (*bp)->next[0];
+		WT_RET(__block_merge(session, avail, b->off, b->size));
+		WT_RET(__block_off_remove(session, bel, b->off, NULL));
+	} else if					/* Case #3, #7 */
+	    (a->off + a->size < b->off + b->size) {
+		/*
+		 * Add overlap to the avail list
+		 */
+		off = b->off;
+		size = (a->off + a->size) - b->off;
+		WT_RET(__block_merge(session, avail, off, size));
+
+		/*
+		 * Remove A from its list
+		 * Decrement A's size by the overlap
+		 * Insert A on its list
+		 */
+		WT_RET(__block_off_remove(session, ael, a->off, &a));
+		a->size -= size;
+		WT_RET(__block_ext_insert(session, ael, a));
+
+		/*
+		 * Remove B from its list
+		 * Increment/Decrement B's offset/size by the overlap
+		 * Insert B on its list
+		 */
+		WT_RET(__block_off_remove(session, bel, b->off, &b));
+		b->off += size;
+		b->size -= size;
+		WT_RET(__block_ext_insert(session, bel, b));
+	} else {					/* Case #5 */
+		/* Calculate the offset/size of the trailing part of A. */
+		off = b->off + b->size;
+		size = (a->off + a->size) - off;
+
+		/*
+		 * Remove A from its list
+		 * Decrement A's size by trailing part of A plus B's size
+		 * Insert A on its list
+		 */
+		WT_RET(__block_off_remove(session, ael, a->off, &a));
+		a->size = b->off - a->off;
+		WT_RET(__block_ext_insert(session, ael, a));
+
+		/* Add trailing part of A to A's list as a new element. */
+		WT_RET(__block_merge(session, ael, off, size));
+
+		/*
+		 * Move caller's B to the next element
+		 * Add B's range to the avail list
+		 * Delete B
+		 */
+		*bp = (*bp)->next[0];
+		WT_RET(__block_merge(session, avail, b->off, b->size));
+		WT_RET(__block_off_remove(session, bel, b->off, NULL));
+	}
+
+	return (0);
 }
 
 /*
- * __block_merge --
+ * __wt_block_extlist_merge --
+ *	Merge one extent list into another.
+ */
+int
+__wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b)
+{
+	WT_EXT *ext;
+
+	WT_VERBOSE_RET(session, block, "merging %s into %s", a->name, b->name);
+
+	WT_EXT_FOREACH(ext, a->off)
+		WT_RET(__block_merge(session, b, ext->off, ext->size));
+
+	return (0);
+}
+
+/*
+ * __wt_block_insert_ext, __block_merge --
  *	Insert an extent into an extent list, merging if possible.
  */
+int
+__wt_block_insert_ext(
+    WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size)
+{
+	/*
+	 * There are currently two copies of this function (this code is a
+	 * one-liner that calls the internal version of the function, which
+	 * means the compiler should compress out the function call).  It's
+	 * that way because the interface is still fluid, I'm not convinced
+	 * there won't be a need for a functional split between the internal
+	 * and external versions in the future.
+	 *
+	 * Callers of this function are expected to be holding any locks
+	 * required to manipulate the extent list.
+	 */
+	return (__block_merge(session, el, off, size));
+}
 static int
 __block_merge(WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size)
 {
 	WT_EXT *ext, *after, *before;
-	u_int skipdepth;
 
 	/*
 	 * Retrieve the records preceding/following the offset.  If the records
 	 * are contiguous with the free'd offset, combine records.
 	 */
-	__block_off_pair_srch(el, off, &before, &after);
+	__block_off_srch_pair(el, off, &before, &after);
 	if (before != NULL) {
 		if (before->off + before->size > off)
 			WT_RET_MSG(session, EINVAL,
@@ -428,14 +819,11 @@ __block_merge(WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size)
 			after = NULL;
 	}
 	if (before == NULL && after == NULL) {
-		/* Allocate a new WT_EXT structure. */
-		skipdepth = __wt_skip_choose_depth();
-		WT_RET(__wt_calloc(session, 1,
-		    sizeof(WT_EXT) + skipdepth * 2 * sizeof(WT_EXT *), &ext));
-		ext->off = off;
-		ext->size = size;
-		ext->depth = (uint8_t)skipdepth;
-		return (__block_off_insert(session, el, ext));
+		WT_VERBOSE_RET(session, block,
+		    "%s: insert range %" PRIdMAX "-%" PRIdMAX,
+		    el->name, (intmax_t)off, (intmax_t)(off + size));
+
+		return (__block_off_insert(session, el, off, size));
 	}
 
 	/*
@@ -448,7 +836,7 @@ __block_merge(WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size)
 	if (before == NULL) {
 		WT_RET(__block_off_remove(session, el, after->off, &ext));
 
-		WT_VERBOSE(session, block,
+		WT_VERBOSE_RET(session, block,
 		    "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %"
 		    PRIdMAX "-%" PRIdMAX,
 		    el->name,
@@ -465,7 +853,7 @@ __block_merge(WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size)
 		}
 		WT_RET(__block_off_remove(session, el, before->off, &ext));
 
-		WT_VERBOSE(session, block,
+		WT_VERBOSE_RET(session, block,
 		    "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %"
 		    PRIdMAX "-%" PRIdMAX,
 		    el->name,
@@ -475,7 +863,7 @@ __block_merge(WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size)
 
 		ext->size += size;
 	}
-	return (__block_off_insert(session, el, ext));
+	return (__block_ext_insert(session, el, ext));
 }
 
 /*
@@ -483,19 +871,21 @@ __block_merge(WT_SESSION_IMPL *session, WT_EXTLIST *el, off_t off, off_t size)
  *	Read an extent list.
  */
 int
-__wt_block_extlist_read(WT_SESSION_IMPL *session,
-    WT_BLOCK *block, WT_EXTLIST *el, off_t off, uint32_t size, uint32_t cksum)
+__wt_block_extlist_read(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
 {
-	WT_ITEM *tmp;
-	off_t loff, lsize;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	off_t off, size;
 	uint8_t *p;
-	int ret;
 
-	tmp = NULL;
-	ret = 0;
+	/* If there isn't a list, we're done. */
+	if (el->offset == WT_BLOCK_INVALID_OFFSET)
+		return (0);
 
-	WT_RET(__wt_scr_alloc(session, size, &tmp));
-	WT_ERR(__wt_block_read(session, block, tmp, off, size, cksum));
+	WT_RET(__wt_scr_alloc(session, el->size, &tmp));
+	WT_ERR(__wt_block_read_off(
+	    session, block, tmp, el->offset, el->size, el->cksum));
 
 #define	WT_EXTLIST_READ(p, v) do {					\
 	(v) = *(off_t *)(p);						\
@@ -503,14 +893,14 @@ __wt_block_extlist_read(WT_SESSION_IMPL *session,
 } while (0)
 
 	p = WT_BLOCK_HEADER_BYTE(tmp->mem);
-	WT_EXTLIST_READ(p, loff);
-	WT_EXTLIST_READ(p, lsize);
-	if (loff != WT_BLOCK_EXTLIST_MAGIC || lsize != 0)
+	WT_EXTLIST_READ(p, off);
+	WT_EXTLIST_READ(p, size);
+	if (off != WT_BLOCK_EXTLIST_MAGIC || size != 0)
 		goto corrupted;
 	for (;;) {
-		WT_EXTLIST_READ(p, loff);
-		WT_EXTLIST_READ(p, lsize);
-		if (loff == WT_BLOCK_INVALID_OFFSET)
+		WT_EXTLIST_READ(p, off);
+		WT_EXTLIST_READ(p, size);
+		if (off == WT_BLOCK_INVALID_OFFSET)
 			break;
 
 		/*
@@ -520,103 +910,80 @@ __wt_block_extlist_read(WT_SESSION_IMPL *session,
 		 * and easy test to do here and we'd have to do the check as
 		 * part of file verification, regardless.
 		 */
-		if ((loff - WT_BLOCK_DESC_SECTOR) % block->allocsize != 0 ||
-		    lsize % block->allocsize != 0 ||
-		    loff + lsize > block->fh->file_size)
+		if (off < WT_BLOCK_DESC_SECTOR ||
+		    (off - WT_BLOCK_DESC_SECTOR) % block->allocsize != 0 ||
+		    size % block->allocsize != 0 ||
+		    off + size > block->fh->file_size)
 corrupted:		WT_ERR_MSG(session, WT_ERROR,
 			    "file contains a corrupted %s extent list, range %"
 			    PRIdMAX "-%" PRIdMAX " past end-of-file",
 			    el->name,
-			    (intmax_t)loff, (intmax_t)(loff + lsize));
+			    (intmax_t)off, (intmax_t)(off + size));
 
 		/*
 		 * We could insert instead of merge, because ranges shouldn't
 		 * overlap, but merge knows how to allocate WT_EXT structures,
-		 * and a little paranoia is a good thing.
+		 * and a little paranoia is a good thing (if we corrupted the
+		 * list and crashed, and rolled back to a corrupted snapshot,
+		 * this might save us?)
 		 */
-		WT_ERR(__block_merge(session, el, loff, lsize));
+		WT_ERR(__block_merge(session, el, off, size));
 	}
 
-	WT_VERBOSE_CALL(session, block, __block_extlist_dump(session, el));
+	if (WT_VERBOSE_ISSET(session, block))
+		WT_ERR(__wt_block_extlist_dump(session, "read extlist", el, 0));
 
 err:	__wt_scr_free(&tmp);
 	return (ret);
 }
 
 /*
- * __wt_block_freelist_open --
- *	Initialize the free-list structures.
- */
-void
-__wt_block_freelist_open(WT_SESSION_IMPL *session, WT_BLOCK *block)
-{
-	__wt_spin_init(session, &block->freelist_lock);
-
-	memset(&block->free, 0, sizeof(block->free));
-	block->free.name = "freelist";
-
-	block->free_offset = WT_BLOCK_INVALID_OFFSET;
-	block->free_size = block->free_cksum = 0;
-}
-
-/*
- * __wt_block_freelist_close --
- *	Write the free-list at the tail of the file.
- */
-void
-__wt_block_freelist_close(WT_SESSION_IMPL *session, WT_BLOCK *block)
-{
-	__wt_spin_destroy(session, &block->freelist_lock);
-}
-
-/*
  * __wt_block_extlist_write --
  *	Write an extent list at the tail of the file.
  */
 int
-__wt_block_extlist_write(WT_SESSION_IMPL *session, WT_BLOCK *block,
-    WT_EXTLIST *el, off_t *offp, uint32_t *sizep, uint32_t *cksump)
+__wt_block_extlist_write(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, WT_EXTLIST *el, WT_EXTLIST *additional)
 {
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
 	WT_EXT *ext;
-	WT_ITEM *tmp;
 	WT_PAGE_HEADER *dsk;
-	uint32_t datasize, size;
+	uint32_t datasize, entries, size;
 	uint8_t *p;
-	int ret;
-	const char *name;
-
-	tmp = NULL;
-	ret = 0;
 
-	WT_VERBOSE_CALL(session, block, __block_extlist_dump(session, el));
+	if (WT_VERBOSE_ISSET(session, block))
+		WT_RET(
+		    __wt_block_extlist_dump(session, "write extlist", el, 0));
 
-	/* If there aren't any free-list entries, we're done. */
-	if (el->entries == 0) {
-		*offp = WT_BLOCK_INVALID_OFFSET;
-		*sizep = *cksump = 0;
+	/*
+	 * Figure out how many entries we're writing -- if there aren't any
+	 * entries, we're done.
+	 */
+	entries = el->entries + (additional == NULL ? 0 : additional->entries);
+	if (entries == 0) {
+		el->offset = WT_BLOCK_INVALID_OFFSET;
+		el->cksum = el->size = 0;
 		return (0);
 	}
 
-	/* Truncate the file if possible. */
-	WT_RET(__block_truncate(session, block));
-
 	/*
 	 * Get a scratch buffer, clear the page's header and data, initialize
 	 * the header.  Allocate an allocation-sized aligned buffer so the
 	 * block write function can zero-out unused bytes and write it without
 	 * copying to something larger.
 	 *
-	 * Allocate room for the free-list entries, plus 2 additional entries:
-	 * the initial WT_BLOCK_EXTLIST_MAGIC/0 pair and the list-terminating
-	 * WT_BLOCK_INVALID_OFFSET/0 pair.
+	 * Allocate memory for the extent list entries plus two additional
+	 * entries: the initial WT_BLOCK_EXTLIST_MAGIC/0 pair and the list-
+	 * terminating WT_BLOCK_INVALID_OFFSET/0 pair.
 	 */
-	datasize = size = (el->entries + 2) * WT_STORE_SIZE(sizeof(off_t)  * 2);
+	datasize = size = (entries + 2) * WT_STORE_SIZE(sizeof(off_t)  * 2);
 	WT_RET(__wt_block_write_size(session, block, &size));
 	WT_RET(__wt_scr_alloc(session, size, &tmp));
 	dsk = tmp->mem;
 	memset(dsk, 0, WT_BLOCK_HEADER_BYTE_SIZE);
 	dsk->u.datalen = WT_STORE_SIZE(datasize);
-	dsk->type = WT_PAGE_FREELIST;
+	dsk->type = WT_PAGE_BLOCK_MANAGER;
 	tmp->size = WT_STORE_SIZE(WT_BLOCK_HEADER_BYTE_SIZE + datasize);
 
 #define	WT_EXTLIST_WRITE(p, v) do {					\
@@ -628,44 +995,37 @@ __wt_block_extlist_write(WT_SESSION_IMPL *session, WT_BLOCK *block,
 	p = WT_BLOCK_HEADER_BYTE(dsk);
 	WT_EXTLIST_WRITE(p, WT_BLOCK_EXTLIST_MAGIC);	/* Initial value */
 	WT_EXTLIST_WRITE(p, 0);
-	WT_EXT_FOREACH(ext, el->off) {		/* Free ranges */
+	WT_EXT_FOREACH(ext, el->off) {			/* Free ranges */
 		WT_EXTLIST_WRITE(p, ext->off);
 		WT_EXTLIST_WRITE(p, ext->size);
 	}
+	if (additional != NULL)
+		WT_EXT_FOREACH(ext, additional->off) {	/* Free ranges */
+			WT_EXTLIST_WRITE(p, ext->off);
+			WT_EXTLIST_WRITE(p, ext->size);
+		}
 	WT_EXTLIST_WRITE(p, WT_BLOCK_INVALID_OFFSET);	/* Ending value */
 	WT_EXTLIST_WRITE(p, 0);
 
-	/*
-	 * XXX
-	 * Discard the in-memory free-list: this has to happen before writing
-	 * the free-list because the underlying block write function is going
-	 * to allocate file space for the free-list block(s), and allocating
-	 * from the blocks on the free-list we just wrote won't work out well.
-	 * A workaround would be to not compress the free-list, which implies
-	 * some kind of "write but don't compress" code path, and that's more
-	 * complex than ordering these operations so the eventual allocation
-	 * in the write code always extends the file.
-	 */
-	name = el->name;
-	el = NULL;
-	__wt_block_discard(session, block);
-
 	/* Write the extent list to disk. */
-	WT_ERR(__wt_block_write(session, block, tmp, offp, sizep, cksump));
+	WT_ERR(__wt_block_write_off(
+	    session, block, tmp, &el->offset, &el->size, &el->cksum, 1));
 
-	WT_VERBOSE(session, block,
-	    "%s written %" PRIdMAX "/%" PRIu32, name, (intmax_t)*offp, *sizep);
+	WT_VERBOSE_ERR(session, block,
+	    "%s written %" PRIdMAX "/%" PRIu32,
+	    el->name, (intmax_t)el->offset, el->size);
 
 err:	__wt_scr_free(&tmp);
 	return (ret);
 }
 
 /*
- * __block_truncate --
- *	Truncate the file if the last part of the file isn't in use.
+ * __wt_block_extlist_truncate --
+ *	Truncate the file based on the last available extent in the list.
  */
-static int
-__block_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block)
+int
+__wt_block_extlist_truncate(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
 {
 	WT_EXT *ext;
 	WT_FH *fh;
@@ -673,90 +1033,85 @@ __block_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block)
 	fh = block->fh;
 
 	/*
-	 * Check if the last free range is at the end of the file, and if so,
-	 * truncate the file and discard the range.
+	 * Check if the last available extent is at the end of the file, and if
+	 * so, truncate the file and discard the extent.
 	 */
-	if ((ext = __block_extlist_last(block->free.off)) == NULL)
+	if ((ext = __block_extlist_last(el->off)) == NULL)
 		return (0);
 	if (ext->off + ext->size != fh->file_size)
 		return (0);
 
-	WT_VERBOSE(session, block,
+	WT_VERBOSE_RET(session, block,
 	    "truncate file from %" PRIdMAX " to %" PRIdMAX,
 	    (intmax_t)fh->file_size, (intmax_t)ext->off);
 
 	fh->file_size = ext->off;
 	WT_RET(__wt_ftruncate(session, fh, fh->file_size));
 
-	WT_RET(__block_off_remove(session, &block->free, ext->off, NULL));
+	WT_RET(__block_off_remove(session, el, ext->off, NULL));
 
 	return (0);
 }
-
 /*
- * __wt_block_discard --
- *	Discard any extent-list entries.
+ * __wt_block_extlist_free --
+ *	Discard an extent list.
  */
 void
-__wt_block_discard(WT_SESSION_IMPL *session, WT_BLOCK *block)
+__wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el)
 {
 	WT_EXT *ext, *next;
 	WT_SIZE *szp, *nszp;
 
-	for (ext = block->free.off[0]; ext != NULL; ext = next) {
+	for (ext = el->off[0]; ext != NULL; ext = next) {
 		next = ext->next[0];
 		__wt_free(session, ext);
 	}
-	for (szp = block->free.size[0]; szp != NULL; szp = nszp) {
+	memset(el->off, 0, sizeof(el->off));
+	for (szp = el->sz[0]; szp != NULL; szp = nszp) {
 		nszp = szp->next[0];
 		__wt_free(session, szp);
 	}
+	memset(el->sz, 0, sizeof(el->sz));
 
-	memset(&block->free, 0, sizeof(block->free));
-}
-
-/*
- * __wt_block_stat --
- *	Free-list statistics.
- */
-void
-__wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block)
-{
-	WT_BSTAT_SET(session, file_freelist_bytes, block->free.bytes);
-	WT_BSTAT_SET(session, file_freelist_entries, block->free.entries);
-	WT_BSTAT_SET(session, file_size, block->fh->file_size);
-	WT_BSTAT_SET(session, file_magic, WT_BLOCK_MAGIC);
-	WT_BSTAT_SET(session, file_major, WT_BLOCK_MAJOR_VERSION);
-	WT_BSTAT_SET(session, file_minor, WT_BLOCK_MINOR_VERSION);
+	el->bytes = 0;
+	el->entries = 0;
 }
 
 #ifdef HAVE_VERBOSE
-static void
-__block_extlist_dump(WT_SESSION_IMPL *session, WT_EXTLIST *el)
+int
+__wt_block_extlist_dump(
+    WT_SESSION_IMPL *session, const char *tag, WT_EXTLIST *el, int show_size)
 {
 	WT_EXT *ext;
 	WT_SIZE *szp;
 
-	if (el->entries == 0) {
-		WT_VERBOSE(session, block, "%s: [Empty]", el->name);
-		return;
-	}
+	WT_RET(__wt_verbose(
+	    session, "%s: %s: %" PRIu64 " bytes, by offset:%s",
+	    tag, el->name, el->bytes, el->entries == 0 ? " [Empty]" : ""));
+	if (el->entries == 0)
+		return (0);
 
-	WT_VERBOSE(session, block, "%s: list by offset:", el->name);
 	WT_EXT_FOREACH(ext, el->off)
-		WT_VERBOSE(session, block,
+		WT_RET(__wt_verbose(session,
 		    "\t{%" PRIuMAX "/%" PRIuMAX "}",
-		    (uintmax_t)ext->off, (uintmax_t)ext->size);
+		    (uintmax_t)ext->off, (uintmax_t)ext->size));
+
+	if (!show_size)
+		return (0);
 
-	WT_VERBOSE(session, block, "%s: list by size:", el->name);
-	WT_EXT_FOREACH(szp, el->size) {
-		WT_VERBOSE(session, block,
-		    "\t{%" PRIuMAX "}",
-		    (uintmax_t)szp->size);
+	WT_RET(__wt_verbose(session, "%s: %s: by size:%s",
+	    tag, el->name, el->entries == 0 ? " [Empty]" : ""));
+	if (el->entries == 0)
+		return (0);
+
+	WT_EXT_FOREACH(szp, el->sz) {
+		WT_RET(__wt_verbose(session,
+		    "\t{%" PRIuMAX "}", (uintmax_t)szp->size));
 		WT_EXT_FOREACH_OFF(ext, szp->off)
-			WT_VERBOSE(session, block,
+			WT_RET(__wt_verbose(session,
 			    "\t\t{%" PRIuMAX "/%" PRIuMAX "}",
-			    (uintmax_t)ext->off, (uintmax_t)ext->size);
+			    (uintmax_t)ext->off, (uintmax_t)ext->size));
 	}
+	return (0);
 }
 #endif
diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c
index 5d3295205dc..b1bc28766f6 100644
--- a/src/block/block_mgr.c
+++ b/src/block/block_mgr.c
@@ -39,8 +39,8 @@ __wt_bm_addr_stderr(
     WT_SESSION_IMPL *session, const uint8_t *addr, uint32_t addr_size)
 {
 	WT_BLOCK *block;
-	WT_ITEM *buf;
-	int ret;
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
 
 	if ((block = session->btree->block) == NULL)
 		return (__bm_invalid(session));
@@ -86,22 +86,30 @@ __wt_bm_create(WT_SESSION_IMPL *session, const char *filename)
  *	Open a file.
  */
 int
-__wt_bm_open(WT_SESSION_IMPL *session,
-    const char *filename, const char *config, const char *cfg[], int salvage)
+__wt_bm_open(WT_SESSION_IMPL *session, const char *filename,
+    const char *config, const char *cfg[], int forced_salvage)
 {
+	WT_BTREE *btree;
+
+	btree = session->btree;
+
+	WT_RET(__wt_block_open(
+	    session, filename, config, cfg, forced_salvage, &btree->block));
+
 	/*
 	 * !!!
 	 * As part of block-manager configuration, we need to return the maximum
 	 * sized address cookie that a block manager will ever return.  There's
-	 * a limit of WT_BM_MAX_ADDR_COOKIE, but at 255B, WT_BM_MAX_ADDR_COOKIE
-	 * is too large for a Btree with 512B internal pages.  The default block
-	 * manager packs an off_t and 2 uint32_t's into its cookie, so there's
-	 * no problem now, but when we create a block manager extension API,
-	 * we need some way to consider the block manager's maximum cookie size
-	 * vs. the minimum Btree internal node size.
+	 * a limit of WT_BTREE_MAX_ADDR_COOKIE, but at 255B, it's too large for
+	 * a Btree with 512B internal pages.  The default block manager packs
+	 * an off_t and 2 uint32_t's into its cookie, so there's no problem now,
+	 * but when we create a block manager extension API, we need some way to
+	 * consider the block manager's maximum cookie size versus the minimum
+	 * Btree internal node size.
 	 */
-	return (__wt_block_open(
-	    session, filename, config, cfg, salvage, &session->btree->block));
+	btree->block_header = __wt_block_header(session);
+
+	return (0);
 }
 
 /*
@@ -112,7 +120,7 @@ int
 __wt_bm_close(WT_SESSION_IMPL *session)
 {
 	WT_BLOCK *block;
-	int ret;
+	WT_DECL_RET;
 
 	if ((block = session->btree->block) == NULL)
 		return (0);
@@ -124,59 +132,106 @@ __wt_bm_close(WT_SESSION_IMPL *session)
 }
 
 /*
- * __wt_bm_truncate --
- *	Truncate a file.
+ * __wt_bm_snapshot --
+ *	Write a buffer into a block, creating a snapshot.
  */
 int
-__wt_bm_truncate(WT_SESSION_IMPL *session, const char *filename)
+__wt_bm_snapshot(WT_SESSION_IMPL *session, WT_ITEM *buf, WT_SNAPSHOT *snapbase)
 {
-	return (__wt_block_truncate(session, filename));
+	WT_BLOCK *block;
+
+	if ((block = session->btree->block) == NULL)
+		return (__bm_invalid(session));
+
+	return (__wt_block_snapshot(session, block, buf, snapbase));
 }
 
 /*
- * __wt_bm_free --
- *	Free a block of space to the underlying file.
+ * __wt_bm_snapshot_resolve --
+ *	Resolve the snapshot.
  */
 int
-__wt_bm_free(WT_SESSION_IMPL *session, const uint8_t *addr, uint32_t addr_size)
+__wt_bm_snapshot_resolve(WT_SESSION_IMPL *session, WT_SNAPSHOT *snapbase)
 {
 	WT_BLOCK *block;
 
 	if ((block = session->btree->block) == NULL)
 		return (__bm_invalid(session));
 
-	return (__wt_block_free_buf(session, block, addr, addr_size));
+	return (__wt_block_snapshot_resolve(session, block, snapbase));
 }
 
 /*
- * __wt_bm_read --
- *	Read a address cookie-referenced block into a buffer.
+ * __wt_bm_snapshot_load --
+ *	Load a snapshot point.
  */
 int
-__wt_bm_read(WT_SESSION_IMPL *session,
-    WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size)
+__wt_bm_snapshot_load(WT_SESSION_IMPL *session,
+    WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size, int readonly)
+{
+	WT_BLOCK *block;
+
+	if ((block = session->btree->block) == NULL)
+		return (__bm_invalid(session));
+
+	return (__wt_block_snapshot_load(
+	    session, block, buf, addr, addr_size, readonly));
+}
+
+/*
+ * __wt_bm_snapshot_unload --
+ *	Unload a snapshot point.
+ */
+int
+__wt_bm_snapshot_unload(WT_SESSION_IMPL *session)
+{
+	WT_BLOCK *block;
+
+	if ((block = session->btree->block) == NULL)
+		return (__bm_invalid(session));
+
+	return (__wt_block_snapshot_unload(session, block));
+}
+
+/*
+ * __wt_bm_truncate --
+ *	Truncate a file.
+ */
+int
+__wt_bm_truncate(WT_SESSION_IMPL *session, const char *filename)
+{
+	return (__wt_block_truncate(session, filename));
+}
+
+/*
+ * __wt_bm_free --
+ *	Free a block of space to the underlying file.
+ */
+int
+__wt_bm_free(WT_SESSION_IMPL *session, const uint8_t *addr, uint32_t addr_size)
 {
 	WT_BLOCK *block;
 
 	if ((block = session->btree->block) == NULL)
 		return (__bm_invalid(session));
 
-	return (__wt_block_read_buf(session, block, buf, addr, addr_size));
+	return (__wt_block_free(session, block, addr, addr_size));
 }
 
 /*
- * __wt_bm_block_header --
- *	Return the size of the block manager's header.
+ * __wt_bm_read --
+ *	Read a address cookie-referenced block into a buffer.
  */
 int
-__wt_bm_block_header(WT_SESSION_IMPL *session, uint32_t *headerp)
+__wt_bm_read(WT_SESSION_IMPL *session,
+    WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size)
 {
 	WT_BLOCK *block;
 
 	if ((block = session->btree->block) == NULL)
 		return (__bm_invalid(session));
 
-	return (__wt_block_header(session, block, headerp));
+	return (__wt_block_read(session, block, buf, addr, addr_size));
 }
 
 /*
@@ -207,7 +262,7 @@ __wt_bm_write(
 	if ((block = session->btree->block) == NULL)
 		return (__bm_invalid(session));
 
-	return (__wt_block_write_buf(session, block, buf, addr, addr_size));
+	return (__wt_block_write(session, block, buf, addr, addr_size));
 }
 
 /*
@@ -263,14 +318,14 @@ __wt_bm_salvage_next(WT_SESSION_IMPL *session, WT_ITEM *buf,
  *	End a block manager salvage.
  */
 int
-__wt_bm_salvage_end(WT_SESSION_IMPL *session, int success)
+__wt_bm_salvage_end(WT_SESSION_IMPL *session)
 {
 	WT_BLOCK *block;
 
 	if ((block = session->btree->block) == NULL)
 		return (__bm_invalid(session));
 
-	return (__wt_block_salvage_end(session, block, success));
+	return (__wt_block_salvage_end(session, block));
 }
 
 /*
@@ -278,14 +333,14 @@ __wt_bm_salvage_end(WT_SESSION_IMPL *session, int success)
  *	Start a block manager salvage.
  */
 int
-__wt_bm_verify_start(WT_SESSION_IMPL *session, int *emptyp)
+__wt_bm_verify_start(WT_SESSION_IMPL *session, WT_SNAPSHOT *snapbase)
 {
 	WT_BLOCK *block;
 
 	if ((block = session->btree->block) == NULL)
 		return (__bm_invalid(session));
 
-	return (__wt_block_verify_start(session, block, emptyp));
+	return (__wt_block_verify_start(session, block, snapbase));
 }
 
 /*
diff --git a/src/block/block_open.c b/src/block/block_open.c
index 85e3a5f8bea..784384119a0 100644
--- a/src/block/block_open.c
+++ b/src/block/block_open.c
@@ -7,8 +7,7 @@
 
 #include "wt_internal.h"
 
-static int __desc_read(WT_SESSION_IMPL *, WT_BLOCK *, int);
-static int __desc_update(WT_SESSION_IMPL *, WT_BLOCK *);
+static int __desc_read(WT_SESSION_IMPL *, WT_BLOCK *);
 
 /*
  * __wt_block_truncate --
@@ -17,8 +16,8 @@ static int __desc_update(WT_SESSION_IMPL *, WT_BLOCK *);
 int
 __wt_block_truncate(WT_SESSION_IMPL *session, const char *filename)
 {
+	WT_DECL_RET;
 	WT_FH *fh;
-	int ret;
 
 	/* Open the underlying file handle. */
 	WT_RET(__wt_open(session, filename, 0, 0, 1, &fh));
@@ -42,8 +41,8 @@ err:	WT_TRET(__wt_close(session, fh));
 int
 __wt_block_create(WT_SESSION_IMPL *session, const char *filename)
 {
+	WT_DECL_RET;
 	WT_FH *fh;
-	int ret;
 
 	/* Create the underlying file and open a handle. */
 	WT_RET(__wt_open(session, filename, 1, 1, 1, &fh));
@@ -67,15 +66,14 @@ __wt_block_create(WT_SESSION_IMPL *session, const char *filename)
  */
 int
 __wt_block_open(WT_SESSION_IMPL *session, const char *filename,
-    const char *config, const char *cfg[], int salvage, void *retp)
+    const char *config, const char *cfg[], int forced_salvage, void *blockp)
 {
 	WT_BLOCK *block;
 	WT_CONFIG_ITEM cval;
-	WT_CONNECTION_IMPL *conn;
-	WT_NAMED_COMPRESSOR *ncomp;
-	int ret;
+	WT_DECL_RET;
 
-	conn = S2C(session);
+	WT_UNUSED(cfg);
+	*(void **)blockp = NULL;
 
 	/*
 	 * Allocate the structure, connect (so error close works), copy the
@@ -84,12 +82,6 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename,
 	WT_RET(__wt_calloc_def(session, 1, &block));
 	WT_ERR(__wt_strdup(session, filename, &block->name));
 
-	/* Initialize the free-list structures. */
-	__wt_block_freelist_open(session, block);
-
-	/* Open the underlying file handle. */
-	WT_ERR(__wt_open(session, filename, 0, 0, 1, &block->fh));
-
 	/* Get the allocation size. */
 	WT_ERR(__wt_config_getones(session, config, "allocation_size", &cval));
 	block->allocsize = (uint32_t)cval.val;
@@ -99,8 +91,12 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename,
 	block->checksum = cval.val == 0 ? 0 : 1;
 
 	/* Page compressor */
-	WT_RET(__wt_config_getones(session, config, "block_compressor", &cval));
+	WT_ERR(__wt_config_getones(session, config, "block_compressor", &cval));
 	if (cval.len > 0) {
+		WT_CONNECTION_IMPL *conn;
+		WT_NAMED_COMPRESSOR *ncomp;
+
+		conn = S2C(session);
 		TAILQ_FOREACH(ncomp, &conn->compqh, q) {
 			if (strncmp(ncomp->name, cval.str, cval.len) == 0) {
 				block->compressor = ncomp->compressor;
@@ -113,32 +109,22 @@ __wt_block_open(WT_SESSION_IMPL *session, const char *filename,
 			    (int)cval.len, cval.str);
 	}
 
-	/*
-	 * Normally we read the file's meta-data to see if this is a WiredTiger
-	 * file.  But, if it's a salvage operation and force is set, we ignore
-	 * the file's format entirely.
-	 */
-	cval.val = 0;
-	if (salvage) {
-		ret = __wt_config_gets(session, cfg, "force", &cval);
-		if (ret != 0 && ret != WT_NOTFOUND)
-			WT_RET(ret);
-	}
-	if (cval.val == 0)
-		WT_ERR(__desc_read(session, block, salvage));
+	/* Open the underlying file handle. */
+	WT_ERR(__wt_open(session, filename, 0, 0, 1, &block->fh));
 
-	/* If not an open for a salvage operation, read the freelist. */
-	if (!salvage && block->free_offset != WT_BLOCK_INVALID_OFFSET) {
-		WT_ERR(__wt_block_extlist_read(session, block, &block->free,
-		    block->free_offset, block->free_size, block->free_cksum));
+	/* Initialize the live snapshot lock. */
+	__wt_spin_init(session, &block->live_lock);
 
-		/* Insert the free-list itself into the linked list. */
-		WT_ERR(__wt_block_free(session,
-		    block, block->free_offset, (off_t)block->free_size));
-	}
+	/*
+	 * Read the description sector.
+	 *
+	 * Salvage is a special case -- if we're forcing the salvage, we don't
+	 * even look at the description sector.
+	 */
+	if (!forced_salvage)
+		WT_ERR(__desc_read(session, block));
 
-	F_SET(block, WT_BLOCK_OK);
-	*(void **)retp = block;
+	*(void **)blockp = block;
 	return (0);
 
 err:	(void)__wt_block_close(session, block);
@@ -152,70 +138,72 @@ err:	(void)__wt_block_close(session, block);
 int
 __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block)
 {
-	int ret;
+	WT_DECL_RET;
 
-	ret = 0;
+	WT_VERBOSE_RETVAL(session, block, ret, "close");
 
-	/*
-	 * If the file was active, write out the free-list and update the
-	 * file's description.
-	 */
-	if (F_ISSET(block, WT_BLOCK_OK)) {
-		WT_TRET(__wt_block_extlist_write(session, block,
-		    &block->free, &block->free_offset,
-		    &block->free_size, &block->free_cksum));
-		WT_TRET(__desc_update(session, block));
-	}
+	ret = __wt_block_snapshot_unload(session, block);
 
 	if (block->name != NULL)
 		__wt_free(session, block->name);
 
 	if (block->fh != NULL)
-		WT_RET(__wt_close(session, block->fh));
-
-	__wt_block_freelist_close(session, block);
+		WT_TRET(__wt_close(session, block->fh));
 
-	__wt_free(session, block->fragbits);
+	__wt_spin_destroy(session, &block->live_lock);
 
 	__wt_free(session, block);
+
 	return (ret);
 }
 
 /*
+ * __wt_desc_init --
+ *	Write a file's initial descriptor structure.
+ */
+int
+__wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+	WT_BLOCK_DESC *desc;
+	uint8_t buf[WT_BLOCK_DESC_SECTOR];
+
+	memset(buf, 0, sizeof(buf));
+	desc = (void *)buf;
+	desc->magic = WT_BLOCK_MAGIC;
+	desc->majorv = WT_BLOCK_MAJOR_VERSION;
+	desc->minorv = WT_BLOCK_MINOR_VERSION;
+
+	/* Update the checksum. */
+	desc->cksum = 0;
+	desc->cksum = __wt_cksum(desc, WT_BLOCK_DESC_SECTOR);
+
+	return (__wt_write(session, fh, (off_t)0, WT_BLOCK_DESC_SECTOR, desc));
+}
+
+/*
  * __desc_read --
  *	Read and verify the file's metadata.
  */
 static int
-__desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block, int salvage)
+__desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block)
 {
 	WT_BLOCK_DESC *desc;
-	WT_ITEM *buf;
 	uint32_t cksum;
-	int ret;
+	uint8_t buf[WT_BLOCK_DESC_SECTOR];
 
-	WT_RET(__wt_scr_alloc(session, WT_BLOCK_DESC_SECTOR, &buf));
+	/* Read the first sector and verify the file's format. */
+	memset(buf, 0, sizeof(buf));
+	WT_RET(__wt_read(
+	    session, block->fh, (off_t)0, WT_BLOCK_DESC_SECTOR, buf));
 
-	/*
-	 * We currently always do the verification step, because it's cheap
-	 * and we only do it the first time a file is opened.
-	 *
-	 * Read the first sector.
-	 */
-	WT_ERR(__wt_read(
-	    session, block->fh, (off_t)0, WT_BLOCK_DESC_SECTOR, buf->mem));
-
-	desc = (WT_BLOCK_DESC *)buf->mem;
-	WT_VERBOSE(session, block,
+	desc = (void *)buf;
+	WT_VERBOSE_RET(session, block,
 	    "open: magic %" PRIu32
 	    ", major/minor: %" PRIu32 "/%" PRIu32
-	    ", checksum %#" PRIx32
-	    ", free offset/size %" PRIu64 "/%" PRIu32
-	    ", write-generation %" PRIu64,
+	    ", checksum %#" PRIx32,
 	    desc->magic,
 	    desc->majorv, desc->minorv,
-	    desc->cksum,
-	    desc->free_offset, desc->free_size,
-	    desc->write_gen);
+	    desc->cksum);
 
 	/*
 	 * We fail the open if the checksum fails, or the magic number is wrong
@@ -229,113 +217,29 @@ __desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block, int salvage)
 	cksum = desc->cksum;
 	desc->cksum = 0;
 	if (desc->magic != WT_BLOCK_MAGIC ||
-	    cksum != __wt_cksum(buf->mem, WT_BLOCK_DESC_SECTOR))
-		WT_ERR_MSG(session, WT_ERROR, "%s %s%s",
-		    "does not appear to be a WiredTiger file",
-		    block->name,
-		    salvage ? "; to salvage this file, configure the salvage "
-		    "operation with the force flag" : "");
+	    cksum != __wt_cksum(desc, WT_BLOCK_DESC_SECTOR))
+		WT_RET_MSG(session, WT_ERROR,
+		    "%s does not appear to be a WiredTiger file", block->name);
 
 	if (desc->majorv > WT_BLOCK_MAJOR_VERSION ||
 	    (desc->majorv == WT_BLOCK_MAJOR_VERSION &&
 	    desc->minorv > WT_BLOCK_MINOR_VERSION))
-		WT_ERR_MSG(session, WT_ERROR,
+		WT_RET_MSG(session, WT_ERROR,
 		    "%s is an unsupported version of a WiredTiger file",
 		    block->name);
 
-	block->write_gen = desc->write_gen;
-
-	/* That's all we check for salvage. */
-	if (!salvage) {
-		if ((desc->free_offset != WT_BLOCK_INVALID_OFFSET &&
-		    desc->free_offset + desc->free_size >
-		    (uint64_t)block->fh->file_size))
-			WT_RET_MSG(session, WT_ERROR,
-			    "free list offset references "
-			    "non-existent file space");
-
-		block->free_offset = (off_t)desc->free_offset;
-		block->free_size = desc->free_size;
-		block->free_cksum = desc->free_cksum;
-	}
-
-err:	__wt_scr_free(&buf);
-	return (ret);
-}
-
-/*
- * __wt_desc_init --
- *	Write a file's initial descriptor structure.
- */
-int
-__wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh)
-{
-	WT_BLOCK_DESC *desc;
-	WT_ITEM *buf;
-	int ret;
-
-	WT_RET(__wt_scr_alloc(session, WT_BLOCK_DESC_SECTOR, &buf));
-	desc = (WT_BLOCK_DESC *)buf->mem;
-
-	desc->magic = WT_BLOCK_MAGIC;
-	desc->majorv = WT_BLOCK_MAJOR_VERSION;
-	desc->minorv = WT_BLOCK_MINOR_VERSION;
-
-	desc->free_offset = WT_BLOCK_INVALID_OFFSET;
-	desc->free_size = desc->free_cksum = 0;
-
-	/* Update the checksum. */
-	desc->cksum = 0;
-	desc->cksum = __wt_cksum(buf->mem, WT_BLOCK_DESC_SECTOR);
-
-	ret = __wt_write(session, fh, (off_t)0, WT_BLOCK_DESC_SECTOR, buf->mem);
-
-	__wt_scr_free(&buf);
-	return (ret);
+	return (0);
 }
 
 /*
- * __desc_update --
- *	Update the file's descriptor structure.
+ * __wt_block_stat --
+ *	Block statistics
  */
-static int
-__desc_update(WT_SESSION_IMPL *session, WT_BLOCK *block)
+void
+__wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block)
 {
-	WT_BLOCK_DESC *desc;
-	WT_ITEM *buf;
-	int ret;
-
-	WT_RET(__wt_scr_alloc(session, WT_BLOCK_DESC_SECTOR, &buf));
-
-	/* Read the first sector. */
-	WT_ERR(__wt_read(
-	    session, block->fh, (off_t)0, WT_BLOCK_DESC_SECTOR, buf->mem));
-	desc = (WT_BLOCK_DESC *)buf->mem;
-
-	/* See if anything has changed. */
-	if (desc->free_offset == (uint64_t)block->free_offset &&
-	    desc->free_size == block->free_size &&
-	    desc->write_gen == block->write_gen)
-		return (0);
-
-	WT_VERBOSE(session, block,
-	    "resetting free list [offset %" PRIuMAX ", size %" PRIu32 "]",
-	    (uintmax_t)block->free_offset, block->free_size);
-
-	desc->free_offset = (uint64_t)block->free_offset;
-	desc->free_size = block->free_size;
-	desc->free_cksum = block->free_cksum;
-
-	desc->write_gen = block->write_gen;
-
-	/* Update the checksum. */
-	desc->cksum = 0;
-	desc->cksum = __wt_cksum(buf->mem, WT_BLOCK_DESC_SECTOR);
-
-	/* Write the first sector. */
-	ret = __wt_write(session,
-	    block->fh, (off_t)0, WT_BLOCK_DESC_SECTOR, buf->mem);
-
-err:	__wt_scr_free(&buf);
-	return (ret);
+	WT_BSTAT_SET(session, file_size, block->fh->file_size);
+	WT_BSTAT_SET(session, file_magic, WT_BLOCK_MAGIC);
+	WT_BSTAT_SET(session, file_major, WT_BLOCK_MAJOR_VERSION);
+	WT_BSTAT_SET(session, file_minor, WT_BLOCK_MINOR_VERSION);
 }
diff --git a/src/block/block_read.c b/src/block/block_read.c
index 4d36a56073e..781089eae09 100644
--- a/src/block/block_read.c
+++ b/src/block/block_read.c
@@ -8,59 +8,46 @@
 #include "wt_internal.h"
 
 /*
- * __wt_block_read_buf --
+ * __wt_block_read --
  *	Read filesystem cookie referenced block into a buffer.
  */
 int
-__wt_block_read_buf(WT_SESSION_IMPL *session, WT_BLOCK *block,
+__wt_block_read(WT_SESSION_IMPL *session, WT_BLOCK *block,
     WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size)
 {
-	WT_ITEM *tmp;
 	off_t offset;
 	uint32_t size, cksum;
-	int ret;
-
-	ret = 0;
 
 	/* Crack the cookie. */
 	WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
 
 	/* Read the block. */
-	WT_RET(__wt_block_read(session, block, buf, offset, size, cksum));
+	WT_RET(__wt_block_read_off(session, block, buf, offset, size, cksum));
 
 	/* Optionally verify the page. */
-	if (block->fragbits == NULL)
-		return (0);
-
-	WT_RET(__wt_scr_alloc(session, 0, &tmp));
-	WT_ERR(__wt_block_addr_string(session, block, tmp, addr, addr_size));
-	WT_ERR(__wt_verify_dsk(
-	    session, (char *)tmp->data, buf->mem, buf->size));
-
-err:	__wt_scr_free(&tmp);
+	if (block->verify)
+		WT_RET(__wt_block_verify(
+		    session, block, buf, addr, addr_size, offset, size));
 
-	return (ret);
+	return (0);
 }
 
 /*
- * __wt_block_read --
+ * __wt_block_read_off --
  *	Read an addr/size pair referenced block into a buffer.
  */
 int
-__wt_block_read(WT_SESSION_IMPL *session, WT_BLOCK *block,
+__wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
     WT_ITEM *buf, off_t offset, uint32_t size, uint32_t cksum)
 {
 	WT_BLOCK_HEADER *blk;
-	WT_ITEM *tmp;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
 	WT_PAGE_HEADER *dsk;
 	size_t result_len;
 	uint32_t page_cksum;
-	int ret;
-
-	tmp = NULL;
-	ret = 0;
 
-	WT_VERBOSE(session, read,
+	WT_VERBOSE_RET(session, read,
 	    "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32,
 	    (uintmax_t)offset, size, cksum);
 
@@ -114,9 +101,9 @@ __wt_block_read(WT_SESSION_IMPL *session, WT_BLOCK *block,
 	 */
 	if (blk->disk_size < dsk->size) {
 		if (block->compressor == NULL)
-			WT_ERR(__wt_illegal_value(session));
+			WT_ERR(__wt_illegal_value(session, block->name));
 
-		WT_RET(__wt_buf_init(session, buf, dsk->size));
+		WT_ERR(__wt_buf_init(session, buf, dsk->size));
 		buf->size = dsk->size;
 
 		/*
@@ -138,7 +125,7 @@ __wt_block_read(WT_SESSION_IMPL *session, WT_BLOCK *block,
 		    dsk->size - WT_BLOCK_COMPRESS_SKIP,
 		    &result_len));
 		if (result_len != dsk->size - WT_BLOCK_COMPRESS_SKIP)
-			WT_ERR(__wt_illegal_value(session));
+			WT_ERR(__wt_illegal_value(session, block->name));
 	} else
 		if (block->compressor == NULL)
 			buf->size = dsk->size;
diff --git a/src/block/block_slvg.c b/src/block/block_slvg.c
index 5d0e3a603f9..9ecad47173c 100644
--- a/src/block/block_slvg.c
+++ b/src/block/block_slvg.c
@@ -17,6 +17,15 @@ __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
 	off_t len;
 	uint32_t allocsize;
 
+	/* Reset the description sector. */
+	WT_RET(__wt_desc_init(session, block->fh));
+
+	/*
+	 * Salvage creates a new snapshot when it's finished, set up for
+	 * rolling an empty file forward.
+	 */
+	WT_RET(__wt_block_snap_init(session, block, &block->live, 1));
+
 	/*
 	 * Truncate the file to an initial sector plus N allocation size
 	 * units (bytes trailing the last multiple of an allocation size
@@ -29,19 +38,24 @@ __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
 		len += WT_BLOCK_DESC_SECTOR;
 		if (len != block->fh->file_size)
 			WT_RET(__wt_ftruncate(session, block->fh, len));
-	}
-
-	/* Reset the description sector. */
-	WT_RET(__wt_desc_init(session, block->fh));
+	} else
+		len = WT_BLOCK_DESC_SECTOR;
 
-	/* The first sector of the file is the description record, skip it. */
+	/*
+	 * The first sector of the file is the description record, skip it as
+	 * we read the file.
+	 */
 	block->slvg_off = WT_BLOCK_DESC_SECTOR;
 
 	/*
-	 * We don't currently need to do anything about the freelist because
-	 * we don't read it for salvage operations.
+	 * The only snapshot extent we care about is the allocation list.  Start
+	 * with the entire file on the allocation list, we'll "free" any blocks
+	 * we don't want as we process the file.
 	 */
+	WT_RET(__wt_block_insert_ext(session, &block->live.alloc,
+	    WT_BLOCK_DESC_SECTOR, len - WT_BLOCK_DESC_SECTOR));
 
+	block->slvg = 1;
 	return (0);
 }
 
@@ -50,17 +64,14 @@ __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
  *	End a file salvage.
  */
 int
-__wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block, int success)
+__wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
 {
-	/*
-	 * If not successful, discard the free-list, it's not useful, and
-	 * don't write back an updated description block.
-	 */
-	if (!success) {
-		F_CLR(block, WT_BLOCK_OK);
-		__wt_block_discard(session, block);
-	}
-	return (0);
+	WT_UNUSED(session);
+
+	block->slvg = 0;
+
+	/* Discard the snapshot. */
+	return (__wt_block_snapshot_unload(session, block));
 }
 
 /*
@@ -117,8 +128,9 @@ __wt_block_salvage_next(
 		 * needed.  If reading the page fails, it's probably corruption,
 		 * we ignore this block.
 		 */
-		if (__wt_block_read(session, block, buf, offset, size, cksum)) {
-skip:			WT_VERBOSE(session, salvage,
+		if (__wt_block_read_off(
+		    session, block, buf, offset, size, cksum)) {
+skip:			WT_VERBOSE_RET(session, salvage,
 			    "skipping %" PRIu32 "B at file offset %" PRIuMAX,
 			    allocsize, (uintmax_t)offset);
 
@@ -126,7 +138,7 @@ skip:			WT_VERBOSE(session, salvage,
 			 * Free the block and make sure we don't return it more
 			 * than once.
 			 */
-			WT_RET(__wt_block_free(
+			WT_RET(__wt_block_off_free(
 			    session, block, offset, (off_t)allocsize));
 			block->slvg_off = offset += allocsize;
 			continue;
@@ -147,8 +159,8 @@ skip:			WT_VERBOSE(session, salvage,
 	 * writes, done after salvage completes, are preferred to these blocks.
 	 */
 	*write_genp = blk->write_gen;
-	if (block->write_gen < blk->write_gen)
-		block->write_gen = blk->write_gen;
+	if (block->live.write_gen < blk->write_gen)
+		block->live.write_gen = blk->write_gen;
 
 	/* Re-create the address cookie that should reference this block. */
 	endp = addr;
diff --git a/src/block/block_snap.c b/src/block/block_snap.c
new file mode 100644
index 00000000000..bf67160e460
--- /dev/null
+++ b/src/block/block_snap.c
@@ -0,0 +1,690 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __snapshot_process(WT_SESSION_IMPL *, WT_BLOCK *, WT_SNAPSHOT *);
+static int __snapshot_string(
+	WT_SESSION_IMPL *, WT_BLOCK *, const uint8_t *, WT_ITEM *);
+static int __snapshot_update(WT_SESSION_IMPL *,
+	WT_BLOCK *, WT_SNAPSHOT *, WT_BLOCK_SNAPSHOT *, uint64_t, int);
+
+/*
+ * __wt_block_snap_init --
+ *	Initialize a snapshot structure.
+ */
+int
+__wt_block_snap_init(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, WT_BLOCK_SNAPSHOT *si, int is_live)
+{
+	WT_DECL_RET;
+
+	/*
+	 * If we're loading a new live snapshot, there shouldn't be one already
+	 * loaded.  The btree engine should prevent this from ever happening,
+	 * but paranoia is a healthy thing.
+	 */
+	if (is_live) {
+		__wt_spin_lock(session, &block->live_lock);
+		if (block->live_load)
+			ret = EINVAL;
+		else
+			block->live_load = 1;
+		__wt_spin_unlock(session, &block->live_lock);
+		if (ret)
+			WT_RET_MSG(session, EINVAL, "snapshot already loaded");
+	}
+
+	memset(si, 0, sizeof(*si));
+
+	si->root_offset = WT_BLOCK_INVALID_OFFSET;
+
+	si->alloc.name = "alloc";
+	si->alloc.offset = WT_BLOCK_INVALID_OFFSET;
+
+	si->avail.name = "avail";
+	si->avail.offset = WT_BLOCK_INVALID_OFFSET;
+
+	si->discard.name = "discard";
+	si->discard.offset = WT_BLOCK_INVALID_OFFSET;
+
+	si->file_size = WT_BLOCK_DESC_SECTOR;
+
+	return (0);
+}
+
+/*
+ * __wt_block_snapshot_load --
+ *	Load a snapshot.
+ */
+int
+__wt_block_snapshot_load(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, WT_ITEM *dsk, const uint8_t *addr, uint32_t addr_size,
+    int readonly)
+{
+	WT_BLOCK_SNAPSHOT *si;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+
+	WT_UNUSED(addr_size);
+
+	/*
+	 * Sometimes we don't find a root page (we weren't given a snapshot,
+	 * or the referenced snapshot was empty).  In that case we return a
+	 * root page size of 0.  Set that up now.
+	 */
+	dsk->size = 0;
+
+	si = &block->live;
+	WT_RET(__wt_block_snap_init(session, block, si, 1));
+
+	if (WT_VERBOSE_ISSET(session, snapshot)) {
+		if (addr != NULL) {
+			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+			WT_ERR(__snapshot_string(session, block, addr, tmp));
+		}
+		WT_VERBOSE_ERR(session, snapshot,
+		    "%s: load-snapshot: %s", block->name,
+		    addr == NULL ? "[Empty]" : (char *)tmp->data);
+	}
+
+	/* If not loading a snapshot from disk, we're done. */
+	if (addr == NULL || addr_size == 0)
+		return (0);
+
+	/* Crack the snapshot cookie. */
+	if (addr != NULL)
+		WT_ERR(__wt_block_buffer_to_snapshot(session, block, addr, si));
+
+	/* Verify sets up next. */
+	if (block->verify)
+		WT_ERR(__wt_verify_snap_load(session, block, si));
+
+	/* Read, and optionally verify, any root page. */
+	if (si->root_offset != WT_BLOCK_INVALID_OFFSET) {
+		WT_ERR(__wt_block_read_off(session, block,
+		    dsk, si->root_offset, si->root_size, si->root_cksum));
+		if (block->verify) {
+			if (tmp == NULL) {
+				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+				WT_ERR(__snapshot_string(
+				    session, block, addr, tmp));
+			}
+			WT_ERR(
+			    __wt_verify_dsk(session, (char *)tmp->data, dsk));
+		}
+	}
+
+	/*
+	 * Rolling a snapshot forward requires the avail list, the blocks from
+	 * which we can allocate.
+	 */
+	if (!readonly)
+		WT_ERR(__wt_block_extlist_read(session, block, &si->avail));
+
+	/*
+	 * If the snapshot can be written, that means anything written after
+	 * the snapshot is no longer interesting.  Truncate the file.
+	 */
+	if (!readonly) {
+		WT_VERBOSE_ERR(session, snapshot,
+		    "truncate file to %" PRIuMAX, (uintmax_t)si->file_size);
+		WT_ERR(__wt_ftruncate(session, block->fh, si->file_size));
+	}
+
+	if (0) {
+err:		(void)__wt_block_snapshot_unload(session, block);
+	}
+
+	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __wt_block_snapshot_unload --
+ *	Unload a snapshot.
+ */
+int
+__wt_block_snapshot_unload(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+	WT_BLOCK_SNAPSHOT *si;
+	WT_DECL_RET;
+
+	WT_VERBOSE_RETVAL(
+	    session, snapshot, ret, "%s: unload snapshot", block->name);
+
+	si = &block->live;
+
+	/* Verify cleanup. */
+	if (block->verify)
+		WT_TRET(__wt_verify_snap_unload(session, block, si));
+
+	/* Discard the extent lists. */
+	__wt_block_extlist_free(session, &si->alloc);
+	__wt_block_extlist_free(session, &si->avail);
+	__wt_block_extlist_free(session, &si->discard);
+
+	__wt_block_extlist_free(session, &si->snapshot_avail);
+
+	block->live_load = 0;
+
+	return (ret);
+}
+
+/*
+ * __wt_block_snapshot --
+ *	Create a new snapshot.
+ */
+int
+__wt_block_snapshot(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, WT_ITEM *buf, WT_SNAPSHOT *snapbase)
+{
+	WT_BLOCK_SNAPSHOT *si;
+
+	si = &block->live;
+	si->version = WT_BM_SNAPSHOT_VERSION;
+
+	/*
+	 * Write the root page: it's possible for there to be a snapshot of
+	 * an empty tree, in which case, we store an illegal root offset.
+	 *
+	 * XXX
+	 * We happen to know that snapshots are single-threaded above us in
+	 * the btree engine.  That's probably something we want to guarantee
+	 * for any WiredTiger block manager.
+	 */
+	if (buf == NULL) {
+		si->root_offset = WT_BLOCK_INVALID_OFFSET;
+		si->root_size = si->root_cksum = 0;
+	} else
+		WT_RET(__wt_block_write_off(session, block, buf,
+		    &si->root_offset, &si->root_size, &si->root_cksum, 0));
+
+	/* Process the list of snapshots, deleting and updating as required. */
+	WT_RET(__snapshot_process(session, block, snapbase));
+
+	/*
+	 * Snapshots have to hit disk (it would be reasonable to configure for
+	 * lazy snapshots, but we don't support them yet).  Regardless, we're
+	 * not holding any locks, other writers can proceed while we wait.
+	 */
+	if (!F_ISSET(S2C(session), WT_CONN_NOSYNC))
+		WT_RET(__wt_fsync(session, block->fh));
+
+	return (0);
+}
+
+/*
+ * __snapshot_extlist_fblocks --
+ *	If an extent list was read from disk, free its space to the live avail
+ * list.
+ */
+static inline int
+__snapshot_extlist_fblocks(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
+{
+	if (el->offset == WT_BLOCK_INVALID_OFFSET)
+		return (0);
+	return (__wt_block_insert_ext(
+	    session, &block->live.avail, el->offset, el->size));
+}
+
+/*
+ * __snapshot_process --
+ *	Process the list of snapshots.
+ */
+static int
+__snapshot_process(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snapbase)
+{
+	WT_BLOCK_SNAPSHOT *a, *b, *si;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	WT_SNAPSHOT *snap;
+	uint64_t snapshot_size;
+	int deleting, locked;
+
+	si = &block->live;
+	locked = 0;
+
+	/*
+	 * We've allocated our last page, update the snapshot size.  We need to
+	 * calculate the live system's snapshot size before reading and merging
+	 * snapshot allocation and discard information from the snapshots we're
+	 * deleting, those operations will change the underlying byte counts.
+	 */
+	snapshot_size = si->snapshot_size;
+	snapshot_size += si->alloc.bytes;
+	snapshot_size -= si->discard.bytes;
+
+	/*
+	 * Extents that become newly available as a result of deleting previous
+	 * snapshots are added to a list of extents.  The list should be empty,
+	 * but there's no explicit "free the snapshot information" call into the
+	 * block manager; if there was an error in an upper level resulting in
+	 * the snapshot never being "resolved", the list might not be empty.
+	 */
+	__wt_block_extlist_free(session, &si->snapshot_avail);
+
+	/*
+	 * To delete a snapshot, we'll need snapshot information for it, and we
+	 * have to read that from the disk.
+	 */
+	deleting = 0;
+	WT_SNAPSHOT_FOREACH(snapbase, snap) {
+		/*
+		 * To delete a snapshot, we'll need snapshot information for it
+		 * and the subsequent snapshot.  The test is tricky, we have to
+		 * load the current snapshot's information if it's marked for
+		 * deletion, or if it follows a snapshot marked for deletion,
+		 * where the boundary cases are the first snapshot in the list
+		 * and the last snapshot in the list: if we're deleting the last
+		 * snapshot in the list, there's no next snapshot, the snapshot
+		 * will be merged into the live tree.
+		 */
+		if (!F_ISSET(snap, WT_SNAP_DELETE) &&
+		    (snap == snapbase ||
+		    F_ISSET(snap, WT_SNAP_ADD) ||
+		    !F_ISSET(snap - 1, WT_SNAP_DELETE)))
+			continue;
+		deleting = 1;
+
+		/*
+		 * Allocate a snapshot structure, crack the cookie and read the
+		 * snapshot's extent lists.
+		 *
+		 * Ignore the avail list: snapshot avail lists are only useful
+		 * if we are rolling forward from the particular snapshot and
+		 * they represent our best understanding of what blocks can be
+		 * allocated.  If we are not operating on the live snapshot,
+		 * subsequent snapshots might have allocated those blocks, and
+		 * the avail list is useless.  We don't discard it, because it
+		 * is useful as part of verification, but we don't re-write it
+		 * either.
+		 */
+		WT_ERR(__wt_calloc(
+		    session, 1, sizeof(WT_BLOCK_SNAPSHOT), &snap->bpriv));
+		si = snap->bpriv;
+		WT_ERR(__wt_block_snap_init(session, block, si, 0));
+		WT_ERR(__wt_block_buffer_to_snapshot(
+		    session, block, snap->raw.data, si));
+		WT_ERR(__wt_block_extlist_read(session, block, &si->alloc));
+		WT_ERR(__wt_block_extlist_read(session, block, &si->discard));
+	}
+
+	/*
+	 * Hold a lock so the live extent lists and the file size can't change
+	 * underneath us.  I suspect we'll tighten this if snapshots take too
+	 * much time away from real work: we read historic snapshot information
+	 * without a lock, but we could also merge and re-write the delete
+	 * snapshot information without a lock, except for ranges merged into
+	 * the live tree.
+	 */
+	__wt_spin_lock(session, &block->live_lock);
+	locked = 1;
+
+	/* Skip the additional processing if we aren't deleting snapshots. */
+	if (!deleting)
+		goto live_update;
+
+	/*
+	 * Delete any no-longer-needed snapshots: we do this first as it frees
+	 * blocks to the live lists, and the freed blocks will then be included
+	 * when writing the live extent lists.
+	 */
+	WT_SNAPSHOT_FOREACH(snapbase, snap) {
+		if (!F_ISSET(snap, WT_SNAP_DELETE))
+			continue;
+
+		if (WT_VERBOSE_ISSET(session, snapshot)) {
+			if (tmp == NULL)
+				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+			WT_ERR(__snapshot_string(
+			    session, block, snap->raw.data, tmp));
+			WT_VERBOSE_ERR(session, snapshot,
+			    "%s: delete-snapshot: %s: %s",
+			    block->name, snap->name, (char *)tmp->data);
+		}
+
+		/*
+		 * Set the from/to snapshot structures, where the "to" value
+		 * may be the live tree.
+		 */
+		a = snap->bpriv;
+		if (F_ISSET(snap + 1, WT_SNAP_ADD))
+			b = &block->live;
+		else
+			b = (snap + 1)->bpriv;
+
+		/*
+		 * Free the root page: there's nothing special about this free,
+		 * the root page is allocated using normal rules, that is, it
+		 * may have been taken from the avail list, and was entered on
+		 * the live system's alloc list at that time.  We free it into
+		 * the snapshot's discard list, however, not the live system's
+		 * list because it appears on the snapshot's alloc list and so
+		 * must be paired in the snapshot.
+		 */
+		if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
+			WT_ERR(__wt_block_insert_ext(session,
+			    &a->discard, a->root_offset, a->root_size));
+
+		/*
+		 * Free the blocks used to hold the "from" snapshot's extent
+		 * lists directly to the live system's avail list, they were
+		 * never on any alloc list.   Include the "from" snapshot's
+		 * avail list, it's going away.
+		 */
+		WT_ERR(__snapshot_extlist_fblocks(session, block, &a->alloc));
+		WT_ERR(__snapshot_extlist_fblocks(session, block, &a->avail));
+		WT_ERR(__snapshot_extlist_fblocks(session, block, &a->discard));
+
+		/*
+		 * Roll the "from" alloc and discard extent lists into the "to"
+		 * snapshot's lists.
+		 */
+		if (a->alloc.entries != 0)
+			WT_ERR(__wt_block_extlist_merge(
+			    session, &a->alloc, &b->alloc));
+		if (a->discard.entries != 0)
+			WT_ERR(__wt_block_extlist_merge(
+			    session, &a->discard, &b->discard));
+
+		/*
+		 * If the "to" snapshot is also being deleted, we're done with
+		 * it, it's merged into some other snapshot in the next loop.
+		 * This means the extent lists may aggregate over a number of
+		 * snapshots, but that's OK, they're disjoint sets of ranges.
+		 */
+		if (F_ISSET(snap + 1, WT_SNAP_DELETE))
+			continue;
+
+		/*
+		 * Find blocks for re-use: wherever the "to" snapshot's allocate
+		 * and discard lists overlap is fair game, move ranges appearing
+		 * on both lists to the live snapshot's newly available list.
+		 */
+		WT_ERR(__wt_block_extlist_overlap(session, block, b));
+
+		/*
+		 * If we're updating the live system's information, we're done.
+		 */
+		if (F_ISSET(snap + 1, WT_SNAP_ADD))
+			continue;
+
+		/*
+		 * We have to write the "to" snapshot's extent lists out in new
+		 * blocks, and update its cookie.
+		 *
+		 * Free the blocks used to hold the "to" snapshot's extent lists
+		 * directly to the live system's avail list, they were never on
+		 * any alloc list.  Do not include the "to" snapshot's avail
+		 * list, it's not changing.
+		 */
+		WT_ERR(__snapshot_extlist_fblocks(session, block, &b->alloc));
+		WT_ERR(__snapshot_extlist_fblocks(session, block, &b->discard));
+
+		F_SET(snap + 1, WT_SNAP_UPDATE);
+	}
+
+	/* Update snapshots marked for update. */
+	WT_SNAPSHOT_FOREACH(snapbase, snap)
+		if (F_ISSET(snap, WT_SNAP_UPDATE)) {
+			WT_ASSERT(session, !F_ISSET(snap, WT_SNAP_ADD));
+			WT_ERR(__snapshot_update(
+			    session, block, snap, snap->bpriv, 0, 0));
+		}
+
+live_update:
+	si = &block->live;
+
+	/* Truncate the file if that's possible. */
+	WT_ERR(__wt_block_extlist_truncate(session, block, &si->avail));
+
+	/* Update the final, added snapshot based on the live system. */
+	WT_SNAPSHOT_FOREACH(snapbase, snap)
+		if (F_ISSET(snap, WT_SNAP_ADD)) {
+			WT_ERR(__snapshot_update(
+			    session, block, snap, si, snapshot_size, 1));
+
+			/*
+			 * XXX
+			 * Our caller wants two pieces of information: the time
+			 * the snapshot was taken and the final snapshot size.
+			 * This violates layering but the alternative is a call
+			 * for the btree layer to crack the snapshot cookie into
+			 * its components, and that's a fair amount of work.
+			 * (We could just read the system time in the session
+			 * layer when updating the metadata file, but that won't
+			 * work for the snapshot size, and so we do both here.)
+			 */
+			snap->snapshot_size = si->snapshot_size;
+			WT_ERR(__wt_epoch(session, &snap->sec, NULL));
+		}
+
+	/*
+	 * Discard the live system's alloc and discard extent lists, leave the
+	 * avail list alone.
+	 */
+	__wt_block_extlist_free(session, &si->alloc);
+	__wt_block_extlist_free(session, &si->discard);
+
+#ifdef HAVE_DIAGNOSTIC
+	/*
+	 * The first snapshot in the system should always have an empty discard
+	 * list.  If we've read that snapshot and/or created it, check.
+	 */
+	WT_SNAPSHOT_FOREACH(snapbase, snap)
+		if (!F_ISSET(snap, WT_SNAP_DELETE))
+			break;
+	if ((a = snap->bpriv) == NULL)
+		a = &block->live;
+	if (a->discard.entries != 0) {
+		__wt_errx(session,
+		    "snapshot incorrectly has blocks on the discard list");
+		WT_ERR(WT_ERROR);
+	}
+#endif
+
+err:	if (locked)
+		__wt_spin_unlock(session, &block->live_lock);
+
+	/* Discard any snapshot information we loaded, we no longer need it. */
+	WT_SNAPSHOT_FOREACH(snapbase, snap)
+		if ((si = snap->bpriv) != NULL) {
+			__wt_block_extlist_free(session, &si->alloc);
+			__wt_block_extlist_free(session, &si->avail);
+			__wt_block_extlist_free(session, &si->discard);
+		}
+
+	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __snapshot_update --
+ *	Update a snapshot.
+ */
+static int
+__snapshot_update(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snap,
+    WT_BLOCK_SNAPSHOT *si, uint64_t snapshot_size, int is_live)
+{
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	uint8_t *endp;
+
+#ifdef HAVE_DIAGNOSTIC
+	/* Check the extent list combinations for overlaps. */
+	WT_RET(__wt_block_extlist_check(session, &si->alloc, &si->avail));
+	WT_RET(__wt_block_extlist_check(session, &si->discard, &si->avail));
+	WT_RET(__wt_block_extlist_check(session, &si->alloc, &si->discard));
+#endif
+	/*
+	 * Write the snapshot's extent lists; we only write an avail list for
+	 * the live system, other snapshot's avail lists are static and never
+	 * change.  When we do write the avail list for the live system it's
+	 * two lists: the current avail list plus the list of blocks that are
+	 * being made available as of the new snapshot.  We can't merge that
+	 * second list into the real list yet, it's not truly available until
+	 * the new snapshot location has been saved to the metadata.
+	 */
+	WT_RET(__wt_block_extlist_write(session, block, &si->alloc, NULL));
+	if (is_live)
+		WT_RET(__wt_block_extlist_write(
+		    session, block, &si->avail, &si->snapshot_avail));
+	WT_RET(__wt_block_extlist_write(session, block, &si->discard, NULL));
+
+	/*
+	 * Set the file size for the live system.
+	 *
+	 * XXX
+	 * We do NOT set the file size when re-writing snapshots because we want
+	 * to test the snapshot's blocks against a reasonable maximum file size
+	 * during verification.  This is not good: imagine a snapshot appearing
+	 * early in the file, re-written, and then the snapshot requires blocks
+	 * at the end of the file, blocks after the listed file size.  If the
+	 * application opens that snapshot for writing (discarding subsequent
+	 * snapshots), we would truncate the file to the early chunk, discarding
+	 * the re-written snapshot information.  The alternative, updating the
+	 * file size has its own problems, in that case we'd work correctly, but
+	 * we'd lose all of the blocks between the original snapshot and the
+	 * re-written snapshot.  Currently, there's no API to roll-forward
+	 * intermediate snapshots, if there ever is, this will need to be fixed.
+	 */
+	if (is_live)
+		WT_RET(__wt_filesize(session, block->fh, &si->file_size));
+
+	/* Set the snapshot size for the live system. */
+	if (is_live)
+		si->snapshot_size = snapshot_size;
+
+	/*
+	 * Copy the snapshot information into the snapshot array's address
+	 * cookie.
+	 */
+	WT_RET(__wt_buf_init(session, &snap->raw, WT_BTREE_MAX_ADDR_COOKIE));
+	endp = snap->raw.mem;
+	WT_RET(__wt_block_snapshot_to_buffer(session, block, &endp, si));
+	snap->raw.size = WT_PTRDIFF32(endp, snap->raw.mem);
+
+	if (WT_VERBOSE_ISSET(session, snapshot)) {
+		WT_RET(__wt_scr_alloc(session, 0, &tmp));
+		WT_ERR(__snapshot_string(session, block, snap->raw.data, tmp));
+		WT_VERBOSE_ERR(session, snapshot,
+		    "%s: create-snapshot: %s: %s",
+		    block->name, snap->name, (char *)tmp->data);
+	}
+
+err:	__wt_scr_free(&tmp);
+	return (ret);
+}
+
+/*
+ * __wt_block_snapshot_resolve --
+ *	Resolve a snapshot.
+ */
+int
+__wt_block_snapshot_resolve(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snapbase)
+{
+	WT_BLOCK_SNAPSHOT *si;
+	WT_DECL_RET;
+
+	si = &block->live;
+
+	/*
+	 * Snapshots are a two-step process: first, we write a new snapshot to
+	 * disk (including all the new extent lists for modified snapshots and
+	 * the live system).  As part of this we create a list of file blocks
+	 * newly available for re-allocation, based on snapshots being deleted.
+	 * We then return the locations of the new snapshot information to our
+	 * caller.  Our caller has to write that information into some kind of
+	 * stable storage, and once that's done, we can actually allocate from
+	 * that list of newly available file blocks.  (We can't allocate from
+	 * that list immediately because the allocation might happen before our
+	 * caller saves the new snapshot information, and if we crashed before
+	 * the new snapshot information was saved, we'd have overwritten blocks
+	 * still referenced by snapshots in the system.)  In summary, there is
+	 * a second step, after our caller saves the snapshot information, we
+	 * are called to add the newly available blocks into the live system's
+	 * available list.
+	 */
+	__wt_spin_lock(session, &block->live_lock);
+	ret =
+	    __wt_block_extlist_merge(session, &si->snapshot_avail, &si->avail);
+	__wt_spin_unlock(session, &block->live_lock);
+
+	/* Discard the list. */
+	__wt_block_extlist_free(session, &si->snapshot_avail);
+
+	WT_UNUSED(snapbase);
+	return (ret);
+}
+
+/*
+ * __snapshot_string --
+ *	Return a printable string representation of a snapshot address cookie.
+ */
+static int
+__snapshot_string(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, const uint8_t *addr, WT_ITEM *buf)
+{
+	WT_BLOCK_SNAPSHOT *si, _si;
+
+	/* Initialize the snapshot, crack the cookie. */
+	si = &_si;
+	WT_RET(__wt_block_snap_init(session, block, si, 0));
+	WT_RET(__wt_block_buffer_to_snapshot(session, block, addr, si));
+
+	WT_RET(__wt_buf_fmt(session, buf,
+	    "version=%d",
+	    si->version));
+	if (si->root_offset == WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__wt_buf_catfmt(session, buf, ", root=[Empty]"));
+	else
+		WT_RET(__wt_buf_catfmt(session, buf,
+		    ", root=[%"
+		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+		    (uintmax_t)si->root_offset,
+		    (uintmax_t)(si->root_offset + si->root_size),
+		    si->root_size, si->root_cksum));
+	if (si->alloc.offset == WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__wt_buf_catfmt(session, buf, ", alloc=[Empty]"));
+	else
+		WT_RET(__wt_buf_catfmt(session, buf,
+		    ", alloc=[%"
+		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+		    (uintmax_t)si->alloc.offset,
+		    (uintmax_t)(si->alloc.offset + si->alloc.size),
+		    si->alloc.size, si->alloc.cksum));
+	if (si->avail.offset == WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__wt_buf_catfmt(session, buf, ", avail=[Empty]"));
+	else
+		WT_RET(__wt_buf_catfmt(session, buf,
+		    ", avail=[%"
+		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+		    (uintmax_t)si->avail.offset,
+		    (uintmax_t)(si->avail.offset + si->avail.size),
+		    si->avail.size, si->avail.cksum));
+	if (si->discard.offset == WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__wt_buf_catfmt(session, buf, ", discard=[Empty]"));
+	else
+		WT_RET(__wt_buf_catfmt(session, buf,
+		    ", discard=[%"
+		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+		    (uintmax_t)si->discard.offset,
+		    (uintmax_t)(si->discard.offset + si->discard.size),
+		    si->discard.size, si->discard.cksum));
+	WT_RET(__wt_buf_catfmt(session, buf,
+	    ", file size=%" PRIuMAX
+	    ", write generation=%" PRIu64,
+	    (uintmax_t)si->file_size,
+	    si->write_gen));
+
+	return (0);
+}
diff --git a/src/block/block_vrfy.c b/src/block/block_vrfy.c
index 62186fcf2fc..7ce7787c08c 100644
--- a/src/block/block_vrfy.c
+++ b/src/block/block_vrfy.c
@@ -7,40 +7,49 @@
 
 #include "wt_internal.h"
 
-static int __verify_addfrag(WT_SESSION_IMPL *, WT_BLOCK *, off_t, off_t);
-static int __verify_checkfrag(WT_SESSION_IMPL *, WT_BLOCK *);
-static int __verify_freelist(WT_SESSION_IMPL *, WT_BLOCK *);
+static int __verify_filefrag_add(
+	WT_SESSION_IMPL *, WT_BLOCK *, off_t, off_t, int);
+static int __verify_filefrag_chk(WT_SESSION_IMPL *, WT_BLOCK *);
+static int __verify_snapfrag_add(WT_SESSION_IMPL *, WT_BLOCK *, off_t, off_t);
+static int __verify_snapfrag_chk(WT_SESSION_IMPL *, WT_BLOCK *);
+static int __verify_start_avail(WT_SESSION_IMPL *, WT_BLOCK *, WT_SNAPSHOT *);
+static int __verify_start_filesize(
+	WT_SESSION_IMPL *, WT_BLOCK *, WT_SNAPSHOT *, off_t *);
+
+/* The bit list ignores the first sector: convert to/from a frag/offset. */
+#define	WT_OFF_TO_FRAG(block, off)					\
+	(((off) - WT_BLOCK_DESC_SECTOR) / (block)->allocsize)
+#define	WT_FRAG_TO_OFF(block, frag)					\
+	(((off_t)(frag)) * (block)->allocsize + WT_BLOCK_DESC_SECTOR)
 
 /*
  * __wt_block_verify_start --
  *	Start file verification.
  */
 int
-__wt_block_verify_start(WT_SESSION_IMPL *session, WT_BLOCK *block, int *emptyp)
+__wt_block_verify_start(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snapbase)
 {
 	off_t file_size;
 
-	file_size = block->fh->file_size;
+	memset(&block->verify_alloc, 0, sizeof(block->verify_alloc));
+	block->verify_alloc.name = "verify_alloc";
+	block->verify_alloc.offset = WT_BLOCK_INVALID_OFFSET;
 
 	/*
-	 * We're done if the file has no data pages (this is what happens if
-	 * we verify a file immediately after creation).
+	 * We're done if the file has no data pages (this happens if we verify
+	 * a file immediately after creation).
 	 */
-	if (file_size == WT_BLOCK_DESC_SECTOR) {
-		*emptyp = 1;
+	if (block->fh->file_size == WT_BLOCK_DESC_SECTOR)
 		return (0);
-	}
-	*emptyp = 0;
 
 	/*
-	 * The file size should be a multiple of the allocsize, offset by the
-	 * size of the descriptor sector, the first 512B of the file.
+	 * Opening a WiredTiger file truncates it back to the snapshot we are
+	 * rolling forward, which means it's OK if there are blocks written
+	 * after that snapshot, they'll be ignored.  Find the largest file size
+	 * referenced by any snapshot.
 	 */
-	if (file_size > WT_BLOCK_DESC_SECTOR)
-		file_size -= WT_BLOCK_DESC_SECTOR;
-	if (file_size % block->allocsize != 0)
-		WT_RET_MSG(session, WT_ERROR,
-		    "the file size is not a multiple of the allocation size");
+	WT_RET(__verify_start_filesize(session, block, snapbase, &file_size));
 
 	/*
 	 * Allocate a bit array, where each bit represents a single allocation
@@ -54,6 +63,11 @@ __wt_block_verify_start(WT_SESSION_IMPL *session, WT_BLOCK *block, int *emptyp)
 	 * To verify larger files than we can handle in this way, we'd have to
 	 * write parts of the bit array into a disk file.
 	 *
+	 * Alternatively, we could switch to maintaining ranges of the file as
+	 * we do with the extents, but that has its own failure mode, where we
+	 * verify many non-contiguous blocks creating too many entries on the
+	 * list to fit into memory.
+	 *
 	 * We also have a minimum maximum verifiable file size of 16TB because
 	 * the underlying bit package takes a 32-bit count of bits to allocate:
 	 *
@@ -64,145 +78,463 @@ __wt_block_verify_start(WT_SESSION_IMPL *session, WT_BLOCK *block, int *emptyp)
 		    session, WT_ERROR, "the file is too large to verify");
 
 	block->frags = (uint32_t)(file_size / block->allocsize);
-	WT_RET(__bit_alloc(session, block->frags, &block->fragbits));
+	WT_RET(__bit_alloc(session, block->frags, &block->fragfile));
+
+	/*
+	 * The only snapshot avail list we care about is the last one written;
+	 * get it now and initialize the list of file fragments.
+	 */
+	WT_RET(__verify_start_avail(session, block, snapbase));
+
+	block->verify = 1;
+	return (0);
+}
+
+/*
+ * __verify_start_filesize --
+ *	Set the file size for the last snapshot.
+ */
+static int
+__verify_start_filesize(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, WT_SNAPSHOT *snapbase, off_t *file_sizep)
+{
+	WT_BLOCK_SNAPSHOT *si, _si;
+	WT_SNAPSHOT *snap;
+	off_t file_size;
+
+	si = &_si;
+
+	/*
+	 * Find the largest file size referenced by any snapshot -- that should
+	 * be the last snapshot taken, but out of sheer, raving paranoia, look
+	 * through the list, future changes to snapshots might break this code
+	 * if we make that assumption.
+	 */
+	file_size = 0;
+	WT_SNAPSHOT_FOREACH(snapbase, snap) {
+		WT_RET(__wt_block_buffer_to_snapshot(
+		    session, block, snap->raw.data, si));
+		if (si->file_size > file_size)
+			file_size = si->file_size;
+	}
 
-	/* Verify the free-list. */
-	WT_RET(__verify_freelist(session, block));
+	/* Verify doesn't make any sense if we don't have a snapshot. */
+	if (file_size <= WT_BLOCK_DESC_SECTOR)
+		WT_RET_MSG(session, WT_ERROR,
+		    "%s has no snapshots to verify", block->name);
+
+	/*
+	 * The file size should be a multiple of the allocsize, offset by the
+	 * size of the descriptor sector, the first 512B of the file.
+	 */
+	file_size -= WT_BLOCK_DESC_SECTOR;
+	if (file_size % block->allocsize != 0)
+		WT_RET_MSG(session, WT_ERROR,
+		    "the snapshot file size is not a multiple of the "
+		    "allocation size");
 
+	*file_sizep = file_size;
 	return (0);
 }
 
 /*
+ * __verify_start_avail --
+ *	Get the last snapshot's avail list and load it into the list of file
+ * fragments.
+ */
+static int
+__verify_start_avail(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snapbase)
+{
+	WT_BLOCK_SNAPSHOT *si, _si;
+	WT_DECL_RET;
+	WT_EXT *ext;
+	WT_EXTLIST *el;
+	WT_SNAPSHOT *snap;
+
+	/* Get the last on-disk snapshot, if one exists. */
+	WT_SNAPSHOT_FOREACH(snapbase, snap)
+		;
+	if (snap == snapbase)
+		return (0);
+	--snap;
+
+	si = &_si;
+	WT_RET(__wt_block_snap_init(session, block, si, 0));
+	WT_RET(__wt_block_buffer_to_snapshot(
+	    session, block, snap->raw.data, si));
+	el = &si->avail;
+	if (el->offset == WT_BLOCK_INVALID_OFFSET)
+		return (0);
+
+	WT_RET(__wt_block_extlist_read(session, block, el));
+	WT_EXT_FOREACH(ext, el->off)
+		if ((ret = __verify_filefrag_add(
+		    session, block, ext->off, ext->size, 1)) != 0)
+			break;
+
+	__wt_block_extlist_free(session, el);
+	return (ret);
+}
+
+/*
  * __wt_block_verify_end --
  *	End file verification.
  */
 int
 __wt_block_verify_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
 {
-	int ret;
+	WT_DECL_RET;
+
+	/* Confirm we verified every file block. */
+	ret = __verify_filefrag_chk(session, block);
 
-	/* Verify we read every file block. */
-	ret = __verify_checkfrag(session, block);
+	/* Discard the accumulated allocation list. */
+	__wt_block_extlist_free(session, &block->verify_alloc);
 
-	__wt_free(session, block->fragbits);
+	/* Discard the fragment tracking lists. */
+	__wt_free(session, block->fragfile);
+	__wt_free(session, block->fragsnap);
 
+	block->verify = 0;
+	return (ret);
+}
+
+/*
+ * __wt_verify_snap_load --
+ *	Verify work done when a snapshot is loaded.
+ */
+int
+__wt_verify_snap_load(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_SNAPSHOT *si)
+{
+	WT_EXTLIST *el;
+	WT_EXT *ext;
+	uint32_t frag, frags;
+
+	/* Set the maximum file size for this snapshot. */
+	block->verify_size = si->file_size;
+
+	/*
+	 * Add the root page and disk blocks used to store the extent lists to
+	 * the list of blocks we've "seen" from the file.
+	 */
+	if (si->root_offset != WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__verify_filefrag_add(session,
+		    block, si->root_offset, (off_t)si->root_size, 1));
+	if (si->alloc.offset != WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__verify_filefrag_add(session,
+		    block, si->alloc.offset, (off_t)si->alloc.size, 1));
+	if (si->avail.offset != WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__verify_filefrag_add(session,
+		    block, si->avail.offset, (off_t)si->avail.size, 1));
+	if (si->discard.offset != WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__verify_filefrag_add(session,
+		    block, si->discard.offset, (off_t)si->discard.size, 1));
+
+	/*
+	 * Snapshot verification is similar to deleting snapshots.  As we read
+	 * each new snapshot, we merge the allocation lists (accumulating all
+	 * allocated pages as we move through the system), and then remove any
+	 * pages found in the discard list.   The result should be a one-to-one
+	 * mapping to the pages we find in this particular snapshot.
+	 */
+	el = &si->alloc;
+	if (el->offset != WT_BLOCK_INVALID_OFFSET) {
+		WT_RET(__wt_block_extlist_read(session, block, el));
+		WT_RET(__wt_block_extlist_merge(
+		    session, el, &block->verify_alloc));
+		__wt_block_extlist_free(session, el);
+	}
+	el = &si->discard;
+	if (el->offset != WT_BLOCK_INVALID_OFFSET) {
+		WT_RET(__wt_block_extlist_read(session, block, el));
+		WT_EXT_FOREACH(ext, el->off)
+			WT_RET(__wt_block_off_remove_overlap(session,
+			    &block->verify_alloc, ext->off, ext->size));
+		__wt_block_extlist_free(session, el);
+	}
+
+	/*
+	 * The root page of the snapshot appears on the alloc list, but not, at
+	 * least until the snapshot is deleted, on a discard list.   To handle
+	 * this case, remove the root page from the accumulated list of snapshot
+	 * pages, so it doesn't add a new requirement for subsequent snapshots.
+	 */
+	if (si->root_offset != WT_BLOCK_INVALID_OFFSET)
+		WT_RET(__wt_block_off_remove_overlap(session,
+		    &block->verify_alloc, si->root_offset, si->root_size));
+
+	/*
+	 * Allocate the per-snapshot bit map.  The per-snapshot bit map is the
+	 * opposite of the per-file bit map, that is, we set all the bits that
+	 * we expect to be set based on the snapshot's allocation and discard
+	 * lists, then clear bits as we verify blocks.  When finished verifying
+	 * the snapshot, the bit list should be empty.
+	 */
+	WT_RET(__bit_alloc(session, block->frags, &block->fragsnap));
+	el = &block->verify_alloc;
+	WT_EXT_FOREACH(ext, el->off) {
+		frag = (uint32_t)WT_OFF_TO_FRAG(block, ext->off);
+		frags = (uint32_t)(ext->size / block->allocsize);
+		__bit_nset(block->fragsnap, frag, frag + (frags - 1));
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_verify_snap_unload --
+ *	Verify work done when a snapshot is unloaded.
+ */
+int
+__wt_verify_snap_unload(
+    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_SNAPSHOT *si)
+{
+	WT_DECL_RET;
+
+	WT_UNUSED(si);
+
+	/* Confirm we verified every snapshot block. */
+	ret = __verify_snapfrag_chk(session, block);
+
+	/* Discard the per-snapshot fragment list. */
+	__wt_free(session, block->fragsnap);
+
+	return (ret);
+}
+
+/*
+ * __wt_block_verify --
+ *	Physically verify a disk block, if we haven't already verified it.
+ */
+int
+__wt_block_verify(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf,
+    const uint8_t *addr, uint32_t addr_size, off_t offset, uint32_t size)
+{
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
+	uint32_t frag, frags, i, match;
+
+	/*
+	 * If we've already verify this block's physical image, we know it's
+	 * good, we don't have to verify it again.
+	 */
+	frag = (uint32_t)WT_OFF_TO_FRAG(block, offset);
+	frags = (uint32_t)(size / block->allocsize);
+	for (match = i = 0; i < frags; ++i)
+		if (__bit_test(block->fragfile, frag++))
+			++match;
+	if (match == frags) {
+		WT_VERBOSE_RET(session, verify,
+		    "skipping block at %" PRIuMAX "-%" PRIuMAX ", already "
+		    "verified",
+		    (uintmax_t)offset, (uintmax_t)(offset + size));
+		return (0);
+	}
+	if (match != 0)
+		WT_RET_MSG(session, WT_ERROR,
+		    "block at %" PRIuMAX "-%" PRIuMAX " partially verified",
+		    (uintmax_t)offset, (uintmax_t)(offset + size));
+
+	/*
+	 * Create a string representation of the address cookie and verify the
+	 * block.
+	 */
+	WT_RET(__wt_scr_alloc(session, 0, &tmp));
+	WT_ERR(__wt_block_addr_string(session, block, tmp, addr, addr_size));
+	WT_ERR(__wt_verify_dsk(session, (char *)tmp->data, buf));
+
+err:	__wt_scr_free(&tmp);
 	return (ret);
 }
 
 /*
  * __wt_block_verify_addr --
- *	Verify an address.
+ *	Update an address in a snapshot as verified.
  */
 int
 __wt_block_verify_addr(WT_SESSION_IMPL *session,
     WT_BLOCK *block, const uint8_t *addr, uint32_t addr_size)
 {
 	off_t offset;
-	uint32_t size;
+	uint32_t cksum, size;
 
 	WT_UNUSED(addr_size);
 
 	/* Crack the cookie. */
-	WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, NULL));
+	WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
 
-	WT_RET(__verify_addfrag(session, block, offset, (off_t)size));
+	/* Add to the per-file list. */ 
+	WT_RET(__verify_filefrag_add(session, block, offset, size, 0));
+
+	/*
+	 * It's tempting to try and flag a page as "verified" when we read it.
+	 * That doesn't work because we may visit a page multiple times when
+	 * verifying a single snapshot (for example, when verifying the physical
+	 * image of a row-store leaf page with overflow keys, the overflow keys
+	 * are read when checking for key sort issues, and read again when more
+	 * general overflow item checking is done).  This function is called by
+	 * the btree verification code, once per logical visit in a snapshot, so
+	 * we can detect if a page is referenced multiple times within a single
+	 * snapshot.  This doesn't apply to the per-file list, because it is
+	 * expected for the same btree blocks to appear in multiple snapshots.
+	 *
+	 * Add the block to the per-snapshot list.
+	 */
+	WT_RET(__verify_snapfrag_add(session, block, offset, size));
 
 	return (0);
 }
 
 /*
- * __verify_freelist --
- *	Add the freelist fragments to the list of verified fragments.
+ * __verify_filefrag_add --
+ *	Add the fragments to the per-file fragment list, optionally complain if
+ * we've already verified this chunk of the file.
  */
 static int
-__verify_freelist(WT_SESSION_IMPL *session, WT_BLOCK *block)
+__verify_filefrag_add(WT_SESSION_IMPL *session,
+    WT_BLOCK *block, off_t offset, off_t size, int nodup)
 {
-	WT_EXT *ext;
-	int ret;
+	uint32_t f, frag, frags, i;
 
-	ret = 0;
+	WT_VERBOSE_RET(session, verify,
+	    "adding file block at %" PRIuMAX "-%" PRIuMAX " (%" PRIuMAX ")",
+	    (uintmax_t)offset, (uintmax_t)(offset + size), (uintmax_t)size);
 
-	WT_EXT_FOREACH(ext, block->free.off) {
-		if (ext->off + (off_t)ext->size > block->fh->file_size)
-			WT_RET_MSG(session, WT_ERROR,
-			    "free-list entry offset %" PRIuMAX "references "
-			    "non-existent file pages",
-			    (uintmax_t)ext->off);
+	/* Check each chunk against the total file size. */
+	if (offset + size > block->fh->file_size)
+		WT_RET_MSG(session, WT_ERROR,
+		    "fragment %" PRIuMAX "-%" PRIuMAX " references "
+		    "non-existent file blocks",
+		    (uintmax_t)offset, (uintmax_t)(offset + size));
 
-		WT_VERBOSE(session, verify,
-		    "free-list range %" PRIdMAX "-%" PRIdMAX,
-		    (intmax_t)ext->off, (intmax_t)(ext->off + ext->size));
+	frag = (uint32_t)WT_OFF_TO_FRAG(block, offset);
+	frags = (uint32_t)(size / block->allocsize);
 
-		WT_TRET(__verify_addfrag(session, block, ext->off, ext->size));
-	}
+	/* It may be illegal to reference a particular chunk more than once. */
+	if (nodup)
+		for (f = frag, i = 0; i < frags; ++f, ++i)
+			if (__bit_test(block->fragfile, f))
+				WT_RET_MSG(session, WT_ERROR,
+				    "file fragment at %" PRIuMAX " referenced "
+				    "multiple times",
+				    (uintmax_t)offset);
 
-	return (ret);
+	/* Add fragments to the file's fragment list. */
+	__bit_nset(block->fragfile, frag, frag + (frags - 1));
+
+	return (0);
 }
 
-/* The bit list ignores the first sector: convert to/from an frag/offset. */
-#define	WT_OFF_TO_FRAG(block, off)					\
-	(((off) - WT_BLOCK_DESC_SECTOR) / (block)->allocsize)
-#define	WT_FRAG_TO_OFF(block, frag)					\
-	(((off_t)(frag)) * (block)->allocsize + WT_BLOCK_DESC_SECTOR)
+/*
+ * __verify_filefrag_chk --
+ *	Verify we've checked all the fragments in the file.
+ */
+static int
+__verify_filefrag_chk(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+	WT_DECL_RET;
+	uint32_t first, last;
+
+	/*
+	 * Check for file fragments we haven't verified -- every time we find
+	 * a bit that's clear, complain.  We re-start the search each time
+	 * after setting the clear bit(s) we found: it's simpler and this isn't
+	 * supposed to happen a lot.
+	 */
+	for (;;) {
+		if (__bit_ffc(block->fragfile, block->frags, &first) != 0)
+			break;
+		__bit_set(block->fragfile, first);
+		for (last = first + 1; last < block->frags; ++last) {
+			if (__bit_test(block->fragfile, last))
+				break;
+			__bit_set(block->fragfile, last);
+		}
+
+		__wt_errx(session,
+		    "file range %" PRIuMAX "-%" PRIuMAX " was never verified",
+		    (uintmax_t)WT_FRAG_TO_OFF(block, first),
+		    (uintmax_t)WT_FRAG_TO_OFF(block, last));
+		ret = WT_ERROR;
+	}
+	return (ret);
+}
 
 /*
- * __verify_addfrag --
- *	Add the fragments to the list, and complain if we've already verified
- *	this chunk of the file.
+ * __verify_snapfrag_add --
+ *	Clear the fragments in the per-snapshot fragment list, and complain if
+ * we've already verified this chunk of the snapshot.
  */
 static int
-__verify_addfrag(
+__verify_snapfrag_add(
     WT_SESSION_IMPL *session, WT_BLOCK *block, off_t offset, off_t size)
 {
-	uint32_t frag, frags, i;
+	uint32_t f, frag, frags, i;
+
+	WT_VERBOSE_RET(session, verify,
+	    "adding snapshot block at %" PRIuMAX "-%" PRIuMAX " (%" PRIuMAX ")",
+	    (uintmax_t)offset, (uintmax_t)(offset + size), (uintmax_t)size);
+
+	/*
+	 * Check each chunk against the snapshot's size, a snapshot should never
+	 * reference a block outside of the snapshot's stored size.
+	 */
+	if (offset + size > block->verify_size)
+		WT_RET_MSG(session, WT_ERROR,
+		    "fragment %" PRIuMAX "-%" PRIuMAX " references "
+		    "file blocks outside the snapshot",
+		    (uintmax_t)offset, (uintmax_t)(offset + size));
 
 	frag = (uint32_t)WT_OFF_TO_FRAG(block, offset);
 	frags = (uint32_t)(size / block->allocsize);
 
-	for (i = 0; i < frags; ++i)
-		if (__bit_test(block->fragbits, frag + i))
+	/* It is illegal to reference a particular chunk more than once. */
+	for (f = frag, i = 0; i < frags; ++f, ++i)
+		if (!__bit_test(block->fragsnap, f))
 			WT_RET_MSG(session, WT_ERROR,
-			    "file fragment at offset %" PRIuMAX
-			    " already verified",
+			    "snapshot fragment at %" PRIuMAX " referenced "
+			    "multiple times in a single snapshot or found in "
+			    "the snapshot but not listed in the snapshot's "
+			    "allocation list",
 			    (uintmax_t)offset);
 
-	__bit_nset(block->fragbits, frag, frag + (frags - 1));
+	/* Remove fragments from the snapshot's allocation list. */
+	__bit_nclr(block->fragsnap, frag, frag + (frags - 1));
+
 	return (0);
 }
 
 /*
- * __verify_checkfrag --
- *	Verify we've checked all the fragments in the file.
+ * __verify_snapfrag_chk --
+ *	Verify we've checked all the fragments in the snapshot.
  */
 static int
-__verify_checkfrag(WT_SESSION_IMPL *session, WT_BLOCK *block)
+__verify_snapfrag_chk(WT_SESSION_IMPL *session, WT_BLOCK *block)
 {
-	uint32_t first, last, frags;
-	uint8_t *fragbits;
-	int ret;
-
-	fragbits = block->fragbits;
-	frags = block->frags;
-	ret = 0;
+	WT_DECL_RET;
+	uint32_t first, last;
 
 	/*
-	 * Check for file fragments we haven't verified -- every time we find
-	 * a bit that's clear, complain.  We re-start the search each time
-	 * after setting the clear bit(s) we found: it's simpler and this isn't
+	 * Check for snapshot fragments we haven't verified -- every time we
+	 * find a bit that's set, complain.  We re-start the search each time
+	 * after clearing the set bit(s) we found: it's simpler and this isn't
 	 * supposed to happen a lot.
 	 */
 	for (;;) {
-		if (__bit_ffc(fragbits, frags, &first) != 0)
+		if (__bit_ffs(block->fragsnap, block->frags, &first) != 0)
 			break;
-		__bit_set(fragbits, first);
-		for (last = first + 1; last < frags; ++last) {
-			if (__bit_test(fragbits, last))
+		__bit_clear(block->fragsnap, first);
+		for (last = first + 1; last < block->frags; ++last) {
+			if (!__bit_test(block->fragsnap, last))
 				break;
-			__bit_set(fragbits, last);
+			__bit_clear(block->fragsnap, last);
 		}
 
 		__wt_errx(session,
-		    "file range %" PRIuMAX "-%" PRIuMAX " was never verified",
+		    "snapshot range %" PRIuMAX "-%" PRIuMAX " was never "
+		    "verified",
 		    (uintmax_t)WT_FRAG_TO_OFF(block, first),
 		    (uintmax_t)WT_FRAG_TO_OFF(block, last));
 		ret = WT_ERROR;
diff --git a/src/block/block_write.c b/src/block/block_write.c
index f4194ccc460..ec2cd85a9dd 100644
--- a/src/block/block_write.c
+++ b/src/block/block_write.c
@@ -11,14 +11,12 @@
  * __wt_block_header --
  *	Return the size of the block-specific header.
  */
-int
-__wt_block_header(WT_SESSION_IMPL *session, WT_BLOCK *block, uint32_t *headerp)
+u_int
+__wt_block_header(WT_SESSION_IMPL *session)
 {
 	WT_UNUSED(session);
-	WT_UNUSED(block);
 
-	*headerp = WT_BLOCK_HEADER_SIZE;
-	return (0);
+	return ((u_int)WT_BLOCK_HEADER_SIZE);
 }
 
 /*
@@ -36,11 +34,11 @@ __wt_block_write_size(
 }
 
 /*
- * __wt_block_write_buf --
+ * __wt_block_write --
  *	Write a buffer into a block, returning the block's address cookie.
  */
 int
-__wt_block_write_buf(WT_SESSION_IMPL *session,
+__wt_block_write(WT_SESSION_IMPL *session,
     WT_BLOCK *block, WT_ITEM *buf, uint8_t *addr, uint32_t *addr_size)
 {
 	off_t offset;
@@ -49,7 +47,8 @@ __wt_block_write_buf(WT_SESSION_IMPL *session,
 
 	WT_UNUSED(addr_size);
 
-	WT_RET(__wt_block_write(session, block, buf, &offset, &size, &cksum));
+	WT_RET(__wt_block_write_off(
+	    session, block, buf, &offset, &size, &cksum, 0));
 
 	endp = addr;
 	WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum));
@@ -59,26 +58,24 @@ __wt_block_write_buf(WT_SESSION_IMPL *session,
 }
 
 /*
- * __wt_block_write --
+ * __wt_block_write_off --
  *	Write a buffer into a block, returning the block's addr/size and
  * checksum.
  */
 int
-__wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block,
-    WT_ITEM *buf, off_t *offsetp, uint32_t *sizep, uint32_t *cksump)
+__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf,
+    off_t *offsetp, uint32_t *sizep, uint32_t *cksump, int force_extend)
 {
 	WT_BLOCK_HEADER *blk;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
 	WT_PAGE_HEADER *dsk;
-	WT_ITEM *tmp;
 	off_t offset;
 	uint32_t align_size, size;
-	int compression_failed, ret;
+	int compression_failed;
 	uint8_t *src, *dst;
 	size_t len, src_len, dst_len, result_len;
 
-	tmp = NULL;
-	ret = 0;
-
 	/*
 	 * Set the block's in-memory size.
 	 *
@@ -94,12 +91,12 @@ __wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block,
 	 *
 	 * Diagnostics: verify the disk page: this violates layering, but it's
 	 * the place we can ensure we never write a corrupted page.  Note that
-	 * we are verifying the free-list page, too.  (We created a "page" for
-	 * the free-list, it was simpler than creating another type of object
-	 * in the file.)
+	 * we are verifying the extent list pages, too.  (We created a "page"
+	 * type for the extent lists, it was simpler than creating another type
+	 * of object in the file.)
 	 */
 #ifdef HAVE_DIAGNOSTIC
-	WT_RET(__wt_verify_dsk(session, "[write-check]", buf->mem, buf->size));
+	WT_RET(__wt_verify_dsk(session, "[write-check]", buf));
 #endif
 
 	/*
@@ -149,7 +146,7 @@ not_compressed:	/*
 		if (block->compressor->pre_size == NULL)
 			len = src_len;
 		else
-			WT_ERR(block->compressor->pre_size(block->compressor,
+			WT_RET(block->compressor->pre_size(block->compressor,
 			    &session->iface, src, src_len, &len));
 		WT_RET(__wt_scr_alloc(
 		    session, (uint32_t)len + WT_BLOCK_COMPRESS_SKIP, &tmp));
@@ -208,7 +205,7 @@ not_compressed:	/*
 	 * internal page may not have been written to disk after the leaf page
 	 * was updated.  So, write generations it is.)
 	 */
-	blk->write_gen = ++block->write_gen;
+	blk->write_gen = ++block->live.write_gen;
 
 	blk->disk_size = align_size;
 
@@ -226,14 +223,37 @@ not_compressed:	/*
 	} else
 		blk->cksum = WT_BLOCK_CHECKSUM_NOT_SET;
 
-	/* Allocate space from the underlying file and write the block. */
-	WT_ERR(__wt_block_alloc(session, block, &offset, (off_t)align_size));
-	WT_ERR(__wt_write(session, block->fh, offset, align_size, dsk));
+	/*
+	 * Allocate space from the underlying file and write the block.  Always
+	 * extend the file when writing snapshot extents, that's easier than
+	 * distinguishing between extents allocated from the live avail list,
+	 * and those which can't be allocated from the live avail list such as
+	 * blocks for writing the live avail list itself.
+	 *
+	 * In the case of forced extension, we're holding the necessary locks,
+	 * don't re-acquire them (and note that if we have to free the blocks
+	 * should the write fail).
+	 */
+	if (force_extend)
+		WT_ERR(__wt_block_extend(
+		    session, block, &offset, (off_t)align_size));
+	else
+		WT_ERR(__wt_block_alloc(
+		    session, block, &offset, (off_t)align_size));
+	if ((ret =
+	    __wt_write(session, block->fh, offset, align_size, dsk)) != 0) {
+		if (!force_extend)
+			__wt_spin_lock(session, &block->live_lock);
+		(void)__wt_block_off_free(session, block, offset, align_size);
+		if (!force_extend)
+			__wt_spin_unlock(session, &block->live_lock);
+		WT_ERR(ret);
+	}
 
 	WT_BSTAT_INCR(session, page_write);
 	WT_CSTAT_INCR(session, block_write);
 
-	WT_VERBOSE(session, write,
+	WT_VERBOSE_ERR(session, write,
 	    "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32,
 	    (uintmax_t)offset, align_size, blk->cksum);
 
diff --git a/src/btree/bt_bulk.c b/src/btree/bt_bulk.c
index c4865f3c034..1ae10ced600 100644
--- a/src/btree/bt_bulk.c
+++ b/src/btree/bt_bulk.c
@@ -16,8 +16,8 @@ static int __bulk_row_keycmp_err(WT_CURSOR_BULK *);
 int
 __wt_bulk_init(WT_CURSOR_BULK *cbulk)
 {
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session;
 
@@ -130,23 +130,29 @@ __wt_bulk_end(WT_CURSOR_BULK *cbulk)
 static int
 __bulk_row_keycmp_err(WT_CURSOR_BULK *cbulk)
 {
-	WT_ITEM a, b;
 	WT_CURSOR *cursor;
+	WT_DECL_ITEM(a);
+	WT_DECL_ITEM(b);
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
 
 	session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session;
 	cursor = &cbulk->cbt.iface;
 
-	WT_CLEAR(a);
-	WT_CLEAR(b);
+	WT_ERR(__wt_scr_alloc(session, 512, &a));
+	WT_ERR(__wt_scr_alloc(session, 512, &b));
 
-	WT_RET(__wt_buf_set_printable(
-	    session, &a, cursor->key.data, cursor->key.size));
-	WT_RET(__wt_buf_set_printable(
-	    session, &b, cbulk->cmp.data, cbulk->cmp.size));
+	WT_ERR(__wt_buf_set_printable(
+	    session, a, cursor->key.data, cursor->key.size));
+	WT_ERR(__wt_buf_set_printable(
+	    session, b, cbulk->cmp.data, cbulk->cmp.size));
 
-	WT_RET_MSG(session, EINVAL,
+	WT_ERR_MSG(session, EINVAL,
 	    "bulk-load presented with out-of-order keys: %.*s compares smaller "
 	    "than previously inserted key %.*s",
-	    (int)a.size, (char *)a.data, (int)b.size, (char *)b.data);
+	    (int)a->size, (char *)a->data, (int)b->size, (char *)b->data);
+
+err:	__wt_scr_free(&a);
+	__wt_scr_free(&b);
+	return (ret);
 }
diff --git a/src/btree/bt_cache.c b/src/btree/bt_cache.c
index 44b63890d2b..f18855b74fb 100644
--- a/src/btree/bt_cache.c
+++ b/src/btree/bt_cache.c
@@ -16,11 +16,10 @@ __wt_cache_create(WT_CONNECTION_IMPL *conn, const char *cfg[])
 {
 	WT_CACHE *cache;
 	WT_CONFIG_ITEM cval;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
-	session = &conn->default_session;
-	ret = 0;
+	session = conn->default_session;
 
 	WT_RET(__wt_calloc_def(session, 1, &conn->cache));
 	cache = conn->cache;
@@ -41,11 +40,11 @@ __wt_cache_create(WT_CONNECTION_IMPL *conn, const char *cfg[])
 
 	WT_ERR(__wt_cond_alloc(session,
 	    "cache eviction server", 1, &cache->evict_cond));
-	__wt_spin_init(session, &cache->lru_lock);
+	__wt_spin_init(session, &cache->evict_lock);
 
 	/*
-	 * Allocate the eviction request array.  We size it to allow one
-	 * eviction request request per session.
+	 * Allocate the forced page eviction request array.  We size it to
+	 * allow one eviction page request per session.
 	 */
 	cache->max_evict_request = conn->session_size;
 	WT_ERR(__wt_calloc_def(
@@ -91,7 +90,7 @@ __wt_cache_destroy(WT_CONNECTION_IMPL *conn)
 	WT_SESSION_IMPL *session;
 	WT_CACHE *cache;
 
-	session = &conn->default_session;
+	session = conn->default_session;
 	cache = conn->cache;
 
 	if (cache == NULL)
@@ -99,8 +98,9 @@ __wt_cache_destroy(WT_CONNECTION_IMPL *conn)
 
 	if (cache->evict_cond != NULL)
 		(void)__wt_cond_destroy(session, cache->evict_cond);
-	__wt_spin_destroy(session, &cache->lru_lock);
+	__wt_spin_destroy(session, &cache->evict_lock);
 
 	__wt_free(session, cache->evict_request);
+
 	__wt_free(session, conn->cache);
 }
diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c
index d607da2f725..91007903477 100644
--- a/src/btree/bt_curnext.c
+++ b/src/btree/bt_curnext.c
@@ -15,7 +15,10 @@ static inline int
 __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, int newpage)
 {
 	WT_ITEM *val;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
 
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
 	val = &cbt->iface.value;
 
 	if (newpage) {
@@ -26,12 +29,27 @@ __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, int newpage)
 		    (cbt->ins = WT_SKIP_NEXT(cbt->ins)) == NULL)
 			return (WT_NOTFOUND);
 
+	/*
+	 * Column store appends are inherently non-transactional.
+	 *
+	 * Even a non-visible update by a concurrent or aborted transaction
+	 * changes the effective end of the data.  The effect is subtle because
+	 * of the blurring between deleted and empty values, but ideally we
+	 * would skip all uncommitted changes at the end of the data.
+	 *
+	 * The problem is that we don't know at this point whether there may be
+	 * multiple uncommitted changes at the end of the data, and it would be
+	 * expensive to check every time we hit an aborted update.  If an
+	 * insert is aborted, we simply return zero (empty), regardless of
+	 * whether we are at the end of the data.
+	 */
 	cbt->iface.recno = ++cbt->recno;
-	if (cbt->recno < WT_INSERT_RECNO(cbt->ins)) {
+	if (cbt->recno < WT_INSERT_RECNO(cbt->ins) ||
+	    (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) {
 		cbt->v = 0;
 		val->data = &cbt->v;
 	} else
-		val->data = WT_UPDATE_DATA(cbt->ins->upd);
+		val->data = WT_UPDATE_DATA(upd);
 	val->size = 1;
 	return (0);
 }
@@ -47,6 +65,7 @@ __cursor_fix_next(WT_CURSOR_BTREE *cbt, int newpage)
 	WT_INSERT *ins;
 	WT_ITEM *val;
 	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
 	uint64_t *recnop;
 
 	session = (WT_SESSION_IMPL *)cbt->iface.session;
@@ -73,8 +92,9 @@ new_page:	*recnop = cbt->recno;
 
 		/* Check any insert list for a matching record. */
 		if ((ins = __col_insert_search_match(
-		    WT_COL_UPDATE_SINGLE(cbt->page), cbt->recno)) != NULL) {
-			val->data = WT_UPDATE_DATA(ins->upd);
+		    WT_COL_UPDATE_SINGLE(cbt->page), cbt->recno)) != NULL &&
+		    (upd = __wt_txn_read(session, ins->upd)) != NULL) {
+			val->data = WT_UPDATE_DATA(upd);
 			val->size = 1;
 			return (0);
 		}
@@ -95,11 +115,15 @@ static inline int
 __cursor_var_append_next(WT_CURSOR_BTREE *cbt, int newpage)
 {
 	WT_ITEM *val;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
 
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
 	val = &cbt->iface.value;
 
 	if (newpage) {
 		cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
+		WT_ASSERT(session, cbt->ins != NULL);
 		goto new_page;
 	}
 
@@ -108,10 +132,11 @@ __cursor_var_append_next(WT_CURSOR_BTREE *cbt, int newpage)
 			return (WT_NOTFOUND);
 
 new_page:	cbt->iface.recno = WT_INSERT_RECNO(cbt->ins);
-		if (WT_UPDATE_DELETED_ISSET(cbt->ins->upd))
+		if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL ||
+		    WT_UPDATE_DELETED_ISSET(upd))
 			continue;
-		val->data = WT_UPDATE_DATA(cbt->ins->upd);
-		val->size = cbt->ins->upd->size;
+		val->data = WT_UPDATE_DATA(upd);
+		val->size = upd->size;
 		break;
 	}
 	return (0);
@@ -130,6 +155,7 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, int newpage)
 	WT_INSERT *ins;
 	WT_ITEM *val;
 	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
 	uint64_t *recnop;
 
 	session = (WT_SESSION_IMPL *)cbt->iface.session;
@@ -158,18 +184,21 @@ new_page:	*recnop = cbt->recno;
 
 		/* Check any insert list for a matching record. */
 		if ((ins = __col_insert_search_match(
-		    WT_COL_UPDATE(cbt->page, cip), cbt->recno)) != NULL) {
-			if (WT_UPDATE_DELETED_ISSET(ins->upd))
+		    WT_COL_UPDATE(cbt->page, cip), cbt->recno)) != NULL &&
+		    (upd = __wt_txn_read(session, ins->upd)) != NULL) {
+			if (WT_UPDATE_DELETED_ISSET(upd))
 				continue;
-			val->data = WT_UPDATE_DATA(ins->upd);
-			val->size = ins->upd->size;
+			val->data = WT_UPDATE_DATA(upd);
+			val->size = upd->size;
 			return (0);
 		}
 
 		/*
 		 * If we're at the same slot as the last reference and there's
-		 * no matching insert list item, re-use the return information.
-		 * Otherwise, unpack the cell and build the return information.
+		 * no matching insert list item, re-use the return information
+		 * (so encoded items with large repeat counts aren't repeatedly
+		 * decoded).  Otherwise, unpack the cell and build the return
+		 * information.
 		 */
 		if (cbt->cip_saved != cip) {
 			if ((cell = WT_COL_PTR(cbt->page, cip)) == NULL)
@@ -205,12 +234,15 @@ new_page:	*recnop = cbt->recno;
 static inline int
 __cursor_row_next(WT_CURSOR_BTREE *cbt, int newpage)
 {
+	WT_INSERT *ins;
 	WT_ITEM *key, *val;
 	WT_ROW *rip;
+	WT_SESSION_IMPL *session;
 	WT_UPDATE *upd;
 
 	key = &cbt->iface.key;
 	val = &cbt->iface.value;
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
 
 	/*
 	 * For row-store pages, we need a single item that tells us the part
@@ -240,12 +272,12 @@ __cursor_row_next(WT_CURSOR_BTREE *cbt, int newpage)
 		if (cbt->ins != NULL)
 			cbt->ins = WT_SKIP_NEXT(cbt->ins);
 
-new_insert:	if (cbt->ins != NULL) {
-			upd = cbt->ins->upd;
-			if (WT_UPDATE_DELETED_ISSET(upd))
+new_insert:	if ((ins = cbt->ins) != NULL) {
+			if ((upd = __wt_txn_read(session, ins->upd)) == NULL ||
+			    WT_UPDATE_DELETED_ISSET(upd))
 				continue;
-			key->data = WT_INSERT_KEY(cbt->ins);
-			key->size = WT_INSERT_KEY_SIZE(cbt->ins);
+			key->data = WT_INSERT_KEY(ins);
+			key->size = WT_INSERT_KEY_SIZE(ins);
 			val->data = WT_UPDATE_DATA(upd);
 			val->size = upd->size;
 			return (0);
@@ -270,7 +302,7 @@ new_insert:	if (cbt->ins != NULL) {
 		cbt->ins = NULL;
 
 		rip = &cbt->page->u.row.d[cbt->slot / 2 - 1];
-		upd = WT_ROW_UPDATE(cbt->page, rip);
+		upd = __wt_txn_read(session, WT_ROW_UPDATE(cbt->page, rip));
 		if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd))
 			continue;
 
@@ -351,8 +383,9 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt, int next)
 int
 __wt_btcur_next(WT_CURSOR_BTREE *cbt)
 {
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int newpage, ret;
+	int newpage;
 
 	session = (WT_SESSION_IMPL *)cbt->iface.session;
 	WT_BSTAT_INCR(session, cursor_read_next);
diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c
index 0347a3f836a..dff2b5361e9 100644
--- a/src/btree/bt_curprev.c
+++ b/src/btree/bt_curprev.c
@@ -107,8 +107,8 @@ restart:
 	}
 
 	/* If we found a previous node, the next one must be current. */
-	WT_ASSERT(session,
-	    cbt->ins_stack[0] == NULL || *cbt->ins_stack[0] == current);
+	if (cbt->ins_stack[0] != NULL && *cbt->ins_stack[0] != current)
+		goto restart;
 
 	cbt->ins = PREV_INS(cbt, 0);
 }
@@ -121,7 +121,10 @@ static inline int
 __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, int newpage)
 {
 	WT_ITEM *val;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
 
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
 	val = &cbt->iface.value;
 
 	if (newpage) {
@@ -137,12 +140,21 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, int newpage)
 		--cbt->recno;
 	}
 
+	/*
+	 * Column store appends are inherently non-transactional.
+	 *
+	 * Even a non-visible update by a concurrent or aborted transaction
+	 * changes the effective end of the data.  The effect is subtle because
+	 * of the blurring between deleted and empty values, but ideally we
+	 * would skip all uncommitted changes at the end of the data.
+	 */
 	cbt->iface.recno = cbt->recno;
-	if (cbt->recno > WT_INSERT_RECNO(cbt->ins)) {
+	if (cbt->recno > WT_INSERT_RECNO(cbt->ins) ||
+	    (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) {
 		cbt->v = 0;
 		val->data = &cbt->v;
 	} else
-		val->data = WT_UPDATE_DATA(cbt->ins->upd);
+		val->data = WT_UPDATE_DATA(upd);
 	val->size = 1;
 	return (0);
 }
@@ -158,6 +170,7 @@ __cursor_fix_prev(WT_CURSOR_BTREE *cbt, int newpage)
 	WT_INSERT *ins;
 	WT_ITEM *val;
 	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
 	uint64_t *recnop;
 
 	session = (WT_SESSION_IMPL *)cbt->iface.session;
@@ -184,8 +197,9 @@ new_page:	*recnop = cbt->recno;
 
 		/* Check any insert list for a matching record. */
 		if ((ins = cbt->ins = __col_insert_search_match(
-		    WT_COL_UPDATE_SINGLE(cbt->page), cbt->recno)) != NULL) {
-			val->data = WT_UPDATE_DATA(ins->upd);
+		    WT_COL_UPDATE_SINGLE(cbt->page), cbt->recno)) != NULL &&
+		    (upd = __wt_txn_read(session, ins->upd)) != NULL) {
+			val->data = WT_UPDATE_DATA(upd);
 			val->size = 1;
 			return (0);
 		}
@@ -206,7 +220,10 @@ static inline int
 __cursor_var_append_prev(WT_CURSOR_BTREE *cbt, int newpage)
 {
 	WT_ITEM *val;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
 
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
 	val = &cbt->iface.value;
 
 	if (newpage) {
@@ -220,10 +237,11 @@ new_page:	if (cbt->ins == NULL)
 			return (WT_NOTFOUND);
 
 		cbt->iface.recno = WT_INSERT_RECNO(cbt->ins);
-		if (WT_UPDATE_DELETED_ISSET(cbt->ins->upd))
+		if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL ||
+		    WT_UPDATE_DELETED_ISSET(upd))
 			continue;
-		val->data = WT_UPDATE_DATA(cbt->ins->upd);
-		val->size = cbt->ins->upd->size;
+		val->data = WT_UPDATE_DATA(upd);
+		val->size = upd->size;
 		break;
 	}
 	return (0);
@@ -242,6 +260,7 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage)
 	WT_INSERT *ins;
 	WT_ITEM *val;
 	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
 	uint64_t *recnop;
 
 	session = (WT_SESSION_IMPL *)cbt->iface.session;
@@ -270,19 +289,22 @@ new_page:	*recnop = cbt->recno;
 
 		/* Check any insert list for a matching record. */
 		if ((ins = __col_insert_search_match(
-		    WT_COL_UPDATE(cbt->page, cip), cbt->recno)) != NULL) {
-			if (WT_UPDATE_DELETED_ISSET(ins->upd))
+		    WT_COL_UPDATE(cbt->page, cip), cbt->recno)) != NULL &&
+		    (upd = __wt_txn_read(session, ins->upd)) != NULL) {
+			if (WT_UPDATE_DELETED_ISSET(upd))
 				continue;
 			cbt->ins = ins;
-			val->data = WT_UPDATE_DATA(ins->upd);
-			val->size = ins->upd->size;
+			val->data = WT_UPDATE_DATA(upd);
+			val->size = upd->size;
 			return (0);
 		}
 
 		/*
 		 * If we're at the same slot as the last reference and there's
-		 * no matching insert list item, re-use the return information.
-		 * Otherwise, unpack the cell and build the return information.
+		 * no matching insert list item, re-use the return information
+		 * (so encoded items with large repeat counts aren't repeatedly
+		 * decoded).  Otherwise, unpack the cell and build the return
+		 * information.
 		 */
 		if (cbt->cip_saved != cip) {
 			if ((cell = WT_COL_PTR(cbt->page, cip)) == NULL)
@@ -368,8 +390,8 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, int newpage)
 			__cursor_skip_prev(cbt);
 
 new_insert:	if ((ins = cbt->ins) != NULL) {
-			upd = ins->upd;
-			if (WT_UPDATE_DELETED_ISSET(upd))
+			if ((upd = __wt_txn_read(session, ins->upd)) == NULL ||
+			    WT_UPDATE_DELETED_ISSET(upd))
 				continue;
 			key->data = WT_INSERT_KEY(ins);
 			key->size = WT_INSERT_KEY_SIZE(ins);
@@ -398,7 +420,7 @@ new_insert:	if ((ins = cbt->ins) != NULL) {
 		cbt->ins = NULL;
 
 		rip = &cbt->page->u.row.d[cbt->slot / 2 - 1];
-		upd = WT_ROW_UPDATE(cbt->page, rip);
+		upd = __wt_txn_read(session, WT_ROW_UPDATE(cbt->page, rip));
 		if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd))
 			continue;
 
@@ -414,8 +436,9 @@ new_insert:	if ((ins = cbt->ins) != NULL) {
 int
 __wt_btcur_prev(WT_CURSOR_BTREE *cbt)
 {
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int newpage, ret;
+	int newpage;
 
 	session = (WT_SESSION_IMPL *)cbt->iface.session;
 	WT_BSTAT_INCR(session, cursor_read_prev);
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index 132c6da755d..834ddeafadf 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -60,14 +60,17 @@ __cursor_invalid(WT_CURSOR_BTREE *cbt)
 	WT_COL *cip;
 	WT_INSERT *ins;
 	WT_PAGE *page;
+	WT_SESSION_IMPL *session;
+	WT_UPDATE *upd;
 
 	btree = cbt->btree;
 	ins = cbt->ins;
 	page = cbt->page;
+	session = (WT_SESSION_IMPL *)cbt->iface.session;
 
 	/* If we found an item on an insert list, check there. */
-	if (ins != NULL)
-		return (WT_UPDATE_DELETED_ISSET(ins->upd) ? 1 : 0);
+	if (ins != NULL && (upd = __wt_txn_read(session, ins->upd)) != NULL)
+		return (WT_UPDATE_DELETED_ISSET(upd) ? 1 : 0);
 
 	/* The page may be empty, the search routine doesn't check. */
 	if (page->entries == 0)
@@ -86,9 +89,9 @@ __cursor_invalid(WT_CURSOR_BTREE *cbt)
 			return (1);
 		break;
 	case BTREE_ROW:
-		if (page->u.row.upd != NULL &&
-		    page->u.row.upd[cbt->slot] != NULL &&
-		    WT_UPDATE_DELETED_ISSET(page->u.row.upd[cbt->slot]))
+		if (page->u.row.upd != NULL && (upd = __wt_txn_read(session,
+		    page->u.row.upd[cbt->slot])) != NULL &&
+		    WT_UPDATE_DELETED_ISSET(upd))
 			return (1);
 		break;
 	}
@@ -122,9 +125,9 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt)
 {
 	WT_BTREE *btree;
 	WT_CURSOR *cursor;
+	WT_DECL_RET;
 	WT_ITEM *val;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	btree = cbt->btree;
 	cursor = &cbt->iface;
@@ -167,10 +170,10 @@ int
 __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exact)
 {
 	WT_BTREE *btree;
-	WT_ITEM *val;
 	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	WT_ITEM *val;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	btree = cbt->btree;
 	cursor = &cbt->iface;
@@ -236,8 +239,8 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt)
 {
 	WT_BTREE *btree;
 	WT_CURSOR *cursor;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	btree = cbt->btree;
 	cursor = &cbt->iface;
@@ -306,7 +309,7 @@ retry:	__cursor_func_init(cbt, 1);
 		if ((ret = __wt_row_modify(session, cbt, 0)) == WT_RESTART)
 			goto retry;
 		break;
-	WT_ILLEGAL_VALUE(session);
+	WT_ILLEGAL_VALUE_ERR(session);
 	}
 
 err:	__cursor_func_resolve(cbt, ret);
@@ -323,8 +326,8 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt)
 {
 	WT_BTREE *btree;
 	WT_CURSOR *cursor;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	btree = cbt->btree;
 	cursor = &cbt->iface;
@@ -361,7 +364,7 @@ retry:	__cursor_func_init(cbt, 1);
 		else if ((ret = __wt_row_modify(session, cbt, 1)) == WT_RESTART)
 			goto retry;
 		break;
-	WT_ILLEGAL_VALUE(session);
+	WT_ILLEGAL_VALUE_ERR(session);
 	}
 
 err:	__cursor_func_resolve(cbt, ret);
@@ -378,8 +381,8 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt)
 {
 	WT_BTREE *btree;
 	WT_CURSOR *cursor;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	btree = cbt->btree;
 	cursor = &cbt->iface;
@@ -423,7 +426,7 @@ retry:	__cursor_func_init(cbt, 1);
 		else if ((ret = __wt_row_modify(session, cbt, 0)) == WT_RESTART)
 			goto retry;
 		break;
-	WT_ILLEGAL_VALUE(session);
+	WT_ILLEGAL_VALUE_ERR(session);
 	}
 
 err:	__cursor_func_resolve(cbt, ret);
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index f4963ad516e..2ae7814c6df 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -112,7 +112,7 @@ __dmsg_wrapup(WT_DBG *ds)
 	 */
 	if (msg != NULL) {
 		if (msg->size != 0)
-			__wt_msg(session, "%s", (char *)msg->mem);
+			(void)__wt_msg(session, "%s", (char *)msg->mem);
 		__wt_scr_free(&ds->msg);
 	}
 
@@ -170,7 +170,7 @@ __dmsg(WT_DBG *ds, const char *fmt, ...)
 		}
 		if (((uint8_t *)msg->mem)[msg->size - 1] == '\n') {
 			((uint8_t *)msg->mem)[msg->size - 1] = '\0';
-			__wt_msg(session, "%s", (char *)msg->mem);
+			(void)__wt_msg(session, "%s", (char *)msg->mem);
 			msg->size = 0;
 		}
 	} else {
@@ -182,21 +182,38 @@ __dmsg(WT_DBG *ds, const char *fmt, ...)
 
 /*
  * __wt_debug_addr --
- *	Read and dump a disk page in debugging mode.
+ *	Read and dump a disk page in debugging mode, using an addr/size pair.
  */
 int
-__wt_debug_addr(
-    WT_SESSION_IMPL *session, uint32_t addr, uint32_t size, const char *ofile)
+__wt_debug_addr(WT_SESSION_IMPL *session,
+    const uint8_t *addr, uint32_t addr_size, const char *ofile)
 {
-	WT_ITEM *buf;
-	int ret;
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
 
-	buf = NULL;
-	ret = 0;
+	WT_RET(__wt_scr_alloc(session, 1024, &buf));
+	WT_ERR(__wt_block_read(
+	    session, session->btree->block, buf, addr, addr_size));
+	ret = __wt_debug_disk(session, buf->mem, ofile);
+err:	__wt_scr_free(&buf);
+
+	return (ret);
+}
+
+/*
+ * __wt_debug_off --
+ *	Read and dump a disk page in debugging mode, using an offset/size pair.
+ */
+int
+__wt_debug_off(
+    WT_SESSION_IMPL *session, uint32_t offset, uint32_t size, const char *ofile)
+{
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
 
 	WT_RET(__wt_scr_alloc(session, size, &buf));
-	WT_ERR(__wt_block_read(
-	    session, session->btree->block, buf, addr, size, 0));
+	WT_ERR(__wt_block_read_off(
+	    session, session->btree->block, buf, offset, size, 0));
 	ret = __wt_debug_disk(session, buf->mem, ofile);
 err:	__wt_scr_free(&buf);
 
@@ -212,9 +229,7 @@ __wt_debug_disk(
     WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk, const char *ofile)
 {
 	WT_DBG *ds, _ds;
-	int ret;
-
-	ret = 0;
+	WT_DECL_RET;
 
 	ds = &_ds;
 	WT_RET(__debug_config(session, ds, ofile));
@@ -329,7 +344,7 @@ int
 __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile)
 {
 	WT_DBG *ds, _ds;
-	int ret;
+	WT_DECL_RET;
 
 	ds = &_ds;
 	WT_RET(__debug_config(session, ds, ofile));
@@ -350,7 +365,7 @@ __debug_tree(
     WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile, uint32_t flags)
 {
 	WT_DBG *ds, _ds;
-	int ret;
+	WT_DECL_RET;
 
 	ds = &_ds;
 	WT_RET(__debug_config(session, ds, ofile));
@@ -477,6 +492,7 @@ __debug_page_modify(WT_DBG *ds, WT_PAGE *page)
 	WT_PAGE_TRACK *track;
 	WT_SESSION_IMPL *session;
 	uint32_t i;
+	char buf[64];
 
 	session = ds->session;
 
@@ -510,27 +526,13 @@ __debug_page_modify(WT_DBG *ds, WT_PAGE *page)
 
 	if (mod->track_entries != 0)
 		__dmsg(ds, "\t" "tracking list:\n");
-	for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i) {
-		switch (track->type) {
-		case WT_PT_BLOCK:
-			__dmsg(ds, "\t\t" "block");
-			break;
-		case WT_PT_BLOCK_EVICT:
-			__dmsg(ds, "\t\t" "block-evict");
-			break;
-		case WT_PT_OVFL:
-			__dmsg(ds, "\t\t" "overflow (on)");
-			break;
-		case WT_PT_OVFL_DISCARD:
-			__dmsg(ds, "\t\t" "overflow (off)");
-			break;
-		case WT_PT_EMPTY:
-			continue;
-		WT_ILLEGAL_VALUE(session);
+	for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i)
+		if (F_ISSET(track, WT_TRK_OBJECT)) {
+			__dmsg(ds, "\t\t%s %s\n",
+			    __wt_track_string(track, buf, sizeof(buf)),
+			    __wt_addr_string(session,
+			    ds->tmp, track->addr.addr, track->addr.size));
 		}
-		__dmsg(ds, " %s\n", __wt_addr_string(
-		    session, ds->tmp, track->addr.addr, track->addr.size));
-	}
 
 	return (0);
 }
@@ -688,6 +690,7 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page)
 	WT_ROW *rip;
 	WT_UPDATE *upd;
 	uint32_t i;
+	void *ripkey;
 
 	unpack = &_unpack;
 
@@ -700,10 +703,11 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page)
 
 	/* Dump the page's K/V pairs. */
 	WT_ROW_FOREACH(page, rip, i) {
-		if (__wt_off_page(page, rip->key))
-			__debug_ikey(ds, rip->key);
+		ripkey = WT_ROW_KEY_COPY(rip);
+		if (__wt_off_page(page, ripkey))
+			__debug_ikey(ds, ripkey);
 		else {
-			__wt_cell_unpack(rip->key, unpack);
+			__wt_cell_unpack(ripkey, unpack);
 			WT_RET(__debug_cell_data(ds, "K", unpack));
 		}
 
@@ -742,7 +746,7 @@ __debug_col_skip(WT_DBG *ds, WT_INSERT_HEAD *head, const char *tag, int hexbyte)
 
 /*
  * __debug_row_skip --
- *	Dump an insert array.
+ *	Dump an insert list.
  */
 static void
 __debug_row_skip(WT_DBG *ds, WT_INSERT_HEAD *head)
@@ -758,7 +762,7 @@ __debug_row_skip(WT_DBG *ds, WT_INSERT_HEAD *head)
 
 /*
  * __debug_update --
- *	Dump an update array.
+ *	Dump an update list.
  */
 static void
 __debug_update(WT_DBG *ds, WT_UPDATE *upd, int hexbyte)
@@ -829,32 +833,43 @@ __debug_ref(WT_DBG *ds, WT_REF *ref, WT_PAGE *page)
 static int
 __debug_cell(WT_DBG *ds, WT_PAGE_HEADER *dsk, WT_CELL_UNPACK *unpack)
 {
-	WT_ITEM *buf;
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	session = ds->session;
-	buf = NULL;
 
 	__dmsg(ds, "\t%s: len %" PRIu32,
 	    __wt_cell_type_string(unpack->raw), unpack->size);
 
-	switch (unpack->type) {
-	case WT_CELL_DEL:
-	case WT_CELL_VALUE:
-		/*
-		 * Column-store internal page value cells include a record
-		 * number, column-store leaf page value cells include a RLE;
-		 * row-store leaf page value cells have no associated value.
-		 */
-		if (unpack->v != 0)
-			__dmsg(ds, ", %s: %" PRIu64,
-			    dsk->type == WT_PAGE_COL_INT ? "recno" : "rle",
-			    unpack->v);
+	switch (dsk->type) {
+	case WT_PAGE_COL_INT:
+		switch (unpack->type) {
+		case WT_CELL_VALUE:
+			__dmsg(ds, ", recno: %" PRIu64, unpack->v);
+			break;
+		}
 		break;
-	case WT_CELL_KEY:
-		__dmsg(ds, ", pfx: %" PRIu8, unpack->prefix);
+	case WT_PAGE_COL_VAR:
+		switch (unpack->type) {
+		case WT_CELL_DEL:
+		case WT_CELL_VALUE:
+		case WT_CELL_VALUE_OVFL:
+			__dmsg(ds, ", rle: %" PRIu64, __wt_cell_rle(unpack));
+			break;
+		}
 		break;
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+		switch (unpack->type) {
+		case WT_CELL_KEY:
+			__dmsg(ds, ", pfx: %" PRIu8, unpack->prefix);
+			break;
+		}
+		break;
+	}
+
+	switch (unpack->type) {
 	case WT_CELL_ADDR:
 	case WT_CELL_KEY_OVFL:
 	case WT_CELL_VALUE_OVFL:
@@ -867,7 +882,6 @@ __debug_cell(WT_DBG *ds, WT_PAGE_HEADER *dsk, WT_CELL_UNPACK *unpack)
 		__wt_scr_free(&buf);
 		WT_RET(ret);
 		break;
-	WT_ILLEGAL_VALUE(session);
 	}
 	__dmsg(ds, "\n");
 
@@ -881,13 +895,11 @@ __debug_cell(WT_DBG *ds, WT_PAGE_HEADER *dsk, WT_CELL_UNPACK *unpack)
 static int
 __debug_cell_data(WT_DBG *ds, const char *tag, WT_CELL_UNPACK *unpack)
 {
-	WT_ITEM *buf;
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	session = ds->session;
-	buf = NULL;
-	ret = 0;
 
 	/*
 	 * Column-store references to deleted cells return a NULL cell
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index a564e3cbc9c..c640e0cb96f 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -7,7 +7,7 @@
 
 #include "wt_internal.h"
 
-static void __free_page_col_fix(WT_SESSION_IMPL *, WT_PAGE *);
+static void __free_page_modify(WT_SESSION_IMPL *, WT_PAGE *);
 static void __free_page_col_int(WT_SESSION_IMPL *, WT_PAGE *);
 static void __free_page_col_var(WT_SESSION_IMPL *, WT_PAGE *);
 static void __free_page_row_int(WT_SESSION_IMPL *, WT_PAGE *);
@@ -22,37 +22,55 @@ static void __free_update_list(WT_SESSION_IMPL *, WT_UPDATE *);
  *	Discard an in-memory page, freeing all memory associated with it.
  */
 void
-__wt_page_out(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
+__wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep, uint32_t flags)
 {
+	WT_PAGE *page;
+	WT_PAGE_MODIFY *mod;
+
 	/*
 	 * When a page is discarded, it's been disconnected from its parent and
-	 * parent's WT_REF structure may now point to a different page.   Make
-	 * sure we don't use any of that information by accident.
+	 * its parent's WT_REF structure may now point to a different page.
+	 * Make sure we don't accidentally use the page itself or any other
+	 * information.
 	 */
+	page = *pagep;
+	*pagep = NULL;
 	page->parent = NULL;
 	page->ref = NULL;
 
 	WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));
 
-	/* If not a split merged into its parent, the page must be clean. */
-	WT_ASSERT(session,
-	    !__wt_page_is_modified(page) ||
-	    F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE));
-
 #ifdef HAVE_DIAGNOSTIC
-	__wt_hazard_validate(session, page);
+	{
+	WT_HAZARD *hp;
+	if ((hp = __wt_page_hazard_check(session, page)) != NULL)
+		__wt_errx(session,
+		    "discarded page has hazard reference: (%p: %s, line %d)",
+		    hp->page, hp->file, hp->line);
+	}
 #endif
-
 	/*
-	 * If this page has a memory footprint associated with it, update
-	 * the cache information.
+	 * Pages without a memory footprint aren't associated with the cache
+	 * and were never counted as "pages read".  If the page has a memory
+	 * footprint, update the cache information based on the discard.
 	 */
 	if (page->memory_footprint != 0)
 		__wt_cache_page_evict(session, page);
 
+	/* Clean up page modifications. */
+	if ((mod = page->modify) != NULL) {
+		/*
+		 * If the page split, there may one or more pages linked from
+		 * the page; walk the list, discarding pages.
+		 */
+		if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_SPLIT)
+			__wt_page_out(session, &mod->u.split, 0);
+
+		__free_page_modify(session, page);
+	}
+
 	switch (page->type) {
 	case WT_PAGE_COL_FIX:
-		__free_page_col_fix(session, page);
 		break;
 	case WT_PAGE_COL_INT:
 		__free_page_col_int(session, page);
@@ -71,36 +89,37 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
 	if (!LF_ISSET(WT_PAGE_FREE_IGNORE_DISK))	/* Disk image */
 		__wt_free(session, page->dsk);
 
-	if (page->modify != NULL) {			/* WT_PAGE_MODIFY */
-		__wt_free(session, page->modify->track);
-		__wt_free(session, page->modify);
-	}
-
-#ifdef HAVE_DIAGNOSTIC
-	memset(page, WT_DEBUG_BYTE, sizeof(WT_PAGE));
-#endif
-	__wt_free(session, page);
+	__wt_overwrite_and_free(session, page);
 }
 
 /*
- * __free_page_col_fix --
- *	Discard a WT_PAGE_COL_FIX page.
+ * __free_page_modify --
+ *	Discard the page's associated modification structures.
  */
 static void
-__free_page_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page)
+__free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
 {
 	WT_INSERT_HEAD *append;
+	WT_PAGE_MODIFY *mod;
+
+	mod = page->modify;
 
 	/* Free the append array. */
 	if ((append = WT_COL_APPEND(page)) != NULL) {
 		__free_skip_list(session, WT_SKIP_FIRST(append));
 		__wt_free(session, append);
-		__wt_free(session, page->modify->append);
+		__wt_free(session, mod->append);
 	}
 
-	/* Free the update array. */
-	if (page->modify != NULL && page->modify->update != NULL)
-		__free_skip_array(session, page->modify->update, 1);
+	/* Free the insert/update array. */
+	if (mod->update != NULL)
+		__free_skip_array(session, mod->update,
+		    page->type == WT_PAGE_COL_FIX ? 1 : page->entries);
+
+	/* Discard any objects the page was tracking plus associated memory. */
+	__wt_rec_track_discard(session, page);
+	__wt_free(session, mod->track);
+	__wt_free(session, page->modify);
 }
 
 /*
@@ -135,24 +154,11 @@ __free_page_col_int(WT_SESSION_IMPL *session, WT_PAGE *page)
 static void
 __free_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page)
 {
-	WT_INSERT_HEAD *append;
-
 	/* Free the in-memory index array. */
 	__wt_free(session, page->u.col_var.d);
 
 	/* Free the RLE lookup array. */
 	__wt_free(session, page->u.col_var.repeats);
-
-	/* Free the append array. */
-	if ((append = WT_COL_APPEND(page)) != NULL) {
-		__free_skip_list(session, WT_SKIP_FIRST(append));
-		__wt_free(session, append);
-		__wt_free(session, page->modify->append);
-	}
-
-	/* Free the insert array. */
-	if (page->modify != NULL && page->modify->update != NULL)
-		__free_skip_array(session, page->modify->update, page->entries);
 }
 
 /*
@@ -204,9 +210,11 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
 	 * points somewhere other than the original page), and if so, free
 	 * the memory.
 	 */
-	WT_ROW_FOREACH(page, rip, i)
-		if ((ikey = rip->key) != NULL && __wt_off_page(page, ikey))
+	WT_ROW_FOREACH(page, rip, i) {
+		ikey = WT_ROW_KEY_COPY(rip);
+		if (ikey != NULL && __wt_off_page(page, ikey))
 			__wt_free(session, ikey);
+	}
 	__wt_free(session, page->u.row.d);
 
 	/*
@@ -244,7 +252,7 @@ __free_skip_array(
 			__wt_free(session, *head);
 		}
 
-	/* Free the page's array of inserts. */
+	/* Free the header array. */
 	__wt_free(session, head_arg);
 }
 
@@ -284,7 +292,7 @@ __free_update(
 		if (*updp != NULL)
 			__free_update_list(session, *updp);
 
-	/* Free the page's array of updates. */
+	/* Free the update array. */
 	__wt_free(session, update_head);
 }
 
diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c
index ac8f888f2a5..a478cf16edf 100644
--- a/src/btree/bt_evict.c
+++ b/src/btree/bt_evict.c
@@ -7,12 +7,13 @@
 
 #include "wt_internal.h"
 
-static int  __evict_file(WT_SESSION_IMPL *, WT_EVICT_REQ *);
+static void __evict_clear_tree_walk(WT_SESSION_IMPL *, WT_PAGE *);
+static int  __evict_file_request(WT_SESSION_IMPL *, int);
+static int  __evict_file_request_walk(WT_SESSION_IMPL *);
 static int  __evict_lru(WT_SESSION_IMPL *);
 static int  __evict_lru_cmp(const void *, const void *);
 static void __evict_lru_sort(WT_SESSION_IMPL *);
-static void __evict_pages(WT_SESSION_IMPL *);
-static int  __evict_request_walk(WT_SESSION_IMPL *);
+static int  __evict_page_request_walk(WT_SESSION_IMPL *);
 static int  __evict_walk(WT_SESSION_IMPL *);
 static int  __evict_walk_file(WT_SESSION_IMPL *, u_int *);
 static int  __evict_worker(WT_SESSION_IMPL *);
@@ -27,7 +28,7 @@ static int  __evict_worker(WT_SESSION_IMPL *);
 
 /*
  * WT_EVICT_REQ_FOREACH --
- *	Walk a list of eviction requests.
+ *	Walk the list of forced page eviction requests.
  */
 #define	WT_EVICT_REQ_FOREACH(er, er_end, cache)				\
 	for ((er) = (cache)->evict_request,				\
@@ -35,11 +36,11 @@ static int  __evict_worker(WT_SESSION_IMPL *);
 	    (er) < (er_end); ++(er))
 
 /*
- * __evict_clr --
- *	Clear an entry in the eviction list.
+ * __evict_list_clr --
+ *	Clear an entry in the LRU eviction list.
  */
 static inline void
-__evict_clr(WT_SESSION_IMPL *session, WT_EVICT_LIST *e)
+__evict_list_clr(WT_SESSION_IMPL *session, WT_EVICT_ENTRY *e)
 {
 	if (e->page != NULL) {
 		WT_ASSERT(session, F_ISSET_ATOMIC(e->page, WT_PAGE_EVICT_LRU));
@@ -50,35 +51,35 @@ __evict_clr(WT_SESSION_IMPL *session, WT_EVICT_LIST *e)
 }
 
 /*
- * __evict_clr_all --
- *	Clear all entries in the eviction list.
+ * __evict_list_clr_all --
+ *	Clear all entries in the LRU eviction list.
  */
 static inline void
-__evict_clr_all(WT_SESSION_IMPL *session, u_int start)
+__evict_list_clr_all(WT_SESSION_IMPL *session, u_int start)
 {
 	WT_CACHE *cache;
-	WT_EVICT_LIST *evict;
+	WT_EVICT_ENTRY *evict;
 	uint32_t i, elem;
 
 	cache = S2C(session)->cache;
 
 	elem = cache->evict_entries;
 	for (i = start, evict = cache->evict + i; i < elem; i++, evict++)
-		__evict_clr(session, evict);
+		__evict_list_clr(session, evict);
 }
 
 /*
- * __wt_evict_clr_page --
- *	Make sure a page is not in the eviction request list.  This called
- *	from inside __rec_review to make sure there is no attempt to evict
- *	child pages multiple times.
+ * __wt_evict_list_clr_page --
+ *	Make sure a page is not in the LRU eviction list.  This called from the
+ * page eviction code to make sure there is no attempt to evict a child page
+ * multiple times.
  */
 void
-__wt_evict_clr_page(WT_SESSION_IMPL *session, WT_PAGE *page)
+__wt_evict_list_clr_page(WT_SESSION_IMPL *session, WT_PAGE *page)
 {
 	WT_CACHE *cache;
-	WT_EVICT_LIST *evict;
-	int i, elem;
+	WT_EVICT_ENTRY *evict;
+	uint32_t i, elem;
 
 	WT_ASSERT(session, WT_PAGE_IS_ROOT(page) ||
 	    page->ref->page != page ||
@@ -89,57 +90,48 @@ __wt_evict_clr_page(WT_SESSION_IMPL *session, WT_PAGE *page)
 		return;
 
 	cache = S2C(session)->cache;
-	__wt_spin_lock(session, &cache->lru_lock);
+	__wt_spin_lock(session, &cache->evict_lock);
 
 	elem = cache->evict_entries;
 	for (evict = cache->evict, i = 0; i < elem; i++, evict++)
 		if (evict->page == page) {
-			__evict_clr(session, evict);
+			__evict_list_clr(session, evict);
 			break;
 		}
 
 	WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));
 
-	__wt_spin_unlock(session, &cache->lru_lock);
+	__wt_spin_unlock(session, &cache->evict_lock);
 }
 
 /*
  * __evict_req_set --
- *	Set an entry in the eviction request list.
+ *	Set an entry in the forced page eviction request list.
  */
 static inline void
-__evict_req_set(
-    WT_SESSION_IMPL *session, WT_EVICT_REQ *r, WT_PAGE *page, uint32_t flags)
+__evict_req_set(WT_EVICT_ENTRY *r, WT_BTREE *btree, WT_PAGE *page)
 {
-					/* Should be empty */
-	WT_ASSERT(session, r->session == NULL);
-
-	WT_CLEAR(*r);
-	r->btree = session->btree;
-	r->page = page;
-	r->flags = flags;
-
+	r->btree = btree;
 	/*
 	 * Publish: there must be a barrier to ensure the structure fields are
 	 * set before the eviction thread can see the request.
 	 */
-	WT_PUBLISH(r->session, session);
+	WT_PUBLISH(r->page, page);
 }
 
 /*
  * __evict_req_clr --
- *	Clear an entry in the eviction request list.
+ *	Clear an entry in the forced page eviction request list.
  */
 static inline void
-__evict_req_clr(WT_SESSION_IMPL *session, WT_EVICT_REQ *r)
+__evict_req_clr(WT_EVICT_ENTRY *r)
 {
-	WT_UNUSED(session);
-
+	r->btree = NULL;
+	r->page = NULL;
 	/*
-	 * Publish; there must be a barrier to ensure the structure fields are
-	 * set before the entry is made available for re-use.
+	 * No publication necessary, all we care about is the page value and
+	 * whenever it's cleared is fine.
 	 */
-	WT_PUBLISH(r->session, NULL);
 }
 
 /*
@@ -158,7 +150,7 @@ __wt_evict_server_wake(WT_SESSION_IMPL *session)
 	bytes_inuse = __wt_cache_bytes_inuse(cache);
 	bytes_max = conn->cache_size;
 
-	WT_VERBOSE(session, evictserver,
+	WT_VERBOSE_VOID(session, evictserver,
 	    "waking, bytes inuse %s max (%" PRIu64 "MB %s %" PRIu64 "MB), ",
 	    bytes_inuse <= bytes_max ? "<=" : ">",
 	    bytes_inuse / WT_MEGABYTE,
@@ -169,50 +161,48 @@ __wt_evict_server_wake(WT_SESSION_IMPL *session)
 }
 
 /*
- * __evict_file_serial_func --
+ * __sync_file_serial_func --
  *	Eviction serialization function called when a tree is being flushed
  *	or closed.
  */
 void
-__wt_evict_file_serial_func(WT_SESSION_IMPL *session)
+__wt_sync_file_serial_func(WT_SESSION_IMPL *session)
 {
 	WT_CACHE *cache;
-	WT_EVICT_REQ *er, *er_end;
-	int discard;
+	int syncop;
 
-	__wt_evict_file_unpack(session, &discard);
+	__wt_sync_file_unpack(session, &syncop);
 
-	cache = S2C(session)->cache;
-
-	/* Find an empty slot and enter the eviction request. */
-	WT_EVICT_REQ_FOREACH(er, er_end, cache)
-		if (er->session == NULL) {
-			__evict_req_set(session,
-			    er, NULL, discard ? WT_EVICT_REQ_CLOSE : 0);
-			return;
-		}
+	/*
+	 * Publish: there must be a barrier to ensure the structure fields are
+	 * set before the eviction thread can see the request.
+	 */
+	WT_PUBLISH(session->syncop, syncop);
 
-	__wt_errx(session, "eviction server request table full");
-	__wt_session_serialize_wrapup(session, NULL, WT_ERROR);
+	/* We're serialized at this point, no lock needed. */
+	cache = S2C(session)->cache;
+	++cache->sync_request;
 }
 
 /*
  * __wt_evict_page_request --
  *	Schedule a page for forced eviction due to a high volume of inserts or
  *	updates.
- *
- *	NOTE: this function is called from inside serialized functions, so it
- *	is holding the serial lock.
  */
-int
+void
 __wt_evict_page_request(WT_SESSION_IMPL *session, WT_PAGE *page)
 {
 	WT_CACHE *cache;
-	WT_EVICT_REQ *er, *er_end;
-	int first;
+	WT_EVICT_ENTRY *er, *er_end;
+	int set;
 
 	cache = S2C(session)->cache;
-	first = 1;
+
+	/* Do a cheap test before acquiring the lock. */
+	if (page->ref->state != WT_REF_MEM)
+		return;
+
+	__wt_spin_lock(session, &cache->evict_lock);
 
 	/*
 	 * Application threads request forced eviction of pages when they
@@ -234,29 +224,32 @@ __wt_evict_page_request(WT_SESSION_IMPL *session, WT_PAGE *page)
 	 * put it on the request queue because the memory may be freed by the
 	 * time the eviction thread sees it.
 	 */
-	if (!WT_ATOMIC_CAS(page->ref->state, WT_REF_MEM, WT_REF_EVICT_FORCE))
-		return (0);
+	if (!WT_ATOMIC_CAS(page->ref->state, WT_REF_MEM, WT_REF_EVICT_FORCE)) {
+		__wt_spin_unlock(session, &cache->evict_lock);
+		return;
+	}
+
+	set = 0;
 
 	/* Find an empty slot and enter the eviction request. */
 	WT_EVICT_REQ_FOREACH(er, er_end, cache)
-		if (er->session == NULL) {
-			/* Always leave one empty slot */
-			if (first) {
-				first = 0;
-				continue;
-			}
-			__evict_req_set(session, er, page, WT_EVICT_REQ_PAGE);
-			__wt_evict_server_wake(session);
-			return (0);
+		if (er->page == NULL) {
+			__evict_req_set(er, session->btree, page);
+			set = 1;
+			break;
 		}
 
-	/*
-	 * The request table is full, that's okay for page requests: another
-	 * thread will see this later.
-	 */
-	WT_VERBOSE(session, evictserver, "eviction server request table full");
-	page->ref->state = WT_REF_MEM;
-	return (WT_RESTART);
+	if (!set) {
+		/*
+		 * The request table is full, that's okay for page requests:
+		 * another thread will see this later.
+		 */
+		WT_VERBOSE_VOID(session, evictserver,
+		    "page eviction request table is full");
+		page->ref->state = WT_REF_MEM;
+	}
+
+	__wt_spin_unlock(session, &cache->evict_lock);
 }
 
 /*
@@ -266,21 +259,22 @@ __wt_evict_page_request(WT_SESSION_IMPL *session, WT_PAGE *page)
 void *
 __wt_cache_evict_server(void *arg)
 {
+	WT_CACHE *cache;
 	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	WT_CACHE *cache;
-	int read_lockout, ret;
+	int read_lockout;
 
 	conn = arg;
 	cache = conn->cache;
-	ret = 0;
 
 	/*
 	 * We need a session handle because we're reading/writing pages.
 	 * Start with the default session to keep error handling simple.
 	 */
-	session = &conn->default_session;
+	session = conn->default_session;
 	WT_ERR(__wt_open_session(conn, 1, NULL, NULL, &session));
+	session->name = "session.eviction-server";
 
 	while (F_ISSET(conn, WT_SERVER_RUN)) {
 		/*
@@ -290,18 +284,20 @@ __wt_cache_evict_server(void *arg)
 		__wt_eviction_check(session, &read_lockout, 0);
 
 		if (!read_lockout) {
-			WT_VERBOSE(session, evictserver, "sleeping");
+			WT_VERBOSE_ERR(session, evictserver, "sleeping");
 			__wt_cond_wait(session, cache->evict_cond);
 		}
 
 		if (!F_ISSET(conn, WT_SERVER_RUN))
 			break;
-		WT_VERBOSE(session, evictserver, "waking");
+		WT_VERBOSE_ERR(session, evictserver, "waking");
 
 		/* Evict pages from the cache as needed. */
 		WT_ERR(__evict_worker(session));
 	}
 
+	WT_VERBOSE_ERR(session, evictserver, "exiting");
+
 	if (ret == 0) {
 		if (__wt_cache_bytes_inuse(cache) != 0) {
 			__wt_errx(session,
@@ -313,12 +309,12 @@ __wt_cache_evict_server(void *arg)
 	} else
 err:		__wt_err(session, ret, "eviction server error");
 
-	WT_VERBOSE(session, evictserver, "exiting");
-
 	__wt_free(session, cache->evict);
 
-	if (session != &conn->default_session)
+	if (session != conn->default_session) {
 		(void)session->iface.close(&session->iface, NULL);
+		__wt_free(conn->default_session, session->hazard);
+	}
 
 	return (NULL);
 }
@@ -340,8 +336,23 @@ __evict_worker(WT_SESSION_IMPL *session)
 
 	/* Evict pages from the cache. */
 	for (loop = 0;; loop++) {
-		/* Walk the eviction-request queue. */
-		WT_RET(__evict_request_walk(session));
+		/*
+		 * Block out concurrent eviction while we are handling requests.
+		 */
+		__wt_spin_lock(session, &cache->evict_lock);
+
+		/*
+		 * Walk the eviction-request queue.  It is important to do this
+		 * before closing files, in case a page schedule for eviction
+		 * is freed by closing a file.
+		 */
+		WT_RET(__evict_page_request_walk(session));
+
+		/* If there is a file sync request, satisfy it. */
+		while (cache->sync_complete != cache->sync_request)
+			WT_RET(__evict_file_request_walk(session));
+
+		__wt_spin_unlock(session, &cache->evict_lock);
 
 		/*
 		 * Keep evicting until we hit the target cache usage.
@@ -363,7 +374,7 @@ __evict_worker(WT_SESSION_IMPL *session)
 		if (bytes_start == bytes_inuse) {
 			if (loop == 10) {
 				WT_STAT_INCR(conn->stats, cache_evict_slow);
-				WT_VERBOSE(session, evictserver,
+				WT_VERBOSE_RET(session, evictserver,
 				    "unable to reach eviction goal");
 				break;
 			}
@@ -374,131 +385,166 @@ __evict_worker(WT_SESSION_IMPL *session)
 }
 
 /*
- * __evict_request_walk --
- *	Walk the eviction request queue.
+ * __evict_clear_tree_walk --
+ *	Clear the tree's current eviction point.
  */
-static int
-__evict_request_walk(WT_SESSION_IMPL *session)
+static void
+__evict_clear_tree_walk(WT_SESSION_IMPL *session, WT_PAGE *page)
 {
-	WT_SESSION_IMPL *request_session;
-	WT_CACHE *cache;
-	WT_EVICT_REQ *er, *er_end;
-	WT_PAGE *page;
 	WT_REF *ref;
-	int ret;
 
-	cache = S2C(session)->cache;
+	/* If no page stack specified, clear the standard eviction stack. */
+	if (page == NULL) {
+		page = session->btree->evict_page;
+		session->btree->evict_page = NULL;
+	}
+
+	/* Clear the current eviction point. */
+	while (page != NULL && !WT_PAGE_IS_ROOT(page)) {
+		ref = page->ref;
+		page = page->parent;
+		if (ref->state == WT_REF_EVICT_WALK)
+			ref->state = WT_REF_MEM;
+	}
+}
+
+/*
+ * __evict_page --
+ *	Evict a given page.
+ */
+static int
+__evict_page(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_DECL_RET;
+	WT_TXN_GLOBAL *txn_global;
+	WT_TXN saved_txn, *txn;
+	int was_running;
 
 	/*
-	 * Walk the eviction request queue, looking for sync/close or page flush
-	 * requests.  If we find a request, perform it, clear the request slot
-	 * and wake up the requesting thread if necessary.
+	 * We have to take care when evicting pages not to write a change that:
+	 *  (a) is not yet committed; or
+	 *  (b) is committed more recently than an in-progress checkpoint.
+	 *
+	 * We handle both of these cases by setting up the transaction context
+	 * before evicting.  If a checkpoint is in progress, copy the
+	 * checkpoint's transaction.  Otherwise, we need a snapshot to avoid
+	 * uncommitted changes.  If a transaction is in progress in the
+	 * evicting session, we save and restore its state.
 	 */
-	WT_EVICT_REQ_FOREACH(er, er_end, cache) {
-		if ((request_session = er->session) == NULL)
-			continue;
+	txn = &session->txn;
+	saved_txn = *txn;
+	was_running = (F_ISSET(txn, TXN_RUNNING) != 0);
 
-		/* Reference the correct WT_BTREE handle. */
-		WT_SET_BTREE_IN_SESSION(session, er->btree);
+	txn_global = &S2C(session)->txn_global;
+	if (was_running)
+		WT_RET(__wt_txn_init(session));
 
-		/*
-		 * Block out concurrent eviction while we are handling this
-		 * request.
-		 */
-		__wt_spin_lock(session, &cache->lru_lock);
+	WT_ERR(__wt_txn_get_snapshot(session, txn_global->ckpt_txnid));
 
-		/*
-		 * The eviction candidate list might reference pages we are
-		 * about to discard; clear it.
-		 */
-		__evict_clr_all(session, 0);
+	ret = __wt_rec_evict(session, page, 0);
 
-		/* Clear the current eviction point. */
-		page = session->btree->evict_page;
-		while (page != NULL && !WT_PAGE_IS_ROOT(page)) {
-			ref = page->ref;
-			page = page->parent;
-			if (ref->state == WT_REF_EVICT_WALK)
-				ref->state = WT_REF_MEM;
-		}
-		session->btree->evict_page = NULL;
+err:	if (was_running) {
+		WT_ASSERT(session, txn->snapshot == NULL ||
+		    txn->snapshot != saved_txn.snapshot);
+		__wt_txn_destroy(session);
+	}
 
-		/*
-		 * Wait for LRU eviction activity to drain.  It is much easier
-		 * to reason about sync or forced eviction if we can be sure
-		 * there are no other threads evicting in the tree.
-		 */
-		while (session->btree->lru_count > 0)
-			__wt_yield();
+	session->txn = saved_txn;
 
-		if (F_ISSET(er, WT_EVICT_REQ_PAGE)) {
-			WT_VERBOSE(session, evictserver,
-			    "forcing eviction of page %p", er->page);
+	return (ret);
+}
 
-			ref = er->page->ref;
-			WT_ASSERT(session, ref->page == er->page);
-			WT_ASSERT(session, ref->state == WT_REF_EVICT_FORCE);
-			ref->state = WT_REF_LOCKED;
+/*
+ * __evict_file_request_walk --
+ *      Walk the session list looking for sync/close requests.  If we find a
+ *      request, perform it, clear the request, and wake up the requesting
+ *      thread.
+ */
+static int
+__evict_file_request_walk(WT_SESSION_IMPL *session)
+{
+	WT_CACHE *cache;
+	WT_CONNECTION_IMPL *conn;
+	WT_SESSION_IMPL *request_session;
+	WT_DECL_RET;
+	uint32_t i, session_cnt;
+	int syncop;
 
-			/*
-			 * At this point, the page is locked, which stalls new
-			 * readers.  Pause before attempting to evict it to
-			 * give existing readers a chance to drop their
-			 * references.
-			 */
-			__wt_yield();
+	conn = S2C(session);
+	cache = conn->cache;
 
-			/*
-			 * If eviction fails, it will free up the page: hope it
-			 * works next time.  Application threads may be holding
-			 * a reference while trying to get another (e.g., if
-			 * they have two cursors open), so blocking
-			 * indefinitely leads to deadlock.
-			 */
-			ret = __wt_rec_evict(session, er->page, 0);
-		} else {
-			/*
-			 * If we're about to do a walk of the file tree (and
-			 * possibly close the file), any page we're referencing
-			 * won't be useful; Discard any page we're holding and
-			 * we can restart our walk as needed.
-			 */
-			ret = __evict_file(session, er);
-		}
+	/* Make progress, regardless of success or failure. */
+	++cache->sync_complete;
 
-		__wt_spin_unlock(session, &cache->lru_lock);
+	/*
+	 * No lock is required because the session array is fixed size, but it
+	 * it may contain inactive entries.
+	 *
+	 * If we don't find a request, something went wrong; complain, but don't
+	 * return an error code, the eviction thread doesn't need to exit.
+	 */
+	WT_ORDERED_READ(session_cnt, conn->session_cnt);
+	for (request_session = conn->sessions,
+	    i = 0; i < session_cnt; ++request_session, ++i)
+		if (request_session->active && request_session->syncop != 0)
+			break;
+	if (i == session_cnt) {
+		__wt_errx(session,
+		    "failed to find handle's sync operation request");
+		return (0);
+	}
 
-		/* Clear the reference to the btree handle. */
-		WT_CLEAR_BTREE_IN_SESSION(session);
+	/*
+	 * Clear the session's request (we don't want to find it again
+	 * on our next walk, and doing it now should help avoid coding
+	 * errors later.  No publish is required, all we care about is
+	 * that we see it change.
+	 */
+	syncop = request_session->syncop;
+	request_session->syncop = 0;
 
-		/*
-		 * Resolve the request and clear the slot.
-		 *
-		 * !!!
-		 * Page eviction is special: the requesting thread is already
-		 * inside wrapup.
-		 */
-		if (!F_ISSET(er, WT_EVICT_REQ_PAGE))
-			__wt_session_serialize_wrapup(
-			    request_session, NULL, ret);
+	WT_VERBOSE_RET(session, evictserver,
+	    "file request: %s",
+	    (request_session->syncop == WT_SYNC ? "sync" :
+	    (request_session->syncop == WT_SYNC_DISCARD ?
+	    "sync-discard" : "sync-discard-nowrite")));
+
+	/*
+	 * The eviction candidate list might reference pages we are
+	 * about to discard; clear it.
+	 */
+	__evict_list_clr_all(session, 0);
 
-		__evict_req_clr(session, er);
+	/*
+	 * Wait for LRU eviction activity to drain.  It is much easier
+	 * to reason about sync or forced eviction if we know there are
+	 * no other threads evicting in the tree.
+	 */
+	while (request_session->btree->lru_count > 0) {
+		__wt_spin_unlock(session, &cache->evict_lock);
+		__wt_yield();
+		__wt_spin_lock(session, &cache->evict_lock);
 	}
+
+	ret = __evict_file_request(request_session, syncop);
+
+	__wt_session_serialize_wrapup(request_session, NULL, ret);
+
 	return (0);
 }
 
 /*
- * __evict_file --
+ * __evict_file_request --
  *	Flush pages for a specific file as part of a close/sync operation.
  */
 static int
-__evict_file(WT_SESSION_IMPL *session, WT_EVICT_REQ *er)
+__evict_file_request(WT_SESSION_IMPL *session, int syncop)
 {
+	WT_DECL_RET;
 	WT_PAGE *next_page, *page;
 
-	WT_VERBOSE(session, evictserver,
-	    "file request: %s",
-	   (F_ISSET(er, WT_EVICT_REQ_CLOSE) ? "close" : "sync"));
+	/* Clear any existing tree walk, we may be about to discard the tree. */
+	__evict_clear_tree_walk(session, NULL);
 
 	/*
 	 * We can't evict the page just returned to us, it marks our place in
@@ -509,36 +555,123 @@ __evict_file(WT_SESSION_IMPL *session, WT_EVICT_REQ *er)
 	for (;;) {
 		if ((page = next_page) == NULL)
 			break;
-		WT_RET(__wt_tree_np(session, &next_page, 1, 1));
+		WT_ERR(__wt_tree_np(session, &next_page, 1, 1));
+
+		/* Write dirty pages for sync, and sync with discard. */
+		switch (syncop) {
+		case WT_SYNC:
+		case WT_SYNC_DISCARD:
+			if (__wt_page_is_modified(page))
+				WT_ERR(__wt_rec_write(session, page, NULL));
+			break;
+		case WT_SYNC_DISCARD_NOWRITE:
+			break;
+		}
 
 		/*
-		 * Close: discarding all of the file's pages from the cache.
-		 *  Sync: only dirty pages need to be written.
-		 *
-		 * First, write the dirty pages: if we're closing the file, we
-		 * will be evicting all of the pages, and all child pages have
-		 * to be in their final, clean state, to evict the parent.
-		 *
-		 * The specific problem this solves is an empty page, which is
-		 * dirty because new material was added: reconciling it clears
-		 * the empty flag, and then we evict it.
+		 * Evict the page for sync with discard, simply discard the page
+		 * for discard alone.
 		 */
-		if (__wt_page_is_modified(page))
-			WT_RET(__wt_rec_write(session, page, NULL));
-		if (!F_ISSET(er, WT_EVICT_REQ_CLOSE))
+		switch (syncop) {
+		case WT_SYNC:
+			break;
+		case WT_SYNC_DISCARD:
+			/*
+			 * Do not attempt to evict pages expected to be merged
+			 * into their parents, with the single exception that
+			 * the root page can't be merged into anything, it must
+			 * be written.
+			 */
+			if (WT_PAGE_IS_ROOT(page) || page->modify == NULL ||
+			    !F_ISSET(page->modify, WT_PM_REC_EMPTY |
+			    WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE))
+				WT_ERR(__wt_rec_evict(
+				    session, page, WT_REC_SINGLE));
+			break;
+		case WT_SYNC_DISCARD_NOWRITE:
+			__wt_page_out(session, &page, 0);
+			break;
+		}
+	}
+
+	return (0);
+
+	/* On error, clear any left-over tree walk. */
+err:	if (next_page != NULL)
+		__evict_clear_tree_walk(session, next_page);
+	return (ret);
+}
+
+/*
+ * __evict_page_request_walk --
+ *	Walk the forced page eviction request queue.
+ */
+static int
+__evict_page_request_walk(WT_SESSION_IMPL *session)
+{
+	WT_CACHE *cache;
+	WT_EVICT_ENTRY *er, *er_end;
+	WT_PAGE *page;
+	WT_REF *ref;
+
+	cache = S2C(session)->cache;
+
+	/*
+	 * Walk the forced page eviction request queue: if we find a request,
+	 * perform it and clear the request slot.
+	 */
+	WT_EVICT_REQ_FOREACH(er, er_end, cache) {
+		if ((page = er->page) == NULL)
 			continue;
 
+		/* Reference the correct WT_BTREE handle. */
+		WT_SET_BTREE_IN_SESSION(session, er->btree);
+
+		WT_VERBOSE_RET(session, evictserver,
+		    "forcing eviction of page %p", page);
+
 		/*
-		 * We do not attempt to evict pages expected to be merged into
-		 * their parents, with the single exception that the root page
-		 * can't be merged into anything, it must be written.
+		 * The eviction candidate list might reference pages we are
+		 * about to discard; clear it.
 		 */
-		if (WT_PAGE_IS_ROOT(page) ||
-		    page->modify == NULL || !F_ISSET(page->modify,
-		    WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE))
-			WT_RET(__wt_rec_evict(session, page, WT_REC_SINGLE));
-	}
+		__evict_list_clr_all(session, 0);
+
+		/*
+		 * The eviction candidate might be part of the current tree's
+		 * walk; clear it.
+		 */
+		__evict_clear_tree_walk(session, NULL);
+
+		/*
+		 * Wait for LRU eviction activity to drain.  It is much easier
+		 * to reason about sync or forced eviction if we know there are
+		 * no other threads evicting in the tree.
+		 */
+		while (session->btree->lru_count > 0) {
+			__wt_spin_unlock(session, &cache->evict_lock);
+			__wt_yield();
+			__wt_spin_lock(session, &cache->evict_lock);
+		}
+
+		ref = page->ref;
+		WT_ASSERT(session, ref->page == page);
+		WT_ASSERT(session, ref->state == WT_REF_EVICT_FORCE);
+		ref->state = WT_REF_LOCKED;
+
+		/*
+		 * If eviction fails, it will free up the page: hope it works
+		 * next time.  Application threads may be holding a reference
+		 * while trying to get another (e.g., if they have two cursors
+		 * open), so blocking indefinitely leads to deadlock.
+		 */
+		(void)__evict_page(session, page);
+
+		/* Clear the reference to the btree handle. */
+		WT_CLEAR_BTREE_IN_SESSION(session);
 
+		/* Clear the request slot. */
+		__evict_req_clr(er);
+	}
 	return (0);
 }
 
@@ -557,15 +690,16 @@ __evict_lru(WT_SESSION_IMPL *session)
 	WT_RET(__evict_walk(session));
 
 	/* Sort the list into LRU order and restart. */
-	__wt_spin_lock(session, &cache->lru_lock);
+	__wt_spin_lock(session, &cache->evict_lock);
 	__evict_lru_sort(session);
-	__evict_clr_all(session, WT_EVICT_WALK_BASE);
+	__evict_list_clr_all(session, WT_EVICT_WALK_BASE);
 
 	cache->evict_current = cache->evict;
-	__wt_spin_unlock(session, &cache->lru_lock);
+	__wt_spin_unlock(session, &cache->evict_lock);
 
 	/* Reconcile and discard some pages. */
-	__evict_pages(session);
+	while (__wt_evict_lru_page(session, 0) == 0)
+		;
 
 	return (0);
 }
@@ -577,15 +711,14 @@ __evict_lru(WT_SESSION_IMPL *session)
 static int
 __evict_walk(WT_SESSION_IMPL *session)
 {
-	WT_CONNECTION_IMPL *conn;
 	WT_BTREE *btree;
 	WT_CACHE *cache;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
 	u_int elem, i;
-	int ret;
 
 	conn = S2C(session);
 	cache = S2C(session)->cache;
-	ret = 0;
 
 	/*
 	 * We hold a spinlock for the entire walk -- it's slow, but (1) how
@@ -602,18 +735,21 @@ __evict_walk(WT_SESSION_IMPL *session)
 	elem = WT_EVICT_WALK_BASE + (conn->btqcnt * WT_EVICT_WALK_PER_TABLE);
 	if (elem > cache->evict_entries) {
 		/* Save the offset of the eviction point. */
-		__wt_spin_lock(session, &cache->lru_lock);
+		__wt_spin_lock(session, &cache->evict_lock);
 		i = (u_int)(cache->evict_current - cache->evict);
 		WT_ERR(__wt_realloc(session, &cache->evict_allocated,
-		    elem * sizeof(WT_EVICT_LIST), &cache->evict));
+		    elem * sizeof(WT_EVICT_ENTRY), &cache->evict));
 		cache->evict_entries = elem;
 		if (cache->evict_current != NULL)
 			cache->evict_current = cache->evict + i;
-		__wt_spin_unlock(session, &cache->lru_lock);
+		__wt_spin_unlock(session, &cache->evict_lock);
 	}
 
 	i = WT_EVICT_WALK_BASE;
 	TAILQ_FOREACH(btree, &conn->btqh, q) {
+		if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
+			continue;
+
 		/* Reference the correct WT_BTREE handle. */
 		WT_SET_BTREE_IN_SESSION(session, btree);
 
@@ -626,7 +762,7 @@ __evict_walk(WT_SESSION_IMPL *session)
 	}
 
 	if (0) {
-err:		__wt_spin_unlock(session, &cache->lru_lock);
+err:		__wt_spin_unlock(session, &cache->evict_lock);
 	}
 	__wt_spin_unlock(session, &conn->spinlock);
 	return (ret);
@@ -641,9 +777,10 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
 {
 	WT_BTREE *btree;
 	WT_CACHE *cache;
-	WT_EVICT_LIST *end, *evict, *start;
+	WT_DECL_RET;
+	WT_EVICT_ENTRY *end, *evict, *start;
 	WT_PAGE *page;
-	int restarts, ret;
+	int restarts;
 
 	btree = session->btree;
 	cache = S2C(session)->cache;
@@ -656,7 +793,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
 	 * We can't evict the page just returned to us, it marks our place in
 	 * the tree.  So, always stay one page ahead of the page being returned.
 	 */
-	for (evict = start, restarts = ret = 0;
+	for (evict = start, restarts = 0;
 	    evict < end && restarts <= 1 && ret == 0;
 	    ret = __wt_tree_np(session, &btree->evict_page, 1, 1)) {
 		if ((page = btree->evict_page) == NULL) {
@@ -671,8 +808,8 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
 		 *
 		 * Don't skip pages marked WT_PM_REC_EMPTY or SPLIT: updates
 		 * after their last reconciliation may have changed their state
-		 * and only the eviction code can check whether they should
-		 * really be skipped.
+		 * and only the reconciliation/eviction code can confirm if they
+		 * should really be skipped.
 		 */
 		if (WT_PAGE_IS_ROOT(page) ||
 		    page->ref->state != WT_REF_EVICT_WALK ||
@@ -681,9 +818,6 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
 		    F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE)))
 			continue;
 
-		WT_VERBOSE(session, evictserver,
-		    "select: %p, size %" PRIu32, page, page->memory_footprint);
-
 		WT_ASSERT(session, evict->page == NULL);
 		evict->page = page;
 		evict->btree = btree;
@@ -691,6 +825,9 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
 
 		/* Mark the page on the list */
 		F_SET_ATOMIC(page, WT_PAGE_EVICT_LRU);
+
+		WT_VERBOSE_RET(session, evictserver,
+		    "select: %p, size %" PRIu32, page, page->memory_footprint);
 	}
 
 	*slotp += (u_int)(evict - start);
@@ -722,7 +859,7 @@ __evict_lru_sort(WT_SESSION_IMPL *session)
 	 */
 	cache = S2C(session)->cache;
 	qsort(cache->evict,
-	    cache->evict_entries, sizeof(WT_EVICT_LIST), __evict_lru_cmp);
+	    cache->evict_entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp);
 }
 
 /*
@@ -734,7 +871,7 @@ __evict_get_page(
     WT_SESSION_IMPL *session, int is_app, WT_BTREE **btreep, WT_PAGE **pagep)
 {
 	WT_CACHE *cache;
-	WT_EVICT_LIST *evict;
+	WT_EVICT_ENTRY *evict;
 	WT_REF *ref;
 	int candidates;
 
@@ -755,7 +892,7 @@ __evict_get_page(
 		if (cache->evict_current == NULL ||
 		    cache->evict_current >= cache->evict + candidates)
 			return;
-		if (__wt_spin_trylock(session, &cache->lru_lock) == 0)
+		if (__wt_spin_trylock(session, &cache->evict_lock) == 0)
 			break;
 		__wt_yield();
 	}
@@ -770,6 +907,17 @@ __evict_get_page(
 		++cache->evict_current;
 
 		/*
+		 * In case something goes wrong, don't pick the same set of
+		 * pages every time.
+		 *
+		 * We used to bump the page's read_gen only if eviction failed,
+		 * but that isn't safe: at that point, eviction has already
+		 * unlocked the page and some other thread may have evicted it
+		 * by the time we look at it.
+		 */
+		evict->page->read_gen = __wt_cache_read_gen(session);
+
+		/*
 		 * Lock the page while holding the eviction mutex to prevent
 		 * multiple attempts to evict it.  For pages that are already
 		 * being evicted, including pages on the request queue for
@@ -777,6 +925,7 @@ __evict_get_page(
 		 * on.
 		 */
 		ref = evict->page->ref;
+		WT_ASSERT(session, evict->page == ref->page);
 		if (!WT_ATOMIC_CAS(ref->state, WT_REF_MEM, WT_REF_LOCKED))
 			continue;
 
@@ -793,13 +942,13 @@ __evict_get_page(
 		 * Remove the entry so we never try and reconcile the same page
 		 * on reconciliation error.
 		 */
-		__evict_clr(session, evict);
+		__evict_list_clr(session, evict);
 		break;
 	}
 
 	if (is_app && *pagep == NULL)
 		cache->evict_current = NULL;
-	__wt_spin_unlock(session, &cache->lru_lock);
+	__wt_spin_unlock(session, &cache->evict_lock);
 }
 
 /*
@@ -823,17 +972,11 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app)
 	WT_SET_BTREE_IN_SESSION(session, btree);
 
 	/*
-	 * We don't care why eviction failed (maybe the page was dirty and we're
-	 * out of disk space, or the page had an in-memory subtree already being
-	 * evicted).  Regardless, don't pick the same page every time.
-	 *
-	 * We used to bump the page's read_gen only if eviction failed, but
-	 * that isn't safe: at that point, eviction has already unlocked the
-	 * page and some other thread may have evicted it by the time we look
-	 * at it.
+	 * We don't care why eviction failed (maybe the page was dirty and
+	 * we're out of disk space, or the page had an in-memory subtree
+	 * already being evicted).
 	 */
-	page->read_gen = __wt_cache_read_gen(session);
-	(void)__wt_rec_evict(session, page, 0);
+	(void)__evict_page(session, page);
 
 	WT_ATOMIC_ADD(btree->lru_count, -1);
 
@@ -844,19 +987,8 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app)
 }
 
 /*
- * __evict_page --
- *	Reconcile and discard cache pages.
- */
-static void
-__evict_pages(WT_SESSION_IMPL *session)
-{
-	while (__wt_evict_lru_page(session, 0) == 0)
-		;
-}
-
-/*
  * __evict_lru_cmp --
- *	Qsort function: sort WT_EVICT_LIST array based on the page's read
+ *	Qsort function: sort LRU eviction array based on the page's read
  *	generation.
  */
 static int
@@ -869,8 +1001,8 @@ __evict_lru_cmp(const void *a, const void *b)
 	 * There may be NULL references in the array; sort them as greater than
 	 * anything else so they migrate to the end of the array.
 	 */
-	a_page = ((WT_EVICT_LIST *)a)->page;
-	b_page = ((WT_EVICT_LIST *)b)->page;
+	a_page = ((WT_EVICT_ENTRY *)a)->page;
+	b_page = ((WT_EVICT_ENTRY *)b)->page;
 	if (a_page == NULL)
 		return (b_page == NULL ? 0 : 1);
 	if (b_page == NULL)
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index 9b48588f1bd..3cc126b1c89 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -7,11 +7,10 @@
 
 #include "wt_internal.h"
 
-static int __btree_conf(WT_SESSION_IMPL *, uint32_t);
+static int __btree_conf(WT_SESSION_IMPL *);
 static int __btree_get_last_recno(WT_SESSION_IMPL *);
 static int __btree_page_sizes(WT_SESSION_IMPL *, const char *);
-static int __btree_root_init_empty(WT_SESSION_IMPL *);
-static int __btree_tree_init(WT_SESSION_IMPL *);
+static int __btree_tree_open_empty(WT_SESSION_IMPL *);
 
 static int pse1(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t);
 static int pse2(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t, uint32_t);
@@ -41,26 +40,67 @@ __wt_btree_truncate(WT_SESSION_IMPL *session, const char *filename)
  *	Open a Btree.
  */
 int
-__wt_btree_open(WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags)
+__wt_btree_open(WT_SESSION_IMPL *session,
+    const uint8_t *addr, uint32_t addr_size, const char *cfg[], int readonly)
 {
 	WT_BTREE *btree;
-	int ret;
+	WT_CONFIG_ITEM cval;
+	WT_DECL_RET;
+	WT_ITEM dsk;
+	const char *filename;
+	int forced_salvage;
 
 	btree = session->btree;
-	ret = 0;
+	WT_CLEAR(dsk);
 
 	/* Initialize and configure the WT_BTREE structure. */
-	WT_RET(__btree_conf(session, flags));
+	WT_ERR(__btree_conf(session));
+
+	forced_salvage = 0;
+	if (F_ISSET(btree, WT_BTREE_SALVAGE)) {
+		ret = __wt_config_gets(session, cfg, "force", &cval);
+		if (ret != 0 && ret != WT_NOTFOUND)
+			WT_ERR(ret);
+		if (cval.val != 0)
+			forced_salvage = 1;
+	}
+
+	/* Connect to the underlying block manager. */
+	filename = btree->name;
+	if (!WT_PREFIX_SKIP(filename, "file:"))
+		WT_ERR_MSG(session, EINVAL, "expected a 'file:' URI");
 
-	/* Open the underlying block object. */
-	WT_RET(__wt_bm_open(session, btree->filename,
-	    btree->config, cfg, F_ISSET(btree, WT_BTREE_SALVAGE) ? 1 : 0));
-	WT_RET(__wt_bm_block_header(session, &btree->block_header));
+	WT_ERR(__wt_bm_open(
+	    session, filename, btree->config, cfg, forced_salvage));
 
-	/* Initialize the tree if not a special command. */
-	if (!F_ISSET(btree,
+	/*
+	 * Open the specified snapshot unless it's a special command (special
+	 * commands are responsible for loading their own snapshots, if any).
+	 */
+	if (F_ISSET(btree,
 	    WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY))
-		WT_RET(__btree_tree_init(session));
+		return (0);
+
+	/*
+	 * There are two reasons to load an empty tree rather than a snapshot:
+	 * either there is no snapshot (the file is being created), or the load
+	 * call returns no root page (the snapshot is empty).
+	 */
+	WT_ERR(__wt_bm_snapshot_load(session, &dsk, addr, addr_size, readonly));
+	if (addr == NULL || addr_size == 0 || dsk.size == 0)
+		WT_ERR(__btree_tree_open_empty(session));
+	else {
+		WT_ERR(__wt_btree_tree_open(session, &dsk));
+
+		/* Get the last record number in a column-store file. */
+		if (btree->type != BTREE_ROW)
+			WT_ERR(__btree_get_last_recno(session));
+	}
+
+	if (0) {
+err:		__wt_buf_free(session, &dsk);
+		(void)__wt_btree_close(session);
+	}
 
 	return (ret);
 }
@@ -73,30 +113,18 @@ int
 __wt_btree_close(WT_SESSION_IMPL *session)
 {
 	WT_BTREE *btree;
-	int ret;
+	WT_DECL_RET;
 
 	btree = session->btree;
-	ret = 0;
-
-	/* Clear any cache. */
-	if (btree->root_page != NULL)
-		WT_TRET(__wt_evict_file_serial(session, 1));
-	WT_ASSERT(session, btree->root_page == NULL);
-
-	/* After all pages are evicted, update the root's address. */
-	if (btree->root_update) {
-		/*
-		 * Release the original blocks held by the root, that is,
-		 * the blocks listed in the schema file.
-		 */
-		WT_RET(__wt_btree_free_root(session));
-
-		WT_RET(__wt_btree_set_root(session, btree->filename,
-		    btree->root_addr.addr, btree->root_addr.size));
-		if (btree->root_addr.addr != NULL)
-			__wt_free(session, btree->root_addr.addr);
-		btree->root_update = 0;
-	}
+
+	/*
+	 * Discard the tree and, if the tree is modified, create a new snapshot
+	 * for the underlying object, unless it's a special command.
+	 */
+	if (F_ISSET(btree, WT_BTREE_OPEN) &&
+	    !F_ISSET(btree,
+	    WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY))
+		WT_TRET(__wt_bm_snapshot_unload(session));
 
 	/* Close the underlying block manager reference. */
 	WT_TRET(__wt_bm_close(session));
@@ -104,6 +132,10 @@ __wt_btree_close(WT_SESSION_IMPL *session)
 	/* Close the Huffman tree. */
 	__wt_btree_huffman_close(session);
 
+	/* Snapshot lock. */
+	if (btree->snaplock != NULL)
+		(void)__wt_rwlock_destroy(session, btree->snaplock);
+
 	/* Free allocated memory. */
 	__wt_free(session, btree->key_format);
 	__wt_free(session, btree->key_plan);
@@ -120,7 +152,7 @@ __wt_btree_close(WT_SESSION_IMPL *session)
  *	Configure a WT_BTREE structure.
  */
 static int
-__btree_conf(WT_SESSION_IMPL *session, uint32_t flags)
+__btree_conf(WT_SESSION_IMPL *session)
 {
 	WT_BTREE *btree;
 	WT_CONFIG_ITEM cval;
@@ -180,6 +212,9 @@ __btree_conf(WT_SESSION_IMPL *session, uint32_t flags)
 		}
 	}
 
+	/* Snapshot lock. */
+	WT_RET(__wt_rwlock_alloc(session, "btree snapshot", &btree->snaplock));
+
 	/* Page sizes */
 	WT_RET(__btree_page_sizes(session, config));
 
@@ -188,93 +223,42 @@ __btree_conf(WT_SESSION_IMPL *session, uint32_t flags)
 
 	WT_RET(__wt_stat_alloc_btree_stats(session, &btree->stats));
 
-	/* Take the config string: it will be freed with the btree handle. */
-	btree->config = config;
-
-	/* Set the flags. */
-	btree->flags = flags;
-
 	return (0);
 }
 
 /*
- * __btree_tree_init --
- *	Open the file in the block manager and read the root/last pages.
- */
-static int
-__btree_tree_init(WT_SESSION_IMPL *session)
-{
-	WT_BTREE *btree;
-	WT_ITEM *addr;
-	int ret;
-
-	btree = session->btree;
-	ret = 0;
-
-	WT_RET(__wt_scr_alloc(session, 0, &addr));
-	WT_ERR(__wt_btree_get_root(session, addr));
-
-	/*
-	 * If there's a root page in the file, read it in and pin it.
-	 * If there's no root page, create an empty in-memory page.
-	 */
-	if (addr->data == NULL)
-		WT_ERR(__btree_root_init_empty(session));
-	else
-		WT_ERR(__wt_btree_root_init(session, addr));
-
-	/* Get the last record number in a column-store file. */
-	if (btree->type != BTREE_ROW)
-		WT_ERR(__btree_get_last_recno(session));
-
-err:	__wt_scr_free(&addr);
-
-	return (ret);
-}
-
-/*
- * __wt_btree_root_init --
+ * __wt_btree_tree_open --
  *      Read in a tree from disk.
  */
 int
-__wt_btree_root_init(WT_SESSION_IMPL *session, WT_ITEM *addr)
+__wt_btree_tree_open(WT_SESSION_IMPL *session, WT_ITEM *dsk)
 {
 	WT_BTREE *btree;
-	WT_ITEM tmp;
 	WT_PAGE *page;
-	int ret;
 
 	btree = session->btree;
 
-	/* Read the root into memory. */
-	WT_CLEAR(tmp);
-	WT_RET(__wt_bm_read(session, &tmp, addr->data, addr->size));
-
 	/* Build the in-memory version of the page. */
-	WT_ERR(__wt_page_inmem(session, NULL, NULL, tmp.mem, NULL, &page));
-
+	WT_RET(__wt_page_inmem(session, NULL, NULL, dsk->mem, &page));
 	btree->root_page = page;
-	return (0);
 
-err:	__wt_buf_free(session, &tmp);
-	return (ret);
+	return (0);
 }
 
 /*
- * __btree_root_init_empty --
+ * __btree_tree_open_empty --
  *      Create an empty in-memory tree.
  */
 static int
-__btree_root_init_empty(WT_SESSION_IMPL *session)
+__btree_tree_open_empty(WT_SESSION_IMPL *session)
 {
 	WT_BTREE *btree;
+	WT_DECL_RET;
 	WT_PAGE *root, *leaf;
 	WT_REF *ref;
-	int ret;
 
 	btree = session->btree;
 	root = leaf = NULL;
-	ret = 0;
 
 	/*
 	 * Create a leaf page -- this can be reconciled while the root stays
@@ -328,10 +312,9 @@ __btree_root_init_empty(WT_SESSION_IMPL *session)
 		ref->page = leaf;
 		ref->addr = NULL;
 		ref->state = WT_REF_MEM;
-		WT_ERR(__wt_row_ikey_alloc(
-		    session, 0, "", 1, (WT_IKEY **)&(ref->u.key)));
+		WT_ERR(__wt_row_ikey_alloc(session, 0, "", 1, &ref->u.key));
 		break;
-	WT_ILLEGAL_VALUE(session);
+	WT_ILLEGAL_VALUE_ERR(session);
 	}
 	root->entries = 1;
 	root->parent = NULL;
@@ -343,18 +326,19 @@ __btree_root_init_empty(WT_SESSION_IMPL *session)
 	btree->root_page = root;
 
 	/*
-	 * Mark the child page dirty so that if it is evicted, the tree ends
-	 * up sane.
+	 * Mark the child page empty so that if it is evicted, the tree ends up
+	 * sane.  The page should not be dirty, or we will always write empty
+	 * trees on close, including empty snapshots.
 	 */
 	WT_ERR(__wt_page_modify_init(session, leaf));
-	__wt_page_modify_set(leaf);
+	F_SET(leaf->modify, WT_PM_REC_EMPTY);
 
 	return (0);
 
 err:	if (leaf != NULL)
-		__wt_page_out(session, leaf, 0);
+		__wt_page_out(session, &leaf, 0);
 	if (root != NULL)
-		__wt_page_out(session, root, 0);
+		__wt_page_out(session, &root, 0);
 	return (ret);
 }
 
diff --git a/src/btree/bt_huffman.c b/src/btree/bt_huffman.c
index 13db305f73c..a5b65d61506 100644
--- a/src/btree/bt_huffman.c
+++ b/src/btree/bt_huffman.c
@@ -134,11 +134,11 @@ static int __wt_huffman_read(WT_SESSION_IMPL *,
 int
 __wt_btree_huffman_open(WT_SESSION_IMPL *session, const char *config)
 {
-	WT_BTREE *btree;
-	u_int entries, numbytes;
 	struct __wt_huffman_table *table;
+	WT_BTREE *btree;
 	WT_CONFIG_ITEM key_conf, value_conf;
-	int ret;
+	WT_DECL_RET;
+	u_int entries, numbytes;
 
 	btree = session->btree;
 
@@ -226,12 +226,11 @@ __wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip,
 {
 	struct __wt_huffman_table *table, *tp;
 	FILE *fp;
+	WT_DECL_RET;
 	uint64_t symbol, frequency;
 	u_int entries, lineno;
-	int ret;
 	char *file;
 
-	ret = 0;
 	file = NULL;
 	table = NULL;
 
@@ -242,7 +241,7 @@ __wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip,
 	if (strncasecmp(ip->str, "utf8", 4) == 0) {
 		entries = UINT8_MAX;
 		*numbytesp = 1;
-		WT_RET(__wt_calloc_def(session, entries, &table));
+		WT_ERR(__wt_calloc_def(session, entries, &table));
 
 		if (ip->len == 4)
 			WT_ERR_MSG(session, EINVAL,
@@ -252,7 +251,7 @@ __wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip,
 	} else if (strncasecmp(ip->str, "utf16", 5) == 0) {
 		entries = UINT16_MAX;
 		*numbytesp = 2;
-		WT_RET(__wt_calloc_def(session, entries, &table));
+		WT_ERR(__wt_calloc_def(session, entries, &table));
 
 		if (ip->len == 5)
 			WT_ERR_MSG(session, EINVAL,
diff --git a/src/btree/bt_misc.c b/src/btree/bt_misc.c
index bc096bd768c..6952cbc4fb5 100644
--- a/src/btree/bt_misc.c
+++ b/src/btree/bt_misc.c
@@ -17,6 +17,8 @@ __wt_page_type_string(u_int type)
 	switch (type) {
 	case WT_PAGE_INVALID:
 		return ("invalid");
+	case WT_PAGE_BLOCK_MANAGER:
+		return ("block manager");
 	case WT_PAGE_COL_FIX:
 		return ("column-store fixed-length leaf");
 	case WT_PAGE_COL_INT:
@@ -29,8 +31,6 @@ __wt_page_type_string(u_int type)
 		return ("row-store internal");
 	case WT_PAGE_ROW_LEAF:
 		return ("row-store leaf");
-	case WT_PAGE_FREELIST:
-		return ("freelist");
 	default:
 		return ("unknown");
 	}
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index a4cc355096a..c9200440051 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -7,7 +7,7 @@
 
 #include "wt_internal.h"
 
-static int  __inmem_col_fix(WT_SESSION_IMPL *, WT_PAGE *, size_t *);
+static int  __inmem_col_fix(WT_SESSION_IMPL *, WT_PAGE *);
 static int  __inmem_col_int(WT_SESSION_IMPL *, WT_PAGE *, size_t *);
 static int  __inmem_col_var(WT_SESSION_IMPL *, WT_PAGE *, size_t *);
 static int  __inmem_row_int(WT_SESSION_IMPL *, WT_PAGE *, size_t *);
@@ -26,7 +26,8 @@ __wt_page_in_func(
 #endif
     )
 {
-	int wake, read_lockout;
+	WT_PAGE *page;
+	int busy, read_lockout, wake;
 
 	/*
 	 * Only wake the eviction server the first time through here (if the
@@ -67,17 +68,37 @@ __wt_page_in_func(
 			 * can't get a hazard reference is because the page is
 			 * being evicted; yield and try again.
 			 */
-			if (__wt_hazard_set(session, ref
 #ifdef HAVE_DIAGNOSTIC
-			    , file, line
+			WT_RET(
+			    __wt_hazard_set(session, ref, &busy, file, line));
+#else
+			WT_RET(__wt_hazard_set(session, ref, &busy));
 #endif
-			    ) == 0) {
-				WT_ASSERT(session, !WT_PAGE_IS_ROOT(ref->page));
-				ref->page->read_gen =
-				    __wt_cache_read_gen(session);
-				return (0);
+			if (busy)
+				break;
+
+			page = ref->page;
+			WT_ASSERT(session, !WT_PAGE_IS_ROOT(page));
+
+			/*
+			 * Ensure the page doesn't have ancient updates on it.
+			 * If it did, reading the page could ignore committed
+			 * updates.  This should be extremely unlikely in real
+			 * applications, force eviction of the page to avoid
+			 * the issue.
+			 */
+			if (page->modify != NULL &&
+			    __wt_txn_ancient(session, page->modify->first_id)) {
+				WT_VERBOSE_RET(session, read,
+				    "ancient updates, forcing eviction");
+				__wt_evict_page_request(session, page);
+				__wt_hazard_clear(session, page);
+				__wt_evict_server_wake(session);
+				break;
 			}
-			break;
+
+			page->read_gen = __wt_cache_read_gen(session);
+			return (0);
 		WT_ILLEGAL_VALUE(session);
 		}
 
@@ -97,55 +118,55 @@ __wt_page_in_func(
  */
 int
 __wt_page_inmem(WT_SESSION_IMPL *session,
-    WT_PAGE *parent, WT_REF *parent_ref, WT_PAGE_HEADER *dsk,
-    size_t *inmem_sizep, WT_PAGE **pagep)
+    WT_PAGE *parent, WT_REF *parent_ref, WT_PAGE_HEADER *dsk, WT_PAGE **pagep)
 {
+	WT_DECL_RET;
 	WT_PAGE *page;
-	int ret;
+	size_t inmem_size;
 
 	WT_ASSERT_RET(session, dsk->u.entries > 0);
 
-	*pagep = NULL;
+	*pagep = page = NULL;
 
 	/*
 	 * Allocate and initialize the WT_PAGE.
 	 * Set the LRU so the page is not immediately selected for eviction.
+	 * Set the read generation (which can't match a search where the write
+	 * generation wasn't set, that is, remained 0).
 	 */
 	WT_RET(__wt_calloc_def(session, 1, &page));
-	if (inmem_sizep != NULL)
-		*inmem_sizep = sizeof(*page) + dsk->size;
-	page->type = dsk->type;
 	page->parent = parent;
 	page->ref = parent_ref;
 	page->dsk = dsk;
-	/*
-	 * Set the write generation to 1 (which can't match a search where the
-	 * write generation wasn't set, that is, remained 0).
-	 */
 	page->read_gen = __wt_cache_read_gen(session);
+	page->type = dsk->type;
 
+	inmem_size = 0;
 	switch (page->type) {
 	case WT_PAGE_COL_FIX:
 		page->u.col_fix.recno = dsk->recno;
-		WT_ERR(__inmem_col_fix(session, page, inmem_sizep));
+		WT_ERR(__inmem_col_fix(session, page));
 		break;
 	case WT_PAGE_COL_INT:
 		page->u.intl.recno = dsk->recno;
-		WT_ERR(__inmem_col_int(session, page, inmem_sizep));
+		WT_ERR(__inmem_col_int(session, page, &inmem_size));
 		break;
 	case WT_PAGE_COL_VAR:
 		page->u.col_var.recno = dsk->recno;
-		WT_ERR(__inmem_col_var(session, page, inmem_sizep));
+		WT_ERR(__inmem_col_var(session, page, &inmem_size));
 		break;
 	case WT_PAGE_ROW_INT:
-		WT_ERR(__inmem_row_int(session, page, inmem_sizep));
+		WT_ERR(__inmem_row_int(session, page, &inmem_size));
 		break;
 	case WT_PAGE_ROW_LEAF:
-		WT_ERR(__inmem_row_leaf(session, page, inmem_sizep));
+		WT_ERR(__inmem_row_leaf(session, page, &inmem_size));
 		break;
-	WT_ILLEGAL_VALUE(session);
+	WT_ILLEGAL_VALUE_ERR(session);
 	}
 
+	__wt_cache_page_read(
+	    session, page, sizeof(WT_PAGE) + dsk->size + inmem_size);
+
 	*pagep = page;
 	return (0);
 
@@ -158,13 +179,11 @@ err:	__wt_free(session, page);
  *	Build in-memory index for fixed-length column-store leaf pages.
  */
 static int
-__inmem_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
+__inmem_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page)
 {
 	WT_BTREE *btree;
 	WT_PAGE_HEADER *dsk;
 
-	WT_UNUSED(inmem_sizep);
-
 	btree = session->btree;
 	dsk = page->dsk;
 
@@ -298,20 +317,20 @@ static int
 __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
 {
 	WT_BTREE *btree;
-	WT_ITEM *current, *last, *tmp;
 	WT_CELL *cell;
 	WT_CELL_UNPACK *unpack, _unpack;
+	WT_DECL_ITEM(current);
+	WT_DECL_ITEM(last);
+	WT_DECL_RET;
+	WT_ITEM *tmp;
 	WT_PAGE_HEADER *dsk;
 	WT_REF *ref;
 	uint32_t i, nindx, prefix;
-	int ret;
 	void *huffman;
 
 	btree = session->btree;
-	current = last = NULL;
 	unpack = &_unpack;
 	dsk = page->dsk;
-	ret = 0;
 	huffman = btree->huffman_key;
 
 	WT_ERR(__wt_scr_alloc(session, 0, &current));
@@ -323,7 +342,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
 	 * and location cookie).
 	 */
 	nindx = dsk->u.entries / 2;
-	WT_RET((__wt_calloc_def(session, (size_t)nindx, &page->u.intl.t)));
+	WT_ERR((__wt_calloc_def(session, (size_t)nindx, &page->u.intl.t)));
 	if (inmem_sizep != NULL)
 		*inmem_sizep += nindx * sizeof(*page->u.intl.t);
 
@@ -350,7 +369,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
 			ref->addr = cell;
 			++ref;
 			continue;
-		WT_ILLEGAL_VALUE(session);
+		WT_ILLEGAL_VALUE_ERR(session);
 		}
 
 		/*
@@ -361,7 +380,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
 		 */
 		prefix = unpack->prefix;
 		if (huffman != NULL || unpack->ovfl) {
-			WT_RET(__wt_cell_unpack_copy(session, unpack, current));
+			WT_ERR(__wt_cell_unpack_copy(session, unpack, current));
 
 			/*
 			 * If there's a prefix, make sure there's enough buffer
@@ -399,7 +418,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
 		 */
 		WT_ERR(__wt_row_ikey_alloc(session,
 		    WT_PAGE_DISK_OFFSET(page, cell),
-		    current->data, current->size, (WT_IKEY **)&ref->u.key));
+		    current->data, current->size, &ref->u.key));
 		if (inmem_sizep != NULL)
 			*inmem_sizep += sizeof(WT_IKEY) + current->size;
 
@@ -474,7 +493,7 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *inmem_sizep)
 		switch (unpack->type) {
 		case WT_CELL_KEY:
 		case WT_CELL_KEY_OVFL:
-			rip->key = cell;
+			WT_ROW_KEY_SET(rip, cell);
 			++rip;
 			break;
 		case WT_CELL_VALUE:
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index a0057e733d4..7b77d60b0af 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -14,19 +14,17 @@
 int
 __wt_cache_read(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref)
 {
+	WT_DECL_RET;
 	WT_ITEM tmp;
 	WT_PAGE *page;
 	uint32_t size;
 	const uint8_t *addr;
-	size_t inmem_size;
-	int ret;
 
 	/*
 	 * We don't pass in an allocated buffer, force allocation of new memory
 	 * of the appropriate size.
 	 */
 	WT_CLEAR(tmp);
-	ret = 0;
 
 	WT_ASSERT(session, ref->state == WT_REF_READING);
 
@@ -37,15 +35,11 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref)
 	WT_ERR(__wt_bm_read(session, &tmp, addr, size));
 
 	/* Build the in-memory version of the page. */
-	WT_ERR(__wt_page_inmem(
-	    session, parent, ref, tmp.mem, &inmem_size, &page));
+	WT_ERR(__wt_page_inmem(session, parent, ref, tmp.mem, &page));
 
-	__wt_cache_page_read(session, page, inmem_size);
+	WT_VERBOSE_ERR(session, read,
+	    "page %p: %s", page, __wt_page_type_string(page->type));
 
-	WT_VERBOSE(session, read,
-	    "page %p, %s", page, __wt_page_type_string(page->type));
-
-	WT_ASSERT(session, page != NULL);
 	ref->page = page;
 	WT_PUBLISH(ref->state, WT_REF_MEM);
 	return (0);
diff --git a/src/btree/bt_ret.c b/src/btree/bt_ret.c
index 0206cd1b0a5..c7332768c4f 100644
--- a/src/btree/bt_ret.c
+++ b/src/btree/bt_ret.c
@@ -23,6 +23,7 @@ __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int key_ret)
 	WT_ROW *rip;
 	WT_UPDATE *upd;
 	uint8_t v;
+	void *ripkey;
 
 	btree = session->btree;
 	unpack = &_unpack;
@@ -74,8 +75,9 @@ __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int key_ret)
 		 */
 		if (cbt->ins == NULL) {
 			if (key_ret) {
-				if (__wt_off_page(page, rip->key)) {
-					ikey = rip->key;
+				ripkey = WT_ROW_KEY_COPY(rip);
+				if (__wt_off_page(page, ripkey)) {
+					ikey = ripkey;
 					cursor->key.data = WT_IKEY_DATA(ikey);
 					cursor->key.size = ikey->size;
 				} else
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c
index d25024bb769..f7559c9d200 100644
--- a/src/btree/bt_slvg.c
+++ b/src/btree/bt_slvg.c
@@ -26,6 +26,8 @@ struct __wt_stuff {
 	uint32_t   ovfl_next;			/* Next empty slot */
 	size_t     ovfl_allocated;		/* Bytes allocated */
 
+	WT_PAGE	  *root_page;			/* Created root page */
+
 	uint8_t    page_type;			/* Page type */
 
 	/* If need to free blocks backing merged page ranges. */
@@ -97,7 +99,7 @@ static int  __slvg_col_build_leaf(
 static int  __slvg_col_merge_ovfl(
 		WT_SESSION_IMPL *, WT_TRACK *, WT_PAGE *, uint64_t, uint64_t);
 static int  __slvg_col_range(WT_SESSION_IMPL *, WT_STUFF *);
-static void __slvg_col_range_missing(WT_SESSION_IMPL *, WT_STUFF *);
+static int  __slvg_col_range_missing(WT_SESSION_IMPL *, WT_STUFF *);
 static int  __slvg_col_range_overlap(
 		WT_SESSION_IMPL *, uint32_t, uint32_t, WT_STUFF *);
 static void __slvg_col_trk_update_start(uint32_t, WT_STUFF *);
@@ -130,21 +132,21 @@ static int  __slvg_trk_ovfl(WT_SESSION_IMPL *,
 		WT_PAGE_HEADER *, uint8_t *, uint32_t, uint64_t, WT_STUFF *);
 
 /*
- * __wt_salvage --
+ * __wt_bt_salvage --
  *	Salvage a Btree.
  */
 int
-__wt_salvage(WT_SESSION_IMPL *session, const char *cfg[])
+__wt_bt_salvage(
+    WT_SESSION_IMPL *session, WT_SNAPSHOT *snapbase, const char *cfg[])
 {
 	WT_BTREE *btree;
+	WT_DECL_RET;
 	WT_STUFF *ss, stuff;
 	uint32_t i, leaf_cnt;
-	int ret, started;
 
 	WT_UNUSED(cfg);
 
 	btree = session->btree;
-	ret = started = 0;
 
 	WT_CLEAR(stuff);
 	ss = &stuff;
@@ -158,25 +160,12 @@ __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[])
 
 	/*
 	 * Step 1:
-	 * Clear the salvaged file's root address, we're done with this file
-	 * until it's salvaged.  We do this first because salvage writes a
-	 * root page when it wraps up, and the eviction of that page updates
-	 * the root's address: if the root address were still set, eviction
-	 * would also free the previous root page, which would collide with
-	 * salvage freeing the previous root page when it reads those blocks
-	 * from the file.
-	 */
-	WT_ERR(__wt_btree_set_root(session, btree->filename, NULL, 0));
-
-	/*
-	 * Step 2:
 	 * Inform the underlying block manager that we're salvaging the file.
 	 */
 	WT_ERR(__wt_bm_salvage_start(session));
-	started = 1;
 
 	/*
-	 * Step 3:
+	 * Step 2:
 	 * Read the file and build in-memory structures that reference any leaf
 	 * or overflow page.  Any pages other than leaf or overflow pages are
 	 * added to the free list.
@@ -190,10 +179,10 @@ __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[])
 	WT_ERR(ret);
 
 	/*
-	 * Step 4:
+	 * Step 3:
 	 * Review the relationships between the pages and the overflow items.
 	 *
-	 * Step 5:
+	 * Step 4:
 	 * Add unreferenced overflow page blocks to the free list.
 	 */
 	if (ss->ovfl_next != 0) {
@@ -202,7 +191,7 @@ __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[])
 	}
 
 	/*
-	 * Step 6:
+	 * Step 5:
 	 * Walk the list of pages looking for overlapping ranges to resolve.
 	 * If we find a range that needs to be resolved, set a global flag
 	 * and a per WT_TRACK flag on the pages requiring modification.
@@ -228,21 +217,21 @@ __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[])
 		WT_ERR(__slvg_col_range(session, ss));
 
 	/*
-	 * Step 7:
+	 * Step 6:
 	 * We may have lost key ranges in column-store databases, that is, some
 	 * part of the record number space is gone.   Look for missing ranges.
 	 */
 	switch (ss->page_type) {
 	case WT_PAGE_COL_FIX:
 	case WT_PAGE_COL_VAR:
-		__slvg_col_range_missing(session, ss);
+		WT_ERR(__slvg_col_range_missing(session, ss));
 		break;
 	case WT_PAGE_ROW_LEAF:
 		break;
 	}
 
 	/*
-	 * Step 8:
+	 * Step 7:
 	 * Build an internal page that references all of the leaf pages,
 	 * and write it, as well as any merged pages, to the file.
 	 *
@@ -266,7 +255,7 @@ __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[])
 		}
 
 	/*
-	 * Step 9:
+	 * Step 8:
 	 * If we had to merge key ranges, we have to do a final pass through
 	 * the leaf page array and discard file pages used during key merges.
 	 * We can't do it earlier: if we free'd the leaf pages we're merging as
@@ -276,15 +265,29 @@ __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[])
 	 * final key range.  In other words, if the salvage run fails, we don't
 	 * want to overwrite data the next salvage run might need.
 	 */
-	 if (ss->merge_free)
+	if (ss->merge_free)
 		WT_ERR(__slvg_merge_block_free(session, ss));
 
 	/*
-	 * Step 11:
+	 * Step 9:
+	 * Evict the newly created root page, creating a snapshot.
+	 */
+	if (ss->root_page != NULL) {
+		btree->snap = snapbase;
+		ret = __wt_rec_evict(session, ss->root_page, WT_REC_SINGLE);
+		btree->snap = NULL;
+		ss->root_page = NULL;
+	}
+
+	/*
+	 * Step 10:
 	 * Inform the underlying block manager that we're done.
 	 */
-err:	if (started)
-		WT_TRET(__wt_bm_salvage_end(session, ret == 0 ? 1 : 0));
+err:	WT_TRET(__wt_bm_salvage_end(session));
+
+	/* Discard any root page we created. */
+	if (ss->root_page != NULL)
+		__wt_page_out(session, &ss->root_page, 0);
 
 	/* Discard the leaf and overflow page memory. */
 	WT_TRET(__slvg_cleanup(session, ss));
@@ -294,7 +297,7 @@ err:	if (started)
 	__wt_scr_free(&ss->tmp2);
 
 	/* Wrap up reporting. */
-	__wt_progress(session, NULL, ss->fcnt);
+	WT_TRET(__wt_progress(session, NULL, ss->fcnt));
 
 	return (ret);
 }
@@ -306,16 +309,15 @@ err:	if (started)
 static int
 __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss)
 {
-	WT_ITEM *as, *buf;
+	WT_DECL_ITEM(as);
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
 	WT_PAGE_HEADER *dsk;
 	uint64_t gen;
 	uint32_t addrbuf_size;
-	uint8_t addrbuf[WT_BM_MAX_ADDR_COOKIE];
-	int eof, ret;
+	uint8_t addrbuf[WT_BTREE_MAX_ADDR_COOKIE];
+	int eof;
 
-	ret = 0;
-
-	as = buf = NULL;
 	WT_ERR(__wt_scr_alloc(session, 0, &as));
 	WT_ERR(__wt_scr_alloc(session, 0, &buf));
 
@@ -328,7 +330,7 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss)
 
 		/* Report progress every 10 reads. */
 		if (++ss->fcnt % 10 == 0)
-			__wt_progress(session, NULL, ss->fcnt);
+			WT_ERR(__wt_progress(session, NULL, ss->fcnt));
 
 		/* Create a printable version of the address. */
 		WT_ERR(__wt_bm_addr_string(session, as, addrbuf, addrbuf_size));
@@ -343,10 +345,10 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss)
 		 * calls don't need them either.
 		 */
 		switch (dsk->type) {
+		case WT_PAGE_BLOCK_MANAGER:
 		case WT_PAGE_COL_INT:
-		case WT_PAGE_FREELIST:
 		case WT_PAGE_ROW_INT:
-			WT_VERBOSE(session, salvage,
+			WT_VERBOSE_ERR(session, salvage,
 			    "%s page ignored %s",
 			    __wt_page_type_string(dsk->type), (char *)as->data);
 			WT_ERR(__wt_bm_free(session, addrbuf, addrbuf_size));
@@ -361,16 +363,15 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss)
 		 * see in a corrupted file, like overflow references past the
 		 * end of the file, might as well discard these pages now.
 		 */
-		if (__wt_verify_dsk(session,
-		    (char *)as->data, buf->mem, buf->size) != 0) {
-			WT_VERBOSE(session, salvage,
+		if (__wt_verify_dsk(session, (char *)as->data, buf) != 0) {
+			WT_VERBOSE_ERR(session, salvage,
 			    "%s page failed verify %s",
 			    __wt_page_type_string(dsk->type), (char *)as->data);
 			WT_ERR(__wt_bm_free(session, addrbuf, addrbuf_size));
 			continue;
 		}
 
-		WT_VERBOSE(session, salvage,
+		WT_VERBOSE_ERR(session, salvage,
 		    "tracking %s page, generation %" PRIu64 " %s",
 		    __wt_page_type_string(dsk->type), gen, (char *)as->data);
 
@@ -381,7 +382,7 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss)
 			if (ss->page_type == WT_PAGE_INVALID)
 				ss->page_type = dsk->type;
 			if (ss->page_type != dsk->type)
-				WT_RET_MSG(session, WT_ERROR,
+				WT_ERR_MSG(session, WT_ERROR,
 				    "file contains multiple file formats (both "
 				    "%s and %s), and cannot be salvaged",
 				    __wt_page_type_string(ss->page_type),
@@ -412,8 +413,8 @@ __slvg_trk_init(WT_SESSION_IMPL *session,
     uint8_t *addr, uint32_t addr_size,
     uint32_t size, uint64_t gen, WT_STUFF *ss, WT_TRACK **retp)
 {
+	WT_DECL_RET;
 	WT_TRACK *trk;
-	int ret;
 
 	WT_RET(__wt_calloc_def(session, 1, &trk));
 	trk->ss = ss;
@@ -444,17 +445,16 @@ __slvg_trk_leaf(WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk,
 	WT_BTREE *btree;
 	WT_CELL *cell;
 	WT_CELL_UNPACK *unpack, _unpack;
+	WT_DECL_RET;
 	WT_PAGE *page;
 	WT_TRACK *trk;
 	uint64_t stop_recno;
 	uint32_t i;
-	int ret;
 
 	btree = session->btree;
 	unpack = &_unpack;
 	page = NULL;
 	trk = NULL;
-	ret = 0;
 
 	/* Re-allocate the array of pages, as necessary. */
 	if (ss->pages_next * sizeof(WT_TRACK *) == ss->pages_allocated)
@@ -474,7 +474,7 @@ __slvg_trk_leaf(WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk,
 		trk->col_start = dsk->recno;
 		trk->col_stop = dsk->recno + (dsk->u.entries - 1);
 
-		WT_VERBOSE(session, salvage,
+		WT_VERBOSE_ERR(session, salvage,
 		    "%s records %" PRIu64 "-%" PRIu64,
 		    __wt_addr_string(
 		    session, ss->tmp1, trk->addr.addr, trk->addr.size),
@@ -495,7 +495,7 @@ __slvg_trk_leaf(WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk,
 		trk->col_start = dsk->recno;
 		trk->col_stop = stop_recno - 1;
 
-		WT_VERBOSE(session, salvage,
+		WT_VERBOSE_ERR(session, salvage,
 		    "%s records %" PRIu64 "-%" PRIu64,
 		    __wt_addr_string(
 		    session, ss->tmp1, trk->addr.addr, trk->addr.size),
@@ -514,7 +514,7 @@ __slvg_trk_leaf(WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk,
 		 * on every leaf page, and if you need to speed up the salvage,
 		 * it's probably a great place to start.
 		 */
-		WT_ERR(__wt_page_inmem(session, NULL, NULL, dsk, NULL, &page));
+		WT_ERR(__wt_page_inmem(session, NULL, NULL, dsk, &page));
 		WT_ERR(__wt_row_key(session,
 		    page, &page->u.row.d[0], &trk->row_start));
 		WT_ERR(__wt_row_key(session,
@@ -524,14 +524,14 @@ __slvg_trk_leaf(WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk,
 		if (WT_VERBOSE_ISSET(session, salvage)) {
 			WT_ERR(__wt_buf_set_printable(session, ss->tmp1,
 			    trk->row_start.data, trk->row_start.size));
-			WT_VERBOSE(session, salvage,
+			WT_VERBOSE_ERR(session, salvage,
 			    "%s start key %.*s",
 			    __wt_addr_string(session,
 			    ss->tmp2, trk->addr.addr, trk->addr.size),
 			    (int)ss->tmp1->size, (char *)ss->tmp1->data);
 			WT_ERR(__wt_buf_set_printable(session, ss->tmp1,
 			    trk->row_stop.data, trk->row_stop.size));
-			WT_VERBOSE(session, salvage,
+			WT_VERBOSE_ERR(session, salvage,
 			    "%s stop key %.*s",
 			    __wt_addr_string(session,
 			    ss->tmp2, trk->addr.addr, trk->addr.size),
@@ -548,7 +548,7 @@ __slvg_trk_leaf(WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk,
 err:		__wt_free(session, trk);
 	}
 	if (page != NULL)
-		__wt_page_out(session, page, WT_PAGE_FREE_IGNORE_DISK);
+		__wt_page_out(session, &page, WT_PAGE_FREE_IGNORE_DISK);
 	return (ret);
 }
 
@@ -616,7 +616,7 @@ __slvg_trk_leaf_ovfl(
 			    unpack->size, &trk->ovfl[ovfl_cnt].addr));
 			trk->ovfl[ovfl_cnt].size = unpack->size;
 
-			WT_VERBOSE(session, salvage,
+			WT_VERBOSE_RET(session, salvage,
 			    "%s overflow reference %s",
 			    __wt_addr_string(session,
 			    trk->ss->tmp1, trk->addr.addr, trk->addr.size),
@@ -785,7 +785,7 @@ __slvg_col_range_overlap(
 	a_trk = ss->pages[a_slot];
 	b_trk = ss->pages[b_slot];
 
-	WT_VERBOSE(session, salvage,
+	WT_VERBOSE_RET(session, salvage,
 	    "%s and %s range overlap",
 	    __wt_addr_string(
 	    session, ss->tmp1, a_trk->addr.addr, a_trk->addr.size),
@@ -809,23 +809,21 @@ __slvg_col_range_overlap(
 	 * and:
 	 *
 	 *		BBBBBBBBBBBBBBBBBB
-	 * #7			AAAAAAAAAAAAAAAA	same as #2
-	 * #8	AAAAAAAAAAAAA				same as #3
+	 * #7	AAAAAAAAAAAAA				same as #3
+	 * #8			AAAAAAAAAAAAAAAA	same as #2
 	 * #9		AAAAA				A is a prefix of B
 	 * #10			AAAAAA			A is middle of B
 	 * #11			AAAAAAAAAA		A is a suffix of B
 	 *
 	 * Because the leaf page array was sorted by record number and a_trk
-	 * appears earlier in that array than b_trk, cases #2/7, #10 and #11
+	 * appears earlier in that array than b_trk, cases #2/8, #10 and #11
 	 * are impossible.
 	 *
 	 * Finally, there's one additional complicating factor -- final ranges
 	 * are assigned based on the page's LSN.
 	 */
-	if (a_trk->col_start == b_trk->col_start) {
+	if (a_trk->col_start == b_trk->col_start) {	/* Case #1, #4 and #9 */
 		/*
-		 * Case #1, #4 and #9.
-		 *
 		 * The secondary sort of the leaf page array was the page's LSN,
 		 * in high-to-low order, which means a_trk has a higher LSN, and
 		 * is more desirable, than b_trk.  In cases #1 and #4 and #9,
@@ -851,8 +849,7 @@ __slvg_col_range_overlap(
 		goto merge;
 	}
 
-	if (a_trk->col_stop == b_trk->col_stop) {
-		/* Case #6. */
+	if (a_trk->col_stop == b_trk->col_stop) {	/* Case #6 */
 		if (a_trk->gen > b_trk->gen)
 			/*
 			 * Case #6: a_trk is a superset of b_trk and a_trk is
@@ -869,8 +866,7 @@ __slvg_col_range_overlap(
 		goto merge;
 	}
 
-	if  (a_trk->col_stop < b_trk->col_stop) {
-		/* Case #3/8. */
+	if  (a_trk->col_stop < b_trk->col_stop) {	/* Case #3/7 */
 		if (a_trk->gen > b_trk->gen) {
 			/*
 			 * Case #3/8: a_trk is more desirable, delete a_trk's
@@ -950,7 +946,7 @@ delete:		WT_RET(__slvg_trk_free(session,
 	a_trk->col_stop = b_trk->col_start - 1;
 	F_SET(a_trk, WT_TRACK_MERGE);
 
-merge:	WT_VERBOSE(session, salvage,
+merge:	WT_VERBOSE_RET(session, salvage,
 	    "%s and %s require merge",
 	    __wt_addr_string(
 	    session, ss->tmp1, a_trk->addr.addr, a_trk->addr.size),
@@ -1004,7 +1000,7 @@ __slvg_col_trk_update_start(uint32_t slot, WT_STUFF *ss)
  * __slvg_col_range_missing --
  *	Detect missing ranges from column-store files.
  */
-static void
+static int
 __slvg_col_range_missing(WT_SESSION_IMPL *session, WT_STUFF *ss)
 {
 	WT_TRACK *trk;
@@ -1015,7 +1011,7 @@ __slvg_col_range_missing(WT_SESSION_IMPL *session, WT_STUFF *ss)
 		if ((trk = ss->pages[i]) == NULL)
 			continue;
 		if (trk->col_start != r + 1) {
-			WT_VERBOSE(session, salvage,
+			WT_VERBOSE_RET(session, salvage,
 			    "%s column-store missing range from %"
 			    PRIu64 " to %" PRIu64 " inclusive",
 			    __wt_addr_string(session,
@@ -1031,6 +1027,7 @@ __slvg_col_range_missing(WT_SESSION_IMPL *session, WT_STUFF *ss)
 		}
 		r = trk->col_stop;
 	}
+	return (0);
 }
 
 /*
@@ -1043,11 +1040,11 @@ __slvg_col_build_internal(
     WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss)
 {
 	WT_ADDR *addr;
+	WT_DECL_RET;
 	WT_PAGE *page;
 	WT_REF *ref;
 	WT_TRACK *trk;
 	uint32_t i;
-	int ret;
 
 	/* Allocate a column-store internal page. */
 	WT_RET(__wt_calloc_def(session, 1, &page));
@@ -1060,7 +1057,7 @@ __slvg_col_build_internal(
 	page->u.intl.recno = 1;
 	page->entries = leaf_cnt;
 	page->type = WT_PAGE_COL_INT;
-	WT_RET(__wt_page_modify_init(session, page));
+	WT_ERR(__wt_page_modify_init(session, page));
 	__wt_page_modify_set(page);
 
 	for (ref = page->u.intl.t, i = 0; i < ss->pages_next; ++i) {
@@ -1092,10 +1089,11 @@ __slvg_col_build_internal(
 		++ref;
 	}
 
-	/* Write the internal page to disk. */
-	return (__wt_rec_evict(session, page, WT_REC_SINGLE));
+	ss->root_page = page;
 
-err:	__wt_page_out(session, page, 0);
+	if (0) {
+err:		__wt_page_out(session, &page, 0);
+	}
 	return (ret);
 }
 
@@ -1108,15 +1106,14 @@ __slvg_col_build_leaf(
     WT_SESSION_IMPL *session, WT_TRACK *trk, WT_PAGE *parent, WT_REF *ref)
 {
 	WT_COL *save_col_var;
+	WT_DECL_RET;
 	WT_PAGE *page;
 	WT_SALVAGE_COOKIE *cookie, _cookie;
 	uint64_t skip, take;
 	uint32_t save_entries;
-	int ret;
 
 	cookie = &_cookie;
 	WT_CLEAR(*cookie);
-	ret = 0;
 
 	/* Get the original page, including the full in-memory setup. */
 	WT_RET(__wt_page_in(session, parent, ref));
@@ -1131,7 +1128,7 @@ __slvg_col_build_leaf(
 	cookie->skip = skip = trk->col_start - page->u.col_var.recno;
 	cookie->take = take = (trk->col_stop - trk->col_start) + 1;
 
-	WT_VERBOSE(session, salvage,
+	WT_VERBOSE_ERR(session, salvage,
 	    "%s merge discarding first %" PRIu64 " records, "
 	    "then taking %" PRIu64 " records",
 	    __wt_addr_string(
@@ -1156,7 +1153,7 @@ __slvg_col_build_leaf(
 		page->u.col_var.recno = trk->col_missing;
 		cookie->missing = trk->col_start - trk->col_missing;
 
-		WT_VERBOSE(session, salvage,
+		WT_VERBOSE_ERR(session, salvage,
 		    "%s merge inserting %" PRIu64 " missing records",
 		    __wt_addr_string(
 		    session, trk->ss->tmp1, trk->addr.addr, trk->addr.size),
@@ -1226,7 +1223,7 @@ __slvg_col_merge_ovfl(WT_SESSION_IMPL *session,
 		if (recno >= start && recno <= stop)
 			continue;
 
-		WT_VERBOSE(session, salvage,
+		WT_VERBOSE_RET(session, salvage,
 		    "%s merge discard freed overflow reference %s",
 		    __wt_addr_string(session,
 			trk->ss->tmp1, trk->addr.addr, trk->addr.size),
@@ -1324,7 +1321,7 @@ __slvg_row_range_overlap(
 	a_trk = ss->pages[a_slot];
 	b_trk = ss->pages[b_slot];
 
-	WT_VERBOSE(session, salvage,
+	WT_VERBOSE_RET(session, salvage,
 	    "%s and %s range overlap",
 	    __wt_addr_string(
 	    session, ss->tmp1, a_trk->addr.addr, a_trk->addr.size),
@@ -1348,14 +1345,14 @@ __slvg_row_range_overlap(
 	 * and:
 	 *
 	 *		BBBBBBBBBBBBBBBBBB
-	 * #7			AAAAAAAAAAAAAAAA	same as #2
-	 * #8	AAAAAAAAAAAAA				same as #3
+	 * #7	AAAAAAAAAAAAA				same as #3
+	 * #8			AAAAAAAAAAAAAAAA	same as #2
 	 * #9		AAAAA				A is a prefix of B
 	 * #10			AAAAAA			A is middle of B
 	 * #11			AAAAAAAAAA		A is a suffix of B
 	 *
 	 * Because the leaf page array was sorted by record number and a_trk
-	 * appears earlier in that array than b_trk, cases #2/7, #10 and #11
+	 * appears earlier in that array than b_trk, cases #2/8, #10 and #11
 	 * are impossible.
 	 *
 	 * Finally, there's one additional complicating factor -- final ranges
@@ -1370,10 +1367,8 @@ __slvg_row_range_overlap(
 	__wt_buf_set(session, dst, (src)->data, (src)->size)
 
 	WT_RET(WT_BTREE_CMP(session, btree, A_TRK_START, B_TRK_START, cmp));
-	if (cmp == 0) {
+	if (cmp == 0) {					/* Case #1, #4, #9 */
 		/*
-		 * Case #1, #4 and #9.
-		 *
 		 * The secondary sort of the leaf page array was the page's LSN,
 		 * in high-to-low order, which means a_trk has a higher LSN, and
 		 * is more desirable, than b_trk.  In cases #1 and #4 and #9,
@@ -1402,8 +1397,7 @@ __slvg_row_range_overlap(
 	}
 
 	WT_RET(WT_BTREE_CMP(session, btree, A_TRK_STOP, B_TRK_STOP, cmp));
-	if (cmp == 0) {
-		/* Case #6. */
+	if (cmp == 0) {					/* Case #6 */
 		if (a_trk->gen > b_trk->gen)
 			/*
 			 * Case #6: a_trk is a superset of b_trk and a_trk is
@@ -1421,8 +1415,7 @@ __slvg_row_range_overlap(
 	}
 
 	WT_RET(WT_BTREE_CMP(session, btree, A_TRK_STOP, B_TRK_STOP, cmp));
-	if (cmp < 0) {
-		/* Case #3/8. */
+	if (cmp < 0) {					/* Case #3/7 */
 		if (a_trk->gen > b_trk->gen) {
 			/*
 			 * Case #3/8: a_trk is more desirable, delete a_trk's
@@ -1504,7 +1497,7 @@ delete:		WT_RET(__slvg_trk_free(session,
 	WT_RET(__slvg_key_copy(session, A_TRK_STOP, B_TRK_START));
 	F_SET(a_trk, WT_TRACK_CHECK_STOP | WT_TRACK_MERGE);
 
-merge:	WT_VERBOSE(session, salvage,
+merge:	WT_VERBOSE_RET(session, salvage,
 	    "%s and %s require merge",
 	    __wt_addr_string(
 	    session, ss->tmp1, a_trk->addr.addr, a_trk->addr.size),
@@ -1522,18 +1515,21 @@ __slvg_row_trk_update_start(
     WT_SESSION_IMPL *session, WT_ITEM *stop, uint32_t slot, WT_STUFF *ss)
 {
 	WT_BTREE *btree;
+	WT_DECL_ITEM(dsk);
+	WT_DECL_ITEM(key);
+	WT_DECL_RET;
 	WT_IKEY *ikey;
-	WT_ITEM *dsk, *key, *item, _item;
+	WT_ITEM *item, _item;
 	WT_PAGE *page;
 	WT_ROW *rip;
 	WT_TRACK *trk;
 	uint32_t i;
-	int cmp, found, ret;
+	int cmp, found;
+	void *ripkey;
 
 	btree = session->btree;
-	key = dsk = NULL;
 	page = NULL;
-	found = ret = 0;
+	found = 0;
 
 	trk = ss->pages[slot];
 
@@ -1559,16 +1555,17 @@ __slvg_row_trk_update_start(
 	 */
 	WT_RET(__wt_scr_alloc(session, trk->size, &dsk));
 	WT_ERR(__wt_bm_read(session, dsk, trk->addr.addr, trk->addr.size));
-	WT_ERR(__wt_page_inmem(session, NULL, NULL, dsk->mem, NULL, &page));
+	WT_ERR(__wt_page_inmem(session, NULL, NULL, dsk->mem, &page));
 
 	/*
 	 * Walk the page, looking for a key sorting greater than the specified
 	 * stop key -- that's our new start key.
 	 */
-	WT_RET(__wt_scr_alloc(session, 0, &key));
+	WT_ERR(__wt_scr_alloc(session, 0, &key));
 	WT_ROW_FOREACH(page, rip, i) {
-		if (__wt_off_page(page, rip->key)) {
-			ikey = rip->key;
+		ripkey = WT_ROW_KEY_COPY(rip);
+		if (__wt_off_page(page, ripkey)) {
+			ikey = ripkey;
 			_item.data = WT_IKEY_DATA(ikey);
 			_item.size = ikey->size;
 			item = &_item;
@@ -1589,8 +1586,8 @@ __slvg_row_trk_update_start(
 	 * would have discarded it, we wouldn't be here.  Therefore, this test
 	 * is safe.  (But, it never hurts to check.)
 	 */
-	WT_RET_TEST(!found, WT_ERROR);
-	WT_RET(__slvg_key_copy(session, &trk->row_start, item));
+	WT_ERR_TEST(!found, WT_ERROR);
+	WT_ERR(__slvg_key_copy(session, &trk->row_start, item));
 
 	/*
 	 * We may need to re-sort some number of elements in the list.  Walk
@@ -1612,7 +1609,7 @@ __slvg_row_trk_update_start(
 		    sizeof(WT_TRACK *), __slvg_trk_compare_key);
 
 	if (page != NULL)
-		__wt_page_out(session, page, WT_PAGE_FREE_IGNORE_DISK);
+		__wt_page_out(session, &page, WT_PAGE_FREE_IGNORE_DISK);
 
 err:	__wt_scr_free(&dsk);
 	__wt_scr_free(&key);
@@ -1630,11 +1627,11 @@ __slvg_row_build_internal(
     WT_SESSION_IMPL *session, uint32_t leaf_cnt,  WT_STUFF *ss)
 {
 	WT_ADDR *addr;
+	WT_DECL_RET;
 	WT_PAGE *page;
 	WT_REF *ref;
 	WT_TRACK *trk;
 	uint32_t i;
-	int ret;
 
 	/* Allocate a row-store internal page. */
 	WT_RET(__wt_calloc_def(session, 1, &page));
@@ -1679,14 +1676,15 @@ __slvg_row_build_internal(
 			WT_ERR(__wt_row_ikey_alloc(session, 0,
 			    trk->row_start.data,
 			    trk->row_start.size,
-			    (WT_IKEY **)&ref->u.key));
+			    &ref->u.key));
 		++ref;
 	}
 
-	/* Write the internal page to disk. */
-	return (__wt_rec_evict(session, page, WT_REC_SINGLE));
+	ss->root_page = page;
 
-err:	__wt_page_out(session, page, 0);
+	if (0) {
+err:		__wt_page_out(session, &page, 0);
+	}
 	return (ret);
 }
 
@@ -1699,20 +1697,22 @@ __slvg_row_build_leaf(WT_SESSION_IMPL *session,
     WT_TRACK *trk, WT_PAGE *parent, WT_REF *ref, WT_STUFF *ss)
 {
 	WT_BTREE *btree;
+	WT_DECL_ITEM(key);
+	WT_DECL_RET;
 	WT_IKEY *ikey;
-	WT_ITEM *item, _item, *key;
+	WT_ITEM *item, _item;
 	WT_PAGE *page;
 	WT_ROW *rip;
 	WT_SALVAGE_COOKIE *cookie, _cookie;
 	uint32_t i, skip_start, skip_stop;
-	int cmp, ret;
+	int cmp;
+	void *ripkey;
 
 	btree = session->btree;
 	page = NULL;
 
 	cookie = &_cookie;
 	WT_CLEAR(*cookie);
-	ret = 0;
 
 	/* Allocate temporary space in which to instantiate the keys. */
 	WT_RET(__wt_scr_alloc(session, 0, &key));
@@ -1739,8 +1739,9 @@ __slvg_row_build_leaf(WT_SESSION_IMPL *session,
 	skip_start = skip_stop = 0;
 	if (F_ISSET(trk, WT_TRACK_CHECK_START))
 		WT_ROW_FOREACH(page, rip, i) {
-			if (__wt_off_page(page, rip->key)) {
-				ikey = rip->key;
+			ripkey = WT_ROW_KEY_COPY(rip);
+			if (__wt_off_page(page, ripkey)) {
+				ikey = ripkey;
 				_item.data = WT_IKEY_DATA(ikey);
 				_item.size = ikey->size;
 				item = &_item;
@@ -1759,7 +1760,7 @@ __slvg_row_build_leaf(WT_SESSION_IMPL *session,
 			if (WT_VERBOSE_ISSET(session, salvage)) {
 				WT_ERR(__wt_buf_set_printable(session,
 				    ss->tmp1, item->data, item->size));
-				WT_VERBOSE(session, salvage,
+				WT_VERBOSE_ERR(session, salvage,
 				    "%s merge discarding leading key %.*s",
 				    __wt_addr_string(session,
 				    ss->tmp2, trk->addr.addr, trk->addr.size),
@@ -1770,8 +1771,9 @@ __slvg_row_build_leaf(WT_SESSION_IMPL *session,
 		}
 	if (F_ISSET(trk, WT_TRACK_CHECK_STOP))
 		WT_ROW_FOREACH_REVERSE(page, rip, i) {
-			if (__wt_off_page(page, rip->key)) {
-				ikey = rip->key;
+			ripkey = WT_ROW_KEY_COPY(rip);
+			if (__wt_off_page(page, ripkey)) {
+				ikey = ripkey;
 				_item.data = WT_IKEY_DATA(ikey);
 				_item.size = ikey->size;
 				item = &_item;
@@ -1790,7 +1792,7 @@ __slvg_row_build_leaf(WT_SESSION_IMPL *session,
 			if (WT_VERBOSE_ISSET(session, salvage)) {
 				WT_ERR(__wt_buf_set_printable(session,
 				    ss->tmp1, item->data, item->size));
-				WT_VERBOSE(session, salvage,
+				WT_VERBOSE_ERR(session, salvage,
 				    "%s merge discarding trailing key %.*s",
 				    __wt_addr_string(session,
 				    ss->tmp2, trk->addr.addr, trk->addr.size),
@@ -1806,7 +1808,7 @@ __slvg_row_build_leaf(WT_SESSION_IMPL *session,
 	 * least one cell on the page we want.  This is a change from previous
 	 * behavior, so I'm asserting it.
 	 */
-	WT_ASSERT_RET(session, skip_start + skip_stop < page->entries);
+	WT_ASSERT_ERR(session, skip_start + skip_stop < page->entries);
 
 	/*
 	 * Take a copy of this page's first key to define the start of
@@ -1814,14 +1816,15 @@ __slvg_row_build_leaf(WT_SESSION_IMPL *session,
 	 * a copy from the page.
 	 */
 	rip = page->u.row.d + skip_start;
-	if (__wt_off_page(page, rip->key)) {
-		ikey = rip->key;
+	ripkey = WT_ROW_KEY_COPY(rip);
+	if (__wt_off_page(page, ripkey)) {
+		ikey = ripkey;
 		WT_ERR(__wt_row_ikey_alloc(session, 0,
-		    WT_IKEY_DATA(ikey), ikey->size, (WT_IKEY **)&ref->u.key));
+		    WT_IKEY_DATA(ikey), ikey->size, &ref->u.key));
 	} else {
 		WT_ERR(__wt_row_key(session, page, rip, key));
 		WT_ERR(__wt_row_ikey_alloc(session, 0,
-		    key->data, key->size, (WT_IKEY **)&ref->u.key));
+		    key->data, key->size, &ref->u.key));
 	}
 
 	/*
@@ -1899,18 +1902,20 @@ __slvg_row_merge_ovfl(WT_SESSION_IMPL *session,
 	WT_CELL *cell;
 	WT_CELL_UNPACK *unpack, _unpack;
 	WT_ROW *rip;
+	void *ripkey;
 
 	unpack = &_unpack;
 
 	for (rip = page->u.row.d + start; start < stop; ++start) {
-		if (__wt_off_page(page, rip->key))
+		ripkey = WT_ROW_KEY_COPY(rip);
+		if (__wt_off_page(page, ripkey))
 			cell = WT_PAGE_REF_OFFSET(
-			    page, ((WT_IKEY *)rip->key)->cell_offset);
+			    page, ((WT_IKEY *)ripkey)->cell_offset);
 		else
-			cell = rip->key;
+			cell = ripkey;
 		__wt_cell_unpack(cell, unpack);
 		if (unpack->type == WT_CELL_KEY_OVFL) {
-			WT_VERBOSE(session, salvage,
+			WT_VERBOSE_RET(session, salvage,
 			    "%s merge discard freed overflow reference %s",
 			    __wt_addr_string(session,
 			    trk->ss->tmp1, trk->addr.addr, trk->addr.size),
@@ -1925,7 +1930,7 @@ __slvg_row_merge_ovfl(WT_SESSION_IMPL *session,
 			continue;
 		__wt_cell_unpack(cell, unpack);
 		if (unpack->type == WT_CELL_VALUE_OVFL) {
-			WT_VERBOSE(session, salvage,
+			WT_VERBOSE_RET(session, salvage,
 			    "%s merge discard freed overflow reference %s",
 			    __wt_addr_string(session,
 			    trk->ss->tmp1, trk->addr.addr, trk->addr.size),
@@ -1946,9 +1951,9 @@ __slvg_row_merge_ovfl(WT_SESSION_IMPL *session,
 static int
 __slvg_trk_compare_addr(const void *a, const void *b)
 {
+	WT_DECL_RET;
 	WT_TRACK *a_trk, *b_trk;
 	uint32_t len;
-	int ret;
 
 	a_trk = *(WT_TRACK **)a;
 	b_trk = *(WT_TRACK **)b;
@@ -1973,9 +1978,9 @@ static int
 __slvg_ovfl_compare(const void *a, const void *b)
 {
 	WT_ADDR *addr;
+	WT_DECL_RET;
 	WT_TRACK *trk;
 	uint32_t len;
-	int ret;
 
 	addr = (WT_ADDR *)a;
 	trk = *(WT_TRACK **)b;
@@ -2065,7 +2070,7 @@ __slvg_ovfl_reconcile(WT_SESSION_IMPL *session, WT_STUFF *ss)
 				    sizeof(WT_TRACK *), __slvg_ovfl_compare);
 				F_CLR(*searchp, WT_TRACK_OVFL_REFD);
 			}
-			WT_VERBOSE(session, salvage,
+			WT_VERBOSE_RET(session, salvage,
 			    "%s references unavailable overflow page %s",
 			    __wt_addr_string(session,
 			    ss->tmp1, trk->addr.addr, trk->addr.size),
@@ -2196,7 +2201,7 @@ __slvg_ovfl_discard(WT_SESSION_IMPL *session, WT_STUFF *ss)
 	for (i = 0; i < ss->ovfl_next; ++i) {
 		if (F_ISSET(ss->ovfl[i], WT_TRACK_OVFL_REFD))
 			continue;
-		WT_VERBOSE(session, salvage,
+		WT_VERBOSE_RET(session, salvage,
 		    "%s unused overflow page",
 		    __wt_addr_string(session,
 		    ss->tmp1, ss->ovfl[i]->addr.addr, ss->ovfl[i]->addr.size));
@@ -2251,7 +2256,7 @@ __slvg_trk_free(WT_SESSION_IMPL *session, WT_TRACK **trkp, uint32_t flags)
 	 * verbose description.
 	 */
 	if (LF_ISSET(WT_TRK_FREE_BLOCKS)) {
-		WT_VERBOSE(session, salvage,
+		WT_VERBOSE_RET(session, salvage,
 		    "%s page discarded: discard freed file bytes %" PRIu32,
 		    __wt_addr_string(
 		    session, trk->ss->tmp1, trk->addr.addr, trk->addr.size),
@@ -2263,13 +2268,12 @@ __slvg_trk_free(WT_SESSION_IMPL *session, WT_TRACK **trkp, uint32_t flags)
 	for (i = 0; i < trk->ovfl_cnt; ++i) {
 		addr = &trk->ovfl[i];
 		if (LF_ISSET(WT_TRK_FREE_OVFL)) {
-			WT_VERBOSE(session, salvage,
+			WT_VERBOSE_RET(session, salvage,
 			    "%s page discarded: discard freed overflow page %s",
 			    __wt_addr_string(session,
 			    trk->ss->tmp1, trk->addr.addr, trk->addr.size),
 			    __wt_addr_string(session,
 			    trk->ss->tmp2, addr->addr, addr->size));
-
 			WT_RET(__wt_bm_free(session, addr->addr, addr->size));
 		}
 		__wt_free(session, addr->addr);
diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c
index eba5d076c59..64c128176c6 100644
--- a/src/btree/bt_stat.c
+++ b/src/btree/bt_stat.c
@@ -19,8 +19,8 @@ int
 __wt_btree_stat_init(WT_SESSION_IMPL *session)
 {
 	WT_BTREE *btree;
+	WT_DECL_RET;
 	WT_PAGE *page;
-	int ret;
 
 	btree = session->btree;
 
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index bc8137493d8..7d37d30c155 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -8,18 +8,30 @@
 #include "wt_internal.h"
 
 /*
- * __wt_btree_sync --
- *	Sync the tree.
+ * __wt_bt_cache_flush --
+ *	Write dirty pages from the cache, optionally discarding the file.
  */
 int
-__wt_btree_sync(WT_SESSION_IMPL *session, const char *cfg[])
+__wt_bt_cache_flush(
+    WT_SESSION_IMPL *session, WT_SNAPSHOT *snapbase, int op, int force)
 {
-	int ret;
+	WT_DECL_RET;
+	WT_BTREE *btree;
 
-	WT_UNUSED(cfg);
+	btree = session->btree;
 
 	/*
-	 * Ask the eviction thread to flush any dirty pages.
+	 * If we need a new snapshot, mark the root page dirty to ensure a
+	 * write.
+	 */
+	if (force) {
+		WT_RET(__wt_page_modify_init(session, btree->root_page));
+		__wt_page_modify_set(btree->root_page);
+	}
+
+	/*
+	 * Ask the eviction thread to flush any dirty pages, and optionally
+	 * discard the file from the cache.
 	 *
 	 * Reconciliation is just another reader of the page, so it's probably
 	 * possible to do this work in the current thread, rather than poking
@@ -39,9 +51,30 @@ __wt_btree_sync(WT_SESSION_IMPL *session, const char *cfg[])
 	 * already works that way.   None of these problems can't be fixed, but
 	 * I don't see a reason to change at this time, either.
 	 */
-	do {
-		ret = __wt_evict_file_serial(session, 0);
-	} while (ret == WT_RESTART);
+	btree->snap = snapbase;
+	ret = __wt_sync_file_serial(session, op);
+	btree->snap = NULL;
+	WT_RET(ret);
+
+	switch (op) {
+	case WT_SYNC:
+		break;
+	case WT_SYNC_DISCARD:
+		/* If discarding the tree, the root page should be gone. */
+		WT_ASSERT(session, btree->root_page == NULL);
+		break;
+	case WT_SYNC_DISCARD_NOWRITE:
+		/*
+		 * XXX
+		 * I'm not sure this is the right place to do this, but it's
+		 * the point in the btree engine where we know the root page
+		 * is gone.  Unlike WT_SYNC_DISCARD, which writes, evicts and
+		 * discards the root page, WT_SYNC_DISCARD_NOWRITE simply
+		 * discards the pages, which means "eviction" never happens.
+		 */
+		btree->root_page = NULL;
+		break;
+	}
 
-	return (ret);
+	return (0);
 }
diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c
index 0d78f9d0bbf..929e9269651 100644
--- a/src/btree/bt_vrfy.c
+++ b/src/btree/bt_vrfy.c
@@ -25,15 +25,16 @@ typedef struct {
 	WT_ITEM *tmp2;				/* Temporary buffer */
 } WT_VSTUFF;
 
-static int __verify_int(WT_SESSION_IMPL *, int);
-static int __verify_overflow(
+static int  __verify_int(WT_SESSION_IMPL *, int);
+static int  __verify_overflow(
 	WT_SESSION_IMPL *, const uint8_t *, uint32_t, WT_VSTUFF *);
-static int __verify_overflow_cell(WT_SESSION_IMPL *, WT_PAGE *, WT_VSTUFF *);
-static int __verify_row_int_key_order(
+static int  __verify_overflow_cell(WT_SESSION_IMPL *, WT_PAGE *, WT_VSTUFF *);
+static int  __verify_row_int_key_order(
 	WT_SESSION_IMPL *, WT_PAGE *, WT_REF *, uint32_t, WT_VSTUFF *);
-static int __verify_row_leaf_key_order(
+static int  __verify_row_leaf_key_order(
 	WT_SESSION_IMPL *, WT_PAGE *, WT_VSTUFF *);
-static int __verify_tree(WT_SESSION_IMPL *, WT_PAGE *, uint64_t, WT_VSTUFF *);
+static void __verify_snapshot_reset(WT_VSTUFF *);
+static int  __verify_tree(WT_SESSION_IMPL *, WT_PAGE *, uint64_t, WT_VSTUFF *);
 
 /*
  * __wt_verify --
@@ -78,47 +79,73 @@ static int
 __verify_int(WT_SESSION_IMPL *session, int dumpfile)
 {
 	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_ITEM dsk;
+	WT_SNAPSHOT *snapbase, *snap;
 	WT_VSTUFF *vs, _vstuff;
-	int empty, ret;
 
 	btree = session->btree;
-	ret = 0;
-
-	WT_RET(__wt_bm_verify_start(session, &empty));
-	if (empty)
-		return (0);
+	snapbase = NULL;
 
 	WT_CLEAR(_vstuff);
 	vs = &_vstuff;
-
 	vs->dumpfile = dumpfile;
 	WT_ERR(__wt_scr_alloc(session, 0, &vs->max_key));
 	WT_ERR(__wt_scr_alloc(session, 0, &vs->max_addr));
 	WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1));
 	WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2));
 
-	/*
-	 * Verify the tree, starting at the root: if there is no root, that's
-	 * still possibly a legal file, but all of the pages must be on the
-	 * free-list.
-	 */
-	WT_ERR(__wt_btree_get_root(session, vs->tmp1));
-	if (vs->tmp1->data != NULL) {
-		WT_ERR(__wt_bm_verify_addr(
-		    session, vs->tmp1->data, vs->tmp1->size));
-		WT_ERR(__wt_btree_root_init(session, vs->tmp1));
-		WT_ERR(
-		    __verify_tree(session, btree->root_page, (uint64_t)1, vs));
-	}
+	/* Get a list of the snapshots for this file. */
+	WT_ERR(__wt_meta_snaplist_get(session, btree->name, &snapbase));
+
+	/* Inform the underlying block manager we're verifying. */
+	WT_ERR(__wt_bm_verify_start(session, snapbase));
+
+	/* Loop through the file's snapshots, verifying each one. */
+	WT_SNAPSHOT_FOREACH(snapbase, snap) {
+		WT_VERBOSE_ERR(session, verify,
+		    "%s: snapshot %s", btree->name, snap->name);
+
+		/* House-keeping between snapshots. */
+		__verify_snapshot_reset(vs);
 
-	if (0) {
-err:		if (ret == 0)
-			ret = WT_ERROR;
+		/*
+		 * Load the snapshot -- if the size of the root page is 0, the
+		 * file is empty.
+		 *
+		 * Clearing the root page reference here is not an error: any
+		 * root page we read will be discarded as part of calling the
+		 * underlying eviction thread to discard the in-cache version
+		 * of the tree.   Since our reference disappears in that call,
+		 * we can't ever use it again.
+		 */
+		WT_CLEAR(dsk);
+		WT_ERR(__wt_bm_snapshot_load(
+		    session, &dsk, snap->raw.data, snap->raw.size, 1));
+		if (dsk.size != 0) {
+			/* Verify, then discard the snapshot from the cache. */
+			if ((ret = __wt_btree_tree_open(session, &dsk)) == 0) {
+				ret = __verify_tree(
+				    session, btree->root_page, (uint64_t)1, vs);
+				WT_TRET(__wt_bt_cache_flush(
+				    session, NULL, WT_SYNC_DISCARD, 0));
+			}
+		}
+
+		/* Unload the snapshot. */
+		WT_TRET(__wt_bm_snapshot_unload(session));
+		WT_ERR(ret);
 	}
 
+	/* Discard the list of snapshots. */
+err:	__wt_meta_snaplist_free(session, snapbase);
+
+	/* Inform the underlying block manager we're done. */
+	WT_TRET(__wt_bm_verify_end(session));
+
 	if (vs != NULL) {
 		/* Wrap up reporting. */
-		__wt_progress(session, NULL, vs->fcnt);
+		WT_TRET(__wt_progress(session, NULL, vs->fcnt));
 
 		/* Free allocated memory. */
 		__wt_scr_free(&vs->max_key);
@@ -127,12 +154,27 @@ err:		if (ret == 0)
 		__wt_scr_free(&vs->tmp2);
 	}
 
-	WT_TRET(__wt_bm_verify_end(session));
-
 	return (ret);
 }
 
 /*
+ * __verify_snapshot_reset --
+ *	Reset anything needing to be reset for each new snapshot verification.
+ */
+static void
+__verify_snapshot_reset(WT_VSTUFF *vs)
+{
+	/*
+	 * Key order is per snapshot, reset the data length that serves as a
+	 * flag value.
+	 */
+	vs->max_addr->size = 0;
+
+	/* Record total is per snapshot, reset the record count. */
+	vs->record_total = 0;
+}
+
+/*
  * __verify_tree --
  *	Verify a tree, recursively descending through it in depth-first fashion.
  * The page argument was physically verified (so we know it's correctly formed),
@@ -146,18 +188,16 @@ __verify_tree(WT_SESSION_IMPL *session,
 	WT_CELL *cell;
 	WT_CELL_UNPACK *unpack, _unpack;
 	WT_COL *cip;
-	WT_ITEM *tmp;
+	WT_DECL_RET;
 	WT_REF *ref;
 	uint64_t recno;
-	const uint8_t *addr;
 	uint32_t entry, i, size;
-	int ret;
+	const uint8_t *addr;
 
-	ret = 0;
 	unpack = &_unpack;
 
-	WT_VERBOSE(session, verify, "%p: %s %s",
-	    page, __wt_page_addr_string(session, vs->tmp1, page),
+	WT_VERBOSE_RET(session, verify, "%s %s",
+	    __wt_page_addr_string(session, vs->tmp1, page),
 	    __wt_page_type_string(page->type));
 
 	/*
@@ -184,7 +224,7 @@ __verify_tree(WT_SESSION_IMPL *session,
 	 * Report progress every 10 pages.
 	 */
 	if (++vs->fcnt % 10 == 0)
-		__wt_progress(session, NULL, vs->fcnt);
+		WT_RET(__wt_progress(session, NULL, vs->fcnt));
 
 #ifdef HAVE_DIAGNOSTIC
 	/* Optionally dump the page in debugging mode. */
@@ -268,6 +308,7 @@ recno_chk:	if (parent_recno != recno)
 			 * reviewed to this point.
 			 */
 			if (ref->u.recno != vs->record_total + 1) {
+				WT_DECL_ITEM(tmp);
 				WT_RET(__wt_scr_alloc(session, 0, &tmp));
 				__wt_cell_unpack(ref->addr, unpack);
 				ret = __wt_bm_addr_string(
@@ -284,12 +325,12 @@ recno_chk:	if (parent_recno != recno)
 
 			/* ref references the subtree containing the record */
 			__wt_get_addr(page, ref, &addr, &size);
-			WT_RET(__wt_bm_verify_addr(session, addr, size));
 			WT_RET(__wt_page_in(session, page, ref));
 			ret =
 			    __verify_tree(session, ref->page, ref->u.recno, vs);
 			__wt_page_release(session, ref->page);
 			WT_RET(ret);
+			WT_RET(__wt_bm_verify_addr(session, addr, size));
 		}
 		break;
 	case WT_PAGE_ROW_INT:
@@ -311,12 +352,12 @@ recno_chk:	if (parent_recno != recno)
 
 			/* ref references the subtree containing the record */
 			__wt_get_addr(page, ref, &addr, &size);
-			WT_RET(__wt_bm_verify_addr(session, addr, size));
 			WT_RET(__wt_page_in(session, page, ref));
 			ret =
 			    __verify_tree(session, ref->page, (uint64_t)0, vs);
 			__wt_page_release(session, ref->page);
 			WT_RET(ret);
+			WT_RET(__wt_bm_verify_addr(session, addr, size));
 		}
 		break;
 	}
@@ -433,13 +474,12 @@ __verify_overflow_cell(WT_SESSION_IMPL *session, WT_PAGE *page, WT_VSTUFF *vs)
 	WT_BTREE *btree;
 	WT_CELL *cell;
 	WT_CELL_UNPACK *unpack, _unpack;
+	WT_DECL_RET;
 	WT_PAGE_HEADER *dsk;
 	uint32_t cell_num, i;
-	int ret;
 
 	btree = session->btree;
 	unpack = &_unpack;
-	ret = 0;
 
 	/*
 	 * If a tree is empty (just created), it won't have a disk image;
@@ -477,23 +517,24 @@ err:	WT_RET_MSG(session, ret,
  */
 static int
 __verify_overflow(WT_SESSION_IMPL *session,
-    const uint8_t *addrbuf, uint32_t addrbuf_len, WT_VSTUFF *vs)
+    const uint8_t *addr, uint32_t addr_size, WT_VSTUFF *vs)
 {
 	WT_PAGE_HEADER *dsk;
 
 	/* Read and verify the overflow item. */
-	WT_RET(__wt_bm_verify_addr(session, addrbuf, addrbuf_len));
-	WT_RET(__wt_bm_read(session, vs->tmp1, addrbuf, addrbuf_len));
+	WT_RET(__wt_bm_read(session, vs->tmp1, addr, addr_size));
 
 	/*
-	 * The page has already been verified, but we haven't confirmed that
-	 * it was an overflow page, only that it was a valid page.  Confirm
-	 * it's the type of page we expected.
+	 * The physical page has already been verified, but we haven't confirmed
+	 * it was an overflow page, only that it was a valid page.  Confirm it's
+	 * the type of page we expected.
 	 */
 	dsk = vs->tmp1->mem;
 	if (dsk->type != WT_PAGE_OVFL)
 		WT_RET_MSG(session, WT_ERROR,
 		    "overflow referenced page at %s is not an overflow page",
-		    __wt_addr_string(session, vs->tmp1, addrbuf, addrbuf_len));
+		    __wt_addr_string(session, vs->tmp1, addr, addr_size));
+
+	WT_RET(__wt_bm_verify_addr(session, addr, addr_size));
 	return (0);
 }
diff --git a/src/btree/bt_vrfy_dsk.c b/src/btree/bt_vrfy_dsk.c
index 610fce614a1..28439b002d4 100644
--- a/src/btree/bt_vrfy_dsk.c
+++ b/src/btree/bt_vrfy_dsk.c
@@ -39,18 +39,22 @@ static int __verify_dsk_row(
  *	Verify a single Btree page as read from disk.
  */
 int
-__wt_verify_dsk(WT_SESSION_IMPL *session,
-    const char *addr, WT_PAGE_HEADER *dsk, uint32_t size)
+__wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf)
 {
-	u_int i;
+	WT_PAGE_HEADER *dsk;
+	uint32_t size;
 	uint8_t *p;
+	u_int i;
+
+	dsk = buf->mem;
+	size = buf->size;
 
 	/* Check the page type. */
 	switch (dsk->type) {
+	case WT_PAGE_BLOCK_MANAGER:
 	case WT_PAGE_COL_FIX:
 	case WT_PAGE_COL_INT:
 	case WT_PAGE_COL_VAR:
-	case WT_PAGE_FREELIST:
 	case WT_PAGE_OVFL:
 	case WT_PAGE_ROW_INT:
 	case WT_PAGE_ROW_LEAF:
@@ -72,7 +76,7 @@ __wt_verify_dsk(WT_SESSION_IMPL *session,
 		WT_RET_VRFY(session,
 		    "%s page at %s has a record number of zero",
 		    __wt_page_type_string(dsk->type), addr);
-	case WT_PAGE_FREELIST:
+	case WT_PAGE_BLOCK_MANAGER:
 	case WT_PAGE_OVFL:
 	case WT_PAGE_ROW_INT:
 	case WT_PAGE_ROW_LEAF:
@@ -108,7 +112,7 @@ __wt_verify_dsk(WT_SESSION_IMPL *session,
 	case WT_PAGE_ROW_INT:
 	case WT_PAGE_ROW_LEAF:
 		return (__verify_dsk_row(session, addr, dsk));
-	case WT_PAGE_FREELIST:
+	case WT_PAGE_BLOCK_MANAGER:
 	case WT_PAGE_OVFL:
 		return (__verify_dsk_chunk(session, addr, dsk, dsk->u.datalen));
 	WT_ILLEGAL_VALUE(session);
@@ -127,19 +131,21 @@ __verify_dsk_row(
 	WT_BTREE *btree;
 	WT_CELL *cell;
 	WT_CELL_UNPACK *unpack, _unpack;
-	WT_ITEM *current, *last, *last_pfx, *last_ovfl;
+	WT_DECL_ITEM(current);
+	WT_DECL_ITEM(last_ovfl);
+	WT_DECL_ITEM(last_pfx);
+	WT_DECL_RET;
+	WT_ITEM *last;
 	enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type;
 	void *huffman;
 	uint32_t cell_num, cell_type, i, prefix;
 	uint8_t *end;
-	int cmp, ret;
+	int cmp;
 
 	btree = session->btree;
 	huffman = btree->huffman_key;
 	unpack = &_unpack;
-	ret = 0;
 
-	current = last_pfx = last_ovfl = NULL;
 	WT_ERR(__wt_scr_alloc(session, 0, &current));
 	WT_ERR(__wt_scr_alloc(session, 0, &last_pfx));
 	WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl));
@@ -162,7 +168,7 @@ __verify_dsk_row(
 		 * Check the raw cell type, then collapse the short key/data
 		 * types.
 		 */
-		WT_RET(__err_cell_type(
+		WT_ERR(__err_cell_type(
 		    session, cell_num, addr, unpack->raw, dsk->type));
 		cell_type = unpack->type;
 
@@ -509,8 +515,8 @@ __verify_dsk_chunk(WT_SESSION_IMPL *session,
 	end = (uint8_t *)dsk + dsk->size;
 
 	/*
-	 * Fixed-length column-store, overflow and freelist pages are simple
-	 * chunks of data.
+	 * Fixed-length column-store and overflow pages are simple chunks of
+	 * data.
 	 */
 	if (datalen == 0)
 		WT_RET_VRFY(session,
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index e3ce624cbea..675e7281493 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -15,13 +15,12 @@ int
 __wt_tree_np(WT_SESSION_IMPL *session, WT_PAGE **pagep, int eviction, int next)
 {
 	WT_BTREE *btree;
+	WT_DECL_RET;
 	WT_PAGE *page, *t;
 	WT_REF *ref;
 	uint32_t slot;
-	int ret;
 
 	btree = session->btree;
-	ret = 0;
 
 	/*
 	 * Take a copy of any returned page; we have a hazard reference on the
diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c
index 9780c9fbc35..ee6ca1e099e 100644
--- a/src/btree/col_modify.c
+++ b/src/btree/col_modify.c
@@ -18,6 +18,7 @@ int
 __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int op)
 {
 	WT_BTREE *btree;
+	WT_DECL_RET;
 	WT_INSERT *ins, *ins_copy;
 	WT_INSERT_HEAD **inshead, *new_inshead, **new_inslist;
 	WT_ITEM *value, _value;
@@ -26,7 +27,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int op)
 	size_t ins_size, new_inshead_size, new_inslist_size, upd_size;
 	uint64_t recno;
 	u_int skipdepth;
-	int i, ret;
+	int i;
 
 	btree = cbt->btree;
 	page = cbt->page;
@@ -65,7 +66,6 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int op)
 	new_inshead = NULL;
 	new_inslist = NULL;
 	upd = NULL;
-	ret = 0;
 
 	/*
 	 * Delete, insert or update a column-store entry.
@@ -79,6 +79,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int op)
 	 * the WT_INSERT structure.
 	 */
 	if (cbt->compare == 0 && cbt->ins != NULL) {
+		WT_ERR(__wt_update_check(session, page, cbt->ins->upd));
 		WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
 
 		/* Insert the WT_UPDATE structure. */
@@ -118,7 +119,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int op)
 		/* There may be no WT_INSERT list, allocate as necessary. */
 		if (*inshead == NULL) {
 			new_inshead_size = sizeof(WT_INSERT_HEAD);
-			WT_RET(__wt_calloc_def(session, 1, &new_inshead));
+			WT_ERR(__wt_calloc_def(session, 1, &new_inshead));
 			for (i = 0; i < WT_SKIP_MAXDEPTH; i++)
 				cbt->ins_stack[i] = &new_inshead->head[i];
 			cbt->ins_head = new_inshead;
@@ -133,6 +134,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int op)
 		 */
 		WT_ERR(__col_insert_alloc(
 		    session, recno, skipdepth, &ins, &ins_size));
+		WT_ERR(__wt_update_check(session, page, NULL));
 		WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
 		ins->upd = upd;
 		ins_size += upd_size;
@@ -149,7 +151,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int op)
 			ins_copy = ins;
 
 			WT_ERR(__wt_col_append_serial(session,
-			    page, inshead, cbt->ins_stack,
+			    page, cbt->write_gen, inshead, cbt->ins_stack,
 			    &new_inslist, new_inslist_size,
 			    &new_inshead, new_inshead_size,
 			    &ins, ins_size, skipdepth));
@@ -168,8 +170,14 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int op)
 	if (ret != 0) {
 err:		if (ins != NULL)
 			__wt_free(session, ins);
-		if (upd != NULL)
+		if (upd != NULL) {
+			/*
+			 * Remove the update from the current transaction, so we
+			 * don't try to modify it on rollback.
+			 */
+			__wt_txn_unmodify(session);
 			__wt_free(session, upd);
+		}
 	}
 
 	__wt_free(session, new_inslist);
@@ -180,8 +188,7 @@ err:		if (ins != NULL)
 
 /*
  * __col_insert_alloc --
- *	Column-store insert: allocate a WT_INSERT structure from the session's
- *	buffer and fill it in.
+ *	Column-store insert: allocate a WT_INSERT structure and fill it in.
  */
 static int
 __col_insert_alloc(WT_SESSION_IMPL *session,
@@ -212,36 +219,24 @@ void
 __wt_col_append_serial_func(WT_SESSION_IMPL *session)
 {
 	WT_BTREE *btree;
-	WT_PAGE *page;
+	WT_DECL_RET;
 	WT_INSERT *ins, *new_ins, ***ins_stack;
-	WT_INSERT_HEAD **inshead, **new_inslist, *new_inshead;
+	WT_INSERT_HEAD *inshead, **insheadp, **new_inslist, *new_inshead;
+	WT_PAGE *page;
 	uint64_t recno;
+	uint32_t write_gen;
 	u_int i, skipdepth;
-	int ret;
 
 	btree = session->btree;
-	ret = 0;
 
-	__wt_col_append_unpack(session, &page, &inshead, &ins_stack,
-	    &new_inslist, &new_inshead, &new_ins, &skipdepth);
+	__wt_col_append_unpack(session, &page, &write_gen, &insheadp,
+	    &ins_stack, &new_inslist, &new_inshead, &new_ins, &skipdepth);
 
-	/*
-	 * If the page does not yet have an insert array, our caller passed
-	 * us one.
-	 */
-	if (page->modify->append == NULL) {
-		page->modify->append = new_inslist;
-		__wt_col_append_new_inslist_taken(session, page);
-	}
+	/* Check the page's write-generation. */
+	WT_ERR(__wt_page_write_gen_check(session, page, write_gen));
 
-	/*
-	 * If the insert head does not yet have an insert list, our caller
-	 * passed us one.
-	 */
-	if (*inshead == NULL) {
-		*inshead = new_inshead;
-		__wt_col_append_new_inshead_taken(session, page);
-	}
+	if ((inshead = *insheadp) == NULL)
+		inshead = new_inshead;
 
 	/*
 	 * If the application specified a record number, there's a race: the
@@ -253,20 +248,12 @@ __wt_col_append_serial_func(WT_SESSION_IMPL *session)
 	 */
 	if ((recno = WT_INSERT_RECNO(new_ins)) == 0)
 		recno = WT_INSERT_RECNO(new_ins) = ++btree->last_recno;
-	ins = __col_insert_search(*inshead, ins_stack, recno);
 
-	/* If we find the record number, there's been a race. */
-	if (ins != NULL && WT_INSERT_RECNO(ins) == recno) {
-		ret = WT_RESTART;
-		goto done;
-	}
+	ins = __col_insert_search(inshead, ins_stack, recno);
 
-	/*
-	 * If we don't find the record, check to see if we extended the file,
-	 * and update the last record number.
-	 */
-	if (recno > btree->last_recno)
-		btree->last_recno = recno;
+	/* If we find the record number, there's been a race. */
+	if (ins != NULL && WT_INSERT_RECNO(ins) == recno)
+		WT_ERR(WT_RESTART);
 
 	/*
 	 * Publish: First, point the new WT_INSERT item's skiplist references
@@ -278,13 +265,44 @@ __wt_col_append_serial_func(WT_SESSION_IMPL *session)
 		new_ins->next[i] = *ins_stack[i];
 	WT_WRITE_BARRIER();
 	for (i = 0; i < skipdepth; i++) {
-		if ((*inshead)->tail[i] == NULL ||
-		    ins_stack[i] == &(*inshead)->tail[i]->next[i])
-			(*inshead)->tail[i] = new_ins;
+		if (inshead->tail[i] == NULL ||
+		    ins_stack[i] == &inshead->tail[i]->next[i])
+			inshead->tail[i] = new_ins;
 		*ins_stack[i] = new_ins;
 	}
 
 	__wt_col_append_new_ins_taken(session, page);
 
-done:	__wt_session_serialize_wrapup(session, page, ret);
+	/*
+	 * If the insert head does not yet have an insert list, our caller
+	 * passed us one.
+	 *
+	 * NOTE: it is important to do this after the item has been added to
+	 * the list.  Code can assume that if the list is set, it is non-empty.
+	 */
+	if (*insheadp == NULL) {
+		WT_PUBLISH(*insheadp, new_inshead);
+		__wt_col_append_new_inshead_taken(session, page);
+	}
+
+	/*
+	 * If the page does not yet have an insert array, our caller passed
+	 * us one.
+	 *
+	 * NOTE: it is important to do this after publishing the list entry.
+	 * Code can assume that if the array is set, it is non-empty.
+	 */
+	if (page->modify->append == NULL) {
+		page->modify->append = new_inslist;
+		__wt_col_append_new_inslist_taken(session, page);
+	}
+
+	/*
+	 * If we don't find the record, check to see if we extended the file,
+	 * and update the last record number.
+	 */
+	if (recno > btree->last_recno)
+		btree->last_recno = recno;
+
+err:	__wt_session_serialize_wrapup(session, page, ret);
 }
diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c
index 1fb692f2b05..0e6044ac5c7 100644
--- a/src/btree/col_srch.c
+++ b/src/btree/col_srch.c
@@ -16,13 +16,13 @@ __wt_col_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify)
 {
 	WT_BTREE *btree;
 	WT_COL *cip;
+	WT_DECL_RET;
 	WT_INSERT *ins;
 	WT_INSERT_HEAD *ins_head;
 	WT_PAGE *page;
 	WT_REF *ref;
 	uint64_t recno;
 	uint32_t base, indx, limit;
-	int ret;
 
 	__cursor_search_clear(cbt);
 
@@ -81,7 +81,7 @@ __wt_col_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify)
 	 */
 	if (is_modify) {
 		/* Initialize the page's modification information */
-		WT_RET(__wt_page_modify_init(session, page));
+		WT_ERR(__wt_page_modify_init(session, page));
 
 		WT_ORDERED_READ(cbt->write_gen, page->modify->write_gen);
 	}
diff --git a/src/btree/rec_evict.c b/src/btree/rec_evict.c
index 798a85acbe1..48342f340a9 100644
--- a/src/btree/rec_evict.c
+++ b/src/btree/rec_evict.c
@@ -8,15 +8,13 @@
 #include "wt_internal.h"
 
 static int  __hazard_exclusive(WT_SESSION_IMPL *, WT_REF *, int);
-static int  __rec_discard(WT_SESSION_IMPL *, WT_PAGE *, int);
-static int  __rec_discard_page(WT_SESSION_IMPL *, WT_PAGE *, int);
+static void __rec_discard_page(WT_SESSION_IMPL *, WT_PAGE *, int);
+static void __rec_discard_tree(WT_SESSION_IMPL *, WT_PAGE *, int);
 static void __rec_excl_clear(WT_SESSION_IMPL *);
-static int  __rec_page_clean_update(WT_SESSION_IMPL *, WT_PAGE *, int);
-static int  __rec_page_dirty_update(WT_SESSION_IMPL *, WT_PAGE *, int);
+static void __rec_page_clean_update(WT_SESSION_IMPL *, WT_PAGE *);
+static int  __rec_page_dirty_update(WT_SESSION_IMPL *, WT_PAGE *);
 static int  __rec_review(WT_SESSION_IMPL *, WT_REF *, WT_PAGE *, uint32_t, int);
-static int  __rec_root_addr_update(WT_SESSION_IMPL *, uint8_t *, uint32_t);
-static int  __rec_root_clean_update(WT_SESSION_IMPL *, WT_PAGE *, int);
-static int  __rec_root_dirty_update(WT_SESSION_IMPL *, WT_PAGE *, int);
+static void __rec_root_update(WT_SESSION_IMPL *);
 
 /*
  * __wt_rec_evict --
@@ -26,12 +24,12 @@ int
 __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
 {
 	WT_CONNECTION_IMPL *conn;
-	int ret, single;
+	WT_DECL_RET;
+	int single;
 
 	conn = S2C(session);
-	ret = 0;
 
-	WT_VERBOSE(session, evict,
+	WT_VERBOSE_RET(session, evict,
 	    "page %p (%s)", page, __wt_page_type_string(page->type));
 
 	WT_ASSERT(session, session->excl_next == 0);
@@ -59,20 +57,26 @@ __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
 	/* Update the parent and discard the page. */
 	if (page->modify == NULL || !F_ISSET(page->modify, WT_PM_REC_MASK)) {
 		WT_STAT_INCR(conn->stats, cache_evict_unmodified);
+		WT_ASSERT(session, single || page->ref->state == WT_REF_LOCKED);
 
 		if (WT_PAGE_IS_ROOT(page))
-			WT_ERR(__rec_root_clean_update(session, page, single));
+			__rec_root_update(session);
 		else
-			WT_ERR(__rec_page_clean_update(session, page, single));
+			__rec_page_clean_update(session, page);
+
+		/* Discard the page. */
+		__rec_discard_page(session, page, single);
 	} else {
 		WT_STAT_INCR(conn->stats, cache_evict_modified);
 
 		if (WT_PAGE_IS_ROOT(page))
-			WT_ERR(__rec_root_dirty_update(session, page, single));
+			__rec_root_update(session);
 		else
-			WT_ERR(__rec_page_dirty_update(session, page, single));
-	}
+			WT_ERR(__rec_page_dirty_update(session, page));
 
+		/* Discard the tree rooted in this page. */
+		__rec_discard_tree(session, page, single);
+	}
 	if (0) {
 err:		/*
 		 * If unable to evict this page, release exclusive reference(s)
@@ -81,46 +85,40 @@ err:		/*
 		__rec_excl_clear(session);
 	}
 	session->excl_next = 0;
+
 	return (ret);
 }
 
 /*
- * __rec_page_clean_update  --
- *	Update a page's reference for an evicted, clean page.
+ * __rec_root_update  --
+ *	Update a root page's reference on eviction (clean or dirty).
  */
-static int
-__rec_page_clean_update(WT_SESSION_IMPL *session, WT_PAGE *page, int single)
+static void
+__rec_root_update(WT_SESSION_IMPL *session)
 {
-	WT_ASSERT(session, single || page->ref->state == WT_REF_LOCKED);
-
-	/* Update the relevant WT_REF structure. */
-	page->ref->page = NULL;
-	WT_PUBLISH(page->ref->state, WT_REF_DISK);
-
-	return (__rec_discard_page(session, page, single));
+	session->btree->root_page = NULL;
 }
 
 /*
- * __rec_root_clean_update  --
- *	Update a page's reference for an evicted, clean page.
+ * __rec_page_clean_update  --
+ *	Update a clean page's reference on eviction.
  */
-static int
-__rec_root_clean_update(WT_SESSION_IMPL *session, WT_PAGE *page, int single)
+static void
+__rec_page_clean_update(WT_SESSION_IMPL *session, WT_PAGE *page)
 {
-	WT_BTREE *btree;
-
-	btree = session->btree;
-	btree->root_page = NULL;
+	/* Update the relevant WT_REF structure. */
+	page->ref->page = NULL;
+	WT_PUBLISH(page->ref->state, WT_REF_DISK);
 
-	return (__rec_discard_page(session, page, single));
+	WT_UNUSED(session);
 }
 
 /*
  * __rec_page_dirty_update --
- *	Update a page's reference for an evicted, dirty page.
+ *	Update a dirty page's reference on eviction.
  */
 static int
-__rec_page_dirty_update(WT_SESSION_IMPL *session, WT_PAGE *page, int single)
+__rec_page_dirty_update(WT_SESSION_IMPL *session, WT_PAGE *page)
 {
 	WT_PAGE_MODIFY *mod;
 	WT_REF *parent_ref;
@@ -161,6 +159,7 @@ __rec_page_dirty_update(WT_SESSION_IMPL *session, WT_PAGE *page, int single)
 
 		/* Clear the reference else discarding the page will free it. */
 		mod->u.split = NULL;
+		F_CLR(mod, WT_PM_REC_SPLIT);
 		break;
 	case WT_PM_REC_EMPTY:				/* Page is empty */
 		/* We checked if the page was empty when we reviewed it. */
@@ -168,123 +167,16 @@ __rec_page_dirty_update(WT_SESSION_IMPL *session, WT_PAGE *page, int single)
 	WT_ILLEGAL_VALUE(session);
 	}
 
-	/*
-	 * Discard pages which were merged into this page during reconciliation,
-	 * then discard the page itself.
-	 */
-	WT_RET(__rec_discard(session, page, single));
-
-	return (0);
-}
-
-/*
- * __rec_root_addr_update --
- *	Update the root page's address.
- */
-static int
-__rec_root_addr_update(WT_SESSION_IMPL *session, uint8_t *addr, uint32_t size)
-{
-	WT_ADDR *root_addr;
-	WT_BTREE *btree;
-
-	btree = session->btree;
-	root_addr = &btree->root_addr;
-
-	/* Free any previously created root addresses. */
-	if (root_addr->addr != NULL) {
-		WT_RET(__wt_bm_free(session, root_addr->addr, root_addr->size));
-		__wt_free(session, root_addr->addr);
-	}
-	btree->root_update = 1;
-
-	root_addr->addr = addr;
-	root_addr->size = size;
-
 	return (0);
 }
 
 /*
- * __rec_root_dirty_update --
- *	Update the reference for an evicted, dirty root page.
- */
-static int
-__rec_root_dirty_update(WT_SESSION_IMPL *session, WT_PAGE *page, int single)
-{
-	WT_BTREE *btree;
-	WT_PAGE *next;
-	WT_PAGE_MODIFY *mod;
-
-	btree = session->btree;
-	mod = page->modify;
-
-	next = NULL;
-	switch (F_ISSET(mod, WT_PM_REC_MASK)) {
-	case WT_PM_REC_EMPTY:				/* Page is empty */
-		WT_VERBOSE(session, evict, "root page empty");
-
-		/* If the root page is empty, clear the root address. */
-		WT_RET(__rec_root_addr_update(session, NULL, 0));
-		btree->root_page = NULL;
-		break;
-	case WT_PM_REC_REPLACE: 			/* 1-for-1 page swap */
-		WT_VERBOSE(session, evict, "root page replaced");
-
-		/* Update the root to its replacement. */
-		WT_RET(__rec_root_addr_update(
-		    session, mod->u.replace.addr, mod->u.replace.size));
-		btree->root_page = NULL;
-		break;
-	case WT_PM_REC_SPLIT:				/* Page split */
-		WT_VERBOSE(session, evict,
-		    "root page split %p -> %p", page, mod->u.split);
-
-		/* Update the root to the split page. */
-		next = mod->u.split;
-
-		/* Clear the reference else discarding the page will free it. */
-		mod->u.split = NULL;
-		break;
-	}
-
-	/*
-	 * Discard pages which were merged into this page during reconciliation,
-	 * then discard the page itself.
-	 */
-	WT_RET(__rec_discard(session, page, single));
-
-	if (next == NULL)
-		return (0);
-
-	/*
-	 * Newly created internal pages are normally merged into their parent
-	 * when the parent is evicted.  Newly split root pages can't be merged,
-	 * they have no parent and the new root page must be written.  We also
-	 * have to write the root page immediately, as the sync or close that
-	 * triggered the split won't see our new root page during its traversal.
-	 *
-	 * Make the new root page look like a normal page that's been modified,
-	 * write it out and discard it.  Keep doing that and eventually we'll
-	 * perform a simple replacement (as opposed to another level of split),
-	 * allowing us to can update the tree's root information and quit.  The
-	 * only time we see multiple splits in here is when we've bulk-loaded
-	 * something huge, and now we're evicting the index page referencing all
-	 * of those leaf pages.
-	 */
-	WT_RET(__wt_page_modify_init(session, next));
-	__wt_page_modify_set(next);
-	F_CLR(next->modify, WT_PM_REC_MASK);
-
-	WT_RET(__wt_rec_write(session, next, NULL));
-
-	return (__rec_root_dirty_update(session, next, single));
-}
-
-/*
- * __rec_discard --
- *	Discard any pages merged into an evicted page, then the page itself.
+ * __rec_discard_tree --
+ *	Discard the tree rooted a page (that is, any pages merged into it),
+ * then the page itself.
  */
-static int
-__rec_discard(WT_SESSION_IMPL *session, WT_PAGE *page, int single)
+static void
+__rec_discard_tree(WT_SESSION_IMPL *session, WT_PAGE *page, int single)
 {
 	WT_REF *ref;
 	uint32_t i;
@@ -298,58 +190,31 @@ __rec_discard(WT_SESSION_IMPL *session, WT_PAGE *page, int single)
 				continue;
 			WT_ASSERT(session,
 			    single || ref->state == WT_REF_LOCKED);
-			WT_RET(__rec_discard(session, ref->page, single));
+			__rec_discard_tree(session, ref->page, single);
 		}
 		/* FALLTHROUGH */
 	default:
-		WT_RET(__rec_discard_page(session, page, single));
+		__rec_discard_page(session, page, single);
 		break;
 	}
-	return (0);
 }
 
 /*
  * __rec_discard_page --
- *	Process the page's list of tracked objects, and discard it.
+ *	Discard the page.
  */
-static int
+static void
 __rec_discard_page(WT_SESSION_IMPL *session, WT_PAGE *page, int single)
 {
-	WT_PAGE_MODIFY *mod;
-
-	mod = page->modify;
-
-	/*
-	 * or if the page was split and later merged, discard it.
-	 */
-	if (mod != NULL) {
-		/*
-		 * If the page has been modified and was tracking objects,
-		 * resolve them.
-		 */
-		WT_RET(__wt_rec_track_wrapup(session, page, 1));
-
-		/*
-		 * If the page was split and eventually merged into the parent,
-		 * discard the split page; if the split page was promoted into
-		 * a split-merge page, then the reference must be cleared before
-		 * the page is discarded.
-		 */
-		if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_SPLIT &&
-		    mod->u.split != NULL)
-			__wt_page_out(session, mod->u.split, 0);
-	}
-
 	/* We should never evict the file's current eviction point. */
 	WT_ASSERT(session, session->btree->evict_page != page);
 
+	/* Make sure a page is not in the eviction request list. */
 	if (!single)
-		__wt_evict_clr_page(session, page);
+		__wt_evict_list_clr_page(session, page);
 
-	/* Discard the page itself. */
-	__wt_page_out(session, page, 0);
-
-	return (0);
+	/* Discard the page. */
+	__wt_page_out(session, &page, 0);
 }
 
 /*
@@ -501,10 +366,6 @@ __rec_excl_clear(WT_SESSION_IMPL *session)
 static int
 __hazard_exclusive(WT_SESSION_IMPL *session, WT_REF *ref, int top)
 {
-	WT_CONNECTION_IMPL *conn;
-	WT_HAZARD *hp;
-	uint32_t elem, i;
-
 	/*
 	 * Make sure there is space to track exclusive access so we can unlock
 	 * to clean up.
@@ -528,18 +389,14 @@ __hazard_exclusive(WT_SESSION_IMPL *session, WT_REF *ref, int top)
 
 	session->excl[session->excl_next++] = ref;
 
-	/* Walk the list of hazard references to search for a match. */
-	conn = S2C(session);
-	elem = conn->session_size * conn->hazard_size;
-	for (i = 0, hp = conn->hazard; i < elem; ++i, ++hp)
-		if (hp->page == ref->page) {
-			WT_BSTAT_INCR(session, rec_hazard);
-			WT_CSTAT_INCR(session, cache_evict_hazard);
-
-			WT_VERBOSE(session,
-			    evict, "page %p hazard request failed", ref->page);
-			return (EBUSY);
-		}
+	/* Check for a matching hazard reference. */
+	if (__wt_page_hazard_check(session, ref->page) == NULL)
+		return (0);
 
-	return (0);
+	WT_BSTAT_INCR(session, rec_hazard);
+	WT_CSTAT_INCR(session, cache_evict_hazard);
+
+	WT_VERBOSE_RET(
+	    session, evict, "page %p hazard request failed", ref->page);
+	return (EBUSY);
 }
diff --git a/src/btree/rec_track.c b/src/btree/rec_track.c
index db9394258ce..51e200c2228 100644
--- a/src/btree/rec_track.c
+++ b/src/btree/rec_track.c
@@ -6,37 +6,59 @@
  */
 
 #include "wt_internal.h"
-
 /*
- * A page in memory has a list of associated blocks and overflow items.  For
- * example, when an overflow item is modified, the original overflow blocks
- * must be freed at some point.  Or, when a page is split, then written again,
- * the first split must be freed.  This code tracks those objects: they are
- * generally called from the routines in rec_write.c, which update the objects
- * each time a page is reconciled.
+ * An in-memory page has a list of tracked blocks and overflow items we use for
+ * a two different tasks.  First, each tracked object has flag information set:
+ *
+ * WT_TRK_DISCARD	The object's backing blocks have been discarded.
+ * WT_TRK_INUSE		The object is in-use.
+ * WT_TRK_ONPAGE	The object is named on the original page, and we might
+ *			encounter it every time we reconcile the page.
+ * The tasks:
+ *
+ * Task #1:
+ *	Free blocks when we're finished with them.  If a page reconciliation
+ * results in a split, and then the page is reconciled again, the split pages
+ * from the first reconciliation should be discarded.  These blocks are added
+ * to the tracking list, and when reconciliation completes, they are discarded.
+ * Normally, the slot is then cleared, but in a few cases, these blocks are
+ * associated with the page, and we might encounter them each time the page
+ * is reconciled.  In that case, the on-page flag is set, and the discard flag
+ * will be set when the backing blocks are discarded, so subsequent page
+ * reconciliations will realize the blocks have already been discarded.
+ *
+ * Task #2:
+ *	Free overflow records when we're finished with them, similarly to the
+ * blocks in task #1.  But, overflow records have additional complications:
+ *
+ *	Complication #1: we want to re-use overflow records whenever possible.
+ * For example, if an overflow record is inserted, and we allocate space and
+ * write it to the backing file, we don't want to do that again every time the
+ * page is reconciled, we want to re-use the overflow record each time we
+ * reconcile the page.  For this we use the in-use flag.  When reconciliation
+ * starts, all of the tracked overflow records have the "track in-use" flag
+ * cleared.  As reconciliation proceeds, every time we create an overflow item,
+ * we check our list of tracked objects for a match.  If we find one we set the
+ * in-use flag and re-use the existing record.  When reconciliation finishes,
+ * any overflow records not marked in-use are discarded.   As above, the
+ * on-page and discard flags may apply, so we know an overflow record has been
+ * discarded (and may not be re-used in future reconciliations).
+ *
+ *	Complication #2: if we discard an overflow key and free its backing
+ * blocks, but then need the key again, we can't get it from disk.  (For
+ * example, the key that references an empty leaf page is discarded when the
+ * reconciliation completes, but the page might not stay empty and we need
+ * the key again for a future reconciliation.)  In this case, the on-page flag
+ * is set for the tracked object, and we can get the key from the object itself.
  */
 
 #ifdef HAVE_VERBOSE
-static void __track_dump(WT_SESSION_IMPL *, WT_PAGE *, const char *);
-static void __track_msg(WT_SESSION_IMPL *, WT_PAGE *, const char *, WT_ADDR *);
-static void __track_print(WT_SESSION_IMPL *, WT_PAGE *, WT_PAGE_TRACK *);
+static int __track_dump(WT_SESSION_IMPL *, WT_PAGE *, const char *);
+static int __track_msg(
+	WT_SESSION_IMPL *, WT_PAGE *, const char *, WT_PAGE_TRACK *);
 #endif
 
 /*
- * __rec_track_clear --
- *	Clear a track entry.
- */
-static inline void
-__rec_track_clear(WT_PAGE_TRACK *track)
-{
-	track->type = WT_PT_EMPTY;
-	track->data = NULL;
-	track->size = 0;
-	track->addr.addr = NULL;
-	track->addr.size = 0;
-}
-
-/*
  * __rec_track_extend --
  *	Extend the list of objects we're tracking
  */
@@ -65,270 +87,345 @@ __rec_track_extend(WT_SESSION_IMPL *session, WT_PAGE *page)
 }
 
 /*
- * __wt_rec_track_block --
- *	Add an addr/size pair to the page's list of tracked objects.
+ * __wt_rec_track --
+ *	Add an object to the page's list of tracked objects.
  */
 int
-__wt_rec_track_block(WT_SESSION_IMPL *session,
-    __wt_pt_type_t type, WT_PAGE *page, const uint8_t *addr, uint32_t size)
+__wt_rec_track(WT_SESSION_IMPL *session, WT_PAGE *page,
+    const uint8_t *addr, uint32_t addr_size,
+    const void *data, uint32_t data_size, uint32_t flags)
 {
 	WT_PAGE_MODIFY *mod;
 	WT_PAGE_TRACK *empty, *track;
+	uint8_t *p;
 	uint32_t i;
 
+	WT_ASSERT(session, addr != NULL);
+
 	mod = page->modify;
 
-	/*
-	 * There may be multiple requests to track a single block. For example,
-	 * an internal page with an overflow key that references a page that's
-	 * split: every time the page is written, we'll figure out the key's
-	 * overflow pages are no longer useful because the underlying page has
-	 * split, but we have no way to know that we've figured that same thing
-	 * out several times already.   Check for duplicates.
-	 */
+	/* Find an empty slot. */
 	empty = NULL;
-	for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i) {
-		if (track->type == WT_PT_EMPTY) {
+	for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i)
+		if (!F_ISSET(track, WT_TRK_OBJECT)) {
 			empty = track;
-			continue;
+			break;
 		}
-		if (track->type == type &&
-		    track->addr.size == size &&
-		    memcmp(addr, track->addr.addr, size) == 0)
-			return (0);
-	}
 
 	/* Reallocate space as necessary. */
 	if (empty == NULL) {
 		WT_RET(__rec_track_extend(session, page));
 		empty = &mod->track[mod->track_entries - 1];
 	}
-
 	track = empty;
-	track->type = type;
-	track->data = NULL;
-	track->size = 0;
-	WT_RET(__wt_strndup(session, (char *)addr, size, &track->addr.addr));
-	track->addr.size = size;
-
-	WT_VERBOSE_CALL(
-	    session, reconcile, __track_print(session, page, track));
+
+	/*
+	 * Minor optimization: allocate a single chunk of space instead of two
+	 * separate ones: be careful when it's freed.
+	 */
+	WT_RET(__wt_calloc_def(session, addr_size + data_size, &p));
+
+	track->flags = (uint8_t)flags | WT_TRK_JUST_ADDED | WT_TRK_OBJECT;
+	track->addr.addr = p;
+	track->addr.size = addr_size;
+	memcpy(track->addr.addr, addr, addr_size);
+	if (data_size) {
+		p += addr_size;
+		track->data = p;
+		track->size = data_size;
+		memcpy(track->data, data, data_size);
+	}
+
+	if (WT_VERBOSE_ISSET(session, reconcile))
+		WT_RET(__track_msg(session, page, "add", track));
 	return (0);
 }
 
 /*
- * __wt_rec_track_ovfl --
- *	Add an overflow object to the page's list of tracked objects.
+ * __wt_rec_track_onpage_srch --
+ *	Search for a permanently tracked object and return a copy of any data
+ * associated with it.
  */
 int
-__wt_rec_track_ovfl(WT_SESSION_IMPL *session, WT_PAGE *page,
-    uint8_t *addr, uint32_t addr_size, const void *data, uint32_t data_size)
+__wt_rec_track_onpage_srch(WT_SESSION_IMPL *session, WT_PAGE *page,
+    const uint8_t *addr, uint32_t addr_size, int *foundp, WT_ITEM *copy)
 {
 	WT_PAGE_MODIFY *mod;
-	WT_PAGE_TRACK *empty, *track;
-	uint8_t *p;
+	WT_PAGE_TRACK *track;
 	uint32_t i;
 
-	WT_ASSERT(session, addr != NULL);
+	/* The default is not-found. */
+	*foundp = 0;
 
 	mod = page->modify;
+	for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i) {
+		/*
+		 * Searching is always for objects referenced from the original
+		 * page, and is only checking to see if the object's address
+		 * matches the address we saved.
+		 *
+		 * It is possible for the address to appear multiple times in
+		 * the list of tracked objects: if we discard an overflow item,
+		 * for example, it can be re-allocated for use by the same page
+		 * during a subsequent reconciliation, and would appear on the
+		 * list of objects based on both the original slot allocated
+		 * from an on-page review, and subsequently as entered during a
+		 * block or overflow object allocation.  This can repeat, too,
+		 * the only entry that can't be discarded is the original one
+		 * from the page.
+		 *
+		 * We don't care if the object is currently in-use or not, just
+		 * if it's there.
+		 *
+		 * Ignore empty slots and objects not loaded from a page.
+		 */
+		if (!F_ISSET(track, WT_TRK_ONPAGE))
+			continue;
 
-	empty = NULL;
-	for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i)
-		if (track->type == WT_PT_EMPTY) {
-			empty = track;
-			break;
-		}
+		/*
+		 * Check for an address match, and if we find one, return a
+		 * copy of the object's data.
+		 */
+		if (track->addr.size != addr_size ||
+		    memcmp(addr, track->addr.addr, addr_size) != 0)
+			continue;
 
-	/* Reallocate space as necessary. */
-	if (empty == NULL) {
-		WT_RET(__rec_track_extend(session, page));
-		empty = &mod->track[mod->track_entries - 1];
+		/* Optionally return a copy of the object's data. */
+		if (copy != NULL) {
+			WT_ASSERT(session, track->size != 0);
+			WT_RET(__wt_buf_set(
+			    session, copy, track->data, track->size));
+		}
+		*foundp = 1;
+		return (0);
 	}
+	return (0);
+}
+
+/*
+ * __wt_rec_track_onpage_add --
+ *	Search for a permanently tracked object and add it if it doesn't
+ * already appear.
+ */
+int
+__wt_rec_track_onpage_add(WT_SESSION_IMPL *session,
+    WT_PAGE *page, const uint8_t *addr, uint32_t addr_size)
+{
+	int found;
 
 	/*
-	 * Minor optimization: allocate a single chunk of space instead of two
-	 * separate ones: be careful when it's freed.
+	 * This function is short-hand for "search the on-page records, and
+	 * if the address is not already listed as an object, add it".  Note
+	 * there is no possibility of object re-use, the object is discarded
+	 * when reconciliation completes.
 	 */
-	WT_RET(__wt_calloc_def(session, addr_size + data_size, &p));
-
-	track = empty;
-	track->type = WT_PT_OVFL;
-	track->addr.addr = p;
-	track->addr.size = addr_size;
-	memcpy(track->addr.addr, addr, addr_size);
-
-	p += addr_size;
-	track->data = p;
-	track->size = data_size;
-	memcpy(track->data, data, data_size);
-
-	WT_VERBOSE_CALL(
-	    session, reconcile, __track_print(session, page, track));
+	WT_RET(__wt_rec_track_onpage_srch(
+	    session, page, addr, addr_size, &found, NULL));
+	if (!found)
+		WT_RET(__wt_rec_track(
+		    session, page, addr, addr_size, NULL, 0, WT_TRK_ONPAGE));
 	return (0);
 }
 
 /*
  * __wt_rec_track_ovfl_reuse --
- *	Search for an overflow record and reactivate it.
+ *	Search for a matching overflow record and reactivate it.
  */
 int
-__wt_rec_track_ovfl_reuse(WT_SESSION_IMPL *session, WT_PAGE *page,
-    const void *data, uint32_t size, uint8_t **addrp, uint32_t *sizep)
+__wt_rec_track_ovfl_reuse(
+    WT_SESSION_IMPL *session, WT_PAGE *page,
+    const void *data, uint32_t data_size,
+    uint8_t **addrp, uint32_t *addr_sizep, int *foundp)
 {
+	WT_PAGE_MODIFY *mod;
 	WT_PAGE_TRACK *track;
 	uint32_t i;
 
-	WT_PAGE_MODIFY *mod;
+	*foundp = 0;
 
 	mod = page->modify;
 	for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i) {
-		/* Check for a match. */
-		if (track->type != WT_PT_OVFL_DISCARD ||
-		    size != track->size || memcmp(data, track->data, size) != 0)
+		/* Ignore empty slots */
+		if (!F_ISSET(track, WT_TRK_OBJECT))
 			continue;
 
-		/* Found a match, return the record to use. */
-		track->type = WT_PT_OVFL;
+		/*
+		 * Ignore discarded objects or objects already in-use.  We don't
+		 * care about whether or not the object came from a page, we can
+		 * re-use objects from the page or objects created in a previous
+		 * reconciliation.
+		 */
+		if (F_ISSET(track, WT_TRK_DISCARD | WT_TRK_INUSE))
+			continue;
 
-		/* Return the block addr/size pair to our caller. */
-		*addrp = track->addr.addr;
-		*sizep = track->addr.size;
+		/*
+		 * Ignore objects without data (must be block objects).  This is
+		 * not really necessary (presumably, our caller is matching on a
+		 * non-zero-length data item), but paranoia is healthy.
+		 */
+		if (track->data == NULL)
+			continue;
 
-		WT_VERBOSE_CALL(session, reconcile, __track_msg(
-		    session, page, "reactivate overflow", &track->addr));
-		return (1);
+		/* Check to see if the data matches. */
+		if (track->size != data_size ||
+		    memcmp(data, track->data, data_size) != 0)
+			continue;
+
+		/*
+		 * Reactivate the record.
+		 * Return the block addr/size pair to our caller.
+		 */
+		F_SET(track, WT_TRK_INUSE);
+		*addrp = track->addr.addr;
+		*addr_sizep = track->addr.size;
+		*foundp = 1;
+		if (WT_VERBOSE_ISSET(session, reconcile))
+			WT_RET(__track_msg(
+			    session, page, "reactivate overflow", track));
+		return (0);
 	}
 	return (0);
 }
 
 /*
  * __wt_rec_track_init --
- *	Initialize/Reset the tracking information when writing a page.
+ *	Initialize the page's list of tracked objects when reconciliation
+ * starts.
  */
 int
 __wt_rec_track_init(WT_SESSION_IMPL *session, WT_PAGE *page)
 {
+	if (WT_VERBOSE_ISSET(session, reconcile))
+		WT_RET(__track_dump(session, page, "reconcile init"));
+
+	return (0);
+}
+
+/*
+ * __wt_rec_track_wrapup --
+ *	Resolve the page's list of tracked objects after the page is written.
+ */
+int
+__wt_rec_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
 	WT_PAGE_MODIFY *mod;
 	WT_PAGE_TRACK *track;
 	uint32_t i;
 
-	WT_VERBOSE_CALL(
-	    session, reconcile, __track_dump(session, page, "reconcile init"));
+	if (WT_VERBOSE_ISSET(session, reconcile))
+		WT_RET(__track_dump(session, page, "reconcile wrapup"));
 
+	/*
+	 * After the successful reconciliation of a page, some of the objects
+	 * we're tracking are no longer needed, free what we can free.
+	 */
 	mod = page->modify;
+	for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i) {
+		/* Ignore empty slots */
+		if (!F_ISSET(track, WT_TRK_OBJECT))
+			continue;
 
-	for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i)
-		switch (track->type) {
-		case WT_PT_BLOCK_EVICT:
-			/*
-			 * We had a block we would have discarded, had the last
-			 * reconciliation been the final one used to evict the
-			 * page -- it wasn't, and we didn't.  Clear the slot.
-			 */
-			__rec_track_clear(track);
-			break;
-		case WT_PT_OVFL:
-			/*
-			 * An overflow item associated with this page: mark it
-			 * "not in use", we'll reactivate any being re-used as
-			 * we process the page.
-			 */
-			WT_VERBOSE_CALL(session, reconcile, __track_msg(
-			    session, page, "set overflow OFF", &track->addr));
-			track->type = WT_PT_OVFL_DISCARD;
-			break;
-		case WT_PT_EMPTY:
-			break;
-		case WT_PT_BLOCK:
-		case WT_PT_OVFL_DISCARD:
-			/*
-			 * We shouldn't see WT_PT_BLOCK or WT_PT_OVFL_DISCARD,
-			 * those blocks were discarded at the end of the last
-			 * reconciliation of this page.
-			 */
-			/* FALLTHROUGH */
-		WT_ILLEGAL_VALUE(session);
+		/*
+		 * Ignore discarded objects (discarded objects left on the list
+		 * are never just-added, never in-use, and only include objects
+		 * found on a page).
+		 */
+		if (F_ISSET(track, WT_TRK_DISCARD)) {
+			WT_ASSERT(session,
+			    !F_ISSET(track, WT_TRK_JUST_ADDED | WT_TRK_INUSE));
+			WT_ASSERT(session, F_ISSET(track, WT_TRK_ONPAGE));
+			continue;
+		}
+
+		/* Clear the just-added flag, reconciliation succeeded. */
+		F_CLR(track, WT_TRK_JUST_ADDED);
+
+		/*
+		 * Ignore in-use objects, other than to clear the in-use flag
+		 * in preparation for the next reconciliation.
+		 */
+		if (F_ISSET(track, WT_TRK_INUSE)) {
+			F_CLR(track, WT_TRK_INUSE);
+			continue;
 		}
+
+		/*
+		 * The object isn't in-use and hasn't yet been discarded.  We
+		 * no longer need the underlying blocks, discard them.
+		 */
+		if (WT_VERBOSE_ISSET(session, reconcile))
+			WT_RET(__track_msg(session, page, "discard", track));
+		WT_RET(
+		    __wt_bm_free(session, track->addr.addr, track->addr.size));
+
+		/*
+		 * There are page and overflow blocks we track anew as part of
+		 * each page reconciliation, we need to know about them even if
+		 * the underlying blocks are no longer in use.  If the object
+		 * came from a page, keep it around.  Regardless, only discard
+		 * objects once.
+		 */
+		if (F_ISSET(track, WT_TRK_ONPAGE)) {
+			F_SET(track, WT_TRK_DISCARD);
+			continue;
+		}
+
+		__wt_free(session, track->addr.addr);
+		memset(track, 0, sizeof(*track));
+	}
 	return (0);
 }
 
 /*
- * __wt_rec_track_wrapup --
- *	Temporarily/Permanently resolve the page's list of tracked objects.
+ * __wt_rec_track_wrapup_err --
+ *	Resolve the page's list of tracked objects after an error occurs.
  */
 int
-__wt_rec_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page, int final)
+__wt_rec_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page)
 {
+	WT_DECL_RET;
+	WT_PAGE_MODIFY *mod;
 	WT_PAGE_TRACK *track;
 	uint32_t i;
 
-	WT_VERBOSE_CALL(session, reconcile,
-	    __track_dump(session,
-	    page, final ? "eviction wrapup" : "reconcile wrapup"));
-
 	/*
-	 * After a sync of a page, some of the objects we're tracking are no
-	 * longer needed -- free what we can free.
-	 *
-	 * WT_PT_EMPTY:
-	 *	Empty slot.
-	 * WT_PT_BLOCK:
-	 *	A discarded block, free when this reconciliation completes.
-	 * WT_PT_BLOCK_EVICT:
-	 *	A discarded block based on this reconciliation; if the page is
-	 *	evicted based on this reconciliation, discard the block.  (For
-	 *	example, an overflow key that references a deleted item will be
-	 *	discarded, but a subsequent reconciliation might find the key
-	 *	is once more in use because the item is no longer deleted.)
-	 * WT_PT_OVFL:
-	 *	An overflow record that's in-use.  Ignored after any particular
-	 *	reconciliation, because we need to track it for re-use in future
-	 *	reconciliations.   When the page is evicted, discard its memory,
-	 *	leaving the underlying blocks alone.
-	 * WT_PT_OVFL_DISCARD:
-	 *	An overflow record that's no longer in-use.  Discard the memory
-	 *	and free the underlying blocks after reconciliation completes.
+	 * After a failed reconciliation of a page, discard entries added in the
+	 * current reconciliation, their information is incorrect, additionally,
+	 * clear the in-use flag in preparation for the next reconciliation.
 	 */
-	for (track = page->modify->track,
-	    i = 0; i < page->modify->track_entries; ++track, ++i) {
-		switch (track->type) {
-		case WT_PT_EMPTY:
-			continue;
-		case WT_PT_BLOCK_EVICT:
-			if (!final)
-				continue;
-			/* FALLTHROUGH */
-		case WT_PT_BLOCK:
-			WT_VERBOSE_CALL(session, reconcile, __track_msg(
-			    session, page, "discard block", &track->addr));
-			WT_RET(__wt_bm_free(
-			    session, track->addr.addr, track->addr.size));
-			__wt_free(session, track->addr.addr);
-			break;
-		case WT_PT_OVFL:
-			WT_VERBOSE_CALL(session, reconcile, __track_msg(
-			    session, page, "retain overflow", &track->addr));
-			if (!final)
-				continue;
+	mod = page->modify;
+	for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i)
+		if (F_ISSET(track, WT_TRK_JUST_ADDED)) {
+			/*
+			 * The in-use flag is used to avoid discarding backing
+			 * blocks: if an object is both just-added and in-use,
+			 * we allocated the blocks on this run, and we want to
+			 * discard them on error.
+			 */
+			if (F_ISSET(track, WT_TRK_INUSE))
+				WT_TRET(__wt_bm_free(session,
+				    track->addr.addr, track->addr.size));
 
-			/* Freeing WT_PAGE_TRACK->addr frees ->data, too. */
 			__wt_free(session, track->addr.addr);
-			break;
-		case WT_PT_OVFL_DISCARD:
-			WT_VERBOSE_CALL(session, reconcile, __track_msg(
-			    session, page, "discard overflow", &track->addr));
-			WT_RET(__wt_bm_free(
-			    session, track->addr.addr, track->addr.size));
+			memset(track, 0, sizeof(*track));
+		} else
+			F_CLR(track, WT_TRK_INUSE);
+	return (ret);
+}
 
-			/* Freeing WT_PAGE_TRACK->addr frees ->data, too. */
-			__wt_free(session, track->addr.addr);
-			break;
-		}
+/*
+ * __wt_rec_track_discard --
+ *	Discard the page's list of tracked objects.
+ */
+void
+__wt_rec_track_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_PAGE_TRACK *track;
+	uint32_t i;
 
-		__rec_track_clear(track);
-	}
-	return (0);
+	for (track = page->modify->track,
+	    i = 0; i < page->modify->track_entries; ++track, ++i)
+		__wt_free(session, track->addr.addr);
 }
 
 #ifdef HAVE_VERBOSE
@@ -336,7 +433,7 @@ __wt_rec_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page, int final)
  * __track_dump --
  *	Dump the list of tracked objects.
  */
-static void
+static int
 __track_dump(WT_SESSION_IMPL *session, WT_PAGE *page, const char *tag)
 {
 	WT_PAGE_MODIFY *mod;
@@ -346,54 +443,71 @@ __track_dump(WT_SESSION_IMPL *session, WT_PAGE *page, const char *tag)
 	mod = page->modify;
 
 	if (mod->track_entries == 0)
-		return;
+		return (0);
 
-	WT_VERBOSE(session,
+	WT_VERBOSE_RET(session, reconcile, "\n");
+	WT_VERBOSE_RET(session,
 	    reconcile, "page %p tracking list at %s:", page, tag);
 	for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i)
-		__track_print(session, page, track);
+		if (F_ISSET(track, WT_TRK_OBJECT))
+			WT_RET(__track_msg(session, page, "dump", track));
+	WT_VERBOSE_RET(session, reconcile, "\n");
+	return (0);
 }
 
 /*
- * __track_print --
- *	Display a tracked entry.
+ * __track_msg --
+ *	Output a verbose message and associated page and address pair.
  */
-static void
-__track_print(WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE_TRACK *track)
+static int
+__track_msg(WT_SESSION_IMPL *session,
+    WT_PAGE *page, const char *msg, WT_PAGE_TRACK *track)
 {
-	switch (track->type) {
-	case WT_PT_BLOCK:
-		__track_msg(session, page, "block", &track->addr);
-		break;
-	case WT_PT_BLOCK_EVICT:
-		__track_msg(session, page, "block-evict", &track->addr);
-		return;
-	case WT_PT_OVFL:
-		__track_msg(session, page, "overflow (on)", &track->addr);
-		break;
-	case WT_PT_OVFL_DISCARD:
-		__track_msg(session, page, "overflow (off)", &track->addr);
-		break;
-	case WT_PT_EMPTY:
-	default:				/* Not possible. */
-		break;
-	}
+	WT_DECL_RET;
+	WT_DECL_ITEM(buf);
+	char f[64];
+
+	WT_RET(__wt_scr_alloc(session, 64, &buf));
+
+	WT_VERBOSE_ERR(
+	    session, reconcile, "page %p %s (%s) %" PRIu32 "B @%s",
+	    page, msg,
+	    __wt_track_string(track, f, sizeof(f)),
+	    track->size,
+	    __wt_addr_string(session, buf, track->addr.addr, track->addr.size));
+
+err:	__wt_scr_free(&buf);
+	return (ret);
 }
 
 /*
- * __track_msg --
- *	Output a verbose message and associated page and address pair.
+ * __wt_track_string --
+ *	Fill in a buffer, describing a track object.
  */
-static void
-__track_msg(
-    WT_SESSION_IMPL *session, WT_PAGE *page, const char *msg, WT_ADDR *addr)
+char *
+__wt_track_string(WT_PAGE_TRACK *track, char *buf, size_t len)
 {
-	WT_ITEM *buf;
+	size_t remain, wlen;
+	char *p, *end;
+	const char *sep;
+
+	p = buf;
+	end = buf + len;
+
+#define	WT_APPEND_FLAG(f, name)						\
+	if (F_ISSET(track, f)) {					\
+		remain = WT_PTRDIFF(end, p);				\
+		wlen = (size_t)snprintf(p, remain, "%s%s", sep, name);	\
+		p = wlen >= remain ? end : p + wlen;			\
+		sep = ", ";						\
+	}
+
+	sep = "";
+	WT_APPEND_FLAG(WT_TRK_DISCARD, "discard");
+	WT_APPEND_FLAG(WT_TRK_INUSE, "inuse");
+	WT_APPEND_FLAG(WT_TRK_JUST_ADDED, "just-added");
+	WT_APPEND_FLAG(WT_TRK_ONPAGE, "onpage");
 
-	if (__wt_scr_alloc(session, 64, &buf))
-		return;
-	WT_VERBOSE(session, reconcile, "page %p %s %s", page, msg,
-	    __wt_addr_string(session, buf, addr->addr, addr->size));
-	__wt_scr_free(&buf);
+	return (buf);
 }
 #endif
diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c
index 90407daff94..22539e1ff73 100644
--- a/src/btree/rec_write.c
+++ b/src/btree/rec_write.c
@@ -23,6 +23,11 @@ typedef struct {
 
 	WT_ITEM	 dsk;			/* Temporary disk-image buffer */
 
+	/* Track whether all changes to the page are written. */
+	uint32_t orig_write_gen;
+	uint32_t orig_disk_gen;
+	int upd_skipped;
+
 	/*
 	 * Reconciliation gets tricky if we have to split a page, that is, if
 	 * the disk image we create exceeds the maximum size of disk images for
@@ -164,26 +169,16 @@ static int  __rec_row_leaf_insert(WT_SESSION_IMPL *, WT_INSERT *);
 static int  __rec_row_merge(WT_SESSION_IMPL *, WT_PAGE *);
 static int  __rec_split(WT_SESSION_IMPL *session);
 static int  __rec_split_col(WT_SESSION_IMPL *, WT_PAGE *, WT_PAGE **);
+static int  __rec_split_discard(WT_SESSION_IMPL *, WT_PAGE *);
 static int  __rec_split_finish(WT_SESSION_IMPL *);
 static int  __rec_split_fixup(WT_SESSION_IMPL *);
 static int  __rec_split_init(WT_SESSION_IMPL *, WT_PAGE *, uint64_t, uint32_t);
 static int  __rec_split_row(WT_SESSION_IMPL *, WT_PAGE *, WT_PAGE **);
 static int  __rec_split_row_promote(WT_SESSION_IMPL *, uint8_t);
-static int  __rec_split_write(WT_SESSION_IMPL *, WT_BOUNDARY *, WT_ITEM *);
+static int  __rec_split_write(WT_SESSION_IMPL *, WT_BOUNDARY *, WT_ITEM *, int);
 static int  __rec_write_init(WT_SESSION_IMPL *, WT_PAGE *);
 static int  __rec_write_wrapup(WT_SESSION_IMPL *, WT_PAGE *);
-
-/*
- * __rec_track_cell --
- *	If a cell references an overflow chunk, add it to the page's list.
- */
-static inline int
-__rec_track_cell(
-    WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack)
-{
-	return (unpack->ovfl ? __wt_rec_track_block(session,
-	    WT_PT_BLOCK_EVICT, page, unpack->data, unpack->size) : 0);
-}
+static int  __rec_write_wrapup_err(WT_SESSION_IMPL *, WT_PAGE *);
 
 /*
  * Helper macro to determine whether the given WT_REF has a page with
@@ -231,7 +226,9 @@ int
 __wt_rec_write(
     WT_SESSION_IMPL *session, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
 {
-	WT_VERBOSE(session, reconcile,
+	WT_DECL_RET;
+
+	WT_VERBOSE_RET(session, reconcile,
 	    "page %p %s", page, __wt_page_type_string(page->type));
 
 	WT_BSTAT_INCR(session, rec_written);
@@ -248,39 +245,43 @@ __wt_rec_write(
 
 	/* Initialize the reconciliation structures for each new run. */
 	WT_RET(__rec_write_init(session, page));
-
-	/* Initialize the overflow tracking information for each new run. */
 	WT_RET(__wt_rec_track_init(session, page));
 
 	/* Reconcile the page. */
 	switch (page->type) {
 	case WT_PAGE_COL_FIX:
 		if (salvage != NULL)
-			WT_RET(__rec_col_fix_slvg(session, page, salvage));
+			ret = __rec_col_fix_slvg(session, page, salvage);
 		else
-			WT_RET(__rec_col_fix(session, page));
+			ret = __rec_col_fix(session, page);
 		break;
 	case WT_PAGE_COL_INT:
-		WT_RET(__rec_col_int(session, page));
+		ret =__rec_col_int(session, page);
 		break;
 	case WT_PAGE_COL_VAR:
-		WT_RET(__rec_col_var(session, page, salvage));
+		ret =__rec_col_var(session, page, salvage);
 		break;
 	case WT_PAGE_ROW_INT:
-		WT_RET(__rec_row_int(session, page));
+		ret =__rec_row_int(session, page);
 		break;
 	case WT_PAGE_ROW_LEAF:
-		WT_RET(__rec_row_leaf(session, page, salvage));
+		ret =__rec_row_leaf(session, page, salvage);
 		break;
 	WT_ILLEGAL_VALUE(session);
 	}
+	if (ret != 0) {
+		/*
+		 * The underlying wrapup-on-error functions can fail, and they
+		 * are written to return an error value, but now we discard it,
+		 * we already have one.
+		 */
+		(void)__rec_write_wrapup_err(session, page);
+		return (ret);
+	}
 
 	/* Wrap up the page's reconciliation. */
 	WT_RET(__rec_write_wrapup(session, page));
 
-	/* Wrap up overflow tracking, discarding what we can. */
-	WT_RET(__wt_rec_track_wrapup(session, page, 0));
-
 	/*
 	 * If this page has a parent, mark the parent dirty.  Split-merge pages
 	 * are a special case: they are always dirty and never reconciled, they
@@ -288,16 +289,66 @@ __wt_rec_write(
 	 * first non-split-merge parent we find dirty, not the split-merge page
 	 * itself, ensuring the chain of dirty pages up the tree isn't broken.
 	 */
-	if (WT_PAGE_IS_ROOT(page))
+	if (!WT_PAGE_IS_ROOT(page)) {
+		for (;;) {
+			page = page->parent;
+			if (page->modify == NULL ||
+			    !F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE))
+				break;
+		}
+		WT_RET(__wt_page_modify_init(session, page));
+		__wt_page_modify_set(page);
+
+		return (0);
+	}
+
+	/*
+	 * Root pages are trickier.  First, if the page is empty or we performed
+	 * a 1-for-1 page swap, we're done, we've written the root (and done the
+	 * snapshot).
+	 */
+	switch (F_ISSET(page->modify, WT_PM_REC_MASK)) {
+	case WT_PM_REC_EMPTY:				/* Page is empty */
+	case WT_PM_REC_REPLACE: 			/* 1-for-1 page swap */
 		return (0);
-	for (;;) {
-		page = page->parent;
-		if (page->modify == NULL ||
-		    !F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE))
-			break;
+	case WT_PM_REC_SPLIT:				/* Page split */
+		break;
+	WT_ILLEGAL_VALUE(session);
 	}
-	WT_RET(__wt_page_modify_init(session, page));
+
+	/*
+	 * Newly created internal pages are normally merged into their parent
+	 * when the parent is evicted.  Newly split root pages can't be merged,
+	 * they have no parent and the new root page must be written.  We also
+	 * have to write the root page immediately; the alternative would be to
+	 * split the page in memory and continue, but that won't work because
+	 * (1) we'd have to require incoming threads use hazard references to
+	 * read the root page, and (2) the sync or close triggering the split
+	 * won't see the new root page during the current traversal.
+	 *
+	 * Make the new split page look like a normal page that's been modified,
+	 * and write it out.  Keep doing that and eventually we'll perform a
+	 * simple replacement (as opposed to another level of split), and then
+	 * we're done.  Given our support of big pages, the only time we see
+	 * multiple splits is when we've bulk-loaded something huge, and we're
+	 * evicting the index page referencing all of those leaf pages.
+	 *
+	 * This creates a new kind of data structure in the system: an in-memory
+	 * root page, pointing to a chain of pages, each of which are flagged as
+	 * "split" pages, up to a final replacement page.  We don't use those
+	 * pages again, they are discarded in the next root page reconciliation.
+	 * We could discard them immediately (because the snapshot is complete,
+	 * any pages we discard go on the next snapshot's free list, it's safe
+	 * to do), but the code is simpler this way, and this operation should
+	 * not be common.
+	 */
+	WT_VERBOSE_RET(session, reconcile,
+	    "root page split %p -> %p", page, page->modify->u.split);
+	page = page->modify->u.split;
 	__wt_page_modify_set(page);
+	F_CLR(page->modify, WT_PM_REC_SPLIT_MERGE);
+
+	WT_RET(__wt_rec_write(session, page, NULL));
 
 	return (0);
 }
@@ -315,9 +366,6 @@ __rec_write_init(WT_SESSION_IMPL *session, WT_PAGE *page)
 
 	btree = session->btree;
 
-	/* Update the disk generation before we read anything from the page. */
-	WT_ORDERED_READ(page->modify->disk_gen, page->modify->write_gen);
-
 	/* Allocate a reconciliation structure if we don't already have one. */
 	if ((r = session->reconcile) == NULL) {
 		WT_RET(__wt_calloc_def(session, 1, &r));
@@ -335,9 +383,24 @@ __rec_write_init(WT_SESSION_IMPL *session, WT_PAGE *page)
 		    btree->config, "split_pct", &cval));
 		r->btree_split_pct = (uint32_t)cval.val;
 
-		WT_RET(__wt_config_getones(session,
-		    btree->config, "internal_key_truncate", &cval));
-		r->key_sfx_compress_conf = (cval.val != 0);
+		/*
+		 * Suffix compression is a hack to shorten internal page keys
+		 * by discarding trailing bytes that aren't necessary for tree
+		 * navigation.  We don't do suffix compression if there is a
+		 * custom collator because we don't know what bytes a custom
+		 * collator might use.  Some custom collators (for example, a
+		 * collator implementing reverse ordering of strings), won't
+		 * have any problem with suffix compression: if there's ever a
+		 * reason to implement suffix compression for custom collators,
+		 * we can add a setting to the collator, configured when the
+		 * collator is added, that turns on suffix compression.
+		 */
+		r->key_sfx_compress_conf = 0;
+		if (btree->collator == NULL) {
+			WT_RET(__wt_config_getones(session,
+			    btree->config, "internal_key_truncate", &cval));
+			r->key_sfx_compress_conf = (cval.val != 0);
+		}
 
 		WT_RET(__wt_config_getones(session,
 		    btree->config, "prefix_compression", &cval));
@@ -346,6 +409,17 @@ __rec_write_init(WT_SESSION_IMPL *session, WT_PAGE *page)
 
 	r->page = page;
 
+	/* Read the disk generation before we read anything from the page. */
+	r->orig_disk_gen = page->modify->disk_gen;
+	WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen);
+
+	/*
+	 * Pages cannot be evicted if they are only partially written, that is,
+	 * if we skipped an update for transactional reasons, the page cannot
+	 * be evicted.
+	 */
+	r->upd_skipped = 0;
+
 	return (0);
 }
 
@@ -706,12 +780,12 @@ __rec_split(WT_SESSION_IMPL *session)
 		 * boundaries, or the split size was the same as the page size,
 		 * so we never bothered with saving split-point information.
 		 *
-		 * Write the current disk image.
+		 * Finalize the header information and write the page.
 		 */
 		dsk->recno = bnd->recno;
 		dsk->u.entries = r->entries;
 		r->dsk.size = WT_PTRDIFF32(r->first_free, dsk);
-		WT_RET(__rec_split_write(session, bnd, &r->dsk));
+		WT_RET(__rec_split_write(session, bnd, &r->dsk, 0));
 
 		/*
 		 * Set the starting record number and promotion key for the next
@@ -748,6 +822,7 @@ __rec_split_finish(WT_SESSION_IMPL *session)
 	WT_BOUNDARY *bnd;
 	WT_PAGE_HEADER *dsk;
 	WT_RECONCILE *r;
+	int snapshot;
 
 	r = session->reconcile;
 
@@ -785,12 +860,19 @@ __rec_split_finish(WT_SESSION_IMPL *session)
 		bnd->entries = r->entries;
 	}
 
-	/* Write the remaining information. */
+	/*
+	 * Third, check to see if we're creating a snapshot: any time we write
+	 * the root page of the tree, we tell the underlying block manager so it
+	 * can write and return the additional information a snapshot requires.
+	 */
+	snapshot = r->bnd_next == 1 && WT_PAGE_IS_ROOT(r->page);
+
+	/* Finalize the header information and write the page. */
 	dsk = r->dsk.mem;
 	dsk->recno = bnd->recno;
 	dsk->u.entries = r->entries;
 	r->dsk.size = WT_PTRDIFF32(r->first_free, dsk);
-	return (__rec_split_write(session, bnd, &r->dsk));
+	return (__rec_split_write(session, bnd, &r->dsk, snapshot));
 }
 
 /*
@@ -800,14 +882,14 @@ __rec_split_finish(WT_SESSION_IMPL *session)
 static int
 __rec_split_fixup(WT_SESSION_IMPL *session)
 {
-	WT_BTREE *btree;
 	WT_BOUNDARY *bnd;
-	WT_ITEM *tmp;
+	WT_BTREE *btree;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
 	WT_PAGE_HEADER *dsk;
 	WT_RECONCILE *r;
 	uint32_t i, len;
 	uint8_t *dsk_start;
-	int ret;
 
 	/*
 	 * When we overflow physical limits of the page, we walk the list of
@@ -816,8 +898,6 @@ __rec_split_fixup(WT_SESSION_IMPL *session)
 	 */
 	r = session->reconcile;
 	btree = session->btree;
-	tmp = NULL;
-	ret = 0;
 
 	/*
 	 * The data isn't laid out on a page boundary or nul padded; copy it to
@@ -841,11 +921,11 @@ __rec_split_fixup(WT_SESSION_IMPL *session)
 		len = WT_PTRDIFF32((bnd + 1)->start, bnd->start);
 		memcpy(dsk_start, bnd->start, len);
 
-		/* Write the page. */
+		/* Finalize the header information and write the page. */
 		dsk->recno = bnd->recno;
 		dsk->u.entries = bnd->entries;
 		tmp->size = WT_PAGE_HEADER_BYTE_SIZE(btree) + len;
-		WT_ERR(__rec_split_write(session, bnd, tmp));
+		WT_ERR(__rec_split_write(session, bnd, tmp, 0));
 	}
 
 	/*
@@ -858,7 +938,7 @@ __rec_split_fixup(WT_SESSION_IMPL *session)
 	 * Fix up our caller's information.
 	 */
 	len = WT_PTRDIFF32(r->first_free, bnd->start);
-	WT_ASSERT_RET(
+	WT_ASSERT_ERR(
 	    session, len < r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree));
 
 	dsk = r->dsk.mem;
@@ -879,12 +959,16 @@ err:	__wt_scr_free(&tmp);
  *	Write a disk block out for the split helper functions.
  */
 static int
-__rec_split_write(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_ITEM *buf)
+__rec_split_write(
+    WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_ITEM *buf, int snapshot)
 {
 	WT_CELL *cell;
 	WT_PAGE_HEADER *dsk;
 	uint32_t size;
-	uint8_t addr[WT_BM_MAX_ADDR_COOKIE];
+	uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE];
+
+	dsk = buf->mem;
+	WT_VERBOSE_RET(session, write, "%s", __wt_page_type_string(dsk->type));
 
 	/*
 	 * We always write an additional byte on row-store leaf pages after the
@@ -899,7 +983,6 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_ITEM *buf)
 	 * see it.
 	 */
 #define	WT_TRAILING_KEY_CELL	(sizeof(uint8_t))
-	dsk = buf->mem;
 	if (dsk->type == WT_PAGE_ROW_LEAF) {
 		WT_ASSERT_RET(session, buf->size < buf->memsize);
 
@@ -908,11 +991,26 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_ITEM *buf)
 		++buf->size;
 	}
 
-	/* Write the chunk and save the location information. */
-	WT_VERBOSE(session, write, "%s", __wt_page_type_string(dsk->type));
-	WT_RET(__wt_bm_write(session, buf, addr, &size));
-	WT_RET(__wt_strndup(session, (char *)addr, size, &bnd->addr.addr));
-	bnd->addr.size = size;
+	/*
+	 * Write the chunk and save the location information.  There is one big
+	 * question: if this is a snapshot, then we're going to have to wrap up
+	 * our tracking information (freeing blocks we no longer need) before we
+	 * can create the snapshot, because snapshots write extent lists, that
+	 * is, the whole system has to be consistent.   We have to handle empty
+	 * tree snapshots elsewhere (because we don't write anything for empty
+	 * tree snapshots, they don't come through this path).  Given that fact,
+	 * clear the boundary information as a reminder, and do the snapshot at
+	 * a later time, during wrapup.
+	 */
+	if (snapshot) {
+		bnd->addr.addr = NULL;
+		bnd->addr.size = 0;
+	} else {
+		WT_RET(__wt_bm_write(session, buf, addr, &size));
+		WT_RET(
+		    __wt_strndup(session, (char *)addr, size, &bnd->addr.addr));
+		bnd->addr.size = size;
+	}
 
 	return (0);
 }
@@ -951,8 +1049,8 @@ __rec_split_row_promote(WT_SESSION_IMPL *session, uint8_t type)
 		/*
 		 * The cell had better have a zero-length prefix: it's the first
 		 * key on the page.  (If it doesn't have a zero-length prefix,
-		 * __wt_cell_update_copy() won't be sufficient any way, we'd
-		 * only copy the non-prefix-compressed portion of the key.)
+		 * __wt_cell_unpack() won't be sufficient anyway, we'd only copy
+		 * the non-prefix-compressed portion of the key.)
 		 */
 		cell = WT_PAGE_HEADER_BYTE(btree, r->dsk.mem);
 		__wt_cell_unpack(cell, unpack);
@@ -975,11 +1073,14 @@ __rec_split_row_promote(WT_SESSION_IMPL *session, uint8_t type)
 	 * internal pages, you cannot repeat suffix truncation as you split up
 	 * the tree, it loses too much information.
 	 *
+	 * One note: if the last key on the previous page was an overflow key,
+	 * we don't have the in-memory key against which to compare, and don't
+	 * try to do suffix compression.  The code for that case turns suffix
+	 * compression off for the next key.
+	 *
 	 * The r->last key sorts before the r->cur key, so we'll either find a
-	 * larger byte value in r->cur, or r->cur will be the longer key. One
-	 * caveat: if the largest key on the previous page was an overflow key,
-	 * we don't have a key against which to compare, and we can't do suffix
-	 * compression.
+	 * larger byte value in r->cur, or r->cur will be the longer key, and
+	 * the interesting byte is one past the length of the shorter key.
 	 */
 	if (type == WT_PAGE_ROW_LEAF && r->key_sfx_compress) {
 		pa = r->last->data;
@@ -1333,6 +1434,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page)
 	WT_INSERT *ins;
 	WT_INSERT_HEAD *append;
 	WT_RECONCILE *r;
+	WT_UPDATE *upd;
 	uint64_t recno;
 	uint32_t entry, nrecs;
 
@@ -1340,10 +1442,14 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page)
 	btree = session->btree;
 
 	/* Update any changes to the original on-page data items. */
-	WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page))
+	WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page)) {
+		upd = __wt_txn_read_skip(session, ins->upd, &r->upd_skipped);
+		if (upd == NULL)
+			continue;
 		__bit_setv_recno(
 		    page, WT_INSERT_RECNO(ins), btree->bitcnt,
-		    ((uint8_t *)WT_UPDATE_DATA(ins->upd))[0]);
+		    ((uint8_t *)WT_UPDATE_DATA(upd))[0]);
+	}
 
 	/* Allocate the memory. */
 	WT_RET(__rec_split_init(session,
@@ -1360,7 +1466,10 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page)
 
 	/* Walk any append list. */
 	append = WT_COL_APPEND(page);
-	WT_SKIP_FOREACH(ins, append)
+	WT_SKIP_FOREACH(ins, append) {
+		upd = __wt_txn_read_skip(session, ins->upd, &r->upd_skipped);
+		if (upd == NULL)
+			continue;
 		for (;;) {
 			/*
 			 * The application may have inserted records which left
@@ -1374,7 +1483,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page)
 
 			if (nrecs > 0) {
 				__bit_setv(r->first_free, entry, btree->bitcnt,
-				    ((uint8_t *)WT_UPDATE_DATA(ins->upd))[0]);
+				    ((uint8_t *)WT_UPDATE_DATA(upd))[0]);
 				--nrecs;
 				++entry;
 				++r->recno;
@@ -1395,6 +1504,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page)
 			entry = 0;
 			nrecs = r->space_avail / btree->bitcnt;
 		}
+	}
 
 	/* Update the counters. */
 	__rec_incr(session, r, entry, __bitstr_size(entry * btree->bitcnt));
@@ -1479,7 +1589,7 @@ __rec_col_fix_slvg(
 static int
 __rec_col_var_helper(
     WT_SESSION_IMPL *session, WT_SALVAGE_COOKIE *salvage,
-    WT_ITEM *value, int deleted, int raw, uint64_t rle)
+    WT_ITEM *value, int deleted, int ovfl, uint64_t rle)
 {
 	WT_RECONCILE *r;
 	WT_KV *val;
@@ -1520,13 +1630,15 @@ __rec_col_var_helper(
 
 	if (deleted) {
 		val->cell_len = __wt_cell_pack_del(&val->cell, rle);
+		val->buf.data = NULL;
 		val->buf.size = 0;
 		val->len = val->cell_len;
-	} else if (raw) {
+	} else if (ovfl) {
+		val->cell_len = __wt_cell_pack_ovfl(
+		    &val->cell, WT_CELL_VALUE_OVFL, rle, value->size);
 		val->buf.data = value->data;
 		val->buf.size = value->size;
-		val->cell_len = 0;
-		val->len = val->buf.size;
+		val->len = val->cell_len + value->size;
 	} else
 		WT_RET(__rec_cell_build_val(
 		    session, value->data, value->size, rle));
@@ -1545,6 +1657,45 @@ __rec_col_var_helper(
 }
 
 /*
+ * __rec_onpage_ovfl --
+ *	Get/set overflow records we need to track over the life of the page.
+ */
+static int
+__rec_onpage_ovfl(WT_SESSION_IMPL *session,
+    WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *buf)
+{
+	int found;
+
+	/*
+	 * We're dealing with an overflow cell we may encounter repeatedly and
+	 * which we can re-use (unless it's discarded).  If it's discarded, we
+	 * may still (in the case of row-store page keys), need to know the
+	 * original value so we can re-create it.  As we can't get the original
+	 * value of the overflow cell's blocks from disk after the blocks are
+	 * discarded, we have to be able to get a copy from the tracking system.
+	 *
+	 * First, check in with the tracking system, and if we find it, we have
+	 * a copy and we're done.
+	 */
+	WT_RET(__wt_rec_track_onpage_srch(
+	    session, page, unpack->data, unpack->size, &found, buf));
+	if (found)
+		return (0);
+
+	/*
+	 * Read the original (possibly Huffman encoded) value from disk, and
+	 * enter it into the tracking system.
+	 *
+	 * There are implications to this call: the overflow item is discarded
+	 * when reconciliation completes, if not subsequently marked for re-use.
+	 */
+	WT_RET(__wt_ovfl_in(session, buf, unpack->data, unpack->size));
+	WT_RET(__wt_rec_track(session, page,
+	    unpack->data, unpack->size, buf->data, buf->size, WT_TRK_ONPAGE));
+	return (0);
+}
+
+/*
  * __rec_col_var --
  *	Reconcile a variable-width column-store leaf page.
  */
@@ -1552,18 +1703,21 @@ static int
 __rec_col_var(
     WT_SESSION_IMPL *session, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
 {
+	enum { OVFL_IGNORE, OVFL_UNUSED, OVFL_USED } ovfl_state;
 	WT_BTREE *btree;
 	WT_CELL *cell;
 	WT_CELL_UNPACK *unpack, _unpack;
 	WT_COL *cip;
+	WT_DECL_ITEM(orig);
+	WT_DECL_RET;
 	WT_INSERT *ins;
 	WT_INSERT_HEAD *append;
-	WT_ITEM *last, orig;
+	WT_ITEM *last;
 	WT_RECONCILE *r;
 	WT_UPDATE *upd;
 	uint64_t n, nrepeat, repeat_count, rle, slvg_missing, src_recno;
 	uint32_t i, size;
-	int can_compare, deleted, last_deleted, orig_deleted, ret;
+	int deleted, last_deleted, orig_deleted, update_no_copy;
 	const void *data;
 
 	r = session->reconcile;
@@ -1571,7 +1725,7 @@ __rec_col_var(
 	last = r->last;
 	unpack = &_unpack;
 
-	WT_CLEAR(orig);
+	WT_RET(__wt_scr_alloc(session, 0, &orig));
 	data = NULL;
 	size = 0;
 
@@ -1608,109 +1762,102 @@ __rec_col_var(
 
 	/* For each entry in the in-memory page... */
 	rle = 0;
-	can_compare = deleted = last_deleted = 0;
+	deleted = last_deleted = 0;
 	WT_COL_FOREACH(page, cip, i) {
-		/*
-		 * Review the original cell, and get its repeat count and
-		 * insert list.
-		 */
-		cell = WT_COL_PTR(page, cip);
-		ins = WT_SKIP_FIRST(WT_COL_UPDATE(page, cip));
-		if (cell == NULL) {
+		ovfl_state = OVFL_IGNORE;
+		if ((cell = WT_COL_PTR(page, cip)) == NULL) {
+			ins = NULL;
 			nrepeat = 1;
 			orig_deleted = 1;
 		} else {
 			__wt_cell_unpack(cell, unpack);
+			nrepeat = __wt_cell_rle(unpack);
+
+			ins = WT_SKIP_FIRST(WT_COL_UPDATE(page, cip));
+			while (ins != NULL && __wt_txn_read_skip(
+			    session, ins->upd, &r->upd_skipped) == NULL)
+				ins = WT_SKIP_NEXT(ins);
 
 			/*
-			 * The data may be Huffman encoded, which means we have
-			 * to decode it in order to compare it with the last
-			 * item we saw, which may have been an update string.
-			 * This code guarantees we find every single pair of
-			 * objects we can RLE encode, including the application
-			 * inserting an update to an existing record where it
-			 * happened (?) to match a Huffman-encoded value in the
-			 * previous or next record.   However, we try to avoid
-			 * copying in overflow records: if there's a WT_INSERT
-			 * entry inserting a new record into a reference counted
-			 * overflow record, then we have to write copies of the
-			 * overflow record, and we do the comparisons.  But, we
-			 * don't copy in the overflow record just to see if it
-			 * matches records on either side.
+			 * If the original value is "deleted", there's no value
+			 * to compare, we're done.
 			 */
-			if (unpack->ovfl && ins == NULL) {
-				/*
-				 * Write out any record we're tracking and turn
-				 * off comparisons for the next item.
-				 */
-				if (can_compare) {
-					WT_ERR(__rec_col_var_helper(
-					    session, salvage,
-					    last, last_deleted, 0, rle));
-					can_compare = 0;
-				}
+			orig_deleted = unpack->type == WT_CELL_DEL ? 1 : 0;
+			if (orig_deleted)
+				goto record_loop;
 
-				/* Write out the overflow cell as a raw cell. */
-				last->data = cell;
-				last->size = unpack->len;
-				WT_ERR(__rec_col_var_helper(
-				    session, salvage,
-				    last, 0, 1, __wt_cell_rle(unpack)));
-				src_recno += __wt_cell_rle(unpack);
-				continue;
+			/*
+			 * Overflow items are tricky: we don't know until we're
+			 * finished processing the set of values if we need the
+			 * overflow value or not.  If we don't use the overflow
+			 * item at all, we'll have to discard it (that's safe
+			 * because once the original value is unused during any
+			 * page reconciliation, it will never be needed again).
+			 *
+			 * Regardless, we avoid copying in overflow records: if
+			 * there's a WT_INSERT entry that modifies a reference
+			 * counted overflow record, we may have to write copies
+			 * of the overflow record, and in that case we'll do the
+			 * comparisons, but we don't read overflow items just to
+			 * see if they match records on either side.
+			 */
+			if (unpack->ovfl) {
+				ovfl_state = OVFL_UNUSED;
+				goto record_loop;
 			}
 
-			nrepeat = __wt_cell_rle(unpack);
-			orig_deleted = unpack->type == WT_CELL_DEL ? 1 : 0;
-
-			/* Get a copy of the cell. */
-			if (!orig_deleted)
-				WT_ERR(__wt_cell_unpack_copy(
-				    session, unpack, &orig));
+			/*
+			 * Check for the common case where the underlying value
+			 * is simple and avoid a copy.
+			 */
+			if (btree->huffman_value == NULL) {
+				orig->data = unpack->data;
+				orig->size = unpack->size;
+				goto record_loop;
+			}
 
 			/*
-			 * If we're re-writing a cell's reference of an overflow
-			 * value, free the underlying file space.
-			 *
-			 * !!!
-			 * We could optimize here by using the original overflow
-			 * information for some set of the column values.  (For
-			 * example, if column cells #10-17 reference overflow X,
-			 * and cell #12 is updated with a new record: we could
-			 * use the original overflow X for either cells #10-11
-			 * or cells #13-17.)  We don't do that, instead we write
-			 * new overflow records for both groups.  I'm skipping
-			 * that work because I don't want the complexity, and
-			 * overflow records should be rare.
+			 * The data is Huffman encoded, which means we have to
+			 * decode it in order to compare it with the last item
+			 * we saw, which may have been an update string.  This
+			 * guarantees we find every single pair of objects we
+			 * can RLE encode, including applications updating an
+			 * existing record where the new value happens (?) to
+			 * match a Huffman-encoded value in a previous or next
+			 * record.
 			 */
-			WT_ERR(__rec_track_cell(session, page, unpack));
+			WT_ERR(__wt_cell_unpack_copy(session, unpack, orig));
 		}
 
-		/*
+record_loop:	/*
 		 * Generate on-page entries: loop repeat records, looking for
 		 * WT_INSERT entries matching the record number.  The WT_INSERT
 		 * lists are in sorted order, so only need check the next one.
 		 */
 		for (n = 0;
 		    n < nrepeat; n += repeat_count, src_recno += repeat_count) {
-			if (ins != NULL && WT_INSERT_RECNO(ins) == src_recno) {
-				upd = ins->upd;
-				ins = WT_SKIP_NEXT(ins);
+			if (ins != NULL &&
+			    WT_INSERT_RECNO(ins) == src_recno) {
+				upd = __wt_txn_read_skip(
+				    session, ins->upd, &r->upd_skipped);
+				WT_ASSERT(session, upd != NULL);
+				do {
+					ins = WT_SKIP_NEXT(ins);
+				} while (ins != NULL &&
+				    __wt_txn_read_skip(session,
+				    ins->upd, &r->upd_skipped) == NULL);
+
+				update_no_copy = 1;	/* No data copy */
+
+				repeat_count = 1;
 
 				deleted = WT_UPDATE_DELETED_ISSET(upd);
 				if (!deleted) {
 					data = WT_UPDATE_DATA(upd);
 					size = upd->size;
 				}
-
-				repeat_count = 1;
 			} else {
-				upd = NULL;
-				deleted = orig_deleted;
-				if (!deleted) {
-					data = orig.data;
-					size = orig.size;
-				}
+				update_no_copy = 0;	/* Maybe data copy */
 
 				/*
 				 * The repeat count is the number of records up
@@ -1723,14 +1870,68 @@ __rec_col_var(
 				else
 					repeat_count =
 					    WT_INSERT_RECNO(ins) - src_recno;
+
+				deleted = orig_deleted;
+				if (deleted)
+					goto compare;
+
+				/*
+				 * If we are handling overflow items, use the
+				 * overflow item itself exactly once, after
+				 * which we have to copy it into a buffer and
+				 * from then on use a complete copy because we
+				 * are re-creating a new overflow record each
+				 * time.
+				 */
+				switch (ovfl_state) {
+				case OVFL_UNUSED:
+					/*
+					 * Original is an overflow item, as yet
+					 * unused -- use it now.
+					 *
+					 * Write out any record we're tracking.
+					 */
+					if (rle != 0) {
+						WT_ERR(__rec_col_var_helper(
+						    session, salvage, last,
+						    last_deleted, 0, rle));
+						rle = 0;
+					}
+
+					/* Write the overflow item. */
+					last->data = unpack->data;
+					last->size = unpack->size;
+					WT_ERR(__rec_col_var_helper(
+					    session, salvage,
+					    last, 0, 1, repeat_count));
+
+					ovfl_state = OVFL_USED;
+					continue;
+				case OVFL_USED:
+					/*
+					 * Original is an overflow item; we used
+					 * it for a key and now we need another
+					 * copy; read it into memory.
+					 */
+					WT_ERR(__wt_cell_unpack_copy(
+					    session, unpack, orig));
+
+					ovfl_state = OVFL_IGNORE;
+					/* FALLTHROUGH */
+				case OVFL_IGNORE:
+					/*
+					 * Original is an overflow item and we
+					 * were forced to copy it into memory,
+					 * or the original wasn't an overflow
+					 * item; use the data copied into orig.
+					 */
+					data = orig->data;
+					size = orig->size;
+					break;
+				}
 			}
 
-			/*
-			 * Handle RLE accounting and comparisons.
-			 *
-			 * If we don't have a record against which to compare,
-			 * save this record for the purpose and continue.
-			 *
+compare:		/*
 			 * If we have a record against which to compare, and
 			 * the records compare equal, increment the rle counter
 			 * and continue.  If the records don't compare equal,
@@ -1738,7 +1939,7 @@ __rec_col_var(
 			 * buffers: do NOT update the starting record number,
 			 * we've been doing that all along.
 			 */
-			if (can_compare) {
+			if (rle != 0) {
 				if ((deleted && last_deleted) ||
 				    (!last_deleted && !deleted &&
 				    last->size == size &&
@@ -1746,38 +1947,55 @@ __rec_col_var(
 					rle += repeat_count;
 					continue;
 				}
-
 				WT_ERR(__rec_col_var_helper(session,
 				    salvage, last, last_deleted, 0, rle));
 			}
 
 			/*
-			 * Swap the current/last state.  We can't always assign
-			 * the data values to the buffer because they may come
-			 * from a copy built based on an encoded cell.  Check,
-			 * because encoded cells aren't common and we'd like to
-			 * avoid the copy.
+			 * Swap the current/last state.
+			 *
+			 * Reset RLE counter and turn on comparisons.
 			 */
 			if (!deleted) {
-				if (data == orig.data)
-					WT_ERR(__wt_buf_set(
-					    session, last, data, size));
-				else {
+				/*
+				 * We can't simply assign the data values into
+				 * the last buffer because they may have come
+				 * from a copy built from an encoded/overflow
+				 * cell and creating the next record is going
+				 * to overwrite that memory.  Check, because
+				 * encoded/overflow cells aren't that common
+				 * and we'd like to avoid the copy.  If data
+				 * was taken from the current unpack structure
+				 * (which points into the page), or was taken
+				 * from an update structure, we can just use
+				 * the pointers, they're not moving.
+				 */
+				if (data == unpack->data || update_no_copy) {
 					last->data = data;
 					last->size = size;
-				}
+				} else
+					WT_ERR(__wt_buf_set(
+					    session, last, data, size));
 			}
 			last_deleted = deleted;
-
-			/* Reset RLE counter and turn on comparisons. */
 			rle = repeat_count;
-			can_compare = 1;
 		}
+
+		/*
+		 * If we had a reference to an overflow record we never used,
+		 * discard the underlying blocks, they're no longer useful.
+		 */
+		if (ovfl_state == OVFL_UNUSED)
+			 WT_ERR(__wt_rec_track_onpage_add(
+			     session, page, unpack->data, unpack->size));
 	}
 
 	/* Walk any append list. */
 	append = WT_COL_APPEND(page);
-	WT_SKIP_FOREACH(ins, append)
+	WT_SKIP_FOREACH(ins, append) {
+		upd = __wt_txn_read_skip(session, ins->upd, &r->upd_skipped);
+		if (upd == NULL)
+			continue;
 		for (n = WT_INSERT_RECNO(ins); src_recno <= n; ++src_recno) {
 			/*
 			 * The application may have inserted records which left
@@ -1786,7 +2004,6 @@ __rec_col_var(
 			if (src_recno < n)
 				deleted = 1;
 			else {
-				upd = ins->upd;
 				deleted = WT_UPDATE_DELETED_ISSET(upd);
 				if (!deleted) {
 					data = WT_UPDATE_DATA(upd);
@@ -1798,7 +2015,7 @@ __rec_col_var(
 			 * Handle RLE accounting and comparisons -- see comment
 			 * above, this code fragment does the same thing.
 			 */
-			if (can_compare) {
+			if (rle != 0) {
 				if ((deleted && last_deleted) ||
 				    (!last_deleted && !deleted &&
 				    last->size == size &&
@@ -1806,7 +2023,6 @@ __rec_col_var(
 					++rle;
 					continue;
 				}
-
 				WT_ERR(__rec_col_var_helper(session,
 				    salvage, last, last_deleted, 0, rle));
 			}
@@ -1815,27 +2031,27 @@ __rec_col_var(
 			 * Swap the current/last state.  We always assign the
 			 * data values to the buffer because they can only be
 			 * the data from a WT_UPDATE structure.
+			 *
+			 * Reset RLE counter and turn on comparisons.
 			 */
 			if (!deleted) {
 				last->data = data;
 				last->size = size;
 			}
 			last_deleted = deleted;
-
-			/* Reset RLE counter and turn on comparisons. */
 			rle = 1;
-			can_compare = 1;
 		}
+	}
 
 	/* If we were tracking a record, write it. */
-	if (can_compare)
+	if (rle != 0)
 		WT_ERR(__rec_col_var_helper(
 		    session, salvage, last, last_deleted, 0, rle));
 
 	/* Write the remnant page. */
 	ret = __rec_split_finish(session);
 
-err:	__wt_buf_free(session, &orig);
+err:	__wt_scr_free(&orig);
 	return (ret);
 }
 
@@ -1846,23 +2062,30 @@ err:	__wt_buf_free(session, &orig);
 static int
 __rec_row_int(WT_SESSION_IMPL *session, WT_PAGE *page)
 {
+	WT_BTREE *btree;
 	WT_CELL *cell;
 	WT_CELL_UNPACK *unpack, _unpack;
+	WT_DECL_RET;
 	WT_IKEY *ikey;
+	WT_DECL_ITEM(tmpkey);
 	WT_KV *key, *val;
 	WT_PAGE *rp;
 	WT_RECONCILE *r;
 	WT_REF *ref;
 	uint32_t i;
-	int onpage_ovfl, ovfl_key, val_set;
+	int found, onpage_ovfl, ovfl_key, val_set;
 
 	r = session->reconcile;
+	btree = session->btree;
 	unpack = &_unpack;
+
 	key = &r->k;
 	val = &r->v;
 
-	WT_RET(__rec_split_init(session,
-	    page, 0ULL, session->btree->maxintlpage));
+	WT_RET(__rec_split_init(session, page, 0ULL, btree->maxintlpage));
+
+	/* Temporary buffer in which to instantiate any uninstantiated keys. */
+	WT_RET(__wt_scr_alloc(session, 0, &tmpkey));
 
 	/*
 	 * Ideally, we'd never store the 0th key on row-store internal pages
@@ -1890,21 +2113,21 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_PAGE *page)
 		 * references one.
 		 */
 		ikey = ref->u.key;
-		if (ikey->cell_offset == 0) {
+		if (ikey->cell_offset == 0)
 			cell = NULL;
-			/*
-			 * We need to know if we're using on-page overflow cell
-			 * in a few places below, initialize the unpacked cell's
-			 * overflow value so there's an easy test.
-			 */
-			onpage_ovfl = 0;
-		} else {
+		else {
 			cell = WT_PAGE_REF_OFFSET(page, ikey->cell_offset);
 			__wt_cell_unpack(cell, unpack);
-			onpage_ovfl = unpack->ovfl;
 		}
 
 		/*
+		 * We need to know if we're using on-page overflow key cell in
+		 * a few places below, initialize the unpacked cell's overflow
+		 * value so there's an easy test.
+		 */
+		onpage_ovfl = cell != NULL && unpack->ovfl == 1 ? 1 : 0;
+
+		/*
 		 * The page may be deleted or internally created during a split.
 		 * Deleted/split pages are merged into the parent and discarded.
 		 *
@@ -1945,11 +2168,14 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_PAGE *page)
 			case WT_PM_REC_EMPTY:
 				/*
 				 * Overflow keys referencing discarded pages are
-				 * no longer useful.
+				 * no longer useful.  We can't just discard them
+				 * though: if the page is re-filled, they may be
+				 * necessary for a subsequent reconciliation,
+				 * enter them into the tracking system.
 				 */
 				if (onpage_ovfl)
-					WT_RET(__rec_track_cell(
-					    session, page, unpack));
+					WT_ERR(__rec_onpage_ovfl(
+					    session, page, unpack, tmpkey));
 				continue;
 			case WT_PM_REC_REPLACE:
 				__rec_cell_build_addr(session,
@@ -1962,14 +2188,18 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_PAGE *page)
 				/*
 				 * Overflow keys referencing split pages are no
 				 * no longer useful (the interesting key is the
-				 * key for the split page).
+				 * key for the split page).  We can't just
+				 * discard them, though: if the page shrinks,
+				 * they may be necessary for a subsequent
+				 * reconciliation, enter them into the tracking
+				 * system.
 				 */
 				if (onpage_ovfl)
-					WT_RET(__rec_track_cell(
-					    session, page, unpack));
+					WT_ERR(__rec_onpage_ovfl(
+					    session, page, unpack, tmpkey));
 
 				r->merge_ref = ref;
-				WT_RET(__rec_row_merge(session,
+				WT_ERR(__rec_row_merge(session,
 				    F_ISSET(rp->modify, WT_PM_REC_MASK) ==
 				    WT_PM_REC_SPLIT_MERGE ?
 				    rp : rp->modify->u.split));
@@ -1980,29 +2210,65 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_PAGE *page)
 		/*
 		 * Build key cell.
 		 *
-		 * If the key is an overflow item, assume prefix compression
-		 * won't make things better, and simply copy it.
+		 * If the key is an overflow item, check to see if it's been
+		 * entered into the tracking system (if an overflow key were
+		 * to reference an empty page during a previous reconciliation,
+		 * its blocks would have been discarded, and the only copy that
+		 * remains is in the tracking system).  If we don't find it in
+		 * the tracking system, assume prefix compression won't make
+		 * things better, and simply copy the key from the disk image.
 		 *
-		 * XXX
 		 * We have the key in-hand (we instantiate all internal page
 		 * keys when the page is brought into memory), so it would be
 		 * easy to check prefix compression, I'm just not bothering.
 		 * If we did gain by prefix compression, we'd have to discard
-		 * the old overflow key and write a new one, and this isn't a
-		 * likely path anyway.
+		 * the old overflow key and write a new one to make it worth
+		 * doing, and this isn't a likely path anyway.
 		 *
 		 * Truncate any 0th key, internal pages don't need 0th keys.
 		 */
 		if (onpage_ovfl) {
-			key->buf.data = cell;
-			key->buf.size = unpack->len;
-			key->cell_len = 0;
-			key->len = key->buf.size;
-			ovfl_key = 1;
+			WT_ERR(__wt_rec_track_onpage_srch(session,
+			    page, unpack->data, unpack->size, &found, tmpkey));
+			if (found) {
+				/*
+				 * If the key is Huffman encoded, decode it and
+				 * build a new key cell, which re-encodes the
+				 * key, wasting some work: this isn't a likely
+				 * path, a deleted key we then re-instantiate,
+				 * it's not worth handling Huffman encoded
+				 * keys separately to avoid the additional work,
+				 * we still have to write the key which is more
+				 * time than anything else.
+				 */
+				if (btree->huffman_key != NULL)
+					WT_ERR(__wt_huffman_decode(session,
+					    btree->huffman_key,
+					    tmpkey->data, tmpkey->size,
+					    tmpkey));
+
+				WT_ERR(__rec_cell_build_key(session,
+				    tmpkey->data,
+				    r->cell_zero ? 1 : tmpkey->size,
+				    1, &ovfl_key));
+
+				/*
+				 * Clear the on-page overflow key flag: we've
+				 * built a real key, we're not copying from a
+				 * page.
+				 */
+				onpage_ovfl = 0;
+			} else {
+				key->buf.data = cell;
+				key->buf.size = unpack->len;
+				key->cell_len = 0;
+				key->len = unpack->len;
+				ovfl_key = 1;
+			}
 		} else
-			WT_RET(__rec_cell_build_key(session,
-			    WT_IKEY_DATA(ikey),
-			    r->cell_zero ? 1 : ikey->size, 1, &ovfl_key));
+			WT_ERR(__rec_cell_build_key(session,
+			    WT_IKEY_DATA(ikey), r->cell_zero ? 1 : ikey->size,
+			    1, &ovfl_key));
 		r->cell_zero = 0;
 
 		/*
@@ -2035,16 +2301,18 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_PAGE *page)
 		 */
 		while (key->len + val->len > r->space_avail) {
 			/*
-			 * We have to have a copy of any overflow key because
-			 * we're about to promote it.
+			 * In one path above, we copied the key from the page
+			 * rather than building the actual key.  In that case,
+			 * we have to build the actual key now because we are
+			 * about to promote it.
 			 */
-			if (ovfl_key && onpage_ovfl)
-				WT_RET(__wt_cell_copy(session, cell, r->cur));
-			WT_RET(__rec_split(session));
+			if (onpage_ovfl)
+				WT_ERR(__wt_cell_copy(session, cell, r->cur));
+			WT_ERR(__rec_split(session));
 
 			r->key_pfx_compress = 0;
 			if (!ovfl_key)
-				WT_RET(__rec_cell_build_key(
+				WT_ERR(__rec_cell_build_key(
 				    session, NULL, 0, 1, &ovfl_key));
 		}
 
@@ -2057,7 +2325,10 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_PAGE *page)
 	}
 
 	/* Write the remnant page. */
-	return (__rec_split_finish(session));
+	ret = __rec_split_finish(session);
+
+err:	__wt_scr_free(&tmpkey);
+	return (ret);
 }
 
 /*
@@ -2190,23 +2461,23 @@ __rec_row_leaf(
 	WT_BTREE *btree;
 	WT_CELL *cell, *val_cell;
 	WT_CELL_UNPACK *unpack, _unpack;
+	WT_DECL_ITEM(tmpkey);
+	WT_DECL_RET;
 	WT_IKEY *ikey;
 	WT_INSERT *ins;
-	WT_ITEM *tmpkey;
 	WT_KV *key, *val;
 	WT_RECONCILE *r;
 	WT_ROW *rip;
 	WT_UPDATE *upd;
 	uint64_t slvg_skip;
 	uint32_t i;
-	int ovfl_key, ret;
+	int found, onpage_ovfl, ovfl_key;
+	void *ripkey;
 
 	r = session->reconcile;
 	btree = session->btree;
-	tmpkey = NULL;
 	unpack = &_unpack;
 	slvg_skip = salvage == NULL ? 0 : salvage->skip;
-	ret = 0;
 
 	key = &r->k;
 	val = &r->v;
@@ -2220,11 +2491,7 @@ __rec_row_leaf(
 	if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL)
 		WT_RET(__rec_row_leaf_insert(session, ins));
 
-	/*
-	 * A temporary buffer in which to instantiate any uninstantiated keys.
-	 * From this point on, we need to jump to the err label on error so the
-	 * buffer is discarded.
-	 */
+	/* Temporary buffer in which to instantiate any uninstantiated keys. */
 	WT_RET(__wt_scr_alloc(session, 0, &tmpkey));
 
 	/* For each entry in the page... */
@@ -2248,26 +2515,30 @@ __rec_row_leaf(
 		 * Set the WT_IKEY reference (if the key was instantiated), and
 		 * the key cell reference.
 		 */
-		if (__wt_off_page(page, rip->key)) {
-			ikey = rip->key;
+		ripkey = WT_ROW_KEY_COPY(rip);
+		if (__wt_off_page(page, ripkey)) {
+			ikey = ripkey;
 			cell = WT_PAGE_REF_OFFSET(page, ikey->cell_offset);
 		} else {
 			ikey = NULL;
-			cell = rip->key;
+			cell = ripkey;
 		}
 
 		/* Build value cell. */
 		if ((val_cell = __wt_row_value(page, rip)) != NULL)
 			__wt_cell_unpack(val_cell, unpack);
-		if ((upd = WT_ROW_UPDATE(page, rip)) == NULL) {
+		upd = __wt_txn_read_skip(
+		    session, WT_ROW_UPDATE(page, rip), &r->upd_skipped);
+		if (upd == NULL) {
 			/*
 			 * Copy the item off the page -- however, when the page
 			 * was read into memory, there may not have been a value
 			 * item, that is, it may have been zero length.
 			 */
-			if (val_cell == NULL)
+			if (val_cell == NULL) {
+				val->buf.data = NULL;
 				val->buf.size = 0;
-			else {
+			} else {
 				val->buf.data = val_cell;
 				val->buf.size = unpack->len;
 			}
@@ -2275,27 +2546,39 @@ __rec_row_leaf(
 			val->len = val->buf.size;
 		} else {
 			/*
-			 * If we updated an overflow value, free the underlying
-			 * file space.
+			 * If the original value was an overflow and we've not
+			 * already done so, discard it.   We don't save a copy
+			 * of the overflow value in case it is re-used -- we'd
+			 * have to read it to get a copy, and that implies disk
+			 * I/O for little reason.
 			 */
-			if (val_cell != NULL)
-				WT_ERR(__rec_track_cell(session, page, unpack));
+			if (val_cell != NULL && unpack->ovfl)
+				WT_ERR(__wt_rec_track_onpage_add(
+				    session, page, unpack->data, unpack->size));
 
-			/*
-			 * If this key/value pair was deleted, we're done.  If
-			 * we deleted an overflow key, free the underlying file
-			 * space.
-			 */
+			/* If this key/value pair was deleted, we're done. */
 			if (WT_UPDATE_DELETED_ISSET(upd)) {
+				/*
+				 * Overflow keys referencing discarded values
+				 * are no longer useful.  We can't just discard
+				 * overflow keys as we did overflow values: if
+				 * the value gets replaced, we'll need the key
+				 * again for a subsequent reconciliation.  Add
+				 * the key to the tracking system.
+				 */
 				__wt_cell_unpack(cell, unpack);
-				WT_ERR(__rec_track_cell(session, page, unpack));
+				if (unpack->ovfl)
+					WT_ERR(__rec_onpage_ovfl(
+					    session, page, unpack, tmpkey));
 
 				/*
-				 * We skip creating the key, don't try to use
-				 * the last valid key in prefix calculations.
+				 * We aren't actually creating the key so we
+				 * can't use bytes from this key to provide
+				 * prefix information for a subsequent key.
 				 */
 				tmpkey->size = 0;
 
+				/* Proceed with appended key/value pairs. */
 				goto leaf_insert;
 			}
 
@@ -2314,28 +2597,66 @@ __rec_row_leaf(
 
 		/*
 		 * Build key cell.
+		 *
+		 * If the key is an overflow item, check to see if it's been
+		 * entered into the tracking system (if an overflow key were
+		 * referenced a deleted value during a previous reconciliation,
+		 * its blocks would have been discarded, and the only copy that
+		 * remains is in the tracking system).  If we don't find it in
+		 * the tracking system, assume prefix compression won't make
+		 * things better, and simply copy the key from the disk image.
 		 */
 		__wt_cell_unpack(cell, unpack);
-		if (unpack->type == WT_CELL_KEY_OVFL) {
-			/*
-			 * If the key is an overflow item, assume prefix
-			 * compression won't make things better, and copy it.
-			 */
-			key->buf.data = cell;
-			key->buf.size = unpack->len;
-			key->cell_len = 0;
-			key->len = key->buf.size;
-			ovfl_key = 1;
-
-			/* Don't try to use a prefix across an overflow key. */
-			tmpkey->size = 0;
+		onpage_ovfl = unpack->ovfl;
+		if (onpage_ovfl) {
+			WT_ERR(__wt_rec_track_onpage_srch(session,
+			    page, unpack->data, unpack->size, &found, tmpkey));
+			if (found) {
+				/*
+				 * If the key is Huffman encoded, decode it and
+				 * build a new key cell, which re-encodes the
+				 * key, wasting some work: this isn't a likely
+				 * path, a deleted key we then re-instantiate,
+				 * it's not worth handling Huffman encoded
+				 * keys separately to avoid the additional work,
+				 * we still have to write the key which is more
+				 * time than anything else.
+				 */
+				if (btree->huffman_key != NULL)
+					WT_ERR(__wt_huffman_decode(session,
+					    btree->huffman_key,
+					    tmpkey->data, tmpkey->size,
+					    tmpkey));
+
+				WT_ERR(__rec_cell_build_key(session,
+				    tmpkey->data, tmpkey->size, 0, &ovfl_key));
+
+				/*
+				 * Clear the on-page overflow key flag: we've
+				 * built a real key, we're not copying from a
+				 * page.
+				 */
+				onpage_ovfl = 0;
+			} else {
+				key->buf.data = cell;
+				key->buf.size = unpack->len;
+				key->cell_len = 0;
+				key->len = unpack->len;
+				ovfl_key = 1;
+
+				/*
+				 * We aren't actually creating the key so we
+				 * can't use bytes from this key to provide
+				 * prefix information for a subsequent key.
+				 */
+				tmpkey->size = 0;
+			}
 		} else {
 			/*
-			 * If the key is already instantiated, use it.
-			 * Else, if the key is available from the page, use it.
-			 * Else, if we can construct the key from a previous
-			 *	key, do so.
-			 * Else, instantiate the key.
+			 * Use an already instantiated key, or
+			 * Use the key from the disk image, or
+			 * Build a key from a previous key, or
+			 * Instantiate the key from scratch.
 			 */
 			if (ikey != NULL) {
 				tmpkey->data = WT_IKEY_DATA(ikey);
@@ -2349,19 +2670,25 @@ __rec_row_leaf(
 			    unpack->type == WT_CELL_KEY &&
 			    tmpkey->size >= unpack->prefix) {
 				/*
+				 * The previous clause checked for a prefix of
+				 * zero, which means the temporary buffer must
+				 * have a non-zero size, and it references a
+				 * valid key.
+				 */
+				WT_ASSERT(session, tmpkey->size != 0);
+
+				/*
 				 * If we previously built a prefix-compressed
 				 * key in the temporary buffer, WT_ITEM->data
 				 * will be the same as WT_ITEM->mem: grow the
-				 * buffer if necessary and copy the suffix into
-				 * place.
+				 * buffer and copy the suffix into place.
 				 *
 				 * If we previously pointed the temporary buffer
-				 * at an on-page key, WT_ITEM->data will not be
-				 * the same as WT_ITEM->mem: grow the buffer if
-				 * necessary, copy the prefix into place, then
-				 * re-point the WT_ITEM->data field to the newly
-				 * constructed memory, and then copy the suffix
-				 * into place.
+				 * at an in-memory or on-page key, WT_ITEM->data
+				 * will not be the same as WT_ITEM->mem: grow
+				 * the buffer, copy the prefix into place, reset
+				 * the data field to point to the buffer memory,
+				 * then copy the suffix into place.
 				 */
 				WT_ERR(__wt_buf_grow(session,
 				    tmpkey, unpack->prefix + unpack->size));
@@ -2393,11 +2720,13 @@ __rec_row_leaf(
 		while (key->len +
 		    val->len + WT_TRAILING_KEY_CELL > r->space_avail) {
 			/*
-			 * We have to have a copy of any overflow key because
-			 * we're about to promote it.
+			 * In one path above, we copied the key from the page
+			 * rather than building the actual key.  In that case,
+			 * we have to build the actual key now because we are
+			 * about to promote it.
 			 */
-			if (ovfl_key && unpack->type == WT_CELL_KEY_OVFL)
-				WT_RET(__wt_cell_unpack_copy(
+			if (onpage_ovfl)
+				WT_ERR(__wt_cell_unpack_copy(
 				    session, unpack, r->cur));
 			WT_ERR(__rec_split(session));
 
@@ -2444,8 +2773,9 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_INSERT *ins)
 	val = &r->v;
 
 	for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) {
-		upd = ins->upd;				/* Build value cell. */
-		if (WT_UPDATE_DELETED_ISSET(upd))
+		/* Build value cell. */
+		upd = __wt_txn_read_skip(session, ins->upd, &r->upd_skipped);
+		if (upd == NULL || WT_UPDATE_DELETED_ISSET(upd))
 			continue;
 		if (upd->size == 0)
 			val->len = 0;
@@ -2488,23 +2818,82 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_INSERT *ins)
 }
 
 /*
+ * __rec_split_discard --
+ *	Discard the pages resulting from a previous split.
+ */
+static int
+__rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_PAGE_MODIFY *mod;
+	WT_REF *ref;
+	uint32_t i;
+
+	/*
+	 * A page that split is being reconciled for the second, or subsequent
+	 * time; discard any the underlying block space or overflow items used
+	 * in the previous reconciliation.
+	 *
+	 * This routine would be trivial, and only walk a single page freeing
+	 * any blocks that were written to support the split -- the problem is
+	 * root splits.  In the case of root splits, we potentially have to
+	 * cope with the underlying blocks of multiple pages, but also there
+	 * may be overflow items that we have to resolve.
+	 *
+	 * These pages are discarded -- add them to the object tracking list.
+	 */
+	WT_REF_FOREACH(page, ref, i)
+		WT_RET(__wt_rec_track(session, page,
+		    ((WT_ADDR *)ref->addr)->addr,
+		    ((WT_ADDR *)ref->addr)->size, NULL, 0, 0));
+	WT_RET(__wt_rec_track_wrapup(session, page));
+
+	if ((mod = page->modify) != NULL)
+		switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+		case WT_PM_REC_SPLIT_MERGE:
+			/*
+			 * NOT root page split: this is the split merge page for
+			 * a normal page split, and we don't need to do anything
+			 * further.
+			 */
+			break;
+		case WT_PM_REC_SPLIT:
+			/*
+			 * Root page split: continue walking the list of split
+			 * pages, cleaning up as we go.
+			 */
+			WT_RET(__rec_split_discard(session, mod->u.split));
+			break;
+		case WT_PM_REC_REPLACE:
+			/*
+			 * Root page split: the last entry on the list.  There
+			 * won't be a page to discard because writing the page
+			 * created a snapshot, not a replacement page.
+			 */
+			WT_ASSERT(session, mod->u.replace.addr == NULL);
+			break;
+		WT_ILLEGAL_VALUE(session);
+		}
+	return (0);
+}
+
+/*
  * __rec_write_wrapup  --
  *	Finish the reconciliation.
  */
 static int
 __rec_write_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
 {
+	WT_BTREE *btree;
 	WT_BOUNDARY *bnd;
+	WT_DECL_RET;
 	WT_PAGE_MODIFY *mod;
 	WT_RECONCILE *r;
-	WT_REF *ref;
 	uint32_t i, size;
-	int ret;
 	const uint8_t *addr;
 
 	r = session->reconcile;
+	btree = session->btree;
 	mod = page->modify;
-	ret = 0;
 
 	/*
 	 * This page may have previously been reconciled, and that information
@@ -2515,22 +2904,30 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
 	switch (F_ISSET(mod, WT_PM_REC_MASK)) {
 	case 0:	/*
 		 * The page has never been reconciled before, track the original
-		 * address blocks (if any).   If we're splitting the root page
-		 * we may schedule the same blocks to be freed repeatedly: that
-		 * is OK, the track function checks for duplicates.
+		 * address blocks (if any).
+		 *
+		 * The exception is root pages are never tracked or free'd, they
+		 * are snapshots, and must be explicitly dropped.
 		 */
 		if (!WT_PAGE_IS_ROOT(page) && page->ref->addr != NULL) {
 			__wt_get_addr(page->parent, page->ref, &addr, &size);
-			WT_RET(__wt_rec_track_block(
-			    session, WT_PT_BLOCK, page, addr, size));
+			WT_RET(__wt_rec_track_onpage_add(
+			    session, page, addr, size));
 		}
 		break;
 	case WT_PM_REC_EMPTY:				/* Page deleted */
 		break;
-	case WT_PM_REC_REPLACE:			/* 1-for-1 page swap */
-		/* Discard the replacement leaf page's blocks. */
-		WT_RET(__wt_rec_track_block(session, WT_PT_BLOCK,
-		    page, mod->u.replace.addr, mod->u.replace.size));
+	case WT_PM_REC_REPLACE:				/* 1-for-1 page swap */
+		/*
+		 * Discard the replacement leaf page's blocks.
+		 *
+		 * The exception is root pages are never tracked or free'd, they
+		 * are snapshots, and must be explicitly dropped.
+		 */
+		if (!WT_PAGE_IS_ROOT(page))
+			WT_RET(__wt_rec_track(session, page,
+			    mod->u.replace.addr, mod->u.replace.size,
+			    NULL, 0, 0));
 
 		/* Discard the replacement page's address. */
 		__wt_free(session, mod->u.replace.addr);
@@ -2538,15 +2935,9 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
 		mod->u.replace.size = 0;
 		break;
 	case WT_PM_REC_SPLIT:				/* Page split */
-		/* Discard the split page's leaf-page blocks. */
-		WT_REF_FOREACH(mod->u.split, ref, i)
-			WT_RET(__wt_rec_track_block(
-			    session, WT_PT_BLOCK, page,
-			    ((WT_ADDR *)ref->addr)->addr,
-			    ((WT_ADDR *)ref->addr)->size));
-
-		/* Discard the split page itself. */
-		__wt_page_out(session, mod->u.split, 0);
+		/* Discard the split page. */
+		WT_RET(__rec_split_discard(session, mod->u.split));
+		__wt_page_out(session, &mod->u.split, 0);
 		mod->u.split = NULL;
 		break;
 	case WT_PM_REC_SPLIT_MERGE:			/* Page split */
@@ -2560,12 +2951,24 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
 	}
 	F_CLR(mod, WT_PM_REC_MASK);
 
+	/*
+	 * Wrap up discarded block and overflow tracking.  If we are about to
+	 * create a snapshot, the system must be entirely consistent at that
+	 * point, the underlying block manager is presumably going to do some
+	 * action to resolve the list of allocated/free/whatever blocks that
+	 * are associated with the snapshot.
+	 */
+	WT_RET(__wt_rec_track_wrapup(session, page));
+
 	switch (r->bnd_next) {
 	case 0:						/* Page delete */
-		WT_VERBOSE(session, reconcile, "page %p empty", page);
-
+		WT_VERBOSE_RET(session, reconcile, "page %p empty", page);
 		WT_BSTAT_INCR(session, rec_page_delete);
 
+		/* If this is the root page, we need to create a sync point. */
+		if (WT_PAGE_IS_ROOT(page))
+			WT_RET(__wt_bm_snapshot(session, NULL, btree->snap));
+
 		/*
 		 * If the page was empty, we want to discard it from the tree
 		 * by discarding the parent's key when evicting the parent.
@@ -2580,25 +2983,23 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
 		 * Because WiredTiger's pages grow without splitting, we're
 		 * replacing a single page with another single page most of
 		 * the time.
+		 *
+		 * If this is a root page, then we don't have an address and we
+		 * have to create a sync point.  The address was cleared when
+		 * we were about to write the buffer so we know what to do here.
 		 */
 		bnd = &r->bnd[0];
-#ifdef HAVE_VERBOSE
-		if (WT_VERBOSE_ISSET(session, reconcile)) {
-			WT_ITEM *buf;
-			WT_RET(__wt_scr_alloc(session, 64, &buf));
-			WT_VERBOSE(session, reconcile, "page %p written to %s",
-			    page, __wt_addr_string(
-			    session, buf, bnd->addr.addr, bnd->addr.size));
-			__wt_scr_free(&buf);
+		if (bnd->addr.addr == NULL)
+			WT_RET(__wt_bm_snapshot(session, &r->dsk, btree->snap));
+		else {
+			mod->u.replace = bnd->addr;
+			bnd->addr.addr = NULL;
 		}
-#endif
-		mod->u.replace = bnd->addr;
-		bnd->addr.addr = NULL;
 
 		F_SET(mod, WT_PM_REC_REPLACE);
 		break;
 	default:					/* Page split */
-		WT_VERBOSE(session, reconcile,
+		WT_VERBOSE_RET(session, reconcile,
 		    "page %p split into %" PRIu32 " pages",
 		    page, r->bnd_next);
 
@@ -2617,7 +3018,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
 
 #ifdef HAVE_VERBOSE
 		if (WT_VERBOSE_ISSET(session, reconcile)) {
-			WT_ITEM *tkey;
+			WT_DECL_ITEM(tkey);
 			if (page->type == WT_PAGE_ROW_INT ||
 			    page->type == WT_PAGE_ROW_LEAF)
 				WT_RET(__wt_scr_alloc(session, 0, &tkey));
@@ -2628,7 +3029,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
 					WT_ERR(__wt_buf_set_printable(
 					    session, tkey,
 					    bnd->key.data, bnd->key.size));
-					WT_VERBOSE(session, reconcile,
+					WT_VERBOSE_ERR(session, reconcile,
 					    "split: starting key "
 					    "%.*s",
 					    (int)tkey->size,
@@ -2637,15 +3038,14 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
 				case WT_PAGE_COL_FIX:
 				case WT_PAGE_COL_INT:
 				case WT_PAGE_COL_VAR:
-					WT_VERBOSE(session, reconcile,
+					WT_VERBOSE_ERR(session, reconcile,
 					    "split: starting recno %" PRIu64,
 					    bnd->recno);
 					break;
-				WT_ILLEGAL_VALUE(session);
+				WT_ILLEGAL_VALUE_ERR(session);
 				}
-err:			if (page->type == WT_PAGE_ROW_INT ||
-			    page->type == WT_PAGE_ROW_LEAF)
-				__wt_scr_free(&tkey);
+err:			__wt_scr_free(&tkey);
+			WT_RET(ret);
 		}
 #endif
 		switch (page->type) {
@@ -2665,6 +3065,42 @@ err:			if (page->type == WT_PAGE_ROW_INT ||
 		break;
 	}
 
+	/*
+	 * If the write succeeded, no updates were skipped and the disk
+	 * generation has not changed in the meantime, update it to the write
+	 * generation when reconciliation started.
+	 */
+	if (!r->upd_skipped)
+		(void)WT_ATOMIC_CAS(
+		    mod->disk_gen, r->orig_disk_gen, r->orig_write_gen);
+
+	return (0);
+}
+
+/*
+ * __rec_write_wrapup_err  --
+ *	Finish the reconciliation on error.
+ */
+static int
+__rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_BOUNDARY *bnd;
+	WT_DECL_RET;
+	WT_RECONCILE *r;
+	uint32_t i;
+
+	r = session->reconcile;
+
+	/*
+	 * On error, discard pages we've written, they're unreferenced by the
+	 * tree.  This is not a question of correctness, we're avoiding block
+	 * leaks.
+	 */
+	WT_TRET(__wt_rec_track_wrapup_err(session, page));
+	for (bnd = r->bnd, i = 0; i < r->bnd_next; ++bnd, ++i)
+		if (bnd->addr.addr != NULL)
+			WT_TRET(__wt_bm_free(
+			    session, bnd->addr.addr, bnd->addr.size));
 	return (ret);
 }
 
@@ -2676,14 +3112,13 @@ static int
 __rec_split_row(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_PAGE **splitp)
 {
 	WT_BOUNDARY *bnd;
+	WT_DECL_RET;
 	WT_PAGE *page;
 	WT_RECONCILE *r;
 	WT_REF *ref;
 	uint32_t i;
-	int ret;
 
 	r = session->reconcile;
-	ret = 0;
 
 	/* Allocate a row-store internal page. */
 	WT_RET(__wt_calloc_def(session, 1, &page));
@@ -2722,7 +3157,7 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_PAGE **splitp)
 	for (ref = page->u.intl.t,
 	    bnd = r->bnd, i = 0; i < r->bnd_next; ++ref, ++bnd, ++i) {
 		WT_ERR(__wt_row_ikey_alloc(session, 0,
-		    bnd->key.data, bnd->key.size, (WT_IKEY **)&ref->u.key));
+		    bnd->key.data, bnd->key.size, &ref->u.key));
 		WT_ERR(__wt_calloc(session, 1, sizeof(WT_ADDR), &ref->addr));
 		((WT_ADDR *)ref->addr)->addr = bnd->addr.addr;
 		((WT_ADDR *)ref->addr)->size = bnd->addr.size;
@@ -2735,7 +3170,7 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_PAGE **splitp)
 	*splitp = page;
 	return (0);
 
-err:	__wt_page_out(session, page, 0);
+err:	__wt_page_out(session, &page, 0);
 	return (ret);
 }
 
@@ -2747,14 +3182,13 @@ static int
 __rec_split_col(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_PAGE **splitp)
 {
 	WT_BOUNDARY *bnd;
+	WT_DECL_RET;
 	WT_PAGE *page;
 	WT_RECONCILE *r;
 	WT_REF *ref;
 	uint32_t i;
-	int ret;
 
 	r = session->reconcile;
-	ret = 0;
 
 	/* Allocate a column-store internal page. */
 	WT_RET(__wt_calloc_def(session, 1, &page));
@@ -2790,7 +3224,7 @@ __rec_split_col(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_PAGE **splitp)
 	*splitp = page;
 	return (0);
 
-err:	__wt_page_out(session, page, 0);
+err:	__wt_page_out(session, &page, 0);
 	return (ret);
 }
 
@@ -2863,13 +3297,13 @@ __rec_cell_build_key(WT_SESSION_IMPL *session,
 	/* Create an overflow object if the data won't fit. */
 	if (key->buf.size >
 	    (is_internal ? btree->maxintlitem : btree->maxleafitem)) {
-		WT_BSTAT_INCR(session, rec_ovfl_key);
-
 		/*
 		 * Overflow objects aren't prefix compressed -- rebuild any
 		 * object that was prefix compressed.
 		 */
 		if (pfx == 0) {
+			WT_BSTAT_INCR(session, rec_ovfl_key);
+
 			*is_ovflp = 1;
 			return (__rec_cell_build_ovfl(
 			    session, key, WT_CELL_KEY_OVFL, (uint64_t)0));
@@ -2973,26 +3407,26 @@ __rec_cell_build_ovfl(
     WT_SESSION_IMPL *session, WT_KV *kv, uint8_t type, uint64_t rle)
 {
 	WT_BTREE *btree;
-	WT_ITEM *tmp;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
 	WT_PAGE *page;
 	WT_PAGE_HEADER *dsk;
 	WT_RECONCILE *r;
 	uint32_t size;
-	int ret;
-	uint8_t *addr, buf[WT_BM_MAX_ADDR_COOKIE];
+	int found;
+	uint8_t *addr, buf[WT_BTREE_MAX_ADDR_COOKIE];
 
 	r = session->reconcile;
 	btree = session->btree;
 	page = r->page;
-	tmp = NULL;
-	ret = 0;
 
 	/*
 	 * See if this overflow record has already been written and reuse it if
 	 * possible.  Else, write a new overflow record.
 	 */
-	if (!__wt_rec_track_ovfl_reuse(
-	    session, page, kv->buf.data, kv->buf.size, &addr, &size)) {
+	WT_RET(__wt_rec_track_ovfl_reuse(
+	    session, page, kv->buf.data, kv->buf.size, &addr, &size, &found));
+	if (!found) {
 		/* Allocate a buffer big enough to write the overflow record. */
 		size = kv->buf.size;
 		WT_RET(__wt_bm_write_size(session, &size));
@@ -3012,8 +3446,8 @@ __rec_cell_build_ovfl(
 		WT_ERR(__wt_bm_write(session, tmp, addr, &size));
 
 		/* Track the overflow record. */
-		WT_ERR(__wt_rec_track_ovfl(
-		    session, page, addr, size, kv->buf.data, kv->buf.size));
+		WT_ERR(__wt_rec_track(session, page,
+		    addr, size, kv->buf.data, kv->buf.size, WT_TRK_INUSE));
 	}
 
 	/* Set the callers K/V to reference the overflow record's address. */
diff --git a/src/btree/row_key.c b/src/btree/row_key.c
index fb4458d083d..776bb309ba3 100644
--- a/src/btree/row_key.c
+++ b/src/btree/row_key.c
@@ -17,13 +17,12 @@ int
 __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page)
 {
 	WT_BTREE *btree;
-	WT_ITEM *tmp;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
 	WT_ROW *rip;
 	uint32_t i;
-	int ret;
 
 	btree = session->btree;
-	ret = 0;
 
 	if (page->entries == 0) {			/* Just checking... */
 		F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS);
@@ -108,22 +107,17 @@ __wt_row_key(
 {
 	enum { FORWARD, BACKWARD } direction;
 	WT_CELL_UNPACK *unpack, _unpack;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
 	WT_IKEY *ikey;
-	WT_ITEM *tmp;
 	WT_ROW *rip;
-	int is_local, ret, slot_offset;
+	int is_local, slot_offset;
 	void *key;
 
 	rip = rip_arg;
-	tmp = NULL;
 	unpack = &_unpack;
 
-	/*
-	 * If the caller didn't pass us a buffer, create one.  We don't use
-	 * an existing buffer because the memory will be attached to a page
-	 * for semi-permanent use, and using an existing buffer might waste
-	 * memory if the one allocated from the pool was larger than needed.
-	 */
+	/* If the caller didn't pass us a buffer, create one. */
 	is_local = 0;
 	if (retb == NULL) {
 		is_local = 1;
@@ -132,13 +126,7 @@ __wt_row_key(
 
 	direction = BACKWARD;
 	for (slot_offset = 0;;) {
-		/*
-		 * Multiple threads of control may be searching this page, which
-		 * means the key may change underfoot, and here's where it gets
-		 * tricky: first, copy the key.  We don't need any barriers, the
-		 * key is updated atomically, and we just need a valid copy.
-		 */
-		key = rip->key;
+		key = WT_ROW_KEY_COPY(rip);
 
 		/*
 		 * Key copied.
@@ -318,9 +306,9 @@ next:		switch (direction) {
 	 * Allocate and initialize a WT_IKEY structure, we're instantiating
 	 * this key.
 	 */
+	key = WT_ROW_KEY_COPY(rip_arg);
 	WT_ERR(__wt_row_ikey_alloc(session,
-	    WT_PAGE_DISK_OFFSET(page, rip_arg->key),
-	    retb->data, retb->size, &ikey));
+	    WT_PAGE_DISK_OFFSET(page, key), retb->data, retb->size, &ikey));
 
 	/* Serialize the swap of the key into place. */
 	ret = __wt_row_key_serial(session, page, rip_arg, ikey);
@@ -329,7 +317,8 @@ next:		switch (direction) {
 	 * Free the WT_IKEY structure if the serialized call didn't use it for
 	 * the key.
 	 */
-	if (rip_arg->key != ikey)
+	key = WT_ROW_KEY_COPY(rip_arg);
+	if (key != ikey)
 		__wt_free(session, ikey);
 
 	__wt_scr_free(&retb);
@@ -355,13 +344,7 @@ __wt_row_value(WT_PAGE *page, WT_ROW *rip)
 
 	unpack = &_unpack;
 
-	/*
-	 * Multiple threads of control may be searching this page, which means
-	 * the key may change underfoot, and here's where it gets tricky: first,
-	 * copy the key.
-	 */
-	cell = rip->key;
-
+	cell = WT_ROW_KEY_COPY(rip);
 	/*
 	 * Key copied.
 	 *
@@ -393,7 +376,7 @@ __wt_row_value(WT_PAGE *page, WT_ROW *rip)
  */
 int
 __wt_row_ikey_alloc(WT_SESSION_IMPL *session,
-    uint32_t cell_offset, const void *key, uint32_t size, WT_IKEY **ikeyp)
+    uint32_t cell_offset, const void *key, uint32_t size, void *ikeyp)
 {
 	WT_IKEY *ikey;
 
@@ -406,7 +389,7 @@ __wt_row_ikey_alloc(WT_SESSION_IMPL *session,
 	ikey->cell_offset = cell_offset;
 	memcpy(WT_IKEY_DATA(ikey), key, size);
 
-	*ikeyp = ikey;
+	*(WT_IKEY **)ikeyp = ikey;
 	return (0);
 }
 
@@ -428,8 +411,8 @@ __wt_row_key_serial_func(WT_SESSION_IMPL *session)
 	 * test, if the key we're interested in still needs to be instantiated,
 	 * because it can only be in one of two states.
 	 */
-	if (!__wt_off_page(page, rip->key)) {
-		rip->key = ikey;
+	if (!__wt_off_page(page, WT_ROW_KEY_COPY(rip))) {
+		WT_ROW_KEY_SET(rip, ikey);
 		__wt_cache_page_inmem_incr(
 		    session, page, sizeof(WT_IKEY) + ikey->size);
 	}
diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c
index 44909954952..aa2b7786d50 100644
--- a/src/btree/row_modify.c
+++ b/src/btree/row_modify.c
@@ -14,6 +14,7 @@
 int
 __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
 {
+	WT_DECL_RET;
 	WT_INSERT *ins;
 	WT_INSERT_HEAD **inshead, *new_inshead, **new_inslist;
 	WT_ITEM *key, *value;
@@ -23,7 +24,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
 	size_t new_inshead_size, new_inslist_size, new_upd_size;
 	uint32_t ins_slot;
 	u_int skipdepth;
-	int i, ret;
+	int i;
 
 	key = &cbt->iface.key;
 	value = is_remove ? NULL : &cbt->iface.value;
@@ -35,7 +36,6 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
 	new_inslist = NULL;
 	new_upd = NULL;
 	upd = NULL;
-	ret = 0;
 
 	/*
 	 * Modify: allocate an update array as necessary, build a WT_UPDATE
@@ -65,6 +65,9 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
 		} else
 			upd_entry = &cbt->ins->upd;
 
+		/* Make sure the update can proceed. */
+		WT_ERR(__wt_update_check(session, page, *upd_entry));
+
 		/* Allocate room for the new value from per-thread memory. */
 		WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
 
@@ -121,6 +124,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
 		 */
 		WT_ERR(__wt_row_insert_alloc(
 		    session, key, skipdepth, &ins, &ins_size));
+		WT_ERR(__wt_update_check(session, page, NULL));
 		WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
 		ins->upd = upd;
 		ins_size += upd_size;
@@ -137,8 +141,14 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
 	if (ret != 0) {
 err:		if (ins != NULL)
 			__wt_free(session, ins);
-		if (upd != NULL)
+		if (upd != NULL) {
+			/*
+			 * Remove the update from the current transaction, so we
+			 * don't try to modify it on rollback.
+			 */
+			__wt_txn_unmodify(session);
 			__wt_free(session, upd);
+		}
 	}
 
 	/* Free any insert, update arrays. */
@@ -186,24 +196,57 @@ __wt_row_insert_alloc(WT_SESSION_IMPL *session,
 void
 __wt_insert_serial_func(WT_SESSION_IMPL *session)
 {
+	WT_DECL_RET;
 	WT_INSERT *new_ins, ***ins_stack;
-	WT_INSERT_HEAD **inshead, **new_inslist, *new_inshead;
+	WT_INSERT_HEAD *inshead, **insheadp, **new_inslist, *new_inshead;
 	WT_PAGE *page;
 	uint32_t write_gen;
 	u_int i, skipdepth;
-	int  ret;
-
-	ret = 0;
 
-	__wt_insert_unpack(session, &page, &write_gen, &inshead,
+	__wt_insert_unpack(session, &page, &write_gen, &insheadp,
 	    &ins_stack, &new_inslist, &new_inshead, &new_ins, &skipdepth);
 
 	/* Check the page's write-generation. */
-	WT_ERR(__wt_page_write_gen_check(page, write_gen));
+	WT_ERR(__wt_page_write_gen_check(session, page, write_gen));
+
+	/*
+	 * Publish: First, point the new WT_INSERT item's skiplist references
+	 * to the next elements in the insert list, then flush memory.  Second,
+	 * update the skiplist elements that reference the new WT_INSERT item,
+	 * this ensures the list is never inconsistent.
+	 */
+	if ((inshead = *insheadp) == NULL)
+		inshead = new_inshead;
+	for (i = 0; i < skipdepth; i++)
+		new_ins->next[i] = *ins_stack[i];
+	WT_WRITE_BARRIER();
+	for (i = 0; i < skipdepth; i++) {
+		if (inshead->tail[i] == NULL ||
+		    ins_stack[i] == &inshead->tail[i]->next[i])
+			inshead->tail[i] = new_ins;
+		*ins_stack[i] = new_ins;
+	}
+
+	__wt_insert_new_ins_taken(session, page);
+
+	/*
+	 * If the insert head does not yet have an insert list, our caller
+	 * passed us one.
+	 *
+	 * NOTE: it is important to do this after the item has been added to
+	 * the list.  Code can assume that if the list is set, it is non-empty.
+	 */
+	if (*insheadp == NULL) {
+		WT_PUBLISH(*insheadp, new_inshead);
+		__wt_insert_new_inshead_taken(session, page);
+	}
 
 	/*
 	 * If the page does not yet have an insert array, our caller passed
 	 * us one.
+	 *
+	 * NOTE: it is important to do this after publishing the list entry.
+	 * Code can assume that if the array is set, it is non-empty.
 	 */
 	if (page->type == WT_PAGE_ROW_LEAF) {
 		if (page->u.row.ins == NULL) {
@@ -216,34 +259,32 @@ __wt_insert_serial_func(WT_SESSION_IMPL *session)
 			__wt_insert_new_inslist_taken(session, page);
 		}
 
-	/*
-	 * If the insert head does not yet have an insert list, our caller
-	 * passed us one.
-	 */
-	if (*inshead == NULL) {
-		*inshead = new_inshead;
-		__wt_insert_new_inshead_taken(session, page);
-	}
+err:	__wt_session_serialize_wrapup(session, page, ret);
+}
+
+/*
+ * __wt_update_check --
+ *	Check whether an update can proceed, and maintain the first txnid in
+ *	the page->modify structure.
+ */
+int
+__wt_update_check(WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *next)
+{
+	WT_TXN *txn;
+
+	/* Before allocating anything, make sure this update is permitted. */
+	WT_RET(__wt_txn_update_check(session, next));
 
 	/*
-	 * Publish: First, point the new WT_INSERT item's skiplist references
-	 * to the next elements in the insert list, then flush memory.  Second,
-	 * update the skiplist elements that reference the new WT_INSERT item,
-	 * this ensures the list is never inconsistent.
+	 * Record the transaction ID for the first update to a page.
+	 * We don't care if this races: there is a buffer built into the
+	 * check for ancient updates.
 	 */
-	for (i = 0; i < skipdepth; i++)
-		new_ins->next[i] = *ins_stack[i];
-	WT_WRITE_BARRIER();
-	for (i = 0; i < skipdepth; i++) {
-		if ((*inshead)->tail[i] == NULL ||
-		    ins_stack[i] == &(*inshead)->tail[i]->next[i])
-			(*inshead)->tail[i] = new_ins;
-		*ins_stack[i] = new_ins;
-	}
-
-	__wt_insert_new_ins_taken(session, page);
+	txn = &session->txn;
+	if (page->modify->first_id == WT_TXN_NONE && txn->id != WT_TXN_NONE)
+		page->modify->first_id = txn->id;
 
-err:	__wt_session_serialize_wrapup(session, page, ret);
+	return (0);
 }
 
 /*
@@ -255,6 +296,7 @@ int
 __wt_update_alloc(WT_SESSION_IMPL *session,
     WT_ITEM *value, WT_UPDATE **updp, size_t *sizep)
 {
+	WT_DECL_RET;
 	WT_UPDATE *upd;
 	size_t size;
 
@@ -271,6 +313,16 @@ __wt_update_alloc(WT_SESSION_IMPL *session,
 		memcpy(WT_UPDATE_DATA(upd), value->data, size);
 	}
 
+	/*
+	 * This must come last: after __wt_txn_modify succeeds, we must return
+	 * a non-NULL upd so our callers can call __wt_txn_unmodify on any
+	 * subsequent failure.
+	 */
+	if ((ret = __wt_txn_modify(session, &upd->txnid)) != 0) {
+		__wt_free(session, upd);
+		return (ret);
+	}
+
 	*updp = upd;
 	if (sizep != NULL)
 		*sizep = sizeof(WT_UPDATE) + size;
@@ -284,38 +336,38 @@ __wt_update_alloc(WT_SESSION_IMPL *session,
 void
 __wt_update_serial_func(WT_SESSION_IMPL *session)
 {
+	WT_DECL_RET;
 	WT_PAGE *page;
 	WT_UPDATE **new_upd, *upd, **upd_entry;
 	uint32_t write_gen;
-	int ret;
-
-	ret = 0;
 
 	__wt_update_unpack(
 	    session, &page, &write_gen, &upd_entry, &new_upd, &upd);
 
 	/* Check the page's write-generation. */
-	WT_ERR(__wt_page_write_gen_check(page, write_gen));
+	WT_ERR(__wt_page_write_gen_check(session, page, write_gen));
+
+	upd->next = *upd_entry;
+	/*
+	 * Publish: there must be a barrier to ensure the new entry's next
+	 * pointer is set before we update the linked list.
+	 */
+	WT_PUBLISH(*upd_entry, upd);
+	__wt_update_upd_taken(session, page);
 
 	/*
 	 * If the page needs an update array (column-store pages and inserts on
 	 * row-store pages do not use the update array), our caller passed us
 	 * one of the correct size.   Check the page still needs one (the write
 	 * generation test should have caught that, though).
+	 *
+	 * NOTE: it is important to do this after publishing that the update is
+	 * set.  Code can assume that if the array is set, it is non-empty.
 	 */
 	if (new_upd != NULL && page->u.row.upd == NULL) {
 		page->u.row.upd = new_upd;
 		__wt_update_new_upd_taken(session, page);
 	}
 
-	upd->next = *upd_entry;
-	/*
-	 * Publish: there must be a barrier to ensure the new entry's next
-	 * pointer is set before we update the linked list.
-	 */
-	WT_PUBLISH(*upd_entry, upd);
-
-	__wt_update_upd_taken(session, page);
-
 err:	__wt_session_serialize_wrapup(session, page, ret);
 }
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
index 5ac5dacdfa3..200048b1b1f 100644
--- a/src/btree/row_srch.c
+++ b/src/btree/row_srch.c
@@ -88,13 +88,14 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify)
 {
 	WT_BTREE *btree;
 	WT_CELL_UNPACK *unpack, _unpack;
+	WT_DECL_RET;
 	WT_IKEY *ikey;
 	WT_ITEM *item, _item, *srch_key;
 	WT_PAGE *page;
 	WT_REF *ref;
 	WT_ROW *rip;
 	uint32_t base, indx, limit;
-	int cmp, ret;
+	int cmp;
 	void *key;
 
 	__cursor_search_clear(cbt);
@@ -128,7 +129,7 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify)
 				item->data = WT_IKEY_DATA(ikey);
 				item->size = ikey->size;
 
-				WT_RET(WT_BTREE_CMP(
+				WT_ERR(WT_BTREE_CMP(
 				    session, btree, srch_key, item, cmp));
 				if (cmp == 0)
 					break;
@@ -164,7 +165,7 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify)
 	 */
 	if (is_modify) {
 		/* Initialize the page's modification information */
-		WT_RET(__wt_page_modify_init(session, page));
+		WT_ERR(__wt_page_modify_init(session, page));
 
 		WT_ORDERED_READ(cbt->write_gen, page->modify->write_gen);
 	}
@@ -174,14 +175,7 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_modify)
 		indx = base + (limit >> 1);
 		rip = page->u.row.d + indx;
 
-retry:		/*
-		 * Multiple threads of control may be searching this page, which
-		 * means the key may change underfoot, and here's where it gets
-		 * tricky: first, copy the key.  We don't need any barriers, the
-		 * key is updated atomically, and we just need a valid copy.
-		 */
-		key = rip->key;
-
+retry:		key = WT_ROW_KEY_COPY(rip);
 		/*
 		 * Key copied.
 		 *
diff --git a/src/config/config.c b/src/config/config.c
index 1d5a3d2ab1c..1c49c90d0de 100644
--- a/src/config/config.c
+++ b/src/config/config.c
@@ -403,7 +403,7 @@ __process_value(WT_CONFIG *conf, WT_CONFIG_ITEM *value)
 		if (value->type == ITEM_NUM && errno == ERANGE)
 			return (
 			    __config_err(conf, "Number out of range", ERANGE));
-}
+	}
 
 	return (0);
 }
@@ -555,7 +555,8 @@ __wt_config_getraw(
     WT_CONFIG *cparser, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
 {
 	WT_CONFIG_ITEM k, v;
-	int found, ret;
+	WT_DECL_RET;
+	int found;
 
 	found = 0;
 	while ((ret = __wt_config_next(cparser, &k, &v)) == 0) {
@@ -580,7 +581,8 @@ __wt_config_get(WT_SESSION_IMPL *session,
     const char **cfg, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
 {
 	WT_CONFIG cparser;
-	int found, ret;
+	WT_DECL_RET;
+	int found;
 
 	for (found = 0; *cfg != NULL; cfg++) {
 		WT_RET(__wt_config_init(session, &cparser, *cfg));
diff --git a/src/config/config_check.c b/src/config/config_check.c
index 8285a9444a6..4ec1d033d2b 100644
--- a/src/config/config_check.c
+++ b/src/config/config_check.c
@@ -21,7 +21,8 @@ __wt_config_check(WT_SESSION_IMPL *session,
 {
 	WT_CONFIG parser, cparser, sparser;
 	WT_CONFIG_ITEM k, v, chk, ck, cv, dummy;
-	int found, ret;
+	WT_DECL_RET;
+	int found;
 
 	/* It is always okay to pass NULL. */
 	if (config == NULL)
diff --git a/src/config/config_collapse.c b/src/config/config_collapse.c
index ca0c863ea55..003a857862a 100644
--- a/src/config/config_collapse.c
+++ b/src/config/config_collapse.c
@@ -10,24 +10,24 @@
 /*
  * __wt_config_collapse --
  *	Given a NULL-terminated list of configuration strings, where the first
- *	one contains all the defaults, collapse them into a newly allocated
- *	buffer.
+ *	one contains all the defaults, collapse them into newly allocated
+ *	memory.
  */
 int
-__wt_config_collapse(WT_SESSION_IMPL *session,
-    const char **cfg, const char **config_ret)
+__wt_config_collapse(
+    WT_SESSION_IMPL *session, const char **cfg, const char **config_ret)
 {
 	WT_CONFIG cparser;
 	WT_CONFIG_ITEM k, v;
-	WT_ITEM buf;
-	int ret;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
 
-	WT_CLEAR(buf);
+	WT_RET(__wt_scr_alloc(session, 0, &tmp));
 
-	WT_RET(__wt_config_init(session, &cparser, cfg[0]));
+	WT_ERR(__wt_config_init(session, &cparser, cfg[0]));
 	while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) {
 		if (k.type != ITEM_STRING && k.type != ITEM_ID)
-			WT_RET_MSG(session, EINVAL,
+			WT_ERR_MSG(session, EINVAL,
 			    "Invalid configuration key found: '%s'\n", k.str);
 		WT_ERR(__wt_config_get(session, cfg, &k, &v));
 		/* Include the quotes around string keys/values. */
@@ -39,28 +39,23 @@ __wt_config_collapse(WT_SESSION_IMPL *session,
 			--v.str;
 			v.len += 2;
 		}
-		WT_ERR(__wt_buf_catfmt(session, &buf, "%.*s=%.*s,",
+		WT_ERR(__wt_buf_catfmt(session, tmp, "%.*s=%.*s,",
 		    (int)k.len, k.str, (int)v.len, v.str));
 	}
-
 	if (ret != WT_NOTFOUND)
 		goto err;
 
 	/*
-	 * If the caller passes us no valid configuration strings, we end up
-	 * here with no allocated memory to return.  Check the final buffer
-	 * size: empty configuration strings are possible, and paranoia is
-	 * good.
+	 * If the caller passes us no valid configuration strings, we get here
+	 * with no bytes to copy -- that's OK, the underlying string copy can
+	 * handle empty strings.
+	 *
+	 * Strip any trailing comma.
 	 */
-	if (buf.size == 0)
-		WT_RET(__wt_buf_initsize(session, &buf, 1));
-
-	/* Strip the trailing comma and NUL-terminate */
-	((char *)buf.data)[buf.size - 1] = '\0';
-
-	*config_ret = buf.data;
-	return (0);
+	if (tmp->size != 0)
+		--tmp->size;
+	ret = __wt_strndup(session, tmp->data, tmp->size, config_ret);
 
-err:	__wt_buf_free(session, &buf);
+err:	__wt_scr_free(&tmp);
 	return (ret);
 }
diff --git a/src/config/config_concat.c b/src/config/config_concat.c
index 6bbee5ee77e..8dcc79c741a 100644
--- a/src/config/config_concat.c
+++ b/src/config/config_concat.c
@@ -10,8 +10,8 @@
 /*
  * __wt_config_concat --
  *	Given a NULL-terminated list of configuration strings, concatenate them
- *	into a newly allocated buffer.  Nothing special is assumed about any
- *	of the config strings, they are simply combined in order.
+ *	into newly allocated memory.  Nothing special is assumed about any of
+ *	the config strings, they are simply combined in order.
  *
  *	This code deals with the case where some of the config strings are
  *	wrapped in brackets but others aren't: the resulting string does not
@@ -23,12 +23,11 @@ __wt_config_concat(
 {
 	WT_CONFIG cparser;
 	WT_CONFIG_ITEM k, v;
-	WT_ITEM buf;
-	int ret;
+	WT_DECL_ITEM(tmp);
+	WT_DECL_RET;
 	const char **cp;
 
-	WT_CLEAR(buf);
-	ret = 0;
+	WT_RET(__wt_scr_alloc(session, 0, &tmp));
 
 	for (cp = cfg; *cp != NULL; ++cp) {
 		WT_ERR(__wt_config_init(session, &cparser, *cp));
@@ -46,7 +45,7 @@ __wt_config_concat(
 				--v.str;
 				v.len += 2;
 			}
-			WT_ERR(__wt_buf_catfmt(session, &buf, "%.*s%s%.*s,",
+			WT_ERR(__wt_buf_catfmt(session, tmp, "%.*s%s%.*s,",
 			    (int)k.len, k.str,
 			    (v.len > 0) ? "=" : "",
 			    (int)v.len, v.str));
@@ -56,20 +55,16 @@ __wt_config_concat(
 	}
 
 	/*
-	 * If the caller passes us no valid configuration strings, we end up
-	 * here with no allocated memory to return.  Check the final buffer
-	 * size: empty configuration strings are possible, and paranoia is
-	 * good.
+	 * If the caller passes us no valid configuration strings, we get here
+	 * with no bytes to copy -- that's OK, the underlying string copy can
+	 * handle empty strings.
+	 *
+	 * Strip any trailing comma.
 	 */
-	if (buf.size == 0)
-		WT_RET(__wt_buf_initsize(session, &buf, 1));
+	if (tmp->size != 0)
+		--tmp->size;
+	ret = __wt_strndup(session, tmp->data, tmp->size, config_ret);
 
-	/* Strip the trailing comma and NUL-terminate */
-	((char *)buf.data)[buf.size - 1] = '\0';
-
-	*config_ret = buf.data;
-	return (0);
-
-err:	__wt_buf_free(session, &buf);
+err:	__wt_scr_free(&tmp);
 	return (ret);
 }
diff --git a/src/config/config_def.c b/src/config/config_def.c
index 3a298113381..09566c00360 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -27,11 +27,11 @@ __wt_confchk_connection_add_compressor =
     "";
 
 const char *
-__wt_confdfl_connection_add_cursor_type =
+__wt_confdfl_connection_add_data_source =
     "";
 
 const char *
-__wt_confchk_connection_add_cursor_type =
+__wt_confchk_connection_add_data_source =
     "";
 
 const char *
@@ -79,7 +79,7 @@ __wt_confdfl_file_meta =
     "allocation_size=512B,block_compressor="",checksum=true,collator="","
     "columns=(),huffman_key="",huffman_value="",internal_item_max=0,"
     "internal_key_truncate=true,internal_page_max=2KB,key_format=u,key_gap=10"
-    ",leaf_item_max=0,leaf_page_max=1MB,prefix_compression=true,root="","
+    ",leaf_item_max=0,leaf_page_max=1MB,prefix_compression=true,snapshot="","
     "split_pct=75,type=btree,value_format=u,version=(major=0,minor=0)";
 
 const char *
@@ -90,9 +90,9 @@ __wt_confchk_file_meta =
     "internal_key_truncate=(type=boolean),internal_page_max=(type=int,"
     "min=512B,max=512MB),key_format=(type=format),key_gap=(type=int,min=0),"
     "leaf_item_max=(type=int,min=0),leaf_page_max=(type=int,min=512B,"
-    "max=512MB),prefix_compression=(type=boolean),root=(),split_pct=(type=int"
-    ",min=25,max=100),type=(choices=[\"btree\"]),value_format=(type=format),"
-    "version=()";
+    "max=512MB),prefix_compression=(type=boolean),snapshot=(),"
+    "split_pct=(type=int,min=25,max=100),type=(choices=[\"btree\"]),"
+    "value_format=(type=format),version=()";
 
 const char *
 __wt_confdfl_index_meta =
@@ -104,24 +104,21 @@ __wt_confchk_index_meta =
 
 const char *
 __wt_confdfl_session_begin_transaction =
-    "isolation=read-committed,name="",priority=0,sync=full";
+    "isolation=snapshot,name="",priority=0,sync=full";
 
 const char *
 __wt_confchk_session_begin_transaction =
-    "isolation=(choices=[\"serializable\",\"snapshot\",\"read-committed\","
-    "\"read-uncommitted\"]),name=(),priority=(type=int,min=-100,max=100),"
-    "sync=(choices=[\"full\",\"flush\",\"write\",\"none\"])";
+    "isolation=(choices=[\"read-uncommitted\",\"snapshot\"]),name=(),"
+    "priority=(type=int,min=-100,max=100),sync=(choices=[\"full\",\"flush\","
+    "\"write\",\"none\"])";
 
 const char *
 __wt_confdfl_session_checkpoint =
-    "archive=false,flush_cache=true,flush_log=true,force=false,log_size=0,"
-    "timeout=0";
+    "snapshot=""";
 
 const char *
 __wt_confchk_session_checkpoint =
-    "archive=(type=boolean),flush_cache=(type=boolean),"
-    "flush_log=(type=boolean),force=(type=boolean),log_size=(type=int,min=0),"
-    "timeout=(type=int,min=0)";
+    "snapshot=()";
 
 const char *
 __wt_confdfl_session_close =
@@ -165,11 +162,11 @@ __wt_confchk_session_create =
 
 const char *
 __wt_confdfl_session_drop =
-    "force=false";
+    "force=false,snapshot=""";
 
 const char *
 __wt_confchk_session_drop =
-    "force=(type=boolean)";
+    "force=(type=boolean),snapshot=()";
 
 const char *
 __wt_confdfl_session_dumpfile =
@@ -189,15 +186,15 @@ __wt_confchk_session_log_printf =
 
 const char *
 __wt_confdfl_session_open_cursor =
-    "append=false,bulk=false,clear_on_close=false,dump="","
-    "isolation=read-committed,overwrite=false,raw=false,statistics=false";
+    "append=false,bulk=false,dump="",isolation=read-committed,overwrite=false"
+    ",raw=false,snapshot="",statistics=false,statistics_clear=false";
 
 const char *
 __wt_confchk_session_open_cursor =
-    "append=(type=boolean),bulk=(type=boolean),clear_on_close=(type=boolean),"
-    "dump=(choices=[\"hex\",\"print\"]),isolation=(choices=[\"snapshot\","
-    "\"read-committed\",\"read-uncommitted\"]),overwrite=(type=boolean),"
-    "raw=(type=boolean),statistics=(type=boolean)";
+    "append=(type=boolean),bulk=(type=boolean),dump=(choices=[\"hex\","
+    "\"print\"]),isolation=(choices=[\"snapshot\",\"read-committed\","
+    "\"read-uncommitted\"]),overwrite=(type=boolean),raw=(type=boolean),"
+    "snapshot=(),statistics=(type=boolean),statistics_clear=(type=boolean)";
 
 const char *
 __wt_confdfl_session_rename =
@@ -225,11 +222,11 @@ __wt_confchk_session_salvage =
 
 const char *
 __wt_confdfl_session_sync =
-    "";
+    "snapshot=""";
 
 const char *
 __wt_confchk_session_sync =
-    "";
+    "snapshot=()";
 
 const char *
 __wt_confdfl_session_truncate =
@@ -269,8 +266,8 @@ __wt_confdfl_wiredtiger_open =
     "buffer_alignment=-1,cache_size=100MB,create=false,direct_io=(),"
     "error_prefix="",eviction_target=80,eviction_trigger=95,extensions=(),"
     "hazard_max=30,home_environment=false,home_environment_priv=false,"
-    "logging=false,multiprocess=false,session_max=50,transactional=false,"
-    "verbose=()";
+    "logging=false,multiprocess=false,session_max=50,sync=true,"
+    "transactional=true,verbose=()";
 
 const char *
 __wt_confchk_wiredtiger_open =
@@ -281,6 +278,7 @@ __wt_confchk_wiredtiger_open =
     "hazard_max=(type=int,min=15),home_environment=(type=boolean),"
     "home_environment_priv=(type=boolean),logging=(type=boolean),"
     "multiprocess=(type=boolean),session_max=(type=int,min=1),"
-    "transactional=(type=boolean),verbose=(type=list,choices=[\"block\","
-    "\"evict\",\"evictserver\",\"fileops\",\"hazard\",\"mutex\",\"read\","
-    "\"readserver\",\"reconcile\",\"salvage\",\"verify\",\"write\"])";
+    "sync=(type=boolean),transactional=(type=boolean),verbose=(type=list,"
+    "choices=[\"block\",\"evict\",\"evictserver\",\"fileops\",\"hazard\","
+    "\"mutex\",\"read\",\"readserver\",\"reconcile\",\"salvage\",\"snapshot\""
+    ",\"verify\",\"write\"])";
diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c
index 84367f879c0..4d1b2f24d8e 100644
--- a/src/conn/conn_api.c
+++ b/src/conn/conn_api.c
@@ -7,22 +7,20 @@
 
 #include "wt_internal.h"
 
-static int __conn_config(WT_CONNECTION_IMPL *, const char **, WT_ITEM **);
-static int __conn_home(WT_CONNECTION_IMPL *, const char *, const char **);
-static int __conn_single(WT_CONNECTION_IMPL *, const char **);
-
 /*
  * api_err_printf --
  *	Extension API call to print to the error stream.
  */
-static void
+static int
 __api_err_printf(WT_SESSION *wt_session, const char *fmt, ...)
 {
+	WT_DECL_RET;
 	va_list ap;
 
 	va_start(ap, fmt);
-	__wt_eventv((WT_SESSION_IMPL *)wt_session, 0, 0, NULL, 0, fmt, ap);
+	ret = __wt_verrx((WT_SESSION_IMPL *)wt_session, fmt, ap);
 	va_end(ap);
+	return (ret);
 }
 
 static WT_EXTENSION_API __api = {
@@ -41,10 +39,10 @@ __conn_load_extension(
 {
 	WT_CONFIG_ITEM cval;
 	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
 	WT_DLH *dlh;
 	WT_SESSION_IMPL *session;
 	int (*entry)(WT_SESSION *, WT_EXTENSION_API *, const char *);
-	int ret;
 	const char *entry_name;
 
 	dlh = NULL;
@@ -84,29 +82,6 @@ err:		if (dlh != NULL)
 }
 
 /*
- * __conn_add_cursor_type --
- *	WT_CONNECTION->add_cursor_type method.
- */
-static int
-__conn_add_cursor_type(WT_CONNECTION *wt_conn,
-    const char *prefix, WT_CURSOR_TYPE *ctype, const char *config)
-{
-	WT_CONNECTION_IMPL *conn;
-	WT_SESSION_IMPL *session;
-	int ret;
-
-	WT_UNUSED(prefix);
-	WT_UNUSED(ctype);
-	ret = ENOTSUP;
-
-	conn = (WT_CONNECTION_IMPL *)wt_conn;
-	CONNECTION_API_CALL(conn, session, add_cursor_type, config, cfg);
-	WT_UNUSED(cfg);
-
-err:	API_END_NOTFOUND_MAP(session, ret);
-}
-
-/*
  * __conn_add_collator --
  *	WT_CONNECTION->add_collator method.
  */
@@ -115,9 +90,9 @@ __conn_add_collator(WT_CONNECTION *wt_conn,
     const char *name, WT_COLLATOR *collator, const char *config)
 {
 	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
 	WT_NAMED_COLLATOR *ncoll;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	conn = (WT_CONNECTION_IMPL *)wt_conn;
 	CONNECTION_API_CALL(conn, session, add_collator, config, cfg);
@@ -137,6 +112,26 @@ err:	__wt_free(session, ncoll);
 }
 
 /*
+ * __conn_remove_collator --
+ *	remove collator added by WT_CONNECTION->add_collator,
+ *	only used internally.
+ */
+static void
+__conn_remove_collator(WT_CONNECTION_IMPL *conn, WT_NAMED_COLLATOR *ncoll)
+{
+	WT_SESSION_IMPL *session;
+
+	session = conn->default_session;
+
+	/* Remove from the connection's list. */
+	TAILQ_REMOVE(&conn->collqh, ncoll, q);
+
+	/* Free associated memory */
+	__wt_free(session, ncoll->name);
+	__wt_free(session, ncoll);
+}
+
+/*
  * __conn_add_compressor --
  *	WT_CONNECTION->add_compressor method.
  */
@@ -145,9 +140,9 @@ __conn_add_compressor(WT_CONNECTION *wt_conn,
     const char *name, WT_COMPRESSOR *compressor, const char *config)
 {
 	WT_CONNECTION_IMPL *conn;
-	WT_SESSION_IMPL *session;
+	WT_DECL_RET;
 	WT_NAMED_COMPRESSOR *ncomp;
-	int ret;
+	WT_SESSION_IMPL *session;
 
 	WT_UNUSED(name);
 	WT_UNUSED(compressor);
@@ -170,23 +165,60 @@ err:	__wt_free(session, ncomp);
 }
 
 /*
- * __conn_remove_collator --
- *	remove collator added by WT_CONNECTION->add_collator,
+ * __conn_remove_compressor --
+ *	remove compressor added by WT_CONNECTION->add_compressor,
  *	only used internally.
  */
 static void
-__conn_remove_collator(WT_CONNECTION_IMPL *conn, WT_NAMED_COLLATOR *ncoll)
+__conn_remove_compressor(WT_CONNECTION_IMPL *conn, WT_NAMED_COMPRESSOR *ncomp)
 {
 	WT_SESSION_IMPL *session;
 
-	session = &conn->default_session;
+	session = conn->default_session;
 
 	/* Remove from the connection's list. */
-	TAILQ_REMOVE(&conn->collqh, ncoll, q);
+	TAILQ_REMOVE(&conn->compqh, ncomp, q);
 
 	/* Free associated memory */
-	__wt_free(session, ncoll->name);
-	__wt_free(session, ncoll);
+	__wt_free(session, ncomp->name);
+	__wt_free(session, ncomp);
+}
+
+/*
+ * __conn_add_data_source --
+ *	WT_CONNECTION->add_data_source method.
+ */
+static int
+__conn_add_data_source(WT_CONNECTION *wt_conn,
+    const char *prefix, WT_DATA_SOURCE *dsrc, const char *config)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	WT_NAMED_DATA_SOURCE *ndsrc;
+
+	ndsrc = NULL;
+
+	conn = (WT_CONNECTION_IMPL *)wt_conn;
+	CONNECTION_API_CALL(conn, session, add_data_source, config, cfg);
+	WT_UNUSED(cfg);
+
+	WT_ERR(__wt_calloc_def(session, 1, &ndsrc));
+	WT_ERR(__wt_strdup(session, prefix, &ndsrc->prefix));
+	ndsrc->dsrc = dsrc;
+
+	/* Link onto the environment's list of data sources. */
+	__wt_spin_lock(session, &conn->spinlock);
+	TAILQ_INSERT_TAIL(&conn->dsrcqh, ndsrc, q);
+	__wt_spin_unlock(session, &conn->spinlock);
+
+	if (0) {
+err:		if (ndsrc != NULL)
+			__wt_free(session, ndsrc->prefix);
+		__wt_free(session, ndsrc);
+	}
+
+	API_END_NOTFOUND_MAP(session, ret);
 }
 
 /*
@@ -195,18 +227,17 @@ __conn_remove_collator(WT_CONNECTION_IMPL *conn, WT_NAMED_COLLATOR *ncoll)
  *	only used internally.
  */
 static void
-__conn_remove_compressor(WT_CONNECTION_IMPL *conn, WT_NAMED_COMPRESSOR *ncomp)
+__conn_remove_data_source(
+    WT_CONNECTION_IMPL *conn, WT_NAMED_DATA_SOURCE *ndsrc)
 {
 	WT_SESSION_IMPL *session;
 
-	session = &conn->default_session;
+	session = conn->default_session;
 
 	/* Remove from the connection's list. */
-	TAILQ_REMOVE(&conn->compqh, ncomp, q);
-
-	/* Free associated memory */
-	__wt_free(session, ncomp->name);
-	__wt_free(session, ncomp);
+	TAILQ_REMOVE(&conn->dsrcqh, ndsrc, q);
+	__wt_free(session, ndsrc->prefix);
+	__wt_free(session, ndsrc);
 }
 
 /*
@@ -218,8 +249,8 @@ __conn_add_extractor(WT_CONNECTION *wt_conn,
     const char *name, WT_EXTRACTOR *extractor, const char *config)
 {
 	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	WT_UNUSED(name);
 	WT_UNUSED(extractor);
@@ -256,35 +287,38 @@ static int
 __conn_close(WT_CONNECTION *wt_conn, const char *config)
 {
 	WT_CONNECTION_IMPL *conn;
-	WT_SESSION_IMPL *s, *session, **tp;
-	WT_SESSION *wt_session;
+	WT_DECL_RET;
 	WT_NAMED_COLLATOR *ncoll;
 	WT_NAMED_COMPRESSOR *ncomp;
-	int ret;
+	WT_NAMED_DATA_SOURCE *ndsrc;
+	WT_SESSION *wt_session;
+	WT_SESSION_IMPL *s, *session;
+	uint32_t i;
 
-	ret = 0;
 	conn = (WT_CONNECTION_IMPL *)wt_conn;
 
 	CONNECTION_API_CALL(conn, session, close, config, cfg);
 	WT_UNUSED(cfg);
 
-	/* Close open sessions. */
-	for (tp = conn->sessions; (s = *tp) != NULL;) {
-		if (!F_ISSET(s, WT_SESSION_INTERNAL)) {
+	/*
+	 * Close open, external sessions.
+	 * Additionally, the session's hazard reference memory isn't discarded
+	 * during normal session close because access to it isn't serialized.
+	 * Discard it now.  Note the loop for the hazard reference memory, it's
+	 * the entire session array, not only the active session count, as the
+	 * active session count may be less than the maximum session count.
+	 */
+	for (s = conn->sessions, i = 0; i < conn->session_cnt; ++s, ++i)
+		if (s->active && !F_ISSET(s, WT_SESSION_INTERNAL)) {
 			wt_session = &s->iface;
 			WT_TRET(wt_session->close(wt_session, config));
-
-			/*
-			 * We closed a session, which has shuffled pointers
-			 * around.  Restart the search.
-			 */
-			tp = conn->sessions;
-		} else
-			++tp;
-	}
+		}
+	for (s = conn->sessions, i = 0; i < conn->session_size; ++s, ++i)
+		if (!F_ISSET(s, WT_SESSION_INTERNAL))
+			__wt_free(session, s->hazard);
 
 	/* Close open btree handles. */
-	WT_TRET(__wt_conn_btree_remove(conn));
+	WT_TRET(__wt_conn_btree_discard(conn));
 
 	/* Free memory for collators */
 	while ((ncoll = TAILQ_FIRST(&conn->collqh)) != NULL)
@@ -294,6 +328,10 @@ __conn_close(WT_CONNECTION *wt_conn, const char *config)
 	while ((ncomp = TAILQ_FIRST(&conn->compqh)) != NULL)
 		__conn_remove_compressor(conn, ncomp);
 
+	/* Free memory for data sources */
+	while ((ndsrc = TAILQ_FIRST(&conn->dsrcqh)) != NULL)
+		__conn_remove_data_source(conn, ndsrc);
+
 	WT_TRET(__wt_connection_close(conn));
 	/* We no longer have a session, don't try to update it. */
 	session = NULL;
@@ -311,12 +349,11 @@ __conn_open_session(WT_CONNECTION *wt_conn,
     WT_SESSION **wt_sessionp)
 {
 	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session, *session_ret;
-	int ret;
 
 	conn = (WT_CONNECTION_IMPL *)wt_conn;
 	session_ret = NULL;
-	ret = 0;
 
 	CONNECTION_API_CALL(conn, session, open_session, config, cfg);
 	WT_UNUSED(cfg);
@@ -329,219 +366,154 @@ err:	API_END_NOTFOUND_MAP(session, ret);
 }
 
 /*
- * wiredtiger_open --
- *	Main library entry point: open a new connection to a WiredTiger
- *	database.
+ * __conn_config --
+ *	Read in any WiredTiger_config file in the home directory.
  */
-int
-wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
-    const char *config, WT_CONNECTION **wt_connp)
+static int
+__conn_config(WT_CONNECTION_IMPL *conn, const char **cfg, WT_ITEM **cbufp)
 {
-	static WT_CONNECTION stdc = {
-		__conn_load_extension,
-		__conn_add_cursor_type,
-		__conn_add_collator,
-		__conn_add_compressor,
-		__conn_add_extractor,
-		__conn_close,
-		__conn_get_home,
-		__conn_is_new,
-		__conn_open_session
-	};
-	static struct {
-		const char *name;
-		uint32_t flag;
-	} *ft, verbtypes[] = {
-		{ "block",	WT_VERB_block },
-		{ "evict",	WT_VERB_evict },
-		{ "evictserver",WT_VERB_evictserver },
-		{ "fileops",	WT_VERB_fileops },
-		{ "hazard",	WT_VERB_hazard },
-		{ "mutex",	WT_VERB_mutex },
-		{ "read",	WT_VERB_read },
-		{ "readserver",	WT_VERB_readserver },
-		{ "reconcile",	WT_VERB_reconcile },
-		{ "salvage",	WT_VERB_salvage },
-		{ "verify",	WT_VERB_verify },
-		{ "write",	WT_VERB_write },
-		{ NULL, 0 }
-	}, directio_types[] = {
-		{ "data",	WT_DIRECTIO_DATA },
-		{ "log",	WT_DIRECTIO_LOG },
-		{ NULL, 0 }
-	};
-	WT_CONFIG subconfig;
-	WT_CONFIG_ITEM cval, skey, sval;
-	WT_CONNECTION_IMPL *conn;
-	WT_ITEM *cbuf, expath, exconfig;
-	WT_SESSION *wt_session;
+	WT_DECL_RET;
+	WT_FH *fh;
+	WT_ITEM *cbuf;
 	WT_SESSION_IMPL *session;
-	int ret;
-	const char *cfg[] =
-	    { __wt_confdfl_wiredtiger_open, config, NULL, NULL };
+	off_t size;
+	uint32_t len;
+	int exist, quoted;
+	uint8_t *p, *t;
+
+	*cbufp = NULL;				/* Returned buffer */
 
-	*wt_connp = NULL;
-	session = NULL;
 	cbuf = NULL;
-	WT_CLEAR(expath);
-	WT_CLEAR(exconfig);
+	fh = NULL;
+	session = conn->default_session;
 
-	WT_RET(__wt_library_init());
+	/* Check for an optional configuration file. */
+#define	WT_CONFIGFILE	"WiredTiger.config"
+	WT_RET(__wt_exist(session, WT_CONFIGFILE, &exist));
+	if (!exist)
+		return (0);
 
-	WT_RET(__wt_calloc_def(NULL, 1, &conn));
-	conn->iface = stdc;
+	/* Open the configuration file. */
+	WT_RET(__wt_open(session, WT_CONFIGFILE, 0, 0, 0, &fh));
+	WT_ERR(__wt_filesize(session, fh, &size));
+	if (size == 0)
+		goto err;
 
 	/*
-	 * Immediately link the structure into the connection structure list:
-	 * the only thing ever looked at on that list is the database name,
-	 * and a NULL value is fine.
+	 * Sanity test: a 100KB configuration file would be insane.  (There's
+	 * no practical reason to limit the file size, but I can either limit
+	 * the file size to something rational, or I can add code to test if
+	 * the off_t size is larger than a uint32_t, which is more complicated
+	 * and a waste of time.)
 	 */
-	__wt_spin_lock(NULL, &__wt_process.spinlock);
-	TAILQ_INSERT_TAIL(&__wt_process.connqh, conn, q);
-	__wt_spin_unlock(NULL, &__wt_process.spinlock);
-
-	session = &conn->default_session;
-	session->iface.connection = &conn->iface;
-	session->name = "wiredtiger_open";
+	if (size > 100 * 1024)
+		WT_ERR_MSG(session, EFBIG, WT_CONFIGFILE);
+	len = (uint32_t)size;
 
 	/*
-	 * Configure event handling as soon as possible so errors are handled
-	 * correctly.  If the application didn't configure an event handler,
-	 * use the default one, and use default entries for any entries not
-	 * set by the application.
+	 * Copy the configuration file into memory, with a little slop, I'm not
+	 * interested in debugging off-by-ones.
+	 *
+	 * The beginning of a file is the same as if we run into an unquoted
+	 * newline character, simplify the parsing loop by pretending that's
+	 * what we're doing.
 	 */
-	if (event_handler == NULL)
-		event_handler = __wt_event_handler_default;
-	else {
-		if (event_handler->handle_error == NULL)
-			event_handler->handle_error =
-			    __wt_event_handler_default->handle_error;
-		if (event_handler->handle_message == NULL)
-			event_handler->handle_message =
-			    __wt_event_handler_default->handle_message;
-		if (event_handler->handle_progress == NULL)
-			event_handler->handle_progress =
-			    __wt_event_handler_default->handle_progress;
-	}
-	session->event_handler = event_handler;
-
-	/* Remaining basic initialization of the connection structure. */
-	WT_ERR(__wt_connection_init(conn));
-
-	/* Check the configuration strings. */
+	WT_ERR(__wt_scr_alloc(session, len + 10,  &cbuf));
 	WT_ERR(
-	    __wt_config_check(session, __wt_confchk_wiredtiger_open, config));
-
-	/* Get the database home. */
-	WT_ERR(__conn_home(conn, home, cfg));
+	    __wt_read(session, fh, (off_t)0, len, ((uint8_t *)cbuf->mem) + 1));
+	((uint8_t *)cbuf->mem)[0] = '\n';
+	cbuf->size = len + 1;
 
-	/* Read the database-home configuration file. */
-	WT_ERR(__conn_config(conn, cfg, &cbuf));
+	/*
+	 * Collapse the file's lines into a single string: newline characters
+	 * are replaced with commas unless the newline is quoted or backslash
+	 * escaped.  Comment lines (an unescaped newline where the next non-
+	 * white-space character is a hash), are discarded.
+	 */
+	for (quoted = 0, p = t = cbuf->mem; len > 0;) {
+		/*
+		 * Backslash pairs pass through untouched, unless immediately
+		 * preceding a newline, in which case both the backslash and
+		 * the newline are discarded.  Backslash characters escape
+		 * quoted characters, too, that is, a backslash followed by a
+		 * quote doesn't start or end a quoted string.
+		 */
+		if (*p == '\\' && len > 1) {
+			if (p[1] != '\n') {
+				*t++ = p[0];
+				*t++ = p[1];
+			}
+			p += 2;
+			len -= 2;
+			continue;
+		}
 
-	/* Make sure no other thread of control already owns this database. */
-	WT_ERR(__conn_single(conn, cfg));
+		/*
+		 * If we're in a quoted string, or starting a quoted string,
+		 * take all characters, including white-space and newlines.
+		 */
+		if (quoted || *p == '"') {
+			if (*p == '"')
+				quoted = !quoted;
+			*t++ = *p++;
+			--len;
+			continue;
+		}
 
-	WT_ERR(__wt_config_gets(session, cfg, "cache_size", &cval));
-	conn->cache_size = cval.val;
-	WT_ERR(__wt_config_gets(session, cfg, "hazard_max", &cval));
-	conn->hazard_size = (uint32_t)cval.val;
-	WT_ERR(__wt_config_gets(session, cfg, "session_max", &cval));
-	conn->session_size = (uint32_t)cval.val;
+		/* Everything else gets taken, except for newline characters. */
+		if (*p != '\n') {
+			*t++ = *p++;
+			--len;
+			continue;
+		}
 
-	/* Configure verbose flags. */
-	conn->verbose = 0;
-#ifdef HAVE_VERBOSE
-	WT_ERR(__wt_config_gets(session, cfg, "verbose", &cval));
-	for (ft = verbtypes; ft->name != NULL; ft++) {
-		ret = __wt_config_subgets(session, &cval, ft->name, &sval);
-		if (ret == 0) {
-			if (sval.val)
-				FLD_SET(conn->verbose, ft->flag);
-		} else if (ret != WT_NOTFOUND)
-			goto err;
+		/*
+		 * Replace any newline characters with commas (and strings of
+		 * commas are safe).
+		 *
+		 * After any newline, skip to a non-white-space character; if
+		 * the next character is a hash mark, skip to the next newline.
+		 */
+		for (;;) {
+			for (*t++ = ','; --len > 0 && isspace(*++p);)
+				;
+			if (len == 0)
+				break;
+			if (*p != '#')
+				break;
+			while (--len > 0 && *++p != '\n')
+				;
+			if (len == 0)
+				break;
+		}
 	}
-#endif
-
-	WT_ERR(__wt_config_gets(session, cfg, "logging", &cval));
-	if (cval.val != 0)
-		WT_ERR(__wt_open(
-		   session, WT_LOG_FILENAME, 1, 0, 0, &conn->log_fh));
+	*t = '\0';
 
-	/* Configure direct I/O and buffer alignment. */
-	WT_ERR(__wt_config_gets(session, cfg, "buffer_alignment", &cval));
-	if (cval.val == -1)
-		conn->buffer_alignment = WT_BUFFER_ALIGNMENT_DEFAULT;
-	else
-		conn->buffer_alignment = (size_t)cval.val;
-#ifndef HAVE_POSIX_MEMALIGN
-	if (conn->buffer_alignment != 0)
-		WT_ERR_MSG(session, EINVAL,
-		    "buffer_alignment requires posix_memalign");
+#if 0
+	fprintf(stderr, "file config: {%s}\n", (char *)cbuf->data);
+	exit(0);
 #endif
 
-	WT_ERR(__wt_config_gets(session, cfg, "direct_io", &cval));
-	for (ft = directio_types; ft->name != NULL; ft++) {
-		ret = __wt_config_subgets(session, &cval, ft->name, &sval);
-		if (ret == 0) {
-			if (sval.val)
-				FLD_SET(conn->direct_io, ft->flag);
-		} else if (ret != WT_NOTFOUND)
-			goto err;
-	}
-
-	/* Load any extensions referenced in the config. */
-	WT_ERR(__wt_config_gets(session, cfg, "extensions", &cval));
-	WT_ERR(__wt_config_subinit(session, &subconfig, &cval));
-	while ((ret = __wt_config_next(&subconfig, &skey, &sval)) == 0) {
-		WT_ERR(__wt_buf_fmt(
-		    session, &expath, "%.*s", (int)skey.len, skey.str));
-		if (sval.len > 0)
-			WT_ERR(__wt_buf_fmt(session, &exconfig,
-			    "entry=%.*s\n", (int)sval.len, sval.str));
-		WT_ERR(conn->iface.load_extension(&conn->iface,
-		    expath.data, (sval.len > 0) ? exconfig.data : NULL));
-	}
-	if (ret == WT_NOTFOUND)
-		ret = 0;
-	WT_ERR(ret);
-
-	/*
-	 * Open the connection; if that fails, the connection handle has been
-	 * destroyed by the time the open function returns.
-	 */
-	if ((ret = __wt_connection_open(conn, cfg)) != 0) {
-		conn = NULL;
-		WT_ERR(ret);
-	}
-
-	/*
-	 * If this is a new database, create the schema file.  This avoids
-	 * application threads racing to create it later.  We need a real
-	 * session handle for this: open one.
-	 */
-	if (conn->is_new) {
-		WT_ERR(conn->iface.open_session(&conn->iface,
-		    NULL, NULL, &wt_session));
-		WT_TRET(__wt_open_schema_table((WT_SESSION_IMPL *)wt_session));
-		WT_TRET(wt_session->close(wt_session, NULL));
-		WT_ERR(ret);
-	}
-
-	STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0);
-	*wt_connp = &conn->iface;
+	/* Check the configuration string. */
+	WT_ERR(__wt_config_check(
+	    session, __wt_confchk_wiredtiger_open, cbuf->data));
 
 	/*
-	 * Destroying the connection on error will destroy our session handle,
-	 * cleanup using the session handle first, then discard the connection.
+	 * The configuration file falls between the default configuration and
+	 * the wiredtiger_open() configuration, overriding the defaults but not
+	 * overriding the wiredtiger_open() configuration.
 	 */
-err:	if (cbuf != NULL)
-		__wt_buf_free(session, cbuf);
-	__wt_buf_free(session, &expath);
-	__wt_buf_free(session, &exconfig);
+	cfg[2] = cfg[1];
+	cfg[1] = cbuf->data;
 
-	if (ret != 0 && conn != NULL)
-		__wt_connection_destroy(conn);
+	*cbufp = cbuf;
 
+	if (0) {
+err:		if (cbuf != NULL)
+			__wt_buf_free(session, cbuf);
+	}
+	if (fh != NULL)
+		WT_TRET(__wt_close(session, fh));
 	return (ret);
 }
 
@@ -555,7 +527,7 @@ __conn_home(WT_CONNECTION_IMPL *conn, const char *home, const char **cfg)
 	WT_CONFIG_ITEM cval;
 	WT_SESSION_IMPL *session;
 
-	session = &conn->default_session;
+	session = conn->default_session;
 
 	/* If the application specifies a home directory, use it. */
 	if (home != NULL)
@@ -604,13 +576,14 @@ __conn_single(WT_CONNECTION_IMPL *conn, const char **cfg)
 {
 	WT_CONFIG_ITEM cval;
 	WT_CONNECTION_IMPL *t;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
 	off_t size;
 	uint32_t len;
-	int created, ret;
+	int created;
 	char buf[256];
 
-	session = &conn->default_session;
+	session = conn->default_session;
 
 #define	WT_FLAGFILE	"WiredTiger"
 	/*
@@ -634,7 +607,6 @@ __conn_single(WT_CONNECTION_IMPL *conn, const char **cfg)
 		    "process");
 
 	/* Check to see if another thread of control has this database open. */
-	ret = 0;
 	__wt_spin_lock(session, &__wt_process.spinlock);
 	TAILQ_FOREACH(t, &__wt_process.connqh, q)
 		if (t->home != NULL &&
@@ -684,153 +656,205 @@ err:	if (conn->lock_fh != NULL) {
 }
 
 /*
- * __conn_config --
- *	Read in any WiredTiger_config file in the home directory.
+ * wiredtiger_open --
+ *	Main library entry point: open a new connection to a WiredTiger
+ *	database.
  */
-static int
-__conn_config(WT_CONNECTION_IMPL *conn, const char **cfg, WT_ITEM **cbufp)
+int
+wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
+    const char *config, WT_CONNECTION **wt_connp)
 {
-	WT_ITEM *cbuf;
-	WT_FH *fh;
+	static WT_CONNECTION stdc = {
+		__conn_load_extension,
+		__conn_add_data_source,
+		__conn_add_collator,
+		__conn_add_compressor,
+		__conn_add_extractor,
+		__conn_close,
+		__conn_get_home,
+		__conn_is_new,
+		__conn_open_session
+	};
+	static struct {
+		const char *name;
+		uint32_t flag;
+	} *ft, verbtypes[] = {
+		{ "block",	WT_VERB_block },
+		{ "evict",	WT_VERB_evict },
+		{ "evictserver",WT_VERB_evictserver },
+		{ "fileops",	WT_VERB_fileops },
+		{ "hazard",	WT_VERB_hazard },
+		{ "mutex",	WT_VERB_mutex },
+		{ "read",	WT_VERB_read },
+		{ "readserver",	WT_VERB_readserver },
+		{ "reconcile",	WT_VERB_reconcile },
+		{ "salvage",	WT_VERB_salvage },
+		{ "verify",	WT_VERB_verify },
+		{ "snapshot",	WT_VERB_snapshot },
+		{ "write",	WT_VERB_write },
+		{ NULL, 0 }
+	}, directio_types[] = {
+		{ "data",	WT_DIRECTIO_DATA },
+		{ "log",	WT_DIRECTIO_LOG },
+		{ NULL, 0 }
+	};
+	WT_CONFIG subconfig;
+	WT_CONFIG_ITEM cval, skey, sval;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_ITEM *cbuf, expath, exconfig;
 	WT_SESSION_IMPL *session;
-	off_t size;
-	uint32_t len;
-	int exist, quoted, ret;
-	uint8_t *p, *t;
-
-	*cbufp = NULL;				/* Returned buffer */
+	const char *cfg[] =
+	    { __wt_confdfl_wiredtiger_open, config, NULL, NULL };
+	int exist;
 
+	*wt_connp = NULL;
+	session = NULL;
 	cbuf = NULL;
-	fh = NULL;
-	session = &conn->default_session;
-	ret = 0;
+	WT_CLEAR(expath);
+	WT_CLEAR(exconfig);
 
-	/* Check for an optional configuration file. */
-#define	WT_CONFIGFILE	"WiredTiger.config"
-	WT_RET(__wt_exist(session, WT_CONFIGFILE, &exist));
-	if (!exist)
-		return (0);
+	WT_RET(__wt_library_init());
 
-	/* Open the configuration file. */
-	WT_RET(__wt_open(session, WT_CONFIGFILE, 0, 0, 0, &fh));
-	WT_ERR(__wt_filesize(session, fh, &size));
-	if (size == 0)
-		goto err;
+	WT_RET(__wt_calloc_def(NULL, 1, &conn));
+	conn->iface = stdc;
 
 	/*
-	 * Sanity test: a 100KB configuration file would be insane.  (There's
-	 * no practical reason to limit the file size, but I can either limit
-	 * the file size to something rational, or I can add code to test if
-	 * the off_t size is larger than a uint32_t, which is more complicated
-	 * and a waste of time.)
+	 * Immediately link the structure into the connection structure list:
+	 * the only thing ever looked at on that list is the database name,
+	 * and a NULL value is fine.
 	 */
-	if (size > 100 * 1024)
-		WT_ERR_MSG(session, EFBIG, WT_CONFIGFILE);
-	len = (uint32_t)size;
+	__wt_spin_lock(NULL, &__wt_process.spinlock);
+	TAILQ_INSERT_TAIL(&__wt_process.connqh, conn, q);
+	__wt_spin_unlock(NULL, &__wt_process.spinlock);
 
-	/*
-	 * Copy the configuration file into memory, with a little slop, I'm not
-	 * interested in debugging off-by-ones.
-	 *
-	 * The beginning of a file is the same as if we run into an unquoted
-	 * newline character, simplify the parsing loop by pretending that's
-	 * what we're doing.
-	 */
-	WT_ERR(__wt_scr_alloc(session, len + 10,  &cbuf));
+	conn->default_session = session = &conn->dummy_session;
+	session->iface.connection = &conn->iface;
+	session->name = "wiredtiger_open";
+	__wt_event_handler_set(session, event_handler);
+
+	/* Remaining basic initialization of the connection structure. */
+	WT_ERR(__wt_connection_init(conn));
+
+	/* Check the configuration strings. */
 	WT_ERR(
-	    __wt_read(session, fh, (off_t)0, len, ((uint8_t *)cbuf->mem) + 1));
-	((uint8_t *)cbuf->mem)[0] = '\n';
-	cbuf->size = len + 1;
+	    __wt_config_check(session, __wt_confchk_wiredtiger_open, config));
 
-	/*
-	 * Collapse the file's lines into a single string: newline characters
-	 * are replaced with commas unless the newline is quoted or backslash
-	 * escaped.  Comment lines (an unescaped newline where the next non-
-	 * white-space character is a hash), are discarded.
-	 */
-	for (quoted = 0, p = t = cbuf->mem; len > 0;) {
-		/*
-		 * Backslash pairs pass through untouched, unless immediately
-		 * preceding a newline, in which case both the backslash and
-		 * the newline are discarded.  Backslash characters escape
-		 * quoted characters, too, that is, a backslash followed by a
-		 * quote doesn't start or end a quoted string.
-		 */
-		if (*p == '\\' && len > 1) {
-			if (p[1] != '\n') {
-				*t++ = p[0];
-				*t++ = p[1];
-			}
-			p += 2;
-			len -= 2;
-			continue;
-		}
+	/* Get the database home. */
+	WT_ERR(__conn_home(conn, home, cfg));
 
-		/*
-		 * If we're in a quoted string, or starting a quoted string,
-		 * take all characters, including white-space and newlines.
-		 */
-		if (quoted || *p == '"') {
-			if (*p == '"')
-				quoted = !quoted;
-			*t++ = *p++;
-			--len;
-			continue;
-		}
+	/* Read the database-home configuration file. */
+	WT_ERR(__conn_config(conn, cfg, &cbuf));
 
-		/* Everything else gets taken, except for newline characters. */
-		if (*p != '\n') {
-			*t++ = *p++;
-			--len;
-			continue;
-		}
+	/* Make sure no other thread of control already owns this database. */
+	WT_ERR(__conn_single(conn, cfg));
 
-		/*
-		 * Replace any newline characters with commas (and strings of
-		 * commas are safe).
-		 *
-		 * After any newline, skip to a non-white-space character; if
-		 * the next character is a hash mark, skip to the next newline.
-		 */
-		for (;;) {
-			for (*t++ = ','; --len > 0 && isspace(*++p);)
-				;
-			if (len == 0)
-				break;
-			if (*p != '#')
-				break;
-			while (--len > 0 && *++p != '\n')
-				;
-			if (len == 0)
-				break;
-		}
+	WT_ERR(__wt_config_gets(session, cfg, "cache_size", &cval));
+	conn->cache_size = cval.val;
+	WT_ERR(__wt_config_gets(session, cfg, "hazard_max", &cval));
+	conn->hazard_size = (uint32_t)cval.val;
+	WT_ERR(__wt_config_gets(session, cfg, "session_max", &cval));
+	conn->session_size = (uint32_t)cval.val + WT_NUM_INTERNAL_SESSIONS;
+	WT_ERR(__wt_config_gets(session, cfg, "sync", &cval));
+	if (!cval.val)
+		F_SET(conn, WT_CONN_NOSYNC);
+	WT_ERR(__wt_config_gets(session, cfg, "transactional", &cval));
+	if (cval.val)
+		F_SET(conn, WT_CONN_TRANSACTIONAL);
+
+	/* Configure verbose flags. */
+	conn->verbose = 0;
+#ifdef HAVE_VERBOSE
+	WT_ERR(__wt_config_gets(session, cfg, "verbose", &cval));
+	for (ft = verbtypes; ft->name != NULL; ft++) {
+		ret = __wt_config_subgets(session, &cval, ft->name, &sval);
+		if (ret == 0) {
+			if (sval.val)
+				FLD_SET(conn->verbose, ft->flag);
+		} else if (ret != WT_NOTFOUND)
+			goto err;
 	}
-	*t = '\0';
+#endif
 
-#if 0
-	fprintf(stderr, "file config: {%s}\n", (char *)cbuf->data);
-	exit(0);
+	WT_ERR(__wt_config_gets(session, cfg, "logging", &cval));
+	if (cval.val != 0)
+		WT_ERR(__wt_open(
+		   session, WT_LOG_FILENAME, 1, 0, 0, &conn->log_fh));
+
+	/* Configure direct I/O and buffer alignment. */
+	WT_ERR(__wt_config_gets(session, cfg, "buffer_alignment", &cval));
+	if (cval.val == -1)
+		conn->buffer_alignment = WT_BUFFER_ALIGNMENT_DEFAULT;
+	else
+		conn->buffer_alignment = (size_t)cval.val;
+#ifndef HAVE_POSIX_MEMALIGN
+	if (conn->buffer_alignment != 0)
+		WT_ERR_MSG(session, EINVAL,
+		    "buffer_alignment requires posix_memalign");
 #endif
 
-	/* Check the configuration string. */
-	WT_ERR(__wt_config_check(
-	    session, __wt_confchk_wiredtiger_open, cbuf->data));
+	WT_ERR(__wt_config_gets(session, cfg, "direct_io", &cval));
+	for (ft = directio_types; ft->name != NULL; ft++) {
+		ret = __wt_config_subgets(session, &cval, ft->name, &sval);
+		if (ret == 0) {
+			if (sval.val)
+				FLD_SET(conn->direct_io, ft->flag);
+		} else if (ret != WT_NOTFOUND)
+			goto err;
+	}
+
+	/* Load any extensions referenced in the config. */
+	WT_ERR(__wt_config_gets(session, cfg, "extensions", &cval));
+	WT_ERR(__wt_config_subinit(session, &subconfig, &cval));
+	while ((ret = __wt_config_next(&subconfig, &skey, &sval)) == 0) {
+		WT_ERR(__wt_buf_fmt(
+		    session, &expath, "%.*s", (int)skey.len, skey.str));
+		if (sval.len > 0)
+			WT_ERR(__wt_buf_fmt(session, &exconfig,
+			    "entry=%.*s\n", (int)sval.len, sval.str));
+		WT_ERR(conn->iface.load_extension(&conn->iface,
+		    expath.data, (sval.len > 0) ? exconfig.data : NULL));
+	}
+	if (ret == WT_NOTFOUND)
+		ret = 0;
+	WT_ERR(ret);
 
 	/*
-	 * The configuration file falls between the default configuration and
-	 * the wiredtiger_open() configuration, overriding the defaults but not
-	 * overriding the wiredtiger_open() configuration.
+	 * Open the connection; if that fails, the connection handle has been
+	 * destroyed by the time the open function returns.
 	 */
-	cfg[2] = cfg[1];
-	cfg[1] = cbuf->data;
+	if ((ret = __wt_connection_open(conn, cfg)) != 0) {
+		conn = NULL;
+		WT_ERR(ret);
+	}
 
-	*cbufp = cbuf;
+	WT_ERR(__wt_open_session(conn, 1, NULL, NULL, &conn->default_session));
+	session = conn->default_session;
+
+	/*
+	 * Check on the turtle and metadata files, creating them if necessary
+	 * (which avoids application threads racing to create the metadata file
+	 * later).  We need a session handle for this, open one.
+	 */
+	if ((ret = __wt_meta_turtle_init(session, &exist)) == 0 && !exist)
+		ret = __wt_schema_create(session, WT_METADATA_URI, NULL);
+	WT_ERR(ret);
+	WT_ERR(__wt_metadata_open(session));
+
+	STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0);
+	*wt_connp = &conn->iface;
+
+	/*
+	 * Destroying the connection on error will destroy our session handle,
+	 * cleanup using the session handle first, then discard the connection.
+	 */
+err:	if (cbuf != NULL)
+		__wt_buf_free(session, cbuf);
+	__wt_buf_free(session, &expath);
+	__wt_buf_free(session, &exconfig);
+
+	if (ret != 0 && conn != NULL)
+		__wt_connection_destroy(conn);
 
-	if (0) {
-err:		if (cbuf != NULL)
-			__wt_buf_free(session, cbuf);
-	}
-	if (fh != NULL)
-		WT_TRET(__wt_close(session, fh));
 	return (ret);
 }
diff --git a/src/conn/conn_btree.c b/src/conn/conn_btree.c
index e12dfcc9e18..90331c2adca 100644
--- a/src/conn/conn_btree.c
+++ b/src/conn/conn_btree.c
@@ -8,35 +8,90 @@
 #include "wt_internal.h"
 
 /*
- * __wt_conn_btree_open --
- *	Find an open btree file handle, otherwise create a new one and link it
- * into the connection's list.
+ * __wt_conn_btree_open_lock --
+ *	Spin on the current btree handle until either (a) it is open, read
+ *	locked; or (b) it is closed, write locked.
  */
-int
-__wt_conn_btree_open(WT_SESSION_IMPL *session,
-    const char *name, const char *filename, const char *config,
-    const char *cfg[], uint32_t flags)
+void
+__wt_conn_btree_open_lock(WT_SESSION_IMPL *session, uint32_t flags)
 {
 	WT_BTREE *btree;
-	WT_CONNECTION_IMPL *conn;
-	int matched, ret;
 
-	conn = S2C(session);
-	ret = 0;
-
-	WT_STAT_INCR(conn->stats, file_open);
+	btree = session->btree;
 
 	/*
-	 * The file configuration string must point to allocated memory: it
-	 * is stored in the returned btree handle and freed when the handle
-	 * is closed.
+	 * Check that the handle is open.  We've already incremented
+	 * the reference count, so once the handle is open it won't be
+	 * closed by another thread.
+	 *
+	 * If we can see the WT_BTREE_OPEN flag set while holding a
+	 * lock on the handle, then it's really open and we can start
+	 * using it.  Alternatively, if we can get an exclusive lock
+	 * and WT_BTREE_OPEN is still not set, we need to do the open.
 	 */
+	for (;;) {
+		if (F_ISSET(btree, WT_BTREE_OPEN) &&
+		    !LF_ISSET(WT_BTREE_EXCLUSIVE)) {
+			__wt_readlock(session, btree->rwlock);
+			if (F_ISSET(btree, WT_BTREE_OPEN))
+				break;
+			__wt_rwunlock(session, btree->rwlock);
+		}
+
+		/*
+		 * It isn't open or we want it exclusive: try to get an
+		 * exclusive lock.  There is some subtlety here: if we race
+		 * with another thread that successfully opens the file, we
+		 * don't want to block waiting to get exclusive access.
+		 */
+		if (__wt_try_writelock(session, btree->rwlock) == 0) {
+			/*
+			 * If it was opened while we waited, drop the write
+			 * lock and get a read lock instead.
+			 */
+			if (F_ISSET(btree, WT_BTREE_OPEN) &&
+			    !LF_ISSET(WT_BTREE_EXCLUSIVE)) {
+				__wt_rwunlock(session, btree->rwlock);
+				continue;
+			}
+
+			/* We have an exclusive lock, we're done. */
+			F_SET(btree, WT_BTREE_EXCLUSIVE);
+			break;
+		}
+
+		/* Give other threads a chance to make progress. */
+		__wt_yield();
+	}
+}
+
+/*
+ * __conn_btree_get --
+ *	Find an open btree file handle, otherwise create a new one and link it
+ *	into the connection's list.  If successful, it returns with either
+ *	(a) an open handle, read locked (if WT_BTREE_EXCLUSIVE is set); or
+ *	(b) an open handle, write locked (if WT_BTREE_EXCLUSIVE is set), or
+ *	(c) a closed handle, write locked.
+ */
+static int
+__conn_btree_get(WT_SESSION_IMPL *session,
+    const char *name, const char *snapshot, uint32_t flags)
+{
+	WT_BTREE *btree;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	int matched;
+
+	conn = S2C(session);
 
 	/* Increment the reference count if we already have the btree open. */
 	matched = 0;
 	__wt_spin_lock(session, &conn->spinlock);
 	TAILQ_FOREACH(btree, &conn->btqh, q) {
-		if (strcmp(filename, btree->filename) == 0) {
+		if (strcmp(name, btree->name) == 0 &&
+		    ((snapshot == NULL && btree->snapshot == NULL) ||
+		    (snapshot != NULL && btree->snapshot != NULL &&
+		    strcmp(snapshot, btree->snapshot) == 0))) {
 			++btree->refcnt;
 			session->btree = btree;
 			matched = 1;
@@ -45,73 +100,24 @@ __wt_conn_btree_open(WT_SESSION_IMPL *session,
 	}
 	if (matched) {
 		__wt_spin_unlock(session, &conn->spinlock);
-
-		/*
-		 * Check that the handle is open.  We've already incremented
-		 * the reference count, so once the handle is open it won't be
-		 * closed by another thread.
-		 *
-		 * If we can see the WT_BTREE_OPEN flag set while holding a
-		 * lock on the handle, then it's really open and we can start
-		 * using it.  Alternatively, if we can get an exclusive lock
-		 * and WT_BTREE_OPEN is still not set, we need to do the open.
-		 */
-		for (;;) {
-			__wt_readlock(session, btree->rwlock);
-			if (F_ISSET(btree, WT_BTREE_OPEN))
-				break;
-
-			/*
-			 * Try to upgrade to an exclusive lock.  There is some
-			 * subtlety here: if we race with another thread that
-			 * successfully opens the file, we don't want to block
-			 * waiting to get exclusive access.
-			 */
-			__wt_rwunlock(session, btree->rwlock);
-			if (__wt_try_writelock(session, btree->rwlock) == 0) {
-				/* Was it opened while we waited? */
-				if (F_ISSET(btree, WT_BTREE_OPEN))
-					break;
-
-				/*
-				 * We've got the exclusive handle lock, it's
-				 * our job to open the file.
-				 */
-				goto conf;
-			}
-
-			/* Give other threads a chance to make progress. */
-			__wt_yield();
-		}
-
-		__wt_rwunlock(session, btree->rwlock);
-
-		/* The config string will not be needed: free it now. */
-		__wt_free(session, config);
-
-		session->btree = btree;
+		__wt_conn_btree_open_lock(session, flags);
 		return (0);
 	}
 
 	/*
 	 * Allocate the WT_BTREE structure, its lock, and set the name so we
 	 * can put the handle into the list.
-	 *
-	 * Because this loop checks for existing btree file handles, the
-	 * connection layer owns:
-	 *	the WT_BTREE structure itself
-	 *	the structure lock
-	 *	the structure names
-	 *	the structure configuration string
 	 */
 	btree = NULL;
 	if ((ret = __wt_calloc_def(session, 1, &btree)) == 0 &&
 	    (ret = __wt_rwlock_alloc(
 		session, "btree handle", &btree->rwlock)) == 0 &&
 	    (ret = __wt_strdup(session, name, &btree->name)) == 0 &&
-	    (ret = __wt_strdup(session, filename, &btree->filename)) == 0) {
+	    (snapshot == NULL ||
+	    (ret = __wt_strdup(session, snapshot, &btree->snapshot)) == 0)) {
 		/* Lock the handle before it is inserted in the list. */
 		__wt_writelock(session, btree->rwlock);
+		F_SET(btree, WT_BTREE_EXCLUSIVE);
 
 		/* Add to the connection list. */
 		btree->refcnt = 1;
@@ -120,32 +126,188 @@ __wt_conn_btree_open(WT_SESSION_IMPL *session,
 	}
 	__wt_spin_unlock(session, &conn->spinlock);
 
-	if (ret != 0) {
-		if (btree != NULL) {
-			if (btree->rwlock != NULL)
-				(void)__wt_rwlock_destroy(
-				    session, btree->rwlock);
-			__wt_free(session, btree->filename);
-			__wt_free(session, btree->name);
-			__wt_free(session, btree);
-			__wt_free(session, config);
-		}
-		return (ret);
+	if (ret == 0)
+		session->btree = btree;
+	else if (btree != NULL) {
+		if (btree->rwlock != NULL)
+			(void)__wt_rwlock_destroy(
+			    session, btree->rwlock);
+		__wt_free(session, btree->name);
+		__wt_free(session, btree->snapshot);
+		__wt_overwrite_and_free(session, btree);
 	}
 
-	/* Open the underlying file. */
-conf:	session->btree = btree;
-	/* Free any old config. */
+	return (ret);
+}
+
+/*
+ * __wt_conn_btree_sync_and_close --
+ *	Sync and close the underlying btree handle.
+ */
+int
+__wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session)
+{
+	WT_DECL_RET;
+	WT_BTREE *btree;
+
+	btree = session->btree;
+
+	if (!F_ISSET(btree, WT_BTREE_OPEN))
+		return (0);
+
+	if (!F_ISSET(btree,
+	    WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY))
+		ret = __wt_snapshot_close(session);
+
+	WT_TRET(__wt_btree_close(session));
+
+	F_CLR(btree,
+	    WT_BTREE_OPEN | WT_BTREE_NO_EVICTION | WT_BTREE_SPECIAL_FLAGS);
+
+	return (ret);
+}
+
+/*
+ * __wt_conn_btree_open --
+ *	Open the current btree handle.
+ */
+int
+__wt_conn_btree_open(WT_SESSION_IMPL *session,
+    const char *config, const char *cfg[], uint32_t flags)
+{
+	WT_BTREE *btree;
+	WT_DECL_ITEM(addr);
+	WT_DECL_RET;
+
+	btree = session->btree;
+
+	WT_ASSERT(session, F_ISSET(btree, WT_BTREE_EXCLUSIVE) &&
+	    !LF_ISSET(WT_BTREE_LOCK_ONLY));
+
+	/* Open the underlying file, free any old config. */
 	__wt_free(session, btree->config);
 	btree->config = config;
 
-	ret = __wt_btree_open(session, cfg, flags);
-	if (ret == 0)
+	/*
+	 * If the handle is already open, it has to be closed so it can be
+	 * reopened with a new configuration.  We don't need to check again:
+	 * this function isn't called if the handle is already open in the
+	 * required mode.
+	 */
+	if (F_ISSET(btree, WT_BTREE_OPEN))
+		WT_RET(__wt_conn_btree_sync_and_close(session));
+
+	WT_RET(__wt_scr_alloc(
+	    session, WT_BTREE_MAX_ADDR_COOKIE, &addr));
+
+	/* Set any special flags on the handle. */
+	F_SET(btree, LF_ISSET(WT_BTREE_SPECIAL_FLAGS));
+
+	/* The metadata file is never evicted. */
+	if (strcmp(btree->name, WT_METADATA_URI) == 0)
+		F_SET(btree, WT_BTREE_NO_EVICTION);
+
+	do {
+		WT_ERR(__wt_meta_snapshot_get(
+		    session, btree->name, btree->snapshot, addr));
+		WT_ERR(__wt_btree_open(session, addr->data, addr->size, cfg,
+		    btree->snapshot == NULL ? 0 : 1));
 		F_SET(btree, WT_BTREE_OPEN);
-	else
-		(void)__wt_conn_btree_close(session, 1);
 
-	__wt_rwunlock(session, btree->rwlock);
+		/* Drop back to a readlock if that is all that was needed. */
+		if (!LF_ISSET(WT_BTREE_EXCLUSIVE)) {
+			F_CLR(btree, WT_BTREE_EXCLUSIVE);
+			__wt_rwunlock(session, btree->rwlock);
+			__wt_conn_btree_open_lock(session, flags);
+		}
+	} while (!F_ISSET(btree, WT_BTREE_OPEN));
+
+	if (0) {
+err:		(void)__wt_conn_btree_close(session, 1);
+	}
+
+	__wt_scr_free(&addr);
+	return (ret);
+}
+
+/*
+ * __wt_conn_btree_get --
+ *	Get an open btree file handle, otherwise open a new one.
+ */
+int
+__wt_conn_btree_get(WT_SESSION_IMPL *session,
+    const char *name, const char *snapshot,
+    const char *cfg[], uint32_t flags)
+{
+	WT_BTREE *btree;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	const char *treeconf;
+	int locked;
+
+	conn = S2C(session);
+
+	WT_STAT_INCR(conn->stats, file_open);
+
+	locked = 1;
+	if ((btree = session->btree) != NULL) {
+		if (!F_ISSET(btree, WT_BTREE_EXCLUSIVE))
+			__wt_conn_btree_open_lock(session, flags);
+		else
+			locked = 0;
+	} else {
+		WT_RET(__conn_btree_get(session, name, snapshot, flags));
+		btree = session->btree;
+	}
+
+	if (!LF_ISSET(WT_BTREE_LOCK_ONLY) &&
+	    (!F_ISSET(session->btree, WT_BTREE_OPEN) ||
+	    LF_ISSET(WT_BTREE_SPECIAL_FLAGS))) {
+		if ((ret = __wt_metadata_read(session, name, &treeconf)) != 0) {
+			if (ret == WT_NOTFOUND)
+				ret = ENOENT;
+			goto err;
+		}
+		ret = __wt_conn_btree_open(session, treeconf, cfg, flags);
+	}
+
+err:	if (ret != 0 && locked) {
+		F_CLR(btree, WT_BTREE_EXCLUSIVE);
+		__wt_rwunlock(session, btree->rwlock);
+	}
+
+	WT_ASSERT(session, ret != 0 ||
+	    LF_ISSET(WT_BTREE_EXCLUSIVE) == F_ISSET(btree, WT_BTREE_EXCLUSIVE));
+
+	return (ret);
+}
+
+/*
+ * __wt_conn_btree_apply --
+ *	Apply a function to all open, non-snapshot btree handles apart from the
+ *	metadata file.
+ */
+int
+__wt_conn_btree_apply(WT_SESSION_IMPL *session,
+    int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[])
+{
+	WT_BTREE *btree, *saved_btree;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+
+	conn = S2C(session);
+	saved_btree = session->btree;
+
+	__wt_spin_lock(session, &conn->spinlock);
+	TAILQ_FOREACH(btree, &conn->btqh, q)
+		if (btree->snapshot == NULL &&
+		    strcmp(btree->name, WT_METADATA_URI) != 0) {
+			session->btree = btree;
+			WT_ERR(func(session, cfg));
+		}
+
+err:	__wt_spin_unlock(session, &conn->spinlock);
+	session->btree = saved_btree;
 	return (ret);
 }
 
@@ -158,145 +320,187 @@ __wt_conn_btree_close(WT_SESSION_IMPL *session, int locked)
 {
 	WT_BTREE *btree;
 	WT_CONNECTION_IMPL *conn;
-	int inuse, ret;
+	WT_DECL_RET;
+	int inuse;
 
 	btree = session->btree;
 	conn = S2C(session);
-	ret = 0;
 
-	if (F_ISSET(btree, WT_BTREE_OPEN)) {
+	if (F_ISSET(btree, WT_BTREE_OPEN))
 		WT_STAT_DECR(conn->stats, file_open);
 
-		/*
-		 * If it looks like we are the last reference, sync the file.
-		 * This should make the close call fast (while we are holding
-		 * an exclusive lock on the handle).
-		 */
-		if (btree->refcnt == 1)
-			WT_RET(__wt_btree_sync(session, NULL));
-	}
-
 	/*
 	 * Decrement the reference count.  If we really are the last reference,
 	 * get an exclusive lock on the handle so that we can close it.
 	 */
 	__wt_spin_lock(session, &conn->spinlock);
 	inuse = --btree->refcnt > 0;
-	if (!inuse && !locked)
+	if (!inuse && !locked) {
 		__wt_writelock(session, btree->rwlock);
+		F_SET(btree, WT_BTREE_EXCLUSIVE);
+	}
 	__wt_spin_unlock(session, &conn->spinlock);
 
 	if (!inuse) {
+		/*
+		 * We should only close the metadata file when closing the
+		 * last session (i.e., the default session for the connection).
+		 */
+		WT_ASSERT(session,
+		    btree != session->metafile ||
+		    session == conn->default_session);
+
+		if (F_ISSET(btree, WT_BTREE_OPEN))
+			WT_TRET(__wt_conn_btree_sync_and_close(session));
+		if (!locked) {
+			F_CLR(btree, WT_BTREE_EXCLUSIVE);
+			__wt_rwunlock(session, btree->rwlock);
+		}
+	}
+
+	return (ret);
+}
+
+/*
+ * __wt_conn_btree_close_all --
+ *	Close all btree handles handles with matching name (including all
+ *	snapshot handles).
+ */
+int
+__wt_conn_btree_close_all(WT_SESSION_IMPL *session, const char *name)
+{
+	WT_BTREE *btree, *saved_btree;
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+
+	conn = S2C(session);
+	saved_btree = session->btree;
+
+	__wt_spin_lock(session, &conn->spinlock);
+	TAILQ_FOREACH(btree, &conn->btqh, q) {
+		if (strcmp(btree->name, name) != 0)
+			continue;
+
+		/*
+		 * The caller may have this tree locked to prevent
+		 * concurrent schema operations.
+		 */
+		if (btree == saved_btree)
+			WT_ASSERT(session, F_ISSET(btree, WT_BTREE_EXCLUSIVE));
+		else {
+			WT_ERR(__wt_try_writelock(session, btree->rwlock));
+			F_SET(btree, WT_BTREE_EXCLUSIVE);
+		}
+
+		session->btree = btree;
+		if (WT_META_TRACKING(session))
+			WT_ERR(__wt_meta_track_handle_lock(session));
+
+		/*
+		 * We have an exclusive lock, which means there are no
+		 * cursors open at this point.  Close the handle, if
+		 * necessary.
+		 */
 		if (F_ISSET(btree, WT_BTREE_OPEN)) {
-			ret = __wt_btree_close(session);
-			F_CLR(btree, WT_BTREE_OPEN);
+			__wt_spin_unlock(session, &conn->spinlock);
+
+			ret = __wt_meta_track_sub_on(session);
+			if (ret == 0)
+				ret = __wt_conn_btree_sync_and_close(session);
+
+			/*
+			 * If the close succeeded, drop any locks it
+			 * acquired.  If there was a failure, this
+			 * function will fail and the whole transaction
+			 * will be rolled back.
+			 */
+			if (ret == 0)
+				ret = __wt_meta_track_sub_off(session);
+
+			__wt_spin_lock(session, &conn->spinlock);
 		}
-		if (!locked)
-			__wt_rwunlock(session, btree->rwlock);
+
+		if (!WT_META_TRACKING(session))
+			WT_TRET(__wt_session_release_btree(session));
+		session->btree = NULL;
+
+		WT_ERR(ret);
 	}
 
+err:	__wt_spin_unlock(session, &conn->spinlock);
 	return (ret);
 }
 
 /*
- * __conn_btree_remove --
+ * __conn_btree_discard --
  *	Discard a single btree file handle structure.
  */
 static int
-__conn_btree_remove(WT_SESSION_IMPL *session, WT_BTREE *btree)
+__conn_btree_discard(WT_SESSION_IMPL *session, WT_BTREE *btree)
 {
-	int ret;
+	WT_DECL_RET;
 
-	ret = 0;
-
-	WT_SET_BTREE_IN_SESSION(session, btree);
-	WT_TRET(__wt_btree_close(session));
+	if (F_ISSET(btree, WT_BTREE_OPEN)) {
+		WT_SET_BTREE_IN_SESSION(session, btree);
+		WT_TRET(__wt_conn_btree_sync_and_close(session));
+		WT_CLEAR_BTREE_IN_SESSION(session);
+	}
 	WT_TRET(__wt_rwlock_destroy(session, btree->rwlock));
-	__wt_free(session, btree->filename);
-	__wt_free(session, btree->name);
 	__wt_free(session, btree->config);
-	__wt_free(session, btree);
-	WT_CLEAR_BTREE_IN_SESSION(session);
+	__wt_free(session, btree->name);
+	__wt_free(session, btree->snapshot);
+	__wt_overwrite_and_free(session, btree);
 
 	return (ret);
 }
 
 /*
- * __wt_conn_btree_remove --
+ * __wt_conn_btree_discard --
  *	Discard the btree file handle structures.
  */
 int
-__wt_conn_btree_remove(WT_CONNECTION_IMPL *conn)
+__wt_conn_btree_discard(WT_CONNECTION_IMPL *conn)
 {
 	WT_BTREE *btree;
 	WT_BTREE_SESSION *btree_session;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
-	ret = 0;
-
-	/*
-	 * We need a session handle because we're potentially reading/writing
-	 * pages.
-	 */
-	WT_RET(__wt_open_session(conn, 1, NULL, NULL, &session));
+	session = conn->default_session;
 
 	/*
-	 * Close open btree handles: first, everything but the schema file (as
-	 * closing a normal file may open and write the schema file), then the
-	 * schema file.  This function isn't called often, and I don't want to
-	 * "know" anything about the schema file's position on the list, so we
-	 * do it the hard way.
+	 * Close open btree handles: first, everything but the metadata file
+	 * (as closing a normal file may open and write the metadata file),
+	 * then the metadata file.  This function isn't called often, and I
+	 * don't want to "know" anything about the metadata file's position on
+	 * the list, so we do it the hard way.
 	 */
 restart:
 	TAILQ_FOREACH(btree, &conn->btqh, q) {
-		if (strcmp(btree->filename, WT_SCHEMA_FILENAME) == 0)
+		if (strcmp(btree->name, WT_METADATA_URI) == 0)
 			continue;
 
 		TAILQ_REMOVE(&conn->btqh, btree, q);
 		--conn->btqcnt;
-		WT_TRET(__conn_btree_remove(session, btree));
+		WT_TRET(__conn_btree_discard(session, btree));
 		goto restart;
 	}
 
 	/*
 	 * Closing the files may have resulted in entries on our session's list
-	 * of open btree handles, specifically, we added the schema file if any
-	 * of the files were dirty.  Clean up that list before we shut down the
-	 * schema file entry, for good.
+	 * of open btree handles, specifically, we added the metadata file if
+	 * any of the files were dirty.  Clean up that list before we shut down
+	 * the metadata entry, for good.
 	 */
 	while ((btree_session = TAILQ_FIRST(&session->btrees)) != NULL)
-		WT_TRET(__wt_session_remove_btree(session, btree_session, 0));
+		WT_TRET(__wt_session_discard_btree(session, btree_session));
 
-	/* Close the schema file handle. */
+	/* Close the metadata file handle. */
 	while ((btree = TAILQ_FIRST(&conn->btqh)) != NULL) {
 		TAILQ_REMOVE(&conn->btqh, btree, q);
 		--conn->btqcnt;
-		WT_TRET(__conn_btree_remove(session, btree));
+		WT_TRET(__conn_btree_discard(session, btree));
 	}
 
-	/* Discard our session. */
-	WT_TRET(session->iface.close(&session->iface, NULL));
-
 	return (ret);
 }
-
-/*
- * __wt_conn_btree_reopen --
- *	Reset an open btree handle back to its initial state.
- */
-int
-__wt_conn_btree_reopen(
-    WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags)
-{
-	WT_BTREE *btree;
-
-	btree = session->btree;
-
-	WT_RET(__wt_btree_close(session));
-	F_CLR(btree, WT_BTREE_OPEN);
-	WT_RET(__wt_btree_open(session, cfg, flags));
-	F_SET(btree, WT_BTREE_OPEN);
-
-	return (0);
-}
diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c
index 10fe5542543..b8523261dfe 100644
--- a/src/conn/conn_handle.c
+++ b/src/conn/conn_handle.c
@@ -16,10 +16,11 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
 {
 	WT_SESSION_IMPL *session;
 
-	session = &conn->default_session;
+	session = conn->default_session;
 
 	TAILQ_INIT(&conn->btqh);		/* WT_BTREE list */
 	TAILQ_INIT(&conn->dlhqh);		/* Library list */
+	TAILQ_INIT(&conn->dsrcqh);		/* Data source list */
 	TAILQ_INIT(&conn->fhqh);		/* File list */
 	TAILQ_INIT(&conn->collqh);		/* Collator list */
 	TAILQ_INIT(&conn->compqh);		/* Compressor list */
@@ -30,12 +31,18 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
 	/* File handle spinlock. */
 	__wt_spin_init(session, &conn->fh_lock);
 
+	/* Schema operation spinlock. */
+	__wt_spin_init(session, &conn->schema_lock);
+
 	/* Serialized function call spinlock. */
 	__wt_spin_init(session, &conn->serial_lock);
 
 	/* General purpose spinlock. */
 	__wt_spin_init(session, &conn->spinlock);
 
+	/* Checkpoint lock. */
+	WT_RET(__wt_rwlock_alloc(session, "checkpoint", &conn->ckpt_rwlock));
+
 	return (0);
 }
 
@@ -48,7 +55,7 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn)
 {
 	WT_SESSION_IMPL *session;
 
-	session = &conn->default_session;
+	session = conn->default_session;
 
 	/* Check there's something to destroy. */
 	if (conn == NULL)
@@ -72,13 +79,15 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn)
 
 	__wt_spin_destroy(session, &conn->fh_lock);
 	__wt_spin_destroy(session, &conn->serial_lock);
+	__wt_spin_destroy(session, &conn->schema_lock);
 	__wt_spin_destroy(session, &conn->spinlock);
 
+	if (conn->ckpt_rwlock != NULL)
+		(void)__wt_rwlock_destroy(session, conn->ckpt_rwlock);
+
 	/* Free allocated memory. */
 	__wt_free(session, conn->home);
 	__wt_free(session, conn->sessions);
-	__wt_free(session, conn->session_array);
-	__wt_free(session, conn->hazard);
 	__wt_free(session, conn->stats);
 
 	__wt_free(NULL, conn);
diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c
index 58263909cfc..7ad0933faea 100644
--- a/src/conn/conn_open.c
+++ b/src/conn/conn_open.c
@@ -14,28 +14,23 @@
 int
 __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[])
 {
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	/* Default session. */
-	conn->default_session.iface.connection = &conn->iface;
+	session = conn->default_session;
+	session->iface.connection = &conn->iface;
 
-	session = &conn->default_session;
-	ret = 0;
-
-	/* WT_SESSION_IMPL and hazard arrays. */
-	WT_ERR(__wt_calloc(session,
-	    conn->session_size, sizeof(WT_SESSION_IMPL *), &conn->sessions));
-	WT_ERR(__wt_calloc(session,
-	    conn->session_size, sizeof(WT_SESSION_IMPL),
-	    &conn->session_array));
+	/* WT_SESSION_IMPL array. */
 	WT_ERR(__wt_calloc(session,
-	   conn->session_size * conn->hazard_size, sizeof(WT_HAZARD),
-	   &conn->hazard));
+	    conn->session_size, sizeof(WT_SESSION_IMPL), &conn->sessions));
 
 	/* Create the cache. */
 	WT_ERR(__wt_cache_create(conn, cfg));
 
+	/* Initialize transaction support. */
+	WT_ERR(__wt_txn_global_init(conn, cfg));
+
 	/*
 	 * Publish: there must be a barrier to ensure the connection structure
 	 * fields are set before other threads read from the pointer.
@@ -61,12 +56,11 @@ int
 __wt_connection_close(WT_CONNECTION_IMPL *conn)
 {
 	WT_SESSION_IMPL *session;
+	WT_DECL_RET;
 	WT_DLH *dlh;
 	WT_FH *fh;
-	int ret;
 
-	session = &conn->default_session;
-	ret = 0;
+	session = conn->default_session;
 
 	/*
 	 * Complain if files weren't closed (ignoring the lock and logging
@@ -92,12 +86,31 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
 	/* Discard the cache. */
 	__wt_cache_destroy(conn);
 
+	/* Discard transaction state. */
+	__wt_txn_global_destroy(conn);
+
 	/* Close extensions. */
 	while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) {
 		TAILQ_REMOVE(&conn->dlhqh, dlh, q);
 		WT_TRET(__wt_dlclose(session, dlh));
 	}
 
+	/*
+	 * Close the internal (default) session, and switch back to the dummy
+	 * session in case of any error messages from the remaining operations
+	 * while destroying the connection handle.
+	 *
+	 * Additionally, the session's hazard reference memory isn't discarded
+	 * during normal session close because access to it isn't serialized.
+	 * Discard it now.
+	 */
+	if (session != &conn->dummy_session) {
+		WT_TRET(session->iface.close(&session->iface, NULL));
+		__wt_free(&conn->dummy_session, session->hazard);
+
+		conn->default_session = &conn->dummy_session;
+	}
+
 	/* Destroy the handle. */
 	__wt_connection_destroy(conn);
 
diff --git a/src/cursor/cur_bulk.c b/src/cursor/cur_bulk.c
index 474a7e0b413..c68417a18c1 100644
--- a/src/cursor/cur_bulk.c
+++ b/src/cursor/cur_bulk.c
@@ -16,8 +16,8 @@ __curbulk_insert(WT_CURSOR *cursor)
 {
 	WT_BTREE *btree;
 	WT_CURSOR_BULK *cbulk;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	cbulk = (WT_CURSOR_BULK *)cursor;
 	btree = cbulk->cbt.btree;
@@ -40,8 +40,8 @@ __curbulk_close(WT_CURSOR *cursor)
 {
 	WT_BTREE *btree;
 	WT_CURSOR_BULK *cbulk;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	cbulk = (WT_CURSOR_BULK *)cursor;
 	btree = cbulk->cbt.btree;
diff --git a/src/cursor/cur_config.c b/src/cursor/cur_config.c
index 1563af56417..985543528f4 100644
--- a/src/cursor/cur_config.c
+++ b/src/cursor/cur_config.c
@@ -55,7 +55,7 @@ __wt_curconfig_open(WT_SESSION_IMPL *session,
 	};
 	WT_CURSOR_CONFIG *cconfig;
 	WT_CURSOR *cursor;
-	int ret;
+	WT_DECL_RET;
 
 	WT_UNUSED(uri);
 
diff --git a/src/cursor/cur_dump.c b/src/cursor/cur_dump.c
index 4a04732910f..31c2aaec382 100644
--- a/src/cursor/cur_dump.c
+++ b/src/cursor/cur_dump.c
@@ -51,9 +51,9 @@ __curdump_get_key(WT_CURSOR *cursor, ...)
 {
 	WT_CURSOR *child;
 	WT_CURSOR_DUMP *cdump;
+	WT_DECL_RET;
 	WT_ITEM item, *itemp;
 	WT_SESSION_IMPL *session;
-	int ret;
 	uint64_t recno;
 	va_list ap;
 
@@ -123,11 +123,11 @@ __curdump_set_key(WT_CURSOR *cursor, ...)
 {
 	WT_CURSOR_DUMP *cdump;
 	WT_CURSOR *child;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
 	uint64_t recno;
 	va_list ap;
 	const char *p;
-	int ret;
 
 	cdump = (WT_CURSOR_DUMP *)cursor;
 	child = cdump->child;
@@ -168,10 +168,10 @@ __curdump_get_value(WT_CURSOR *cursor, ...)
 {
 	WT_CURSOR_DUMP *cdump;
 	WT_CURSOR *child;
+	WT_DECL_RET;
 	WT_ITEM item, *itemp;
 	WT_SESSION_IMPL *session;
 	va_list ap;
-	int ret;
 
 	cdump = (WT_CURSOR_DUMP *)cursor;
 	child = cdump->child;
@@ -204,9 +204,9 @@ __curdump_set_value(WT_CURSOR *cursor, ...)
 {
 	WT_CURSOR_DUMP *cdump;
 	WT_CURSOR *child;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
 	va_list ap;
-	int ret;
 	const char *p;
 
 	cdump = (WT_CURSOR_DUMP *)cursor;
@@ -267,12 +267,11 @@ __curdump_close(WT_CURSOR *cursor)
 {
 	WT_CURSOR_DUMP *cdump;
 	WT_CURSOR *child;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	cdump = (WT_CURSOR_DUMP *)cursor;
 	child = cdump->child;
-	ret = 0;
 
 	CURSOR_API_CALL(cursor, session, get_key, NULL);
 	if (child != NULL)
@@ -338,6 +337,7 @@ __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp)
 	F_SET(cursor,
 	    F_ISSET(child, WT_CURSTD_DUMP_PRINT | WT_CURSTD_DUMP_HEX));
 
+	STATIC_ASSERT(offsetof(WT_CURSOR_DUMP, iface) == 0);
 	WT_RET(__wt_cursor_init(cursor, NULL, owner, cfg, cursorp));
 	return (0);
 }
diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c
index f9e9b35096f..cc70d0c3ef6 100644
--- a/src/cursor/cur_file.c
+++ b/src/cursor/cur_file.c
@@ -15,8 +15,8 @@ static int
 __curfile_next(WT_CURSOR *cursor)
 {
 	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	cbt = (WT_CURSOR_BTREE *)cursor;
 	CURSOR_API_CALL(cursor, session, next, cbt->btree);
@@ -34,8 +34,8 @@ static int
 __curfile_prev(WT_CURSOR *cursor)
 {
 	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	cbt = (WT_CURSOR_BTREE *)cursor;
 	CURSOR_API_CALL(cursor, session, prev, cbt->btree);
@@ -53,8 +53,8 @@ static int
 __curfile_reset(WT_CURSOR *cursor)
 {
 	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	cbt = (WT_CURSOR_BTREE *)cursor;
 	CURSOR_API_CALL(cursor, session, reset, cbt->btree);
@@ -72,8 +72,8 @@ static int
 __curfile_search(WT_CURSOR *cursor)
 {
 	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	cbt = (WT_CURSOR_BTREE *)cursor;
 	CURSOR_API_CALL(cursor, session, search, cbt->btree);
@@ -92,8 +92,8 @@ static int
 __curfile_search_near(WT_CURSOR *cursor, int *exact)
 {
 	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	cbt = (WT_CURSOR_BTREE *)cursor;
 	CURSOR_API_CALL(cursor, session, search_near, cbt->btree);
@@ -112,8 +112,8 @@ static int
 __curfile_insert(WT_CURSOR *cursor)
 {
 	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	cbt = (WT_CURSOR_BTREE *)cursor;
 	CURSOR_API_CALL(cursor, session, insert, cbt->btree);
@@ -121,7 +121,7 @@ __curfile_insert(WT_CURSOR *cursor)
 		WT_CURSOR_NEEDKEY(cursor);
 	WT_CURSOR_NEEDVALUE(cursor);
 	ret = __wt_btcur_insert((WT_CURSOR_BTREE *)cursor);
-err:	API_END(session);
+err:	API_END_TXN_ERROR(session, ret);
 
 	return (ret);
 }
@@ -134,15 +134,15 @@ static int
 __curfile_update(WT_CURSOR *cursor)
 {
 	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	cbt = (WT_CURSOR_BTREE *)cursor;
 	CURSOR_API_CALL(cursor, session, update, cbt->btree);
 	WT_CURSOR_NEEDKEY(cursor);
 	WT_CURSOR_NEEDVALUE(cursor);
 	ret = __wt_btcur_update((WT_CURSOR_BTREE *)cursor);
-err:	API_END(session);
+err:	API_END_TXN_ERROR(session, ret);
 
 	return (ret);
 }
@@ -155,14 +155,14 @@ static int
 __curfile_remove(WT_CURSOR *cursor)
 {
 	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	cbt = (WT_CURSOR_BTREE *)cursor;
 	CURSOR_API_CALL(cursor, session, remove, cbt->btree);
 	WT_CURSOR_NEEDKEY(cursor);
 	ret = __wt_btcur_remove((WT_CURSOR_BTREE *)cursor);
-err:	API_END(session);
+err:	API_END_TXN_ERROR(session, ret);
 
 	return (ret);
 }
@@ -175,8 +175,8 @@ static int
 __curfile_close(WT_CURSOR *cursor)
 {
 	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	cbt = (WT_CURSOR_BTREE *)cursor;
 	CURSOR_API_CALL(cursor, session, close, cbt->btree);
@@ -186,7 +186,7 @@ __curfile_close(WT_CURSOR *cursor)
 	/* The URI is owned by the btree handle. */
 	cursor->uri = NULL;
 	WT_TRET(__wt_cursor_close(cursor));
-	API_END(session);
+	API_END_TXN_ERROR(session, ret);
 
 	return (ret);
 }
@@ -230,11 +230,11 @@ __wt_curfile_create(WT_SESSION_IMPL *session,
 	WT_CONFIG_ITEM cval;
 	WT_CURSOR *cursor;
 	WT_CURSOR_BTREE *cbt;
+	WT_DECL_RET;
 	size_t csize;
-	int bulk, ret;
+	int bulk;
 
 	cbt = NULL;
-	ret = 0;
 
 	btree = session->btree;
 	WT_ASSERT(session, btree != NULL);
@@ -242,10 +242,6 @@ __wt_curfile_create(WT_SESSION_IMPL *session,
 	WT_RET(__wt_config_gets(session, cfg, "bulk", &cval));
 	bulk = (cval.val != 0);
 
-	/* Lock the handle while the cursor is using it. */
-	WT_RET(__wt_session_lock_btree(session,
-	    NULL, bulk ? WT_BTREE_EXCLUSIVE | WT_BTREE_BULK : 0));
-
 	csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE);
 	WT_RET(__wt_calloc(session, 1, csize, &cbt));
 
@@ -275,19 +271,26 @@ err:		__wt_free(session, cbt);
  *	WT_SESSION->open_cursor method for the btree cursor type.
  */
 int
-__wt_curfile_open(WT_SESSION_IMPL *session,
-    const char *uri, const char *cfg[], WT_CURSOR **cursorp)
+__wt_curfile_open(WT_SESSION_IMPL *session, const char *uri,
+    WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
 {
+	WT_CONFIG_ITEM cval;
+	int bulk;
+
+	WT_RET(__wt_config_gets(session, cfg, "bulk", &cval));
+	bulk = (cval.val != 0);
+
 	/* TODO: handle projections. */
 
-	if (WT_PREFIX_MATCH(uri, "colgroup:"))
+	/* Get the handle and lock it while the cursor is using it. */
+	if (WT_PREFIX_MATCH(uri, "colgroup:") || WT_PREFIX_MATCH(uri, "index:"))
 		WT_RET(__wt_schema_get_btree(session,
-		    uri, strlen(uri), NULL, WT_BTREE_NO_LOCK));
+		    uri, strlen(uri), cfg, bulk ? WT_BTREE_EXCLUSIVE : 0));
 	else if (WT_PREFIX_MATCH(uri, "file:"))
 		WT_RET(__wt_session_get_btree(session,
-		     uri, uri, NULL, NULL, WT_BTREE_NO_LOCK));
+		     uri, cfg, bulk ? WT_BTREE_EXCLUSIVE : 0));
 	else
-		return (EINVAL);
+		WT_RET_MSG(session, EINVAL, "Unexpected object type");
 
-	return (__wt_curfile_create(session, NULL, cfg, cursorp));
+	return (__wt_curfile_create(session, owner, cfg, cursorp));
 }
diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c
index efaa8ac48b4..35b45ab733c 100644
--- a/src/cursor/cur_index.c
+++ b/src/cursor/cur_index.c
@@ -15,10 +15,10 @@ static int
 __curindex_get_value(WT_CURSOR *cursor, ...)
 {
 	WT_CURSOR_INDEX *cindex;
+	WT_DECL_RET;
 	WT_ITEM *item;
 	WT_SESSION_IMPL *session;
 	va_list ap;
-	int ret;
 
 	cindex = (WT_CURSOR_INDEX *)cursor;
 	CURSOR_API_CALL(cursor, session, get_value, NULL);
@@ -50,8 +50,8 @@ err:	API_END(session);
 static void
 __curindex_set_value(WT_CURSOR *cursor, ...)
 {
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	CURSOR_API_CALL(cursor, session, set_value, NULL);
 	WT_UNUSED(ret);
@@ -111,8 +111,8 @@ static int
 __curindex_next(WT_CURSOR *cursor)
 {
 	WT_CURSOR_INDEX *cindex;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	cindex = (WT_CURSOR_INDEX *)cursor;
 	CURSOR_API_CALL(cursor, session, next, cindex->cbt.btree);
@@ -131,8 +131,8 @@ static int
 __curindex_prev(WT_CURSOR *cursor)
 {
 	WT_CURSOR_INDEX *cindex;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	cindex = (WT_CURSOR_INDEX *)cursor;
 	CURSOR_API_CALL(cursor, session, prev, cindex->cbt.btree);
@@ -152,8 +152,9 @@ __curindex_reset(WT_CURSOR *cursor)
 {
 	WT_CURSOR **cp;
 	WT_CURSOR_INDEX *cindex;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int i, ret;
+	int i;
 
 	cindex = (WT_CURSOR_INDEX *)cursor;
 	CURSOR_API_CALL(cursor, session, reset, cindex->cbt.btree);
@@ -179,9 +180,10 @@ static int
 __curindex_search(WT_CURSOR *cursor)
 {
 	WT_CURSOR_INDEX *cindex;
+	WT_DECL_RET;
 	WT_ITEM *oldkeyp;
 	WT_SESSION_IMPL *session;
-	int exact, ret;
+	int exact;
 
 	cindex = (WT_CURSOR_INDEX *)cursor;
 	CURSOR_API_CALL(cursor, session, search, cindex->cbt.btree);
@@ -244,8 +246,8 @@ static int
 __curindex_search_near(WT_CURSOR *cursor, int *exact)
 {
 	WT_CURSOR_INDEX *cindex;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	cindex = (WT_CURSOR_INDEX *)cursor;
 	CURSOR_API_CALL(cursor, session, search_near, cindex->cbt.btree);
@@ -266,8 +268,9 @@ __curindex_close(WT_CURSOR *cursor)
 	WT_BTREE *btree;
 	WT_CURSOR_INDEX *cindex;
 	WT_CURSOR **cp;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int i, ret;
+	int i;
 
 	cindex = (WT_CURSOR_INDEX *)cursor;
 	btree = cindex->cbt.btree;
@@ -318,9 +321,8 @@ __curindex_open_colgroups(
 		if ((*proj != WT_PROJ_KEY && *proj != WT_PROJ_VALUE) ||
 		    cp[arg] != NULL)
 			continue;
-		session->btree = table->colgroup[arg];
-		WT_RET(__wt_curfile_create(
-		    session, &cindex->cbt.iface, cfg, &cp[arg]));
+		WT_RET(__wt_curfile_open(session,
+		    table->cg_name[arg], &cindex->cbt.iface, cfg, &cp[arg]));
 	}
 
 	return (0);
@@ -365,12 +367,10 @@ __wt_curindex_open(WT_SESSION_IMPL *session,
 	WT_CURSOR_INDEX *cindex;
 	WT_CURSOR_BTREE *cbt;
 	WT_CURSOR *cursor;
+	WT_DECL_RET;
 	WT_TABLE *table;
 	const char *columns, *idxname, *tablename;
 	size_t namesize;
-	int ret;
-
-	ret = 0;
 
 	tablename = uri;
 	if (!WT_PREFIX_SKIP(tablename, "index:") ||
@@ -394,7 +394,9 @@ __wt_curindex_open(WT_SESSION_IMPL *session,
 		namesize = (size_t)(columns - idxname);
 
 	WT_RET(__wt_schema_open_index(session, table, idxname, namesize));
-	WT_RET(__wt_session_lock_btree(session, NULL, 0));
+	WT_RET(__wt_schema_get_btree(session,
+	    uri, (columns == NULL) ? strlen(uri) : WT_PTRDIFF(columns, uri),
+	    NULL, 0));
 	WT_RET(__wt_calloc_def(session, 1, &cindex));
 
 	cbt = &cindex->cbt;
diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c
index dbc9d114265..9aeda0c85ec 100644
--- a/src/cursor/cur_stat.c
+++ b/src/cursor/cur_stat.c
@@ -37,13 +37,12 @@ static int
 __curstat_get_key(WT_CURSOR *cursor, ...)
 {
 	WT_CURSOR_STAT *cst;
+	WT_DECL_RET;
 	WT_ITEM *item;
 	WT_SESSION_IMPL *session;
 	size_t size;
 	va_list ap;
-	int ret;
 
-	ret = 0;
 	cst = (WT_CURSOR_STAT *)cursor;
 	CURSOR_API_CALL(cursor, session, get_key, cst->btree);
 	va_start(ap, cursor);
@@ -51,7 +50,8 @@ __curstat_get_key(WT_CURSOR *cursor, ...)
 	WT_CURSOR_NEEDKEY(cursor);
 
 	if (F_ISSET(cursor, WT_CURSTD_RAW)) {
-		size = __wt_struct_size(session, cursor->key_format, cst->key);
+		WT_ERR(__wt_struct_size(
+		    session, &size, cursor->key_format, cst->key));
 		WT_ERR(__wt_buf_initsize(session, &cursor->key, size));
 		WT_ERR(__wt_struct_pack(session, cursor->key.mem, size,
 		    cursor->key_format, cst->key));
@@ -75,13 +75,12 @@ static int
 __curstat_get_value(WT_CURSOR *cursor, ...)
 {
 	WT_CURSOR_STAT *cst;
+	WT_DECL_RET;
 	WT_ITEM *item;
 	WT_SESSION_IMPL *session;
 	va_list ap;
 	size_t size;
-	int ret;
 
-	ret = 0;
 	cst = (WT_CURSOR_STAT *)cursor;
 	CURSOR_API_CALL(cursor, session, get_value, cst->btree);
 	va_start(ap, cursor);
@@ -89,8 +88,8 @@ __curstat_get_value(WT_CURSOR *cursor, ...)
 	WT_CURSOR_NEEDVALUE(cursor);
 
 	if (F_ISSET(cursor, WT_CURSTD_RAW)) {
-		size = __wt_struct_size(session, cursor->value_format,
-		    cst->stats_first[cst->key].desc, cst->pv.data, cst->v);
+		WT_ERR(__wt_struct_size(session, &size, cursor->value_format,
+		    cst->stats_first[cst->key].desc, cst->pv.data, cst->v));
 		WT_ERR(__wt_buf_initsize(session, &cursor->value, size));
 		WT_ERR(__wt_struct_pack(session, cursor->value.mem, size,
 		    cursor->value_format,
@@ -118,12 +117,11 @@ static void
 __curstat_set_key(WT_CURSOR *cursor, ...)
 {
 	WT_CURSOR_STAT *cst;
+	WT_DECL_RET;
 	WT_ITEM *item;
 	WT_SESSION_IMPL *session;
 	va_list ap;
-	int ret;
 
-	ret = 0;
 	cst = (WT_CURSOR_STAT *)cursor;
 	CURSOR_API_CALL(cursor, session, set_key, cst->btree);
 
@@ -163,10 +161,9 @@ static int
 __curstat_next(WT_CURSOR *cursor)
 {
 	WT_CURSOR_STAT *cst;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
-	ret = 0;
 	cst = (WT_CURSOR_STAT *)cursor;
 	CURSOR_API_CALL(cursor, session, next, cst->btree);
 
@@ -196,10 +193,9 @@ static int
 __curstat_prev(WT_CURSOR *cursor)
 {
 	WT_CURSOR_STAT *cst;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
-	ret = 0;
 	cst = (WT_CURSOR_STAT *)cursor;
 	CURSOR_API_CALL(cursor, session, prev, cst->btree);
 
@@ -244,10 +240,9 @@ static int
 __curstat_search(WT_CURSOR *cursor)
 {
 	WT_CURSOR_STAT *cst;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
-	ret = 0;
 	cst = (WT_CURSOR_STAT *)cursor;
 	CURSOR_API_CALL(cursor, session, search, cst->btree);
 
@@ -273,10 +268,9 @@ static int
 __curstat_close(WT_CURSOR *cursor)
 {
 	WT_CURSOR_STAT *cst;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
-	ret = 0;
 	cst = (WT_CURSOR_STAT *)cursor;
 	CURSOR_API_CALL(cursor, session, close, cst->btree);
 
@@ -333,37 +327,36 @@ __wt_curstat_open(WT_SESSION_IMPL *session,
 		0			/* uint32_t flags */
 	};
 	WT_BTREE *btree;
-	WT_CURSOR_STAT *cst;
 	WT_CONFIG_ITEM cval;
 	WT_CURSOR *cursor;
+	WT_CURSOR_STAT *cst;
+	WT_DECL_RET;
 	WT_STATS *stats_first;
 	void (*clear_func)(WT_STATS *);
-	int clear_on_close, ret, stats_count;
+	int statistics_clear, stats_count;
 
 	btree = NULL;
 	clear_func = NULL;
 	cst = NULL;
-	ret = 0;
 
-	WT_RET(__wt_config_gets(session, cfg, "clear_on_close", &cval));
-	clear_on_close = (cval.val != 0);
+	WT_RET(__wt_config_gets(session, cfg, "statistics_clear", &cval));
+	statistics_clear = (cval.val != 0);
 
 	if (!WT_PREFIX_SKIP(uri, "statistics:"))
 		return (EINVAL);
 	if (WT_PREFIX_MATCH(uri, "file:")) {
-		WT_ERR(
-		    __wt_session_get_btree(session, uri, uri, NULL, NULL, 0));
+		WT_ERR(__wt_session_get_btree(session, uri, NULL, 0));
 		btree = session->btree;
 		WT_ERR(__wt_btree_stat_init(session));
 		stats_first = (WT_STATS *)session->btree->stats;
 		stats_count = sizeof(WT_BTREE_STATS) / sizeof(WT_STATS);
-		if (clear_on_close)
+		if (statistics_clear)
 			clear_func = __wt_stat_clear_btree_stats;
 	} else {
 		__wt_conn_stat_init(session);
 		stats_first = (WT_STATS *)S2C(session)->stats;
 		stats_count = sizeof(WT_CONNECTION_STATS) / sizeof(WT_STATS);
-		if (clear_on_close)
+		if (statistics_clear)
 			clear_func = __wt_stat_clear_connection_stats;
 	}
 
diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c
index 27659070d15..c2bc71dae73 100644
--- a/src/cursor/cur_std.c
+++ b/src/cursor/cur_std.c
@@ -26,8 +26,8 @@ __wt_cursor_notsup(WT_CURSOR *cursor)
 int
 __wt_cursor_get_key(WT_CURSOR *cursor, ...)
 {
+	WT_DECL_RET;
 	va_list ap;
-	int ret;
 
 	va_start(ap, cursor);
 	ret = __wt_cursor_get_keyv(cursor, cursor->flags, ap);
@@ -42,10 +42,11 @@ __wt_cursor_get_key(WT_CURSOR *cursor, ...)
 int
 __wt_cursor_get_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap)
 {
+	WT_DECL_RET;
 	WT_ITEM *key;
 	WT_SESSION_IMPL *session;
+	size_t size;
 	const char *fmt;
-	int ret;
 
 	CURSOR_API_CALL(cursor, session, get_key, NULL);
 	WT_CURSOR_NEEDKEY(cursor);
@@ -54,8 +55,9 @@ __wt_cursor_get_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap)
 		if (LF_ISSET(WT_CURSTD_RAW)) {
 			key = va_arg(ap, WT_ITEM *);
 			key->data = cursor->raw_recno_buf;
-			key->size = (uint32_t)
-			    __wt_struct_size(session, "q", cursor->recno);
+			WT_ERR(__wt_struct_size(
+			    session, &size, "q", cursor->recno));
+			key->size = (uint32_t)size;
 			ret = __wt_struct_pack(session, cursor->raw_recno_buf,
 			    sizeof(cursor->raw_recno_buf), "q", cursor->recno);
 		} else
@@ -80,10 +82,10 @@ err:	API_END(session);
 int
 __wt_cursor_get_value(WT_CURSOR *cursor, ...)
 {
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
 	const char *fmt;
 	va_list ap;
-	int ret;
 
 	CURSOR_API_CALL(cursor, session, get_value, NULL);
 	WT_CURSOR_NEEDVALUE(cursor);
@@ -121,12 +123,12 @@ __wt_cursor_set_key(WT_CURSOR *cursor, ...)
 void
 __wt_cursor_set_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap)
 {
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
 	WT_ITEM *buf, *item;
+	size_t sz;
 	va_list ap_copy;
 	const char *fmt, *str;
-	size_t sz;
-	int ret;
 
 	CURSOR_API_CALL(cursor, session, set_key, NULL);
 
@@ -160,9 +162,10 @@ __wt_cursor_set_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap)
 			buf = &cursor->key;
 
 			va_copy(ap_copy, ap);
-			sz = __wt_struct_sizev(
-			    session, cursor->key_format, ap_copy);
+			ret = __wt_struct_sizev(
+			    session, &sz, cursor->key_format, ap_copy);
 			va_end(ap_copy);
+			WT_ERR(ret);
 
 			WT_ERR(__wt_buf_initsize(session, buf, sz));
 			WT_ERR(__wt_struct_packv(
@@ -192,12 +195,12 @@ err:		cursor->saved_err = ret;
 void
 __wt_cursor_set_value(WT_CURSOR *cursor, ...)
 {
-	WT_SESSION_IMPL *session;
+	WT_DECL_RET;
 	WT_ITEM *buf, *item;
+	WT_SESSION_IMPL *session;
 	const char *fmt, *str;
 	size_t sz;
 	va_list ap;
-	int ret;
 
 	CURSOR_API_CALL(cursor, session, set_value, NULL);
 
@@ -216,8 +219,9 @@ __wt_cursor_set_value(WT_CURSOR *cursor, ...)
 		cursor->value.data = item->data;
 	} else {
 		buf = &cursor->value;
-		sz = __wt_struct_sizev(session, cursor->value_format, ap);
+		ret = __wt_struct_sizev(session, &sz, cursor->value_format, ap);
 		va_end(ap);
+		WT_ERR(ret);
 		va_start(ap, cursor);
 		if ((ret = __wt_buf_initsize(session, buf, sz)) != 0 ||
 		    (ret = __wt_struct_packv(session, buf->mem, sz,
@@ -255,8 +259,8 @@ __cursor_search(WT_CURSOR *cursor)
 static int
 __cursor_equals(WT_CURSOR *cursor, WT_CURSOR *other)
 {
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	CURSOR_API_CALL(cursor, session, equals, NULL);
 
@@ -285,8 +289,8 @@ done:	API_END(session);
 int
 __wt_cursor_close(WT_CURSOR *cursor)
 {
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	CURSOR_API_CALL(cursor, session, close, NULL);
 
@@ -312,10 +316,10 @@ __wt_cursor_dup(WT_SESSION_IMPL *session,
     WT_CURSOR *to_dup, const char *config, WT_CURSOR **cursorp)
 {
 	WT_CURSOR *cursor;
-	WT_SESSION *wt_session;
+	WT_DECL_RET;
 	WT_ITEM key;
+	WT_SESSION *wt_session;
 	uint32_t saved_flags;
-	int ret;
 
 	wt_session = &session->iface;
 
@@ -414,13 +418,21 @@ __wt_cursor_init(WT_CURSOR *cursor,
 	} else
 		cdump = NULL;
 
+	WT_RET(__wt_config_gets(session, cfg, "overwrite", &cval));
+	if (cval.val != 0)
+		F_SET(cursor, WT_CURSTD_OVERWRITE);
+
 	WT_RET(__wt_config_gets(session, cfg, "raw", &cval));
 	if (cval.val != 0)
 		F_SET(cursor, WT_CURSTD_RAW);
 
-	WT_RET(__wt_config_gets(session, cfg, "overwrite", &cval));
-	if (cval.val != 0)
-		F_SET(cursor, WT_CURSTD_OVERWRITE);
+	/* Snapshot cursors are read-only. */
+	WT_RET(__wt_config_gets(session, cfg, "snapshot", &cval));
+	if (cval.len != 0) {
+		cursor->insert = (int (*)(WT_CURSOR *))__wt_cursor_notsup;
+		cursor->update = (int (*)(WT_CURSOR *))__wt_cursor_notsup;
+		cursor->remove = (int (*)(WT_CURSOR *))__wt_cursor_notsup;
+	}
 
 	/*
 	 * Cursors that are internal to some other cursor (such as file cursors
diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c
index f87edd92f34..153eda5c93d 100644
--- a/src/cursor/cur_table.c
+++ b/src/cursor/cur_table.c
@@ -44,8 +44,8 @@ __wt_curtable_get_key(WT_CURSOR *cursor, ...)
 {
 	WT_CURSOR *primary;
 	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
 	va_list ap;
-	int ret;
 
 	ctable = (WT_CURSOR_TABLE *)cursor;
 	primary = *ctable->cg_cursors;
@@ -66,10 +66,10 @@ __wt_curtable_get_value(WT_CURSOR *cursor, ...)
 {
 	WT_CURSOR *primary;
 	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
 	WT_ITEM *item;
 	WT_SESSION_IMPL *session;
 	va_list ap;
-	int ret;
 
 	ctable = (WT_CURSOR_TABLE *)cursor;
 	primary = *ctable->cg_cursors;
@@ -137,10 +137,11 @@ __wt_curtable_set_value(WT_CURSOR *cursor, ...)
 {
 	WT_CURSOR **cp;
 	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
 	WT_ITEM *item;
 	WT_SESSION_IMPL *session;
 	va_list ap;
-	int i, ret;
+	int i;
 
 	ctable = (WT_CURSOR_TABLE *)cursor;
 	CURSOR_API_CALL(cursor, session, set_value, NULL);
@@ -180,8 +181,8 @@ static int
 __curtable_next(WT_CURSOR *cursor)
 {
 	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	ctable = (WT_CURSOR_TABLE *)cursor;
 	CURSOR_API_CALL(cursor, session, next, NULL);
@@ -199,8 +200,8 @@ static int
 __curtable_prev(WT_CURSOR *cursor)
 {
 	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	ctable = (WT_CURSOR_TABLE *)cursor;
 	CURSOR_API_CALL(cursor, session, prev, NULL);
@@ -218,8 +219,8 @@ static int
 __curtable_reset(WT_CURSOR *cursor)
 {
 	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	ctable = (WT_CURSOR_TABLE *)cursor;
 	CURSOR_API_CALL(cursor, session, reset, NULL);
@@ -237,8 +238,8 @@ static int
 __curtable_search(WT_CURSOR *cursor)
 {
 	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	ctable = (WT_CURSOR_TABLE *)cursor;
 	CURSOR_API_CALL(cursor, session, search, NULL);
@@ -257,8 +258,9 @@ __curtable_search_near(WT_CURSOR *cursor, int *exact)
 {
 	WT_CURSOR_TABLE *ctable;
 	WT_CURSOR *primary, **cp;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int i, ret;
+	int i;
 
 	ctable = (WT_CURSOR_TABLE *)cursor;
 	CURSOR_API_CALL(cursor, session, search_near, NULL);
@@ -284,10 +286,11 @@ err:	API_END(session);
 static int
 __curtable_insert(WT_CURSOR *cursor)
 {
-	WT_CURSOR_TABLE *ctable;
 	WT_CURSOR *primary, **cp;
+	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int i, ret;
+	int i;
 
 	ctable = (WT_CURSOR_TABLE *)cursor;
 	CURSOR_API_CALL(cursor, session, insert, NULL);
@@ -332,8 +335,8 @@ static int
 __curtable_update(WT_CURSOR *cursor)
 {
 	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	ctable = (WT_CURSOR_TABLE *)cursor;
 	CURSOR_API_CALL(cursor, session, update, NULL);
@@ -372,8 +375,8 @@ static int
 __curtable_remove(WT_CURSOR *cursor)
 {
 	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	ctable = (WT_CURSOR_TABLE *)cursor;
 	CURSOR_API_CALL(cursor, session, remove, NULL);
@@ -401,8 +404,9 @@ __curtable_close(WT_CURSOR *cursor)
 {
 	WT_CURSOR_TABLE *ctable;
 	WT_CURSOR **cp;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int i, ret;
+	int i;
 
 	ctable = (WT_CURSOR_TABLE *)cursor;
 	CURSOR_API_CALL(cursor, session, close, NULL);
@@ -462,11 +466,9 @@ __curtable_open_colgroups(WT_CURSOR_TABLE *ctable, const char *cfg[])
 
 	for (i = 0, cp = ctable->cg_cursors;
 	    i < WT_COLGROUPS(table);
-	    i++, cp++) {
-		session->btree = table->colgroup[i];
-		WT_RET(__wt_curfile_create(
-		    session, &ctable->iface, cfg_no_overwrite, cp));
-	}
+	    i++, cp++)
+		WT_RET(__wt_curfile_open(session, table->cg_name[i],
+		    &ctable->iface, cfg_no_overwrite, cp));
 	return (0);
 }
 
@@ -492,10 +494,10 @@ __curtable_open_indices(WT_CURSOR_TABLE *ctable)
 		WT_RET_MSG(session, ENOTSUP,
 		    "Bulk load is not supported for tables with indices");
 	WT_RET(__wt_calloc_def(session, table->nindices, &ctable->idx_cursors));
-	for (i = 0, cp = ctable->idx_cursors; i < table->nindices; i++, cp++) {
-		session->btree = table->index[i];
-		WT_RET(__wt_curfile_create(session, &ctable->iface, cfg, cp));
-	}
+
+	for (i = 0, cp = ctable->idx_cursors; i < table->nindices; i++, cp++)
+		WT_RET(__wt_curfile_open(session, table->idx_name[i],
+		    &ctable->iface, cfg, cp));
 	return (0);
 }
 
@@ -536,10 +538,10 @@ __wt_curtable_open(WT_SESSION_IMPL *session,
 	};
 	WT_CURSOR *cursor;
 	WT_CURSOR_TABLE *ctable;
+	WT_DECL_RET;
 	WT_ITEM fmt, plan;
 	WT_TABLE *table;
 	size_t size;
-	int ret;
 	const char *tablename, *columns;
 
 	WT_CLEAR(fmt);
@@ -553,9 +555,9 @@ __wt_curtable_open(WT_SESSION_IMPL *session,
 	if (columns == NULL)
 		size = strlen(tablename);
 	else
-		size = columns - tablename;
-	if ((ret = __wt_schema_get_table(session,
-	    tablename, size, &table)) != 0) {
+		size = WT_PTRDIFF(columns, tablename);
+	if ((ret =
+	    __wt_schema_get_table(session, tablename, size, &table)) != 0) {
 		if (ret == WT_NOTFOUND)
 			WT_RET_MSG(session, EINVAL,
 			    "Cannot open cursor '%s' on unknown table", uri);
@@ -565,14 +567,13 @@ __wt_curtable_open(WT_SESSION_IMPL *session,
 	if (!table->cg_complete)
 		WT_RET_MSG(session, EINVAL,
 		    "Cannot open cursor '%s' on incomplete table", uri);
-	if (table->is_simple) {
+	if (table->is_simple)
 		/*
 		 * The returned cursor should be public: it is not part of a
 		 * table cursor.
 		 */
-		session->btree = table->colgroup[0];
-		return (__wt_curfile_create(session, NULL, cfg, cursorp));
-	}
+		return (__wt_curfile_open(
+		    session, table->cg_name[0], NULL, cfg, cursorp));
 
 	WT_RET(__wt_calloc_def(session, 1, &ctable));
 
diff --git a/docs/Doxyfile b/src/docs/Doxyfile
index eae877063a7..47ebbab6e60 100644
--- a/docs/Doxyfile
+++ b/src/docs/Doxyfile
@@ -51,7 +51,7 @@ PROJECT_LOGO           = images/LogoFinal-header.png
 # If a relative path is entered, it will be relative to the location 
 # where doxygen was started. If left blank the current directory will be used.
 
-OUTPUT_DIRECTORY       = 
+OUTPUT_DIRECTORY       = ../../docs
 
 # If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 
 # 4096 sub-directories (in 2 levels) under the output directory of each output 
@@ -146,7 +146,7 @@ STRIP_FROM_PATH        =
 # definition is used. Otherwise one should specify the include paths that 
 # are normally passed to the compiler using the -I flag.
 
-STRIP_FROM_INC_PATH    = ../src/include/
+STRIP_FROM_INC_PATH    = ../include/
 
 # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter 
 # (but less readable) file names. This can be useful if your file system 
@@ -204,7 +204,7 @@ TAB_SIZE               = 8
 
 ALIASES                = "notyet{1}=<b>Not yet supported in WiredTiger.</b>\n@todo fix when \1 supported\n\n" \
 			 "errors=@returns zero on success and a non-zero error code on failure.  See @ref error_returns \"Error Returns\" for details." \
-                         "ex_ref{1}=@ref \1 \"\1\", available in the source tree as \c examples/c/\1" \
+                         "ex_ref{1}=@ref \1 \"\1\"" \
                          "hrow{1}=<tr><th>\1</th></tr>" \
                          "hrow{2}=<tr><th>\1</th><th>\2</th></tr>" \
                          "hrow{3}=<tr><th>\1</th><th>\2</th><th>\3</th></tr>" \
@@ -635,9 +635,9 @@ WARN_LOGFILE           = doxygen.log
 # directories like "/usr/src/myproject". Separate the files or directories 
 # with spaces.
 
-INPUT                  = ../src/include/wiredtiger.in \
-			 ../src/include/wiredtiger_ext.h \
-                         src
+INPUT                  = ../include/wiredtiger.in \
+			 ../include/wiredtiger_ext.h \
+                         .
 
 # This tag can be used to specify the character encoding of the source files 
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is 
@@ -698,10 +698,12 @@ RECURSIVE              = YES
 # subdirectory from a directory tree whose root is specified with the INPUT tag.
 
 EXCLUDE                = \
-                         src/bdb-map.dox \
-                         src/design.dox \
-                         src/processes.dox \
-                         src/sql-map.dox
+                         bdb-map.dox \
+                         design.dox \
+                         processes.dox \
+                         sql-map.dox \
+			 tools \
+			 top
 
 # The EXCLUDE_SYMLINKS tag can be used select whether or not files or 
 # directories that are symbolic links (a Unix file system feature) are excluded 
@@ -729,7 +731,7 @@ EXCLUDE_SYMBOLS        = __F
 # directories that contain example code fragments that are included (see 
 # the \include command).
 
-EXAMPLE_PATH           = ../examples/c
+EXAMPLE_PATH           = ../../examples/c
 
 # If the value of the EXAMPLE_PATH tag contains directories, you can use the 
 # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
@@ -1508,7 +1510,7 @@ PREDEFINED             = DOXYGEN \
                          __wt_compressor:=WT_COMPRESSOR \
                          __wt_connection:=WT_CONNECTION \
                          __wt_cursor:=WT_CURSOR \
-                         __wt_cursor_type:=WT_CURSOR_TYPE \
+                         __wt_data_source:=WT_DATA_SOURCE \
                          __wt_event_handler:=WT_EVENT_HANDLER \
                          __wt_extension_api:=WT_EXTENSION_API \
                          __wt_extractor:=WT_EXTRACTOR \
diff --git a/src/docs/Makefile b/src/docs/Makefile
new file mode 100644
index 00000000000..71173523a0a
--- /dev/null
+++ b/src/docs/Makefile
@@ -0,0 +1,7 @@
+all:
+	@(cd ../../dist && sh s_docs -t)
+
+clean:
+	@(cd ../../dist && sh s_docs -a)
+
+.PHONY: all clean
diff --git a/src/docs/admin.dox b/src/docs/admin.dox
new file mode 100644
index 00000000000..3fe634c6749
--- /dev/null
+++ b/src/docs/admin.dox
@@ -0,0 +1,8 @@
+/*! @page admin Managing a WiredTiger Database
+
+- @subpage home
+- @subpage security
+- @subpage file_formats
+- @subpage tuning
+
+ */
diff --git a/docs/src/architecture.dox b/src/docs/architecture.dox
index 221432bd248..ef09656094d 100644
--- a/docs/src/architecture.dox
+++ b/src/docs/architecture.dox
@@ -1,8 +1,8 @@
 /*! @page architecture WiredTiger Architecture
 
-The WiredTiger data engine is a high performance, scalable, production
-quality, open source, NoSQL data engine, created to maximize the value
-of each computer you buy:
+The WiredTiger data engine is a high performance, scalable, transactional,
+production quality, open source, NoSQL data engine, created to maximize the
+value of each computer you buy:
 
 - WiredTiger offers both low latency and high throughput (in-cache reads
 require no latching, writes typically require a single latch),
@@ -13,6 +13,8 @@ or resource degradation,
 - WiredTiger has predictable behavior under heavy access and large
 volumes of data,
 
+- WiredTiger offers transactional semantics without blocking,
+
 - WiredTiger stores are not corrupted by torn writes, reverting to the
 last snapshot after system failure,
 
@@ -23,10 +25,15 @@ WiredTiger's design is focused on a few core principles:
 
 @section multi_core Multi-core scaling
 
-WiredTiger scales on modern, multi-CPU architectures.  Using a variety
-of programming techniques such as hazard references, lock-free
-algorithms, fast latching and message passing, WiredTiger performs more
-work per CPU core than alternative engines.
+WiredTiger scales on modern, multi-CPU architectures.  Using a variety of
+programming techniques such as hazard references, lock-free algorithms, fast
+latching and message passing, WiredTiger performs more work per CPU core than
+alternative engines.
+
+WiredTiger's transactions use optimistic concurrency control algorithms that
+avoid the bottleneck of a centralized lock manager.  Transactional operations
+in one thread do not block operations in other threads, but strong isolation is
+provided and update conflicts are detected to preserve data consistency.
 
 @section cache Hot caches
 
diff --git a/docs/src/basic-api.dox b/src/docs/basic-api.dox
index d7098e9ab61..d7098e9ab61 100644
--- a/docs/src/basic-api.dox
+++ b/src/docs/basic-api.dox
diff --git a/docs/build-javadoc.sh b/src/docs/build-javadoc.sh
index c413db28d33..c413db28d33 100755
--- a/docs/build-javadoc.sh
+++ b/src/docs/build-javadoc.sh
diff --git a/docs/build-pydoc.sh b/src/docs/build-pydoc.sh
index 5e6e3635be5..5e6e3635be5 100755
--- a/docs/build-pydoc.sh
+++ b/src/docs/build-pydoc.sh
diff --git a/docs/src/command-line.dox b/src/docs/command-line.dox
index 22d134c1bcb..d2180440922 100644
--- a/docs/src/command-line.dox
+++ b/src/docs/command-line.dox
@@ -43,7 +43,6 @@ The following are command-specific options for the \c create command:
 @par <code>-c</code>
 Include a configuration string to be passed to WT_SESSION::create.
 
-
 <hr>
 @section utility_drop wt drop
 Drop a table or file.
@@ -52,11 +51,18 @@ The \c drop command drops the specified \c uri.  It is equivalent to a
 call to WT_SESSION::drop with the "force" configuration argument.
 
 @subsection utility_drop_synopsis Synopsis
-<code>wt [-Vv] [-C config] [-h directory] drop uri</code>
+<code>wt [-Vv] [-C config] [-h directory] drop [-s snapshot] uri</code>
 
 @subsection utility_drop_options Options
-There are no command-specific options for the \c drop command.
+The following are command-specific options for the \c drop command:
 
+@par <code>-s</code>
+Specify one more more snapshots to drop; the argument must be either the
+name of a single snapshot to drop (a string), or a list containing one
+of the following keys: "all" to drop all snapshots, "from=<snapshot>"
+to drop all snapshots after and including the named snapshots, or
+"to=<snapshot>" to drop all snapshots before and including the named
+snapshot.
 
 <hr>
 @section utility_dump wt dump
@@ -68,7 +74,7 @@ which can be re-loaded into a new table using the \c load command.
 See @subpage dump_formats for details of the dump file formats.
 
 @subsection utility_dump_synopsis Synopsis
-<code>wt [-Vv] [-C config] [-h directory] dump [-rx] [-f output] uri</code>
+<code>wt [-Vv] [-C config] [-h directory] dump [-rx] [-f output] [-s snapshot] uri</code>
 
 @subsection utility_dump_options Options
 The following are command-specific options for the \c dump command:
@@ -80,11 +86,14 @@ the \c -f option re-directs the output to the specified file.
 @par <code>-r</code>
 Dump in reverse order, from largest to smallest.
 
+@par <code>-s</code>
+By default, the \c dump command opens the most recent snapshot of the object;
+the \c -s option changes the \c dump command to open the named snapshot.
+
 @par <code>-x</code>
 Dump all characters in a hexadecimal encoding (the default is to leave
 printable characters unencoded).
 
-
 <hr>
 @section utility_dumpfile wt dumpfile
 Dump a file in a debugging format.
@@ -104,20 +113,27 @@ By default, the \c dumpfile command output is written to the standard
 output; the \c -f option re-directs the output to the specified
 file.
 
-
 <hr>
 @section utility_read wt list
 List the tables and files in the database.
 
-The \c list command prints out the URIs for tables and files stored in
-the database.
+By default, the \c list command prints out the tables and files stored
+in the database.  If an object name is specified as an argument, only
+information about that object is printed.
 
 @subsection utility_list_synopsis Synopsis
-<code>wt [-Vv] [-C config] [-h directory] list</code>
+<code>wt [-Vv] [-C config] [-h directory] list [-sv] [uri]</code>
 
 @subsection utility_list_options Options
-The \c list command has no command-specific options.
+The following are command-specific options for the \c list command:
 
+@par <code>-s</code>
+If the \c -s option is specified, the object's snapshots are printed
+in a human-readable format.
+
+@par <code>-v</code>
+If the \c -v option is specified, the object's complete schema table
+value is printed.
 
 <hr>
 @section utility_rename wt rename
@@ -131,8 +147,6 @@ The \c rename command renames the specified table or file.
 @subsection utility_rename_options Options
 The \c rename command has no command-specific options.
 
-
-
 <hr>
 @section utility_load wt load
 Load a table or file from dump output.
@@ -178,7 +192,6 @@ configuration of an object in the table or file.  For each of the pairs,
 the configuration string will be appended to the WT_SESSION::create call
 for the object matching the uri.
 
-
 <hr>
 @section utility_loadtext wt loadtext
 Load text into a table or file.
@@ -207,7 +220,6 @@ The following are command-specific options for the \c loadtext command:
 By default, the \c loadtext command reads from the standard input; the
 \c -f option reads the input from the specified file.
 
-
 <hr>
 @section utility_printlog wt printlog
 Display the database log.
@@ -227,7 +239,6 @@ output; the \c -f option re-directs the output to the specified file.
 @par <code>-p</code>
 Display the log in a printable format.
 
-
 <hr>
 @section utility_read wt read
 Read records from a table or file.
@@ -246,7 +257,6 @@ The \c read command exits non-zero if a specified record is not found.
 @subsection utility_read_options Options
 The \c read command has no command-specific options.
 
-
 <hr>
 @section utility_salvage wt salvage
 Recover data from a corrupted file.
@@ -266,7 +276,6 @@ By default, salvage will refuse to salvage files that fail basic tests
 (for example, files that don't appear to be in a WiredTiger format).
 The \c -F option forces the salvage of the file, regardless.
 
-
 <hr>
 @section utility_stat wt stat
 Display database or object statistics.
@@ -280,7 +289,6 @@ engine, or, if specified, for the command-line object.
 @subsection utility_stat_options Options
 The \c stat command has no command-specific options.
 
-
 <hr>
 @section utility_upgrade wt upgrade
 Upgrade a table or file.
@@ -295,7 +303,6 @@ upgraded.
 @subsection utility_upgrade_options Options
 The \c upgrade command has no command-specific options.
 
-
 <hr>
 @section utility_verify wt verify
 Check the structural integrity of a table or file.
@@ -309,7 +316,6 @@ success if the object is correct, and failure if the object is corrupted.
 @subsection utility_verify_options Options
 The \c verify command has no command-specific options.
 
-
 <hr>
 @section utility_write wt write
 Write records to a table or file.
diff --git a/docs/src/compression.dox b/src/docs/compression.dox
index c2223f0a404..c2223f0a404 100644
--- a/docs/src/compression.dox
+++ b/src/docs/compression.dox
diff --git a/docs/src/config-file.dox b/src/docs/config-file.dox
index 6490aca20e6..59a92901c68 100644
--- a/docs/src/config-file.dox
+++ b/src/docs/config-file.dox
@@ -1,4 +1,4 @@
-/*! @page config_file WiredTiger Home Directory Configuration File
+/*! @page config_file WiredTiger Configuration File
 
 If a file named \c WiredTiger.config appears in the WiredTiger home
 directory, it is read as a configuration string.  Configuration values
diff --git a/docs/src/config-strings.dox b/src/docs/config-strings.dox
index 65f78c19946..65f78c19946 100644
--- a/docs/src/config-strings.dox
+++ b/src/docs/config-strings.dox
diff --git a/docs/src/cursor-ops.dox b/src/docs/cursor-ops.dox
index cd77621d43d..ad5825f100b 100644
--- a/docs/src/cursor-ops.dox
+++ b/src/docs/cursor-ops.dox
@@ -28,7 +28,7 @@ See @ref cursors for more information on available cursor types.
 
 Cursors may be positioned at the beginning of the data source, the end of
 the data source, at an exact key within the data source, and near a key
-within the data source.  
+within the data source.
 
 To invalidate the position of a cursor so that subsequent iterations start
 from the beginning or end of the data source, use the WT_CURSOR::reset method:
diff --git a/src/docs/cursors.dox b/src/docs/cursors.dox
new file mode 100644
index 00000000000..53b025a6845
--- /dev/null
+++ b/src/docs/cursors.dox
@@ -0,0 +1,61 @@
+/*! @page cursors Cursors
+
+Common operations in WiredTiger are performed using WT_CURSOR handles.
+A cursor includes:
+
+- a position within a data source
+- getter/setters for key and value fields
+- encoding of fields to store in the data source
+- methods to navigate within and iterate through the data
+
+See @subpage cursor_ops for a description of how to use cursors.
+
+@section cursor_types Cursor types
+
+The following are some of common builtin cursor types:
+
+<table>
+  @hrow{URI, Type}
+   @row{<tt>colgroup:\<tablename\>.\<columnset\></tt>,
+column group cursor}
+  @row{<tt>table:\<tablename\></tt>,
+table cursor (key=table key\, value=table value)}
+  @row{<tt>file:\<filename\></tt>,
+file cursor (key=file key\, value=file value)}
+  @row{<tt>index:\<tablename\>.\<indexname\></tt>,
+index cursor (key=index key\, value=table value)}
+  @row{<tt>statistics:[file</tt><tt>:\<filename\>]</tt>,
+  database or file statistics (key=(int)\,
+  value=(string)description\, (string)value\, (uint64_t)value)}
+</table>
+
+See @subpage data_sources for the full list.
+
+@section cursor_projections Projections
+
+Cursors on tables, column groups and indices can return a subset of
+columns.  This is done by listing the column names in parenthesis in the
+<code>uri</code> parameter to WT_SESSION::open_cursor.  Only the fields
+from the listed columns are returned by WT_CURSOR::get_value.
+
+This is particularly useful with index cursors, because if all columns in
+the projection are available in the index (including primary key columns,
+which are the values of the index), there is no need to access any column
+groups.
+
+@section cursor_raw Raw mode
+
+Cursors can be configured for raw mode by specifying the \c "raw" config
+keyword to WT_SESSION::open_cursor.  In this mode, the methods
+WT_CURSOR::get_key, WT_CURSOR::get_value, WT_CURSOR::set_key and
+WT_CURSOR::set_value all take a single WT_ITEM in the variable-length
+argument list instead of a separate argument for each column.
+
+For WT_CURSOR::get_key and WT_CURSOR::get_value in raw mode, the WT_ITEM
+can be split into columns by calling ::wiredtiger_struct_unpack with the
+cursor's \c key_format or \c value_format, respectively.  For
+WT_CURSOR::set_key and WT_CURSOR::set_value in raw mode, the WT_ITEM
+should be equivalent to calling ::wiredtiger_struct_pack for the
+cursor's \c key_format or \c value_format, respectively.
+
+*/
diff --git a/docs/src/cursors.dox b/src/docs/data_sources.dox
index 735be7478a9..2eaf2a99fe4 100644
--- a/docs/src/cursors.dox
+++ b/src/docs/data_sources.dox
@@ -1,16 +1,17 @@
-/*! @page cursors Cursors
+/*! @page data_sources Data Sources
 
-Common operations in WiredTiger are performed using WT_CURSOR handles.
-A cursor includes:
+WiredTiger provides access to data from a variety of sources.  At the
+lowest level, data may be stored in a file using a tree structure.  A
+relational schema supporting tables, indices and column groups is
+layered on top of file.  Additional sources include LSM trees and
+statistics, and applications can further extend the supported types by
+implementing the ::WT_DATA_SOURCE interface.
 
-- a position within a data source
-- getter/setters for key and value fields
-- encoding of fields to store in the data source
-- methods to navigate within and iterate through the data
+Common operations on all data sources are performed using WT_CURSOR
+handles.  See @subpage cursor_ops for a description of how to use
+cursors.
 
-See @subpage cursor_ops for a description of how to use cursors.
-
-@section cursor_types Cursor types
+@section data_builtin Builtin data sources
 
 The following are the builtin cursor types:
 
@@ -34,7 +35,18 @@ join cursor @notyet{join cursors}}
   value=(string)description\, (string)value\, (uint64_t)value)}
 </table>
 
-@subsection cursor_index Index cursors
+@subsection data_files Raw Files
+
+WiredTiger's schema layer can be bypassed by opening cursors with a \c
+"file:" URI, using the name of the underlying file.  This can be useful for
+seeing the contents of a column group or index without reading all of the
+columns from the table.
+
+For example, if an index becomes inconsistent with its primary, a file
+cursor can read from the index without errors (even though some of the keys
+that are returned may not exist in the primary).
+
+@subsection data_indices Table Index data
 
 When an index is created for a table, records are inserted into the index
 whenever the table is updated.  These records use a different key to the
@@ -47,18 +59,7 @@ default to returning the value columns from the table, but this can be
 overridden by configuring a projection cursor (see @ref cursor_projections),
 which can access the table key columns or a subset of the value columns.
 
-@subsection cursor_file File cursors
-
-WiredTiger's schema layer can be bypassed by opening cursors with a \c
-"file:" URI, using the name of the underlying file.  This can be useful for
-seeing the contents of a column group or index without reading all of the
-columns from the table.
-
-For example, if an index becomes inconsistent with its primary, a file
-cursor can read from the index without errors (even though some of the keys
-that are returned may not exist in the primary).
-
-@subsection cursor_statistics Statistics cursors
+@subsection data_statistics Statistics Data
 
 Cursors can return run-time statistics about the WiredTiger engine as
 well as statistics for the underlying row- and column-store files.  Each
@@ -83,32 +84,4 @@ Both examples can use a common display routine that iterates through the
 statistics until the cursor returns the end of the list.
 
 @snippet ex_stat.c statistics display function
-
-@section cursor_projections Projections
-
-Cursors on tables, column groups and indices can return a subset of
-columns.  This is done by listing the column names in parenthesis in the
-<code>uri</code> parameter to WT_SESSION::open_cursor.  Only the fields
-from the listed columns are returned by WT_CURSOR::get_value.
-
-This is particularly useful with index cursors, because if all columns in
-the projection are available in the index (including primary key columns,
-which are the values of the index), there is no need to access any column
-groups.
-
-@section cursor_raw Raw mode
-
-Cursors can be configured for raw mode by specifying the \c "raw" config
-keyword to WT_SESSION::open_cursor.  In this mode, the methods
-WT_CURSOR::get_key, WT_CURSOR::get_value, WT_CURSOR::set_key and
-WT_CURSOR::set_value all take a single WT_ITEM in the variable-length
-argument list instead of a separate argument for each column.
-
-For WT_CURSOR::get_key and WT_CURSOR::get_value in raw mode, the WT_ITEM
-can be split into columns by calling ::wiredtiger_struct_unpack with the
-cursor's \c key_format or \c value_format, respectively.  For
-WT_CURSOR::set_key and WT_CURSOR::set_value in raw mode, the WT_ITEM
-should be equivalent to calling ::wiredtiger_struct_pack for the
-cursor's \c key_format or \c value_format, respectively.
-
 */
diff --git a/docs/src/dump-formats.dox b/src/docs/dump-formats.dox
index 0c021e3a29b..0c021e3a29b 100644
--- a/docs/src/dump-formats.dox
+++ b/src/docs/dump-formats.dox
diff --git a/docs/src/examples.dox b/src/docs/examples.dox
index d0a249aecb6..d0a249aecb6 100644
--- a/docs/src/examples.dox
+++ b/src/docs/examples.dox
diff --git a/docs/src/file-formats.dox b/src/docs/file-formats.dox
index 538709673d4..538709673d4 100644
--- a/docs/src/file-formats.dox
+++ b/src/docs/file-formats.dox
diff --git a/docs/src/home.dox b/src/docs/home.dox
index 222e0400f75..222e0400f75 100644
--- a/docs/src/home.dox
+++ b/src/docs/home.dox
diff --git a/docs/src/huffman.dox b/src/docs/huffman.dox
index e4693873d66..e4693873d66 100644
--- a/docs/src/huffman.dox
+++ b/src/docs/huffman.dox
diff --git a/docs/images/LogoFace-watermark.png b/src/docs/images/LogoFace-watermark.png
index 7af37f88a7c..7af37f88a7c 100644
--- a/docs/images/LogoFace-watermark.png
+++ b/src/docs/images/LogoFace-watermark.png
diff --git a/docs/images/LogoFinal-header.png b/src/docs/images/LogoFinal-header.png
index fcbdb27ae81..fcbdb27ae81 100644
--- a/docs/images/LogoFinal-header.png
+++ b/src/docs/images/LogoFinal-header.png
diff --git a/docs/images/architecture.pdf b/src/docs/images/architecture.pdf
index ff97ddff2b4..ff97ddff2b4 100644
--- a/docs/images/architecture.pdf
+++ b/src/docs/images/architecture.pdf
diff --git a/docs/images/architecture.png b/src/docs/images/architecture.png
index c5b72bc05e0..c5b72bc05e0 100644
--- a/docs/images/architecture.png
+++ b/src/docs/images/architecture.png
diff --git a/docs/src/install.dox b/src/docs/install.dox
index c6be21060e7..dbf4712ba1d 100644
--- a/docs/src/install.dox
+++ b/src/docs/install.dox
@@ -37,6 +37,7 @@ To build the WiredTiger software on a POSIX-like system, change directory to
 the top-level directory, then configure and build the software:
 
 @code
+cd wiredtiger
 ./configure && make
 @endcode
 
diff --git a/docs/src/introduction.dox b/src/docs/introduction.dox
index 055bf09c40d..84ddd0865c2 100644
--- a/docs/src/introduction.dox
+++ b/src/docs/introduction.dox
@@ -29,12 +29,16 @@ For more information about using WiredTiger, see:
 
 - @subpage install\n
 
-- @subpage using\n
+- @subpage programming\n
 
 - @ref wt "WiredTiger API reference manual"
 
 - @subpage command_line\n
 
+- @subpage admin\n
+
+- @subpage license\n
+
 To browse or download the WiredTiger source code, visit our
 
 - <a href="https://github.com/wiredtiger/wiredtiger"><b>Project page</b></a>
diff --git a/docs/src/keyvalue.dox b/src/docs/keyvalue.dox
index f375886b030..f375886b030 100644
--- a/docs/src/keyvalue.dox
+++ b/src/docs/keyvalue.dox
diff --git a/docs/src/license.dox b/src/docs/license.dox
index 282b1da5ff6..409bedea512 100644
--- a/docs/src/license.dox
+++ b/src/docs/license.dox
@@ -1,12 +1,14 @@
 /*! @page license WiredTiger license
 
-The WiredTiger software is Open Source software: you can redistribute
-it and/or modify it under the superset of the terms of version 3 of the
+The WiredTiger software is Open Source software: you may redistribute
+it and modify it under the terms of version 3 of the
 <a href="http://www.gnu.org/licenses/gpl-3.0-standalone.html">
 <b>GNU General Public License</b></a>
-as published by the Free Software Foundation, and the
+as published by the Free Software Foundation.  The WiredTiger library
+binary also includes software copyrighted under the terms of the
 <a href="http://www.opensource.org/licenses/BSD-3-Clause">
 <b>University of California, Berkeley (BSD) 3-Clause License</b></a>.
+Any redistribution should comply with both copyrights.
 
 This program is distributed in the hope that it will be useful, but
 WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -19,13 +21,13 @@ those described above, or for technical support for this software, please
 contact WiredTiger, Inc. at
 <a mailto="info@wiredtiger.com">info@wiredtiger.com</a>.
 
-@section library 3rd party software included in the WiredTiger library
+@section library 3rd party software included in the WiredTiger library binary
 
-The WiredTiger binary library build includes the following 3rd party
-software, distributed under the following licenses:
+The WiredTiger library binary includes the following 3rd party software,
+distributed under the following licenses:
 
 <table>
-@hrow{File, License}
+@hrow{Distribution File, License}
 @row{src/include/bitstring.i, University of California\, Berkeley (BSD) 3-Clause License}
 @row{src/include/queue.h, University of California\, Berkeley (BSD) 3-Clause License}
 @row{src/utilities/util_getopt.c, University of California\, Berkeley (BSD) 3-Clause License}
diff --git a/docs/src/namespace.dox b/src/docs/namespace.dox
index 3ac7bfd6ec0..1b2402dd4f2 100644
--- a/docs/src/namespace.dox
+++ b/src/docs/namespace.dox
@@ -9,9 +9,14 @@ begin with the string "WT_".
 
 WiredTiger's private function names begin with the string "__wt__".
 
-@section file File system name space
+@section filename File system name space
 
 WiredTiger's files begin with the string "WiredTiger"; applications
 should not create files in the WiredTiger file system name space.
 
+@section error Error return name space
+
+WiredTiger reserves all values from -31,800 to -31,999 as possible error
+return values.
+
 */
diff --git a/docs/src/packing.dox b/src/docs/packing.dox
index 7723e21bc79..7723e21bc79 100644
--- a/docs/src/packing.dox
+++ b/src/docs/packing.dox
diff --git a/docs/src/processes.dox b/src/docs/processes.dox
index 87f715792f9..87f715792f9 100644
--- a/docs/src/processes.dox
+++ b/src/docs/processes.dox
diff --git a/src/docs/programming.dox b/src/docs/programming.dox
new file mode 100644
index 00000000000..f4041679dbb
--- /dev/null
+++ b/src/docs/programming.dox
@@ -0,0 +1,16 @@
+/*! @page programming Writing WiredTiger applications
+
+This section explains how to write applications that use WiredTiger:
+
+- @subpage basic_api
+- @subpage config_strings
+- @subpage schema
+- @subpage cursors
+- @subpage threads
+- @subpage snapshots
+- @subpage transactions
+- @subpage compression
+- @subpage name_space
+- @subpage signals
+
+ */
diff --git a/docs/src/schema.dox b/src/docs/schema.dox
index 01b9572c0ff..01b9572c0ff 100644
--- a/docs/src/schema.dox
+++ b/src/docs/schema.dox
diff --git a/docs/src/security.dox b/src/docs/security.dox
index 004a2661f54..004a2661f54 100644
--- a/docs/src/security.dox
+++ b/src/docs/security.dox
diff --git a/docs/src/signals.dox b/src/docs/signals.dox
index 1d05f0f2019..1d05f0f2019 100644
--- a/docs/src/signals.dox
+++ b/src/docs/signals.dox
diff --git a/src/docs/snapshots.dox b/src/docs/snapshots.dox
new file mode 100644
index 00000000000..5367b86375b
--- /dev/null
+++ b/src/docs/snapshots.dox
@@ -0,0 +1,38 @@
+/*! @page snapshots Snapshots
+
+WiredTiger supports snapshots, a read-only, static view of a data source.
+
+Snapshots offer basic operation durability without transactional logging,
+across application or system failure.  (Transactional logging offers
+fine-grained durability, but requires a recovery step when files are first
+opened, and impacts the performance of every operation; snapshots offer
+durability without recovery or impacting operational performance, but the
+creation of a snapshot is a relatively heavy-weight operation.  WiredTiger
+does not currently support transactional logging.)
+
+Snapshots may optionally be named: WiredTiger creates and manages unnamed
+snapshots automatically.  Snapshots are created using the WT_SESSION::sync
+method, read using the WT_SESSION::open_cursor method, and discarded using the
+WT_SESSION::drop method.  Additionally, the \c -s option to the \c wt command
+line utility will list a data source's snapshots in a human-readable format.
+
+When WiredTiger data sources are first opened, they are opened in the state of
+the most recent snapshot taken on the file, in other words, updates after the
+most recent snapshot will not appear in the data source.  If no snapshot is
+found when the data source is opened, the data source will appear empty.
+
+Cursors are opened in the current working version of a data source unless a
+snapshot name is provided to WT_SESSION::open_cursor.  Cursors opened in the
+current working version snapshot support write operations, snapshots opened in
+a snapshot are read-only.
+
+Named snapshots persist until they are explicitly dropped.  Snapshots share
+pages, and deleting a snapshot may or may not make pages available for re-use,
+depending on whether the dropped snapshot contained the last reference to a
+block of data.  Creating a named snapshot drops all previous snapshots with
+the same name.  Snapshots cannot be dropped if they are currently open
+in a cursor.
+
+Unnamed snapshots managed by WiredTiger are given the name
+"WiredTigerInternal".
+ */
diff --git a/docs/src/spell.ok b/src/docs/spell.ok
index 05071b53215..027c6623ecd 100644
--- a/docs/src/spell.ok
+++ b/src/docs/spell.ok
@@ -18,6 +18,7 @@ IEC
 LDFLAGS
 LIBS
 LSB
+LSM
 MERCHANTABILITY
 MVCC's
 Makefiles
@@ -25,11 +26,11 @@ Mewhort
 NoSQL
 RepMgr
 Rrx
-TimesTen's
 URIs
 Vv
 WiredTiger
 WiredTiger's
+WiredTigerInternal
 aR
 ack'ed
 alloc
@@ -57,6 +58,7 @@ callbk
 cd
 cdb
 cds
+checksum
 checksums
 ckp
 colgroup
@@ -255,6 +257,7 @@ structs
 subdatabases
 subpage
 superset
+sv
 tablename
 tcl
 tcmalloc
diff --git a/docs/src/sql-map.dox b/src/docs/sql-map.dox
index 8f288565436..8f288565436 100644
--- a/docs/src/sql-map.dox
+++ b/src/docs/sql-map.dox
diff --git a/docs/style/DoxygenLayout.xml b/src/docs/style/DoxygenLayout.xml
index 3c186e36f89..3c186e36f89 100644
--- a/docs/style/DoxygenLayout.xml
+++ b/src/docs/style/DoxygenLayout.xml
diff --git a/docs/style/background_navigation.png b/src/docs/style/background_navigation.png
index d59e96ddef6..d59e96ddef6 100644
--- a/docs/style/background_navigation.png
+++ b/src/docs/style/background_navigation.png
diff --git a/docs/style/doxygen.png b/src/docs/style/doxygen.png
index f0a274bbaff..f0a274bbaff 100644
--- a/docs/style/doxygen.png
+++ b/src/docs/style/doxygen.png
diff --git a/docs/style/footer.html b/src/docs/style/footer.html
index cf0bb962e6f..cf0bb962e6f 100644
--- a/docs/style/footer.html
+++ b/src/docs/style/footer.html
diff --git a/docs/style/header.html b/src/docs/style/header.html
index c6302f28692..39b1d1b9372 100644
--- a/docs/style/header.html
+++ b/src/docs/style/header.html
@@ -20,7 +20,7 @@ $mathjax
  <tr style="height: 56px;">
   <!--BEGIN PROJECT_LOGO-->
   <td id="projectlogo">
-    <div class="logo"><a href="http://wiredtiger.com/"><img src="$relpath$images/LogoFinal-header.png" alt="WiredTiger" /></a></div>
+    <div class="logo"><a href="http://wiredtiger.com/"><img src="$relpath$LogoFinal-header.png" alt="WiredTiger" /></a></div>
   </td>
   <!--END PROJECT_LOGO-->
   <!--BEGIN PROJECT_NAME-->
diff --git a/docs/style/img_downArrow.png b/src/docs/style/img_downArrow.png
index 024a03c2654..024a03c2654 100644
--- a/docs/style/img_downArrow.png
+++ b/src/docs/style/img_downArrow.png
diff --git a/docs/style/javadoc.css b/src/docs/style/javadoc.css
index f345b05b158..f345b05b158 100644
--- a/docs/style/javadoc.css
+++ b/src/docs/style/javadoc.css
diff --git a/docs/style/tabs.css b/src/docs/style/tabs.css
index c543eb00429..c543eb00429 100644
--- a/docs/style/tabs.css
+++ b/src/docs/style/tabs.css
diff --git a/docs/style/wiredtiger.css b/src/docs/style/wiredtiger.css
index f3d3dcc6bee..f3d3dcc6bee 100644
--- a/docs/style/wiredtiger.css
+++ b/src/docs/style/wiredtiger.css
diff --git a/src/docs/threads.dox b/src/docs/threads.dox
new file mode 100644
index 00000000000..eb830de6b01
--- /dev/null
+++ b/src/docs/threads.dox
@@ -0,0 +1,36 @@
+/*! @page threads Multithreading
+
+All WT_CONNECTION methods are thread safe, and WT_CONNECTION handles can
+be shared between threads.  Applications typically open a single
+connection to each database, per process.
+
+WT_SESSION and WT_CURSOR methods are not thread safe and WT_SESSION and
+WT_CURSOR handles cannot be accessed concurrently by multiple threads.
+Applications typically open one WT_SESSION handle for each thread
+accessing a database, and then one or more WT_CURSOR handles within the
+session.
+
+WT_SESSION and WT_CURSOR methods may be accessed by different threads
+serially (for example, a pool of threads managed by the application with
+a set of shared session or cursor handles).  There is no thread-local
+state in WiredTiger, but no built-in synchronization of session or
+cursor handles, either, so if multiple threads access a session or
+cursor handle, access must be serialized by the application.
+
+@section threads_example Code samples
+
+The code below is taken from the complete example program
+@ex_ref{ex_thread.c}.
+
+This is an example of a thread entry point.  A new session is opened for
+the thread and used for all operations within that thread.
+
+@snippet ex_thread.c thread scan
+
+Here is the main function that starts the threads.  It opens a single
+connection, shared between the threads, and closes the connection after
+waiting for all of the threads to exit.
+
+@snippet ex_thread.c thread main
+
+ */
diff --git a/docs/tools/doxypy.py b/src/docs/tools/doxypy.py
index 54fef5f03a5..54fef5f03a5 100755
--- a/docs/tools/doxypy.py
+++ b/src/docs/tools/doxypy.py
diff --git a/docs/tools/fixlinks.py b/src/docs/tools/fixlinks.py
index b249307f3c3..b249307f3c3 100755
--- a/docs/tools/fixlinks.py
+++ b/src/docs/tools/fixlinks.py
diff --git a/docs/tools/pyfilter b/src/docs/tools/pyfilter
index 67ee7ca4544..67ee7ca4544 100755
--- a/docs/tools/pyfilter
+++ b/src/docs/tools/pyfilter
diff --git a/src/docs/top/Doxyfile b/src/docs/top/Doxyfile
new file mode 100644
index 00000000000..468dd31aee4
--- /dev/null
+++ b/src/docs/top/Doxyfile
@@ -0,0 +1,9 @@
+# Override settings in the normal documentation build for the landing page.
+
+PROJECT_NUMBER		= "Developer Site"
+OUTPUT_DIRECTORY	= ../../docs/top
+INPUT			= top license.dox
+EXCLUDE			=
+
+GENERATE_TREEVIEW	= NO
+USE_INLINE_TREES	= NO
diff --git a/src/docs/top/main.dox b/src/docs/top/main.dox
new file mode 100644
index 00000000000..d7294dc0bd9
--- /dev/null
+++ b/src/docs/top/main.dox
@@ -0,0 +1,18 @@
+-/*! @mainpage WiredTiger Developer Site
+
+WiredTiger is an high performance, scalable, production quality, NoSQL,
+@subpage license "Open Source" extensible platform for data management.
+
+To browse or download the WiredTiger source code, visit our
+
+- <a href="https://github.com/wiredtiger/wiredtiger"><b>Project page</b></a>
+
+To ask questions or discuss issues related to using WiredTiger, visit our
+
+- <a href="http://groups.google.com/group/wiredtiger-users"><b>User Group</b></a>
+
+View the documentation online:
+
+- <a href="1.2.0/index.html"><b>WiredTiger 1.2.0 documentation (current)</b></a>
+- <a href="1.1.5/index.html"><b>WiredTiger 1.1.5 documentation</b></a>
+*/
diff --git a/src/docs/transactions.dox b/src/docs/transactions.dox
new file mode 100644
index 00000000000..b94515c007c
--- /dev/null
+++ b/src/docs/transactions.dox
@@ -0,0 +1,86 @@
+/*! @page transactions Transactions
+
+@section transactions_acid ACID properties
+
+Transactions provide a powerful abstraction for multiple threads to operate
+on data concurrently because they have the following properties:
+
+- Atomicity: all or none of a transaction is completed.
+- Consistency: if each transaction maintains some property when considered
+  separately, then the combined effect of executing the transactions
+  concurrently will maintain the same property.
+- Isolation: developers can reason about transactions as if they run
+  single-threaded.
+- Durability: once a transaction commits, its updates cannot be lost.
+
+In WiredTiger release 1.2, support for transactions was added with the
+following caveats to the ACID properties:
+
+- the maximum level of isolation supported is snapshot isolation.
+  See @ref transaction_isolation for more details.
+- only coarse-grained durability is supported: updates become durable when they
+  are part of a checkpoint, not at commit time.  If there is a crash, commits
+  since the last checkpoint will be lost.
+
+@section transactions_api Transactional API
+
+In WiredTiger, the transactional context is managed by the WT_SESSION
+class.  Applications call WT_SESSION::begin_transaction to start a new
+transaction, which is only permitted when no cursors are open.  Operations
+performed with that WT_SESSION handle are then part of the transaction, and
+their effects can be committed by calling WT_SESSION::commit_transaction or
+WT_SESSION::rollback_transaction, both of which implicitly close any open
+cursors.
+
+When transactions are used, concurrent update operations will fail with the
+::WT_DEADLOCK error if they conflict with a concurrent transaction.
+Transactions should be rolled back with WT_SESSION::rollback_transaction and
+retried if this error occurs.
+
+@section transactions_cc Concurrency control
+
+WiredTiger uses optimistic concurrency control algorithms.  This avoids the
+bottleneck of a centralized lock manager, and means that transactional
+operations do not block: reads do not block writes, and vice versa.
+
+Further, writes do not block writes, but as mentioned above, concurrent
+transactions updating the same value will fail with ::WT_DEADLOCK.  Some
+applications may benefit from application-level synchronization to avoid
+repeated attempts to rollback and update the same value.
+
+@section transaction_isolation Isolation levels
+
+The default isolation level is <code>snapshot</code>, which means that each
+transaction reads the versions of records that were committed before the
+transaction started.
+
+Snapshot isolation is a strong guarantee, but not equivalent to a
+single-threaded execution of the transactions, known as serializable isolation.
+Concurrent transactions T1 and T2 running under snapshot isolation may both
+commit and produce a state that neither (T1 followed by T2) or (T2 followed by
+T1) could have produced, if there is overlap between T1's reads and T2's
+writes, and between T1's writes and T2's reads.
+
+Weaker isolation levels are also provided, including
+<code>read-uncommitted</code>, which always reads the most recent version of
+data, regardless of whether it is committed.
+
+@section transaction_recovery Checkpoints and Recovery
+
+Recovery is run automatically when a data source is opened.  Any changes since
+the last checkpoint are discarded, and the application restarts from a
+consistent point in the transaction history.
+
+This suggests the importance of regular checkpoints: they limit the volume of
+commits that may be lost in a crash.  Checkpoints create a snapshot in every
+data source in the database.  See WT_SESSION::checkpoint for information about
+checkpoints, and @ref snapshots for information about snapshots.
+
+@section transaction_example Code samples
+
+The code below is taken from the complete example program
+@ex_ref{ex_transaction.c}.
+
+@snippet ex_transaction.c transaction
+
+ */
diff --git a/docs/src/tuning.dox b/src/docs/tuning.dox
index e179e362bcc..83de3584985 100644
--- a/docs/src/tuning.dox
+++ b/src/docs/tuning.dox
@@ -101,6 +101,14 @@ better threaded performance (for example, Google's
 or <a href="http://www.canonware.com/jemalloc">jemalloc</a>),
 can dramatically improve throughput.
 
+@section checksums Checksums
+
+WiredTiger configures checksums on file reads and writes, by default.
+In read-only applications, or when file compression provides any
+necessary checksum functionality, or when using backing storage systems
+where blocks require no validation, performance can be increased by
+turning off checksum support when calling the WT_SESSION::create method.
+
 @section statistics Performance monitoring with statistics
 
 WiredTiger maintains a variety of statistics that can be read with a
@@ -120,7 +128,7 @@ file:
 @snippet ex_stat.c statistics file function
 
 Both examples can use a common display routine that iterates through the
-statistics until the cursor returns the end of the list. 
+statistics until the cursor returns the end of the list.
 
 @snippet ex_stat.c statistics display function
 
diff --git a/src/include/api.h b/src/include/api.h
index 8e18d56339c..3e2483961ce 100644
--- a/src/include/api.h
+++ b/src/include/api.h
@@ -58,19 +58,30 @@ typedef	enum {
 struct __wt_session_impl {
 	WT_SESSION iface;
 
+	u_int active;			/* Non-zero if the session is in-use */
+
 	WT_CONDVAR *cond;		/* Condition variable */
 
 	const char *name;		/* Name */
-	WT_EVENT_HANDLER *event_handler;
+
+	WT_EVENT_HANDLER *event_handler;/* Application's event handlers */
 
 	WT_BTREE *btree;		/* Current file */
 	TAILQ_HEAD(__btrees, __wt_btree_session) btrees;
 
+	WT_BTREE *created_btree;	/* File being created */
+
 	WT_CURSOR *cursor;		/* Current cursor */
 					/* Cursors closed with the session */
 	TAILQ_HEAD(__cursors, __wt_cursor) cursors;
 
-	WT_BTREE *schematab;		/* Schema tables */
+	WT_BTREE *metafile;		/* Metadata file */
+	void	*meta_track;		/* Metadata operation tracking */
+	void	*meta_track_next;	/* Current position */
+	void	*meta_track_sub;	/* Child transaction / save point */
+	size_t	 meta_track_alloc;	/* Currently allocated */
+#define	WT_META_TRACKING(session)	(session->meta_track_next != NULL)
+
 	TAILQ_HEAD(__tables, __wt_table) tables;
 
 	WT_ITEM	logrec_buf;		/* Buffer for log records */
@@ -78,13 +89,24 @@ struct __wt_session_impl {
 
 	WT_ITEM	**scratch;		/* Temporary memory for any function */
 	u_int	scratch_alloc;		/* Currently allocated */
-
+#ifdef HAVE_DIAGNOSTIC
+	/*
+	 * It's hard to figure out from where a buffer was allocated after it's
+	 * leaked, so in diagnostic mode we track them; DIAGNOSTIC can't simply
+	 * add additional fields to WT_ITEM structures because they are visible
+	 * to applications, create a parallel structure instead.
+	 */
+	struct __wt_scratch_track {
+		const char *file;	/* Allocating file, line */
+		int line;
+	} *scratch_track;
+#endif
 					/* Serialized operation state */
 	void	*wq_args;		/* Operation arguments */
 	int	wq_sleeping;		/* Thread is blocked */
 	int	wq_ret;			/* Return value */
 
-	WT_HAZARD *hazard;		/* Hazard reference array */
+	WT_TXN	txn;			/* Transaction state */
 
 	void	*reconcile;		/* Reconciliation information */
 
@@ -92,10 +114,24 @@ struct __wt_session_impl {
 	u_int	 excl_next;		/* Next empty slot */
 	size_t	 excl_allocated;	/* Bytes allocated */
 
-	void	*schema_track;		/* Tracking schema operations */
-	u_int	 schema_track_entries;	/* Currently allocated */
+#define	WT_SYNC			1	/* Sync the file */
+#define	WT_SYNC_DISCARD		2	/* Sync the file, discard pages */
+#define	WT_SYNC_DISCARD_NOWRITE	3	/* Discard the file */
+	int syncop;			/* File operation */
+
+	uint32_t id;			/* Offset in conn->session_array */
 
 	uint32_t flags;
+
+	/*
+	 * The hazard reference must be placed at the end of the structure: the
+	 * structure is cleared when closed, all except the hazard reference.
+	 * Putting the hazard reference at the end of the structure allows us to
+	 * easily call a function to clear memory up to, but not including, the
+	 * hazard reference.
+	 */
+#define	WT_SESSION_CLEAR(s)	memset(s, 0, WT_PTRDIFF(&(s)->hazard, s))
+	WT_HAZARD *hazard;		/* Hazard reference array */
 };
 
 /*******************************************
@@ -122,19 +158,39 @@ struct __wt_named_compressor {
 };
 
 /*
+ * WT_NAMED_DATA_SOURCE --
+ *	A data source list entry
+ */
+struct __wt_named_data_source {
+	const char *prefix;		/* Name of compressor */
+	WT_DATA_SOURCE *dsrc;		/* User supplied callbacks */
+	TAILQ_ENTRY(__wt_named_data_source) q;	/* Linked list of compressors */
+};
+
+/*
+ * Allocate some additional slots for internal sessions.  There is a default
+ * session for each connection, plus a session for the eviction thread.
+ */
+#define	WT_NUM_INTERNAL_SESSIONS	2
+
+/*
  * WT_CONNECTION_IMPL --
  *	Implementation of WT_CONNECTION
  */
 struct __wt_connection_impl {
 	WT_CONNECTION iface;
 
-	WT_SESSION_IMPL default_session;/* For operations without an
-					   application-supplied session */
+	/* For operations without an application-supplied session */
+	WT_SESSION_IMPL *default_session;
+	WT_SESSION_IMPL  dummy_session;
 
 	WT_SPINLOCK fh_lock;		/* File handle queue spinlock */
+	WT_SPINLOCK schema_lock;	/* Schema operation spinlock */
 	WT_SPINLOCK serial_lock;	/* Serial function call spinlock */
 	WT_SPINLOCK spinlock;		/* General purpose spinlock */
 
+	WT_RWLOCK *ckpt_rwlock;		/* Checkpoint lock */
+
 					/* Connection queue */
 	TAILQ_ENTRY(__wt_connection_impl) q;
 
@@ -148,7 +204,6 @@ struct __wt_connection_impl {
 
 					/* Locked: btree list */
 	TAILQ_HEAD(__wt_btree_qh, __wt_btree) btqh;
-
 					/* Locked: file list */
 	TAILQ_HEAD(__wt_fh_qh, __wt_fh) fhqh;
 
@@ -169,26 +224,23 @@ struct __wt_connection_impl {
 	 * the server thread code to avoid walking the entire array when only a
 	 * few threads are running.
 	 */
-	WT_SESSION_IMPL	**sessions;		/* Session reference */
-	void		 *session_array;	/* Session array */
-	uint32_t	  session_cnt;		/* Session count */
+	WT_SESSION_IMPL	*sessions;	/* Session reference */
+	uint32_t	 session_size;	/* Session array size */
+	uint32_t	 session_cnt;	/* Session count */
 
 	/*
 	 * WiredTiger allocates space for 15 hazard references in each thread of
 	 * control, by default.  There's no code path that requires more than 15
 	 * pages at a time (and if we find one, the right change is to increase
 	 * the default).
-	 *
-	 * The hazard array is separate from the WT_SESSION_IMPL array because
-	 * we need to easily copy and search it when evicting pages from memory.
 	 */
-	WT_HAZARD *hazard;		/* Hazard references array */
-	uint32_t   hazard_size;
-	uint32_t   session_size;
+	uint32_t   hazard_size;		/* Hazard array size */
 
 	WT_CACHE  *cache;		/* Page cache */
 	uint64_t   cache_size;
 
+	WT_TXN_GLOBAL txn_global;	/* Global transaction state. */
+
 	WT_CONNECTION_STATS *stats;	/* Connection statistics */
 
 	WT_FH	   *log_fh;		/* Logging file handle */
@@ -199,6 +251,9 @@ struct __wt_connection_impl {
 					/* Locked: compressor list */
 	TAILQ_HEAD(__wt_comp_qh, __wt_named_compressor) compqh;
 
+					/* Locked: data source list */
+	TAILQ_HEAD(__wt_dsrc_qh, __wt_named_data_source) dsrcqh;
+
 	FILE *msgfile;
 	void (*msgcall)(const WT_CONNECTION_IMPL *, const char *);
 
@@ -220,8 +275,7 @@ struct __wt_connection_impl {
 	const char *__oldname = (s)->name;				\
 	(s)->cursor = (cur);						\
 	(s)->btree = (bt);						\
-	(s)->name = #h "." #n;						\
-	ret = 0;
+	(s)->name = #h "." #n;
 
 #define	API_CALL_NOCONF(s, h, n, cur, bt) do {				\
 	API_SESSION_INIT(s, h, n, cur, bt);
@@ -239,6 +293,12 @@ struct __wt_connection_impl {
 	}								\
 } while (0)
 
+/* If an error is returned, mark that the transaction requires abort. */
+#define	API_END_TXN_ERROR(s, ret)					\
+	API_END(s);							\
+	if ((ret) != 0 && (ret) != WT_NOTFOUND && (ret) != WT_DUPLICATE_KEY) \
+		F_SET(&(s)->txn, TXN_ERROR)
+
 /*
  * If a session or connection method is about to return WT_NOTFOUND (some
  * underlying object was not found), map it to ENOENT, only cursor methods
@@ -252,7 +312,7 @@ struct __wt_connection_impl {
 	API_CALL(s, session, n, NULL, NULL, cfg, cfgvar);
 
 #define	CONNECTION_API_CALL(conn, s, n, cfg, cfgvar)			\
-	s = &conn->default_session;					\
+	s = conn->default_session;					\
 	API_CALL(s, connection, n, NULL, NULL, cfg, cfgvar);		\
 
 #define	CURSOR_API_CALL(cur, s, n, bt)					\
@@ -270,6 +330,8 @@ extern WT_PROCESS __wt_process;
  * DO NOT EDIT: automatically built by dist/api_flags.py.
  * API flags section: BEGIN
  */
+#define	WT_CONN_NOSYNC					0x00000004
+#define	WT_CONN_TRANSACTIONAL				0x00000002
 #define	WT_DIRECTIO_DATA				0x00000002
 #define	WT_DIRECTIO_LOG					0x00000001
 #define	WT_PAGE_FREE_IGNORE_DISK			0x00000001
@@ -277,16 +339,17 @@ extern WT_PROCESS __wt_process;
 #define	WT_SERVER_RUN					0x00000001
 #define	WT_SESSION_INTERNAL				0x00000002
 #define	WT_SESSION_SALVAGE_QUIET_ERR			0x00000001
-#define	WT_VERB_block					0x00000800
-#define	WT_VERB_evict					0x00000400
-#define	WT_VERB_evictserver				0x00000200
-#define	WT_VERB_fileops					0x00000100
-#define	WT_VERB_hazard					0x00000080
-#define	WT_VERB_mutex					0x00000040
-#define	WT_VERB_read					0x00000020
-#define	WT_VERB_readserver				0x00000010
-#define	WT_VERB_reconcile				0x00000008
-#define	WT_VERB_salvage					0x00000004
+#define	WT_VERB_block					0x00001000
+#define	WT_VERB_evict					0x00000800
+#define	WT_VERB_evictserver				0x00000400
+#define	WT_VERB_fileops					0x00000200
+#define	WT_VERB_hazard					0x00000100
+#define	WT_VERB_mutex					0x00000080
+#define	WT_VERB_read					0x00000040
+#define	WT_VERB_readserver				0x00000020
+#define	WT_VERB_reconcile				0x00000010
+#define	WT_VERB_salvage					0x00000008
+#define	WT_VERB_snapshot				0x00000004
 #define	WT_VERB_verify					0x00000002
 #define	WT_VERB_write					0x00000001
 /*
diff --git a/src/include/bitstring.i b/src/include/bitstring.i
index 4f788a625b5..b0506075c93 100644
--- a/src/include/bitstring.i
+++ b/src/include/bitstring.i
@@ -116,7 +116,6 @@ __bit_clear(uint8_t *bitf, uint32_t bit)
 	bitf[__bit_byte(bit)] &= ~__bit_mask(bit);
 }
 
-#ifdef __NOT_CURRENTLY_USED
 /*
  * __bit_nclr --
  *	Clear bits start-to-stop in name.
@@ -140,7 +139,6 @@ __bit_nclr(uint8_t *bitf, uint32_t start, uint32_t stop)
 		bitf[stopbyte] &= 0xff << ((stop & 0x7) + 1);
 	}
 }
-#endif
 
 /*
  * __bit_nset --
@@ -195,7 +193,6 @@ __bit_ffc(uint8_t *bitf, uint32_t nbits, uint32_t *retp)
 	return (0);
 }
 
-#ifdef __NOT_CURRENTLY_USED
 /*
  * __bit_ffs --
  *	Find first set bit in name, return 0 on success, -1 on no bit set.
@@ -206,6 +203,7 @@ __bit_ffs(uint8_t *bitf, uint32_t nbits, uint32_t *retp)
 	uint8_t lb;
 	uint32_t byte, stopbyte, value;
 
+	value = 0;
 	if (nbits == 0)
 		return (-1);
 
@@ -224,7 +222,6 @@ __bit_ffs(uint8_t *bitf, uint32_t nbits, uint32_t *retp)
 	*retp = value;
 	return (0);
 }
-#endif
 
 /*
  * __bit_getv --
diff --git a/src/include/block.h b/src/include/block.h
index 6b4b8c88830..26dde46b715 100644
--- a/src/include/block.h
+++ b/src/include/block.h
@@ -8,7 +8,6 @@
 /*
  * WiredTiger's block manager interface.
  */
-#define	WT_BM_MAX_ADDR_COOKIE		255	/* Maximum address cookie */
 
 /*
  * The file's description is written into the first 512B of the file, which
@@ -18,21 +17,27 @@
 #define	WT_BLOCK_INVALID_OFFSET		0
 
 /*
- * The block allocator maintains two primary skiplists: first, the by-offset
- * list linking WT_EXT elements and sorted by file offset (low-to-high):
- * this list has an entry for every free chunk in the file.  The second primary
- * skiplist is the by-size list linking WT_SIZE elements and sorted by chunk
- * size (low-to-high).  This list has an entry for every free chunk size seen
- * since the list was created.
- *	Additionally, each WT_SIZE element has a skiplist of its own, linking
- * WT_EXT elements and sorted by file offset (low-to-high).  This list has an
- * entry for every free chunk in the file of a particular size.
+ * The block manager maintains three per-snapshot extent lists:
+ *	alloc:	 the extents allocated in this snapshot
+ *	avail:	 the extents available for allocation
+ *	discard: the extents freed in this snapshot
+ * Each of the extent lists is based on two skiplists: first, a by-offset list
+ * linking WT_EXT elements and sorted by file offset (low-to-high), second, a
+ * by-size list linking WT_SIZE elements and sorted by chunk size (low-to-high).
+ *	Additionally, each WT_SIZE element on the by-size has a skiplist of its
+ * own, linking WT_EXT elements and sorted by file offset (low-to-high).  This
+ * list has an entry for extents of a particular size.
  *	The trickiness is that each individual WT_EXT element appears on two
  * skiplists.  In order to minimize allocation calls, we allocate a single
  * array of WT_EXT pointers at the end of the WT_EXT structure, for both
  * skiplists, and store the depth of the skiplist in the WT_EXT structure.
  * The skiplist entries for the offset skiplist start at WT_EXT.next[0] and
  * the entries for the size skiplist start at WT_EXT.next[WT_EXT.depth].
+ *
+ * XXX
+ * We maintain the per-size skiplists for the alloc and discard extent lists,
+ * but there's no reason for that, the avail list is the only list we search
+ * by size.
  */
 
 /*
@@ -45,8 +50,11 @@ struct __wt_extlist {
 	uint64_t bytes;				/* Byte count */
 	uint32_t entries;			/* Entry count */
 
+	off_t	 offset;			/* Written extent offset */
+	uint32_t cksum, size;			/* Written extent cksum, size */
+
 	WT_EXT	*off[WT_SKIP_MAXDEPTH];		/* Size/offset skiplists */
-	WT_SIZE *size[WT_SKIP_MAXDEPTH];
+	WT_SIZE *sz[WT_SKIP_MAXDEPTH];
 };
 
 /*
@@ -98,6 +106,33 @@ struct __wt_size {
 	    (skip) != NULL; (skip) = (skip)->next[(skip)->depth])
 
 /*
+ * Snapshot cookie: carries a version number as I don't want to rev the schema
+ * file version should the default block manager snapshot format change.
+ *
+ * Version #1 snapshot cookie format:
+ *	[1] [root addr] [alloc addr] [avail addr] [discard addr]
+ *	    [file size] [snapshot size] [write generation]
+ */
+#define	WT_BM_SNAPSHOT_VERSION		1	/* Snapshot format version */
+#define	WT_BLOCK_EXTLIST_MAGIC		71002	/* Identify a list */
+struct __wt_block_snapshot {
+	uint8_t	 version;			/* Version */
+
+	off_t	 root_offset;			/* The root */
+	uint32_t root_cksum, root_size;
+
+	WT_EXTLIST alloc;			/* Extents allocated */
+	WT_EXTLIST avail;			/* Extents available */
+	WT_EXTLIST discard;			/* Extents discarded */
+
+	off_t	   file_size;			/* Snapshot file size */
+	uint64_t   snapshot_size;		/* Snapshot byte count */
+	WT_EXTLIST snapshot_avail;		/* Snapshot free'd extents */
+
+	uint64_t write_gen;			/* Write generation */
+};
+
+/*
  * WT_BLOCK --
  *	Encapsulation of the standard WiredTiger block manager.
  */
@@ -106,31 +141,26 @@ struct __wt_block {
 
 	WT_FH	*fh;			/* Backing file handle */
 
-	uint64_t write_gen;		/* Write generation */
-
 	uint32_t allocsize;		/* Allocation size */
 	int	 checksum;		/* If checksums configured */
 
-	WT_COMPRESSOR *compressor;	/* Page compressor */
-
-					/* Freelist support */
-	WT_SPINLOCK freelist_lock;	/* Lock to protect the freelist. */
-
-	WT_EXTLIST free;		/* Freelist offset/size skiplists */
+	WT_SPINLOCK	  live_lock;	/* Lock to protect the live snapshot */
+	WT_BLOCK_SNAPSHOT live;		/* Live snapshot */
+	int		  live_load;	/* Live snapshot loaded */
 
-	off_t	 free_offset;		/* Freelist file location */
-	uint32_t free_size;
-	uint32_t free_cksum;
-
-					/* Salvage support */
-	off_t	 slvg_off;		/* Salvage file offset */
-
-					/* Verification support */
-	uint32_t frags;			/* Total frags */
-	uint8_t *fragbits;		/* Frag tracking bit list */
+	WT_COMPRESSOR *compressor;	/* Page compressor */
 
-#define	WT_BLOCK_OK	0x01		/* File successfully opened */
-	uint32_t flags;
+				/* Salvage support */
+	int	slvg;			/* If performing salvage */
+	off_t	slvg_off;		/* Salvage file offset */
+
+				/* Verification support */
+	int	   verify;		/* If performing verification */
+	off_t	   verify_size;		/* Snapshot's file size */
+	WT_EXTLIST verify_alloc;	/* Verification allocation list */
+	uint32_t   frags;		/* Maximum frags in the file */
+	uint8_t   *fragfile;		/* Per-file frag tracking list */
+	uint8_t   *fragsnap;		/* Per-snapshot frag tracking list */
 };
 
 /*
@@ -148,19 +178,6 @@ struct __wt_block_desc {
 	uint32_t cksum;			/* 08-11: Description block checksum */
 
 	uint32_t unused;		/* 12-15: Padding */
-#define	WT_BLOCK_EXTLIST_MAGIC	071002
-	uint64_t free_offset;		/* 16-23: Free list page offset */
-	uint32_t free_size;		/* 24-27: Free list page length */
-	uint32_t free_cksum;		/* 28-31: Free list page checksum */
-
-	/*
-	 * We maintain page write-generations in the non-transactional case
-	 * (where, instead of a transactional LSN, the value is a counter),
-	 * as that's how salvage can determine the most recent page between
-	 * pages overlapping the same key range.  The value has to persist,
-	 * so it's included in the file's metadata.
-	 */
-	uint64_t write_gen;		/* 32-39: Write generation */
 };
 /*
  * WT_BLOCK_DESC_SIZE is the expected structure size -- we verify the build to
@@ -168,7 +185,7 @@ struct __wt_block_desc {
  * since we reserve the first sector of the file for this information, but it
  * would be worth investigation, regardless).
  */
-#define	WT_BLOCK_DESC_SIZE		40
+#define	WT_BLOCK_DESC_SIZE		16
 
 /*
  * WT_BLOCK_HEADER --
@@ -185,7 +202,7 @@ struct __wt_block_header {
 	 * !!!
 	 * The write-generation is "owned" by the btree layer, but it's easier
 	 * to set it (when physically writing blocks), to persist it (in the
-	 * WT_BLOCK_DESC structure, rather than the schema file), and restore
+	 * WT_BLOCK_DESC structure, rather than the metadata file), and restore
 	 * it during salvage, in the block-manager layer.
 	 */
 	uint64_t write_gen;		/* 00-07: write generation */
diff --git a/src/include/btmem.h b/src/include/btmem.h
index a6ea0e53225..5f6a25d4d0f 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -61,7 +61,6 @@ struct __wt_page_header {
  * WT_ADDR --
  *	A block location.
  */
-#define	WT_NOADDR	""		/* No address */
 struct __wt_addr {
 	uint8_t *addr;			/* Cookie */
 	uint32_t size;			/* Cookie length */
@@ -72,14 +71,6 @@ struct __wt_addr {
  *	When a page is modified, there's additional information maintained as it
  * is written to disk.
  */
-typedef enum {
-	WT_PT_EMPTY=0,			/* Unused slot */
-	WT_PT_BLOCK,			/* Block: inactive */
-	WT_PT_BLOCK_EVICT,		/* Block: inactive on eviction */
-	WT_PT_OVFL,			/* Overflow: active */
-	WT_PT_OVFL_DISCARD		/* Overflow: inactive */
-} __wt_pt_type_t;
-
 struct __wt_page_modify {
 	/*
 	 * The write generation is incremented after a page is modified.  That
@@ -169,20 +160,29 @@ struct __wt_page_modify {
 		uint8_t *data;		/* Overflow data reference */
 		uint32_t size;		/* Overflow data length */
 
-		__wt_pt_type_t type;	/* Type */
+#define	WT_TRK_DISCARD		0x001	/* Object was discarded */
+#define	WT_TRK_INUSE		0x002	/* Object is currently in-use */
+#define	WT_TRK_JUST_ADDED	0x004	/* Object added this reconciliation */
+#define	WT_TRK_OBJECT		0x008	/* Slot set (not empty) */
+#define	WT_TRK_ONPAGE		0x010	/* Object was referenced from a page */
+		uint8_t  flags;
 	} *track;			/* Array of tracked objects */
 	uint32_t track_entries;		/* Total track slots */
 
+	wt_txnid_t first_id;		/* Earliest transactional update, used
+					 * to avoid errors from transaction ID
+					 * wraparound.
+					 */
+
 #define	WT_PM_REC_EMPTY		0x01	/* Reconciliation: page empty */
 #define	WT_PM_REC_REPLACE	0x02	/* Reconciliation: page replaced */
 #define	WT_PM_REC_SPLIT		0x04	/* Reconciliation: page split */
 #define	WT_PM_REC_SPLIT_MERGE	0x08	/* Reconciliation: page split merge */
-	uint8_t flags;			/* Page flags */
-};
-
 #define	WT_PM_REC_MASK							\
 	(WT_PM_REC_EMPTY |						\
 	    WT_PM_REC_REPLACE | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE)
+	uint8_t flags;			/* Page flags */
+};
 
 /*
  * WT_PAGE --
@@ -284,13 +284,13 @@ struct __wt_page {
 	uint32_t memory_footprint;
 
 #define	WT_PAGE_INVALID		0	/* Invalid page */
-#define	WT_PAGE_COL_FIX		1	/* Col-store fixed-len leaf */
-#define	WT_PAGE_COL_INT		2	/* Col-store internal page */
-#define	WT_PAGE_COL_VAR		3	/* Col-store var-length leaf page */
-#define	WT_PAGE_OVFL		4	/* Overflow page */
-#define	WT_PAGE_ROW_INT		5	/* Row-store internal page */
-#define	WT_PAGE_ROW_LEAF	6	/* Row-store leaf page */
-#define	WT_PAGE_FREELIST	7	/* Free-list page */
+#define	WT_PAGE_BLOCK_MANAGER	1	/* Block-manager page */
+#define	WT_PAGE_COL_FIX		2	/* Col-store fixed-len leaf */
+#define	WT_PAGE_COL_INT		3	/* Col-store internal page */
+#define	WT_PAGE_COL_VAR		4	/* Col-store var-length leaf page */
+#define	WT_PAGE_OVFL		5	/* Overflow page */
+#define	WT_PAGE_ROW_INT		6	/* Row-store internal page */
+#define	WT_PAGE_ROW_LEAF	7	/* Row-store leaf page */
 	uint8_t type;			/* Page type */
 
 #define	WT_PAGE_BUILD_KEYS	0x01	/* Keys have been built in memory */
@@ -402,10 +402,23 @@ struct __wt_ref {
  * sorted by key, fixed in size, and references data on the page.
  */
 struct __wt_row {
-	void	*key;			/* On-page cell or off-page WT_IKEY */
+	void	*__key;			/* On-page cell or off-page WT_IKEY */
 };
 
 /*
+ * Multiple threads of control may be searching the in-memory row-store pages,
+ * and the key may be instantiated at any time.  Code must be able to handle
+ * both when the key has not been instantiated (the key field points into the
+ * page's disk image), and when the key has been instantiated (the key field
+ * points outside the page's disk image).  We don't need barriers because the
+ * key is updated atomically, but code that reads the key field multiple times
+ * is a very, very bad idea.  We obscure the field name and use a copy macro in
+ * all references to the field to make sure we don't introduce this bug (again).
+ */
+#define	WT_ROW_KEY_COPY(rip)	((rip)->__key)
+#define	WT_ROW_KEY_SET(rip, v)	((rip)->__key) = (v)
+
+/*
  * WT_ROW_FOREACH --
  *	Walk the entries of an in-memory row-store leaf page.
  */
@@ -518,8 +531,6 @@ struct __wt_ikey {
  * list.
  */
 struct __wt_update {
-	WT_UPDATE *next;		/* forward-linked list */
-
 	/*
 	 * We use the maximum size as an is-deleted flag, which means we can't
 	 * store 4GB objects; I'd rather do that than increase the size of this
@@ -528,6 +539,9 @@ struct __wt_update {
 #define	WT_UPDATE_DELETED_ISSET(upd)	((upd)->size == UINT32_MAX)
 #define	WT_UPDATE_DELETED_SET(upd)	((upd)->size = UINT32_MAX)
 	uint32_t size;			/* update length */
+	wt_txnid_t txnid;		/* update transaction */
+
+	WT_UPDATE *next;		/* forward-linked list */
 
 	/* The untyped value immediately follows the WT_UPDATE structure. */
 #define	WT_UPDATE_DATA(upd)						\
diff --git a/src/include/btree.h b/src/include/btree.h
index f46783ab479..5a7e6665e13 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -32,6 +32,14 @@
 #define	WT_BTREE_MAX_OBJECT_SIZE	(UINT32_MAX - 512)
 
 /*
+ * A location in a file is a variable-length cookie, but it has a maximum size
+ * so it's easy to create temporary space in which to store them.  (Locations
+ * can't be much larger than this anyway, they must fit onto the minimum size
+ * page because a reference to an overflow page is itself a location.)
+ */
+#define	WT_BTREE_MAX_ADDR_COOKIE	255	/* Maximum address cookie */
+
+/*
  * Split page size calculation -- we don't want to repeatedly split every time
  * a new entry is added, so we split to a smaller-than-maximum page size.
  */
@@ -62,9 +70,12 @@ struct __wt_btree {
 
 	volatile uint32_t lru_count;	/* Count of threads in LRU eviction. */
 
-	const char *name;		/* Logical name */
-	const char *filename;		/* File name */
+	const char *name;		/* Object name as a URI */
 	const char *config;		/* Configuration string */
+	const char *snapshot;		/* Snapshot name (or NULL) */
+
+	/* XXX Should move into the session-level handle information. */
+	WT_RWLOCK   *snaplock;		/* Lock for snapshot creation */
 
 	enum {	BTREE_COL_FIX=1,	/* Fixed-length column store */
 		BTREE_COL_VAR=2,	/* Variable-length column store */
@@ -95,10 +106,8 @@ struct __wt_btree {
 	uint64_t last_recno;		/* Column-store last record number */
 
 	WT_PAGE *root_page;		/* Root page */
-	WT_ADDR  root_addr;		/* Replacement root address */
-	int	 root_update;		/* 0: free original root blocks
-					   1: free saved root blocks and
-					      update on close */
+
+	WT_SNAPSHOT *snap;		/* Snapshot information */
 
 	void *block;			/* Block manager */
 	u_int block_header;		/* Block manager header length */
@@ -107,27 +116,21 @@ struct __wt_btree {
 
 	WT_BTREE_STATS *stats;		/* Btree statistics */
 
-#define	WT_BTREE_BULK		0x01	/* Bulk-load handle */
-#define	WT_BTREE_EXCLUSIVE	0x02	/* Need exclusive access to handle */
-#define	WT_BTREE_NO_LOCK	0x04	/* Do not lock the handle */
-#define	WT_BTREE_OPEN		0x08	/* Handle is open */
-#define	WT_BTREE_SALVAGE	0x10	/* Handle is for salvage */
-#define	WT_BTREE_UPGRADE	0x20	/* Handle is for upgrade */
-#define	WT_BTREE_VERIFY		0x40	/* Handle is for verify */
+#define	WT_BTREE_BULK		0x0001	/* Bulk-load handle */
+#define	WT_BTREE_EXCLUSIVE	0x0002	/* Need exclusive access to handle */
+#define	WT_BTREE_LOCK_ONLY	0x0004	/* Handle is only needed for locking */
+#define	WT_BTREE_NO_EVICTION	0x0008	/* The file isn't evicted */
+#define	WT_BTREE_OPEN		0x0010	/* Handle is open */
+#define	WT_BTREE_SALVAGE	0x0020	/* Handle is for salvage */
+#define	WT_BTREE_SNAPSHOT_OP	0x0040	/* Handle is for a snapshot operation */
+#define	WT_BTREE_UPGRADE	0x0080	/* Handle is for upgrade */
+#define	WT_BTREE_VERIFY		0x0100	/* Handle is for verify */
 	uint32_t flags;
 };
 
-/*
- * In diagnostic mode we track the locations from which hazard references
- * were acquired.
- */
-#ifdef HAVE_DIAGNOSTIC
-#define	__wt_page_in(a, b, c)						\
-	__wt_page_in_func(a, b, c, __FILE__, __LINE__)
-#else
-#define	__wt_page_in(a, b, c)						\
-	__wt_page_in_func(a, b, c)
-#endif
+/* Flags that make a btree handle special (not for normal use). */
+#define	WT_BTREE_SPECIAL_FLAGS	 					\
+	(WT_BTREE_BULK | WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)
 
 /*
  * WT_SALVAGE_COOKIE --
diff --git a/src/include/btree.i b/src/include/btree.i
index 64105167e5c..e076375e5ad 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -162,9 +162,14 @@ __wt_page_is_modified(WT_PAGE *page)
  *	Confirm the page's write generation number is correct.
  */
 static inline int
-__wt_page_write_gen_check(WT_PAGE *page, uint32_t write_gen)
+__wt_page_write_gen_check(
+    WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t write_gen)
 {
-	return (page->modify->write_gen == write_gen ? 0 : WT_RESTART);
+	if (page->modify->write_gen == write_gen)
+		return (0);
+
+	WT_BSTAT_INCR(session, file_write_conflicts);
+	return (WT_RESTART);
 }
 
 /*
@@ -220,17 +225,50 @@ __wt_get_addr(
 
 /*
  * __wt_page_release --
- *	Release a reference to a page, unless it's pinned into memory, in which
- * case we never acquired a hazard reference.
+ *	Release a reference to a page.
  */
 static inline void
 __wt_page_release(WT_SESSION_IMPL *session, WT_PAGE *page)
 {
+	/* We never acquired a hazard reference on the root page. */
 	if (page != NULL && !WT_PAGE_IS_ROOT(page))
 		__wt_hazard_clear(session, page);
 }
 
 /*
+ * __wt_page_hazard_check --
+ *	Return if there's a hazard reference to the page in the system.
+ */
+static inline WT_HAZARD *
+__wt_page_hazard_check(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_HAZARD *hp;
+	WT_SESSION_IMPL *s;
+	uint32_t i, session_cnt;
+
+	conn = S2C(session);
+
+	/*
+	 * No lock is required because the session array is fixed size, but it
+	 * it may contain inactive entries.  We must review any active session
+	 * that might contain a hazard reference, so insert a barrier before
+	 * reading the active session count.  That way, no matter what sessions
+	 * come or go, we'll check the slots for all of the sessions that could
+	 * have been active when we started our check.
+	 */
+	WT_ORDERED_READ(session_cnt, conn->session_cnt);
+	for (s = conn->sessions, i = 0; i < session_cnt; ++s, ++i) {
+		if (!s->active)
+			continue;
+		for (hp = s->hazard; hp < s->hazard + conn->hazard_size; ++hp)
+			if (hp->page == page)
+				return (hp);
+	}
+	return (NULL);
+}
+
+/*
  * __wt_skip_choose_depth --
  *      Randomly choose a depth for a skiplist insert.
  */
diff --git a/src/include/cache.h b/src/include/cache.h
index 63a545e5dc3..2fc7e0fedd3 100644
--- a/src/include/cache.h
+++ b/src/include/cache.h
@@ -6,26 +6,12 @@
  */
 
 /*
- * WT_EVICT_LIST --
+ * WT_EVICT_ENTRY --
  *	Encapsulation of an eviction candidate.
  */
-struct __wt_evict_list {
-	WT_BTREE *btree;			/* File object */
-	WT_PAGE	 *page;				/* Page */
-};
-
-/*
- * WT_EVICT_REQ --
- *	Encapsulation of a eviction request.
- */
-struct __wt_evict_req {
-	WT_SESSION_IMPL *session;		/* Requesting thread */
-	WT_BTREE *btree;			/* Btree */
-	WT_PAGE *page;                          /* Single page to flush */
-
-#define	WT_EVICT_REQ_CLOSE      0x1		/* Discard pages */
-#define	WT_EVICT_REQ_PAGE       0x2		/* Force out a page */
-	uint32_t flags;
+struct __wt_evict_entry {
+	WT_BTREE *btree;			/* Enclosing btree object */
+	WT_PAGE	 *page;				/* Page to flush/evict */
 };
 
 /*
@@ -54,18 +40,28 @@ struct __wt_cache {
 	 * Eviction thread information.
 	 */
 	WT_CONDVAR *evict_cond;		/* Cache eviction server mutex */
+	WT_SPINLOCK evict_lock;		/* Eviction serialization */
 
-	WT_SPINLOCK lru_lock;		/* Manage the eviction list. */
+	u_int eviction_trigger;		/* Percent to trigger eviction. */
+	u_int eviction_target;		/* Percent to end eviction */
 
-	WT_EVICT_LIST *evict;		/* Pages being tracked for eviction */
-	WT_EVICT_LIST *evict_current;	/* Current page to be evicted */
-	size_t   evict_allocated;	/* Bytes allocated */
-	uint32_t evict_entries;		/* Total evict slots */
+	/*
+	 * LRU eviction list information.
+	 */
+	WT_EVICT_ENTRY *evict;		/* LRU pages being tracked */
+	WT_EVICT_ENTRY *evict_current;	/* LRU current page to be evicted */
+	size_t   evict_allocated;	/* LRU list bytes allocated */
+	uint32_t evict_entries;		/* LRU list eviction slots */
 
-	u_int eviction_trigger;		/* Percent to trigger eviction. */
-	u_int eviction_target;		/* Percent to end eviction. */
+	/*
+	 * Forced-page eviction request information.
+	 */
+	WT_EVICT_ENTRY *evict_request;	/* Forced page eviction request list */
+	uint32_t max_evict_request;	/* Size of the eviction request array */
 
-	WT_EVICT_REQ *evict_request;	/* Eviction requests:
-					   slot available if session is NULL */
-	uint32_t max_evict_request;	/* Size of the evict request array */
+	/*
+	 * Sync/flush request information.
+	 */
+	volatile uint64_t sync_request;	/* File sync requests */
+	volatile uint64_t sync_complete;/* File sync requests completed */
 };
diff --git a/src/include/cache.i b/src/include/cache.i
index 5ac61f6cb69..f7991760898 100644
--- a/src/include/cache.i
+++ b/src/include/cache.i
@@ -37,7 +37,7 @@ __wt_eviction_check(WT_SESSION_IMPL *session, int *read_lockoutp, int wake)
 
 /*
  * __wt_eviction_page_check --
- *	Check if a page is too big and wake the eviction server if necessary.
+ *	Return if a page should be forcibly evicted.
  */
 static inline int
 __wt_eviction_page_check(WT_SESSION_IMPL *session, WT_PAGE *page)
@@ -46,21 +46,7 @@ __wt_eviction_page_check(WT_SESSION_IMPL *session, WT_PAGE *page)
 
 	conn = S2C(session);
 
-	/*
-	 * If the page is pathologically large, force eviction.
-	 * Otherwise, if the cache is more than 95% full, wake up the eviction
-	 * thread.
-	 */
-	if (page != NULL && !WT_PAGE_IS_ROOT(page) &&
+	return (!WT_PAGE_IS_ROOT(page) && __wt_page_is_modified(page) &&
 	    (((int64_t)page->memory_footprint > conn->cache_size / 2) ||
-	    (page->memory_footprint > 20 * session->btree->maxleafpage))) {
-		/*
-		 * We're already inside a serialized function, so we need to
-		 * take some care.
-		 */
-		WT_RET(__wt_evict_page_request(session, page));
-	} else
-		__wt_eviction_check(session, NULL, 1);
-
-	return (0);
+	    (page->memory_footprint > 20 * session->btree->maxleafpage)));
 }
diff --git a/src/include/cursor.i b/src/include/cursor.i
index cea5ac359f9..11a3458c4b2 100644
--- a/src/include/cursor.i
+++ b/src/include/cursor.i
@@ -104,14 +104,9 @@ __cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip)
 
 	/*
 	 * Return the WT_ROW slot's K/V pair.
-	 *
-	 * Multiple threads of control may be searching this page, which
-	 * means the key may change underfoot, and here's where it gets
-	 * tricky: first, copy the key.  We don't need any barriers, the
-	 * key is updated atomically, and we just need a valid copy.
 	 */
-	key = rip->key;
 
+	key = WT_ROW_KEY_COPY(rip);
 	/*
 	 * Key copied.
 	 *
diff --git a/src/include/error.h b/src/include/error.h
index ddbf7966bf3..4d8f18ae74e 100644
--- a/src/include/error.h
+++ b/src/include/error.h
@@ -11,10 +11,10 @@
 /* Return and branch-to-err-label cases for switch statements. */
 #define	WT_ILLEGAL_VALUE(session)					\
 	default:							\
-		return (__wt_illegal_value(session))
+		return (__wt_illegal_value(session, NULL))
 #define	WT_ILLEGAL_VALUE_ERR(session)					\
 	default:							\
-		ret = __wt_illegal_value(session);			\
+		ret = __wt_illegal_value(session, NULL);		\
 		goto err
 
 /* Set "ret" and branch-to-err-label tests. */
@@ -58,17 +58,25 @@
 } while (0)
 
 /*
- * WT_ASSERT, WT_ASSERT_RET --
+ * WT_ASSERT, WT_ASSERT_ERR, WT_ASSERT_RET --
  *	Assert an expression, abort in diagnostic mode, otherwise, optionally
- * return an error.
+ *	return an error.
  */
 #define	WT_ASSERT(session, exp) do {					\
 	if (!(exp))							\
-		(void)__wt_assert(					\
-		    session, 0, __FILE__, __LINE__, "%s", #exp);	\
+		__wt_assert(session, 0, __FILE__, __LINE__, "%s", #exp);\
+} while (0)
+#define	WT_ASSERT_ERR(session, exp) do {				\
+	if (!(exp)) {							\
+		__wt_assert(						\
+		    session, WT_ERROR, __FILE__, __LINE__, "%s", #exp);	\
+		WT_ERR(WT_ERROR);					\
+	}								\
 } while (0)
 #define	WT_ASSERT_RET(session, exp) do {				\
-	if (!(exp))							\
-		return (__wt_assert(					\
-		    session, WT_ERROR, __FILE__, __LINE__, "%s", #exp));\
+	if (!(exp)) {							\
+		__wt_assert(						\
+		    session, WT_ERROR, __FILE__, __LINE__, "%s", #exp);	\
+		return (WT_ERROR);					\
+	}								\
 } while (0)
diff --git a/src/include/extern.h b/src/include/extern.h
index 2e832d21cfa..cd972b3edb0 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -1,15 +1,15 @@
 /* DO NOT EDIT: automatically built by dist/s_prototypes. */
 
+extern int __wt_block_addr_to_buffer(WT_BLOCK *block,
+    uint8_t **pp,
+    off_t offset,
+    uint32_t size,
+    uint32_t cksum);
 extern int __wt_block_buffer_to_addr(WT_BLOCK *block,
     const uint8_t *p,
     off_t *offsetp,
     uint32_t *sizep,
     uint32_t *cksump);
-extern int __wt_block_addr_to_buffer(WT_BLOCK *block,
-    uint8_t **p,
-    off_t offset,
-    uint32_t size,
-    uint32_t cksum);
 extern int __wt_block_addr_valid(WT_SESSION_IMPL *session,
     WT_BLOCK *block,
     const uint8_t *addr,
@@ -19,36 +19,63 @@ extern int __wt_block_addr_string(WT_SESSION_IMPL *session,
     WT_ITEM *buf,
     const uint8_t *addr,
     uint32_t addr_size);
+extern int __wt_block_buffer_to_snapshot(WT_SESSION_IMPL *session,
+    WT_BLOCK *block,
+    const uint8_t *p,
+    WT_BLOCK_SNAPSHOT *si);
+extern int __wt_block_snapshot_to_buffer(WT_SESSION_IMPL *session,
+    WT_BLOCK *block,
+    uint8_t **pp,
+    WT_BLOCK_SNAPSHOT *si);
 extern uint32_t __wt_cksum(const void *chunk, size_t len);
+extern int __wt_block_off_remove_overlap( WT_SESSION_IMPL *session,
+    WT_EXTLIST *el,
+    off_t off,
+    off_t size);
 extern int __wt_block_alloc( WT_SESSION_IMPL *session,
     WT_BLOCK *block,
     off_t *offp,
     off_t size);
-extern int __wt_block_free_buf(WT_SESSION_IMPL *session,
+extern int __wt_block_extend( WT_SESSION_IMPL *session,
+    WT_BLOCK *block,
+    off_t *offp,
+    off_t size);
+extern int __wt_block_free(WT_SESSION_IMPL *session,
     WT_BLOCK *block,
     const uint8_t *addr,
     uint32_t addr_size);
-extern int __wt_block_free( WT_SESSION_IMPL *session,
+extern int __wt_block_off_free( WT_SESSION_IMPL *session,
     WT_BLOCK *block,
     off_t off,
     off_t size);
-extern int __wt_block_extlist_read(WT_SESSION_IMPL *session,
+extern int __wt_block_extlist_check( WT_SESSION_IMPL *session,
+    WT_EXTLIST *al,
+    WT_EXTLIST *bl);
+extern int __wt_block_extlist_overlap( WT_SESSION_IMPL *session,
     WT_BLOCK *block,
+    WT_BLOCK_SNAPSHOT *si);
+extern int __wt_block_extlist_merge(WT_SESSION_IMPL *session,
+    WT_EXTLIST *a,
+    WT_EXTLIST *b);
+extern int __wt_block_insert_ext( WT_SESSION_IMPL *session,
     WT_EXTLIST *el,
     off_t off,
-    uint32_t size,
-    uint32_t cksum);
-extern void __wt_block_freelist_open(WT_SESSION_IMPL *session, WT_BLOCK *block);
-extern void __wt_block_freelist_close(WT_SESSION_IMPL *session,
-    WT_BLOCK *block);
+    off_t size);
+extern int __wt_block_extlist_read( WT_SESSION_IMPL *session,
+    WT_BLOCK *block,
+    WT_EXTLIST *el);
 extern int __wt_block_extlist_write(WT_SESSION_IMPL *session,
     WT_BLOCK *block,
     WT_EXTLIST *el,
-    off_t *offp,
-    uint32_t *sizep,
-    uint32_t *cksump);
-extern void __wt_block_discard(WT_SESSION_IMPL *session, WT_BLOCK *block);
-extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block);
+    WT_EXTLIST *additional);
+extern int __wt_block_extlist_truncate( WT_SESSION_IMPL *session,
+    WT_BLOCK *block,
+    WT_EXTLIST *el);
+extern void __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el);
+extern int __wt_block_extlist_dump( WT_SESSION_IMPL *session,
+    const char *tag,
+    WT_EXTLIST *el,
+    int show_size);
 extern int __wt_bm_addr_valid( WT_SESSION_IMPL *session,
     const uint8_t *addr,
     uint32_t addr_size);
@@ -64,8 +91,19 @@ extern int __wt_bm_open(WT_SESSION_IMPL *session,
     const char *filename,
     const char *config,
     const char *cfg[],
-    int salvage);
+    int forced_salvage);
 extern int __wt_bm_close(WT_SESSION_IMPL *session);
+extern int __wt_bm_snapshot(WT_SESSION_IMPL *session,
+    WT_ITEM *buf,
+    WT_SNAPSHOT *snapbase);
+extern int __wt_bm_snapshot_resolve(WT_SESSION_IMPL *session,
+    WT_SNAPSHOT *snapbase);
+extern int __wt_bm_snapshot_load(WT_SESSION_IMPL *session,
+    WT_ITEM *buf,
+    const uint8_t *addr,
+    uint32_t addr_size,
+    int readonly);
+extern int __wt_bm_snapshot_unload(WT_SESSION_IMPL *session);
 extern int __wt_bm_truncate(WT_SESSION_IMPL *session, const char *filename);
 extern int __wt_bm_free(WT_SESSION_IMPL *session,
     const uint8_t *addr,
@@ -74,7 +112,6 @@ extern int __wt_bm_read(WT_SESSION_IMPL *session,
     WT_ITEM *buf,
     const uint8_t *addr,
     uint32_t addr_size);
-extern int __wt_bm_block_header(WT_SESSION_IMPL *session, uint32_t *headerp);
 extern int __wt_bm_write_size(WT_SESSION_IMPL *session, uint32_t *sizep);
 extern int __wt_bm_write( WT_SESSION_IMPL *session,
     WT_ITEM *buf,
@@ -88,8 +125,9 @@ extern int __wt_bm_salvage_next(WT_SESSION_IMPL *session,
     uint32_t *addr_sizep,
     uint64_t *write_genp,
     int *eofp);
-extern int __wt_bm_salvage_end(WT_SESSION_IMPL *session, int success);
-extern int __wt_bm_verify_start(WT_SESSION_IMPL *session, int *emptyp);
+extern int __wt_bm_salvage_end(WT_SESSION_IMPL *session);
+extern int __wt_bm_verify_start(WT_SESSION_IMPL *session,
+    WT_SNAPSHOT *snapbase);
 extern int __wt_bm_verify_end(WT_SESSION_IMPL *session);
 extern int __wt_bm_verify_addr(WT_SESSION_IMPL *session,
     const uint8_t *addr,
@@ -100,25 +138,24 @@ extern int __wt_block_open(WT_SESSION_IMPL *session,
     const char *filename,
     const char *config,
     const char *cfg[],
-    int salvage,
-    void *retp);
+    int forced_salvage,
+    void *blockp);
 extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block);
 extern int __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh);
-extern int __wt_block_read_buf(WT_SESSION_IMPL *session,
+extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_block_read(WT_SESSION_IMPL *session,
     WT_BLOCK *block,
     WT_ITEM *buf,
     const uint8_t *addr,
     uint32_t addr_size);
-extern int __wt_block_read(WT_SESSION_IMPL *session,
+extern int __wt_block_read_off(WT_SESSION_IMPL *session,
     WT_BLOCK *block,
     WT_ITEM *buf,
     off_t offset,
     uint32_t size,
     uint32_t cksum);
 extern int __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block);
-extern int __wt_block_salvage_end(WT_SESSION_IMPL *session,
-    WT_BLOCK *block,
-    int success);
+extern int __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block);
 extern int __wt_block_salvage_next( WT_SESSION_IMPL *session,
     WT_BLOCK *block,
     WT_ITEM *buf,
@@ -126,31 +163,62 @@ extern int __wt_block_salvage_next( WT_SESSION_IMPL *session,
     uint32_t *addr_sizep,
     uint64_t *write_genp,
     int *eofp);
-extern int __wt_block_verify_start(WT_SESSION_IMPL *session,
+extern int __wt_block_snap_init(WT_SESSION_IMPL *session,
+    WT_BLOCK *block,
+    WT_BLOCK_SNAPSHOT *si,
+    int is_live);
+extern int __wt_block_snapshot_load(WT_SESSION_IMPL *session,
     WT_BLOCK *block,
-    int *emptyp);
+    WT_ITEM *dsk,
+    const uint8_t *addr,
+    uint32_t addr_size,
+    int readonly);
+extern int __wt_block_snapshot_unload(WT_SESSION_IMPL *session,
+    WT_BLOCK *block);
+extern int __wt_block_snapshot(WT_SESSION_IMPL *session,
+    WT_BLOCK *block,
+    WT_ITEM *buf,
+    WT_SNAPSHOT *snapbase);
+extern int __wt_block_snapshot_resolve( WT_SESSION_IMPL *session,
+    WT_BLOCK *block,
+    WT_SNAPSHOT *snapbase);
+extern int __wt_block_verify_start( WT_SESSION_IMPL *session,
+    WT_BLOCK *block,
+    WT_SNAPSHOT *snapbase);
 extern int __wt_block_verify_end(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_verify_snap_load( WT_SESSION_IMPL *session,
+    WT_BLOCK *block,
+    WT_BLOCK_SNAPSHOT *si);
+extern int __wt_verify_snap_unload( WT_SESSION_IMPL *session,
+    WT_BLOCK *block,
+    WT_BLOCK_SNAPSHOT *si);
+extern int __wt_block_verify(WT_SESSION_IMPL *session,
+    WT_BLOCK *block,
+    WT_ITEM *buf,
+    const uint8_t *addr,
+    uint32_t addr_size,
+    off_t offset,
+    uint32_t size);
 extern int __wt_block_verify_addr(WT_SESSION_IMPL *session,
     WT_BLOCK *block,
     const uint8_t *addr,
     uint32_t addr_size);
-extern int __wt_block_header(WT_SESSION_IMPL *session,
-    WT_BLOCK *block,
-    uint32_t *headerp);
+extern u_int __wt_block_header(WT_SESSION_IMPL *session);
 extern int __wt_block_write_size( WT_SESSION_IMPL *session,
     WT_BLOCK *block,
     uint32_t *sizep);
-extern int __wt_block_write_buf(WT_SESSION_IMPL *session,
+extern int __wt_block_write(WT_SESSION_IMPL *session,
     WT_BLOCK *block,
     WT_ITEM *buf,
     uint8_t *addr,
     uint32_t *addr_size);
-extern int __wt_block_write(WT_SESSION_IMPL *session,
+extern int __wt_block_write_off(WT_SESSION_IMPL *session,
     WT_BLOCK *block,
     WT_ITEM *buf,
     off_t *offsetp,
     uint32_t *sizep,
-    uint32_t *cksump);
+    uint32_t *cksump,
+    int force_extend);
 extern int __wt_bulk_init(WT_CURSOR_BULK *cbulk);
 extern int __wt_bulk_insert(WT_CURSOR_BULK *cbulk);
 extern int __wt_bulk_end(WT_CURSOR_BULK *cbulk);
@@ -173,8 +241,12 @@ extern int __wt_btcur_insert(WT_CURSOR_BTREE *cbt);
 extern int __wt_btcur_remove(WT_CURSOR_BTREE *cbt);
 extern int __wt_btcur_update(WT_CURSOR_BTREE *cbt);
 extern int __wt_btcur_close(WT_CURSOR_BTREE *cbt);
-extern int __wt_debug_addr( WT_SESSION_IMPL *session,
-    uint32_t addr,
+extern int __wt_debug_addr(WT_SESSION_IMPL *session,
+    const uint8_t *addr,
+    uint32_t addr_size,
+    const char *ofile);
+extern int __wt_debug_off( WT_SESSION_IMPL *session,
+    uint32_t offset,
     uint32_t size,
     const char *ofile);
 extern int __wt_debug_disk( WT_SESSION_IMPL *session,
@@ -190,21 +262,23 @@ extern int __wt_debug_page(WT_SESSION_IMPL *session,
     WT_PAGE *page,
     const char *ofile);
 extern void __wt_page_out(WT_SESSION_IMPL *session,
-    WT_PAGE *page,
+    WT_PAGE **pagep,
     uint32_t flags);
-extern void __wt_evict_clr_page(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern void __wt_evict_list_clr_page(WT_SESSION_IMPL *session, WT_PAGE *page);
 extern void __wt_evict_server_wake(WT_SESSION_IMPL *session);
-extern void __wt_evict_file_serial_func(WT_SESSION_IMPL *session);
-extern int __wt_evict_page_request(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern void __wt_sync_file_serial_func(WT_SESSION_IMPL *session);
+extern void __wt_evict_page_request(WT_SESSION_IMPL *session, WT_PAGE *page);
 extern void *__wt_cache_evict_server(void *arg);
 extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app);
 extern int __wt_btree_create(WT_SESSION_IMPL *session, const char *filename);
 extern int __wt_btree_truncate(WT_SESSION_IMPL *session, const char *filename);
 extern int __wt_btree_open(WT_SESSION_IMPL *session,
+    const uint8_t *addr,
+    uint32_t addr_size,
     const char *cfg[],
-    uint32_t flags);
+    int readonly);
 extern int __wt_btree_close(WT_SESSION_IMPL *session);
-extern int __wt_btree_root_init(WT_SESSION_IMPL *session, WT_ITEM *addr);
+extern int __wt_btree_tree_open(WT_SESSION_IMPL *session, WT_ITEM *dsk);
 extern int __wt_btree_root_empty(WT_SESSION_IMPL *session, WT_PAGE **leafp);
 extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session,
     const char *config);
@@ -233,7 +307,6 @@ extern int __wt_page_inmem(WT_SESSION_IMPL *session,
     WT_PAGE *parent,
     WT_REF *parent_ref,
     WT_PAGE_HEADER *dsk,
-    size_t *inmem_sizep,
     WT_PAGE **pagep);
 extern int __wt_cache_read(WT_SESSION_IMPL *session,
     WT_PAGE *parent,
@@ -241,16 +314,20 @@ extern int __wt_cache_read(WT_SESSION_IMPL *session,
 extern int __wt_kv_return(WT_SESSION_IMPL *session,
     WT_CURSOR_BTREE *cbt,
     int key_ret);
-extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_bt_salvage( WT_SESSION_IMPL *session,
+    WT_SNAPSHOT *snapbase,
+    const char *cfg[]);
 extern int __wt_btree_stat_init(WT_SESSION_IMPL *session);
-extern int __wt_btree_sync(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_bt_cache_flush( WT_SESSION_IMPL *session,
+    WT_SNAPSHOT *snapbase,
+    int op,
+    int force);
 extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]);
 extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]);
 extern int __wt_dumpfile(WT_SESSION_IMPL *session, const char *cfg[]);
 extern int __wt_verify_dsk(WT_SESSION_IMPL *session,
     const char *addr,
-    WT_PAGE_HEADER *dsk,
-    uint32_t size);
+    WT_ITEM *buf);
 extern int __wt_tree_np(WT_SESSION_IMPL *session,
     WT_PAGE **pagep,
     int eviction,
@@ -265,27 +342,35 @@ extern int __wt_col_search(WT_SESSION_IMPL *session,
 extern int __wt_rec_evict(WT_SESSION_IMPL *session,
     WT_PAGE *page,
     uint32_t flags);
-extern int __wt_rec_track_block(WT_SESSION_IMPL *session,
-    __wt_pt_type_t type,
+extern int __wt_rec_track(WT_SESSION_IMPL *session,
     WT_PAGE *page,
     const uint8_t *addr,
-    uint32_t size);
-extern int __wt_rec_track_ovfl(WT_SESSION_IMPL *session,
-    WT_PAGE *page,
-    uint8_t *addr,
     uint32_t addr_size,
     const void *data,
-    uint32_t data_size);
-extern int __wt_rec_track_ovfl_reuse(WT_SESSION_IMPL *session,
+    uint32_t data_size,
+    uint32_t flags);
+extern int __wt_rec_track_onpage_srch(WT_SESSION_IMPL *session,
+    WT_PAGE *page,
+    const uint8_t *addr,
+    uint32_t addr_size,
+    int *foundp,
+    WT_ITEM *copy);
+extern int __wt_rec_track_onpage_add(WT_SESSION_IMPL *session,
+    WT_PAGE *page,
+    const uint8_t *addr,
+    uint32_t addr_size);
+extern int __wt_rec_track_ovfl_reuse( WT_SESSION_IMPL *session,
     WT_PAGE *page,
     const void *data,
-    uint32_t size,
+    uint32_t data_size,
     uint8_t **addrp,
-    uint32_t *sizep);
+    uint32_t *addr_sizep,
+    int *foundp);
 extern int __wt_rec_track_init(WT_SESSION_IMPL *session, WT_PAGE *page);
-extern int __wt_rec_track_wrapup(WT_SESSION_IMPL *session,
-    WT_PAGE *page,
-    int final);
+extern int __wt_rec_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_rec_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern void __wt_rec_track_discard(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern char *__wt_track_string(WT_PAGE_TRACK *track, char *buf, size_t len);
 extern int __wt_rec_write( WT_SESSION_IMPL *session,
     WT_PAGE *page,
     WT_SALVAGE_COOKIE *salvage);
@@ -305,7 +390,7 @@ extern int __wt_row_ikey_alloc(WT_SESSION_IMPL *session,
     uint32_t cell_offset,
     const void *key,
     uint32_t size,
-    WT_IKEY **ikeyp);
+    void *ikeyp);
 extern void __wt_row_key_serial_func(WT_SESSION_IMPL *session);
 extern int __wt_row_modify(WT_SESSION_IMPL *session,
     WT_CURSOR_BTREE *cbt,
@@ -316,6 +401,9 @@ extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session,
     WT_INSERT **insp,
     size_t *ins_sizep);
 extern void __wt_insert_serial_func(WT_SESSION_IMPL *session);
+extern int __wt_update_check(WT_SESSION_IMPL *session,
+    WT_PAGE *page,
+    WT_UPDATE *next);
 extern int __wt_update_alloc(WT_SESSION_IMPL *session,
     WT_ITEM *value,
     WT_UPDATE **updp,
@@ -371,7 +459,7 @@ extern  int __wt_config_subgets(WT_SESSION_IMPL *session,
 extern int __wt_config_check(WT_SESSION_IMPL *session,
     const char *checks,
     const char *config);
-extern int __wt_config_collapse(WT_SESSION_IMPL *session,
+extern int __wt_config_collapse( WT_SESSION_IMPL *session,
     const char **cfg,
     const char **config_ret);
 extern int __wt_config_concat( WT_SESSION_IMPL *session,
@@ -383,8 +471,8 @@ extern const char *__wt_confdfl_connection_add_collator;
 extern const char *__wt_confchk_connection_add_collator;
 extern const char *__wt_confdfl_connection_add_compressor;
 extern const char *__wt_confchk_connection_add_compressor;
-extern const char *__wt_confdfl_connection_add_cursor_type;
-extern const char *__wt_confchk_connection_add_cursor_type;
+extern const char *__wt_confdfl_connection_add_data_source;
+extern const char *__wt_confchk_connection_add_data_source;
 extern const char *__wt_confdfl_connection_add_extractor;
 extern const char *__wt_confchk_connection_add_extractor;
 extern const char *__wt_confdfl_connection_close;
@@ -435,17 +523,25 @@ extern const char *__wt_confdfl_table_meta;
 extern const char *__wt_confchk_table_meta;
 extern const char *__wt_confdfl_wiredtiger_open;
 extern const char *__wt_confchk_wiredtiger_open;
+extern void __wt_conn_btree_open_lock(WT_SESSION_IMPL *session, uint32_t flags);
+extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session);
 extern int __wt_conn_btree_open(WT_SESSION_IMPL *session,
-    const char *name,
-    const char *filename,
     const char *config,
     const char *cfg[],
     uint32_t flags);
-extern int __wt_conn_btree_close(WT_SESSION_IMPL *session, int locked);
-extern int __wt_conn_btree_remove(WT_CONNECTION_IMPL *conn);
-extern int __wt_conn_btree_reopen( WT_SESSION_IMPL *session,
+extern int __wt_conn_btree_get(WT_SESSION_IMPL *session,
+    const char *name,
+    const char *snapshot,
     const char *cfg[],
     uint32_t flags);
+extern int __wt_conn_btree_apply(WT_SESSION_IMPL *session,
+    int (*func)(WT_SESSION_IMPL *,
+    const char *[]),
+    const char *cfg[]);
+extern int __wt_conn_btree_close(WT_SESSION_IMPL *session, int locked);
+extern int __wt_conn_btree_close_all(WT_SESSION_IMPL *session,
+    const char *name);
+extern int __wt_conn_btree_discard(WT_CONNECTION_IMPL *conn);
 extern int __wt_connection_init(WT_CONNECTION_IMPL *conn);
 extern void __wt_connection_destroy(WT_CONNECTION_IMPL *conn);
 extern int __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]);
@@ -465,6 +561,7 @@ extern int __wt_curfile_create(WT_SESSION_IMPL *session,
     WT_CURSOR **cursorp);
 extern int __wt_curfile_open(WT_SESSION_IMPL *session,
     const char *uri,
+    WT_CURSOR *owner,
     const char *cfg[],
     WT_CURSOR **cursorp);
 extern int __wt_curindex_open(WT_SESSION_IMPL *session,
@@ -511,7 +608,65 @@ extern int __wt_log_printf(WT_SESSION_IMPL *session,
     2,
     3)));
 extern WT_LOGREC_DESC __wt_logdesc_debug;
-extern void __wt_abort(WT_SESSION_IMPL *session);
+extern int __wt_metadata_get(WT_SESSION *session,
+    const char *uri,
+    const char **valuep);
+extern int __wt_metadata_get_snaplist( WT_SESSION *session,
+    const char *name,
+    WT_SNAPSHOT **snapbasep);
+extern void __wt_metadata_free_snaplist(WT_SESSION *session,
+    WT_SNAPSHOT *snapbase);
+extern int __wt_meta_btree_apply(WT_SESSION_IMPL *session,
+    int (*func)(WT_SESSION_IMPL *,
+    const char *[]),
+    const char *cfg[],
+    uint32_t flags);
+extern int __wt_meta_snapshot_get(WT_SESSION_IMPL *session,
+    const char *name,
+    const char *snapshot,
+    WT_ITEM *addr);
+extern int __wt_meta_snapshot_clear(WT_SESSION_IMPL *session, const char *name);
+extern int __wt_meta_snaplist_get( WT_SESSION_IMPL *session,
+    const char *name,
+    WT_SNAPSHOT **snapbasep);
+extern int __wt_meta_snaplist_set( WT_SESSION_IMPL *session,
+    const char *name,
+    WT_SNAPSHOT *snapbase);
+extern void __wt_meta_snaplist_free(WT_SESSION_IMPL *session,
+    WT_SNAPSHOT *snapbase);
+extern int __wt_metadata_open(WT_SESSION_IMPL *session);
+extern int __wt_metadata_cursor( WT_SESSION_IMPL *session,
+    const char *config,
+    WT_CURSOR **cursorp);
+extern int __wt_metadata_insert( WT_SESSION_IMPL *session,
+    const char *key,
+    const char *value);
+extern int __wt_metadata_update( WT_SESSION_IMPL *session,
+    const char *key,
+    const char *value);
+extern int __wt_metadata_remove(WT_SESSION_IMPL *session, const char *key);
+extern int __wt_metadata_read( WT_SESSION_IMPL *session,
+    const char *key,
+    const char **valuep);
+extern void __wt_meta_track_discard(WT_SESSION_IMPL *session);
+extern int __wt_meta_track_on(WT_SESSION_IMPL *session);
+extern int __wt_meta_track_off(WT_SESSION_IMPL *session, int unroll);
+extern int __wt_meta_track_sub_on(WT_SESSION_IMPL *session);
+extern int __wt_meta_track_sub_off(WT_SESSION_IMPL *session);
+extern int __wt_meta_track_insert(WT_SESSION_IMPL *session, const char *key);
+extern int __wt_meta_track_update(WT_SESSION_IMPL *session, const char *key);
+extern int __wt_meta_track_fileop( WT_SESSION_IMPL *session,
+    const char *olduri,
+    const char *newuri);
+extern int __wt_meta_track_handle_lock(WT_SESSION_IMPL *session);
+extern int __wt_meta_turtle_init(WT_SESSION_IMPL *session, int *existp);
+extern int __wt_meta_turtle_read( WT_SESSION_IMPL *session,
+    const char *key,
+    const char **valuep);
+extern int __wt_meta_turtle_update( WT_SESSION_IMPL *session,
+    const char *key,
+    const char *value);
+extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_ATTRIBUTE((noreturn));
 extern int __wt_calloc(WT_SESSION_IMPL *session,
     size_t number,
     size_t size,
@@ -588,17 +743,23 @@ extern int __wt_thread_create(pthread_t *tidret,
     void *(*func)(void *),
     void *arg);
 extern int __wt_thread_join(pthread_t tid);
-extern int __wt_epoch(WT_SESSION_IMPL *session, time_t *secp, long *nsecp);
+extern int __wt_epoch(WT_SESSION_IMPL *session,
+    uintmax_t *secp,
+    uintmax_t *nsecp);
 extern void __wt_yield(void);
 extern int __wt_struct_check(WT_SESSION_IMPL *session,
     const char *fmt,
     size_t len,
     int *fixedp,
     uint32_t *fixed_lenp);
-extern size_t __wt_struct_sizev(WT_SESSION_IMPL *session,
+extern int __wt_struct_sizev( WT_SESSION_IMPL *session,
+    size_t *sizep,
     const char *fmt,
     va_list ap);
-extern size_t __wt_struct_size(WT_SESSION_IMPL *session, const char *fmt, ...);
+extern int __wt_struct_size(WT_SESSION_IMPL *session,
+    size_t *sizep,
+    const char *fmt,
+    ...);
 extern int __wt_struct_packv(WT_SESSION_IMPL *session,
     void *buffer,
     size_t size,
@@ -619,11 +780,6 @@ extern int __wt_struct_unpack(WT_SESSION_IMPL *session,
     size_t size,
     const char *fmt,
     ...);
-extern int __wt_create_file(WT_SESSION_IMPL *session,
-    const char *name,
-    const char *fileuri,
-    int exclusive,
-    const char *config);
 extern int __wt_schema_create( WT_SESSION_IMPL *session,
     const char *name,
     const char *config);
@@ -643,12 +799,11 @@ extern void __wt_schema_destroy_table(WT_SESSION_IMPL *session,
     WT_TABLE *table);
 extern int __wt_schema_remove_table( WT_SESSION_IMPL *session, WT_TABLE *table);
 extern int __wt_schema_close_tables(WT_SESSION_IMPL *session);
-extern void __wt_schema_detach_tree(WT_SESSION_IMPL *session, WT_BTREE *btree);
 extern int __wt_schema_colgroup_name(WT_SESSION_IMPL *session,
     WT_TABLE *table,
     const char *cgname,
     size_t len,
-    char **namebufp);
+    WT_ITEM *namebuf);
 extern int __wt_schema_get_btree(WT_SESSION_IMPL *session,
     const char *objname,
     size_t len,
@@ -711,32 +866,12 @@ extern int __wt_schema_rename(WT_SESSION_IMPL *session,
     const char *uri,
     const char *newuri,
     const char *cfg[]);
-extern int __wt_open_schema_table(WT_SESSION_IMPL *session);
-extern int __wt_schema_table_cursor( WT_SESSION_IMPL *session,
-    const char *config,
-    WT_CURSOR **cursorp);
-extern int __wt_schema_table_insert( WT_SESSION_IMPL *session,
-    const char *key,
-    const char *value);
-extern int __wt_schema_table_update( WT_SESSION_IMPL *session,
-    const char *key,
-    const char *value);
-extern int __wt_schema_table_remove(WT_SESSION_IMPL *session, const char *key);
-extern int __wt_schema_table_read( WT_SESSION_IMPL *session,
-    const char *key,
-    const char **valuep);
-extern int __wt_schema_table_track_on(WT_SESSION_IMPL *session);
-extern int __wt_schema_table_track_off(WT_SESSION_IMPL *session, int unroll);
-extern int __wt_schema_table_track_insert(WT_SESSION_IMPL *session,
-    const char *key);
-extern int __wt_schema_table_track_update(WT_SESSION_IMPL *session,
-    const char *key);
-extern int __wt_schema_table_track_fileop( WT_SESSION_IMPL *session,
-    const char *oldname,
-    const char *newname);
 extern int __wt_schema_truncate( WT_SESSION_IMPL *session,
     const char *uri,
     const char *cfg[]);
+extern int __wt_schema_get_source( WT_SESSION_IMPL *session,
+    const char *name,
+    WT_DATA_SOURCE **dsrcp);
 extern int __wt_schema_name_check(WT_SESSION_IMPL *session, const char *uri);
 extern int __wt_schema_worker(WT_SESSION_IMPL *session,
     const char *uri,
@@ -755,40 +890,23 @@ extern int __wt_open_session(WT_CONNECTION_IMPL *conn,
     WT_SESSION_IMPL **sessionp);
 extern int __wt_session_add_btree( WT_SESSION_IMPL *session,
     WT_BTREE_SESSION **btree_sessionp);
-extern int __wt_session_lock_btree( WT_SESSION_IMPL *session,
-    const char *cfg[],
-    uint32_t flags);
+extern int __wt_session_lock_btree(WT_SESSION_IMPL *session, uint32_t flags);
 extern int __wt_session_release_btree(WT_SESSION_IMPL *session);
-extern int __wt_session_find_btree(WT_SESSION_IMPL *session,
-    const char *filename,
-    size_t namelen,
-    const char *cfg[],
-    uint32_t flags,
-    WT_BTREE_SESSION **btree_sessionp);
 extern int __wt_session_get_btree(WT_SESSION_IMPL *session,
-    const char *name,
-    const char *fileuri,
-    const char *tconfig,
+    const char *uri,
     const char *cfg[],
     uint32_t flags);
-extern int __wt_session_remove_btree( WT_SESSION_IMPL *session,
-    WT_BTREE_SESSION *btree_session,
-    int locked);
-extern int __wt_session_close_any_open_btree(WT_SESSION_IMPL *session,
-    const char *name);
-extern int __wt_btree_get_root(WT_SESSION_IMPL *session, WT_ITEM *addr);
-extern int __wt_btree_free_root(WT_SESSION_IMPL *session);
-extern int __wt_btree_set_root(WT_SESSION_IMPL *session,
-    const char *filename,
-    const uint8_t *addr,
-    uint32_t size);
-extern void __wt_eventv(WT_SESSION_IMPL *session,
-    int msg_event,
-    int error,
-    const char *file_name,
-    int line_number,
-    const char *fmt,
-    va_list ap);
+extern int __wt_session_lock_snapshot( WT_SESSION_IMPL *session,
+    const char *snapshot,
+    uint32_t flags);
+extern int __wt_session_discard_btree( WT_SESSION_IMPL *session,
+    WT_BTREE_SESSION *btree_session);
+extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_snapshot(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_snapshot_close(WT_SESSION_IMPL *session);
+extern int __wt_snapshot_drop(WT_SESSION_IMPL *session, const char *cfg[]);
+extern void __wt_event_handler_set(WT_SESSION_IMPL *session,
+    WT_EVENT_HANDLER *handler);
 extern void __wt_err(WT_SESSION_IMPL *session,
     int error,
     const char *fmt,
@@ -800,18 +918,20 @@ extern void __wt_errx(WT_SESSION_IMPL *session,
     ...) WT_GCC_ATTRIBUTE((format (printf,
     2,
     3)));
-extern void __wt_msgv(WT_SESSION_IMPL *session, const char *fmt, va_list ap);
-extern void __wt_verbose(WT_SESSION_IMPL *session,
+extern int __wt_verrx(WT_SESSION_IMPL *session, const char *fmt, va_list ap);
+extern int __wt_msg(WT_SESSION_IMPL *session,
     const char *fmt,
     ...) WT_GCC_ATTRIBUTE((format (printf,
     2,
     3)));
-extern void __wt_msg(WT_SESSION_IMPL *session,
+extern int __wt_vmsg(WT_SESSION_IMPL *session, const char *fmt, va_list ap);
+extern int __wt_progress(WT_SESSION_IMPL *session, const char *s, uint64_t v);
+extern int __wt_verbose(WT_SESSION_IMPL *session,
     const char *fmt,
     ...) WT_GCC_ATTRIBUTE((format (printf,
     2,
     3)));
-extern int __wt_assert(WT_SESSION_IMPL *session,
+extern void __wt_assert(WT_SESSION_IMPL *session,
     int error,
     const char *file_name,
     int line_number,
@@ -819,7 +939,7 @@ extern int __wt_assert(WT_SESSION_IMPL *session,
     ...) WT_GCC_ATTRIBUTE((format (printf,
     5,
     6)));
-extern int __wt_illegal_value(WT_SESSION_IMPL *session);
+extern int __wt_illegal_value(WT_SESSION_IMPL *session, const char *name);
 extern int __wt_unknown_object_type(WT_SESSION_IMPL *session, const char *uri);
 extern int __wt_filename(WT_SESSION_IMPL *session,
     const char *name,
@@ -828,25 +948,28 @@ extern int __wt_library_init(void);
 extern int __wt_breakpoint(void);
 extern void __wt_attach(WT_SESSION_IMPL *session);
 extern int
-__wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref
+__wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp
 #ifdef HAVE_DIAGNOSTIC
  , const char *file, int line
 #endif
  );
 extern void __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page);
 extern void __wt_hazard_empty(WT_SESSION_IMPL *session);
-extern void __wt_hazard_validate(WT_SESSION_IMPL *session, WT_PAGE *page);
-extern int __wt_raw_to_hex(WT_SESSION_IMPL *session,
+extern int __wt_raw_to_hex( WT_SESSION_IMPL *session,
     const uint8_t *from,
     uint32_t size,
     WT_ITEM *to);
-extern int __wt_raw_to_esc_hex(WT_SESSION_IMPL *session,
+extern int __wt_raw_to_esc_hex( WT_SESSION_IMPL *session,
     const uint8_t *from,
     size_t size,
     WT_ITEM *to);
 extern int __wt_hex_to_raw(WT_SESSION_IMPL *session,
     const char *from,
     WT_ITEM *to);
+extern int __wt_nhex_to_raw( WT_SESSION_IMPL *session,
+    const char *from,
+    size_t size,
+    WT_ITEM *to);
 extern int __wt_esc_hex_to_raw(WT_SESSION_IMPL *session,
     const char *from,
     WT_ITEM *to);
@@ -900,9 +1023,13 @@ extern int __wt_buf_catfmt(WT_SESSION_IMPL *session,
     ...) WT_GCC_ATTRIBUTE((format (printf,
     3,
     4)));
-extern int __wt_scr_alloc(WT_SESSION_IMPL *session,
-    uint32_t size,
-    WT_ITEM **scratchp);
+extern int
+__wt_scr_alloc_func(WT_SESSION_IMPL *session,
+ uint32_t size, WT_ITEM **scratchp
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ );
 extern void __wt_scr_free(WT_ITEM **bufp);
 extern void __wt_scr_discard(WT_SESSION_IMPL *session);
 extern void *__wt_scr_alloc_ext(WT_SESSION *wt_session, size_t size);
@@ -915,3 +1042,13 @@ extern void __wt_stat_clear_btree_stats(WT_STATS *stats_arg);
 extern int __wt_stat_alloc_connection_stats(WT_SESSION_IMPL *session,
     WT_CONNECTION_STATS **statsp);
 extern void __wt_stat_clear_connection_stats(WT_STATS *stats_arg);
+extern int __wt_txnid_cmp(const void *v1, const void *v2);
+extern int __wt_txn_get_snapshot(WT_SESSION_IMPL *session, wt_txnid_t max_id);
+extern int __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_txn_init(WT_SESSION_IMPL *session);
+extern void __wt_txn_destroy(WT_SESSION_IMPL *session);
+extern int __wt_txn_global_init(WT_CONNECTION_IMPL *conn, const char *cfg[]);
+extern void __wt_txn_global_destroy(WT_CONNECTION_IMPL *conn);
diff --git a/src/include/meta.h b/src/include/meta.h
new file mode 100644
index 00000000000..7e696cf0aef
--- /dev/null
+++ b/src/include/meta.h
@@ -0,0 +1,43 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define	WT_METADATA_URI		"file:WiredTiger.wt"
+
+#define	WT_METADATA_TURTLE	"WiredTiger.turtle"	/* Metadata metadata */
+#define	WT_METADATA_TURTLE_SET	"WiredTiger.turtle.set"	/* Turtle temp file */
+
+#define	WT_METADATA_VERSION	"WiredTiger version"	/* Version keys */
+#define	WT_METADATA_VERSION_STR	"WiredTiger version string"
+
+/*
+ * WT_SNAPSHOT --
+ *	Encapsulation of snapshot information, shared by the metadata, the
+ * btree engine, and the block manager.
+ */
+#define	WT_INTERNAL_SNAPSHOT	"WiredTigerInternalSnapshot"
+#define	WT_SNAPSHOT_FOREACH(snapbase, snap)				\
+	for ((snap) = (snapbase); (snap)->name != NULL; ++(snap))
+
+struct __wt_snapshot {
+	char	*name;				/* Name or NULL */
+
+	WT_ITEM  addr;				/* Snapshot cookie string */
+	WT_ITEM  raw;				/* Snapshot cookie raw */
+
+	int64_t	 order;				/* Snapshot order */
+
+	uintmax_t sec;				/* Timestamp */
+
+	uint64_t snapshot_size;			/* Snapshot size */
+
+	void	*bpriv;				/* Block manager private */
+
+#define	WT_SNAP_ADD	0x01			/* Snapshot to be added */
+#define	WT_SNAP_DELETE	0x02			/* Snapshot to be deleted */
+#define	WT_SNAP_UPDATE	0x04			/* Snapshot requires update */
+	uint32_t flags;
+};
diff --git a/src/include/misc.h b/src/include/misc.h
index 850e38a874e..53bc389a60c 100644
--- a/src/include/misc.h
+++ b/src/include/misc.h
@@ -6,9 +6,14 @@
  */
 
 /* Basic constants. */
+#define	WT_MILLION	(1000000)
 #define	WT_BILLION	(1000000000)
+
+#define	WT_KILOBYTE	(1024)
 #define	WT_MEGABYTE	(1048576)
-#define	WT_MILLION	(1000000)
+#define	WT_GIGABYTE	(1073741824)
+#define	WT_TERABYTE	(1099511627776)
+#define	WT_PETABYTE	(1125899906842624)
 
 /*
  * Sizes that cannot be larger than 2**32 are stored in uint32_t fields in
@@ -70,8 +75,8 @@
  * __wt_calloc_def --
  *	Simple calls don't need separate sizeof arguments.
  */
-#define	__wt_calloc_def(a, b, c)					\
-	__wt_calloc(a, (size_t)(b), sizeof(**(c)), c)
+#define	__wt_calloc_def(session, number, addr)				\
+	__wt_calloc(session, (size_t)(number), sizeof(**(addr)), addr)
 /*
  * Our internal free function clears the underlying address atomically so there
  * is a smaller chance of racing threads seeing intermediate results while a
@@ -80,14 +85,22 @@
  * resulting bug is a mother to find -- make sure we get it right, don't make
  * the caller remember to put the & operator on the pointer.
  */
-#define	__wt_free(a, b)			__wt_free_int(a, &(b))
+#define	__wt_free(session, p)		__wt_free_int(session, &(p))
+#ifdef HAVE_DIAGNOSTIC
+#define	__wt_overwrite_and_free(session, p) do {			\
+	memset(p, WT_DEBUG_BYTE, sizeof(*(p)));				\
+	__wt_free(session, p);						\
+} while (0)
+#else
+#define	__wt_overwrite_and_free(session, p)	__wt_free(session, p)
+#endif
 
 /*
  * Flag set, clear and test.
  *
  * They come in 3 flavors: F_XXX (handles a field named "flags" in the structure
  * referenced by its argument), LF_XXX (handles a local variable named "flags"),
- * and FLD_XXX (handles any variable, anywhere.
+ * and FLD_XXX (handles any variable, anywhere).
  *
  * Flags are unsigned 32-bit values -- we cast to keep the compiler quiet (the
  * hex constant might be a negative integer), and to ensure the hex constant is
@@ -109,18 +122,28 @@
 #ifdef HAVE_VERBOSE
 #define	WT_VERBOSE_ISSET(session, f)					\
 	(FLD_ISSET(S2C(session)->verbose, WT_VERB_##f))
-#define	WT_VERBOSE(session, f, ...) do {				\
+#define	WT_VERBOSE_ERR(session, f, ...) do {				\
+	if (WT_VERBOSE_ISSET(session, f))				\
+		WT_ERR(__wt_verbose(session, #f ": " __VA_ARGS__));	\
+} while (0)
+#define	WT_VERBOSE_RET(session, f, ...) do {				\
 	if (WT_VERBOSE_ISSET(session, f))				\
-		__wt_verbose(session, #f ": " __VA_ARGS__);		\
+		WT_RET(__wt_verbose(session, #f ": " __VA_ARGS__));	\
 } while (0)
-#define	WT_VERBOSE_CALL(session, f, func) do {				\
+#define	WT_VERBOSE_RETVAL(session, f, ret, ...) do {			\
 	if (WT_VERBOSE_ISSET(session, f))				\
-		func;							\
+		(ret) = __wt_verbose(session, #f ": " __VA_ARGS__);	\
+} while (0)
+#define	WT_VERBOSE_VOID(session, f, ...) do {				\
+	if (WT_VERBOSE_ISSET(session, f))				\
+		(void)__wt_verbose(session, #f ": " __VA_ARGS__);	\
 } while (0)
 #else
 #define	WT_VERBOSE_ISSET(session, f)	0
-#define	WT_VERBOSE(session, f, ...)
-#define	WT_VERBOSE_CALL(session, f, func)
+#define	WT_VERBOSE_ERR(session, f, ...)
+#define	WT_VERBOSE_RET(session, f, ...)
+#define	WT_VERBOSE_RETVAL(session, f, ret, ...)
+#define	WT_VERBOSE_VOID(session, f, ...)
 #endif
 
 /* Clear a structure. */
@@ -135,3 +158,23 @@
 #define	WT_PREFIX_SKIP(str, pre)					\
 	((strncmp((str), (pre), strlen(pre)) == 0) ?			\
 	    ((str) += strlen(pre), 1) : 0)
+
+/* Function return value and scratch buffer declaration and initialization. */
+#define	WT_DECL_ITEM(i)	WT_ITEM *i = NULL
+#define	WT_DECL_RET	int ret = 0
+
+/*
+ * In diagnostic mode we track the locations from which hazard references and
+ * scratch buffers were acquired.
+ */
+#ifdef HAVE_DIAGNOSTIC
+#define	__wt_scr_alloc(session, size, scratchp)				\
+	__wt_scr_alloc_func(session, size, scratchp, __FILE__, __LINE__)
+#define	__wt_page_in(session, parent, ref)				\
+	__wt_page_in_func(session, parent, ref, __FILE__, __LINE__)
+#else
+#define	__wt_scr_alloc(session, size, scratchp)				\
+	__wt_scr_alloc_func(session, size, scratchp)
+#define	__wt_page_in(session, parent, ref)				\
+	__wt_page_in_func(session, parent, ref)
+#endif
diff --git a/src/include/progress.i b/src/include/progress.i
deleted file mode 100644
index 315eeabc6cd..00000000000
--- a/src/include/progress.i
+++ /dev/null
@@ -1,23 +0,0 @@
-/*-
- * Copyright (c) 2008-2012 WiredTiger, Inc.
- *	All rights reserved.
- *
- * See the file LICENSE for redistribution information.
- */
-
-/*
- * __wt_progress --
- *	Send a progress message to stdout.
- */
-static inline void
-__wt_progress(WT_SESSION_IMPL *session, const char *s, uint64_t v)
-{
-	WT_EVENT_HANDLER *handler;
-
-	if (s == NULL)
-		s = session->name;
-
-	handler = session->event_handler;
-	if (handler->handle_progress != NULL)
-		(void)handler->handle_progress(handler, s, v);
-}
diff --git a/src/include/schema.h b/src/include/schema.h
index b7467fb18bb..58618b5bdc0 100644
--- a/src/include/schema.h
+++ b/src/include/schema.h
@@ -5,12 +5,6 @@
  * See the file LICENSE for redistribution information.
  */
 
-#define	WT_SCHEMA_FILENAME	"WiredTiger.wt"		/* Schema file name */
-#define	WT_SCHEMA_URI		"file:WiredTiger.wt"	/* Schema file URI */
-
-#define	WT_SCHEMA_VERSION	"WiredTiger version"	/* Version keys */
-#define	WT_SCHEMA_VERSION_STR	"WiredTiger version string"
-
 /* Character constants for projection plans. */
 #define	WT_PROJ_KEY	'k' /* Go to key in cursor <arg>. */
 #define	WT_PROJ_NEXT	'n' /* Process the next item (<arg> repeats). */
@@ -31,9 +25,8 @@ struct __wt_table {
 
 	WT_CONFIG_ITEM cgconf, colconf;
 
-	WT_BTREE **colgroup;
-	WT_BTREE **index;
-	size_t index_alloc;
+	const char **cg_name, **idx_name;
+	size_t idx_name_alloc;
 
 	TAILQ_ENTRY(__wt_table) q;
 
diff --git a/src/include/serial.i b/src/include/serial.i
index c74d683e333..d15e27f4102 100644
--- a/src/include/serial.i
+++ b/src/include/serial.i
@@ -62,15 +62,9 @@ __wt_session_serialize_func(WT_SESSION_IMPL *session,
 static inline void
 __wt_session_serialize_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page, int ret)
 {
-	if (ret == 0 && page != NULL) {
-		/*
-		 * If passed a page and the return value is OK, we modified the
-		 * page.  Wake the eviction server as necessary if the page
-		 * has become too large.
-		 */
+	/* If passed a page and the return value is OK, we modified the page. */
+	if (page != NULL && ret == 0)
 		__wt_page_modify_set(page);
-		(void)__wt_eviction_page_check(session, page);
-	}
 
 	/*
 	 * Publish: there must be a barrier to ensure the return value is set
diff --git a/src/include/serial_funcs.i b/src/include/serial_funcs.i
index be4a6c66231..96f6d613e45 100644
--- a/src/include/serial_funcs.i
+++ b/src/include/serial_funcs.i
@@ -2,7 +2,8 @@
 
 typedef struct {
 	WT_PAGE *page;
-	WT_INSERT_HEAD **inshead;
+	uint32_t write_gen;
+	WT_INSERT_HEAD **insheadp;
 	WT_INSERT ***ins_stack;
 	WT_INSERT_HEAD **new_inslist;
 	size_t new_inslist_size;
@@ -18,18 +19,20 @@ typedef struct {
 
 static inline int
 __wt_col_append_serial(
-	WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD **inshead,
-	WT_INSERT ***ins_stack, WT_INSERT_HEAD ***new_inslistp, size_t
-	new_inslist_size, WT_INSERT_HEAD **new_insheadp, size_t
-	new_inshead_size, WT_INSERT **new_insp, size_t new_ins_size, u_int
-	skipdepth)
+	WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t write_gen,
+	WT_INSERT_HEAD **insheadp, WT_INSERT ***ins_stack, WT_INSERT_HEAD
+	***new_inslistp, size_t new_inslist_size, WT_INSERT_HEAD
+	**new_insheadp, size_t new_inshead_size, WT_INSERT **new_insp, size_t
+	new_ins_size, u_int skipdepth)
 {
 	__wt_col_append_args _args, *args = &_args;
-	int ret;
+	WT_DECL_RET;
 
 	args->page = page;
 
-	args->inshead = inshead;
+	args->write_gen = write_gen;
+
+	args->insheadp = insheadp;
 
 	args->ins_stack = ins_stack;
 
@@ -76,16 +79,17 @@ __wt_col_append_serial(
 
 static inline void
 __wt_col_append_unpack(
-	WT_SESSION_IMPL *session, WT_PAGE **pagep, WT_INSERT_HEAD ***insheadp,
-	WT_INSERT ****ins_stackp, WT_INSERT_HEAD ***new_inslistp,
-	WT_INSERT_HEAD **new_insheadp, WT_INSERT **new_insp, u_int
-	*skipdepthp)
+	WT_SESSION_IMPL *session, WT_PAGE **pagep, uint32_t *write_genp,
+	WT_INSERT_HEAD ***insheadpp, WT_INSERT ****ins_stackp, WT_INSERT_HEAD
+	***new_inslistp, WT_INSERT_HEAD **new_insheadp, WT_INSERT **new_insp,
+	u_int *skipdepthp)
 {
 	__wt_col_append_args *args =
 	    (__wt_col_append_args *)session->wq_args;
 
 	*pagep = args->page;
-	*insheadp = args->inshead;
+	*write_genp = args->write_gen;
+	*insheadpp = args->insheadp;
 	*ins_stackp = args->ins_stack;
 	*new_inslistp = args->new_inslist;
 	*new_insheadp = args->new_inshead;
@@ -130,35 +134,6 @@ __wt_col_append_new_ins_taken(WT_SESSION_IMPL *session, WT_PAGE *page)
 }
 
 typedef struct {
-	int discard;
-} __wt_evict_file_args;
-
-static inline int
-__wt_evict_file_serial(
-	WT_SESSION_IMPL *session, int discard)
-{
-	__wt_evict_file_args _args, *args = &_args;
-	int ret;
-
-	args->discard = discard;
-
-	ret = __wt_session_serialize_func(session,
-	    WT_SERIAL_EVICT, __wt_evict_file_serial_func, args);
-
-	return (ret);
-}
-
-static inline void
-__wt_evict_file_unpack(
-	WT_SESSION_IMPL *session, int *discardp)
-{
-	__wt_evict_file_args *args =
-	    (__wt_evict_file_args *)session->wq_args;
-
-	*discardp = args->discard;
-}
-
-typedef struct {
 	WT_PAGE *page;
 	uint32_t write_gen;
 	WT_INSERT_HEAD **inshead;
@@ -184,7 +159,7 @@ __wt_insert_serial(
 	new_ins_size, u_int skipdepth)
 {
 	__wt_insert_args _args, *args = &_args;
-	int ret;
+	WT_DECL_RET;
 
 	args->page = page;
 
@@ -303,7 +278,7 @@ __wt_row_key_serial(
 	*ikey)
 {
 	__wt_row_key_args _args, *args = &_args;
-	int ret;
+	WT_DECL_RET;
 
 	args->page = page;
 
@@ -331,6 +306,35 @@ __wt_row_key_unpack(
 }
 
 typedef struct {
+	int syncop;
+} __wt_sync_file_args;
+
+static inline int
+__wt_sync_file_serial(
+	WT_SESSION_IMPL *session, int syncop)
+{
+	__wt_sync_file_args _args, *args = &_args;
+	WT_DECL_RET;
+
+	args->syncop = syncop;
+
+	ret = __wt_session_serialize_func(session,
+	    WT_SERIAL_EVICT, __wt_sync_file_serial_func, args);
+
+	return (ret);
+}
+
+static inline void
+__wt_sync_file_unpack(
+	WT_SESSION_IMPL *session, int *syncopp)
+{
+	__wt_sync_file_args *args =
+	    (__wt_sync_file_args *)session->wq_args;
+
+	*syncopp = args->syncop;
+}
+
+typedef struct {
 	WT_PAGE *page;
 	uint32_t write_gen;
 	WT_UPDATE **srch_upd;
@@ -349,7 +353,7 @@ __wt_update_serial(
 	**updp, size_t upd_size)
 {
 	__wt_update_args _args, *args = &_args;
-	int ret;
+	WT_DECL_RET;
 
 	args->page = page;
 
diff --git a/src/include/stat.h b/src/include/stat.h
index 31ead1a84a7..06e8c00ec1f 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -77,8 +77,6 @@ struct __wt_btree_stats {
 	WT_STATS file_maxleafitem;
 	WT_STATS file_maxleafpage;
 	WT_STATS file_minor;
-	WT_STATS file_freelist_bytes;
-	WT_STATS file_freelist_entries;
 	WT_STATS file_overflow;
 	WT_STATS file_allocsize;
 	WT_STATS rec_page_merge;
@@ -92,6 +90,7 @@ struct __wt_btree_stats {
 	WT_STATS file_row_int_pages;
 	WT_STATS file_row_leaf_pages;
 	WT_STATS file_entries;
+	WT_STATS file_write_conflicts;
 };
 
 /*
diff --git a/src/include/txn.h b/src/include/txn.h
new file mode 100644
index 00000000000..5d3f8e910fd
--- /dev/null
+++ b/src/include/txn.h
@@ -0,0 +1,76 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Transaction ID type: transaction IDs are 32-bit integers that wrap after
+ * 4 billion transactions are executed.
+ */
+typedef uint32_t wt_txnid_t;
+
+#define	WT_TXN_NONE	0		/* No txn running in a session. */
+#define	WT_TXN_ABORTED	UINT32_MAX	/* Update rolled back, ignore. */
+
+/*
+ * Transaction ID comparison dealing with edge cases and wrapping.
+ *
+ * WT_TXN_ABORTED is the largest possible ID (never visible to a running
+ * transaction), WT_TXN_NONE is smaller than any possible ID (visible to all
+ * running transactions).
+ *
+ * Otherwise, we deal with 32-bit wrapping by looking at the difference between
+ * the two IDs.  In what follows, "small" means less than 2^31 and "large"
+ * means greater than 2^31.
+ *
+ * If t2 > t1 and neither has wrapped, then (t2 - t1) is small.  It is
+ * certainly smaller than t2, and we assume that we never compare IDs that
+ * differ by more than 2^31.  If t2 has wrapped (so it is small) and t1 is
+ * large, then (t2 - t1) = (t2 + (-t1)) will be small.
+ *
+ * In effect, we have a 31-bit window of active transaction IDs: if an update
+ * remains in the system after 2 billion transactions it can no longer be
+ * compared with current transaction ID.
+ */
+#define	TXNID_LT(t1, t2)						\
+	(((t1) == (t2) ||						\
+	 (t1) == WT_TXN_ABORTED || (t2) == WT_TXN_NONE) ? 0 :	\
+	 ((t1) == WT_TXN_NONE || (t2) == WT_TXN_ABORTED) ? 1 :	\
+	 (t2) - (t1) < (UINT32_MAX / 2))
+
+struct __wt_txn_global {
+	volatile wt_txnid_t current;	/* Current transaction ID. */
+	wt_txnid_t *ids;		/* Per-session transaction IDs */
+	wt_txnid_t ckpt_txnid;		/* ID of checkpoint, or WT_TXN_NONE */
+};
+
+struct __wt_txn {
+	wt_txnid_t id;
+
+	/*
+	 * Snapshot data:
+	 *     everything < snapshot[0] is visible,
+	 *     everything > id is invisible
+	 *     everything in between is visible unless it is in snap_overlap.
+	 */
+	wt_txnid_t snap_min, snap_max;
+	wt_txnid_t *snapshot;
+	uint32_t snapshot_count;
+
+	/* Array of txn IDs in items created or modified by this txn. */
+	wt_txnid_t **mod;
+	size_t mod_alloc;
+	u_int mod_count;
+
+	enum {
+		TXN_ISO_READ_UNCOMMITTED,
+		TXN_ISO_READ_COMMITTED,
+		TXN_ISO_SNAPSHOT
+	} isolation;
+
+#define	TXN_ERROR	0x01
+#define	TXN_RUNNING	0x02
+	uint32_t flags;
+};
diff --git a/src/include/txn.i b/src/include/txn.i
new file mode 100644
index 00000000000..37395341750
--- /dev/null
+++ b/src/include/txn.i
@@ -0,0 +1,161 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __wt_txn_modify --
+ *      Mark an object modified by the current transaction.
+ */
+static inline int
+__wt_txn_modify(WT_SESSION_IMPL *session, wt_txnid_t *id)
+{
+	WT_TXN *txn;
+
+	txn = &session->txn;
+	if (F_ISSET(txn, TXN_RUNNING)) {
+		*id = txn->id;
+		if (txn->mod_count * sizeof(wt_txnid_t *) == txn->mod_alloc)
+			WT_RET(__wt_realloc(session, &txn->mod_alloc,
+			    WT_MAX(10, 2 * txn->mod_count) *
+			    sizeof(wt_txnid_t *), &txn->mod));
+		txn->mod[txn->mod_count++] = id;
+	} else
+		*id = WT_TXN_NONE;
+
+	return (0);
+}
+
+/*
+ * __wt_txn_unmodify --
+ *	If threads race making updates, they may discard the last referenced
+ *	WT_UPDATE item while the transaction is still active.  This function
+ *	removes the last update item from the "log".
+ */
+static inline void
+__wt_txn_unmodify(WT_SESSION_IMPL *session)
+{
+	WT_TXN *txn;
+
+	txn = &session->txn;
+	if (F_ISSET(txn, TXN_RUNNING)) {
+		WT_ASSERT(session, txn->mod_count > 0);
+		txn->mod_count--;
+	}
+}
+
+/*
+ * __wt_txn_visible --
+ *	Can the current transaction see the given ID?
+ */
+static inline int
+__wt_txn_visible(WT_SESSION_IMPL *session, wt_txnid_t id)
+{
+	WT_TXN *txn;
+
+	/* Nobody sees the results of aborted transactions. */
+	if (id == WT_TXN_ABORTED)
+		return (0);
+
+	/*
+	 * Changes with no associated transaction are always visible, and
+	 * non-snapshot transactions see all other changes.
+	 */
+	txn = &session->txn;
+	if (id == WT_TXN_NONE || id == txn->id ||
+	    txn->isolation != TXN_ISO_SNAPSHOT)
+		return (1);
+
+	/*
+	 * The snapshot test.
+	 */
+	if (TXNID_LT(id, txn->snap_min))
+		return (1);
+	if (TXNID_LT(txn->id, txn->snap_max))
+		return (0);
+
+	/*
+	 * Otherwise, the ID is visible if it is not the result of a concurrent
+	 * transaction.  That is, if it is not in the snapshot list.  Fast path
+	 * the single-threaded case where there are no concurrent transactions.
+	 */
+	return (txn->snapshot_count == 0 || bsearch(&id, txn->snapshot,
+	    txn->snapshot_count, sizeof(wt_txnid_t), __wt_txnid_cmp) == NULL);
+}
+
+/*
+ * __wt_txn_read_skip --
+ *	Get the first visible update in a list (or NULL if none are visible),
+ *	and report whether uncommitted changes were skipped.
+ */
+static inline WT_UPDATE *
+__wt_txn_read_skip(WT_SESSION_IMPL *session, WT_UPDATE *upd, int *skipp)
+{
+	while (upd != NULL && !__wt_txn_visible(session, upd->txnid)) {
+		if (upd->txnid != WT_TXN_ABORTED)
+			*skipp = 1;
+		upd = upd->next;
+	}
+
+	return (upd);
+}
+
+/*
+ * __wt_txn_read --
+ *	Get the first visible update in a list (or NULL if none are visible).
+ */
+static inline WT_UPDATE *
+__wt_txn_read(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+{
+	while (upd != NULL && !__wt_txn_visible(session, upd->txnid))
+		upd = upd->next;
+
+	return (upd);
+}
+
+/*
+ * __wt_txn_update_check --
+ *	Check if the current transaction can update an item.
+ */
+static inline int
+__wt_txn_update_check(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+{
+	WT_TXN *txn;
+
+	txn = &session->txn;
+	if (txn->isolation == TXN_ISO_SNAPSHOT)
+		while (upd != NULL && !__wt_txn_visible(session, upd->txnid)) {
+			if (upd->txnid != WT_TXN_ABORTED)
+				return (WT_DEADLOCK);
+			upd = upd->next;
+		}
+
+	return (0);
+}
+
+/*
+ * __wt_txn_ancient --
+ *	Check if a given transaction ID is "ancient".
+ *      That is, if it is so far behind the current transaction that it could
+ *      soon become invisible.  If so, eviction will be forced on the page.
+ */
+static inline int
+__wt_txn_ancient(WT_SESSION_IMPL *session, wt_txnid_t id)
+{
+	WT_TXN_GLOBAL *txn_global;
+	wt_txnid_t current;
+
+	txn_global = &S2C(session)->txn_global;
+	current = txn_global->current;
+
+	/*
+	 * Call an update "ancient" if it will wrap around in under 1 million
+	 * transactions, to give eviction time to write it.
+	 */
+#define	TXN_WRAP_BUFFER	1000000
+#define	TXN_WINDOW	((UINT32_MAX / 2) - TXN_WRAP_BUFFER)
+
+	return (id != WT_TXN_NONE && TXNID_LT(id, current - TXN_WINDOW));
+}
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 6c4704fa199..89cd98f7cea 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -72,7 +72,7 @@ struct __wt_collator;	    typedef struct __wt_collator WT_COLLATOR;
 struct __wt_compressor;	    typedef struct __wt_compressor WT_COMPRESSOR;
 struct __wt_connection;	    typedef struct __wt_connection WT_CONNECTION;
 struct __wt_cursor;	    typedef struct __wt_cursor WT_CURSOR;
-struct __wt_cursor_type;    typedef struct __wt_cursor_type WT_CURSOR_TYPE;
+struct __wt_data_source;    typedef struct __wt_data_source WT_DATA_SOURCE;
 struct __wt_event_handler;  typedef struct __wt_event_handler WT_EVENT_HANDLER;
 struct __wt_extension_api;  typedef struct __wt_extension_api WT_EXTENSION_API;
 struct __wt_extractor;	    typedef struct __wt_extractor WT_EXTRACTOR;
@@ -142,8 +142,8 @@ struct __wt_item {
  * statistics, configuration data or application-specific data sources.  See
  * WT_SESSION::open_cursor for more information.
  *
- * <b>Thread safety:</b> A WT_CURSOR handle cannot be shared between threads:
- * it may only be used within the same thread as the encapsulating WT_SESSION.
+ * <b>Thread safety:</b> A WT_CURSOR handle is not usually shared between
+ * threads, see @ref threads for more information.
  */
 struct __wt_cursor {
 	WT_SESSION *session;	/*!< The session handle for this cursor. */
@@ -408,9 +408,8 @@ struct __wt_cursor {
  * All data operations are performed in the context of a WT_SESSION.  This
  * encapsulates the thread and transactional context of the operation.
  *
- * <b>Thread safety:</b> A WT_SESSION handle cannot be shared between threads:
- * it may only be used within a single thread.  Each thread accessing a
- * database should open a separate WT_SESSION handle.
+ * <b>Thread safety:</b> A WT_SESSION handle is not usually shared between
+ * threads, see @ref threads for more information.
  */
 struct __wt_session {
 	/*! The connection for this session. */
@@ -419,8 +418,8 @@ struct __wt_session {
 	/*! Close the session handle.
 	 *
 	 * This will release the resources associated with the session handle,
-	 * including rolling any active transactions and closing any cursors
-	 * that remain open in the session.
+	 * including rolling back any active transactions and closing any
+	 * cursors that remain open in the session.
 	 *
 	 * @snippet ex_all.c session close
 	 *
@@ -462,7 +461,7 @@ struct __wt_session {
 	 *  opened on any data source, regardless of whether it is ultimately
 	 *  stored in a table.  Some cursor types may have limited
 	 *  functionality (for example, be read-only or not support
-	 *  transactional updates).  See @ref cursor_types for more
+	 *  transactional updates).  See @ref data_sources for more
 	 *  information.
 	 *  <br>
 	 *  The following are the builtin cursor types:
@@ -484,7 +483,7 @@ struct __wt_session {
 	 *	database or file statistics cursor,
 	 *	key=<code>int id</code>\, value=(<code>string description\,
 	 *	string value\, uint64_t value</code>)\,
-	 *	see @ref cursor_statistics for details}
+	 *	see @ref data_statistics for details}
 	 *  </table>
 	 * @param to_dup a cursor to duplicate
 	 * @param session the session handle
@@ -495,9 +494,6 @@ struct __wt_session {
 	 * @config{bulk, configure the cursor for bulk loads; bulk-load is a
 	 * fast load path for empty objects\, only empty objects may be
 	 * bulk-loaded.,a boolean flag; default \c false.}
-	 * @config{clear_on_close, for statistics cursors\, reset statistics
-	 * counters when the cursor is closed.,a boolean flag; default \c
-	 * false.}
 	 * @config{dump, configure the cursor for dump format inputs and
 	 * outputs: "hex" selects a simple hexadecimal format\, "print" selects
 	 * a format where only non-printing characters are hexadecimal encoded.
@@ -514,8 +510,13 @@ struct __wt_session {
 	 * @config{raw, ignore the encodings for the key and value\, manage data
 	 * as if the formats were \c "u".  See @ref cursor_raw for details.,a
 	 * boolean flag; default \c false.}
+	 * @config{snapshot, the name of a snapshot to open.,a string; default
+	 * empty.}
 	 * @config{statistics, configure the cursor for statistics.,a boolean
 	 * flag; default \c false.}
+	 * @config{statistics_clear, statistics cursors only; reset statistics
+	 * counters when the cursor is closed.,a boolean flag; default \c
+	 * false.}
 	 * @configend
 	 * @param cursorp a pointer to the newly opened cursor
 	 * @errors
@@ -584,8 +585,8 @@ struct __wt_session {
 	 * least 8 keys per internal page.,an integer greater than or equal to
 	 * 0; default \c 0.}
 	 * @config{internal_key_truncate, configure internal key truncation\,
-	 * discarding unnecessary trailing bytes on internal keys.,a boolean
-	 * flag; default \c true.}
+	 * discarding unnecessary trailing bytes on internal keys (ignored for
+	 * custom collators).,a boolean flag; default \c true.}
 	 * @config{internal_page_max, the maximum page size for internal nodes\,
 	 * in bytes; the size must be a multiple of the allocation size and is
 	 * significant for applications wanting to avoid excessive L2 cache
@@ -641,6 +642,13 @@ struct __wt_session {
 	 * @configstart{session.drop, see dist/api_data.py}
 	 * @config{force, return success if the object does not exist.,a boolean
 	 * flag; default \c false.}
+	 * @config{snapshot, specify one or more snapshots to drop.  The value
+	 * must be either the name of a single snapshot to drop (a string)\, or
+	 * a list containing one of the following keys: \c "all" to drop all
+	 * snapshots\, \c "from=<snapshot>" to drop all snapshots after and
+	 * including the named snapshots\, or \c "to=<snapshot>" to drop all
+	 * snapshots before and including the named snapshot.,a string; default
+	 * empty.}
 	 * @configend
 	 * @errors
 	 */
@@ -687,18 +695,19 @@ struct __wt_session {
 	int __F(salvage)(WT_SESSION *session,
 	    const char *name, const char *config);
 
-	/*! Sync a file or table.
+	/*! Snapshot a table or file.
 	 *
-	 * Flush dirty pages from a table to stable storage.  Note that not
-	 * all pages are necessarily flushed (pages pinned in memory, or in
-	 * use by other threads of control may not be written until all open
-	 * session handles for the table are closed).
+	 * Flush dirty pages from a table or file to stable storage, creating a
+	 * snapshot (see @ref snapshots for more information).
 	 *
 	 * @snippet ex_all.c session sync
 	 *
 	 * @param session the session handle
 	 * @param name the URI of the file or table to sync
-	 * @configempty{session.sync, see dist/api_data.py}
+	 * @configstart{session.sync, see dist/api_data.py}
+	 * @config{snapshot, if non-empty\, create a named snapshot.,a string;
+	 * default empty.}
+	 * @configend
 	 * @errors
 	 */
 	int __F(sync)(WT_SESSION *session,
@@ -759,25 +768,23 @@ struct __wt_session {
 	/*! @name Transactions
 	 * @{
 	 */
-	/*! Start a transaction in this session. @notyet{transactions}
+	/*! Start a transaction in this session.
 	 *
 	 * All cursors opened in this session that support transactional
 	 * semantics will operate in the context of the transaction.  The
 	 * transaction remains active until ended with
 	 * WT_SESSION::commit_transaction or WT_SESSION::rollback_transaction.
 	 *
-	 * Ignored if a transaction is in progress.
-	 *
-	 * @todo describe nested transactions / savepoints
+	 * Not permitted if a transaction is in progress or cursors are open in
+	 * the session.
 	 *
 	 * @snippet ex_all.c session begin transaction
 	 *
 	 * @param session the session handle
 	 * @configstart{session.begin_transaction, see dist/api_data.py}
 	 * @config{isolation, the isolation level for this transaction.,a
-	 * string\, chosen from the following options: \c "serializable"\, \c
-	 * "snapshot"\, \c "read-committed"\, \c "read-uncommitted"; default \c
-	 * read-committed.}
+	 * string\, chosen from the following options: \c "read-uncommitted"\,
+	 * \c "snapshot"; default \c snapshot.}
 	 * @config{name, name of the transaction for tracing and debugging.,a
 	 * string; default empty.}
 	 * @config{priority, priority of the transaction for resolving
@@ -791,12 +798,11 @@ struct __wt_session {
 	 */
 	int __F(begin_transaction)(WT_SESSION *session, const char *config);
 
-	/*! Commit the current transaction. @notyet{transactions}
-	 *
-	 * Any cursors opened during the transaction will be closed before
-	 * the commit is processed.
+	/*! Commit the current transaction.
 	 *
-	 * Ignored if no transaction is in progress.
+	 * A transaction must be in progress when this method is called.  Any
+	 * cursors opened during the transaction will be closed before the
+	 * commit is processed.
 	 *
 	 * @snippet ex_all.c session commit transaction
 	 *
@@ -806,12 +812,11 @@ struct __wt_session {
 	 */
 	int __F(commit_transaction)(WT_SESSION *session, const char *config);
 
-	/*! Roll back the current transaction. @notyet{transactions}
+	/*! Roll back the current transaction.
 	 *
-	 * Any cursors opened during the transaction will be closed before
-	 * the rollback is processed.
-	 *
-	 * Ignored if no transaction is in progress.
+	 * A transaction must be in progress when this method is called.  Any
+	 * cursors opened during the transaction will be closed before the
+	 * rollback is processed.
 	 *
 	 * @snippet ex_all.c session rollback transaction
 	 *
@@ -821,27 +826,17 @@ struct __wt_session {
 	 */
 	int __F(rollback_transaction)(WT_SESSION *session, const char *config);
 
-	/*! Flush the cache and/or the log and optionally archive log files.
-	 * @notyet{checkpoint}
+	/*! Write a transactionally consistent snapshot of a database.
+	 *
+	 * All data files in the database are updated with snapshots that
+	 * reflect the transactions committed before the checkpoint starts.
 	 *
 	 * @snippet ex_all.c session checkpoint
 	 *
 	 * @param session the session handle
 	 * @configstart{session.checkpoint, see dist/api_data.py}
-	 * @config{archive, remove log files no longer required for
-	 * transactional durability.,a boolean flag; default \c false.}
-	 * @config{flush_cache, flush the cache.,a boolean flag; default \c
-	 * true.}
-	 * @config{flush_log, flush the log to disk.,a boolean flag; default \c
-	 * true.}
-	 * @config{force, write a new checkpoint even if nothing has changed
-	 * since the last one.,a boolean flag; default \c false.}
-	 * @config{log_size, only proceed if more than the specified number of
-	 * bytes of log records have been written since the last checkpoint.,an
-	 * integer greater than or equal to 0; default \c 0.}
-	 * @config{timeout, only proceed if more than the specified number of
-	 * milliseconds have elapsed since the last checkpoint.,an integer
-	 * greater than or equal to 0; default \c 0.}
+	 * @config{snapshot, if non-empty\, create named snapshots in files.,a
+	 * string; default empty.}
 	 * @configend
 	 * @errors
 	 */
@@ -889,6 +884,9 @@ struct __wt_session {
  * the database in its own address space.  Subsequent connections (if allowed)
  * will communicate with the first process over a socket connection to perform
  * their operations.
+ *
+ * <b>Thread safety:</b> A WT_CONNECTION handle may be shared between threads,
+ * see @ref threads for more information.
  */
 struct __wt_connection {
 	/*! Load an extension.
@@ -909,25 +907,24 @@ struct __wt_connection {
 	int __F(load_extension)(WT_CONNECTION *connection,
 	    const char *path, const char *config);
 
-	/*! Add a new type of cursor. @notyet{custom cursors}
+	/*! Add a custom data source. @notyet{custom data sources}
 	 *
-	 * The application must first implement the WT_CURSOR_TYPE interface
+	 * The application must first implement the WT_DATA_SOURCE interface
 	 * and then register the implementation with WiredTiger:
 	 *
-	 * @snippet ex_all.c WT_CURSOR_TYPE register
+	 * @snippet ex_all.c WT_DATA_SOURCE register
 	 *
 	 * @param connection the connection handle
-	 * @param prefix the prefix for location strings passed to
-	 * WT_SESSION::open_cursor
-	 * @param ctype the application-supplied code to manage cursors of
-	 * this type
-	 * @configempty{connection.add_cursor_type, see dist/api_data.py}
+	 * @param prefix the URI prefix for this data source, e.g., "file:"
+	 * @param data_source the application-supplied implementation of
+	 *	WT_DATA_SOURCE to manage this data source.
+	 * @configempty{connection.add_data_source, see dist/api_data.py}
 	 * @errors
 	 */
-	int __F(add_cursor_type)(WT_CONNECTION *connection,
-	    const char *prefix, WT_CURSOR_TYPE *ctype, const char *config);
+	int __F(add_data_source)(WT_CONNECTION *connection, const char *prefix,
+	    WT_DATA_SOURCE *data_source, const char *config);
 
-	/*! Add a custom collation function. @notyet{custom collation}
+	/*! Add a custom collation function.
 	 *
 	 * The application must first implement the WT_COLLATOR interface and
 	 * then register the implementation with WiredTiger:
@@ -1075,14 +1072,16 @@ struct __wt_connection {
  * \c false.}
  * @config{session_max, maximum expected number of sessions (including server
  * threads).,an integer greater than or equal to 1; default \c 50.}
+ * @config{sync, sync files when closing or writing snapshots.,a boolean flag;
+ * default \c true.}
  * @config{transactional, support transactional semantics.,a boolean flag;
- * default \c false.}
+ * default \c true.}
  * @config{verbose, enable messages for various events.  Options are given as a
  * list\, such as <code>"verbose=[evictserver\,read]"</code>.,a list\, with
  * values chosen from the following options: \c "block"\, \c "evict"\, \c
  * "evictserver"\, \c "fileops"\, \c "hazard"\, \c "mutex"\, \c "read"\, \c
- * "readserver"\, \c "reconcile"\, \c "salvage"\, \c "verify"\, \c "write";
- * default empty.}
+ * "readserver"\, \c "reconcile"\, \c "salvage"\, \c "snapshot"\, \c "verify"\,
+ * \c "write"; default empty.}
  * @configend
  * Additionally, if a file named \c WiredTiger.config appears in the WiredTiger
  * home directory, it is read for configuration values (see @ref config_file
@@ -1107,27 +1106,39 @@ int wiredtiger_open(const char *home,
 const char *wiredtiger_strerror(int err);
 
 /*!
- * The interface implemented by applications in order to handle error messages,
- * information messages and progress.  Entries set to NULL are ignored and will
- * continue to use the default handlers.
+ * The interface implemented by applications to handle error, informational and
+ * progress messages.  Entries set to NULL are ignored and the default handlers
+ * will continue to be used.
  */
 struct __wt_event_handler {
 	/*!
 	 * Callback to handle error messages; by default, error messages are
-	 * written to the stderr stream.
+	 * written to the stderr stream.  If the handler returns non-zero,
+	 * the application's current operation will return an error.
+	 *
+	 * @param error a WiredTiger, C99 or POSIX error code, which can
+	 * be converted to a string using ::wiredtiger_strerror
+	 * @param message an error string
 	 */
-	void (*handle_error)(WT_EVENT_HANDLER *handler,
-	    int error, const char *errmsg);
+	int (*handle_error)(WT_EVENT_HANDLER *handler,
+	    int error, const char *message);
 
 	/*!
 	 * Callback to handle informational messages; by default, informational
-	 * messages are written to the stdout stream.
+	 * messages are written to the stdout stream.  If the handler returns
+	 * non-zero, the application's current operation will return an error.
+	 *
+	 * @param message an informational string
 	 */
 	int (*handle_message)(WT_EVENT_HANDLER *handler, const char *message);
 
 	/*!
-	 * Callback to handle progress messages; by default, progress messages
-	 * are ignored.
+	 * Callback to handle progress messages; by default, no progress
+	 * messages are written.  If the handler returns non-zero, the
+	 * application's current operation will return an error.
+	 *
+	 * @param operation a string representation of the operation
+	 * @param progress a counter
 	 */
 	int (*handle_progress)(WT_EVENT_HANDLER *handler,
 	    const char *operation, uint64_t progress);
@@ -1150,12 +1161,14 @@ struct __wt_event_handler {
  *
  * @snippet ex_all.c Unpack fields from a buffer
  *
+ * @param session the session handle
  * @param buffer a pointer to a packed byte array
  * @param size the number of valid bytes in the buffer
- * @param format the data format, see ::wiredtiger_struct_pack
+ * @param format the data format, see @ref packing
  * @errors
  */
-int wiredtiger_struct_pack(void *buffer, size_t size, const char *format, ...);
+int wiredtiger_struct_pack(
+    WT_SESSION *session, void *buffer, size_t size, const char *format, ...);
 
 /*! Calculate the size required to pack a structure.
  *
@@ -1165,11 +1178,14 @@ int wiredtiger_struct_pack(void *buffer, size_t size, const char *format, ...);
  *
  * @snippet ex_all.c Get the packed size
  *
- * @param format the data format, see ::wiredtiger_struct_pack
- * @returns the number of bytes needed for the matching call to
- * ::wiredtiger_struct_pack
+ * @param session the session handle
+ * @param sizep a location where the the number of bytes needed for the
+ * matching call to ::wiredtiger_struct_pack is returned
+ * @param format the data format, see @ref packing
+ * @errors
  */
-size_t wiredtiger_struct_size(const char *format, ...);
+int wiredtiger_struct_size(
+    WT_SESSION *session, size_t *sizep, const char *format, ...);
 
 /*! Unpack a structure from a buffer.
  *
@@ -1177,13 +1193,14 @@ size_t wiredtiger_struct_size(const char *format, ...);
  *
  * @snippet ex_all.c Unpack fields from a buffer
  *
+ * @param session the session handle
  * @param buffer a pointer to a packed byte array
  * @param size the number of valid bytes in the buffer
- * @param format the data format, see ::wiredtiger_struct_pack
+ * @param format the data format, see @ref packing
  * @errors
  */
-int wiredtiger_struct_unpack(const void *buffer, size_t size,
-    const char *format, ...);
+int wiredtiger_struct_unpack(WT_SESSION *session,
+    const void *buffer, size_t size, const char *format, ...);
 
 /*! Get version information.
  *
@@ -1207,9 +1224,9 @@ const char *wiredtiger_version(int *majorp, int *minorp, int *patchp);
  * whether the operation succeeded or failed.  A return of zero indicates
  * success, all non-zero return values indicate some kind of failure.
  *
- * WiredTiger reserves all values from -31,800 to -31,999 to itself as possible
- * error values.  In addition, C99/POSIX error codes such as \c ENOMEM,
- * \c EINVAL and \c ENOTSUP may also be returned, with the usual meanings.
+ * WiredTiger reserves all values from -31,800 to -31,999 as possible error
+ * return values.  WiredTiger may also return C99/POSIX error codes such as
+ * \c ENOMEM, \c EINVAL and \c ENOTSUP, with the usual meanings.
  *
  * The following are all of the WiredTiger-specific error returns:
  * @{
@@ -1421,37 +1438,62 @@ struct __wt_compressor {
 
 /*!
  * Applications can extend WiredTiger by providing new implementations of the
- * WT_CURSOR class.
+ * WT_DATA_SOURCE class.  Each data source supports a different URI scheme for
+ * data sources to WT_SESSION::create, WT_SESSION::open_cursor and related
+ * methods.
  *
- * <b>Thread safety:</b> WiredTiger may invoke methods on the WT_CURSOR_TYPE
+ * <b>Thread safety:</b> WiredTiger may invoke methods on the WT_DATA_SOURCE
  * interface from multiple threads concurrently.  It is the responsibility of
  * the implementation to protect any shared data.
  *
  * Applications register their implementation with WiredTiger by calling
- * WT_CONNECTION::add_cursor_type.
+ * WT_CONNECTION::add_data_source.
  *
- * @snippet ex_all.c WT_CURSOR_TYPE register
+ * @snippet ex_all.c WT_DATA_SOURCE register
  */
-struct __wt_cursor_type {
-	/*! Callback to determine how much space to allocate for a cursor.
-	 *
-	 * If the callback is NULL, no additional space is allocated in the
-	 * WT_CURSOR implementation.
+struct __wt_data_source {
+	/*! Callback to create a new object.
 	 *
-	 * @errors
+	 * @snippet ex_all.c WT_DATA_SOURCE create
+	 */
+	int (*create)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+	    const char *name, const char *config);
+
+	/*! Callback to drop an object.
 	 *
-	 * @snippet ex_all.c WT_CURSOR_TYPE size
+	 * @snippet ex_all.c WT_DATA_SOURCE drop
 	 */
-	int (*cursor_size)(WT_CURSOR_TYPE *ctype,
-	    const char *obj, size_t *sizep);
+	int (*drop)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+	    const char *name, const char *config);
 
 	/*! Callback to initialize a cursor.
 	 *
-	 * @snippet ex_all.c WT_CURSOR_TYPE init
+	 * @snippet ex_all.c WT_DATA_SOURCE open_cursor
+	 */
+	int (*open_cursor)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+	    const char *obj, WT_CURSOR *old_cursor,
+	    const char *config, WT_CURSOR **new_cursor);
+
+	/*! Callback to rename an object.
+	 *
+	 * @snippet ex_all.c WT_DATA_SOURCE sync
 	 */
-	int (*init_cursor)(WT_CURSOR_TYPE *ctype,
-	    WT_SESSION *session, const char *obj, WT_CURSOR *old_cursor,
-	    const char *config, WT_CURSOR *new_cursor);
+	int (*rename)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+	    const char *oldname, const char *newname, const char *config);
+
+	/*! Callback to sync an object.
+	 *
+	 * @snippet ex_all.c WT_DATA_SOURCE sync
+	 */
+	int (*sync)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+	    const char *name, const char *config);
+
+	/*! Callback to truncate an object.
+	 *
+	 * @snippet ex_all.c WT_DATA_SOURCE truncate
+	 */
+	int (*truncate)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+	    const char *name, const char *config);
 };
 
 /*!
@@ -1607,36 +1649,34 @@ extern int wiredtiger_extension_init(WT_SESSION *session,
 #define	WT_STAT_file_maxleafpage			26
 /*! minor version number */
 #define	WT_STAT_file_minor				27
-/*! number of bytes in the freelist */
-#define	WT_STAT_file_freelist_bytes			28
-/*! number of entries in the freelist */
-#define	WT_STAT_file_freelist_entries			29
 /*! overflow pages */
-#define	WT_STAT_file_overflow				30
+#define	WT_STAT_file_overflow				28
 /*! page size allocation unit */
-#define	WT_STAT_file_allocsize				31
+#define	WT_STAT_file_allocsize				29
 /*! reconcile: deleted or temporary pages merged */
-#define	WT_STAT_rec_page_merge				32
+#define	WT_STAT_rec_page_merge				30
 /*! reconcile: internal pages split */
-#define	WT_STAT_rec_split_intl				33
+#define	WT_STAT_rec_split_intl				31
 /*! reconcile: leaf pages split */
-#define	WT_STAT_rec_split_leaf				34
+#define	WT_STAT_rec_split_leaf				32
 /*! reconcile: overflow key */
-#define	WT_STAT_rec_ovfl_key				35
+#define	WT_STAT_rec_ovfl_key				33
 /*! reconcile: overflow value */
-#define	WT_STAT_rec_ovfl_value				36
+#define	WT_STAT_rec_ovfl_value				34
 /*! reconcile: pages deleted */
-#define	WT_STAT_rec_page_delete				37
+#define	WT_STAT_rec_page_delete				35
 /*! reconcile: pages written */
-#define	WT_STAT_rec_written				38
+#define	WT_STAT_rec_written				36
 /*! reconcile: unable to acquire hazard reference */
-#define	WT_STAT_rec_hazard				39
+#define	WT_STAT_rec_hazard				37
 /*! row-store internal pages */
-#define	WT_STAT_file_row_int_pages			40
+#define	WT_STAT_file_row_int_pages			38
 /*! row-store leaf pages */
-#define	WT_STAT_file_row_leaf_pages			41
+#define	WT_STAT_file_row_leaf_pages			39
 /*! total entries */
-#define	WT_STAT_file_entries				42
+#define	WT_STAT_file_entries				40
+/*! write generation conflicts */
+#define	WT_STAT_file_write_conflicts			41
 /*! @} */
 /*
  * Statistics section: END
diff --git a/src/include/wiredtiger_ext.h b/src/include/wiredtiger_ext.h
index 88ccd8385b0..856c5b84483 100644
--- a/src/include/wiredtiger_ext.h
+++ b/src/include/wiredtiger_ext.h
@@ -25,16 +25,30 @@ extern "C" {
  */
 struct __wt_extension_api {
 /* !!! To maintain backwards compatibility, this structure is append-only. */
-	/*! Put an error message on the WiredTiger error stream.  */
-	void (*err_printf)(WT_SESSION *, const char *fmt, ...);
+	/*! Insert an error message into the WiredTiger error stream.
+	 *
+	 * @param session the session handle
+	 * @param fmt a printf-like format specification
+	 * @errors
+	 */
+	int (*err_printf)(WT_SESSION *, const char *fmt, ...);
 #define	wiredtiger_err_printf	wt_api->err_printf
 
-	/*! Allocate short-term use scratch memory. */
-	void *(*scr_alloc)(WT_SESSION *, size_t);
+	/*! Allocate short-term use scratch memory.
+	 *
+	 * @param session the session handle
+	 * @param bytes the number of bytes of memory needed
+	 * @returns A valid memory reference on success or NULL on error
+	 */
+	void *(*scr_alloc)(WT_SESSION *, size_t bytes);
 #define	wiredtiger_scr_alloc	wt_api->scr_alloc
 
-	/*! Free short-term use scratch memory. */
-	void (*scr_free)(WT_SESSION *, void *);
+	/*! Free short-term use scratch memory.
+	 *
+	 * @param session the session handle
+	 * @param ref a memory reference returned by WT_EXTENSION_API::scr_alloc
+	 */
+	void (*scr_free)(WT_SESSION *, void *ref);
 #define	wiredtiger_scr_free	wt_api->scr_free
 };
 
diff --git a/src/include/wt_internal.in b/src/include/wt_internal.h
index 803f9cf704b..81c6fe7b053 100644
--- a/src/include/wt_internal.in
+++ b/src/include/wt_internal.h
@@ -54,6 +54,8 @@ struct __wt_block_desc;
     typedef struct __wt_block_desc WT_BLOCK_DESC;
 struct __wt_block_header;
     typedef struct __wt_block_header WT_BLOCK_HEADER;
+struct __wt_block_snapshot;
+    typedef struct __wt_block_snapshot WT_BLOCK_SNAPSHOT;
 struct __wt_btree;
     typedef struct __wt_btree WT_BTREE;
 struct __wt_btree_session;
@@ -96,10 +98,8 @@ struct __wt_cursor_table;
     typedef struct __wt_cursor_table WT_CURSOR_TABLE;
 struct __wt_dlh;
     typedef struct __wt_dlh WT_DLH;
-struct __wt_evict_list;
-    typedef struct __wt_evict_list WT_EVICT_LIST;
-struct __wt_evict_req;
-    typedef struct __wt_evict_req WT_EVICT_REQ;
+struct __wt_evict_entry;
+    typedef struct __wt_evict_entry WT_EVICT_ENTRY;
 struct __wt_ext;
     typedef struct __wt_ext WT_EXT;
 struct __wt_extlist;
@@ -118,6 +118,8 @@ struct __wt_named_collator;
     typedef struct __wt_named_collator WT_NAMED_COLLATOR;
 struct __wt_named_compressor;
     typedef struct __wt_named_compressor WT_NAMED_COMPRESSOR;
+struct __wt_named_data_source;
+    typedef struct __wt_named_data_source WT_NAMED_DATA_SOURCE;
 struct __wt_page;
     typedef struct __wt_page WT_PAGE;
 struct __wt_page_header;
@@ -136,14 +138,22 @@ struct __wt_rwlock;
     typedef struct __wt_rwlock WT_RWLOCK;
 struct __wt_salvage_cookie;
     typedef struct __wt_salvage_cookie WT_SALVAGE_COOKIE;
+struct __wt_scratch_track;
+    typedef struct __wt_scratch_track WT_SCRATCH_TRACK;
 struct __wt_session_impl;
     typedef struct __wt_session_impl WT_SESSION_IMPL;
 struct __wt_size;
     typedef struct __wt_size WT_SIZE;
+struct __wt_snapshot;
+    typedef struct __wt_snapshot WT_SNAPSHOT;
 struct __wt_stats;
     typedef struct __wt_stats WT_STATS;
 struct __wt_table;
     typedef struct __wt_table WT_TABLE;
+struct __wt_txn;
+    typedef struct __wt_txn WT_TXN;
+struct __wt_txn_global;
+    typedef struct __wt_txn_global WT_TXN_GLOBAL;
 struct __wt_update;
     typedef struct __wt_update WT_UPDATE;
 /*
@@ -157,6 +167,7 @@ struct __wt_update;
 #include "posix.h"
 #include "misc.h"
 #include "mutex.h"
+#include "txn.h"
 
 #include "block.h"
 #include "btmem.h"
@@ -171,6 +182,7 @@ struct __wt_update;
 
 #include "api.h"
 #include "cursor.h"
+#include "meta.h"
 #include "schema.h"
 
 #include "extern.h"
@@ -188,9 +200,9 @@ struct __wt_update;
 #include "log.i"
 #include "mutex.i"
 #include "packing.i"
-#include "progress.i"
 #include "serial.i"
 #include "serial_funcs.i"
+#include "txn.i"
 
 #if defined(__cplusplus)
 }
diff --git a/src/log/log.c b/src/log/log.c
index 45de4d7beaa..0f9adb9610a 100644
--- a/src/log/log.c
+++ b/src/log/log.c
@@ -9,34 +9,31 @@
 
 static int
 __log_record_size(WT_SESSION_IMPL *session,
-    WT_LOGREC_DESC *recdesc, va_list ap, size_t *sizep)
+    size_t *sizep, WT_LOGREC_DESC *recdesc, va_list ap)
 {
-	WT_UNUSED(session);
-
-	*sizep = wiredtiger_struct_size(recdesc->fmt, ap);
-	return (0);
+	return (__wt_struct_sizev(session, sizep, recdesc->fmt, ap));
 }
 
 int
 __wt_log_put(WT_SESSION_IMPL *session, WT_LOGREC_DESC *recdesc, ...)
 {
+	WT_DECL_RET;
 	WT_ITEM *buf;
 	va_list ap;
 	size_t size;
-	int ret;
 
 	buf = &session->logrec_buf;
 
 	va_start(ap, recdesc);
-	WT_ERR(__log_record_size(session, recdesc, ap, &size));
+	ret = __log_record_size(session, &size, recdesc, ap);
 	va_end(ap);
+	WT_RET(ret);
 
 	WT_RET(__wt_buf_initsize(session, buf, size));
 
 	va_start(ap, recdesc);
-	WT_ERR(__wt_struct_packv(session, buf->mem, size, recdesc->fmt, ap));
-err:	va_end(ap);
-
+	ret = __wt_struct_packv(session, buf->mem, size, recdesc->fmt, ap);
+	va_end(ap);
 	return (ret);
 }
 
@@ -80,8 +77,8 @@ int
 __wt_log_printf(WT_SESSION_IMPL *session, const char *fmt, ...)
     WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3)))
 {
+	WT_DECL_RET;
 	va_list ap;
-	int ret;
 
 	va_start(ap, fmt);
 	ret = __wt_log_vprintf(session, fmt, ap);
diff --git a/src/meta/meta_api.c b/src/meta/meta_api.c
new file mode 100644
index 00000000000..74ea9ea91e5
--- /dev/null
+++ b/src/meta/meta_api.c
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * wt_metadata_get --
+ *	Public entry point to __wt_metadata_read (for wt dump and list).
+ */
+int
+__wt_metadata_get(WT_SESSION *session, const char *uri, const char **valuep)
+{
+	return (__wt_metadata_read((WT_SESSION_IMPL *)session, uri, valuep));
+}
+
+/*
+ * __wt_snaplist_get --
+ *	Public entry point to __wt_meta_snaplist_get (for wt list).
+ */
+int
+__wt_metadata_get_snaplist(
+    WT_SESSION *session, const char *name, WT_SNAPSHOT **snapbasep)
+{
+	return (__wt_meta_snaplist_get(
+	    (WT_SESSION_IMPL *)session, name, snapbasep));
+}
+
+/*
+ * __wt_snaplist_free --
+ *	Public entry point to __wt_snapshot_list_free (for wt list).
+ */
+void
+__wt_metadata_free_snaplist(WT_SESSION *session, WT_SNAPSHOT *snapbase)
+{
+	__wt_meta_snaplist_free((WT_SESSION_IMPL *)session, snapbase);
+}
diff --git a/src/meta/meta_apply.c b/src/meta/meta_apply.c
new file mode 100644
index 00000000000..793582261e4
--- /dev/null
+++ b/src/meta/meta_apply.c
@@ -0,0 +1,47 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_meta_btree_apply --
+ *	Apply a function to all files listed in the metadata.
+ */
+int
+__wt_meta_btree_apply(WT_SESSION_IMPL *session,
+    int (*func)(WT_SESSION_IMPL *, const char *[]),
+    const char *cfg[], uint32_t flags)
+{
+	WT_BTREE *saved_btree;
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	const char *uri;
+	int cmp, tret;
+
+	saved_btree = session->btree;
+	WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
+	cursor->set_key(cursor, "file:");
+	if ((tret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0)
+		tret = cursor->next(cursor);
+	for (; tret == 0; tret = cursor->next(cursor)) {
+		WT_ERR(cursor->get_key(cursor, &uri));
+		if (!WT_PREFIX_MATCH(uri, "file:"))
+			break;
+		else if (strcmp(uri, WT_METADATA_URI) == 0)
+			continue;
+		WT_ERR(__wt_session_get_btree(session, uri, NULL, flags));
+		ret = func(session, cfg);
+		WT_TRET(__wt_session_release_btree(session));
+		WT_ERR(ret);
+	}
+
+	if (tret != WT_NOTFOUND)
+		WT_TRET(tret);
+err:	WT_TRET(cursor->close(cursor));
+	session->btree = saved_btree;
+	return (ret);
+}
diff --git a/src/meta/meta_snapshot.c b/src/meta/meta_snapshot.c
new file mode 100644
index 00000000000..60642e7f53c
--- /dev/null
+++ b/src/meta/meta_snapshot.c
@@ -0,0 +1,410 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __snap_get(
+	WT_SESSION_IMPL *, const char *, const char *,  WT_ITEM *);
+static int __snap_get_last(WT_SESSION_IMPL *, const char *, WT_ITEM *);
+static int __snap_get_name(
+	WT_SESSION_IMPL *, const char *, const char *, WT_ITEM *);
+static int __snap_set(WT_SESSION_IMPL *, const char *, const char *);
+static int __snap_version_chk(WT_SESSION_IMPL *, const char *, const char *);
+
+/*
+ * __wt_meta_snapshot_get --
+ *	Get the file's most recent snapshot address.
+ */
+int
+__wt_meta_snapshot_get(WT_SESSION_IMPL *session,
+    const char *name, const char *snapshot, WT_ITEM *addr)
+{
+	WT_DECL_RET;
+
+	/* Get the snapshot address. */
+	ret = __snap_get(session, name, snapshot, addr);
+
+	/*
+	 * If we find a snapshot, check the version and return the address.
+	 * If we don't find a named snapshot, we're done, they're read-only.
+	 * If we don't find a default snapshot, it's creation, return "no
+	 * data" and let our caller handle it.
+	 */
+	if (ret == WT_NOTFOUND) {
+		/*
+		 * If the caller didn't give us a specific snapshot name, we
+		 * assume it's a creation and there isn't a snapshot to find.
+		 * Let the caller deal with the failure.
+		 */
+		if (snapshot != NULL)
+			WT_RET_MSG(session, WT_NOTFOUND,
+			    "no \"%s\" snapshot found in %s",
+			    snapshot, name);
+
+		addr->data = NULL;
+		addr->size = 0;
+	}
+	return (0);
+}
+
+/*
+ * __wt_meta_snapshot_clear --
+ *	Clear a file's snapshot.
+ */
+int
+__wt_meta_snapshot_clear(WT_SESSION_IMPL *session, const char *name)
+{
+	WT_DECL_RET;
+
+	ret = __snap_set(session, name, NULL);
+
+	/*
+	 * If we are unrolling a failed create, we may have already removed the
+	 * metadata entry.  If no entry is found to update and we're trying to
+	 * clear the snapshot, just ignore it.
+	 */
+	if (ret == WT_NOTFOUND)
+		ret = 0;
+	return (ret);
+}
+
+/*
+ * __snap_get --
+ *	Get a file's snapshot.
+ */
+static int
+__snap_get(WT_SESSION_IMPL *session,
+    const char *name, const char *snapshot, WT_ITEM *addr)
+{
+	WT_DECL_RET;
+	const char *config;
+
+	config = NULL;
+
+	/* Retrieve the metadata entry for the file. */
+	WT_ERR(__wt_metadata_read(session, name, &config));
+
+	/* Check the major/minor version numbers. */
+	WT_ERR(__snap_version_chk(session, name, config));
+
+	/* Retrieve the named snapshot or the last snapshot. */
+	if (snapshot == NULL)
+		WT_ERR(__snap_get_last(session, config, addr));
+	else
+		WT_ERR(__snap_get_name(session, snapshot, config, addr));
+
+err:	__wt_free(session, config);
+	return (ret);
+}
+
+/*
+ * __snap_set --
+ *	Set a file's snapshot.
+ */
+static int
+__snap_set(WT_SESSION_IMPL *session, const char *name, const char *v)
+{
+	WT_DECL_RET;
+	const char *config, *cfg[3], *newcfg;
+
+	config = newcfg = NULL;
+
+	/* Retrieve the metadata for this file. */
+	WT_ERR(__wt_metadata_read(session, name, &config));
+
+	/* Replace the snapshot entry. */
+	cfg[0] = config;
+	cfg[1] = v == NULL ? "snapshot=()" : v;
+	cfg[2] = NULL;
+	WT_ERR(__wt_config_collapse(session, cfg, &newcfg));
+	WT_ERR(__wt_metadata_update(session, name, newcfg));
+
+err:	__wt_free(session, config);
+	__wt_free(session, newcfg);
+	return (ret);
+}
+
+/*
+ * __snap_get_name --
+ *	Return the cookie associated with a file's named snapshot.
+ */
+static int
+__snap_get_name(WT_SESSION_IMPL *session,
+    const char *name, const char *config, WT_ITEM *addr)
+{
+	WT_CONFIG snapconf;
+	WT_CONFIG_ITEM a, k, v;
+
+	WT_RET(__wt_config_getones(session, config, "snapshot", &v));
+	WT_RET(__wt_config_subinit(session, &snapconf, &v));
+	while (__wt_config_next(&snapconf, &k, &v) == 0)
+		if (strlen(name) == k.len && strncmp(name, k.str, k.len) == 0) {
+			WT_RET(__wt_config_subgets(session, &v, "addr", &a));
+			WT_RET(__wt_nhex_to_raw(session, a.str, a.len, addr));
+			return (0);
+		}
+	return (WT_NOTFOUND);
+}
+
+/*
+ * __snap_get_last --
+ *	Return the cookie associated with the file's last snapshot.
+ */
+static int
+__snap_get_last(
+    WT_SESSION_IMPL *session, const char *config, WT_ITEM *addr)
+{
+	WT_CONFIG snapconf;
+	WT_CONFIG_ITEM a, k, v;
+	int64_t found;
+
+	WT_RET(__wt_config_getones(session, config, "snapshot", &v));
+	WT_RET(__wt_config_subinit(session, &snapconf, &v));
+	for (found = 0; __wt_config_next(&snapconf, &k, &v) == 0;) {
+		if (found) {
+			WT_RET(__wt_config_subgets(session, &v, "order", &a));
+			if (a.val < found)
+				continue;
+		}
+
+		WT_RET(__wt_config_subgets(session, &v, "addr", &a));
+		if (a.len == 0)
+			WT_RET(EINVAL);
+
+		/* Our caller wants the raw cookie, not the hex. */
+		WT_RET(__wt_nhex_to_raw(session, a.str, a.len, addr));
+		WT_RET(__wt_config_subgets(session, &v, "order", &a));
+		found = a.val;
+	}
+
+	return (found ? 0 : WT_NOTFOUND);
+}
+
+/*
+ * __snap_compare_order --
+ *	Qsort comparison routine for the snapshot list.
+ */
+static int
+__snap_compare_order(const void *a, const void *b)
+{
+	WT_SNAPSHOT *asnap, *bsnap;
+
+	asnap = (WT_SNAPSHOT *)a;
+	bsnap = (WT_SNAPSHOT *)b;
+
+	return (asnap->order > bsnap->order ? 1 : -1);
+}
+
+/*
+ * __wt_meta_snaplist_get --
+ *	Load all available snapshot information for a file.
+ */
+int
+__wt_meta_snaplist_get(
+    WT_SESSION_IMPL *session, const char *name, WT_SNAPSHOT **snapbasep)
+{
+	WT_CONFIG snapconf;
+	WT_CONFIG_ITEM a, k, v;
+	WT_DECL_RET;
+	WT_ITEM *buf;
+	WT_SNAPSHOT *snap, *snapbase;
+	size_t allocated, slot;
+	const char *config;
+	char timebuf[64];
+
+	*snapbasep = NULL;
+
+	buf = NULL;
+	snapbase = NULL;
+	allocated = slot = 0;
+	config = NULL;
+
+	/* Retrieve the metadata information for the file. */
+	WT_RET(__wt_metadata_read(session, name, &config));
+
+	/* Load any existing snapshots into the array. */
+	WT_ERR(__wt_scr_alloc(session, 0, &buf));
+	if (__wt_config_getones(session, config, "snapshot", &v) == 0 &&
+	    __wt_config_subinit(session, &snapconf, &v) == 0)
+		for (; __wt_config_next(&snapconf, &k, &v) == 0; ++slot) {
+			if (slot * sizeof(WT_SNAPSHOT) == allocated)
+				WT_ERR(__wt_realloc(session, &allocated,
+				    (slot + 50) * sizeof(WT_SNAPSHOT),
+				    &snapbase));
+			snap = &snapbase[slot];
+
+			/*
+			 * Copy the name, address (raw and hex), order and time
+			 * into the slot.
+			 */
+			WT_ERR(
+			    __wt_strndup(session, k.str, k.len, &snap->name));
+
+			WT_ERR(__wt_config_subgets(session, &v, "addr", &a));
+			if (a.len == 0)
+				goto format;
+			WT_ERR(__wt_buf_set(
+			    session, &snap->addr, a.str, a.len));
+			WT_ERR(__wt_nhex_to_raw(
+			    session, a.str, a.len, &snap->raw));
+
+			WT_ERR(__wt_config_subgets(session, &v, "order", &a));
+			if (a.val == 0)
+				goto format;
+			snap->order = a.val;
+
+			WT_ERR(__wt_config_subgets(session, &v, "time", &a));
+			if (a.len == 0)
+				goto format;
+			if (a.len > sizeof(timebuf) - 1)
+				goto format;
+			memcpy(timebuf, a.str, a.len);
+			timebuf[a.len] = '\0';
+			if (sscanf(timebuf, "%" SCNuMAX, &snap->sec) != 1)
+				goto format;
+
+			WT_ERR(__wt_config_subgets(session, &v, "size", &a));
+			snap->snapshot_size = (uint64_t)a.val;
+		}
+
+	/*
+	 * Allocate an extra slot for a new value, plus a slot to mark the end.
+	 *
+	 * This isn't very clean, but there's necessary cooperation between the
+	 * schema layer (that maintains the list of snapshots), the btree layer
+	 * (that knows when the root page is written, creating a new snapshot),
+	 * and the block manager (which actually creates the snapshot).  All of
+	 * that cooperation is handled in the WT_SNAPSHOT structure referenced
+	 * from the WT_BTREE structure.
+	 */
+	if ((slot + 2) * sizeof(WT_SNAPSHOT) >= allocated)
+		WT_ERR(__wt_realloc(session, &allocated,
+		    (slot + 2) * sizeof(WT_SNAPSHOT), &snapbase));
+
+	/* Sort in creation-order. */
+	qsort(snapbase, slot, sizeof(WT_SNAPSHOT), __snap_compare_order);
+
+	/* Return the array to our caller. */
+	*snapbasep = snapbase;
+
+	if (0) {
+format:		WT_ERR_MSG(session, WT_ERROR, "corrupted snapshot list");
+err:		__wt_meta_snaplist_free(session, snapbase);
+	}
+	__wt_free(session, config);
+	__wt_scr_free(&buf);
+
+	return (ret);
+}
+
+/*
+ * __wt_meta_snaplist_set --
+ *	Set a file's snapshot value from the WT_SNAPSHOT list.
+ */
+int
+__wt_meta_snaplist_set(
+    WT_SESSION_IMPL *session, const char *name, WT_SNAPSHOT *snapbase)
+{
+	WT_DECL_RET;
+	WT_ITEM *buf;
+	WT_SNAPSHOT *snap;
+	int64_t order;
+	const char *sep;
+
+	buf = NULL;
+
+	WT_ERR(__wt_scr_alloc(session, 0, &buf));
+	order = 0;
+	sep = "";
+	WT_ERR(__wt_buf_fmt(session, buf, "snapshot=("));
+	WT_SNAPSHOT_FOREACH(snapbase, snap) {
+		/* Skip deleted snapshots. */
+		if (F_ISSET(snap, WT_SNAP_DELETE))
+			continue;
+
+		/*
+		 * Track the largest active snapshot counter: it's not really
+		 * a generational number or an ID because we reset it to 1 if
+		 * the snapshot we're writing is the only snapshot the file has.
+		 * The problem we're solving is when two snapshots are taken
+		 * quickly, the timer may not be unique and/or we can even see
+		 * time travel on the second snapshot if we read the time
+		 * in-between nanoseconds rolling over.  All we need to know
+		 * is the real snapshot order so we don't accidentally take the
+		 * wrong "last" snapshot.
+		 */
+		if (snap->order > order)
+			order = snap->order;
+
+		if (F_ISSET(snap, WT_SNAP_ADD | WT_SNAP_UPDATE)) {
+			/* Convert the raw cookie to a hex string. */
+			WT_ERR(__wt_raw_to_hex(session,
+			    snap->raw.data, snap->raw.size, &snap->addr));
+
+			if (F_ISSET(snap, WT_SNAP_ADD))
+				snap->order = order + 1;
+		}
+		WT_ERR(__wt_buf_catfmt(session, buf,
+		    "%s%s=(addr=\"%.*s\",order=%" PRIu64
+		    ",time=%" PRIuMAX ",size=%" PRIu64 ")",
+		    sep, snap->name,
+		    (int)snap->addr.size, (char *)snap->addr.data,
+		    snap->order, snap->sec, snap->snapshot_size));
+		sep = ",";
+	}
+	WT_ERR(__wt_buf_catfmt(session, buf, ")"));
+	WT_ERR(__snap_set(session, name, buf->mem));
+
+err:	__wt_scr_free(&buf);
+
+	return (ret);
+}
+
+/*
+ * __wt_meta_snaplist_free --
+ *	Discard the snapshot array.
+ */
+void
+__wt_meta_snaplist_free(WT_SESSION_IMPL *session, WT_SNAPSHOT *snapbase)
+{
+	WT_SNAPSHOT *snap;
+	if (snapbase == NULL)
+		return;
+	WT_SNAPSHOT_FOREACH(snapbase, snap) {
+		__wt_free(session, snap->name);
+		__wt_buf_free(session, &snap->addr);
+		__wt_buf_free(session, &snap->raw);
+		__wt_free(session, snap->bpriv);
+	}
+	__wt_free(session, snapbase);
+}
+
+/*
+ * __snap_version_chk --
+ *	Check the version major/minor numbers.
+ */
+static int
+__snap_version_chk(
+    WT_SESSION_IMPL *session, const char *name, const char *config)
+{
+	WT_CONFIG_ITEM a, v;
+	int majorv, minorv;
+
+	WT_RET(__wt_config_getones(session, config, "version", &v));
+	WT_RET(__wt_config_subgets(session, &v, "major", &a));
+	majorv = (int)a.val;
+	WT_RET(__wt_config_subgets(session, &v, "minor", &a));
+	minorv = (int)a.val;
+
+	if (majorv > WT_BTREE_MAJOR_VERSION ||
+	    (majorv == WT_BTREE_MAJOR_VERSION &&
+	    minorv > WT_BTREE_MINOR_VERSION))
+		WT_RET_MSG(session, EACCES,
+		    "%s is an unsupported version of a WiredTiger file",
+		    name);
+	return (0);
+}
diff --git a/src/meta/meta_table.c b/src/meta/meta_table.c
new file mode 100644
index 00000000000..62260bfac48
--- /dev/null
+++ b/src/meta/meta_table.c
@@ -0,0 +1,179 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __metadata_turtle --
+ *	Return if a key's value should be taken from the turtle file.
+ */
+static int
+__metadata_turtle(const char *key)
+{
+	switch (key[0]) {
+	case 'f':
+		if (strcmp(key, WT_METADATA_URI) == 0)
+			return (1);
+		break;
+	case 'W':
+		if (strcmp(key, "WiredTiger version") == 0)
+			return (1);
+		if (strcmp(key, "WiredTiger version string") == 0)
+			return (1);
+		break;
+	}
+	return (0);
+}
+
+/*
+ * __wt_metadata_open --
+ *	Opens the metadata file, sets session->metafile.
+ */
+int
+__wt_metadata_open(WT_SESSION_IMPL *session)
+{
+	if (session->metafile != NULL)
+		return (0);
+
+	WT_RET(__wt_session_get_btree(
+	    session, WT_METADATA_URI, NULL, 0));
+
+	session->metafile = session->btree;
+	WT_ASSERT(session, session->metafile != NULL);
+
+	/* The metafile doesn't need to stay locked -- release it. */
+	return (__wt_session_release_btree(session));
+}
+
+/*
+ * __wt_metadata_cursor --
+ *	Opens a cursor on the metadata.
+ */
+int
+__wt_metadata_cursor(
+    WT_SESSION_IMPL *session, const char *config, WT_CURSOR **cursorp)
+{
+	WT_BTREE *saved_btree;
+	WT_DECL_RET;
+	const char *cfg[] = API_CONF_DEFAULTS(session, open_cursor, config);
+
+	saved_btree = session->btree;
+	WT_ERR(__wt_metadata_open(session));
+
+	session->btree = session->metafile;
+	WT_ERR(__wt_session_lock_btree(session, 0));
+	ret = __wt_curfile_create(session, NULL, cfg, cursorp);
+
+	/* Restore the caller's btree. */
+err:	session->btree = saved_btree;
+	return (ret);
+}
+
+/*
+ * __wt_metadata_insert --
+ *	Insert a row into the metadata
+ */
+int
+__wt_metadata_insert(
+    WT_SESSION_IMPL *session, const char *key, const char *value)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+
+	if (__metadata_turtle(key))
+		WT_RET_MSG(session, EINVAL,
+		    "%s: insert not supported on the turtle file", key);
+
+	WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+	cursor->set_key(cursor, key);
+	cursor->set_value(cursor, value);
+	WT_TRET(cursor->insert(cursor));
+	if (ret == 0 && WT_META_TRACKING(session))
+		ret = __wt_meta_track_insert(session, key);
+	WT_TRET(cursor->close(cursor));
+
+err:	return (ret);
+}
+
+/*
+ * __wt_metadata_update --
+ *	Update a row in the metadata.
+ */
+int
+__wt_metadata_update(
+    WT_SESSION_IMPL *session, const char *key, const char *value)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+
+	if (__metadata_turtle(key))
+		return (__wt_meta_turtle_update(session, key, value));
+
+	if (WT_META_TRACKING(session))
+		WT_RET(__wt_meta_track_update(session, key));
+
+	WT_RET(__wt_metadata_cursor(session, "overwrite", &cursor));
+	cursor->set_key(cursor, key);
+	cursor->set_value(cursor, value);
+	WT_TRET(cursor->insert(cursor));
+	WT_TRET(cursor->close(cursor));
+
+	return (ret);
+}
+
+/*
+ * __wt_metadata_remove --
+ *	Removes a row from the metadata.
+ */
+int
+__wt_metadata_remove(WT_SESSION_IMPL *session, const char *key)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+
+	if (__metadata_turtle(key))
+		WT_RET_MSG(session, EINVAL,
+		    "%s: remove not supported on the turtle file", key);
+
+	WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
+	cursor->set_key(cursor, key);
+	WT_TRET(cursor->search(cursor));
+	if (ret == 0) {
+		if (WT_META_TRACKING(session))
+			WT_TRET(__wt_meta_track_update(session, key));
+		WT_TRET(cursor->remove(cursor));
+	}
+	WT_TRET(cursor->close(cursor));
+
+	return (ret);
+}
+
+/*
+ * __wt_metadata_read --
+ *	Reads and copies a row from the metadata.
+ *	The caller is responsible for freeing the allocated memory.
+ */
+int
+__wt_metadata_read(
+    WT_SESSION_IMPL *session, const char *key, const char **valuep)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	const char *value;
+
+	if (__metadata_turtle(key))
+		return (__wt_meta_turtle_read(session, key, valuep));
+
+	WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
+	cursor->set_key(cursor, key);
+	WT_ERR(cursor->search(cursor));
+	WT_ERR(cursor->get_value(cursor, &value));
+	WT_ERR(__wt_strdup(session, value, valuep));
+
+err:	WT_TRET(cursor->close(cursor));
+	return (ret);
+}
diff --git a/src/meta/meta_track.c b/src/meta/meta_track.c
new file mode 100644
index 00000000000..b7d5c77bc35
--- /dev/null
+++ b/src/meta/meta_track.c
@@ -0,0 +1,315 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * WT_META_TRACK -- A tracked metadata operation: a non-transactional log,
+ * maintained to make it easy to unroll simple metadata and filesystem
+ * operations.
+ */
+typedef struct __wt_meta_track {
+	enum {
+		WT_ST_EMPTY,		/* Unused slot */
+		WT_ST_FILEOP,		/* File operation */
+		WT_ST_LOCK,		/* Lock a handle */
+		WT_ST_REMOVE,		/* Remove a metadata entry */
+		WT_ST_SET		/* Reset a metadata entry */
+	} op;
+	const char *a, *b;		/* Strings */
+	WT_BTREE *btree;		/* Locked handle */
+} WT_META_TRACK;
+
+/*
+ * __meta_track_next --
+ *	Extend the list of operations we're tracking, as necessary, and
+ *	optionally return the next slot.
+ */
+static int
+__meta_track_next(WT_SESSION_IMPL *session, WT_META_TRACK **trkp)
+{
+	size_t offset, sub_off;
+
+	if (!WT_META_TRACKING(session))
+		session->meta_track_next = session->meta_track;
+
+	offset = WT_PTRDIFF(session->meta_track_next, session->meta_track);
+	sub_off = WT_PTRDIFF(session->meta_track_sub, session->meta_track);
+	if (offset == session->meta_track_alloc) {
+		WT_RET(__wt_realloc(session, &session->meta_track_alloc,
+		    WT_MAX(2 * session->meta_track_alloc,
+		    20 * sizeof(WT_META_TRACK)), &session->meta_track));
+
+		/* Maintain positions in the new chunk of memory. */
+		session->meta_track_next =
+		    (uint8_t *)session->meta_track + offset;
+		if (session->meta_track_sub != NULL)
+			session->meta_track_sub =
+			    (uint8_t *)session->meta_track + sub_off;
+	}
+
+	WT_ASSERT(session, session->meta_track_next != NULL);
+
+	if (trkp != NULL) {
+		*trkp = session->meta_track_next;
+		session->meta_track_next = *trkp + 1;
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_meta_track_discard --
+ *	Cleanup metadata tracking when closing a session.
+ */
+void
+__wt_meta_track_discard(WT_SESSION_IMPL *session)
+{
+	__wt_free(session, session->meta_track);
+	session->meta_track_next = NULL;
+	session->meta_track_alloc = 0;
+}
+
+/*
+ * __wt_meta_track_on --
+ *	Turn on metadata operation tracking.
+ */
+int
+__wt_meta_track_on(WT_SESSION_IMPL *session)
+{
+	return (__meta_track_next(session, NULL));
+}
+
+static int
+__meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll)
+{
+	WT_BTREE *saved_btree;
+	WT_DECL_RET;
+	int tret;
+
+	/* Unlock handles regardless of whether we are unrolling. */
+	if (!unroll && trk->op != WT_ST_LOCK)
+		goto free;
+
+	switch (trk->op) {
+	case WT_ST_EMPTY:	/* Unused slot */
+		break;
+	case WT_ST_LOCK:	/* Handle lock, see above */
+		saved_btree = session->btree;
+		session->btree = trk->btree;
+		if (session->created_btree == trk->btree)
+			session->created_btree = NULL;
+		WT_TRET(__wt_session_release_btree(session));
+		session->btree = saved_btree;
+		break;
+	case WT_ST_FILEOP:	/* File operation */
+		/*
+		 * For renames, both a and b are set.
+		 * For creates, a is NULL.
+		 * For removes, b is NULL.
+		 */
+		if (trk->a != NULL && trk->b != NULL &&
+		    (tret = __wt_rename(session,
+		    trk->b + strlen("file:"),
+		    trk->a + strlen("file:"))) != 0) {
+			__wt_err(session, tret,
+			    "metadata unroll rename %s to %s",
+			    trk->b, trk->a);
+			WT_TRET(tret);
+		} else if (trk->a == NULL) {
+			saved_btree = session->btree;
+			if ((session->btree = session->created_btree) != NULL)
+				WT_TRET(
+				    __wt_conn_btree_sync_and_close(session));
+			session->btree = saved_btree;
+			if ((tret = __wt_remove(session,
+			    trk->b + strlen("file:"))) != 0) {
+				__wt_err(session, tret,
+				    "metadata unroll create %s",
+				    trk->b);
+				WT_TRET(tret);
+			}
+		}
+		/*
+		 * We can't undo removes yet: that would imply
+		 * some kind of temporary rename and remove in
+		 * roll forward.
+		 */
+		break;
+	case WT_ST_REMOVE:	/* Remove trk.a */
+		if ((tret = __wt_metadata_remove(
+		    session, trk->a)) != 0) {
+			__wt_err(session, ret,
+			    "metadata unroll remove: %s",
+			    trk->a);
+			WT_TRET(tret);
+		}
+		break;
+	case WT_ST_SET:		/* Set trk.a to trk.b */
+		if ((tret = __wt_metadata_update(
+		    session, trk->a, trk->b)) != 0) {
+			__wt_err(session, ret,
+			    "metadata unroll update %s to %s",
+			    trk->a, trk->b);
+			WT_TRET(tret);
+		}
+		break;
+	WT_ILLEGAL_VALUE(session);
+	}
+
+free:	trk->op = WT_ST_EMPTY;
+	__wt_free(session, trk->a);
+	__wt_free(session, trk->b);
+	trk->btree = NULL;
+
+	return (ret);
+}
+
+/*
+ * __wt_meta_track_off --
+ *	Turn off metadata operation tracking, unrolling on error.
+ */
+int
+__wt_meta_track_off(WT_SESSION_IMPL *session, int unroll)
+{
+	WT_DECL_RET;
+	WT_META_TRACK *trk, *trk_orig;
+
+	if (!WT_META_TRACKING(session))
+		return (0);
+
+	trk_orig = session->meta_track;
+	trk = session->meta_track_next;
+
+	/* Turn off tracking for unroll. */
+	session->meta_track_next = session->meta_track_sub = NULL;
+
+	while (--trk >= trk_orig)
+		WT_TRET(__meta_track_apply(session, trk, unroll));
+
+	return (ret);
+}
+
+/*
+ * __wt_meta_track_sub_on --
+ *	Start a group of operations that can be committed independent of the
+ *	main transaction.
+ */
+int
+__wt_meta_track_sub_on(WT_SESSION_IMPL *session)
+{
+	WT_ASSERT(session, session->meta_track_sub == NULL);
+	session->meta_track_sub = session->meta_track_next;
+	return (0);
+}
+
+/*
+ * __wt_meta_track_sub_off --
+ *	Commit a group of operations independent of the main transaction.
+ */
+int
+__wt_meta_track_sub_off(WT_SESSION_IMPL *session)
+{
+	WT_DECL_RET;
+	WT_META_TRACK *trk, *trk_orig;
+
+	if (!WT_META_TRACKING(session) || session->meta_track_sub == NULL)
+		return (0);
+
+	trk_orig = session->meta_track_sub;
+	trk = session->meta_track_next;
+
+	/* Turn off tracking for unroll. */
+	session->meta_track_next = session->meta_track_sub = NULL;
+
+	while (--trk >= trk_orig)
+		WT_TRET(__meta_track_apply(session, trk, 0));
+
+	session->meta_track_next = trk_orig;
+	return (ret);
+}
+
+/*
+ * __wt_meta_track_insert --
+ *	Track an insert operation.
+ */
+int
+__wt_meta_track_insert(WT_SESSION_IMPL *session, const char *key)
+{
+	WT_META_TRACK *trk;
+
+	WT_RET(__meta_track_next(session, &trk));
+
+	trk->op = WT_ST_REMOVE;
+	WT_RET(__wt_strdup(session, key, &trk->a));
+
+	return (0);
+}
+
+/*
+ * __wt_meta_track_update --
+ *	Track a metadata update operation.
+ */
+int
+__wt_meta_track_update(WT_SESSION_IMPL *session, const char *key)
+{
+	WT_DECL_RET;
+	WT_META_TRACK *trk;
+
+	WT_RET(__meta_track_next(session, &trk));
+
+	trk->op = WT_ST_SET;
+	WT_RET(__wt_strdup(session, key, &trk->a));
+
+	/*
+	 * If there was a previous value, keep it around -- if not, then this
+	 * "update" is really an insert.
+	 */
+	if ((ret = __wt_metadata_read(session, key, &trk->b)) == WT_NOTFOUND) {
+		trk->op = WT_ST_REMOVE;
+		ret = 0;
+	}
+	return (ret);
+}
+
+/*
+ * __wt_meta_track_fs_rename --
+ *	Track a filesystem rename operation.
+ */
+int
+__wt_meta_track_fileop(
+    WT_SESSION_IMPL *session, const char *olduri, const char *newuri)
+{
+	WT_META_TRACK *trk;
+
+	WT_RET(__meta_track_next(session, &trk));
+
+	trk->op = WT_ST_FILEOP;
+	if (olduri != NULL)
+		WT_RET(__wt_strdup(session, olduri, &trk->a));
+	if (newuri != NULL)
+		WT_RET(__wt_strdup(session, newuri, &trk->b));
+	return (0);
+}
+
+/*
+ * __wt_meta_track_handle_lock --
+ *	Track a locked handle.
+ */
+int
+__wt_meta_track_handle_lock(WT_SESSION_IMPL *session)
+{
+	WT_META_TRACK *trk;
+
+	WT_ASSERT(session, session->btree != NULL);
+
+	WT_RET(__meta_track_next(session, &trk));
+
+	trk->op = WT_ST_LOCK;
+	trk->btree = session->btree;
+	return (0);
+}
diff --git a/src/meta/meta_turtle.c b/src/meta/meta_turtle.c
new file mode 100644
index 00000000000..862f7b11ab5
--- /dev/null
+++ b/src/meta/meta_turtle.c
@@ -0,0 +1,158 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_meta_turtle_init --
+ *	Check the turtle file and create if necessary.
+ */
+int
+__wt_meta_turtle_init(WT_SESSION_IMPL *session, int *existp)
+{
+	WT_DECL_RET;
+	WT_ITEM *buf;
+	int exist;
+	const char *metaconf;
+	const char *cfg[] = API_CONF_DEFAULTS(file, meta, NULL);
+
+	buf = NULL;
+	metaconf = NULL;
+	*existp = 0;
+
+	/* Discard any turtle setup file left-over from previous runs. */
+	WT_RET(__wt_exist(session, WT_METADATA_TURTLE_SET, &exist));
+	if (exist)
+		WT_RET(__wt_remove(session, WT_METADATA_TURTLE_SET));
+
+	/* If there's already a turtle file, we're done. */
+	WT_RET(__wt_exist(session, WT_METADATA_TURTLE, &exist));
+	if (exist) {
+		*existp = 1;
+		return (0);
+	}
+
+	/* Create a turtle file with default values. */
+	WT_ERR(__wt_scr_alloc(session, 0, &buf));
+	WT_ERR(__wt_buf_fmt(session, buf,
+	    "key_format=S,value_format=S,version=(major=%d,minor=%d)",
+	    WT_BTREE_MAJOR_VERSION, WT_BTREE_MINOR_VERSION));
+	cfg[1] = buf->data;
+	WT_ERR(__wt_config_collapse(session, cfg, &metaconf));
+	WT_ERR(__wt_meta_turtle_update(session, WT_METADATA_URI, metaconf));
+
+err:	__wt_free(session, metaconf);
+	__wt_scr_free(&buf);
+
+	return (ret);
+}
+
+/*
+ * __wt_meta_turtle_read --
+ *	Read the turtle file.
+ */
+int
+__wt_meta_turtle_read(
+    WT_SESSION_IMPL *session, const char *key, const char **valuep)
+{
+	FILE *fp;
+	WT_DECL_RET;
+	const char *path;
+	char *p, line[1024];
+
+	fp = NULL;
+	path = NULL;
+
+	/* Open the turtle file. */
+	WT_RET(__wt_filename(session, WT_METADATA_TURTLE, &path));
+	WT_ERR_TEST((fp = fopen(path, "r")) == NULL, WT_NOTFOUND);
+
+	/* Search for the key. */
+	ret = WT_NOTFOUND;
+	while (fgets(line, sizeof(line), fp) != NULL) {
+		if ((p = strchr(line, '\n')) == NULL)
+			goto format;
+		*p = '\0';
+		if (strcmp(key, line) == 0)
+			ret = 0;
+
+		/* Key matched: read the subsequent line for the value. */
+		if (fgets(line, sizeof(line), fp) == NULL)
+			goto format;
+		if ((p = strchr(line, '\n')) == NULL)
+			goto format;
+		*p = '\0';
+		if (ret == 0)
+			break;
+	}
+
+	/* Check for an I/O error. */
+	if (ferror(fp))
+		WT_ERR(__wt_errno());
+	WT_ERR(ret);
+
+	/* Successful: copy the value for the caller. */
+	WT_ERR(__wt_strdup(session, line, valuep));
+
+	if (0) {
+format:		return (__wt_illegal_value(session, WT_METADATA_TURTLE));
+	}
+
+err:	if (fp != NULL)
+		WT_TRET(fclose(fp));
+	__wt_free(session, path);
+	return (ret);
+}
+
+/*
+ * __wt_meta_turtle_update --
+ *	Update the turtle file.
+ */
+int
+__wt_meta_turtle_update(
+    WT_SESSION_IMPL *session, const char *key,  const char *value)
+{
+	FILE *fp;
+	WT_DECL_RET;
+	WT_ITEM *buf;
+	int vmajor, vminor, vpatch;
+	const char *path, *version;
+
+	buf = NULL;
+	fp = NULL;
+
+	version = wiredtiger_version(&vmajor, &vminor, &vpatch);
+
+	/*
+	 * Create the turtle setup file: we currently re-write it from scratch
+	 * every time.
+	 */
+	WT_ERR(__wt_filename(session, WT_METADATA_TURTLE_SET, &path));
+	WT_ERR_TEST((fp = fopen(path, "w")) == NULL, __wt_errno());
+
+	WT_ERR_TEST((fprintf(fp,
+	    "%s\n%s\n%s\n" "major=%d,minor=%d,patch=%d\n%s\n%s\n",
+	    WT_METADATA_VERSION_STR, version,
+	    WT_METADATA_VERSION, vmajor, vminor, vpatch,
+	    key, value) < 0), __wt_errno());
+
+	WT_ERR_TEST(fclose(fp) == EOF, __wt_errno());
+	fp = NULL;
+
+	ret = __wt_rename(session, WT_METADATA_TURTLE_SET, WT_METADATA_TURTLE);
+
+	if (0) {
+err:		(void)__wt_remove(session, WT_METADATA_TURTLE_SET);
+	}
+
+	if  (fp != NULL)
+		(void)fclose(fp);
+	__wt_free(session, path);
+	__wt_scr_free(&buf);
+
+	return (ret);
+}
diff --git a/src/os_posix/os_abort.c b/src/os_posix/os_abort.c
index 810455453fd..9962fb60624 100644
--- a/src/os_posix/os_abort.c
+++ b/src/os_posix/os_abort.c
@@ -13,6 +13,7 @@
  */
 void
 __wt_abort(WT_SESSION_IMPL *session)
+    WT_GCC_FUNC_ATTRIBUTE((noreturn))
 {
 	__wt_errx(session, "aborting WiredTiger library");
 
diff --git a/src/os_posix/os_alloc.c b/src/os_posix/os_alloc.c
index f52a6109709..bb104a4a6c3 100644
--- a/src/os_posix/os_alloc.c
+++ b/src/os_posix/os_alloc.c
@@ -104,7 +104,7 @@ __wt_realloc_aligned(WT_SESSION_IMPL *session,
     size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp)
 {
 #if defined(HAVE_POSIX_MEMALIGN)
-	int ret;
+	WT_DECL_RET;
 
 	/*
 	 * !!!
diff --git a/src/os_posix/os_dlopen.c b/src/os_posix/os_dlopen.c
index caf0c1d3d26..14f20501205 100644
--- a/src/os_posix/os_dlopen.c
+++ b/src/os_posix/os_dlopen.c
@@ -14,8 +14,8 @@
 int
 __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp)
 {
+	WT_DECL_RET;
 	WT_DLH *dlh;
-	int ret;
 
 	WT_RET(__wt_calloc_def(session, 1, &dlh));
 	WT_ERR(__wt_strdup(session, path, &dlh->name));
@@ -57,9 +57,7 @@ __wt_dlsym(
 int
 __wt_dlclose(WT_SESSION_IMPL *session, WT_DLH *dlh)
 {
-	int ret;
-
-	ret = 0;
+	WT_DECL_RET;
 
 	/*
 	 * FreeBSD dies inside __cxa_finalize when closing handles.
diff --git a/src/os_posix/os_exist.c b/src/os_posix/os_exist.c
index 3f524f96e01..3b3174e7125 100644
--- a/src/os_posix/os_exist.c
+++ b/src/os_posix/os_exist.c
@@ -14,9 +14,9 @@
 int
 __wt_exist(WT_SESSION_IMPL *session, const char *filename, int *existp)
 {
-	const char *path;
+	WT_DECL_RET;
 	struct stat sb;
-	int ret;
+	const char *path;
 
 	WT_RET(__wt_filename(session, filename, &path));
 
diff --git a/src/os_posix/os_filesize.c b/src/os_posix/os_filesize.c
index 42daf86cd86..bfb9b39af6b 100644
--- a/src/os_posix/os_filesize.c
+++ b/src/os_posix/os_filesize.c
@@ -15,9 +15,9 @@ int
 __wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, off_t *sizep)
 {
 	struct stat sb;
-	int ret;
+	WT_DECL_RET;
 
-	WT_VERBOSE(session, fileops, "%s: fstat", fh->name);
+	WT_VERBOSE_RET(session, fileops, "%s: fstat", fh->name);
 
 	WT_SYSCALL_RETRY(fstat(fh->fd, &sb), ret);
 	if (ret == 0) {
diff --git a/src/os_posix/os_flock.c b/src/os_posix/os_flock.c
index 8b317654105..2967719fbb6 100644
--- a/src/os_posix/os_flock.c
+++ b/src/os_posix/os_flock.c
@@ -15,7 +15,7 @@ int
 __wt_bytelock(WT_FH *fhp, off_t byte, int lock)
 {
 	struct flock fl;
-	int ret;
+	WT_DECL_RET;
 
 	/*
 	 * WiredTiger requires this function be able to acquire locks past
diff --git a/src/os_posix/os_fsync.c b/src/os_posix/os_fsync.c
index 3874510d946..5a67dabb313 100644
--- a/src/os_posix/os_fsync.c
+++ b/src/os_posix/os_fsync.c
@@ -14,9 +14,9 @@
 int
 __wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh)
 {
-	int ret;
+	WT_DECL_RET;
 
-	WT_VERBOSE(session, fileops, "%s: fsync", fh->name);
+	WT_VERBOSE_RET(session, fileops, "%s: fsync", fh->name);
 
 	WT_SYSCALL_RETRY(fsync(fh->fd), ret);
 	if (ret == 0)
diff --git a/src/os_posix/os_ftruncate.c b/src/os_posix/os_ftruncate.c
index 6c98aecd852..aece569f8ba 100644
--- a/src/os_posix/os_ftruncate.c
+++ b/src/os_posix/os_ftruncate.c
@@ -14,7 +14,7 @@
 int
 __wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, off_t len)
 {
-	int ret;
+	WT_DECL_RET;
 
 	WT_SYSCALL_RETRY(ftruncate(fh->fd, len), ret);
 	if (ret == 0) {
diff --git a/src/os_posix/os_mtx.c b/src/os_posix/os_mtx.c
index f749d5d79f3..59a6fc06f7a 100644
--- a/src/os_posix/os_mtx.c
+++ b/src/os_posix/os_mtx.c
@@ -48,14 +48,14 @@ err:	__wt_free(session, cond);
 void
 __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
 {
-	int ret;
+	WT_DECL_RET;
 
 	/*
 	 * !!!
 	 * This function MUST handle a NULL session handle.
 	 */
 	if (session != NULL)
-		WT_VERBOSE(
+		WT_VERBOSE_VOID(
 		    session, mutex, "lock %s mutex (%p)", cond->name, cond);
 
 	WT_ERR(pthread_mutex_lock(&cond->mtx));
@@ -95,17 +95,16 @@ err:	__wt_err(session, ret, "mutex lock failed");
 void
 __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
 {
-	int ret;
+	WT_DECL_RET;
 
 	/*
 	 * !!!
 	 * This function MUST handle a NULL session handle.
 	 */
 	if (session != NULL)
-		WT_VERBOSE(
+		WT_VERBOSE_VOID(
 		    session, mutex, "signal %s cond (%p)", cond->name, cond);
 
-	ret = 0;
 	WT_ERR(pthread_mutex_lock(&cond->mtx));
 	if (cond->locked) {
 		cond->locked = 0;
@@ -125,7 +124,7 @@ err:	__wt_err(session, ret, "mutex unlock failed");
 int
 __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
 {
-	int ret;
+	WT_DECL_RET;
 
 	ret = pthread_cond_destroy(&cond->cond);
 	WT_TRET(pthread_mutex_destroy(&cond->mtx));
@@ -144,11 +143,10 @@ int
 __wt_rwlock_alloc(
     WT_SESSION_IMPL *session, const char *name, WT_RWLOCK **rwlockp)
 {
+	WT_DECL_RET;
 	WT_RWLOCK *rwlock;
-	int ret;
 
 	WT_RET(__wt_calloc(session, 1, sizeof(WT_RWLOCK), &rwlock));
-	ret = 0;
 	WT_ERR_TEST(pthread_rwlock_init(&rwlock->rwlock, NULL), WT_ERROR);
 
 	rwlock->name = name;
@@ -166,9 +164,9 @@ err:		__wt_free(session, rwlock);
 void
 __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 {
-	int ret;
+	WT_DECL_RET;
 
-	WT_VERBOSE(session, mutex,
+	WT_VERBOSE_VOID(session, mutex,
 	    "readlock %s rwlock (%p)", rwlock->name, rwlock);
 
 	WT_ERR(pthread_rwlock_rdlock(&rwlock->rwlock));
@@ -187,9 +185,9 @@ err:		__wt_err(session, ret, "rwlock readlock failed");
 int
 __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 {
-	int ret;
+	WT_DECL_RET;
 
-	WT_VERBOSE(session, mutex,
+	WT_VERBOSE_VOID(session, mutex,
 	    "try_writelock %s rwlock (%p)", rwlock->name, rwlock);
 
 	if ((ret = pthread_rwlock_trywrlock(&rwlock->rwlock)) == 0)
@@ -209,9 +207,9 @@ __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 void
 __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 {
-	int ret;
+	WT_DECL_RET;
 
-	WT_VERBOSE(session, mutex,
+	WT_VERBOSE_VOID(session, mutex,
 	    "writelock %s rwlock (%p)", rwlock->name, rwlock);
 
 	WT_ERR(pthread_rwlock_wrlock(&rwlock->rwlock));
@@ -230,9 +228,9 @@ err:		__wt_err(session, ret, "rwlock writelock failed");
 void
 __wt_rwunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 {
-	int ret;
+	WT_DECL_RET;
 
-	WT_VERBOSE(session, mutex,
+	WT_VERBOSE_VOID(session, mutex,
 	    "unlock %s rwlock (%p)", rwlock->name, rwlock);
 
 	WT_ERR(pthread_rwlock_unlock(&rwlock->rwlock));
@@ -250,7 +248,7 @@ err:		__wt_err(session, ret, "rwlock unlock failed");
 int
 __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 {
-	int ret;
+	WT_DECL_RET;
 
 	ret = pthread_rwlock_destroy(&rwlock->rwlock);
 	if (ret == EBUSY)
diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c
index 34f78b49a19..f8d75f5585e 100644
--- a/src/os_posix/os_open.c
+++ b/src/os_posix/os_open.c
@@ -15,18 +15,18 @@ int
 __wt_open(WT_SESSION_IMPL *session,
     const char *name, int ok_create, int exclusive, int is_tree, WT_FH **fhp)
 {
-	const char *path;
 	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
 	WT_FH *fh;
 	mode_t mode;
-	int f, fd, matched, ret;
+	int f, fd, matched;
+	const char *path;
 
 	conn = S2C(session);
 	fh = NULL;
 	fd = -1;
-	ret = 0;
 
-	WT_VERBOSE(session, fileops, "%s: open", name);
+	WT_VERBOSE_RET(session, fileops, "%s: open", name);
 
 	/* Increment the reference count if we already have the file open. */
 	matched = 0;
@@ -118,10 +118,9 @@ int
 __wt_close(WT_SESSION_IMPL *session, WT_FH *fh)
 {
 	WT_CONNECTION_IMPL *conn;
-	int ret;
+	WT_DECL_RET;
 
 	conn = S2C(session);
-	ret = 0;
 
 	if (fh == NULL || fh->refcnt == 0 || --fh->refcnt > 0)
 		return (0);
diff --git a/src/os_posix/os_remove.c b/src/os_posix/os_remove.c
index 41ec89a08d9..c064228c9e8 100644
--- a/src/os_posix/os_remove.c
+++ b/src/os_posix/os_remove.c
@@ -14,15 +14,15 @@
 int
 __wt_remove(WT_SESSION_IMPL *session, const char *name)
 {
-	const char *path;
 	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
 	WT_FH *fh;
-	int ret;
+	const char *path;
 
 	conn = S2C(session);
 	fh = NULL;
 
-	WT_VERBOSE(session, fileops, "%s: remove", name);
+	WT_VERBOSE_RET(session, fileops, "%s: remove", name);
 
 	/* If the file is open, close/free it. */
 	__wt_spin_lock(session, &conn->fh_lock);
diff --git a/src/os_posix/os_rename.c b/src/os_posix/os_rename.c
index 67aaed1357a..dba15feab1c 100644
--- a/src/os_posix/os_rename.c
+++ b/src/os_posix/os_rename.c
@@ -14,10 +14,10 @@
 int
 __wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to)
 {
-	int ret;
+	WT_DECL_RET;
 	const char *from_path, *to_path;
 
-	WT_VERBOSE(session, fileops, "rename %s to %s", from, to);
+	WT_VERBOSE_RET(session, fileops, "rename %s to %s", from, to);
 
 	WT_RET(__wt_filename(session, from, &from_path));
 	WT_RET(__wt_filename(session, to, &to_path));
diff --git a/src/os_posix/os_rw.c b/src/os_posix/os_rw.c
index 8666ebe7eaa..bb210700960 100644
--- a/src/os_posix/os_rw.c
+++ b/src/os_posix/os_rw.c
@@ -17,7 +17,7 @@ __wt_read(WT_SESSION_IMPL *session,
 {
 	WT_CSTAT_INCR(session, total_read_io);
 
-	WT_VERBOSE(session, fileops,
+	WT_VERBOSE_RET(session, fileops,
 	    "%s: read %" PRIu32 " bytes at offset %" PRIuMAX,
 	    fh->name, bytes, (uintmax_t)offset);
 
@@ -40,7 +40,7 @@ __wt_write(WT_SESSION_IMPL *session,
 {
 	WT_CSTAT_INCR(session, total_write_io);
 
-	WT_VERBOSE(session, fileops,
+	WT_VERBOSE_RET(session, fileops,
 	    "%s: write %" PRIu32 " bytes at offset %" PRIuMAX,
 	    fh->name, bytes, (uintmax_t)offset);
 
diff --git a/src/os_posix/os_time.c b/src/os_posix/os_time.c
index c6345829c28..f0c7dd37d74 100644
--- a/src/os_posix/os_time.c
+++ b/src/os_posix/os_time.c
@@ -12,18 +12,18 @@
  *	Return the seconds and nanoseconds since the Epoch.
  */
 int
-__wt_epoch(WT_SESSION_IMPL *session, time_t *secp, long *nsecp)
+__wt_epoch(WT_SESSION_IMPL *session, uintmax_t *secp, uintmax_t *nsecp)
 {
-	int ret;
+	WT_DECL_RET;
 
 #if defined(HAVE_CLOCK_GETTIME)
 	struct timespec v;
 	WT_SYSCALL_RETRY(clock_gettime(CLOCK_REALTIME, &v), ret);
 	if (ret == 0) {
 		if (secp != NULL)
-			*secp = v.tv_sec;
+			*secp = (uintmax_t)v.tv_sec;
 		if (nsecp != NULL)
-			*nsecp = v.tv_nsec;
+			*nsecp = (uintmax_t)v.tv_nsec;
 		return (0);
 	}
 	WT_RET_MSG(session, ret, "clock_gettime");
@@ -33,9 +33,9 @@ __wt_epoch(WT_SESSION_IMPL *session, time_t *secp, long *nsecp)
 	WT_SYSCALL_RETRY(gettimeofday(&v, NULL), ret);
 	if (ret == 0) {
 		if (secp != NULL)
-			*secp = v.tv_sec;
+			*secp = (uintmax_t)v.tv_sec;
 		if (nsecp != NULL)	/* nanoseconds in a microsecond */
-			*nsecp = v.tv_usec * 1000;
+			*nsecp = (uintmax_t)(v.tv_usec * 1000);
 		return (0);
 	}
 	WT_RET_MSG(session, ret, "gettimeofday");
diff --git a/src/packing/packing.c b/src/packing/packing.c
index 7fe751f6d44..7f3286b2e0f 100644
--- a/src/packing/packing.c
+++ b/src/packing/packing.c
@@ -16,9 +16,10 @@ int
 __wt_struct_check(WT_SESSION_IMPL *session,
     const char *fmt, size_t len, int *fixedp, uint32_t *fixed_lenp)
 {
+	WT_DECL_RET;
 	WT_PACK pack;
 	WT_PACK_VALUE pv;
-	int fields, ret;
+	int fields;
 
 	WT_CLEAR(pv);		/* -Wuninitialized. */
 
@@ -48,8 +49,9 @@ __wt_struct_check(WT_SESSION_IMPL *session,
  * __wt_struct_sizev --
  *	Calculate the size of a packed byte string (va_list version).
  */
-size_t
-__wt_struct_sizev(WT_SESSION_IMPL *session, const char *fmt, va_list ap)
+int
+__wt_struct_sizev(
+    WT_SESSION_IMPL *session, size_t *sizep, const char *fmt, va_list ap)
 {
 	WT_PACK pack;
 	WT_PACK_VALUE pv;
@@ -57,31 +59,31 @@ __wt_struct_sizev(WT_SESSION_IMPL *session, const char *fmt, va_list ap)
 
 	WT_CLEAR(pv);		/* -Wuninitialized */
 
-	if (__pack_init(session, &pack, fmt) != 0)
-		return ((size_t)-1);
+	WT_RET(__pack_init(session, &pack, fmt));
 
 	for (total = 0; __pack_next(&pack, &pv) == 0;) {
 		WT_PACK_GET(session, pv, ap);
 		total += __pack_size(session, &pv);
 	}
-	return (total);
+	*sizep = total;
+	return (0);
 }
 
 /*
  * __wt_struct_size --
  *	Calculate the size of a packed byte string.
  */
-size_t
-__wt_struct_size(WT_SESSION_IMPL *session, const char *fmt, ...)
+int
+__wt_struct_size(WT_SESSION_IMPL *session, size_t *sizep, const char *fmt, ...)
 {
+	WT_DECL_RET;
 	va_list ap;
-	size_t size;
 
 	va_start(ap, fmt);
-	size = __wt_struct_sizev(session, fmt, ap);
+	ret = __wt_struct_sizev(session, sizep, fmt, ap);
 	va_end(ap);
 
-	return (size);
+	return (ret);
 }
 
 /*
@@ -92,10 +94,10 @@ int
 __wt_struct_packv(WT_SESSION_IMPL *session,
     void *buffer, size_t size, const char *fmt, va_list ap)
 {
+	WT_DECL_RET;
 	WT_PACK pack;
 	WT_PACK_VALUE pv;
 	uint8_t *p, *end;
-	int ret;
 
 	WT_CLEAR(pv);		/* -Wuninitialized */
 
@@ -125,8 +127,8 @@ int
 __wt_struct_pack(WT_SESSION_IMPL *session,
     void *buffer, size_t size, const char *fmt, ...)
 {
+	WT_DECL_RET;
 	va_list ap;
-	int ret;
 
 	va_start(ap, fmt);
 	ret = __wt_struct_packv(session, buffer, size, fmt, ap);
@@ -143,10 +145,10 @@ int
 __wt_struct_unpackv(WT_SESSION_IMPL *session,
     const void *buffer, size_t size, const char *fmt, va_list ap)
 {
+	WT_DECL_RET;
 	WT_PACK pack;
 	WT_PACK_VALUE pv;
 	const uint8_t *p, *end;
-	int ret;
 
 	WT_RET(__pack_init(session, &pack, fmt));
 
@@ -174,8 +176,8 @@ int
 __wt_struct_unpack(WT_SESSION_IMPL *session,
     const void *buffer, size_t size, const char *fmt, ...)
 {
+	WT_DECL_RET;
 	va_list ap;
-	int ret;
 
 	va_start(ap, fmt);
 	ret = __wt_struct_unpackv(session, buffer, size, fmt, ap);
diff --git a/src/packing/packing_api.c b/src/packing/packing_api.c
index ddfad1dd669..e0861c837b8 100644
--- a/src/packing/packing_api.c
+++ b/src/packing/packing_api.c
@@ -11,21 +11,18 @@
  * wiredtiger_struct_size --
  *	Calculate the size of a packed byte string.
  */
-size_t
-wiredtiger_struct_size(const char *fmt, ...)
+int
+wiredtiger_struct_size(
+    WT_SESSION *wt_session, size_t *sizep, const char *fmt, ...)
 {
-	WT_SESSION_IMPL session;
+	WT_DECL_RET;
 	va_list ap;
-	size_t size;
-
-	WT_CLEAR(session);
-	session.event_handler = __wt_event_handler_default;
 
 	va_start(ap, fmt);
-	size = __wt_struct_sizev(&session, fmt, ap);
+	ret = __wt_struct_sizev((WT_SESSION_IMPL *)wt_session, sizep, fmt, ap);
 	va_end(ap);
 
-	return (size);
+	return (ret);
 }
 
 /*
@@ -33,17 +30,15 @@ wiredtiger_struct_size(const char *fmt, ...)
  *	Pack a byte string.
  */
 int
-wiredtiger_struct_pack(void *buffer, size_t size, const char *fmt, ...)
+wiredtiger_struct_pack(
+    WT_SESSION *wt_session, void *buffer, size_t size, const char *fmt, ...)
 {
-	WT_SESSION_IMPL session;
+	WT_DECL_RET;
 	va_list ap;
-	int ret;
-
-	WT_CLEAR(session);
-	session.event_handler = __wt_event_handler_default;
 
 	va_start(ap, fmt);
-	ret = __wt_struct_packv(&session, buffer, size, fmt, ap);
+	ret = __wt_struct_packv(
+	    (WT_SESSION_IMPL *)wt_session, buffer, size, fmt, ap);
 	va_end(ap);
 
 	return (ret);
@@ -54,17 +49,15 @@ wiredtiger_struct_pack(void *buffer, size_t size, const char *fmt, ...)
  *	Unpack a byte string.
  */
 int
-wiredtiger_struct_unpack(const void *buffer, size_t size, const char *fmt, ...)
+wiredtiger_struct_unpack(WT_SESSION *wt_session,
+    const void *buffer, size_t size, const char *fmt, ...)
 {
-	WT_SESSION_IMPL session;
+	WT_DECL_RET;
 	va_list ap;
-	int ret;
-
-	WT_CLEAR(session);
-	session.event_handler = __wt_event_handler_default;
 
 	va_start(ap, fmt);
-	ret = __wt_struct_unpackv(&session, buffer, size, fmt, ap);
+	ret = __wt_struct_unpackv(
+	    (WT_SESSION_IMPL *)wt_session, buffer, size, fmt, ap);
 	va_end(ap);
 
 	return (ret);
diff --git a/src/schema/schema_create.c b/src/schema/schema_create.c
index 4cc729c0818..77e50172671 100644
--- a/src/schema/schema_create.c
+++ b/src/schema/schema_create.c
@@ -7,83 +7,81 @@
 
 #include "wt_internal.h"
 
-int
-__wt_create_file(WT_SESSION_IMPL *session,
-    const char *name, const char *fileuri, int exclusive, const char *config)
+static int
+__create_file(WT_SESSION_IMPL *session,
+    const char *uri, int exclusive, const char *config)
 {
 	WT_ITEM *val;
+	WT_DECL_RET;
+	int is_metadata;
 	const char *cfg[] = API_CONF_DEFAULTS(session, create, config);
 	const char *filecfg[4] = API_CONF_DEFAULTS(file, meta, config);
 	const char *filename, *treeconf;
-	int is_schema, vmajor, vminor, vpatch, ret;
 
 	val = NULL;
 	treeconf = NULL;
-	ret = 0;
 
-	filename = fileuri;
+	is_metadata = strcmp(uri, WT_METADATA_URI) == 0;
+
+	filename = uri;
 	if (!WT_PREFIX_SKIP(filename, "file:"))
-		WT_RET_MSG(
-		    session, EINVAL, "Expecting a 'file:' URI: %s", fileuri);
+		WT_RET_MSG(session, EINVAL, "Expected a 'file:' URI: %s", uri);
 
-	/*
-	 * Opening the schema table is a special case, use the config
-	 * string we were passed to open the file.
-	 */
-	is_schema = (strcmp(filename, WT_SCHEMA_FILENAME) == 0);
+	/* Get an exclusive handle lock to protect the name. */
+	WT_RET(__wt_session_get_btree(
+	    session, uri, cfg, WT_BTREE_EXCLUSIVE | WT_BTREE_LOCK_ONLY));
 
-	/* If the file exists, don't try to recreate it. */
-	if ((ret = __wt_session_get_btree(session, name, fileuri,
-	    is_schema ? config : NULL,
-	    NULL, WT_BTREE_NO_LOCK)) != WT_NOTFOUND) {
-		if (ret == 0 && exclusive)
-			ret = EEXIST;
-		return (ret);
+	if (WT_META_TRACKING(session)) {
+		WT_RET(__wt_meta_track_handle_lock(session));
+		session->created_btree = session->btree;
 	}
 
-	WT_RET(__wt_btree_create(session, filename));
-	WT_ERR(__wt_schema_table_track_fileop(session, NULL, filename));
-
-	/* Insert WiredTiger version numbers into the schema file. */
-	WT_ERR(__wt_scr_alloc(session, 0, &val));
-	if (is_schema) {
-		WT_ERR(__wt_schema_table_insert(
-		    session, WT_SCHEMA_VERSION_STR,
-		    wiredtiger_version(&vmajor, &vminor, &vpatch)));
-		WT_ERR(__wt_buf_fmt(session, val,
-		    "major=%d,minor=%d,patch=%d", vmajor, vminor, vpatch));
-		WT_ERR(__wt_schema_table_insert(
-		    session, WT_SCHEMA_VERSION, val->data));
+	/* Check if the file already exists. */
+	if (!is_metadata && (ret =
+	    __wt_metadata_read(session, uri, &treeconf)) != WT_NOTFOUND) {
+		__wt_free(session, treeconf);
+		if (exclusive)
+			WT_TRET(EEXIST);
+		goto err;
 	}
 
+	/* Create the file. */
+	WT_ERR(__wt_btree_create(session, filename));
+	if (WT_META_TRACKING(session))
+		WT_ERR(__wt_meta_track_fileop(session, NULL, uri));
+
 	/*
-	 * Insert Btree version numbers into the schema file (including for
-	 * the schema file itself, although the schema file version numbers
-	 * can never be trusted, we have to get them from the turtle file).
+	 * If creating an ordinary file, append the current version numbers to
+	 * the passed-in configuration and insert the resulting configuration
+	 * into the metadata.
 	 */
-	WT_ERR(__wt_buf_fmt(session, val, "version=(major=%d,minor=%d)",
-	    WT_BTREE_MAJOR_VERSION, WT_BTREE_MINOR_VERSION));
-	filecfg[2] = val->data;
-
-	if (is_schema)
-		WT_ERR(__wt_strdup(session, config, &treeconf));
-	else
+	if (!is_metadata) {
+		WT_ERR(__wt_scr_alloc(session, 0, &val));
+		WT_ERR(__wt_buf_fmt(session, val, "version=(major=%d,minor=%d)",
+		    WT_BTREE_MAJOR_VERSION, WT_BTREE_MINOR_VERSION));
+		filecfg[2] = val->data;
 		WT_ERR(__wt_config_collapse(session, filecfg, &treeconf));
-	WT_ERR(__wt_schema_table_insert(session, fileuri, treeconf));
+		if ((ret = __wt_metadata_insert(session, uri, treeconf)) != 0) {
+			if (ret == WT_DUPLICATE_KEY)
+				ret = EEXIST;
+			goto err;
+		}
+	}
 
 	/*
-	 * Call the underlying connection function to allocate a WT_BTREE handle
-	 * and open the underlying file (note we no longer own the configuration
-	 * string after that call).
+	 * Open the file to check that it was setup correctly.
+	 *
+	 * Keep the handle exclusive until it is released at the end of the
+	 * call, otherwise we could race with a drop.
 	 */
-	ret = __wt_conn_btree_open(session, name, filename, treeconf, cfg, 0);
-	treeconf = NULL;
-	WT_ERR(ret);
-	WT_ERR(__wt_session_add_btree(session, NULL));
+	ret = __wt_conn_btree_get(session, uri, NULL, cfg, WT_BTREE_EXCLUSIVE);
+
+err:	if (!WT_META_TRACKING(session))
+		WT_TRET(__wt_session_release_btree(session));
 
-	/* If something goes wrong, throw away anything we created. */
-err:	__wt_scr_free(&val);
+	__wt_scr_free(&val);
 	__wt_free(session, treeconf);
+
 	return (ret);
 }
 
@@ -92,6 +90,7 @@ __create_colgroup(WT_SESSION_IMPL *session,
     const char *name, int exclusive, const char *config)
 {
 	WT_CONFIG_ITEM cval;
+	WT_DECL_RET;
 	WT_ITEM fmt, namebuf, uribuf;
 	WT_TABLE *table;
 	const char *cfg[] = { __wt_confdfl_colgroup_meta, config, NULL, NULL };
@@ -100,7 +99,6 @@ __create_colgroup(WT_SESSION_IMPL *session,
 	const char *cgconf, *cgname, *fileconf, *filename, *fileuri;
 	const char *oldconf, *tablename;
 	size_t tlen;
-	int ret;
 
 	cgconf = fileconf = oldconf = NULL;
 	WT_CLEAR(fmt);
@@ -172,20 +170,19 @@ __create_colgroup(WT_SESSION_IMPL *session,
 	WT_ERR(__wt_buf_fmt(session, &uribuf, "file:%s", filename));
 	fileuri = uribuf.data;
 
-	if ((ret = __wt_schema_table_insert(session, name, cgconf)) != 0) {
+	WT_ERR(__create_file(session, fileuri, exclusive, fileconf));
+	if ((ret = __wt_metadata_insert(session, name, cgconf)) != 0) {
 		/*
-		 * If the entry already exists in the schema table, we're done.
+		 * If the entry already exists in the metadata, we're done.
 		 * This is an error for exclusive creates but okay otherwise.
 		 */
 		if (ret == WT_DUPLICATE_KEY)
 			ret = exclusive ? EEXIST : 0;
 		goto err;
 	}
-	WT_ERR(__wt_create_file(session, name, fileuri, exclusive, fileconf));
-
 	WT_ERR(__wt_schema_open_colgroups(session, table));
 
-err:    __wt_free(session, cgconf);
+err:	__wt_free(session, cgconf);
 	__wt_free(session, fileconf);
 	__wt_free(session, oldconf);
 	__wt_buf_free(session, &fmt);
@@ -200,6 +197,7 @@ __create_index(WT_SESSION_IMPL *session,
 {
 	WT_CONFIG pkcols;
 	WT_CONFIG_ITEM ckey, cval, icols;
+	WT_DECL_RET;
 	WT_ITEM extra_cols, fmt, namebuf, uribuf;
 	WT_TABLE *table;
 	const char *cfg[] = { __wt_confdfl_index_meta, config, NULL, NULL };
@@ -207,7 +205,7 @@ __create_index(WT_SESSION_IMPL *session,
 	const char *fileconf, *filename, *fileuri, *idxconf, *idxname;
 	const char *tablename;
 	size_t tlen;
-	int i, ret;
+	int i;
 
 	idxconf = fileconf = NULL;
 	WT_CLEAR(fmt);
@@ -281,16 +279,16 @@ __create_index(WT_SESSION_IMPL *session,
 	WT_ERR(__wt_buf_fmt(session, &uribuf, "file:%s", filename));
 	fileuri = uribuf.data;
 
-	if ((ret = __wt_schema_table_insert(session, name, idxconf)) != 0) {
+	WT_ERR(__create_file(session, fileuri, exclusive, fileconf));
+	if ((ret = __wt_metadata_insert(session, name, idxconf)) != 0) {
 		/*
-		 * If the entry already exists in the schema table, we're done.
+		 * If the entry already exists in the metadata, we're done.
 		 * This is an error for exclusive creates but okay otherwise.
 		 */
 		if (ret == WT_DUPLICATE_KEY)
 			ret = exclusive ? EEXIST : 0;
 		goto err;
 	}
-	WT_ERR(__wt_create_file(session, name, fileuri, exclusive, fileconf));
 
 err:	__wt_free(session, fileconf);
 	__wt_free(session, idxconf);
@@ -307,12 +305,13 @@ __create_table(WT_SESSION_IMPL *session,
 {
 	WT_CONFIG conf;
 	WT_CONFIG_ITEM cgkey, cgval, cval;
+	WT_DECL_RET;
 	WT_TABLE *table;
 	const char *cfg[] = { __wt_confdfl_table_meta, config, NULL, NULL };
 	const char *tableconf, *tablename;
 	char *cgname;
 	size_t cgsize;
-	int ncolgroups, ret;
+	int ncolgroups;
 
 	cgname = NULL;
 	table = NULL;
@@ -341,7 +340,15 @@ __create_table(WT_SESSION_IMPL *session,
 		return (ret);
 
 	WT_RET(__wt_config_collapse(session, cfg, &tableconf));
-	WT_ERR(__wt_schema_table_insert(session, name, tableconf));
+	if ((ret = __wt_metadata_insert(session, name, tableconf)) != 0) {
+		/*
+		 * If the entry already exists in the metadata, we're done.
+		 * This is an error for exclusive creates but okay otherwise.
+		 */
+		if (ret == WT_DUPLICATE_KEY)
+			ret = exclusive ? EEXIST : 0;
+		goto err;
+	}
 
 	/* Attempt to open the table now to catch any errors. */
 	WT_ERR(__wt_schema_get_table(
@@ -368,33 +375,33 @@ __wt_schema_create(
     WT_SESSION_IMPL *session, const char *name, const char *config)
 {
 	WT_CONFIG_ITEM cval;
-	int exclusive, ret;
-
-	/* Disallow objects in the WiredTiger name space. */
-	WT_RET(__wt_schema_name_check(session, name));
+	WT_DATA_SOURCE *dsrc;
+	WT_DECL_RET;
+	int exclusive;
 
 	exclusive = (
 	    __wt_config_getones(session, config, "exclusive", &cval) == 0 &&
 	    cval.val != 0);
 
 	/*
-	 * We track rename operations, if we fail in the middle, we want to
-	 * back it all out.
+	 * We track create operations: if we fail in the middle of creating a
+	 * complex object, we want to back it all out.
 	 */
-	WT_RET(__wt_schema_table_track_on(session));
+	WT_RET(__wt_meta_track_on(session));
 
 	if (WT_PREFIX_MATCH(name, "colgroup:"))
 		ret = __create_colgroup(session, name, exclusive, config);
 	else if (WT_PREFIX_MATCH(name, "file:"))
-		ret = __wt_create_file(session, name, name, exclusive, config);
+		ret = __create_file(session, name, exclusive, config);
 	else if (WT_PREFIX_MATCH(name, "index:"))
 		ret = __create_index(session, name, exclusive, config);
 	else if (WT_PREFIX_MATCH(name, "table:"))
 		ret = __create_table(session, name, exclusive, config);
-	else
-		ret = __wt_unknown_object_type(session, name);
+	else if ((ret = __wt_schema_get_source(session, name, &dsrc)) == 0)
+		ret = dsrc->create(dsrc, &session->iface, name, config);
 
-	WT_TRET(__wt_schema_table_track_off(session, ret != 0));
+	session->btree = NULL;
+	WT_TRET(__wt_meta_track_off(session, ret != 0));
 
 	return (ret);
 }
diff --git a/src/schema/schema_drop.c b/src/schema/schema_drop.c
index aba31969d2d..8d1386e0369 100644
--- a/src/schema/schema_drop.c
+++ b/src/schema/schema_drop.c
@@ -12,29 +12,43 @@
  *	Drop a file.
  */
 static int
-__drop_file(WT_SESSION_IMPL *session, const char *uri, int force)
+__drop_file(
+    WT_SESSION_IMPL *session, const char *uri, int force, const char *cfg[])
 {
-	int exist, ret;
+	WT_DECL_RET;
+	int exist;
 	const char *filename;
 
-	ret = 0;
 	filename = uri;
 	if (!WT_PREFIX_SKIP(filename, "file:"))
 		return (EINVAL);
 
-	/* If open, close the btree handle. */
-	WT_RET(__wt_session_close_any_open_btree(session, filename));
+	if (session->btree == NULL &&
+	    (ret = __wt_session_get_btree(session, uri, cfg,
+	    WT_BTREE_EXCLUSIVE | WT_BTREE_LOCK_ONLY)) != 0) {
+		if (ret == WT_NOTFOUND || ret == ENOENT)
+			ret = 0;
+		return (ret);
+	}
+
+	/* Close all btree handles associated with this file. */
+	WT_RET(__wt_conn_btree_close_all(session, uri));
 
-	/* Remove the schema table entry (ignore missing items). */
-	WT_TRET(__wt_schema_table_remove(session, uri));
+	/* Remove the metadata entry (ignore missing items). */
+	WT_TRET(__wt_metadata_remove(session, uri));
 	if (force && ret == WT_NOTFOUND)
 		ret = 0;
 
 	/* Remove the underlying physical file. */
 	exist = 0;
 	WT_TRET(__wt_exist(session, filename, &exist));
-	if (exist)
+	if (exist) {
+		/*
+		 * There is no point tracking this operation: there is no going
+		 * back from here.
+		 */
 		WT_TRET(__wt_remove(session, filename));
+	}
 
 	return (ret);
 }
@@ -44,26 +58,30 @@ __drop_file(WT_SESSION_IMPL *session, const char *uri, int force)
  *	Drop an index or colgroup reference.
  */
 static int
-__drop_tree(WT_SESSION_IMPL *session, WT_BTREE *btree, int force)
+__drop_tree(
+    WT_SESSION_IMPL *session, const char *uri, int force, const char *cfg[])
 {
+	WT_BTREE *btree;
+	WT_DECL_RET;
 	WT_ITEM *buf;
-	int ret;
 
-	ret = 0;
+	btree = session->btree;
+	buf = NULL;
 
-	/* Remove the schema table entry (ignore missing items). */
-	WT_TRET(__wt_schema_table_remove(session, btree->name));
+	/* Remove the metadata entry (ignore missing items). */
+	WT_TRET(__wt_metadata_remove(session, uri));
 	if (force && ret == WT_NOTFOUND)
 		ret = 0;
 
 	/*
 	 * Drop the file.
 	 * __drop_file closes the WT_BTREE handle, so we copy the
-	 * WT_BTREE->filename field to make a URI.
+	 * WT_BTREE->name field to save the URI.
 	 */
 	WT_ERR(__wt_scr_alloc(session, 0, &buf));
-	WT_ERR(__wt_buf_fmt(session, buf, "file:%s", btree->filename));
-	WT_ERR(__drop_file(session, buf->data, force));
+	WT_ERR(__wt_buf_set(
+	    session, buf, btree->name, strlen(btree->name) + 1));
+	WT_ERR(__drop_file(session, buf->data, force, cfg));
 
 err:	__wt_scr_free(&buf);
 
@@ -78,11 +96,10 @@ static int
 __drop_colgroup(
     WT_SESSION_IMPL *session, const char *uri, int force, const char *cfg[])
 {
-	WT_BTREE *btree;
+	WT_DECL_RET;
 	WT_TABLE *table;
 	const char *cgname, *tablename;
 	size_t tlen;
-	int i, ret;
 
 	tablename = uri;
 	if (!WT_PREFIX_SKIP(tablename, "colgroup:"))
@@ -95,35 +112,24 @@ __drop_colgroup(
 		tlen = strlen(tablename);
 
 	/*
-	 * Try to get the btree handle.  Ideally, we would use an exclusive
-	 * lock here to prevent access to the table while we are dropping it,
-	 * but conflicts with the exclusive lock taken by
-	 * __wt_session_close_any_open_btree.  If two threads race dropping
-	 * the same object, it will be caught there.
-	 *
-	 * If we can't get a tree, try to remove it from the schema table.
+	 * Try to get the btree handle.  It will be unlocked by
+	 * __wt_conn_btree_close_all.
 	 */
-	if ((ret = __wt_schema_get_btree(
-	    session, uri, strlen(uri), cfg, WT_BTREE_NO_LOCK)) != 0) {
-		(void)__wt_schema_table_remove(session, uri);
+	if ((ret = __wt_schema_get_btree(session, uri, strlen(uri), cfg,
+	    WT_BTREE_EXCLUSIVE | WT_BTREE_LOCK_ONLY)) != 0) {
+		if (ret == WT_NOTFOUND || ret == ENOENT)
+			ret = 0;
 		return (ret);
 	}
-	btree = session->btree;
 
 	/* If we can get the table, detach the colgroup from it. */
-	if ((ret = __wt_schema_get_table(
-	    session, tablename, tlen, &table)) == 0) {
-		for (i = 0; i < WT_COLGROUPS(table); i++) {
-			if (table->colgroup[i] == btree) {
-				table->colgroup[i] = NULL;
-				table->cg_complete = 0;
-				break;
-			}
-		}
-	} else if (ret != WT_NOTFOUND)
-		WT_TRET(ret);
-
-	WT_TRET(__drop_tree(session, btree, force));
+	if ((ret =
+	    __wt_schema_get_table(session, tablename, tlen, &table)) == 0)
+		table->cg_complete = 0;
+	else if (ret == WT_NOTFOUND)
+		ret = 0;
+
+	WT_TRET(__drop_tree(session, uri, force, cfg));
 
 	return (ret);
 }
@@ -136,11 +142,10 @@ static int
 __drop_index(
     WT_SESSION_IMPL *session, const char *uri, int force, const char *cfg[])
 {
-	WT_BTREE *btree;
+	WT_DECL_RET;
 	WT_TABLE *table;
 	const char *idxname, *tablename;
 	size_t tlen;
-	int i, ret;
 
 	tablename = uri;
 	if (!WT_PREFIX_SKIP(tablename, "index:") ||
@@ -150,35 +155,24 @@ __drop_index(
 	++idxname;
 
 	/*
-	 * Try to get the btree handle.  Ideally, we would use an exclusive
-	 * lock here to prevent access to the table while we are dropping it,
-	 * but conflicts with the exclusive lock taken by
-	 * __wt_session_close_any_open_btree.  If two threads race dropping
-	 * the same object, it will be caught there.
-	 *
-	 * If we can't get a tree, try to remove it from the schema table.
+	 * Try to get the btree handle.  It will be unlocked by
+	 * __wt_conn_btree_close_all.
 	 */
-	if ((ret = __wt_schema_get_btree(
-	    session, uri, strlen(uri), cfg, WT_BTREE_NO_LOCK)) != 0) {
-		(void)__wt_schema_table_remove(session, uri);
+	if ((ret = __wt_schema_get_btree(session, uri, strlen(uri), cfg,
+	    WT_BTREE_EXCLUSIVE | WT_BTREE_LOCK_ONLY)) != 0) {
+		if (ret == WT_NOTFOUND || ret == ENOENT)
+			ret = 0;
 		return (ret);
 	}
-	btree = session->btree;
 
 	/* If we can get the table, detach the index from it. */
 	if ((ret = __wt_schema_get_table(
-	    session, tablename, tlen, &table)) == 0 &&
-	    (ret = __wt_schema_open_index(
-	    session, table, idxname, strlen(idxname))) == 0) {
-		for (i = 0; i < table->nindices; i++)
-			if (table->index[i] == btree) {
-				table->index[i] = NULL;
-				table->idx_complete = 0;
-			}
-	} else if (ret != WT_NOTFOUND)
-		WT_TRET(ret);
-
-	WT_TRET(__drop_tree(session, btree, force));
+	    session, tablename, tlen, &table)) == 0)
+		table->idx_complete = 0;
+	else if (ret == WT_NOTFOUND)
+		ret = 0;
+
+	WT_TRET(__drop_tree(session, uri, force, cfg));
 
 	return (ret);
 }
@@ -188,15 +182,14 @@ __drop_index(
  *	WT_SESSION::drop for a table.
  */
 static int
-__drop_table(WT_SESSION_IMPL *session, const char *uri, int force)
+__drop_table(
+    WT_SESSION_IMPL *session, const char *uri, int force, const char *cfg[])
 {
-	WT_BTREE *btree;
+	WT_DECL_RET;
 	WT_TABLE *table;
-	int i, ret;
+	int i;
 	const char *name;
 
-	ret = 0;
-
 	name = uri;
 	(void)WT_PREFIX_SKIP(name, "table:");
 
@@ -204,24 +197,24 @@ __drop_table(WT_SESSION_IMPL *session, const char *uri, int force)
 
 	/* Drop the column groups. */
 	for (i = 0; i < WT_COLGROUPS(table); i++) {
-		if ((btree = table->colgroup[i]) == NULL)
+		if (table->cg_name[i] == NULL)
 			continue;
-		table->colgroup[i] = NULL;
-		WT_TRET(__drop_tree(session, btree, force));
+		WT_ERR(__drop_colgroup(
+		    session, table->cg_name[i], force, cfg));
 	}
 
 	/* Drop the indices. */
-	WT_TRET(__wt_schema_open_index(session, table, NULL, 0));
+	WT_ERR(__wt_schema_open_index(session, table, NULL, 0));
 	for (i = 0; i < table->nindices; i++) {
-		btree = table->index[i];
-		table->index[i] = NULL;
-		WT_TRET(__drop_tree(session, btree, force));
+		if (table->idx_name[i] == NULL)
+			continue;
+		WT_TRET(__drop_index(session, table->idx_name[i], force, cfg));
 	}
 
-	WT_TRET(__wt_schema_remove_table(session, table));
+	WT_ERR(__wt_schema_remove_table(session, table));
 
-	/* Remove the schema table entry (ignore missing items). */
-	WT_TRET(__wt_schema_table_remove(session, uri));
+	/* Remove the metadata entry (ignore missing items). */
+	WT_ERR(__wt_metadata_remove(session, uri));
 
 err:	if (force && ret == WT_NOTFOUND)
 		ret = 0;
@@ -232,7 +225,9 @@ int
 __wt_schema_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
 {
 	WT_CONFIG_ITEM cval;
-	int force, ret;
+	WT_DATA_SOURCE *dsrc;
+	WT_DECL_RET;
+	int force;
 
 	cval.val = 0;
 	ret = __wt_config_gets(session, cfg, "force", &cval);
@@ -246,17 +241,17 @@ __wt_schema_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
 	if (WT_PREFIX_MATCH(uri, "colgroup:"))
 		ret = __drop_colgroup(session, uri, force, cfg);
 	else if (WT_PREFIX_MATCH(uri, "file:"))
-		ret = __drop_file(session, uri, force);
+		ret = __drop_file(session, uri, force, cfg);
 	else if (WT_PREFIX_MATCH(uri, "index:"))
 		ret = __drop_index(session, uri, force, cfg);
 	else if (WT_PREFIX_MATCH(uri, "table:"))
-		ret = __drop_table(session, uri, force);
-	else
-		return (__wt_unknown_object_type(session, uri));
+		ret = __drop_table(session, uri, force, cfg);
+	else if ((ret = __wt_schema_get_source(session, uri, &dsrc)) == 0)
+		ret = dsrc->drop(dsrc, &session->iface, uri, cfg[1]);
 
 	/*
 	 * Map WT_NOTFOUND to ENOENT (or to 0 if "force" is set), based on the
-	 * assumption WT_NOTFOUND means there was no schema file entry.  The
+	 * assumption WT_NOTFOUND means there was no metadata entry.  The
 	 * underlying drop functions should handle this case (we passed them
 	 * the "force" value), but better safe than sorry.
 	 */
diff --git a/src/schema/schema_list.c b/src/schema/schema_list.c
index acc38880a19..f8128f8e3e3 100644
--- a/src/schema/schema_list.c
+++ b/src/schema/schema_list.c
@@ -52,7 +52,7 @@ int
 __wt_schema_get_table(WT_SESSION_IMPL *session,
     const char *name, size_t namelen, WT_TABLE **tablep)
 {
-	int ret;
+	WT_DECL_RET;
 
 	ret = __wt_schema_find_table(session, name, namelen, tablep);
 
@@ -71,13 +71,23 @@ __wt_schema_get_table(WT_SESSION_IMPL *session,
 void
 __wt_schema_destroy_table(WT_SESSION_IMPL *session, WT_TABLE *table)
 {
+	int i;
+
 	__wt_free(session, table->name);
 	__wt_free(session, table->config);
 	__wt_free(session, table->plan);
 	__wt_free(session, table->key_format);
 	__wt_free(session, table->value_format);
-	__wt_free(session, table->colgroup);
-	__wt_free(session, table->index);
+	if (table->cg_name != NULL) {
+		for (i = 0; i < WT_COLGROUPS(table); i++)
+			__wt_free(session, table->cg_name[i]);
+		__wt_free(session, table->cg_name);
+	}
+	if (table->idx_name != NULL) {
+		for (i = 0; i < table->nindices; i++)
+			__wt_free(session, table->idx_name[i]);
+		__wt_free(session, table->idx_name);
+	}
 	__wt_free(session, table);
 }
 
@@ -102,44 +112,11 @@ __wt_schema_remove_table(
 int
 __wt_schema_close_tables(WT_SESSION_IMPL *session)
 {
+	WT_DECL_RET;
 	WT_TABLE *table;
-	int ret;
 
-	ret = 0;
 	while ((table = TAILQ_FIRST(&session->tables)) != NULL)
 		WT_TRET(__wt_schema_remove_table(session, table));
 
 	return (ret);
 }
-
-/*
- * __wt_schema_detach tree --
- *	Remove any references to a tree from a table in the session.
- *
- * Note: this function should be called with an exclusive lock on the btree
- * handle to prevent races.
- */
-void
-__wt_schema_detach_tree(WT_SESSION_IMPL *session, WT_BTREE *btree)
-{
-	WT_TABLE *table;
-	int i;
-
-	TAILQ_FOREACH(table, &session->tables, q) {
-		/* Check the column groups. */
-		for (i = 0; i < WT_COLGROUPS(table); i++)
-			if (table->colgroup[i] == btree) {
-				table->colgroup[i] = NULL;
-				table->cg_complete = 0;
-				return;
-			}
-
-		/* Check the indices. */
-		for (i = 0; i < table->nindices; i++)
-			if (table->index[i] == btree) {
-				table->index[i] = NULL;
-				table->idx_complete = 0;
-				return;
-			}
-	}
-}
diff --git a/src/schema/schema_open.c b/src/schema/schema_open.c
index fd8f43b46ca..9875d9b649a 100644
--- a/src/schema/schema_open.c
+++ b/src/schema/schema_open.c
@@ -9,36 +9,23 @@
 
 /*
  * __wt_schema_colgroup_name --
- *	Get the URI for a column group.  This is used for schema table lookups.
+ *	Get the URI for a column group.  This is used for metadata lookups.
  *	The only complexity here is that simple tables (with a single column
  *	group) use a simpler naming scheme.
  */
 int
 __wt_schema_colgroup_name(WT_SESSION_IMPL *session,
-    WT_TABLE *table, const char *cgname, size_t len, char **namebufp)
+    WT_TABLE *table, const char *cgname, size_t len, WT_ITEM *namebuf)
 {
 	const char *tablename;
-	char *namebuf;
-	size_t namesize;
 
-	namebuf = *namebufp;
 	tablename = table->name;
 	(void)WT_PREFIX_SKIP(tablename, "table:");
 
-	/* The primary filename is in the table config. */
-	if (table->ncolgroups == 0) {
-		namesize = strlen("colgroup:") + strlen(tablename) + 1;
-		WT_RET(__wt_realloc(session, NULL, namesize, &namebuf));
-		snprintf(namebuf, namesize, "colgroup:%s", tablename);
-	} else {
-		namesize = strlen("colgroup::") + strlen(tablename) + len + 1;
-		WT_RET(__wt_realloc(session, NULL, namesize, &namebuf));
-		snprintf(namebuf, namesize, "colgroup:%s:%.*s",
-		    tablename, (int)len, cgname);
-	}
-
-	*namebufp = namebuf;
-	return (0);
+	return ((table->ncolgroups == 0) ?
+	    __wt_buf_fmt(session, namebuf, "colgroup:%s", tablename) :
+	    __wt_buf_fmt(session, namebuf, "colgroup:%s:%.*s",
+	    tablename, (int)len, cgname));
 }
 
 /*
@@ -50,29 +37,30 @@ int
 __wt_schema_get_btree(WT_SESSION_IMPL *session,
     const char *objname, size_t len, const char *cfg[], uint32_t flags)
 {
-	WT_ITEM uribuf;
 	WT_CONFIG_ITEM cval;
 	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	WT_ITEM *uribuf;
 	const char *fileuri, *name, *objconf;
-	int ret;
 
 	cursor = NULL;
-	WT_CLEAR(uribuf);
+	uribuf = NULL;
 
 	name = objname;
 	if (len != strlen(objname))
 		WT_ERR(__wt_strndup(session, objname, len, &name));
 
-	WT_ERR(__wt_schema_table_cursor(session, NULL, &cursor));
+	WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
 	cursor->set_key(cursor, name);
 	WT_ERR(cursor->search(cursor));
 	WT_ERR(cursor->get_value(cursor, &objconf));
 
-	/* Get the filename from the schema table. */
+	/* Get the filename from the metadata. */
+	WT_ERR(__wt_scr_alloc(session, 0, &uribuf));
 	WT_ERR(__wt_config_getones(session, objconf, "filename", &cval));
 	WT_ERR(__wt_buf_fmt(
-	    session, &uribuf, "file:%.*s", (int)cval.len, cval.str));
-	fileuri = uribuf.data;
+	    session, uribuf, "file:%.*s", (int)cval.len, cval.str));
+	fileuri = uribuf->data;
 
 	/* !!! Close the schema cursor first, this overwrites session->btree. */
 	ret = cursor->close(cursor);
@@ -80,12 +68,12 @@ __wt_schema_get_btree(WT_SESSION_IMPL *session,
 	if (ret != 0)
 		goto err;
 
-	ret = __wt_session_get_btree(session, name, fileuri, NULL, cfg, flags);
+	ret = __wt_session_get_btree(session, fileuri, cfg, flags);
 	if (ret == ENOENT)
 		__wt_errx(session,
 		    "%s created but '%s' is missing", objname, fileuri);
 
-err:	__wt_buf_free(session, &uribuf);
+err:	__wt_scr_free(&uribuf);
 	if (name != objname)
 		__wt_free(session, name);
 	if (cursor != NULL)
@@ -102,17 +90,16 @@ __wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table)
 {
 	WT_CONFIG cparser;
 	WT_CONFIG_ITEM ckey, cval;
-	WT_ITEM plan;
-	char *cgname;
-	const char *fileconf;
-	int i, ret;
+	WT_DECL_RET;
+	WT_ITEM namebuf, plan;
+	const char *cgname, *fileconf;
+	int i;
 
 	if (table->cg_complete)
 		return (0);
 
+	WT_CLEAR(namebuf);
 	fileconf = NULL;
-	cgname = NULL;
-	ret = 0;
 
 	WT_RET(__wt_config_subinit(session, &cparser, &table->cgconf));
 
@@ -122,20 +109,22 @@ __wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table)
 			WT_ERR(__wt_config_next(&cparser, &ckey, &cval));
 		else
 			WT_CLEAR(ckey);
-		if (table->colgroup[i] != NULL)
-			continue;
 
-		WT_ERR(__wt_schema_colgroup_name(session, table,
-		    ckey.str, ckey.len, &cgname));
+		if ((cgname = table->cg_name[i]) == NULL) {
+			WT_ERR(__wt_schema_colgroup_name(session, table,
+			    ckey.str, ckey.len, &namebuf));
+			cgname = table->cg_name[i] =
+			    __wt_buf_steal(session, &namebuf, NULL);
+		}
 		ret = __wt_schema_get_btree(session,
-		    cgname, strlen(cgname), NULL, WT_BTREE_NO_LOCK);
+		    cgname, strlen(cgname), NULL, 0);
 		if (ret != 0) {
 			/* It is okay if the table is not yet complete. */
 			if (ret == WT_NOTFOUND)
 				ret = 0;
 			goto err;
 		}
-		table->colgroup[i] = session->btree;
+		WT_ERR(__wt_session_release_btree(session));
 	}
 
 	if (!table->is_simple) {
@@ -149,7 +138,7 @@ __wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table)
 
 	table->cg_complete = 1;
 
-err:	__wt_free(session, cgname);
+err:	__wt_buf_free(session, &namebuf);
 	__wt_free(session, fileconf);
 	return (ret);
 }
@@ -160,17 +149,18 @@ err:	__wt_free(session, cgname);
  */
 static int
 __open_index(WT_SESSION_IMPL *session, WT_TABLE *table,
-    const char *uri, const char *idxconf, WT_BTREE **btreep)
+    const char *uri, const char *idxconf)
 {
 	WT_BTREE *btree;
 	WT_CONFIG colconf;
 	WT_CONFIG_ITEM ckey, cval, icols;
+	WT_DECL_RET;
 	WT_ITEM cols, fmt, plan, uribuf;
 	const char *fileuri;
 	u_int cursor_key_cols;
-	int i, ret;
+	int i;
 
-	ret = 0;
+	btree = NULL;
 	WT_CLEAR(uribuf);
 
 	/* Get the filename from the index config. */
@@ -179,8 +169,9 @@ __open_index(WT_SESSION_IMPL *session, WT_TABLE *table,
 	    session, &uribuf, "file:%.*s", (int)cval.len, cval.str));
 	fileuri = uribuf.data;
 
-	ret = __wt_session_get_btree(session, uri, fileuri,
-	    NULL, NULL, WT_BTREE_NO_LOCK);
+	ret = __wt_session_get_btree(
+	    session, fileuri, NULL, WT_BTREE_EXCLUSIVE);
+	btree = session->btree;
 	if (ret == ENOENT)
 		__wt_errx(session,
 		    "Index '%s' created but '%s' is missing", uri, fileuri);
@@ -188,8 +179,6 @@ __open_index(WT_SESSION_IMPL *session, WT_TABLE *table,
 	if (ret != 0)
 		goto err;
 
-	btree = session->btree;
-
 	/*
 	 * The key format for an index is somewhat subtle: the application
 	 * specifies a set of columns that it will use for the key, but the
@@ -251,10 +240,12 @@ __open_index(WT_SESSION_IMPL *session, WT_TABLE *table,
 	    table, table->colconf.str, table->colconf.len, 1, &plan));
 	btree->value_plan = __wt_buf_steal(session, &plan, NULL);
 
-	*btreep = btree;
-
 err:	__wt_buf_free(session, &cols);
 	__wt_buf_free(session, &uribuf);
+	if (btree != NULL) {
+		session->btree = btree;
+		WT_TRET(__wt_session_release_btree(session));
+	}
 
 	return (ret);
 }
@@ -268,8 +259,9 @@ __wt_schema_open_index(
     WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, size_t len)
 {
 	WT_CURSOR *cursor;
+	WT_DECL_RET;
 	const char *idxconf, *name, *tablename, *uri;
-	int i, match, ret, skipped;
+	int i, match, skipped;
 
 	cursor = NULL;
 	skipped = 0;
@@ -281,10 +273,11 @@ __wt_schema_open_index(
 		return (0);
 
 	/*
-	 * XXX Do a full scan through the schema table to find all matching
-	 * indices.  This scan be optimized when we have cursor search + next.
+	 * XXX
+	 * Do a full scan through the metadata to find all matching indices.
+	 * This scan be optimized with search + next.
 	 */
-	WT_RET(__wt_schema_table_cursor(session, NULL, &cursor));
+	WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
 
 	/* Open each index. */
 	for (i = 0; (ret = cursor->next(cursor)) == 0;) {
@@ -299,25 +292,23 @@ __wt_schema_open_index(
 		match = (len > 0 &&
 		   strncmp(name, idxname, len) == 0 && name[len] == '\0');
 
-		if (i * sizeof(WT_BTREE *) >= table->index_alloc)
-			WT_ERR(__wt_realloc(session, &table->index_alloc,
-			    WT_MAX(10 * sizeof(WT_BTREE *),
-			    2 * table->index_alloc),
-			    &table->index));
-
-		if (table->index[i] == NULL) {
-			if (len == 0 || match) {
-				WT_ERR(cursor->get_value(cursor, &idxconf));
-				WT_ERR(__open_index(session,
-				    table, uri, idxconf, &table->index[i]));
-			} else
-				skipped = 1;
-		}
+		if ((size_t)i * sizeof(const char *) >= table->idx_name_alloc)
+			WT_ERR(__wt_realloc(session, &table->idx_name_alloc,
+			    WT_MAX(10 * sizeof(const char *),
+			    2 * table->idx_name_alloc), &table->idx_name));
+
+		if (table->idx_name[i] == NULL)
+			WT_ERR(__wt_strdup(session, uri, &table->idx_name[i]));
+
+		if (len == 0 || match) {
+			WT_ERR(cursor->get_value(cursor, &idxconf));
+			WT_ERR(__open_index(session, table, uri, idxconf));
+		} else
+			skipped = 1;
 
 		if (match) {
 			ret = cursor->close(cursor);
 			cursor = NULL;
-			session->btree = table->index[i];
 			break;
 		}
 		i++;
@@ -348,11 +339,11 @@ __wt_schema_open_table(WT_SESSION_IMPL *session,
 	WT_CONFIG cparser;
 	WT_CONFIG_ITEM ckey, cval;
 	WT_CURSOR *cursor;
+	WT_DECL_RET;
 	WT_ITEM buf;
 	WT_TABLE *table;
 	const char *tconfig;
 	char *tablename;
-	int ret;
 
 	cursor = NULL;
 	table = NULL;
@@ -361,7 +352,7 @@ __wt_schema_open_table(WT_SESSION_IMPL *session,
 	WT_RET(__wt_buf_fmt(session, &buf, "table:%.*s", (int)namelen, name));
 	tablename = __wt_buf_steal(session, &buf, NULL);
 
-	WT_ERR(__wt_schema_table_cursor(session, NULL, &cursor));
+	WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
 	cursor->set_key(cursor, tablename);
 	WT_ERR(cursor->search(cursor));
 	WT_ERR(cursor->get_value(cursor, &tconfig));
@@ -410,9 +401,8 @@ __wt_schema_open_table(WT_SESSION_IMPL *session,
 	if (ret != WT_NOTFOUND)
 		goto err;
 
-	WT_ERR(__wt_calloc_def(session, WT_COLGROUPS(table), &table->colgroup));
+	WT_ERR(__wt_calloc_def(session, WT_COLGROUPS(table), &table->cg_name));
 	WT_ERR(__wt_schema_open_colgroups(session, table));
-
 	*tablep = table;
 
 	if (0) {
diff --git a/src/schema/schema_plan.c b/src/schema/schema_plan.c
index 0c1ef1d01c9..92876b7994a 100644
--- a/src/schema/schema_plan.c
+++ b/src/schema/schema_plan.c
@@ -14,14 +14,18 @@ __find_next_col(WT_SESSION_IMPL *session, WT_TABLE *table,
 	WT_BTREE *cgtree;
 	WT_CONFIG conf;
 	WT_CONFIG_ITEM cval, k, v;
+	WT_DECL_RET;
 	int cg, col, foundcg, foundcol, getnext;
 
 	foundcg = foundcol = -1;
 
 	getnext = 1;
-	for (cg = 0; cg < WT_COLGROUPS(table); cg++) {
-		if ((cgtree = table->colgroup[cg]) == NULL)
-			continue;
+	for (cgtree = NULL, cg = 0; cg < WT_COLGROUPS(table); cg++) {
+		WT_RET(__wt_schema_get_btree(session,
+		    table->cg_name[cg], strlen(table->cg_name[cg]),
+		    NULL, 0));
+		cgtree = session->btree;
+
 		/*
 		 * If there is only one column group, we just scan through all
 		 * of the columns.  For tables with multiple column groups, we
@@ -32,11 +36,11 @@ __find_next_col(WT_SESSION_IMPL *session, WT_TABLE *table,
 			cval = table->colconf;
 			col = 0;
 		} else {
-cgcols:			WT_RET(__wt_config_getones(session,
+cgcols:			WT_ERR(__wt_config_getones(session,
 			    cgtree->config, "columns", &cval));
 			col = table->nkey_columns;
 		}
-		WT_RET(__wt_config_subinit(session, &conf, &cval));
+		WT_ERR(__wt_config_subinit(session, &conf, &cval));
 		for (; __wt_config_next(&conf, &k, &v) == 0; col++) {
 			if (cg == *cgnump && col == *colnump)
 				getnext = 1;
@@ -50,8 +54,15 @@ cgcols:			WT_RET(__wt_config_getones(session,
 			    col == table->nkey_columns - 1)
 				goto cgcols;
 		}
+
+		cgtree = NULL;
+		WT_ERR(__wt_session_release_btree(session));
 	}
 
+err:	if (cgtree != NULL)
+		WT_TRET(__wt_session_release_btree(session));
+	WT_RET(ret);
+
 	if (foundcg == -1)
 		return (WT_NOTFOUND);
 
@@ -77,9 +88,10 @@ __wt_schema_colcheck(WT_SESSION_IMPL *session,
 {
 	WT_CONFIG conf;
 	WT_CONFIG_ITEM k, v;
+	WT_DECL_RET;
 	WT_PACK pack;
 	WT_PACK_VALUE pv;
-	int kcols, ncols, ret, vcols;
+	int kcols, ncols, vcols;
 
 	WT_RET(__pack_init(session, &pack, key_format));
 	for (kcols = 0; (ret = __pack_next(&pack, &pv)) == 0; kcols++)
@@ -118,7 +130,8 @@ __wt_table_check(WT_SESSION_IMPL *session, WT_TABLE *table)
 {
 	WT_CONFIG conf;
 	WT_CONFIG_ITEM k, v;
-	int cg, col, i, ret;
+	WT_DECL_RET;
+	int cg, col, i;
 	char coltype;
 
 	if (table->is_simple)
@@ -161,20 +174,23 @@ int
 __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table,
     const char *columns, size_t len, int value_only, WT_ITEM *plan)
 {
+	WT_BTREE *saved_btree;
 	WT_CONFIG conf;
 	WT_CONFIG_ITEM k, v;
+	WT_DECL_RET;
 	int cg, col, current_cg, current_col, start_cg, start_col;
 	int i, have_it;
 	char coltype, current_coltype;
 
+	saved_btree = session->btree;
 	start_cg = start_col = -1;      /* -Wuninitialized */
 
 	/* Work through the value columns by skipping over the key columns. */
-	WT_RET(__wt_config_initn(session, &conf, columns, len));
+	WT_ERR(__wt_config_initn(session, &conf, columns, len));
 
 	if (value_only)
 		for (i = 0; i < table->nkey_columns; i++)
-			WT_RET(__wt_config_next(&conf, &k, &v));
+			WT_ERR(__wt_config_next(&conf, &k, &v));
 
 	current_cg = cg = 0;
 	current_col = col = INT_MAX;
@@ -196,7 +212,7 @@ __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table,
 			    current_coltype != coltype) {
 				WT_ASSERT(session, !value_only ||
 				    coltype == WT_PROJ_VALUE);
-				WT_RET(__wt_buf_catfmt(
+				WT_ERR(__wt_buf_catfmt(
 				    session, plan, "%d%c", cg, coltype));
 
 				/*
@@ -210,9 +226,9 @@ __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table,
 			/* Now move to the column we want. */
 			if (current_col < col) {
 				if (col - current_col > 1)
-					WT_RET(__wt_buf_catfmt(session,
+					WT_ERR(__wt_buf_catfmt(session,
 					    plan, "%d", col - current_col));
-				WT_RET(__wt_buf_catfmt(session,
+				WT_ERR(__wt_buf_catfmt(session,
 				    plan, "%c", WT_PROJ_SKIP));
 			}
 			/*
@@ -222,20 +238,21 @@ __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table,
 			 * a "reuse" operation to avoid making another copy.
 			 */
 			if (!have_it) {
-				WT_RET(__wt_buf_catfmt(session,
+				WT_ERR(__wt_buf_catfmt(session,
 				    plan, "%c", WT_PROJ_NEXT));
 
 				start_cg = cg;
 				start_col = col;
 				have_it = 1;
 			} else
-				WT_RET(__wt_buf_catfmt(session,
+				WT_ERR(__wt_buf_catfmt(session,
 				    plan, "%c", WT_PROJ_REUSE));
 			current_col = col + 1;
 		}
 	}
 
-	return (0);
+err:	session->btree = saved_btree;
+	return (ret);
 }
 
 static int
@@ -244,8 +261,9 @@ __find_column_format(WT_SESSION_IMPL *session,
 {
 	WT_CONFIG conf;
 	WT_CONFIG_ITEM k, v;
+	WT_DECL_RET;
 	WT_PACK pack;
-	int inkey, ret;
+	int inkey;
 
 	WT_RET(__wt_config_subinit(session, &conf, &table->colconf));
 	WT_RET(__pack_init(session, &pack, table->key_format));
@@ -285,8 +303,9 @@ __wt_struct_reformat(WT_SESSION_IMPL *session, WT_TABLE *table,
 {
 	WT_CONFIG config;
 	WT_CONFIG_ITEM k, next_k, next_v;
+	WT_DECL_RET;
 	WT_PACK_VALUE pv;
-	int have_next, ret;
+	int have_next;
 
 	WT_CLEAR(pv);		/* -Wuninitialized */
 
diff --git a/src/schema/schema_rename.c b/src/schema/schema_rename.c
index 1fef65ed416..ffa27043330 100644
--- a/src/schema/schema_rename.c
+++ b/src/schema/schema_rename.c
@@ -15,11 +15,11 @@ static int
 __rename_file(
     WT_SESSION_IMPL *session, const char *uri, const char *newuri)
 {
-	int exist, ret;
+	WT_DECL_RET;
+	int exist;
 	const char *filename, *newfile, *value;
 
 	value = NULL;
-	ret = 0;
 
 	filename = uri;
 	newfile = newuri;
@@ -27,14 +27,14 @@ __rename_file(
 	    !WT_PREFIX_SKIP(newfile, "file:"))
 		return (EINVAL);
 
-	/* If open, close the btree handle. */
-	WT_RET(__wt_session_close_any_open_btree(session, filename));
+	/* Close any btree handles in the file. */
+	WT_RET(__wt_conn_btree_close_all(session, uri));
 
 	/*
 	 * Check to see if the proposed name is already in use, in either
-	 * the schema table or the filesystem.
+	 * the metadata or the filesystem.
 	 */
-	switch (ret = __wt_schema_table_read(session, newuri, &value)) {
+	switch (ret = __wt_metadata_read(session, newuri, &value)) {
 	case 0:
 		WT_ERR_MSG(session, EEXIST, "%s", newuri);
 	case WT_NOTFOUND:
@@ -48,13 +48,14 @@ __rename_file(
 		WT_ERR_MSG(session, EEXIST, "%s", newfile);
 
 	/* Replace the old file entries with new file entries. */
-	WT_ERR(__wt_schema_table_read(session, uri, &value));
-	WT_ERR(__wt_schema_table_remove(session, uri));
-	WT_ERR(__wt_schema_table_insert(session, newuri, value));
+	WT_ERR(__wt_metadata_read(session, uri, &value));
+	WT_ERR(__wt_metadata_remove(session, uri));
+	WT_ERR(__wt_metadata_insert(session, newuri, value));
 
 	/* Rename the underlying file. */
-	WT_ERR(__wt_schema_table_track_fileop(session, filename, newfile));
 	WT_ERR(__wt_rename(session, filename, newfile));
+	if (WT_META_TRACKING(session))
+		WT_ERR(__wt_meta_track_fileop(session, uri, newuri));
 
 err:	__wt_free(session, value);
 
@@ -66,17 +67,16 @@ err:	__wt_free(session, value);
  *	Rename an index or colgroup reference.
  */
 static int
-__rename_tree(WT_SESSION_IMPL *session, WT_BTREE *btree, const char *newname)
+__rename_tree(WT_SESSION_IMPL *session, const char *name, const char *newname)
 {
+	WT_DECL_RET;
 	WT_ITEM *of, *nf, *nk, *nv;
-	int ret;
 	const char *newfile, *p, *t, *value;
 
 	nf = nk = nv = of = NULL;
-	ret = 0;
 
 	/* Read the old schema value. */
-	WT_ERR(__wt_schema_table_read(session, btree->name, &value));
+	WT_ERR(__wt_metadata_read(session, name, &value));
 
 	/*
 	 * Create the new file name, new schema key, new schema value.
@@ -84,9 +84,9 @@ __rename_tree(WT_SESSION_IMPL *session, WT_BTREE *btree, const char *newname)
 	 * Names are of the form "prefix.oldname:suffix", where suffix is
 	 * optional; we need prefix and suffix.
 	 */
-	if ((p = strchr(btree->name, ':')) == NULL)
+	if ((p = strchr(name, ':')) == NULL)
 		WT_ERR_MSG(session, EINVAL,
-		    "invalid index or column-group name: %s", btree->name);
+		    "invalid index or column-group name: %s", name);
 	t = strchr(p + 1, ':');
 
 	WT_ERR(__wt_scr_alloc(session, 0, &nf));
@@ -96,32 +96,34 @@ __rename_tree(WT_SESSION_IMPL *session, WT_BTREE *btree, const char *newname)
 
 	WT_ERR(__wt_scr_alloc(session, 0, &nk));
 	WT_ERR(__wt_buf_fmt(session, nk, "%.*s:%s%s%s",
-	    (int)WT_PTRDIFF(p, btree->name), btree->name,
-	    newname, t == NULL ? "" : ":", t == NULL ? "" : t + 1));
+	    (int)WT_PTRDIFF(p, name), name, newname,
+	    t == NULL ? "" : ":", t == NULL ? "" : t + 1));
 
-	WT_ERR(__wt_scr_alloc(session, 0, &nv));
 	if ((p = strstr(value, "filename=")) == NULL)
 		WT_ERR_MSG(session, EINVAL,
 		    "index or column-group value has no file name: %s", value);
+	p += strlen("filename=");
 	t = strchr(p, ',');
-	WT_ERR(__wt_buf_fmt(session, nv, "%.*s" "filename=%s%s",
+
+	/* Take a copy of the old filename. */
+	WT_ERR(__wt_scr_alloc(session, 0, &of));
+	WT_ERR(__wt_buf_fmt(session, of, "file:%.*s",
+	    (int)((t == NULL) ? strlen(p) : WT_PTRDIFF(t, p)), p));
+
+	/* Overwrite it with the new filename. */
+	WT_ERR(__wt_scr_alloc(session, 0, &nv));
+	WT_ERR(__wt_buf_fmt(session, nv, "%.*s%s%s",
 	    (int)WT_PTRDIFF(p, value), value,
 	    newfile, t == NULL ? "" : t));
 
 	/*
-	 * Remove the old schema table entry
-	 * Insert the new schema table entry
+	 * Remove the old metadata entry.
+	 * Insert the new metadata entry.
 	 */
-	WT_ERR(__wt_schema_table_remove(session, btree->name));
-	WT_ERR(__wt_schema_table_insert(session, nk->data, nv->data));
+	WT_ERR(__wt_metadata_remove(session, name));
+	WT_ERR(__wt_metadata_insert(session, nk->data, nv->data));
 
-	/*
-	 * Rename the file.
-	 * __rename_file closes the WT_BTREE handle, so we have to have a local
-	 * copy of the WT_BTREE->filename field.
-	 */
-	WT_ERR(__wt_scr_alloc(session, 0, &of));
-	WT_ERR(__wt_buf_fmt(session, of, "file:%s", btree->filename));
+	/* Rename the file. */
 	WT_ERR(__rename_file(session, of->data, nf->data));
 
 err:	__wt_scr_free(&nf);
@@ -140,43 +142,35 @@ static int
 __rename_table(
     WT_SESSION_IMPL *session, const char *oldname, const char *newname)
 {
-	WT_BTREE *btree;
+	WT_DECL_RET;
 	WT_ITEM *buf;
 	WT_TABLE *table;
-	int i, ret;
+	int i;
 	const char *value;
 
 	buf = NULL;
-	ret = 0;
 
 	WT_RET(
 	    __wt_schema_get_table(session, oldname, strlen(oldname), &table));
 
 	/* Rename the column groups. */
-	for (i = 0; i < WT_COLGROUPS(table); i++) {
-		if ((btree = table->colgroup[i]) == NULL)
-			continue;
-		table->colgroup[i] = NULL;
-		WT_RET(__rename_tree(session, btree, newname));
-	}
+	for (i = 0; i < WT_COLGROUPS(table); i++)
+		WT_RET(__rename_tree(session, table->cg_name[i], newname));
 
 	/* Rename the indices. */
 	WT_RET(__wt_schema_open_index(session, table, NULL, 0));
-	for (i = 0; i < table->nindices; i++) {
-		btree = table->index[i];
-		table->index[i] = NULL;
-		WT_RET(__rename_tree(session, btree, newname));
-	}
+	for (i = 0; i < table->nindices; i++)
+		WT_RET(__rename_tree(session, table->idx_name[i], newname));
 
 	WT_RET(__wt_schema_remove_table(session, table));
 
 	/* Rename the table. */
 	WT_ERR(__wt_scr_alloc(session, 0, &buf));
 	WT_ERR(__wt_buf_fmt(session, buf, "table:%s", oldname));
-	WT_ERR(__wt_schema_table_read(session, buf->data, &value));
-	WT_ERR(__wt_schema_table_remove(session, buf->data));
+	WT_ERR(__wt_metadata_read(session, buf->data, &value));
+	WT_ERR(__wt_metadata_remove(session, buf->data));
 	WT_ERR(__wt_buf_fmt(session, buf, "table:%s", newname));
-	WT_ERR(__wt_schema_table_insert(session, buf->data, value));
+	WT_ERR(__wt_metadata_insert(session, buf->data, value));
 
 err:	__wt_scr_free(&buf);
 	return (ret);
@@ -190,8 +184,9 @@ int
 __wt_schema_rename(WT_SESSION_IMPL *session,
     const char *uri, const char *newuri, const char *cfg[])
 {
+	WT_DATA_SOURCE *dsrc;
+	WT_DECL_RET;
 	const char *oldname, *newname;
-	int ret;
 
 	WT_UNUSED(cfg);
 
@@ -203,7 +198,7 @@ __wt_schema_rename(WT_SESSION_IMPL *session,
 	 * We track rename operations, if we fail in the middle, we want to
 	 * back it all out.
 	 */
-	WT_RET(__wt_schema_table_track_on(session));
+	WT_RET(__wt_meta_track_on(session));
 
 	oldname = uri;
 	newname = newuri;
@@ -219,11 +214,12 @@ __wt_schema_rename(WT_SESSION_IMPL *session,
 			    "rename target type must match URI: %s to %s",
 			    uri, newuri);
 		ret = __rename_table(session, oldname, newname);
-	} else
-		return (__wt_unknown_object_type(session, uri));
+	} else if ((ret = __wt_schema_get_source(session, oldname, &dsrc)) == 0)
+		ret = dsrc->rename(dsrc,
+		    &session->iface, oldname, newname, cfg[1]);
 
-	WT_TRET(__wt_schema_table_track_off(session, ret != 0));
+	WT_TRET(__wt_meta_track_off(session, ret != 0));
 
-	/* If we didn't find a schema file entry, map that error to ENOENT. */
+	/* If we didn't find a metadata entry, map that error to ENOENT. */
 	return (ret == WT_NOTFOUND ? ENOENT : ret);
 }
diff --git a/src/schema/schema_table.c b/src/schema/schema_table.c
deleted file mode 100644
index 41ba9055f68..00000000000
--- a/src/schema/schema_table.c
+++ /dev/null
@@ -1,178 +0,0 @@
-/*-
- * Copyright (c) 2008-2012 WiredTiger, Inc.
- *	All rights reserved.
- *
- * See the file LICENSE for redistribution information.
- */
-
-#include "wt_internal.h"
-
-static const char *schematab_config = "key_format=S,value_format=S";
-
-/*
- * __wt_open_schema_table --
- *	Opens the schema table, sets session->schematab.
- */
-int
-__wt_open_schema_table(WT_SESSION_IMPL *session)
-{
-	const char *cfg[] = API_CONF_DEFAULTS(file, meta, schematab_config);
-	const char *schemaconf;
-	int ret, tracking;
-
-	ret = 0;
-
-	if (session->schematab != NULL)
-		return (0);
-
-	WT_RET(__wt_config_collapse(session, cfg, &schemaconf));
-
-	/*
-	 * Turn off tracking when creating the schema file: this is always done
-	 * before any other schema operations and there is no going back.
-	 */
-	tracking = (session->schema_track != NULL);
-	if (tracking)
-		__wt_schema_table_track_off(session, 0);
-	WT_ERR(__wt_create_file(session,
-	    "file:" WT_SCHEMA_FILENAME,
-	    "file:" WT_SCHEMA_FILENAME, 0, schemaconf));
-	session->schematab = session->btree;
-err:	__wt_free(session, schemaconf);
-	if (tracking)
-		WT_TRET(__wt_schema_table_track_on(session));
-	return (ret);
-}
-
-/*
- * __wt_schema_table_cursor --
- *	Opens a cursor on the schema table.
- */
-int
-__wt_schema_table_cursor(
-    WT_SESSION_IMPL *session, const char *config, WT_CURSOR **cursorp)
-{
-	const char *cfg[] = API_CONF_DEFAULTS(session, open_cursor, config);
-
-	WT_RET(__wt_open_schema_table(session));
-	session->btree = session->schematab;
-	return (__wt_curfile_create(session, NULL, cfg, cursorp));
-}
-
-/*
- * __wt_schema_table_insert --
- *	Insert a row into the schema table.
- */
-int
-__wt_schema_table_insert(
-    WT_SESSION_IMPL *session, const char *key, const char *value)
-{
-	WT_BTREE *btree;
-	WT_CURSOR *cursor;
-	int ret;
-
-	ret = 0;
-
-	if (session->schema_track != NULL)		/* Optional tracking */
-		WT_RET(__wt_schema_table_track_insert(session, key));
-
-	/* Save the caller's btree: the schema cursor will overwrite it. */
-	btree = session->btree;
-	WT_RET(__wt_schema_table_cursor(session, NULL, &cursor));
-	cursor->set_key(cursor, key);
-	cursor->set_value(cursor, value);
-	WT_TRET(cursor->insert(cursor));
-	WT_TRET(cursor->close(cursor));
-
-	/* Restore the caller's btree. */
-	session->btree = btree;
-	return (ret);
-}
-
-/*
- * __wt_schema_table_update --
- *	Update a row in the schema table.
- */
-int
-__wt_schema_table_update(
-    WT_SESSION_IMPL *session, const char *key, const char *value)
-{
-	WT_BTREE *btree;
-	WT_CURSOR *cursor;
-	int ret;
-
-	ret = 0;
-
-	if (session->schema_track != NULL)		/* Optional tracking */
-		WT_RET(__wt_schema_table_track_update(session, key));
-
-	/* Save the caller's btree: the schema cursor will overwrite it. */
-	btree = session->btree;
-	WT_RET(__wt_schema_table_cursor(session, "overwrite", &cursor));
-	cursor->set_key(cursor, key);
-	cursor->set_value(cursor, value);
-	WT_TRET(cursor->insert(cursor));
-	WT_TRET(cursor->close(cursor));
-
-	/* Restore the caller's btree. */
-	session->btree = btree;
-	return (ret);
-}
-
-/*
- * __wt_schema_table_remove --
- *	Removes a row from the schema table.
- */
-int
-__wt_schema_table_remove(WT_SESSION_IMPL *session, const char *key)
-{
-	WT_BTREE *btree;
-	WT_CURSOR *cursor;
-	int ret;
-
-	ret = 0;
-
-	if (session->schema_track != NULL)		/* Optional tracking */
-		WT_RET(__wt_schema_table_track_update(session, key));
-
-	/* Save the caller's btree: the schema cursor will overwrite it. */
-	btree = session->btree;
-	WT_RET(__wt_schema_table_cursor(session, NULL, &cursor));
-	cursor->set_key(cursor, key);
-	WT_TRET(cursor->remove(cursor));
-	WT_TRET(cursor->close(cursor));
-
-	/* Restore the caller's btree. */
-	session->btree = btree;
-	return (ret);
-}
-
-/*
- * __wt_schema_table_read --
- *	Reads and copies a row from the schema table.
- *	The caller is responsible for freeing the allocated memory.
- */
-int
-__wt_schema_table_read(
-    WT_SESSION_IMPL *session, const char *key, const char **valuep)
-{
-	WT_BTREE *btree;
-	WT_CURSOR *cursor;
-	const char *value;
-	int ret;
-
-	ret = 0;
-
-	/* Save the caller's btree: the schema cursor will overwrite it. */
-	btree = session->btree;
-	WT_RET(__wt_schema_table_cursor(session, NULL, &cursor));
-	cursor->set_key(cursor, key);
-	WT_ERR(cursor->search(cursor));
-	WT_ERR(cursor->get_value(cursor, &value));
-	WT_ERR(__wt_strdup(session, value, valuep));
-
-err:    WT_TRET(cursor->close(cursor));
-	/* Restore the caller's btree. */
-	session->btree = btree;
-	return (ret);
-}
diff --git a/src/schema/schema_track.c b/src/schema/schema_track.c
deleted file mode 100644
index e903cb9e5ec..00000000000
--- a/src/schema/schema_track.c
+++ /dev/null
@@ -1,227 +0,0 @@
-/*-
- * Copyright (c) 2008-2012 WiredTiger, Inc.
- *	All rights reserved.
- *
- * See the file LICENSE for redistribution information.
- */
-
-#include "wt_internal.h"
-
-/*
- * WT_SCHEMA_TRACK --
- *	A tracked schema operation: a non-transactional log, maintained to make
- * it easy to unroll simple schema table and filesystem operations.
- */
-typedef struct __wt_schema_track {
-	enum {
-		WT_ST_EMPTY=0,		/* Unused slot */
-		WT_ST_FILEOP=1,		/* File operation */
-		WT_ST_REMOVE=2,		/* Remove a schema table entry */
-		WT_ST_SET=3		/* Reset a schema table entry */
-	} op;
-	const char *a, *b;		/* Strings */
-} WT_SCHEMA_TRACK;
-
-/*
- * __schema_table_track_next --
- *	Return the next slot, and extend the list of operations we're tracking,
- * as necessary.
- */
-static int
-__schema_table_track_next(WT_SESSION_IMPL *session, WT_SCHEMA_TRACK **trkp)
-{
-	WT_SCHEMA_TRACK *trk;
-	size_t bytes_allocated;
-	u_int i;
-
-	/*
-	 * Slow, but we don't care -- it's a schema table op, searching an array
-	 * of maybe 20 items.
-	 */
-	for (trk = session->schema_track,
-	    i = 0; i <  session->schema_track_entries; ++trk, ++i)
-		if (trk->op == WT_ST_EMPTY) {
-			if (trkp != NULL)
-				*trkp = trk;
-			return (0);
-		}
-
-	/*
-	 * The __wt_realloc() function uses the "bytes allocated" value
-	 * to figure out how much of the memory it needs to clear (see
-	 * the function for an explanation of why the memory is cleared,
-	 * it's a security thing).
-	 */
-	bytes_allocated =
-	    session->schema_track_entries * sizeof(WT_SCHEMA_TRACK);
-	WT_RET(__wt_realloc(session, &bytes_allocated,
-	    (session->schema_track_entries + 20) * sizeof(WT_SCHEMA_TRACK),
-	    &session->schema_track));
-	if (trkp != NULL)
-		*trkp = &((WT_SCHEMA_TRACK *)
-		    session->schema_track)[session->schema_track_entries];
-	session->schema_track_entries += 20;
-	return (0);
-}
-
-/*
- * __wt_schema_table_track_on --
- *	Turn on schema table tracking.
- */
-int
-__wt_schema_table_track_on(WT_SESSION_IMPL *session)
-{
-	return (__schema_table_track_next(session, NULL));
-}
-
-/*
- * __wt_schema_table_track_off --
- *	Turn off schema table tracking, unrolling on error.
- */
-int
-__wt_schema_table_track_off(WT_SESSION_IMPL *session, int unroll)
-{
-	WT_SCHEMA_TRACK *trk, *trk_orig;
-	int ret, tret;
-
-	ret = 0;
-
-	if (session->schema_track == NULL || session->schema_track_entries == 0)
-		return (0);
-
-	trk_orig = session->schema_track;
-	trk = &trk_orig[session->schema_track_entries - 1];
-
-	/* Turn off tracking for unroll. */
-	session->schema_track = NULL;
-	session->schema_track_entries = 0;
-
-	for (;; --trk) {
-		if (unroll)
-			switch (trk->op) {
-			case WT_ST_EMPTY:	/* Unused slot */
-				break;
-			case WT_ST_FILEOP:	/* File operation */
-				/*
-				 * For renames, both a and b are set.
-				 * For creates, a is NULL.
-				 * For removes, b is NULL.
-				 */
-				if (trk->a != NULL && trk->b != NULL &&
-				    (tret = __wt_rename(
-				    session, trk->b, trk->a)) != 0) {
-					__wt_err(session, tret,
-					    "schema table unroll rename "
-					    "%s to %s",
-					    trk->b, trk->a);
-					WT_TRET(tret);
-				} else if (trk->a == NULL &&
-				    ((tret = __wt_session_close_any_open_btree(
-				    session, trk->b)) != 0 || (tret =
-				    __wt_remove(session, trk->b)) != 0)) {
-					__wt_err(session, tret,
-					    "schema table unroll create %s",
-					    trk->b);
-					WT_TRET(tret);
-				}
-				/*
-				 * We can't undo removes yet: that would imply
-				 * some kind of temporary rename and remove in
-				 * roll forward.
-				 */
-				break;
-			case WT_ST_REMOVE:	/* Remove trk.a */
-				if ((tret = __wt_schema_table_remove(
-				    session, trk->a)) != 0) {
-					__wt_err(session, ret,
-					    "schema table unroll remove: %s",
-					    trk->a);
-					WT_TRET(tret);
-				}
-				break;
-			case WT_ST_SET:		/* Set trk.a to trk.b */
-				if ((tret = __wt_schema_table_update(
-				    session, trk->a, trk->b)) != 0) {
-					__wt_err(session, ret,
-					    "schema table unroll update "
-					    "%s to %s",
-					    trk->a, trk->b);
-					WT_TRET(tret);
-				}
-				break;
-			WT_ILLEGAL_VALUE(session);
-			}
-
-		__wt_free(session, trk->a);
-		__wt_free(session, trk->b);
-
-		if (trk == trk_orig)
-			break;
-	}
-	__wt_free(session, trk_orig);
-	return (ret);
-}
-
-/*
- * __wt_schema_table_track_insert --
- *	Track an insert operation.
- */
-int
-__wt_schema_table_track_insert(WT_SESSION_IMPL *session, const char *key)
-{
-	WT_SCHEMA_TRACK *trk;
-
-	WT_RET(__schema_table_track_next(session, &trk));
-
-	trk->op = WT_ST_REMOVE;
-	WT_RET(__wt_strdup(session, key, &trk->a));
-
-	return (0);
-}
-
-/*
- * __wt_schema_table_track_update --
- *	Track a schema table update operation.
- */
-int
-__wt_schema_table_track_update(WT_SESSION_IMPL *session, const char *key)
-{
-	WT_SCHEMA_TRACK *trk;
-	int ret;
-
-	WT_RET(__schema_table_track_next(session, &trk));
-
-	trk->op = WT_ST_SET;
-	WT_RET(__wt_strdup(session, key, &trk->a));
-
-	/*
-	 * If there was a previous value, keep it around -- if not, then this
-	 * "update" is really an insert.
-	 */
-	if ((ret =
-	    __wt_schema_table_read(session, key, &trk->b)) == WT_NOTFOUND) {
-		trk->op = WT_ST_REMOVE;
-		ret = 0;
-	}
-	return (ret);
-}
-
-/*
- * __wt_schema_table_track_fs_rename --
- *	Track a filesystem rename operation.
- */
-int
-__wt_schema_table_track_fileop(
-    WT_SESSION_IMPL *session, const char *oldname, const char *newname)
-{
-	WT_SCHEMA_TRACK *trk;
-
-	WT_RET(__schema_table_track_next(session, &trk));
-
-	trk->op = WT_ST_FILEOP;
-	if (oldname != NULL)
-		WT_RET(__wt_strdup(session, oldname, &trk->a));
-	if (newname != NULL)
-		WT_RET(__wt_strdup(session, newname, &trk->b));
-	return (0);
-}
diff --git a/src/schema/schema_truncate.c b/src/schema/schema_truncate.c
index 67a93236c45..bd8a64736b4 100644
--- a/src/schema/schema_truncate.c
+++ b/src/schema/schema_truncate.c
@@ -12,13 +12,19 @@
  *	WT_SESSION::truncate for a file.
  */
 static int
-__truncate_file(WT_SESSION_IMPL *session, const char *filename)
+__truncate_file(WT_SESSION_IMPL *session, const char *name)
 {
-	/* If open, close the btree handle. */
-	WT_RET(__wt_session_close_any_open_btree(session, filename));
+	const char *filename;
+
+	filename = name;
+	if (!WT_PREFIX_SKIP(filename, "file:"))
+		return (EINVAL);
+
+	/* Close any btree handles in the file. */
+	WT_RET(__wt_conn_btree_close_all(session, name));
 
 	/* Delete the root address and truncate the file. */
-	WT_RET(__wt_btree_set_root(session, filename, NULL, 0));
+	WT_RET(__wt_meta_snapshot_clear(session, name));
 	WT_RET(__wt_btree_truncate(session, filename));
 
 	return (0);
@@ -32,53 +38,58 @@ static int
 __truncate_table(WT_SESSION_IMPL *session, const char *name)
 {
 	WT_BTREE *btree;
-	WT_ITEM *buf;
+	WT_DECL_RET;
+	WT_ITEM *namebuf;
 	WT_TABLE *table;
-	int i, ret;
-
-	ret = 0;
-	WT_RET(__wt_scr_alloc(session, 0, &buf));
+	int i, tret;
 
 	WT_RET(__wt_schema_get_table(session, name, strlen(name), &table));
-	/*
-	 * We are closing the column groups, they must be reopened for future
-	 * accesses to the table.
-	 */
-	table->cg_complete = 0;
+	WT_RET(__wt_scr_alloc(session, 0, &namebuf));
 
 	/* Truncate the column groups. */
 	for (i = 0; i < WT_COLGROUPS(table); i++) {
-		if ((btree = table->colgroup[i]) == NULL)
-			continue;
-		table->colgroup[i] = NULL;
 		/*
-		 * Take a copy of the file name: it will be freed when the
-		 * handle is closed.
+		 * Get an exclusive lock on the handle: it will be released by
+		 * __wt_conn_btree_close_all.
 		 */
-		WT_ERR(__wt_buf_set(session, buf,
-		    btree->filename, strlen(btree->filename) + 1));
-		WT_TRET(__truncate_file(session, buf->data));
+		if ((tret = __wt_schema_get_btree(session,
+		    table->cg_name[i], strlen(table->cg_name[i]),
+		    NULL, WT_BTREE_EXCLUSIVE)) != 0) {
+			WT_TRET(tret);
+			continue;
+		}
+		btree = session->btree;
+		WT_ERR(__wt_buf_set(
+		    session, namebuf, btree->name, strlen(btree->name) + 1));
+		WT_TRET(__truncate_file(session, namebuf->data));
 	}
 
 	/* Truncate the indices. */
 	WT_TRET(__wt_schema_open_index(session, table, NULL, 0));
 	for (i = 0; i < table->nindices; i++) {
-		btree = table->index[i];
-		table->index[i] = NULL;
 		/*
-		 * Take a copy of the file name: it will be freed when the
-		 * handle is closed.
+		 * Get an exclusive lock on the handle: it will be released by
+		 * __wt_conn_btree_close_all.
 		 */
-		WT_ERR(__wt_buf_set(session, buf,
-		    btree->filename, strlen(btree->filename) + 1));
-		WT_TRET(__truncate_file(session, buf->data));
+		if ((tret = __wt_schema_get_btree(session,
+		    table->idx_name[i], strlen(table->idx_name[i]),
+		    NULL, WT_BTREE_EXCLUSIVE)) != 0) {
+			WT_TRET(tret);
+			continue;
+		}
+		btree = session->btree;
+		WT_ERR(__wt_buf_set(
+		    session, namebuf, btree->name, strlen(btree->name) + 1));
+		WT_TRET(__truncate_file(session, namebuf->data));
 	}
 
+	table->idx_complete = 0;
+
 	/* Reopen the column groups. */
 	if (ret == 0)
 		ret = __wt_schema_open_colgroups(session, table);
 
-err:	__wt_scr_free(&buf);
+err:	__wt_scr_free(&namebuf);
 	return (ret);
 }
 
@@ -90,17 +101,20 @@ int
 __wt_schema_truncate(
     WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
 {
-	int ret;
+	WT_DATA_SOURCE *dsrc;
+	WT_DECL_RET;
+	const char *tablename;
 
 	WT_UNUSED(cfg);
+	tablename = uri;
 
-	if (WT_PREFIX_SKIP(uri, "file:"))
+	if (WT_PREFIX_MATCH(uri, "file:"))
 		ret = __truncate_file(session, uri);
-	else if (WT_PREFIX_SKIP(uri, "table:"))
-		ret = __truncate_table(session, uri);
-	else
-		return (__wt_unknown_object_type(session, uri));
+	else if (WT_PREFIX_SKIP(tablename, "table:"))
+		ret = __truncate_table(session, tablename);
+	else if ((ret = __wt_schema_get_source(session, uri, &dsrc)) == 0)
+		ret = dsrc->truncate(dsrc, &session->iface, uri, cfg[1]);
 
-	/* If we didn't find a schema file entry, map that error to ENOENT. */
+	/* If we didn't find a metadata entry, map that error to ENOENT. */
 	return (ret == WT_NOTFOUND ? ENOENT : ret);
 }
diff --git a/src/schema/schema_util.c b/src/schema/schema_util.c
index 39053782d7d..b84429c89ae 100644
--- a/src/schema/schema_util.c
+++ b/src/schema/schema_util.c
@@ -8,6 +8,26 @@
 #include "wt_internal.h"
 
 /*
+ * __wt_schema_get_source --
+ *	Find a matching data source or report an error.
+ */
+int
+__wt_schema_get_source(
+    WT_SESSION_IMPL *session, const char *name, WT_DATA_SOURCE **dsrcp)
+{
+	WT_NAMED_DATA_SOURCE *ndsrc;
+
+	TAILQ_FOREACH(ndsrc, &S2C(session)->dsrcqh, q) {
+		if (!WT_PREFIX_MATCH(name, ndsrc->prefix))
+			continue;
+		*dsrcp = ndsrc->dsrc;
+		return (0);
+	}
+
+	return (__wt_unknown_object_type(session, name));
+}
+
+/*
  * __wt_schema_name_check --
  *	Disallow any use of the WiredTiger name space.
  */
@@ -18,7 +38,7 @@ __wt_schema_name_check(WT_SESSION_IMPL *session, const char *uri)
 
 	/*
 	 * Check if name is somewhere in the WiredTiger name space: it would be
-	 * "bad" if the application truncated the schema file.  We get passed
+	 * "bad" if the application truncated the metadata file.  We get passed
 	 * both objects and simple strings, skip any leading URI prefix.
 	 */
 	name = uri;
diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c
index 1857a1009ca..0b4e83776a2 100644
--- a/src/schema/schema_worker.c
+++ b/src/schema/schema_worker.c
@@ -17,38 +17,33 @@ __wt_schema_worker(WT_SESSION_IMPL *session,
    const char *uri, const char *cfg[],
    int (*func)(WT_SESSION_IMPL *, const char *[]), uint32_t open_flags)
 {
-	WT_BTREE *cg;
+	WT_DECL_RET;
 	WT_TABLE *table;
-	const char *tablename;
-	int i, ret;
+	const char *cgname, *tablename;
+	int i;
 
 	tablename = uri;
-	ret = 0;
 
 	/* Get the btree handle(s) and call the underlying function. */
 	if (WT_PREFIX_MATCH(uri, "file:")) {
-		WT_RET(__wt_session_get_btree(
-		    session, uri, uri, NULL, cfg, open_flags));
+		WT_RET(__wt_session_get_btree(session, uri, cfg, open_flags));
 		ret = func(session, cfg);
 		WT_TRET(__wt_session_release_btree(session));
-		WT_RET(ret);
-	} else if (
-	    WT_PREFIX_SKIP(uri, "colgroup:") || WT_PREFIX_SKIP(uri, "index:")) {
+	} else if (WT_PREFIX_MATCH(uri, "colgroup:") ||
+	    WT_PREFIX_MATCH(uri, "index:")) {
 		WT_RET(__wt_schema_get_btree(
 		    session, uri, strlen(uri), cfg, open_flags));
 		ret = func(session, cfg);
 		WT_TRET(__wt_session_release_btree(session));
-		WT_RET(ret);
 	} else if (WT_PREFIX_SKIP(tablename, "table:")) {
 		WT_RET(__wt_schema_get_table(session,
 		    tablename, strlen(tablename), &table));
+		WT_ASSERT(session, session->btree == NULL);
 
 		for (i = 0; i < WT_COLGROUPS(table); i++) {
-			if ((cg = table->colgroup[i]) == NULL)
-				continue;
-
-			WT_TRET(__wt_schema_get_btree(session,
-			    cg->name, strlen(cg->name), cfg, open_flags));
+			cgname = table->cg_name[i];
+			WT_RET(__wt_schema_get_btree(session,
+			    cgname, strlen(cgname), cfg, open_flags));
 			ret = func(session, cfg);
 			WT_TRET(__wt_session_release_btree(session));
 			WT_RET(ret);
diff --git a/src/session/session_api.c b/src/session/session_api.c
index d8364b4f3b2..3d1c8eff80d 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -7,6 +7,24 @@
 
 #include "wt_internal.h"
 
+static int __session_rollback_transaction(WT_SESSION *, const char *);
+
+/*
+ * __session_close_cursors --
+ *	Close all cursors open in a session.
+ */
+static int
+__session_close_cursors(WT_SESSION_IMPL *session)
+{
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+
+	ret = 0;
+	while ((cursor = TAILQ_FIRST(&session->cursors)) != NULL)
+		WT_TRET(cursor->close(cursor));
+	return (ret);
+}
+
 /*
  * __session_close --
  *	WT_SESSION->close method.
@@ -16,9 +34,8 @@ __session_close(WT_SESSION *wt_session, const char *config)
 {
 	WT_BTREE_SESSION *btree_session;
 	WT_CONNECTION_IMPL *conn;
-	WT_CURSOR *cursor;
-	WT_SESSION_IMPL *session, **tp;
-	int ret;
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
 
 	conn = (WT_CONNECTION_IMPL *)wt_session->connection;
 	session = (WT_SESSION_IMPL *)wt_session;
@@ -26,19 +43,27 @@ __session_close(WT_SESSION *wt_session, const char *config)
 	SESSION_API_CALL(session, close, config, cfg);
 	WT_UNUSED(cfg);
 
-	while ((cursor = TAILQ_FIRST(&session->cursors)) != NULL)
-		WT_TRET(cursor->close(cursor));
+	if (F_ISSET(&session->txn, TXN_RUNNING))
+		WT_TRET(__session_rollback_transaction(wt_session, NULL));
+
+	WT_TRET(__session_close_cursors(session));
 
 	while ((btree_session = TAILQ_FIRST(&session->btrees)) != NULL)
-		WT_TRET(__wt_session_remove_btree(session, btree_session, 0));
+		WT_TRET(__wt_session_discard_btree(session, btree_session));
 
 	WT_TRET(__wt_schema_close_tables(session));
 
 	__wt_spin_lock(session, &conn->spinlock);
 
+	/* Discard metadata tracking. */
+	__wt_meta_track_discard(session);
+
 	/* Discard scratch buffers. */
 	__wt_scr_discard(session);
 
+	/* Free transaction information. */
+	__wt_txn_destroy(session);
+
 	/* Confirm we're not holding any hazard references. */
 	__wt_hazard_empty(session);
 
@@ -53,26 +78,29 @@ __session_close(WT_SESSION *wt_session, const char *config)
 		(void)__wt_cond_destroy(session, session->cond);
 
 	/*
-	 * Replace the session reference we're closing with the last entry in
-	 * the table, then clear the last entry.  As far as the walk of the
-	 * server threads is concerned, it's OK if the session appears twice,
-	 * or if it doesn't appear at all, so these lines can race all they
-	 * want.
+	 * Sessions are re-used, clear the structure: this code sets the active
+	 * field to 0, which will exclude the hazard array from review by the
+	 * eviction thread.   Note: there's no serialization support around the
+	 * review of the hazard array, which means threads checking for hazard
+	 * references first check the active field (which may be 0) and then use
+	 * the hazard pointer (which cannot be NULL).  For this reason, clear
+	 * the session structure carefully.
+	 *
+	 * We don't need to publish here, because regardless of the active field
+	 * being non-zero, the hazard reference is always valid.
 	 */
-	for (tp = conn->sessions; *tp != session; ++tp)
-		;
-	--conn->session_cnt;
-	*tp = conn->sessions[conn->session_cnt];
-	conn->sessions[conn->session_cnt] = NULL;
+	WT_SESSION_CLEAR(session);
+	session = conn->default_session;
 
 	/*
-	 * Publish, making the session array entry available for re-use.  There
-	 * must be a barrier here to ensure the cleanup above completes before
-	 * the entry is re-used.
+	 * Decrement the count of active sessions if that's possible: a session
+	 * being closed may or may not be at the end of the array, step toward
+	 * the beginning of the array until we reach an active session.
 	 */
-	WT_PUBLISH(session->iface.connection, NULL);
+	while (conn->sessions[conn->session_cnt - 1].active == 0)
+		if (--conn->session_cnt == 0)
+			break;
 
-	session = &conn->default_session;
 	__wt_spin_unlock(session, &conn->spinlock);
 
 err:	API_END_NOTFOUND_MAP(session, ret);
@@ -86,8 +114,8 @@ static int
 __session_open_cursor(WT_SESSION *wt_session,
     const char *uri, WT_CURSOR *to_dup, const char *config, WT_CURSOR **cursorp)
 {
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	session = (WT_SESSION_IMPL *)wt_session;
 	SESSION_API_CALL(session, open_cursor, config, cfg);
@@ -99,11 +127,11 @@ __session_open_cursor(WT_SESSION *wt_session,
 	if (to_dup != NULL)
 		ret = __wt_cursor_dup(session, to_dup, config, cursorp);
 	else if (WT_PREFIX_MATCH(uri, "colgroup:"))
-		ret = __wt_curfile_open(session, uri, cfg, cursorp);
+		ret = __wt_curfile_open(session, uri, NULL, cfg, cursorp);
 	else if (WT_PREFIX_MATCH(uri, "config:"))
 		ret = __wt_curconfig_open(session, uri, cfg, cursorp);
 	else if (WT_PREFIX_MATCH(uri, "file:"))
-		ret = __wt_curfile_open(session, uri, cfg, cursorp);
+		ret = __wt_curfile_open(session, uri, NULL, cfg, cursorp);
 	else if (WT_PREFIX_MATCH(uri, "index:"))
 		ret = __wt_curindex_open(session, uri, cfg, cursorp);
 	else if (WT_PREFIX_MATCH(uri, "statistics:"))
@@ -139,15 +167,20 @@ __wt_session_create_strip(
  *	WT_SESSION->create method.
  */
 static int
-__session_create(WT_SESSION *wt_session, const char *name, const char *config)
+__session_create(WT_SESSION *wt_session, const char *uri, const char *config)
 {
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	session = (WT_SESSION_IMPL *)wt_session;
 	SESSION_API_CALL(session, create, config, cfg);
 	WT_UNUSED(cfg);
-	WT_ERR(__wt_schema_create(session, name, config));
+
+	/* Disallow objects in the WiredTiger name space. */
+	WT_ERR(__wt_schema_name_check(session, uri));
+	__wt_spin_lock(session, &S2C(session)->schema_lock);
+	ret = __wt_schema_create(session, uri, config);
+	__wt_spin_unlock(session, &S2C(session)->schema_lock);
 
 err:	API_END_NOTFOUND_MAP(session, ret);
 }
@@ -160,14 +193,17 @@ static int
 __session_rename(WT_SESSION *wt_session,
     const char *uri, const char *newname, const char *config)
 {
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	session = (WT_SESSION_IMPL *)wt_session;
 	SESSION_API_CALL(session, rename, config, cfg);
+
+	WT_ERR(__wt_meta_track_on(session));
 	ret = __wt_schema_rename(session, uri, newname, cfg);
 
-err:	API_END_NOTFOUND_MAP(session, ret);
+err:	WT_TRET(__wt_meta_track_off(session, ret != 0));
+	API_END_NOTFOUND_MAP(session, ret);
 }
 
 /*
@@ -175,16 +211,28 @@ err:	API_END_NOTFOUND_MAP(session, ret);
  *	WT_SESSION->drop method.
  */
 static int
-__session_drop(WT_SESSION *wt_session, const char *name, const char *config)
+__session_drop(WT_SESSION *wt_session, const char *uri, const char *config)
 {
+	WT_CONFIG_ITEM cval;
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	session = (WT_SESSION_IMPL *)wt_session;
 	SESSION_API_CALL(session, drop, config, cfg);
-	ret = __wt_schema_drop(session, name, cfg);
 
-err:	API_END_NOTFOUND_MAP(session, ret);
+	WT_ERR(__wt_meta_track_on(session));
+
+	/* Dropping snapshots is a different code path. */
+	WT_ERR(__wt_config_gets(session, cfg, "snapshot", &cval));
+	__wt_spin_lock(session, &S2C(session)->schema_lock);
+	ret = (cval.len == 0) ? __wt_schema_drop(session, uri, cfg) :
+	    __wt_schema_worker(
+		session, uri, cfg, __wt_snapshot_drop, WT_BTREE_SNAPSHOT_OP);
+	__wt_spin_unlock(session, &S2C(session)->schema_lock);
+
+err:	/* Note: drop operations cannot be unrolled (yet?). */
+	WT_TRET(__wt_meta_track_off(session, 0));
+	API_END_NOTFOUND_MAP(session, ret);
 }
 
 /*
@@ -194,13 +242,15 @@ err:	API_END_NOTFOUND_MAP(session, ret);
 static int
 __session_dumpfile(WT_SESSION *wt_session, const char *uri, const char *config)
 {
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	session = (WT_SESSION_IMPL *)wt_session;
 	SESSION_API_CALL(session, dumpfile, config, cfg);
+	__wt_spin_lock(session, &S2C(session)->schema_lock);
 	ret = __wt_schema_worker(session, uri, cfg,
 	    __wt_dumpfile, WT_BTREE_EXCLUSIVE | WT_BTREE_VERIFY);
+	__wt_spin_unlock(session, &S2C(session)->schema_lock);
 
 err:	API_END_NOTFOUND_MAP(session, ret);
 }
@@ -212,14 +262,16 @@ err:	API_END_NOTFOUND_MAP(session, ret);
 static int
 __session_salvage(WT_SESSION *wt_session, const char *uri, const char *config)
 {
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	session = (WT_SESSION_IMPL *)wt_session;
 
 	SESSION_API_CALL(session, salvage, config, cfg);
+	__wt_spin_lock(session, &S2C(session)->schema_lock);
 	ret = __wt_schema_worker(session, uri, cfg,
 	    __wt_salvage, WT_BTREE_EXCLUSIVE | WT_BTREE_SALVAGE);
+	__wt_spin_unlock(session, &S2C(session)->schema_lock);
 
 err:	API_END_NOTFOUND_MAP(session, ret);
 }
@@ -231,15 +283,21 @@ err:	API_END_NOTFOUND_MAP(session, ret);
 static int
 __session_sync(WT_SESSION *wt_session, const char *uri, const char *config)
 {
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	session = (WT_SESSION_IMPL *)wt_session;
 
 	SESSION_API_CALL(session, sync, config, cfg);
-	ret = __wt_schema_worker(session, uri, cfg, __wt_btree_sync, 0);
+	WT_ERR(__wt_meta_track_on(session));
 
-err:	API_END_NOTFOUND_MAP(session, ret);
+	__wt_spin_lock(session, &S2C(session)->schema_lock);
+	ret = __wt_schema_worker(
+	    session, uri, cfg, __wt_snapshot, WT_BTREE_SNAPSHOT_OP);
+	__wt_spin_unlock(session, &S2C(session)->schema_lock);
+
+err:	WT_TRET(__wt_meta_track_off(session, ret != 0));
+	API_END_NOTFOUND_MAP(session, ret);
 }
 
 /*
@@ -250,8 +308,8 @@ static int
 __session_truncate(WT_SESSION *wt_session,
     const char *uri, WT_CURSOR *start, WT_CURSOR *stop, const char *config)
 {
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	session = (WT_SESSION_IMPL *)wt_session;
 
@@ -317,14 +375,16 @@ err:	API_END_NOTFOUND_MAP(session, ret);
 static int
 __session_upgrade(WT_SESSION *wt_session, const char *uri, const char *config)
 {
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	session = (WT_SESSION_IMPL *)wt_session;
 
 	SESSION_API_CALL(session, upgrade, config, cfg);
+	__wt_spin_lock(session, &S2C(session)->schema_lock);
 	ret = __wt_schema_worker(session, uri, cfg,
 	    __wt_upgrade, WT_BTREE_EXCLUSIVE | WT_BTREE_UPGRADE);
+	__wt_spin_unlock(session, &S2C(session)->schema_lock);
 
 err:	API_END_NOTFOUND_MAP(session, ret);
 }
@@ -336,14 +396,16 @@ err:	API_END_NOTFOUND_MAP(session, ret);
 static int
 __session_verify(WT_SESSION *wt_session, const char *uri, const char *config)
 {
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session;
-	int ret;
 
 	session = (WT_SESSION_IMPL *)wt_session;
 
 	SESSION_API_CALL(session, verify, config, cfg);
+	__wt_spin_lock(session, &S2C(session)->schema_lock);
 	ret = __wt_schema_worker(session, uri, cfg,
 	    __wt_verify, WT_BTREE_EXCLUSIVE | WT_BTREE_VERIFY);
+	__wt_spin_unlock(session, &S2C(session)->schema_lock);
 
 err:	API_END_NOTFOUND_MAP(session, ret);
 }
@@ -355,10 +417,22 @@ err:	API_END_NOTFOUND_MAP(session, ret);
 static int
 __session_begin_transaction(WT_SESSION *wt_session, const char *config)
 {
-	WT_UNUSED(wt_session);
-	WT_UNUSED(config);
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+
+	SESSION_API_CALL(session, begin_transaction, config, cfg);
+	if (!F_ISSET(S2C(session), WT_CONN_TRANSACTIONAL))
+		WT_ERR_MSG(session, EINVAL,
+		    "Database not configured for transactions");
+	if (TAILQ_FIRST(&session->cursors) != NULL)
+		WT_ERR_MSG(session, EINVAL, "Not permitted with open cursors");
 
-	return (ENOTSUP);
+	ret = __wt_txn_begin(session, cfg);
+
+err:	API_END(session);
+	return (ret);
 }
 
 /*
@@ -368,10 +442,26 @@ __session_begin_transaction(WT_SESSION *wt_session, const char *config)
 static int
 __session_commit_transaction(WT_SESSION *wt_session, const char *config)
 {
-	WT_UNUSED(wt_session);
-	WT_UNUSED(config);
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+	WT_TXN *txn;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+
+	SESSION_API_CALL(session, commit_transaction, config, cfg);
+	txn = &session->txn;
+	if (F_ISSET(txn, TXN_ERROR)) {
+		__wt_errx(session, "failed transaction requires rollback");
+		ret = EINVAL;
+	}
+	WT_TRET(__session_close_cursors(session));
+	if (ret == 0)
+		ret = __wt_txn_commit(session, cfg);
+	else
+		(void)__wt_txn_rollback(session, cfg);
 
-	return (ENOTSUP);
+err:	API_END(session);
+	return (ret);
 }
 
 /*
@@ -381,10 +471,16 @@ __session_commit_transaction(WT_SESSION *wt_session, const char *config)
 static int
 __session_rollback_transaction(WT_SESSION *wt_session, const char *config)
 {
-	WT_UNUSED(wt_session);
-	WT_UNUSED(config);
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
+
+	session = (WT_SESSION_IMPL *)wt_session;
+	SESSION_API_CALL(session, rollback_transaction, config, cfg);
+	WT_TRET(__session_close_cursors(session));
+	WT_TRET(__wt_txn_rollback(session, cfg));
 
-	return (ENOTSUP);
+err:	API_END(session);
+	return (ret);
 }
 
 /*
@@ -394,10 +490,14 @@ __session_rollback_transaction(WT_SESSION *wt_session, const char *config)
 static int
 __session_checkpoint(WT_SESSION *wt_session, const char *config)
 {
-	WT_UNUSED(wt_session);
-	WT_UNUSED(config);
+	WT_DECL_RET;
+	WT_SESSION_IMPL *session;
 
-	return (ENOTSUP);
+	session = (WT_SESSION_IMPL *)wt_session;
+	SESSION_API_CALL(session, checkpoint, config, cfg);
+	WT_TRET(__wt_txn_checkpoint(session, cfg));
+
+err:	API_END_NOTFOUND_MAP(session, ret);
 }
 
 /*
@@ -407,16 +507,14 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config)
 static int
 __session_msg_printf(WT_SESSION *wt_session, const char *fmt, ...)
 {
-	WT_SESSION_IMPL *session;
+	WT_DECL_RET;
 	va_list ap;
 
-	session = (WT_SESSION_IMPL *)wt_session;
-
 	va_start(ap, fmt);
-	__wt_msgv(session, fmt, ap);
+	ret = __wt_vmsg((WT_SESSION_IMPL *)wt_session, fmt, ap);
 	va_end(ap);
 
-	return (0);
+	return (ret);
 }
 
 /*
@@ -448,46 +546,58 @@ __wt_open_session(WT_CONNECTION_IMPL *conn, int internal,
 		__session_dumpfile,
 		__session_msg_printf
 	};
+	WT_DECL_RET;
 	WT_SESSION_IMPL *session, *session_ret;
-	uint32_t slot;
-	int ret;
+	uint32_t i;
 
 	WT_UNUSED(config);
-	ret = 0;
-	session = &conn->default_session;
+
+	session = conn->default_session;
 	session_ret = NULL;
 
 	__wt_spin_lock(session, &conn->spinlock);
 
-	/* Check to see if there's an available session slot. */
-	if (conn->session_cnt == conn->session_size - 1)
+	/* Find the first inactive session slot. */
+	for (session_ret = conn->sessions,
+	    i = 0; i < conn->session_size; ++session_ret, ++i)
+		if (!session_ret->active)
+			break;
+	if (i == conn->session_size)
 		WT_ERR_MSG(session, WT_ERROR,
-		    "WiredTiger only configured to support %d thread contexts",
+		    "only configured to support %d thread contexts",
 		    conn->session_size);
 
 	/*
-	 * The session reference list is compact, the session array is not.
-	 * Find the first empty session slot.
+	 * If the active session count is increasing, update it.  We don't worry
+	 * about correcting the session count on error, as long as we don't mark
+	 * this session as active, we'll clean it up on close.
 	 */
-	for (slot = 0, session_ret = conn->session_array;
-	    session_ret->iface.connection != NULL;
-	    ++session_ret, ++slot)
-		;
-
-	/* Session entries are re-used, clear the old contents. */
-	WT_CLEAR(*session_ret);
+	if (i >= conn->session_cnt)	/* Defend against off-by-one errors. */
+		conn->session_cnt = i + 1;
 
-	WT_ERR(__wt_cond_alloc(session, "session", 1, &session_ret->cond));
+	session_ret->id = i;
 	session_ret->iface = stds;
 	session_ret->iface.connection = &conn->iface;
-	WT_ASSERT(session, session->event_handler != NULL);
-	session_ret->event_handler = session->event_handler;
-	session_ret->hazard = conn->hazard + slot * conn->hazard_size;
+
+	WT_ERR(__wt_cond_alloc(session, "session", 1, &session_ret->cond));
+
+	__wt_event_handler_set(session_ret,
+	    event_handler == NULL ? session->event_handler : event_handler);
 
 	TAILQ_INIT(&session_ret->cursors);
 	TAILQ_INIT(&session_ret->btrees);
-	if (event_handler != NULL)
-		session_ret->event_handler = event_handler;
+
+	/* Initialize transaction support. */
+	WT_ERR(__wt_txn_init(session_ret));
+
+	/*
+	 * The session's hazard reference memory isn't discarded during normal
+	 * session close because access to it isn't serialized.  Allocate the
+	 * first time we open this session.
+	 */
+	if (session_ret->hazard == NULL)
+		WT_ERR(__wt_calloc(session, conn->hazard_size,
+		    sizeof(WT_HAZARD), &session_ret->hazard));
 
 	/*
 	 * Public sessions are automatically closed during WT_CONNECTION->close.
@@ -500,12 +610,13 @@ __wt_open_session(WT_CONNECTION_IMPL *conn, int internal,
 
 	/*
 	 * Publish: make the entry visible to server threads.  There must be a
-	 * barrier to ensure the structure fields are set before any other
-	 * thread can see the session.
+	 * barrier for two reasons, to ensure structure fields are set before
+	 * any other thread will consider the session, and to push the session
+	 * count to ensure the eviction thread can't review too few slots.
 	 */
-	WT_PUBLISH(conn->sessions[conn->session_cnt++], session_ret);
+	WT_PUBLISH(session_ret->active, 1);
 
-	STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0);
+	STATIC_ASSERT(offsetof(WT_SESSION_IMPL, iface) == 0);
 	*sessionp = session_ret;
 
 err:	__wt_spin_unlock(session, &conn->spinlock);
diff --git a/src/session/session_btree.c b/src/session/session_btree.c
index 6cec54874f2..00b01e25c05 100644
--- a/src/session/session_btree.c
+++ b/src/session/session_btree.c
@@ -33,15 +33,20 @@ __wt_session_add_btree(
  *	Lock a btree handle.
  */
 int
-__wt_session_lock_btree(
-    WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags)
+__wt_session_lock_btree(WT_SESSION_IMPL *session, uint32_t flags)
 {
 	WT_BTREE *btree;
-	uint32_t open_flags;
-	int ret;
+	uint32_t special_flags;
 
 	btree = session->btree;
-	ret = 0;
+
+	/*
+	 * Special operation flags will cause the handle to be reopened.
+	 * For example, a handle opened with WT_BTREE_BULK cannot use the same
+	 * internal data structures as a handle opened for ordinary access.
+	 */
+	special_flags = LF_ISSET(WT_BTREE_SPECIAL_FLAGS);
+	WT_ASSERT(session, special_flags == 0 || LF_ISSET(WT_BTREE_EXCLUSIVE));
 
 	if (LF_ISSET(WT_BTREE_EXCLUSIVE)) {
 		/*
@@ -50,23 +55,35 @@ __wt_session_lock_btree(
 		 * trees to be mixed with ordinary cursor access, but if there
 		 * is a use case in the future, we could make blocking here
 		 * configurable.
+		 *
+		 * Special flags will cause the handle to be reopened, which
+		 * will get the necessary lock, so don't bother here.
 		 */
-		WT_RET(__wt_try_writelock(session, btree->rwlock));
-
-		/*
-		 * Reopen the handle for this operation to set any special
-		 * flags.  For example, set WT_BTREE_BULK so the handle is
-		 * closed correctly.
-		 */
-		open_flags = LF_ISSET(WT_BTREE_BULK |
-		    WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY);
-		if (open_flags != 0)
-			ret = __wt_conn_btree_reopen(session, cfg, open_flags);
-		F_SET(btree, WT_BTREE_EXCLUSIVE);
-	} else if (!LF_ISSET(WT_BTREE_NO_LOCK))
+		if (LF_ISSET(WT_BTREE_LOCK_ONLY) || special_flags == 0) {
+			WT_RET(__wt_try_writelock(session, btree->rwlock));
+			F_SET(btree, WT_BTREE_EXCLUSIVE);
+		}
+	} else
 		__wt_readlock(session, btree->rwlock);
 
-	return (ret);
+	/*
+	 * At this point, we have the requested lock -- if that is all that was
+	 * required, we're done.  Otherwise, check that the handle is open and
+	 * that no special flags are required.
+	 */
+	if (LF_ISSET(WT_BTREE_LOCK_ONLY) ||
+	    (F_ISSET(btree, WT_BTREE_OPEN) && special_flags == 0))
+		return (0);
+
+	/*
+	 * The handle needs to be opened.  If we locked the handle above,
+	 * unlock it before returning.
+	 */
+	if (!LF_ISSET(WT_BTREE_EXCLUSIVE) || special_flags == 0)
+		__wt_rwunlock(session, btree->rwlock);
+
+	/* Treat an unopened handle just like a non-existent handle. */
+	return (WT_NOTFOUND);
 }
 
 /*
@@ -77,142 +94,151 @@ int
 __wt_session_release_btree(WT_SESSION_IMPL *session)
 {
 	WT_BTREE *btree;
-	int ret;
+	WT_DECL_RET;
 
 	btree = session->btree;
-	ret = 0;
+
+	/* If the tree is being created, it is already locked and tracked. */
+	if (btree == session->created_btree)
+		return (0);
 
 	/*
-	 * If we had exclusive access, reopen the tree without special flags so
-	 * that other threads can use it (note the reopen call sets the flags).
+	 * If we had special flags set, close the handle so that future access
+	 * can get a handle without special flags.
 	 */
-	if (F_ISSET(btree, WT_BTREE_BULK |
-	    WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
+	if (F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) {
 		WT_ASSERT(session, F_ISSET(btree, WT_BTREE_EXCLUSIVE));
-		ret = __wt_conn_btree_reopen(session, NULL, 0);
+
+		ret = __wt_conn_btree_sync_and_close(session);
 	}
 
 	if (F_ISSET(btree, WT_BTREE_EXCLUSIVE))
 		F_CLR(btree, WT_BTREE_EXCLUSIVE);
 
 	__wt_rwunlock(session, btree->rwlock);
+	session->btree = NULL;
 
 	return (ret);
 }
 
 /*
- * __wt_session_find_btree --
- *	Find an open btree handle for the named table.
+ * __wt_session_get_btree --
+ *	Get a btree handle for the given name, set session->btree.
  */
 int
-__wt_session_find_btree(WT_SESSION_IMPL *session,
-    const char *filename, size_t namelen, const char *cfg[], uint32_t flags,
-    WT_BTREE_SESSION **btree_sessionp)
+__wt_session_get_btree(WT_SESSION_IMPL *session,
+    const char *uri, const char *cfg[], uint32_t flags)
 {
 	WT_BTREE *btree;
 	WT_BTREE_SESSION *btree_session;
+	WT_CONFIG_ITEM cval;
+	WT_DECL_RET;
+	const char *snapshot;
+	size_t snaplen;
+
+	btree = NULL;
+
+	/* Is this a snapshot operation? */
+	if (!LF_ISSET(WT_BTREE_SNAPSHOT_OP) && cfg != NULL &&
+	    __wt_config_gets(session, cfg, "snapshot", &cval) == 0 &&
+	    cval.len != 0) {
+		snapshot = cval.str;
+		snaplen = cval.len;
+	} else {
+		snapshot = NULL;
+		snaplen = 0;
+	}
 
 	TAILQ_FOREACH(btree_session, &session->btrees, q) {
 		btree = btree_session->btree;
-		if (strncmp(filename, btree->filename, namelen) == 0 &&
-		    btree->filename[namelen] == '\0') {
-			if (btree_sessionp != NULL)
-				*btree_sessionp = btree_session;
-			session->btree = btree;
-			return (__wt_session_lock_btree(session, cfg, flags));
-		}
+		if (strcmp(uri, btree->name) != 0)
+			continue;
+		if ((snapshot == NULL && btree->snapshot == NULL) ||
+		    (snapshot != NULL && btree->snapshot != NULL &&
+		    (strncmp(snapshot, btree->snapshot, snaplen) == 0 &&
+		    btree->snapshot[snaplen] == '\0')))
+			break;
 	}
 
-	return (WT_NOTFOUND);
-}
-
-/*
- * __wt_session_get_btree --
- *	Get a btree handle for the given name, set session->btree.
- */
-int
-__wt_session_get_btree(WT_SESSION_IMPL *session,
-    const char *name, const char *fileuri, const char *tconfig,
-    const char *cfg[], uint32_t flags)
-{
-	WT_BTREE_SESSION *btree_session;
-	const char *filename, *treeconf;
-	int exist, ret;
-
-	filename = fileuri;
-	if (!WT_PREFIX_SKIP(filename, "file:"))
-		WT_RET_MSG(
-		    session, EINVAL, "Expected a 'file:' URI: %s", fileuri);
-
-	if ((ret = __wt_session_find_btree(session,
-	    filename, strlen(filename), cfg, flags, &btree_session)) == 0) {
-		WT_ASSERT(session, btree_session->btree != NULL);
-		session->btree = btree_session->btree;
-		return (0);
+	if (btree_session == NULL)
+		session->btree = NULL;
+	else {
+		session->btree = btree;
+		/*
+		 * If the tree is being created, it is already locked and
+		 * tracked.
+		 */
+		if (btree == session->created_btree)
+			return (0);
+
+		if ((ret =
+		    __wt_session_lock_btree(session, flags)) != WT_NOTFOUND) {
+			WT_ASSERT(session, ret != 0 ||
+			    LF_ISSET(WT_BTREE_EXCLUSIVE) ==
+			    F_ISSET(session->btree, WT_BTREE_EXCLUSIVE));
+			return (ret);
+		}
+		ret = 0;
 	}
-	if (ret != WT_NOTFOUND)
-		return (ret);
 
-	WT_RET(__wt_exist(session, filename, &exist));
-	if (!exist)
-		return (WT_NOTFOUND);
+	WT_RET(__wt_conn_btree_get(session, uri, snapshot, cfg, flags));
 
-	/*
-	 * A fixed configuration is passed in for special files, such
-	 * as the schema table itself.
-	 */
-	if (tconfig != NULL)
-		WT_RET(__wt_strdup(session, tconfig, &treeconf));
-	else
-		WT_RET(__wt_schema_table_read(session, fileuri, &treeconf));
-	WT_RET(__wt_conn_btree_open(
-	    session, name, filename, treeconf, cfg, flags));
-	WT_RET(__wt_session_lock_btree(session, cfg, flags));
-	WT_RET(__wt_session_add_btree(session, NULL));
+	if (btree_session == NULL)
+		WT_RET(__wt_session_add_btree(session, NULL));
+
+	WT_ASSERT(session, LF_ISSET(WT_BTREE_LOCK_ONLY) ||
+	    F_ISSET(session->btree, WT_BTREE_OPEN));
+	WT_ASSERT(session, LF_ISSET(WT_BTREE_EXCLUSIVE) ==
+	    F_ISSET(session->btree, WT_BTREE_EXCLUSIVE));
 
 	return (0);
 }
 
 /*
- * __wt_session_remove_btree --
- *	Discard our reference to the btree.
+ * __wt_session_lock_snapshot --
+ *	Lock the btree handle for the given snapshot name.
  */
 int
-__wt_session_remove_btree(
-    WT_SESSION_IMPL *session, WT_BTREE_SESSION *btree_session, int locked)
+__wt_session_lock_snapshot(
+    WT_SESSION_IMPL *session, const char *snapshot, uint32_t flags)
 {
-	TAILQ_REMOVE(&session->btrees, btree_session, q);
-	session->btree = btree_session->btree;
-	__wt_free(session, btree_session);
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_ITEM *buf;
+	const char *cfg[] = { NULL, NULL };
 
-	return (__wt_conn_btree_close(session, locked));
+	buf = NULL;
+	btree = session->btree;
+
+	WT_ERR(__wt_scr_alloc(session, 0, &buf));
+	WT_ERR(__wt_buf_fmt(session, buf, "snapshot=\"%s\"", snapshot));
+	cfg[0] = buf->data;
+
+	LF_SET(WT_BTREE_LOCK_ONLY);
+	WT_ERR(__wt_session_get_btree(session, btree->name, cfg, flags));
+
+	WT_ASSERT(session, WT_META_TRACKING(session));
+	WT_ERR(__wt_meta_track_handle_lock(session));
+
+	/* Restore the original btree in the session. */
+err:	session->btree = btree;
+	__wt_scr_free(&buf);
+
+	return (ret);
 }
 
 /*
- * __wt_session_close_any_open_btree --
- *	If open, close the btree handle.
+ * __wt_session_discard_btree --
+ *	Discard our reference to the btree.
  */
 int
-__wt_session_close_any_open_btree(WT_SESSION_IMPL *session, const char *name)
+__wt_session_discard_btree(
+    WT_SESSION_IMPL *session, WT_BTREE_SESSION *btree_session)
 {
-	WT_BTREE_SESSION *btree_session;
-	int ret;
+	TAILQ_REMOVE(&session->btrees, btree_session, q);
 
-	if ((ret = __wt_session_find_btree(session, name, strlen(name),
-	    NULL, WT_BTREE_EXCLUSIVE, &btree_session)) == 0) {
-		/*
-		 * XXX
-		 * We have an exclusive lock, which means there are no cursors
-		 * open but some other thread may have the handle cached.
-		 * Fixing this will mean adding additional synchronization to
-		 * the cursor open path.
-		 */
-		WT_ASSERT(session, btree_session->btree->refcnt == 1);
-		__wt_schema_detach_tree(session, btree_session->btree);
-		ret = __wt_session_remove_btree(session, btree_session, 1);
-		__wt_rwunlock(session, session->btree->rwlock);
-	} else if (ret == WT_NOTFOUND)
-		ret = 0;
+	session->btree = btree_session->btree;
+	__wt_overwrite_and_free(session, btree_session);
 
-	return (ret);
+	return (__wt_conn_btree_close(session, 0));
 }
diff --git a/src/session/session_root.c b/src/session/session_root.c
deleted file mode 100644
index 5d7d8b76540..00000000000
--- a/src/session/session_root.c
+++ /dev/null
@@ -1,330 +0,0 @@
-/*-
- * Copyright (c) 2008-2012 WiredTiger, Inc.
- *	All rights reserved.
- *
- * See the file LICENSE for redistribution information.
- */
-
-#include "wt_internal.h"
-
-static int __btree_get_root(
-	WT_SESSION_IMPL *, const char *, const char **, int *, int *);
-static int __btree_get_turtle(WT_SESSION_IMPL *, const char **, int *, int *);
-static int __btree_parse_root(
-	WT_SESSION_IMPL *, const char *, const char **, int *, int *);
-static int __btree_set_root(WT_SESSION_IMPL *, const char *, WT_ITEM *);
-static int __btree_set_turtle(WT_SESSION_IMPL *, WT_ITEM *);
-
-#define	WT_TURTLE_MSG		"The turtle."
-
-#define	WT_SCHEMA_TURTLE	"WiredTiger.turtle"	/* Schema root page */
-#define	WT_SCHEMA_TURTLE_SET	"WiredTiger.turtle.set"	/* Schema root prep */
-
-/*
- * __wt_btree_get_root --
- *	Get the file's root address.
- */
-int
-__wt_btree_get_root(WT_SESSION_IMPL *session, WT_ITEM *addr)
-{
-	WT_BTREE *btree;
-	int majorv, minorv, ret;
-	const char *v;
-
-	btree = session->btree;
-	v = NULL;
-	ret = 0;
-
-	/* If there's no root address, return a NULL with a size of 0. */
-	addr->data = NULL;
-	addr->size = 0;
-
-	/*
-	 * If we don't find a file, we're creating a new one, at the current
-	 * version.
-	 */
-	majorv = WT_BTREE_MAJOR_VERSION;
-	minorv = WT_BTREE_MINOR_VERSION;
-
-	/* Get the root address and major/minor numbers. */
-	WT_ERR(strcmp(btree->filename, WT_SCHEMA_FILENAME) == 0 ?
-	    __btree_get_turtle(session, &v, &majorv, &minorv) :
-	    __btree_get_root(session, btree->filename, &v, &majorv, &minorv));
-
-	if (majorv > WT_BTREE_MAJOR_VERSION ||
-	    (majorv == WT_BTREE_MAJOR_VERSION &&
-	    minorv > WT_BTREE_MINOR_VERSION))
-		WT_ERR_MSG(session, EACCES,
-		    "%s is an unsupported version of a WiredTiger file",
-		    btree->filename);
-
-	if (v != NULL && strlen(v) != 0 && strcmp(v, WT_NOADDR) != 0)
-		WT_ERR(__wt_hex_to_raw(session, v, addr));
-
-err:	if (ret != 0)
-		__wt_errx(session,
-		    "unable to find %s file's root address", btree->filename);
-
-	__wt_free(session, v);
-	return (ret);
-}
-
-/*
- * __wt_btree_free_root --
- *	Free the file's root address.
- */
-int
-__wt_btree_free_root(WT_SESSION_IMPL *session)
-{
-	WT_ITEM *addr, *as;
-	WT_BTREE *btree;
-	int ret;
-
-	btree = session->btree;
-	addr = as = NULL;
-
-	WT_RET(__wt_scr_alloc(session, WT_BM_MAX_ADDR_COOKIE, &addr));
-	WT_ERR(__wt_btree_get_root(session, addr));
-	if (addr->data != NULL) {
-		WT_RET(__wt_scr_alloc(session, 0, &as));
-		WT_VERBOSE(session, verify, "free %s root %s",
-		    btree->filename,
-		    __wt_addr_string(session, as, addr->data, addr->size));
-
-		WT_ERR(__wt_bm_free(session, addr->data, addr->size));
-	}
-
-err:	__wt_scr_free(&addr);
-	__wt_scr_free(&as);
-	return (ret);
-}
-
-/*
- * __wt_btree_set_root --
- *	Set the file's root address.
- */
-int
-__wt_btree_set_root(WT_SESSION_IMPL *session,
-    const char *filename, const uint8_t *addr, uint32_t size)
-{
-	WT_ITEM *v;
-	int ret;
-
-	v = NULL;
-	ret = 0;
-
-	/*
-	 * Every bytes is encoded as 2 bytes, plus a trailing nul byte,
-	 * and it needs to hold the "no address" string.
-	 */
-	WT_RET(__wt_scr_alloc(
-	    session, size * 2 + 1 + WT_STORE_SIZE(strlen(WT_NOADDR)), &v));
-
-	WT_VERBOSE(session, verify, "set %s root %s",
-	    filename, __wt_addr_string(session, v, addr, size));
-
-	/*
-	 * We're not using the WT_ITEM as a buffer going forward, but fill
-	 * in the values anyway, just for safety.
-	 */
-	if (addr == NULL) {
-		v->data = WT_NOADDR;
-		v->size = WT_STORE_SIZE(strlen(WT_NOADDR)) + 1;
-	} else
-		WT_ERR(__wt_raw_to_hex(session, addr, size, v));
-
-	WT_ERR(strcmp(filename, WT_SCHEMA_FILENAME) == 0 ?
-	    __btree_set_turtle(session, v) :
-	    __btree_set_root(session, filename, v));
-
-err:	/*
-	 * If we are unrolling a failed create, we may have already removed
-	 * the schema table entry.  If no entry is found to update and we're
-	 * trying to clear the root, just ignore it.
-	 */
-	if (ret == WT_NOTFOUND && addr == NULL)
-		ret = 0;
-	if (ret != 0)
-		__wt_errx(session,
-		    "unable to update %s file's root address", filename);
-
-	__wt_scr_free(&v);
-	return (ret);
-}
-
-/*
- * __btree_get_turtle --
- *	Get the schema file's root address.
- */
-static int
-__btree_get_turtle(
-    WT_SESSION_IMPL *session, const char **vp, int *majorp, int *minorp)
-{
-	FILE *fp;
-	int ret;
-	const char *path;
-	char line[1024], *p;
-
-	*vp = NULL;
-
-	fp = NULL;
-	ret = 0;
-	path = NULL;
-
-	WT_RET(__wt_filename(session, WT_SCHEMA_TURTLE, &path));
-	if ((fp = fopen(path, "r")) == NULL)
-		goto done;
-	while (fgets(line, (int)sizeof(line), fp) != NULL) {
-		if ((p = strchr(line, '\n')) == NULL)
-			break;
-		*p = '\0';
-		if (strcmp(line, WT_TURTLE_MSG) == 0)
-			continue;
-
-		WT_ERR(__btree_parse_root(session, line, vp, majorp, minorp));
-		goto done;
-	}
-
-	if (ferror(fp))
-		ret = __wt_errno();
-err:	if (ret == 0)
-		ret = WT_ERROR;
-	__wt_errx(session, "the %s file is corrupted", path);
-
-done:	if (fp != NULL)
-		WT_TRET(fclose(fp));
-	__wt_free(session, path);
-
-	return (ret);
-}
-
-/*
- * __btree_set_turtle --
- *	Set the schema file's root address.
- */
-static int
-__btree_set_turtle(WT_SESSION_IMPL *session, WT_ITEM *v)
-{
-	WT_ITEM *buf;
-	FILE *fp;
-	size_t len;
-	int ret;
-	const char *path;
-
-	buf = NULL;
-	ret = 0;
-	path = NULL;
-
-	WT_ERR(__wt_filename(session, WT_SCHEMA_TURTLE_SET, &path));
-	WT_ERR_TEST((fp = fopen(path, "w")) == NULL, WT_ERROR);
-
-	WT_RET(__wt_scr_alloc(session, 0, &buf));
-	WT_ERR(__wt_buf_fmt(session, buf,
-	    "%s\n"
-	    "root=%.*s,version=(major=%d,minor=%d)\n", WT_TURTLE_MSG,
-	    (int)v->size, (const char *)v->data,
-	    WT_BTREE_MAJOR_VERSION, WT_BTREE_MINOR_VERSION));
-	len = (size_t)fprintf(fp, "%s", (char *)buf->data);
-	if (len != buf->size)
-		ret = WT_ERROR;
-
-	WT_TRET(fflush(fp));
-	WT_TRET(fclose(fp));
-
-	if (ret == 0)
-		ret = __wt_rename(
-		    session, WT_SCHEMA_TURTLE_SET, WT_SCHEMA_TURTLE);
-	else
-		(void)__wt_remove(session, WT_SCHEMA_TURTLE_SET);
-
-err:	if (path != NULL)
-		__wt_free(session, path);
-	__wt_scr_free(&buf);
-	return (ret);
-}
-
-/*
- * __btree_parse_root --
- *	Parse a btree config string to extract the version
- */
-static int
-__btree_parse_root(WT_SESSION_IMPL *session,
-    const char *config, const char **vp, int *majorp, int *minorp)
-{
-	WT_CONFIG_ITEM subv, v;
-
-	*vp = NULL;
-	*majorp = *minorp = 0;
-
-	WT_RET(__wt_config_getones(session, config, "version", &v));
-	WT_RET(__wt_config_subgets(session, &v, "major", &subv));
-	*majorp = (int)subv.val;
-
-	WT_RET(__wt_config_subgets(session, &v, "minor", &subv));
-	*minorp = (int)subv.val;
-
-	WT_RET(__wt_config_getones(session, config, "root", &v));
-	if (v.len > 0)
-		WT_RET(__wt_strndup(session, v.str, v.len, vp));
-
-	return (0);
-}
-
-/*
- * __btree_get_root --
- *	Parse a btree config string to extract the version
- */
-static int
-__btree_get_root(WT_SESSION_IMPL *session,
-    const char *filename, const char **vp, int *majorp, int *minorp)
-{
-	WT_ITEM *key;
-	const char *config;
-	int ret;
-
-	config = NULL;
-	key = NULL;
-	*vp = NULL;
-	*majorp = *minorp = 0;
-
-	WT_ERR(__wt_scr_alloc(session, 0, &key));
-	WT_ERR(__wt_buf_fmt(session, key, "file:%s", filename));
-	WT_ERR(__wt_schema_table_read(session, key->data, &config));
-	WT_ERR(__btree_parse_root(session, config, vp, majorp, minorp));
-
-err:	__wt_scr_free(&key);
-	__wt_free(session, config);
-	return (ret == WT_NOTFOUND ? 0 : ret);
-}
-
-/*
- * __btree_set_root --
- *	Set a non-schema file's root address.
- */
-static int
-__btree_set_root(WT_SESSION_IMPL *session, const char *filename, WT_ITEM *v)
-{
-	WT_ITEM *key, *newv;
-	const char *cfg[3], *newcfg;
-	int ret;
-
-	key = newv = NULL;
-	cfg[0] = newcfg = NULL;
-
-	WT_ERR(__wt_scr_alloc(session, 0, &key));
-	WT_ERR(__wt_buf_fmt(session, key, "file:%s", filename));
-	WT_ERR(__wt_schema_table_read(session, key->data, &cfg[0]));
-	WT_ERR(__wt_scr_alloc(session, 0, &newv));
-	WT_ERR(__wt_buf_fmt(session, newv, "root=%.*s",
-	    (int)v->size, (const char *)v->data));
-	cfg[1] = newv->data;
-	cfg[2] = NULL;
-	WT_ERR(__wt_config_collapse(session, cfg, &newcfg));
-	WT_ERR(__wt_schema_table_update(session, key->data, newcfg));
-
-err:	__wt_scr_free(&key);
-	__wt_scr_free(&newv);
-	__wt_free(session, cfg[0]);
-	__wt_free(session, newcfg);
-	return (ret);
-}
diff --git a/src/session/session_salvage.c b/src/session/session_salvage.c
new file mode 100644
index 00000000000..87fb2d36854
--- /dev/null
+++ b/src/session/session_salvage.c
@@ -0,0 +1,57 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_salvage --
+ *	Salvage a single file.
+ */
+int
+__wt_salvage(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_SNAPSHOT *snapbase;
+
+	btree = session->btree;
+
+	/*
+	 * XXX
+	 * The salvage process reads and discards previous snapshot blocks, so
+	 * the underlying block manager has to ignore any previous snapshot
+	 * entries when creating a new snapshot, in other words, we can't use
+	 * the metadata snapshot list, it has all of those snapshots listed and
+	 * we don't care about them.  Build a clean snapshot array and use it
+	 * instead.
+	 *
+	 * Don't first clear the metadata snapshot list and call the snapshot
+	 * get routine: a crash between clearing the metadata snapshot list and
+	 * creating a new snapshot list would look like a create or open of a
+	 * file without a snapshot from which to roll-forward, and the contents
+	 * of the file would be discarded.
+	 */
+	WT_RET(__wt_calloc_def(session, 2, &snapbase));
+	WT_ERR(__wt_strdup(session, WT_INTERNAL_SNAPSHOT, &snapbase[0].name));
+	F_SET(&snapbase[0], WT_SNAP_ADD);
+
+	WT_ERR(__wt_bt_salvage(session, snapbase, cfg));
+
+	/*
+	 * If no snapshot was created, well, it's probably bad news, but there
+	 * is nothing to do but clear any recorded snapshots for the file.  If
+	 * a snapshot was created, life is good, replace any recorded snapshots
+	 * with the new one.
+	 */
+	if (snapbase[0].raw.data == NULL)
+		WT_ERR(__wt_meta_snapshot_clear(session, btree->name));
+	else
+		WT_ERR(__wt_meta_snaplist_set(session, btree->name, snapbase));
+
+err:	__wt_meta_snaplist_free(session, snapbase);
+	return (ret);
+}
diff --git a/src/session/session_snapshot.c b/src/session/session_snapshot.c
new file mode 100644
index 00000000000..131c37a3a2d
--- /dev/null
+++ b/src/session/session_snapshot.c
@@ -0,0 +1,256 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+typedef enum {
+	SNAPSHOT,			/* Create snapshot */
+	SNAPSHOT_DROP,			/* Drop named snapshot */
+	SNAPSHOT_DROP_ALL,		/* Drop all snapshots */
+	SNAPSHOT_DROP_FROM,		/* Drop snapshots from name to end */
+	SNAPSHOT_DROP_TO		/* Drop snapshots from start to name */
+} snapshot_op;
+
+static int __snapshot_worker(WT_SESSION_IMPL *, const char *, int, snapshot_op);
+
+/*
+ * __wt_snapshot --
+ *	Snapshot the tree.
+ */
+int
+__wt_snapshot(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_CONFIG_ITEM cval;
+	WT_DECL_RET;
+	char *name;
+
+	name = NULL;
+
+	/* This may be a named snapshot, check the configuration. */
+	if ((ret = __wt_config_gets(
+	    session, cfg, "snapshot", &cval)) != 0 && ret != WT_NOTFOUND)
+		WT_RET(ret);
+	if (cval.len != 0)
+		WT_RET(__wt_strndup(session, cval.str, cval.len, &name));
+
+	ret = __snapshot_worker(session, name, 0, SNAPSHOT);
+
+	__wt_free(session, name);
+	return (ret);
+}
+
+/*
+ * __wt_snapshot_close --
+ *	Snapshot the tree when the handle is closed.
+ */
+int
+__wt_snapshot_close(WT_SESSION_IMPL *session)
+{
+	return (__snapshot_worker(session, NULL, 1, SNAPSHOT));
+}
+
+/*
+ * __wt_snapshot_drop --
+ *	Snapshot the tree, dropping one or more snapshots.
+ */
+int
+__wt_snapshot_drop(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_CONFIG_ITEM cval, sval;
+	WT_DECL_RET;
+	char *name;
+
+	name = NULL;
+
+	WT_RET(__wt_config_gets(session, cfg, "snapshot", &cval));
+	if (cval.type != ITEM_STRUCT) {
+		WT_RET(__wt_strndup(session, cval.str, cval.len, &name));
+		ret = __snapshot_worker(session, name, 0, SNAPSHOT_DROP);
+	} else if (__wt_config_subgets(session, &cval, "all", &sval) == 0 &&
+	    sval.val != 0)
+		ret = __snapshot_worker(session, name, 0, SNAPSHOT_DROP_ALL);
+	else if (__wt_config_subgets(session, &cval, "from", &sval) == 0 &&
+	    sval.len != 0) {
+		WT_RET(__wt_strndup(session, sval.str, sval.len, &name));
+		ret = __snapshot_worker(session, name, 0, SNAPSHOT_DROP_FROM);
+	} else if (__wt_config_subgets(session, &cval, "to", &sval) == 0 &&
+	    sval.len != 0) {
+		WT_RET(__wt_strndup(session, sval.str, sval.len, &name));
+		ret = __snapshot_worker(session, name, 0, SNAPSHOT_DROP_TO);
+	} else
+		WT_RET_MSG(session, EINVAL,
+		    "Unexpected value for 'snapshot' key: %.*s",
+		    (int)cval.len, cval.str);
+
+	__wt_free(session, name);
+	return (ret);
+}
+
+/*
+ * __snapshot_worker --
+ *	Snapshot the tree.
+ */
+static int
+__snapshot_worker(
+    WT_SESSION_IMPL *session, const char *name, int discard, snapshot_op op)
+{
+	WT_BTREE *btree;
+	WT_DECL_RET;
+	WT_SNAPSHOT *deleted, *snap, *snapbase;
+	int force, matched;
+
+	btree = session->btree;
+	matched = 0;
+	snap = snapbase = NULL;
+
+	/* Snapshots are single-threaded. */
+	__wt_writelock(session, btree->snaplock);
+
+	/* Set the name to the default, if we aren't provided one. */
+	if (op == SNAPSHOT && name == NULL) {
+		force = 0;
+		name = WT_INTERNAL_SNAPSHOT;
+	} else
+		force = 1;
+
+	/*
+	 * Get the list of snapshots for this file.  If there's no reference,
+	 * this file is dead.  Discard it from the cache without bothering to
+	 * write any dirty pages.
+	 */
+	if ((ret =
+	    __wt_meta_snaplist_get(session, btree->name, &snapbase)) != 0) {
+		if (ret == WT_NOTFOUND)
+			ret = __wt_bt_cache_flush(
+			    session, NULL, WT_SYNC_DISCARD_NOWRITE, 0);
+		goto err;
+	}
+
+	switch (op) {
+	case SNAPSHOT:
+		/*
+		 * Create a new, possibly named, snapshot.  Review existing
+		 * snapshots, deleting default snapshots and snapshots with
+		 * matching names, add the new snapshot entry at the end of
+		 * the list.
+		 */
+		WT_SNAPSHOT_FOREACH(snapbase, snap)
+			if (strcmp(snap->name, name) == 0 ||
+			    strcmp(snap->name, WT_INTERNAL_SNAPSHOT) == 0)
+				F_SET(snap, WT_SNAP_DELETE);
+
+		WT_ERR(__wt_strdup(session, name, &snap->name));
+		F_SET(snap, WT_SNAP_ADD);
+		break;
+	case SNAPSHOT_DROP:
+		/*
+		 * Drop all snapshots with matching names.
+		 * Drop all snapshots with the default name.
+		 * Add a new snapshot with the default name.
+		 */
+		WT_SNAPSHOT_FOREACH(snapbase, snap) {
+			/*
+			 * There should be only one snapshot with a matching
+			 * name, but it doesn't hurt to check the rest.
+			 */
+			if (strcmp(snap->name, name) == 0)
+				matched = 1;
+			else if (strcmp(snap->name, WT_INTERNAL_SNAPSHOT) != 0)
+				continue;
+			F_SET(snap, WT_SNAP_DELETE);
+		}
+		if (!matched)
+			goto nomatch;
+
+		WT_ERR(__wt_strdup(session, WT_INTERNAL_SNAPSHOT, &snap->name));
+		F_SET(snap, WT_SNAP_ADD);
+		break;
+	case SNAPSHOT_DROP_ALL:
+		/*
+		 * Drop all snapshots.
+		 * Add a new snapshot with the default name.
+		 */
+		WT_SNAPSHOT_FOREACH(snapbase, snap)
+			F_SET(snap, WT_SNAP_DELETE);
+
+		WT_ERR(__wt_strdup(session, WT_INTERNAL_SNAPSHOT, &snap->name));
+		F_SET(snap, WT_SNAP_ADD);
+		break;
+	case SNAPSHOT_DROP_FROM:
+		/*
+		 * Drop all snapshots after, and including, the named snapshot.
+		 * Drop all snapshots with the default name.
+		 * Add a new snapshot with the default name.
+		 */
+		WT_SNAPSHOT_FOREACH(snapbase, snap) {
+			if (strcmp(snap->name, name) == 0)
+				matched = 1;
+			if (matched ||
+			    strcmp(snap->name, WT_INTERNAL_SNAPSHOT) == 0)
+				F_SET(snap, WT_SNAP_DELETE);
+		}
+		if (!matched)
+			goto nomatch;
+
+		WT_ERR(__wt_strdup(session, WT_INTERNAL_SNAPSHOT, &snap->name));
+		F_SET(snap, WT_SNAP_ADD);
+		break;
+	case SNAPSHOT_DROP_TO:
+		/*
+		 * Drop all snapshots before, and including, the named snapshot.
+		 * Drop all snapshots with the default name.
+		 * Add a new snapshot with the default name.
+		 */
+		WT_SNAPSHOT_FOREACH(snapbase, snap) {
+			if (!matched ||
+			    strcmp(snap->name, WT_INTERNAL_SNAPSHOT) == 0)
+				F_SET(snap, WT_SNAP_DELETE);
+			if (strcmp(snap->name, name) == 0)
+				matched = 1;
+		}
+		if (!matched)
+nomatch:		WT_ERR_MSG(session,
+			    EINVAL, "no snapshot named %s was found", name);
+
+		WT_ERR(__wt_strdup(session, WT_INTERNAL_SNAPSHOT, &snap->name));
+		F_SET(snap, WT_SNAP_ADD);
+		break;
+	}
+
+	/*
+	 * Lock the snapshots that will be deleted.
+	 *
+	 * Snapshots are only locked when tracking is enabled, which covers
+	 * sync and drop operations, but not close.  The reasoning is that
+	 * there should be no access to a snapshot during close, because any
+	 * thread accessing a snapshot will also have the current file handle
+	 * open.
+	 */
+	if (WT_META_TRACKING(session))
+		WT_SNAPSHOT_FOREACH(snapbase, deleted)
+			if (F_ISSET(deleted, WT_SNAP_DELETE))
+				WT_ERR(__wt_session_lock_snapshot(session,
+				    deleted->name, WT_BTREE_EXCLUSIVE));
+
+	WT_ERR(__wt_bt_cache_flush(
+	    session, snapbase, discard ? WT_SYNC_DISCARD : WT_SYNC, force));
+
+	/* If there was a snapshot, update the metadata. */
+	if (snap->raw.data == NULL) {
+		if (force)
+			WT_ERR_MSG(session,
+			    EINVAL, "cache flush failed to create a snapshot");
+	} else {
+		WT_ERR(__wt_meta_snaplist_set(session, btree->name, snapbase));
+		WT_ERR(__wt_bm_snapshot_resolve(session, snapbase));
+	}
+
+err:	__wt_meta_snaplist_free(session, snapbase);
+	__wt_rwunlock(session, btree->snaplock);
+
+	return (ret);
+}
diff --git a/src/support/err.c b/src/support/err.c
index aac2f8ab3ea..53ff89829f2 100644
--- a/src/support/err.c
+++ b/src/support/err.c
@@ -8,19 +8,125 @@
 #include "wt_internal.h"
 
 /*
- * __wt_eventv --
- * 	Report a message to an event handler.
+ * __handle_error_default --
+ *	Default WT_EVENT_HANDLER->handle_error implementation: send to stderr.
+ */
+static int
+__handle_error_default(WT_EVENT_HANDLER *handler, int error, const char *errmsg)
+{
+	WT_UNUSED(handler);
+	WT_UNUSED(error);
+
+	return (fprintf(stderr, "%s\n", errmsg) >= 0 ? 0 : EIO);
+}
+
+/*
+ * __handle_message_default --
+ *	Default WT_EVENT_HANDLER->handle_message implementation: send to stdout.
+ */
+static int
+__handle_message_default(WT_EVENT_HANDLER *handler, const char *message)
+{
+	WT_UNUSED(handler);
+
+	return (printf("%s\n", message) >= 0 ? 0 : EIO);
+}
+
+/*
+ * __handle_progress_default --
+ *	Default WT_EVENT_HANDLER->handle_progress implementation: ignore.
+ */
+static int
+__handle_progress_default(
+    WT_EVENT_HANDLER *handler, const char *operation, uint64_t progress)
+{
+	WT_UNUSED(handler);
+	WT_UNUSED(operation);
+	WT_UNUSED(progress);
+
+	return (0);
+}
+
+static WT_EVENT_HANDLER __event_handler_default = {
+	__handle_error_default,
+	__handle_message_default,
+	__handle_progress_default
+};
+
+/*
+ * __handler_failure --
+ *	Report the failure of an application-configured event handler.
+ */
+static void
+__handler_failure(WT_SESSION_IMPL *session,
+    int error, const char *which, int error_handler_failed)
+{
+	WT_EVENT_HANDLER *handler;
+
+	/*
+	 * !!!
+	 * SECURITY:
+	 * Buffer placed at the end of the stack in case snprintf overflows.
+	 */
+	char s[256];
+
+	(void)snprintf(s, sizeof(s),
+	    "application %s event handler failed: %s",
+	    which, wiredtiger_strerror(error));
+
+	/*
+	 * Use the error handler to report the failure, unless it was the error
+	 * handler that failed.  If it was the error handler that failed, or a
+	 * call to the error handler fails, use the default error handler.
+	 */
+	handler = session->event_handler;
+	if (!error_handler_failed &&
+	    handler->handle_error != __handle_error_default &&
+	    handler->handle_error(handler, error, s) == 0)
+		return;
+
+	(void)__handle_error_default(NULL, error, s);
+}
+
+/*
+ * __wt_event_handler_set --
+ *	Set an event handler, fill in any NULL methods with the defaults.
  */
 void
-__wt_eventv(WT_SESSION_IMPL *session, int msg_event,
-    int error,
+__wt_event_handler_set(WT_SESSION_IMPL *session, WT_EVENT_HANDLER *handler)
+{
+	if (handler == NULL)
+		handler = &__event_handler_default;
+	else {
+		if (handler->handle_error == NULL)
+			handler->handle_error = __handle_error_default;
+		if (handler->handle_message == NULL)
+			handler->handle_message = __handle_message_default;
+		if (handler->handle_progress == NULL)
+			handler->handle_progress = __handle_progress_default;
+	}
+
+	session->event_handler = handler;
+}
+
+/*
+ * __eventv --
+ * 	Report a message to an event handler.
+ */
+static int
+__eventv(WT_SESSION_IMPL *session, int msg_event, int error,
     const char *file_name, int line_number, const char *fmt, va_list ap)
 {
 	WT_EVENT_HANDLER *handler;
-	const char *prefix1, *prefix2;
+	WT_DECL_RET;
+	size_t len, remain, wlen;
+	const char *err, *prefix1, *prefix2;
 	char *end, *p;
 
 	/*
+	 * We're using a stack buffer because we want error messages no matter
+	 * what, and allocating a WT_ITEM, or the memory it needs, might fail.
+	 *
 	 * !!!
 	 * SECURITY:
 	 * Buffer placed at the end of the stack in case snprintf overflows.
@@ -33,27 +139,74 @@ __wt_eventv(WT_SESSION_IMPL *session, int msg_event,
 	prefix1 = (session->btree != NULL) ? session->btree->name : NULL;
 	prefix2 = session->name;
 
-	if (prefix1 != NULL && prefix2 != NULL && p < end)
-		p += snprintf(p, (size_t)(end - p),
-		    "%s [%s]: ", prefix1, prefix2);
-	else if (prefix1 != NULL && p < end)
-		p += snprintf(p, (size_t)(end - p), "%s: ", prefix1);
-	else if (prefix2 != NULL && p < end)
-		p += snprintf(p, (size_t)(end - p), "%s: ", prefix2);
-	if (file_name != NULL && p < end)
-		p += snprintf(p, (size_t)(end - p),
-		    "%s, %d: ", file_name, line_number);
-	if (p < end)
-		p += vsnprintf(p, (size_t)(end - p), fmt, ap);
-	if (error != 0 && p < end)
-		p += snprintf(p,
-		    (size_t)(end - p), ": %s", wiredtiger_strerror(error));
+	remain = WT_PTRDIFF(end, p);
+	if (prefix1 != NULL && prefix2 != NULL)
+		wlen =
+		    (size_t)snprintf(p, remain, "%s [%s]: ", prefix1, prefix2);
+	else if (prefix1 != NULL)
+		wlen = (size_t)snprintf(p, remain, "%s: ", prefix1);
+	else if (prefix2 != NULL)
+		wlen = (size_t)snprintf(p, remain, "%s: ", prefix2);
+	else
+		wlen = 0;
+	p = wlen >= remain ? end : p + wlen;
+
+	if (file_name != NULL) {
+		remain = WT_PTRDIFF(end, p);
+		wlen = (size_t)
+		    snprintf(p, remain, "%s, %d: ", file_name, line_number);
+		p = wlen >= remain ? end : p + wlen;
+	}
+
+	remain = WT_PTRDIFF(end, p);
+	wlen = (size_t)vsnprintf(p, remain, fmt, ap);
+	p = wlen >= remain ? end : p + wlen;
+
+	if (error != 0) {
+		/*
+		 * When the engine calls __wt_err on error, it often outputs an
+		 * error message including the string associated with the error
+		 * it's returning.  We could change the calls to call __wt_errx,
+		 * but it's simpler to not append an error string if all we are
+		 * doing is duplicating an existing error string.
+		 *
+		 * Use strcmp to compare: both strings are nul-terminated, and
+		 * we don't want to run past the end of the buffer.
+		 */
+		err = wiredtiger_strerror(error);
+		len = strlen(err);
+		if (WT_PTRDIFF(p, s) < len || strcmp(p - len, err) != 0) {
+			remain = WT_PTRDIFF(end, p);
+			(void)snprintf(p, remain, ": %s", err);
+		}
+	}
 
+	/*
+	 * If a handler fails, return the error status: if we're in the process
+	 * of handling an error, any return value we provide will be ignored by
+	 * our caller, our caller presumably already has an error value it will
+	 * be returning.
+	 *
+	 * If an application-specified or default informational message handler
+	 * fails, complain using the application-specified or default error
+	 * handler.
+	 *
+	 * If an application-specified error message handler fails, complain
+	 * using the default error handler.  If the default error handler fails,
+	 * there's nothing to do.
+	 */
 	handler = session->event_handler;
-	if (msg_event)
-		(void)handler->handle_message(handler, s);
-	else
-		handler->handle_error(handler, error, s);
+	if (msg_event) {
+		ret = handler->handle_message(handler, s);
+		if (ret != 0)
+			__handler_failure(session, ret, "message", 0);
+	} else {
+		ret = handler->handle_error(handler, error, s);
+		if (ret != 0 && handler->handle_error != __handle_error_default)
+			__handler_failure(session, ret, "error", 1);
+	}
+
+	return (ret);
 }
 
 /*
@@ -66,8 +219,12 @@ __wt_err(WT_SESSION_IMPL *session, int error, const char *fmt, ...)
 {
 	va_list ap;
 
+	/*
+	 * Ignore error returns from underlying event handlers, we already have
+	 * an error value to return.
+	 */
 	va_start(ap, fmt);
-	__wt_eventv(session, 0, error, NULL, 0, fmt, ap);
+	(void)__eventv(session, 0, error, NULL, 0, fmt, ap);
 	va_end(ap);
 }
 
@@ -81,17 +238,48 @@ __wt_errx(WT_SESSION_IMPL *session, const char *fmt, ...)
 {
 	va_list ap;
 
+	/*
+	 * Ignore error returns from underlying event handlers, we already have
+	 * an error value to return.
+	 */
 	va_start(ap, fmt);
-	__wt_eventv(session, 0, 0, NULL, 0, fmt, ap);
+	(void)__eventv(session, 0, 0, NULL, 0, fmt, ap);
 	va_end(ap);
 }
 
 /*
- * __wt_msg_call --
- *	Pass a message to an event handler.
+ * __wt_verrx --
+ *	Interface to support the extension API.
  */
-void
-__wt_msgv(WT_SESSION_IMPL *session, const char *fmt, va_list ap)
+int
+__wt_verrx(WT_SESSION_IMPL *session, const char *fmt, va_list ap)
+{
+	return (__eventv(session, 0, 0, NULL, 0, fmt, ap));
+}
+/*
+ * __wt_msg --
+ * 	Informational message.
+ */
+int
+__wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...)
+    WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3)))
+{
+	WT_DECL_RET;
+	va_list ap;
+
+	va_start(ap, fmt);
+	ret = __wt_vmsg(session, fmt, ap);
+	va_end(ap);
+
+	return (ret);
+}
+
+/*
+ * __wt_vmsg --
+ * 	Informational message.
+ */
+int
+__wt_vmsg(WT_SESSION_IMPL *session, const char *fmt, va_list ap)
 {
 	WT_EVENT_HANDLER *handler;
 
@@ -105,37 +293,42 @@ __wt_msgv(WT_SESSION_IMPL *session, const char *fmt, va_list ap)
 	(void)vsnprintf(s, sizeof(s), fmt, ap);
 
 	handler = session->event_handler;
-	(void)handler->handle_message(handler, s);
+	return (handler->handle_message(handler, s));
 }
 
 /*
- * __wt_verbose --
- * 	Verbose message.
+ * __wt_progress --
+ *	Progress message.
  */
-void
-__wt_verbose(WT_SESSION_IMPL *session, const char *fmt, ...)
-    WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3)))
+int
+__wt_progress(WT_SESSION_IMPL *session, const char *s, uint64_t v)
 {
-	va_list ap;
+	WT_DECL_RET;
+	WT_EVENT_HANDLER *handler;
 
-	va_start(ap, fmt);
-	__wt_eventv(session, 1, 0, NULL, 0, fmt, ap);
-	va_end(ap);
+	handler = session->event_handler;
+	if (handler != NULL && handler->handle_progress != NULL)
+		if ((ret = handler->handle_progress(
+		    handler, s == NULL ? session->name : s, v)) != 0)
+			__handler_failure(session, ret, "progress", 0);
+	return (0);
 }
 
 /*
- * __wt_msg --
- * 	Report a message.
+ * __wt_verbose --
+ * 	Verbose message.
  */
-void
-__wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...)
+int
+__wt_verbose(WT_SESSION_IMPL *session, const char *fmt, ...)
     WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3)))
 {
+	WT_DECL_RET;
 	va_list ap;
 
 	va_start(ap, fmt);
-	__wt_msgv(session, fmt, ap);
+	ret = __eventv(session, 1, 0, NULL, 0, fmt, ap);
 	va_end(ap);
+	return (ret);
 }
 
 /*
@@ -143,7 +336,7 @@ __wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...)
  *	Assert and other unexpected failures, includes file/line information
  * for debugging.
  */
-int
+void
 __wt_assert(WT_SESSION_IMPL *session,
     int error, const char *file_name, int line_number, const char *fmt, ...)
     WT_GCC_FUNC_ATTRIBUTE((format (printf, 5, 6)))
@@ -151,14 +344,13 @@ __wt_assert(WT_SESSION_IMPL *session,
 	va_list ap;
 
 	va_start(ap, fmt);
-	__wt_eventv(session, 0, error, file_name, line_number, fmt, ap);
+	(void)__eventv(session, 0, error, file_name, line_number, fmt, ap);
 	va_end(ap);
 
 #ifdef HAVE_DIAGNOSTIC
 	__wt_abort(session);
 	/* NOTREACHED */
 #endif
-	return (error);
 }
 
 /*
@@ -166,13 +358,15 @@ __wt_assert(WT_SESSION_IMPL *session,
  *	Print a standard error message when we detect an illegal value.
  */
 int
-__wt_illegal_value(WT_SESSION_IMPL *session)
+__wt_illegal_value(WT_SESSION_IMPL *session, const char *name)
 {
 	WT_RET_MSG(session, WT_ERROR,
+	    "%s%s"
 	    "encountered an illegal file format or internal value; restart "
 	    "the system and verify the underlying files, if corruption is "
 	    "detected use the WT_SESSION salvage method or the wt utility's "
-	    "salvage command to repair the file");
+	    "salvage command to repair the file",
+	    name == NULL ? "" : name, name == NULL ? "" : " ");
 }
 
 /*
diff --git a/src/support/global.c b/src/support/global.c
index 7f78b620006..4d6420e6d14 100644
--- a/src/support/global.c
+++ b/src/support/global.c
@@ -32,7 +32,7 @@ __wt_library_init(void)
 {
 	static pthread_once_t once_control = PTHREAD_ONCE_INIT;
 	static int first = 1;
-	int ret;
+	WT_DECL_RET;
 
 	/*
 	 * Do per-process initialization once, before anything else, but only
diff --git a/src/support/hazard.c b/src/support/hazard.c
index e22d25be3c2..3846889dce2 100644
--- a/src/support/hazard.c
+++ b/src/support/hazard.c
@@ -16,7 +16,7 @@ static void __hazard_dump(WT_SESSION_IMPL *);
  *	Set a hazard reference.
  */
 int
-__wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref
+__wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp
 #ifdef HAVE_DIAGNOSTIC
     , const char *file, int line
 #endif
@@ -26,6 +26,7 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref
 	WT_HAZARD *hp;
 
 	conn = S2C(session);
+	*busyp = 0;
 
 	/*
 	 * Do the dance:
@@ -60,7 +61,7 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref
 		 */
 		if (ref->state == WT_REF_MEM ||
 		    ref->state == WT_REF_EVICT_WALK) {
-			WT_VERBOSE(session, hazard,
+			WT_VERBOSE_RET(session, hazard,
 			    "session %p hazard %p: set", session, ref->page);
 			return (0);
 		}
@@ -78,7 +79,8 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref
 		 * again until it loops around through the tree.
 		 */
 		hp->page = NULL;
-		return (EBUSY);
+		*busyp = 1;
+		return (0);
 	}
 
 	__wt_errx(session,
@@ -109,24 +111,39 @@ __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page)
 	 */
 	WT_ASSERT(session, page != NULL);
 
-	WT_VERBOSE(session, hazard,
-	    "session %p hazard %p: clr", session, page);
-
 	/* Clear the caller's hazard pointer. */
 	for (hp = session->hazard;
 	    hp < session->hazard + conn->hazard_size; ++hp)
 		if (hp->page == page) {
-			hp->page = NULL;
 			/*
-			 * We don't have to flush memory here for correctness;
-			 * it would give the page server thread faster access
-			 * to the block were the block selected to be evicted,
-			 * but the generation number was just set which makes
-			 * it unlikely to be selected for eviction.
+			 * Check to see if the page has grown too big and force
+			 * eviction.  We have to request eviction while holding
+			 * a hazard reference (else the page might disappear out
+			 * from under us), but we can't wake the eviction server
+			 * until we've released our hazard reference because our
+			 * hazard reference blocks the page eviction.  A little
+			 * dance: check the page, schedule the forced eviction,
+			 * clear/publish the hazard reference, wake the eviction
+			 * server.
+			 *
+			 * We don't publish the hazard reference clear in the
+			 * general case.  It's not required for correctness;
+			 * it gives the page server thread faster access to the
+			 * page were the page selected for eviction, but the
+			 * generation number was just set, so it's unlikely the
+			 * page will be selected for eviction.
 			 */
+			if (__wt_eviction_page_check(session, page)) {
+				__wt_evict_page_request(session, page);
+				WT_PUBLISH(hp->page, NULL);
+				__wt_evict_server_wake(session);
+			} else
+				hp->page = NULL;
 			return;
 		}
-	__wt_errx(session, "hazard reference not found");
+	__wt_errx(session,
+	    "clear hazard reference: session: %p reference %p: not found",
+	    session, page);
 }
 
 /*
@@ -190,26 +207,4 @@ __hazard_dump(WT_SESSION_IMPL *session)
 	if (fail)
 		__wt_errx(session, "unexpected hazard reference");
 }
-
-/*
- * __wt_hazard_validate --
- *	Confirm that a page isn't on the hazard list.
- */
-void
-__wt_hazard_validate(WT_SESSION_IMPL *session, WT_PAGE *page)
-{
-	WT_CONNECTION_IMPL *conn;
-	WT_HAZARD *hp;
-	uint32_t elem, i;
-
-	conn = S2C(session);
-
-	elem = conn->session_size * conn->hazard_size;
-	for (i = 0, hp = conn->hazard; i < elem; ++i, ++hp)
-		if (hp->page == page)
-			__wt_errx(session,
-			    "discarded page has hazard reference: "
-			    "(%p: %s, line %d)",
-			    hp->page, hp->file, hp->line);
-}
 #endif
diff --git a/src/support/hex.c b/src/support/hex.c
index e582e9576ef..d9ee135599b 100644
--- a/src/support/hex.c
+++ b/src/support/hex.c
@@ -11,11 +11,11 @@ static const char hex[] = "0123456789abcdef";
 
 /*
  * __wt_raw_to_hex --
- *	Convert a chunk of data to a printable hex string.
+ *	Convert a chunk of data to a nul-terminated printable hex string.
  */
 int
-__wt_raw_to_hex(WT_SESSION_IMPL *session,
-    const uint8_t *from, uint32_t size, WT_ITEM *to)
+__wt_raw_to_hex(
+    WT_SESSION_IMPL *session, const uint8_t *from, uint32_t size, WT_ITEM *to)
 {
 	uint32_t i;
 	const uint8_t *p;
@@ -38,12 +38,12 @@ __wt_raw_to_hex(WT_SESSION_IMPL *session,
 
 /*
  * __wt_raw_to_esc_hex --
- *	Convert a chunk of data to an printable string using escaped hex as
- *	necessary.
+ *	Convert a chunk of data to a nul-terminated printable string using
+ * escaped hex, as necessary.
  */
 int
-__wt_raw_to_esc_hex(WT_SESSION_IMPL *session,
-    const uint8_t *from, size_t size, WT_ITEM *to)
+__wt_raw_to_esc_hex(
+    WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to)
 {
 	size_t i;
 	const uint8_t *p;
@@ -136,22 +136,31 @@ __hex_fmterr(WT_SESSION_IMPL *session)
 
 /*
  * __wt_hex_to_raw --
- *	Convert a printable hex string to a chunk of data.
+ *	Convert a nul-terminated printable hex string to a chunk of data.
  */
 int
 __wt_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to)
 {
+	return (__wt_nhex_to_raw(session, from, strlen(from), to));
+}
+
+/*
+ * __wt_nhex_to_raw --
+ *	Convert a printable hex string to a chunk of data.
+ */
+int
+__wt_nhex_to_raw(
+    WT_SESSION_IMPL *session, const char *from, size_t size, WT_ITEM *to)
+{
 	const char *p;
 	uint8_t *t;
-	size_t size;
 
-	size = strlen(from);
 	if (size % 2 != 0)
 		return (__hex_fmterr(session));
 
 	WT_RET(__wt_buf_init(session, to, size / 2));
 
-	for (p = from, t = to->mem; *p != '\0'; p += 2, ++t)
+	for (p = from, t = to->mem; size > 0; p += 2, size -= 2, ++t)
 		if (hex2byte(p, t))
 			return (__hex_fmterr(session));
 
@@ -161,8 +170,7 @@ __wt_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to)
 
 /*
  * __wt_esc_hex_to_raw --
- *	Convert a printable string using escaped hex as necessary to a chunk
- *	of data.
+ *	Convert a printable string, encoded in escaped hex, to a chunk of data.
  */
 int
 __wt_esc_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to)
diff --git a/src/support/huffman.c b/src/support/huffman.c
index 7b66b321b9c..fa1f1499cff 100644
--- a/src/support/huffman.c
+++ b/src/support/huffman.c
@@ -62,7 +62,6 @@ typedef struct __wt_huffman_obj {
 	 * memory: code2symbol[1 << max_code_length]
 	 */
 	uint8_t *code2symbol;
-
 } WT_HUFFMAN_OBJ;
 
 /*
@@ -296,17 +295,16 @@ __wt_huffman_open(WT_SESSION_IMPL *session,
 {
 	INDEXED_SYMBOL *indexed_freqs, *sym;
 	NODE_QUEUE *combined_nodes, *leaves;
+	WT_DECL_RET;
 	WT_FREQTREE_NODE *node, *node2, **refnode, *tempnode;
 	WT_HUFFMAN_OBJ *huffman;
 	uint64_t w1, w2;
 	uint16_t i;
-	int ret;
 
 	indexed_freqs = symbol_frequency_array;
 
 	combined_nodes = leaves = NULL;
 	node = node2 = tempnode = NULL;
-	ret = 0;
 
 	WT_RET(__wt_calloc_def(session, 1, &huffman));
 
@@ -337,12 +335,12 @@ __wt_huffman_open(WT_SESSION_IMPL *session,
 		if (i > 0 &&
 		    indexed_freqs[i].symbol == indexed_freqs[i - 1].symbol)
 			WT_ERR_MSG(session, EINVAL,
-			    "duplicate symbol %" PRIx16
+			    "duplicate symbol %" PRIx32
 			    " specified in a huffman table",
 			    indexed_freqs[i].symbol);
 		if (indexed_freqs[i].symbol > huffman->numSymbols)
 			WT_ERR_MSG(session, EINVAL,
-			    "illegal symbol %" PRIx16
+			    "illegal symbol %" PRIx32
 			    " specified in a huffman table",
 			    indexed_freqs[i].symbol);
 	}
@@ -590,6 +588,7 @@ int
 __wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg,
     const uint8_t *from_arg, uint32_t from_len, WT_ITEM *to_buf)
 {
+	WT_DECL_RET;
 	WT_HUFFMAN_CODE code;
 	WT_HUFFMAN_OBJ *huffman;
 	WT_ITEM *tmp;
@@ -597,7 +596,6 @@ __wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg,
 	uint32_t max_len, outlen, bytes;
 	const uint8_t *from;
 	uint8_t len, *out, padding_info, symbol;
-	int ret;
 
 	/*
 	 * Shift register to accumulate bits from input.
@@ -612,7 +610,6 @@ __wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg,
 	huffman = huffman_arg;
 	from = from_arg;
 	tmp = NULL;
-	ret = 0;
 
 	/*
 	 * We don't want to find all of our callers and ensure they don't pass
@@ -729,6 +726,7 @@ int
 __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg,
     const uint8_t *from_arg, uint32_t from_len, WT_ITEM *to_buf)
 {
+	WT_DECL_RET;
 	WT_ITEM *tmp;
 	WT_HUFFMAN_OBJ *huffman;
 	uint64_t from_len_bits;
@@ -736,12 +734,10 @@ __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg,
 	uint16_t pattern;
 	const uint8_t *from;
 	uint8_t padding_info, symbol, *to, valid;
-	int ret;
 
 	huffman = huffman_arg;
 	from = from_arg;
 	tmp = NULL;
-	ret = 0;
 
 	/*
 	 * We don't want to find all of our callers and ensure they don't pass
diff --git a/src/support/scratch.c b/src/support/scratch.c
index 03e3d3a8b78..57b779bd5c2 100644
--- a/src/support/scratch.c
+++ b/src/support/scratch.c
@@ -248,12 +248,17 @@ __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...)
  *	Scratch buffer allocation function.
  */
 int
-__wt_scr_alloc(WT_SESSION_IMPL *session, uint32_t size, WT_ITEM **scratchp)
+__wt_scr_alloc_func(WT_SESSION_IMPL *session,
+    uint32_t size, WT_ITEM **scratchp
+#ifdef HAVE_DIAGNOSTIC
+    , const char *file, int line
+#endif
+    )
 {
-	WT_ITEM *buf, **p, *best, **slot;
+	WT_DECL_RET;
+	WT_ITEM *buf, **p, **best, **slot;
 	size_t allocated;
 	u_int i;
-	int ret;
 
 	/* Don't risk the caller not catching the error. */
 	*scratchp = NULL;
@@ -267,7 +272,7 @@ __wt_scr_alloc(WT_SESSION_IMPL *session, uint32_t size, WT_ITEM **scratchp)
 	 *
 	 * Walk the array, looking for a buffer we can use.
 	 */
-	for (i = 0, best = NULL, slot = NULL,
+	for (i = 0, best = slot = NULL,
 	    p = session->scratch; i < session->scratch_alloc; ++i, ++p) {
 		/* If we find an empty slot, remember it. */
 		if ((buf = *p) == NULL) {
@@ -285,11 +290,13 @@ __wt_scr_alloc(WT_SESSION_IMPL *session, uint32_t size, WT_ITEM **scratchp)
 		 * or the largest buffer if none are large enough.
 		 */
 		if (best == NULL ||
-		    (best->memsize < size && buf->memsize > best->memsize) ||
-		    (buf->memsize >= size && buf->memsize < best->memsize))
-			best = buf;
+		    ((*best)->memsize < size &&
+		    buf->memsize > (*best)->memsize) ||
+		    (buf->memsize >= size && buf->memsize < (*best)->memsize))
+			best = p;
+
 		/* If we find a perfect match, use it. */
-		if (best->memsize == size)
+		if ((*best)->memsize == size)
 			break;
 	}
 
@@ -302,6 +309,12 @@ __wt_scr_alloc(WT_SESSION_IMPL *session, uint32_t size, WT_ITEM **scratchp)
 		WT_ERR(__wt_realloc(session, &allocated,
 		    (session->scratch_alloc + 10) * sizeof(WT_ITEM *),
 		    &session->scratch));
+#ifdef HAVE_DIAGNOSTIC
+		allocated = session->scratch_alloc * sizeof(WT_SCRATCH_TRACK);
+		WT_ERR(__wt_realloc(session, &allocated,
+		    (session->scratch_alloc + 10) * sizeof(WT_SCRATCH_TRACK),
+		    &session->scratch_track));
+#endif
 		slot = session->scratch + session->scratch_alloc;
 		session->scratch_alloc += 10;
 	}
@@ -312,17 +325,24 @@ __wt_scr_alloc(WT_SESSION_IMPL *session, uint32_t size, WT_ITEM **scratchp)
 	 */
 	if (best == NULL) {
 		WT_ASSERT(session, slot != NULL);
-		WT_ERR(__wt_calloc_def(session, 1, slot));
-		best = *slot;
+		best = slot;
+
+		WT_ERR(__wt_calloc_def(session, 1, best));
 
 		/* Scratch buffers must be aligned. */
-		F_SET(best, WT_ITEM_ALIGNED);
+		F_SET(*best, WT_ITEM_ALIGNED);
 	}
 
 	/* Grow the buffer as necessary and return. */
-	WT_ERR(__wt_buf_init(session, best, size));
-	F_SET(best, WT_ITEM_INUSE);
-	*scratchp = best;
+	WT_ERR(__wt_buf_init(session, *best, size));
+	F_SET(*best, WT_ITEM_INUSE);
+
+#ifdef HAVE_DIAGNOSTIC
+	session->scratch_track[best - session->scratch].file = file;
+	session->scratch_track[best - session->scratch].line = line;
+#endif
+
+	*scratchp = *best;
 	return (0);
 
 err:	WT_RET_MSG(session, ret,
@@ -356,16 +376,26 @@ __wt_scr_discard(WT_SESSION_IMPL *session)
 	    bufp = session->scratch; i < session->scratch_alloc; ++i, ++bufp) {
 		if (*bufp == NULL)
 			continue;
-#if 0
 		if (F_ISSET(*bufp, WT_ITEM_INUSE))
 			__wt_errx(session,
-			    "scratch buffer allocated and never discarded");
+			    "scratch buffer allocated and never discarded"
+#ifdef HAVE_DIAGNOSTIC
+			    ": %s: %d",
+			    session->
+			    scratch_track[bufp - session->scratch].file,
+			    session->
+			    scratch_track[bufp - session->scratch].line
 #endif
+			    );
+
 		__wt_buf_free(session, *bufp);
 		__wt_free(session, *bufp);
 	}
 
 	__wt_free(session, session->scratch);
+#ifdef HAVE_DIAGNOSTIC
+	__wt_free(session, session->scratch_track);
+#endif
 }
 
 /*
diff --git a/src/support/sess_dump.c b/src/support/sess_dump.c
index 962f50c532f..5fa5deb2b74 100644
--- a/src/support/sess_dump.c
+++ b/src/support/sess_dump.c
@@ -15,13 +15,17 @@
 void
 __wt_session_dump_all(WT_SESSION_IMPL *session)
 {
-	WT_SESSION_IMPL **tp;
+	WT_CONNECTION_IMPL *conn;
+	WT_SESSION_IMPL *s;
+	uint32_t i;
 
 	if (session == NULL)
 		return;
 
-	for (tp = S2C(session)->sessions; *tp != NULL; ++tp)
-		__wt_session_dump(*tp);
+	conn = S2C(session);
+	for (s = conn->sessions, i = 0; i < conn->session_size; ++s, ++i)
+		if (s->active)
+			__wt_session_dump(s);
 }
 
 /*
@@ -38,15 +42,15 @@ __wt_session_dump(WT_SESSION_IMPL *session)
 
 	conn = S2C(session);
 
-	__wt_msg(session, "session: %s%s%p",
+	(void)__wt_msg(session, "session: %s%s%p",
 	    session->name == NULL ? "" : session->name,
 	    session->name == NULL ? "" : " ", session);
 
 	first = 0;
 	TAILQ_FOREACH(cursor, &session->cursors, q) {
 		if (++first == 1)
-			__wt_msg(session, "\tcursors:");
-		__wt_msg(session, "\t\t%p", cursor);
+			(void)__wt_msg(session, "\tcursors:");
+		(void)__wt_msg(session, "\t\t%p", cursor);
 	}
 
 	first = 0;
@@ -55,12 +59,12 @@ __wt_session_dump(WT_SESSION_IMPL *session)
 		if (hp->page == NULL)
 			continue;
 		if (++first == 1)
-			__wt_msg(session, "\thazard references:");
+			(void)__wt_msg(session, "\thazard references:");
 #ifdef HAVE_DIAGNOSTIC
-		__wt_msg(session,
+		(void)__wt_msg(session,
 		    "\t\t%p (%s, line %d)", hp->page, hp->file, hp->line);
 #else
-		__wt_msg(session, "\t\t%p", hp->page);
+		(void)__wt_msg(session, "\t\t%p", hp->page);
 #endif
 	}
 }
diff --git a/src/support/stat.c b/src/support/stat.c
index bbd13b8b839..5741be441e8 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -28,9 +28,6 @@ __wt_stat_alloc_btree_stats(WT_SESSION_IMPL *session, WT_BTREE_STATS **statsp)
 	    "column-store variable-size leaf pages";
 	stats->file_entries.desc = "total entries";
 	stats->file_fixed_len.desc = "fixed-record size";
-	stats->file_freelist_bytes.desc = "number of bytes in the freelist";
-	stats->file_freelist_entries.desc =
-	    "number of entries in the freelist";
 	stats->file_magic.desc = "magic number";
 	stats->file_major.desc = "major version number";
 	stats->file_maxintlitem.desc = "maximum internal page item size";
@@ -42,6 +39,7 @@ __wt_stat_alloc_btree_stats(WT_SESSION_IMPL *session, WT_BTREE_STATS **statsp)
 	stats->file_row_int_pages.desc = "row-store internal pages";
 	stats->file_row_leaf_pages.desc = "row-store leaf pages";
 	stats->file_size.desc = "file: size";
+	stats->file_write_conflicts.desc = "write generation conflicts";
 	stats->free.desc = "file: block frees";
 	stats->overflow_read.desc = "file: overflow pages read from the file";
 	stats->page_read.desc = "file: pages read from the file";
@@ -85,8 +83,6 @@ __wt_stat_clear_btree_stats(WT_STATS *stats_arg)
 	stats->file_col_var_pages.v = 0;
 	stats->file_entries.v = 0;
 	stats->file_fixed_len.v = 0;
-	stats->file_freelist_bytes.v = 0;
-	stats->file_freelist_entries.v = 0;
 	stats->file_magic.v = 0;
 	stats->file_major.v = 0;
 	stats->file_maxintlitem.v = 0;
@@ -98,6 +94,7 @@ __wt_stat_clear_btree_stats(WT_STATS *stats_arg)
 	stats->file_row_int_pages.v = 0;
 	stats->file_row_leaf_pages.v = 0;
 	stats->file_size.v = 0;
+	stats->file_write_conflicts.v = 0;
 	stats->free.v = 0;
 	stats->overflow_read.v = 0;
 	stats->page_read.v = 0;
diff --git a/src/txn/interleave.py b/src/txn/interleave.py
deleted file mode 100755
index 75d2ff47a67..00000000000
--- a/src/txn/interleave.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright (c) 2008-2012 WiredTiger, Inc.
-#
-# This is free and unencumbered software released into the public domain.
-#
-# Anyone is free to copy, modify, publish, use, compile, sell, or
-# distribute this software, either in source code form or as a compiled
-# binary, for any purpose, commercial or non-commercial, and by any
-# means.
-#
-# In jurisdictions that recognize copyright laws, the author or authors
-# of this software dedicate any and all copyright interest in the
-# software to the public domain. We make this dedication for the benefit
-# of the public at large and to the detriment of our heirs and
-# successors. We intend this dedication to be an overt act of
-# relinquishment in perpetuity of all present and future rights to this
-# software under copyright law.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-# OTHER DEALINGS IN THE SOFTWARE.
-#
-
-# identical transactions
-t1 = ["r1(x)", "w1(x)"]
-t2 = ["r2(x)", "w2(x)"]
-t3 = ["r3(x)", "w3(x)"]
-t4 = ["r4(x)", "w4(x)"]
-
-def interleave(T1, T2):
-    """Given lists of operations as input, return all possible interleavings"""
-    if not T1:
-        return [T2]
-    elif not T2:
-        return [T1]
-    else:
-        return [T1[0:1] + l for l in interleave(T1[1:], T2)] + [T2[0:1] + l for l in interleave(T1, T2[1:])]
-
-for l1 in interleave(t1, t2):
- for l2 in interleave(l1, t3):
-  for l in interleave(l2, t4):
-    # timestamps of item x
-    readts = 0
-    writets = 0
-    skip = False
-    failure = ''
-
-    for op in l:
-        ts = int(op[1])
-        # Check whether the operation is valid:
-        if op[0] == 'r':
-            if writets < ts and readts < ts:
-                readts = ts
-        elif op[0] == 'w':
-            if writets > ts or readts > ts:
-                if ts <= 2 and l.index("r" + str(ts+2) + "(x)") < l.index("w" + str(ts+1) + "(x)"):
-                    skip = True
-                    break
-                failure += ' ' + op
-        elif op[0] == 'c':
-            pass
-    if skip:
-        continue
-    elif failure:
-        print '%s: failed at%s' % (' '.join(l), failure)
-    else:
-        print '%s: passed' % (' '.join(l),)
diff --git a/src/txn/txn.c b/src/txn/txn.c
new file mode 100644
index 00000000000..36149170239
--- /dev/null
+++ b/src/txn/txn.c
@@ -0,0 +1,311 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_txnid_cmp --
+ *	Compare transaction IDs for sorting / searching.
+ */
+int
+__wt_txnid_cmp(const void *v1, const void *v2)
+{
+	wt_txnid_t id1, id2;
+
+	id1 = *(wt_txnid_t *)v1;
+	id2 = *(wt_txnid_t *)v2;
+
+	return ((id1 == id2) ? 0 : TXNID_LT(id1, id2) ? -1 : 1);
+}
+
+/*
+ * __txn_sort_snapshot --
+ *	Sort a snapshot for faster searching and set the min/max bounds.
+ */
+static void
+__txn_sort_snapshot(WT_SESSION_IMPL *session, uint32_t n, wt_txnid_t id)
+{
+	WT_TXN *txn;
+
+	txn = &session->txn;
+
+	qsort(txn->snapshot, n, sizeof(wt_txnid_t), __wt_txnid_cmp);
+	txn->snapshot_count = n;
+	txn->snap_min = (n == 0) ? id : txn->snapshot[0];
+	txn->snap_max = (n == 0) ? id : txn->snapshot[n - 1];
+	WT_ASSERT(session, txn->snap_min != WT_TXN_NONE);
+}
+
+/*
+ * __wt_txn_get_snapshot --
+ *	Set up a snapshot in the current transaction, without allocating an ID.
+ */
+int
+__wt_txn_get_snapshot(WT_SESSION_IMPL *session, wt_txnid_t max_id)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_TXN *txn;
+	WT_TXN_GLOBAL *txn_global;
+	wt_txnid_t current_id, id;
+	uint32_t i, n, session_cnt;
+
+	conn = S2C(session);
+	n = 0;
+	txn = &session->txn;
+	txn_global = &conn->txn_global;
+
+	do {
+		/* Take a copy of the current session ID. */
+		current_id = txn_global->current;
+
+		/* Copy the array of concurrent transactions. */
+		WT_ORDERED_READ(session_cnt, conn->session_cnt);
+		for (i = 0; i < session_cnt; i++)
+			if ((id = txn_global->ids[i]) != WT_TXN_NONE &&
+			    (max_id == WT_TXN_NONE || TXNID_LT(id, max_id)))
+				txn->snapshot[n++] = id;
+	} while (current_id != txn_global->current);
+
+	__txn_sort_snapshot(
+	    session, n, (max_id != WT_TXN_NONE) ? max_id : current_id);
+	return (0);
+}
+
+/*
+ * __wt_txn_begin --
+ *	Begin a transaction.
+ */
+int
+__wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_CONFIG_ITEM cval;
+	WT_CONNECTION_IMPL *conn;
+	WT_TXN *txn;
+	WT_TXN_GLOBAL *txn_global;
+	wt_txnid_t id;
+	uint32_t i, n, session_cnt;
+
+	conn = S2C(session);
+	n = 0;
+	txn = &session->txn;
+	txn_global = &conn->txn_global;
+
+	if (F_ISSET(txn, TXN_RUNNING))
+		WT_RET_MSG(session, EINVAL, "Transaction already running");
+
+	WT_RET(__wt_config_gets(session, cfg, "isolation", &cval));
+	txn->isolation = (strcmp(cval.str, "snapshot") == 0) ?
+	    TXN_ISO_SNAPSHOT : TXN_ISO_READ_UNCOMMITTED;
+
+	WT_ASSERT(session, txn->id == WT_TXN_NONE);
+	WT_ASSERT(session, txn_global->ids[session->id] == WT_TXN_NONE);
+
+	do {
+		/* Take a copy of the current session ID. */
+		txn->id = txn_global->current;
+		WT_PUBLISH(txn_global->ids[session->id], txn->id);
+
+		if (txn->isolation == TXN_ISO_SNAPSHOT) {
+			/* Copy the array of concurrent transactions. */
+			WT_ORDERED_READ(session_cnt, conn->session_cnt);
+			for (i = 0; i < session_cnt; i++)
+				if ((id = txn_global->ids[i]) != WT_TXN_NONE &&
+				    TXNID_LT(id, txn->id))
+					txn->snapshot[n++] = id;
+		}
+	} while (!WT_ATOMIC_CAS(txn_global->current, txn->id, txn->id + 1) ||
+	    txn->id == WT_TXN_NONE || txn->id == WT_TXN_ABORTED);
+
+	if (txn->isolation == TXN_ISO_SNAPSHOT)
+		__txn_sort_snapshot(session, n, txn->id);
+
+	F_SET(txn, TXN_RUNNING);
+
+	return (0);
+}
+
+/*
+ * __txn_release --
+ *	Release the resources associated with the current transaction.
+ */
+static int
+__txn_release(WT_SESSION_IMPL *session)
+{
+	WT_TXN *txn;
+	WT_TXN_GLOBAL *txn_global;
+
+	txn = &session->txn;
+	txn->mod_count = 0;
+
+	if (!F_ISSET(txn, TXN_RUNNING))
+		WT_RET_MSG(session, EINVAL, "No transaction is active");
+
+	txn_global = &S2C(session)->txn_global;
+	WT_ASSERT(session, txn_global->ids[session->id] != WT_TXN_NONE &&
+	    txn->id != WT_TXN_NONE);
+	WT_PUBLISH(txn_global->ids[session->id], txn->id = WT_TXN_NONE);
+	F_CLR(txn, TXN_ERROR | TXN_RUNNING);
+
+	return (0);
+}
+
+/*
+ * __wt_txn_commit --
+ *	Commit the current transaction.
+ */
+int
+__wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_UNUSED(cfg);
+
+	return (__txn_release(session));
+}
+
+/*
+ * __wt_txn_rollback --
+ *	Roll back the current transaction.
+ */
+int
+__wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_TXN *txn;
+	wt_txnid_t **m;
+	u_int i;
+
+	WT_UNUSED(cfg);
+
+	txn = &session->txn;
+	for (i = 0, m = txn->mod; i < txn->mod_count; i++, m++)
+		**m = WT_TXN_ABORTED;
+
+	return (__txn_release(session));
+}
+
+/*
+ * __wt_txn_checkpoint --
+ *	Write a checkpoint.
+ */
+int
+__wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
+{
+	WT_CONFIG_ITEM cval;
+	WT_CURSOR *cursor;
+	WT_DECL_RET;
+	WT_TXN_GLOBAL *txn_global;
+	const char *snapshot;
+	const char *txn_cfg[] = { "isolation=snapshot", NULL };
+
+	cursor = NULL;
+	txn_global = &S2C(session)->txn_global;
+
+	if ((ret = __wt_config_gets(
+	    session, cfg, "snapshot", &cval)) != 0 && ret != WT_NOTFOUND)
+		WT_RET(ret);
+	if (cval.len != 0)
+		WT_RET(__wt_strndup(session, cval.str, cval.len, &snapshot));
+	else
+		snapshot = NULL;
+
+	/* Only one checkpoint can be active at a time. */
+	__wt_writelock(session, S2C(session)->ckpt_rwlock);
+
+	WT_ERR(__wt_txn_begin(session, txn_cfg));
+
+	/* Prevent eviction from evicting anything newer than this. */
+	txn_global->ckpt_txnid = session->txn.snap_min;
+
+	/*
+	 * If we're doing an ordinary unnamed checkpoint, we only need to flush
+	 * open files.	If we're creating a named snapshot, we need to walk the
+	 * entire list of files in the metadata.
+	 */
+	WT_TRET((snapshot == NULL) ?
+	    __wt_conn_btree_apply(session, __wt_snapshot, cfg) :
+	    __wt_meta_btree_apply(session, __wt_snapshot, cfg, 0));
+
+	if (cursor != NULL)
+		WT_TRET(cursor->close(cursor));
+
+	txn_global->ckpt_txnid = WT_TXN_NONE;
+
+	WT_TRET(__txn_release(session));
+
+err:	__wt_rwunlock(session, S2C(session)->ckpt_rwlock);
+	__wt_free(session, snapshot);
+	return (ret);
+}
+
+/*
+ * __wt_txn_init --
+ *	Initialize a session's transaction data.
+ */
+int
+__wt_txn_init(WT_SESSION_IMPL *session)
+{
+	WT_TXN *txn;
+
+	txn = &session->txn;
+	txn->id = WT_TXN_NONE;
+
+	WT_RET(__wt_calloc_def(session,
+	    S2C(session)->session_size, &txn->snapshot));
+
+	return (0);
+}
+
+/*
+ * __wt_txn_destroy --
+ *	Destroy a session's transaction data.
+ */
+void
+__wt_txn_destroy(WT_SESSION_IMPL *session)
+{
+	WT_TXN *txn;
+
+	txn = &session->txn;
+	__wt_free(session, txn->snapshot);
+}
+
+/*
+ * __wt_txn_global_init --
+ *	Initialize the global transaction state.
+ */
+int
+__wt_txn_global_init(WT_CONNECTION_IMPL *conn, const char *cfg[])
+{
+	WT_SESSION_IMPL *session;
+	WT_TXN_GLOBAL *txn_global;
+	u_int i;
+
+	WT_UNUSED(cfg);
+	session = conn->default_session;
+	txn_global = &conn->txn_global;
+	txn_global->current = 1;
+	txn_global->ckpt_txnid = WT_TXN_NONE;
+
+	WT_RET(__wt_calloc_def(session, conn->session_size, &txn_global->ids));
+	for (i = 0; i < conn->session_size; i++)
+		txn_global->ids[i] = WT_TXN_NONE;
+
+	return (0);
+}
+
+/*
+ * __wt_txn_global_destroy --
+ *	Destroy the global transaction state.
+ */
+void
+__wt_txn_global_destroy(WT_CONNECTION_IMPL *conn)
+{
+	WT_SESSION_IMPL *session;
+	WT_TXN_GLOBAL *txn_global;
+
+	session = conn->default_session;
+	txn_global = &conn->txn_global;
+
+	__wt_free(session, txn_global->ids);
+}
diff --git a/src/utilities/util_create.c b/src/utilities/util_create.c
index 106a49d3537..750777a7c3a 100644
--- a/src/utilities/util_create.c
+++ b/src/utilities/util_create.c
@@ -12,7 +12,8 @@ static int usage(void);
 int
 util_create(WT_SESSION *session, int argc, char *argv[])
 {
-	int ch, ret;
+	WT_DECL_RET;
+	int ch;
 	const char *config, *uri;
 
 	config = NULL;
diff --git a/src/utilities/util_drop.c b/src/utilities/util_drop.c
index efd69c98841..3e6536deb5c 100644
--- a/src/utilities/util_drop.c
+++ b/src/utilities/util_drop.c
@@ -12,11 +12,19 @@ static int usage(void);
 int
 util_drop(WT_SESSION *session, int argc, char *argv[])
 {
+	size_t len;
+	WT_DECL_RET;
 	int ch;
-	const char *name;
+	const char *snapshot;
+	char *name, *config;
 
-	while ((ch = util_getopt(argc, argv, "")) != EOF)
+	config = NULL;
+	snapshot = NULL;
+	while ((ch = util_getopt(argc, argv, "s:")) != EOF)
 		switch (ch) {
+		case 's':
+			snapshot = util_optarg;
+			break;
 		case '?':
 		default:
 			return (usage());
@@ -31,7 +39,25 @@ util_drop(WT_SESSION *session, int argc, char *argv[])
 	if ((name = util_name(*argv, "table", UTIL_ALL_OK)) == NULL)
 		return (1);
 
-	return (session->drop(session, name, "force"));
+	if (snapshot == NULL)
+		ret = session->drop(session, name, "force");
+	else {
+		len = strlen(snapshot) +
+		    strlen("snapshot=") + strlen("force") + 10;
+		if ((config = malloc(len)) == NULL)
+			goto err;
+		(void)snprintf(config, len, "snapshot=%s,force", snapshot);
+		ret = session->drop(session, name, config);
+	}
+
+	if (0) {
+err:		ret = 1;
+	}
+	if (config != NULL)
+		free(config);
+	if (name != NULL)
+		free(name);
+	return (ret);
 }
 
 static int
@@ -39,7 +65,7 @@ usage(void)
 {
 	(void)fprintf(stderr,
 	    "usage: %s %s "
-	    "drop uri\n",
+	    "drop [-s snapshot] uri\n",
 	    progname, usage_prefix);
 	return (1);
 }
diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c
index 37a26089f7c..6f918b51774 100644
--- a/src/utilities/util_dump.c
+++ b/src/utilities/util_dump.c
@@ -7,19 +7,19 @@
 
 #include "util.h"
 
+static int dump_config(WT_SESSION *, const char *);
+static int dump_file_config(WT_SESSION *, const char *);
 static int dump_prefix(int);
 static int dump_suffix(void);
+static int dump_table_config(WT_SESSION *, WT_CURSOR *, const char *);
 static int print_config(WT_SESSION *, const char *, const char *, const char *);
-static int schema(WT_SESSION *, const char *);
-static int schema_file(WT_SESSION *, WT_CURSOR *, const char *);
-static int schema_table(WT_SESSION *, WT_CURSOR *, const char *);
 static int usage(void);
 
 static inline int
 dump_forward(WT_CURSOR *cursor, const char *name)
 {
+	WT_DECL_RET;
 	const char *key, *value;
-	int ret;
 
 	while ((ret = cursor->next(cursor)) == 0) {
 		if ((ret = cursor->get_key(cursor, &key)) != 0)
@@ -35,8 +35,8 @@ dump_forward(WT_CURSOR *cursor, const char *name)
 static inline int
 dump_reverse(WT_CURSOR *cursor, const char *name)
 {
+	WT_DECL_RET;
 	const char *key, *value;
-	int ret;
 
 	while ((ret = cursor->prev(cursor)) == 0) {
 		if ((ret = cursor->get_key(cursor, &key)) != 0)
@@ -53,12 +53,14 @@ int
 util_dump(WT_SESSION *session, int argc, char *argv[])
 {
 	WT_CURSOR *cursor;
-	int ch, hex, ret, reverse;
-	char *name;
+	WT_DECL_RET;
+	size_t len;
+	int ch, hex, reverse;
+	char *config, *name, *snapshot;
 
 	hex = reverse = 0;
-	name = NULL;
-	while ((ch = util_getopt(argc, argv, "f:rx")) != EOF)
+	config = name = snapshot = NULL;
+	while ((ch = util_getopt(argc, argv, "f:rs:x")) != EOF)
 		switch (ch) {
 		case 'f':			/* output file */
 			if (freopen(util_optarg, "w", stdout) == NULL)
@@ -68,6 +70,9 @@ util_dump(WT_SESSION *session, int argc, char *argv[])
 		case 'r':
 			reverse = 1;
 			break;
+		case 's':
+			snapshot = util_optarg;
+			break;
 		case 'x':
 			hex = 1;
 			break;
@@ -86,12 +91,24 @@ util_dump(WT_SESSION *session, int argc, char *argv[])
 		goto err;
 
 	if (dump_prefix(hex) != 0 ||
-	    schema(session, name) != 0 ||
+	    dump_config(session, name) != 0 ||
 	    dump_suffix() != 0)
 		goto err;
 
-	if ((ret = session->open_cursor(session,
-	    name, NULL, hex ? "dump=hex" : "dump=print", &cursor)) != 0) {
+	len = snapshot == NULL ? 0 : strlen("snapshot=") + strlen(snapshot);
+	len += strlen(hex ? "dump=hex" : "dump=print");
+	if ((config = malloc(len + 10)) == NULL)
+		goto err;
+	if (snapshot == NULL)
+		config[0] = '\0';
+	else {
+		(void)strcpy(config, "snapshot=");
+		(void)strcat(config, snapshot);
+		(void)strcat(config, ",");
+	}
+	(void)strcat(config, hex ? "dump=hex" : "dump=print");
+	if ((ret = session->open_cursor(
+	    session, name, NULL, config, &cursor)) != 0) {
 		fprintf(stderr, "%s: cursor open(%s) failed: %s\n",
 		    progname, name, wiredtiger_strerror(ret));
 		goto err;
@@ -106,6 +123,8 @@ util_dump(WT_SESSION *session, int argc, char *argv[])
 err:		ret = 1;
 	}
 
+	if (config != NULL)
+		free(config);
 	if (name != NULL)
 		free(name);
 
@@ -113,54 +132,53 @@ err:		ret = 1;
 }
 
 /*
- * schema --
- *	Dump the schema for the uri.
+ * config --
+ *	Dump the config for the uri.
  */
 static int
-schema(WT_SESSION *session, const char *uri)
+dump_config(WT_SESSION *session, const char *uri)
 {
 	WT_CURSOR *cursor;
-	int ret, tret;
-
-	ret = 0;
-
-	/* Open the schema file. */
-	if ((ret = session->open_cursor(
-	    session, WT_SCHEMA_URI, NULL, NULL, &cursor)) != 0) {
-		fprintf(stderr, "%s: %s: session.open_cursor: %s\n",
-		    progname, WT_SCHEMA_URI, wiredtiger_strerror(ret));
-		return (1);
-	}
+	WT_DECL_RET;
+	int tret;
+
+	/* Dump the config. */
+	if (strncmp(uri, "table:", strlen("table:")) == 0) {
+		/* Open a metadata cursor. */
+		if ((ret = session->open_cursor(
+		    session, WT_METADATA_URI, NULL, NULL, &cursor)) != 0) {
+			fprintf(stderr, "%s: %s: session.open_cursor: %s\n",
+			    progname,
+			    WT_METADATA_URI, wiredtiger_strerror(ret));
+			return (1);
+		}
 
-	/* Dump the schema. */
-	if (strncmp(uri, "table:", strlen("table:")) == 0)
-		ret = schema_table(session, cursor, uri);
-	else
-		ret = schema_file(session, cursor, uri);
+		ret = dump_table_config(session, cursor, uri);
 
-	if ((tret = cursor->close(cursor)) != 0 && ret == 0)
-		ret = tret;
+		if ((tret = cursor->close(cursor)) != 0 && ret == 0)
+			ret = tret;
+	} else
+		ret = dump_file_config(session, uri);
 
 	return (ret);
 }
 
 /*
- * schema_table --
- *	Dump the schema for a table.
+ * dump_table_config --
+ *	Dump the config for a table.
  */
 static int
-schema_table(WT_SESSION *session, WT_CURSOR *cursor, const char *uri)
+dump_table_config(WT_SESSION *session, WT_CURSOR *cursor, const char *uri)
 {
 	struct {
-		char *key;			/* Schema key */
-		char *value;			/* Schema value */
+		char *key;			/* Metadata key */
+		char *value;			/* Metadata value */
 	} *list;
-	int i, elem, list_elem, ret;
+	WT_DECL_RET;
+	int i, elem, list_elem;
 	const char *key, *name, *value;
 	char *buf, *filename, *p, *t, *sep;
 
-	ret = 0;
-
 	/* Get the name. */
 	if ((name = strchr(uri, ':')) == NULL) {
 		fprintf(stderr, "%s: %s: corrupted uri\n", progname, uri);
@@ -176,7 +194,7 @@ schema_table(WT_SESSION *session, WT_CURSOR *cursor, const char *uri)
 			return (util_cerr(uri, "get_key", ret));
 		if ((buf = strdup(key)) == NULL)
 			return (util_err(errno, NULL));
-			
+
 		/* Check for the dump table's column groups or indices. */
 		if ((p = strchr(buf, ':')) == NULL)
 			continue;
@@ -206,7 +224,7 @@ schema_table(WT_SESSION *session, WT_CURSOR *cursor, const char *uri)
 	ret = 0;
 
 	/*
-	 * Dump out the schema information: first, dump the uri entry itself
+	 * Dump out the config information: first, dump the uri entry itself
 	 * (requires a lookup).
 	 */
 	cursor->set_key(cursor, uri);
@@ -248,7 +266,7 @@ schema_table(WT_SESSION *session, WT_CURSOR *cursor, const char *uri)
 		cursor->set_key(cursor, p);
 		if ((ret = cursor->search(cursor)) != 0) {
 			fprintf(stderr,
-			    "%s: %s: unable to find schema reference for the "
+			    "%s: %s: unable to find metadata for the "
 			    "underlying file %s\n",
 			    progname, list[i].key, p);
 			return (1);
@@ -270,25 +288,26 @@ schema_table(WT_SESSION *session, WT_CURSOR *cursor, const char *uri)
 }
 
 /*
- * schema_file --
- *	Dump the schema for a file.
+ * dump_file_config --
+ *	Dump the config for a file.
  */
 static int
-schema_file(WT_SESSION *session, WT_CURSOR *cursor, const char *uri)
+dump_file_config(WT_SESSION *session, const char *uri)
 {
-	const char *key, *value;
-	int ret;
+	WT_DECL_RET;
+	const char *value;
 
-	ret = 0;
+	/*
+	 * We want to be able to dump the metadata file itself, but the
+	 * configuration for that file lives in the turtle file.  Reach
+	 * down into the library and ask for the file's configuration,
+	 * that will work in all cases.
+	 */
+	if ((ret = __wt_metadata_get(session, uri, &value)) != 0)
+		return (util_err(ret, "metadata read: %s", uri));
 
-	cursor->set_key(cursor, uri);
-	if ((ret = cursor->search(cursor)) != 0)
-		return (util_cerr(uri, "search", ret));
-	if ((ret = cursor->get_key(cursor, &key)) != 0)
-		return (util_cerr(uri, "get_key", ret));
-	if ((ret = cursor->get_value(cursor, &value)) != 0)
-		return (util_cerr(uri, "get_value", ret));
-	return (print_config(session, key, value, NULL));
+	/* Leak the memory, I don't care. */
+	return (print_config(session, uri, value, NULL));
 }
 
 /*
@@ -331,8 +350,8 @@ static int
 print_config(WT_SESSION *session,
     const char *key, const char *v1, const char *v2)
 {
+	WT_DECL_RET;
 	const char *value_ret;
-	int ret;
 
 	/*
 	 * The underlying call will ignore v2 if v1 is NULL -- check here and
@@ -357,7 +376,7 @@ usage(void)
 {
 	(void)fprintf(stderr,
 	    "usage: %s %s "
-	    "dump [-rx] [-f output-file] uri\n",
+	    "dump [-rx] [-f output-file] [-s snapshot] uri\n",
 	    progname, usage_prefix);
 	return (1);
 }
diff --git a/src/utilities/util_dumpfile.c b/src/utilities/util_dumpfile.c
index c415bf9f1ba..2edbbf88322 100644
--- a/src/utilities/util_dumpfile.c
+++ b/src/utilities/util_dumpfile.c
@@ -12,7 +12,8 @@ static int usage(void);
 int
 util_dumpfile(WT_SESSION *session, int argc, char *argv[])
 {
-	int ch, ret;
+	WT_DECL_RET;
+	int ch;
 	char *name;
 
 	name = NULL;
diff --git a/src/utilities/util_list.c b/src/utilities/util_list.c
index 4fbf2f76542..3cf3a988427 100644
--- a/src/utilities/util_list.c
+++ b/src/utilities/util_list.c
@@ -7,16 +7,27 @@
 
 #include "util.h"
 
-static int list_print(WT_SESSION *);
+static int list_print(WT_SESSION *, const char *, int, int);
+static int list_print_snapshot(WT_SESSION *, const char *);
 static int usage(void);
 
 int
 util_list(WT_SESSION *session, int argc, char *argv[])
 {
-	int ch;
+	WT_DECL_RET;
+	int ch, sflag, vflag;
+	char *name;
 
-	while ((ch = util_getopt(argc, argv, "")) != EOF)
+	sflag = vflag = 0;
+	name = NULL;
+	while ((ch = util_getopt(argc, argv, "sv")) != EOF)
 		switch (ch) {
+		case 's':
+			sflag = 1;
+			break;
+		case 'v':
+			vflag = 1;
+			break;
 		case '?':
 		default:
 			return (usage());
@@ -24,10 +35,24 @@ util_list(WT_SESSION *session, int argc, char *argv[])
 	argc -= util_optind;
 	argv += util_optind;
 
-	if (argc != 0)
+	switch (argc) {
+	case 0:
+		break;
+	case 1:
+		if ((name = util_name(
+		    *argv, "table", UTIL_FILE_OK | UTIL_TABLE_OK)) == NULL)
+			return (1);
+		break;
+	default:
 		return (usage());
+	}
+
+	ret = list_print(session, name, sflag, vflag);
 
-	return (list_print(session));
+	if (name != NULL)
+		free(name);
+
+	return (ret);
 }
 
 /*
@@ -35,44 +60,141 @@ util_list(WT_SESSION *session, int argc, char *argv[])
  *	List the high-level objects in the database.
  */
 static int
-list_print(WT_SESSION *session)
+list_print(WT_SESSION *session, const char *name, int sflag, int vflag)
 {
 	WT_CURSOR *cursor;
-	int ret;
-	const char *key;
+	WT_DECL_RET;
+	int found;
+	const char *key, *value, *uri;
 
-	ret = 0;
+	/*
+	 * XXX
+	 * Normally, we don't say anything about the WiredTiger metadata file,
+	 * it's not an "object" in the database.  I'm making an exception for
+	 * -s and -v, the snapshot and verbose options.
+	 */
+	if (sflag || vflag) {
+		uri = WT_METADATA_URI;
+		printf("%s\n", uri);
+		if (sflag && (ret = list_print_snapshot(session, uri)) != 0)
+			return (ret);
+		if (vflag) {
+			if ((ret =
+			    __wt_metadata_get(session, uri, &value)) != 0)
+				return (
+				    util_err(ret, "metadata read: %s", uri));
+			printf("%s\n", value);
+		}
+	}
 
-	/* Open the schema file. */
+	/* Open the metadata file. */
 	if ((ret = session->open_cursor(
-	    session, WT_SCHEMA_URI, NULL, NULL, &cursor)) != 0) {
+	    session, WT_METADATA_URI, NULL, NULL, &cursor)) != 0) {
 		/*
-		 * If there is no schema (yet), this will return ENOENT.
-		 * Treat that the same as an empty schema.
+		 * If there is no metadata (yet), this will return ENOENT.
+		 * Treat that the same as an empty metadata.
 		 */
 		if (ret == ENOENT)
 			return (0);
 
 		fprintf(stderr, "%s: %s: session.open_cursor: %s\n",
-		    progname, WT_SCHEMA_URI, wiredtiger_strerror(ret));
+		    progname, WT_METADATA_URI, wiredtiger_strerror(ret));
 		return (1);
 	}
 
 #define	MATCH(s, tag)							\
 	(strncmp(s, tag, strlen(tag)) == 0)
 
+	found = name == NULL;
 	while ((ret = cursor->next(cursor)) == 0) {
 		/* Get the key. */
 		if ((ret = cursor->get_key(cursor, &key)) != 0)
-			return (util_cerr("schema", "get_key", ret));
-			
-		/* All we care about are top-level objects (files or tables). */
-		if (MATCH(key, "table:") || MATCH(key, "file:"))
-			printf("%s\n", key);
+			return (util_cerr("metadata", "get_key", ret));
+
+		/*
+		 * If no object specified, show top-level objects (files and
+		 * tables).
+		 */
+		if (name == NULL) {
+			if (!MATCH(key, "file:") && !MATCH(key, "table:"))
+				continue;
+		} else {
+			if (!MATCH(key, name))
+				continue;
+			found = 1;
+		}
+		printf("%s\n", key);
+		if (!sflag && !vflag)
+			continue;
+
+		if (sflag && (ret = list_print_snapshot(session, key)) != 0)
+			return (ret);
+		if (vflag) {
+			if ((ret = cursor->get_value(cursor, &value)) != 0)
+				return (
+				    util_cerr("metadata", "get_value", ret));
+			printf("%s\n", value);
+		}
 	}
 	if (ret != WT_NOTFOUND)
-		return (util_cerr("schema", "next", ret));
+		return (util_cerr("metadata", "next", ret));
+	if (!found) {
+		fprintf(stderr, "%s: %s: not found\n", progname, name);
+		return (1);
+	}
+
+	return (0);
+}
+
+/*
+ * list_print_snapshot --
+ *	List the snapshot information.
+ */
+static int
+list_print_snapshot(WT_SESSION *session, const char *key)
+{
+	WT_DECL_RET;
+	WT_SNAPSHOT *snap, *snapbase;
+	size_t len;
+	time_t t;
+	uint64_t v;
+	char buf[256];
+
+	/*
+	 * We may not find any snapshots for this file, in which case we don't
+	 * report an error, and continue our caller's loop.  Otherwise, report
+	 * each snapshot's name and time.
+	 */
+	if ((ret = __wt_metadata_get_snaplist(session, key, &snapbase)) != 0)
+		return (ret == WT_NOTFOUND ? 0 : ret);
+
+	/* Find the longest name, so we can pretty-print. */
+	len = 0;
+	WT_SNAPSHOT_FOREACH(snapbase, snap)
+		if (strlen(snap->name) > len)
+			len = strlen(snap->name);
+	++len;
+
+	WT_SNAPSHOT_FOREACH(snapbase, snap) {
+		t = (time_t)snap->sec;
+		printf("\t%*s: %.24s", (int)len, snap->name, ctime_r(&t, buf));
+
+		v = snap->snapshot_size;
+		if (v >= WT_PETABYTE)
+			printf(" (%" PRIu64 " PB)\n", v / WT_PETABYTE);
+		else if (v >= WT_TERABYTE)
+			printf(" (%" PRIu64 " TB)\n", v / WT_TERABYTE);
+		else if (v >= WT_GIGABYTE)
+			printf(" (%" PRIu64 " GB)\n", v / WT_GIGABYTE);
+		else if (v >= WT_MEGABYTE)
+			printf(" (%" PRIu64 " MB)\n", v / WT_MEGABYTE);
+		else if (v >= WT_KILOBYTE)
+			printf(" (%" PRIu64 " KB)\n", v / WT_KILOBYTE);
+		else
+			printf(" (%" PRIu64 " B)\n", v);
+	}
 
+	__wt_metadata_free_snaplist(session, snapbase);
 	return (0);
 }
 
@@ -81,7 +203,7 @@ usage(void)
 {
 	(void)fprintf(stderr,
 	    "usage: %s %s "
-	    "list\n",
+	    "list [-sv] [uri]\n",
 	    progname, usage_prefix);
 	return (1);
 }
diff --git a/src/utilities/util_load.c b/src/utilities/util_load.c
index 0d562ebfe6d..26402b7f0a7 100644
--- a/src/utilities/util_load.c
+++ b/src/utilities/util_load.c
@@ -10,9 +10,9 @@
 static int format(void);
 static int insert(WT_CURSOR *, const char *);
 static int load_dump(WT_SESSION *);
-static int schema_read(char ***, int *);
-static int schema_rename(char **, const char *);
-static int schema_update(WT_SESSION *, char **);
+static int config_read(char ***, int *);
+static int config_rename(char **, const char *);
+static int config_update(WT_SESSION *, char **);
 static int usage(void);
 
 static int	append;		/* -a append (ignore record number keys) */
@@ -72,14 +72,15 @@ static int
 load_dump(WT_SESSION *session)
 {
 	WT_CURSOR *cursor;
-	int hex, ret, tret;
+	WT_DECL_RET;
+	int hex, tret;
 	char **entry, **list, *p, *uri, config[64];
 
-        list = NULL;            /* -Wuninitialized */
-        hex = 0;                /* -Wuninitialized */
+	list = NULL;            /* -Wuninitialized */
+	hex = 0;                /* -Wuninitialized */
 
-	/* Read the schema file. */
-	if ((ret = schema_read(&list, &hex)) != 0)
+	/* Read the metadata file. */
+	if ((ret = config_read(&list, &hex)) != 0)
 		return (ret);
 
 	/*
@@ -111,8 +112,8 @@ load_dump(WT_SESSION *session)
 		p = list[1]; list[1] = entry[1]; entry[1] = p;
 	}
 
-	/* Update the schema based on any command-line configuration. */
-	if ((ret = schema_update(session, list)) != 0)
+	/* Update the config based on any command-line configuration. */
+	if ((ret = config_update(session, list)) != 0)
 		return (ret);
 
 	uri = list[0];
@@ -127,7 +128,7 @@ load_dump(WT_SESSION *session)
 	    append ? ",append" : "", overwrite ? ",overwrite" : "");
 	if ((ret = session->open_cursor(
 	    session, uri, NULL, config, &cursor)) != 0)
-		return(util_err(ret, "%s: session.open", uri));
+		return (util_err(ret, "%s: session.open", uri));
 
 	/*
 	 * Check the append flag (it only applies to objects where the primary
@@ -154,16 +155,16 @@ load_dump(WT_SESSION *session)
 	}
 	if (ret == 0 && (ret = session->sync(session, uri, NULL)) != 0)
 		ret = util_err(ret, "%s: session.sync", uri);
-		
+
 	return (ret == 0 ? 0 : 1);
 }
 
 /*
- * schema_read --
- *	Read the schema lines and do some basic validation.
+ * config_read --
+ *	Read the config lines and do some basic validation.
  */
 static int
-schema_read(char ***listp, int *hexp)
+config_read(char ***listp, int *hexp)
 {
 	ULINE l;
 	int entry, eof, max_entry;
@@ -222,18 +223,18 @@ schema_read(char ***listp, int *hexp)
 }
 
 /*
- * schema_update --
+ * config_update --
  *	Reconcile and update the command line configuration against the
- * schema we found.
+ * config we found.
  */
 static int
-schema_update(WT_SESSION *session, char **list)
+config_update(WT_SESSION *session, char **list)
 {
 	int found;
 	const char *cfg[] = { NULL, NULL, NULL };
 	char **configp, **listp, *p, *t;
 
-#define MATCH(s, tag)                                           	\
+#define	MATCH(s, tag)                                           	\
 	(strncmp(s, tag, strlen(tag)) == 0)
 
 	/*
@@ -246,7 +247,7 @@ schema_update(WT_SESSION *session, char **list)
 			    MATCH(*listp, "file:") ||
 			    MATCH(*listp, "index:") ||
 			    MATCH(*listp, "table:"))
-				if (schema_rename(listp, cmdname))
+				if (config_rename(listp, cmdname))
 					return (1);
 
 		/*
@@ -256,7 +257,7 @@ schema_update(WT_SESSION *session, char **list)
 		 */
 		for (configp = cmdconfig;
 		    cmdconfig != NULL && *configp != NULL; configp += 2)
-			if (schema_rename(configp, cmdname))
+			if (config_rename(configp, cmdname))
 				return (1);
 	}
 
@@ -337,11 +338,11 @@ schema_update(WT_SESSION *session, char **list)
 }
 
 /*
- * schema_rename --
+ * config_rename --
  *	Update the URI name.
  */
 static int
-schema_rename(char **urip, const char *name)
+config_rename(char **urip, const char *name)
 {
 	size_t len;
 	char *buf, *p;
@@ -383,12 +384,13 @@ static int
 insert(WT_CURSOR *cursor, const char *name)
 {
 	ULINE key, value;
+	WT_DECL_RET;
 	uint64_t insert_count;
-	int eof, ret;
+	int eof;
 
 	memset(&key, 0, sizeof(key));
 	memset(&value, 0, sizeof(value));
-	
+
 	/* Read key/value pairs and insert them into the file. */
 	for (insert_count = 0;;) {
 		/*
diff --git a/src/utilities/util_loadtext.c b/src/utilities/util_loadtext.c
index 1121a02c31f..c3973c78a90 100644
--- a/src/utilities/util_loadtext.c
+++ b/src/utilities/util_loadtext.c
@@ -49,7 +49,8 @@ static int
 text(WT_SESSION *session, const char *uri)
 {
 	WT_CURSOR *cursor;
-	int readkey, ret, tret;
+	WT_DECL_RET;
+	int readkey, tret;
 
 	/*
 	 * Open the cursor, configured to append new records (in the case of
@@ -90,7 +91,7 @@ text(WT_SESSION *session, const char *uri)
 	}
 	if (ret == 0 && (ret = session->sync(session, uri, NULL)) != 0)
 		ret = util_err(ret, "%s: session.sync", uri);
-		
+
 	return (ret == 0 ? 0 : 1);
 }
 
@@ -102,12 +103,13 @@ static int
 insert(WT_CURSOR *cursor, const char *name, int readkey)
 {
 	ULINE key, value;
+	WT_DECL_RET;
 	uint64_t insert_count;
-	int eof, ret;
+	int eof;
 
 	memset(&key, 0, sizeof(key));
 	memset(&value, 0, sizeof(value));
-	
+
 	/* Read key/value pairs and insert them into the file. */
 	for (insert_count = 0;;) {
 		/*
diff --git a/src/utilities/util_main.c b/src/utilities/util_main.c
index 7fe3015f1b9..74a6b1278a5 100644
--- a/src/utilities/util_main.c
+++ b/src/utilities/util_main.c
@@ -20,12 +20,12 @@ int
 main(int argc, char *argv[])
 {
 	WT_CONNECTION *conn;
+	WT_DECL_RET;
 	WT_SESSION *session;
-	int ch, major_v, minor_v, ret, tret;
+	int ch, major_v, minor_v, tret;
 	const char *config;
 
 	conn = NULL;
-	ret = 0;
 
 	/* Get the program name. */
 	if ((progname = strrchr(argv[0], '/')) == NULL)
@@ -186,7 +186,7 @@ static int
 usage(void)
 {
 	fprintf(stderr,
-	    "WiredTiger Data Engine (version %d.%d)\n", 
+	    "WiredTiger Data Engine (version %d.%d)\n",
 	    WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR);
 	fprintf(stderr,
 	    "global options:\n"
diff --git a/src/utilities/util_misc.c b/src/utilities/util_misc.c
index cf5f3e9c690..444ac162a74 100644
--- a/src/utilities/util_misc.c
+++ b/src/utilities/util_misc.c
@@ -57,7 +57,7 @@ util_read_line(ULINE *l, int eof_expected, int *eofp)
 					*eofp = 1;
 					return (0);
 				}
-				return (util_err(0, 
+				return (util_err(0,
 				    "line %llu: unexpected end-of-file", line));
 			}
 			return (util_err(0,
diff --git a/src/utilities/util_printlog.c b/src/utilities/util_printlog.c
index 300cedd9daf..e19c1ed787c 100644
--- a/src/utilities/util_printlog.c
+++ b/src/utilities/util_printlog.c
@@ -13,8 +13,9 @@ int
 util_printlog(WT_SESSION *session, int argc, char *argv[])
 {
 	WT_CURSOR *cursor;
+	WT_DECL_RET;
 	WT_ITEM key, value;
-	int ch, printable, ret;
+	int ch, printable;
 
 	printable = 0;
 	while ((ch = util_getopt(argc, argv, "f:p")) != EOF)
diff --git a/src/utilities/util_read.c b/src/utilities/util_read.c
index c3d10368105..0956adf06c4 100644
--- a/src/utilities/util_read.c
+++ b/src/utilities/util_read.c
@@ -13,12 +13,11 @@ int
 util_read(WT_SESSION *session, int argc, char *argv[])
 {
 	WT_CURSOR *cursor;
+	WT_DECL_RET;
 	uint64_t recno;
-	int ch, rkey, ret, rval;
+	int ch, rkey, rval;
 	const char *uri, *value;
 
-	ret = 0;
-
 	while ((ch = util_getopt(argc, argv, "")) != EOF)
 		switch (ch) {
 		case '?':
@@ -87,7 +86,7 @@ util_read(WT_SESSION *session, int argc, char *argv[])
 			return (util_cerr(uri, "search", ret));
 		}
 	}
-		
+
 	return (rval);
 }
 
diff --git a/src/utilities/util_rename.c b/src/utilities/util_rename.c
index 22e5067a460..6f894e2e9c7 100644
--- a/src/utilities/util_rename.c
+++ b/src/utilities/util_rename.c
@@ -12,7 +12,8 @@ static int usage(void);
 int
 util_rename(WT_SESSION *session, int argc, char *argv[])
 {
-	int ch, ret;
+	WT_DECL_RET;
+	int ch;
 	char *uri, *newname;
 
 	uri = NULL;
diff --git a/src/utilities/util_salvage.c b/src/utilities/util_salvage.c
index 8f61fa3a906..8821e673df1 100644
--- a/src/utilities/util_salvage.c
+++ b/src/utilities/util_salvage.c
@@ -12,7 +12,8 @@ static int usage(void);
 int
 util_salvage(WT_SESSION *session, int argc, char *argv[])
 {
-	int ch, ret;
+	WT_DECL_RET;
+	int ch;
 	const char *force;
 	char *name;
 
diff --git a/src/utilities/util_stat.c b/src/utilities/util_stat.c
index 988ebba86b6..770e68a81f8 100644
--- a/src/utilities/util_stat.c
+++ b/src/utilities/util_stat.c
@@ -13,9 +13,10 @@ int
 util_stat(WT_SESSION *session, int argc, char *argv[])
 {
 	WT_CURSOR *cursor;
+	WT_DECL_RET;
 	uint64_t v;
 	size_t urilen;
-	int ch, objname_free, ret;
+	int ch, objname_free;
 	const char *pval, *desc;
 	char *objname, *uri;
 
diff --git a/src/utilities/util_upgrade.c b/src/utilities/util_upgrade.c
index 9f8ac6c8949..1529d0b611c 100644
--- a/src/utilities/util_upgrade.c
+++ b/src/utilities/util_upgrade.c
@@ -12,7 +12,8 @@ static int usage(void);
 int
 util_upgrade(WT_SESSION *session, int argc, char *argv[])
 {
-	int ch, ret;
+	WT_DECL_RET;
+	int ch;
 	char *name;
 
 	name = NULL;
diff --git a/src/utilities/util_verbose.c b/src/utilities/util_verbose.c
index 6e3703201e5..fcf757a9d3d 100644
--- a/src/utilities/util_verbose.c
+++ b/src/utilities/util_verbose.c
@@ -11,13 +11,13 @@
  * __handle_error_verbose --
  *	Verbose WT_EVENT_HANDLER->handle_error implementation: send to stderr.
  */
-static void
+static int
 __handle_error_verbose(WT_EVENT_HANDLER *handler, int error, const char *errmsg)
 {
 	WT_UNUSED(handler);
 	WT_UNUSED(error);
 
-	fprintf(stderr, "%s\n", errmsg);
+	return (fprintf(stderr, "%s\n", errmsg) < 0 ? EIO : 0);
 }
 
 /*
@@ -29,8 +29,7 @@ __handle_message_verbose(WT_EVENT_HANDLER *handler, const char *message)
 {
 	WT_UNUSED(handler);
 
-	(void)printf("%s\n", message);
-	return (0);
+	return (printf("%s\n", message) < 0 ? EIO : 0);
 }
 
 /*
@@ -43,8 +42,8 @@ __handle_progress_verbose(WT_EVENT_HANDLER *handler,
 {
 	WT_UNUSED(handler);
 
-	(void)printf("\r\t%s %-20" PRIu64, operation, progress);
-	return (0);
+	return (
+	    printf("\r\t%s %-20" PRIu64, operation, progress) < 0 ? EIO : 0);
 }
 
 static WT_EVENT_HANDLER __event_handler_verbose = {
diff --git a/src/utilities/util_verify.c b/src/utilities/util_verify.c
index 1efa143cedd..7ac74178217 100644
--- a/src/utilities/util_verify.c
+++ b/src/utilities/util_verify.c
@@ -12,7 +12,8 @@ static int usage(void);
 int
 util_verify(WT_SESSION *session, int argc, char *argv[])
 {
-	int ch, ret;
+	WT_DECL_RET;
+	int ch;
 	char *name;
 
 	name = NULL;
diff --git a/src/utilities/util_write.c b/src/utilities/util_write.c
index c7bb6fb654e..95def2f652f 100644
--- a/src/utilities/util_write.c
+++ b/src/utilities/util_write.c
@@ -13,13 +13,13 @@ int
 util_write(WT_SESSION *session, int argc, char *argv[])
 {
 	WT_CURSOR *cursor;
+	WT_DECL_RET;
 	uint64_t recno;
-	int append, ch, overwrite, rkey, ret;
+	int append, ch, overwrite, rkey;
 	const char *uri;
 	char config[100];
 
-	append = overwrite = ret = 0;
-
+	append = overwrite = 0;
 	while ((ch = util_getopt(argc, argv, "ao")) != EOF)
 		switch (ch) {
 		case 'a':
@@ -93,7 +93,7 @@ util_write(WT_SESSION *session, int argc, char *argv[])
 		if ((ret = cursor->insert(cursor)) != 0)
 			return (util_cerr(uri, "search", ret));
 	}
-		
+
 	return (0);
 }
 
diff --git a/test/fops/Makefile.am b/test/fops/Makefile.am
new file mode 100644
index 00000000000..5ad18e6c29b
--- /dev/null
+++ b/test/fops/Makefile.am
@@ -0,0 +1,11 @@
+INCLUDES = -I$(top_builddir)
+
+noinst_PROGRAMS = t
+t_LDADD = $(top_builddir)/libwiredtiger.la
+t_SOURCES = thread.h file.c fops.c t.c
+t_LDFLAGS = -static
+
+TESTS = $(noinst_PROGRAMS)
+
+clean-local:
+	rm -rf WiredTiger* __* *.core
diff --git a/test/fops/file.c b/test/fops/file.c
new file mode 100644
index 00000000000..2817fefef8e
--- /dev/null
+++ b/test/fops/file.c
@@ -0,0 +1,93 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "thread.h"
+
+void
+obj_create(void)
+{
+	WT_SESSION *session;
+	int ret;
+
+	if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
+		die("conn.session", ret);
+
+	if ((ret = session->create(session, uri, NULL)) != 0)
+		if (ret != EEXIST)
+			die("session.create", ret);
+
+	if ((ret = session->close(session, NULL)) != 0)
+		die("session.close", ret);
+}
+
+void
+obj_drop(void)
+{
+	WT_SESSION *session;
+	int ret;
+
+	if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
+		die("conn.session", ret);
+
+	if ((ret = session->drop(session, uri, NULL)) != 0)
+		if (ret != ENOENT && ret != EBUSY)
+			die("session.drop", ret);
+
+	if ((ret = session->close(session, NULL)) != 0)
+		die("session.close", ret);
+}
+
+void
+obj_sync(void)
+{
+	WT_SESSION *session;
+	int ret;
+
+	if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
+		die("conn.session", ret);
+
+	if ((ret = session->sync(session, uri, NULL)) != 0)
+		if (ret != ENOENT)
+			die("session.sync", ret);
+
+	if ((ret = session->close(session, NULL)) != 0)
+		die("session.close", ret);
+}
+
+void
+obj_upgrade(void)
+{
+	WT_SESSION *session;
+	int ret;
+
+	if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
+		die("conn.session", ret);
+
+	if ((ret = session->upgrade(session, uri, NULL)) != 0)
+		if (ret != ENOENT)
+			die("session.upgrade", ret);
+
+	if ((ret = session->close(session, NULL)) != 0)
+		die("session.close", ret);
+}
+
+void
+obj_verify(void)
+{
+	WT_SESSION *session;
+	int ret;
+
+	if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
+		die("conn.session", ret);
+
+	if ((ret = session->verify(session, uri, NULL)) != 0)
+		if (ret != ENOENT)
+			die("session.verify", ret);
+
+	if ((ret = session->close(session, NULL)) != 0)
+		die("session.close", ret);
+}
diff --git a/test/fops/fops.c b/test/fops/fops.c
new file mode 100644
index 00000000000..c1a3abb64bd
--- /dev/null
+++ b/test/fops/fops.c
@@ -0,0 +1,154 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "thread.h"
+
+static void *fop(void *);
+static void  print_stats(u_int);
+
+typedef struct {
+	int create;				/* session.create */
+	int drop;				/* session.drop */
+	int sync;				/* session.sync */
+	int upgrade;				/* session.upgrade */
+	int verify;				/* session.verify */
+} STATS;
+
+static STATS *run_stats;
+
+/*
+ * r --
+ *	Return a 32-bit pseudo-random number.
+ *
+ * This is an implementation of George Marsaglia's multiply-with-carry pseudo-
+ * random number generator.  Computationally fast, with reasonable randomness
+ * properties.
+ */
+static inline uint32_t
+r(void)
+{
+	static uint32_t m_w = 0, m_z = 0;
+
+	if (m_w == 0) {
+		struct timeval t;
+		(void)gettimeofday(&t, NULL);
+		m_w = (uint32_t)t.tv_sec;
+		m_z = (uint32_t)t.tv_usec;
+	}
+
+	m_z = 36969 * (m_z & 65535) + (m_z >> 16);
+	m_w = 18000 * (m_w & 65535) + (m_w >> 16);
+	return (m_z << 16) + (m_w & 65535);
+}
+
+int
+fop_start(u_int nthreads)
+{
+	clock_t start, stop;
+	double seconds;
+	pthread_t *tids;
+	u_int i;
+	int ret;
+	void *thread_ret;
+
+	/* Create statistics and thread structures. */
+	if ((run_stats = calloc(
+	    (size_t)(nthreads), sizeof(*run_stats))) == NULL ||
+	    (tids = calloc((size_t)(nthreads), sizeof(*tids))) == NULL)
+		die("calloc", errno);
+
+	start = clock();
+
+	/* Create threads. */
+	for (i = 0; i < nthreads; ++i)
+		if ((ret = pthread_create(
+		    &tids[i], NULL, fop, (void *)(uintptr_t)i)) != 0)
+			die("pthread_create", ret);
+
+	/* Wait for the threads. */
+	for (i = 0; i < nthreads; ++i)
+		(void)pthread_join(tids[i], &thread_ret);
+
+	stop = clock();
+	seconds = (stop - start) / (double)CLOCKS_PER_SEC;
+	fprintf(stderr, "timer: %.2lf seconds (%d ops/second)\n",
+	    seconds, (int)((nthreads * nops) / seconds));
+
+	print_stats(nthreads);
+
+	free(run_stats);
+	free(tids);
+
+	return (0);
+}
+
+/*
+ * fop --
+ *	File operation function.
+ */
+static void *
+fop(void *arg)
+{
+	STATS *s;
+	pthread_t tid;
+	u_int i;
+	int id;
+
+	id = (int)(uintptr_t)arg;
+	tid = pthread_self();
+	printf(
+	    "file operation thread %2d starting: tid: %p\n", id, (void *)tid);
+	sched_yield();		/* Get all the threads created. */
+
+	s = &run_stats[id];
+
+	for (i = 0; i < nops; ++i, sched_yield())
+		switch (r() % 5) {
+		case 0:
+			++s->create;
+			obj_create();
+			break;
+		case 1:
+			++s->drop;
+			obj_drop();
+			break;
+		case 2:
+			++s->sync;
+			obj_sync();
+			break;
+		case 3:
+			++s->upgrade;
+			obj_upgrade();
+			break;
+		case 4:
+			++s->verify;
+			obj_verify();
+			break;
+		default:
+			break;
+		}
+
+	return (NULL);
+}
+
+/*
+ * print_stats --
+ *	Display file operation thread stats.
+ */
+static void
+print_stats(u_int nthreads)
+{
+	STATS *s;
+	u_int id;
+
+	s = run_stats;
+	for (id = 0; id < nthreads; ++id, ++s)
+		printf(
+		    "%3d: create %6d, drop %6d, sync %6d, upgrade %6d, "
+		    "verify %6d\n",
+		    id, s->create, s->drop, s->sync, s->upgrade, s->verify);
+}
diff --git a/test/fops/t.c b/test/fops/t.c
new file mode 100644
index 00000000000..ad1d7811771
--- /dev/null
+++ b/test/fops/t.c
@@ -0,0 +1,209 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "thread.h"
+
+WT_CONNECTION *conn;				/* WiredTiger connection */
+u_int nops;					/* Operations */
+const char *uri;				/* Object */
+
+static char *progname;				/* Program name */
+static FILE *logfp;				/* Log file */
+
+static int  handle_error(WT_EVENT_HANDLER *, int, const char *);
+static int  handle_message(WT_EVENT_HANDLER *, const char *);
+static void onint(int);
+static void shutdown(void);
+static int  usage(void);
+static void wt_startup(char *);
+static void wt_shutdown(void);
+
+int
+main(int argc, char *argv[])
+{
+	u_int nthreads;
+	int ch, cnt, runs;
+	char *config_open;
+	const char **objp, *objs[] = { "file:__wt", "table:__wt", NULL };
+
+	if ((progname = strrchr(argv[0], '/')) == NULL)
+		progname = argv[0];
+	else
+		++progname;
+
+	config_open = NULL;
+	nops = 1000;
+	nthreads = 10;
+	runs = 1;
+
+	while ((ch = getopt(argc, argv, "C:l:n:r:t:")) != EOF)
+		switch (ch) {
+		case 'C':			/* wiredtiger_open config */
+			config_open = optarg;
+			break;
+		case 'l':			/* log */
+			if ((logfp = fopen(optarg, "w")) == NULL) {
+				fprintf(stderr,
+				    "%s: %s\n", optarg, strerror(errno));
+				return (EXIT_FAILURE);
+			}
+			break;
+		case 'n':			/* operations */
+			nops = (u_int)atoi(optarg);
+			break;
+		case 'r':			/* runs */
+			runs = atoi(optarg);
+			break;
+		case 't':
+			nthreads = (u_int)atoi(optarg);
+			break;
+		default:
+			return (usage());
+		}
+
+	argc -= optind;
+	argv += optind;
+	if (argc != 0)
+		return (usage());
+
+	/* Clean up on signal. */
+	(void)signal(SIGINT, onint);
+
+	printf("%s: process %" PRIu64 "\n", progname, (uint64_t)getpid());
+	for (cnt = 1; runs == 0 || cnt <= runs; ++cnt) {
+		shutdown();			/* Clean up previous runs */
+
+		for (objp = objs; *objp != NULL; objp++) {
+			uri = *objp;
+			printf("%5d: %u threads on %s\n", cnt, nthreads, uri);
+			wt_startup(config_open);
+			if (fop_start(nthreads))
+				return (EXIT_FAILURE);
+			wt_shutdown();
+			printf("\n");
+		}
+	}
+	return (0);
+}
+
+/*
+ * wt_startup --
+ *	Configure the WiredTiger connection.
+ */
+static void
+wt_startup(char *config_open)
+{
+	static WT_EVENT_HANDLER event_handler = {
+		handle_error,
+		handle_message,
+		NULL
+	};
+	int ret;
+	char config[128];
+
+	snprintf(config, sizeof(config),
+	    "create,error_prefix=\"%s\",cache_size=5MB%s%s",
+	    progname,
+	    config_open == NULL ? "" : ",",
+	    config_open == NULL ? "" : config_open);
+
+	if ((ret = wiredtiger_open(NULL, &event_handler, config, &conn)) != 0)
+		die("wiredtiger_open", ret);
+}
+
+/*
+ * wt_shutdown --
+ *	Flush the file to disk and shut down the WiredTiger connection.
+ */
+static void
+wt_shutdown(void)
+{
+	int ret;
+
+	if ((ret = conn->close(conn, NULL)) != 0)
+		die("conn.close", ret);
+}
+
+/*
+ * shutdown --
+ *	Clean up from previous runs.
+ */
+static void
+shutdown(void)
+{
+	(void)system("rm -f WildTiger WiredTiger.* __wt*");
+}
+
+static int
+handle_error(WT_EVENT_HANDLER *handler, int error, const char *errmsg)
+{
+	UNUSED(handler);
+	UNUSED(error);
+
+	/* Ignore complaints about missing files. */
+	if (error == ENOENT)
+		return (0);
+	return (fprintf(stderr, "%s\n", errmsg) < 0 ? -1 : 0);
+}
+
+static int
+handle_message(WT_EVENT_HANDLER *handler, const char *message)
+{
+	UNUSED(handler);
+
+	if (logfp != NULL)
+		return (fprintf(logfp, "%s\n", message) < 0 ? -1 : 0);
+
+	return (printf("%s\n", message) < 0 ? -1 : 0);
+}
+
+/*
+ * onint --
+ *	Interrupt signal handler.
+ */
+static void
+onint(int signo)
+{
+	UNUSED(signo);
+
+	shutdown();
+
+	fprintf(stderr, "\n");
+	exit(EXIT_FAILURE);
+}
+
+/*
+ * die --
+ *	Report an error and quit.
+ */
+void
+die(const char *m, int e)
+{
+	fprintf(stderr, "%s: %s: %s\n", progname, m, wiredtiger_strerror(e));
+	exit(EXIT_FAILURE);
+}
+
+/*
+ * usage --
+ *	Display usage statement and exit failure.
+ */
+static int
+usage(void)
+{
+	fprintf(stderr,
+	    "usage: %s "
+	    "[-S] [-C wiredtiger-config] [-k keys] [-l log]\n\t"
+	    "[-n ops] [-R readers] [-r runs] [-t f|r|v] [-W writers]\n",
+	    progname);
+	fprintf(stderr, "%s",
+	    "\t-C specify wiredtiger_open configuration arguments\n"
+	    "\t-l specify a log file\n"
+	    "\t-n set number of operations each thread does\n"
+	    "\t-r set number of runs\n"
+	    "\t-t set number of threads\n");
+	return (EXIT_FAILURE);
+}
diff --git a/test/fops/thread.h b/test/fops/thread.h
new file mode 100644
index 00000000000..f3f17abeae9
--- /dev/null
+++ b/test/fops/thread.h
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include <sys/types.h>
+#include <sys/time.h>
+
+#include <errno.h>
+#include <inttypes.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <wiredtiger.h>
+
+#define	UNUSED(v)	(void)(v)		/* Quiet unused var warnings */
+
+extern WT_CONNECTION *conn;			/* WiredTiger connection */
+
+extern u_int nops;				/* Operations per thread */
+
+extern const char *uri;				/* Object */
+
+#if defined (__GNUC__)
+void die(const char *, int) __attribute__((noreturn));
+#else
+void die(const char *, int);
+#endif
+int  fop_start(u_int);
+void obj_create(void);
+void obj_drop(void);
+void obj_sync(void);
+void obj_upgrade(void);
+void obj_verify(void);
diff --git a/test/format/CONFIG.example b/test/format/CONFIG.example
index fa759cd9a7f..7517cbf644c 100644
--- a/test/format/CONFIG.example
+++ b/test/format/CONFIG.example
@@ -49,6 +49,9 @@
 #runs
 #	the number of runs
 
+#threads
+#	the number of threads
+
 #value_max
 #	maximum size of values
 
diff --git a/test/format/Makefile.am b/test/format/Makefile.am
index 81aac6d041f..9077d33e073 100644
--- a/test/format/Makefile.am
+++ b/test/format/Makefile.am
@@ -2,17 +2,18 @@ BDB = $(top_builddir)/db
 INCLUDES = -I$(top_builddir) -I$(BDB)
 
 noinst_PROGRAMS = t
-noinst_DATA = s_dumpcmp
+noinst_SCRIPTS = s_dumpcmp
 t_SOURCES =\
 	config.h format.h bdb.c config.c t.c util.c wts.c wts_bulk.c wts_ops.c
 t_LDADD = $(top_builddir)/libwiredtiger.la -L$(BDB)/build_unix -ldb
 t_LDFLAGS = -static
 
-s_dumpcmp: $(srcdir)/s_dumpcmp.in
-	cp $(srcdir)/s_dumpcmp.in $@
+s_dumpcmp: $(srcdir)/s_dumpcmp.sh
+	cp $(srcdir)/s_dumpcmp.sh $@
+	chmod +x $@
 
 backup:
 	rm -rf BACKUP && mkdir BACKUP && cp -p -r WiredTiger* __* BACKUP/
 
 clean-local:
-	rm -rf WiredTiger WiredTiger.* __* *.core
+	rm -rf WiredTiger* *.core __*
diff --git a/test/format/bdb.c b/test/format/bdb.c
index 9cbe8012583..9691e32fc1f 100644
--- a/test/format/bdb.c
+++ b/test/format/bdb.c
@@ -136,7 +136,7 @@ bdb_read(uint64_t keyno, void *valuep, uint32_t *valuesizep, int *notfoundp)
 }
 
 void
-bdb_put(const void *arg_key, uint32_t arg_key_size,
+bdb_update(const void *arg_key, uint32_t arg_key_size,
     const void *arg_value, uint32_t arg_value_size, int *notfoundp)
 {
 	DBC *dbc = g.dbc;
@@ -159,7 +159,7 @@ bdb_put(const void *arg_key, uint32_t arg_key_size,
 }
 
 void
-bdb_del(uint64_t keyno, int *notfoundp)
+bdb_remove(uint64_t keyno, int *notfoundp)
 {
 	DBC *dbc = g.dbc;
 	int ret;
diff --git a/test/format/config.c b/test/format/config.c
index e141fd0248b..e6e37a4d88f 100644
--- a/test/format/config.c
+++ b/test/format/config.c
@@ -64,6 +64,19 @@ config_setup(void)
 			*cp->v = CONF_RAND(cp);
 	}
 
+	/* Clear operations values if the whole run is read-only. */
+	if (g.c_ops == 0)
+		for (cp = c; cp->name != NULL; ++cp)
+			if (cp->flags & C_OPS)
+				*cp->v = 0;
+
+	/* Multi-threaded runs cannot be replayed. */
+	if (g.replay && g.c_threads != 1) {
+		fprintf(stderr,
+		    "%s: -r is incompatible with threaded runs\n", g.progname);
+		exit(EXIT_FAILURE);
+	}
+
 	/*
 	 * Periodically, set the delete percentage to 0 so salvage gets run,
 	 * as long as the delete percentage isn't nailed down.
diff --git a/test/format/config.h b/test/format/config.h
index 39f04cf91d3..2b71300adaf 100644
--- a/test/format/config.h
+++ b/test/format/config.h
@@ -19,16 +19,19 @@ typedef struct {
 	uint8_t	 	type_mask;		/* File type mask */
 
 	/* Value is a boolean, yes if roll of 1-to-100 is <= CONFIG->min. */
-#define	C_BOOL		0x01
+#define	C_BOOL		0x001
 
 	/* Not a simple randomization, handle outside the main loop. */ 
-#define	C_IGNORE	0x02
+#define	C_IGNORE	0x002
+
+	/* Operation, only set if doing operations. */
+#define	C_OPS		0x004
 
 	/* Value was set from command-line or file, ignore for all runs. */
-#define	C_PERM		0x04
+#define	C_PERM		0x008
 
 	/* Value isn't random for this run, ignore just for this run. */
-#define	C_TEMP		0x08
+#define	C_TEMP		0x010
 	uint32_t 	flags;
 
 	uint32_t	 min;			/* Minimum value */
@@ -57,7 +60,7 @@ static CONFIG c[] = {
 
 	{ "delete_pct",
 	  "percent operations that are deletes",
-	  0, 0, 0, 45, &g.c_delete_pct },
+	  0, C_OPS, 0, 45, &g.c_delete_pct },
 
 	{ "file_type",
 	  "type of file to create (fix | var | row)",
@@ -73,7 +76,7 @@ static CONFIG c[] = {
 
 	{ "insert_pct",
 	  "percent operations that are inserts",
-	  0, 0, 0, 45, &g.c_insert_pct },
+	  0, C_OPS, 0, 45, &g.c_insert_pct },
 
 	{ "internal_page_max",
 	  "maximum size of Btree internal nodes",
@@ -125,7 +128,7 @@ static CONFIG c[] = {
 
 	{ "write_pct",
 	  "percent operations that are writes",
-	  0, 0, 0, 90, &g.c_write_pct },
+	  0, C_OPS, 0, 90, &g.c_write_pct },
 
 	{ NULL, NULL, 0, 0, 0, 0, NULL }
 };
diff --git a/test/format/format.h b/test/format/format.h
index 03ee02a4d28..f6aed7ba126 100644
--- a/test/format/format.h
+++ b/test/format/format.h
@@ -96,6 +96,7 @@ typedef struct {
 	uint64_t update;
 	uint64_t remove;
 
+	int       id;					/* simple thread ID */
 	pthread_t tid;					/* thread ID */
 
 #define	TINFO_RUNNING	1				/* Running */
@@ -105,12 +106,12 @@ typedef struct {
 } TINFO;
 
 void	 bdb_close(void);
-void	 bdb_del(uint64_t, int *);
 void	 bdb_insert(const void *, uint32_t, const void *, uint32_t);
 void	 bdb_np(int, void *, uint32_t *, void *, uint32_t *, int *);
 void	 bdb_open(void);
-void	 bdb_put(const void *, uint32_t, const void *, uint32_t, int *);
 void	 bdb_read(uint64_t, void *, uint32_t *, int *);
+void	 bdb_remove(uint64_t, int *);
+void	 bdb_update(const void *, uint32_t, const void *, uint32_t, int *);
 
 void	 config_error(void);
 const char *
diff --git a/test/format/s_dumpcmp.in b/test/format/s_dumpcmp.sh
index 71eed7b055f..71eed7b055f 100644..100755
--- a/test/format/s_dumpcmp.in
+++ b/test/format/s_dumpcmp.sh
diff --git a/test/format/t.c b/test/format/t.c
index e3e6fc771c5..85ca9b7f5e7 100644
--- a/test/format/t.c
+++ b/test/format/t.c
@@ -27,10 +27,8 @@ main(int argc, char *argv[])
 	(void)setenv("MALLOC_OPTIONS", "AJZ", 1);
 
 	/* Set values from the "CONFIG" file, if it exists. */
-	if (access("CONFIG", R_OK) == 0) {
-		printf("... reading CONFIG file\n");
+	if (access("CONFIG", R_OK) == 0)
 		config_file("CONFIG");
-	}
 
 	/* Track progress unless we're re-directing output to a file. */
 	g.track = isatty(STDOUT_FILENO) ? 1 : 0;
@@ -69,13 +67,6 @@ main(int argc, char *argv[])
 			usage();
 		}
 
-	/* Multi-threaded runs cannot be replayed. */
-	if (g.c_threads != 1 && g.replay) {
-		fprintf(stderr,
-		    "%s: -r and -t are mutually exclusive\n", g.progname);
-		return (EXIT_FAILURE);
-	}
-
 	argc -= optind;
 	argv += optind;
 	for (; *argv != NULL; ++argv)
diff --git a/test/format/util.c b/test/format/util.c
index 347352fd310..a15de447257 100644
--- a/test/format/util.c
+++ b/test/format/util.c
@@ -63,6 +63,8 @@ key_gen(uint8_t *key, uint32_t *sizep, uint64_t keyno, int insert)
 	*sizep = (uint32_t)len;
 }
 
+static uint32_t val_dup_data_len;	/* Length of duplicate data items */
+
 void
 val_gen_setup(uint8_t **valp)
 {
@@ -83,13 +85,13 @@ val_gen_setup(uint8_t **valp)
 		val[i] = (uint8_t)("ABCDEFGHIJKLMNOPQRSTUVWXYZ"[i % 26]);
 
 	*valp = val;
+
+	val_dup_data_len = MMRAND(g.c_value_min, g.c_value_max);
 }
 
 void
 value_gen(uint8_t *val, uint32_t *sizep, uint64_t keyno)
 {
-	static const char *dup_data = "duplicate data item";
-
 	/*
 	 * Fixed-length records: take the low N bits from the last digit of
 	 * the record number.
@@ -113,7 +115,7 @@ value_gen(uint8_t *val, uint32_t *sizep, uint64_t keyno)
 	 * WiredTiger doesn't store zero-length data items in row-store files,
 	 * test that by inserting a zero-length data item every so often.
 	 */
-	if (++keyno % 63 == 0) {
+	if (keyno % 63 == 0) {
 		val[0] = '\0';
 		*sizep = 0;
 		return;
@@ -129,15 +131,15 @@ value_gen(uint8_t *val, uint32_t *sizep, uint64_t keyno)
 	 */
 	if (g.c_file_type == VAR &&
 	    g.c_repeat_data_pct != 0 &&
-	    (u_int)wts_rand() % 100 > g.c_repeat_data_pct) {
-		(void)strcpy((char *)val, dup_data);
-		*sizep = (uint32_t)strlen(dup_data);
-		return;
+	    MMRAND(1, 100) > g.c_repeat_data_pct) {
+		(void)strcpy((char *)val, "DUPLICATEV");
+		val[10] = '/';
+		*sizep = val_dup_data_len;
+	} else {
+		(void)sprintf((char *)val, "%010" PRIu64, keyno);
+		val[10] = '/';
+		*sizep = MMRAND(g.c_value_min, g.c_value_max);
 	}
-
-	sprintf((char *)val, "%010" PRIu64, keyno);
-	val[10] = '/';
-	*sizep = MMRAND(g.c_value_min, g.c_value_max);
 }
 
 void
@@ -168,8 +170,10 @@ track(const char *tag, uint64_t cnt, TINFO *tinfo)
 	}
 	lastlen = len;
 
-	(void)printf("%s\r", msg);
-	(void)fflush(stdout);
+	if (printf("%s\r", msg) < 0)
+		die(EIO, "printf");
+	if (fflush(stdout) == EOF)
+		die(errno, "fflush");
 }
 
 /*
diff --git a/test/format/wts.c b/test/format/wts.c
index 6474bd7447b..6852fd05386 100644
--- a/test/format/wts.c
+++ b/test/format/wts.c
@@ -15,10 +15,9 @@ handle_message(WT_EVENT_HANDLER *handler, const char *message)
 	UNUSED(handler);
 
 	if (g.logfp != NULL)
-		fprintf(g.logfp, "%s\n", message);
-	else
-		printf("%s\n", message);
-	return (0);
+		return (fprintf(g.logfp, "%s\n", message) < 0 ? -1 : 0);
+
+	return (printf("%s\n", message) < 0 ? -1 : 0);
 }
 
 /*
@@ -64,7 +63,7 @@ wts_open(void)
 	 * end so they can override "standard" configuration.
 	 */
 	snprintf(config, sizeof(config),
-	    "create,error_prefix=\"%s\",cache_size=%" PRIu32 "MB,"
+	    "create,error_prefix=\"%s\",cache_size=%" PRIu32 "MB,sync=false,"
 	    "extensions=[\"%s\",\"%s\"],%s",
 	    g.progname, g.c_cache, ext1, ext2,
 	    g.config_open == NULL ? "" : g.config_open);
diff --git a/test/format/wts_ops.c b/test/format/wts_ops.c
index e4292ed7694..0ec003d186a 100644
--- a/test/format/wts_ops.c
+++ b/test/format/wts_ops.c
@@ -7,16 +7,16 @@
 
 #include "format.h"
 
-static void  col_del(WT_CURSOR *, WT_ITEM *, uint64_t, int *);
 static void  col_insert(WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t *);
-static void  col_put(WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t);
+static void  col_remove(WT_CURSOR *, WT_ITEM *, uint64_t, int *);
+static void  col_update(WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t);
 static void  nextprev(WT_CURSOR *, int, int *);
 static int   notfound_chk(const char *, int, int, uint64_t);
 static void *ops(void *);
-static void  read_row(WT_CURSOR *, WT_ITEM *, uint64_t);
-static void  row_del(WT_CURSOR *, WT_ITEM *, uint64_t, int *);
-static void  row_put(WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t, int);
 static void  print_item(const char *, WT_ITEM *);
+static void  read_row(WT_CURSOR *, WT_ITEM *, uint64_t);
+static void  row_remove(WT_CURSOR *, WT_ITEM *, uint64_t, int *);
+static void  row_update(WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t, int);
 
 /*
  * wts_ops --
@@ -48,6 +48,7 @@ wts_ops(void)
 
 	if (SINGLETHREADED) {
 		memset(&total, 0, sizeof(total));
+		total.id = 1;
 		(void)ops(&total);
 	} else {
 		/* Create thread structure. */
@@ -55,6 +56,7 @@ wts_ops(void)
 		    calloc((size_t)g.c_threads, sizeof(*tinfo))) == NULL)
 			die(errno, "calloc");
 		for (i = 0; i < g.c_threads; ++i) {
+			tinfo[i].id = (int)i + 1;
 			tinfo[i].state = TINFO_RUNNING;
 			if ((ret = pthread_create(
 			    &tinfo[i].tid, NULL, ops, &tinfo[i])) != 0)
@@ -109,11 +111,12 @@ ops(void *arg)
 	WT_CURSOR *cursor, *cursor_insert;
 	WT_SESSION *session;
 	WT_ITEM key, value;
-	uint64_t cnt, keyno;
+	uint64_t cnt, keyno, sync_op, thread_ops;
 	uint32_t op;
-	u_int np;
-	int dir, insert, notfound, ret;
 	uint8_t *keybuf, *valbuf;
+	u_int np;
+	int dir, insert, notfound, ret, sync_drop;
+	char sync_name[64];
 
 	conn = g.wts_conn;
 
@@ -148,10 +151,40 @@ ops(void *arg)
 	    WT_TABLENAME, NULL, "append", &cursor_insert)) != 0)
 		die(ret, "session.open_cursor");
 
-	for (cnt = 0; cnt < g.c_ops / g.c_threads; ++cnt) {
+	/* Each thread does its share of the total operations. */
+	thread_ops = g.c_ops / g.c_threads;
+
+	/* Pick an operation where we'll do a sync and create the name. */
+	sync_drop = 0;
+	sync_op = MMRAND(1, thread_ops);
+	snprintf(sync_name, sizeof(sync_name), "snapshot=thread-%d", tinfo->id);
+
+	for (cnt = 0; cnt < thread_ops; ++cnt) {
 		if (SINGLETHREADED && cnt % 100 == 0)
 			track("read/write ops", 0ULL, tinfo);
 
+		if (cnt == sync_op) {
+			if (sync_drop && (int)MMRAND(1, 4) == 1) {
+				if ((ret = session->drop(
+				    session, WT_TABLENAME, sync_name)) != 0)
+					die(ret, "session.drop: %s: %s",
+					    WT_TABLENAME, sync_name);
+				sync_drop = 0;
+			} else {
+				if ((ret = session->sync(
+				    session, WT_TABLENAME, sync_name)) != 0)
+					die(ret, "session.sync: %s: %s",
+					    WT_TABLENAME, sync_name);
+				sync_drop = 1;
+			}
+
+			/*
+			 * Pick the next sync operation, try for roughly five
+			 * snapshot operations per thread run.
+			 */
+			sync_op += MMRAND(1, thread_ops) / 5;
+		}
+
 		insert = notfound = 0;
 
 		keyno = MMRAND(1, g.rows);
@@ -174,18 +207,18 @@ ops(void *arg)
 				 * If deleting a non-existent record, the cursor
 				 * won't be positioned, and so can't do a next.
 				 */
-				row_del(cursor, &key, keyno, &notfound);
+				row_remove(cursor, &key, keyno, &notfound);
 				break;
 			case FIX:
 			case VAR:
-				col_del(cursor, &key, keyno, &notfound);
+				col_remove(cursor, &key, keyno, &notfound);
 				break;
 			}
 		} else if (op < g.c_delete_pct + g.c_insert_pct) {
 			++tinfo->insert;
 			switch (g.c_file_type) {
 			case ROW:
-				row_put(cursor, &key, &value, keyno, 1);
+				row_update(cursor, &key, &value, keyno, 1);
 				break;
 			case FIX:
 			case VAR:
@@ -204,11 +237,11 @@ ops(void *arg)
 			++tinfo->update;
 			switch (g.c_file_type) {
 			case ROW:
-				row_put(cursor, &key, &value, keyno, 0);
+				row_update(cursor, &key, &value, keyno, 0);
 				break;
 			case FIX:
 			case VAR:
-				col_put(cursor, &key, &value, keyno);
+				col_update(cursor, &key, &value, keyno);
 				break;
 			}
 		} else {
@@ -221,7 +254,7 @@ ops(void *arg)
 		 * If we did any operation, we've set the cursor, do a small
 		 * number of next/prev cursor operations in a random direction.
 		 */
-		dir = MMRAND(0, 1);
+		dir = (int)MMRAND(0, 1);
 		for (np = 0; np < MMRAND(1, 8); ++np) {
 			if (notfound)
 				break;
@@ -294,15 +327,6 @@ wts_read_scan(void)
 	free(keybuf);
 }
 
-#define	NTF_CHK(a) do {							\
-	switch (a) {							\
-	case 0:								\
-		break;							\
-	case 1:								\
-		return;							\
-	}								\
-} while (0)
-
 /*
  * read_row --
  *	Read and verify a single element in a row- or column-store file.
@@ -366,7 +390,8 @@ read_row(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno)
 		ret = 0;
 	}
 
-	NTF_CHK(notfound_chk("read_row", ret, notfound, keyno));
+	if (notfound_chk("read_row", ret, notfound, keyno))
+		return;
 
 	/* Compare the two. */
 	if (value.size != bdb_value.size ||
@@ -427,8 +452,9 @@ nextprev(WT_CURSOR *cursor, int next, int *notfoundp)
 	/* Retrieve the BDB value. */
 	bdb_np(next, &bdb_key.data, &bdb_key.size,
 	    &bdb_value.data, &bdb_value.size, &notfound);
-	NTF_CHK(notfound_chk(
-	    next ? "nextprev(next)" : "nextprev(prev)", ret, notfound, keyno));
+	if (notfound_chk(
+	    next ? "nextprev(next)" : "nextprev(prev)", ret, notfound, keyno))
+		return;
 
 	/* Compare the two. */
 	if (g.c_file_type == ROW) {
@@ -480,11 +506,11 @@ nextprev(WT_CURSOR *cursor, int next, int *notfoundp)
 }
 
 /*
- * row_put --
- *	Update an element in a row-store file.
+ * row_update --
+ *	Update a row in a row-store file.
  */
 static void
-row_put(
+row_update(
     WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t keyno, int insert)
 {
 	WT_SESSION *session;
@@ -508,22 +534,22 @@ row_put(
 	ret = cursor->insert(cursor);
 	if (ret != 0 && ret != WT_NOTFOUND)
 		die(ret,
-		    "row_put: %s row %" PRIu64 " by key",
+		    "row_update: %s row %" PRIu64 " by key",
 		    insert ? "insert" : "update", keyno);
 
 	if (!SINGLETHREADED)
 		return;
 
-	bdb_put(key->data, key->size, value->data, value->size, &notfound);
-	NTF_CHK(notfound_chk("row_put", ret, notfound, keyno));
+	bdb_update(key->data, key->size, value->data, value->size, &notfound);
+	(void)notfound_chk("row_update", ret, notfound, keyno);
 }
 
 /*
- * col_put --
- *	Update an element in a column-store file.
+ * col_update --
+ *	Update a row in a column-store file.
  */
 static void
-col_put(WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t keyno)
+col_update(WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t keyno)
 {
 	WT_SESSION *session;
 	int notfound, ret;
@@ -553,14 +579,14 @@ col_put(WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t keyno)
 		cursor->set_value(cursor, value);
 	ret = cursor->insert(cursor);
 	if (ret != 0 && ret != WT_NOTFOUND)
-		die(ret, "col_put: %" PRIu64, keyno);
+		die(ret, "col_update: %" PRIu64, keyno);
 
 	if (!SINGLETHREADED)
 		return;
 
 	key_gen((uint8_t *)key->data, &key->size, keyno, 0);
-	bdb_put(key->data, key->size, value->data, value->size, &notfound);
-	NTF_CHK(notfound_chk("col_put", ret, notfound, keyno));
+	bdb_update(key->data, key->size, value->data, value->size, &notfound);
+	(void)notfound_chk("col_update", ret, notfound, keyno);
 }
 
 /*
@@ -613,15 +639,15 @@ col_insert(WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t *keynop)
 		return;
 
 	key_gen((uint8_t *)key->data, &key->size, keyno, 0);
-	bdb_put(key->data, key->size, value->data, value->size, &notfound);
+	bdb_update(key->data, key->size, value->data, value->size, &notfound);
 }
 
 /*
- * row_del --
- *	Delete an element from a row-store file.
+ * row_remove --
+ *	Remove an row from a row-store file.
  */
 static void
-row_del(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int *notfoundp)
+row_remove(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int *notfoundp)
 {
 	WT_SESSION *session;
 	int notfound, ret;
@@ -638,22 +664,22 @@ row_del(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int *notfoundp)
 	cursor->set_key(cursor, key);
 	ret = cursor->remove(cursor);
 	if (ret != 0 && ret != WT_NOTFOUND)
-		die(ret, "row_del: remove %" PRIu64 " by key", keyno);
+		die(ret, "row_remove: remove %" PRIu64 " by key", keyno);
 	*notfoundp = ret == WT_NOTFOUND;
 
 	if (!SINGLETHREADED)
 		return;
 
-	bdb_del(keyno, &notfound);
-	NTF_CHK(notfound_chk("row_del", ret, notfound, keyno));
+	bdb_remove(keyno, &notfound);
+	(void)notfound_chk("row_remove", ret, notfound, keyno);
 }
 
 /*
- * col_del --
- *	Delete an element from a column-store file.
+ * col_remove --
+ *	Remove a row from a column-store file.
  */
 static void
-col_del(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int *notfoundp)
+col_remove(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int *notfoundp)
 {
 	WT_SESSION *session;
 	int notfound, ret;
@@ -668,7 +694,7 @@ col_del(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int *notfoundp)
 	cursor->set_key(cursor, keyno);
 	ret = cursor->remove(cursor);
 	if (ret != 0 && ret != WT_NOTFOUND)
-		die(ret, "col_del: remove %" PRIu64 " by key", keyno);
+		die(ret, "col_remove: remove %" PRIu64 " by key", keyno);
 	*notfoundp = ret == WT_NOTFOUND;
 
 	if (!SINGLETHREADED)
@@ -680,11 +706,11 @@ col_del(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int *notfoundp)
 	 */
 	if (g.c_file_type == FIX) {
 		key_gen((uint8_t *)key->data, &key->size, keyno, 0);
-		bdb_put(key->data, key->size, "\0", 1, &notfound);
+		bdb_update(key->data, key->size, "\0", 1, &notfound);
 	} else
-		bdb_del(keyno, &notfound);
+		bdb_remove(keyno, &notfound);
 
-	NTF_CHK(notfound_chk("col_del", ret, notfound, keyno));
+	(void)notfound_chk("col_remove", ret, notfound, keyno);
 }
 
 /*
diff --git a/test/salvage/Makefile.am b/test/salvage/Makefile.am
index ca0ef890e72..f51caab964f 100644
--- a/test/salvage/Makefile.am
+++ b/test/salvage/Makefile.am
@@ -5,5 +5,8 @@ t_SOURCES = salvage.c
 t_LDADD = $(top_builddir)/libwiredtiger.la
 t_LDFLAGS = -static
 
+# Run this during a "make check" smoke test.
+TESTS = $(noinst_PROGRAMS)
+
 clean-local:
-	rm -rf WiredTiger WiredTiger.* __slvg.*
+	rm -rf WiredTiger* *.core __*
diff --git a/test/salvage/salvage.c b/test/salvage/salvage.c
index b5a0913f320..fee3ef68fac 100644
--- a/test/salvage/salvage.c
+++ b/test/salvage/salvage.c
@@ -551,7 +551,7 @@ copy(u_int gen, u_int recno)
 	if (gen != 0) {
 		assert(fseek(ifp, (long)512, SEEK_SET) == 0);
 		assert(fread(buf, 1, PSIZE, ifp) == PSIZE);
-		dsk = (WT_PAGE_HEADER *)buf;
+		dsk = (void *)buf;
 		if (page_type != WT_PAGE_ROW_LEAF)
 			dsk->recno = recno;
 		blk = WT_BLOCK_HEADER_REF(buf);
diff --git a/test/snapshot/Makefile.am b/test/snapshot/Makefile.am
new file mode 100644
index 00000000000..d6fb5cf86d3
--- /dev/null
+++ b/test/snapshot/Makefile.am
@@ -0,0 +1,12 @@
+INCLUDES = -I$(top_builddir) -I$(top_srcdir)/src/include
+
+noinst_PROGRAMS = t
+t_SOURCES = snapshot.c
+t_LDADD = $(top_builddir)/libwiredtiger.la
+t_LDFLAGS = -static
+
+# Run this during a "make check" smoke test.
+TESTS = $(noinst_PROGRAMS)
+
+clean-local:
+	rm -rf WiredTiger WiredTiger.* __* *.core
diff --git a/test/snapshot/snapshot.c b/test/snapshot/snapshot.c
new file mode 100644
index 00000000000..af6d1d36c27
--- /dev/null
+++ b/test/snapshot/snapshot.c
@@ -0,0 +1,290 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <wiredtiger.h>
+
+#define	URI	"file:__snap"
+
+struct L {
+	int start, stop;			/* starting/stopping id */
+	const char *name;			/* snapshot name */
+} list[] = {
+	{ 100, 120, "snapshot-1" },
+	{ 200, 220, "snapshot-2" },
+	{ 300, 320, "snapshot-3" },
+	{ 400, 420, "snapshot-4" },
+	{ 500, 520, "snapshot-5" },
+	{ 100, 620, "snapshot-6" },
+	{ 200, 720, "snapshot-7" },
+	{ 300, 820, "snapshot-8" },
+	{ 400, 920, "snapshot-9" },
+	{ 500, 600, "snapshot-a" },
+	{ 0, 0, NULL }
+};
+
+void add(int, int);
+void build(void);
+void check(struct L *);
+void cursor_lock(void);
+void drop(void);
+void dump_cat(struct L *, const char *);
+void dump_snap(struct L *, const char *);
+void run(void);
+int  usage(void);
+
+WT_CONNECTION *conn;
+WT_SESSION *session;
+const char *progname;
+
+int
+main(int argc, char *argv[])
+{
+	int ch;
+
+	(void)system("rm -f WiredTiger* __*");
+
+	if ((progname = strrchr(argv[0], '/')) == NULL)
+		progname = argv[0];
+	else
+		++progname;
+
+	while ((ch = getopt(argc, argv, "")) != EOF)
+		switch (ch) {
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= optind;
+	argv += optind;
+
+	run();
+
+	return (EXIT_SUCCESS);
+}
+
+int
+usage(void)
+{
+	(void)fprintf(stderr, "usage: %s\n", progname);
+	return (EXIT_FAILURE);
+}
+
+/*
+ * run --
+ *	Worker function.
+ */
+void
+run(void)
+{
+	struct L *p;
+	char config[128];
+
+	/* Open the connection and create the file. */
+	assert(wiredtiger_open(
+	    NULL, NULL, "create,cache_size=100MB", &conn) == 0);
+	assert(conn->open_session(conn, NULL, NULL, &session) == 0);
+	(void)snprintf(config, sizeof(config),
+	    "key_format=S,value_format=S,"
+	    "internal_page_max=512,leaf_page_max=512");
+	assert(session->create(session, URI, config) == 0);
+
+	printf("building...\n");
+	build();				/* Build a set of snapshots */
+
+	printf("checking build...\n");
+	for (p = list; p->start != 0; ++p)
+		check(p);			/* Check the contents */
+
+	printf("checking cursor_lock...\n");
+	cursor_lock();
+
+	printf("checking delete...\n");
+	drop();
+
+	assert(conn->close(conn, 0) == 0);
+}
+
+/*
+ * build --
+ *	Build a file with a set of snapshots.
+ */
+void
+build(void)
+{
+	struct L *p;
+	char buf[64];
+
+	for (p = list; p->start != 0; ++p) {
+		add(p->start, p->stop);
+
+		snprintf(buf, sizeof(buf), "snapshot=%s", p->name);
+		assert(session->sync(session, URI, buf) == 0);
+		assert(session->verify(session, URI, NULL) == 0);
+	}
+}
+
+/*
+ * add --
+ *	Add records.
+ */
+void
+add(int start, int stop)
+{
+	WT_CURSOR *cursor;
+	int ret;
+	char kbuf[64], vbuf[64];
+
+	assert(session->open_cursor(
+	    session, URI, NULL, "overwrite", &cursor) == 0);
+
+	/* Insert the key/value pairs. */
+	for (; start < stop; ++start) {
+		snprintf(kbuf, sizeof(kbuf), "%010d KEY------", start);
+		cursor->set_key(cursor, kbuf);
+		snprintf(vbuf, sizeof(vbuf), "%010d VALUE----", start);
+		cursor->set_value(cursor, vbuf);
+
+		if ((ret = cursor->insert(cursor)) != 0) {
+			fprintf(stderr,
+			    "cursor->insert: %s\n", wiredtiger_strerror(ret));
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	assert(cursor->close(cursor) == 0);
+}
+
+/*
+ * check --
+ *	Check the contents of an individual snapshot.
+ */
+void
+check(struct L *snap)
+{
+	dump_cat(snap, "__dump.1");		/* Dump out the records */
+	dump_snap(snap, "__dump.2");		/* Dump out the snapshot */
+
+	/*
+	 * Sort the two versions of the snapshot, discarding overlapping
+	 * entries, and compare the results.
+	 */
+	if (system(
+	    "sort -u -o __dump.1 __dump.1 && "
+	    "sort -u -o __dump.2 __dump.2 && "
+	    "cmp __dump.1 __dump.2 > /dev/null")) {
+		fprintf(stderr,
+		    "check failed, snapshot results for %s were incorrect\n",
+		    snap->name);
+		exit(EXIT_FAILURE);
+	 }
+}
+
+/*
+ * dump_cat --
+ *	Output the expected rows into a file.
+ */
+void
+dump_cat(struct L *snap, const char *f)
+{
+	struct L *p;
+	FILE *fp;
+	int row;
+
+	assert((fp = fopen(f, "w")) != NULL);
+
+	for (p = list; p <= snap; ++p) {
+		for (row = p->start; row < p->stop; ++row)
+			fprintf(fp,
+			    "%010d KEY------\n%010d VALUE----\n", row, row);
+	}
+
+	assert(fclose(fp) == 0);
+}
+
+/*
+ * dump_snap --
+ *	Dump a snapshot into a file.
+ */
+void
+dump_snap(struct L *snap, const char *f)
+{
+	FILE *fp;
+	WT_CURSOR *cursor;
+	int ret;
+	const char *key, *value;
+	char buf[64];
+
+	assert((fp = fopen(f, "w")) != NULL);
+
+	snprintf(buf, sizeof(buf), "snapshot=%s", snap->name);
+	assert(session->open_cursor(session, URI, NULL, buf, &cursor) == 0);
+
+	while ((ret = cursor->next(cursor)) == 0) {
+		assert(cursor->get_key(cursor, &key) == 0);
+		assert(cursor->get_value(cursor, &value) == 0);
+		fprintf(fp, "%s\n%s\n", key, value);
+	}
+	assert(ret == WT_NOTFOUND);
+
+	assert(cursor->close(cursor) == 0);
+	assert(fclose(fp) == 0);
+}
+
+/*
+ * cursor_lock --
+ *	Check locking cases.
+ */
+void
+cursor_lock(void)
+{
+	WT_CURSOR *cursor, *c1, *c2, *c3;
+	char buf[64];
+
+	/* Check that you can't drop a snapshot if it's in use. */
+	snprintf(buf, sizeof(buf), "snapshot=%s", list[0].name);
+	assert(session->open_cursor(session, URI, NULL, buf, &cursor) == 0);
+	assert(session->drop(session, URI, buf) != 0);
+	assert(cursor->close(cursor) == 0);
+
+	/* Check you can open two snapshots at the same time. */
+	snprintf(buf, sizeof(buf), "snapshot=%s", list[0].name);
+	assert(session->open_cursor(session, URI, NULL, buf, &c1) == 0);
+	snprintf(buf, sizeof(buf), "snapshot=%s", list[1].name);
+	assert(session->open_cursor(session, URI, NULL, buf, &c2) == 0);
+	assert(session->open_cursor(session, URI, NULL, NULL, &c3) == 0);
+	assert(c2->close(c2) == 0);
+	assert(c1->close(c1) == 0);
+	assert(c3->close(c3) == 0);
+}
+
+/*
+ * drop --
+ *	Delete a snapshot and verify the file.
+ */
+void
+drop(void)
+{
+	struct L *p;
+	char buf[64];
+
+	for (p = list; p->start != 0; ++p) {
+		snprintf(buf, sizeof(buf), "snapshot=%s", p->name);
+		assert(session->drop(session, URI, buf) == 0);
+		assert(session->verify(session, URI, NULL) == 0);
+	}
+}
diff --git a/test/suite/test_config03.py b/test/suite/test_config03.py
index 7c73fec8419..a76b67cd76f 100644
--- a/test/suite/test_config03.py
+++ b/test/suite/test_config03.py
@@ -83,9 +83,9 @@ class test_config03(test_base03.test_base03):
     scenarios = wtscenario.prune_scenarios(all_scenarios, 1000)
     scenarios = wtscenario.number_scenarios(scenarios)
 
-    wttest.WiredTigerTestCase.printVerbose(2, 'test_config03: running ' + \
-                          str(len(scenarios)) + ' of ' + \
-                          str(len(all_scenarios)) + ' possible scenarios')
+    #wttest.WiredTigerTestCase.printVerbose(2, 'test_config03: running ' + \
+    #                      str(len(scenarios)) + ' of ' + \
+    #                      str(len(all_scenarios)) + ' possible scenarios')
 
     def setUpConnectionOpen(self, dir):
         args = ''
diff --git a/test/suite/test_schema03.py b/test/suite/test_schema03.py
index c28eaed5bdc..ea6a9343ab3 100644
--- a/test/suite/test_schema03.py
+++ b/test/suite/test_schema03.py
@@ -268,9 +268,9 @@ class test_schema03(wttest.WiredTigerTestCase):
     #   or
     # scenarios = [ scenarios[0], scenarios[30], scenarios[40] ]
 
-    wttest.WiredTigerTestCase.printVerbose(2, 'test_schema03: running ' + \
-                          str(len(scenarios)) + ' of ' + \
-                          str(len(all_scenarios)) + ' possible scenarios')
+    #wttest.WiredTigerTestCase.printVerbose(2, 'test_schema03: running ' + \
+    #                      str(len(scenarios)) + ' of ' + \
+    #                      str(len(all_scenarios)) + ' possible scenarios')
 
     # This test requires a large number of open files.
     # Increase our resource limits before we start
diff --git a/test/suite/test_txn01.py b/test/suite/test_txn01.py
new file mode 100644
index 00000000000..c0a4c6d78cd
--- /dev/null
+++ b/test/suite/test_txn01.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2008-2012 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_txn01.py
+#	Transactions: basic functionality
+#
+
+import os, struct
+import wiredtiger, wttest
+
+class test_txn01(wttest.WiredTigerTestCase):
+    tablename = 'test_txn01'
+    uri = 'table:' + tablename
+    nentries = 10000
+    create_params = 'key_format=r,value_format=S'
+
+    # Overrides WiredTigerTestCase
+    def setUpConnectionOpen(self, dir):
+        conn = wiredtiger.wiredtiger_open(dir, 'create,' +
+                ('error_prefix="%s: ",' % self.shortid()) +
+                'transactional,')
+        self.pr(`conn`)
+        return conn
+
+    def check_checkpoint(self, expected):
+        s = self.conn.open_session()
+        s.checkpoint("snapshot=test")
+        try:
+            cursor = s.open_cursor(self.uri, None, "snapshot=test")
+            count = 0
+            for r in cursor:
+                count += 1
+        finally:
+            s.close()
+        self.assertEqual(count, expected)
+
+    def check_transaction(self, expected):
+        s = self.conn.open_session()
+        s.begin_transaction('isolation=snapshot')
+        try:
+            cursor = s.open_cursor(self.uri, None)
+            count = 0
+            for r in cursor:
+                count += 1
+        finally:
+            s.close()
+        self.assertEqual(count, expected)
+
+    def check_count(self, expected):
+        self.check_transaction(expected)
+        self.check_checkpoint(expected)
+
+    def test_visibilty(self):
+        self.session.create(self.uri, self.create_params)
+        committed_inserts = 0
+        self.check_count(committed_inserts)
+        self.session.begin_transaction()
+        cursor = self.session.open_cursor(self.uri, None, "append")
+        for i in xrange(self.nentries):
+            if i > 0 and (i * 10) % self.nentries == 0:
+                self.check_count(committed_inserts)
+                self.session.commit_transaction()
+                committed_inserts = i
+                self.session.begin_transaction()
+                cursor = self.session.open_cursor(self.uri, None, "append")
+            cursor.set_value(("value%06d" % i) * 100)
+            cursor.insert()
+        cursor.close()
+        self.check_count(committed_inserts)
+        self.session.commit_transaction()
+        committed_inserts = self.nentries
+        self.check_count(committed_inserts)
+
+if __name__ == '__main__':
+    wttest.run()
diff --git a/test/suite/test_txn02.py b/test/suite/test_txn02.py
new file mode 100644
index 00000000000..b406ad16b3b
--- /dev/null
+++ b/test/suite/test_txn02.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2008-2012 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_txn02.py
+#   Transactions: commits and rollbacks
+#
+
+import os, struct
+import wiredtiger, wttest
+from wtscenario import multiply_scenarios, number_scenarios
+
+class test_txn02(wttest.WiredTigerTestCase):
+    tablename = 'test_txn02'
+    uri = 'table:' + tablename
+
+    types = [
+        ('row', dict(tabletype='row',
+                    create_params = 'key_format=i,value_format=i')),
+        ('var', dict(tabletype='var',
+                    create_params = 'key_format=r,value_format=i')),
+        ('fix', dict(tabletype='fix',
+                    create_params = 'key_format=r,value_format=8t')),
+    ]
+    op1s = [
+        ('i4', dict(op1=('insert', 4))),
+        ('r1', dict(op1=('remove', 1))),
+        ('u10', dict(op1=('update', 10))),
+    ]
+    op2s = [
+        ('i6', dict(op2=('insert', 6))),
+        ('r4', dict(op2=('remove', 4))),
+        ('u4', dict(op2=('update', 4))),
+    ]
+    op3s = [
+        ('i12', dict(op3=('insert', 12))),
+        ('r4', dict(op3=('remove', 4))),
+        ('u4', dict(op3=('update', 4))),
+    ]
+    op4s = [
+        ('i14', dict(op4=('insert', 14))),
+        ('r12', dict(op4=('remove', 12))),
+        ('u12', dict(op4=('update', 12))),
+    ]
+    txn1s = [('t1c', dict(txn1='commit')), ('t1r', dict(txn1='rollback'))]
+    txn2s = [('t2c', dict(txn2='commit')), ('t2r', dict(txn2='rollback'))]
+    txn3s = [('t3c', dict(txn3='commit')), ('t3r', dict(txn3='rollback'))]
+    txn4s = [('t4c', dict(txn4='commit')), ('t4r', dict(txn4='rollback'))]
+    scenarios = number_scenarios(multiply_scenarios('.', types,
+        op1s, txn1s, op2s, txn2s, op3s, txn3s, op4s, txn4s))
+
+    # Overrides WiredTigerTestCase
+    def setUpConnectionOpen(self, dir):
+        conn = wiredtiger.wiredtiger_open(dir, 'create,' +
+                ('error_prefix="%s: ",' % self.shortid()) +
+                'transactional,')
+        self.pr(`conn`)
+        return conn
+
+    def check(self, expected):
+        c = self.session.open_cursor(self.uri, None)
+        actual = dict((k, v) for k, v in c if v != 0)
+        c.close()
+        self.assertEqual(actual, expected)
+
+    def test_ops(self):
+        self.session.create(self.uri, self.create_params)
+        # Set up the table with entries for 1 and 10
+        c = self.session.open_cursor(self.uri, None)
+        c.set_value(1)
+        c.set_key(1)
+        c.insert()
+        c.set_key(10)
+        c.insert()
+        c.close()
+        expected = {1:1, 10:1}
+
+        ops = (self.op1, self.op2, self.op3, self.op4)
+        txns = (self.txn1, self.txn2, self.txn3, self.txn4)
+        for i, ot in enumerate(zip(ops, txns)):
+            self.session.begin_transaction()
+            c = self.session.open_cursor(self.uri, None, 'overwrite')
+            ok, txn = ot
+            op, k = ok
+            # We use the overwrite config so insert can update as needed.
+            if op == 'insert' or op == 'update':
+                c.set_key(k)
+                c.set_value(i + 2)
+                c.insert()
+                if txn == 'commit':
+                    expected[k] = i + 2
+            elif op == 'remove':
+                c.set_key(k)
+                c.remove()
+                if txn == 'commit' and k in expected:
+                    del expected[k]
+            else:
+                print "UNKNOWN op", op
+            if txn == 'commit':
+                # The transaction should see its own changes
+                self.check(expected)
+                self.session.commit_transaction()
+            elif txn == 'rollback':
+                self.session.rollback_transaction()
+            else:
+                print "UNKNOWN op", op
+            self.check(expected)
+        self.session.drop(self.uri)
+
+if __name__ == '__main__':
+    wttest.run()
diff --git a/test/suite/test_util05.py b/test/suite/test_util05.py
index b6571b9e903..5139e71d67b 100644
--- a/test/suite/test_util05.py
+++ b/test/suite/test_util05.py
@@ -172,48 +172,6 @@ class test_util05(wttest.WiredTigerTestCase, suite_subprocess):
         self.runWt(["verify", "table:" + self.tablename], errfilename="verifyerr.out")
         self.check_non_empty_file("verifyerr.out")
 
-    def test_verify_process_appended_null(self):
-        """
-        Test verify in a 'wt' process on a table that is purposely damaged,
-        with some null bytes at the end of the file.
-        """
-        params = 'key_format=S,value_format=S'
-        self.session.create('table:' + self.tablename, params)
-        self.populate(self.tablename)
-        with self.open_and_position(self.tablename, 100) as f:
-            for i in range(0, 6):
-                f.write(struct.pack('B', 0))
-        self.runWt(["verify", "table:" + self.tablename], errfilename="verifyerr.out")
-        self.check_non_empty_file("verifyerr.out")
-
-    def test_verify_process_appended_null_block(self):
-        """
-        Test verify in a 'wt' process on a table that is purposely damaged,
-        with some null bytes at the end of the file.
-        """
-        params = 'key_format=S,value_format=S'
-        self.session.create('table:' + self.tablename, params)
-        self.populate(self.tablename)
-        with self.open_and_position(self.tablename, 100) as f:
-            for i in range(0, 4096):
-                f.write(struct.pack('B', 0))
-        self.runWt(["verify", "table:" + self.tablename], errfilename="verifyerr.out")
-        self.check_non_empty_file("verifyerr.out")
-
-    def test_verify_process_appended_junk(self):
-        """
-        Test verify in a 'wt' process on a table that is purposely damaged,
-        with some junk bytes at the end of the file.
-        """
-        params = 'key_format=S,value_format=S'
-        self.session.create('table:' + self.tablename, params)
-        self.populate(self.tablename)
-        with self.open_and_position(self.tablename, 100) as f:
-            for i in range(0, 1024):
-                f.write('\x01\0x02\x03\x04')
-        self.runWt(["verify", "table:" + self.tablename], errfilename="verifyerr.out")
-        self.check_non_empty_file("verifyerr.out")
-
     def test_verify_process_truncated(self):
         """
         Test verify in a 'wt' process on a table that is purposely damaged,
@@ -229,7 +187,7 @@ class test_util05(wttest.WiredTigerTestCase, suite_subprocess):
 
     def test_verify_process_zero_length(self):
         """
-        Test verify in a 'wt' process on a table that has junk added
+        Test verify in a 'wt' process on a zero-length table.
         """
         params = 'key_format=S,value_format=S'
         self.session.create('table:' + self.tablename, params)
diff --git a/test/suite/test_util11.py b/test/suite/test_util11.py
index 0bbb946a780..cb279b5d998 100644
--- a/test/suite/test_util11.py
+++ b/test/suite/test_util11.py
@@ -52,7 +52,7 @@ class test_util11(wttest.WiredTigerTestCase, suite_subprocess):
         """
 
         # Construct what we think we'll find
-        filelist = 'file:WiredTiger.wt\n'
+        filelist = ''
         outfile = "listout.txt"
         self.runWt(["list"], outfilename=outfile)
         self.check_file_content(outfile, filelist)
@@ -72,7 +72,7 @@ class test_util11(wttest.WiredTigerTestCase, suite_subprocess):
         self.populate(pfx + '3')
 
         # Construct what we think we'll find
-        filelist = 'file:WiredTiger.wt\n'
+        filelist = ''
         tablelist = ''
         for i in range(1, 6):
             filelist += 'file:' + pfx + str(i) + '.wt\n'
@@ -100,7 +100,7 @@ class test_util11(wttest.WiredTigerTestCase, suite_subprocess):
         self.session.drop('table:' + pfx + '4', None)
 
         # Construct what we think we'll find
-        filelist = 'file:WiredTiger.wt\n'
+        filelist = ''
         tablelist = ''
         filelist += 'file:' + pfx + '1.wt\n'
         tablelist += 'table:' + pfx + '1\n'
@@ -134,7 +134,7 @@ class test_util11(wttest.WiredTigerTestCase, suite_subprocess):
         self.session.drop('table:' + pfx + '1', None)
 
         # Construct what we think we'll find
-        filelist = 'file:WiredTiger.wt\n'
+        filelist = ''
         outfile = "listout.txt"
         self.runWt(["list"], outfilename=outfile)
         self.check_file_content(outfile, filelist)
diff --git a/test/thread/Makefile.am b/test/thread/Makefile.am
index 5ffa1a8aada..d43e13b2bbe 100644
--- a/test/thread/Makefile.am
+++ b/test/thread/Makefile.am
@@ -2,8 +2,10 @@ INCLUDES = -I$(top_builddir)
 
 noinst_PROGRAMS = t
 t_LDADD = $(top_builddir)/libwiredtiger.la
-t_SOURCES = thread.h t.c load.c run.c stats.c
+t_SOURCES = thread.h file.c rw.c stats.c t.c
 t_LDFLAGS = -static
 
+TESTS = $(noinst_PROGRAMS)
+
 clean-local:
-	rm -rf WiredTiger __*
+	rm -rf WiredTiger* __* *.core
diff --git a/test/thread/load.c b/test/thread/file.c
index 71450ac973a..6012681d51b 100644
--- a/test/thread/load.c
+++ b/test/thread/file.c
@@ -7,18 +7,12 @@
 
 #include "thread.h"
 
-void
-load(void)
+static void
+file_create(void)
 {
-	WT_CURSOR *cursor;
-	WT_ITEM *key, _key, *value, _value;
 	WT_SESSION *session;
-	char *p, *end, keybuf[64], valuebuf[64], config[128];
-	u_int keyno;
 	int ret;
-
-	key = &_key;
-	value = &_value;
+	char *p, *end, config[128];
 
 	if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
 		die("conn.session", ret);
@@ -34,12 +28,34 @@ load(void)
 		(void)snprintf(p, (size_t)(end - p), ",value_format=3t");
 
 	if ((ret = session->create(session, FNAME, config)) != 0)
-		die("session.create", ret);
+		if (ret != EEXIST)
+			die("session.create", ret);
+
+	if ((ret = session->close(session, NULL)) != 0)
+		die("session.close", ret);
+}
+
+void
+load(void)
+{
+	WT_CURSOR *cursor;
+	WT_ITEM *key, _key, *value, _value;
+	WT_SESSION *session;
+	char keybuf[64], valuebuf[64];
+	u_int keyno;
+	int ret;
+
+	file_create();
+
+	if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
+		die("conn.session", ret);
 
 	if ((ret =
 	    session->open_cursor(session, FNAME, NULL, "bulk", &cursor)) != 0)
 		die("cursor.open", ret);
 
+	key = &_key;
+	value = &_value;
 	for (keyno = 0; keyno < nkeys; ++keyno) {
 		if (ftype == ROW) {
 			key->data = keybuf;
diff --git a/test/thread/run.c b/test/thread/rw.c
index 0488bbc6cf4..613ddb46100 100644
--- a/test/thread/run.c
+++ b/test/thread/rw.c
@@ -45,7 +45,7 @@ r(void)
 }
 
 int
-run(u_int readers, u_int writers)
+rw_start(u_int readers, u_int writers)
 {
 	clock_t start, stop;
 	double seconds;
@@ -269,6 +269,6 @@ print_stats(u_int nthreads)
 
 	s = run_stats;
 	for (id = 0; id < nthreads; ++id, ++s)
-		printf("%2d: read: %6d, remove: %6d, update: %6d\n",
+		printf("%3d: read %6d, remove %6d, update %6d\n",
 		    id, s->reads, s->remove, s->update);
 }
diff --git a/test/thread/t.c b/test/thread/t.c
index f64b6112837..554c624073b 100644
--- a/test/thread/t.c
+++ b/test/thread/t.c
@@ -15,6 +15,7 @@ int session_per_op;				/* New session per operation */
 static char *progname;				/* Program name */
 static FILE *logfp;				/* Log file */
 
+static int  handle_error(WT_EVENT_HANDLER *, int, const char *);
 static int  handle_message(WT_EVENT_HANDLER *, const char *);
 static void onint(int);
 static void shutdown(void);
@@ -39,15 +40,12 @@ main(int argc, char *argv[])
 	nkeys = 1000;
 	nops = 10000;
 	readers = 10;
-	runs = 0;
+	runs = 1;
 	session_per_op = 0;
 	writers = 10;
 
-	while ((ch = getopt(argc, argv, "1C:k:l:n:R:r:St:W:")) != EOF)
+	while ((ch = getopt(argc, argv, "C:k:l:n:R:r:St:W:")) != EOF)
 		switch (ch) {
-		case '1':			/* One run */
-			runs = 1;
-			break;
 		case 'C':			/* wiredtiger_open config */
 			config_open = optarg;
 			break;
@@ -114,7 +112,7 @@ main(int argc, char *argv[])
 
 		load();				/* Load initial records */
 						/* Loop operations */
-		if (run(readers, writers))
+		if (rw_start(readers, writers))
 			return (EXIT_FAILURE);
 
 		stats();			/* Statistics */
@@ -132,7 +130,7 @@ static void
 wt_connect(char *config_open)
 {
 	static WT_EVENT_HANDLER event_handler = {
-		NULL,
+		handle_error,
 		handle_message,
 		NULL
 	};
@@ -183,15 +181,23 @@ shutdown(void)
 }
 
 static int
+handle_error(WT_EVENT_HANDLER *handler, int error, const char *errmsg)
+{
+	UNUSED(handler);
+	UNUSED(error);
+
+	return (fprintf(stderr, "%s\n", errmsg) < 0 ? -1 : 0);
+}
+
+static int
 handle_message(WT_EVENT_HANDLER *handler, const char *message)
 {
 	UNUSED(handler);
 
-	if (logfp == NULL)
-		printf("%s\n", message);
-	else
-		fprintf(logfp, "%s\n", message);
-	return (0);
+	if (logfp != NULL)
+		return (fprintf(logfp, "%s\n", message) < 0 ? -1 : 0);
+
+	return (printf("%s\n", message) < 0 ? -1 : 0);
 }
 
 /*
@@ -229,17 +235,16 @@ usage(void)
 {
 	fprintf(stderr,
 	    "usage: %s "
-	    "[-1S] [-C wiredtiger-config] [-k keys] [-l log]\n\t"
+	    "[-S] [-C wiredtiger-config] [-k keys] [-l log]\n\t"
 	    "[-n ops] [-R readers] [-r runs] [-t f|r|v] [-W writers]\n",
 	    progname);
 	fprintf(stderr, "%s",
-	    "\t-1 run once\n"
 	    "\t-C specify wiredtiger_open configuration arguments\n"
 	    "\t-k set number of keys to load\n"
 	    "\t-l specify a log file\n"
 	    "\t-n set number of operations each thread does\n"
 	    "\t-R set number of reading threads\n"
-	    "\t-r set number of runs\n"
+	    "\t-r set number of runs (0 for continuous)\n"
 	    "\t-S open/close a session on every operation\n"
 	    "\t-t set a file type (fix | row | var)\n"
 	    "\t-W set number of writing threads\n");
diff --git a/test/thread/thread.h b/test/thread/thread.h
index 5fb9a1f4e2f..0cb70e43f51 100644
--- a/test/thread/thread.h
+++ b/test/thread/thread.h
@@ -39,5 +39,5 @@ void die(const char *, int) __attribute__((noreturn));
 void die(const char *, int);
 #endif
 void load(void);
-int  run(u_int, u_int);
+int  rw_start(u_int, u_int);
 void stats(void);
author	Michael Cahill <michael.cahill@wiredtiger.com>	2012-06-04 17:05:36 +1000
committer	Michael Cahill <michael.cahill@wiredtiger.com>	2012-06-04 17:05:36 +1000
commit	953b622700125746202c116452638e0181db9165 (patch)
tree	bc03d820966af3cc87a2181137df6374ab7faacc
parent	d20711f22bad7fcd401367977d883cd1bba9c017 (diff)
parent	97cb94c0ccdf8e4554e4b233271f9ba219fee811 (diff)
download	mongo-953b622700125746202c116452638e0181db9165.tar.gz