Restructure the tree to ease merge.

refs #27 --HG-- branch : keith rename : lint/fl => dist/lint/fl rename : lint/lint.current => dist/lint/lint.current rename : inc_posix/bitstring.h => include/bitstring.h rename : inc_posix/btree.h => include/btree.h rename : inc_posix/cache.h => include/cache.h rename : inc_posix/debug.h => include/debug.h rename : inc_posix/extern.h => include/extern.h rename : inc_posix/fh.h => include/fh.h rename : inc_posix/mem.h => include/mem.h rename : inc_posix/misc.h => include/misc.h rename : inc_posix/mutex.h => include/mutex.h rename : inc_posix/queue.h => include/queue.h rename : inc_posix/serial.h => include/serial.h rename : inc_posix/stat.h => include/stat.h rename : inc_posix/util.h => include/util.h rename : inc_posix/walk.h => include/walk.h rename : inc_posix/wiredtiger.in => include/wiredtiger.in rename : inc_posix/wt_internal.in => include/wt_internal.in rename : btree/bt_alloc.c => src/btree/bt_alloc.c rename : btree/bt_bulk.c => src/btree/bt_bulk.c rename : btree/bt_cache.c => src/btree/bt_cache.c rename : btree/bt_close.c => src/btree/bt_close.c rename : btree/bt_cmp.c => src/btree/bt_cmp.c rename : btree/bt_debug.c => src/btree/bt_debug.c rename : btree/bt_desc.c => src/btree/bt_desc.c rename : btree/bt_discard.c => src/btree/bt_discard.c rename : btree/bt_dump.c => src/btree/bt_dump.c rename : btree/bt_evict.c => src/btree/bt_evict.c rename : btree/bt_misc.c => src/btree/bt_misc.c rename : btree/bt_open.c => src/btree/bt_open.c rename : btree/bt_ovfl.c => src/btree/bt_ovfl.c rename : btree/bt_page.c => src/btree/bt_page.c rename : btree/bt_read.c => src/btree/bt_read.c rename : btree/bt_reconcile.c => src/btree/bt_reconcile.c rename : btree/bt_ret.c => src/btree/bt_ret.c rename : btree/bt_rw.c => src/btree/bt_rw.c rename : btree/bt_stat.c => src/btree/bt_stat.c rename : btree/bt_sync.c => src/btree/bt_sync.c rename : btree/bt_vrfy.c => src/btree/bt_vrfy.c rename : btree/bt_walk.c => src/btree/bt_walk.c rename : btree/c_drain.c => src/btree/c_drain.c rename : btree/c_init.c => src/btree/c_init.c rename : btree/c_page.c => src/btree/c_page.c rename : btree/c_read.c => src/btree/c_read.c rename : btree/col_get.c => src/btree/col_get.c rename : btree/col_put.c => src/btree/col_put.c rename : btree/col_srch.c => src/btree/col_srch.c rename : btree/row_get.c => src/btree/row_get.c rename : btree/row_put.c => src/btree/row_put.c rename : btree/row_srch.c => src/btree/row_srch.c rename : db/db_err.c => src/db/db_err.c rename : db/db_getset.c => src/db/db_getset.c rename : db/db_handle.c => src/db/db_handle.c rename : db/db_huffman.c => src/db/db_huffman.c rename : db/db_open.c => src/db/db_open.c rename : db/db_stat.c => src/db/db_stat.c rename : db/db_sync.c => src/db/db_sync.c rename : env/env_err.c => src/env/env_err.c rename : env/env_getset.c => src/env/env_getset.c rename : env/env_global.c => src/env/env_global.c rename : env/env_handle.c => src/env/env_handle.c rename : env/env_init.c => src/env/env_init.c rename : env/env_msg.c => src/env/env_msg.c rename : env/env_open.c => src/env/env_open.c rename : env/env_stat.c => src/env/env_stat.c rename : env/env_sync.c => src/env/env_sync.c rename : env/env_toc.c => src/env/env_toc.c rename : env/env_workq.c => src/env/env_workq.c rename : os_posix/os_abort.c => src/os_posix/os_abort.c rename : os_posix/os_alloc.c => src/os_posix/os_alloc.c rename : os_posix/os_filesize.c => src/os_posix/os_filesize.c rename : os_posix/os_fsync.c => src/os_posix/os_fsync.c rename : os_posix/os_mtx.c => src/os_posix/os_mtx.c rename : os_posix/os_open.c => src/os_posix/os_open.c rename : os_posix/os_rw.c => src/os_posix/os_rw.c rename : os_posix/os_sleep.c => src/os_posix/os_sleep.c rename : os_posix/os_thread.c => src/os_posix/os_thread.c rename : os_posix/os_yield.c => src/os_posix/os_yield.c rename : support/api.c => src/support/api.c rename : support/cksum.c => src/support/cksum.c rename : support/err.c => src/support/err.c rename : support/hazard.c => src/support/hazard.c rename : support/huffman.c => src/support/huffman.c rename : support/pow.c => src/support/pow.c rename : support/prime.c => src/support/prime.c rename : support/progress.c => src/support/progress.c rename : support/scratch.c => src/support/scratch.c rename : support/serial.c => src/support/serial.c rename : support/simple_setup.c => src/support/simple_setup.c rename : support/stat.c => src/support/stat.c rename : support/strerror.c => src/support/strerror.c rename : support/version.c => src/support/version.c rename : utilities/db_dump/util_dump.c => src/utilities/db_dump/util_dump.c rename : utilities/db_load/util_load.c => src/utilities/db_load/util_load.c rename : utilities/db_stat/util_stat.c => src/utilities/db_stat/util_stat.c rename : utilities/db_verify/util_verify.c => src/utilities/db_verify/util_verify.c
author: Michael Cahill <michael.cahill@wiredtiger.com> 2011-02-01 09:24:17 +1100
committer: Michael Cahill <michael.cahill@wiredtiger.com> 2011-02-01 09:24:17 +1100
commit: 7ebbbf1d52c1ed989cfe5f4fde3b98e983db2e63 (patch)
tree: 0e0fd0f6b190dbcd283ca3c4040b5dcd89a94014 /src
parent: 6f87637341366fb90f890a5ef860e90c57b36d1f (diff)
download: mongo-7ebbbf1d52c1ed989cfe5f4fde3b98e983db2e63.tar.gz
78 files changed, 18464 insertions, 0 deletions
diff --git a/src/btree/bt_alloc.c b/src/btree/bt_alloc.c
new file mode 100644
index 00000000000..4477ce4e0f9
--- /dev/null
+++ b/src/btree/bt_alloc.c
@@ -0,0 +1,106 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static void __wt_file_extend(WT_TOC *, uint32_t *, uint32_t);
+
+#ifdef HAVE_DIAGNOSTIC
+static int __wt_file_free_write(WT_TOC *, uint32_t, uint32_t);
+#endif
+
+/*
+ * __wt_file_alloc --
+ *	Alloc a chunk of space from the underlying file.
+ */
+int
+__wt_file_alloc(WT_TOC *toc, uint32_t *addrp, uint32_t size)
+{
+	IDB *idb;
+
+	idb = toc->db->idb;
+
+	__wt_file_extend(toc, addrp, size);
+
+	WT_STAT_INCR(idb->stats, DB_ALLOC);
+
+	return (0);
+}
+
+/*
+ * __wt_file_extend --
+ *	Extend the file to allocate space.
+ */
+static void
+__wt_file_extend(WT_TOC *toc, uint32_t *addrp, uint32_t size)
+{
+	DB *db;
+	IDB *idb;
+	WT_FH *fh;
+
+	db = toc->db;
+	idb = db->idb;
+	fh = idb->fh;
+
+	/* Extend the file. */
+	*addrp = WT_OFF_TO_ADDR(db, fh->file_size);
+	fh->file_size += size;
+
+	WT_STAT_INCR(idb->stats, DB_ALLOC_FILE);
+}
+
+/*
+ * __wt_file_free --
+ *	Free a chunk of space to the underlying file.
+ */
+int
+__wt_file_free(WT_TOC *toc, uint32_t addr, uint32_t size)
+{
+	WT_STATS *stats;
+
+	stats = toc->db->idb->stats;
+
+#ifdef HAVE_DIAGNOSTIC
+	WT_RET(__wt_file_free_write(toc, addr, size));
+#endif
+
+	WT_STAT_INCR(stats, DB_FREE);
+
+	return (0);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_file_free_write --
+ *	Overwrite the space in the file so future reads don't get fooled.
+ *	DIAGNOSTIC only.
+ */
+static int
+__wt_file_free_write(WT_TOC *toc, uint32_t addr, uint32_t size)
+{
+	DBT *tmp;
+	WT_PAGE_DISK *dsk;
+	uint32_t allocsize;
+	int ret;
+
+	allocsize = toc->db->allocsize;
+	ret = 0;
+
+	WT_RET(__wt_scr_alloc(toc, allocsize, &tmp));
+	memset(tmp->data, 0, allocsize);
+
+	dsk = tmp->data;
+	dsk->type = WT_PAGE_FREE;
+	for (; size >= allocsize; size -= allocsize)
+		WT_ERR(__wt_page_disk_write(toc, dsk, addr++, allocsize));
+
+err:	__wt_scr_release(&tmp);
+	return (ret);
+}
+#endif
diff --git a/src/btree/bt_bulk.c b/src/btree/bt_bulk.c
new file mode 100644
index 00000000000..f88c0d5e8ae
--- /dev/null
+++ b/src/btree/bt_bulk.c
@@ -0,0 +1,1467 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * WT_STACK --
+ *	We maintain a stack of parent pages as we build the tree, encapsulated
+ *	in this structure.
+ */
+typedef struct {
+	WT_PAGE	*page;				/* page header */
+	uint8_t	*first_free;			/* page's first free byte */
+	uint32_t space_avail;			/* page's space available */
+
+	DBT	*tmp;				/* page-in-a-buffer */
+	void	*data;				/* last on-page WT_COL/WT_ROW */
+} WT_STACK_ELEM;
+typedef struct {
+	WT_STACK_ELEM *elem;			/* stack */
+	u_int size;				/* stack size */
+} WT_STACK;
+
+static int __wt_bulk_dbt_copy(ENV *, DBT *, DBT *);
+static int __wt_bulk_dup_offpage(WT_TOC *, DBT **, DBT **, DBT *, WT_ITEM *,
+		uint32_t, uint32_t, WT_OFF *, int (*)(DB *, DBT **, DBT **));
+static int __wt_bulk_fix(WT_TOC *, void (*)(const char *,
+		uint64_t), int (*)(DB *, DBT **, DBT **));
+static int __wt_bulk_ovfl_copy(WT_TOC *, WT_OVFL *, WT_OVFL *);
+static int __wt_bulk_ovfl_write(WT_TOC *, DBT *, WT_OVFL *);
+static int __wt_bulk_promote(
+		WT_TOC *, WT_PAGE *, uint64_t, WT_STACK *, u_int, uint32_t *);
+static int __wt_bulk_scratch_page(
+		WT_TOC *, uint32_t, uint32_t, uint32_t, WT_PAGE **, DBT **);
+static int __wt_bulk_stack_put(WT_TOC *, WT_STACK *);
+static int __wt_bulk_var(WT_TOC *, uint32_t, void (*)(const char *,
+		uint64_t), int (*)(DB *, DBT **, DBT **));
+static int __wt_item_build_key(WT_TOC *, DBT *, WT_ITEM *, WT_OVFL *);
+
+/*
+ * __wt_db_bulk_load --
+ *	Db.bulk_load method.
+ */
+int
+__wt_db_bulk_load(WT_TOC *toc, uint32_t flags,
+    void (*f)(const char *, uint64_t), int (*cb)(DB *, DBT **, DBT **))
+{
+	DB *db;
+	IDB *idb;
+	uint32_t addr;
+
+	db = toc->db;
+	idb = db->idb;
+
+	/*
+	 * XXX
+	 * Write out the description record -- this goes away when we figure
+	 * out how the table schema is going to work, but for now, we use the
+	 * first sector, and this file extend makes sure we don't allocate it
+	 * as a table page.
+	 */
+	WT_RET(__wt_file_alloc(toc, &addr, 512));
+
+	if (F_ISSET(idb, WT_COLUMN))
+		WT_DB_FCHK(db, "DB.bulk_load", flags, 0);
+
+	/*
+	 * There are two styles of bulk-load: variable length pages or
+	 * fixed-length pages.
+	 */
+	if (F_ISSET(idb, WT_COLUMN) && db->fixed_len != 0)
+		WT_RET(__wt_bulk_fix(toc, f, cb));
+	else
+		WT_RET(__wt_bulk_var(toc, flags, f, cb));
+
+	/* Get a permanent root page reference. */
+	return (__wt_root_pin(toc));
+}
+
+/*
+ * __wt_bulk_fix
+ *	Db.bulk_load method for column-store, fixed-length database pages.
+ */
+static int
+__wt_bulk_fix(WT_TOC *toc,
+    void (*f)(const char *, uint64_t), int (*cb)(DB *, DBT **, DBT **))
+{
+	DB *db;
+	DBT *key, *data, *tmp;
+	IDB *idb;
+	WT_PAGE *page;
+	WT_PAGE_DISK *dsk;
+	WT_STACK stack;
+	uint64_t insert_cnt;
+	uint32_t len, space_avail;
+	uint16_t *last_repeat;
+	uint8_t *first_free, *last_data;
+	int rle, ret;
+
+	db = toc->db;
+	tmp = NULL;
+	idb = db->idb;
+	insert_cnt = 0;
+	WT_CLEAR(stack);
+
+	rle = F_ISSET(idb, WT_RLE) ? 1 : 0;
+
+	/* Figure out how large is the chunk we're storing on the page. */
+	len = db->fixed_len;
+	if (rle)
+		len += sizeof(uint16_t);
+
+	/* Get a scratch buffer and make it look like our work page. */
+	WT_ERR(__wt_bulk_scratch_page(toc, db->leafmin,
+	    rle ? WT_PAGE_COL_RLE : WT_PAGE_COL_FIX, WT_LLEAF, &page, &tmp));
+	dsk = page->dsk;
+	dsk->start_recno = 1;
+	__wt_set_ff_and_sa_from_offset(
+	    page, WT_PAGE_BYTE(page), &first_free, &space_avail);
+
+	while ((ret = cb(db, &key, &data)) == 0) {
+		if (key != NULL) {
+			__wt_api_db_errx(db,
+			    "column database keys are implied and so should "
+			    "not be set by the bulk load input routine");
+			ret = WT_ERROR;
+			goto err;
+		}
+		if (data->size != db->fixed_len)
+			WT_ERR(__wt_database_wrong_fixed_size(toc, data->size));
+
+		/*
+		 * We use the high bit of the data field as a "deleted" value,
+		 * make sure the user's data doesn't set it.
+		 */
+		if (WT_FIX_DELETE_ISSET(data->data)) {
+			__wt_api_db_errx(db,
+			    "the first bit may not be stored in fixed-length "
+			    "column-store database items");
+			ret = WT_ERROR;
+			goto err;
+		}
+
+		/* Report on progress every 100 inserts. */
+		if (f != NULL && ++insert_cnt % 100 == 0)
+			f(toc->name, insert_cnt);
+		WT_STAT_INCR(idb->stats, ITEMS_INSERTED);
+
+		/*
+		 * If doing run-length encoding, check to see if this record
+		 * matches the last data inserted.   If there's a match try
+		 * and increment that item's repeat count instead of entering
+		 * new data.
+		 */
+		if (rle && dsk->u.entries != 0)
+			if (*last_repeat < UINT16_MAX &&
+			    memcmp(last_data, data->data, data->size) == 0) {
+				++*last_repeat;
+				++page->records;
+				WT_STAT_INCR(idb->stats, REPEAT_COUNT);
+				continue;
+			}
+
+		/*
+		 * We now have the data item to store on the page.  If there
+		 * is insufficient space on the current page, allocate a new
+		 * one.
+		 */
+		if (len > space_avail) {
+			/*
+			 * We've finished with the page: promote its first key
+			 * to its parent and discard it, then switch to the new
+			 * page.
+			 */
+			WT_ERR(__wt_bulk_promote(
+			    toc, page, page->records, &stack, 0, NULL));
+			WT_ERR(__wt_page_write(toc, page));
+			dsk->u.entries = 0;
+			page->records = 0;
+			dsk->start_recno = insert_cnt;
+			WT_ERR(
+			    __wt_file_alloc(toc, &page->addr, db->leafmin));
+			__wt_set_ff_and_sa_from_offset(page,
+			    WT_PAGE_BYTE(page), &first_free, &space_avail);
+		}
+
+		++dsk->u.entries;
+		++page->records;
+
+		/*
+		 * Copy the data item onto the page -- if doing run-length
+		 * encoding, track the location of the item for comparison.
+		 */
+		if (rle) {
+			last_repeat = (uint16_t *)first_free;
+			*last_repeat = 1;
+			first_free += sizeof(uint16_t);
+			space_avail -= sizeof(uint16_t);
+			last_data = first_free;
+		}
+		memcpy(first_free, data->data, data->size);
+		first_free += data->size;
+		space_avail -= data->size;
+	}
+
+	/* A ret of 1 just means we've reached the end of the input. */
+	if (ret != 1)
+		goto err;
+	ret = 0;
+
+	/* Promote a key from any partially-filled page and write it. */
+	if (dsk->u.entries != 0) {
+		ret = __wt_bulk_promote(
+		    toc, page, page->records, &stack, 0, NULL);
+		WT_ERR(__wt_page_write(toc, page));
+	}
+
+	/* Wrap up reporting. */
+	if (f != NULL)
+		f(toc->name, insert_cnt);
+
+err:	WT_TRET(__wt_bulk_stack_put(toc, &stack));
+	if (tmp != NULL)
+		__wt_scr_release(&tmp);
+
+	return (ret);
+}
+
+/*
+ * __wt_bulk_var --
+ *	Db.bulk_load method for row or column-store variable-length database
+ *	pages.
+ */
+static int
+__wt_bulk_var(WT_TOC *toc, uint32_t flags,
+    void (*f)(const char *, uint64_t), int (*cb)(DB *, DBT **, DBT **))
+{
+	DB *db;
+	DBT *key, *data, key_copy, data_copy;
+	DBT *lastkey, *lastkey_copy, lastkey_std;
+	DBT *tmp1, *tmp2;
+	ENV *env;
+	IDB *idb;
+	WT_ITEM key_item, data_item, *dup_key, *dup_data;
+	WT_OFF off;
+	WT_OVFL key_ovfl, data_ovfl;
+	WT_PAGE *page, *next;
+	WT_STACK stack;
+	uint64_t insert_cnt;
+	uint32_t dup_count, dup_space, len, next_space_avail, space_avail;
+	uint8_t *first_free, *next_first_free, *p, type;
+	int ret;
+
+	db = toc->db;
+	tmp1 = tmp2 = NULL;
+	env = toc->env;
+	idb = db->idb;
+	ret = 0;
+
+	WT_CLEAR(stack);
+	dup_space = dup_count = 0;
+	insert_cnt = 0;
+	type = F_ISSET(idb, WT_COLUMN) ? WT_PAGE_COL_VAR : WT_PAGE_ROW_LEAF;
+
+	lastkey = &lastkey_std;
+	WT_CLEAR(data_copy);
+	WT_CLEAR(key_copy);
+	WT_CLEAR(key_item);
+	WT_CLEAR(lastkey_std);
+	WT_ERR(__wt_scr_alloc(toc, 0, &lastkey_copy));
+
+	/* Get a scratch buffer and make it look like our work page. */
+	WT_ERR(__wt_bulk_scratch_page(
+	    toc, db->leafmin, type, WT_LLEAF, &page, &tmp1));
+	__wt_set_ff_and_sa_from_offset(
+	    page, WT_PAGE_BYTE(page), &first_free, &space_avail);
+	if (type == WT_PAGE_COL_VAR)
+		page->dsk->start_recno = 1;
+
+	while ((ret = cb(db, &key, &data)) == 0) {
+		if (F_ISSET(idb, WT_COLUMN) ) {
+			if (key != NULL) {
+				__wt_api_db_errx(db,
+				    "column database keys are implied and "
+				    "so should not be returned by the bulk "
+				    "load input routine");
+				ret = WT_ERROR;
+				goto err;
+			}
+		} else {
+			if (key == NULL && !LF_ISSET(WT_DUPLICATES)) {
+				__wt_api_db_errx(db,
+				    "keys must be specified unless duplicates "
+				    "are configured");
+				ret = WT_ERROR;
+				goto err;
+			}
+			if (key != NULL && key->size == 0) {
+				__wt_api_db_errx(db,
+				    "zero-length keys are not supported");
+				ret = WT_ERROR;
+				goto err;
+			}
+		}
+
+		/* Report on progress every 100 inserts. */
+		if (f != NULL && ++insert_cnt % 100 == 0)
+			f(toc->name, insert_cnt);
+		WT_STAT_INCR(idb->stats, ITEMS_INSERTED);
+
+		/*
+		 * We don't have a key to store on the page if we're building a
+		 * column-store, and we don't store the key on the page in the
+		 * case of a row-store duplicate data item.  The check from here
+		 * on is if "key == NULL" for both cases, that is, there's no
+		 * key to store.
+		 */
+
+skip_read:	/*
+		 * We pushed a set of duplicates off-page, and that routine
+		 * returned an ending key/data pair to us.
+		 */
+
+		/*
+		 * Copy the caller's DBTs, we don't want to modify them.  But,
+		 * copy them carefully, all we want is a pointer and a length.
+		 */
+		if (key != NULL) {
+			key_copy.data = key->data;
+			key_copy.size = key->size;
+			key = &key_copy;
+		}
+		data_copy.data = data->data;
+		data_copy.size = data->size;
+		data = &data_copy;
+
+		/* Build the data item we're going to store on the page. */
+		WT_ERR(__wt_item_build_data(
+		    toc, data, &data_item, &data_ovfl, 0));
+
+		/*
+		 * Check for duplicate keys; we don't store the key on the page
+		 * in the case of a duplicate.
+		 *
+		 * !!!
+		 * Do a fast check of the old and new sizes -- note checking
+		 * lastkey->size is safe -- it's initialized to 0, and we do
+		 * not allow zero-length keys.
+		 */
+		if (LF_ISSET(WT_DUPLICATES) &&
+		    (key == NULL ||
+		    (lastkey->size == key->size &&
+		    db->btree_compare(db, lastkey, key) == 0))) {
+			/*
+			 * The first duplicate in the set is already on the
+			 * page, but with an item type set to WT_ITEM_DATA or
+			 * WT_ITEM_DATA_OVFL.  Correct the type and dup_count.
+			 */
+			if (++dup_count == 1) {
+				dup_count = 2;
+				WT_ITEM_SET_TYPE(dup_data,
+				    WT_ITEM_TYPE(dup_data) == WT_ITEM_DATA ?
+				    WT_ITEM_DATA_DUP : WT_ITEM_DATA_DUP_OVFL);
+			}
+
+			/* Reset the type of the current item to a duplicate. */
+			WT_ITEM_SET_TYPE(&data_item,
+			    WT_ITEM_TYPE(&data_item) == WT_ITEM_DATA ?
+			    WT_ITEM_DATA_DUP : WT_ITEM_DATA_DUP_OVFL);
+
+			WT_STAT_INCR(idb->stats, DUPLICATE_ITEMS_INSERTED);
+
+			key = NULL;
+		} else {
+			/*
+			 * It's a new key, but if duplicates are possible we'll
+			 * need a copy of the key for comparison with the next
+			 * key.  If the key is Huffman encoded or an overflow
+			 * object, we can't use the on-page version, we have to
+			 * save a copy.
+			 */
+			if (LF_ISSET(WT_DUPLICATES) &&
+			    (key->size > db->leafitemsize ||
+			    idb->huffman_key != NULL)) {
+				WT_ERR(
+				    __wt_bulk_dbt_copy(env, key, lastkey_copy));
+				lastkey = lastkey_copy;
+			} else
+				lastkey = NULL;
+
+			dup_count = 0;
+		}
+
+		/* Build the key item we're going to store on the page. */
+		if (key != NULL)
+			WT_ERR(__wt_item_build_key(
+			    toc, key, &key_item, &key_ovfl));
+
+		/*
+		 * We now have the key/data items to store on the page.  If
+		 * there is insufficient space on the current page, allocate
+		 * a new one.
+		 */
+		if ((key == NULL ? 0 : WT_ITEM_SPACE_REQ(key->size)) +
+		    WT_ITEM_SPACE_REQ(data->size) > space_avail) {
+			WT_ERR(__wt_bulk_scratch_page(toc,
+			    db->leafmin, type, WT_LLEAF, &next, &tmp2));
+			__wt_set_ff_and_sa_from_offset(next,
+			    WT_PAGE_BYTE(next),
+			    &next_first_free, &next_space_avail);
+			if (type == WT_PAGE_COL_VAR)
+				next->dsk->start_recno = insert_cnt;
+
+			/*
+			 * If in the middle of loading a set of duplicates, but
+			 * the set hasn't yet reached the boundary where we'd
+			 * push them offpage, we can't split them across the two
+			 * pages.  Move the entire set to the new page.  This
+			 * can waste up to 25% of the old page, but it would be
+			 * difficult and messy to move them and then go back
+			 * and fix things up if and when they moved offpage.
+			 *
+			 * We use a check of dup_count instead of checking the
+			 * WT_DUPLICATES flag, since we have to check it anyway.
+			 */
+			if (dup_count != 0) {
+				/*
+				 * Reset the page entry and record counts -- we
+				 * are moving a single key plus the duplicate
+				 * set.
+				 *
+				 * Since dup_count was already incremented to
+				 * reflect the data item we're loading now, it
+				 * is the right number of elements to move, that
+				 * is, move (dup_count - 1) + 1 for the key.
+				 */
+				page->dsk->u.entries -= dup_count;
+				page->records -= dup_count - 1;
+				next->dsk->u.entries += dup_count;
+				next->records += dup_count - 1;
+
+				/*
+				 * Move the duplicate set and adjust the page
+				 * information for "next" -- we don't have to
+				 * fix up "page", we're never going to use it
+				 * again.
+				 */
+				len =
+				    (uint32_t)(first_free - (uint8_t *)dup_key);
+				memcpy(next_first_free, dup_key, len);
+				next_first_free += len;
+				next_space_avail -= len;
+
+				/*
+				 * We'll never have to move this dup set to
+				 * another primary page -- if the dup set
+				 * continues to grow, it will be moved
+				 * off-page.  We still need to know where
+				 * the dup set starts, though, for the
+				 * possible move off-page: it's the second
+				 * entry on the page, where the first entry
+				 * is the dup set's key.
+				 */
+				dup_key = (WT_ITEM *)WT_PAGE_BYTE(next);
+				dup_data = (WT_ITEM *)((uint8_t *)dup_key +
+				    WT_ITEM_SPACE_REQ(WT_ITEM_LEN(dup_key)));
+
+				/*
+				 * The "lastkey" value just moved to a new page.
+				 * If it's an overflow item, we have a copy; if
+				 * it's not, then we need to reset it.
+				 */
+				if (lastkey == &lastkey_std) {
+					lastkey_std.data =
+					    WT_ITEM_BYTE(dup_key);
+					lastkey_std.size = WT_ITEM_LEN(dup_key);
+				}
+			}
+
+			/*
+			 * We've finished with the page: promote its first key
+			 * to its parent and discard it, then switch to the new
+			 * page.
+			 */
+			WT_ERR(__wt_bulk_promote(
+			    toc, page, page->records, &stack, 0, NULL));
+			WT_ERR(__wt_page_write(toc, page));
+			__wt_scr_release(&tmp1);
+
+			/*
+			 * Discard the last page, and switch to the next page.
+			 *
+			 * XXX
+			 * The obvious speed-up here is to re-initialize page
+			 * instead of discarding it and acquiring it again as
+			 * as soon as the just-allocated page fills up.  I am
+			 * not doing that deliberately: eventually we'll use
+			 * asynchronous I/O in bulk load, which means the page
+			 * won't be reusable until the I/O completes.
+			 */
+			page = next;
+			first_free = next_first_free;
+			space_avail = next_space_avail;
+			next = NULL;
+			next_first_free = NULL;
+			next_space_avail = 0;
+			tmp1 = tmp2;
+			tmp2 = NULL;
+		}
+
+		++page->records;
+
+		/* Copy the key item onto the page. */
+		if (key != NULL) {
+			++page->dsk->u.entries;
+
+			memcpy(first_free, &key_item, sizeof(key_item));
+			memcpy(first_free +
+			    sizeof(key_item), key->data, key->size);
+			space_avail -= WT_ITEM_SPACE_REQ(key->size);
+
+			/*
+			 * If processing duplicates we'll need a copy of the key
+			 * for comparison with the next key.  If the key was an
+			 * overflow or Huffman encoded item, we already have a
+			 * copy -- otherwise, use the copy we just put on the
+			 * page.
+			 *
+			 * We also save the location for the key of any current
+			 * duplicate set in case we have to move the set to a
+			 * different page (the case where a duplicate set isn't
+			 * large enough to move offpage, but doesn't entirely
+			 * fit on this page).
+			 */
+			if (LF_ISSET(WT_DUPLICATES)) {
+				if (lastkey == NULL) {
+					lastkey = &lastkey_std;
+					lastkey_std.data =
+					    WT_ITEM_BYTE(first_free);
+					lastkey_std.size = key->size;
+				}
+				dup_key = (WT_ITEM *)first_free;
+			}
+			first_free += WT_ITEM_SPACE_REQ(key->size);
+		}
+
+		/* Copy the data item onto the page. */
+		++page->dsk->u.entries;
+		memcpy(first_free, &data_item, sizeof(data_item));
+		memcpy(first_free + sizeof(data_item), data->data, data->size);
+		space_avail -= WT_ITEM_SPACE_REQ(data->size);
+
+		/*
+		 * If duplicates: if this isn't a duplicate data item, save
+		 * the item location, since it's potentially the first of a
+		 * duplicate data set, and we need to know where duplicate
+		 * data sets start.  Additionally, reset the counter and
+		 * space calculation.
+		 */
+		if (LF_ISSET(WT_DUPLICATES) && dup_count == 0) {
+			dup_space = data->size;
+			dup_data = (WT_ITEM *)first_free;
+		}
+		first_free += WT_ITEM_SPACE_REQ(data->size);
+
+		/*
+		 * If duplicates: check to see if the duplicate set crosses
+		 * the (roughly) 25% of the page space boundary.  If it does,
+		 * move it offpage.
+		 */
+		if (LF_ISSET(WT_DUPLICATES) && dup_count != 0) {
+			dup_space += data->size;
+
+			if (dup_space < db->leafmin / db->btree_dup_offpage)
+				continue;
+
+			/*
+			 * Move the duplicate set off our page, and read in the
+			 * rest of the off-page duplicate set.
+			 */
+			WT_ERR(__wt_bulk_dup_offpage(toc, &key, &data, lastkey,
+			    dup_data,
+			    (uint32_t)(first_free - (uint8_t *)dup_data),
+			    dup_count, &off, cb));
+
+			/* Reset the page entry and record counts. */
+			page->dsk->u.entries -= (dup_count - 1);
+			page->records -= dup_count;
+			page->records += WT_RECORDS(&off);
+
+			/*
+			 * Replace the duplicate set with a WT_OFF structure,
+			 * that is, we've replaced dup_count entries with a
+			 * single entry.
+			 */
+			WT_ITEM_SET(&data_item, WT_ITEM_OFF, sizeof(WT_OFF));
+			p = (uint8_t *)dup_data;
+			memcpy(p, &data_item, sizeof(data_item));
+			memcpy(p + sizeof(data_item), &off, sizeof(WT_OFF));
+			__wt_set_ff_and_sa_from_offset(page,
+			    (uint8_t *)p + WT_ITEM_SPACE_REQ(sizeof(WT_OFF)),
+			    &first_free, &space_avail);
+
+			/* Reset local counters. */
+			dup_count = dup_space = 0;
+
+			goto skip_read;
+		}
+	}
+
+	/* A ret of 1 just means we've reached the end of the input. */
+	if (ret != 1)
+		goto err;
+	ret = 0;
+
+	/* Promote a key from any partially-filled page and write it. */
+	if (page->dsk->u.entries != 0) {
+		WT_ERR(__wt_bulk_promote(
+		    toc, page, page->records, &stack, 0, NULL));
+		WT_ERR(__wt_page_write(toc, page));
+	}
+
+	/* Wrap up reporting. */
+	if (f != NULL)
+		f(toc->name, insert_cnt);
+
+err:	WT_TRET(__wt_bulk_stack_put(toc, &stack));
+	if (lastkey_copy != NULL)
+		__wt_scr_release(&lastkey_copy);
+	if (tmp1 != NULL)
+		__wt_scr_release(&tmp1);
+	if (tmp2 != NULL)
+		__wt_scr_release(&tmp2);
+
+	return (ret);
+}
+
+/*
+ * __wt_bulk_dup_offpage --
+ *	Move the last set of duplicates on the page to a page of their own,
+ *	then load the rest of the duplicate set.
+ */
+static int
+__wt_bulk_dup_offpage(WT_TOC *toc, DBT **keyp, DBT **datap, DBT *lastkey,
+    WT_ITEM *dup_data, uint32_t dup_len, uint32_t dup_count, WT_OFF *off,
+    int (*cb)(DB *, DBT **, DBT **))
+{
+	DB *db;
+	DBT *key, *data, *tmp;
+	IDB *idb;
+	WT_ITEM data_item;
+	WT_OVFL data_ovfl;
+	WT_PAGE *page;
+	WT_STACK stack;
+	uint32_t root_addr, space_avail;
+	uint8_t *first_free;
+	int ret, success_return;
+
+	db = toc->db;
+	idb = db->idb;
+	success_return = 0;
+
+	/*
+	 * This routine is the same as the bulk load routine, except it loads
+	 * only data items into off-page duplicate trees.  It's passed a lot
+	 * of state from the bulk load routine, and updates that state as a
+	 * side-effect.
+	 *
+	 * In summary, the bulk load routine stops loading a primary btree leaf
+	 * page, calls us to load a set of duplicate data items into a separate
+	 * btree, and then continues on with its primary leaf page when we
+	 * return.  The arguments are complex enough that it's worth describing
+	 * them:
+	 *
+	 * keyp/datap --
+	 *	The key and data pairs the application is filling in -- we
+	 *	get them passed to us because we get additional key/data
+	 *	pairs returned to us, and the last one we get is likely to
+	 *	be consumed by our caller.
+	 * lastkey --
+	 *	The last key pushed onto the caller's page -- we use this to
+	 *	compare against future keys we read.
+	 * dup_data --
+	 *	On-page reference to the first duplicate data item in the set.
+	 * dup_count --
+	 *	Count of duplicates in the set.
+	 * off --
+	 *	Callers WT_OFF structure, which we have to fill in.
+	 * cb --
+	 *	User's callback function.
+	 */
+
+	WT_CLEAR(data_item);
+	WT_CLEAR(stack);
+	ret = 0;
+
+	/* Get a scratch buffer and make it look like our work page. */
+	WT_ERR(__wt_bulk_scratch_page(toc,
+	    db->leafmin, WT_PAGE_DUP_LEAF, WT_LLEAF, &page, &tmp));
+	__wt_set_ff_and_sa_from_offset(
+	    page, WT_PAGE_BYTE(page), &first_free, &space_avail);
+
+	/* Move the duplicates onto the newly allocated page. */
+	page->records = dup_count;
+	page->dsk->u.entries = dup_count;
+	memcpy(first_free, dup_data, (size_t)dup_len);
+	first_free += dup_len;
+	space_avail -= dup_len;
+
+	/*
+	 * Unless we have enough duplicates to split this page, it will be the
+	 * "root" of the offpage duplicates.
+	 */
+	root_addr = page->addr;
+
+	/* Read in new duplicate records until the key changes. */
+	while ((ret = cb(db, &key, &data)) == 0) {
+		if (key->size == 0) {
+			__wt_api_db_errx(
+			    db, "zero-length keys are not supported");
+			return (WT_ERROR);
+		}
+		WT_STAT_INCR(idb->stats, ITEMS_INSERTED);
+		WT_STAT_INCR(idb->stats, DUPLICATE_ITEMS_INSERTED);
+
+		/* Loading duplicates, so a key change means we're done. */
+		if (lastkey->size != key->size ||
+		    db->btree_compare_dup(db, lastkey, key) != 0) {
+			*keyp = key;
+			*datap = data;
+			break;
+		}
+
+		/* Build the data item we're going to store on the page. */
+		WT_ERR(__wt_item_build_data(
+		    toc, data, &data_item, &data_ovfl, WT_IS_DUP));
+
+		/*
+		 * If there's insufficient space available, allocate a new
+		 * page.
+		 */
+		if (WT_ITEM_SPACE_REQ(data->size) > space_avail) {
+			/*
+			 * We've finished with the page: promote its first key
+			 * to its parent and discard it, then switch to the new
+			 * page.
+			 *
+			 * If we promoted a key, we might have split, and so
+			 * there may be a new offpage duplicates root page.
+			 */
+			WT_RET(__wt_bulk_promote(toc,
+			    page, page->records, &stack, 0, &root_addr));
+			WT_ERR(__wt_page_write(toc, page));
+			page->records = 0;
+			page->dsk->u.entries = 0;
+			__wt_set_ff_and_sa_from_offset(page,
+			    WT_PAGE_BYTE(page), &first_free, &space_avail);
+		}
+
+		++dup_count;			/* Total duplicate count */
+		++page->records;		/* On-page key/data count */
+		++page->dsk->u.entries;		/* On-page entry count */
+
+		/* Copy the data item onto the page. */
+		WT_ITEM_SET_LEN(&data_item, data->size);
+		memcpy(first_free, &data_item, sizeof(data_item));
+		memcpy(first_free + sizeof(data_item), data->data, data->size);
+		space_avail -= WT_ITEM_SPACE_REQ(data->size);
+		first_free += WT_ITEM_SPACE_REQ(data->size);
+	}
+
+	/*
+	 * Ret values of 1 and 0 are both "OK", the ret value of 1 means we
+	 * reached the end of the bulk input.   Save the successful return
+	 * for our final return value.
+	 */
+	if (ret != 0 && ret != 1)
+		goto err;
+	success_return = ret;
+
+	/* Promote a key from the partially-filled page and write it. */
+	WT_ERR(
+	    __wt_bulk_promote(toc, page, page->records, &stack, 0, &root_addr));
+	WT_ERR(__wt_page_write(toc, page));
+
+	/* Fill in the caller's WT_OFF structure. */
+	WT_RECORDS(off) = dup_count;
+	off->addr = root_addr;
+	off->size = db->intlmin;
+
+err:	WT_TRET(__wt_bulk_stack_put(toc, &stack));
+	if (tmp != NULL)
+		__wt_scr_release(&tmp);
+
+	return (ret == 0 ? success_return : ret);
+}
+
+/*
+ * __wt_bulk_promote --
+ *	Promote the first entry on a page to its parent.
+ */
+static int
+__wt_bulk_promote(WT_TOC *toc, WT_PAGE *page, uint64_t incr,
+    WT_STACK *stack, u_int level, uint32_t *dup_root_addrp)
+{
+	DB *db;
+	DBT *key, key_build, *next_tmp;
+	ENV *env;
+	WT_ITEM *key_item, item;
+	WT_OFF off;
+	WT_OVFL tmp_ovfl;
+	WT_PAGE *next, *parent;
+	WT_PAGE_DISK *dsk;
+	WT_STACK_ELEM *elem;
+	uint32_t next_space_avail;
+	uint8_t *next_first_free;
+	u_int type;
+	int need_promotion, ret;
+	void *parent_data;
+
+	db = toc->db;
+	env = toc->env;
+	dsk = page->dsk;
+	WT_CLEAR(item);
+	next_tmp = NULL;
+	next = parent = NULL;
+	ret = 0;
+
+	/*
+	 * If it's a row-store, get a copy of the first item on the page -- it
+	 * might be an overflow item, in which case we need to make a copy for
+	 * the database.  Most versions of Berkeley DB tried to reference count
+	 * overflow items if they were promoted to internal pages.  That turned
+	 * out to be hard to get right, so I'm not doing it again.
+	 *
+	 * If it's a column-store page, we don't promote a key at all.
+	 */
+	switch (dsk->type) {
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_DUP_LEAF:
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+		key = &key_build;
+		WT_CLEAR(key_build);
+
+		key_item = (WT_ITEM *)WT_PAGE_BYTE(page);
+		switch (WT_ITEM_TYPE(key_item)) {
+		case WT_ITEM_KEY:
+		case WT_ITEM_DATA_DUP:
+			key->data = WT_ITEM_BYTE(key_item);
+			key->size = WT_ITEM_LEN(key_item);
+			switch (dsk->type) {
+			case WT_PAGE_ROW_INT:
+			case WT_PAGE_ROW_LEAF:
+				WT_ITEM_SET(&item, WT_ITEM_KEY, key->size);
+				break;
+			case WT_PAGE_DUP_INT:
+			case WT_PAGE_DUP_LEAF:
+				WT_ITEM_SET(&item, WT_ITEM_KEY_DUP, key->size);
+				break;
+			default:		/* Not possible */
+				break;
+			}
+			break;
+		case WT_ITEM_KEY_OVFL:
+		case WT_ITEM_DATA_DUP_OVFL:
+			/*
+			 * Assume overflow keys remain overflow keys when they
+			 * are promoted; not necessarily true if internal nodes
+			 * are larger than leaf nodes), but that's unlikely.
+			 */
+			WT_CLEAR(tmp_ovfl);
+			WT_RET(__wt_bulk_ovfl_copy(toc,
+			    WT_ITEM_BYTE_OVFL(key_item), &tmp_ovfl));
+			key->data = &tmp_ovfl;
+			key->size = sizeof(tmp_ovfl);
+			switch (dsk->type) {
+			case WT_PAGE_ROW_INT:
+			case WT_PAGE_ROW_LEAF:
+				WT_ITEM_SET(&item,
+				    WT_ITEM_KEY_OVFL, sizeof(WT_OVFL));
+				break;
+			case WT_PAGE_DUP_INT:
+			case WT_PAGE_DUP_LEAF:
+				WT_ITEM_SET(&item,
+				    WT_ITEM_KEY_DUP_OVFL, sizeof(WT_OVFL));
+				break;
+			default:		/* Not possible */
+				break;
+			}
+			break;
+		WT_ILLEGAL_FORMAT(db);
+		}
+		break;
+	case WT_PAGE_COL_FIX:
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_COL_RLE:
+	case WT_PAGE_COL_VAR:
+		key = NULL;
+		break;
+	WT_ILLEGAL_FORMAT(db);
+	}
+
+	/*
+	 * There are two paths into this code based on whether the page already
+	 * has a parent.
+	 *
+	 * If we have a page with no parent page, create the parent page.  In
+	 * this path, there's not much to do -- allocate a parent page, copy
+	 * reference information from the page to the parent, and we're done.
+	 * This is a modified root-split: we're putting a single key on an
+	 * internal page, which is illegal, but we know another page on this
+	 * page's level will be created, and it will be promoted to the parent
+	 * at some point.  This is case #1.
+	 *
+	 * The second path into this code is if we have a page and its parent,
+	 * but the page's reference information doesn't fit on the parent and
+	 * we have to split the parent.  This path has two different cases,
+	 * based on whether the page's parent itself has a parent.
+	 *
+	 * Here's a diagram of case #2, where the parent also has a parent:
+	 *
+	 * P2 -> P1 -> L	(case #2)
+	 *
+	 * The promoted key from leaf L won't fit onto P1, and so we split P1:
+	 *
+	 * P2 -> P1
+	 *    -> P3 -> L
+	 *
+	 * In case #2, allocate P3 and copy reference information from the leaf
+	 * page to it, then recursively call the promote code to promote the
+	 * first entry from P3 to P2.
+	 *
+	 * Here's a diagram of case #3, where the parent does not have a parent,
+	 * in other words, a root split:
+	 *
+	 * P1 -> L		(case #3)
+	 *
+	 * The promoted key from leaf L won't fit onto P1, and so we split P1:
+	 *
+	 * P1 ->
+	 * P2 -> L
+	 *
+	 * In case #3, we allocate P2, copy reference information from the page
+	 * to it, and then recursively call the promote code twice: first to
+	 * promote the first entry from P1 to a new page, and again to promote
+	 * the first entry from P2 to a new page, creating a new root level of
+	 * the tree:
+	 *
+	 * P3 -> P1
+	 *    -> P2 -> L
+	 */
+	/*
+	 * To simplify the rest of the code, check to see if there's room for
+	 * another entry in our stack structure.  Allocate the stack in groups
+	 * of 20, which is probably big enough for any tree we'll ever see in
+	 * the field, we'll never test the realloc code unless we work at it.
+	 */
+#ifdef HAVE_DIAGNOSTIC
+#define	WT_STACK_ALLOC_INCR	2
+#else
+#define	WT_STACK_ALLOC_INCR	20
+#endif
+	if (stack->size == 0 || level == stack->size - 1) {
+		uint32_t bytes_allocated = stack->size * sizeof(WT_STACK_ELEM);
+		WT_RET(__wt_realloc(env, &bytes_allocated,
+		    (stack->size + WT_STACK_ALLOC_INCR) * sizeof(WT_STACK_ELEM),
+		    &stack->elem));
+		stack->size += WT_STACK_ALLOC_INCR;
+		/*
+		 * Note, the stack structure may be entirely uninitialized here,
+		 * that is, everything set to 0 bytes.  That's OK: the level of
+		 * the stack starts out at 0, that is, the 0th element of the
+		 * stack is the 1st level of internal/parent pages in the tree.
+		 */
+	}
+
+	elem = &stack->elem[level];
+	parent = elem->page;
+	if (parent == NULL) {
+split:		switch (dsk->type) {
+		case WT_PAGE_COL_FIX:
+		case WT_PAGE_COL_INT:
+		case WT_PAGE_COL_RLE:
+		case WT_PAGE_COL_VAR:
+			type = WT_PAGE_COL_INT;
+			break;
+		case WT_PAGE_DUP_INT:
+		case WT_PAGE_DUP_LEAF:
+			type = WT_PAGE_DUP_INT;
+			break;
+		case WT_PAGE_ROW_INT:
+		case WT_PAGE_ROW_LEAF:
+			type = WT_PAGE_ROW_INT;
+			break;
+		WT_ILLEGAL_FORMAT(db);
+		}
+
+		WT_ERR(__wt_bulk_scratch_page(
+		    toc, db->intlmin, type, dsk->level + 1, &next, &next_tmp));
+		__wt_set_ff_and_sa_from_offset(next,
+		    WT_PAGE_BYTE(next), &next_first_free, &next_space_avail);
+
+		/*
+		 * Column stores set the starting record number to the starting
+		 * record number of the promoted leaf -- the new leaf is always
+		 * the first record in the new parent's page.  Ignore the type
+		 * of the database, it's simpler ot just promote 0 up the tree
+		 * in row store databases.
+		 */
+		next->dsk->start_recno = page->dsk->start_recno;
+
+		/*
+		 * If we don't have a parent page, it's case #1 -- allocate the
+		 * parent page immediately.
+		 */
+		if (parent == NULL) {
+			/*
+			 * Case #1 -- there's no parent, it's a root split.  No
+			 * additional work in the main tree.  In an off-page
+			 * duplicates tree, return the new root of the off-page
+			 * tree.
+			 */
+			if (type == WT_PAGE_DUP_INT)
+				*dup_root_addrp = next->addr;
+			need_promotion = 0;
+		} else {
+			/*
+			 * Case #2 and #3.
+			 *
+			 * Case #3: a root split, so we have to promote a key
+			 * from both of the parent pages: promote the key from
+			 * the existing parent page.
+			 */
+			if (stack->elem[level + 1].page == NULL)
+				WT_ERR(__wt_bulk_promote(toc, parent,
+				    incr, stack, level + 1, dup_root_addrp));
+			need_promotion = 1;
+
+			/* Write the last parent page, we have a new one. */
+			WT_ERR(__wt_page_write(toc, parent));
+			__wt_scr_release(&stack->elem[level].tmp);
+		}
+
+		/* There's a new parent page, reset the stack. */
+		elem = &stack->elem[level];
+		elem->page = parent = next;
+		elem->first_free = next_first_free;
+		elem->space_avail = next_space_avail;
+		elem->tmp = next_tmp;
+		next = NULL;
+		next_first_free = NULL;
+		next_space_avail = 0;
+		next_tmp = NULL;
+	} else
+		need_promotion = 0;
+
+	/*
+	 * See if the promoted data will fit (if they don't, we have to split).
+	 * We don't need to check for overflow keys: if the key was an overflow,
+	 * we already created a smaller, on-page version of it.
+	 *
+	 * If there's room, copy the promoted data onto the parent's page.
+	 */
+	switch (parent->dsk->type) {
+	case WT_PAGE_COL_INT:
+		if (elem->space_avail < sizeof(WT_OFF))
+			goto split;
+
+		/* Create the WT_OFF reference. */
+		WT_RECORDS(&off) = page->records;
+		off.addr = page->addr;
+		off.size = dsk->level == WT_LLEAF ? db->leafmin : db->intlmin;
+
+		/* Store the data item. */
+		++parent->dsk->u.entries;
+		parent_data = elem->first_free;
+		memcpy(elem->first_free, &off, sizeof(off));
+		elem->first_free += sizeof(WT_OFF);
+		elem->space_avail -= sizeof(WT_OFF);
+
+		/* Track the last entry on the page for record count updates. */
+		stack->elem[level].data = parent_data;
+		break;
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_DUP_INT:
+		if (elem->space_avail <
+		    WT_ITEM_SPACE_REQ(sizeof(WT_OFF)) +
+		    WT_ITEM_SPACE_REQ(key->size))
+			goto split;
+
+		/* Store the key. */
+		++parent->dsk->u.entries;
+		memcpy(elem->first_free, &item, sizeof(item));
+		memcpy(elem->first_free + sizeof(item), key->data, key->size);
+		elem->first_free += WT_ITEM_SPACE_REQ(key->size);
+		elem->space_avail -= WT_ITEM_SPACE_REQ(key->size);
+
+		/* Create the WT_ITEM(WT_OFF) reference. */
+		WT_ITEM_SET(&item, WT_ITEM_OFF, sizeof(WT_OFF));
+		WT_RECORDS(&off) = page->records;
+		off.addr = page->addr;
+		off.size = dsk->level == WT_LLEAF ? db->leafmin : db->intlmin;
+
+		/* Store the data item. */
+		++parent->dsk->u.entries;
+		parent_data = elem->first_free;
+		memcpy(elem->first_free, &item, sizeof(item));
+		memcpy(elem->first_free + sizeof(item), &off, sizeof(off));
+		elem->first_free += WT_ITEM_SPACE_REQ(sizeof(WT_OFF));
+		elem->space_avail -= WT_ITEM_SPACE_REQ(sizeof(WT_OFF));
+
+		/* Track the last entry on the page for record count updates. */
+		stack->elem[level].data = parent_data;
+		break;
+	WT_ILLEGAL_FORMAT(db);
+	}
+
+	parent->records += page->records;
+
+	/*
+	 * The promotion for case #2 and the second part of case #3 -- promote
+	 * the key from the newly allocated internal page to its parent.
+	 */
+	if (need_promotion)
+		WT_RET(__wt_bulk_promote(
+		    toc, parent, incr, stack, level + 1, dup_root_addrp));
+	else {
+		/*
+		 * We've finished promoting the new page's key into the tree.
+		 * What remains is to push the new record counts all the way
+		 * to the root.  We've already corrected our current "parent"
+		 * page, so proceed from there to the root.
+		 */
+		for (elem =
+		    &stack->elem[level + 1]; elem->page != NULL; ++elem) {
+			switch (elem->page->dsk->type) {
+			case WT_PAGE_COL_INT:
+				WT_RECORDS((WT_OFF *)elem->data) += incr;
+				break;
+			case WT_PAGE_ROW_INT:
+			case WT_PAGE_DUP_INT:
+				WT_RECORDS(
+				    (WT_OFF *)WT_ITEM_BYTE(elem->data)) += incr;
+				break;
+			WT_ILLEGAL_FORMAT(db);
+			}
+			elem->page->records += incr;
+		}
+	}
+
+err:	if (next_tmp != NULL)
+		__wt_scr_release(&next_tmp);
+
+	return (ret);
+}
+
+/*
+ * __wt_item_build_key --
+ *	Process an inserted key item and return an WT_ITEM structure and byte
+ *	string to be stored on the page.
+ */
+static int
+__wt_item_build_key(WT_TOC *toc, DBT *dbt, WT_ITEM *item, WT_OVFL *ovfl)
+{
+	DB *db;
+	IDB *idb;
+	WT_STATS *stats;
+
+	db = toc->db;
+	idb = db->idb;
+	stats = idb->stats;
+
+	/*
+	 * We're called with a DBT that references a data/size pair.  We can
+	 * re-point that DBT's data and size fields to other memory, but we
+	 * cannot allocate memory in that DBT -- all we can do is re-point it.
+	 *
+	 * For Huffman-encoded key/data items, we need a chunk of new space;
+	 * use the WT_TOC key/data return memory: this routine is called during
+	 * bulk insert and reconciliation, we aren't returning key/data pairs.
+	 */
+
+	/* Optionally compress the data using the Huffman engine. */
+	if (idb->huffman_key != NULL) {
+		WT_RET(__wt_huffman_encode(
+		    idb->huffman_key, dbt->data, dbt->size,
+		    &toc->key.data, &toc->key.mem_size, &toc->key.size));
+		if (toc->key.size > dbt->size)
+			WT_STAT_INCRV(stats,
+			    HUFFMAN_KEY, toc->key.size - dbt->size);
+		dbt->data = toc->key.data;
+		dbt->size = toc->key.size;
+	}
+
+	/* Create an overflow object if the data won't fit. */
+	if (dbt->size > db->leafitemsize) {
+		WT_STAT_INCR(stats, OVERFLOW_KEY);
+
+		WT_RET(__wt_bulk_ovfl_write(toc, dbt, ovfl));
+
+		dbt->data = ovfl;
+		dbt->size = sizeof(*ovfl);
+		WT_ITEM_SET(item, WT_ITEM_KEY_OVFL, dbt->size);
+	} else
+		WT_ITEM_SET(item, WT_ITEM_KEY, dbt->size);
+	return (0);
+}
+
+/*
+ * __wt_item_build_data --
+ *	Process an inserted data item and return an WT_ITEM structure and byte
+ *	string to be stored on the page.
+ */
+int
+__wt_item_build_data(
+    WT_TOC *toc, DBT *dbt, WT_ITEM *item, WT_OVFL *ovfl, u_int flags)
+{
+	DB *db;
+	IDB *idb;
+	WT_STATS *stats;
+
+	 WT_ENV_FCHK(toc->env,
+	    "__wt_item_build_data", flags, WT_APIMASK_BT_BUILD_DATA_ITEM);
+
+	db = toc->db;
+	idb = db->idb;
+	stats = idb->stats;
+
+	/*
+	 * We're called with a DBT that references a data/size pair.  We can
+	 * re-point that DBT's data and size fields to other memory, but we
+	 * cannot allocate memory in that DBT -- all we can do is re-point it.
+	 *
+	 * For Huffman-encoded key/data items, we need a chunk of new space;
+	 * use the WT_TOC key/data return memory: this routine is called during
+	 * bulk insert and reconciliation, we aren't returning key/data pairs.
+	 */
+	WT_CLEAR(*item);
+	WT_ITEM_SET_TYPE(
+	    item, LF_ISSET(WT_IS_DUP) ? WT_ITEM_DATA_DUP : WT_ITEM_DATA);
+
+	/*
+	 * Handle zero-length items quickly -- this is a common value, it's
+	 * a deleted column-store variable length item.
+	 */
+	if (dbt->size == 0) {
+		WT_ITEM_SET_LEN(item, 0);
+		return (0);
+	}
+
+	/* Optionally compress the data using the Huffman engine. */
+	if (idb->huffman_data != NULL) {
+		WT_RET(__wt_huffman_encode(
+		    idb->huffman_data, dbt->data, dbt->size,
+		    &toc->data.data, &toc->data.mem_size, &toc->data.size));
+		if (toc->data.size > dbt->size)
+			WT_STAT_INCRV(stats,
+			    HUFFMAN_DATA, toc->data.size - dbt->size);
+		dbt->data = toc->data.data;
+		dbt->size = toc->data.size;
+	}
+
+	/* Create an overflow object if the data won't fit. */
+	if (dbt->size > db->leafitemsize) {
+		WT_RET(__wt_bulk_ovfl_write(toc, dbt, ovfl));
+
+		dbt->data = ovfl;
+		dbt->size = sizeof(*ovfl);
+		WT_ITEM_SET_TYPE(item, LF_ISSET(WT_IS_DUP) ?
+		    WT_ITEM_DATA_DUP_OVFL : WT_ITEM_DATA_OVFL);
+		WT_STAT_INCR(stats, OVERFLOW_DATA);
+	}
+
+	WT_ITEM_SET_LEN(item, dbt->size);
+	return (0);
+}
+
+/*
+ * __wt_bulk_ovfl_copy --
+ *	Copy bulk-loaded overflow items in the database, returning the WT_OVFL
+ *	structure, filled in.
+ */
+static int
+__wt_bulk_ovfl_copy(WT_TOC *toc, WT_OVFL *from, WT_OVFL *to)
+{
+	DB *db;
+	DBT *tmp;
+	WT_PAGE *page;
+	uint32_t size;
+	int ret;
+
+	db = toc->db;
+	tmp = NULL;
+
+	/* Get a scratch buffer and make it look like an overflow page. */
+	size = WT_ALIGN(sizeof(WT_PAGE_DISK) + from->size, db->allocsize);
+	WT_RET(__wt_bulk_scratch_page(
+	    toc, size, WT_PAGE_OVFL, WT_LLEAF, &page, &tmp));
+	page->dsk->u.datalen = from->size;
+
+	/* Fill in the return information. */
+	to->addr = page->addr;
+	to->size = from->size;
+
+	/*
+	 * Read the page into our scratch buffer, then write it out to the
+	 * new location.
+	 */
+	if ((ret =
+	    __wt_page_disk_read(toc, page->dsk, from->addr, from->size)) == 0)
+		ret =
+		    __wt_page_disk_write(toc, page->dsk, to->addr, from->size);
+
+	__wt_scr_release(&tmp);
+
+	return (ret);
+}
+
+/*
+ * __wt_bulk_ovfl_write --
+ *	Store bulk-loaded overflow items in the database, returning the page
+ *	addr.
+ */
+static int
+__wt_bulk_ovfl_write(WT_TOC *toc, DBT *dbt, WT_OVFL *to)
+{
+	DB *db;
+	DBT *tmp;
+	WT_PAGE *page;
+	WT_PAGE_DISK *dsk;
+	uint32_t size;
+	int ret;
+
+	db = toc->db;
+	tmp = NULL;
+
+	/* Get a scratch buffer and make it look like our work page. */
+	size = WT_ALIGN(sizeof(WT_PAGE_DISK) + dbt->size, db->allocsize);
+	WT_ERR(__wt_bulk_scratch_page(
+	    toc, size, WT_PAGE_OVFL, WT_LLEAF, &page, &tmp));
+
+	/* Fill in the return information. */
+	to->addr = page->addr;
+	to->size = dbt->size;
+
+	/* Initialize the page header and copy the record into place. */
+	dsk = page->dsk;
+	dsk->u.datalen = dbt->size;
+	memcpy((uint8_t *)dsk + sizeof(WT_PAGE_DISK), dbt->data, dbt->size);
+
+	ret = __wt_page_write(toc, page);
+
+err:	if (tmp != NULL)
+		__wt_scr_release(&tmp);
+
+	return (ret);
+}
+
+/*
+ * __wt_bulk_scratch_page --
+ *	Allocate a scratch buffer and make it look like a database page.
+ */
+static int
+__wt_bulk_scratch_page(WT_TOC *toc, uint32_t page_size,
+    uint32_t page_type, uint32_t page_level, WT_PAGE **page_ret, DBT **tmp_ret)
+{
+	DBT *tmp;
+	WT_PAGE *page;
+	WT_PAGE_DISK *dsk;
+	uint32_t size;
+	int ret;
+
+	ret = 0;
+
+	/*
+	 * Allocate a scratch buffer and make sure it's big enough to hold a
+	 * WT_PAGE structure plus the page itself, and clear the memory so
+	 * it's never random bytes.
+	 */
+	size = page_size + sizeof(WT_PAGE);
+	WT_ERR(__wt_scr_alloc(toc, size, &tmp));
+	memset(tmp->data, 0, size);
+
+	/*
+	 * Set up the page and allocate a file address.
+	 *
+	 * We don't run the leaf pages through the cache -- that means passing
+	 * a lot of messages we don't want to bother with.  We're the only user
+	 * of the file, which means we can grab file space whenever we want.
+	 */
+	page = tmp->data;
+	page->dsk = dsk =
+	    (WT_PAGE_DISK *)((uint8_t *)tmp->data + sizeof(WT_PAGE));
+	WT_ERR(__wt_file_alloc(toc, &page->addr, page_size));
+	page->size = page_size;
+	dsk->type = (uint8_t)page_type;
+	dsk->level = (uint8_t)page_level;
+
+	*page_ret = page;
+	*tmp_ret = tmp;
+	return (0);
+
+err:	if (tmp != NULL)
+		__wt_scr_release(&tmp);
+	return (ret);
+}
+
+/*
+ * __wt_bulk_stack_put --
+ *	Push out the tree's stack of pages.
+ */
+static int
+__wt_bulk_stack_put(WT_TOC *toc, WT_STACK *stack)
+{
+	ENV *env;
+	IDB *idb;
+	WT_STACK_ELEM *elem;
+	int ret;
+
+	env = toc->env;
+	idb = toc->db->idb;
+	ret = 0;
+
+	for (elem = stack->elem; elem->page != NULL; ++elem) {
+		WT_TRET(__wt_page_write(toc, elem->page));
+
+		/*
+		 * If we've reached the last element in the stack, it's the
+		 * root page of the tree.  Update the in-memory root address
+		 * and the descriptor record.
+		 */
+		if ((elem + 1)->page == NULL) {
+			idb->root_off.addr = elem->page->addr;
+			idb->root_off.size = elem->page->size;
+			WT_RECORDS(&idb->root_off) = elem->page->records;
+			WT_TRET(__wt_desc_write(toc));
+		}
+
+		__wt_scr_release(&elem->tmp);
+	}
+	__wt_free(env, stack->elem, stack->size * sizeof(WT_STACK_ELEM));
+
+	return (0);
+}
+
+/*
+ * __wt_bulk_dbt_copy --
+ *	Get a copy of DBT referenced object.
+ */
+static int
+__wt_bulk_dbt_copy(ENV *env, DBT *orig, DBT *copy)
+{
+	if (copy->mem_size < orig->size)
+		WT_RET(__wt_realloc(
+		    env, &copy->mem_size, orig->size, &copy->data));
+	memcpy(copy->data, orig->data, orig->size);
+	copy->size = orig->size;
+
+	return (0);
+}
diff --git a/src/btree/bt_cache.c b/src/btree/bt_cache.c
new file mode 100644
index 00000000000..43d4f7e6596
--- /dev/null
+++ b/src/btree/bt_cache.c
@@ -0,0 +1,133 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cache_create --
+ *	Create the underlying cache.
+ */
+int
+__wt_cache_create(ENV *env)
+{
+	IENV *ienv;
+	WT_CACHE *cache;
+	int ret;
+
+	ienv = env->ienv;
+	ret = 0;
+
+	WT_RET(__wt_calloc(env, 1, sizeof(WT_CACHE), &ienv->cache));
+	cache = ienv->cache;
+
+	WT_ERR(
+	    __wt_mtx_alloc(env, "cache eviction server", 1, &cache->mtx_evict));
+	WT_ERR(__wt_mtx_alloc(env, "cache read server", 1, &cache->mtx_read));
+	WT_ERR(__wt_mtx_alloc(env, "reconciliation", 0, &cache->mtx_reconcile));
+
+	WT_ERR(__wt_stat_alloc_cache_stats(env, &cache->stats));
+
+	WT_STAT_SET(
+	    cache->stats, CACHE_BYTES_MAX, env->cache_size * WT_MEGABYTE);
+
+	return (0);
+
+err:	(void)__wt_cache_destroy(env);
+	return (ret);
+}
+
+/*
+ * __wt_cache_pages_inuse --
+ *	Return the number of pages in use.
+ */
+inline uint64_t
+__wt_cache_pages_inuse(WT_CACHE *cache)
+{
+	uint64_t pages_in, pages_out;
+
+	/*
+	 * Reading 64-bit fields, potentially on 32-bit machines, and other
+	 * threads of control may be modifying them.  Check them for sanity
+	 * (although "interesting" corruption is vanishingly unlikely, these
+	 * values just increment over time).
+	 */
+	pages_in = cache->stat_pages_in;
+	pages_out = cache->stat_pages_out;
+	return (pages_in > pages_out ? pages_in - pages_out : 0);
+}
+
+/*
+ * __wt_cache_bytes_inuse --
+ *	Return the number of bytes in use.
+ */
+inline uint64_t
+__wt_cache_bytes_inuse(WT_CACHE *cache)
+{
+	uint64_t bytes_in, bytes_out;
+
+	/*
+	 * Reading 64-bit fields, potentially on 32-bit machines, and other
+	 * threads of control may be modifying them.  Check them for sanity
+	 * (although "interesting" corruption is vanishingly unlikely, these
+	 * values just increment over time).
+	 */
+	bytes_in = cache->stat_bytes_in;
+	bytes_out = cache->stat_bytes_out;
+	return (bytes_in > bytes_out ? bytes_in - bytes_out : 0);
+}
+
+/*
+ * __wt_cache_stats --
+ *	Update the cache statistics for return to the application.
+ */
+void
+__wt_cache_stats(ENV *env)
+{
+	WT_CACHE *cache;
+	WT_STATS *stats;
+
+	cache = env->ienv->cache;
+	stats = cache->stats;
+
+	WT_STAT_SET(stats, CACHE_BYTES_INUSE, __wt_cache_bytes_inuse(cache));
+	WT_STAT_SET(stats, CACHE_PAGES_INUSE, __wt_cache_pages_inuse(cache));
+}
+
+/*
+ * __wt_cache_destroy --
+ *	Discard the underlying cache.
+ */
+int
+__wt_cache_destroy(ENV *env)
+{
+	IENV *ienv;
+	WT_CACHE *cache;
+	int ret;
+
+	ienv = env->ienv;
+	cache = ienv->cache;
+	ret = 0;
+
+	if (cache == NULL)
+		return (0);
+
+	/* Discard mutexes. */
+	if (cache->mtx_evict != NULL)
+		(void)__wt_mtx_destroy(env, cache->mtx_evict);
+	if (cache->mtx_read != NULL)
+		(void)__wt_mtx_destroy(env, cache->mtx_read);
+	if (cache->mtx_reconcile != NULL)
+		(void)__wt_mtx_destroy(env, cache->mtx_reconcile);
+
+	/* Discard allocated memory, and clear. */
+	__wt_free(env, cache->stats, 0);
+	__wt_free(env, ienv->cache, sizeof(WT_CACHE));
+
+	return (ret);
+}
diff --git a/src/btree/bt_close.c b/src/btree/bt_close.c
new file mode 100644
index 00000000000..6bf58e98d7e
--- /dev/null
+++ b/src/btree/bt_close.c
@@ -0,0 +1,86 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_bt_close_page(WT_TOC *, WT_PAGE *, void *);
+
+/*
+ * __wt_bt_close --
+ *	Close the tree.
+ */
+int
+__wt_bt_close(WT_TOC *toc)
+{
+	ENV *env;
+	IDB *idb;
+	WT_CACHE *cache;
+	int ret;
+
+	env = toc->env;
+	idb = toc->db->idb;
+	cache = env->ienv->cache;
+	ret = 0;
+
+	/*
+	 * XXX
+	 * We assume two threads can't call the close method at the same time,
+	 * nor can close be called while other threads are in the tree -- the
+	 * higher level API has to ensure this.
+	 */
+
+	if (WT_UNOPENED_DATABASE(idb))
+		return (0);
+
+	/*
+	 * The tree walk is depth first, that is, the worker function is not
+	 * called on internal pages until all children have been visited; so,
+	 * we don't have to worry about a page being dirtied after the visit.
+	 *
+	 * Lock out the cache evictions thread, though, we don't want it trying
+	 * to evict pages we're flushing.
+	 */
+	__wt_lock(env, cache->mtx_reconcile);
+	WT_TRET(__wt_tree_walk(toc, NULL,
+	    WT_WALK_CACHE | WT_WALK_OFFDUP, __wt_bt_close_page, NULL));
+	__wt_evict_db_clear(toc);
+	__wt_unlock(env, cache->mtx_reconcile);
+
+	/* There's no root page any more, kill the pointer to catch mistakes. */
+	idb->root_page.page = NULL;
+
+	/* Close the underlying file handle. */
+	WT_TRET(__wt_close(env, idb->fh));
+	idb->fh = NULL;
+
+	return (ret);
+}
+
+/*
+ * __wt_bt_close_page --
+ *	Close a page.
+ */
+static int
+__wt_bt_close_page(WT_TOC *toc, WT_PAGE *page, void *arg)
+{
+	WT_CC_QUIET(arg, NULL);
+
+	/* Reconcile any dirty pages, then discard the page. */
+	if (WT_PAGE_IS_MODIFIED(page))
+		WT_RET(__wt_page_reconcile(toc, page));
+
+	/*
+	 * The tree walk is depth first, that is, the worker function is not
+	 * called on internal pages until all children have been visited; so,
+	 * we don't have to worry about reading a page after we discard it.
+	 */
+	__wt_page_discard(toc, page);
+
+	return (0);
+}
diff --git a/src/btree/bt_cmp.c b/src/btree/bt_cmp.c
new file mode 100644
index 00000000000..8cfddc0496a
--- /dev/null
+++ b/src/btree/bt_cmp.c
@@ -0,0 +1,74 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_bt_lex_compare --
+ *	Lexicographic comparison routine.
+ */
+int
+__wt_bt_lex_compare(DB *db, const DBT *user_dbt, const DBT *tree_dbt)
+{
+	uint32_t len;
+	uint8_t *userp, *treep;
+
+	/*
+	 * The DB * argument isn't used by the default routine, but is
+	 * a standard argument for user-specified comparison functions.
+	 */
+	WT_CC_QUIET(db, NULL);
+
+	/*
+	 * Return:
+	 *	< 0 if user_dbt is lexicographically < tree_dbt
+	 *	= 0 if user_dbt is lexicographically = tree_dbt
+	 *	> 0 if user_dbt is lexicographically > tree_dbt
+	 *
+	 * We use the names "user" and "tree" so it's clear which the
+	 * application is looking at when we call its comparison func.
+	 */
+	if ((len = user_dbt->size) > tree_dbt->size)
+		len = tree_dbt->size;
+	for (userp = user_dbt->data,
+	    treep = tree_dbt->data; len > 0; --len, ++userp, ++treep)
+		if (*userp != *treep)
+			return (*userp < *treep ? -1 : 1);
+
+	/* Contents are equal up to the smallest length. */
+	return (user_dbt->size == tree_dbt->size ? 0 :
+	    (user_dbt->size < tree_dbt->size ? -1 : 1));
+}
+
+/*
+ * __wt_bt_int_compare --
+ *	Integer comparison routine.
+ */
+int
+__wt_bt_int_compare(DB *db, const DBT *user_dbt, const DBT *tree_dbt)
+{
+	uint64_t user_int, tree_int;
+
+	/*
+	 * The DBT must hold the low-order bits in machine integer order.
+	 *
+	 * Return:
+	 *	< 0 if user_dbt is < tree_dbt
+	 *	= 0 if user_dbt is = tree_dbt
+	 *	> 0 if user_dbt is > tree_dbt
+	 *
+	 * We use the names "user" and "tree" so it's clear which the
+	 * application is looking at when we call its comparison func.
+	 */
+	user_int = tree_int = 0;
+	memcpy(&user_int, user_dbt->data, (size_t)db->btree_compare_int);
+	memcpy(&tree_int, tree_dbt->data, (size_t)db->btree_compare_int);
+
+	return (user_int == tree_int ? 0 : (user_int < tree_int ? -1 : 1));
+}
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
new file mode 100644
index 00000000000..e27607aba6a
--- /dev/null
+++ b/src/btree/bt_debug.c
@@ -0,0 +1,661 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+#ifdef HAVE_DIAGNOSTIC
+static void __wt_debug_dsk_col_fix(DB *, WT_PAGE_DISK *, FILE *);
+static void __wt_debug_dsk_col_int(WT_PAGE_DISK *, FILE *);
+static void __wt_debug_dsk_col_rle(DB *, WT_PAGE_DISK *, FILE *);
+static int  __wt_debug_dsk_item(WT_TOC *, WT_PAGE_DISK *, FILE *);
+static void __wt_debug_inmem_col_fix(WT_TOC *, WT_PAGE *, FILE *);
+static void __wt_debug_inmem_col_int(WT_PAGE *, FILE *);
+static void __wt_debug_inmem_col_rle(WT_TOC *, WT_PAGE *, FILE *);
+static int  __wt_debug_inmem_col_var(WT_TOC *, WT_PAGE *, FILE *);
+static void __wt_debug_inmem_row_int(WT_PAGE *, FILE *);
+static int  __wt_debug_inmem_row_leaf(WT_TOC *, WT_PAGE *, FILE *);
+static int  __wt_debug_item(WT_TOC *, WT_ITEM *, FILE *);
+static int  __wt_debug_item_data(WT_TOC *, WT_ITEM *, FILE *fp);
+static void __wt_debug_off(WT_OFF *, const char *, FILE *);
+static void __wt_debug_page_hdr(WT_TOC *, WT_PAGE *, FILE *);
+static void __wt_debug_pair(const char *, void *, uint32_t, FILE *);
+static void __wt_debug_repl(WT_REPL *, FILE *);
+static void __wt_debug_rleexp(WT_RLE_EXPAND *, FILE *);
+static int  __wt_debug_set_fp(const char *, FILE **, int *);
+
+static int
+__wt_debug_set_fp(const char *ofile, FILE **fpp, int *close_varp)
+{
+	FILE *fp;
+
+	*close_varp = 0;
+
+	/* If we were giving a stream, use it. */
+	if ((fp = *fpp) != NULL)
+		return (0);
+
+	/* If we were given a file, use it. */
+	if (ofile != NULL) {
+		if ((fp = fopen(ofile, "w")) == NULL)
+			return (WT_ERROR);
+		*fpp = fp;
+		*close_varp = 1;
+		return (0);
+	}
+
+	/* Default to stdout. */
+	*fpp = stdout;
+	return (0);
+}
+
+/*
+ * __wt_debug_dump --
+ *	Dump a database in debugging mode.
+ */
+int
+__wt_debug_dump(WT_TOC *toc, char *ofile, FILE *fp)
+{
+	int do_close, ret;
+
+	WT_RET(__wt_debug_set_fp(ofile, &fp, &do_close));
+
+	/*
+	 * We use the verification code to do debugging dumps because if we're
+	 * dumping in debugging mode, we want to confirm the page is OK before
+	 * walking it.
+	 */
+	ret = __wt_verify(toc, NULL, fp);
+
+	if (do_close)
+		(void)fclose(fp);
+
+	return (ret);
+}
+
+/*
+ * __wt_debug_page --
+ *	Dump a page in debugging mode.
+ */
+int
+__wt_debug_page(WT_TOC *toc, WT_PAGE *page, char *ofile, FILE *fp)
+{
+	WT_PAGE_DISK *dsk;
+	DB *db;
+	int do_close, ret;
+
+	db = toc->db;
+	dsk = page->dsk;
+	ret = 0;
+
+	WT_RET(__wt_debug_set_fp(ofile, &fp, &do_close));
+
+	__wt_debug_page_hdr(toc, page, fp);
+
+	switch (dsk->type) {
+	case WT_PAGE_COL_VAR:
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_DUP_LEAF:
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+	case WT_PAGE_COL_FIX:
+	case WT_PAGE_COL_RLE:
+	case WT_PAGE_COL_INT:
+		fprintf(fp,
+		    "\trecords %llu, starting recno %llu, level %lu, "
+		    "entries %lu, lsn %lu/%lu\n",
+		    (unsigned long long)page->records,
+		    (unsigned long long)dsk->start_recno,
+		    (u_long)dsk->level, (u_long)dsk->u.entries,
+		    (u_long)dsk->lsn[0], (u_long)dsk->lsn[1]);
+		break;
+	case WT_PAGE_OVFL:
+		fprintf(fp, "size %lu\n", (u_long)dsk->u.datalen);
+		break;
+	WT_ILLEGAL_FORMAT(db);
+	}
+
+	switch (dsk->type) {
+	case WT_PAGE_COL_VAR:
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_DUP_LEAF:
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+		ret = __wt_debug_dsk_item(toc, dsk, fp);
+		break;
+	case WT_PAGE_COL_FIX:
+		__wt_debug_dsk_col_fix(db, dsk, fp);
+		break;
+	case WT_PAGE_COL_RLE:
+		__wt_debug_dsk_col_rle(db, dsk, fp);
+		break;
+	case WT_PAGE_COL_INT:
+		__wt_debug_dsk_col_int(dsk, fp);
+		break;
+	default:
+		break;
+	}
+
+	fprintf(fp, "}\n");
+
+	if (do_close)
+		(void)fclose(fp);
+
+	return (ret);
+}
+
+/*
+ * __wt_debug_inmem --
+ *	Dump the in-memory information for a page.
+ */
+int
+__wt_debug_inmem(WT_TOC *toc, WT_PAGE *page, char *ofile, FILE *fp)
+{
+	DB *db;
+	int do_close;
+
+	db = toc->db;
+
+	WT_RET(__wt_debug_set_fp(ofile, &fp, &do_close));
+
+	__wt_debug_page_hdr(toc, page, fp);
+
+	/* Dump the WT_{ROW,COL}_INDX array. */
+	switch (page->dsk->type) {
+	case WT_PAGE_COL_FIX:
+		__wt_debug_inmem_col_fix(toc, page, fp);
+		break;
+	case WT_PAGE_COL_INT:
+		__wt_debug_inmem_col_int(page, fp);
+		break;
+	case WT_PAGE_COL_RLE:
+		__wt_debug_inmem_col_rle(toc, page, fp);
+		break;
+	case WT_PAGE_COL_VAR:
+		WT_RET(__wt_debug_inmem_col_var(toc, page, fp));
+		break;
+	case WT_PAGE_DUP_LEAF:
+	case WT_PAGE_ROW_LEAF:
+		WT_RET(__wt_debug_inmem_row_leaf(toc, page, fp));
+		break;
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_ROW_INT:
+		__wt_debug_inmem_row_int(page, fp);
+		break;
+	case WT_PAGE_OVFL:
+		break;
+	WT_ILLEGAL_FORMAT(db);
+	}
+
+	fprintf(fp, "}\n");
+
+	if (do_close)
+		(void)fclose(fp);
+
+	return (0);
+}
+
+/*
+ * __wt_debug_inmem_col_fix --
+ *	Dump an in-memory WT_PAGE_COL_FIX page.
+ */
+static void
+__wt_debug_inmem_col_fix(WT_TOC *toc, WT_PAGE *page, FILE *fp)
+{
+	WT_COL *cip;
+	WT_REPL *repl;
+	uint32_t fixed_len, i;
+
+	fixed_len = toc->db->fixed_len;
+
+	if (fp == NULL)				/* Default to stderr */
+		fp = stderr;
+
+	WT_INDX_FOREACH(page, cip, i) {
+		fprintf(fp, "\tdata {");
+		if (WT_FIX_DELETE_ISSET(cip->data))
+			fprintf(fp, "deleted");
+		else
+			__wt_print_byte_string(cip->data, fixed_len, fp);
+		fprintf(fp, "}\n");
+
+		if ((repl = WT_COL_REPL(page, cip)) != NULL)
+			__wt_debug_repl(repl, fp);
+	}
+}
+
+/*
+ * __wt_debug_inmem_col_int --
+ *	Dump an in-memory WT_PAGE_COL_INT page.
+ */
+static void
+__wt_debug_inmem_col_int(WT_PAGE *page, FILE *fp)
+{
+	WT_COL *cip;
+	uint32_t i;
+
+	if (fp == NULL)				/* Default to stderr */
+		fp = stderr;
+
+	WT_INDX_FOREACH(page, cip, i)
+		__wt_debug_off(cip->data, "\t", fp);
+}
+
+/*
+ * __wt_debug_inmem_col_rle --
+ *	Dump an in-memory WT_PAGE_COL_RLE page.
+ */
+static void
+__wt_debug_inmem_col_rle(WT_TOC *toc, WT_PAGE *page, FILE *fp)
+{
+	WT_COL *cip;
+	WT_RLE_EXPAND *exp;
+	uint32_t fixed_len, i;
+
+	fixed_len = toc->db->fixed_len;
+
+	if (fp == NULL)				/* Default to stderr */
+		fp = stderr;
+
+	WT_INDX_FOREACH(page, cip, i) {
+		fprintf(fp,
+		    "\trepeat %lu {", (u_long)WT_RLE_REPEAT_COUNT(cip->data));
+		if (WT_FIX_DELETE_ISSET(WT_RLE_REPEAT_DATA(cip->data)))
+			fprintf(fp, "deleted");
+		else
+			__wt_print_byte_string(
+			    WT_RLE_REPEAT_DATA(cip->data), fixed_len, fp);
+		fprintf(fp, "}\n");
+
+		if ((exp = WT_COL_RLEEXP(page, cip)) != NULL)
+			__wt_debug_rleexp(exp, fp);
+	}
+}
+
+/*
+ * __wt_debug_inmem_col_var --
+ *	Dump an in-memory WT_PAGE_COL_VAR page.
+ */
+static int
+__wt_debug_inmem_col_var(WT_TOC *toc, WT_PAGE *page, FILE *fp)
+{
+	WT_COL *cip;
+	WT_REPL *repl;
+	uint32_t i;
+
+	if (fp == NULL)				/* Default to stderr */
+		fp = stderr;
+
+	WT_INDX_FOREACH(page, cip, i) {
+		fprintf(fp, "\tdata {");
+		WT_RET(__wt_debug_item_data(toc, cip->data, fp));
+		fprintf(fp, "}\n");
+
+		if ((repl = WT_COL_REPL(page, cip)) != NULL)
+			__wt_debug_repl(repl, fp);
+	}
+	return (0);
+}
+
+/*
+ * __wt_debug_inmem_row_leaf --
+ *	Dump an in-memory WT_PAGE_DUP_LEAF or WT_PAGE_ROW_LEAF page.
+ */
+static int
+__wt_debug_inmem_row_leaf(WT_TOC *toc, WT_PAGE *page, FILE *fp)
+{
+	WT_REPL *repl;
+	WT_ROW *rip;
+	uint32_t i;
+
+	if (fp == NULL)				/* Default to stderr */
+		fp = stderr;
+
+	WT_INDX_FOREACH(page, rip, i) {
+		if (__wt_key_process(rip))
+			fprintf(fp, "\tkey: {requires processing}\n");
+		else
+			__wt_debug_dbt("\tkey", rip, fp);
+
+		fprintf(fp, "\tdata: {");
+		WT_RET(__wt_debug_item_data(toc, rip->data, fp));
+		fprintf(fp, "}\n");
+
+		if ((repl = WT_ROW_REPL(page, rip)) != NULL)
+			__wt_debug_repl(repl, fp);
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_debug_inmem_row_int --
+ *	Dump an in-memory WT_PAGE_DUP_INT or WT_PAGE_ROW_INT page.
+ */
+static void
+__wt_debug_inmem_row_int(WT_PAGE *page, FILE *fp)
+{
+	WT_ROW *rip;
+	uint32_t i;
+
+	if (fp == NULL)				/* Default to stderr */
+		fp = stderr;
+
+	WT_INDX_FOREACH(page, rip, i) {
+		if (__wt_key_process(rip))
+			fprintf(fp, "\tkey: {requires processing}\n");
+		else
+			__wt_debug_dbt("\tkey", rip, fp);
+
+		__wt_debug_off(rip->data, "\t", fp);
+	}
+}
+
+/*
+ * __wt_debug_repl --
+ *	Dump a replacement array.
+ */
+static void
+__wt_debug_repl(WT_REPL *repl, FILE *fp)
+{
+	if (fp == NULL)				/* Default to stderr */
+		fp = stderr;
+
+	for (; repl != NULL; repl = repl->next)
+		if (WT_REPL_DELETED_ISSET(repl))
+			fprintf(fp, "\trepl: {deleted}\n");
+		else
+			__wt_debug_pair(
+			    "\trepl", WT_REPL_DATA(repl), repl->size, fp);
+}
+
+/*
+ * __wt_debug_rleexp --
+ *	Dump a column store expansion array.
+ */
+static void
+__wt_debug_rleexp(WT_RLE_EXPAND *exp, FILE *fp)
+{
+	WT_REPL *repl;
+
+	if (fp == NULL)				/* Default to stderr */
+		fp = stderr;
+
+	for (; exp != NULL; exp = exp->next) {
+		repl = exp->repl;
+		if (WT_REPL_DELETED_ISSET(repl))
+			fprintf(fp, "\trepl: {deleted}\n");
+		else
+			__wt_debug_pair(
+			    "\trepl", WT_REPL_DATA(repl), repl->size, fp);
+	}
+}
+
+/*
+ * __wt_debug_dsk_item --
+ *	Dump a page of WT_ITEM's.
+ */
+static int
+__wt_debug_dsk_item(WT_TOC *toc, WT_PAGE_DISK *dsk, FILE *fp)
+{
+	WT_ITEM *item;
+	uint32_t i;
+
+	if (fp == NULL)				/* Default to stderr */
+		fp = stderr;
+
+	WT_ITEM_FOREACH(dsk, item, i)
+		WT_RET(__wt_debug_item(toc, item, fp));
+	return (0);
+}
+
+/*
+ * __wt_debug_item --
+ *	Dump a single WT_ITEM.
+ */
+static int
+__wt_debug_item(WT_TOC *toc, WT_ITEM *item, FILE *fp)
+{
+	DB *db;
+	WT_OVFL *ovfl;
+
+	if (fp == NULL)				/* Default to stderr */
+		fp = stderr;
+
+	db = toc->db;
+
+	fprintf(fp, "\t%s: len %lu",
+	    __wt_item_type_string(item), (u_long)WT_ITEM_LEN(item));
+
+	switch (WT_ITEM_TYPE(item)) {
+	case WT_ITEM_KEY:
+	case WT_ITEM_KEY_DUP:
+	case WT_ITEM_DATA:
+	case WT_ITEM_DATA_DUP:
+		break;
+	case WT_ITEM_KEY_OVFL:
+	case WT_ITEM_KEY_DUP_OVFL:
+	case WT_ITEM_DATA_OVFL:
+	case WT_ITEM_DATA_DUP_OVFL:
+		ovfl = WT_ITEM_BYTE_OVFL(item);
+		fprintf(fp, ", addr %lu, size %lu",
+		    (u_long)ovfl->addr, (u_long)ovfl->size);
+		break;
+	case WT_ITEM_DEL:
+		fprintf(fp, "\n");
+		return (0);
+	case WT_ITEM_OFF:
+		__wt_debug_off(WT_ITEM_BYTE_OFF(item), ", ", fp);
+		return (0);
+	WT_ILLEGAL_FORMAT(db);
+	}
+
+	fprintf(fp, "\n\t{");
+	WT_RET(__wt_debug_item_data(toc, item, fp));
+	fprintf(fp, "}\n");
+	return (0);
+}
+
+/*
+ * __wt_debug_dsk_col_int --
+ *	Dump a WT_PAGE_COL_INT page.
+ */
+static void
+__wt_debug_dsk_col_int(WT_PAGE_DISK *dsk, FILE *fp)
+{
+	WT_OFF *off;
+	uint32_t i;
+
+	if (fp == NULL)				/* Default to stderr */
+		fp = stderr;
+
+	WT_OFF_FOREACH(dsk, off, i)
+		__wt_debug_off(off, "\t", fp);
+}
+
+/*
+ * __wt_debug_dsk_col_fix --
+ *	Dump a WT_PAGE_COL_FIX page.
+ */
+static void
+__wt_debug_dsk_col_fix(DB *db, WT_PAGE_DISK *dsk, FILE *fp)
+{
+	uint32_t i;
+	uint8_t *p;
+
+	if (fp == NULL)				/* Default to stderr */
+		fp = stderr;
+
+	WT_FIX_FOREACH(db, dsk, p, i) {
+		fprintf(fp, "\t{");
+		if (WT_FIX_DELETE_ISSET(p))
+			fprintf(fp, "deleted");
+		else
+			__wt_print_byte_string(p, db->fixed_len, fp);
+		fprintf(fp, "}\n");
+	}
+}
+
+/*
+ * __wt_debug_dsk_col_rle --
+ *	Dump a WT_PAGE_COL_RLE page.
+ */
+static void
+__wt_debug_dsk_col_rle(DB *db, WT_PAGE_DISK *dsk, FILE *fp)
+{
+	uint32_t i;
+	uint8_t *p;
+
+	if (fp == NULL)				/* Default to stderr */
+		fp = stderr;
+
+	WT_RLE_REPEAT_FOREACH(db, dsk, p, i) {
+		fprintf(fp, "\trepeat %lu {",
+		    (u_long)WT_RLE_REPEAT_COUNT(p));
+		if (WT_FIX_DELETE_ISSET(WT_RLE_REPEAT_DATA(p)))
+			fprintf(fp, "deleted");
+		else
+			__wt_print_byte_string(
+			    WT_RLE_REPEAT_DATA(p), db->fixed_len, fp);
+		fprintf(fp, "}\n");
+	}
+}
+
+/*
+ * __wt_debug_item_data --
+ *	Dump a single item's data in debugging mode.
+ */
+static int
+__wt_debug_item_data(WT_TOC *toc, WT_ITEM *item, FILE *fp)
+{
+	DB *db;
+	DBT *tmp;
+	IDB *idb;
+	uint32_t size;
+	uint8_t *p;
+	int ret;
+
+	if (fp == NULL)				/* Default to stderr */
+		fp = stderr;
+
+	db = toc->db;
+	tmp = NULL;
+	idb = db->idb;
+	ret = 0;
+
+	switch (WT_ITEM_TYPE(item)) {
+	case WT_ITEM_KEY:
+		if (idb->huffman_key != NULL)
+			goto process;
+		goto onpage;
+	case WT_ITEM_KEY_DUP:
+	case WT_ITEM_DATA:
+	case WT_ITEM_DATA_DUP:
+		if (idb->huffman_data != NULL)
+			goto process;
+onpage:		p = WT_ITEM_BYTE(item);
+		size = WT_ITEM_LEN(item);
+		break;
+	case WT_ITEM_KEY_OVFL:
+	case WT_ITEM_KEY_DUP_OVFL:
+	case WT_ITEM_DATA_OVFL:
+	case WT_ITEM_DATA_DUP_OVFL:
+process:	WT_ERR(__wt_scr_alloc(toc, 0, &tmp));
+		WT_ERR(__wt_item_process(toc, item, tmp));
+		p = tmp->data;
+		size = tmp->size;
+		break;
+	case WT_ITEM_DEL:
+		p = (uint8_t *)"deleted";
+		size = 7;
+		break;
+	case WT_ITEM_OFF:
+		p = (uint8_t *)"offpage";
+		size = 7;
+		break;
+	WT_ILLEGAL_FORMAT_ERR(db, ret);
+	}
+
+	__wt_print_byte_string(p, size, fp);
+
+err:	if (tmp != NULL)
+		__wt_scr_release(&tmp);
+	return (ret);
+}
+
+/*
+ * __wt_debug_off --
+ *	Dump a WT_OFF structure.
+ */
+static void
+__wt_debug_off(WT_OFF *off, const char *prefix, FILE *fp)
+{
+	if (fp == NULL)				/* Default to stderr */
+		fp = stderr;
+
+	fprintf(fp, "%soffpage: addr %lu, size %lu, records %llu\n",
+	    prefix, (u_long)off->addr, (u_long)off->size,
+	    (unsigned long long)WT_RECORDS(off));
+}
+
+/*
+ * __wt_debug_dbt --
+ *	Dump a single DBT in debugging mode, with an optional tag.
+ */
+void
+__wt_debug_dbt(const char *tag, void *arg_dbt, FILE *fp)
+{
+	DBT *dbt;
+
+	if (fp == NULL)				/* Default to stderr */
+		fp = stderr;
+
+	/*
+	 * The argument isn't necessarily a DBT structure, but the first two
+	 * fields of the argument are always a void *data/uint32_t size pair.
+	 */
+	dbt = arg_dbt;
+	__wt_debug_pair(tag, dbt->data, dbt->size, fp);
+}
+
+/*
+ * __wt_debug_pair --
+ *	Dump a single data/size pair, with an optional tag.
+ */
+static void
+__wt_debug_pair(const char *tag, void *data, uint32_t size, FILE *fp)
+{
+	if (fp == NULL)				/* Default to stderr */
+		fp = stderr;
+
+	if (tag != NULL)
+		fprintf(fp, "%s: ", tag);
+	fprintf(fp, "%lu {",  (u_long)size);
+	__wt_print_byte_string(data, size, fp);
+	fprintf(fp, "}\n");
+}
+#endif
+
+/*
+ * __wt_debug_page_hdr --
+ *	Standard debug page-header output.
+ */
+static void
+__wt_debug_page_hdr(WT_TOC *toc, WT_PAGE *page, FILE *fp)
+{
+	DB *db;
+
+	db = toc->db;
+
+	fprintf(fp,
+	    "addr: %lu-%lu {\n\t%s: size %lu\n",
+	    (u_long)page->addr,
+	    (u_long)page->addr + (WT_OFF_TO_ADDR(db, page->size) - 1),
+	    __wt_page_type_string(page->dsk), (u_long)page->size);
+
+}
diff --git a/src/btree/bt_desc.c b/src/btree/bt_desc.c
new file mode 100644
index 00000000000..2fc024d1e8c
--- /dev/null
+++ b/src/btree/bt_desc.c
@@ -0,0 +1,132 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_desc_io(WT_TOC *, void *, int);
+
+/*
+ * __wt_desc_stat --
+ *	Fill in the statistics from the database description.
+ */
+int
+__wt_desc_stat(WT_TOC *toc)
+{
+	WT_PAGE_DESC desc;
+	WT_STATS *stats;
+
+	stats = toc->db->idb->dstats;
+
+	WT_RET(__wt_desc_io(toc, &desc, 1));
+
+	WT_STAT_SET(stats, MAGIC, desc.magic);
+	WT_STAT_SET(stats, MAJOR, desc.majorv);
+	WT_STAT_SET(stats, MINOR, desc.minorv);
+	WT_STAT_SET(stats, INTLMAX, desc.intlmax);
+	WT_STAT_SET(stats, INTLMIN, desc.intlmin);
+	WT_STAT_SET(stats, LEAFMAX, desc.leafmax);
+	WT_STAT_SET(stats, LEAFMIN, desc.leafmin);
+	WT_STAT_SET(stats, BASE_RECNO, desc.recno_offset);
+	WT_STAT_SET(stats, FIXED_LEN, desc.fixed_len);
+
+	return (0);
+}
+
+/*
+ * __wt_desc_read --
+ *	Read the descriptor structure from page 0.
+ */
+int
+__wt_desc_read(WT_TOC *toc)
+{
+	DB *db;
+	WT_PAGE_DESC desc;
+
+	db = toc->db;
+
+	WT_RET(__wt_desc_io(toc, &desc, 1));
+
+	db->intlmax = desc.intlmax;		/* Update DB handle */
+	db->intlmin = desc.intlmin;
+	db->leafmax = desc.leafmax;
+	db->leafmin = desc.leafmin;
+	db->idb->root_off.addr = desc.root_addr;
+	db->idb->root_off.size = desc.root_size;
+	WT_RECORDS(&db->idb->root_off) = desc.records;
+	db->idb->free_addr = desc.free_addr;
+	db->idb->free_size = desc.free_size;
+	db->fixed_len = desc.fixed_len;
+
+	/*
+	 * XXX
+	 * This is the wrong place to do this -- need to think about how
+	 * to update open/configuration information in a reasonable way.
+	 */
+	if (db->fixed_len != 0)
+		F_SET(db->idb, WT_COLUMN);
+
+	return (0);
+}
+
+/*
+ * __wt_desc_write --
+ *	Update the description page.
+ */
+int
+__wt_desc_write(WT_TOC *toc)
+{
+	DB *db;
+	IDB *idb;
+	WT_PAGE_DESC desc;
+	int ret;
+
+	db = toc->db;
+	idb = db->idb;
+	ret = 0;
+
+	desc.magic = WT_BTREE_MAGIC;
+	desc.majorv = WT_BTREE_MAJOR_VERSION;
+	desc.minorv = WT_BTREE_MINOR_VERSION;
+	desc.intlmax = db->intlmax;
+	desc.intlmin = db->intlmin;
+	desc.leafmax = db->leafmax;
+	desc.leafmin = db->leafmin;
+	desc.recno_offset = 0;
+	desc.root_addr = idb->root_off.addr;
+	desc.root_size = idb->root_off.size;
+	desc.records = WT_RECORDS(&idb->root_off);
+	desc.free_addr = idb->free_addr;
+	desc.free_size = idb->free_size;
+	desc.fixed_len = (uint8_t)db->fixed_len;
+	desc.flags = 0;
+	if (F_ISSET(idb, WT_RLE))
+		F_SET(&desc, WT_PAGE_DESC_RLE);
+
+	WT_RET(__wt_desc_io(toc, &desc, 0));
+
+	return (ret);
+}
+
+/*
+ * __wt_desc_io --
+ *	Read/write the WT_DESC sector.
+ */
+static int
+__wt_desc_io(WT_TOC *toc, void *p, int is_read)
+{
+	WT_FH *fh;
+	ENV *env;
+
+	fh = toc->db->idb->fh;
+	env = toc->env;
+
+	return (is_read ?
+	    __wt_read(env, fh, (off_t)0, 512, p) :
+	    __wt_write(env, fh, (off_t)0, 512, p));
+}
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
new file mode 100644
index 00000000000..8e189204ce0
--- /dev/null
+++ b/src/btree/bt_discard.c
@@ -0,0 +1,234 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static void __wt_page_discard_dup(ENV *, WT_PAGE *);
+static void __wt_page_discard_rleexp(ENV *, WT_PAGE *);
+static void __wt_page_discard_repl(ENV *, WT_PAGE *);
+static void __wt_page_discard_repl_list(ENV *, WT_REPL *);
+static inline int __wt_row_key_on_page(WT_PAGE *, WT_ROW *);
+
+/*
+ * __wt_page_discard --
+ *	Free all memory associated with a page.
+ */
+void
+__wt_page_discard(WT_TOC *toc, WT_PAGE *page)
+{
+	ENV *env;
+	WT_ROW *rip;
+	uint32_t i, type;
+	void *last_key;
+
+	env = toc->env;
+	type = page->dsk->type;
+
+	/* Never discard a dirty page. */
+	WT_ASSERT(env, !WT_PAGE_IS_MODIFIED(page));
+
+	/* Free the in-memory index array. */
+	switch (type) {
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_DUP_LEAF:
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+		/*
+		 * For each entry, see if the key was an allocation (that is,
+		 * if it points somewhere other than the original page), and
+		 * if so, free the memory.  This test is a superset of the
+		 * __wt_key_process test, that is, any key requiring processing
+		 * but not yet processed, must reference on-page information.
+		 */
+		last_key = NULL;
+		WT_INDX_FOREACH(page, rip, i) {
+			if (__wt_row_key_on_page(page, rip))
+				continue;
+
+			/*
+			 * Only test the first entry for duplicate key/data
+			 * pairs, the others reference the same memory.  (This
+			 * test only makes sense for WT_PAGE_ROW_LEAF pages,
+			 * but there is no cost in doing the test for duplicate
+			 * leaf pages as well.)
+			 */
+			if (rip->key == last_key)
+				continue;
+			last_key = rip->key;
+			__wt_free(env, rip->key, rip->size);
+		}
+		__wt_free(env, page->u.irow, page->indx_count * sizeof(WT_ROW));
+		break;
+	case WT_PAGE_COL_FIX:
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_COL_RLE:
+	case WT_PAGE_COL_VAR:
+		__wt_free(env, page->u.icol, page->indx_count * sizeof(WT_COL));
+		break;
+	default:
+		break;
+	}
+
+	/* Free the modified/deletion replacements array. */
+	switch (type) {
+	case WT_PAGE_DUP_LEAF:
+	case WT_PAGE_ROW_LEAF:
+	case WT_PAGE_COL_FIX:
+	case WT_PAGE_COL_VAR:
+		if (page->u2.repl != NULL)
+			__wt_page_discard_repl(env, page);
+		break;
+	default:
+		break;
+	}
+
+	/* Free the run-length encoded column store expansion array. */
+	switch (type) {
+	case WT_PAGE_COL_RLE:
+		if (page->u2.rleexp != NULL)
+			__wt_page_discard_rleexp(env, page);
+		break;
+	default:
+		break;
+	}
+
+	/* Free the subtree-reference array. */
+	switch (type) {
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_ROW_INT:
+		if (page->u3.ref != NULL)
+			__wt_free(env, page->u3.ref,
+			    page->indx_count * sizeof(WT_REF));
+		break;
+	case WT_PAGE_ROW_LEAF:
+		if (WT_PAGE_DUP_TREES(page))
+			__wt_page_discard_dup(env, page);
+		break;
+	default:
+		break;
+	}
+
+	if (page->dsk != NULL)
+		__wt_free(env, page->dsk, page->size);
+	__wt_free(env, page, sizeof(WT_PAGE));
+}
+
+/*
+ * __wt_page_discard_repl --
+ *	Discard the replacement array.
+ */
+static void
+__wt_page_discard_repl(ENV *env, WT_PAGE *page)
+{
+	WT_REPL **replp;
+	u_int i;
+
+	/*
+	 * For each non-NULL slot in the page's array of replacements, free the
+	 * linked list anchored in that slot.
+	 */
+	WT_REPL_FOREACH(page, replp, i)
+		if (*replp != NULL)
+			__wt_page_discard_repl_list(env, *replp);
+
+	/* Free the page's array of replacements. */
+	__wt_free(env, page->u2.repl, page->indx_count * sizeof(WT_REPL *));
+}
+
+/*
+ * __wt_page_discard_rleexp --
+ *	Discard the run-length encoded column store expansion array.
+ */
+static void
+__wt_page_discard_rleexp(ENV *env, WT_PAGE *page)
+{
+	WT_RLE_EXPAND **expp, *exp, *a;
+	u_int i;
+
+	/*
+	 * For each non-NULL slot in the page's run-length encoded column
+	 * store expansion array, free the linked list of WT_RLE_EXPAND
+	 * structures anchored in that slot.
+	 */
+	WT_RLE_EXPAND_FOREACH(page, expp, i) {
+		if ((exp = *expp) == NULL)
+			continue;
+		/*
+		 * Free the linked list of WT_REPL structures anchored in the
+		 * WT_RLE_EXPAND entry.
+		 */
+		__wt_page_discard_repl_list(env, exp->repl);
+		do {
+			a = exp->next;
+			__wt_free(env, exp, sizeof(WT_RLE_EXPAND));
+		} while ((exp = a) != NULL);
+	}
+
+	/* Free the page's expansion array. */
+	__wt_free(
+	    env, page->u2.rleexp, page->indx_count * sizeof(WT_RLE_EXPAND *));
+}
+
+/*
+ * __wt_page_discard_repl_list --
+ *	Walk a WT_REPL forward-linked list and free the per-thread combination
+ *	of a WT_REPL structure and its associated data.
+ */
+static void
+__wt_page_discard_repl_list(ENV *env, WT_REPL *repl)
+{
+	WT_REPL *a;
+	WT_TOC_UPDATE *update;
+
+	do {
+		a = repl->next;
+
+		update = repl->update;
+		WT_ASSERT(env, update->out < update->in);
+		if (++update->out == update->in)
+			__wt_free(env, update, update->len);
+	} while ((repl = a) != NULL);
+}
+
+/*
+ * __wt_page_discard_dup --
+ *	Walk the off-page duplicates tree array.
+ */
+static void
+__wt_page_discard_dup(ENV *env, WT_PAGE *page)
+{
+	WT_REF **dupp;
+	u_int i;
+
+	/*
+	 * For each non-NULL slot in the page's array of off-page duplicate
+	 * references, free the reference.
+	 */
+	WT_DUP_FOREACH(page, dupp, i)
+		if (*dupp != NULL)
+			__wt_free(env, *dupp, sizeof(WT_REF));
+
+	/* Free the page's array of off-page duplicate references. */
+	__wt_free(env, page->u3.dup, page->indx_count * sizeof(WT_REF *));
+}
+
+/*
+ * __wt_row_key_on_page --
+ *	Return if a WT_ROW structure's key references on-page data.
+ */
+static inline int
+__wt_row_key_on_page(WT_PAGE *page, WT_ROW *rip)
+{
+	uint8_t *p;
+
+	p = rip->key;
+	return (p >= (uint8_t *)page->dsk &&
+	    p < (uint8_t *)page->dsk + page->size ? 1 : 0);
+}
diff --git a/src/btree/bt_dump.c b/src/btree/bt_dump.c
new file mode 100644
index 00000000000..4d46fceff27
--- /dev/null
+++ b/src/btree/bt_dump.c
@@ -0,0 +1,472 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+typedef struct {
+	void (*p)				/* Print function */
+	    (uint8_t *, uint32_t, FILE *);
+	FILE *stream;				/* Dump stream */
+
+	void (*f)(const char *, uint64_t);	/* Progress callback */
+	uint64_t fcnt;				/* Progress counter */
+
+	DBT *dupkey;				/* Offpage duplicate tree key */
+} WT_DSTUFF;
+
+static int  __wt_dump_page(WT_TOC *, WT_PAGE *, void *);
+static void __wt_dump_page_col_fix(WT_TOC *, WT_PAGE *, WT_DSTUFF *);
+static int  __wt_dump_page_col_rle(WT_TOC *, WT_PAGE *, WT_DSTUFF *);
+static int  __wt_dump_page_col_var(WT_TOC *, WT_PAGE *, WT_DSTUFF *);
+static int  __wt_dump_page_dup_leaf(WT_TOC *, WT_PAGE *, WT_DSTUFF *);
+static int  __wt_dump_page_row_leaf(WT_TOC *, WT_PAGE *, WT_DSTUFF *);
+static void __wt_print_byte_string_hex(uint8_t *, uint32_t, FILE *);
+static void __wt_print_byte_string_nl(uint8_t *, uint32_t, FILE *);
+
+/*
+ * __wt_db_dump --
+ *	Db.dump method.
+ */
+int
+__wt_db_dump(WT_TOC *toc,
+    FILE *stream, void (*f)(const char *, uint64_t), uint32_t flags)
+{
+	WT_DSTUFF dstuff;
+	int ret;
+
+	if (LF_ISSET(WT_DEBUG)) {
+		/*
+		 * We use the verification code to do debugging dumps because
+		 * if we're dumping in debugging mode, we want to confirm the
+		 * page is OK before blindly reading it.
+		 */
+		return (__wt_verify(toc, f, stream));
+	}
+
+	dstuff.p = flags == WT_PRINTABLES ?
+	    __wt_print_byte_string_nl : __wt_print_byte_string_hex;
+	dstuff.stream = stream;
+	dstuff.f = f;
+	dstuff.fcnt = 0;
+	dstuff.dupkey = NULL;
+
+	/*
+	 * Note we do not have a hazard reference for the root page, and that's
+	 * safe -- root pages are pinned into memory when a database is opened,
+	 * and never re-written until the database is closed.
+	 */
+	fprintf(stream, "VERSION=1\n");
+	fprintf(stream, "HEADER=END\n");
+	ret = __wt_tree_walk(toc, NULL, 0, __wt_dump_page, &dstuff);
+	fprintf(stream, "DATA=END\n");
+
+	/* Wrap up reporting. */
+	if (f != NULL)
+		f(toc->name, dstuff.fcnt);
+
+	return (ret);
+}
+
+/*
+ * __wt_dump_page --
+ *	Depth-first recursive walk of a btree.
+ */
+static int
+__wt_dump_page(WT_TOC *toc, WT_PAGE *page, void *arg)
+{
+	DB *db;
+	WT_DSTUFF *dp;
+
+	db = toc->db;
+	dp = arg;
+
+	switch (page->dsk->type) {
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_ROW_INT:
+		break;
+	case WT_PAGE_COL_FIX:
+		__wt_dump_page_col_fix(toc, page, dp);
+		break;
+	case WT_PAGE_COL_RLE:
+		WT_RET(__wt_dump_page_col_rle(toc, page, dp));
+		break;
+	case WT_PAGE_COL_VAR:
+		WT_RET(__wt_dump_page_col_var(toc, page, dp));
+		break;
+	case WT_PAGE_DUP_LEAF:
+		WT_RET(__wt_dump_page_dup_leaf(toc, page, dp));
+		break;
+	case WT_PAGE_ROW_LEAF:
+		WT_RET(__wt_dump_page_row_leaf(toc, page, dp));
+		break;
+	WT_ILLEGAL_FORMAT(db);
+	}
+
+	/* Report progress every 10 pages. */
+	if (dp->f != NULL && ++dp->fcnt % 10 == 0)
+		dp->f(toc->name, dp->fcnt);
+
+	return (0);
+}
+
+/*
+ * __wt_dump_page_col_fix --
+ *	Dump a WT_PAGE_COL_FIX page.
+ */
+static void
+__wt_dump_page_col_fix(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp)
+{
+	DB *db;
+	WT_COL *cip;
+	WT_REPL *repl;
+	uint32_t i;
+
+	db = toc->db;
+
+	/* Walk the page, dumping data items. */
+	WT_INDX_FOREACH(page, cip, i) {
+		if ((repl = WT_COL_REPL(page, cip)) == NULL) {
+			if (!WT_FIX_DELETE_ISSET(cip->data))
+				dp->p(cip->data, db->fixed_len, dp->stream);
+		} else
+			if (!WT_REPL_DELETED_ISSET(repl))
+				dp->p(WT_REPL_DATA(repl),
+				    db->fixed_len, dp->stream);
+	}
+}
+
+/*
+ * __wt_dump_page_col_rle --
+ *	Dump a WT_PAGE_COL_RLE page.
+ */
+static int
+__wt_dump_page_col_rle(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp)
+{
+	DB *db;
+	ENV *env;
+	WT_COL *cip;
+	WT_RLE_EXPAND *exp, **expsort, **expp;
+	WT_REPL *repl;
+	uint64_t recno;
+	uint32_t i, n_expsort;
+	uint16_t n_repeat;
+
+	db = toc->db;
+	env = toc->env;
+	expsort = NULL;
+	n_expsort = 0;
+
+	recno = page->dsk->start_recno;
+	WT_INDX_FOREACH(page, cip, i) {
+		/*
+		 * Get a sorted list of any expansion entries we've created for
+		 * this set of records.  The sort function returns a NULL-
+		 * terminated array of references to WT_RLE_EXPAND structures,
+		 * sorted by record number.
+		 */
+		WT_RET(__wt_rle_expand_sort(
+		    env, page, cip, &expsort, &n_expsort));
+
+		/*
+		 * Dump the records.   We use the WT_REPL entry for records in
+		 * in the WT_RLE_EXPAND array, and original data otherwise.
+		 */
+		for (expp = expsort,
+		    n_repeat = WT_RLE_REPEAT_COUNT(cip->data);
+		    n_repeat > 0; --n_repeat, ++recno)
+			if ((exp = *expp) != NULL && exp->recno == recno) {
+				++expp;
+				repl = exp->repl;
+				if (WT_REPL_DELETED_ISSET(repl))
+					continue;
+				dp->p(
+				    WT_REPL_DATA(repl), repl->size, dp->stream);
+			} else
+				dp->p(WT_RLE_REPEAT_DATA(cip->data),
+				    db->fixed_len, dp->stream);
+	}
+	/* Free the sort array. */
+	if (expsort != NULL)
+		__wt_free(env, expsort, n_expsort * sizeof(WT_RLE_EXPAND *));
+
+	return (0);
+}
+
+/*
+ * __wt_dump_page_col_var --
+ *	Dump a WT_PAGE_COL_VAR page.
+ */
+static int
+__wt_dump_page_col_var(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp)
+{
+	DB *db;
+	DBT *tmp;
+	WT_COL *cip;
+	WT_ITEM *item;
+	WT_REPL *repl;
+	uint32_t i;
+	int ret;
+	void *huffman;
+
+	db = toc->db;
+	huffman = db->idb->huffman_data;
+	ret = 0;
+
+	WT_RET(__wt_scr_alloc(toc, 0, &tmp));
+	WT_INDX_FOREACH(page, cip, i) {
+		/* Check for replace or deletion. */
+		if ((repl = WT_COL_REPL(page, cip)) != NULL) {
+			if (!WT_REPL_DELETED_ISSET(repl))
+				dp->p(
+				    WT_REPL_DATA(repl), repl->size, dp->stream);
+			continue;
+		}
+
+		/* Process the original data. */
+		item = cip->data;
+		switch (WT_ITEM_TYPE(item)) {
+		case WT_ITEM_DATA:
+			if (huffman == NULL) {
+				dp->p(WT_ITEM_BYTE(item),
+				    WT_ITEM_LEN(item), dp->stream);
+				break;
+			}
+			/* FALLTHROUGH */
+		case WT_ITEM_DATA_OVFL:
+			WT_ERR(__wt_item_process(toc, item, tmp));
+			dp->p(tmp->data, tmp->size, dp->stream);
+			break;
+		case WT_ITEM_DEL:
+			break;
+		WT_ILLEGAL_FORMAT_ERR(db, ret);
+		}
+	}
+
+err:	__wt_scr_release(&tmp);
+	return (ret);
+}
+
+/*
+ * __wt_dump_page_dup_leaf --
+ *	Dump a WT_PAGE_DUP_LEAF page.
+ */
+static int
+__wt_dump_page_dup_leaf(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp)
+{
+	DB *db;
+	DBT *dupkey, *tmp;
+	WT_ITEM *item;
+	WT_REPL *repl;
+	WT_ROW *rip;
+	uint32_t i;
+	int ret;
+	void *huffman;
+
+	db = toc->db;
+	dupkey = dp->dupkey;
+	huffman = db->idb->huffman_data;
+	ret = 0;
+
+	WT_ERR(__wt_scr_alloc(toc, 0, &tmp));
+	WT_INDX_FOREACH(page, rip, i) {
+		/* Check for deletion. */
+		if ((repl = WT_ROW_REPL(
+		    page, rip)) != NULL && WT_REPL_DELETED_ISSET(repl))
+			continue;
+
+		/* Output the key, we're going to need it. */
+		dp->p(dupkey->data, dupkey->size, dp->stream);
+
+		/* Output the replacement item. */
+		if (repl != NULL) {
+			dp->p(WT_REPL_DATA(repl), repl->size, dp->stream);
+			continue;
+		}
+
+		/* Process the original data. */
+		item = rip->data;
+		switch (WT_ITEM_TYPE(item)) {
+		case WT_ITEM_DATA_DUP:
+			if (huffman == NULL) {
+				dp->p(WT_ITEM_BYTE(item),
+				    WT_ITEM_LEN(item), dp->stream);
+				break;
+			}
+			/* FALLTHROUGH */
+		case WT_ITEM_DATA_DUP_OVFL:
+			WT_ERR(__wt_item_process(toc, item, tmp));
+			dp->p(tmp->data, tmp->size, dp->stream);
+			break;
+		WT_ILLEGAL_FORMAT_ERR(db, ret);
+		}
+	}
+
+err:	__wt_scr_release(&tmp);
+	return (ret);
+}
+
+/*
+ * __wt_dump_page_row_leaf --
+ *	Dump a WT_PAGE_ROW_LEAF page.
+ */
+static int
+__wt_dump_page_row_leaf(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp)
+{
+	DB *db;
+	DBT *key, *data, *key_tmp, *data_tmp, key_local, data_local;
+	WT_ITEM *item;
+	WT_OFF *off;
+	WT_REF *ref;
+	WT_REPL *repl;
+	WT_ROW *rip;
+	uint32_t i;
+	int ret;
+	void *huffman;
+
+	db = toc->db;
+	key = data = key_tmp = data_tmp = NULL;
+	huffman = db->idb->huffman_data;
+	ret = 0;
+
+	WT_ERR(__wt_scr_alloc(toc, 0, &key_tmp));
+	WT_ERR(__wt_scr_alloc(toc, 0, &data_tmp));
+	WT_CLEAR(key_local);
+	WT_CLEAR(data_local);
+
+	WT_INDX_FOREACH(page, rip, i) {
+		/* Check for deletion. */
+		if ((repl = WT_ROW_REPL(
+		    page, rip)) != NULL && WT_REPL_DELETED_ISSET(repl))
+			continue;
+
+		/*
+		 * The key and data variables reference the DBT's we'll print.
+		 * Set the key.
+		 */
+		if (__wt_key_process(rip)) {
+			WT_ERR(__wt_item_process(toc, rip->key, key_tmp));
+			key = key_tmp;
+		} else
+			key = (DBT *)rip;
+
+		/*
+		 * If the item was ever replaced, we're done: it can't be an
+		 * off-page tree, and we don't care what kind of item it was
+		 * originally.  Dump the data from the replacement entry.
+		 *
+		 * XXX
+		 * This is wrong -- if an off-page dup tree is reconciled,
+		 * the off-page reference will change underfoot.
+		 */
+		if (repl != NULL) {
+			dp->p(key->data, key->size, dp->stream);
+			dp->p(WT_REPL_DATA(repl), repl->size, dp->stream);
+			continue;
+		}
+
+		/* Set data to reference the data we'll dump. */
+		item = rip->data;
+		switch (WT_ITEM_TYPE(item)) {
+		case WT_ITEM_DATA:
+		case WT_ITEM_DATA_DUP:
+			if (huffman == NULL) {
+				data_local.data = WT_ITEM_BYTE(item);
+				data_local.size = WT_ITEM_LEN(item);
+				data = &data_local;
+				break;
+			}
+			/* FALLTHROUGH */
+		case WT_ITEM_DATA_DUP_OVFL:
+		case WT_ITEM_DATA_OVFL:
+			WT_ERR(__wt_item_process(toc, item, data_tmp));
+			data = data_tmp;
+			break;
+		case WT_ITEM_OFF:
+			/*
+			 * Set the key and recursively call the tree-walk code
+			 * for any off-page duplicate trees.  (Check for any
+			 * off-page duplicate trees locally because we already
+			 * have to walk the page, so it's faster than walking
+			 * the page both here and in the tree-walk function.)
+			 */
+			dp->dupkey = key;
+
+			ref = WT_ROW_DUP(page, rip);
+			off = WT_ROW_OFF(rip);
+			WT_RET(__wt_page_in(toc, page, ref, off, 0));
+			ret = __wt_tree_walk(toc, ref, 0, __wt_dump_page, dp);
+			__wt_hazard_clear(toc, ref->page);
+			if (ret != 0)
+				goto err;
+			continue;
+		WT_ILLEGAL_FORMAT_ERR(db, ret);
+		}
+
+		dp->p(key->data, key->size, dp->stream);
+		dp->p(data->data, data->size, dp->stream);
+	}
+
+err:	/* Discard any space allocated to hold off-page key/data items. */
+	if (key_tmp != NULL)
+		__wt_scr_release(&key_tmp);
+	if (data_tmp != NULL)
+		__wt_scr_release(&data_tmp);
+
+	return (ret);
+}
+
+static const char hex[] = "0123456789abcdef";
+
+/*
+ * __wt_print_byte_string_nl --
+ *	Output a single byte stringin printable characters, where possible.
+ *	In addition, terminate with a <newline> character, unless the entry
+ *	is itself terminated with a <newline> character.
+ */
+static void
+__wt_print_byte_string_nl(uint8_t *data, uint32_t size, FILE *stream)
+{
+	if (data[size - 1] == '\n')
+		--size;
+	__wt_print_byte_string(data, size, stream);
+	fprintf(stream, "\n");
+}
+
+/*
+ * __wt_print_byte_string --
+ *	Output a single byte string in printable characters, where possible.
+ */
+void
+__wt_print_byte_string(uint8_t *data, uint32_t size, FILE *stream)
+{
+	int ch;
+
+	for (; size > 0; --size, ++data) {
+		ch = data[0];
+		if (isprint(ch))
+			fprintf(stream, "%c", ch);
+		else
+			fprintf(stream, "%x%x",
+			    hex[(data[0] & 0xf0) >> 4], hex[data[0] & 0x0f]);
+	}
+}
+
+/*
+ * __wt_print_byte_string_hex --
+ *	Output a single byte string in hexadecimal characters.
+ */
+static void
+__wt_print_byte_string_hex(uint8_t *data, uint32_t size, FILE *stream)
+{
+	for (; size > 0; --size, ++data)
+		fprintf(stream, "%x%x",
+		    hex[(data[0] & 0xf0) >> 4], hex[data[0] & 0x0f]);
+	fprintf(stream, "\n");
+}
diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c
new file mode 100644
index 00000000000..cd4cb87bfb4
--- /dev/null
+++ b/src/btree/bt_evict.c
@@ -0,0 +1,944 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int  __wt_evict(WT_TOC *);
+static int  __wt_evict_compare_lru(const void *a, const void *b);
+static int  __wt_evict_compare_page(const void *a, const void *b);
+static void __wt_evict_hazard_check(WT_TOC *);
+static int  __wt_evict_hazard_compare(const void *a, const void *b);
+static void __wt_evict_page(WT_TOC *, int);
+static int  __wt_evict_page_subtrees(WT_PAGE *);
+static void __wt_evict_set(WT_TOC *);
+static void __wt_evict_state_check(WT_TOC *);
+static int  __wt_evict_walk(WT_TOC *);
+static int  __wt_evict_walk_single(WT_TOC *, IDB *, uint);
+static void __wt_evict_write(WT_TOC *);
+
+#ifdef HAVE_DIAGNOSTIC
+static void __wt_evict_hazard_validate(ENV *, WT_PAGE *);
+#endif
+
+/*
+ * Tuning constants -- I hesitate to call this tuning, but we should review some
+ * number of pages from each file's in-memory tree for each page we evict, and
+ * we should amortize the comparison of the hazard references across some number
+ * of eviction candidates.
+ */
+#define	WT_EVICT_GROUP		10	/* Evict N pages at a time */
+#define	WT_EVICT_WALK_PER_TABLE	5	/* Pages to visit per file */
+#define	WT_EVICT_WALK_BASE	25	/* Pages tracked across file visits */
+
+/*
+ * WT_EVICT_FOREACH --
+ *	Walk a list of eviction candidates.
+ */
+#define	WT_EVICT_FOREACH(cache, p, i)					\
+	for ((i) = 0, (p) = (cache)->evict; (i) < WT_EVICT_GROUP; ++(i), ++(p))
+
+/*
+ * WT_EVICT_REF_CLR --
+ *	Clear an eviction list entry.
+ */
+#define	WT_EVICT_CLR(p) do {						\
+	(p)->ref = NULL;						\
+	(p)->idb = WT_DEBUG_POINT;					\
+} while (0)
+
+/*
+ * __wt_workq_evict_server --
+ *	See if the eviction server thread needs to be awakened.
+ */
+void
+__wt_workq_evict_server(ENV *env, int force)
+{
+	WT_CACHE *cache;
+	uint64_t bytes_inuse, bytes_max;
+
+	cache = env->ienv->cache;
+
+	/* If the eviction server is running, there's nothing to do. */
+	if (!cache->evict_sleeping)
+		return;
+
+	/*
+	 * If we're locking out reads, or over our cache limit, or forcing the
+	 * issue (when closing the environment), run the eviction server.
+	 */
+	bytes_inuse = __wt_cache_bytes_inuse(cache);
+	bytes_max = WT_STAT(cache->stats, CACHE_BYTES_MAX);
+	if (!force && !cache->read_lockout && bytes_inuse < bytes_max)
+		return;
+
+	WT_VERBOSE(env, WT_VERB_EVICT, (env,
+	    "waking eviction server: force %sset, read lockout %sset, "
+	    "bytes inuse %s max (%lluMB %s %lluMB), ",
+	    force ? "" : "not ", cache->read_lockout ? "" : "not ",
+	    bytes_inuse <= bytes_max ? "<=" : ">",
+	    (unsigned long long)(bytes_inuse / WT_MEGABYTE),
+	    bytes_inuse <= bytes_max ? "<=" : ">",
+	    (unsigned long long)(bytes_max / WT_MEGABYTE)));
+
+	cache->evict_sleeping = 0;
+	__wt_unlock(env, cache->mtx_evict);
+}
+
+/*
+ * __wt_cache_evict_server --
+ *	Thread to evict pages from the cache.
+ */
+void *
+__wt_cache_evict_server(void *arg)
+{
+	ENV *env;
+	IENV *ienv;
+	WT_CACHE *cache;
+	WT_TOC *toc;
+	uint64_t bytes_inuse, bytes_max;
+	int ret;
+
+	env = arg;
+	ienv = env->ienv;
+	cache = ienv->cache;
+	ret = 0;
+
+	/* We need a thread of control because we're reading/writing pages. */
+	toc = NULL;
+	WT_ERR(__wt_toc_api_set(env, "CacheReconciliation", NULL, &toc));
+
+	/*
+	 * Multiple pages are marked for eviction by the eviction server, which
+	 * means nobody can read them -- but, this thread of control has to
+	 * update higher pages in the tree when it writes this page, which
+	 * requires reading other pages, which might themselves be marked for
+	 * eviction.   Set a flag to allow this thread of control to see pages
+	 * marked for eviction -- we know it's safe, because only this thread
+	 * is writing pages.
+	 *
+	 * Reconciliation is probably running because the cache is full, which
+	 * means reads are locked out -- reconciliation can read, regardless.
+	 */
+	F_SET(toc, WT_READ_EVICT | WT_READ_PRIORITY);
+
+	/*
+	 * Allocate memory for a copy of the hazard references -- it's a fixed
+	 * size so doesn't need run-time adjustments.
+	 */
+	cache->hazard_elem = env->toc_size * env->hazard_size;
+	WT_ERR(__wt_calloc(
+	    env, cache->hazard_elem, sizeof(WT_PAGE *), &cache->hazard));
+	cache->hazard_len = cache->hazard_elem * sizeof(WT_PAGE *);
+
+	for (;;) {
+		WT_VERBOSE(env,
+		    WT_VERB_EVICT, (env, "eviction server sleeping"));
+		cache->evict_sleeping = 1;
+		__wt_lock(env, cache->mtx_evict);
+		WT_VERBOSE(env,
+		    WT_VERB_EVICT, (env, "eviction server waking"));
+
+		/*
+		 * Check for environment exit; do it here, instead of the top of
+		 * the loop because doing it here keeps us from doing a bunch of
+		 * worked when simply awakened to quit.
+		 */
+		if (!F_ISSET(ienv, WT_SERVER_RUN))
+			break;
+
+		for (;;) {
+			/* Single-thread reconciliation. */
+			__wt_lock(env, cache->mtx_reconcile);
+			ret = __wt_evict(toc);
+			__wt_unlock(env, cache->mtx_reconcile);
+			if (ret != 0)
+				goto err;
+
+			/*
+			 * If we've locked out reads, keep evicting until we
+			 * get to at least 5% under the maximum cache.  Else,
+			 * quit evicting as soon as we get under the maximum
+			 * cache.
+			 */
+			bytes_inuse = __wt_cache_bytes_inuse(cache);
+			bytes_max = WT_STAT(cache->stats, CACHE_BYTES_MAX);
+			if (cache->read_lockout) {
+				if (bytes_inuse <= bytes_max - (bytes_max / 20))
+					break;
+			} else if (bytes_inuse < bytes_max)
+				break;
+		}
+	}
+
+err:	if (cache->evict != NULL)
+		__wt_free(env, cache->evict, cache->evict_len);
+	if (cache->hazard != NULL)
+		__wt_free(env, cache->hazard, cache->hazard_len);
+	if (toc != NULL)
+		WT_TRET(toc->close(toc, 0));
+
+	if (ret != 0)
+		__wt_api_env_err(env, ret, "cache eviction server error");
+
+	WT_VERBOSE(
+	    env, WT_VERB_EVICT, (env, "cache eviction server exiting"));
+
+	return (NULL);
+}
+
+/*
+ * __wt_evict --
+ *	Evict pages from the cache.
+ */
+static int
+__wt_evict(WT_TOC *toc)
+{
+	ENV *env;
+	WT_CACHE *cache;
+	WT_EVICT_LIST *evict;
+	uint elem, i, j;
+
+	env = toc->env;
+	cache = env->ienv->cache;
+
+	/* Get some more pages to consider for eviction. */
+	WT_RET(__wt_evict_walk(toc));
+
+	/*
+	 * We have an array of page eviction references that may contain NULLs,
+	 * as well as duplicate entries.
+	 *
+	 * First, sort the array by WT_REF address, then delete any duplicates.
+	 * The reason is because we might evict the page but leave a duplicate
+	 * entry in the "saved" area of the array, and that would be a NULL
+	 * dereference on the next run.  (If someone ever tries to remove this
+	 * duplicate cleanup for better performance, you can't fix it just by
+	 * checking the WT_REF state -- that only works if you are discarding
+	 * a page from a single level of the tree; if you are discarding a
+	 * page and its parent, the duplicate of the page's WT_REF might have
+	 * been free'd before a subsequent review of the eviction array.)
+	 */
+	evict = cache->evict;
+	elem = cache->evict_elem;
+	qsort(evict,
+	    (size_t)elem, sizeof(WT_EVICT_LIST), __wt_evict_compare_page);
+	for (i = 0; i < elem; i = j)
+		for (j = i + 1; j < elem; ++j) {
+			/*
+			 * If the leading pointer hits a NULL, we're done, the
+			 * NULLs all sorted to the top of the array.
+			 */
+			if (evict[j].ref == NULL)
+				goto done_duplicates;
+
+			/* Delete the second and any subsequent duplicates. */
+			if (evict[i].ref == evict[j].ref)
+				WT_EVICT_CLR(&evict[j]);
+			else
+				break;
+		}
+done_duplicates:
+
+	/* Second, sort the array by LRU. */
+	qsort(evict,
+	    (size_t)elem, sizeof(WT_EVICT_LIST), __wt_evict_compare_lru);
+
+	/*
+	 * Discarding pages is done in 5 steps:
+	 *	Set the WT_EVICT state
+	 *	Check for any hazard references
+	 *	Discard clean pages
+	 *	Reconcile dirty pages (making them clean)
+	 *	Discard clean pages
+	 *
+	 * The reason we release clean pages, then reconcile dirty pages, then
+	 * release clean pages again is because reconciling a dirty page is a
+	 * slow operation, and this releases space sooner.   (Arguably, we are
+	 * going to discard all of the pages anyway, so what does it matter if
+	 * we make clean pages wait for the dirty page writes?   On the other
+	 * hand, it's a small change and benefits any thread waiting to read a
+	 * clean page we picked for discarding, unlikely though that may be.)
+	 */
+	__wt_evict_set(toc);
+	__wt_evict_hazard_check(toc);
+	__wt_evict_state_check(toc);
+	__wt_evict_page(toc, 0);
+	__wt_evict_write(toc);
+	__wt_evict_page(toc, 1);
+
+	return (0);
+}
+
+/*
+ * __wt_evict_walk --
+ *	Fill in the array by walk the next set of pages.
+ */
+static int
+__wt_evict_walk(WT_TOC *toc)
+{
+	ENV *env;
+	IDB *idb;
+	IENV *ienv;
+	WT_CACHE *cache;
+	uint elem, i;
+	int ret;
+
+	env = toc->env;
+	ienv = env->ienv;
+	cache = ienv->cache;
+
+	/*
+	 * Resize the array in which we're tracking pages, as necessary, then
+	 * get some pages from each underlying file.  We hold a mutex for the
+	 * entire time -- it's slow, but (1) how often do new files get added
+	 * or removed to/from the system, and (2) it's all in-memory stuff, so
+	 * it's not that slow.
+	 */
+	ret = 0;
+	__wt_lock(env, ienv->mtx);
+	elem = WT_EVICT_WALK_BASE + (ienv->dbqcnt * WT_EVICT_WALK_PER_TABLE);
+	if (elem <= cache->evict_elem || (ret = __wt_realloc(env,
+	    &cache->evict_len,
+	    elem * sizeof(WT_EVICT_LIST), &cache->evict)) == 0) {
+		cache->evict_elem = elem;
+
+		i = WT_EVICT_WALK_BASE;
+		TAILQ_FOREACH(idb, &ienv->dbqh, q) {
+			if ((ret = __wt_evict_walk_single(toc, idb, i)) != 0)
+				break;
+			i += WT_EVICT_WALK_PER_TABLE;
+		}
+	}
+	__wt_unlock(env, ienv->mtx);
+	return (ret);
+}
+
+/*
+ * __wt_evict_walk_single --
+ *	Get a few page eviction candidates from a single underlying file.
+ */
+static int
+__wt_evict_walk_single(WT_TOC *toc, IDB *idb, uint slot)
+{
+	WT_CACHE *cache;
+	WT_EVICT_LIST *evict;
+	int i, restarted_once;
+
+	cache = toc->env->ienv->cache;
+
+	/*
+	 * Tricky little loop that restarts the walk as necessary, without
+	 * resetting the count of pages retrieved.
+	 */
+	i = restarted_once = 0;
+
+	/* If we haven't yet opened a tree-walk structure, do so. */
+	if (idb->evict_walk.tree == NULL)
+restart:	WT_RET(__wt_walk_begin(toc, &idb->root_page, &idb->evict_walk));
+
+	/* Get the next WT_EVICT_WALK_PER_TABLE entries. */
+	do {
+		evict = &cache->evict[slot];
+		WT_RET(__wt_walk_next(toc, &idb->evict_walk, &evict->ref));
+
+		/*
+		 * Restart the walk as necessary,  but only once (after one
+		 * restart we've already acquired all of the pages, and we
+		 * could loop infinitely on a tree with a single, pinned, page).
+		 */
+		if (evict->ref == NULL) {
+			if (restarted_once++)
+				break;
+			goto restart;
+		}
+
+		evict->idb = idb;
+		++slot;
+	} while (++i < WT_EVICT_WALK_PER_TABLE);
+
+	return (0);
+}
+
+/*
+ * __wt_evict_db_clear --
+ *	Remove any entries for a file from the eviction list.
+ */
+void
+__wt_evict_db_clear(WT_TOC *toc)
+{
+	ENV *env;
+	IDB *idb;
+	IENV *ienv;
+	WT_CACHE *cache;
+	WT_EVICT_LIST *evict;
+	uint i;
+
+	env = toc->env;
+	idb = toc->db->idb;
+	ienv = env->ienv;
+	cache = ienv->cache;
+
+	/*
+	 * Discard any entries in the eviction list to a file we're closing
+	 * (the caller better have locked out the eviction thread).
+	 */
+	if (cache->evict == NULL)
+		return;
+	WT_EVICT_FOREACH(cache, evict, i)
+		if (evict->ref != NULL && evict->idb == idb)
+			WT_EVICT_CLR(evict);
+}
+
+/*
+ * __wt_evict_set --
+ *	Set the WT_EVICT flag on a set of pages.
+ */
+static void
+__wt_evict_set(WT_TOC *toc)
+{
+	ENV *env;
+	WT_CACHE *cache;
+	WT_EVICT_LIST *evict;
+	WT_REF *ref;
+	uint i;
+
+	env = toc->env;
+	cache = env->ienv->cache;
+
+	/*
+	 * Set the entry state so readers don't try and use the pages.   Once
+	 * that's done, any thread searching for a page will either see our
+	 * state value, or will have already set a hazard reference to the page.
+	 * We don't evict a page with a hazard reference set, so we can't race.
+	 *
+	 * No memory flush needed, the state field is declared volatile.
+	 */
+	WT_EVICT_FOREACH(cache, evict, i) {
+		if ((ref = evict->ref) == NULL)
+			continue;
+		ref->state = WT_EVICT;
+	}
+}
+
+/*
+ * __wt_evict_hazard_check --
+ *	Compare the list of hazard references to the list of pages to be
+ *	discarded.
+ */
+static void
+__wt_evict_hazard_check(WT_TOC *toc)
+{
+	ENV *env;
+	IENV *ienv;
+	WT_CACHE *cache;
+	WT_EVICT_LIST *evict;
+	WT_PAGE **hazard, **end_hazard, *page;
+	WT_REF *ref;
+	WT_STATS *stats;
+	uint i;
+
+	env = toc->env;
+	ienv = env->ienv;
+	cache = ienv->cache;
+	stats = cache->stats;
+
+	/* Sort the eviction candidates by WT_PAGE address. */
+	qsort(cache->evict, (size_t)WT_EVICT_GROUP,
+	    sizeof(WT_EVICT_LIST), __wt_evict_compare_page);
+
+	/* Copy the hazard reference array and sort it by WT_PAGE address. */
+	hazard = cache->hazard;
+	end_hazard = hazard + cache->hazard_elem;
+	memcpy(hazard, ienv->hazard, cache->hazard_elem * sizeof(WT_PAGE *));
+	qsort(hazard, (size_t)cache->hazard_elem,
+	    sizeof(WT_PAGE *), __wt_evict_hazard_compare);
+
+	/* Walk the lists in parallel and look for matches. */
+	WT_EVICT_FOREACH(cache, evict, i) {
+		if ((ref = evict->ref) == NULL)
+			continue;
+
+		/*
+		 * Look for the page in the hazard list until we reach the end
+		 * of the list or find a hazard pointer larger than the page.
+		 */
+		for (page = ref->page;
+		    hazard < end_hazard && *hazard < page; ++hazard)
+			;
+		if (hazard == end_hazard)
+			break;
+
+		/*
+		 * If we find a matching hazard reference, the page is in use:
+		 * remove it from the eviction list.
+		 *
+		 * No memory flush needed, the state field is declared volatile.
+		 */
+		if (*hazard == page) {
+			WT_VERBOSE(env, WT_VERB_EVICT, (env,
+			    "eviction skipped page addr %lu (hazard reference)",
+			    page->addr));
+			WT_STAT_INCR(stats, CACHE_EVICT_HAZARD);
+
+			/*
+			 * A page with a low LRU and a hazard reference?
+			 *
+			 * Set the page's LRU so we don't select it again.
+			 * Return the page to service.
+			 * Discard our reference.
+			 */
+			ref->page->read_gen = ++cache->read_gen;
+			ref->state = WT_OK;
+			WT_EVICT_CLR(evict);
+		}
+	}
+}
+
+/*
+ * __wt_evict_state_check --
+ *	Confirm these are pages we want to evict.
+ */
+static void
+__wt_evict_state_check(WT_TOC *toc)
+{
+	ENV *env;
+	WT_CACHE *cache;
+	WT_EVICT_LIST *evict;
+	WT_PAGE *page;
+	WT_REF *ref;
+	uint i;
+
+	env = toc->env;
+	cache = env->ienv->cache;
+
+	/*
+	 * We "own" the pages (we've flagged them for eviction, and there were
+	 * no hazard references).   Now do checks to see if these are pages we
+	 * can evict -- we have to wait until after we own the page because the
+	 * page might be updated and race with us.
+	 */
+	WT_EVICT_FOREACH(cache, evict, i) {
+		if ((ref = evict->ref) == NULL)
+			continue;
+		page = ref->page;
+
+		/* Ignore pinned pages. */
+		if (F_ISSET(page, WT_PINNED)) {
+			WT_VERBOSE(env, WT_VERB_EVICT, (env,
+			    "eviction skipped page addr %lu (pinned)",
+			    page->addr));
+			goto skip;
+		}
+
+		/* Ignore pages with in-memory subtrees. */
+		switch (page->dsk->type) {
+		case WT_PAGE_COL_INT:
+		case WT_PAGE_DUP_INT:
+		case WT_PAGE_ROW_INT:
+		case WT_PAGE_ROW_LEAF:
+			if (__wt_evict_page_subtrees(page)) {
+				WT_VERBOSE(env, WT_VERB_EVICT, (env,
+				    "eviction skipped page addr %lu (subtrees)",
+				    page->addr));
+				goto skip;
+			}
+			break;
+		default:
+			break;
+		}
+
+		continue;
+
+skip:		/*
+		 * Set the page's LRU so we don't select it again.
+		 * Return the page to service.
+		 * Discard our reference.
+		 */
+		page->read_gen = ++cache->read_gen;
+		ref->state = WT_OK;
+		WT_EVICT_CLR(evict);
+	}
+}
+
+/*
+ * __wt_evict_write --
+ *	Write any modified pages.
+ */
+static void
+__wt_evict_write(WT_TOC *toc)
+{
+	ENV *env;
+	WT_CACHE *cache;
+	WT_EVICT_LIST *evict;
+	WT_PAGE *page;
+	WT_REF *ref;
+	uint i;
+
+	env = toc->env;
+	cache = env->ienv->cache;
+
+	WT_EVICT_FOREACH(cache, evict, i) {
+		if ((ref = evict->ref) == NULL)
+			continue;
+		page = ref->page;
+
+		/* Ignore dirty pages. */
+		if (!WT_PAGE_IS_MODIFIED(page))
+			continue;
+
+		/*
+		 * We're using our WT_TOC handle, it needs to reference the
+		 * correct DB handle.
+		 *
+		 * XXX
+		 * This is pretty sleazy, but I'm hesitant to try and drive
+		 * a separate DB/IDB handle down through the reconciliation
+		 * code.
+		 */
+		toc->db = evict->idb->db;
+		(void)__wt_page_reconcile(toc, page);
+	}
+}
+
+/*
+ * __wt_evict_page --
+ *	Evict cache pages.
+ */
+static void
+__wt_evict_page(WT_TOC *toc, int was_dirty)
+{
+	ENV *env;
+	WT_CACHE *cache;
+	WT_EVICT_LIST *evict;
+	WT_PAGE *page;
+	WT_REF *ref;
+	WT_STATS *stats;
+	uint i;
+
+	env = toc->env;
+	cache = env->ienv->cache;
+	stats = cache->stats;
+
+	WT_EVICT_FOREACH(cache, evict, i) {
+		if ((ref = evict->ref) == NULL)
+			continue;
+		page = ref->page;
+
+		/*
+		 * The first time we're called, we get rid of the clean pages;
+		 * the second time we're called, we get rid of the pages that
+		 * were dirty but have since been cleaned.  Ignore dirty pages
+		 * in all cases, it's simpler.
+		 */
+		if (WT_PAGE_IS_MODIFIED(page))
+			continue;
+
+		if (was_dirty)
+			WT_STAT_INCR(stats, CACHE_EVICT_MODIFIED);
+		else
+			WT_STAT_INCR(stats, CACHE_EVICT_UNMODIFIED);
+
+#ifdef HAVE_DIAGNOSTIC
+		__wt_evict_hazard_validate(env, page);
+#endif
+		WT_VERBOSE(env, WT_VERB_EVICT, (env,
+		    "cache evicting page addr %lu", page->addr));
+
+		/*
+		 * Copy a page reference, then make the cache entry available
+		 * for re-use.
+		 *
+		 * No memory flush needed, the state field is declared volatile.
+		 */
+		ref->page = NULL;
+		ref->state = WT_EMPTY;
+
+		/* Remove the entry from the eviction list. */
+		WT_EVICT_CLR(evict);
+
+		/* We've got more space. */
+		WT_CACHE_PAGE_OUT(cache, page->size);
+
+		/* The page can no longer be found, free the memory. */
+		__wt_page_discard(toc, page);
+	}
+}
+
+/*
+ * __wt_evict_page_subtrees --
+ *	Return if a page has an in-memory subtree.
+ */
+static int
+__wt_evict_page_subtrees(WT_PAGE *page)
+{
+	WT_REF *ref, **dupp;
+	uint32_t i;
+
+	/*
+	 * Return if a page has an in-memory subtree -- this array search could
+	 * be replaced by a reference count in the page, but (1) the eviction
+	 * thread isn't where I expect performance problems, (2) I hate to lose
+	 * more bytes on every page, (3) how often will an internal page be
+	 * evicted anyway?
+	 */
+	switch (page->dsk->type) {
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_ROW_INT:
+		WT_REF_FOREACH(page, ref, i)
+			if (ref->state != WT_EMPTY)
+				return (1);
+		break;
+	case WT_PAGE_ROW_LEAF:
+		if (WT_PAGE_DUP_TREES(page))
+			WT_DUP_FOREACH(page, dupp, i)
+				if (*dupp != NULL && (*dupp)->state != WT_EMPTY)
+					return (1);
+		break;
+	default:
+		break;
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_evict_compare_page --
+ *	Qsort function: sort WT_EVICT_LIST array based on the page's address.
+ */
+static int
+__wt_evict_compare_page(const void *a, const void *b)
+{
+	WT_REF *a_ref, *b_ref;
+	WT_PAGE *a_page, *b_page;
+
+	/*
+	 * There may be NULL references in the array; sort them as greater than
+	 * anything else so they migrate to the end of the array.
+	 */
+	a_ref = ((WT_EVICT_LIST *)a)->ref;
+	b_ref = ((WT_EVICT_LIST *)b)->ref;
+	if (a_ref == NULL)
+		return (b_ref == NULL ? 0 : 1);
+	if (b_ref == NULL)
+		return (-1);
+
+	/* Sort the page address in ascending order. */
+	a_page = a_ref->page;
+	b_page = b_ref->page;
+	return (a_page > b_page ? 1 : (a_page < b_page ? -1 : 0));
+}
+
+/*
+ * __wt_evict_compare_lru --
+ *	Qsort function: sort WT_EVICT_LIST array based on the page's read
+ *	generation.
+ */
+static int
+__wt_evict_compare_lru(const void *a, const void *b)
+{
+	WT_REF *a_ref, *b_ref;
+	uint32_t a_lru, b_lru;
+
+	/*
+	 * There may be NULL references in the array; sort them as greater than
+	 * anything else so they migrate to the end of the array.
+	 */
+	a_ref = ((WT_EVICT_LIST *)a)->ref;
+	b_ref = ((WT_EVICT_LIST *)b)->ref;
+	if (a_ref == NULL)
+		return (b_ref == NULL ? 0 : 1);
+	if (b_ref == NULL)
+		return (-1);
+
+	/* Sort the LRU in ascending order. */
+	a_lru = a_ref->page->read_gen;
+	b_lru = b_ref->page->read_gen;
+	return (a_lru > b_lru ? 1 : (a_lru < b_lru ? -1 : 0));
+}
+
+/*
+ * __wt_evict_hazard_compare --
+ *	Qsort function: sort hazard list based on the page's address.
+ */
+static int
+__wt_evict_hazard_compare(const void *a, const void *b)
+{
+	WT_PAGE *a_page, *b_page;
+
+	a_page = *(WT_PAGE **)a;
+	b_page = *(WT_PAGE **)b;
+
+	return (a_page > b_page ? 1 : (a_page < b_page ? -1 : 0));
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_evict_hazard_validate --
+ *	Return if a page is or isn't on the hazard list.
+ */
+static void
+__wt_evict_hazard_validate(ENV *env, WT_PAGE *page)
+{
+	IENV *ienv;
+	WT_PAGE **hp;
+	WT_TOC **tp, *toc;
+
+	ienv = env->ienv;
+
+	for (tp = ienv->toc; (toc = *tp) != NULL; ++tp)
+		for (hp = toc->hazard;
+		    hp < toc->hazard + toc->env->hazard_size; ++hp)
+			if (*hp == page) {
+				__wt_api_env_errx(env,
+				    "hazard eviction check for page %lu "
+				    "failed",
+				    (u_long)page->addr);
+				__wt_abort(env);
+			}
+}
+
+/*
+ * __wt_evict_dump --
+ *	Display the eviction list.
+ */
+void
+__wt_evict_dump(WT_TOC *toc)
+{
+	ENV *env;
+	WT_CACHE *cache;
+	WT_EVICT_LIST *evict;
+	WT_MBUF mb;
+	uint n;
+	int sep;
+
+	env = toc->env;
+	cache = env->ienv->cache;
+
+	__wt_mb_init(env, &mb);
+	__wt_mb_add(&mb, "eviction list");
+
+	for (sep = ':', n = 0; n < cache->evict_elem; ++n) {
+		evict = &cache->evict[n];
+		if (evict->ref == NULL)
+			continue;
+		__wt_mb_add(&mb, "%c %lu", sep, (u_long)evict->ref->page->addr);
+		sep = ',';
+	}
+	__wt_mb_discard(&mb);
+}
+
+/*
+ * __wt_evict_dump_cache
+ *	Dump the in-memory cache.
+ */
+int
+__wt_evict_cache_dump(WT_TOC *toc)
+{
+	IDB *idb;
+	IENV *ienv;
+
+	ienv = toc->env->ienv;
+
+	TAILQ_FOREACH(idb, &ienv->dbqh, q)
+		WT_RET(__wt_evict_tree_dump(toc, idb));
+	return (0);
+}
+
+/*
+ * __wt_evict_tree_dump
+ *	Dump an in-memory tree.
+ */
+int
+__wt_evict_tree_dump(WT_TOC *toc, IDB *idb)
+{
+	ENV *env;
+	WT_CACHE *cache;
+	WT_REF *ref;
+	WT_WALK walk;
+	WT_MBUF mb;
+	int sep;
+
+	env = toc->env;
+	cache = env->ienv->cache;
+
+	WT_VERBOSE(env, WT_VERB_EVICT, (env,
+	    "%s: pages inuse %llu, bytes inuse (%llu), max (%llu)",
+	    idb->name,
+	    __wt_cache_pages_inuse(cache),
+	    __wt_cache_bytes_inuse(cache),
+	    WT_STAT(cache->stats, CACHE_BYTES_MAX)));
+
+	__wt_mb_init(env, &mb);
+	__wt_mb_add(&mb, "in-memory page list");
+
+	WT_CLEAR(walk);
+	WT_RET(__wt_walk_begin(toc, &idb->root_page, &walk));
+	for (sep = ':';;) {
+		WT_RET(__wt_walk_next(toc, &walk, &ref));
+		if (ref == NULL)
+			break;
+		__wt_mb_add(&mb, "%c %lu", sep, (u_long)ref->page->addr);
+		sep = ',';
+	}
+	__wt_walk_end(env, &walk);
+	__wt_mb_discard(&mb);
+
+	return (0);
+}
+
+/*
+ * __wt_evict_cache_count
+ *	Retrun the count of nodes in the cache.
+ */
+int
+__wt_evict_cache_count(WT_TOC *toc, uint64_t *nodesp)
+{
+	IDB *idb;
+	IENV *ienv;
+	uint64_t nodes;
+
+	ienv = toc->env->ienv;
+
+	*nodesp = 0;
+	TAILQ_FOREACH(idb, &ienv->dbqh, q) {
+		WT_RET(__wt_evict_tree_count(toc, idb, &nodes));
+		*nodesp += nodes;
+	}
+	return (0);
+}
+
+/*
+ * __wt_evict_tree_count
+ *	Return a count of nodes in the tree.
+ */
+int
+__wt_evict_tree_count(WT_TOC *toc, IDB *idb, uint64_t *nodesp)
+{
+	ENV *env;
+	WT_REF *ref;
+	WT_WALK walk;
+	uint64_t nodes;
+
+	env = toc->env;
+
+	WT_CLEAR(walk);
+	WT_RET(__wt_walk_begin(toc, &idb->root_page, &walk));
+	for (nodes = 0;;) {
+		WT_RET(__wt_walk_next(toc, &walk, &ref));
+		if (ref == NULL)
+			break;
+		++nodes;
+	}
+	*nodesp = nodes;
+	__wt_walk_end(env, &walk);
+
+	return (0);
+}
+#endif
diff --git a/src/btree/bt_misc.c b/src/btree/bt_misc.c
new file mode 100644
index 00000000000..c0f58002522
--- /dev/null
+++ b/src/btree/bt_misc.c
@@ -0,0 +1,175 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_bt_build_verify --
+ *	Verify the Btree build itself.
+ */
+int
+__wt_bt_build_verify(void)
+{
+	static const struct {
+		char *name;
+		u_int size, expected;
+	} size_check[] = {
+		{ "WT_COL", sizeof(WT_COL), WT_COL_SIZE },
+		{ "WT_ITEM", sizeof(WT_ITEM), WT_ITEM_SIZE },
+		{ "WT_OFF", sizeof(WT_OFF), WT_OFF_SIZE },
+		{ "WT_OVFL", sizeof(WT_OVFL), WT_OVFL_SIZE },
+		{ "WT_PAGE", sizeof(WT_PAGE), WT_PAGE_SIZE },
+		{ "WT_PAGE_DESC", sizeof(WT_PAGE_DESC), WT_PAGE_DESC_SIZE },
+		{ "WT_PAGE_DISK", sizeof(WT_PAGE_DISK), WT_PAGE_DISK_SIZE },
+		{ "WT_ROW", sizeof(WT_ROW), WT_ROW_SIZE }
+	};
+	static const struct {
+		char *name;
+		u_int size, align;
+	} align_check[] = {
+		{ "WT_OFF", sizeof(WT_OFF), sizeof(uint32_t) },
+		{ "WT_OVFL", sizeof(WT_OVFL), sizeof(uint32_t) },
+		{ "WT_PAGE_DISK", sizeof(WT_PAGE_DISK), sizeof(uint32_t) },
+		{ "WT_TOC_UPDATE", sizeof(WT_TOC_UPDATE), sizeof(uint32_t) }
+	};
+	u_int i;
+
+	/*
+	 * The compiler had better not have padded our structures -- make
+	 * sure the page header structure is exactly what we expect.
+	 */
+	for (i = 0; i < WT_ELEMENTS(size_check); ++i) {
+		if (size_check[i].size == size_check[i].expected)
+			continue;
+		__wt_api_env_errx(NULL,
+		    "WiredTiger build failed, the %s header structure is not "
+		    "the correct size (expected %u, got %u)",
+		    size_check[i].name,
+		    size_check[i].expected, size_check[i].size);
+		return (WT_ERROR);
+	}
+
+	/* There are also structures that must be aligned correctly. */
+	for (i = 0; i < WT_ELEMENTS(align_check); ++i) {
+		if (WT_ALIGN(align_check[i].size,
+		    align_check[i].align) == align_check[i].size)
+			continue;
+		__wt_api_env_errx(NULL,
+		    "Build verification failed, the %s structure is not"
+		    " correctly aligned", align_check[i].name);
+		return (WT_ERROR);
+	}
+
+	/*
+	 * We mix-and-match 32-bit unsigned values and size_t's, mostly because
+	 * we allocate and handle 32-bit objects, and lots of the underlying C
+	 * library expects size_t values for the length of memory objects.  We
+	 * check, just to be sure.
+	 */
+	if (sizeof(size_t) < sizeof(uint32_t)) {
+		__wt_api_env_errx(NULL, "%s",
+		    "Build verification failed, a size_t is smaller than "
+		    "4-bytes");
+		return (WT_ERROR);
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_set_ff_and_sa_from_offset --
+ *	Set first-free and space-available values from an address positioned
+ *	one past the last used byte on the page.
+ */
+inline void
+__wt_set_ff_and_sa_from_offset(WT_PAGE *page,
+    void *p, uint8_t **first_freep, uint32_t *space_availp)
+{
+	*first_freep = (uint8_t *)p;
+	*space_availp =
+	    page->size - (uint32_t)((uint8_t *)p - (uint8_t *)page->dsk);
+}
+
+/*
+ * __wt_page_write_gen_check --
+ *	Confirm the page's write generation number is correct.
+ */
+inline int
+__wt_page_write_gen_check(WT_PAGE *page, uint32_t write_gen)
+{
+	return (page->write_gen == write_gen ? 0 : WT_RESTART);
+}
+
+/*
+ * __wt_page_type_string --
+ *	Return a string representing the page type.
+ */
+const char *
+__wt_page_type_string(WT_PAGE_DISK *dsk)
+{
+	switch (dsk->type) {
+	case WT_PAGE_INVALID:
+		return ("invalid");
+	case WT_PAGE_COL_FIX:
+		return ("column-store fixed-length leaf");
+	case WT_PAGE_COL_INT:
+		return ("column-store internal");
+	case WT_PAGE_COL_RLE:
+		return ("column-store fixed-length run-length encoded leaf");
+	case WT_PAGE_COL_VAR:
+		return ("column-store variable-length leaf");
+	case WT_PAGE_DUP_INT:
+		return ("duplicate tree internal");
+	case WT_PAGE_DUP_LEAF:
+		return ("duplicate tree leaf");
+	case WT_PAGE_OVFL:
+		return ("overflow");
+	case WT_PAGE_ROW_INT:
+		return ("row-store internal");
+	case WT_PAGE_ROW_LEAF:
+		return ("row-store leaf");
+	default:
+		break;
+	}
+	return ("unknown");
+}
+
+/*
+ * __wt_item_type_string --
+ *	Return a string representing the item type.
+ */
+const char *
+__wt_item_type_string(WT_ITEM *item)
+{
+	switch (WT_ITEM_TYPE(item)) {
+	case WT_ITEM_KEY:
+		return ("key");
+	case WT_ITEM_KEY_OVFL:
+		return ("key-overflow");
+	case WT_ITEM_KEY_DUP:
+		return ("key-duplicate");
+	case WT_ITEM_KEY_DUP_OVFL:
+		return ("key-duplicate-overflow");
+	case WT_ITEM_DATA:
+		return ("data");
+	case WT_ITEM_DATA_OVFL:
+		return ("data-overflow");
+	case WT_ITEM_DATA_DUP:
+		return ("data-duplicate");
+	case WT_ITEM_DATA_DUP_OVFL:
+		return ("data-duplicate-overflow");
+	case WT_ITEM_DEL:
+		return ("deleted");
+	case WT_ITEM_OFF:
+		return ("off-page");
+	default:
+		break;
+	}
+	return ("unknown");
+}
diff --git a/src/btree/bt_open.c b/src/btree/bt_open.c
new file mode 100644
index 00000000000..c746782221e
--- /dev/null
+++ b/src/btree/bt_open.c
@@ -0,0 +1,279 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_open_verify(DB *);
+static int __wt_open_verify_page_sizes(DB *);
+
+/*
+ * __wt_bt_open --
+ *	Open a Btree.
+ */
+int
+__wt_bt_open(WT_TOC *toc, int ok_create)
+{
+	DB *db;
+	ENV *env;
+	IDB *idb;
+
+	db = toc->db;
+	env = toc->env;
+	idb = db->idb;
+
+	/* Check page size configuration. */
+	WT_RET(__wt_open_verify(db));
+
+	/* Open the fle. */
+	WT_RET(__wt_open(env, idb->name, idb->mode, ok_create, &idb->fh));
+
+	/*
+	 * If the file size is 0, write a description page; if the file size
+	 * is non-zero, update the DB handle based on the on-disk description
+	 * page.  (If the file isn't empty, there must be a description page.)
+	 */
+	if (idb->fh->file_size == 0)
+		WT_RET(__wt_desc_write(toc));
+	else {
+		WT_RET(__wt_desc_read(toc));
+
+		/* If there's a root page, pin it. */
+		if (idb->root_off.addr != WT_ADDR_INVALID)
+			WT_RET(__wt_root_pin(toc));
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_open_verify --
+ *	Verify anything we can't verify before we're about to open the file;
+ *	set defaults as necessary.
+ */
+static int
+__wt_open_verify(DB *db)
+{
+	IDB *idb;
+
+	idb = db->idb;
+
+	/* Verify the page sizes. */
+	WT_RET(__wt_open_verify_page_sizes(db));
+
+	/* Verify other configuration combinations. */
+	if (db->fixed_len != 0 && (idb->huffman_key || idb->huffman_data)) {
+		__wt_api_db_errx(db,
+		    "Fixed size column-store databases may not be Huffman "
+		    "compressed");
+		return (WT_ERROR);
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_open_verify_page_sizes --
+ *	Verify the page sizes.
+ */
+static int
+__wt_open_verify_page_sizes(DB *db)
+{
+	IDB *idb;
+
+	idb = db->idb;
+
+	/*
+	 * The application can set lots of page sizes.  It's complicated, so
+	 * instead of verifying the relationships when they're set, verify
+	 * then when the database is opened and we know we have the final
+	 * values.  (Besides, if we verify the relationships when they're set,
+	 * the application has to set them in a specific order or we'd need
+	 * one set function that took 10 parameters.)
+	 *
+	 * If the values haven't been set, set the defaults.
+	 *
+	 * Default to a small fragment size, so overflow items don't consume
+	 * a lot of space.
+	 */
+	if (db->allocsize == 0)
+		db->allocsize = WT_BTREE_ALLOCATION_SIZE;
+
+	/* Allocation sizes must be a power-of-two, nothing else makes sense. */
+	if (!__wt_ispo2(db->allocsize)) {
+		__wt_api_db_errx(db,
+		   "the allocation size must be a power of two");
+		return (WT_ERROR);
+	}
+
+	/*
+	 * Limit allocation units to 256MB, and page sizes to 128MB.  There's
+	 * no reason (other than testing) we can't support larger sizes (any
+	 * sizes up to the smaller of an off_t and a size_t should work), but
+	 * an application specifying larger allocation or page sizes is almost
+	 * certainly making a mistake.
+	 */
+	if (db->allocsize > WT_BTREE_ALLOCATION_SIZE_MAX) {
+		__wt_api_db_errx(db,
+		   "the allocation size must less than or equal to %luMB",
+		    (u_long)(WT_BTREE_PAGE_SIZE_MAX / WT_MEGABYTE));
+		return (WT_ERROR);
+	}
+
+	/*
+	 * Internal pages are also usually small, we want it to fit into the
+	 * L1 cache.   We try and put at least 40 keys on each internal page
+	 * (40 because that results in 100M keys in a level 5 Btree).  But,
+	 * if it's a small page, push anything bigger than about 50 bytes
+	 * off-page.   Here's the table:
+	 *	Pagesize	Largest key retained on-page:
+	 *	512B		 50 bytes
+	 *	1K		 50 bytes
+	 *	2K		 51 bytes
+	 *	4K		102 bytes
+	 *	8K		204 bytes
+	 * and so on, roughly doubling for each power-of-two.
+	 */
+	if (db->intlmin == 0)
+		db->intlmin = WT_BTREE_INTLMIN_DEFAULT;
+	if (db->intlmax == 0)
+		db->intlmax = WT_MAX(db->intlmin, WT_BTREE_INTLMAX_DEFAULT);
+	if (db->intlitemsize == 0) {
+		if (db->intlmin <= 1024)
+			db->intlitemsize = 50;
+		else
+			db->intlitemsize = db->intlmin / 40;
+	}
+
+	/*
+	 * Leaf pages are larger to amortize I/O across a large chunk of the
+	 * data space, but still minimize the chance of a broken write.  We
+	 * only require 20 key/data pairs fit onto a leaf page.  Again, if it's
+	 * a small page, push anything bigger than about 80 bytes off-page.
+	 * Here's the table:
+	 *	Pagesize	Largest key or data item retained on-page:
+	 *	512B		 80 bytes
+	 *	 1K		 80 bytes
+	 *	 2K		 80 bytes
+	 *	 4K		 80 bytes
+	 *	 8K		204 bytes
+	 *	16K		409 bytes
+	 * and so on, roughly doubling for each power-of-two.
+	 */
+	if (db->leafmin == 0)
+		db->leafmin = WT_BTREE_LEAFMIN_DEFAULT;
+	if (db->leafmax == 0)
+		db->leafmax = WT_MAX(db->leafmin, WT_BTREE_LEAFMAX_DEFAULT);
+	if (db->leafitemsize == 0) {
+		if (db->leafmin <= 4096)
+			db->leafitemsize = 80;
+		else
+			db->leafitemsize = db->leafmin / 40;
+	}
+
+	/* Final checks for safety. */
+	if (db->intlmin % db->allocsize != 0 ||
+	    db->intlmax % db->allocsize != 0 ||
+	    db->leafmin % db->allocsize != 0 ||
+	    db->leafmax % db->allocsize != 0) {
+		__wt_api_db_errx(db,
+		    "all page sizes must be a multiple of %lu bytes",
+		    (u_long)db->allocsize);
+		return (WT_ERROR);
+	}
+
+	if (db->intlmin > db->intlmax || db->leafmin > db->leafmax) {
+		__wt_api_db_errx(db,
+		    "minimum page sizes must be less than or equal to maximum "
+		    "page sizes");
+		return (WT_ERROR);
+	}
+
+	if (db->intlmin > WT_BTREE_PAGE_SIZE_MAX ||
+	    db->intlmax > WT_BTREE_PAGE_SIZE_MAX ||
+	    db->leafmin > WT_BTREE_PAGE_SIZE_MAX ||
+	    db->leafmax > WT_BTREE_PAGE_SIZE_MAX) {
+		__wt_api_db_errx(db,
+		    "all page sizes must less than or equal to %luMB",
+		    (u_long)WT_BTREE_PAGE_SIZE_MAX / WT_MEGABYTE);
+		return (WT_ERROR);
+	}
+
+	/*
+	 * We only have 3 bytes of length for on-page items, so the maximum
+	 * on-page item size is limited to 16MB.
+	 */
+	if (db->intlitemsize > WT_ITEM_MAX_LEN)
+		db->intlitemsize = WT_ITEM_MAX_LEN;
+	if (db->leafitemsize > WT_ITEM_MAX_LEN)
+		db->leafitemsize = WT_ITEM_MAX_LEN;
+
+	/*
+	 * By default, any duplicate set that reaches 25% of a leaf page is
+	 * moved into its own separate tree.
+	 */
+	if (db->btree_dup_offpage == 0)
+		db->btree_dup_offpage = 4;
+
+	/*
+	 * A leaf page must hold at least 2 key/data pairs, otherwise the
+	 * whole btree thing breaks down because we can't split.  We have
+	 * to include WT_DESC_SIZE in leaf page calculations, it's not
+	 * strictly necessary in internal pages because page 0 is always
+	 * a leaf page.  The additional 10 bytes is for slop -- Berkeley DB
+	 * took roughly a decade to get the calculation correct, and that
+	 * way I can skip the suspense.
+	 */
+#define	WT_MINIMUM_DATA_SPACE(db, s)					\
+	    (((s) - (WT_PAGE_DISK_SIZE + WT_PAGE_DESC_SIZE + 10)) / 4)
+	if (db->intlitemsize > WT_MINIMUM_DATA_SPACE(db, db->intlmin)) {
+		__wt_api_db_errx(db,
+		    "The internal page size is too small for its maximum item "
+		    "size");
+		return (WT_ERROR);
+	}
+	if (db->leafitemsize > WT_MINIMUM_DATA_SPACE(db, db->leafmin)) {
+		__wt_api_db_errx(db,
+		    "The leaf page size is too small for its maximum item "
+		    "size");
+		return (WT_ERROR);
+	}
+
+	/*
+	 * A fixed-size column store should be able to store at least 20
+	 * objects on a page, otherwise it just doesn't make sense.
+	 */
+	if (F_ISSET(idb, WT_COLUMN) &&
+	    db->fixed_len != 0 && db->leafmin / db->fixed_len < 20) {
+		__wt_api_db_errx(db,
+		    "The leaf page size cannot store at least 20 fixed-length "
+		    "objects");
+		return (WT_ERROR);
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_root_pin --
+ *	Read in the root page and pin it into memory.
+ */
+int
+__wt_root_pin(WT_TOC *toc)
+{
+	IDB *idb;
+
+	idb = toc->db->idb;
+
+	/* Get the root page. */
+	WT_RET(__wt_page_in(toc, NULL, &idb->root_page, &idb->root_off, 0));
+		F_SET(idb->root_page.page, WT_PINNED);
+	__wt_hazard_clear(toc, idb->root_page.page);
+
+	return (0);
+}
diff --git a/src/btree/bt_ovfl.c b/src/btree/bt_ovfl.c
new file mode 100644
index 00000000000..09eac77264b
--- /dev/null
+++ b/src/btree/bt_ovfl.c
@@ -0,0 +1,72 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_ovfl_in --
+ *	Read an overflow item from the disk.
+ */
+int
+__wt_ovfl_in(WT_TOC *toc, WT_OVFL *ovfl, DBT *store)
+{
+	DB *db;
+	ENV *env;
+	WT_PAGE_DISK *dsk;
+	WT_STATS *stats;
+	uint32_t size;
+
+	env = toc->env;
+	db = toc->db;
+	stats = env->ienv->cache->stats;
+
+	/*
+	 * Read an overflow page, using an overflow structure from a page for
+	 * which we (better) have a hazard reference.
+	 *
+	 * Overflow reads are synchronous. That may bite me at some point, but
+	 * WiredTiger supports large page sizes, and overflow items should be
+	 * rare.
+	 */
+	WT_VERBOSE(env, WT_VERB_READ, (env,
+	    "overflow read addr/size %lu/%lu",
+	    (u_long)ovfl->addr, (u_long)ovfl->size));
+	WT_STAT_INCR(stats, OVERFLOW_READ);
+
+	/*
+	 * The only caller that wants a copy of the overflow pages (as opposed
+	 * to the contents of the overflow pages), is the verify code.  For that
+	 * reason, it reads its own overflow pages, it doesn't call this code.
+	 *
+	 * But, we still have to verify the checksum, which means we have to
+	 * read the entire set of pages, then copy the interesting information
+	 * to the beginning of the buffer.   The copy is a shift in a single
+	 * buffer and so should be fast, but it's still not a good thing.  If
+	 * it ever becomes a problem, then we either have to pass the fact that
+	 * it's a "page" back to our caller and let them deal with the offset,
+	 * or add a new field to the DBT that flags the start of the allocated
+	 * buffer, instead of using the "data" field to indicate both the start
+	 * of the data and the start of the allocated memory.
+	 *
+	 * Re-allocate memory as necessary to hold the overflow pages.
+	 */
+	size = WT_HDR_BYTES_TO_ALLOC(db, ovfl->size);
+	if (store->mem_size < size)
+		WT_RET(__wt_realloc(env, &store->mem_size, size, &store->data));
+
+	/* Read the page. */
+	WT_RET(__wt_page_disk_read(toc, store->data, ovfl->addr, size));
+
+	/* Copy the actual data in the DBT down to the start of the data. */
+	(void)memmove(store->data,
+	    (uint8_t *)store->data + sizeof(WT_PAGE_DISK), ovfl->size);
+	store->size = ovfl->size;
+
+	return (0);
+}
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
new file mode 100644
index 00000000000..915d038751b
--- /dev/null
+++ b/src/btree/bt_page.c
@@ -0,0 +1,656 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static void __wt_page_inmem_col_fix(DB *, WT_PAGE *);
+static void __wt_page_inmem_col_int(WT_PAGE *);
+static void __wt_page_inmem_col_rle(DB *, WT_PAGE *);
+static void __wt_page_inmem_col_var(WT_PAGE *);
+static int  __wt_page_inmem_dup_leaf(DB *, WT_PAGE *);
+static int  __wt_page_inmem_int_ref(WT_TOC *, uint32_t, WT_PAGE *);
+static int  __wt_page_inmem_row_int(DB *, WT_PAGE *);
+static int  __wt_page_inmem_row_leaf(DB *, WT_PAGE *);
+
+/*
+ * __wt_page_in --
+ *	Acquire a hazard reference to a page; if the page is not in-memory,
+ *	read it from the disk and build an in-memory version.
+ */
+int
+__wt_page_in(
+    WT_TOC *toc, WT_PAGE *parent, WT_REF *ref, WT_OFF *off, int dsk_verify)
+{
+	ENV *env;
+	WT_CACHE *cache;
+	int ret;
+
+	env = toc->env;
+	cache = env->ienv->cache;
+
+	for (;;)
+		switch (ref->state) {
+		case WT_OK:
+			/*
+			 * The page is in memory: get a hazard reference, update
+			 * the page's LRU and return.
+			 */
+			if (__wt_hazard_set(toc, ref)) {
+				ref->page->read_gen = ++cache->read_gen;
+				return (0);
+			}
+			/* FALLTHROUGH */
+		case WT_EVICT:
+			/*
+			 * The page is being considered for eviction, wait for
+			 * that to resolve.
+			 */
+			__wt_yield();
+			break;
+		case WT_EMPTY:
+			/* The page isn't in memory, request it be read. */
+			__wt_cache_read_serial(
+			    toc, parent, ref, off, dsk_verify, ret);
+			if (ret != 0)
+				return (ret);
+			break;
+		default:
+			WT_ABORT(env, "WT_REF->state invalid");
+			break;
+		}
+	/* NOTREACHED */
+}
+
+/*
+ * __wt_page_inmem --
+ *	Build in-memory page information.
+ */
+int
+__wt_page_inmem(WT_TOC *toc, WT_PAGE *page)
+{
+	DB *db;
+	ENV *env;
+	WT_PAGE_DISK *dsk;
+	uint32_t nindx;
+	int ret;
+
+	db = toc->db;
+	env = toc->env;
+	dsk = page->dsk;
+	ret = 0;
+
+	WT_ASSERT(env, page->u.indx == NULL);
+
+	/* Determine the maximum number of indexes we'll need for this page. */
+	switch (dsk->type) {
+	case WT_PAGE_COL_FIX:
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_COL_RLE:
+	case WT_PAGE_COL_VAR:
+	case WT_PAGE_DUP_LEAF:
+		nindx = dsk->u.entries;
+		break;
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_ROW_INT:
+		nindx = dsk->u.entries / 2;
+		break;
+	case WT_PAGE_ROW_LEAF:
+		/*
+		 * Row store leaf pages support duplicates, so the real worst
+		 * case is one key plus some number of duplicate data items.
+		 * The number is configurable, that is, you can configure when
+		 * a duplicate set is big enough to be pushed off the page;
+		 * we're conservative here.
+		 */
+		nindx = dsk->u.entries - 1;
+		break;
+	WT_ILLEGAL_FORMAT(db);
+	}
+
+	/*
+	 * XXX
+	 * We don't yet have a free-list on which to put empty pages -- for
+	 * now, we handle them.
+	 */
+	if (nindx == 0)
+		return (0);
+
+	/* Allocate an array of WT_{ROW,COL}_INDX structures for the page. */
+	switch (dsk->type) {
+	case WT_PAGE_COL_FIX:
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_COL_RLE:
+	case WT_PAGE_COL_VAR:
+		WT_ERR((__wt_calloc(env,
+		    nindx, sizeof(WT_COL), &page->u.icol)));
+		break;
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_DUP_LEAF:
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+		WT_ERR((__wt_calloc(env,
+		    nindx, sizeof(WT_ROW), &page->u.irow)));
+		break;
+	default:
+		break;
+	}
+
+	/* Allocate reference array for internal pages. */
+	switch (dsk->type) {
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_ROW_INT:
+		WT_ERR(__wt_page_inmem_int_ref(toc, nindx, page));
+		break;
+	default:
+		break;
+	}
+
+	/* Fill in the structures. */
+	switch (dsk->type) {
+	case WT_PAGE_COL_FIX:
+		__wt_page_inmem_col_fix(db, page);
+		break;
+	case WT_PAGE_COL_INT:
+		__wt_page_inmem_col_int(page);
+		break;
+	case WT_PAGE_COL_RLE:
+		__wt_page_inmem_col_rle(db, page);
+		break;
+	case WT_PAGE_COL_VAR:
+		__wt_page_inmem_col_var(page);
+		break;
+	case WT_PAGE_DUP_LEAF:
+		WT_ERR(__wt_page_inmem_dup_leaf(db, page));
+		break;
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_ROW_INT:
+		WT_ERR(__wt_page_inmem_row_int(db, page));
+		break;
+	case WT_PAGE_ROW_LEAF:
+		WT_ERR(__wt_page_inmem_row_leaf(db, page));
+		break;
+	default:
+		break;
+	}
+	return (0);
+
+err:	__wt_page_discard(toc, page);
+	return (ret);
+}
+
+/*
+ * __wt_page_inmem_col_fix --
+ *	Build in-memory index for fixed-length column-store leaf pages.
+ */
+static void
+__wt_page_inmem_col_fix(DB *db, WT_PAGE *page)
+{
+	WT_COL *cip;
+	WT_PAGE_DISK *dsk;
+	uint32_t i;
+	uint8_t *p;
+
+	dsk = page->dsk;
+	cip = page->u.icol;
+
+	/*
+	 * Walk the page, building indices and finding the end of the page.
+	 * The page contains fixed-length objects.
+	 */
+	WT_FIX_FOREACH(db, dsk, p, i) {
+		cip->data = p;
+		++cip;
+	}
+
+	page->indx_count = page->records = dsk->u.entries;
+}
+
+/*
+ * __wt_page_inmem_col_int --
+ *	Build in-memory index for column-store internal pages.
+ */
+static void
+__wt_page_inmem_col_int(WT_PAGE *page)
+{
+	WT_COL *cip;
+	WT_OFF *off;
+	WT_PAGE_DISK *dsk;
+	uint64_t records;
+	uint32_t i;
+
+	dsk = page->dsk;
+	cip = page->u.icol;
+	records = 0;
+
+	/*
+	 * Walk the page, building indices and finding the end of the page.
+	 * The page contains WT_OFF structures.
+	 */
+	WT_OFF_FOREACH(dsk, off, i) {
+		cip->data = off;
+		++cip;
+		records += WT_RECORDS(off);
+	}
+
+	page->indx_count = dsk->u.entries;
+	page->records = records;
+}
+
+/*
+ * __wt_page_inmem_col_rle --
+ *	Build in-memory index for fixed-length, run-length encoded, column-store
+ *	leaf pages.
+ */
+static void
+__wt_page_inmem_col_rle(DB *db, WT_PAGE *page)
+{
+	WT_COL *cip;
+	WT_PAGE_DISK *dsk;
+	uint64_t records;
+	uint32_t i;
+	uint8_t *p;
+
+	dsk = page->dsk;
+	cip = page->u.icol;
+	records = 0;
+
+	/*
+	 * Walk the page, building indices and finding the end of the page.
+	 * The page contains fixed-length objects.
+	 */
+	WT_RLE_REPEAT_FOREACH(db, dsk, p, i) {
+		records += WT_RLE_REPEAT_COUNT(p);
+		cip->data = p;
+		++cip;
+	}
+
+	page->indx_count = dsk->u.entries;
+	page->records = records;
+}
+
+/*
+ * __wt_page_inmem_col_var --
+ *	Build in-memory index for variable-length, data-only leaf pages in
+ *	column-store trees.
+ */
+static void
+__wt_page_inmem_col_var(WT_PAGE *page)
+{
+	WT_COL *cip;
+	WT_ITEM *item;
+	WT_PAGE_DISK *dsk;
+	uint32_t i;
+
+	dsk = page->dsk;
+	cip = page->u.icol;
+
+	/*
+	 * Walk the page, building indices and finding the end of the page.
+	 * The page contains unsorted data items.  The data items are on-page
+	 * data (WT_ITEM_DATA), overflow (WT_ITEM_DATA_OVFL) or deleted
+	 * (WT_ITEM_DEL) items.
+	 */
+	WT_ITEM_FOREACH(dsk, item, i) {
+		cip->data = item;
+		++cip;
+	}
+
+	page->indx_count = page->records = dsk->u.entries;
+}
+
+/*
+ * __wt_page_inmem_dup_leaf --
+ *	Build in-memory index for variable-length, data-only leaf pages in
+ *	duplicate trees.
+ */
+static int
+__wt_page_inmem_dup_leaf(DB *db, WT_PAGE *page)
+{
+	WT_ROW *rip;
+	WT_ITEM *item;
+	WT_PAGE_DISK *dsk;
+	uint32_t i;
+
+	dsk = page->dsk;
+
+	/*
+	 * Walk the page, building indices and finding the end of the page.
+	 * The page contains sorted data items.  The data items are on-page
+	 * (WT_ITEM_DATA_DUP) or overflow (WT_ITEM_DUP_OVFL) items.
+	 *
+	 * These data values are sorted, so we want to treat them as keys, and
+	 * we return them as on-page WT_ITEM values, so we want to tream them
+	 * as data.  Set both the WT_ROW key and data fields.
+	 */
+	rip = page->u.irow;
+	WT_ITEM_FOREACH(dsk, item, i) {
+		switch (WT_ITEM_TYPE(item)) {
+		case WT_ITEM_DATA_DUP:
+			__wt_key_set
+			    (rip, WT_ITEM_BYTE(item), WT_ITEM_LEN(item));
+			break;
+		case WT_ITEM_DATA_DUP_OVFL:
+			__wt_key_set_process(rip, item);
+			break;
+		WT_ILLEGAL_FORMAT(db);
+		}
+		rip->data = item;
+		++rip;
+	}
+
+	page->indx_count = dsk->u.entries;
+	page->records = dsk->u.entries;
+	return (0);
+}
+
+/*
+ * __wt_page_inmem_row_int --
+ *	Build in-memory index for row-store and off-page duplicate tree
+ *	internal pages.
+ */
+static int
+__wt_page_inmem_row_int(DB *db, WT_PAGE *page)
+{
+	IDB *idb;
+	WT_ITEM *item;
+	WT_OFF *off;
+	WT_PAGE_DISK *dsk;
+	WT_ROW *rip;
+	uint64_t records;
+	uint32_t i;
+	void *huffman;
+
+	idb = db->idb;
+	dsk = page->dsk;
+	rip = page->u.irow;
+	records = 0;
+
+	huffman =
+	    dsk->type == WT_PAGE_DUP_INT ? idb->huffman_data : idb->huffman_key;
+
+	/*
+	 * Walk the page, building indices and finding the end of the page.
+	 *
+	 * The page contains sorted key/offpage-reference pairs.  Keys are row
+	 * store internal pages with on-page/overflow (WT_ITEM_KEY/KEY_OVFL)
+	 * items, or row store duplicate internal pages with on-page/overflow
+	 * (WT_ITEM_KEY_DUP/WT_ITEM_DATA_KEY_DUP_OVFL) items.  In both cases,
+	 * offpage references are WT_ITEM_OFF items.
+	 */
+	WT_ITEM_FOREACH(dsk, item, i)
+		switch (WT_ITEM_TYPE(item)) {
+		case WT_ITEM_KEY:
+		case WT_ITEM_KEY_DUP:
+			if (huffman == NULL) {
+				__wt_key_set(rip,
+				    WT_ITEM_BYTE(item), WT_ITEM_LEN(item));
+				break;
+			}
+			/* FALLTHROUGH */
+		case WT_ITEM_KEY_OVFL:
+		case WT_ITEM_KEY_DUP_OVFL:
+			__wt_key_set_process(rip, item);
+			break;
+		case WT_ITEM_OFF:
+			off = WT_ITEM_BYTE_OFF(item);
+			records += WT_RECORDS(off);
+			rip->data = item;
+			++rip;
+			break;
+		WT_ILLEGAL_FORMAT(db);
+		}
+
+	page->indx_count = dsk->u.entries / 2;
+	page->records = records;
+	return (0);
+}
+
+/*
+ * __wt_page_inmem_row_leaf --
+ *	Build in-memory index for row-store leaf pages.
+ */
+static int
+__wt_page_inmem_row_leaf(DB *db, WT_PAGE *page)
+{
+	ENV *env;
+	IDB *idb;
+	WT_ITEM *item;
+	WT_PAGE_DISK *dsk;
+	WT_REF *ref;
+	WT_ROW *rip;
+	uint32_t i, indx_count;
+	uint64_t records;
+
+	env = db->env;
+	idb = db->idb;
+	dsk = page->dsk;
+	records = 0;
+
+	/*
+	 * Walk a row-store page of WT_ITEMs, building indices and finding the
+	 * end of the page.
+	 *
+	 * The page contains key/data pairs.  Keys are on-page (WT_ITEM_KEY) or
+	 * overflow (WT_ITEM_KEY_OVFL) items.  The data sets are either: a
+	 * single on-page (WT_ITEM_DATA) or overflow (WT_ITEM_DATA_OVFL) item;
+	 * a group of duplicate data items where each duplicate is an on-page
+	 * (WT_ITEM_DATA_DUP) or overflow (WT_ITEM_DUP_OVFL) item; or an offpage
+	 * reference (WT_ITEM_OFF).
+	 */
+	rip = NULL;
+	indx_count = 0;
+	WT_ITEM_FOREACH(dsk, item, i)
+		switch (WT_ITEM_TYPE(item)) {
+		case WT_ITEM_KEY:
+		case WT_ITEM_KEY_OVFL:
+			if (rip == NULL)
+				rip = page->u.irow;
+			else
+				++rip;
+			if (idb->huffman_key != NULL ||
+			    WT_ITEM_TYPE(item) == WT_ITEM_KEY_OVFL)
+				__wt_key_set_process(rip, item);
+			else
+				__wt_key_set(rip,
+				    WT_ITEM_BYTE(item), WT_ITEM_LEN(item));
+			++indx_count;
+			break;
+		case WT_ITEM_DATA_DUP:
+		case WT_ITEM_DATA_DUP_OVFL:
+			/*
+			 * If the second or subsequent duplicate, move to the
+			 * next slot and copy the previous key.
+			 */
+			if (rip->data != NULL) {
+				__wt_key_set(rip + 1, rip->key, rip->size);
+				++rip;
+				++indx_count;
+			}
+			/* FALLTHROUGH */
+		case WT_ITEM_DATA:
+		case WT_ITEM_DATA_OVFL:
+			rip->data = item;
+			++records;
+			break;
+		case WT_ITEM_OFF:
+			rip->data = item;
+			records += WT_ROW_OFF_RECORDS(rip);
+
+			/*
+			 * We need a WT_REF entry for any item referencing an
+			 * off-page duplicate tree.  Create the array of WT_REF
+			 * pointers and fill in a WT_REF structure.
+			 */
+			if (page->u3.dup == NULL)
+				WT_RET(__wt_calloc(env, indx_count,
+				    sizeof(WT_REF *), &page->u3.dup));
+			WT_RET(__wt_calloc(env, 1, sizeof(WT_REF), &ref));
+			ref->state = WT_EMPTY;
+			page->u3.dup[WT_ROW_SLOT(page, rip)] = ref;
+
+			break;
+		WT_ILLEGAL_FORMAT(db);
+		}
+
+	page->indx_count = indx_count;
+	page->records = records;
+
+	return (0);
+}
+
+/*
+ * __wt_item_process --
+ *	Overflow and/or compressed on-page items need processing before
+ *	we look at them.
+ */
+int
+__wt_item_process(WT_TOC *toc, WT_ITEM *item, DBT *dbt_ret)
+{
+	DB *db;
+	DBT *tmp;
+	ENV *env;
+	IDB *idb;
+	uint32_t size;
+	int ret;
+	void *huffman, *p;
+
+	db = toc->db;
+	tmp = NULL;
+	env = toc->env;
+	idb = db->idb;
+	ret = 0;
+
+	/*
+	 * 3 cases: compressed on-page item, or compressed or uncompressed
+	 * overflow item.
+	 */
+	switch (WT_ITEM_TYPE(item)) {
+	case WT_ITEM_KEY:
+		huffman = idb->huffman_key;
+		goto onpage;
+	case WT_ITEM_KEY_DUP:
+	case WT_ITEM_DATA:
+	case WT_ITEM_DATA_DUP:
+		huffman = idb->huffman_data;
+onpage:		p = WT_ITEM_BYTE(item);
+		size = WT_ITEM_LEN(item);
+		break;
+	case WT_ITEM_KEY_OVFL:
+		huffman = idb->huffman_key;
+		goto offpage;
+	case WT_ITEM_KEY_DUP_OVFL:
+	case WT_ITEM_DATA_OVFL:
+	case WT_ITEM_DATA_DUP_OVFL:
+		huffman = idb->huffman_data;
+offpage:	/*
+		 * It's an overflow item -- if it's not encoded, we can read
+		 * it directly into the user's return DBT, otherwise we have to
+		 * have our own buffer as temporary space, and the decode call
+		 * will put a decoded version into the user's return DBT.
+		 */
+		if (huffman == NULL)
+			tmp = dbt_ret;
+		else
+			WT_RET(__wt_scr_alloc(toc, 0, &tmp));
+		WT_RET(__wt_ovfl_in(toc, WT_ITEM_BYTE_OVFL(item), tmp));
+		p = tmp->data;
+		size = tmp->size;
+		break;
+	WT_ILLEGAL_FORMAT(db);
+	}
+
+	/*
+	 * If the item is not compressed, and it's not an overflow item, copy
+	 * it into the caller's DBT.  If the item is not compressed, and it's
+	 * an overflow item, it was already copied into the caller's DBT.
+	 *
+	 * If the item is compressed, pass it to the decode routines, they'll
+	 * copy a decoded version into the caller's DBT.
+	 */
+	if (huffman == NULL) {
+		if (tmp != dbt_ret) {
+			 if (size > dbt_ret->mem_size)
+				 WT_ERR(__wt_realloc(
+				     env, &dbt_ret->mem_size,
+				     size, &dbt_ret->data));
+			memcpy(dbt_ret->data, p, size);
+			dbt_ret->size = size;
+		}
+	} else
+		WT_ERR(__wt_huffman_decode(huffman, p, size,
+		    &dbt_ret->data, &dbt_ret->mem_size, &dbt_ret->size));
+
+err:	if (tmp != NULL && tmp != dbt_ret)
+		__wt_scr_release(&tmp);
+
+	return (ret);
+}
+
+/*
+ * __wt_page_inmem_int_ref --
+ *	Allocate and initialize the reference array for internal pages.
+ */
+static int
+__wt_page_inmem_int_ref(WT_TOC *toc, uint32_t nindx, WT_PAGE *page)
+{
+	ENV *env;
+	WT_REF *cp;
+	uint32_t i;
+
+	env = toc->env;
+
+	/*
+	 * Allocate an array of WT_REF structures for internal pages.  In the
+	 * case of an internal page, we know all of the slots are going to be
+	 * filled in -- every slot on the page references a subtree.  In the
+	 * case of row-store leaf pages, the only slots that get filled in are
+	 * slots that reference off-page duplicate trees.   So, if it's an
+	 * internal page, it's a simple one-time allocation; if a leaf page,
+	 * we'll do similar work, but lazily in the routine that fills in the
+	 * in-memory information.
+	 */
+	WT_RET(__wt_calloc(
+	    env, nindx, sizeof(WT_REF), &page->u3.ref));
+	for (i = 0, cp = page->u3.ref; i < nindx; ++i, ++cp)
+		cp->state = WT_EMPTY;
+	return (0);
+}
+
+/*
+ * __wt_key_set --
+ *	Set a key/size pair, where the key does not require further processing.
+ */
+inline void
+__wt_key_set(WT_ROW *rip, void *key, uint32_t size)
+{
+	rip->key = key;
+	rip->size = size;
+}
+
+/*
+ * __wt_key_set_process --
+ *	Set a key/size pair, where the key requires further processing.
+ */
+inline void
+__wt_key_set_process(WT_ROW *rip, void *key)
+{
+	rip->key = key;
+	rip->size = 0;
+}
+
+/*
+ * __wt_key_process --
+ *	Return if a key requires processing.
+ */
+inline int
+__wt_key_process(WT_ROW *rip)
+{
+	return (rip->size == 0 ? 1 : 0);
+}
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
new file mode 100644
index 00000000000..f7e594d2217
--- /dev/null
+++ b/src/btree/bt_read.c
@@ -0,0 +1,272 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_cache_read(WT_READ_REQ *);
+
+/*
+ * __wt_workq_read_server --
+ *	See if the read server thread needs to be awakened.
+ */
+void
+__wt_workq_read_server(ENV *env, int force)
+{
+	WT_CACHE *cache;
+	uint64_t bytes_inuse, bytes_max;
+
+	cache = env->ienv->cache;
+
+	/*
+	 * If we're 10% over the maximum cache, shut out reads (which include
+	 * page allocations) until we evict to at least 5% under the maximum
+	 * cache.  The idea is that we don't want to run on the edge all the
+	 * time -- if we're seriously out of space, get things under control
+	 * before opening up for more reads.
+	 */
+	bytes_inuse = __wt_cache_bytes_inuse(cache);
+	bytes_max = WT_STAT(cache->stats, CACHE_BYTES_MAX);
+	if (cache->read_lockout) {
+		if (bytes_inuse <= bytes_max - (bytes_max / 20))
+			cache->read_lockout = 0;
+	} else if (bytes_inuse > bytes_max + (bytes_max / 10)) {
+		WT_VERBOSE(env, WT_VERB_READ, (env,
+		    "workQ locks out reads: bytes-inuse %llu of bytes-max %llu",
+		    (unsigned long long)bytes_inuse,
+		    (unsigned long long)bytes_max));
+		cache->read_lockout = 1;
+	}
+
+	/* If the cache read server is running, there's nothing to do. */
+	if (!cache->read_sleeping)
+		return;
+
+	/*
+	 * If reads are locked out and we're not forcing the issue (that's when
+	 * closing the environment, or if there's a priority read waiting to be
+	 * handled), we're done.
+	 */
+	if (!force && cache->read_lockout)
+		return;
+
+	cache->read_sleeping = 0;
+	__wt_unlock(env, cache->mtx_read);
+}
+
+/*
+ * __wt_cache_read_serial_func --
+ *	Read/allocation serialization function called when a page-in requires
+ *	allocation or a read.
+ */
+int
+__wt_cache_read_serial_func(WT_TOC *toc)
+{
+	ENV *env;
+	WT_CACHE *cache;
+	WT_OFF *off;
+	WT_PAGE *parent;
+	WT_READ_REQ *rr, *rr_end;
+	WT_REF *ref;
+	int dsk_verify;
+
+	__wt_cache_read_unpack(toc, parent, ref, off, dsk_verify);
+
+	env = toc->env;
+	cache = env->ienv->cache;
+
+	/* Find an empty slot and enter the read request. */
+	rr = cache->read_request;
+	rr_end = rr + WT_ELEMENTS(cache->read_request);
+	for (; rr < rr_end; ++rr)
+		if (WT_READ_REQ_ISEMPTY(rr)) {
+			WT_READ_REQ_SET(rr, toc, parent, ref, off, dsk_verify);
+			return (0);
+		}
+	__wt_api_env_errx(env, "read server request table full");
+	return (WT_RESTART);
+}
+
+/*
+ * __wt_cache_read_server --
+ *	Thread to do database reads.
+ */
+void *
+__wt_cache_read_server(void *arg)
+{
+	ENV *env;
+	IENV *ienv;
+	WT_CACHE *cache;
+	WT_READ_REQ *rr, *rr_end;
+	WT_TOC *toc;
+	int didwork, ret;
+
+	env = arg;
+	ienv = env->ienv;
+	cache = ienv->cache;
+	ret = 0;
+
+	rr = cache->read_request;
+	rr_end = rr + WT_ELEMENTS(cache->read_request);
+
+	for (;;) {
+		WT_VERBOSE(env,
+		    WT_VERB_READ, (env, "cache read server sleeping"));
+		cache->read_sleeping = 1;
+		__wt_lock(env, cache->mtx_read);
+		WT_VERBOSE(
+		    env, WT_VERB_READ, (env, "cache read server waking"));
+
+		/*
+		 * Check for environment exit; do it here, instead of the top of
+		 * the loop because doing it here keeps us from doing a bunch of
+		 * worked when simply awakened to quit.
+		 */
+		if (!F_ISSET(ienv, WT_SERVER_RUN))
+			break;
+
+		/*
+		 * Walk the read-request queue, looking for reads (defined by
+		 * a valid WT_TOC handle).  If we find a read request, perform
+		 * it, flush the result and clear the request slot, then wake
+		 * up the requesting thread.  The request slot clear doesn't
+		 * need to be flushed, but we have to flush the read result,
+		 * might as well include it.  If we don't find any work, go to
+		 * sleep.
+		 */
+		do {
+			didwork = 0;
+			for (rr = cache->read_request; rr < rr_end; ++rr) {
+				if ((toc = rr->toc) == NULL)
+					continue;
+				if (cache->read_lockout &&
+				    !F_ISSET(toc, WT_READ_PRIORITY))
+					continue;
+
+				/*
+				 * The read server thread does both general file
+				 * allocation and cache page instantiation.   In
+				 * a file allocation, there's no pagep field in
+				 * in which to return a page.
+				 */
+				ret = __wt_cache_read(rr);
+
+				WT_READ_REQ_CLR(rr);
+				__wt_toc_serialize_wrapup(toc, NULL, ret);
+
+				didwork = 1;
+
+				/*
+				 * Any error terminates the request; a serious
+				 * error causes the read server to exit.
+				 */
+				if (ret != 0) {
+					if (ret != WT_RESTART)
+						goto err;
+					ret = 0;
+				}
+			}
+		} while (didwork);
+	}
+
+	if (ret != 0)
+err:		__wt_api_env_err(env, ret, "cache read server error");
+
+	WT_VERBOSE(env, WT_VERB_READ, (env, "cache read server exiting"));
+	return (NULL);
+}
+
+/*
+ * __wt_cache_read --
+ *	Read a page from the file.
+ */
+static int
+__wt_cache_read(WT_READ_REQ *rr)
+{
+	ENV *env;
+	WT_CACHE *cache;
+	WT_OFF *off;
+	WT_PAGE *page;
+	WT_PAGE_DISK *dsk;
+	WT_REF *ref;
+	WT_TOC *toc;
+	uint32_t addr, size;
+	int ret;
+
+	toc = rr->toc;
+	ref = rr->ref;
+	off = rr->off;
+	addr = off->addr;
+	size = off->size;
+
+	env = toc->env;
+	cache = env->ienv->cache;
+	ret = 0;
+
+	/*
+	 * Check to see if some other thread brought the page into the cache
+	 * while our request was in the queue.   If the state is anything
+	 * other than empty, it's not our problem.
+	 */
+	if (ref->state != WT_EMPTY)
+		return (0);
+
+	/*
+	 * The page isn't in the cache, and since we're the only path for the
+	 * page to get into the cache, we don't have to worry further, and
+	 * we might as well get to it.
+	 *
+	 * Allocate memory for the in-memory page information and for the page
+	 * itself. They're two separate allocation calls so we (hopefully) get
+	 * better alignment from the underlying heap memory allocator.
+	 */
+	WT_RET(__wt_calloc(env, 1, sizeof(WT_PAGE), &page));
+	WT_ERR(__wt_calloc(env, (size_t)size, sizeof(uint8_t), &dsk));
+
+	/* Read the page. */
+	WT_VERBOSE(env, WT_VERB_READ,
+	    (env, "cache read addr/size %lu/%lu", (u_long)addr, (u_long)size));
+
+	WT_ERR(__wt_page_disk_read(toc, dsk, addr, size));
+	WT_CACHE_PAGE_IN(cache, size);
+
+	/* If the page needs to be verified, that's next. */
+	if (rr->dsk_verify)
+		WT_ERR(__wt_verify_dsk_page(toc, dsk, addr, size));
+
+	/*
+	 * Fill in the WT_PAGE addr, size.
+	 * Reference the parent's WT_PAGE and parent's WT_OFF structures.
+	 * Reference the underlying disk page.
+	 */
+	page->addr = addr;
+	page->size = size;
+	page->parent = rr->parent;
+	page->parent_off = off;
+	page->dsk = dsk;
+
+	/* Build the in-memory version of the page. */
+	WT_ERR(__wt_page_inmem(toc, page));
+
+	/*
+	 * The page is now available -- set the LRU so the page is not selected
+	 * for eviction.
+	 */
+	page->read_gen = ++cache->read_gen;
+	ref->page = page;
+	ref->state = WT_OK;
+
+	return (0);
+
+err:	if (page != NULL) {
+		if (page->dsk != NULL)
+			__wt_free(env, page->dsk, size);
+		__wt_free(env, page, sizeof(WT_PAGE));
+	}
+	return (ret);
+}
diff --git a/src/btree/bt_reconcile.c b/src/btree/bt_reconcile.c
new file mode 100644
index 00000000000..7a57cfe4a97
--- /dev/null
+++ b/src/btree/bt_reconcile.c
@@ -0,0 +1,982 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_rle_expand_compare(const void *, const void *);
+static int __wt_rec_col_fix(WT_TOC *, WT_PAGE *, WT_PAGE *);
+static int __wt_rec_col_int(WT_TOC *, WT_PAGE *, WT_PAGE *);
+static int __wt_rec_col_rle(WT_TOC *, WT_PAGE *, WT_PAGE *);
+static int __wt_rec_col_var(WT_TOC *, WT_PAGE *, WT_PAGE *);
+static int __wt_rec_page_write(WT_TOC *, WT_PAGE *, WT_PAGE *);
+static int __wt_rec_parent_update(WT_TOC *, WT_PAGE *, WT_PAGE *);
+static int __wt_rec_row(WT_TOC *, WT_PAGE *, WT_PAGE *);
+static int __wt_rec_row_int(WT_TOC *, WT_PAGE *, WT_PAGE *);
+static inline void __wt_rec_set_page_size(WT_TOC *, WT_PAGE *, uint8_t *);
+
+/*
+ * __wt_rec_set_page_size --
+ *	Set the page's size to the minimum number of allocation units.
+ */
+static inline void
+__wt_rec_set_page_size(WT_TOC *toc, WT_PAGE *page, uint8_t *first_free)
+{
+	DB *db;
+
+	db = toc->db;
+
+	/*
+	 * Set the page's size to the minimum number of allocation units needed
+	 * (the page size can either grow or shrink).
+	 *
+	 * Set the page size before verifying the page, the verification code
+	 * checks for entries that extend past the end of the page, and expects
+	 * the WT_PAGE->size field to be valid.
+	 */
+	page->size = WT_ALIGN(first_free - (uint8_t *)page->dsk, db->allocsize);
+}
+
+/*
+ * __wt_page_reconcile --
+ *	Format an in-memory page to its on-disk format, and write it.
+ */
+int
+__wt_page_reconcile(WT_TOC *toc, WT_PAGE *page)
+{
+	DB *db;
+	DBT *tmp;
+	ENV *env;
+	WT_PAGE *new, _new;
+	WT_PAGE_DISK *dsk;
+	uint32_t max;
+	int ret;
+
+	db = toc->db;
+	tmp = NULL;
+	env = toc->env;
+	dsk = page->dsk;
+
+	/* If the page isn't dirty, we should never have been called. */
+	WT_ASSERT(env, WT_PAGE_IS_MODIFIED(page));
+
+	WT_VERBOSE(env, WT_VERB_EVICT,
+	    (env, "reconcile addr %lu (page %p, type %s)",
+	    (u_long)page->addr, page, __wt_page_type_string(dsk)));
+
+	/*
+	 * Update the disk generation before reading the page.  The workQ will
+	 * update the write generation after it makes a change, and if we have
+	 * different disk and write generation numbers, the page may be dirty.
+	 * We technically requires a flush (the eviction server might run on a
+	 * different core before a flush naturally occurred).
+	 */
+	WT_PAGE_DISK_WRITE(page);
+	WT_MEMORY_FLUSH;
+
+	switch (dsk->type) {
+	case WT_PAGE_COL_FIX:
+		/*
+		 * Fixed-width pages without run-length encoding cannot change
+		 * size.
+		 */
+		max = page->size;
+		break;
+	case WT_PAGE_COL_RLE:
+	case WT_PAGE_COL_VAR:
+	case WT_PAGE_DUP_LEAF:
+	case WT_PAGE_ROW_LEAF:
+		/*
+		 * Other leaf page types can grow, allocate the maximum leaf
+		 * page size.
+		 */
+		max = db->leafmax;
+		break;
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_ROW_INT:
+		/*
+		 * All internal page types can grow, allocate the maximum
+		 * internal page size.
+		 */
+		max = db->intlmax;
+		break;
+	case WT_PAGE_OVFL:
+	WT_ILLEGAL_FORMAT_ERR(db, ret);
+	}
+
+	/*
+	 * Initialize a WT_PAGE page on the stack and allocate a scratch buffer
+	 * for its contents.  We use two pieces of memory because we want the
+	 * page contents to be aligned for direct I/O.  The WT_PAGE structure
+	 * is relatively small, the stack is fine.
+	 */
+	WT_CLEAR(_new);
+	new = &_new;
+	WT_ERR(__wt_scr_alloc(toc, max, &tmp));
+	memset(tmp->data, 0, max);
+	new->addr = page->addr;
+	new->size = max;
+	new->dsk = tmp->data;
+	new->dsk->start_recno = dsk->start_recno;
+	new->dsk->type = dsk->type;
+	new->dsk->level = dsk->level;
+
+	switch (dsk->type) {
+	case WT_PAGE_COL_FIX:
+		WT_ERR(__wt_rec_col_fix(toc, page, new));
+		break;
+	case WT_PAGE_COL_RLE:
+		WT_ERR(__wt_rec_col_rle(toc, page, new));
+		break;
+	case WT_PAGE_COL_VAR:
+		WT_ERR(__wt_rec_col_var(toc, page, new));
+		break;
+	case WT_PAGE_COL_INT:
+		WT_ERR(__wt_rec_col_int(toc, page, new));
+		break;
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_ROW_INT:
+		WT_ERR(__wt_rec_row_int(toc, page, new));
+		break;
+	case WT_PAGE_ROW_LEAF:
+	case WT_PAGE_DUP_LEAF:
+		WT_ERR(__wt_rec_row(toc, page, new));
+		break;
+	WT_ILLEGAL_FORMAT_ERR(db, ret);
+	}
+
+	/* Write the new page to disk. */
+	WT_ERR(__wt_rec_page_write(toc, page, new));
+
+	/* Free the original page -- update the address and size. */
+	WT_ERR(__wt_file_free(toc, page->addr, page->size));
+
+	/*
+	 * Update the backing address.
+	 *
+	 * XXX
+	 * This is more for diagnostic information than anything else, that is,
+	 * this will match the WT_REF->addr in the parent.
+	 *
+	 * The parent's WT_REF->size may be different, that is, page->size is
+	 * the original page size at the original address and the size of the
+	 * page's buffer in memory, NOT the size of the newly written page at
+	 * the new address.   We may NOT update the size here, otherwise we
+	 * can no longer figure out if WT_ROW/WT_COL items reference on-page
+	 * data vs. allocated data.
+	 */
+	page->addr = new->addr;
+
+err:	if (tmp != NULL)
+		__wt_scr_release(&tmp);
+
+	return (ret);
+}
+
+/*
+ * __wt_rec_col_int --
+ *	Reconcile a column store internal page.
+ */
+static int
+__wt_rec_col_int(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
+{
+	WT_COL *cip;
+	WT_OFF *from;
+	WT_PAGE_DISK *dsk;
+	WT_REPL *repl;
+	uint32_t i, space_avail;
+	uint8_t *first_free;
+
+	dsk = new->dsk;
+	__wt_set_ff_and_sa_from_offset(
+	    new, WT_PAGE_BYTE(new), &first_free, &space_avail);
+
+	WT_INDX_FOREACH(page, cip, i) {
+		if ((repl = WT_COL_REPL(page, cip)) != NULL)
+			from = WT_REPL_DATA(repl);
+		else
+			from = cip->data;
+
+		/*
+		 * XXX
+		 * We don't yet handle splits:  we allocated the maximum page
+		 * size, but it still wasn't enough.  We must allocate another
+		 * page and split the parent.
+		 */
+		if (sizeof(WT_OFF) > space_avail) {
+			fprintf(stderr,
+			   "__wt_rec_col_int: page %lu split\n",
+			   (u_long)page->addr);
+			__wt_abort(toc->env);
+		}
+
+		memcpy(first_free, from, sizeof(WT_OFF));
+		first_free += sizeof(WT_OFF);
+		space_avail -= sizeof(WT_OFF);
+		++dsk->u.entries;
+	}
+
+	new->records = page->records;
+	__wt_rec_set_page_size(toc, new, first_free);
+
+	return (0);
+}
+
+/*
+ * __wt_rec_row_int --
+ *	Reconcile a row store, or off-page duplicate tree, internal page.
+ */
+static int
+__wt_rec_row_int(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
+{
+	WT_ITEM *key_item, *data_item, *next;
+	WT_PAGE_DISK *dsk;
+	WT_REPL *repl;
+	WT_ROW *rip;
+	uint32_t i, len, space_avail;
+	uint8_t *first_free;
+
+	dsk = new->dsk;
+	__wt_set_ff_and_sa_from_offset(
+	    new, WT_PAGE_BYTE(new), &first_free, &space_avail);
+
+	/*
+	 * We have to walk both the WT_ROW structures as well as the original
+	 * page: the problem is keys that require processing.  When a page is
+	 * read into memory from a simple database, the WT_ROW key/size pair
+	 * is set to reference an on-page group of bytes in the key's WT_ITEM
+	 * structure.  As Btree keys are immutable, that original WT_ITEM is
+	 * usually what we want to write, and we can pretty easily find it by
+	 * moving to immediately before the on-page key.
+	 *
+	 * Keys that require processing are harder (for example, a Huffman
+	 * encoded key).  When we have to use a key that requires processing,
+	 * we process the key and set the WT_ROW key/size pair to reference
+	 * the allocated memory that holds the key.  At that point we've lost
+	 * any reference to the original WT_ITEM structure, which is what we
+	 * want to re-write when reconciling the page.  We don't want to make
+	 * the WT_ROW structure bigger by another sizeof(void *) bytes, so we
+	 * walk the original page at the same time we walk the WT_PAGE array
+	 * when reconciling the page so we can find the original WT_ITEM.
+	 */
+	key_item = WT_PAGE_BYTE(page);
+	WT_INDX_FOREACH(page, rip, i) {
+		/*
+		 * Copy the paired items off the old page into the new page; if
+		 * the page has been replaced, update its information.
+		 *
+		 * XXX
+		 * Internal pages can't grow, yet, so we could more easily just
+		 * update the old page.   We do the copy because eventually we
+		 * will have to split the internal pages, and they'll be able to
+		 * grow.
+		 */
+		data_item = WT_ITEM_NEXT(key_item);
+		if ((repl = WT_ROW_REPL(page, rip)) != NULL)
+			memcpy(WT_ITEM_BYTE(data_item),
+			    WT_REPL_DATA(repl), sizeof(WT_OFF));
+		next = WT_ITEM_NEXT(data_item);
+		len = (uint32_t)((uint8_t *)next - (uint8_t *)key_item);
+
+		/*
+		 * XXX
+		 * We don't yet handle splits:  we allocated the maximum page
+		 * size, but it still wasn't enough.  We must allocate another
+		 * page and split the parent.
+		 */
+		if (len > space_avail) {
+			fprintf(stderr,
+			    "__wt_rec_row_int: page %lu split\n",
+			    (u_long)page->addr);
+			__wt_abort(toc->env);
+		}
+
+		memcpy(first_free, key_item, len);
+		first_free += len;
+		space_avail -= len;
+		++dsk->u.entries;
+
+		key_item = next;
+	}
+
+	new->records = page->records;
+	__wt_rec_set_page_size(toc, new, first_free);
+
+	return (0);
+}
+
+/*
+ * __wt_rec_col_fix --
+ *	Reconcile a fixed-width, column-store leaf page (does not handle
+ *	run-length encoding).
+ */
+static int
+__wt_rec_col_fix(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
+{
+	DB *db;
+	DBT *tmp;
+	ENV *env;
+	WT_COL *cip;
+	WT_PAGE_DISK *dsk;
+	WT_REPL *repl;
+	uint32_t i, len, space_avail;
+	uint8_t *data, *first_free;
+	int ret;
+
+	db = toc->db;
+	tmp = NULL;
+	env = toc->env;
+	dsk = new->dsk;
+	ret = 0;
+
+	__wt_set_ff_and_sa_from_offset(
+	    new, WT_PAGE_BYTE(new), &first_free, &space_avail);
+
+	/*
+	 * We need a "deleted" data item to store on the page.  Make sure the
+	 * WT_TOC's scratch buffer is big enough.  Clear the buffer's contents
+	 * and set the delete flag.
+	 */
+	len = db->fixed_len;
+	WT_ERR(__wt_scr_alloc(toc, len, &tmp));
+	memset(tmp->data, 0, len);
+	WT_FIX_DELETE_SET(tmp->data);
+
+	WT_INDX_FOREACH(page, cip, i) {
+		/*
+		 * Get a reference to the data, on- or off- page, and see if
+		 * it's been deleted.
+		 */
+		if ((repl = WT_COL_REPL(page, cip)) != NULL) {
+			if (WT_REPL_DELETED_ISSET(repl))
+				data = tmp->data;	/* Replaced deleted */
+			else				/* Replaced data */
+				data = WT_REPL_DATA(repl);
+		} else if (WT_FIX_DELETE_ISSET(cip->data))
+			data = tmp->data;		/* On-page deleted */
+		else
+			data = cip->data;		/* On-page data */
+
+		/*
+		 * When reconciling a fixed-width page that doesn't support
+		 * run-length encoding, the on-page information can't change
+		 * size -- there's no reason to ever split such a page.
+		 */
+		WT_ASSERT(env, len <= space_avail);
+
+		memcpy(first_free, data, len);
+		first_free += len;
+		space_avail -= len;
+		++dsk->u.entries;
+	}
+
+	new->records = page->records;
+	__wt_rec_set_page_size(toc, new, first_free);
+
+err:	if (tmp != NULL)
+		__wt_scr_release(&tmp);
+	return (ret);
+}
+
+/*
+ * __wt_rec_col_rle --
+ *	Reconcile a fixed-width, run-length encoded, column-store leaf page.
+ */
+static int
+__wt_rec_col_rle(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
+{
+	DB *db;
+	DBT *tmp;
+	ENV *env;
+	WT_COL *cip;
+	WT_PAGE_DISK *dsk;
+	WT_RLE_EXPAND *exp, **expsort, **expp;
+	WT_REPL *repl;
+	uint64_t recno;
+	uint32_t i, len, n_expsort, space_avail;
+	uint16_t n, nrepeat, repeat_count;
+	uint8_t *data, *first_free, *last_data;
+	int from_repl, ret;
+
+	db = toc->db;
+	tmp = NULL;
+	env = toc->env;
+	expsort = NULL;
+	dsk = new->dsk;
+	n_expsort = 0;			/* Necessary for the sort function */
+	last_data = NULL;
+	ret = 0;
+
+	__wt_set_ff_and_sa_from_offset(
+	    new, WT_PAGE_BYTE(new), &first_free, &space_avail);
+
+	/*
+	 * We need a "deleted" data item to store on the page.  Make sure the
+	 * WT_TOC's scratch buffer is big enough.  Clear the buffer's contents
+	 * and set the delete flag.
+	 */
+	len = db->fixed_len + sizeof(uint16_t);
+	WT_ERR(__wt_scr_alloc(toc, len, &tmp));
+	memset(tmp->data, 0, len);
+	WT_RLE_REPEAT_COUNT(tmp->data) = 1;
+	WT_FIX_DELETE_SET(WT_RLE_REPEAT_DATA(tmp->data));
+
+	/* Set recno to the first record on the page. */
+	recno = page->dsk->start_recno;
+	WT_INDX_FOREACH(page, cip, i) {
+		/*
+		 * Get a sorted list of any expansion entries we've created for
+		 * this set of records.  The sort function returns a NULL-
+		 * terminated array of references to WT_RLE_EXPAND structures,
+		 * sorted by record number.
+		 */
+		WT_ERR(__wt_rle_expand_sort(
+		    env, page, cip, &expsort, &n_expsort));
+
+		/*
+		 *
+		 * Generate entries for the new page: loop through the repeat
+		 * records, checking for WT_RLE_EXPAND entries that match the
+		 * current record number.
+		 */
+		nrepeat = WT_RLE_REPEAT_COUNT(cip->data);
+		for (expp = expsort, n = 1;
+		    n <= nrepeat; n += repeat_count, recno += repeat_count) {
+			from_repl = 0;
+			if ((exp = *expp) != NULL && recno == exp->recno) {
+				++expp;
+
+				/* Use the WT_RLE_EXPAND's WT_REPL field. */
+				repl = exp->repl;
+				if (WT_REPL_DELETED_ISSET(repl))
+					data = tmp->data;
+				else {
+					from_repl = 1;
+					data = WT_REPL_DATA(repl);
+				}
+				repeat_count = 1;
+			} else {
+				if (WT_FIX_DELETE_ISSET(cip->data))
+					data = tmp->data;
+				else
+					data = cip->data;
+				/*
+				 * The repeat count is the number of records
+				 * up to the next WT_RLE_EXPAND record, or
+				 * up to the end of this entry if we have no
+				 * more WT_RLE_EXPAND records.
+				 */
+				if (exp == NULL)
+					repeat_count = (nrepeat - n) + 1;
+				else
+					repeat_count =
+					    (uint16_t)(exp->recno - recno);
+			}
+
+			/*
+			 * In all cases, check the last entry written on the
+			 * page to see if it's identical, and increment its
+			 * repeat count where possible.
+			 */
+			if (last_data != NULL &&
+			    memcmp(WT_RLE_REPEAT_DATA(last_data),
+			    WT_RLE_REPEAT_DATA(data), db->fixed_len) == 0 &&
+			    WT_RLE_REPEAT_COUNT(last_data) < UINT16_MAX) {
+				WT_RLE_REPEAT_COUNT(last_data) += repeat_count;
+				continue;
+			}
+
+			/*
+			 * XXX
+			 * We don't yet handle splits:  we allocated the maximum
+			 * leaf page size, but it still wasn't enough.  We must
+			 * allocate another leaf page and split the parent.
+			 */
+			if (len > space_avail) {
+				fprintf(stderr,
+				    "__wt_rec_col_rle: page %lu split\n",
+				    (u_long)page->addr);
+				__wt_abort(env);
+			}
+
+			/*
+			 * Most of the formats already include a repeat count:
+			 * specifically the deleted buffer, or any entry we're
+			 * copying from the original page.   However, entries
+			 * that were deleted or replaced are read from a WT_REPL
+			 * structure, which has no repeat count.
+			 */
+			last_data = first_free;
+			if (from_repl) {
+				WT_RLE_REPEAT_COUNT(last_data) = repeat_count;
+				memcpy(WT_RLE_REPEAT_DATA(
+				    last_data), data, db->fixed_len);
+			} else
+				memcpy(last_data, data, len);
+			first_free += len;
+			space_avail -= len;
+			++dsk->u.entries;
+		}
+	}
+
+	new->records = page->records;
+	__wt_rec_set_page_size(toc, new, first_free);
+
+	/* Free the sort array. */
+err:	if (expsort != NULL)
+		__wt_free(env, expsort, n_expsort * sizeof(WT_RLE_EXPAND *));
+
+	if (tmp != NULL)
+		__wt_scr_release(&tmp);
+
+	return (ret);
+}
+
+/*
+ * __wt_rle_expand_compare --
+ *	Qsort function: sort WT_RLE_EXPAND structures based on the record
+ *	offset, in ascending order.
+ */
+static int
+__wt_rle_expand_compare(const void *a, const void *b)
+{
+	WT_RLE_EXPAND *a_exp, *b_exp;
+
+	a_exp = *(WT_RLE_EXPAND **)a;
+	b_exp = *(WT_RLE_EXPAND **)b;
+
+	return (a_exp->recno > b_exp->recno ? 1 : 0);
+}
+
+/*
+ * __wt_rle_expand_sort --
+ *	Return the current on-page index's array of WT_RLE_EXPAND structures,
+ *	sorted by record offset.
+ */
+int
+__wt_rle_expand_sort(ENV *env,
+    WT_PAGE *page, WT_COL *cip, WT_RLE_EXPAND ***expsortp, uint32_t *np)
+{
+	WT_RLE_EXPAND *exp;
+	uint16_t n;
+
+	/* Figure out how big the array needs to be. */
+	for (n = 0,
+	    exp = WT_COL_RLEEXP(page, cip); exp != NULL; exp = exp->next, ++n)
+		;
+
+	/*
+	 * Allocate that big an array -- always allocate at least one slot,
+	 * our caller expects NULL-termination.
+	 */
+	if (n >= *np) {
+		if (*expsortp != NULL)
+			__wt_free(
+			    env, *expsortp, *np * sizeof(WT_RLE_EXPAND *));
+		WT_RET(__wt_calloc(
+		    env, n + 10, sizeof(WT_RLE_EXPAND *), expsortp));
+		*np = n + 10;
+	}
+
+	/* Enter the WT_RLE_EXPAND structures into the array. */
+	for (n = 0,
+	    exp = WT_COL_RLEEXP(page, cip); exp != NULL; exp = exp->next, ++n)
+		(*expsortp)[n] = exp;
+
+	/* Sort the entries. */
+	if (n != 0)
+		qsort(*expsortp, (size_t)n,
+		    sizeof(WT_RLE_EXPAND *), __wt_rle_expand_compare);
+
+	/* NULL-terminate the array. */
+	(*expsortp)[n] = NULL;
+
+	return (0);
+}
+
+/*
+ * __wt_rec_col_var --
+ *	Reconcile a variable-width column-store leaf page.
+ */
+static int
+__wt_rec_col_var(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
+{
+	enum { DATA_ON_PAGE, DATA_OFF_PAGE } data_loc;
+	DBT *data, data_dbt;
+	WT_COL *cip;
+	WT_ITEM data_item;
+	WT_OVFL data_ovfl;
+	WT_PAGE_DISK *dsk;
+	WT_REPL *repl;
+	uint32_t i, len, space_avail;
+	uint8_t *first_free;
+
+	dsk = new->dsk;
+	__wt_set_ff_and_sa_from_offset(
+	    new, WT_PAGE_BYTE(new), &first_free, &space_avail);
+
+	WT_CLEAR(data_dbt);
+	WT_CLEAR(data_item);
+	data = &data_dbt;
+
+	WT_INDX_FOREACH(page, cip, i) {
+		/*
+		 * Get a reference to the data: it's either a replacement value
+		 * or the original on-page item.
+		 */
+		if ((repl = WT_COL_REPL(page, cip)) != NULL) {
+			/*
+			 * Check for deletion, else build the data's WT_ITEM
+			 * chunk from the most recent replacement value.
+			 */
+			if (WT_REPL_DELETED_ISSET(repl)) {
+				WT_CLEAR(data_item);
+				WT_ITEM_SET(&data_item, WT_ITEM_DEL, 0);
+				len = WT_ITEM_SPACE_REQ(0);
+			} else {
+				data->data = WT_REPL_DATA(repl);
+				data->size = repl->size;
+				WT_RET(__wt_item_build_data(
+				    toc, data, &data_item, &data_ovfl, 0));
+				len = WT_ITEM_SPACE_REQ(data->size);
+			}
+			data_loc = DATA_OFF_PAGE;
+		} else {
+			data->data = cip->data;
+			data->size = WT_ITEM_SPACE_REQ(WT_ITEM_LEN(cip->data));
+			len = data->size;
+			data_loc = DATA_ON_PAGE;
+		}
+
+		/*
+		 * XXX
+		 * We don't yet handle splits -- we allocated the maximum leaf
+		 * page size, but it still wasn't enough.  We must allocate
+		 * another leaf page and split the parent.
+		 */
+		if (len > space_avail) {
+			fprintf(stderr,
+			    "__wt_rec_col_var: page %lu split\n",
+			    (u_long)page->addr);
+			__wt_abort(toc->env);
+		}
+
+		switch (data_loc) {
+		case DATA_ON_PAGE:
+			memcpy(first_free, data->data, data->size);
+			first_free += data->size;
+			space_avail -= data->size;
+			break;
+		case DATA_OFF_PAGE:
+			memcpy(first_free, &data_item, sizeof(data_item));
+			memcpy(first_free +
+			    sizeof(data_item), data->data, data->size);
+			first_free += len;
+			space_avail -= len;
+		}
+		++dsk->u.entries;
+	}
+
+	new->records = page->records;
+	__wt_rec_set_page_size(toc, new, first_free);
+
+	return (0);
+}
+
+/*
+ * __wt_rec_row --
+ *	Reconcile a row-store leaf page.
+ */
+static int
+__wt_rec_row(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
+{
+	enum { DATA_ON_PAGE, DATA_OFF_PAGE } data_loc;
+	enum { KEY_ON_PAGE, KEY_NONE } key_loc;
+	DB *db;
+	DBT *key, key_dbt, *data, data_dbt;
+	WT_ITEM key_item, data_item, *item;
+	WT_OVFL data_ovfl;
+	WT_PAGE_DISK *dsk;
+	WT_ROW *rip;
+	WT_REPL *repl;
+	uint32_t i, len, space_avail, type;
+	uint8_t *first_free;
+
+	db = toc->db;
+	dsk = new->dsk;
+	__wt_set_ff_and_sa_from_offset(
+	    new, WT_PAGE_BYTE(new), &first_free, &space_avail);
+
+	WT_CLEAR(data_dbt);
+	WT_CLEAR(key_dbt);
+	WT_CLEAR(data_item);
+	WT_CLEAR(key_item);
+
+	key = &key_dbt;
+	data = &data_dbt;
+
+	/*
+	 * Walk the page, accumulating key/data groups (groups, because a key
+	 * can reference a duplicate data set).
+	 *
+	 * We have to walk both the WT_ROW structures as well as the original
+	 * page: the problem is keys that require processing.  When a page is
+	 * read into memory from a simple database, the WT_ROW key/size pair
+	 * is set to reference an on-page group of bytes in the key's WT_ITEM
+	 * structure.  As Btree keys are immutable, that original WT_ITEM is
+	 * usually what we want to write, and we can pretty easily find it by
+	 * moving to immediately before the on-page key.
+	 *
+	 * Keys that require processing are harder (for example, a Huffman
+	 * encoded key).  When we have to use a key that requires processing,
+	 * we process the key and set the WT_ROW key/size pair to reference
+	 * the allocated memory that holds the key.  At that point we've lost
+	 * any reference to the original WT_ITEM structure, which is what we
+	 * want to re-write when reconciling the page.  We don't want to make
+	 * the WT_ROW structure bigger by another sizeof(void *) bytes, so we
+	 * walk the original page at the same time we walk the WT_PAGE array
+	 * when reconciling the page so we can find the original WT_ITEM.
+	 */
+	item = NULL;
+	WT_INDX_FOREACH(page, rip, i) {
+		/* Move to the next key on the original page. */
+		if (item == NULL)
+			item = (WT_ITEM *)WT_PAGE_BYTE(page);
+		else
+			do {
+				item = WT_ITEM_NEXT(item);
+			} while (WT_ITEM_TYPE(item) != WT_ITEM_KEY &&
+			    WT_ITEM_TYPE(item) != WT_ITEM_KEY_OVFL);
+
+		/*
+		 * Get a reference to the data.  We get the data first because
+		 * it may have been deleted, in which case we ignore the pair.
+		 */
+		if ((repl = WT_ROW_REPL(page, rip)) != NULL) {
+			if (WT_REPL_DELETED_ISSET(repl))
+				continue;
+
+			/*
+			 * Build the data's WT_ITEM chunk from the most recent
+			 * replacement value.
+			 */
+			data->data = WT_REPL_DATA(repl);
+			data->size = repl->size;
+			WT_RET(__wt_item_build_data(
+			    toc, data, &data_item, &data_ovfl, 0));
+			data_loc = DATA_OFF_PAGE;
+		} else {
+			/* Copy the item off the page. */
+			data->data = rip->data;
+			data->size = WT_ITEM_SPACE_REQ(WT_ITEM_LEN(rip->data));
+			data_loc = DATA_ON_PAGE;
+		}
+
+		/*
+		 * Check if the key is a duplicate (the key preceding it on the
+		 * page references the same information).  We don't store the
+		 * key for the second and subsequent data items in duplicated
+		 * groups.
+		 */
+		if (WT_ROW_INDX_IS_DUPLICATE(page, rip)) {
+			type = data_loc == DATA_ON_PAGE ?
+			    WT_ITEM_TYPE(rip->data) : WT_ITEM_TYPE(&data_item);
+			switch (type) {
+				case WT_ITEM_DATA:
+				case WT_ITEM_DATA_DUP:
+					type = WT_ITEM_DATA_DUP;
+					break;
+				case WT_ITEM_DATA_OVFL:
+				case WT_ITEM_DATA_DUP_OVFL:
+					type = WT_ITEM_DATA_DUP_OVFL;
+					break;
+				WT_ILLEGAL_FORMAT(db);
+				}
+			if (data_loc == DATA_ON_PAGE)
+				WT_ITEM_SET_TYPE(rip->data, type);
+			else
+				WT_ITEM_SET_TYPE(&data_item, type);
+			key_loc = KEY_NONE;
+		} else {
+			/* Take the key's WT_ITEM from the original page. */
+			key->data = item;
+			key->size = WT_ITEM_SPACE_REQ(WT_ITEM_LEN(item));
+			key_loc = KEY_ON_PAGE;
+		}
+
+		len = 0;
+		switch (key_loc) {
+		case KEY_ON_PAGE:
+			len = key->size;
+			break;
+		case KEY_NONE:
+			break;
+		}
+		switch (data_loc) {
+		case DATA_OFF_PAGE:
+			len += WT_ITEM_SPACE_REQ(data->size);
+			break;
+		case DATA_ON_PAGE:
+			len += data->size;
+			break;
+		}
+
+		/*
+		 * XXX
+		 * We don't yet handle splits -- we allocated the maximum leaf
+		 * page size, but it still wasn't enough.  We must allocate
+		 * another leaf page and split the parent.
+		 */
+		if (len > space_avail) {
+			fprintf(stderr, "__wt_rec_row: page %lu split\n",
+			    (u_long)page->addr);
+			__wt_abort(toc->env);
+		}
+
+		switch (key_loc) {
+		case KEY_ON_PAGE:
+			memcpy(first_free, key->data, key->size);
+			first_free += key->size;
+			space_avail -= key->size;
+			++dsk->u.entries;
+			break;
+		case KEY_NONE:
+			break;
+		}
+		switch (data_loc) {
+		case DATA_ON_PAGE:
+			memcpy(first_free, data->data, data->size);
+			first_free += data->size;
+			space_avail -= data->size;
+			++dsk->u.entries;
+			break;
+		case DATA_OFF_PAGE:
+			memcpy(first_free, &data_item, sizeof(data_item));
+			memcpy(first_free +
+			    sizeof(WT_ITEM), data->data, data->size);
+			first_free += WT_ITEM_SPACE_REQ(data->size);
+			space_avail -= WT_ITEM_SPACE_REQ(data->size);
+			++dsk->u.entries;
+			break;
+		}
+	}
+
+	__wt_rec_set_page_size(toc, new, first_free);
+
+	return (0);
+}
+
+/*
+ * __wt_rec_page_write --
+ *	Write a newly reconciled page.
+ */
+static int
+__wt_rec_page_write(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
+{
+	ENV *env;
+	int ret;
+
+	env = toc->env;
+
+	/*
+	 * XXX
+	 * We fail if the page gets emptied -- we'll need to do some kind of
+	 * reverse split where the internal page disappears.   That shouldn't
+	 * be difficult, but I haven't written it yet.
+	 */
+	if (new->dsk->u.entries == 0) {
+		new->addr = WT_ADDR_INVALID;
+		WT_VERBOSE(env, WT_VERB_EVICT, (env,
+		    "reconcile removing empty page %lu", (u_long)page->addr));
+		fprintf(stderr, "PAGE %lu EMPTIED\n", (u_long)page->addr);
+		__wt_abort(env);
+	} else {
+		/*
+		 * Allocate file space for the page.
+		 *
+		 * The cache eviction server is the only thread allocating space
+		 * from the file, so there's no need to do any serialization.
+		 */
+		WT_RET(__wt_file_alloc(toc, &new->addr, new->size));
+
+		/*
+		 * Write the page to disk.
+		 *
+		 * !!!
+		 * This is safe for now, but it's a problem when we switch to
+		 * asynchronous I/O: the scenario is (1) schedule the write,
+		 * (2) discard the newly-clean in-memory version, (3) another
+		 * thread tries to read down the tree before the write finishes.
+		 */
+		WT_RET(__wt_page_write(toc, new));
+
+		WT_VERBOSE(env, WT_VERB_EVICT,
+		    (env, "reconcile move %lu to %lu, resize %lu to %lu",
+		    (u_long)page->addr, (u_long)new->addr,
+		    (u_long)page->size, (u_long)new->size));
+	}
+
+	/* Update the page's parent. */
+	if ((ret = __wt_rec_parent_update(toc, page, new)) != 0) {
+		(void)__wt_file_free(toc, new->addr, new->size);
+		return (ret);
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_rec_parent_update --
+ *	Update a parent page's reference when a page is reconciled.
+ */
+static int
+__wt_rec_parent_update(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
+{
+	IDB *idb;
+	WT_OFF *parent_off;
+
+	idb = toc->db->idb;
+
+	/*
+	 * If we're writing the root of the tree, then we have to update the
+	 * descriptor record, there's no parent to update.
+	 */
+	if (page->addr == idb->root_off.addr) {
+		  idb->root_off.addr = new->addr;
+		  idb->root_off.size = new->size;
+		  return (__wt_desc_write(toc));
+	 }
+
+	/*
+	 * Update the relevant WT_OFF structure.  There are two memory locations
+	 * that change (address and size), and we could race, but that's not a
+	 * problem.   Only a single thread ever reconciles a page at a time, and
+	 * pages cannot leave memory while they have children.
+	 */
+	parent_off = page->parent_off;
+	WT_RECORDS(parent_off) = new->records;
+	parent_off->addr = new->addr;
+	parent_off->size = new->size;
+
+	/*
+	 * Mark the parent page as dirty.
+	 *
+	 * There's no chance we need to flush this write -- the eviction thread
+	 * is the only thread that eventually cares if the page is dirty or not,
+	 * and it's our update that's making it dirty.   (The workQ thread does
+	 * have to flush its set-modified update, of course).
+	 *
+	 * We don't care if we race with the workQ; if the workQ thread races
+	 * with us, the page will still be marked dirty and that's all we care
+	 * about.
+	 */
+	WT_PAGE_SET_MODIFIED(page->parent);
+
+	return (0);
+}
diff --git a/src/btree/bt_ret.c b/src/btree/bt_ret.c
new file mode 100644
index 00000000000..8cdf8d90ce1
--- /dev/null
+++ b/src/btree/bt_ret.c
@@ -0,0 +1,179 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_dbt_return --
+ *	Retrun a WT_PAGE/WT_{ROW,COL}_INDX pair to the application.
+ */
+int
+__wt_dbt_return(WT_TOC *toc, DBT *key, DBT *data, int key_return)
+{
+	DB *db;
+	DBT local_key, local_data;
+	ENV *env;
+	IDB *idb;
+	WT_COL *cip;
+	WT_ITEM *item;
+	WT_PAGE *page;
+	WT_PAGE_DISK *dsk;
+	WT_ROW *rip;
+	WT_REPL *repl;
+	void *data_ret;
+	uint32_t size_ret;
+	int (*callback)(DB *, DBT *, DBT *), ret;
+
+	db = toc->db;
+	env = toc->env;
+	idb = db->idb;
+	callback = data->callback;
+	ret = 0;
+
+	page = toc->srch_page;
+	dsk = page->dsk;
+	cip = toc->srch_ip;
+	rip = toc->srch_ip;
+	repl = toc->srch_repl;
+
+	/*
+	 * Handle the key item -- the key may be unchanged, in which case we
+	 * don't touch it, it's already correct.
+	 *
+	 * If the key/data items are being passed to a callback routine and
+	 * there's nothing special about them (they aren't uninstantiated
+	 * overflow or compressed items), then give the callback a pointer to
+	 * the on-page data.  (We use a local DBT in this case, so we don't
+	 * touch potentially allocated application DBT memory.)  Else, copy
+	 * the items into the application's DBTs.
+	 *
+	 * If the key/data item are uninstantiated overflow and/or compressed
+	 * items, they require processing before being copied into the DBTs.
+	 * Don't allocate WT_INDX memory for key/data items here.  (We never
+	 * allocate WT_INDX memory for data items.   We do allocate WT_INDX
+	 * memory for keys, but if we are looking at a key only to return it,
+	 * it's not that likely to be accessed again (think of a cursor moving
+	 * through the tree).  Use memory in the application's DBT instead, it
+	 * is discarded when the WT_TOC is discarded.
+	 *
+	 * Key return implies a reference to a WT_ROW index (we don't return
+	 * record number keys yet, that will probably change when I add cursor
+	 * support).
+	 */
+	if (key_return) {
+		if (__wt_key_process(rip)) {
+			WT_RET(__wt_item_process(toc, rip->key, &toc->key));
+
+			key->data = toc->key.data;
+			key->size = toc->key.size;
+		} else if (callback == NULL) {
+			if (toc->key.mem_size < rip->size)
+				WT_RET(__wt_realloc(env,
+				    &toc->key.mem_size,
+				    rip->size, &toc->key.data));
+			memcpy(toc->key.data, rip->key, rip->size);
+			toc->key.size = rip->size;
+
+			key->data = toc->key.data;
+			key->size = toc->key.size;
+		} else {
+			WT_CLEAR(local_key);
+			key = &local_key;
+			key->data = rip->key;
+			key->size = rip->size;
+		}
+	}
+
+	/*
+	 * Handle the data item.
+	 *
+	 * If the item was ever replaced, it's easy, take the last replacement
+	 * data item, it's just a byte string.
+	 */
+	if (repl != NULL) {
+		if (WT_REPL_DELETED_ISSET(repl))
+			return (WT_NOTFOUND);
+		data->data = WT_REPL_DATA(repl);
+		data->size = repl->size;
+		return (callback == NULL ? 0 : callback(db, key, data));
+	}
+
+	/* Otherwise, take the item from the original page. */
+	switch (dsk->type) {
+	case WT_PAGE_COL_FIX:
+		data_ret = cip->data;
+		size_ret = db->fixed_len;
+		break;
+	case WT_PAGE_COL_RLE:
+		data_ret = WT_RLE_REPEAT_DATA(cip->data);
+		size_ret = db->fixed_len;
+		break;
+	case WT_PAGE_COL_VAR:
+		item = cip->data;
+		goto item_set;
+	case WT_PAGE_ROW_LEAF:
+	case WT_PAGE_DUP_LEAF:
+		item = rip->data;
+item_set:	switch (WT_ITEM_TYPE(item)) {
+		case WT_ITEM_DATA:
+		case WT_ITEM_DATA_DUP:
+			if (idb->huffman_data == NULL) {
+				data_ret = WT_ITEM_BYTE(item);
+				size_ret = WT_ITEM_LEN(item);
+			}
+			/* FALLTHROUGH */
+		case WT_ITEM_DATA_OVFL:
+		case WT_ITEM_DATA_DUP_OVFL:
+			WT_RET(__wt_item_process(toc, item, &toc->data));
+			data_ret = toc->data.data;
+			size_ret = toc->data.size;
+			break;
+		WT_ILLEGAL_FORMAT(db);
+		}
+		break;
+	WT_ILLEGAL_FORMAT(db);
+	}
+
+	/*
+	 * When we get here, data_ret and size_ret are set to the byte string
+	 * and the length we're going to return.   That byte string has been
+	 * decoded, we called __wt_item_process above in all cases where the
+	 * item could be encoded.
+	 */
+	if (callback == NULL) {
+		/*
+		 * We're copying the key/data pair out to the caller.  If we
+		 * haven't yet copied the data_ret/size_ret pair into the return
+		 * DBT (potentially done by __wt_item_process), do so now.
+		 */
+		if (data_ret != toc->data.data) {
+			if (toc->data.mem_size < size_ret)
+				WT_RET(__wt_realloc(env,
+				    &toc->data.mem_size,
+				    size_ret, &toc->data.data));
+			memcpy(toc->data.data, data_ret, size_ret);
+			toc->data.size = size_ret;
+		}
+
+		data->data = toc->data.data;
+		data->size = toc->data.size;
+	} else {
+		/*
+		 * If we're given a callback function, use the data_ret/size_ret
+		 * fields as set.
+		 */
+		WT_CLEAR(local_data);
+		data = &local_data;
+		data->data = data_ret;
+		data->size = size_ret;
+		ret = callback(db, key, data);
+	}
+
+	return (ret);
+}
diff --git a/src/btree/bt_rw.c b/src/btree/bt_rw.c
new file mode 100644
index 00000000000..ad8f12482b1
--- /dev/null
+++ b/src/btree/bt_rw.c
@@ -0,0 +1,85 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_page_disk_read --
+ *	Read a file page.
+ */
+int
+__wt_page_disk_read(
+    WT_TOC *toc, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size)
+{
+	DB *db;
+	ENV *env;
+	WT_FH *fh;
+	WT_STATS *stats;
+	off_t offset;
+	uint32_t checksum;
+
+	db = toc->db;
+	env = toc->env;
+	fh = db->idb->fh;
+	stats = env->ienv->cache->stats;
+
+	WT_STAT_INCR(stats, PAGE_READ);
+
+	offset = WT_ADDR_TO_OFF(db, addr);
+	WT_RET(__wt_read(env, fh, offset, size, dsk));
+
+	checksum = dsk->checksum;
+	dsk->checksum = 0;
+	if (checksum != __wt_cksum(dsk, size)) {
+		__wt_api_env_errx(env,
+		    "read checksum error: addr/size %lu/%lu at offset %llu",
+		    (u_long)addr, (u_long)size, (unsigned long long)offset);
+		return (WT_ERROR);
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_page_write --
+ *	Write a file page.
+ */
+inline int
+__wt_page_write(WT_TOC *toc, WT_PAGE *page)
+{
+	return (__wt_page_disk_write(toc, page->dsk, page->addr, page->size));
+}
+
+/*
+ * __wt_page_disk_write --
+ *	Write a file page.
+ */
+int
+__wt_page_disk_write(
+    WT_TOC *toc, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size)
+{
+	DB *db;
+	ENV *env;
+	WT_FH *fh;
+	WT_STATS *stats;
+
+	db = toc->db;
+	env = toc->env;
+	fh = db->idb->fh;
+	stats = env->ienv->cache->stats;
+
+	WT_ASSERT(env, __wt_verify_dsk_page(toc, dsk, addr, size) == 0);
+
+	WT_STAT_INCR(stats, PAGE_WRITE);
+
+	dsk->checksum = 0;
+	dsk->checksum = __wt_cksum(dsk, size);
+
+	return (__wt_write(env, fh, WT_ADDR_TO_OFF(db, addr), size, dsk));
+}
diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c
new file mode 100644
index 00000000000..5beb931f578
--- /dev/null
+++ b/src/btree/bt_stat.c
@@ -0,0 +1,348 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_stat_page_col_fix(WT_TOC *, WT_PAGE *);
+static int __wt_stat_page_col_rle(WT_TOC *, WT_PAGE *);
+static int __wt_stat_page_col_var(WT_TOC *, WT_PAGE *);
+static int __wt_stat_page_dup_leaf(WT_TOC *, WT_PAGE *);
+static int __wt_stat_page_row_leaf(WT_TOC *, WT_PAGE *, void *);
+
+/*
+ * __wt_page_stat --
+ *	Stat any Btree page.
+ */
+int
+__wt_page_stat(WT_TOC *toc, WT_PAGE *page, void *arg)
+{
+	DB *db;
+	IDB *idb;
+	WT_PAGE_DISK *dsk;
+	WT_STATS *stats;
+
+	db = toc->db;
+	idb = db->idb;
+	dsk = page->dsk;
+	stats = idb->dstats;
+
+	/*
+	 * All internal pages and overflow pages are trivial, all we track is
+	 * a count of the page type.
+	 */
+	switch (dsk->type) {
+	case WT_PAGE_COL_FIX:
+		WT_STAT_INCR(stats, PAGE_COL_FIX);
+		WT_RET(__wt_stat_page_col_fix(toc, page));
+		break;
+	case WT_PAGE_COL_INT:
+		WT_STAT_INCR(stats, PAGE_COL_INTERNAL);
+		break;
+	case WT_PAGE_COL_RLE:
+		WT_STAT_INCR(stats, PAGE_COL_RLE);
+		WT_RET(__wt_stat_page_col_rle(toc, page));
+		break;
+	case WT_PAGE_COL_VAR:
+		WT_STAT_INCR(stats, PAGE_COL_VARIABLE);
+		WT_RET(__wt_stat_page_col_var(toc, page));
+		break;
+	case WT_PAGE_DUP_INT:
+		WT_STAT_INCR(stats, PAGE_DUP_INTERNAL);
+		break;
+	case WT_PAGE_DUP_LEAF:
+		WT_STAT_INCR(stats, PAGE_DUP_LEAF);
+		WT_RET(__wt_stat_page_dup_leaf(toc, page));
+		break;
+	case WT_PAGE_OVFL:
+		WT_STAT_INCR(stats, PAGE_OVERFLOW);
+		break;
+	case WT_PAGE_ROW_INT:
+		WT_STAT_INCR(stats, PAGE_ROW_INTERNAL);
+		break;
+	case WT_PAGE_ROW_LEAF:
+		WT_STAT_INCR(stats, PAGE_ROW_LEAF);
+		WT_RET(__wt_stat_page_row_leaf(toc, page, arg));
+		break;
+	WT_ILLEGAL_FORMAT(db);
+	}
+	return (0);
+}
+
+/*
+ * __wt_stat_page_col_fix --
+ *	Stat a WT_PAGE_COL_FIX page.
+ */
+static int
+__wt_stat_page_col_fix(WT_TOC *toc, WT_PAGE *page)
+{
+	WT_COL *cip;
+	WT_REPL *repl;
+	WT_STATS *stats;
+	uint32_t i;
+
+	stats = toc->db->idb->dstats;
+
+	/* Walk the page, counting data items. */
+	WT_INDX_FOREACH(page, cip, i) {
+		if ((repl = WT_COL_REPL(page, cip)) == NULL)
+			if (WT_FIX_DELETE_ISSET(cip->data))
+				WT_STAT_INCR(stats, ITEM_COL_DELETED);
+			else
+				WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+		else
+			if (WT_REPL_DELETED_ISSET(repl))
+				WT_STAT_INCR(stats, ITEM_COL_DELETED);
+			else
+				WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+	}
+	return (0);
+}
+
+/*
+ * __wt_stat_page_col_rle --
+ *	Stat a WT_PAGE_COL_RLE page.
+ */
+static int
+__wt_stat_page_col_rle(WT_TOC *toc, WT_PAGE *page)
+{
+	WT_COL *cip;
+	WT_RLE_EXPAND *exp;
+	WT_REPL *repl;
+	WT_STATS *stats;
+	uint32_t i;
+
+	stats = toc->db->idb->dstats;
+
+	/* Walk the page, counting data items. */
+	WT_INDX_FOREACH(page, cip, i) {
+		if (WT_FIX_DELETE_ISSET(WT_RLE_REPEAT_DATA(cip->data)))
+			WT_STAT_INCRV(stats,
+			    ITEM_COL_DELETED, WT_RLE_REPEAT_COUNT(cip->data));
+		else
+			WT_STAT_INCRV(stats,
+			    ITEM_TOTAL_DATA, WT_RLE_REPEAT_COUNT(cip->data));
+
+		/*
+		 * Check for corrections.
+		 *
+		 * XXX
+		 * This gets the count wrong if an application changes existing
+		 * records, or updates a deleted record two times in a row --
+		 * we'll incorrectly count the records as unique, when they are
+		 * changes to the same record.  I'm not fixing it as I don't
+		 * expect the WT_COL_RLEEXP data structure to be permanent, it's
+		 * too likely to become a linked list in bad cases.
+		 */
+		for (exp =
+		    WT_COL_RLEEXP(page, cip); exp != NULL; exp = exp->next) {
+			repl = exp->repl;
+			if (WT_REPL_DELETED_ISSET(repl))
+				WT_STAT_INCR(stats, ITEM_COL_DELETED);
+			else
+				WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+		}
+	}
+	return (0);
+}
+
+/*
+ * __wt_stat_page_col_var --
+ *	Stat a WT_PAGE_COL_VAR page.
+ */
+static int
+__wt_stat_page_col_var(WT_TOC *toc, WT_PAGE *page)
+{
+	DB *db;
+	WT_COL *cip;
+	WT_REPL *repl;
+	WT_STATS *stats;
+	uint32_t i;
+
+	db = toc->db;
+	stats = db->idb->dstats;
+
+	/*
+	 * Walk the page, counting regular and overflow data items, and checking
+	 * to be sure any replacements weren't deletions.  If the item has been
+	 * replaced, assume it was replaced by an item of the same size (it's
+	 * to expensive to figure out if it will require the same space or not,
+	 * especially if there's Huffman encoding).
+	 */
+	WT_INDX_FOREACH(page, cip, i) {
+		switch (WT_ITEM_TYPE(cip->data)) {
+		case WT_ITEM_DATA:
+			repl = WT_COL_REPL(page, cip);
+			if (repl == NULL || !WT_REPL_DELETED_ISSET(repl))
+				WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+			break;
+		case WT_ITEM_DATA_OVFL:
+			repl = WT_COL_REPL(page, cip);
+			if (repl == NULL || !WT_REPL_DELETED_ISSET(repl)) {
+				WT_STAT_INCR(stats, ITEM_DATA_OVFL);
+				WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+			}
+			break;
+		case WT_ITEM_DEL:
+			WT_STAT_INCR(stats, ITEM_COL_DELETED);
+			break;
+		WT_ILLEGAL_FORMAT(db);
+		}
+	}
+	return (0);
+}
+
+/*
+ * __wt_stat_page_dup_leaf --
+ *	Stat a WT_PAGE_DUP_LEAF page.
+ */
+static int
+__wt_stat_page_dup_leaf(WT_TOC *toc, WT_PAGE *page)
+{
+	DB *db;
+	WT_REPL *repl;
+	WT_ROW *rip;
+	WT_STATS *stats;
+	uint32_t i;
+
+	db = toc->db;
+	stats = db->idb->dstats;
+
+	/*
+	 * Walk the page, counting regular and overflow data items, and checking
+	 * to be sure any replacements weren't deletions.  If the item has been
+	 * replaced, assume it was replaced by an item of the same size (it's
+	 * to expensive to figure out if it will require the same space or not,
+	 * especially if there's Huffman encoding).
+	 */
+	WT_INDX_FOREACH(page, rip, i) {
+		switch (WT_ITEM_TYPE(rip->data)) {
+		case WT_ITEM_DATA_DUP:
+			repl = WT_ROW_REPL(page, rip);
+			if (repl == NULL || !WT_REPL_DELETED_ISSET(repl)) {
+				WT_STAT_INCR(stats, ITEM_DUP_DATA);
+				WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+			}
+			break;
+		case WT_ITEM_DATA_DUP_OVFL:
+			repl = WT_ROW_REPL(page, rip);
+			if (repl == NULL || !WT_REPL_DELETED_ISSET(repl)) {
+				WT_STAT_INCR(stats, ITEM_DUP_DATA);
+				WT_STAT_INCR(stats, ITEM_DATA_OVFL);
+				WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+			}
+			break;
+		WT_ILLEGAL_FORMAT(db);
+		}
+	}
+	return (0);
+}
+
+/*
+ * __wt_stat_page_row_leaf --
+ *	Stat a WT_PAGE_ROW_LEAF page.
+ */
+static int
+__wt_stat_page_row_leaf(WT_TOC *toc, WT_PAGE *page, void *arg)
+{
+	DB *db;
+	WT_OFF *off;
+	WT_REF *ref;
+	WT_REPL *repl;
+	WT_ROW *rip;
+	WT_STATS *stats;
+	uint32_t i;
+	int ret;
+
+	db = toc->db;
+	stats = db->idb->dstats;
+
+	/*
+	 * Walk the page, counting regular and overflow data items, and checking
+	 * to be sure any replacements weren't deletions.  If the item has been
+	 * replaced, assume it was replaced by an item of the same size (it's
+	 * to expensive to figure out if it will require the same space or not,
+	 * especially if there's Huffman encoding).
+	 */
+	WT_INDX_FOREACH(page, rip, i) {
+		switch (WT_ITEM_TYPE(rip->data)) {
+		case WT_ITEM_DATA:
+			repl = WT_ROW_REPL(page, rip);
+			if (repl != NULL && WT_REPL_DELETED_ISSET(repl))
+				continue;
+			WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+			break;
+		case WT_ITEM_DATA_OVFL:
+			repl = WT_ROW_REPL(page, rip);
+			if (repl != NULL && WT_REPL_DELETED_ISSET(repl))
+				continue;
+			WT_STAT_INCR(stats, ITEM_DATA_OVFL);
+			WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+			break;
+		case WT_ITEM_DATA_DUP:
+			repl = WT_ROW_REPL(page, rip);
+			if (repl != NULL && WT_REPL_DELETED_ISSET(repl))
+				continue;
+			WT_STAT_INCR(stats, ITEM_DUP_DATA);
+			WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+			break;
+		case WT_ITEM_DATA_DUP_OVFL:
+			repl = WT_ROW_REPL(page, rip);
+			if (repl != NULL && WT_REPL_DELETED_ISSET(repl))
+				continue;
+			WT_STAT_INCR(stats, ITEM_DUP_DATA);
+			WT_STAT_INCR(stats, ITEM_DATA_OVFL);
+			WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+			break;
+		case WT_ITEM_OFF:
+			/*
+			 * Recursively call the tree-walk code for any off-page
+			 * duplicate trees.  (Check for any off-page duplicate
+			 * trees locally because we already have to walk the
+			 * page, so it's faster than walking the page both here
+			 * and in the tree-walk function.)
+			 */
+			ref = WT_ROW_REF(page, rip);
+			off = WT_ROW_OFF(rip);
+			WT_RET(__wt_page_in(toc, page, ref, off, 0));
+			ret = __wt_tree_walk(toc, ref, 0, __wt_page_stat, arg);
+			__wt_hazard_clear(toc, ref->page);
+			if (ret != 0)
+				return (ret);
+			WT_STAT_INCR(stats, DUP_TREE);
+			break;
+		WT_ILLEGAL_FORMAT(db);
+		}
+
+		/*
+		 * If the data item wasn't deleted, count the key.
+		 *
+		 * If we have processed the key, we have lost the information as
+		 * to whether or not it's an overflow key -- we can figure out
+		 * if it's Huffman encoded by looking at the huffman key, but
+		 * that doesn't tell us if it's an overflow key or not.  To fix
+		 * this we'd have to maintain a reference to the on-page key and
+		 * check it, and I'm not willing to spend the additional pointer
+		 * in the WT_ROW structure.
+		 */
+		if (__wt_key_process(rip))
+			switch (WT_ITEM_TYPE(rip->key)) {
+			case WT_ITEM_KEY_OVFL:
+				WT_STAT_INCR(stats, ITEM_KEY_OVFL);
+				/* FALLTHROUGH */
+			case WT_ITEM_KEY:
+				WT_STAT_INCR(stats, ITEM_TOTAL_KEY);
+				break;
+			WT_ILLEGAL_FORMAT(db);
+			}
+		else
+			WT_STAT_INCR(stats, ITEM_TOTAL_KEY);
+
+	}
+	return (0);
+}
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
new file mode 100644
index 00000000000..af5a9d65258
--- /dev/null
+++ b/src/btree/bt_sync.c
@@ -0,0 +1,61 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_bt_tree_sync(WT_TOC *, WT_PAGE *, void *);
+
+/*
+ * __wt_bt_sync --
+ *	Sync the tree.
+ */
+int
+__wt_bt_sync(WT_TOC *toc)
+{
+	ENV *env;
+	IDB *idb;
+	WT_CACHE *cache;
+	int ret;
+
+	env = toc->env;
+	idb = toc->db->idb;
+	cache = env->ienv->cache;
+
+	if (WT_UNOPENED_DATABASE(idb))
+		return (0);
+
+	/*
+	 * The tree walk is depth first, that is, the worker function is not
+	 * called on internal pages until all children have been visited; so,
+	 * we don't have to worry about a page being dirtied after the visit.
+	 *
+	 * Lock out the cache eviction thread, though, we don't want it trying
+	 * to reconcile pages we're flushing.
+	 */
+	__wt_lock(env, cache->mtx_reconcile);
+	ret = __wt_tree_walk(toc, NULL,
+	    WT_WALK_CACHE | WT_WALK_OFFDUP, __wt_bt_tree_sync, NULL);
+	__wt_unlock(env, cache->mtx_reconcile);
+	return (ret);
+}
+
+/*
+ * __wt_bt_tree_sync --
+ *	Sync a page.
+ */
+static int
+__wt_bt_tree_sync(WT_TOC *toc, WT_PAGE *page, void *arg)
+{
+	WT_CC_QUIET(arg, NULL);
+
+	/* Reconcile any dirty pages. */
+	if (WT_PAGE_IS_MODIFIED(page))
+		WT_RET(__wt_page_reconcile(toc, page));
+	return (0);
+}
diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c
new file mode 100644
index 00000000000..19e9fccb82a
--- /dev/null
+++ b/src/btree/bt_vrfy.c
@@ -0,0 +1,1346 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * There's a bunch of stuff we pass around during verification, group it
+ * together to make the code prettier.
+ */
+typedef struct {
+	uint32_t frags;				/* Total frags */
+	bitstr_t *fragbits;			/* Frag tracking bit list */
+
+	FILE	*stream;			/* Dump file stream */
+
+	void (*f)(const char *, uint64_t);	/* Progress callback */
+	uint64_t fcnt;				/* Progress counter */
+
+	WT_PAGE *leaf;				/* Last leaf-page */
+} WT_VSTUFF;
+
+static int __wt_verify_addfrag(WT_TOC *, uint32_t, uint32_t, WT_VSTUFF *);
+static int __wt_verify_checkfrag(DB *, WT_VSTUFF *);
+static int __wt_verify_delfmt(DB *, uint32_t, uint32_t);
+static int __wt_verify_dsk_col_fix(DB *, WT_PAGE_DISK *, uint32_t, uint32_t);
+static int __wt_verify_dsk_col_int(DB *, WT_PAGE_DISK *, uint32_t, uint32_t);
+static int __wt_verify_dsk_col_rle(DB *, WT_PAGE_DISK *, uint32_t, uint32_t);
+static int __wt_verify_dsk_item(WT_TOC *, WT_PAGE_DISK *, uint32_t, uint32_t);
+static int __wt_verify_dsk_ovfl(WT_TOC *, WT_PAGE_DISK *, uint32_t, uint32_t);
+static int __wt_verify_eof(DB *, uint32_t, uint32_t);
+static int __wt_verify_eop(DB *, uint32_t, uint32_t);
+static int __wt_verify_key_order(WT_TOC *, WT_PAGE *);
+static int __wt_verify_overflow_col(WT_TOC *, WT_PAGE *, WT_VSTUFF *);
+static int __wt_verify_overflow_common(
+		WT_TOC *, WT_OVFL *, uint32_t, uint32_t, WT_VSTUFF *);
+static int __wt_verify_overflow_row(WT_TOC *, WT_PAGE *, WT_VSTUFF *);
+static int __wt_verify_pc(WT_TOC *, WT_ROW *, WT_PAGE *, int);
+static int __wt_verify_tree(WT_TOC *,
+		WT_ROW *, uint64_t, uint64_t, uint32_t, WT_REF *, WT_VSTUFF *);
+
+/*
+ * __wt_db_verify --
+ *	Verify a Btree.
+ */
+int
+__wt_db_verify(WT_TOC *toc, void (*f)(const char *, uint64_t))
+{
+	return (__wt_verify(toc, f, NULL));
+}
+
+/*
+ * __wt_verify --
+ *	Verify a Btree, optionally dumping each page in debugging mode.
+ */
+int
+__wt_verify(
+    WT_TOC *toc, void (*f)(const char *, uint64_t), FILE *stream)
+{
+	DB *db;
+	ENV *env;
+	IDB *idb;
+	WT_VSTUFF vstuff;
+	int ret;
+
+	env = toc->env;
+	db = toc->db;
+	idb = db->idb;
+	ret = 0;
+
+	memset(&vstuff, 0, sizeof(vstuff));
+	vstuff.stream = stream;
+	vstuff.f = f;
+
+	/*
+	 * Allocate a bit array, where each bit represents a single allocation
+	 * size piece of the file.   This is how we track the parts of the file
+	 * we've verified.  Storing this on the heap seems reasonable: with a
+	 * minimum allocation size of 512B, we would allocate 4MB to verify a
+	 * 16GB file.  To verify larger files than we can handle this way, we'd
+	 * have to write parts of the bit array into a disk file.
+	 *
+	 * !!!
+	 * There's one portability issue -- the bitstring package uses "ints",
+	 * not unsigned ints, or any fixed size.   If an "int" can't hold a
+	 * big enough value, we could lose.   There's a check here to make we
+	 * don't overflow.   I don't ever expect to see this error message, but
+	 * better safe than sorry.
+	 */
+	vstuff.frags = WT_OFF_TO_ADDR(db, idb->fh->file_size);
+	if (vstuff.frags > INT_MAX) {
+		__wt_api_db_errx(db, "file is too large to verify");
+		goto err;
+	}
+	WT_ERR(bit_alloc(env, vstuff.frags, &vstuff.fragbits));
+
+	/*
+	 * The first sector of the file is the description record -- ignore
+	 * it for now.
+	 */
+	bit_nset(vstuff.fragbits, 0, 0);
+
+	/* Verify the tree, starting at the root. */
+	WT_ERR(__wt_verify_tree(toc, NULL, WT_RECORDS(&idb->root_off),
+	    (uint64_t)1, WT_NOLEVEL, &idb->root_page, &vstuff));
+
+	WT_ERR(__wt_verify_checkfrag(db, &vstuff));
+
+err:	/* Wrap up reporting and free allocated memory. */
+	if (vstuff.f != NULL)
+		vstuff.f(toc->name, vstuff.fcnt);
+	if (vstuff.fragbits != NULL)
+		__wt_free(env, vstuff.fragbits, 0);
+
+	return (ret);
+}
+
+/*
+ * __wt_verify_tree --
+ *	Verify a tree, recursively descending through it in depth-first fashion.
+ * The page argument was physically verified (so we know it's correctly formed),
+ * and the in-memory version built.  Our job is to check logical relationships
+ * in the page and in the tree.
+ */
+static int
+__wt_verify_tree(
+    WT_TOC *toc,		/* Thread of control */
+    WT_ROW *parent_rip,		/* Internal key referencing this page, if any */
+    uint64_t parent_records,	/* Parent's count of records in this tree */
+    uint64_t start_recno,	/* First record on this page */
+    uint32_t level,		/* Page's tree level */
+    WT_REF *ref,		/* Already verified page reference */
+    WT_VSTUFF *vs)		/* The verify package */
+{
+	DB *db;
+	WT_COL *cip;
+	WT_ITEM *item;
+	WT_OFF *off;
+	WT_PAGE *page;
+	WT_PAGE_DISK *dsk;
+	WT_REPL *repl;
+	WT_ROW *rip;
+	uint64_t records;
+	uint32_t i;
+	int is_root, ret;
+
+	db = toc->db;
+	page = ref->page;
+	dsk = page->dsk;
+	ret = 0;
+
+	/* Report progress every 10 pages. */
+	if (vs->f != NULL && ++vs->fcnt % 10 == 0)
+		vs->f(toc->name, vs->fcnt);
+
+	/* Update frags list. */
+	WT_ERR(__wt_verify_addfrag(toc, page->addr, page->size, vs));
+
+#ifdef DIAGNOSTIC
+	/* Optionally dump the page in debugging mode. */
+	if (vs->stream != NULL)
+		return (__wt_debug_page(toc, page, NULL, vs->stream));
+#endif
+
+	/*
+	 * The page's physical structure was verified when it was read into
+	 * memory by the read server thread, and then the in-memory version
+	 * of the page was built.   Now we make sure the page and tree are
+	 * logically consistent.
+	 *
+	 * !!!
+	 * The problem: (1) the read server has to build the in-memory version
+	 * of the page because the read server is the thread that flags when
+	 * any thread can access the page in the tree; (2) we can't build the
+	 * in-memory version of the page until the physical structure is known
+	 * to be OK, so the read server has to verify at least the physical
+	 * structure of the page; (3) doing complete page verification requires
+	 * reading additional pages (for example, overflow keys imply reading
+	 * overflow pages in order to test the key's order in the page); (4)
+	 * the read server cannot read additional pages because it will hang
+	 * waiting on itself.  For this reason, we split page verification
+	 * into a physical verification, which allows the in-memory version
+	 * of the page to be built, and then a subsequent logical verification
+	 * which happens here.
+	 */
+
+	/*
+	 * If passed a level of WT_NOLEVEL, that is, the only level that can't
+	 * possibly be a valid database page level, this is the root page of
+	 * the tree.
+	 *
+	 * If it's the root, use this page's level to initialize expected the
+	 * values for the rest of the tree.
+	 */
+	is_root = level == WT_NOLEVEL ? 1 : 0;
+	if (is_root)
+		level = dsk->level;
+
+	/* Check that tree levels and record counts match up. */
+	if (dsk->level != level) {
+		__wt_api_db_errx(db,
+		    "page at addr %lu has a tree level of %lu where the "
+		    "expected level was %lu",
+		    (u_long)page->addr, (u_long)dsk->level, (u_long)level);
+		goto err;
+	}
+
+	/*
+	 * Check the record counts.
+	 *
+	 * Confirm the number of records found on this page (by summing the
+	 * WT_OFF structure record counts) matches the WT_OFF structure record
+	 * count in our parent.  Use the in-memory record count for internal
+	 * pages -- we could sum the record counts as we walk the page below,
+	 * but we did that when building the in-memory version of the page,
+	 * there's no reason to do it again.
+	 */
+	if (page->records != parent_records) {
+		__wt_api_db_errx(db,
+		    "page at addr %lu has a record count of %llu where the "
+		    "expected record count was %llu",
+		    (u_long)page->addr, page->records,
+		    (unsigned long long)parent_records);
+		goto err;
+	}
+
+	/* Check the starting record number. */
+	switch (dsk->type) {
+	case WT_PAGE_COL_FIX:
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_COL_RLE:
+	case WT_PAGE_COL_VAR:
+		if (dsk->start_recno != start_recno) {
+			__wt_api_db_errx(db,
+			    "page at addr %lu has a starting record of %llu "
+			    "where the expected starting record was %llu",
+			    (u_long)page->addr,
+			    (unsigned long long)dsk->start_recno,
+			    (unsigned long long)start_recno);
+			goto err;
+		}
+		break;
+	default:
+		break;
+	}
+
+	/*
+	 * Check on-page overflow page references.
+	 *
+	 * There's a potential performance problem here: we read key overflow
+	 * pages twice, once when checking the overflow page itself, and again
+	 * when checking the key ordering.   It's a pain to combine the two
+	 * tests (the page types with overflow items aren't exactly the same
+	 * as the page types with ordered keys, and the underlying functions
+	 * that instantiate (and decompress) overflow pages don't want to know
+	 * anything about verification), and I don't want to keep the overflow
+	 * keys in the cache, it's likely to be wasted space.  Until it's a
+	 * problem, I'm going to assume the second read of the overflow key is
+	 * satisfied in the operating system buffer cache, and not worry about
+	 * it.  Table verify isn't likely to be a performance path anyway.
+	 */
+	switch (dsk->type) {
+	case WT_PAGE_COL_VAR:
+		WT_RET(__wt_verify_overflow_col(toc, page, vs));
+		break;
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_DUP_LEAF:
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+		WT_RET(__wt_verify_overflow_row(toc, page, vs));
+		break;
+	default:
+		break;
+	}
+
+	/* Check on-page key ordering. */
+	switch (dsk->type) {
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_DUP_LEAF:
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+		WT_RET(__wt_verify_key_order(toc, page));
+		break;
+	default:
+		break;
+	}
+
+	/* Check tree connections and recursively descend the tree. */
+	switch (dsk->type) {
+	case WT_PAGE_COL_INT:
+		/* For each entry in an internal page, verify the subtree. */
+		start_recno = dsk->start_recno;
+		WT_INDX_FOREACH(page, cip, i) {
+			/* cip references the subtree containing the record */
+			ref = WT_COL_REF(page, cip);
+			off = WT_COL_OFF(cip);
+			records = WT_COL_OFF_RECORDS(cip);
+			WT_ERR(__wt_page_in(toc, page, ref, off, 1));
+			ret = __wt_verify_tree(toc, NULL,
+			    records, start_recno, level - 1, ref, vs);
+			__wt_hazard_clear(toc, ref->page);
+			if (ret != 0)
+				goto err;
+			start_recno += records;
+		}
+		break;
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_ROW_INT:
+		/*
+		 * There are two row-store, logical connection checks:
+		 *
+		 * First, compare the internal node key leading to the current
+		 * page against the first entry on the current page.  The
+		 * internal node key must compare less than or equal to the
+		 * first entry on the current page.
+		 *
+		 * Second, compare the largest key we've seen on any leaf page
+		 * against the next internal node key we find.  This check is
+		 * a little tricky: every time we find a leaf page, we save a
+		 * reference in the vs->leaf field.  The next time we're about
+		 * to indirect through an entry on an internal node, we compare
+		 * the last entry on that saved page against the internal node
+		 * entry's key.  In that comparison, the leaf page's key must
+		 * be less than the internal node entry's key.
+		 */
+		if (parent_rip != NULL)
+			WT_ERR(__wt_verify_pc(toc, parent_rip, page, 1));
+
+		/* For each entry in an internal page, verify the subtree. */
+		WT_INDX_FOREACH(page, rip, i) {
+			/*
+			 * At each off-page entry, we compare the current entry
+			 * against the largest key in the subtree rooted to the
+			 * immediate left of the current item; this key must
+			 * compare less than or equal to the current item.  The
+			 * trick here is we need the last leaf key, not the last
+			 * internal node key.  It's returned to us in the leaf
+			 * field of the vs structure, whenever we verify a leaf
+			 * page.  Discard the leaf node as soon as we've used it
+			 * in a comparison.
+			 */
+			if (vs->leaf != NULL) {
+				WT_ERR(
+				    __wt_verify_pc(toc, rip, vs->leaf, 0));
+				__wt_hazard_clear(toc, vs->leaf);
+				vs->leaf = NULL;
+			}
+			/* rip references the subtree containing the record */
+			ref = WT_ROW_REF(page, rip);
+			off = WT_ROW_OFF(rip);
+			records = WT_ROW_OFF_RECORDS(rip);
+			WT_ERR(__wt_page_in(toc, page, ref, off, 1));
+			ret = __wt_verify_tree(toc, rip,
+			    records, (uint64_t)0, level - 1, ref, vs);
+
+			/*
+			 * Remaining special handling of the last verified leaf
+			 * page: if we kept a reference to that page, don't
+			 * release the hazard reference until after comparing
+			 * the last key on that page against the next key in the
+			 * tree.
+			 */
+			if (vs->leaf != ref->page)
+				__wt_hazard_clear(toc, ref->page);
+			if (ret != 0)
+				goto err;
+		}
+		break;
+	case WT_PAGE_ROW_LEAF:
+		/*
+		 * For each entry in a row-store leaf page, verify any off-page
+		 * duplicates tree.
+		 */
+		WT_INDX_FOREACH(page, rip, i) {
+			/* Ignore anything except off-page duplicate trees. */
+			if ((repl = WT_ROW_REPL(
+			    page, rip)) != NULL && WT_REPL_DELETED_ISSET(repl))
+				continue;
+			item = rip->data;
+			if (WT_ITEM_TYPE(item) != WT_ITEM_OFF)
+				continue;
+
+			/* Verify the off-page duplicate tree. */
+			ref = WT_ROW_DUP(page, rip);
+			off = WT_ROW_OFF(rip);
+			records = WT_ROW_OFF_RECORDS(rip);
+			WT_ERR(__wt_page_in(toc, page, ref, off, 1));
+			ret = __wt_verify_tree(toc, NULL,
+			    records, (uint64_t)0, WT_NOLEVEL, ref, vs);
+			__wt_hazard_clear(toc, ref->page);
+			if (ret != 0)
+				goto err;
+		}
+		/* FALLTHROUGH */
+	case WT_PAGE_DUP_LEAF:
+		/*
+		 * Retain a reference to all row-store leaf pages, we need them
+		 * to check their last entry against the next internal key in
+		 * the tree.
+		 */
+		vs->leaf = page;
+		return (0);
+	default:
+		break;
+	}
+
+	/*
+	 * The largest key on the last leaf page in the tree is never needed,
+	 * there aren't any internal pages after it.  So, we get here with
+	 * vs->leaf needing to be released.
+	 */
+err:	if (vs->leaf != NULL) {
+		__wt_hazard_clear(toc, vs->leaf);
+		vs->leaf = NULL;
+	}
+
+	return (ret);
+}
+
+/*
+ * __wt_verify_pc --
+ *	Compare a key on a parent page to a designated entry on a child page.
+ */
+static int
+__wt_verify_pc(WT_TOC *toc, WT_ROW *parent_rip, WT_PAGE *child, int first_entry)
+{
+	DB *db;
+	DBT *cd_ref, *pd_ref, *scratch1, *scratch2;
+	WT_ROW *child_rip;
+	int cmp, ret, (*func)(DB *, const DBT *, const DBT *);
+
+	db = toc->db;
+	scratch1 = scratch2 = NULL;
+	ret = 0;
+
+	/* Set the comparison function. */
+	switch (child->dsk->type) {
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_DUP_LEAF:
+		func = db->btree_compare_dup;
+		break;
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+		func = db->btree_compare;
+		break;
+	WT_ILLEGAL_FORMAT(db);
+	}
+
+	/*
+	 * The two keys we're going to compare may be overflow keys -- don't
+	 * bother instantiating the keys in the tree, there's no reason to
+	 * believe we're going to be working in this database.
+	 */
+	child_rip = first_entry ?
+	    child->u.irow : child->u.irow + (child->indx_count - 1);
+	if (__wt_key_process(child_rip)) {
+		WT_ERR(__wt_scr_alloc(toc, 0, &scratch1));
+		WT_ERR(__wt_item_process(toc, child_rip->key, scratch1));
+		cd_ref = scratch1;
+	} else
+		cd_ref = (DBT *)child_rip;
+	if (__wt_key_process(parent_rip)) {
+		WT_ERR(__wt_scr_alloc(toc, 0, &scratch2));
+		WT_RET(__wt_item_process(toc, parent_rip->key, scratch2));
+		pd_ref = scratch2;
+	} else
+		pd_ref = (DBT *)parent_rip;
+
+	/* Compare the parent's key against the child's key. */
+	cmp = func(db, cd_ref, pd_ref);
+
+	if (first_entry && cmp < 0) {
+		__wt_api_db_errx(db,
+		    "the first key on page at addr %lu sorts before its "
+		    "reference key on its parent's page",
+		    (u_long)child->addr);
+		ret = WT_ERROR;
+	}
+	if (!first_entry && cmp >= 0) {
+		__wt_api_db_errx(db,
+		    "the last key on the page at addr %lu sorts after a parent "
+		    "page's key for the subsequent page",
+		    (u_long)child->addr);
+		ret = WT_ERROR;
+	}
+
+err:	if (scratch1 != NULL)
+		__wt_scr_release(&scratch1);
+	if (scratch2 != NULL)
+		__wt_scr_release(&scratch2);
+
+	return (ret);
+}
+
+/*
+ * __wt_verify_key_order --
+ *	Check on-page key ordering.
+ */
+static int
+__wt_verify_key_order(WT_TOC *toc, WT_PAGE *page)
+{
+	struct {
+		DBT	*dbt;			/* DBT to compare */
+		DBT	*scratch;		/* scratch buffer */
+	} *current, *last, _a, _b;
+	DB *db;
+	WT_PAGE_DISK *dsk;
+	WT_ROW *rip;
+	uint32_t i;
+	int (*func)(DB *, const DBT *, const DBT *), ret;
+
+	db = toc->db;
+	dsk = page->dsk;
+	ret = 0;
+
+	WT_CLEAR(_a);
+	WT_CLEAR(_b);
+	current = &_a;
+	WT_ERR(__wt_scr_alloc(toc, 0, &current->scratch));
+	last = &_b;
+	WT_ERR(__wt_scr_alloc(toc, 0, &last->scratch));
+
+	/* Set the comparison function. */
+	switch (dsk->type) {
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_DUP_LEAF:
+		func = db->btree_compare_dup;
+		break;
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+		func = db->btree_compare;
+		break;
+	WT_ILLEGAL_FORMAT(db);
+	}
+
+	/* Walk the page, comparing keys. */
+	WT_INDX_FOREACH(page, rip, i) {
+		/* Skip duplicates */
+		if (WT_ROW_INDX_IS_DUPLICATE(page, rip))
+			continue;
+
+		/*
+		 * The two keys we're going to compare may be overflow keys --
+		 * don't bother instantiating the keys in the tree, there's no
+		 * reason to believe we're going to be working in this database.
+		 */
+		if (__wt_key_process(rip)) {
+			WT_RET(__wt_item_process(
+			    toc, rip->key, current->scratch));
+			current->dbt = current->scratch;
+		} else
+			current->dbt = (DBT *)rip;
+
+		/* Compare the current key against the last key. */
+		if (last->dbt != NULL &&
+		    func(db, last->dbt, current->dbt) >= 0) {
+			__wt_api_db_errx(db,
+			    "the %lu and %lu keys on page at addr %lu are "
+			    "incorrectly sorted",
+			    (u_long)WT_ROW_SLOT(page, rip) - 1,
+			    (u_long)WT_ROW_SLOT(page, rip),
+			    (u_long)page->addr);
+			ret = WT_ERROR;
+			goto err;
+		}
+	}
+
+err:	if (_a.scratch != NULL)
+		__wt_scr_release(&_a.scratch);
+	if (_b.scratch != NULL)
+		__wt_scr_release(&_b.scratch);
+
+	return (ret);
+}
+
+/*
+ * __wt_verify_dsk_page --
+ *	Verify a single Btree page as read from disk.
+ */
+int
+__wt_verify_dsk_page(
+    WT_TOC *toc, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size)
+{
+	DB *db;
+
+	db = toc->db;
+
+	/* Check the page type. */
+	switch (dsk->type) {
+	case WT_PAGE_FREE:
+		/*
+		 * Free pages are only written in diagnostic mode, and the
+		 * type is the only thing that can be verified about them.
+		 */
+		return (0);
+	case WT_PAGE_COL_FIX:
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_COL_RLE:
+	case WT_PAGE_COL_VAR:
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_DUP_LEAF:
+	case WT_PAGE_OVFL:
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+		break;
+	case WT_PAGE_INVALID:
+	default:
+		__wt_api_db_errx(db,
+		    "page at addr %lu has an invalid type of %lu",
+		    (u_long)addr, (u_long)dsk->type);
+		return (WT_ERROR);
+	}
+
+	/*
+	 * FUTURE:
+	 * Check the LSN against the existing log files.
+	 */
+	if (dsk->lsn[0] != 0 || dsk->lsn[1] != 0) {
+		__wt_api_db_errx(db,
+		    "page at addr %lu has non-zero lsn header fields",
+		    (u_long)addr);
+		return (WT_ERROR);
+	}
+
+	/* Ignore the checksum -- it verified when we first read the page. */
+
+	/* Check the page level. */
+	switch (dsk->type) {
+	case WT_PAGE_COL_FIX:
+	case WT_PAGE_COL_RLE:
+	case WT_PAGE_COL_VAR:
+	case WT_PAGE_DUP_LEAF:
+	case WT_PAGE_OVFL:
+	case WT_PAGE_ROW_LEAF:
+		if (dsk->level != WT_LLEAF)
+			goto err_level;
+		break;
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_ROW_INT:
+		if (dsk->level <= WT_LLEAF) {
+err_level:		__wt_api_db_errx(db,
+			    "%s page at addr %lu has incorrect tree level "
+			    "of %lu",
+			    __wt_page_type_string(dsk),
+			    (u_long)addr, (u_long)dsk->level);
+			return (WT_ERROR);
+		}
+		break;
+	WT_ILLEGAL_FORMAT(db);
+	}
+
+	if (dsk->unused[0] != '\0' || dsk->unused[1] != '\0') {
+		__wt_api_db_errx(db,
+		    "page at addr %lu has non-zero unused header fields",
+		    (u_long)addr);
+		return (WT_ERROR);
+	}
+
+	/* Verify the items on the page. */
+	switch (dsk->type) {
+	case WT_PAGE_COL_VAR:
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_DUP_LEAF:
+	case WT_PAGE_ROW_INT:
+	case WT_PAGE_ROW_LEAF:
+		WT_RET(__wt_verify_dsk_item(toc, dsk, addr, size));
+		break;
+	case WT_PAGE_COL_INT:
+		WT_RET(__wt_verify_dsk_col_int(db, dsk, addr, size));
+		break;
+	case WT_PAGE_COL_FIX:
+		WT_RET(__wt_verify_dsk_col_fix(db, dsk, addr, size));
+		break;
+	case WT_PAGE_COL_RLE:
+		WT_RET(__wt_verify_dsk_col_rle(db, dsk, addr, size));
+		break;
+	case WT_PAGE_OVFL:
+		WT_RET(__wt_verify_dsk_ovfl(toc, dsk, addr, size));
+		break;
+	WT_ILLEGAL_FORMAT(db);
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_verify_dsk_item --
+ *	Walk a disk page of WT_ITEMs, and verify them.
+ */
+static int
+__wt_verify_dsk_item(
+    WT_TOC *toc, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size)
+{
+	enum { IS_FIRST, WAS_KEY, WAS_DATA, WAS_DUP_DATA } last_item_type;
+	DB *db;
+	WT_ITEM *item;
+	WT_OVFL *ovfl;
+	WT_OFF *off;
+	off_t file_size;
+	uint8_t *end;
+	uint32_t i, item_num, item_len, item_type;
+
+	db = toc->db;
+	file_size = db->idb->fh->file_size;
+
+	end = (uint8_t *)dsk + size;
+
+	last_item_type = IS_FIRST;
+	item_num = 0;
+	WT_ITEM_FOREACH(dsk, item, i) {
+		++item_num;
+
+		/* Check if this item is entirely on the page. */
+		if ((uint8_t *)item + sizeof(WT_ITEM) > end)
+			goto eop;
+
+		item_type = WT_ITEM_TYPE(item);
+		item_len = WT_ITEM_LEN(item);
+
+		/* Check the item's type. */
+		switch (item_type) {
+		case WT_ITEM_KEY:
+		case WT_ITEM_KEY_OVFL:
+			if (dsk->type != WT_PAGE_ROW_INT &&
+			    dsk->type != WT_PAGE_ROW_LEAF)
+				goto item_vs_page;
+			break;
+		case WT_ITEM_KEY_DUP:
+		case WT_ITEM_KEY_DUP_OVFL:
+			if (dsk->type != WT_PAGE_DUP_INT)
+				goto item_vs_page;
+			break;
+		case WT_ITEM_DATA:
+		case WT_ITEM_DATA_OVFL:
+			if (dsk->type != WT_PAGE_COL_VAR &&
+			    dsk->type != WT_PAGE_ROW_LEAF)
+				goto item_vs_page;
+			break;
+		case WT_ITEM_DATA_DUP:
+		case WT_ITEM_DATA_DUP_OVFL:
+			if (dsk->type != WT_PAGE_DUP_LEAF &&
+			    dsk->type != WT_PAGE_ROW_LEAF)
+				goto item_vs_page;
+			break;
+		case WT_ITEM_DEL:
+			/* Deleted items only appear on column-store pages. */
+			if (dsk->type != WT_PAGE_COL_VAR)
+				goto item_vs_page;
+			break;
+		case WT_ITEM_OFF:
+			if (dsk->type != WT_PAGE_DUP_INT &&
+			    dsk->type != WT_PAGE_ROW_INT &&
+			    dsk->type != WT_PAGE_ROW_LEAF) {
+item_vs_page:			__wt_api_db_errx(db,
+				    "illegal item and page type combination "
+				    "(item %lu on page at addr %lu is a %s "
+				    "item on a %s page)",
+				    (u_long)item_num, (u_long)addr,
+				    __wt_item_type_string(item),
+				    __wt_page_type_string(dsk));
+				return (WT_ERROR);
+			}
+			break;
+		default:
+			__wt_api_db_errx(db,
+			    "item %lu on page at addr %lu has an illegal type "
+			    "of %lu",
+			    (u_long)item_num, (u_long)addr, (u_long)item_type);
+			return (WT_ERROR);
+		}
+
+		/*
+		 * Check the item type ordering.   For row-stores, check for:
+		 *	two keys in a row,
+		 *	two non-dup data items in a row,
+		 *	a non-dup data item followed by a dup data item
+		 *	a data item as the first item on a page.
+		 *
+		 * Column-stores only have data items, and we already checked
+		 * to see if there was anything else on the page.  Skip the
+		 * order check.
+		 */
+		if (dsk->type == WT_PAGE_COL_VAR)
+			goto skip_order_check;
+
+		switch (item_type) {
+		case WT_ITEM_KEY:
+		case WT_ITEM_KEY_OVFL:
+		case WT_ITEM_KEY_DUP:
+		case WT_ITEM_KEY_DUP_OVFL:
+			switch (last_item_type) {
+			case IS_FIRST:
+			case WAS_DATA:
+			case WAS_DUP_DATA:
+				last_item_type = WAS_KEY;
+				break;
+			case WAS_KEY:
+				__wt_api_db_errx(db,
+				    "item %lu on page at addr %lu is first of "
+				    "two adjacent keys",
+				    (u_long)item_num - 1, (u_long)addr);
+				return (WT_ERROR);
+			}
+			break;
+		case WT_ITEM_DATA:
+		case WT_ITEM_DATA_DUP:
+		case WT_ITEM_DATA_DUP_OVFL:
+		case WT_ITEM_DATA_OVFL:
+		case WT_ITEM_DEL:
+		case WT_ITEM_OFF:
+			if (last_item_type == IS_FIRST) {
+				__wt_api_db_errx(db,
+				    "page at addr %lu begins with a data item",
+				    (u_long)addr);
+				return (WT_ERROR);
+			}
+			switch (item_type) {
+			case WT_ITEM_DATA:
+			case WT_ITEM_DATA_DUP:
+			case WT_ITEM_DEL:
+			case WT_ITEM_OFF:
+				switch (last_item_type) {
+				case IS_FIRST:
+				case WAS_DATA:
+				case WAS_DUP_DATA:
+					__wt_api_db_errx(db,
+					    "item %lu on page at addr %lu is "
+					    "the first of two adjacent data "
+					    "items",
+					    (u_long)item_num - 1, (u_long)addr);
+					return (WT_ERROR);
+				case WAS_KEY:
+					last_item_type = WAS_DATA;
+					break;
+				}
+				break;
+			case WT_ITEM_DATA_DUP_OVFL:
+			case WT_ITEM_DATA_OVFL:
+				switch (last_item_type) {
+				case WAS_DATA:
+					__wt_api_db_errx(db,
+					    "item %lu on page at addr %lu is "
+					    "a non-duplicate data item "
+					    "followed by a duplicate data item",
+					    (u_long)item_num - 1, (u_long)addr);
+					return (WT_ERROR);
+				case IS_FIRST:
+				case WAS_DUP_DATA:
+				case WAS_KEY:
+					last_item_type = WAS_DUP_DATA;
+					break;
+				}
+				break;
+			default:
+				break;
+			}
+			break;
+		default:
+			break;
+		}
+
+skip_order_check:
+		/* Check the item's length. */
+		switch (item_type) {
+		case WT_ITEM_KEY:
+		case WT_ITEM_KEY_DUP:
+		case WT_ITEM_DATA:
+		case WT_ITEM_DATA_DUP:
+			/* The length is variable, we can't check it. */
+			break;
+		case WT_ITEM_KEY_OVFL:
+		case WT_ITEM_KEY_DUP_OVFL:
+		case WT_ITEM_DATA_OVFL:
+		case WT_ITEM_DATA_DUP_OVFL:
+			if (item_len != sizeof(WT_OVFL))
+				goto item_len;
+			break;
+		case WT_ITEM_DEL:
+			if (item_len != 0)
+				goto item_len;
+			break;
+		case WT_ITEM_OFF:
+			if (item_len != sizeof(WT_OFF)) {
+item_len:			__wt_api_db_errx(db,
+				    "item %lu on page at addr %lu has an "
+				    "incorrect length",
+				    (u_long)item_num, (u_long)addr);
+				return (WT_ERROR);
+			}
+			break;
+		default:
+			break;
+		}
+
+		/* Check if the item is entirely on the page. */
+		if ((uint8_t *)WT_ITEM_NEXT(item) > end)
+			goto eop;
+
+		/* Check if the referenced item is entirely in the file. */
+		switch (item_type) {
+		case WT_ITEM_KEY_OVFL:
+		case WT_ITEM_KEY_DUP_OVFL:
+		case WT_ITEM_DATA_OVFL:
+		case WT_ITEM_DATA_DUP_OVFL:
+			ovfl = WT_ITEM_BYTE_OVFL(item);
+			if (WT_ADDR_TO_OFF(db, ovfl->addr) +
+			    WT_HDR_BYTES_TO_ALLOC(db, ovfl->size) > file_size)
+				goto eof;
+			break;
+		case WT_ITEM_OFF:
+			off = WT_ITEM_BYTE_OFF(item);
+			if (WT_ADDR_TO_OFF(db, off->addr) +
+			    off->size > file_size)
+				goto eof;
+			break;
+		default:
+			break;
+		}
+	}
+	return (0);
+
+eof:	return (__wt_verify_eof(db, item_num, addr));
+eop:	return (__wt_verify_eop(db, item_num, addr));
+}
+
+/*
+ * __wt_verify_dsk_col_int --
+ *	Walk a WT_PAGE_COL_INT disk page and verify it.
+ */
+static int
+__wt_verify_dsk_col_int(DB *db, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size)
+{
+	IDB *idb;
+	WT_OFF *off;
+	uint8_t *end;
+	uint32_t i, entry_num;
+
+	idb = db->idb;
+	end = (uint8_t *)dsk + size;
+
+	entry_num = 0;
+	WT_OFF_FOREACH(dsk, off, i) {
+		++entry_num;
+
+		/* Check if this entry is entirely on the page. */
+		if ((uint8_t *)off + sizeof(WT_OFF) > end)
+			return (__wt_verify_eop(db, entry_num, addr));
+
+		/* Check if the reference is past the end-of-file. */
+		if (WT_ADDR_TO_OFF(
+		    db, off->addr) + off->size > idb->fh->file_size)
+			return (__wt_verify_eof(db, entry_num, addr));
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_verify_dsk_col_fix --
+ *	Walk a WT_PAGE_COL_FIX disk page and verify it.
+ */
+static int
+__wt_verify_dsk_col_fix(DB *db, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size)
+{
+	u_int len;
+	uint32_t i, j, entry_num;
+	uint8_t *data, *end, *p;
+
+	len = db->fixed_len;
+	end = (uint8_t *)dsk + size;
+
+	entry_num = 0;
+	WT_FIX_FOREACH(db, dsk, data, i) {
+		++entry_num;
+
+		/* Check if this entry is entirely on the page. */
+		if (data + len > end)
+			return (__wt_verify_eop(db, entry_num, addr));
+
+		/* Deleted items are entirely nul bytes. */
+		p = data;
+		if (WT_FIX_DELETE_ISSET(data)) {
+			if (*p != WT_FIX_DELETE_BYTE)
+				goto delfmt;
+			for (j = 1; j < db->fixed_len; ++j)
+				if (*++p != '\0')
+					goto delfmt;
+		}
+	}
+
+	return (0);
+
+delfmt:	return (__wt_verify_delfmt(db, entry_num, addr));
+}
+
+/*
+ * __wt_verify_dsk_col_rle --
+ *	Walk a WT_PAGE_COL_RLE disk page and verify it.
+ */
+static int
+__wt_verify_dsk_col_rle(DB *db, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size)
+{
+	u_int len;
+	uint32_t i, j, entry_num;
+	uint8_t *data, *end, *last_data, *p;
+
+	end = (uint8_t *)dsk + size;
+
+	last_data = NULL;
+	len = db->fixed_len + sizeof(uint16_t);
+
+	entry_num = 0;
+	WT_RLE_REPEAT_FOREACH(db, dsk, data, i) {
+		++entry_num;
+
+		/* Check if this entry is entirely on the page. */
+		if (data + len > end)
+			return (__wt_verify_eop(db, entry_num, addr));
+
+		/* Count must be non-zero. */
+		if (WT_RLE_REPEAT_COUNT(data) == 0) {
+			__wt_api_db_errx(db,
+			    "fixed-length entry %lu on page at addr "
+			    "%lu has a repeat count of 0",
+			    (u_long)entry_num, (u_long)addr);
+			return (WT_ERROR);
+		}
+
+		/* Deleted items are entirely nul bytes. */
+		p = WT_RLE_REPEAT_DATA(data);
+		if (WT_FIX_DELETE_ISSET(p)) {
+			if (*p != WT_FIX_DELETE_BYTE)
+				goto delfmt;
+			for (j = 1; j < db->fixed_len; ++j)
+				if (*++p != '\0')
+					goto delfmt;
+		}
+
+		/*
+		 * If the previous data is the same as this data, we
+		 * missed an opportunity for compression -- complain.
+		 */
+		if (last_data != NULL &&
+		    memcmp(WT_RLE_REPEAT_DATA(last_data),
+		    WT_RLE_REPEAT_DATA(data), db->fixed_len) == 0 &&
+		    WT_RLE_REPEAT_COUNT(last_data) < UINT16_MAX) {
+			__wt_api_db_errx(db,
+			    "fixed-length entries %lu and %lu on page "
+			    "at addr %lu are identical and should have "
+			    "been compressed",
+			    (u_long)entry_num,
+			    (u_long)entry_num - 1, (u_long)addr);
+			return (WT_ERROR);
+		}
+		last_data = data;
+	}
+
+	return (0);
+
+delfmt:	return (__wt_verify_delfmt(db, entry_num, addr));
+}
+
+/*
+ * __wt_verify_overflow_col --
+ *	Check on-page column-store overflow references.
+ */
+static int
+__wt_verify_overflow_col(WT_TOC *toc, WT_PAGE *page, WT_VSTUFF *vs)
+{
+	WT_COL *cip;
+	WT_ITEM *item;
+	uint32_t i;
+
+	/* Walk the in-memory page, verifying overflow items. */
+	WT_INDX_FOREACH(page, cip, i) {
+		item = cip->data;
+		if (WT_ITEM_TYPE(item) == WT_ITEM_DATA_OVFL)
+			WT_RET(__wt_verify_overflow_common(
+			    toc, WT_ITEM_BYTE_OVFL(item),
+			    WT_COL_SLOT(page, cip) + 1, page->addr, vs));
+	}
+	return (0);
+}
+
+/*
+ * __wt_verify_overflow_row --
+ *	Check on-page row-store overflow references.
+ */
+static int
+__wt_verify_overflow_row(WT_TOC *toc, WT_PAGE *page, WT_VSTUFF *vs)
+{
+	WT_ITEM *item;
+	WT_ROW *rip;
+	uint32_t i;
+	int check_data;
+
+	/*
+	 * Walk the in-memory page, verifying overflow items.   We service 4
+	 * page types here: DUP_INT, DUP_LEAF, ROW_INT and ROW_LEAF.  In the
+	 * case of DUP_INT, DUP_LEAF and ROW_INT, we only check the key, as
+	 * there is either no data item, or the data item is known to not be
+	 * an overflow page.   In the case of ROW_LEAF, we have to check both
+	 * the key and the data item.
+	 */
+	check_data = page->dsk->type == WT_PAGE_ROW_LEAF ? 1 : 0;
+
+	/* Walk the in-memory page, verifying overflow items. */
+	WT_INDX_FOREACH(page, rip, i) {
+		item = rip->key;
+		switch (WT_ITEM_TYPE(item)) {
+		case WT_ITEM_KEY_OVFL:
+		case WT_ITEM_KEY_DUP_OVFL:
+			WT_RET(__wt_verify_overflow_common(
+			    toc, WT_ITEM_BYTE_OVFL(item),
+			    WT_ROW_SLOT(page, rip) + 1, page->addr, vs));
+			break;
+		default:
+			break;
+		}
+
+		if (!check_data)
+			continue;
+
+		item = rip->data;
+		switch (WT_ITEM_TYPE(item)) {
+		case WT_ITEM_DATA_OVFL:
+		case WT_ITEM_DATA_DUP_OVFL:
+			WT_RET(__wt_verify_overflow_common(
+			    toc, WT_ITEM_BYTE_OVFL(item),
+			    WT_ROW_SLOT(page, rip) + 1, page->addr, vs));
+			break;
+		default:
+			break;
+		}
+	}
+	return (0);
+}
+
+/*
+ * __wt_verify_overflow_common --
+ *	Common code that reads in an overflow page and checks it.
+ */
+static int
+__wt_verify_overflow_common(WT_TOC *toc,
+    WT_OVFL *ovfl, uint32_t entry_num, uint32_t page_ref_addr, WT_VSTUFF *vs)
+{
+	DB *db;
+	DBT *scratch1;
+	WT_PAGE_DISK *dsk;
+	uint32_t addr, size;
+	int ret;
+
+	db = toc->db;
+	scratch1 = NULL;
+	ret = 0;
+
+	addr = ovfl->addr;
+	size = WT_HDR_BYTES_TO_ALLOC(db, ovfl->size);
+
+	/* Allocate enough memory to hold the overflow pages. */
+	WT_RET(__wt_scr_alloc(toc, size, &scratch1));
+
+	/* Read the page. */
+	dsk = scratch1->data;
+	WT_ERR(__wt_page_disk_read(toc, dsk, addr, size));
+
+	/*
+	 * Verify the disk image -- this function would normally be called
+	 * from the asynchronous read server, but overflow pages are read
+	 * synchronously. Regardless, we break the overflow verification code
+	 * into two parts, on-disk format checking and internal checking,
+	 * just so it looks like all of the other page type checking.
+	 */
+	WT_ERR(__wt_verify_dsk_ovfl(toc, dsk, addr, size));
+
+	/* Add the fragments. */
+	WT_ERR(__wt_verify_addfrag(toc, addr, size, vs));
+
+	/*
+	 * The only other thing to check is that the size we have in the page
+	 * matches the size on the underlying overflow page.
+	 */
+	if (ovfl->size != dsk->u.datalen) {
+		__wt_api_db_errx(db,
+		    "overflow page reference in item %lu on page at addr %lu "
+		    "does not match the data size on the overflow page",
+		    (u_long)entry_num, (u_long)page_ref_addr);
+		ret = WT_ERROR;
+	}
+
+err:	__wt_scr_release(&scratch1);
+
+	return (ret);
+}
+
+/*
+ * __wt_verify_dsk_ovfl --
+ *	Verify a WT_PAGE_OVFL disk page.
+ */
+static int
+__wt_verify_dsk_ovfl(
+    WT_TOC *toc, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size)
+{
+	DB *db;
+	uint32_t len;
+	uint8_t *p;
+
+	db = toc->db;
+
+	if (dsk->u.datalen == 0) {
+		__wt_api_db_errx(db,
+		    "overflow page at addr %lu has no data", (u_long)addr);
+		return (WT_ERROR);
+	}
+
+	/* Any page data after the overflow record should be nul bytes. */
+	p = (uint8_t *)dsk + (sizeof(WT_PAGE_DISK) + dsk->u.datalen);
+	len = size - (sizeof(WT_PAGE_DISK) + dsk->u.datalen);
+	for (; len > 0; ++p, --len)
+		if (*p != '\0') {
+			__wt_api_db_errx(db,
+			    "overflow page at addr %lu has non-zero trailing "
+			    "bytes",
+			    (u_long)addr);
+			return (WT_ERROR);
+		}
+
+	return (0);
+}
+
+/*
+ * __wt_verify_eop --
+ *	Generic item extends past the end-of-page error.
+ */
+static int
+__wt_verify_eop(DB *db, uint32_t entry_num, uint32_t addr)
+{
+	__wt_api_db_errx(db,
+	    "item %lu on page at addr %lu extends past the end of the page",
+	    (u_long)entry_num, (u_long)addr);
+	return (WT_ERROR);
+}
+
+/*
+ * __wt_verify_eof --
+ *	Generic item references non-existent file pages error.
+ */
+static int
+__wt_verify_eof(DB *db, uint32_t entry_num, uint32_t addr)
+{
+	__wt_api_db_errx(db,
+	    "off-page item %lu on page at addr %lu references non-existent "
+	    "file pages",
+	    (u_long)entry_num, (u_long)addr);
+	return (WT_ERROR);
+}
+
+/*
+ * __wt_verify_delfmt --
+ *	WT_PAGE_COL_FIX and WT_PAGE_COL_RLE error where a deleted item has
+ *	non-nul bytes.
+ */
+static int
+__wt_verify_delfmt(DB *db, uint32_t entry_num, uint32_t addr)
+{
+	__wt_api_db_errx(db,
+	    "deleted fixed-length entry %lu on page at addr %lu has non-nul "
+	    "bytes",
+	    (u_long)entry_num, (u_long)addr);
+	return (WT_ERROR);
+}
+
+/*
+ * __wt_verify_addfrag --
+ *	Add the WT_PAGE's fragments to the list, and complain if we've already
+ *	verified this chunk of the file.
+ */
+static int
+__wt_verify_addfrag(WT_TOC *toc, uint32_t addr, uint32_t size, WT_VSTUFF *vs)
+{
+	DB *db;
+	uint32_t frags, i;
+
+	db = toc->db;
+
+	frags = WT_OFF_TO_ADDR(db, size);
+	for (i = 0; i < frags; ++i)
+		if (bit_test(vs->fragbits, addr + i)) {
+			__wt_api_db_errx(db,
+			    "page fragment at addr %lu already verified",
+			    (u_long)addr);
+			return (0);
+		}
+	bit_nset(vs->fragbits, addr, addr + (frags - 1));
+	return (0);
+}
+
+/*
+ * __wt_verify_checkfrag --
+ *	Verify we've checked all the fragments in the file.
+ */
+static int
+__wt_verify_checkfrag(DB *db, WT_VSTUFF *vs)
+{
+	int ffc, ffc_start, ffc_end, frags, ret;
+
+	frags = (int)vs->frags;		/* XXX: bitstring.h wants "ints" */
+	ret = 0;
+
+	/* Check for page fragments we haven't verified. */
+	for (ffc_start = ffc_end = -1;;) {
+		bit_ffc(vs->fragbits, frags, &ffc);
+		if (ffc != -1) {
+			bit_set(vs->fragbits, ffc);
+			if (ffc_start == -1) {
+				ffc_start = ffc_end = ffc;
+				continue;
+			}
+			if (ffc_end == ffc - 1) {
+				ffc_end = ffc;
+				continue;
+			}
+		}
+		if (ffc_start != -1) {
+			if (ffc_start == ffc_end)
+				__wt_api_db_errx(db,
+				    "fragment %d was never verified",
+				    ffc_start);
+			else
+				__wt_api_db_errx(db,
+				    "fragments %d to %d were never verified",
+				    ffc_start, ffc_end);
+			ret = WT_ERROR;
+		}
+		ffc_start = ffc_end = ffc;
+		if (ffc == -1)
+			break;
+	}
+	return (ret);
+}
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
new file mode 100644
index 00000000000..f5ef9674f9b
--- /dev/null
+++ b/src/btree/bt_walk.c
@@ -0,0 +1,306 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * There are two tree-walk implementations: a textbook, depth-first recursive
+ * tree walk in __wt_tree_walk(), and a non-recursive, depth-first tree walk
+ * in __wt_walk_{begin,end,next}().
+ *
+ * The simple recursive walk is sufficient in most cases -- a hazard reference
+ * is obtained on each page in turn, a worker function is called on the page,
+ * then the hazard reference is released.
+ *
+ * The complicated tree walk routine was added because the cache eviction code
+ * needs:
+ *    + to walk the tree a few pages at a time, that is, periodically wake,
+ *	visit some pages, then go back to sleep, which requires enough state
+ *	to restart the traversal at any point,
+ *    + to only visit pages that currently appear in the cache,
+ *    + to return the WT_REF structure (not the WT_PAGE structure),
+ *    + to walk files not associated with the current WT_TOC's DB handle,
+ *    + and finally, it doesn't require a hazard reference.
+ *
+ * My guess is we'll generalize a more complicated walk at some point, which
+ * means some or all of those behaviors will become configurable, and that's
+ * why the code lives here instead of in the eviction code.
+ */
+
+/*
+ * __wt_tree_walk --
+ *	Depth-first recursive walk of a btree, calling a worker function on
+ *	each page.
+ */
+int
+__wt_tree_walk(WT_TOC *toc, WT_REF *ref,
+    uint32_t flags, int (*work)(WT_TOC *, WT_PAGE *, void *), void *arg)
+{
+	IDB *idb;
+	WT_COL *cip;
+	WT_OFF *off;
+	WT_PAGE *page;
+	WT_ROW *rip;
+	uint32_t i;
+	int ret;
+
+	 WT_ENV_FCHK(
+	     toc->env, "__wt_tree_walk", flags, WT_APIMASK_BT_TREE_WALK);
+
+	idb = toc->db->idb;
+
+	/*
+	 * A NULL WT_REF means to start at the top of the tree -- it's just
+	 * a convenience.
+	 */
+	page = ref == NULL ? idb->root_page.page : ref->page;
+
+	/*
+	 * Walk any internal pages, descending through any off-page references.
+	 *
+	 * Descending into row-store off-page duplicate trees is optional for
+	 * two reasons. (1) it may be faster to call this function recursively
+	 * from the worker function, which is already walking the page, and (2)
+	 * information for off-page dup trees is split (the key is on the
+	 * row-leaf page, and the data is obviously in the off-page dup tree):
+	 * we need the key when we dump the data, and that would be a hard
+	 * special case in this code.  Functions where it's both possible and
+	 * no slower to walk off-page dupliate trees in this code can request
+	 * it be done here.
+	 */
+	switch (page->dsk->type) {
+	case WT_PAGE_COL_INT:
+		WT_INDX_FOREACH(page, cip, i) {
+			/* cip references the subtree containing the record */
+			ref = WT_COL_REF(page, cip);
+			if (LF_ISSET(WT_WALK_CACHE) && ref->state != WT_OK)
+				continue;
+
+			off = WT_COL_OFF(cip);
+			WT_RET(__wt_page_in(toc, page, ref, off, 0));
+			ret = __wt_tree_walk(toc, ref, flags, work, arg);
+			__wt_hazard_clear(toc, ref->page);
+			if (ret != 0)
+				return (ret);
+		}
+		break;
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_ROW_INT:
+		WT_INDX_FOREACH(page, rip, i) {
+			/* rip references the subtree containing the record */
+			ref = WT_ROW_REF(page, rip);
+			if (LF_ISSET(WT_WALK_CACHE) && ref->state != WT_OK)
+				continue;
+
+			off = WT_ROW_OFF(rip);
+			WT_RET(__wt_page_in(toc, page, ref, off, 0));
+			ret = __wt_tree_walk(toc, ref, flags, work, arg);
+			__wt_hazard_clear(toc, ref->page);
+			if (ret != 0)
+				return (ret);
+		}
+		break;
+	case WT_PAGE_ROW_LEAF:
+		if (!LF_ISSET(WT_WALK_OFFDUP))
+			break;
+		WT_INDX_FOREACH(page, rip, i) {
+			if (WT_ITEM_TYPE(rip->data) != WT_ITEM_OFF)
+				break;
+
+			/*
+			 * Recursively call the tree-walk function for the
+			 * off-page duplicate tree.
+			 */
+			ref = WT_ROW_REF(page, rip);
+			if (LF_ISSET(WT_WALK_CACHE) && ref->state != WT_OK)
+				continue;
+
+			off = WT_ROW_OFF(rip);
+			WT_RET(__wt_page_in(toc, page, ref, off, 0));
+			ret = __wt_tree_walk(toc, ref, flags, work, arg);
+			__wt_hazard_clear(toc, ref->page);
+			if (ret != 0)
+				return (ret);
+		}
+		break;
+	default:
+		break;
+	}
+
+	/*
+	 * Don't call the worker function for any page until all of its children
+	 * have been visited.   This allows the walker function to be used for
+	 * the sync method, where reconciling a modified child page modifies the
+	 * parent.
+	 */
+	WT_RET(work(toc, page, arg));
+
+	return (0);
+}
+
+/*
+ * __wt_walk_begin --
+ *	Start a tree walk.
+ */
+int
+__wt_walk_begin(WT_TOC *toc, WT_REF *ref, WT_WALK *walk)
+{
+	ENV *env;
+
+	env = toc->env;
+
+	/*
+	 * The caller may be restarting a walk, so the structure may already
+	 * be allocated.  Allocate 20 slots: it's always going to be enough.
+	 */
+	if (walk->tree_len == 0)
+		WT_RET(__wt_realloc(env, &walk->tree_len,
+		    20 * sizeof(WT_WALK_ENTRY), &walk->tree));
+	walk->tree_slot = 0;
+
+	walk->tree[0].ref = ref;
+	walk->tree[0].indx = 0;
+	walk->tree[0].visited = 0;
+
+	return (0);
+}
+
+/*
+ * __wt_walk_end --
+ *	End a tree walk.
+ */
+void
+__wt_walk_end(ENV *env, WT_WALK *walk)
+{
+	__wt_free(env, walk->tree, walk->tree_len);
+}
+
+/*
+ * __wt_walk_next --
+ *	Return the next WT_REF/WT_PAGE in the tree, in a non-recursive way.
+ */
+int
+__wt_walk_next(WT_TOC *toc, WT_WALK *walk, WT_REF **refp)
+{
+	DB *db;
+	ENV *env;
+	WT_PAGE *page, *child;
+	WT_REF *ref;
+	WT_WALK_ENTRY *e;
+	uint elem;
+
+	env = toc->env;
+	db = toc->db;
+
+	e = &walk->tree[walk->tree_slot];
+	page = e->ref->page;
+
+	/*
+	 * Coming into this function we have either a tree internal page (and
+	 * we're walking the array of children), or a row-leaf page (and we're
+	 * walking the array of off-page duplicate trees).
+	 *
+	 * If we've reached the end of this page, and haven't yet returned it,
+	 * do that now.  If the page has been returned, traversal is finished:
+	 * pop the stack and call ourselve recursively, unless the entire tree
+	 * has been traversed, in which case we return NULL.
+	 */
+	if (e->visited) {
+		if (walk->tree_slot == 0) {
+			*refp = NULL;
+			return (0);
+		} else {
+			--walk->tree_slot;
+			return (__wt_walk_next(toc, walk, refp));
+		}
+	} else
+		if (e->indx == page->indx_count) {
+eop:			e->visited = 1;
+			*refp = e->ref;
+			return (0);
+		}
+
+	/* Find the next WT_REF/WT_PAGE pair present in the cache. */
+	for (;;) {
+		switch (page->dsk->type) {
+		case WT_PAGE_ROW_LEAF:
+			ref = page->u3.dup[e->indx];
+			break;
+		case WT_PAGE_COL_INT:
+		case WT_PAGE_DUP_INT:
+		case WT_PAGE_ROW_INT:
+			ref = &page->u3.ref[e->indx];
+			break;
+		WT_ILLEGAL_FORMAT(db);
+		}
+
+		/*
+		 * The row-leaf page off-page duplicates tree array has empty
+		 * slots (unlike col/row internal pages), so check for a NULL
+		 * ref.
+		 *
+		 * We only care about pages in the cache.
+		 */
+		if (ref != NULL && ref->state == WT_OK)
+			break;
+
+		/*
+		 * If we don't find another WT_REF/WT_OFF pair, do the
+		 * post-order visit.
+		 */
+		if (++e->indx == page->indx_count)
+			goto eop;
+	}
+
+	/*
+	 * Check to see if the page has sub-trees associated with it, in which
+	 * case we traverse those pages.
+	 */
+	child = ref->page;
+	switch (child->dsk->type) {
+	case WT_PAGE_ROW_LEAF:
+		/*
+		 * Check for off-page duplicates -- if there are any, push them
+		 * onto the stack and recursively call ourselves to descend the
+		 * tree.
+		 */
+		if (!WT_PAGE_DUP_TREES(child))
+			break;
+		/* FALLTHROUGH */
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_ROW_INT:
+		/*
+		 * The page has children.
+		 *
+		 * First, move past this child, then push the child onto our
+		 * stack, and recursively descend the tree.
+		 */
+		++e->indx;
+
+		/* Check to see if we grew past the end of our stack. */
+		elem = walk->tree_len / sizeof(WT_WALK_ENTRY);
+		if (walk->tree_slot >= elem)
+			WT_RET(__wt_realloc(env, &walk->tree_len,
+			    (elem + 20) * sizeof(WT_WALK_ENTRY), &walk->tree));
+
+		e = &walk->tree[++walk->tree_slot];
+		e->ref = ref;
+		e->indx = 0;
+		e->visited = 0;
+		return (__wt_walk_next(toc, walk, refp));
+	default:
+		break;
+	}
+
+	/* Return the child page, it's not interesting for further traversal. */
+	++e->indx;
+	*refp = ref;
+	return (0);
+}
diff --git a/src/btree/c_drain.c b/src/btree/c_drain.c
new file mode 100644
index 00000000000..c213f652e75
--- /dev/null
+++ b/src/btree/c_drain.c
@@ -0,0 +1,940 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2010 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int  __wt_evict(WT_TOC *);
+static int  __wt_evict_compare_lru(const void *a, const void *b);
+static int  __wt_evict_compare_page(const void *a, const void *b);
+static void __wt_evict_hazard_check(WT_TOC *);
+static int  __wt_evict_hazard_compare(const void *a, const void *b);
+static void __wt_evict_page(WT_TOC *, int);
+static int  __wt_evict_page_subtrees(WT_PAGE *);
+static void __wt_evict_set(WT_TOC *);
+static void __wt_evict_state_check(WT_TOC *);
+static int  __wt_evict_walk(WT_TOC *);
+static int  __wt_evict_walk_single(WT_TOC *, IDB *, uint);
+static void __wt_evict_write(WT_TOC *);
+
+#ifdef HAVE_DIAGNOSTIC
+static void __wt_evict_hazard_validate(ENV *, WT_PAGE *);
+#endif
+
+/*
+ * Tuning constants -- I hesitate to call this tuning, but we should review some
+ * number of pages from each file's in-memory tree for each page we evict, and
+ * we should amortize the comparison of the hazard references across some number
+ * of eviction candidates.
+ */
+#define	WT_EVICT_GROUP		10	/* Evict N pages at a time */
+#define	WT_EVICT_WALK_PER_TABLE	5	/* Pages to visit per file */
+#define	WT_EVICT_WALK_BASE	25	/* Pages tracked across file visits */
+
+/*
+ * WT_EVICT_FOREACH --
+ *	Walk a list of eviction candidates.
+ */
+#define	WT_EVICT_FOREACH(cache, p, i)					\
+	for ((i) = 0, (p) = (cache)->evict; (i) < WT_EVICT_GROUP; ++(i), ++(p))
+
+/*
+ * WT_EVICT_REF_CLR --
+ *	Clear an eviction list entry.
+ */
+#define	WT_EVICT_CLR(p) do {						\
+	(p)->ref = NULL;						\
+	(p)->idb = WT_DEBUG_POINT;					\
+} while (0)
+
+/*
+ * __wt_workq_evict_server --
+ *	See if the eviction server thread needs to be awakened.
+ */
+void
+__wt_workq_evict_server(ENV *env, int force)
+{
+	WT_CACHE *cache;
+	uint64_t bytes_inuse, bytes_max;
+
+	cache = env->ienv->cache;
+
+	/* If the eviction server is running, there's nothing to do. */
+	if (!cache->evict_sleeping)
+		return;
+
+	/*
+	 * If we're locking out reads, or over our cache limit, or forcing the
+	 * issue (when closing the environment), run the eviction server.
+	 */
+	bytes_inuse = __wt_cache_bytes_inuse(cache);
+	bytes_max = WT_STAT(cache->stats, CACHE_BYTES_MAX);
+	if (!force && !cache->read_lockout && bytes_inuse < bytes_max)
+		return;
+
+	WT_VERBOSE(env, WT_VERB_EVICT, (env,
+	    "waking eviction server: force %sset, read lockout %sset, "
+	    "bytes inuse %s max (%lluMB %s %lluMB), ",
+	    force ? "" : "not ", cache->read_lockout ? "" : "not ",
+	    bytes_inuse <= bytes_max ? "<=" : ">",
+	    (unsigned long long)(bytes_inuse / WT_MEGABYTE),
+	    bytes_inuse <= bytes_max ? "<=" : ">",
+	    (unsigned long long)(bytes_max / WT_MEGABYTE)));
+
+	cache->evict_sleeping = 0;
+	__wt_unlock(env, cache->mtx_evict);
+}
+
+/*
+ * __wt_cache_evict_server --
+ *	Thread to evict pages from the cache.
+ */
+void *
+__wt_cache_evict_server(void *arg)
+{
+	ENV *env;
+	IENV *ienv;
+	WT_CACHE *cache;
+	WT_TOC *toc;
+	uint64_t bytes_inuse, bytes_max;
+	int ret;
+
+	env = arg;
+	ienv = env->ienv;
+	cache = ienv->cache;
+	ret = 0;
+
+	/* We need a thread of control because we're reading/writing pages. */
+	toc = NULL;
+	WT_ERR(__wt_toc_api_set(env, "CacheReconciliation", NULL, &toc));
+
+	/*
+	 * Allocate memory for a copy of the hazard references -- it's a fixed
+	 * size so doesn't need run-time adjustments.
+	 */
+	cache->hazard_elem = env->toc_size * env->hazard_size;
+	WT_ERR(__wt_calloc(
+	    env, cache->hazard_elem, sizeof(WT_PAGE *), &cache->hazard));
+	cache->hazard_len = cache->hazard_elem * sizeof(WT_PAGE *);
+
+	for (;;) {
+		WT_VERBOSE(env,
+		    WT_VERB_EVICT, (env, "eviction server sleeping"));
+		cache->evict_sleeping = 1;
+		__wt_lock(env, cache->mtx_evict);
+		WT_VERBOSE(env,
+		    WT_VERB_EVICT, (env, "eviction server waking"));
+
+		/*
+		 * Check for environment exit; do it here, instead of the top of
+		 * the loop because doing it here keeps us from doing a bunch of
+		 * worked when simply awakened to quit.
+		 */
+		if (!F_ISSET(ienv, WT_SERVER_RUN))
+			break;
+
+		for (;;) {
+			/*
+			 * The cache eviction server is a long-running thread;
+			 * its TOC must "enter" and "leave" the library
+			 * periodically in order to be a good thread citizen.
+			 */
+			WT_TOC_GEN_SET(toc);
+
+			/* Single-thread reconciliation. */
+			__wt_lock(env, cache->mtx_reconcile);
+			ret = __wt_evict(toc);
+			__wt_unlock(env, cache->mtx_reconcile);
+			if (ret != 0)
+				goto err;
+
+			WT_TOC_GEN_CLR(toc);
+
+			/*
+			 * If we've locked out reads, keep evicting until we
+			 * get to at least 5% under the maximum cache.  Else,
+			 * quit evicting as soon as we get under the maximum
+			 * cache.
+			 */
+			bytes_inuse = __wt_cache_bytes_inuse(cache);
+			bytes_max = WT_STAT(cache->stats, CACHE_BYTES_MAX);
+			if (cache->read_lockout) {
+				if (bytes_inuse <= bytes_max - (bytes_max / 20))
+					break;
+			} else if (bytes_inuse < bytes_max)
+				break;
+		}
+	}
+
+err:	if (cache->evict != NULL)
+		__wt_free(env, cache->evict, cache->evict_len);
+	if (cache->hazard != NULL)
+		__wt_free(env, cache->hazard, cache->hazard_len);
+	if (toc != NULL)
+		WT_TRET(toc->close(toc, 0));
+
+	if (ret != 0)
+		__wt_api_env_err(env, ret, "cache eviction server error");
+
+	WT_VERBOSE(
+	    env, WT_VERB_EVICT, (env, "cache eviction server exiting"));
+
+	return (NULL);
+}
+
+/*
+ * __wt_evict --
+ *	Evict pages from the cache.
+ */
+static int
+__wt_evict(WT_TOC *toc)
+{
+	ENV *env;
+	WT_CACHE *cache;
+	WT_EVICT_LIST *evict;
+	uint elem, i, j;
+
+	env = toc->env;
+	cache = env->ienv->cache;
+
+	/* Get some more pages to consider for eviction. */
+	WT_RET(__wt_evict_walk(toc));
+
+	/*
+	 * We have an array of page eviction references that may contain NULLs,
+	 * as well as duplicate entries.
+	 *
+	 * First, sort the array by WT_REF address, then delete any duplicates.
+	 * The reason is because we might evict the page but leave a duplicate
+	 * entry in the "saved" area of the array, and that would be a NULL
+	 * dereference on the next run.  (If someone ever tries to remove this
+	 * duplicate cleanup for better performance, you can't fix it just by
+	 * checking the WT_REF state -- that only works if you are discarding
+	 * a page from a single level of the tree; if you are discarding a
+	 * page and its parent, the duplicate of the page's WT_REF might have
+	 * been free'd before a subsequent review of the eviction array.)
+	 */
+	evict = cache->evict;
+	elem = cache->evict_elem;
+	qsort(evict,
+	    (size_t)elem, sizeof(WT_EVICT_LIST), __wt_evict_compare_page);
+	for (i = 0; i < elem; i = j)
+		for (j = i + 1; j < elem; ++j) {
+			/*
+			 * If the leading pointer hits a NULL, we're done, the
+			 * NULLs all sorted to the top of the array.
+			 */
+			if (evict[j].ref == NULL)
+				goto done_duplicates;
+
+			/* Delete the second and any subsequent duplicates. */
+			if (evict[i].ref == evict[j].ref)
+				WT_EVICT_CLR(&evict[j]);
+			else
+				break;
+		}
+done_duplicates:
+
+	/* Second, sort the array by LRU. */
+	qsort(evict,
+	    (size_t)elem, sizeof(WT_EVICT_LIST), __wt_evict_compare_lru);
+
+	/*
+	 * Discarding pages is done in 5 steps:
+	 *	Set the WT_EVICT state
+	 *	Check for any hazard references
+	 *	Discard clean pages
+	 *	Reconcile dirty pages (making them clean)
+	 *	Discard clean pages
+	 *
+	 * The reason we release clean pages, then reconcile dirty pages, then
+	 * release clean pages again is because reconciling a dirty page is a
+	 * slow operation, and this releases space sooner.   (Arguably, we are
+	 * going to discard all of the pages anyway, so what does it matter if
+	 * we make clean pages wait for the dirty page writes?   On the other
+	 * hand, it's a small change and benefits any thread waiting to read a
+	 * clean page we picked for discarding, unlikely though that may be.)
+	 */
+	__wt_evict_set(toc);
+	__wt_evict_hazard_check(toc);
+	__wt_evict_state_check(toc);
+	__wt_evict_page(toc, 0);
+	__wt_evict_write(toc);
+	__wt_evict_page(toc, 1);
+
+	return (0);
+}
+
+/*
+ * __wt_evict_walk --
+ *	Fill in the array by walk the next set of pages.
+ */
+static int
+__wt_evict_walk(WT_TOC *toc)
+{
+	ENV *env;
+	IDB *idb;
+	IENV *ienv;
+	WT_CACHE *cache;
+	uint elem, i;
+	int ret;
+
+	env = toc->env;
+	ienv = env->ienv;
+	cache = ienv->cache;
+
+	/*
+	 * Resize the array in which we're tracking pages, as necessary, then
+	 * get some pages from each underlying file.  We hold a mutex for the
+	 * entire time -- it's slow, but (1) how often do new files get added
+	 * or removed to/from the system, and (2) it's all in-memory stuff, so
+	 * it's not that slow.
+	 */
+	ret = 0;
+	__wt_lock(env, ienv->mtx);
+	elem = WT_EVICT_WALK_BASE + (ienv->dbqcnt * WT_EVICT_WALK_PER_TABLE);
+	if (elem <= cache->evict_elem || (ret = __wt_realloc(env,
+	    &cache->evict_len,
+	    elem * sizeof(WT_EVICT_LIST), &cache->evict)) == 0) {
+		cache->evict_elem = elem;
+
+		i = WT_EVICT_WALK_BASE;
+		TAILQ_FOREACH(idb, &ienv->dbqh, q) {
+			if ((ret = __wt_evict_walk_single(toc, idb, i)) != 0)
+				break;
+			i += WT_EVICT_WALK_PER_TABLE;
+		}
+	}
+	__wt_unlock(env, ienv->mtx);
+	return (ret);
+}
+
+/*
+ * __wt_evict_walk_single --
+ *	Get a few page eviction candidates from a single underlying file.
+ */
+static int
+__wt_evict_walk_single(WT_TOC *toc, IDB *idb, uint slot)
+{
+	WT_CACHE *cache;
+	WT_EVICT_LIST *evict;
+	int i, restarted_once;
+
+	cache = toc->env->ienv->cache;
+
+	/*
+	 * Tricky little loop that restarts the walk as necessary, without
+	 * resetting the count of pages retrieved.
+	 */
+	i = restarted_once = 0;
+
+	/* If we haven't yet opened a tree-walk structure, do so. */
+	if (idb->evict_walk.tree == NULL)
+restart:	WT_RET(
+		    __wt_bt_walk_begin(toc, &idb->root_page, &idb->evict_walk));
+
+	/* Get the next WT_EVICT_WALK_PER_TABLE entries. */
+	do {
+		evict = &cache->evict[slot];
+		WT_RET(__wt_bt_walk_next(toc, &idb->evict_walk, &evict->ref));
+
+		/*
+		 * Restart the walk as necessary,  but only once (after one
+		 * restart we've already acquired all of the pages, and we
+		 * could loop infinitely on a tree with a single, pinned, page).
+		 */
+		if (evict->ref == NULL) {
+			if (restarted_once++)
+				break;
+			goto restart;
+		}
+
+		evict->idb = idb;
+		++slot;
+	} while (++i < WT_EVICT_WALK_PER_TABLE);
+
+	return (0);
+}
+
+/*
+ * __wt_evict_db_clear --
+ *	Remove any entries for a file from the eviction list.
+ */
+void
+__wt_evict_db_clear(WT_TOC *toc)
+{
+	ENV *env;
+	IDB *idb;
+	IENV *ienv;
+	WT_CACHE *cache;
+	WT_EVICT_LIST *evict;
+	uint i;
+
+	env = toc->env;
+	idb = toc->db->idb;
+	ienv = env->ienv;
+	cache = ienv->cache;
+
+	/*
+	 * Discard any entries in the eviction list to a file we're closing
+	 * (the caller better have locked out the eviction thread).
+	 */
+	if (cache->evict == NULL)
+		return;
+	WT_EVICT_FOREACH(cache, evict, i)
+		if (evict->ref != NULL && evict->idb == idb)
+			WT_EVICT_CLR(evict);
+}
+
+/*
+ * __wt_evict_set --
+ *	Set the WT_EVICT flag on a set of pages.
+ */
+static void
+__wt_evict_set(WT_TOC *toc)
+{
+	ENV *env;
+	WT_CACHE *cache;
+	WT_EVICT_LIST *evict;
+	WT_REF *ref;
+	uint i;
+
+	env = toc->env;
+	cache = env->ienv->cache;
+
+	/*
+	 * Set the entry state so readers don't try and use the pages.   Once
+	 * that's done, any thread searching for a page will either see our
+	 * state value, or will have already set a hazard reference to the page.
+	 * We don't evict a page with a hazard reference set, so we can't race.
+	 *
+	 * No memory flush needed, the state field is declared volatile.
+	 */
+	WT_EVICT_FOREACH(cache, evict, i) {
+		if ((ref = evict->ref) == NULL)
+			continue;
+		ref->state = WT_EVICT;
+	}
+}
+
+/*
+ * __wt_evict_hazard_check --
+ *	Compare the list of hazard references to the list of pages to be
+ *	discarded.
+ */
+static void
+__wt_evict_hazard_check(WT_TOC *toc)
+{
+	ENV *env;
+	IENV *ienv;
+	WT_CACHE *cache;
+	WT_EVICT_LIST *evict;
+	WT_PAGE **hazard, **end_hazard, *page;
+	WT_REF *ref;
+	WT_STATS *stats;
+	uint i;
+
+	env = toc->env;
+	ienv = env->ienv;
+	cache = ienv->cache;
+	stats = cache->stats;
+
+	/* Sort the eviction candidates by WT_PAGE address. */
+	qsort(cache->evict, (size_t)WT_EVICT_GROUP,
+	    sizeof(WT_EVICT_LIST), __wt_evict_compare_page);
+
+	/* Copy the hazard reference array and sort it by WT_PAGE address. */
+	hazard = cache->hazard;
+	end_hazard = hazard + cache->hazard_elem;
+	memcpy(hazard, ienv->hazard, cache->hazard_elem * sizeof(WT_PAGE *));
+	qsort(hazard, (size_t)cache->hazard_elem,
+	    sizeof(WT_PAGE *), __wt_evict_hazard_compare);
+
+	/* Walk the lists in parallel and look for matches. */
+	WT_EVICT_FOREACH(cache, evict, i) {
+		if ((ref = evict->ref) == NULL)
+			continue;
+
+		/*
+		 * Look for the page in the hazard list until we reach the end
+		 * of the list or find a hazard pointer larger than the page.
+		 */
+		for (page = ref->page;
+		    hazard < end_hazard && *hazard < page; ++hazard)
+			;
+		if (hazard == end_hazard)
+			break;
+
+		/*
+		 * If we find a matching hazard reference, the page is in use:
+		 * remove it from the eviction list.
+		 *
+		 * No memory flush needed, the state field is declared volatile.
+		 */
+		if (*hazard == page) {
+			WT_VERBOSE(env, WT_VERB_EVICT, (env,
+			    "eviction skipped page addr %lu (hazard reference)",
+			    page->addr));
+			WT_STAT_INCR(stats, CACHE_EVICT_HAZARD);
+
+			/*
+			 * A page with a low LRU and a hazard reference?
+			 *
+			 * Set the page's LRU so we don't select it again.
+			 * Return the page to service.
+			 * Discard our reference.
+			 */
+			ref->page->read_gen = ++cache->read_gen;
+			ref->state = WT_OK;
+			WT_EVICT_CLR(evict);
+		}
+	}
+}
+
+/*
+ * __wt_evict_state_check --
+ *	Confirm these are pages we want to evict.
+ */
+static void
+__wt_evict_state_check(WT_TOC *toc)
+{
+	ENV *env;
+	WT_CACHE *cache;
+	WT_EVICT_LIST *evict;
+	WT_PAGE *page;
+	WT_REF *ref;
+	uint i;
+
+	env = toc->env;
+	cache = env->ienv->cache;
+
+	/*
+	 * We "own" the pages (we've flagged them for eviction, and there were
+	 * no hazard references).   Now do checks to see if these are pages we
+	 * can evict -- we have to wait until after we own the page because the
+	 * page might be updated and race with us.
+	 */
+	WT_EVICT_FOREACH(cache, evict, i) {
+		if ((ref = evict->ref) == NULL)
+			continue;
+		page = ref->page;
+
+		/* Ignore pinned pages. */
+		if (F_ISSET(page, WT_PINNED)) {
+			WT_VERBOSE(env, WT_VERB_EVICT, (env,
+			    "eviction skipped page addr %lu (pinned)",
+			    page->addr));
+			goto skip;
+		}
+
+		/* Ignore pages with in-memory subtrees. */
+		switch (page->hdr->type) {
+		case WT_PAGE_COL_INT:
+		case WT_PAGE_DUP_INT:
+		case WT_PAGE_ROW_INT:
+		case WT_PAGE_ROW_LEAF:
+			if (__wt_evict_page_subtrees(page)) {
+				WT_VERBOSE(env, WT_VERB_EVICT, (env,
+				    "eviction skipped page addr %lu (subtrees)",
+				    page->addr));
+				goto skip;
+			}
+			break;
+		default:
+			break;
+		}
+
+		continue;
+
+skip:		/*
+		 * Set the page's LRU so we don't select it again.
+		 * Return the page to service.
+		 * Discard our reference.
+		 */
+		page->read_gen = ++cache->read_gen;
+		ref->state = WT_OK;
+		WT_EVICT_CLR(evict);
+	}
+}
+
+/*
+ * __wt_evict_write --
+ *	Write any modified pages.
+ */
+static void
+__wt_evict_write(WT_TOC *toc)
+{
+	ENV *env;
+	WT_CACHE *cache;
+	WT_EVICT_LIST *evict;
+	WT_PAGE *page;
+	WT_REF *ref;
+	uint i;
+
+	env = toc->env;
+	cache = env->ienv->cache;
+
+	WT_EVICT_FOREACH(cache, evict, i) {
+		if ((ref = evict->ref) == NULL)
+			continue;
+		page = ref->page;
+
+		/* Ignore dirty pages. */
+		if (!WT_PAGE_IS_MODIFIED(page))
+			continue;
+
+		/*
+		 * We're using our WT_TOC handle, it needs to reference the
+		 * correct DB handle.
+		 *
+		 * XXX
+		 * This is pretty sleazy, but I'm hesitant to try and drive
+		 * a separate DB/IDB handle down through the reconciliation
+		 * code.
+		 */
+		toc->db = evict->idb->db;
+		(void)__wt_bt_rec_page(toc, page);
+	}
+}
+
+/*
+ * __wt_evict_page --
+ *	Evict cache pages.
+ */
+static void
+__wt_evict_page(WT_TOC *toc, int was_dirty)
+{
+	ENV *env;
+	WT_CACHE *cache;
+	WT_EVICT_LIST *evict;
+	WT_PAGE *page;
+	WT_REF *ref;
+	WT_STATS *stats;
+	uint i;
+
+	env = toc->env;
+	cache = env->ienv->cache;
+	stats = cache->stats;
+
+	WT_EVICT_FOREACH(cache, evict, i) {
+		if ((ref = evict->ref) == NULL)
+			continue;
+		page = ref->page;
+
+		/*
+		 * The first time we're called, we get rid of the clean pages;
+		 * the second time we're called, we get rid of the pages that
+		 * were dirty but have since been cleaned.  Ignore dirty pages
+		 * in all cases, it's simpler.
+		 */
+		if (WT_PAGE_IS_MODIFIED(page))
+			continue;
+
+		if (was_dirty)
+			WT_STAT_INCR(stats, CACHE_EVICT_MODIFIED);
+		else
+			WT_STAT_INCR(stats, CACHE_EVICT_UNMODIFIED);
+
+#ifdef HAVE_DIAGNOSTIC
+		__wt_evict_hazard_validate(env, page);
+#endif
+		WT_VERBOSE(env, WT_VERB_EVICT, (env,
+		    "cache evicting page addr %lu", page->addr));
+
+		/*
+		 * Copy a page reference, then make the cache entry available
+		 * for re-use.
+		 *
+		 * No memory flush needed, the state field is declared volatile.
+		 */
+		ref->page = NULL;
+		ref->state = WT_EMPTY;
+
+		/* Remove the entry from the eviction list. */
+		WT_EVICT_CLR(evict);
+
+		/* We've got more space. */
+		WT_CACHE_PAGE_OUT(cache, page->size);
+
+		/* The page can no longer be found, free the memory. */
+		__wt_bt_page_discard(toc, page);
+	}
+}
+
+/*
+ * __wt_evict_page_subtrees --
+ *	Return if a page has an in-memory subtree.
+ */
+static int
+__wt_evict_page_subtrees(WT_PAGE *page)
+{
+	WT_REF *ref, **dupp;
+	uint32_t i;
+
+	/*
+	 * Return if a page has an in-memory subtree -- this array search could
+	 * be replaced by a reference count in the page, but (1) the eviction
+	 * thread isn't where I expect performance problems, (2) I hate to lose
+	 * more bytes on every page, (3) how often will an internal page be
+	 * evicted anyway?
+	 */
+	switch (page->hdr->type) {
+	case WT_PAGE_COL_INT:
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_ROW_INT:
+		WT_REF_FOREACH(page, ref, i)
+			if (ref->state != WT_EMPTY)
+				return (1);
+		break;
+	case WT_PAGE_ROW_LEAF:
+		if (WT_PAGE_DUP_TREES(page))
+			WT_DUP_FOREACH(page, dupp, i)
+				if (*dupp != NULL && (*dupp)->state != WT_EMPTY)
+					return (1);
+		break;
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_evict_compare_page --
+ *	Qsort function: sort WT_EVICT_LIST array based on the page's address.
+ */
+static int
+__wt_evict_compare_page(const void *a, const void *b)
+{
+	WT_REF *a_ref, *b_ref;
+	WT_PAGE *a_page, *b_page;
+
+	/*
+	 * There may be NULL references in the array; sort them as greater than
+	 * anything else so they migrate to the end of the array.
+	 */
+	a_ref = ((WT_EVICT_LIST *)a)->ref;
+	b_ref = ((WT_EVICT_LIST *)b)->ref;
+	if (a_ref == NULL)
+		return (b_ref == NULL ? 0 : 1);
+	if (b_ref == NULL)
+		return (-1);
+
+	/* Sort the page address in ascending order. */
+	a_page = a_ref->page;
+	b_page = b_ref->page;
+	return (a_page > b_page ? 1 : (a_page < b_page ? -1 : 0));
+}
+
+/*
+ * __wt_evict_compare_lru --
+ *	Qsort function: sort WT_EVICT_LIST array based on the page's read
+ *	generation.
+ */
+static int
+__wt_evict_compare_lru(const void *a, const void *b)
+{
+	WT_REF *a_ref, *b_ref;
+	uint32_t a_lru, b_lru;
+
+	/*
+	 * There may be NULL references in the array; sort them as greater than
+	 * anything else so they migrate to the end of the array.
+	 */
+	a_ref = ((WT_EVICT_LIST *)a)->ref;
+	b_ref = ((WT_EVICT_LIST *)b)->ref;
+	if (a_ref == NULL)
+		return (b_ref == NULL ? 0 : 1);
+	if (b_ref == NULL)
+		return (-1);
+
+	/* Sort the LRU in ascending order. */
+	a_lru = a_ref->page->read_gen;
+	b_lru = b_ref->page->read_gen;
+	return (a_lru > b_lru ? 1 : (a_lru < b_lru ? -1 : 0));
+}
+
+/*
+ * __wt_evict_hazard_compare --
+ *	Qsort function: sort hazard list based on the page's address.
+ */
+static int
+__wt_evict_hazard_compare(const void *a, const void *b)
+{
+	WT_PAGE *a_page, *b_page;
+
+	a_page = *(WT_PAGE **)a;
+	b_page = *(WT_PAGE **)b;
+
+	return (a_page > b_page ? 1 : (a_page < b_page ? -1 : 0));
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_evict_hazard_validate --
+ *	Return if a page is or isn't on the hazard list.
+ */
+static void
+__wt_evict_hazard_validate(ENV *env, WT_PAGE *page)
+{
+	IENV *ienv;
+	WT_PAGE **hp;
+	WT_TOC **tp, *toc;
+
+	ienv = env->ienv;
+
+	for (tp = ienv->toc; (toc = *tp) != NULL; ++tp)
+		for (hp = toc->hazard;
+		    hp < toc->hazard + toc->env->hazard_size; ++hp)
+			if (*hp == page) {
+				__wt_api_env_errx(env,
+				    "hazard eviction check for page %lu "
+				    "failed",
+				    (u_long)page->addr);
+				__wt_abort(env);
+			}
+}
+
+/*
+ * __wt_evict_dump --
+ *	Display the eviction list.
+ */
+void
+__wt_evict_dump(WT_TOC *toc)
+{
+	ENV *env;
+	WT_CACHE *cache;
+	WT_EVICT_LIST *evict;
+	WT_MBUF mb;
+	uint n;
+	int sep;
+
+	env = toc->env;
+	cache = env->ienv->cache;
+
+	__wt_mb_init(env, &mb);
+	__wt_mb_add(&mb, "eviction list");
+
+	for (sep = ':', n = 0; n < cache->evict_elem; ++n) {
+		evict = &cache->evict[n];
+		if (evict->ref == NULL)
+			continue;
+		__wt_mb_add(&mb, "%c %lu", sep, (u_long)evict->ref->page->addr);
+		sep = ',';
+	}
+	__wt_mb_discard(&mb);
+}
+
+/*
+ * __wt_evict_dump_cache
+ *	Dump the in-memory cache.
+ */
+int
+__wt_evict_cache_dump(WT_TOC *toc)
+{
+	IDB *idb;
+	IENV *ienv;
+
+	ienv = toc->env->ienv;
+
+	TAILQ_FOREACH(idb, &ienv->dbqh, q)
+		WT_RET(__wt_evict_tree_dump(toc, idb));
+	return (0);
+}
+
+/*
+ * __wt_evict_tree_dump
+ *	Dump an in-memory tree.
+ */
+int
+__wt_evict_tree_dump(WT_TOC *toc, IDB *idb)
+{
+	ENV *env;
+	WT_CACHE *cache;
+	WT_REF *ref;
+	WT_WALK walk;
+	WT_MBUF mb;
+	int sep;
+
+	env = toc->env;
+	cache = env->ienv->cache;
+
+	WT_VERBOSE(env, WT_VERB_EVICT, (env,
+	    "%s: pages inuse %llu, bytes inuse (%llu), max (%llu)",
+	    idb->name,
+	    __wt_cache_pages_inuse(cache),
+	    __wt_cache_bytes_inuse(cache),
+	    WT_STAT(cache->stats, CACHE_BYTES_MAX)));
+
+	__wt_mb_init(env, &mb);
+	__wt_mb_add(&mb, "in-memory page list");
+
+	WT_CLEAR(walk);
+	WT_RET(__wt_bt_walk_begin(toc, &idb->root_page, &walk));
+	for (sep = ':';;) {
+		WT_RET(__wt_bt_walk_next(toc, &walk, &ref));
+		if (ref == NULL)
+			break;
+		__wt_mb_add(&mb, "%c %lu", sep, (u_long)ref->page->addr);
+		sep = ',';
+	}
+	__wt_bt_walk_end(env, &walk);
+	__wt_mb_discard(&mb);
+
+	return (0);
+}
+
+/*
+ * __wt_evict_cache_count
+ *	Retrun the count of nodes in the cache.
+ */
+int
+__wt_evict_cache_count(WT_TOC *toc, uint64_t *nodesp)
+{
+	IDB *idb;
+	IENV *ienv;
+	uint64_t nodes;
+
+	ienv = toc->env->ienv;
+
+	*nodesp = 0;
+	TAILQ_FOREACH(idb, &ienv->dbqh, q) {
+		WT_RET(__wt_evict_tree_count(toc, idb, &nodes));
+		*nodesp += nodes;
+	}
+	return (0);
+}
+
+/*
+ * __wt_evict_tree_count
+ *	Return a count of nodes in the tree.
+ */
+int
+__wt_evict_tree_count(WT_TOC *toc, IDB *idb, uint64_t *nodesp)
+{
+	ENV *env;
+	WT_CACHE *cache;
+	WT_REF *ref;
+	WT_WALK walk;
+	uint64_t nodes;
+
+	env = toc->env;
+	cache = env->ienv->cache;
+
+	WT_CLEAR(walk);
+	WT_RET(__wt_bt_walk_begin(toc, &idb->root_page, &walk));
+	for (nodes = 0;;) {
+		WT_RET(__wt_bt_walk_next(toc, &walk, &ref));
+		if (ref == NULL)
+			break;
+		++nodes;
+	}
+	*nodesp = nodes;
+	__wt_bt_walk_end(env, &walk);
+
+	return (0);
+}
+#endif
diff --git a/src/btree/c_init.c b/src/btree/c_init.c
new file mode 100644
index 00000000000..641f90d9a56
--- /dev/null
+++ b/src/btree/c_init.c
@@ -0,0 +1,133 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2010 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cache_create --
+ *	Create the underlying cache.
+ */
+int
+__wt_cache_create(ENV *env)
+{
+	IENV *ienv;
+	WT_CACHE *cache;
+	int ret;
+
+	ienv = env->ienv;
+	ret = 0;
+
+	WT_RET(__wt_calloc(env, 1, sizeof(WT_CACHE), &ienv->cache));
+	cache = ienv->cache;
+
+	WT_ERR(
+	    __wt_mtx_alloc(env, "cache eviction server", 1, &cache->mtx_evict));
+	WT_ERR(__wt_mtx_alloc(env, "cache read server", 1, &cache->mtx_read));
+	WT_ERR(__wt_mtx_alloc(env, "reconciliation", 0, &cache->mtx_reconcile));
+
+	WT_ERR(__wt_stat_alloc_cache_stats(env, &cache->stats));
+
+	WT_STAT_SET(
+	    cache->stats, CACHE_BYTES_MAX, env->cache_size * WT_MEGABYTE);
+
+	return (0);
+
+err:	(void)__wt_cache_destroy(env);
+	return (ret);
+}
+
+/*
+ * __wt_cache_pages_inuse --
+ *	Return the number of pages in use.
+ */
+inline uint64_t
+__wt_cache_pages_inuse(WT_CACHE *cache)
+{
+	uint64_t pages_in, pages_out;
+
+	/*
+	 * Reading 64-bit fields, potentially on 32-bit machines, and other
+	 * threads of control may be modifying them.  Check them for sanity
+	 * (although "interesting" corruption is vanishingly unlikely, these
+	 * values just increment over time).
+	 */
+	pages_in = cache->stat_pages_in;
+	pages_out = cache->stat_pages_out;
+	return (pages_in > pages_out ? pages_in - pages_out : 0);
+}
+
+/*
+ * __wt_cache_bytes_inuse --
+ *	Return the number of bytes in use.
+ */
+inline uint64_t
+__wt_cache_bytes_inuse(WT_CACHE *cache)
+{
+	uint64_t bytes_in, bytes_out;
+
+	/*
+	 * Reading 64-bit fields, potentially on 32-bit machines, and other
+	 * threads of control may be modifying them.  Check them for sanity
+	 * (although "interesting" corruption is vanishingly unlikely, these
+	 * values just increment over time).
+	 */
+	bytes_in = cache->stat_bytes_in;
+	bytes_out = cache->stat_bytes_out;
+	return (bytes_in > bytes_out ? bytes_in - bytes_out : 0);
+}
+
+/*
+ * __wt_cache_stats --
+ *	Update the cache statistics for return to the application.
+ */
+void
+__wt_cache_stats(ENV *env)
+{
+	WT_CACHE *cache;
+	WT_STATS *stats;
+
+	cache = env->ienv->cache;
+	stats = cache->stats;
+
+	WT_STAT_SET(stats, CACHE_BYTES_INUSE, __wt_cache_bytes_inuse(cache));
+	WT_STAT_SET(stats, CACHE_PAGES_INUSE, __wt_cache_pages_inuse(cache));
+}
+
+/*
+ * __wt_cache_destroy --
+ *	Discard the underlying cache.
+ */
+int
+__wt_cache_destroy(ENV *env)
+{
+	IENV *ienv;
+	WT_CACHE *cache;
+	int ret;
+
+	ienv = env->ienv;
+	cache = ienv->cache;
+	ret = 0;
+
+	if (cache == NULL)
+		return (0);
+
+	/* Discard mutexes. */
+	if (cache->mtx_evict != NULL)
+		(void)__wt_mtx_destroy(env, cache->mtx_evict);
+	if (cache->mtx_read != NULL)
+		__wt_mtx_destroy(env, cache->mtx_read);
+	if (cache->mtx_reconcile != NULL)
+		__wt_mtx_destroy(env, cache->mtx_reconcile);
+
+	/* Discard allocated memory, and clear. */
+	__wt_free(env, cache->stats, 0);
+	__wt_free(env, ienv->cache, sizeof(WT_CACHE));
+
+	return (ret);
+}
diff --git a/src/btree/c_page.c b/src/btree/c_page.c
new file mode 100644
index 00000000000..cd71c0b4ebf
--- /dev/null
+++ b/src/btree/c_page.c
@@ -0,0 +1,69 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2010 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_page_read --
+ *	Read a database page (same as read, but verify the checksum).
+ */
+int
+__wt_page_read(DB *db, WT_PAGE *page)
+{
+	ENV *env;
+	WT_FH *fh;
+	WT_PAGE_HDR *hdr;
+	off_t offset;
+	uint32_t checksum;
+
+	env = db->env;
+	fh = db->idb->fh;
+	hdr = page->hdr;
+
+	offset = WT_ADDR_TO_OFF(db, page->addr);
+	WT_RET(__wt_read(env, fh, offset, page->size, hdr));
+
+	checksum = hdr->checksum;
+	hdr->checksum = 0;
+	if (checksum != __wt_cksum(hdr, page->size)) {
+		__wt_api_env_errx(env,
+		    "read checksum error: addr/size %lu/%lu at offset %llu",
+		    (u_long)page->addr,
+		    (u_long)page->size, (unsigned long long)offset);
+		return (WT_ERROR);
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_page_write --
+ *	Write a database page.
+ */
+int
+__wt_page_write(WT_TOC *toc, WT_PAGE *page)
+{
+	DB *db;
+	ENV *env;
+	WT_FH *fh;
+	WT_PAGE_HDR *hdr;
+
+	db = toc->db;
+	env = toc->env;
+	fh = db->idb->fh;
+
+	WT_ASSERT(env, __wt_bt_verify_dsk_page(toc, page) == 0);
+
+	hdr = page->hdr;
+	hdr->checksum = 0;
+	hdr->checksum = __wt_cksum(hdr, page->size);
+
+	return (__wt_write(
+	    env, fh, WT_ADDR_TO_OFF(db, page->addr), page->size, hdr));
+}
diff --git a/src/btree/c_read.c b/src/btree/c_read.c
new file mode 100644
index 00000000000..1578b5ee642
--- /dev/null
+++ b/src/btree/c_read.c
@@ -0,0 +1,273 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2010 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_cache_read(WT_READ_REQ *);
+
+/*
+ * __wt_workq_read_server --
+ *	See if the read server thread needs to be awakened.
+ */
+void
+__wt_workq_read_server(ENV *env, int force)
+{
+	WT_CACHE *cache;
+	uint64_t bytes_inuse, bytes_max;
+
+	cache = env->ienv->cache;
+
+	/*
+	 * If we're 10% over the maximum cache, shut out reads (which include
+	 * page allocations) until we evict to at least 5% under the maximum
+	 * cache.  The idea is that we don't want to run on the edge all the
+	 * time -- if we're seriously out of space, get things under control
+	 * before opening up for more reads.
+	 */
+	bytes_inuse = __wt_cache_bytes_inuse(cache);
+	bytes_max = WT_STAT(cache->stats, CACHE_BYTES_MAX);
+	if (cache->read_lockout) {
+		if (bytes_inuse <= bytes_max - (bytes_max / 20))
+			cache->read_lockout = 0;
+	} else if (bytes_inuse > bytes_max + (bytes_max / 10)) {
+		WT_VERBOSE(env, WT_VERB_READ, (env,
+		    "workQ locks out reads: bytes-inuse %llu of bytes-max %llu",
+		    (unsigned long long)bytes_inuse,
+		    (unsigned long long)bytes_max));
+		cache->read_lockout = 1;
+	}
+
+	/* If the cache read server is running, there's nothing to do. */
+	if (!cache->read_sleeping)
+		return;
+
+	/*
+	 * If reads are locked out and we're not forcing the issue (that's when
+	 * closing the environment, or if there's a priority read waiting to be
+	 * handled), we're done.
+	 */
+	if (!force && cache->read_lockout)
+		return;
+
+	cache->read_sleeping = 0;
+	__wt_unlock(env, cache->mtx_read);
+}
+
+/*
+ * __wt_cache_read_serial_func --
+ *	Read/allocation serialization function called when a page-in requires
+ *	allocation or a read.
+ */
+int
+__wt_cache_read_serial_func(WT_TOC *toc)
+{
+	ENV *env;
+	WT_CACHE *cache;
+	WT_OFF *off;
+	WT_PAGE *parent;
+	WT_READ_REQ *rr, *rr_end;
+	WT_REF *ref;
+	int dsk_verify;
+
+	__wt_cache_read_unpack(toc, parent, ref, off, dsk_verify);
+
+	env = toc->env;
+	cache = env->ienv->cache;
+
+	/* Find an empty slot and enter the read request. */
+	rr = cache->read_request;
+	rr_end = rr + WT_ELEMENTS(cache->read_request);
+	for (; rr < rr_end; ++rr)
+		if (WT_READ_REQ_ISEMPTY(rr)) {
+			WT_READ_REQ_SET(rr, toc, parent, ref, off, dsk_verify);
+			return (0);
+		}
+	__wt_api_env_errx(env, "read server request table full");
+	return (WT_RESTART);
+}
+
+/*
+ * __wt_cache_read_server --
+ *	Thread to do database reads.
+ */
+void *
+__wt_cache_read_server(void *arg)
+{
+	ENV *env;
+	IENV *ienv;
+	WT_CACHE *cache;
+	WT_READ_REQ *rr, *rr_end;
+	WT_TOC *toc;
+	int didwork, ret;
+
+	env = arg;
+	ienv = env->ienv;
+	cache = ienv->cache;
+
+	rr = cache->read_request;
+	rr_end = rr + WT_ELEMENTS(cache->read_request);
+
+	for (;;) {
+		WT_VERBOSE(env,
+		    WT_VERB_READ, (env, "cache read server sleeping"));
+		cache->read_sleeping = 1;
+		__wt_lock(env, cache->mtx_read);
+		WT_VERBOSE(
+		    env, WT_VERB_READ, (env, "cache read server waking"));
+
+		/*
+		 * Check for environment exit; do it here, instead of the top of
+		 * the loop because doing it here keeps us from doing a bunch of
+		 * worked when simply awakened to quit.
+		 */
+		if (!F_ISSET(ienv, WT_SERVER_RUN))
+			break;
+
+		/*
+		 * Walk the read-request queue, looking for reads (defined by
+		 * a valid WT_TOC handle).  If we find a read request, perform
+		 * it, flush the result and clear the request slot, then wake
+		 * up the requesting thread.  The request slot clear doesn't
+		 * need to be flushed, but we have to flush the read result,
+		 * might as well include it.  If we don't find any work, go to
+		 * sleep.
+		 */
+		do {
+			didwork = 0;
+			for (rr = cache->read_request; rr < rr_end; ++rr) {
+				if ((toc = rr->toc) == NULL)
+					continue;
+				if (cache->read_lockout &&
+				    !F_ISSET(toc, WT_READ_PRIORITY))
+					continue;
+
+				/*
+				 * The read server thread does both general file
+				 * allocation and cache page instantiation.   In
+				 * a file allocation, there's no pagep field in
+				 * in which to return a page.
+				 */
+				ret = __wt_cache_read(rr);
+
+				WT_READ_REQ_CLR(rr);
+				__wt_toc_serialize_wrapup(toc, NULL, ret);
+
+				didwork = 1;
+
+				/*
+				 * Any error terminates the request; a serious
+				 * error causes the read server to exit.
+				 */
+				if (ret != 0) {
+					if (ret != WT_RESTART)
+						goto err;
+					ret = 0;
+				}
+			}
+		} while (didwork);
+	}
+
+	if (ret != 0)
+err:		__wt_api_env_err(env, ret, "cache read server error");
+
+	WT_VERBOSE(env, WT_VERB_READ, (env, "cache read server exiting"));
+	return (NULL);
+}
+
+/*
+ * __wt_cache_read --
+ *	Read a page from the file.
+ */
+static int
+__wt_cache_read(WT_READ_REQ *rr)
+{
+	DB *db;
+	ENV *env;
+	WT_CACHE *cache;
+	WT_FH *fh;
+	WT_OFF *off;
+	WT_PAGE *page;
+	WT_REF *ref;
+	WT_TOC *toc;
+	uint32_t addr, size;
+	int ret;
+
+	toc = rr->toc;
+	ref = rr->ref;
+	off = rr->off;
+	addr = off->addr;
+	size = off->size;
+
+	db = toc->db;
+	env = toc->env;
+	cache = env->ienv->cache;
+	fh = db->idb->fh;
+	ret = 0;
+
+	/*
+	 * Check to see if some other thread brought the page into the cache
+	 * while our request was in the queue.   If the state is anything
+	 * other than empty, it's not our problem.
+	 */
+	if (ref->state != WT_EMPTY)
+		return (0);
+
+	/*
+	 * The page isn't in the cache, and since we're the only path for the
+	 * page to get into the cache, we don't have to worry further, and
+	 * we might as well get to it.
+	 *
+	 * Allocate memory for the in-memory page information and for the page
+	 * itself. They're two separate allocation calls so we (hopefully) get
+	 * better alignment from the underlying heap memory allocator.
+	 */
+	WT_RET(__wt_calloc(env, 1, sizeof(WT_PAGE), &page));
+	WT_ERR(__wt_calloc(env, (size_t)size, sizeof(uint8_t), &page->hdr));
+
+	/* Read the page. */
+	WT_VERBOSE(env, WT_VERB_READ,
+	    (env, "cache read addr/size %lu/%lu", (u_long)addr, (u_long)size));
+	WT_STAT_INCR(cache->stats, PAGE_READ);
+
+	page->addr = addr;
+	page->size = size;
+	WT_ERR(__wt_page_read(db, page));
+	WT_CACHE_PAGE_IN(cache, size);
+
+	/* If the page needs to be verified, that's next. */
+	if (rr->dsk_verify)
+		WT_ERR(__wt_bt_verify_dsk_page(toc, page));
+
+	/* Build the in-memory version of the page. */
+	WT_ERR(__wt_bt_page_inmem(toc, page));
+
+	/*
+	 * Reference the parent's WT_PAGE and parent's WT_OFF structure that
+	 * read the page.
+	 */
+	page->parent = rr->parent;
+	page->parent_off = off;
+
+	/*
+	 * The page is now available -- set the LRU so the page is not selected
+	 * for eviction.
+	 */
+	page->read_gen = ++cache->read_gen;
+	ref->page = page;
+	ref->state = WT_OK;
+
+	return (0);
+
+err:	if (page != NULL) {
+		if (page->hdr != NULL)
+			__wt_free(env, page->hdr, size);
+		__wt_free(env, page, sizeof(WT_PAGE));
+	}
+	return (ret);
+}
diff --git a/src/btree/col_get.c b/src/btree/col_get.c
new file mode 100644
index 00000000000..7ab2f242a35
--- /dev/null
+++ b/src/btree/col_get.c
@@ -0,0 +1,40 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_db_col_get --
+ *	Db.col_get method.
+ */
+int
+__wt_db_col_get(WT_TOC *toc, uint64_t recno, DBT *data)
+{
+	DB *db;
+	IDB *idb;
+	int ret;
+
+	db = toc->db;
+	idb = db->idb;
+
+	/* Search the column store for the key. */
+	if (!F_ISSET(idb, WT_COLUMN)) {
+		__wt_api_db_errx(db,
+		    "row database records cannot be retrieved by record "
+		    "number");
+		return (WT_ERROR);
+	}
+
+	WT_ERR(__wt_col_search(toc, recno, WT_NOLEVEL, 0));
+	ret = __wt_dbt_return(toc, NULL, data, 0);
+
+err:	if (toc->srch_page != idb->root_page.page)
+		__wt_hazard_clear(toc, toc->srch_page);
+	return (ret);
+}
diff --git a/src/btree/col_put.c b/src/btree/col_put.c
new file mode 100644
index 00000000000..e7e76778fe3
--- /dev/null
+++ b/src/btree/col_put.c
@@ -0,0 +1,229 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_col_update(WT_TOC *, uint64_t, DBT *, int);
+
+/*
+ * __wt_db_col_del --
+ *	Db.col_del method.
+ */
+inline int
+__wt_db_col_del(WT_TOC *toc, uint64_t recno)
+{
+	return (__wt_col_update(toc, recno, NULL, 0));
+}
+
+/*
+ * __wt_db_col_put --
+ *	Db.put method.
+ */
+inline int
+__wt_db_col_put(WT_TOC *toc, uint64_t recno, DBT *data)
+{
+	DB *db;
+
+	db = toc->db;
+
+	if (db->fixed_len != 0 && data->size != db->fixed_len)
+		WT_RET(__wt_database_wrong_fixed_size(toc, data->size));
+
+	return (__wt_col_update(toc, recno, data, 1));
+}
+
+/*
+ * __wt_col_update --
+ *	Column store delete and update.
+ */
+static int
+__wt_col_update(WT_TOC *toc, uint64_t recno, DBT *data, int data_overwrite)
+{
+	DB *db;
+	ENV *env;
+	WT_PAGE *page;
+	WT_RLE_EXPAND *exp, **new_rleexp;
+	WT_REPL **new_repl, *repl;
+	int ret;
+
+	env = toc->env;
+	db = toc->db;
+
+	page = NULL;
+	exp = NULL;
+	new_rleexp = NULL;
+	new_repl = NULL;
+	repl = NULL;
+
+	/* Search the btree for the key. */
+	WT_RET(__wt_col_search(
+	    toc, recno, WT_NOLEVEL, data_overwrite ? WT_DATA_OVERWRITE : 0));
+	page = toc->srch_page;
+
+	/*
+	 * Run-length encoded (RLE) column store operations are hard because
+	 * each original on-disk index for an RLE can represent large numbers
+	 * of records, and we're only deleting a single one of those records,
+	 * which means working in the WT_RLE_EXPAND array.  All other column
+	 * store deletes are simple changes where a new WT_REPL entry is added
+	 * to the page's modification array.  There are three code paths:
+	 *
+	 * 1: column store deletes other than RLE column stores: delete an entry
+	 * from the on-disk page by creating a new WT_REPL entry, and linking it
+	 * into the WT_REPL array.
+	 *
+	 * 2: an RLE column store delete of an already modified record: create
+	 * a new WT_REPL entry, and link it to the WT_RLE_EXPAND entry's WT_REPL
+	 * list.
+	 *
+	 * 3: an RLE column store delete of a record not yet modified: create
+	 * a new WT_RLE_EXPAND/WT_REPL pair, and link it into the WT_RLE_EXPAND
+	 * array.
+	 */
+	switch (page->dsk->type) {
+	case WT_PAGE_COL_FIX:				/* #1 */
+	case WT_PAGE_COL_VAR:
+		/* Allocate a page replacement array if necessary. */
+		if (page->u2.repl == NULL)
+			WT_ERR(__wt_calloc(env,
+			    page->indx_count, sizeof(WT_REPL *), &new_repl));
+
+		/* Allocate a WT_REPL structure and fill it in. */
+		WT_ERR(__wt_repl_alloc(toc, &repl, data));
+
+		/* workQ: schedule insert of the WT_REPL structure. */
+		__wt_item_update_serial(toc, page, toc->srch_write_gen,
+		    WT_COL_SLOT(page, toc->srch_ip), new_repl, repl, ret);
+		 break;
+	case WT_PAGE_COL_RLE:
+		if (toc->srch_repl != NULL) {		/* #2 */
+			/* Allocate a WT_REPL structure and fill it in. */
+			WT_ERR(__wt_repl_alloc(toc, &repl, data));
+
+			/* workQ: schedule insert of the WT_REPL structure. */
+			__wt_rle_expand_repl_serial(toc, page,
+			    toc->srch_write_gen, toc->srch_exp, repl, ret);
+			break;
+		}
+							/* #3 */
+		/* Allocate a page expansion array as necessary. */
+		if (page->u2.rleexp == NULL)
+			WT_ERR(__wt_calloc(env, page->indx_count,
+			    sizeof(WT_RLE_EXPAND *), &new_rleexp));
+
+		/* Allocate a WT_REPL structure and fill it in. */
+		WT_ERR(__wt_repl_alloc(toc, &repl, data));
+
+		/* Allocate a WT_RLE_EXPAND structure and fill it in. */
+		WT_ERR(__wt_calloc(env, 1, sizeof(WT_RLE_EXPAND), &exp));
+		exp->recno = recno;
+		exp->repl = repl;
+
+		/* Schedule the workQ to link in the WT_RLE_EXPAND structure. */
+		__wt_rle_expand_serial(toc, page, toc->srch_write_gen,
+		    WT_COL_SLOT(page, toc->srch_ip), new_rleexp, exp, ret);
+		break;
+	WT_ILLEGAL_FORMAT_ERR(db, ret);
+	}
+
+	if (ret != 0) {
+err:		if (exp != NULL)
+			__wt_free(env, exp, sizeof(WT_RLE_EXPAND));
+		if (repl != NULL)
+			__wt_repl_free(toc, repl);
+	}
+
+	/* Free any allocated page expansion array unless the workQ used it. */
+	if (new_rleexp != NULL && new_rleexp != page->u2.rleexp)
+		__wt_free(env,
+		    new_rleexp, page->indx_count * sizeof(WT_RLE_EXPAND *));
+
+	/* Free any page replacement array unless the workQ used it. */
+	if (new_repl != NULL && new_repl != page->u2.repl)
+		__wt_free(env, new_repl, page->indx_count * sizeof(WT_REPL *));
+
+	WT_PAGE_OUT(toc, page);
+
+	return (0);
+}
+
+/*
+ * __wt_rle_expand_serial_func --
+ *	Server function to expand a run-length encoded column store during a
+ *	delete.
+ */
+int
+__wt_rle_expand_serial_func(WT_TOC *toc)
+{
+	WT_PAGE *page;
+	WT_RLE_EXPAND **new_rleexp, *exp;
+	uint32_t slot, write_gen;
+	int ret;
+
+	ret = 0;
+
+	__wt_rle_expand_unpack(toc, page, write_gen, slot, new_rleexp, exp);
+
+	/* Check the page's write-generation. */
+	WT_ERR(__wt_page_write_gen_check(page, write_gen));
+
+	/*
+	 * If the page does not yet have an expansion array, our caller passed
+	 * us one of the correct size.   (It's the caller's responsibility to
+	 * detect & free the passed-in expansion array if we don't use it.)
+	 */
+	if (page->u2.rleexp == NULL)
+		page->u2.rleexp = new_rleexp;
+
+	/*
+	 * Insert the new WT_RLE_EXPAND as the first item in the forward-linked
+	 * list of expansion structures.  Flush memory to ensure the list is
+	 * never broken.
+	 */
+	exp->next = page->u2.rleexp[slot];
+	WT_MEMORY_FLUSH;
+	page->u2.rleexp[slot] = exp;
+
+err:	__wt_toc_serialize_wrapup(toc, page, ret);
+	return (0);
+}
+
+/*
+ * __wt_rle_expand_repl_serial_func --
+ *	Server function to update a WT_REPL entry in an already expanded
+ *	run-length encoded column store during a delete.
+ */
+int
+__wt_rle_expand_repl_serial_func(WT_TOC *toc)
+{
+	WT_PAGE *page;
+	WT_RLE_EXPAND *exp;
+	WT_REPL *repl;
+	uint32_t write_gen;
+	int ret;
+
+	ret = 0;
+
+	__wt_rle_expand_repl_unpack(toc, page, write_gen, exp, repl);
+
+	/* Check the page's write-generation. */
+	WT_ERR(__wt_page_write_gen_check(page, write_gen));
+
+	/*
+	 * Insert the new WT_REPL as the first item in the forward-linked list
+	 * of replacement structures from the WT_RLE_EXPAND structure.  Flush
+	 * memory to ensure the list is never broken.
+	 */
+	repl->next = exp->repl;
+	WT_MEMORY_FLUSH;
+	exp->repl = repl;
+
+err:	__wt_toc_serialize_wrapup(toc, page, ret);
+	return (0);
+}
diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c
new file mode 100644
index 00000000000..81c24e3d54f
--- /dev/null
+++ b/src/btree/col_srch.c
@@ -0,0 +1,211 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_col_search --
+ *	Search a column-store tree for a specific record-based key.
+ */
+int
+__wt_col_search(WT_TOC *toc, uint64_t recno, uint32_t level, uint32_t flags)
+{
+	DB *db;
+	IDB *idb;
+	WT_COL *cip;
+	WT_OFF *off;
+	WT_PAGE *page;
+	WT_PAGE_DISK *dsk;
+	WT_RLE_EXPAND *exp;
+	WT_REF *ref;
+	WT_REPL *repl;
+	uint64_t record_cnt;
+	uint32_t i, write_gen;
+	int ret;
+
+	toc->srch_page = NULL;			/* Return values. */
+	toc->srch_ip = NULL;
+	toc->srch_repl = NULL;
+	toc->srch_exp = NULL;
+	toc->srch_write_gen = 0;
+
+	db = toc->db;
+	idb = db->idb;
+
+	WT_DB_FCHK(db, "__wt_col_search", flags, WT_APIMASK_BT_SEARCH_COL);
+
+	/* Check for a record past the end of the database. */
+	page = idb->root_page.page;
+	if (page->records < recno)
+		return (WT_NOTFOUND);
+
+	/* Search the tree. */
+	for (;;) {
+		/* Save the write generation value before the read. */
+		write_gen = page->write_gen;
+
+		/* Walk the page looking for the record. */
+		dsk = page->dsk;
+		switch (dsk->type) {
+		case WT_PAGE_COL_FIX:
+		case WT_PAGE_COL_VAR:
+			cip = page->u.icol + (recno - dsk->start_recno);
+			goto done;
+		case WT_PAGE_COL_RLE:
+			/*
+			 * Walk the page, counting records -- do the record
+			 * count calculation in a funny way to avoid overflow.
+			 */
+			record_cnt = recno - dsk->start_recno;
+			WT_INDX_FOREACH(page, cip, i) {
+				if (record_cnt < WT_RLE_REPEAT_COUNT(cip->data))
+					break;
+				record_cnt -= WT_RLE_REPEAT_COUNT(cip->data);
+			}
+			goto done;
+		case WT_PAGE_COL_INT:
+		default:
+			/*
+			 * Walk the page, counting records -- do the record
+			 * count calculation in a funny way to avoid overflow.
+			 */
+			record_cnt = recno - dsk->start_recno;
+			WT_INDX_FOREACH(page, cip, i) {
+				if (record_cnt < WT_COL_OFF_RECORDS(cip))
+					break;
+				record_cnt -= WT_COL_OFF_RECORDS(cip);
+			}
+			break;
+		}
+
+		/* If a level was set, see if we found the asked-for page. */
+		if (level == dsk->level)
+			goto done;
+
+		/* cip references the subtree containing the record. */
+		ref = WT_COL_REF(page, cip);
+		off = WT_COL_OFF(cip);
+		WT_ERR(__wt_page_in(toc, page, ref, off, 0));
+
+		/* Swap the parent page for the child page. */
+		if (page != idb->root_page.page)
+			__wt_hazard_clear(toc, page);
+		page = ref->page;
+	}
+
+done:	/*
+	 * We've found the right on-page WT_COL structure, but that's only the
+	 * first step; the record may have been updated since reading the page
+	 * into the cache.
+	 */
+	switch (dsk->type) {
+	case WT_PAGE_COL_FIX:
+		/* Find the item's WT_REPL slot if it exists. */
+		repl = WT_COL_REPL(page, cip);
+
+		/*
+		 * If overwriting an existing data item, we don't care if the
+		 * item was previously deleted, return the gathered information.
+		 */
+		if (LF_ISSET(WT_DATA_OVERWRITE)) {
+			toc->srch_repl = repl;
+			break;
+		}
+
+		/*
+		 * Otherwise, check for deletion, in either the WT_REPL slot
+		 * or in the original data.
+		 */
+		if (repl != NULL) {
+			if (WT_REPL_DELETED_ISSET(repl))
+				goto notfound;
+			toc->srch_repl = repl;
+		} else
+			if (WT_FIX_DELETE_ISSET(cip->data))
+				goto notfound;
+		break;
+	case WT_PAGE_COL_RLE:
+		/* Find the item's WT_COL_EXP slot if it exists. */
+		for (exp =
+		    WT_COL_RLEEXP(page, cip); exp != NULL; exp = exp->next)
+			if (exp->recno == recno)
+				break;
+
+		/*
+		 * If overwriting an existing data item, we don't care if the
+		 * item was previously deleted, return the gathered information.
+		 */
+		if (LF_ISSET(WT_DATA_OVERWRITE)) {
+			if (exp != NULL) {
+				toc->srch_exp = exp;
+				toc->srch_repl = exp->repl;
+			}
+			break;
+		}
+
+		/*
+		 * Otherwise, check for deletion, in either the WT_REPL slot
+		 * (referenced by the WT_COL_EXP slot), or in the original data.
+		 */
+		if (exp != NULL) {
+			if (WT_REPL_DELETED_ISSET(exp->repl))
+				goto notfound;
+			toc->srch_exp = exp;
+			toc->srch_repl = exp->repl;
+		} else
+			if (WT_FIX_DELETE_ISSET(WT_RLE_REPEAT_DATA(cip->data)))
+				goto notfound;
+		break;
+	case WT_PAGE_COL_VAR:
+		/* Find the item's WT_REPL slot if it exists. */
+		repl = WT_COL_REPL(page, cip);
+
+		/*
+		 * If overwriting an existing data item, we don't care if the
+		 * item was previously deleted, return the gathered information.
+		 */
+		if (LF_ISSET(WT_DATA_OVERWRITE)) {
+			toc->srch_repl = repl;
+			break;
+		}
+
+		/*
+		 * Otherwise, check for deletion, in either the WT_REPL slot
+		 * or in the original data.
+		 */
+		if (repl != NULL) {
+			if (WT_REPL_DELETED_ISSET(repl))
+				goto notfound;
+			toc->srch_repl = repl;
+			break;
+		} else
+			if (WT_ITEM_TYPE(cip->data) == WT_ITEM_DEL)
+				goto notfound;
+		break;
+	case WT_PAGE_COL_INT:
+		/*
+		 * When returning internal pages, set the item's WT_REPL slot
+		 * if it exists, otherwise we're done.
+		 */
+		toc->srch_repl = WT_COL_REPL(page, cip);
+		break;
+	WT_ILLEGAL_FORMAT(db);
+	}
+
+	toc->srch_page = page;
+	toc->srch_ip = cip;
+	toc->srch_write_gen = write_gen;
+	return (0);
+
+notfound:
+	ret = WT_NOTFOUND;
+
+err:	WT_PAGE_OUT(toc, page);
+	return (ret);
+}
diff --git a/src/btree/row_get.c b/src/btree/row_get.c
new file mode 100644
index 00000000000..03f2cce44bc
--- /dev/null
+++ b/src/btree/row_get.c
@@ -0,0 +1,61 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_db_row_get --
+ *	Db.row_get method.
+ */
+int
+__wt_db_row_get(WT_TOC *toc, DBT *key, DBT *data)
+{
+	DB *db;
+	IDB *idb;
+	WT_PAGE *page;
+	WT_ROW *rip;
+	uint32_t type;
+	int ret;
+
+	db = toc->db;
+	idb = db->idb;
+	page = NULL;
+
+	/* Search the btree for the key. */
+	WT_ERR(__wt_row_search(toc, key, WT_NOLEVEL, 0));
+	page = toc->srch_page;
+	rip = toc->srch_ip;
+
+	/*
+	 * The Db.get method can only return single key/data pairs.
+	 * If that's not what we found, we're done.
+	 *
+	 * XXX
+	 * Checking if page_data is NULL isn't the right thing to do
+	 * here.   Re-visit this when we figure out how we handle
+	 * dup inserts into the tree.  Maybe pass NO-DUP flag into the
+	 * search function?
+	 */
+	if (rip->data != NULL) {
+		type = WT_ITEM_TYPE(rip->data);
+		if (type != WT_ITEM_DATA && type != WT_ITEM_DATA_OVFL) {
+			__wt_api_db_errx(db,
+			    "the Db.get method cannot return keys with "
+			    "duplicate data items; use the Db.cursor method "
+			    "instead");
+			ret = WT_ERROR;
+			goto err;
+		}
+	}
+	ret = __wt_dbt_return(toc, key, data, 0);
+
+err:	if (page != idb->root_page.page)
+		__wt_hazard_clear(toc, page);
+	return (ret);
+}
diff --git a/src/btree/row_put.c b/src/btree/row_put.c
new file mode 100644
index 00000000000..3ac4304ccec
--- /dev/null
+++ b/src/btree/row_put.c
@@ -0,0 +1,288 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_row_update(WT_TOC *, DBT *, DBT *, int);
+
+/*
+ * __wt_db_row_del --
+ *	Db.row_del method.
+ */
+inline int
+__wt_db_row_del(WT_TOC *toc, DBT *key)
+{
+	return (__wt_row_update(toc, key, NULL, 0));
+}
+
+/*
+ * __wt_db_row_put --
+ *	Db.row_put method.
+ */
+inline int
+__wt_db_row_put(WT_TOC *toc, DBT *key, DBT *data)
+{
+	return (__wt_row_update(toc, key, data, 1));
+}
+
+/*
+ * __wt_row_update --
+ *	Row store delete and update.
+ */
+static int
+__wt_row_update(WT_TOC *toc, DBT *key, DBT *data, int insert)
+{
+	ENV *env;
+	WT_PAGE *page;
+	WT_REPL **new_repl, *repl;
+	int ret;
+
+	env = toc->env;
+	new_repl = NULL;
+	repl = NULL;
+
+	/* Search the btree for the key. */
+	WT_RET(__wt_row_search(toc, key, WT_NOLEVEL, insert ? WT_INSERT : 0));
+	page = toc->srch_page;
+
+	/* Allocate a page replacement array as necessary. */
+	if (page->u2.repl == NULL)
+		WT_ERR(__wt_calloc(
+		    env, page->indx_count, sizeof(WT_REPL *), &new_repl));
+
+	/* Allocate room for the new data item from per-thread memory. */
+	WT_ERR(__wt_repl_alloc(toc, &repl, data));
+
+	/* Schedule the workQ to insert the WT_REPL structure. */
+	__wt_item_update_serial(toc, page, toc->srch_write_gen,
+	    WT_ROW_SLOT(page, toc->srch_ip), new_repl, repl, ret);
+
+	if (ret != 0) {
+err:		if (repl != NULL)
+			__wt_repl_free(toc, repl);
+	}
+
+	/* Free any replacement array unless the workQ used it. */
+	if (new_repl != NULL && new_repl != page->u2.repl)
+		__wt_free(env, new_repl, page->indx_count * sizeof(WT_REPL *));
+
+	WT_PAGE_OUT(toc, page);
+
+	return (0);
+}
+
+/*
+ * __wt_item_update_serial_func --
+ *	Server function to update a WT_REPL entry in the modification array.
+ */
+int
+__wt_item_update_serial_func(WT_TOC *toc)
+{
+	WT_PAGE *page;
+	WT_REPL **new_repl, *repl;
+	uint32_t slot, write_gen;
+	int ret;
+
+	__wt_item_update_unpack(toc, page, write_gen, slot, new_repl, repl);
+
+	ret = 0;
+
+	/* Check the page's write-generation. */
+	WT_ERR(__wt_page_write_gen_check(page, write_gen));
+
+	/*
+	 * If the page does not yet have a replacement array, our caller passed
+	 * us one of the correct size.   (It's the caller's responsibility to
+	 * detect & free the passed-in expansion array if we don't use it.)
+	 */
+	if (page->u2.repl == NULL)
+		page->u2.repl = new_repl;
+
+	/*
+	 * Insert the new WT_REPL as the first item in the forward-linked list
+	 * of replacement structures.  Flush memory to ensure the list is never
+	 * broken.
+	 */
+	repl->next = page->u2.repl[slot];
+	WT_MEMORY_FLUSH;
+	page->u2.repl[slot] = repl;
+
+err:	__wt_toc_serialize_wrapup(toc, page, ret);
+	return (0);
+}
+
+/*
+ * __wt_repl_alloc --
+ *	Allocate a WT_REPL structure and associated data from the TOC's update
+ *	memory, and fill it in.
+ */
+int
+__wt_repl_alloc(WT_TOC *toc, WT_REPL **replp, DBT *data)
+{
+	DB *db;
+	ENV *env;
+	WT_REPL *repl;
+	WT_TOC_UPDATE *update;
+	uint32_t align_size, alloc_size, size;
+	int single_use;
+
+	env = toc->env;
+	db = toc->db;
+
+	/*
+	 * Allocate memory for a data insert or change; there's a buffer in the
+	 * WT_TOC structure for allocation of chunks of memory to hold changed
+	 * or inserted data items.
+	 *
+	 * We align each allocation because we directly access WT_REPL structure
+	 * fields in the memory (the x86 handles unaligned accesses, but I don't
+	 * want to have to find and fix this code for a port to a system that
+	 * doesn't handle unaligned accesses).  It wastes space, but this memory
+	 * is never written to disk and there are fewer concerns about memory
+	 * than with on-disk structures.  Any other code allocating memory from
+	 * this buffer needs to align its allocations as well.
+	 *
+	 * The first thing in each chunk of memory is WT_TOC_UPDATE structure
+	 * (which we check is a multiple of 4B during initialization); then
+	 * there are one or more WT_REPL structure plus data chunk pairs.
+	 *
+	 * XXX
+	 * Figure out how much space we need: this code limits the maximum size
+	 * of a data item stored in the database.  In summary, for a big item we
+	 * have to store a WT_TOC_UPDATE structure, the WT_REPL structure and
+	 * the data, all in an allocated buffer.   We only pass a 32-bit value
+	 * to our allocation routine, so we can't store an item bigger than the
+	 * maximum 32-bit value minus the sizes of those two structures, where
+	 * the WT_REPL structure and data item are aligned to a 32-bit boundary.
+	 * We could fix this, but it's unclear it's worth the effort -- document
+	 * you can store a (4GB - 20B) item max, and you're done, because it's
+	 * insane to store a 4GB item in the database anyway.
+	 *
+	 * Check first we won't overflow when calculating an aligned size, then
+	 * check the total required space for this item.
+	 */
+	size = data == NULL ? 0 : data->size;
+	if (UINT32_MAX - size < sizeof(WT_REPL) + sizeof(uint32_t))
+		return (__wt_database_item_too_big(db));
+	align_size = WT_ALIGN(size + sizeof(WT_REPL), sizeof(uint32_t));
+	if (UINT32_MAX - align_size < sizeof(WT_TOC_UPDATE))
+		return (__wt_database_item_too_big(db));
+
+	/*
+	 * If we already have a buffer and the data fits, just copy the WT_REPL
+	 * structure and data into place, we're done.
+	 */
+	update = toc->update;
+	if (update != NULL && align_size <= update->space_avail)
+		goto no_allocation;
+
+	/*
+	 * Decide how much memory to allocate: if it's a one-off (that is, the
+	 * data is bigger than anything we'll aggregate into these buffers, it's
+	 * a one-off.  Otherwise, allocate the next power-of-two larger than 4
+	 * times the requested size, and at least the default buffer size.
+	 *
+	 * XXX
+	 * I have no reason for the 4x the request size, I just hate to allocate
+	 * a buffer for every change to the database.  A better approach would
+	 * be to grow the allocation buffer as the thread makes more changes; if
+	 * a thread is doing lots of work, give it lots of memory, otherwise
+	 * only allocate as it's necessary.
+	 */
+	if (align_size > env->data_update_max) {
+		alloc_size = sizeof(WT_TOC_UPDATE) + align_size;
+		single_use = 1;
+	} else {
+		alloc_size = __wt_nlpo2(
+		    WT_MAX(align_size * 4, env->data_update_initial));
+		single_use = 0;
+	}
+	WT_RET(__wt_calloc(env, 1, alloc_size, &update));
+
+	update->len = alloc_size;
+	update->space_avail = alloc_size - sizeof(WT_TOC_UPDATE);
+	update->first_free = (uint8_t *)update + sizeof(WT_TOC_UPDATE);
+
+	/*
+	 * If it's a single use allocation, ignore any current update buffer.
+	 * Else, release the old update buffer and replace it with the new one.
+	 */
+	if (!single_use) {
+		/*
+		 * The "in" reference count is artificially incremented by 1 as
+		 * long as an update buffer is referenced by the WT_TOC thread;
+		 * we don't want them freed because a page was evicted and the
+		 * count went to 0.  Decrement the reference count on the buffer
+		 * as part of releasing it.  There's a similar reference count
+		 * decrement when the WT_TOC structure is discarded.
+		 *
+		 * XXX
+		 * There's a race here: if this code, or the WT_TOC structure
+		 * close code, and the page discard code race, it's possible
+		 * neither will realize the buffer is no longer needed and free
+		 * it.  The fix is to involve the eviction or workQ threads:
+		 * they may need a linked list of buffers they review to ensure
+		 * it never happens.  I'm living with this now: it's unlikely
+		 * and it's a memory leak if it ever happens.
+		 */
+		if (toc->update != NULL)
+			--toc->update->in;
+		toc->update = update;
+
+		update->in = 1;
+	}
+
+no_allocation:
+	/* Copy the WT_REPL structure into place. */
+	repl = (WT_REPL *)update->first_free;
+	repl->update = update;
+	if (data == NULL)
+		WT_REPL_DELETED_SET(repl);
+	else {
+		repl->size = data->size;
+		memcpy(WT_REPL_DATA(repl), data->data, data->size);
+	}
+
+	update->first_free += align_size;
+	update->space_avail -= align_size;
+	++update->in;
+
+	*replp = repl;
+	return (0);
+}
+
+/*
+ * __wt_repl_free --
+ *	Free a WT_REPL structure and associated data from the TOC's update
+ *	memory.
+ */
+void
+__wt_repl_free(WT_TOC *toc, WT_REPL *repl)
+{
+	ENV *env;
+
+	env = toc->env;
+
+	/*
+	 * It's possible we allocated a WT_REPL structure and associated item
+	 * memory from the WT_TOC update buffer, but then an error occurred.
+	 * Don't try and clean up the update buffer, it's simpler to decrement
+	 * the use count and let the page discard code deal with it during the
+	 * page reconciliation process.  (Note we're still in the allocation
+	 * path, so we decrement the "in" field, not the "out" field.)
+	 */
+	--repl->update->in;
+
+	/*
+	 * One other thing: if the update buffer was a one-off, we have to free
+	 * it here, it's not linked to any WT_PAGE in the system.
+	 */
+	if (repl->update->in == 0)
+		__wt_free(env, repl->update, repl->update->len);
+}
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
new file mode 100644
index 00000000000..a8ff78dc380
--- /dev/null
+++ b/src/btree/row_srch.c
@@ -0,0 +1,196 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_key_build(WT_TOC *, WT_PAGE *, WT_ROW *);
+
+/*
+ * __wt_row_search --
+ *	Search a row-store tree for a specific key.
+ */
+int
+__wt_row_search(WT_TOC *toc, DBT *key, uint32_t level, uint32_t flags)
+{
+	DB *db;
+	IDB *idb;
+	WT_OFF *off;
+	WT_PAGE *page;
+	WT_PAGE_DISK *dsk;
+	WT_REF *ref;
+	WT_ROW *rip;
+	WT_REPL *repl;
+	uint32_t base, indx, limit, write_gen;
+	int cmp, isleaf, ret;
+
+	toc->srch_page = NULL;			/* Return values. */
+	toc->srch_ip = NULL;
+	toc->srch_repl = NULL;
+	toc->srch_exp = NULL;
+	toc->srch_write_gen = 0;
+
+	db = toc->db;
+	idb = db->idb;
+
+	WT_DB_FCHK(db, "__wt_row_search", flags, WT_APIMASK_BT_SEARCH_KEY_ROW);
+
+	/* Search the tree. */
+	for (page = idb->root_page.page;;) {
+		/* Copy the write generation value before the read. */
+		write_gen = page->write_gen;
+
+		dsk = page->dsk;
+		isleaf =
+		    dsk->type == WT_PAGE_DUP_LEAF ||
+		    dsk->type == WT_PAGE_ROW_LEAF;
+		for (base = 0,
+		    limit = page->indx_count; limit != 0; limit >>= 1) {
+			indx = base + (limit >> 1);
+
+			/*
+			 * If the key is compressed or an overflow, it may not
+			 * have been instantiated yet.
+			 */
+			rip = page->u.irow + indx;
+			if (__wt_key_process(rip))
+				WT_ERR(__wt_key_build(toc, page, rip));
+
+			/*
+			 * If we're about to compare an application key with the
+			 * 0th index on an internal page, pretend the 0th index
+			 * sorts less than any application key.  This test is so
+			 * we don't have to update internal pages if the
+			 * application stores a new, "smallest" key in the tree.
+			 *
+			 * For the record, we still maintain the key at the 0th
+			 * location because it means tree verification and other
+			 * code that processes a level of the tree doesn't need
+			 * to know about this hack.
+			 */
+			if (indx != 0 || isleaf) {
+				cmp = db->btree_compare(db, key, (DBT *)rip);
+				if (cmp == 0)
+					break;
+				if (cmp < 0)
+					continue;
+			}
+			base = indx + 1;
+			--limit;
+		}
+
+		/*
+		 * Reference the slot used for next step down the tree.  We do
+		 * this on leaf pages too, because it's simpler to code, and we
+		 * only care if there's an exact match on leaf pages; setting
+		 * rip doesn't matter for leaf pages because we always return
+		 * WT_NOTFOUND if there's no match.
+		 *
+		 * Base is the smallest index greater than key and may be the
+		 * 0th index or the (last + 1) indx.  If base is not the 0th
+		 * index (remember, the 0th index always sorts less than any
+		 * application key), decrement it to the smallest index less
+		 * than or equal to key.
+		 */
+		if (cmp != 0)
+			rip = page->u.irow + (base == 0 ? 0 : base - 1);
+
+		/*
+		 * If we've reached the leaf page, or we've reached the level
+		 * requested by our caller, we're done.
+		 */
+		if (isleaf || level == dsk->level)
+			break;
+
+		/* rip references the subtree containing the record. */
+		ref = WT_ROW_REF(page, rip);
+		off = WT_ROW_OFF(rip);
+		WT_ERR(__wt_page_in(toc, page, ref, off, 0));
+
+		/* Swap the parent page for the child page. */
+		if (page != idb->root_page.page)
+			__wt_hazard_clear(toc, page);
+		page = ref->page;
+	}
+
+	/*
+	 * We've got the right on-page WT_ROW structure (an exact match in the
+	 * case of a lookup, or the smallest key on the page less than or equal
+	 * to the specified key in the case of an insert).   If it's an insert,
+	 * we're done, return the information.   Otherwise, check to see if the
+	 * item was modified/deleted.
+	 */
+	switch (dsk->type) {
+	case WT_PAGE_DUP_LEAF:
+	case WT_PAGE_ROW_LEAF:
+		if (LF_ISSET(WT_INSERT))
+			break;
+		if (cmp != 0)				/* No match */
+			goto notfound;
+							/* Deleted match. */
+		if ((repl = WT_ROW_REPL(page, rip)) != NULL) {
+			if (WT_REPL_DELETED_ISSET(repl))
+				goto notfound;
+			toc->srch_repl = repl;
+		}
+		break;
+	case WT_PAGE_DUP_INT:
+	case WT_PAGE_ROW_INT:
+		/*
+		 * When returning internal pages, set the item's WT_REPL slot
+		 * if it exists, otherwise we're done.
+		 */
+		toc->srch_repl = WT_ROW_REPL(page, rip);
+		break;
+	WT_ILLEGAL_FORMAT(db);
+	}
+
+	toc->srch_page = page;
+	toc->srch_ip = rip;
+	toc->srch_write_gen = write_gen;
+	return (0);
+
+notfound:
+	ret = WT_NOTFOUND;
+
+err:	WT_PAGE_OUT(toc, page);
+	return (ret);
+}
+
+/*
+ * __wt_key_build --
+ *	Instantiate an overflow or compressed key into a WT_ROW structure.
+ */
+static int
+__wt_key_build(WT_TOC *toc, WT_PAGE *page, WT_ROW *rip_arg)
+{
+	DBT *dbt, _dbt;
+	WT_ROW *rip;
+	WT_ITEM *item;
+	uint32_t i;
+
+	WT_CLEAR(_dbt);
+	dbt = &_dbt;
+
+	item = rip_arg->key;
+	WT_RET(__wt_item_process(toc, item, dbt));
+
+	/*
+	 * Update the WT_ROW reference with the processed key.  If there are
+	 * any duplicates of this item, update them as well.
+	 */
+	__wt_key_set(rip_arg, dbt->data, dbt->size);
+	if (WT_ITEM_TYPE(rip_arg->data) == WT_ITEM_DATA_DUP ||
+	    WT_ITEM_TYPE(rip_arg->data) == WT_ITEM_DATA_DUP_OVFL) {
+		WT_INDX_FOREACH(page, rip, i)
+			if (rip->key == item)
+				__wt_key_set(rip, dbt->data, dbt->size);
+	}
+
+	return (0);
+}
diff --git a/src/db/db_err.c b/src/db/db_err.c
new file mode 100644
index 00000000000..1ba46e06a69
--- /dev/null
+++ b/src/db/db_err.c
@@ -0,0 +1,64 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+#define	WT_DB_ERR(db, error, fmt) {					\
+	va_list __ap;							\
+									\
+	/* Application-specified callback function. */			\
+	va_start(__ap, fmt);						\
+	if ((db)->errcall != NULL)					\
+		__wt_msg_call((void *)((db)->errcall),			\
+		    (void *)(db), (db)->errpfx,				\
+		    (db)->idb == NULL ? NULL : (db)->idb->name,		\
+		    error, fmt, __ap);					\
+	va_end(__ap);							\
+									\
+	/*								\
+	 * If the application set an error callback function but not an	\
+	 * error stream, we're done.  Otherwise, write an error	stream.	\
+	 */								\
+	if ((db)->errcall != NULL && (db)->errfile == NULL)		\
+			return;						\
+									\
+	va_start(__ap, fmt);						\
+	__wt_msg_stream((db)->errfile, (db)->errpfx,			\
+	    (db)->idb == NULL ? NULL : (db)->idb->name,			\
+	    error, fmt, __ap);						\
+	va_end(__ap);							\
+}
+
+/*
+ * __wt_api_db_err --
+ *	Db.err method.
+ */
+void
+__wt_api_db_err(DB *db, int error, const char *fmt, ...)
+{
+	/*
+	 * This function may be called at before/after the statistics memory
+	 * has been allocated/freed; don't increment method statistics here.
+	 */
+	WT_DB_ERR(db, error, fmt);
+}
+
+/*
+ * __wt_api_db_errx --
+ *	Db.errx method.
+ */
+void
+__wt_api_db_errx(DB *db, const char *fmt, ...)
+{
+	/*
+	 * This function may be called at before/after the statistics memory
+	 * has been allocated/freed; don't increment method statistics here.
+	 */
+	WT_DB_ERR(db, 0, fmt);
+}
diff --git a/src/db/db_getset.c b/src/db/db_getset.c
new file mode 100644
index 00000000000..6c133a0a3fb
--- /dev/null
+++ b/src/db/db_getset.c
@@ -0,0 +1,85 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_db_btree_compare_int_set_verify --
+ *	Verify arguments to the Db.btree_compare_int_set method.
+ */
+int
+__wt_db_btree_compare_int_set_verify(DB *db, int btree_compare_int)
+{
+	if (btree_compare_int >= 0 && btree_compare_int <= 8)
+		return (0);
+
+	__wt_api_db_errx(db,
+	    "The number of bytes must be an integral value between 1 and 8");
+	return (WT_ERROR);
+}
+
+/*
+ * __wt_db_btree_dup_offpage_set_verify --
+ *	Verify arguments to the Db.btree_dup_offpage_set method.
+ */
+int
+__wt_db_btree_dup_offpage_set_verify(DB *db, uint32_t dup_offpage)
+{
+	/*
+	 * Limiting this value to something between 10 and 50 is a sanity test,
+	 * not a hard constraint (although a value of 100 might fail hard).
+	 *
+	 * If the value is too large, pages can end up being empty because it
+	 * isn't possible for duplicate sets to span pages.  So, if you set
+	 * the value to 50%, and you have two sequential, large duplicate sets,
+	 * you end up with two, half-empty pages.
+	 */
+	if (dup_offpage > 10 && dup_offpage <= 50)
+		return (0);
+
+	__wt_api_db_errx(db,
+	    "The percent of the page taken up by duplicate entries before "
+	    "being moved off-page must must be between 10 and 50");
+	return (WT_ERROR);
+}
+
+/*
+ * __wt_db_column_set_verify --
+ *	Verify arguments to the Db.column_set method.
+ */
+int
+__wt_db_column_set_verify(
+    DB *db, uint32_t fixed_len, const char *dictionary, uint32_t flags)
+{
+	ENV *env;
+	IDB *idb;
+
+	env = db->env;
+	idb = db->idb;
+
+	/*
+	 * The fixed-length number of bytes is stored in a single byte, which
+	 * limits the size to 255 bytes.
+	 */
+	WT_RET(__wt_api_arg_max(
+	    env, "DB.column_set", "fixed_len", fixed_len, 255));
+
+	/* Run-length encoding is incompatible with variable length records. */
+	if (fixed_len == 0 && LF_ISSET(WT_RLE)) {
+		__wt_api_db_errx(db,
+		    "Run-length encoding is incompatible with variable length "
+		    "column-store records");
+		return (WT_ERROR);
+	}
+
+	if (LF_ISSET(WT_RLE))
+		F_SET(idb, WT_RLE);
+	F_SET(idb, WT_COLUMN);
+	return (0);
+}
diff --git a/src/db/db_handle.c b/src/db/db_handle.c
new file mode 100644
index 00000000000..b9e244d5ea9
--- /dev/null
+++ b/src/db/db_handle.c
@@ -0,0 +1,184 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_db_config(DB *);
+static int __wt_idb_config(DB *);
+static int __wt_idb_destroy(DB *);
+
+/*
+ * __wt_env_db --
+ *	DB constructor.
+ */
+int
+__wt_env_db(ENV *env, DB **dbp)
+{
+	DB *db;
+	IDB *idb;
+	int ret;
+
+	db = NULL;
+	idb = NULL;
+
+	/* Create the DB and IDB structures. */
+	WT_ERR(__wt_calloc(env, 1, sizeof(DB), &db));
+	WT_ERR(__wt_calloc(env, 1, sizeof(IDB), &idb));
+
+	/* Connect everything together. */
+	db->idb = idb;
+	idb->db = db;
+	db->env = env;
+
+	/* Configure the DB and the IDB. */
+	WT_ERR(__wt_db_config(db));
+	WT_ERR(__wt_idb_config(db));
+
+	*dbp = db;
+	return (0);
+
+err:	(void)__wt_db_destroy(db);
+	return (ret);
+}
+
+/*
+ * __wt_db_config --
+ *	Set configuration for a just-created DB handle.
+ */
+static int
+__wt_db_config(DB *db)
+{
+	__wt_methods_db_config_default(db);
+	__wt_methods_db_lockout(db);
+	__wt_methods_db_init_transition(db);
+
+	return (0);
+}
+
+/*
+ * __wt_idb_config --
+ *	Set configuration for a just-created IDB handle.
+ */
+static int
+__wt_idb_config(DB *db)
+{
+	ENV *env;
+	IDB *idb;
+	IENV *ienv;
+
+	env = db->env;
+	idb = db->idb;
+	ienv = env->ienv;
+
+	idb->db = db;
+	idb->root_off.addr = idb->free_addr = WT_ADDR_INVALID;
+
+	__wt_lock(env, ienv->mtx);		/* Add to the ENV's list */
+	TAILQ_INSERT_TAIL(&ienv->dbqh, idb, q);
+	++ienv->dbqcnt;
+	__wt_unlock(env, ienv->mtx);
+
+	WT_RET(__wt_stat_alloc_db_stats(env, &idb->stats));
+	WT_RET(__wt_stat_alloc_database_stats(env, &idb->dstats));
+
+	return (0);
+}
+
+/*
+ * __wt_db_destroy --
+ *	DB handle destructor.
+ */
+int
+__wt_db_destroy(DB *db)
+{
+	ENV *env;
+	int ret;
+
+	env = db->env;
+
+	/* Discard the underlying IDB object. */
+	ret = __wt_idb_destroy(db);
+
+	/* Discard the DB object. */
+	__wt_free(env, db, sizeof(DB));
+
+	return (ret);
+}
+
+/*
+ * __wt_idb_destroy --
+ *	IDB handle destructor.
+ */
+static int
+__wt_idb_destroy(DB *db)
+{
+	ENV *env;
+	IDB *idb;
+	IENV *ienv;
+	int ret;
+
+	env = db->env;
+	idb = db->idb;
+	ienv = env->ienv;
+	ret = 0;
+
+	/* Check that there's something to close. */
+	if (idb == NULL)
+		return (0);
+
+	/* Diagnostic check: check flags against approved list. */
+	WT_ENV_FCHK_RET(env, "Db.close", idb->flags, WT_APIMASK_IDB, ret);
+
+	__wt_free(env, idb->name, 0);
+
+	if (idb->huffman_key != NULL) {
+		/* Key and data may use the same table, only close it once. */
+		if (idb->huffman_data == idb->huffman_key)
+			idb->huffman_data = NULL;
+		__wt_huffman_close(env, idb->huffman_key);
+		idb->huffman_key = NULL;
+	}
+	if (idb->huffman_data != NULL) {
+		__wt_huffman_close(env, idb->huffman_data);
+		idb->huffman_data = NULL;
+	}
+
+	__wt_walk_end(env, &idb->evict_walk);
+
+	__wt_free(env, idb->stats, 0);
+	__wt_free(env, idb->dstats, 0);
+
+	__wt_lock(env, ienv->mtx);		/* Delete from the ENV's list */
+	TAILQ_REMOVE(&ienv->dbqh, idb, q);
+	--ienv->dbqcnt;
+	__wt_unlock(env, ienv->mtx);
+
+	__wt_free(env, idb, sizeof(IDB));
+	db->idb = NULL;
+	return (ret);
+}
+
+int
+__wt_db_lockout_err(DB *db)
+{
+	__wt_api_db_errx(db,
+	    "This Db handle has failed for some reason, and can no longer "
+	    "be used; the only method permitted on it is Db.close which "
+	    "discards the handle permanently");
+	return (WT_ERROR);
+}
+
+int
+__wt_db_lockout_open(DB *db)
+{
+	__wt_api_db_errx(db,
+	    "This method may not be called until after the Db.open method has "
+	    "been called");
+	return (WT_ERROR);
+}
diff --git a/src/db/db_huffman.c b/src/db/db_huffman.c
new file mode 100644
index 00000000000..ae9fe7fccde
--- /dev/null
+++ b/src/db/db_huffman.c
@@ -0,0 +1,233 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * 7-bit ASCII, with English language frequencies.
+ *
+ * Based on "Case-sensitive letter and bigram frequency counts from large-scale
+ * English corpora"
+ *	Michael N. Jones and D.J.K. Mewhort
+ *	Queen's University, Kingston, Ontario, Canada
+ * Behavior Research Methods, Instruments, & Computers 2004, 36 (3), 388-396
+ *
+ * Additionally supports space and tab characters; space is the most common
+ * character in text where it occurs, and tab appears about as frequently as
+ * 'a' and 'n' in text where it occurs.
+ */
+static uint8_t const __wt_huffman_ascii_english[256] = {
+	1,	/* 000 nul */
+	1,	/* 001 soh */
+	1,	/* 002 stx */
+	1,	/* 003 etx */
+	1,	/* 004 eot */
+	1,	/* 005 enq */
+	1,	/* 006 ack */
+	1,	/* 007 bel */
+	1,	/* 010 bs  */
+	251,	/* 011 ht  */
+	1,	/* 012 nl  */
+	1,	/* 013 vt  */
+	1,	/* 014 np  */
+	1,	/* 015 cr  */
+	1,	/* 016 so  */
+	1,	/* 017 si  */
+	1,	/* 020 dle */
+	1,	/* 021 dc1 */
+	1,	/* 022 dc2 */
+	1,	/* 023 dc3 */
+	1,	/* 024 dc4 */
+	1,	/* 025 nak */
+	1,	/* 026 syn */
+	1,	/* 027 etb */
+	1,	/* 030 can */
+	1,	/* 031 em  */
+	1,	/* 032 sub */
+	1,	/* 033 esc */
+	1,	/* 034 fs  */
+	1,	/* 035 gs  */
+	1,	/* 036 rs  */
+	1,	/* 037 us  */
+	255,	/* 040 sp  */
+	177,	/* 041  !  */
+	223,	/* 042  "  */
+	171,	/* 043  #  */
+	188,	/* 044  $  */
+	176,	/* 045  %  */
+	179,	/* 046  &  */
+	215,	/* 047  '  */
+	189,	/* 050  (  */
+	190,	/* 051  )  */
+	184,	/* 052  *  */
+	175,	/* 053  +  */
+	234,	/* 054  ,  */
+	219,	/* 055  -  */
+	233,	/* 056  .  */
+	181,	/* 057  /  */
+	230,	/* 060  0  */
+	229,	/* 061  1  */
+	226,	/* 062  2  */
+	213,	/* 063  3  */
+	214,	/* 064  4  */
+	227,	/* 065  5  */
+	210,	/* 066  6  */
+	203,	/* 067  7  */
+	212,	/* 070  8  */
+	222,	/* 071  9  */
+	191,	/* 072  :  */
+	186,	/* 073  ;  */
+	173,	/* 074  <  */
+	172,	/* 075  =  */
+	174,	/* 076  >  */
+	183,	/* 077  ?  */
+	170,	/* 100  @  */
+	221,	/* 101  A  */
+	211,	/* 102  B  */
+	218,	/* 103  C  */
+	206,	/* 104  D  */
+	207,	/* 105  E  */
+	199,	/* 106  F  */
+	197,	/* 107  G  */
+	205,	/* 110  H  */
+	217,	/* 111  I  */
+	196,	/* 112  J  */
+	187,	/* 113  K  */
+	201,	/* 114  L  */
+	220,	/* 115  M  */
+	216,	/* 116  N  */
+	200,	/* 117  O  */
+	208,	/* 120  P  */
+	182,	/* 121  Q  */
+	209,	/* 122  R  */
+	224,	/* 123  S  */
+	225,	/* 124  T  */
+	193,	/* 125  U  */
+	185,	/* 126  V  */
+	202,	/* 127  W  */
+	180,	/* 130  X  */
+	198,	/* 131  Y  */
+	178,	/* 132  Z  */
+	1,	/* 133  [  */
+	1,	/* 134  \  */
+	1,	/* 135  ]  */
+	1,	/* 136  ^  */
+	1,	/* 137  _  */
+	1,	/* 140  `  */
+	252,	/* 141  a  */
+	232,	/* 142  b  */
+	242,	/* 143  c  */
+	243,	/* 144  d  */
+	254,	/* 145  e  */
+	239,	/* 146  f  */
+	237,	/* 147  g  */
+	245,	/* 150  h  */
+	248,	/* 151  i  */
+	194,	/* 152  j  */
+	228,	/* 153  k  */
+	244,	/* 154  l  */
+	240,	/* 155  m  */
+	249,	/* 156  n  */
+	250,	/* 157  o  */
+	238,	/* 160  p  */
+	192,	/* 161  q  */
+	246,	/* 162  r  */
+	247,	/* 163  s  */
+	253,	/* 164  t  */
+	241,	/* 165  u  */
+	231,	/* 166  v  */
+	235,	/* 167  w  */
+	204,	/* 170  x  */
+	236,	/* 171  y  */
+	195,	/* 172  z  */
+	1,	/* 173  {  */
+	1,	/* 174  |  */
+	1,	/* 175  }  */
+	1,	/* 176  ~  */
+	1,	/* 177 del */
+};
+
+/*
+ * __wt_db_huffman_set --
+ *	DB huffman configuration setter.
+ */
+int
+__wt_db_huffman_set(DB *db,
+    uint8_t const *huffman_table, u_int huffman_table_size, uint32_t flags)
+{
+	ENV *env;
+	IDB *idb;
+	uint8_t phone[256];
+
+	env = db->env;
+	idb = db->idb;
+
+	switch (LF_ISSET(WT_ASCII_ENGLISH | WT_TELEPHONE)) {
+	case WT_ASCII_ENGLISH:
+		if (huffman_table != NULL)
+			goto err;
+		huffman_table = __wt_huffman_ascii_english;
+		huffman_table_size = sizeof(__wt_huffman_ascii_english);
+		break;
+	case WT_TELEPHONE:
+		if (huffman_table != NULL)
+			goto err;
+		memset(phone, 0, sizeof(phone));
+		phone['('] = 2;
+		phone[')'] = 2;
+		phone['+'] = 1;
+		phone['-'] = 3;
+		phone['0'] = 1;
+		phone['1'] = 1;
+		phone['2'] = 1;
+		phone['3'] = 1;
+		phone['4'] = 1;
+		phone['5'] = 1;
+		phone['6'] = 1;
+		phone['7'] = 1;
+		phone['8'] = 1;
+		phone['9'] = 1;
+		huffman_table = phone;
+		huffman_table_size = sizeof(phone);
+		break;
+	default:
+err:		return (__wt_api_args(env, "Db.huffman_set"));
+	}
+
+	/*
+	 * If we're using an already-specified table, close it.   It's probably
+	 * an application error to set the Huffman table twice, but hey, I just
+	 * work here.
+	 */
+	if (LF_ISSET(WT_HUFFMAN_KEY) && idb->huffman_key != NULL) {
+		/* Key and data may use the same table, only close it once. */
+		if (idb->huffman_data == idb->huffman_key)
+			idb->huffman_data = NULL;
+		__wt_huffman_close(env, idb->huffman_key);
+		idb->huffman_key = NULL;
+	}
+	if (LF_ISSET(WT_HUFFMAN_DATA) && idb->huffman_data != NULL) {
+		__wt_huffman_close(env, idb->huffman_data);
+		idb->huffman_data = NULL;
+	}
+	if (LF_ISSET(WT_HUFFMAN_KEY)) {
+		WT_RET(__wt_huffman_open(env,
+		     huffman_table, huffman_table_size, &idb->huffman_key));
+		/* Key and data may use the same table. */
+		if (LF_ISSET(WT_HUFFMAN_DATA)) {
+			idb->huffman_data = idb->huffman_key;
+			LF_CLR(WT_HUFFMAN_DATA);
+		}
+	}
+	if (LF_ISSET(WT_HUFFMAN_DATA))
+		WT_RET(__wt_huffman_open(env,
+		    huffman_table, huffman_table_size, &idb->huffman_data));
+
+	return (0);
+}
diff --git a/src/db/db_open.c b/src/db/db_open.c
new file mode 100644
index 00000000000..1cdf04c1288
--- /dev/null
+++ b/src/db/db_open.c
@@ -0,0 +1,104 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_db_idb_open(DB *, const char *, mode_t, uint32_t);
+
+/*
+ * __wt_db_open --
+ *	Open a DB handle.
+ */
+int
+__wt_db_open(WT_TOC *toc, const char *name, mode_t mode, uint32_t flags)
+{
+	DB *db;
+	ENV *env;
+
+	env = toc->env;
+	db = toc->db;
+
+	WT_STAT_INCR(env->ienv->stats, DATABASE_OPEN);
+
+	/* Initialize the IDB structure. */
+	WT_RET(__wt_db_idb_open(db, name, mode, flags));
+
+	/* Open the underlying Btree. */
+	WT_RET(__wt_bt_open(toc, LF_ISSET(WT_CREATE) ? 1 : 0));
+
+	/* Turn on the methods that require open. */
+	__wt_methods_db_open_transition(db);
+
+	return (0);
+}
+
+/*
+ * __wt_db_idb_open --
+ *	Routine to intialize any IDB values based on a DB value during open.
+ */
+static int
+__wt_db_idb_open(DB *db, const char *name, mode_t mode, uint32_t flags)
+{
+	ENV *env;
+	IENV *ienv;
+	IDB *idb;
+
+	env = db->env;
+	ienv = env->ienv;
+	idb = db->idb;
+
+	WT_RET(__wt_strdup(env, name, &idb->name));
+	idb->mode = mode;
+
+	__wt_lock(env, ienv->mtx);
+	idb->file_id = ++ienv->next_file_id;
+	__wt_unlock(env, ienv->mtx);
+
+	/*
+	 * XXX
+	 * Initialize the root WT_REF/WT_OFF pair to point to the start of
+	 * the file.  This is all wrong, and we'll get the information from
+	 * somewhere else, eventually.
+	 */
+	WT_CLEAR(idb->root_page);
+	idb->root_page.state = WT_EMPTY;
+	WT_CLEAR(idb->root_off);
+	idb->root_off.addr = 0;
+	idb->root_off.size = 0;
+
+	if (LF_ISSET(WT_RDONLY))
+		F_SET(idb, WT_RDONLY);
+
+	return (0);
+}
+
+/*
+ * __wt_db_close --
+ *	Db.close method (DB close & handle destructor).
+ */
+int
+__wt_db_close(WT_TOC *toc, uint32_t flags)
+{
+	DB *db;
+	int ret;
+
+	db = toc->db;
+	ret = 0;
+
+	/* Flush the underlying Btree. */
+	if (!LF_ISSET(WT_NOWRITE))
+		WT_TRET(__wt_bt_sync(toc));
+
+	/* Close the underlying Btree. */
+	ret = __wt_bt_close(toc);
+
+	WT_TRET(__wt_db_destroy(db));
+
+	return (ret);
+}
diff --git a/src/db/db_stat.c b/src/db/db_stat.c
new file mode 100644
index 00000000000..84ac9960860
--- /dev/null
+++ b/src/db/db_stat.c
@@ -0,0 +1,72 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_db_stat_print --
+ *	Print DB handle statistics to a stream.
+ */
+int
+__wt_db_stat_print(WT_TOC *toc, FILE *stream)
+{
+	DB *db;
+	ENV *env;
+	IDB *idb;
+
+	db = toc->db;
+	env = toc->env;
+	idb = db->idb;
+
+	fprintf(stream, "Database handle statistics: %s\n", idb->name);
+	__wt_stat_print(env, idb->stats, stream);
+
+	/* Clear the database stats, then call Btree stat to fill them in. */
+	__wt_stat_clear_database_stats(idb->dstats);
+	WT_STAT_SET(idb->dstats, TREE_LEVEL, idb->root_page.page->dsk->level);
+	WT_RET(__wt_desc_stat(toc));
+
+	/*
+	 * Note we do not have a hazard reference for the root page, and that's
+	 * safe -- root pages are pinned into memory when a database is opened,
+	 * and never re-written until the database is closed.
+	 */
+	WT_RET(__wt_tree_walk(toc, NULL, 0, __wt_page_stat, NULL));
+
+	fprintf(stream, "Database statistics: %s\n", idb->name);
+	__wt_stat_print(env, idb->dstats, stream);
+
+	/* Underlying file handle statistics. */
+	if (idb->fh != NULL) {
+		fprintf(stream,
+		    "Underlying file I/O statistics: %s\n", idb->name);
+		__wt_stat_print(env, idb->fh->stats, stream);
+	}
+
+	return (0);
+}
+
+/*
+ * __wt_db_stat_clear --
+ *	Clear DB handle statistics.
+ */
+int
+__wt_db_stat_clear(DB *db)
+{
+	IDB *idb;
+
+	idb = db->idb;
+
+	__wt_stat_clear_db_stats(idb->stats);
+	__wt_stat_clear_database_stats(idb->dstats);
+	if (idb->fh != NULL)
+		__wt_stat_clear_fh_stats(idb->fh->stats);
+
+	return (0);
+}
diff --git a/src/db/db_sync.c b/src/db/db_sync.c
new file mode 100644
index 00000000000..eec5026f0c2
--- /dev/null
+++ b/src/db/db_sync.c
@@ -0,0 +1,20 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_db_sync --
+ *	Flush a database to the backing file.
+ */
+int
+__wt_db_sync(WT_TOC *toc, void (*f)(const char *, uint64_t), uint32_t flags)
+{
+	return (__wt_bt_sync(toc));
+}
diff --git a/src/env/env_err.c b/src/env/env_err.c
new file mode 100644
index 00000000000..b5bc0ca5966
--- /dev/null
+++ b/src/env/env_err.c
@@ -0,0 +1,83 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+void
+wiredtiger_err_stream(FILE *stream)
+{
+	extern FILE *__wt_err_stream;
+
+	__wt_err_stream = stream;
+}
+
+#define	WT_ENV_ERR(env, error, fmt) {					\
+	extern FILE *__wt_err_stream;					\
+	va_list __ap;							\
+	/*								\
+	 * Support error messages even when we don't yet have an ENV	\
+	 * handle.							\
+	 */								\
+	if ((env) == NULL) {						\
+		va_start(__ap, fmt);					\
+		__wt_msg_stream(					\
+		    __wt_err_stream, NULL, NULL, error, fmt, __ap);	\
+		va_end(__ap);						\
+		return;							\
+	}								\
+									\
+	/* Application-specified callback function. */			\
+	if ((env)->errcall != NULL) {					\
+		va_start(__ap, fmt);					\
+		__wt_msg_call((void *)((env)->errcall),			\
+		    (void *)(env), env->errpfx,				\
+		    NULL, error, fmt, __ap);				\
+		va_end(__ap);						\
+	}								\
+									\
+	/*								\
+	 * If the application set an error callback function but not an	\
+	 * error stream, we're done.  Otherwise, write the stream.	\
+	 */								\
+	if ((env)->errcall != NULL && (env)->errfile == NULL)		\
+			return;						\
+									\
+	va_start(__ap, fmt);						\
+	__wt_msg_stream((env)->errfile,					\
+	    (env)->errpfx, NULL, error, fmt, __ap);			\
+	va_end(__ap);							\
+}
+
+/*
+ * __wt_api_env_err --
+ *	Env.err method.
+ */
+void
+__wt_api_env_err(ENV *env, int error, const char *fmt, ...)
+{
+	/*
+	 * This function may be called at before/after the statistics memory
+	 * has been allocated/freed; don't increment method statistics here.
+	 */
+	WT_ENV_ERR(env, error, fmt);
+}
+
+/*
+ * __wt_api_env_errx --
+ *	Env.errx method.
+ */
+void
+__wt_api_env_errx(ENV *env, const char *fmt, ...)
+{
+	/*
+	 * This function may be called at before/after the statistics memory
+	 * has been allocated/freed; don't increment method statistics here.
+	 */
+	WT_ENV_ERR(env, 0, fmt);
+}
diff --git a/src/env/env_getset.c b/src/env/env_getset.c
new file mode 100644
index 00000000000..6786c87b41d
--- /dev/null
+++ b/src/env/env_getset.c
@@ -0,0 +1,70 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_env_cache_cache_size_set_verify --
+ *	Verify an argument to the Env.cache_size_set method.
+ */
+int
+__wt_env_cache_size_set_verify(ENV *env, uint32_t cache_size)
+{
+	return (__wt_api_arg_min(env,
+	    "Env.cache_size_set", "cache size", cache_size, 1));
+}
+
+/*
+ * __wt_env_cache_hash_size_set_verify --
+ *	Verify an argument to the Env.hash_size_set method.
+ */
+int
+__wt_env_cache_hash_size_set_verify(ENV *env, uint32_t hash_size)
+{
+	return (__wt_api_arg_min(env,
+	    "Env.hash_size_set", "hash size", hash_size, 1));
+}
+
+/*
+ * __wt_env_cache_hazard_size_set_verify --
+ *	Verify an argument to the Env.hazard_size_set method.
+ */
+int
+__wt_env_hazard_size_set_verify(ENV *env, uint32_t hazard_size)
+{
+	return (__wt_api_arg_min(env,
+	    "Env.hazard_size_set", "hazard size", hazard_size, 1));
+}
+
+/*
+ * __wt_env_toc_size_set_verify --
+ *	Verify an argument to the Env.toc_size_set method.
+ */
+int
+__wt_env_toc_size_set_verify(ENV *env, uint32_t toc_size)
+{
+	return (__wt_api_arg_min(env,
+	    "Env.toc_size_set", "toc size", toc_size, 1));
+}
+
+/*
+ * __wt_env_verbose_set_verify --
+ *	Verify an argument to the Env.verbose_set method.
+ */
+int
+__wt_env_verbose_set_verify(ENV *env, uint32_t verbose)
+{
+#ifdef HAVE_VERBOSE
+	WT_ENV_FCHK(env,
+	    "Env.verbose_set", verbose, WT_APIMASK_ENV_VERBOSE_SET);
+	return (0);
+#else
+	return (__wt_api_config(env, "Env.verbose_set", "--enable-verbose"));
+#endif
+}
diff --git a/src/env/env_global.c b/src/env/env_global.c
new file mode 100644
index 00000000000..e41a7bccfad
--- /dev/null
+++ b/src/env/env_global.c
@@ -0,0 +1,72 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+void *__wt_addr;				/* Memory flush address. */
+FILE *__wt_err_stream;				/* Error stream from init. */
+
+/*
+ * __wt_library_init --
+ *	Some things to do, before we do anything else.
+ */
+int
+__wt_library_init(void)
+{
+	/*
+	 * We need an address for memory flushing -- it doesn't matter which
+	 * one we choose.
+	 */
+	__wt_addr = &__wt_addr;
+
+	/*
+	 * We want to be able to redirect error messages from the very first
+	 * instruction.
+	 */
+	__wt_err_stream = stderr;
+
+	/*
+	 * Check the build & compiler itself before going further.
+	 */
+	WT_RET(__wt_bt_build_verify());
+
+#ifdef HAVE_DIAGNOSTIC
+	/* Load debug code the compiler might optimize out. */
+	WT_RET(__wt_breakpoint());
+#endif
+
+	return (0);
+}
+
+/*
+ * __wt_breakpoint --
+ *	A simple place to put a breakpoint, if you need one.
+ */
+int
+__wt_breakpoint(void)
+{
+	return (0);
+}
+
+int __wt_debugger_attach;
+
+/*
+ * __wt_attach --
+ *	A routine to wait for the debugging to attach.
+ */
+void
+__wt_attach(ENV *env)
+{
+#ifdef HAVE_ATTACH
+	__wt_api_env_errx(env,
+	    "process ID %lld: waiting for debugger...", (long long)getpid());
+	while (__wt_debugger_attach == 0)
+		__wt_sleep(10, 0);
+#endif
+}
diff --git a/src/env/env_handle.c b/src/env/env_handle.c
new file mode 100644
index 00000000000..1c02675041f
--- /dev/null
+++ b/src/env/env_handle.c
@@ -0,0 +1,137 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_env_config(ENV *);
+static int __wt_ienv_config(ENV *);
+
+/*
+ * __wt_env_create --
+ *	ENV constructor.
+ */
+int
+__wt_env_create(uint32_t flags, ENV **envp)
+{
+	ENV *env;
+	IENV *ienv;
+	int ret;
+
+	/*
+	 * !!!
+	 * We don't yet have valid ENV/IENV structures to use to call other
+	 * functions.  The only functions that can handle NULL ENV handles
+	 * are the memory allocation and free functions, no other functions
+	 * may be called.
+	 */
+	WT_RET(__wt_calloc(NULL, 1, sizeof(ENV), &env));
+	WT_ERR(__wt_calloc(NULL, 1, sizeof(IENV), &ienv));
+
+	/* Connect everything together. */
+	env->ienv = ienv;
+
+	/* Set flags. */
+	if (LF_ISSET(WT_MEMORY_CHECK))
+		F_SET(env, WT_MEMORY_CHECK);
+
+	/* Configure the ENV and the IENV. */
+	WT_ERR(__wt_env_config(env));
+	WT_ERR(__wt_ienv_config(env));
+
+	*envp = env;
+	return (0);
+
+err:	(void)__wt_env_close(env);
+	return (ret);
+}
+
+/*
+ * __wt_env_config --
+ *	Set configuration for a just-created ENV handle.
+ */
+static int
+__wt_env_config(ENV *env)
+{
+	__wt_methods_env_config_default(env);
+	__wt_methods_env_lockout(env);
+	__wt_methods_env_init_transition(env);
+	return (0);
+}
+
+/*
+ * __wt_ienv_config --
+ *	Set configuration for a just-created IENV handle.
+ */
+static int
+__wt_ienv_config(ENV *env)
+{
+	IENV *ienv;
+
+	ienv = env->ienv;
+
+#ifdef HAVE_DIAGNOSTIC
+	/* If we're tracking memory, initialize those structures first. */
+	if (F_ISSET(env, WT_MEMORY_CHECK))
+		WT_RET(__wt_mtrack_alloc(env));
+#endif
+						/* Global mutex */
+	WT_RET(__wt_mtx_alloc(env, "IENV", 0, &ienv->mtx));
+
+	TAILQ_INIT(&ienv->dbqh);		/* DB list */
+	TAILQ_INIT(&ienv->fhqh);		/* File list */
+
+	/* Statistics. */
+	WT_RET(__wt_stat_alloc_env_stats(env, &ienv->stats));
+	WT_RET(__wt_stat_alloc_method_stats(env, &ienv->method_stats));
+
+	/* Diagnostic output separator. */
+	ienv->sep = "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=";
+
+	return (0);
+}
+
+/*
+ * __wt_ienv_destroy --
+ *	Destroy the ENV's underlying IENV structure.
+ */
+int
+__wt_ienv_destroy(ENV *env)
+{
+	IENV *ienv;
+	int ret;
+
+	ienv = env->ienv;
+	ret = 0;
+
+	/* Check there's something to destroy. */
+	if (ienv == NULL)
+		return (0);
+
+	/* Diagnostic check: check flags against approved list. */
+	WT_ENV_FCHK_RET(env, "Env.close", ienv->flags, WT_APIMASK_IENV, ret);
+
+	(void)__wt_mtx_destroy(env, ienv->mtx);
+
+	/* Free allocated memory. */
+	__wt_free(env, ienv->toc, 0);
+	__wt_free(env, ienv->toc_array, 0);
+	__wt_free(env, ienv->hazard, 0);
+	__wt_free(env, ienv->stats, 0);
+	__wt_free(env, ienv->method_stats, 0);
+
+#ifdef HAVE_DIAGNOSTIC
+	/* If we're tracking memory, check to see if everything was free'd. */
+	__wt_mtrack_dump(env);
+	__wt_mtrack_free(env);
+#endif
+
+	__wt_free(NULL, ienv, sizeof(IENV));
+	env->ienv = NULL;
+	return (ret);
+}
diff --git a/src/env/env_init.c b/src/env/env_init.c
new file mode 100644
index 00000000000..26c7062d63f
--- /dev/null
+++ b/src/env/env_init.c
@@ -0,0 +1,41 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * wiredtiger_env_init --
+ *	Initialize the library, creating an ENV handle.
+ */
+int
+wiredtiger_env_init(ENV **envp, uint32_t flags)
+{
+	static int library_init = 0;
+	ENV *env;
+
+	*envp = NULL;
+
+	/*
+	 * We end up here before we do any real work.   Check the build itself,
+	 * and do some global stuff.
+	 */
+	if (library_init == 0) {
+		WT_RET(__wt_library_init());
+		library_init = 1;
+	}
+
+	WT_ENV_FCHK(NULL,
+	    "wiredtiger_env_init", flags, WT_APIMASK_WIREDTIGER_ENV_INIT);
+
+	/* Create the ENV handle. */
+	WT_RET(__wt_env_create(flags, &env));
+
+	*envp = env;
+	return (0);
+}
diff --git a/src/env/env_msg.c b/src/env/env_msg.c
new file mode 100644
index 00000000000..9dcfdec9514
--- /dev/null
+++ b/src/env/env_msg.c
@@ -0,0 +1,138 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+#define	WT_MSG(env, fmt) {						\
+	extern FILE *__wt_err_stream;					\
+	va_list __ap;							\
+	/*								\
+	 * Support messages even when we don't yet have an ENV handle,	\
+	 * using the error stream.
+	 */								\
+	if ((env) == NULL) {						\
+		va_start(__ap, fmt);					\
+		__wt_msg_stream(					\
+		    __wt_err_stream, NULL, NULL, 0, fmt, __ap);		\
+		va_end(__ap);						\
+		return;							\
+	}								\
+									\
+	/* Application-specified callback function. */			\
+	if ((env)->msgcall != NULL) {					\
+		va_start(__ap, fmt);					\
+		__wt_msg_call((void *)((env)->msgcall),			\
+		    (void *)env, NULL, NULL, 0, fmt, __ap);		\
+		va_end(__ap);						\
+	}								\
+									\
+	/*								\
+	 * If the application set an message callback function but not a\
+	 * message stream, we're done.  Otherwise, write the stream.	\
+	 */								\
+	if ((env)->msgcall != NULL && (env)->msgfile == NULL)		\
+			return;						\
+									\
+	va_start(__ap, fmt);						\
+	__wt_msg_stream((env)->msgfile, NULL, NULL, 0, fmt, __ap);	\
+	va_end(__ap);							\
+}
+
+/*
+ * __wt_msg --
+ *	Write a message.
+ */
+void
+__wt_msg(ENV *env, const char *fmt, ...)
+{
+	WT_MSG(env, fmt);
+}
+
+/*
+ * __wt_mb_init --
+ *	Initialize a WT_MBUF structure for message aggregation.
+ */
+void
+__wt_mb_init(ENV *env, WT_MBUF *mbp)
+{
+	mbp->env = env;
+	mbp->first = mbp->next = NULL;
+	mbp->len = 0;
+}
+
+/*
+ * __wt_mb_discard --
+ *	Discard a WT_MBUF structure.
+ */
+void
+__wt_mb_discard(WT_MBUF *mbp)
+{
+	if (mbp->first == NULL)
+		return;
+
+	/* Write any remaining message. */
+	if (mbp->next != mbp->first)
+		__wt_mb_write(mbp);
+
+	__wt_free(mbp->env, mbp->first, mbp->len);
+}
+
+/*
+ * __wt_mb_add --
+ *	Append log messages into a WT_MBUF structure.
+ */
+void
+__wt_mb_add(WT_MBUF *mbp, const char *fmt, ...)
+{
+	va_list ap;
+	size_t current, len, remain;
+
+	va_start(ap, fmt);
+
+	current = (size_t)(mbp->next - mbp->first);
+	remain = mbp->len - current;
+	len = 64;
+	for (;;) {
+		/*
+		 * If we don't have at least "len" bytes allocate 2x len bytes
+		 * more memory.
+		 */
+		if (remain <= len) {
+			if (__wt_realloc(mbp->env,
+			    &mbp->len, mbp->len + len * 2, &mbp->first))
+				return;
+			mbp->next = mbp->first + current;
+			remain = mbp->len - current;
+		}
+		/*
+		 * Format the user's information.  If it doesn't fit into the
+		 * buffer we have, re-allocate enough memory and try again.
+		 */
+		len = (size_t)vsnprintf(mbp->next, remain, fmt, ap);
+		if (len < remain) {
+			mbp->next += len;
+			break;
+		}
+	}
+}
+
+/*
+ * __wt_mb_write --
+ *	Write the messages from a WT_MBUF structure.
+ */
+void
+__wt_mb_write(WT_MBUF *mbp)
+{
+	if (mbp->first == NULL || mbp->next == mbp->first)
+		return;
+
+	__wt_msg(mbp->env, "%s", mbp->first);
+
+	mbp->next = mbp->first;
+}
diff --git a/src/env/env_open.c b/src/env/env_open.c
new file mode 100644
index 00000000000..a6f95838ede
--- /dev/null
+++ b/src/env/env_open.c
@@ -0,0 +1,132 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_env_open --
+ *	Open a Env handle.
+ */
+int
+__wt_env_open(ENV *env, const char *home, mode_t mode)
+{
+	IENV *ienv;
+	int ret;
+
+	WT_CC_QUIET(home, NULL);
+	WT_CC_QUIET(mode, 0);
+
+	ienv = env->ienv;
+	ret = 0;
+
+	/* WT_TOC and hazard arrays. */
+	WT_RET(__wt_calloc(env, env->toc_size, sizeof(WT_TOC *), &ienv->toc));
+	WT_RET(
+	    __wt_calloc(env, env->toc_size, sizeof(WT_TOC), &ienv->toc_array));
+	WT_RET(__wt_calloc(env,
+	   env->toc_size * env->hazard_size, sizeof(WT_PAGE *), &ienv->hazard));
+
+	/* Create the cache. */
+	WT_RET(__wt_cache_create(env));
+
+	/* Transition to the open state. */
+	__wt_methods_env_open_transition(env);
+
+	/* Start worker threads. */
+	F_SET(ienv, WT_WORKQ_RUN | WT_SERVER_RUN);
+	WT_MEMORY_FLUSH;
+
+	WT_ERR(__wt_thread_create(
+	    &ienv->cache_evict_tid, __wt_cache_evict_server, env));
+	WT_ERR(__wt_thread_create(
+	    &ienv->cache_read_tid, __wt_cache_read_server, env));
+	WT_ERR(__wt_thread_create(&ienv->workq_tid, __wt_workq_srvr, env));
+
+	return (0);
+
+err:	(void)__wt_env_close(env);
+	return (ret);
+}
+
+/*
+ * __wt_env_close --
+ *	Close an Env handle.
+ */
+int
+__wt_env_close(ENV *env)
+{
+	IDB *idb;
+	IENV *ienv;
+	WT_FH *fh;
+	int ret, secondary_err;
+
+	WT_ENV_FCHK_RET(env, "Env.close", env->flags, WT_APIMASK_ENV, ret);
+
+	ienv = env->ienv;
+	ret = secondary_err = 0;
+
+	/* Complain if DB handles weren't closed. */
+	if (TAILQ_FIRST(&ienv->dbqh) != NULL) {
+		TAILQ_FOREACH(idb, &ienv->dbqh, q) {
+			__wt_api_env_errx(env,
+			    "Env handle has open Db handles: %s",
+			    idb->name);
+			WT_TRET(idb->db->close(idb->db, 0));
+		}
+		secondary_err = WT_ERROR;
+	}
+
+	/* Complain if files weren't closed. */
+	if (TAILQ_FIRST(&ienv->fhqh) != NULL) {
+		TAILQ_FOREACH(fh, &ienv->fhqh, q) {
+			__wt_api_env_errx(env,
+			    "Env handle has open file handles: %s",
+			    fh->name);
+			WT_TRET(__wt_close(env, fh));
+		}
+		secondary_err = WT_ERROR;
+	}
+
+	/* Shut down the server threads. */
+	F_CLR(ienv, WT_SERVER_RUN);
+	WT_MEMORY_FLUSH;
+
+	/*
+	 * Force the cache server threads to run and wait for them to exit.
+	 * Wait for the cache eviction server first, it potentially schedules
+	 * work for the read thread.
+	 */
+	__wt_workq_evict_server(env, 1);
+	__wt_thread_join(ienv->cache_evict_tid);
+	__wt_workq_read_server(env, 1);
+	__wt_thread_join(ienv->cache_read_tid);
+
+	/*
+	 * Close down and wait for the workQ thread; this only happens after
+	 * all other server threads have exited, as they may be waiting on a
+	 * request from the workQ, or vice-versa.
+	 */
+	F_CLR(ienv, WT_WORKQ_RUN);
+	WT_MEMORY_FLUSH;
+	__wt_thread_join(ienv->workq_tid);
+
+	/* Discard the cache. */
+	WT_TRET(__wt_cache_destroy(env));
+
+	/* Re-cycle the underlying ENV/IENV structures. */
+	WT_TRET(__wt_ienv_destroy(env));
+
+	/* Free the Env structure. */
+	__wt_free(NULL, env, sizeof(ENV));
+
+	if (ret == 0)
+		ret = secondary_err;
+
+	return (ret == 0 ? secondary_err : ret);
+}
diff --git a/src/env/env_stat.c b/src/env/env_stat.c
new file mode 100644
index 00000000000..997d9080f31
--- /dev/null
+++ b/src/env/env_stat.c
@@ -0,0 +1,86 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_env_stat_print --
+ *	Print ENV handle statistics to a stream.
+ */
+int
+__wt_env_stat_print(ENV *env, FILE *stream)
+{
+	IDB *idb;
+	IENV *ienv;
+
+	ienv = env->ienv;
+
+	fprintf(stream, "Environment handle statistics:\n");
+	__wt_stat_print(env, ienv->stats, stream);
+
+	fprintf(stream, "Environment cache statistics:\n");
+	__wt_cache_stats(env);
+	__wt_stat_print(env, ienv->cache->stats, stream);
+	fprintf(stream, "Environment method statistics:\n");
+	__wt_stat_print(env, ienv->method_stats, stream);
+
+	TAILQ_FOREACH(idb, &ienv->dbqh, q)
+		WT_RET(idb->db->stat_print(idb->db, stream, 0));
+	return (0);
+}
+
+/*
+ * __wt_env_stat_clear --
+ *	Clear ENV handle statistics.
+ */
+int
+__wt_env_stat_clear(ENV *env)
+{
+	IDB *idb;
+	IENV *ienv;
+	int ret;
+
+	ienv = env->ienv;
+	ret = 0;
+
+	TAILQ_FOREACH(idb, &ienv->dbqh, q)
+		WT_TRET(__wt_db_stat_clear(idb->db));
+
+	__wt_stat_clear_env_stats(ienv->stats);
+	__wt_stat_clear_cache_stats(ienv->cache->stats);
+	__wt_stat_clear_method_stats(ienv->method_stats);
+
+	return (ret);
+}
+
+/*
+ * __wt_stat_print --
+ *	Print out a statistics table.
+ */
+void
+__wt_stat_print(ENV *env, WT_STATS *s, FILE *stream)
+{
+	IENV *ienv;
+
+	ienv = env->ienv;
+
+	for (; s->desc != NULL; ++s)
+		if (s->v >= WT_BILLION)
+			fprintf(stream, "%lluB\t%s (%llu bytes)\n",
+			    (unsigned long long)s->v / WT_BILLION,
+			    s->desc, (unsigned long long)s->v);
+		else if (s->v >= WT_MILLION)
+			fprintf(stream, "%lluM\t%s (%llu bytes)\n",
+			    (unsigned long long)s->v / WT_MILLION,
+			    s->desc, (unsigned long long)s->v);
+		else
+			fprintf(stream,
+			    "%llu\t%s\n", (unsigned long long)s->v, s->desc);
+	fprintf(stream, "%s\n", ienv->sep);
+}
diff --git a/src/env/env_sync.c b/src/env/env_sync.c
new file mode 100644
index 00000000000..4c40b52ad1c
--- /dev/null
+++ b/src/env/env_sync.c
@@ -0,0 +1,30 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_env_sync --
+ *	Flush the environment's cache.
+ */
+int
+__wt_env_sync(ENV *env, void (*f)(const char *, uint64_t))
+{
+	IDB *idb;
+	IENV *ienv;
+	int ret;
+
+	ienv = env->ienv;
+	ret = 0;
+
+	TAILQ_FOREACH(idb, &ienv->dbqh, q)
+		WT_TRET(idb->db->sync(idb->db, f, 0));
+
+	return (ret);
+}
diff --git a/src/env/env_toc.c b/src/env/env_toc.c
new file mode 100644
index 00000000000..46d132707b5
--- /dev/null
+++ b/src/env/env_toc.c
@@ -0,0 +1,238 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_env_toc --
+ *	ENV.toc method.
+ */
+int
+__wt_env_toc(ENV *env, WT_TOC **tocp)
+{
+	IENV *ienv;
+	WT_TOC *toc;
+	uint32_t slot;
+
+	ienv = env->ienv;
+	*tocp = NULL;
+
+	/* Check to see if there's an available WT_TOC slot. */
+	if (ienv->toc_cnt == env->toc_size - 1) {
+		__wt_api_env_errx(env,
+		    "WiredTiger only configured to support %d thread contexts",
+		    env->toc_size);
+		return (WT_ERROR);
+	}
+
+	/*
+	 * The WT_TOC reference list is compact, the WT_TOC array is not.  Find
+	 * the first empty WT_TOC slot.
+	 */
+	for (slot = 0, toc = ienv->toc_array; toc->env != NULL; ++toc, ++slot)
+		;
+
+	/* Clear previous contents of the WT_TOC entry, they get re-used. */
+	memset(toc, 0, sizeof(WT_TOC));
+
+	toc->env = env;
+	toc->hazard = ienv->hazard + slot * env->hazard_size;
+
+	WT_RET(__wt_mtx_alloc(env, "toc", 1, &toc->mtx));
+
+	__wt_methods_wt_toc_lockout(toc);
+	__wt_methods_wt_toc_init_transition(toc);
+
+	/* Make the entry visible to the workQ. */
+	ienv->toc[ienv->toc_cnt++] = toc;
+	WT_MEMORY_FLUSH;
+
+	*tocp = toc;
+	return (0);
+}
+
+/*
+ * __wt_wt_toc_close --
+ *	WT_TOC.close method.
+ */
+int
+__wt_wt_toc_close(WT_TOC *toc)
+{
+	ENV *env;
+	IENV *ienv;
+	WT_TOC **tp;
+	WT_TOC_UPDATE *update;
+	int ret;
+
+	env = toc->env;
+	ienv = env->ienv;
+	ret = 0;
+
+	WT_ENV_FCHK_RET(
+	    env, "WT_TOC.close", toc->flags, WT_APIMASK_WT_TOC, ret);
+
+	/*
+	 * The "in" reference count is artificially incremented by 1 as
+	 * long as an update buffer is referenced by the WT_TOC thread;
+	 * we don't want them freed because a page was evicted and their
+	 * count went to 0.  Decrement the reference count on the buffer
+	 * as part of releasing it.  There's a similar reference count
+	 * decrement when the WT_TOC structure is discarded.
+	 *
+	 * XXX
+	 * There's a race here: if this code, or the WT_TOC structure
+	 * close code, and the page discard code race, it's possible
+	 * neither will realize the buffer is no longer needed and free
+	 * it.  The fix is to involve the eviction or workQ threads:
+	 * they may need a linked list of buffers they review to ensure
+	 * it never happens.  I'm living with this now: it's unlikely
+	 * and it's a memory leak if it ever happens.
+	 */
+	update = toc->update;
+	if (update != NULL && --update->in == update->out)
+		__wt_free(env, update, update->len);
+
+	/* Discard DBT memory. */
+	__wt_free(env, toc->key.data, toc->key.mem_size);
+	__wt_free(env, toc->data.data, toc->data.mem_size);
+	__wt_scr_free(toc);
+
+	/* Unlock and destroy the thread's mutex. */
+	if (toc->mtx != NULL) {
+		__wt_unlock(env, toc->mtx);
+		(void)__wt_mtx_destroy(env, toc->mtx);
+	}
+
+	/*
+	 * Replace the WT_TOC reference we're closing with the last entry in
+	 * the table, then clear the last entry.  As far as the walk of the
+	 * workQ is concerned, it's OK if the WT_TOC appears twice, or if it
+	 * doesn't appear at all, so these lines can race all they want.
+	 */
+	for (tp = ienv->toc; *tp != toc; ++tp)
+		;
+	--ienv->toc_cnt;
+	*tp = ienv->toc[ienv->toc_cnt];
+	ienv->toc[ienv->toc_cnt] = NULL;
+
+	/* Make the WT_TOC array entry available for re-use. */
+	toc->env = NULL;
+	WT_MEMORY_FLUSH;
+
+	return (ret);
+}
+
+/*
+ * __wt_toc_api_set --
+ *	Pair WT_TOC and DB handle, allocating the WT_TOC as necessary.
+ */
+int
+__wt_toc_api_set(ENV *env, const char *name, DB *db, WT_TOC **tocp)
+{
+	WT_TOC *toc;
+
+	/*
+	 * We pass around WT_TOCs internally in the Btree, (rather than a DB),
+	 * because the DB's are free-threaded, and the WT_TOCs are per-thread.
+	 * Lots of the API calls don't require the application to allocate and
+	 * manage the WT_TOC, which means we have to do it for them.
+	 *
+	 * WT_TOCs always reference a DB handle, and we do that here, as well.
+	 */
+	if ((toc = *tocp) == NULL) {
+		WT_RET(env->toc(env, 0, tocp));
+		toc = *tocp;
+	}
+	toc->db = db;
+	toc->name = name;
+	return (0);
+}
+
+/*
+ * __wt_toc_api_clr --
+ *	Clear the WT_TOC, freeing it if it was allocated by the library.
+ */
+int
+__wt_toc_api_clr(WT_TOC *toc, const char *name, int islocal)
+{
+	/*
+	 * The WT_TOC should hold no more hazard references; this is a
+	 * diagnostic check, but it's cheap so we do it all the time.
+	 */
+	__wt_hazard_empty(toc, name);
+
+	if (islocal)
+		return (toc->close(toc, 0));
+
+	toc->db = NULL;
+	toc->name = NULL;
+	return (0);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+static const char *__wt_toc_print_state(WT_TOC *);
+
+int
+__wt_toc_dump(ENV *env)
+{
+	IENV *ienv;
+	WT_MBUF mb;
+	WT_TOC *toc, **tp;
+	WT_PAGE **hp;
+
+	ienv = env->ienv;
+	__wt_mb_init(env, &mb);
+
+	__wt_mb_add(&mb, "%s\n", ienv->sep);
+	for (tp = ienv->toc; (toc = *tp) != NULL; ++tp) {
+		__wt_mb_add(&mb,
+		    "toc: %p {\n\tworkq func: ", toc);
+		if (toc->wq_func == NULL)
+			__wt_mb_add(&mb, "none");
+		else
+			__wt_mb_add(&mb, "%p", toc->wq_func);
+
+		__wt_mb_add(&mb, " state: %s", __wt_toc_print_state(toc));
+
+		__wt_mb_add(&mb, "\n\thazard: ");
+		for (hp = toc->hazard;
+		    hp < toc->hazard + env->hazard_size; ++hp)
+			__wt_mb_add(&mb, "%p ", *hp);
+
+		__wt_mb_add(&mb, "\n}");
+		if (toc->name != NULL)
+			__wt_mb_add(&mb, " %s", toc->name);
+		__wt_mb_write(&mb);
+	}
+
+	__wt_mb_discard(&mb);
+	return (0);
+}
+
+/*
+ * __wt_toc_print_state --
+ *	Return the WT_TOC state as a string.
+ */
+static const char *
+__wt_toc_print_state(WT_TOC *toc)
+{
+	switch (toc->wq_state) {
+	case WT_WORKQ_READ:
+		return ("read");
+	case WT_WORKQ_READ_SCHED:
+		return ("read scheduled");
+	case WT_WORKQ_FUNC:
+		return ("function");
+	case WT_WORKQ_NONE:
+		return ("none");
+	}
+	return ("unknown");
+	/* NOTREACHED */
+}
+#endif
diff --git a/src/env/env_workq.c b/src/env/env_workq.c
new file mode 100644
index 00000000000..76a00b0dce5
--- /dev/null
+++ b/src/env/env_workq.c
@@ -0,0 +1,94 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_workq_srvr --
+ *      Routine to process the WT_TOC work queue.
+ */
+void *
+__wt_workq_srvr(void *arg)
+{
+	ENV *env;
+	IENV *ienv;
+	WT_TOC **tp, *toc;
+	int chk_read, read_force, request;
+
+	env = (ENV *)arg;
+	ienv = env->ienv;
+
+	/* Walk the WT_TOC list and execute requests. */
+	while (F_ISSET(ienv, WT_WORKQ_RUN)) {
+		++ienv->api_gen;
+		WT_STAT_INCR(ienv->stats, WORKQ_PASSES);
+
+		chk_read = read_force = request = 0;
+		for (tp = ienv->toc; (toc = *tp) != NULL; ++tp) {
+			switch (toc->wq_state) {
+			case WT_WORKQ_NONE:
+				break;
+			case WT_WORKQ_FUNC:
+				request = 1;
+				(void)toc->wq_func(toc);
+				break;
+			case WT_WORKQ_READ:
+				request = 1;
+
+				/*
+				 * Call a function which makes a request of the
+				 * read server.  There are two read states: READ
+				 * (the initial request), and READ_SCHED (the
+				 * function has been called and we're waiting on
+				 * the read to complete).  There are two states
+				 * because we can race with the server: if the
+				 * called function adds itself to the queue just
+				 * as the server is going to sleep, the server
+				 * might not see the request.   So, READ_SCHED
+				 * means we don't have to call the function, but
+				 * we do have check if the server is running.
+				 *
+				 * The read state is eventually reset by the
+				 * read server, so we set it before we call the
+				 * function that will contact the server, so we
+				 * can't race on that update.
+				 */
+				toc->wq_state = WT_WORKQ_READ_SCHED;
+
+				/*
+				 * Call the function (which contacts the read
+				 * server).  If that call fails, we're done.
+				 */
+				if (toc->wq_func(toc) != 0)
+					break;
+
+				/* FALLTHROUGH */
+			case WT_WORKQ_READ_SCHED:
+				chk_read = 1;
+				if (F_ISSET(toc, WT_READ_PRIORITY))
+					read_force = 1;
+				break;
+			}
+		}
+
+		/* If a read is scheduled, check on the read server. */
+		if (chk_read)
+			__wt_workq_read_server(env, read_force);
+
+		/* Check on the cache eviction server. */
+		__wt_workq_evict_server(env, 0);
+
+		/* If we didn't find work, yield the processor. */
+		if (!request) {
+			WT_STAT_INCR(ienv->stats, WORKQ_YIELD);
+			__wt_yield();
+		}
+	}
+	return (NULL);
+}
diff --git a/src/os_posix/os_abort.c b/src/os_posix/os_abort.c
new file mode 100644
index 00000000000..68106636831
--- /dev/null
+++ b/src/os_posix/os_abort.c
@@ -0,0 +1,25 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_abort --
+ *	Abort the process, dropping core.
+ */
+void
+__wt_abort(ENV *env)
+{
+	__wt_msg(env, "aborting WiredTiger library");
+
+	__wt_attach(env);
+
+	abort();
+	/* NOTREACHED */
+}
diff --git a/src/os_posix/os_alloc.c b/src/os_posix/os_alloc.c
new file mode 100644
index 00000000000..dbbb915822a
--- /dev/null
+++ b/src/os_posix/os_alloc.c
@@ -0,0 +1,359 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+#ifdef HAVE_DIAGNOSTIC
+static void __wt_mtrack(
+    ENV *env, const void *, const void *, const char *, int);
+#endif
+
+/*
+ * There's no malloc interface, WiredTiger never calls malloc.  The problem is
+ * an application might: allocate memory, write secret stuff into it, free the
+ * memory, we allocate the memory, and then use it for a database page or log
+ * record and write it to disk.  That would result in the secret stuff being
+ * protected by the WiredTiger permission mechanisms, potentially inappropriate
+ * for the secret stuff.
+ */
+
+/*
+ * __wt_calloc_func --
+ *	ANSI calloc function.
+ */
+int
+__wt_calloc_func(ENV *env, uint32_t number, uint32_t size, void *retp
+#ifdef HAVE_DIAGNOSTIC
+    , const char *file, int line
+#endif
+    )
+{
+	void *p;
+
+	/*
+	 * !!!
+	 * This function MUST handle a NULL ENV structure reference.
+	 */
+	WT_ASSERT(env, number != 0 && size != 0);
+
+	if (env != NULL && env->ienv != NULL && env->ienv->stats != NULL)
+		WT_STAT_INCR(env->ienv->stats, MEMALLOC);
+
+	if ((p = calloc(number, (size_t)size)) == NULL) {
+		__wt_api_env_err(env, errno, "memory allocation");
+		return (WT_ERROR);
+	}
+	*(void **)retp = p;
+
+#ifdef	HAVE_DIAGNOSTIC
+	__wt_mtrack(env, NULL, p, file, line);
+#endif
+	return (0);
+}
+
+/*
+ * __wt_realloc_func --
+ *	ANSI realloc function.
+ */
+int
+__wt_realloc_func(ENV *env,
+    uint32_t *bytes_allocated_ret, uint32_t bytes_to_allocate, void *retp
+#ifdef HAVE_DIAGNOSTIC
+    , const char *file, int line
+#endif
+    )
+{
+	void *p;
+	uint32_t bytes_allocated;
+
+	/*
+	 * !!!
+	 * This function MUST handle a NULL ENV structure reference.
+	 */
+	WT_ASSERT(env, bytes_to_allocate != 0);
+
+	if (env != NULL && env->ienv != NULL && env->ienv->stats != NULL)
+		WT_STAT_INCR(env->ienv->stats, MEMALLOC);
+
+	p = *(void **)retp;
+
+	/*
+	 * Sometimes we're allocating memory and we don't care about the
+	 * final length -- bytes_allocated_ret may be NULL.
+	 */
+	bytes_allocated =
+	    bytes_allocated_ret == NULL ? 0 : *bytes_allocated_ret;
+	WT_ASSERT(env, bytes_allocated < bytes_to_allocate);
+
+	if ((p = realloc(p, (size_t)bytes_to_allocate)) == NULL) {
+		__wt_api_env_err(env, errno, "memory allocation");
+		return (WT_ERROR);
+	}
+
+	/*
+	 * Clear the allocated memory -- an application might: allocate memory,
+	 * write secret stuff into it, free the memory, we re-allocate the
+	 * memory, then use it for a database page or log record and write it
+	 * to disk.  That would result in the secret stuff being protected by
+	 * the WiredTiger permission mechanisms, potentially inappropriate for
+	 * the secret stuff.
+	 */
+	memset((uint8_t *)
+	    p + bytes_allocated, 0, bytes_to_allocate - bytes_allocated);
+
+	/* Update caller's bytes allocated value. */
+	if (bytes_allocated_ret != NULL)
+		*bytes_allocated_ret = bytes_to_allocate;
+
+#ifdef	HAVE_DIAGNOSTIC
+	__wt_mtrack(env, *(void **)retp, p, file, line);
+#endif
+
+	*(void **)retp = p;
+	return (0);
+}
+
+/*
+ * __wt_strdup_func --
+ *	ANSI strdup function.
+ */
+int
+__wt_strdup_func(ENV *env, const char *str, void *retp
+#ifdef HAVE_DIAGNOSTIC
+    , const char *file, int line
+#endif
+    )
+{
+	size_t len;
+	void *p;
+
+	/*
+	 * !!!
+	 * This function MUST handle a NULL ENV structure reference.
+	 */
+	if (env != NULL && env->ienv != NULL && env->ienv->stats != NULL)
+		WT_STAT_INCR(env->ienv->stats, MEMALLOC);
+
+	len = strlen(str) + 1;
+#ifdef HAVE_DIAGNOSTIC
+	WT_RET(__wt_calloc_func(env, len, 1, &p, file, line));
+#else
+	WT_RET(__wt_calloc_func(env, len, 1, &p));
+#endif
+
+	memcpy(p, str, len);
+
+	*(void **)retp = p;
+	return (0);
+}
+
+/*
+ * __wt_free_func --
+ *	ANSI free function.
+ */
+void
+__wt_free_func(ENV *env, void *p_arg
+#ifdef HAVE_DIAGNOSTIC
+    , uint32_t len
+#endif
+    )
+{
+	void *p;
+
+	/*
+	 * !!!
+	 * This function MUST handle a NULL ENV structure reference.
+	 */
+	if (env != NULL && env->ienv != NULL && env->ienv->stats != NULL)
+		WT_STAT_INCR(env->ienv->stats, MEMFREE);
+
+	/*
+	 * If there's a serialization bug we might race with another thread.
+	 * We can't avoid the race (and we aren't willing to flush memory),
+	 * but we minimize the window by clearing the free address atomically,
+	 * hoping a racing thread will see, and won't free, a NULL pointer.
+	 */
+	p = *(void **)p_arg;
+	*(void **)p_arg = NULL;
+
+	if (p == NULL)			/* ANSI C free semantics */
+		return;
+
+#ifdef HAVE_DIAGNOSTIC
+	/*
+	 * If we know how long the object is, overwrite it with an easily
+	 * recognizable value for debugging.
+	 */
+	if (len != 0)
+		memset(p, WT_DEBUG_BYTE, len);
+
+	__wt_mtrack(env, p, NULL, NULL, 0);
+#endif
+
+	free(p);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_mtrack_alloc --
+ *	Allocate memory tracking structures.
+ */
+int
+__wt_mtrack_alloc(ENV *env)
+{
+	IENV *ienv;
+	WT_MTRACK *p;
+
+	ienv = env->ienv;
+
+	/*
+	 * Use a temporary variable -- assigning memory to ienv->mtrack turns
+	 * on memory object tracking, and we need to set up the rest of the
+	 * structure first.
+	 */
+	WT_RET(__wt_calloc(env, 1, sizeof(WT_MTRACK), &p));
+	WT_RET(__wt_calloc(env, 1000, sizeof(WT_MEM), &p->list));
+	p->next = p->list;
+	p->slots = 1000;
+	ienv->mtrack = p;
+	return (0);
+}
+
+/*
+ * __wt_mtrack_free --
+ *	Free memory tracking structures.
+ */
+void
+__wt_mtrack_free(ENV *env)
+{
+	IENV *ienv;
+	WT_MTRACK *p;
+
+	ienv = env->ienv;
+
+	/*
+	 * Clear ienv->mtrack (to turn off memory object tracking) before the
+	 * free.
+	 */
+	if ((p = ienv->mtrack) == NULL)
+		return;
+	ienv->mtrack = NULL;
+
+	__wt_free(env, p->list, 0);
+	__wt_free(env, p, 0);
+}
+
+/*
+ * __wt_mtrack_free --
+ *	Track memory allocations and frees.
+ */
+static void
+__wt_mtrack(ENV *env, const void *f, const void *a, const char *file, int line)
+{
+	WT_MEM *mp, *t, *mp_end;
+	WT_MTRACK *mtrack;
+	int slot_check;
+
+	if (env == NULL ||
+	    env->ienv == NULL || (mtrack = env->ienv->mtrack) == NULL)
+		return;
+
+	/*
+	 * Remove freed memory from the list.  If it's a free/alloc pair (that
+	 * is, if __wt_realloc was called), re-use the slot.
+	 */
+	if (f != NULL) {
+		if ((mp = mtrack->next) > mtrack->list)
+			do {
+				if ((--mp)->addr == f)
+					goto enter;
+			} while (mp > mtrack->list);
+
+		__wt_api_env_errx(env, "mtrack: %p: not found", f);
+		__wt_attach(env);
+	}
+
+	if (a == NULL)
+		return;
+
+	/*
+	 * Add allocated memory to the list.
+	 *
+	 * First, see if there's a slot close by we can re-use (the assumption
+	 * is that when memory is allocated and quickly freed we re-use the
+	 * slots instead of leaving lots of free spots in the array.
+	 */
+	if ((mp = mtrack->next) > mtrack->list)
+		for (slot_check = 0; slot_check < 10; ++slot_check) {
+			if ((--mp)->addr == NULL)
+				goto enter;
+			if (mp == mtrack->list)
+				break;
+		}
+
+	mp_end = mtrack->list + mtrack->slots;
+
+	/* If there's an empty slot, use it. */
+	if (mtrack->next < mp_end)
+		goto next;
+
+	/* Try to compress the array. */
+	for (mp = mtrack->list, t = NULL;; ++mp, ++t) {
+		while (mp < mp_end && mp->addr != NULL)
+			++mp;
+		if (mp == mp_end)
+			break;
+		if (t == NULL)
+			t = mp + 1;
+		while (t < mp_end && t->addr == NULL)
+			++t;
+		if (t == mp_end)
+			break;
+		*mp++ = *t;
+		t->addr = NULL;
+	}
+	mtrack->next = mp;
+
+	/* If there's an empty slot, use it. */
+	if (mtrack->next < mp_end)
+		goto next;
+
+	/* Re-allocate the array and use the next empty slot. */
+	if ((mtrack->list = realloc(mtrack->list,
+	    mtrack->slots * 2 * sizeof(WT_MEM))) == NULL)
+		return;
+	mtrack->next = mtrack->list + mtrack->slots;
+	mtrack->slots *= 2;
+
+next:	mp = mtrack->next++;
+enter:	mp->addr = a;
+	mp->file = file;
+	mp->line = line;
+}
+
+/*
+ * __wt_mtrack_dump --
+ *	Complain about any memory allocated but never freed.
+ */
+void
+__wt_mtrack_dump(ENV *env)
+{
+	WT_MTRACK *mtrack;
+	WT_MEM *mp;
+
+	if ((mtrack = env->ienv->mtrack) == NULL)
+		return;
+
+	for (mp = mtrack->list; mp < mtrack->next; ++mp)
+		if (mp->addr != NULL)
+			__wt_api_env_errx(env,
+			    "mtrack: %p {%s/%d}: never freed",
+				mp->addr, mp->file, mp->line);
+}
+#endif
diff --git a/src/os_posix/os_filesize.c b/src/os_posix/os_filesize.c
new file mode 100644
index 00000000000..604d963f8e6
--- /dev/null
+++ b/src/os_posix/os_filesize.c
@@ -0,0 +1,27 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+int
+__wt_filesize(ENV *env, WT_FH *fh, off_t *sizep)
+{
+	struct stat sb;
+
+	WT_VERBOSE(env,
+	    WT_VERB_FILEOPS, (env, "fileops: %s: fstat", fh->name));
+
+	if (fstat(fh->fd, &sb) == -1) {
+		__wt_api_env_err(env, errno, "%s: fstat", fh->name);
+		return (WT_ERROR);
+	}
+
+	*sizep = sb.st_size;		/* Return size in bytes. */
+	return (0);
+}
diff --git a/src/os_posix/os_fsync.c b/src/os_posix/os_fsync.c
new file mode 100644
index 00000000000..e6ecfd95a21
--- /dev/null
+++ b/src/os_posix/os_fsync.c
@@ -0,0 +1,29 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_fsync --
+ *	Flush a file handle.
+ */
+int
+__wt_fsync(ENV *env, WT_FH *fh)
+{
+
+	WT_STAT_INCR(fh->stats, FSYNC);
+
+	WT_VERBOSE(env, WT_VERB_FILEOPS, (env, "fileops: %s: fsync", fh->name));
+
+	if (fsync(fh->fd) == 0)
+		return (0);
+
+	__wt_api_env_err(env, errno, "%s fsync error", fh->name);
+	return (WT_ERROR);
+}
diff --git a/src/os_posix/os_mtx.c b/src/os_posix/os_mtx.c
new file mode 100644
index 00000000000..fb58784ec2d
--- /dev/null
+++ b/src/os_posix/os_mtx.c
@@ -0,0 +1,148 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_mtx_alloc --
+ *	Allocate and initialize a pthread mutex.
+ */
+int
+__wt_mtx_alloc(ENV *env, const char *name, int is_locked, WT_MTX **mtxp)
+{
+	WT_MTX *mtx;
+	pthread_condattr_t condattr;
+	pthread_mutexattr_t mutexattr;
+
+	WT_RET(__wt_calloc(env, 1, sizeof(WT_MTX), &mtx));
+
+	/*
+	 * !!!
+	 * This function MUST handle a NULL ENV structure reference.
+	 *
+	 * Initialize the mutex.
+	 * Mutexes are shared between processes.
+	 */
+	if (pthread_mutexattr_init(&mutexattr) != 0)
+		goto err;
+#if 0
+	if (pthread_mutexattr_setpshared(
+	    &mutexattr, PTHREAD_PROCESS_SHARED) != 0)
+		goto err;
+#endif
+	if (pthread_mutex_init(&mtx->mtx, &mutexattr) != 0)
+		goto err;
+	(void)pthread_mutexattr_destroy(&mutexattr);
+
+	/* Initialize the condition variable (mutexes are self-blocking). */
+	if (pthread_condattr_init(&condattr) != 0)
+		goto err;
+#if 0
+	if (pthread_condattr_setpshared(
+	    &condattr, PTHREAD_PROCESS_SHARED) != 0)
+		goto err;
+#endif
+	if (pthread_cond_init(&mtx->cond, &condattr) != 0)
+		goto err;
+	(void)pthread_condattr_destroy(&condattr);
+
+	mtx->name = name;
+
+	/* If the normal state of the mutex is locked, lock it immediately. */
+	if (is_locked)
+		__wt_lock(env, mtx);
+
+	*mtxp = mtx;
+	return (0);
+
+err:	__wt_free(env, mtx, sizeof(WT_MTX));
+	return (WT_ERROR);
+}
+
+/*
+ * __wt_lock
+ *	Lock a mutex.
+ */
+void
+__wt_lock(ENV *env, WT_MTX *mtx)
+{
+	int ret;
+
+	WT_VERBOSE(env,
+	    WT_VERB_MUTEX, (env, "lock %s mutex (%p)",  mtx->name, mtx));
+
+	WT_ERR(pthread_mutex_lock(&mtx->mtx));
+
+	/*
+	 * Check pthread_cond_wait() return for EINTR, ETIME and ETIMEDOUT,
+	 * it's known to return these errors on some systems.
+	 */
+	while (mtx->locked) {
+		ret = pthread_cond_wait(&mtx->cond, &mtx->mtx);
+		if (ret != 0 &&
+		    ret != EINTR &&
+#ifdef ETIME
+		    ret != ETIME &&
+#endif
+		    ret != ETIMEDOUT) {
+			(void)pthread_mutex_unlock(&mtx->mtx);
+			goto err;
+		}
+	}
+
+	mtx->locked = 1;
+	WT_STAT_INCR(env->ienv->stats, MTX_LOCK);
+
+	WT_ERR(pthread_mutex_unlock(&mtx->mtx));
+	return;
+
+err:	__wt_api_env_err(env, ret, "mutex lock failed");
+	__wt_abort(env);
+}
+
+/*
+ * __wt_unlock --
+ *	Release a mutex.
+ */
+void
+__wt_unlock(ENV *env, WT_MTX *mtx)
+{
+	int ret;
+
+	WT_VERBOSE(env,
+	    WT_VERB_MUTEX, (env, "unlock %s mutex (%p)",  mtx->name, mtx));
+
+	ret = 0;
+	WT_ERR(pthread_mutex_lock(&mtx->mtx));
+	mtx->locked = 0;
+	WT_ERR(pthread_cond_signal(&mtx->cond));
+
+	WT_ERR(pthread_mutex_unlock(&mtx->mtx));
+	return;
+
+err:	__wt_api_env_err(env, ret, "mutex unlock failed");
+	__wt_abort(NULL);
+}
+
+/*
+ * __wt_mtx_destroy --
+ *	Destroy a mutex.
+ */
+int
+__wt_mtx_destroy(ENV *env, WT_MTX *mtx)
+{
+	int ret;
+
+	ret = pthread_cond_destroy(&mtx->cond);
+	WT_TRET(pthread_mutex_destroy(&mtx->mtx));
+
+	__wt_free(env, mtx, sizeof(WT_MTX));
+
+	return (ret == 0 ? 0 : WT_ERROR);
+}
diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c
new file mode 100644
index 00000000000..971fe47f11b
--- /dev/null
+++ b/src/os_posix/os_open.c
@@ -0,0 +1,128 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_open --
+ *	Open a file handle.
+ */
+int
+__wt_open(ENV *env, const char *name, mode_t mode, int ok_create, WT_FH **fhp)
+{
+	IDB *idb;
+	IENV *ienv;
+	WT_FH *fh;
+	int f, fd, ret;
+
+	fh = NULL;
+	ienv = env->ienv;
+
+	 WT_VERBOSE(env, WT_VERB_FILEOPS, (env, "fileops: %s: open", name));
+
+	/* Increment the reference count if we already have the file open. */
+	__wt_lock(env, ienv->mtx);
+	TAILQ_FOREACH(idb, &ienv->dbqh, q) {
+		if ((fh = idb->fh) == NULL)
+			continue;
+		if (strcmp(name, idb->name) == 0) {
+			++fh->refcnt;
+			*fhp = fh;
+			break;
+		}
+	}
+	__wt_unlock(env, ienv->mtx);
+	if (fh != NULL)
+		return (0);
+
+	f = O_RDWR;
+#ifdef O_BINARY
+	/* Windows clones: we always want to treat the file as a binary. */
+	f |= O_BINARY;
+#endif
+	if (ok_create)
+		f |= O_CREAT;
+
+	if ((fd = open(name, f, mode)) == -1) {
+		__wt_api_env_err(env, errno, "%s", name);
+		return (WT_ERROR);
+	}
+
+	WT_RET(__wt_calloc(env, 1, sizeof(WT_FH), &fh));
+	WT_ERR(__wt_stat_alloc_fh_stats(env, &fh->stats));
+	WT_ERR(__wt_strdup(env, name, &fh->name));
+
+#if defined(HAVE_FCNTL) && defined(FD_CLOEXEC)
+	/*
+	 * Security:
+	 * The application may spawn a new process, and we don't want another
+	 * process to have access to our file handles.  There's an obvious
+	 * race here...
+	 */
+	if ((f = fcntl(fd, F_GETFD)) == -1 ||
+	    fcntl(fd, F_SETFD, f | FD_CLOEXEC) == -1) {
+		__wt_api_env_err(env, errno, "%s: fcntl", name);
+		goto err;
+	}
+#endif
+
+	fh->fd = fd;
+	fh->refcnt = 1;
+	*fhp = fh;
+
+	/* Set the file's size. */
+	WT_ERR(__wt_filesize(env, fh, &fh->file_size));
+
+	/* Link onto the environment's list of files. */
+	__wt_lock(env, ienv->mtx);
+	TAILQ_INSERT_TAIL(&ienv->fhqh, fh, q);
+	__wt_unlock(env, ienv->mtx);
+
+	return (0);
+
+err:	if (fh != NULL) {
+		if (fh->name != NULL)
+			__wt_free(env, fh->name, 0);
+		__wt_free(env, fh, sizeof(WT_FH));
+	}
+	(void)close(fd);
+	return (ret);
+}
+
+/*
+ * __wt_close --
+ *	Close a file handle.
+ */
+int
+__wt_close(ENV *env, WT_FH *fh)
+{
+	IENV *ienv;
+	int ret;
+
+	ienv = env->ienv;
+	ret = 0;
+
+	if (fh == NULL || fh->refcnt == 0 || --fh->refcnt > 0)
+		return (0);
+
+	/* Remove from the list and discard the memory. */
+	__wt_lock(env, ienv->mtx);
+	TAILQ_REMOVE(&ienv->fhqh, fh, q);
+	__wt_unlock(env, ienv->mtx);
+
+	if (close(fh->fd) != 0) {
+		__wt_api_env_err(env, errno, "%s", fh->name);
+		ret = WT_ERROR;
+	}
+
+	__wt_free(env, fh->name, 0);
+	__wt_free(env, fh->stats, 0);
+	__wt_free(env, fh, sizeof(WT_FH));
+	return (ret);
+}
diff --git a/src/os_posix/os_rw.c b/src/os_posix/os_rw.c
new file mode 100644
index 00000000000..1ce48f3ec56
--- /dev/null
+++ b/src/os_posix/os_rw.c
@@ -0,0 +1,56 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_read --
+ *	Read a chunk.
+ */
+int
+__wt_read(ENV *env, WT_FH *fh, off_t offset, uint32_t bytes, void *buf)
+{
+	WT_STAT_INCR(fh->stats, READ_IO);
+	WT_STAT_INCR(env->ienv->stats, TOTAL_READ_IO);
+
+	WT_VERBOSE(env, WT_VERB_FILEOPS,
+	    (env, "fileops: %s: read %lu bytes at offset %lu",
+	    fh->name, (u_long)bytes, (u_long)offset));
+
+	if (pread(fh->fd, buf, (size_t)bytes, offset) == (ssize_t)bytes)
+		return (0);
+
+	__wt_api_env_err(env, errno,
+	    "%s read error: attempt to read %lu bytes at offset %lu",
+	    fh->name, (u_long)bytes, (u_long)offset);
+	return (WT_ERROR);
+}
+
+/*
+ * __wt_write --
+ *	Write a chunk.
+ */
+int
+__wt_write(ENV *env, WT_FH *fh, off_t offset, uint32_t bytes, void *buf)
+{
+	WT_STAT_INCR(fh->stats, WRITE_IO);
+	WT_STAT_INCR(env->ienv->stats, TOTAL_WRITE_IO);
+
+	WT_VERBOSE(env, WT_VERB_FILEOPS,
+	    (env, "fileops: %s: write %lu bytes at offset %lu",
+	    fh->name, (u_long)bytes, (u_long)offset));
+
+	if (pwrite(fh->fd, buf, (size_t)bytes, offset) == (ssize_t)bytes)
+		return (0);
+
+	__wt_api_env_err(env, errno,
+	    "%s write error: attempt to write %lu bytes at offset %lu",
+	    fh->name, (u_long)bytes, (u_long)offset);
+	return (WT_ERROR);
+}
diff --git a/src/os_posix/os_sleep.c b/src/os_posix/os_sleep.c
new file mode 100644
index 00000000000..74b86a30d42
--- /dev/null
+++ b/src/os_posix/os_sleep.c
@@ -0,0 +1,25 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_sleep --
+ *	Pause the thread of control.
+ */
+void
+__wt_sleep(long seconds, long micro_seconds)
+{
+	struct timeval t;
+
+	t.tv_sec = (long)seconds + micro_seconds / 1000000;
+	t.tv_usec = (long)micro_seconds % 1000000;
+
+	(void)select(0, NULL, NULL, NULL, &t);
+}
diff --git a/src/os_posix/os_thread.c b/src/os_posix/os_thread.c
new file mode 100644
index 00000000000..3fb62a482d8
--- /dev/null
+++ b/src/os_posix/os_thread.c
@@ -0,0 +1,31 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_thread_create --
+ *	Create a new thread of control.
+ */
+int
+__wt_thread_create(pthread_t *tidret, void *(*func)(void *), void *arg)
+{
+	/* Spawn a new thread of control. */
+	return (pthread_create(tidret, NULL, func, arg) == 0 ? 0 : WT_ERROR);
+}
+
+/*
+ * __wt_thread_join --
+ *	Wait for a thread of control to exit.
+ */
+void
+__wt_thread_join(pthread_t tid)
+{
+	(void)pthread_join(tid, NULL);
+}
diff --git a/src/os_posix/os_yield.c b/src/os_posix/os_yield.c
new file mode 100644
index 00000000000..a13b407150d
--- /dev/null
+++ b/src/os_posix/os_yield.c
@@ -0,0 +1,24 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_yield --
+ *	Yield the thread of control.
+ */
+void
+__wt_yield(void)
+{
+#ifdef HAVE_PTHREAD_YIELD
+	pthread_yield();
+#else
+	sched_yield();
+#endif
+}
diff --git a/src/support/api.c b/src/support/api.c
new file mode 100644
index 00000000000..cb7b48a7d69
--- /dev/null
+++ b/src/support/api.c
@@ -0,0 +1,1597 @@
+/* DO NOT EDIT: automatically built by dist/api.py. */
+
+#include "wt_internal.h"
+
+static int __wt_api_db_btree_compare_dup_get(
+	DB *db,
+	int (**btree_compare_dup)(DB *, const DBT *, const DBT *));
+static int __wt_api_db_btree_compare_dup_get(
+	DB *db,
+	int (**btree_compare_dup)(DB *, const DBT *, const DBT *))
+{
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, DB_BTREE_COMPARE_DUP_GET);
+	*btree_compare_dup = db->btree_compare_dup;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_db_btree_compare_dup_set(
+	DB *db,
+	int (*btree_compare_dup)(DB *, const DBT *, const DBT *));
+static int __wt_api_db_btree_compare_dup_set(
+	DB *db,
+	int (*btree_compare_dup)(DB *, const DBT *, const DBT *))
+{
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, DB_BTREE_COMPARE_DUP_SET);
+	db->btree_compare_dup = btree_compare_dup;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_db_btree_compare_get(
+	DB *db,
+	int (**btree_compare)(DB *, const DBT *, const DBT *));
+static int __wt_api_db_btree_compare_get(
+	DB *db,
+	int (**btree_compare)(DB *, const DBT *, const DBT *))
+{
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, DB_BTREE_COMPARE_GET);
+	*btree_compare = db->btree_compare;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_db_btree_compare_int_get(
+	DB *db,
+	int *btree_compare_int);
+static int __wt_api_db_btree_compare_int_get(
+	DB *db,
+	int *btree_compare_int)
+{
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, DB_BTREE_COMPARE_INT_GET);
+	*btree_compare_int = db->btree_compare_int;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_db_btree_compare_int_set(
+	DB *db,
+	int btree_compare_int);
+static int __wt_api_db_btree_compare_int_set(
+	DB *db,
+	int btree_compare_int)
+{
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+
+	WT_RET((__wt_db_btree_compare_int_set_verify(
+	    db, btree_compare_int)));
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, DB_BTREE_COMPARE_INT_SET);
+	db->btree_compare_int = btree_compare_int;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_db_btree_compare_set(
+	DB *db,
+	int (*btree_compare)(DB *, const DBT *, const DBT *));
+static int __wt_api_db_btree_compare_set(
+	DB *db,
+	int (*btree_compare)(DB *, const DBT *, const DBT *))
+{
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, DB_BTREE_COMPARE_SET);
+	db->btree_compare = btree_compare;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_db_btree_dup_offpage_get(
+	DB *db,
+	uint32_t *btree_dup_offpage);
+static int __wt_api_db_btree_dup_offpage_get(
+	DB *db,
+	uint32_t *btree_dup_offpage)
+{
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, DB_BTREE_DUP_OFFPAGE_GET);
+	*btree_dup_offpage = db->btree_dup_offpage;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_db_btree_dup_offpage_set(
+	DB *db,
+	uint32_t btree_dup_offpage);
+static int __wt_api_db_btree_dup_offpage_set(
+	DB *db,
+	uint32_t btree_dup_offpage)
+{
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+
+	WT_RET((__wt_db_btree_dup_offpage_set_verify(
+	    db, btree_dup_offpage)));
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, DB_BTREE_DUP_OFFPAGE_SET);
+	db->btree_dup_offpage = btree_dup_offpage;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_db_btree_itemsize_get(
+	DB *db,
+	uint32_t *intlitemsize,
+	uint32_t *leafitemsize);
+static int __wt_api_db_btree_itemsize_get(
+	DB *db,
+	uint32_t *intlitemsize,
+	uint32_t *leafitemsize)
+{
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, DB_BTREE_ITEMSIZE_GET);
+	*intlitemsize = db->intlitemsize;
+	*leafitemsize = db->leafitemsize;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_db_btree_itemsize_set(
+	DB *db,
+	uint32_t intlitemsize,
+	uint32_t leafitemsize);
+static int __wt_api_db_btree_itemsize_set(
+	DB *db,
+	uint32_t intlitemsize,
+	uint32_t leafitemsize)
+{
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, DB_BTREE_ITEMSIZE_SET);
+	db->intlitemsize = intlitemsize;
+	db->leafitemsize = leafitemsize;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_db_btree_pagesize_get(
+	DB *db,
+	uint32_t *allocsize,
+	uint32_t *intlmin,
+	uint32_t *intlmax,
+	uint32_t *leafmin,
+	uint32_t *leafmax);
+static int __wt_api_db_btree_pagesize_get(
+	DB *db,
+	uint32_t *allocsize,
+	uint32_t *intlmin,
+	uint32_t *intlmax,
+	uint32_t *leafmin,
+	uint32_t *leafmax)
+{
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, DB_BTREE_PAGESIZE_GET);
+	*allocsize = db->allocsize;
+	*intlmin = db->intlmin;
+	*intlmax = db->intlmax;
+	*leafmin = db->leafmin;
+	*leafmax = db->leafmax;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_db_btree_pagesize_set(
+	DB *db,
+	uint32_t allocsize,
+	uint32_t intlmin,
+	uint32_t intlmax,
+	uint32_t leafmin,
+	uint32_t leafmax);
+static int __wt_api_db_btree_pagesize_set(
+	DB *db,
+	uint32_t allocsize,
+	uint32_t intlmin,
+	uint32_t intlmax,
+	uint32_t leafmin,
+	uint32_t leafmax)
+{
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, DB_BTREE_PAGESIZE_SET);
+	db->allocsize = allocsize;
+	db->intlmin = intlmin;
+	db->intlmax = intlmax;
+	db->leafmin = leafmin;
+	db->leafmax = leafmax;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_db_bulk_load(
+	DB *db,
+	uint32_t flags,
+	void (*progress)(const char *, uint64_t),
+	int (*cb)(DB *, DBT **, DBT **));
+static int __wt_api_db_bulk_load(
+	DB *db,
+	uint32_t flags,
+	void (*progress)(const char *, uint64_t),
+	int (*cb)(DB *, DBT **, DBT **))
+{
+	const char *method_name = "DB.bulk_load";
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+	WT_TOC *toc = NULL;
+	int ret;
+
+	WT_DB_RDONLY(db, method_name);
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_BULK_LOAD);
+	WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+	WT_STAT_INCR(ienv->method_stats, DB_BULK_LOAD);
+	ret = __wt_db_bulk_load(toc, flags, progress, cb);
+	WT_TRET(__wt_toc_api_clr(toc, method_name, 1));
+	return (ret);
+}
+
+static int __wt_api_db_close(
+	DB *db,
+	uint32_t flags);
+static int __wt_api_db_close(
+	DB *db,
+	uint32_t flags)
+{
+	const char *method_name = "DB.close";
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+	WT_TOC *toc = NULL;
+	int ret;
+
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_CLOSE);
+	WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+	WT_STAT_INCR(ienv->method_stats, DB_CLOSE);
+	ret = __wt_db_close(toc, flags);
+	WT_TRET(__wt_toc_api_clr(toc, method_name, 1));
+	return (ret);
+}
+
+static int __wt_api_db_col_del(
+	DB *db,
+	WT_TOC *toc,
+	uint64_t recno,
+	uint32_t flags);
+static int __wt_api_db_col_del(
+	DB *db,
+	WT_TOC *toc,
+	uint64_t recno,
+	uint32_t flags)
+{
+	const char *method_name = "DB.col_del";
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+	int ret;
+
+	WT_DB_COL_ONLY(db, method_name);
+	WT_DB_RDONLY(db, method_name);
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_COL_DEL);
+	WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+	WT_STAT_INCR(ienv->method_stats, DB_COL_DEL);
+	while ((ret = __wt_db_col_del(toc, recno)) == WT_RESTART)
+		WT_STAT_INCR(ienv->method_stats, DB_COL_DEL_RESTART);
+	WT_TRET(__wt_toc_api_clr(toc, method_name, 0));
+	return (ret);
+}
+
+static int __wt_api_db_col_get(
+	DB *db,
+	WT_TOC *toc,
+	uint64_t recno,
+	DBT *data,
+	uint32_t flags);
+static int __wt_api_db_col_get(
+	DB *db,
+	WT_TOC *toc,
+	uint64_t recno,
+	DBT *data,
+	uint32_t flags)
+{
+	const char *method_name = "DB.col_get";
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+	int ret;
+
+	WT_DB_COL_ONLY(db, method_name);
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_COL_GET);
+	WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+	WT_STAT_INCR(ienv->method_stats, DB_COL_GET);
+	ret = __wt_db_col_get(toc, recno, data);
+	WT_TRET(__wt_toc_api_clr(toc, method_name, 0));
+	return (ret);
+}
+
+static int __wt_api_db_col_put(
+	DB *db,
+	WT_TOC *toc,
+	uint64_t recno,
+	DBT *data,
+	uint32_t flags);
+static int __wt_api_db_col_put(
+	DB *db,
+	WT_TOC *toc,
+	uint64_t recno,
+	DBT *data,
+	uint32_t flags)
+{
+	const char *method_name = "DB.col_put";
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+	int ret;
+
+	WT_DB_COL_ONLY(db, method_name);
+	WT_DB_RDONLY(db, method_name);
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_COL_PUT);
+	WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+	WT_STAT_INCR(ienv->method_stats, DB_COL_PUT);
+	while ((ret = __wt_db_col_put(toc, recno, data)) == WT_RESTART)
+		WT_STAT_INCR(ienv->method_stats, DB_COL_PUT_RESTART);
+	WT_TRET(__wt_toc_api_clr(toc, method_name, 0));
+	return (ret);
+}
+
+static int __wt_api_db_column_set(
+	DB *db,
+	uint32_t fixed_len,
+	const char *dictionary,
+	uint32_t flags);
+static int __wt_api_db_column_set(
+	DB *db,
+	uint32_t fixed_len,
+	const char *dictionary,
+	uint32_t flags)
+{
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+
+	WT_ENV_FCHK(env, "DB.column_set",
+	    flags, WT_APIMASK_DB_COLUMN_SET);
+
+	WT_RET((__wt_db_column_set_verify(
+	    db, fixed_len, dictionary, flags)));
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, DB_COLUMN_SET);
+	db->fixed_len = fixed_len;
+	db->dictionary = dictionary;
+	db->flags = flags;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_db_dump(
+	DB *db,
+	FILE *stream,
+	void (*progress)(const char *, uint64_t),
+	uint32_t flags);
+static int __wt_api_db_dump(
+	DB *db,
+	FILE *stream,
+	void (*progress)(const char *, uint64_t),
+	uint32_t flags)
+{
+	const char *method_name = "DB.dump";
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+	WT_TOC *toc = NULL;
+	int ret;
+
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_DUMP);
+	WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+	WT_STAT_INCR(ienv->method_stats, DB_DUMP);
+	ret = __wt_db_dump(toc, stream, progress, flags);
+	WT_TRET(__wt_toc_api_clr(toc, method_name, 1));
+	return (ret);
+}
+
+static int __wt_api_db_errcall_get(
+	DB *db,
+	void (**errcall)(const DB *, const char *));
+static int __wt_api_db_errcall_get(
+	DB *db,
+	void (**errcall)(const DB *, const char *))
+{
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, DB_ERRCALL_GET);
+	*errcall = db->errcall;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_db_errcall_set(
+	DB *db,
+	void (*errcall)(const DB *, const char *));
+static int __wt_api_db_errcall_set(
+	DB *db,
+	void (*errcall)(const DB *, const char *))
+{
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, DB_ERRCALL_SET);
+	db->errcall = errcall;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_db_errfile_get(
+	DB *db,
+	FILE **errfile);
+static int __wt_api_db_errfile_get(
+	DB *db,
+	FILE **errfile)
+{
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, DB_ERRFILE_GET);
+	*errfile = db->errfile;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_db_errfile_set(
+	DB *db,
+	FILE *errfile);
+static int __wt_api_db_errfile_set(
+	DB *db,
+	FILE *errfile)
+{
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, DB_ERRFILE_SET);
+	db->errfile = errfile;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_db_errpfx_get(
+	DB *db,
+	const char **errpfx);
+static int __wt_api_db_errpfx_get(
+	DB *db,
+	const char **errpfx)
+{
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, DB_ERRPFX_GET);
+	*errpfx = db->errpfx;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_db_errpfx_set(
+	DB *db,
+	const char *errpfx);
+static int __wt_api_db_errpfx_set(
+	DB *db,
+	const char *errpfx)
+{
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, DB_ERRPFX_SET);
+	db->errpfx = errpfx;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_db_huffman_set(
+	DB *db,
+	uint8_t const *huffman_table,
+	u_int huffman_table_size,
+	uint32_t huffman_flags);
+static int __wt_api_db_huffman_set(
+	DB *db,
+	uint8_t const *huffman_table,
+	u_int huffman_table_size,
+	uint32_t huffman_flags)
+{
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+	int ret;
+
+	WT_ENV_FCHK(env, "DB.huffman_set",
+	    huffman_flags, WT_APIMASK_DB_HUFFMAN_SET);
+
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, DB_HUFFMAN_SET);
+	ret = __wt_db_huffman_set(
+	    db, huffman_table, huffman_table_size, huffman_flags);
+	__wt_unlock(env, ienv->mtx);
+	return (ret);
+}
+
+static int __wt_api_db_open(
+	DB *db,
+	const char *name,
+	mode_t mode,
+	uint32_t flags);
+static int __wt_api_db_open(
+	DB *db,
+	const char *name,
+	mode_t mode,
+	uint32_t flags)
+{
+	const char *method_name = "DB.open";
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+	WT_TOC *toc = NULL;
+	int ret;
+
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_OPEN);
+	WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+	WT_STAT_INCR(ienv->method_stats, DB_OPEN);
+	ret = __wt_db_open(toc, name, mode, flags);
+	WT_TRET(__wt_toc_api_clr(toc, method_name, 1));
+	return (ret);
+}
+
+static int __wt_api_db_row_del(
+	DB *db,
+	WT_TOC *toc,
+	DBT *key,
+	uint32_t flags);
+static int __wt_api_db_row_del(
+	DB *db,
+	WT_TOC *toc,
+	DBT *key,
+	uint32_t flags)
+{
+	const char *method_name = "DB.row_del";
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+	int ret;
+
+	WT_DB_ROW_ONLY(db, method_name);
+	WT_DB_RDONLY(db, method_name);
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_ROW_DEL);
+	WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+	WT_STAT_INCR(ienv->method_stats, DB_ROW_DEL);
+	while ((ret = __wt_db_row_del(toc, key)) == WT_RESTART)
+		WT_STAT_INCR(ienv->method_stats, DB_ROW_DEL_RESTART);
+	WT_TRET(__wt_toc_api_clr(toc, method_name, 0));
+	return (ret);
+}
+
+static int __wt_api_db_row_get(
+	DB *db,
+	WT_TOC *toc,
+	DBT *key,
+	DBT *data,
+	uint32_t flags);
+static int __wt_api_db_row_get(
+	DB *db,
+	WT_TOC *toc,
+	DBT *key,
+	DBT *data,
+	uint32_t flags)
+{
+	const char *method_name = "DB.row_get";
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+	int ret;
+
+	WT_DB_ROW_ONLY(db, method_name);
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_ROW_GET);
+	WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+	WT_STAT_INCR(ienv->method_stats, DB_ROW_GET);
+	ret = __wt_db_row_get(toc, key, data);
+	WT_TRET(__wt_toc_api_clr(toc, method_name, 0));
+	return (ret);
+}
+
+static int __wt_api_db_row_put(
+	DB *db,
+	WT_TOC *toc,
+	DBT *key,
+	DBT *data,
+	uint32_t flags);
+static int __wt_api_db_row_put(
+	DB *db,
+	WT_TOC *toc,
+	DBT *key,
+	DBT *data,
+	uint32_t flags)
+{
+	const char *method_name = "DB.row_put";
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+	int ret;
+
+	WT_DB_ROW_ONLY(db, method_name);
+	WT_DB_RDONLY(db, method_name);
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_ROW_PUT);
+	WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+	WT_STAT_INCR(ienv->method_stats, DB_ROW_PUT);
+	while ((ret = __wt_db_row_put(toc, key, data)) == WT_RESTART)
+		WT_STAT_INCR(ienv->method_stats, DB_ROW_PUT_RESTART);
+	WT_TRET(__wt_toc_api_clr(toc, method_name, 0));
+	return (ret);
+}
+
+static int __wt_api_db_stat_clear(
+	DB *db,
+	uint32_t flags);
+static int __wt_api_db_stat_clear(
+	DB *db,
+	uint32_t flags)
+{
+	const char *method_name = "DB.stat_clear";
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+	int ret;
+
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_STAT_CLEAR);
+	WT_STAT_INCR(ienv->method_stats, DB_STAT_CLEAR);
+	ret = __wt_db_stat_clear(db);
+	return (ret);
+}
+
+static int __wt_api_db_stat_print(
+	DB *db,
+	FILE *stream,
+	uint32_t flags);
+static int __wt_api_db_stat_print(
+	DB *db,
+	FILE *stream,
+	uint32_t flags)
+{
+	const char *method_name = "DB.stat_print";
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+	WT_TOC *toc = NULL;
+	int ret;
+
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_STAT_PRINT);
+	WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+	WT_STAT_INCR(ienv->method_stats, DB_STAT_PRINT);
+	ret = __wt_db_stat_print(toc, stream);
+	WT_TRET(__wt_toc_api_clr(toc, method_name, 1));
+	return (ret);
+}
+
+static int __wt_api_db_sync(
+	DB *db,
+	void (*progress)(const char *, uint64_t),
+	uint32_t flags);
+static int __wt_api_db_sync(
+	DB *db,
+	void (*progress)(const char *, uint64_t),
+	uint32_t flags)
+{
+	const char *method_name = "DB.sync";
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+	WT_TOC *toc = NULL;
+	int ret;
+
+	WT_DB_RDONLY(db, method_name);
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_SYNC);
+	WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+	WT_STAT_INCR(ienv->method_stats, DB_SYNC);
+	ret = __wt_db_sync(toc, progress, flags);
+	WT_TRET(__wt_toc_api_clr(toc, method_name, 1));
+	return (ret);
+}
+
+static int __wt_api_db_verify(
+	DB *db,
+	void (*progress)(const char *, uint64_t),
+	uint32_t flags);
+static int __wt_api_db_verify(
+	DB *db,
+	void (*progress)(const char *, uint64_t),
+	uint32_t flags)
+{
+	const char *method_name = "DB.verify";
+	ENV *env = db->env;
+	IENV *ienv = env->ienv;
+	WT_TOC *toc = NULL;
+	int ret;
+
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_VERIFY);
+	WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+	WT_STAT_INCR(ienv->method_stats, DB_VERIFY);
+	ret = __wt_db_verify(toc, progress);
+	WT_TRET(__wt_toc_api_clr(toc, method_name, 1));
+	return (ret);
+}
+
+static int __wt_api_env_cache_size_get(
+	ENV *env,
+	uint32_t *cache_size);
+static int __wt_api_env_cache_size_get(
+	ENV *env,
+	uint32_t *cache_size)
+{
+	IENV *ienv = env->ienv;
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_CACHE_SIZE_GET);
+	*cache_size = env->cache_size;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_cache_size_set(
+	ENV *env,
+	uint32_t cache_size);
+static int __wt_api_env_cache_size_set(
+	ENV *env,
+	uint32_t cache_size)
+{
+	IENV *ienv = env->ienv;
+	WT_RET((__wt_env_cache_size_set_verify(
+	    env, cache_size)));
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_CACHE_SIZE_SET);
+	env->cache_size = cache_size;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_close(
+	ENV *env,
+	uint32_t flags);
+static int __wt_api_env_close(
+	ENV *env,
+	uint32_t flags)
+{
+	const char *method_name = "ENV.close";
+	IENV *ienv = env->ienv;
+	int ret;
+
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_CLOSE);
+	WT_STAT_INCR(ienv->method_stats, ENV_CLOSE);
+	ret = __wt_env_close(env);
+	return (ret);
+}
+
+static int __wt_api_env_data_update_initial_get(
+	ENV *env,
+	uint32_t *data_update_initial);
+static int __wt_api_env_data_update_initial_get(
+	ENV *env,
+	uint32_t *data_update_initial)
+{
+	IENV *ienv = env->ienv;
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_DATA_UPDATE_INITIAL_GET);
+	*data_update_initial = env->data_update_initial;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_data_update_initial_set(
+	ENV *env,
+	uint32_t data_update_initial);
+static int __wt_api_env_data_update_initial_set(
+	ENV *env,
+	uint32_t data_update_initial)
+{
+	IENV *ienv = env->ienv;
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_DATA_UPDATE_INITIAL_SET);
+	env->data_update_initial = data_update_initial;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_data_update_max_get(
+	ENV *env,
+	uint32_t *data_update_max);
+static int __wt_api_env_data_update_max_get(
+	ENV *env,
+	uint32_t *data_update_max)
+{
+	IENV *ienv = env->ienv;
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_DATA_UPDATE_MAX_GET);
+	*data_update_max = env->data_update_max;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_data_update_max_set(
+	ENV *env,
+	uint32_t data_update_max);
+static int __wt_api_env_data_update_max_set(
+	ENV *env,
+	uint32_t data_update_max)
+{
+	IENV *ienv = env->ienv;
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_DATA_UPDATE_MAX_SET);
+	env->data_update_max = data_update_max;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_db(
+	ENV *env,
+	uint32_t flags,
+	DB **dbp);
+static int __wt_api_env_db(
+	ENV *env,
+	uint32_t flags,
+	DB **dbp)
+{
+	const char *method_name = "ENV.db";
+	IENV *ienv = env->ienv;
+	int ret;
+
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_DB);
+	WT_STAT_INCR(ienv->method_stats, ENV_DB);
+	ret = __wt_env_db(env, dbp);
+	return (ret);
+}
+
+static int __wt_api_env_errcall_get(
+	ENV *env,
+	void (**errcall)(const ENV *, const char *));
+static int __wt_api_env_errcall_get(
+	ENV *env,
+	void (**errcall)(const ENV *, const char *))
+{
+	IENV *ienv = env->ienv;
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_ERRCALL_GET);
+	*errcall = env->errcall;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_errcall_set(
+	ENV *env,
+	void (*errcall)(const ENV *, const char *));
+static int __wt_api_env_errcall_set(
+	ENV *env,
+	void (*errcall)(const ENV *, const char *))
+{
+	IENV *ienv = env->ienv;
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_ERRCALL_SET);
+	env->errcall = errcall;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_errfile_get(
+	ENV *env,
+	FILE **errfile);
+static int __wt_api_env_errfile_get(
+	ENV *env,
+	FILE **errfile)
+{
+	IENV *ienv = env->ienv;
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_ERRFILE_GET);
+	*errfile = env->errfile;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_errfile_set(
+	ENV *env,
+	FILE *errfile);
+static int __wt_api_env_errfile_set(
+	ENV *env,
+	FILE *errfile)
+{
+	IENV *ienv = env->ienv;
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_ERRFILE_SET);
+	env->errfile = errfile;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_errpfx_get(
+	ENV *env,
+	const char **errpfx);
+static int __wt_api_env_errpfx_get(
+	ENV *env,
+	const char **errpfx)
+{
+	IENV *ienv = env->ienv;
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_ERRPFX_GET);
+	*errpfx = env->errpfx;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_errpfx_set(
+	ENV *env,
+	const char *errpfx);
+static int __wt_api_env_errpfx_set(
+	ENV *env,
+	const char *errpfx)
+{
+	IENV *ienv = env->ienv;
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_ERRPFX_SET);
+	env->errpfx = errpfx;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_hazard_size_get(
+	ENV *env,
+	uint32_t *hazard_size);
+static int __wt_api_env_hazard_size_get(
+	ENV *env,
+	uint32_t *hazard_size)
+{
+	IENV *ienv = env->ienv;
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_HAZARD_SIZE_GET);
+	*hazard_size = env->hazard_size;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_hazard_size_set(
+	ENV *env,
+	uint32_t hazard_size);
+static int __wt_api_env_hazard_size_set(
+	ENV *env,
+	uint32_t hazard_size)
+{
+	IENV *ienv = env->ienv;
+	WT_RET((__wt_env_hazard_size_set_verify(
+	    env, hazard_size)));
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_HAZARD_SIZE_SET);
+	env->hazard_size = hazard_size;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_msgcall_get(
+	ENV *env,
+	void (**msgcall)(const ENV *, const char *));
+static int __wt_api_env_msgcall_get(
+	ENV *env,
+	void (**msgcall)(const ENV *, const char *))
+{
+	IENV *ienv = env->ienv;
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_MSGCALL_GET);
+	*msgcall = env->msgcall;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_msgcall_set(
+	ENV *env,
+	void (*msgcall)(const ENV *, const char *));
+static int __wt_api_env_msgcall_set(
+	ENV *env,
+	void (*msgcall)(const ENV *, const char *))
+{
+	IENV *ienv = env->ienv;
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_MSGCALL_SET);
+	env->msgcall = msgcall;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_msgfile_get(
+	ENV *env,
+	FILE **msgfile);
+static int __wt_api_env_msgfile_get(
+	ENV *env,
+	FILE **msgfile)
+{
+	IENV *ienv = env->ienv;
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_MSGFILE_GET);
+	*msgfile = env->msgfile;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_msgfile_set(
+	ENV *env,
+	FILE *msgfile);
+static int __wt_api_env_msgfile_set(
+	ENV *env,
+	FILE *msgfile)
+{
+	IENV *ienv = env->ienv;
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_MSGFILE_SET);
+	env->msgfile = msgfile;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_open(
+	ENV *env,
+	const char *home,
+	mode_t mode,
+	uint32_t flags);
+static int __wt_api_env_open(
+	ENV *env,
+	const char *home,
+	mode_t mode,
+	uint32_t flags)
+{
+	const char *method_name = "ENV.open";
+	IENV *ienv = env->ienv;
+	int ret;
+
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_OPEN);
+	WT_STAT_INCR(ienv->method_stats, ENV_OPEN);
+	ret = __wt_env_open(env, home, mode);
+	return (ret);
+}
+
+static int __wt_api_env_stat_clear(
+	ENV *env,
+	uint32_t flags);
+static int __wt_api_env_stat_clear(
+	ENV *env,
+	uint32_t flags)
+{
+	const char *method_name = "ENV.stat_clear";
+	IENV *ienv = env->ienv;
+	int ret;
+
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_STAT_CLEAR);
+	WT_STAT_INCR(ienv->method_stats, ENV_STAT_CLEAR);
+	ret = __wt_env_stat_clear(env);
+	return (ret);
+}
+
+static int __wt_api_env_stat_print(
+	ENV *env,
+	FILE *stream,
+	uint32_t flags);
+static int __wt_api_env_stat_print(
+	ENV *env,
+	FILE *stream,
+	uint32_t flags)
+{
+	const char *method_name = "ENV.stat_print";
+	IENV *ienv = env->ienv;
+	int ret;
+
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_STAT_PRINT);
+	WT_STAT_INCR(ienv->method_stats, ENV_STAT_PRINT);
+	ret = __wt_env_stat_print(env, stream);
+	return (ret);
+}
+
+static int __wt_api_env_sync(
+	ENV *env,
+	void (*progress)(const char *, uint64_t),
+	uint32_t flags);
+static int __wt_api_env_sync(
+	ENV *env,
+	void (*progress)(const char *, uint64_t),
+	uint32_t flags)
+{
+	const char *method_name = "ENV.sync";
+	IENV *ienv = env->ienv;
+	int ret;
+
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_SYNC);
+	WT_STAT_INCR(ienv->method_stats, ENV_SYNC);
+	ret = __wt_env_sync(env, progress);
+	return (ret);
+}
+
+static int __wt_api_env_toc(
+	ENV *env,
+	uint32_t flags,
+	WT_TOC **tocp);
+static int __wt_api_env_toc(
+	ENV *env,
+	uint32_t flags,
+	WT_TOC **tocp)
+{
+	const char *method_name = "ENV.toc";
+	IENV *ienv = env->ienv;
+	int ret;
+
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_TOC);
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_TOC);
+	ret = __wt_env_toc(env, tocp);
+	__wt_unlock(env, ienv->mtx);
+	return (ret);
+}
+
+static int __wt_api_env_toc_size_get(
+	ENV *env,
+	uint32_t *toc_size);
+static int __wt_api_env_toc_size_get(
+	ENV *env,
+	uint32_t *toc_size)
+{
+	IENV *ienv = env->ienv;
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_TOC_SIZE_GET);
+	*toc_size = env->toc_size;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_toc_size_set(
+	ENV *env,
+	uint32_t toc_size);
+static int __wt_api_env_toc_size_set(
+	ENV *env,
+	uint32_t toc_size)
+{
+	IENV *ienv = env->ienv;
+	WT_RET((__wt_env_toc_size_set_verify(
+	    env, toc_size)));
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_TOC_SIZE_SET);
+	env->toc_size = toc_size;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_verbose_get(
+	ENV *env,
+	uint32_t *verbose);
+static int __wt_api_env_verbose_get(
+	ENV *env,
+	uint32_t *verbose)
+{
+	IENV *ienv = env->ienv;
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_VERBOSE_GET);
+	*verbose = env->verbose;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_env_verbose_set(
+	ENV *env,
+	uint32_t verbose);
+static int __wt_api_env_verbose_set(
+	ENV *env,
+	uint32_t verbose)
+{
+	IENV *ienv = env->ienv;
+	WT_RET((__wt_env_verbose_set_verify(
+	    env, verbose)));
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, ENV_VERBOSE_SET);
+	env->verbose = verbose;
+	__wt_unlock(env, ienv->mtx);
+	return (0);
+}
+
+static int __wt_api_wt_toc_close(
+	WT_TOC *wt_toc,
+	uint32_t flags);
+static int __wt_api_wt_toc_close(
+	WT_TOC *wt_toc,
+	uint32_t flags)
+{
+	const char *method_name = "WT_TOC.close";
+	ENV *env = wt_toc->env;
+	IENV *ienv = env->ienv;
+	int ret;
+
+	WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_WT_TOC_CLOSE);
+	__wt_lock(env, ienv->mtx);
+	WT_STAT_INCR(ienv->method_stats, WT_TOC_CLOSE);
+	ret = __wt_wt_toc_close(wt_toc);
+	__wt_unlock(env, ienv->mtx);
+	return (ret);
+}
+
+void
+__wt_methods_db_config_default(DB *db)
+{
+	db->btree_compare_dup = __wt_bt_lex_compare;
+	db->btree_compare = __wt_bt_lex_compare;
+}
+
+void
+__wt_methods_db_lockout(DB *db)
+{
+	db->btree_compare_dup_get = (int (*)
+	    (DB *, int (**)(DB *, const DBT *, const DBT *)))
+	    __wt_db_lockout;
+	db->btree_compare_dup_set = (int (*)
+	    (DB *, int (*)(DB *, const DBT *, const DBT *)))
+	    __wt_db_lockout;
+	db->btree_compare_get = (int (*)
+	    (DB *, int (**)(DB *, const DBT *, const DBT *)))
+	    __wt_db_lockout;
+	db->btree_compare_int_get = (int (*)
+	    (DB *, int *))
+	    __wt_db_lockout;
+	db->btree_compare_int_set = (int (*)
+	    (DB *, int ))
+	    __wt_db_lockout;
+	db->btree_compare_set = (int (*)
+	    (DB *, int (*)(DB *, const DBT *, const DBT *)))
+	    __wt_db_lockout;
+	db->btree_dup_offpage_get = (int (*)
+	    (DB *, uint32_t *))
+	    __wt_db_lockout;
+	db->btree_dup_offpage_set = (int (*)
+	    (DB *, uint32_t ))
+	    __wt_db_lockout;
+	db->btree_itemsize_get = (int (*)
+	    (DB *, uint32_t *, uint32_t *))
+	    __wt_db_lockout;
+	db->btree_itemsize_set = (int (*)
+	    (DB *, uint32_t , uint32_t ))
+	    __wt_db_lockout;
+	db->btree_pagesize_get = (int (*)
+	    (DB *, uint32_t *, uint32_t *, uint32_t *, uint32_t *, uint32_t *))
+	    __wt_db_lockout;
+	db->btree_pagesize_set = (int (*)
+	    (DB *, uint32_t , uint32_t , uint32_t , uint32_t , uint32_t ))
+	    __wt_db_lockout;
+	db->bulk_load = (int (*)
+	    (DB *, uint32_t , void (*)(const char *, uint64_t), int (*)(DB *, DBT **, DBT **)))
+	    __wt_db_lockout;
+	db->col_del = (int (*)
+	    (DB *, WT_TOC *, uint64_t , uint32_t ))
+	    __wt_db_lockout;
+	db->col_get = (int (*)
+	    (DB *, WT_TOC *, uint64_t , DBT *, uint32_t ))
+	    __wt_db_lockout;
+	db->col_put = (int (*)
+	    (DB *, WT_TOC *, uint64_t , DBT *, uint32_t ))
+	    __wt_db_lockout;
+	db->column_set = (int (*)
+	    (DB *, uint32_t , const char *, uint32_t ))
+	    __wt_db_lockout;
+	db->dump = (int (*)
+	    (DB *, FILE *, void (*)(const char *, uint64_t), uint32_t ))
+	    __wt_db_lockout;
+	db->err = (void (*)
+	    (DB *, int , const char *, ...))
+	    __wt_db_lockout;
+	db->errcall_get = (int (*)
+	    (DB *, void (**)(const DB *, const char *)))
+	    __wt_db_lockout;
+	db->errcall_set = (int (*)
+	    (DB *, void (*)(const DB *, const char *)))
+	    __wt_db_lockout;
+	db->errfile_get = (int (*)
+	    (DB *, FILE **))
+	    __wt_db_lockout;
+	db->errfile_set = (int (*)
+	    (DB *, FILE *))
+	    __wt_db_lockout;
+	db->errpfx_get = (int (*)
+	    (DB *, const char **))
+	    __wt_db_lockout;
+	db->errpfx_set = (int (*)
+	    (DB *, const char *))
+	    __wt_db_lockout;
+	db->errx = (void (*)
+	    (DB *, const char *, ...))
+	    __wt_db_lockout;
+	db->huffman_set = (int (*)
+	    (DB *, uint8_t const *, u_int , uint32_t ))
+	    __wt_db_lockout;
+	db->open = (int (*)
+	    (DB *, const char *, mode_t , uint32_t ))
+	    __wt_db_lockout;
+	db->row_del = (int (*)
+	    (DB *, WT_TOC *, DBT *, uint32_t ))
+	    __wt_db_lockout;
+	db->row_get = (int (*)
+	    (DB *, WT_TOC *, DBT *, DBT *, uint32_t ))
+	    __wt_db_lockout;
+	db->row_put = (int (*)
+	    (DB *, WT_TOC *, DBT *, DBT *, uint32_t ))
+	    __wt_db_lockout;
+	db->stat_clear = (int (*)
+	    (DB *, uint32_t ))
+	    __wt_db_lockout;
+	db->stat_print = (int (*)
+	    (DB *, FILE *, uint32_t ))
+	    __wt_db_lockout;
+	db->sync = (int (*)
+	    (DB *, void (*)(const char *, uint64_t), uint32_t ))
+	    __wt_db_lockout;
+	db->verify = (int (*)
+	    (DB *, void (*)(const char *, uint64_t), uint32_t ))
+	    __wt_db_lockout;
+}
+
+void
+__wt_methods_db_init_transition(DB *db)
+{
+	db->btree_compare_dup_get = __wt_api_db_btree_compare_dup_get;
+	db->btree_compare_dup_set = __wt_api_db_btree_compare_dup_set;
+	db->btree_compare_get = __wt_api_db_btree_compare_get;
+	db->btree_compare_int_get = __wt_api_db_btree_compare_int_get;
+	db->btree_compare_int_set = __wt_api_db_btree_compare_int_set;
+	db->btree_compare_set = __wt_api_db_btree_compare_set;
+	db->btree_dup_offpage_get = __wt_api_db_btree_dup_offpage_get;
+	db->btree_dup_offpage_set = __wt_api_db_btree_dup_offpage_set;
+	db->btree_itemsize_get = __wt_api_db_btree_itemsize_get;
+	db->btree_itemsize_set = __wt_api_db_btree_itemsize_set;
+	db->btree_pagesize_get = __wt_api_db_btree_pagesize_get;
+	db->btree_pagesize_set = __wt_api_db_btree_pagesize_set;
+	db->close = __wt_api_db_close;
+	db->column_set = __wt_api_db_column_set;
+	db->err = __wt_api_db_err;
+	db->errcall_get = __wt_api_db_errcall_get;
+	db->errcall_set = __wt_api_db_errcall_set;
+	db->errfile_get = __wt_api_db_errfile_get;
+	db->errfile_set = __wt_api_db_errfile_set;
+	db->errpfx_get = __wt_api_db_errpfx_get;
+	db->errpfx_set = __wt_api_db_errpfx_set;
+	db->errx = __wt_api_db_errx;
+	db->huffman_set = __wt_api_db_huffman_set;
+	db->open = __wt_api_db_open;
+}
+
+void
+__wt_methods_db_open_transition(DB *db)
+{
+	db->btree_compare_dup_set = (int (*)
+	    (DB *, int (*)(DB *, const DBT *, const DBT *)))
+	    __wt_db_lockout;
+	db->btree_compare_int_set = (int (*)
+	    (DB *, int ))
+	    __wt_db_lockout;
+	db->btree_compare_set = (int (*)
+	    (DB *, int (*)(DB *, const DBT *, const DBT *)))
+	    __wt_db_lockout;
+	db->btree_dup_offpage_set = (int (*)
+	    (DB *, uint32_t ))
+	    __wt_db_lockout;
+	db->btree_itemsize_set = (int (*)
+	    (DB *, uint32_t , uint32_t ))
+	    __wt_db_lockout;
+	db->btree_pagesize_set = (int (*)
+	    (DB *, uint32_t , uint32_t , uint32_t , uint32_t , uint32_t ))
+	    __wt_db_lockout;
+	db->column_set = (int (*)
+	    (DB *, uint32_t , const char *, uint32_t ))
+	    __wt_db_lockout;
+	db->huffman_set = (int (*)
+	    (DB *, uint8_t const *, u_int , uint32_t ))
+	    __wt_db_lockout;
+	db->bulk_load = __wt_api_db_bulk_load;
+	db->col_del = __wt_api_db_col_del;
+	db->col_get = __wt_api_db_col_get;
+	db->col_put = __wt_api_db_col_put;
+	db->dump = __wt_api_db_dump;
+	db->row_del = __wt_api_db_row_del;
+	db->row_get = __wt_api_db_row_get;
+	db->row_put = __wt_api_db_row_put;
+	db->stat_clear = __wt_api_db_stat_clear;
+	db->stat_print = __wt_api_db_stat_print;
+	db->sync = __wt_api_db_sync;
+	db->verify = __wt_api_db_verify;
+}
+
+void
+__wt_methods_env_config_default(ENV *env)
+{
+	env->cache_size = 20;
+	env->data_update_initial = 8 * 1024;
+	env->data_update_max = 32 * 1024;
+	env->hazard_size = 15;
+	env->toc_size = 50;
+}
+
+void
+__wt_methods_env_lockout(ENV *env)
+{
+	env->cache_size_get = (int (*)
+	    (ENV *, uint32_t *))
+	    __wt_env_lockout;
+	env->cache_size_set = (int (*)
+	    (ENV *, uint32_t ))
+	    __wt_env_lockout;
+	env->data_update_initial_get = (int (*)
+	    (ENV *, uint32_t *))
+	    __wt_env_lockout;
+	env->data_update_initial_set = (int (*)
+	    (ENV *, uint32_t ))
+	    __wt_env_lockout;
+	env->data_update_max_get = (int (*)
+	    (ENV *, uint32_t *))
+	    __wt_env_lockout;
+	env->data_update_max_set = (int (*)
+	    (ENV *, uint32_t ))
+	    __wt_env_lockout;
+	env->db = (int (*)
+	    (ENV *, uint32_t , DB **))
+	    __wt_env_lockout;
+	env->err = (void (*)
+	    (ENV *, int , const char *, ...))
+	    __wt_env_lockout;
+	env->errcall_get = (int (*)
+	    (ENV *, void (**)(const ENV *, const char *)))
+	    __wt_env_lockout;
+	env->errcall_set = (int (*)
+	    (ENV *, void (*)(const ENV *, const char *)))
+	    __wt_env_lockout;
+	env->errfile_get = (int (*)
+	    (ENV *, FILE **))
+	    __wt_env_lockout;
+	env->errfile_set = (int (*)
+	    (ENV *, FILE *))
+	    __wt_env_lockout;
+	env->errpfx_get = (int (*)
+	    (ENV *, const char **))
+	    __wt_env_lockout;
+	env->errpfx_set = (int (*)
+	    (ENV *, const char *))
+	    __wt_env_lockout;
+	env->errx = (void (*)
+	    (ENV *, const char *, ...))
+	    __wt_env_lockout;
+	env->hazard_size_get = (int (*)
+	    (ENV *, uint32_t *))
+	    __wt_env_lockout;
+	env->hazard_size_set = (int (*)
+	    (ENV *, uint32_t ))
+	    __wt_env_lockout;
+	env->msgcall_get = (int (*)
+	    (ENV *, void (**)(const ENV *, const char *)))
+	    __wt_env_lockout;
+	env->msgcall_set = (int (*)
+	    (ENV *, void (*)(const ENV *, const char *)))
+	    __wt_env_lockout;
+	env->msgfile_get = (int (*)
+	    (ENV *, FILE **))
+	    __wt_env_lockout;
+	env->msgfile_set = (int (*)
+	    (ENV *, FILE *))
+	    __wt_env_lockout;
+	env->open = (int (*)
+	    (ENV *, const char *, mode_t , uint32_t ))
+	    __wt_env_lockout;
+	env->stat_clear = (int (*)
+	    (ENV *, uint32_t ))
+	    __wt_env_lockout;
+	env->stat_print = (int (*)
+	    (ENV *, FILE *, uint32_t ))
+	    __wt_env_lockout;
+	env->sync = (int (*)
+	    (ENV *, void (*)(const char *, uint64_t), uint32_t ))
+	    __wt_env_lockout;
+	env->toc = (int (*)
+	    (ENV *, uint32_t , WT_TOC **))
+	    __wt_env_lockout;
+	env->toc_size_get = (int (*)
+	    (ENV *, uint32_t *))
+	    __wt_env_lockout;
+	env->toc_size_set = (int (*)
+	    (ENV *, uint32_t ))
+	    __wt_env_lockout;
+	env->verbose_get = (int (*)
+	    (ENV *, uint32_t *))
+	    __wt_env_lockout;
+	env->verbose_set = (int (*)
+	    (ENV *, uint32_t ))
+	    __wt_env_lockout;
+}
+
+void
+__wt_methods_env_init_transition(ENV *env)
+{
+	env->cache_size_get = __wt_api_env_cache_size_get;
+	env->cache_size_set = __wt_api_env_cache_size_set;
+	env->close = __wt_api_env_close;
+	env->data_update_initial_get = __wt_api_env_data_update_initial_get;
+	env->data_update_initial_set = __wt_api_env_data_update_initial_set;
+	env->data_update_max_get = __wt_api_env_data_update_max_get;
+	env->data_update_max_set = __wt_api_env_data_update_max_set;
+	env->err = __wt_api_env_err;
+	env->errcall_get = __wt_api_env_errcall_get;
+	env->errcall_set = __wt_api_env_errcall_set;
+	env->errfile_get = __wt_api_env_errfile_get;
+	env->errfile_set = __wt_api_env_errfile_set;
+	env->errpfx_get = __wt_api_env_errpfx_get;
+	env->errpfx_set = __wt_api_env_errpfx_set;
+	env->errx = __wt_api_env_errx;
+	env->hazard_size_get = __wt_api_env_hazard_size_get;
+	env->hazard_size_set = __wt_api_env_hazard_size_set;
+	env->msgcall_get = __wt_api_env_msgcall_get;
+	env->msgcall_set = __wt_api_env_msgcall_set;
+	env->msgfile_get = __wt_api_env_msgfile_get;
+	env->msgfile_set = __wt_api_env_msgfile_set;
+	env->open = __wt_api_env_open;
+	env->stat_clear = __wt_api_env_stat_clear;
+	env->stat_print = __wt_api_env_stat_print;
+	env->toc_size_get = __wt_api_env_toc_size_get;
+	env->toc_size_set = __wt_api_env_toc_size_set;
+	env->verbose_get = __wt_api_env_verbose_get;
+	env->verbose_set = __wt_api_env_verbose_set;
+}
+
+void
+__wt_methods_env_open_transition(ENV *env)
+{
+	env->cache_size_set = (int (*)
+	    (ENV *, uint32_t ))
+	    __wt_env_lockout;
+	env->hazard_size_set = (int (*)
+	    (ENV *, uint32_t ))
+	    __wt_env_lockout;
+	env->open = (int (*)
+	    (ENV *, const char *, mode_t , uint32_t ))
+	    __wt_env_lockout;
+	env->toc_size_set = (int (*)
+	    (ENV *, uint32_t ))
+	    __wt_env_lockout;
+	env->db = __wt_api_env_db;
+	env->sync = __wt_api_env_sync;
+	env->toc = __wt_api_env_toc;
+}
+
+void
+__wt_methods_wt_toc_lockout(WT_TOC *wt_toc)
+{
+	WT_CC_QUIET(wt_toc, NULL);
+}
+
+void
+__wt_methods_wt_toc_init_transition(WT_TOC *wt_toc)
+{
+	wt_toc->close = __wt_api_wt_toc_close;
+}
+
diff --git a/src/support/cksum.c b/src/support/cksum.c
new file mode 100644
index 00000000000..06b0e625b0d
--- /dev/null
+++ b/src/support/cksum.c
@@ -0,0 +1,134 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cksum --
+ *	Return a checksum for a chunk of memory.
+ *
+ *	Algorithm 3 from Richard Black's discussion of CRC32.
+ *	http://www.cl.cam.ac.uk/research/srg/
+ *	    bluebook/21/crc/node6.html#SECTION00060000000000000000
+ */
+uint32_t
+__wt_cksum(void *chunk, uint32_t bytes)
+{
+	#if 0
+	/*
+	 * Code to generate the crctab.
+	 */
+	#define	__QUOTIENT	0x04c11db7
+	int
+	main()
+	{
+		int i, j;
+		unsigned int crc, crctab[256];
+
+		for (i = 0; i < 256; i++) {
+			crc = i << 24;
+			for (j = 0; j < 8; j++) {
+				if (crc & 0x80000000)
+					crc = (crc << 1) ^ __QUOTIENT;
+				else
+					crc = crc << 1;
+			}
+			crctab[i] = crc;
+		}
+		for (i = 0; i < 256;) {
+			printf("0x%08lx, ", (unsigned long)crctab[i]);
+			if (++i % 4 == 0)
+				printf("\n");
+		}
+		return (0);
+	}
+	#endif
+
+	static const uint32_t crctab[256] = {
+		0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9,
+		0x130476dc, 0x17c56b6b, 0x1a864db2, 0x1e475005,
+		0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61,
+		0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd,
+		0x4c11db70, 0x48d0c6c7, 0x4593e01e, 0x4152fda9,
+		0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75,
+		0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011,
+		0x791d4014, 0x7ddc5da3, 0x709f7b7a, 0x745e66cd,
+		0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039,
+		0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5,
+		0xbe2b5b58, 0xbaea46ef, 0xb7a96036, 0xb3687d81,
+		0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d,
+		0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49,
+		0xc7361b4c, 0xc3f706fb, 0xceb42022, 0xca753d95,
+		0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1,
+		0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d,
+		0x34867077, 0x30476dc0, 0x3d044b19, 0x39c556ae,
+		0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072,
+		0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16,
+		0x018aeb13, 0x054bf6a4, 0x0808d07d, 0x0cc9cdca,
+		0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde,
+		0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02,
+		0x5e9f46bf, 0x5a5e5b08, 0x571d7dd1, 0x53dc6066,
+		0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba,
+		0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e,
+		0xbfa1b04b, 0xbb60adfc, 0xb6238b25, 0xb2e29692,
+		0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6,
+		0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a,
+		0xe0b41de7, 0xe4750050, 0xe9362689, 0xedf73b3e,
+		0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2,
+		0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686,
+		0xd5b88683, 0xd1799b34, 0xdc3abded, 0xd8fba05a,
+		0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637,
+		0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb,
+		0x4f040d56, 0x4bc510e1, 0x46863638, 0x42472b8f,
+		0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53,
+		0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47,
+		0x36194d42, 0x32d850f5, 0x3f9b762c, 0x3b5a6b9b,
+		0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff,
+		0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623,
+		0xf12f560e, 0xf5ee4bb9, 0xf8ad6d60, 0xfc6c70d7,
+		0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b,
+		0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f,
+		0xc423cd6a, 0xc0e2d0dd, 0xcda1f604, 0xc960ebb3,
+		0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7,
+		0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b,
+		0x9b3660c6, 0x9ff77d71, 0x92b45ba8, 0x9675461f,
+		0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3,
+		0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640,
+		0x4e8ee645, 0x4a4ffbf2, 0x470cdd2b, 0x43cdc09c,
+		0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8,
+		0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24,
+		0x119b4be9, 0x155a565e, 0x18197087, 0x1cd86d30,
+		0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec,
+		0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088,
+		0x2497d08d, 0x2056cd3a, 0x2d15ebe3, 0x29d4f654,
+		0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0,
+		0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c,
+		0xe3a1cbc1, 0xe760d676, 0xea23f0af, 0xeee2ed18,
+		0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4,
+		0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0,
+		0x9abc8bd5, 0x9e7d9662, 0x933eb0bb, 0x97ffad0c,
+		0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668,
+		0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4
+	};
+	uint32_t i, result;
+	uint8_t *data;
+
+	data = chunk;
+	result = *data++ << 24;
+	result |= *data++ << 16;
+	result |= *data++ << 8;
+	result |= *data++;
+	result = ~result;
+	bytes -= 4;
+
+	for (i = 0; i < bytes; ++i)
+		result = (result << 8 | *data++) ^ crctab[result >> 24];
+
+	return (~result);
+}
diff --git a/src/support/err.c b/src/support/err.c
new file mode 100644
index 00000000000..dc8eac01189
--- /dev/null
+++ b/src/support/err.c
@@ -0,0 +1,247 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_msg_call --
+ *	Pass a message to a callback function.
+ */
+void
+__wt_msg_call(void *cb, void *handle,
+    const char *pfx1, const char *pfx2,
+    int error, const char *fmt, va_list ap)
+{
+	size_t len;
+	int separator;
+
+	/*
+	 * !!!
+	 * SECURITY:
+	 * Buffer placed at the end of the stack in case snprintf overflows.
+	 */
+	char s[2048];
+
+	len = 0;
+	separator = 0;
+	s[0] = '\0';
+	if (pfx1 != NULL) {
+		len += (size_t)snprintf(s + len, sizeof(s) - len, "%s", pfx1);
+		separator = 1;
+	}
+	if (pfx2 != NULL && len < sizeof(s) - 1) {
+		len += (size_t)snprintf(s + len, sizeof(s) - len,
+		    "%s%s", separator ? ": " : "", pfx2);
+		separator = 1;
+	}
+	if (separator && len < sizeof(s) - 1)
+		len += (size_t)snprintf(s + len, sizeof(s) - len, ": ");
+	if (len < sizeof(s) - 1)
+		len += (size_t)vsnprintf(s + len, sizeof(s) - len, fmt, ap);
+	if (error != 0 && len < sizeof(s) - 1)
+		(void)snprintf(s + len,
+		    sizeof(s) - len, ": %s", wiredtiger_strerror(error));
+
+	((void (*)(void *, const char *))cb)(handle, s);
+}
+
+/*
+ * __wt_msg_stream --
+ *	Write a message to a FILE stream.
+ */
+void
+__wt_msg_stream(FILE *fp,
+    const char *pfx1, const char *pfx2, int error, const char *fmt, va_list ap)
+{
+	if (fp == NULL)
+		fp = stderr;
+
+	if (pfx1 != NULL)
+		(void)fprintf(fp, "%s: ", pfx1);
+	if (pfx2 != NULL)
+		(void)fprintf(fp, "%s: ", pfx2);
+	(void)vfprintf(fp, fmt, ap);
+	if (error != 0)
+		(void)fprintf(fp, ": %s", wiredtiger_strerror(error));
+	(void)fprintf(fp, "\n");
+	(void)fflush(fp);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_assert --
+ *	Internal version of assert function.
+ */
+void
+__wt_assert(ENV *env, const char *check, const char *file_name, int line_number)
+{
+	__wt_api_env_errx(env,
+	    "assertion failure: %s/%d: \"%s\"", file_name, line_number, check);
+
+	__wt_abort(env);
+	/* NOTREACHED */
+}
+#endif
+
+/*
+ * __wt_api_args --
+ *	Print a standard error message when an API function is passed illegal
+ *	arguments.
+ */
+int
+__wt_api_args(ENV *env, const char *name)
+{
+	__wt_api_env_errx(env,
+	    "%s: illegal API arguments or flag values specified", name);
+	return (WT_ERROR);
+}
+
+/*
+ * __wt_api_arg_min --
+ *	Print a standard error message when an API function is passed a
+ *	too-small argument.
+ */
+int
+__wt_api_arg_min(ENV *env,
+    const char *name, const char *arg_name, uint32_t v, uint32_t min)
+{
+	if (v >= min)
+		return (0);
+
+	__wt_api_env_errx(env,
+	    "%s: %s argument less than minimum value of %lu",
+	    name, arg_name, (u_long)min);
+	return (WT_ERROR);
+}
+
+/*
+ * __wt_api_arg_max --
+ *	Print a standard error message when an API function is passed a
+ *	too-large argument.
+ */
+int
+__wt_api_arg_max(ENV *env,
+    const char *name, const char *arg_name, uint32_t v, uint32_t max)
+{
+	if (v <= max)
+		return (0);
+
+	__wt_api_env_errx(env,
+	    "%s: %s argument larger than maximum value of %lu",
+	    name, arg_name, (u_long)max);
+	return (WT_ERROR);
+}
+
+/*
+ * __wt_database_method_type --
+ *	Print a standard error message on attempts to call methods inappropriate
+ *	for a database type.
+ */
+int
+__wt_database_method_type(DB *db, const char *name, int column_err)
+{
+	__wt_api_db_errx(db,
+	    "%s: this method is not supported for a %s database",
+	    name, column_err ? "column store" : "row store");
+	return (WT_ERROR);
+}
+
+/*
+ * __wt_database_wrong_fixed_size --
+ *	Print a standard error message on attempts to put the wrong size element
+ *	into a fixed-size database.
+ */
+int
+__wt_database_wrong_fixed_size(WT_TOC *toc, uint32_t len)
+{
+	DB *db;
+
+	db = toc->db;
+
+	__wt_api_db_errx(db,
+	    "%s: length of %lu does not match fixed-length database "
+	    "configuration of %lu",
+	     toc->name, (u_long)len, (u_long)db->fixed_len);
+	return (WT_ERROR);
+}
+
+/*
+ * __wt_database_readonly --
+ *	Print a standard error message on attempts to modify a read-only
+ *	database.
+ */
+int
+__wt_database_readonly(DB *db, const char *name)
+{
+	__wt_api_db_errx(db,
+	    "%s: the database was opened read-only and may not be modified",
+	    name);
+	return (WT_READONLY);
+}
+
+/*
+ * __wt_database_format --
+ *	Print a standard error message when a database format error is
+ *	suddenly discovered.
+ */
+int
+__wt_database_format(DB *db)
+{
+	__wt_api_db_errx(db, "the database is corrupted; use the Db.salvage"
+	    " method or the db_salvage utility to repair the database");
+	return (WT_ERROR);
+}
+
+/*
+ * __wt_database_item_too_big --
+ *	Print a standard error message when an element is too large to store.
+ */
+int
+__wt_database_item_too_big(DB *db)
+{
+	__wt_api_db_errx(db, "the item is too large for the database to store");
+	return (WT_ERROR);
+}
+
+/*
+ * __wt_wt_toc_lockout --
+ *	Standard WT_TOC handle lockout error message.
+ */
+int
+__wt_wt_toc_lockout(WT_TOC *toc)
+{
+	return (__wt_env_lockout(toc->env));
+}
+
+/*
+ * __wt_db_lockout --
+ *	Standard DB handle lockout error message.
+ */
+int
+__wt_db_lockout(DB *db)
+{
+	return (__wt_env_lockout(db->env));
+}
+
+/*
+ * __wt_env_lockout --
+ *	Standard ENV handle lockout error message.
+ */
+int
+__wt_env_lockout(ENV *env)
+{
+	__wt_api_env_errx(env,
+	    "An unavailable handle method was called; the handle method is "
+	    "not available for some reason, for example, handle methods are "
+	    "restricted after an error, or configuration methods may be "
+	    "restricted after the database or environment have been opened, "
+	    "or operational methods may be restricted until the database or "
+	    "environment has been opened.");
+	return (WT_ERROR);
+}
diff --git a/src/support/hazard.c b/src/support/hazard.c
new file mode 100644
index 00000000000..5bef0731aa5
--- /dev/null
+++ b/src/support/hazard.c
@@ -0,0 +1,133 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_hazard_set --
+ *	Set a hazard reference.
+ */
+int
+__wt_hazard_set(WT_TOC *toc, WT_REF *ref)
+{
+	ENV *env;
+	WT_PAGE **hp;
+
+	env = toc->env;
+
+	/*
+	 * Do the dance:
+	 *
+	 * The memory location making a page "real" is the WT_REF's state which
+	 * can be reset from WT_OK to WT_EVICT at any time by the page eviction
+	 * server.
+	 *
+	 * Add the WT_REF reference to the WT_TOC's hazard list and flush the
+	 * write, then see if the state field is still WT_OK.  If it's still
+	 * WT_OK, we know we can use the page because the page eviction server
+	 * will see our hazard reference before it discards the buffer (the
+	 * eviction server sets the WT_REF state to WT_EVICT, flushes memory,
+	 * and then checks the hazard references).
+	 */
+	for (hp = toc->hazard; hp < toc->hazard + env->hazard_size; ++hp) {
+		if (*hp != NULL)
+			continue;
+
+		/*
+		 * Memory flush needed; the hazard array isn't declared volatile
+		 * and an explicit memory flush is necessary.
+		 */
+		*hp = ref->page;
+		WT_MEMORY_FLUSH;
+
+		/*
+		 * If the cache entry is set, check to see if it's still valid.
+		 * Valid means the state is WT_OK, or the state is WT_EVICT and
+		 * this thread is allowed to see pages flagged for eviction.
+		 */
+		if (ref->state == WT_OK ||
+		    (ref->state == WT_EVICT && F_ISSET(toc, WT_READ_EVICT))) {
+			WT_VERBOSE(env, WT_VERB_HAZARD,
+			    (env, "toc %p hazard %p: set", toc, ref->page));
+			return (1);
+		}
+
+		/* The cache eviction server owns the page, we can't have it. */
+		*hp = NULL;
+		return (0);
+	}
+
+	__wt_api_env_errx(env, "WT_TOC has no more hazard reference slots");
+	WT_ASSERT(env, hp < toc->hazard + env->hazard_size);
+	return (0);
+}
+
+/*
+ * __wt_hazard_clear --
+ *	Clear a hazard reference.
+ */
+void
+__wt_hazard_clear(WT_TOC *toc, WT_PAGE *page)
+{
+	ENV *env;
+	WT_PAGE **hp;
+
+	env = toc->env;
+
+	WT_VERBOSE(env,
+	    WT_VERB_HAZARD, (env, "toc %p hazard %p: clr", toc, page));
+
+	/* Clear the caller's hazard pointer. */
+	for (hp = toc->hazard; hp < toc->hazard + env->hazard_size; ++hp)
+		if (*hp == page) {
+			*hp = NULL;
+			/*
+			 * We don't have to flush memory here for correctness;
+			 * it would give the page server thread faster access
+			 * to the block were the block selected to be evicted,
+			 * but the generation number was just set which makes
+			 * it unlikely to be selected for eviction.
+			 */
+			return;
+		}
+	__wt_api_env_errx(env, "WT_TOC hazard reference not found");
+	WT_ASSERT(env, hp < toc->hazard + env->hazard_size);
+}
+
+/*
+ * __wt_hazard_empty --
+ *	Verify that no hazard references are set.
+ */
+void
+__wt_hazard_empty(WT_TOC *toc, const char *name)
+{
+	ENV *env;
+	WT_PAGE **hp;
+
+	env = toc->env;
+
+	/*
+	 * Check for a set hazard reference and complain if we find one.  Clear
+	 * any we find because it's not a correctness problem (any hazard ref
+	 * we find can't be real because the WT_TOC is being closed when we're
+	 * called).   We do this work because it's not expensive, and we don't
+	 * want to let a hazard reference lie around, keeping a page from being
+	 * flushed.  The flush isn't necessary for correctness, but gives the
+	 * cache eviction thread immediate access to any page our reference
+	 * blocks.
+	 */
+	for (hp = toc->hazard; hp < toc->hazard + env->hazard_size; ++hp)
+		if (*hp != NULL) {
+			__wt_api_env_errx(env,
+			    "%s: returned with a hazard reference set (%p)",
+			    name, *hp);
+			*hp = NULL;
+			WT_MEMORY_FLUSH;
+		}
+}
diff --git a/src/support/huffman.c b/src/support/huffman.c
new file mode 100644
index 00000000000..2a0fcfde218
--- /dev/null
+++ b/src/support/huffman.c
@@ -0,0 +1,692 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ *
+ *   Huffman Encoder/Decoder v1.0
+ *   Author Brian Pollack <brian@brians.com>
+ */
+
+#include "wt_internal.h"
+
+typedef struct __wt_freqtree_node {
+	/*
+	 * Data structure representing a node of the huffman tree. It holds a
+	 * 32-bit weight and pointers to the left and right child nodes.
+	 * The node either has two child nodes or none.
+	 */
+	uint16_t symbol;			/* only used in leaf nodes */
+	uint32_t weight;
+	uint16_t codeword_length;
+	struct __wt_freqtree_node *left;	/* bit 0 */
+	struct __wt_freqtree_node *right;	/* bit 1 */
+} WT_FREQTREE_NODE;
+
+typedef struct __wt_static_huffman_node {
+	/*
+	 * This data structure is used to represent the huffman tree in a
+	 * static array, after it has been created (using a dynamic tree
+	 * representation with WT_FREQTREE_NODE nodes).
+	 *
+	 * In the binary tree's array representation if a node's index is i,
+	 * then its left child node is 2i+1 and its right child node is 2i+2.
+	 */
+	uint8_t valid;
+	uint16_t symbol;
+	uint16_t codeword_length;
+} WT_STATIC_HUFFMAN_NODE;
+
+typedef struct __wt_huffman_obj {
+	ENV *env;		/* Enclosing environment */
+	/*
+	 * Data structure here defines specific instance of the encoder/decoder.
+	 * This contains the frequency table (tree) used to produce optimal
+	 * results.  This version of the encoder supports 1- and 2-byte symbols.
+	 */
+	uint32_t numSymbols;
+	uint8_t  numBytes;	/* 1 or 2 */
+				/* The tree in static array reprentation */
+	WT_STATIC_HUFFMAN_NODE *nodes;
+	uint16_t max_depth;
+} WT_HUFFMAN_OBJ;
+
+/*
+ * Queue element data structure.
+ *
+ * Consists of a pointer to a huffman tree node, and a pointer to the next
+ * element in the queue.
+ */
+typedef struct node_queue_elem {
+	WT_FREQTREE_NODE *node;
+	struct node_queue_elem *next;
+} NODE_QUEUE_ELEM;
+
+/*
+ * Queue of huffman tree nodes.
+ *
+ * Contains a pointer to the beginning and the end of the queue, which is
+ * implemented as a linked list.
+ */
+typedef struct node_queue {
+	NODE_QUEUE_ELEM *first;
+	NODE_QUEUE_ELEM *last;
+} NODE_QUEUE;
+
+#define	node_queue_is_empty(queue)					\
+	(((queue) == NULL || (queue)->first == NULL) ? 1 : 0)
+
+static void node_queue_close(ENV *, NODE_QUEUE *);
+static void node_queue_dequeue(ENV *, NODE_QUEUE *, WT_FREQTREE_NODE **);
+static int  node_queue_enqueue(ENV *, NODE_QUEUE *, WT_FREQTREE_NODE *);
+static void recursive_free_node(ENV *env, WT_FREQTREE_NODE *node);
+
+/*
+ * The following macros are used by the encoder to write the buffer with bit
+ * addressing.
+ */
+#undef	SET_BIT
+#define	SET_BIT(ptr, pos)						\
+	*((ptr) + ((pos) / 8)) |= 1 << (7 - ((pos) % 8))
+#undef	CLEAR_BIT
+#define	CLEAR_BIT(ptr, pos)						\
+	*((ptr) + ((pos) / 8)) &= ~(uint8_t)(1 << (7 - ((pos) % 8)))
+#undef	MODIFY_BIT
+#define	MODIFY_BIT(ptr, pos, bit)					\
+	if (bit)							\
+		SET_BIT(ptr, pos);					\
+	else								\
+		CLEAR_BIT(ptr, pos);
+
+/*
+ * Internal data structure used to preserve the symbol when rearranging the
+ * frequency array.
+ */
+typedef struct __indexed_byte {
+	uint8_t frequency;
+	uint16_t symbol;
+} INDEXED_BYTE;
+
+/*
+ * Comparator function used by QuickSort to order the frequency table by
+ * frequency (most frequent symbols will be at the end of the array).
+ */
+static int
+indexed_byte_comparator(const void *elem1, const void *elem2)
+{
+	return (((INDEXED_BYTE *)
+	    elem1)->frequency) - (((INDEXED_BYTE *)elem2)->frequency);
+}
+
+/*
+ * traverse_tree --
+ *	Recursive function with dual functionality:
+ *	- It sets the codeword_length field of each leaf node to the
+ *	  appropriate value.
+ *	- It finds the maximum depth of the tree.
+ */
+static void
+traverse_tree(
+    WT_FREQTREE_NODE *node, uint16_t current_length, uint16_t *max_depth)
+{
+	/* Recursively traverse the tree */
+	if (node->left != NULL)
+		traverse_tree(node->left, current_length + 1, max_depth);
+	if (node->right != NULL)
+		traverse_tree(node->right, current_length + 1, max_depth);
+
+	/* If this is a leaf: */
+	if (node->left == NULL && node->right == NULL) {
+		/*
+		 * Setting the leaf's codeword length (for inner nodes, it
+		 * is always 0!)
+		 */
+		node->codeword_length = current_length;
+
+		/* Store the new maximal depth. */
+		if (*max_depth < current_length + 1)
+			*max_depth = current_length + 1;
+	}
+}
+
+/*
+ * fill_static_representation --
+ *	Recursive function that converts the huffman tree from its dynamic
+ * representation to static tree representation, to a preallocated array.
+ *
+ * To know the required size of the array the traverse_tree function can be
+ * used, determining the maximum depth N. Then the required array size is 2^N.
+ */
+static void
+fill_static_representation(
+    WT_STATIC_HUFFMAN_NODE *target, WT_FREQTREE_NODE *node, int idx)
+{
+	WT_STATIC_HUFFMAN_NODE *current_target;
+
+	current_target = &target[idx];
+	current_target->symbol = node->symbol;
+	current_target->codeword_length = node->codeword_length;
+	current_target->valid = 1;
+
+	if (node->left != NULL)
+		fill_static_representation(target, node->left, idx * 2 + 1);
+	if (node->right != NULL)
+		fill_static_representation(target, node->right, idx * 2 + 2);
+}
+
+/*
+ * recursive_free_node --
+ *	Recursively free the huffman frequency tree's nodes.
+ */
+static void
+recursive_free_node(ENV *env, WT_FREQTREE_NODE *node)
+{
+	if (node != NULL) {
+		recursive_free_node(env, node->left);
+		recursive_free_node(env, node->right);
+		__wt_free(env, node, sizeof(WT_FREQTREE_NODE));
+	}
+}
+
+/*
+ * __wt_huffman_open --
+ *	Take a frequency table and return a pointer to a descriptor object.
+ *
+ *  The frequency table must be the full range of valid values.  For 1 byte
+ *  tables there are 256 values in 8 bits.  The highest rank is 255, and the
+ * lowest rank is 1 (0 means the byte never appears in the input), so 1 byte
+ * is needed to hold the rank and the input table must be 1 byte x 256 values.
+ *
+ *  For UTF-16 (nbytes == 2) the range is 0 - 65535 and the max rank is 65535.
+ *  The table should be 2 bytes x 65536 values.
+ */
+int
+__wt_huffman_open(ENV *env,
+    uint8_t const *byte_frequency_array, u_int nbytes, void *retp)
+{
+	INDEXED_BYTE *indexed_freqs;
+	NODE_QUEUE *combined_nodes, *leaves;
+	WT_FREQTREE_NODE *node, *node2, **refnode, *tempnode;
+	WT_HUFFMAN_OBJ *huffman;
+	uint32_t w1, w2;
+	uint16_t i;
+	int ret;
+
+	indexed_freqs = NULL;
+	combined_nodes = leaves = NULL;
+	node = node2 = tempnode = NULL;
+	ret = 0;
+
+	WT_RET(__wt_calloc(env, 1, sizeof(WT_HUFFMAN_OBJ), &huffman));
+	WT_ERR(__wt_calloc(env, nbytes, sizeof(INDEXED_BYTE), &indexed_freqs));
+	huffman->env = env;
+
+	/*
+	 * The frequency array must be sorted to be able to use linear time
+	 * construction algorithm.
+	 */
+	for (i = 0; i < nbytes; ++i) {
+		indexed_freqs[i].frequency = byte_frequency_array[i];
+		indexed_freqs[i].symbol = i;
+	}
+
+	qsort(indexed_freqs,
+	    nbytes, sizeof(INDEXED_BYTE), indexed_byte_comparator);
+
+	/* We need two node queues to build the tree. */
+	WT_ERR(__wt_calloc(env, 1, sizeof(NODE_QUEUE), &leaves));
+	WT_ERR(__wt_calloc(env, 1, sizeof(NODE_QUEUE), &combined_nodes));
+
+	/* Adding the leaves to the queue */
+	for (i = 0; i < nbytes; ++i) {
+		/*
+		 * We are leaving out symbols with a frequency of 0.  This
+		 * assumes these symbols will NEVER occur in the source stream,
+		 * and the purpose is to reduce the huffman tree's size.
+		 *
+		 * NOTE: Even if this behavior is not desired, the frequencies
+		 * should have a range between 1 - 255, otherwise the algorithm
+		 * cannot produce well balanced tree; so this can be treated as
+		 * an optional feature.
+		 */
+		if (indexed_freqs[i].frequency > 0) {
+			WT_ERR(__wt_calloc(
+			    env, 1, sizeof(WT_FREQTREE_NODE), &tempnode));
+			tempnode->symbol = indexed_freqs[i].symbol;
+			tempnode->weight = indexed_freqs[i].frequency;
+			WT_ERR(node_queue_enqueue(env, leaves, tempnode));
+			tempnode = NULL;
+		}
+	}
+
+	while (!node_queue_is_empty(leaves) ||
+	    !node_queue_is_empty(combined_nodes)) {
+		/*
+		 * We have to get the node with the smaller weight, examining
+		 * both queues first element.  We are collecting pairs of these
+		 * items, by alternating between node and node2:
+		 */
+		refnode = !node ? &node : &node2;
+
+		/*
+		 * To decide which queue must be used, we get the weights of
+		 * the first items from both:
+		 */
+		w1 = node_queue_is_empty(leaves) ?
+		    UINT32_MAX : leaves->first->node->weight;
+		w2 = node_queue_is_empty(combined_nodes) ?
+		    UINT32_MAX : combined_nodes->first->node->weight;
+
+		/*
+		 * Based on the two weights we finally can dequeue the smaller
+		 * element and place it to the alternating target node pointer:
+		 */
+		if (w1 < w2)
+			node_queue_dequeue(env, leaves, refnode);
+		else
+			node_queue_dequeue(env, combined_nodes, refnode);
+
+		/*
+		 * In every second run, we have both node and node2 initialized.
+		 */
+		if (node != NULL && node2 != NULL) {
+			WT_ERR(__wt_calloc(
+			    env, 1, sizeof(WT_FREQTREE_NODE), &tempnode));
+
+			/* The new weight is the sum of the two weights. */
+			tempnode->weight = node->weight + node2->weight;
+			tempnode->left = node;
+			tempnode->right = node2;
+
+			/* Enqueue it to the combined nodes queue */
+			WT_ERR(
+			    node_queue_enqueue(env, combined_nodes, tempnode));
+			tempnode = NULL;
+
+			/* Reset the state pointers */
+			node = node2 = NULL;
+		}
+	}
+
+	/*
+	 * The remaining node is in the node variable, this is the root of the
+	 * tree.   Calculate the number of bytes it takes to hold nbytes bits.
+	 */
+	huffman->numSymbols = nbytes;
+	huffman->numBytes = nbytes > 256 ? 2 : 1;
+
+	/* Traverse the tree and set the code word length for each node. */
+	traverse_tree(node, 0, &huffman->max_depth);
+
+	/* Converting the tree to a static array representation. */
+	WT_ERR(__wt_calloc(env, 1 << huffman->max_depth,
+	    sizeof(WT_STATIC_HUFFMAN_NODE), &huffman->nodes));
+	fill_static_representation(huffman->nodes, node, 0);
+
+	*(void **)retp = huffman;
+
+err:	if (leaves != NULL)
+		node_queue_close(env, leaves);
+	if (combined_nodes != NULL)
+		node_queue_close(env, combined_nodes);
+	if (indexed_freqs != NULL)
+		__wt_free(env, indexed_freqs, 0);
+	if (node != NULL)
+		recursive_free_node(env, node);
+	if (node2 != NULL)
+		recursive_free_node(env, node2);
+	if (tempnode != NULL)
+		__wt_free(env, tempnode, sizeof(WT_FREQTREE_NODE));
+	if (ret != 0) {
+		if (huffman->nodes != NULL)
+			__wt_free(env, huffman->nodes, 0);
+		__wt_free(env, huffman, sizeof(WT_HUFFMAN_OBJ));
+	}
+	return (ret);
+}
+
+/*
+ * __wt_huffman_close --
+ *	Discard a Huffman descriptor object.
+ */
+void
+__wt_huffman_close(ENV *env, void *huffman_arg)
+{
+	WT_HUFFMAN_OBJ *huffman;
+
+	huffman = huffman_arg;
+
+	__wt_free(env, huffman->nodes, 0);
+	__wt_free(env, huffman, sizeof(WT_HUFFMAN_OBJ));
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_print_huffman_code --
+ *	Prints a symbol's huffman code. Can be used for debugging purposes.
+ */
+int
+__wt_print_huffman_code(ENV *env, void *huffman_arg, uint16_t symbol)
+{
+	WT_HUFFMAN_OBJ *huffman;
+	WT_STATIC_HUFFMAN_NODE *node;
+	u_int i, n;
+	int p;
+	char *buffer;
+
+	huffman = huffman_arg;
+
+	/* Check if the symbol is in valid range */
+	if (symbol < huffman->numSymbols) {
+		WT_RET(__wt_calloc(env, huffman->max_depth, 1, &buffer));
+
+		node = NULL;
+		for (i = 0, n = 1 << huffman->max_depth; i < n; ++i) {
+			node = &huffman->nodes[i];
+			if (node->valid &&
+			    node->symbol == symbol && node->codeword_length > 0)
+				break;
+		}
+
+		if (node != NULL) {
+			/*
+			 * We've got the leaf node, at index 'i'.  Now we fill
+			 * the output buffer in back order.
+			 */
+			for (p = node->codeword_length - 1; p >= 0; --p) {
+				buffer[p] = (i % 2) == 1 ? '0' : '1';
+				i = (i - 1) / 2;
+			}
+
+			(void)printf("%s\n", buffer);
+		} else {
+			(void)printf(
+			    "Symbol is not in the huffman tree: %x\n", symbol);
+			return (WT_ERROR);
+		}
+
+		__wt_free(env, buffer, 0);
+	} else
+		(void)printf("Symbol out of range: %lu >= %lu\n",
+		    (u_long)symbol, (u_long)huffman->numSymbols);
+	return (0);
+}
+#endif
+
+/*
+ * __wt_huffman_encode --
+ *	Take a byte string, encode it into the target.
+ */
+int
+__wt_huffman_encode(void *huffman_arg,
+    uint8_t *from, uint32_t from_len,
+    void *top, uint32_t *to_len, uint32_t *out_bytes_used)
+{
+	ENV *env;
+	WT_HUFFMAN_OBJ *huffman;
+	WT_STATIC_HUFFMAN_NODE *node;
+	uint32_t bitpos, i, n, j;
+	uint16_t symbol;
+	uint8_t padding_info, *to;
+	int p;
+
+	huffman = huffman_arg;
+	env = huffman->env;
+
+	/*
+	 * We need N+1 bytes to encode N bytes, re-allocate as necessary.
+	 *
+	 * If the initial target pointer, or the initial target buffer length,
+	 * aren't set, it's an allocation.   Clear the initial target pointer,
+	 * our caller may have only set the initial target buffer length, not
+	 * the initial pointer value.
+	 */
+	if (to_len == NULL || *to_len < from_len + 1) {
+		if (to_len == NULL)
+			*(void **)top = NULL;
+		WT_RET(__wt_realloc(env, to_len, from_len + 1, top));
+	}
+
+	to = *(uint8_t **)top;
+	memset(to, 0, from_len + 1);
+
+	/*
+	 * Leave the first 3 bits of the encoded value empty, it holds the
+	 * number of bits actually used in the last byte of the encoded value.
+	 */
+	bitpos = 3;
+	n = 1 << huffman->max_depth;
+	for (i = 0; i < from_len; i += huffman->numBytes) {
+		/* Getting the next symbol, either 1 or 2 bytes */
+		if (huffman->numBytes == 1)
+			symbol = *from++;
+		else {
+			symbol = ((uint16_t)(*from++)) << 8;
+			symbol |= *from++;
+		}
+
+		/* Getting the symbol's huffman code from the table */
+		node = NULL;
+		for (j = 0; j < n; ++j) {
+			node = &huffman->nodes[j];
+			if (node->valid &&
+			    node->symbol == symbol && node->codeword_length > 0)
+				break;
+		}
+
+		if (node != NULL) {
+			/*
+			 * We've got the leaf node, at index 'j'.  Now we fill
+			 * the output buffer in back order.
+			 */
+			for (p = node->codeword_length - 1; p >= 0; --p) {
+				MODIFY_BIT(to, bitpos + (u_int)p, (j % 2) ^ 1);
+				j = (j - 1) / 2;
+			}
+
+			bitpos += node->codeword_length;
+		} else {
+			__wt_api_env_errx(NULL,
+			    "Huffman compression: there was a symbol in the "
+			    "source originally declared with zero frequency; "
+			    "undefined source symbol: %lu", (u_long)symbol);
+			return (WT_ERROR);
+		}
+	}
+
+	/*
+	 * At this point, bitpos is the total number of used bits (including
+	 * the 3 bits at the beginning of the buffer, which we'll set now to
+	 * the number of bits used in the last byte).   Note if the number of
+	 * bits used in the last byte is 8, we set the 3 bits to 0, in other
+	 * words, the first 3 bits of the encoded value are the number of bits
+	 * used in the last byte, unless they're 0, in which case there are 8
+	 * bits used in the last byte.
+	 */
+	padding_info = (bitpos % 8) << 5;
+	*to |= padding_info;
+
+	*out_bytes_used = bitpos / 8 + ((bitpos % 8) ? 1 : 0);
+
+	return (0);
+}
+
+/*
+ * __wt_huffman_decode --
+ *	Take a byte string, decode it into the target.
+ */
+int
+__wt_huffman_decode(void *huffman_arg,
+    uint8_t *from, uint32_t from_len,
+    void *top, uint32_t *to_len, uint32_t *out_bytes_used)
+{
+	ENV *env;
+	WT_HUFFMAN_OBJ *huffman;
+	WT_STATIC_HUFFMAN_NODE* node;
+	uint32_t bytes, i, from_len_bits, node_idx;
+	uint8_t bitpos, mask, bit, padding_info, *to;
+
+	huffman = huffman_arg;
+	env = huffman->env;
+
+	/*
+	 * We need 2N+1 bytes to decode N bytes, re-allocate as necessary.
+	 *
+	 * If the initial target pointer, or the initial target buffer length,
+	 * aren't set, it's an allocation.   Clear the initial target pointer,
+	 * our caller may have only set the initial target buffer length, not
+	 * the initial pointer value.
+	 */
+	if (to_len == NULL || *to_len < 2 * from_len + 1) {
+		if (to_len == NULL)
+			*(void **)top = NULL;
+		WT_RET(__wt_realloc(env, to_len, 2 * from_len + 1, top));
+	}
+
+	to = *(uint8_t **)top;
+
+	bitpos = 4;			/* Skipping the first 3 bits. */
+	bytes = 0;
+	node_idx = 0;
+
+	/*
+	 * The first 3 bits are the number of used bits in the last byte, unless
+	 * they're 0, in which case there are 8 bits used in the last byte.
+	 */
+	padding_info = (*from & 0xE0) >> 5;
+	from_len_bits = from_len * 8;
+	if (padding_info != 0)
+		from_len_bits -= 8 - padding_info;
+
+	/*
+	 * The loop will go through each bit of the source stream, its length
+	 * is given in BITS!
+	 */
+	for (i = 3; i < from_len_bits; i++) {
+		/* Extracting the current bit */
+		mask = (uint8_t)(1 << bitpos);
+		bit = (*from & mask);
+
+		/*
+		 * As we go through the bits, we also make steps in the huffman
+		 * tree, originated from the root, toward the leaves.
+		 */
+		if (bit)
+			node_idx = (node_idx * 2) + 2;
+		else
+			node_idx = (node_idx * 2) + 1;
+
+		node = &huffman->nodes[node_idx];
+
+		/* If this is a leaf, we've found a complete symbol. */
+		if (node->valid && node->codeword_length > 0) {
+			if (huffman->numBytes == 1)
+				*to++ = (uint8_t)node->symbol;
+			else {
+				*to++ = (node->symbol & 0xFF00) >> 8;
+				*to++ = node->symbol & 0xFF;
+			}
+
+			bytes += huffman->numBytes;
+			node_idx = 0;
+		}
+
+		/* Moving forward one bit in the source stream. */
+		if (bitpos > 0)
+			bitpos--;
+		else {
+			bitpos = 7;
+			from++;
+		}
+	}
+
+	/* Return the number of bytes used. */
+	*out_bytes_used = bytes;
+
+	return (0);
+}
+
+/*
+ * node_queue_close --
+ *	Delete a queue from memory.
+ *
+ * It does not delete the pointed huffman tree nodes!
+ */
+static void
+node_queue_close(ENV *env, NODE_QUEUE *queue)
+{
+	NODE_QUEUE_ELEM *elem, *next_elem;
+
+	/* Freeing each element of the queue's linked list. */
+	for (elem = queue->first; elem != NULL; elem = next_elem) {
+		next_elem = elem->next;
+		__wt_free(env, elem, sizeof(NODE_QUEUE_ELEM));
+	}
+
+	/* Freeing the queue record itself. */
+	__wt_free(env, queue, sizeof(NODE_QUEUE));
+}
+
+/*
+ * node_queue_enqueue --
+ *	Push a tree node to the end of the queue.
+ */
+static int
+node_queue_enqueue(ENV *env, NODE_QUEUE *queue, WT_FREQTREE_NODE *node)
+{
+	NODE_QUEUE_ELEM *elem;
+
+	/* Allocating a new linked list element */
+	WT_RET(__wt_calloc(env, 1, sizeof(NODE_QUEUE_ELEM), &elem));
+
+	/* It holds the tree node, and has no next element yet */
+	elem->node = node;
+	elem->next = NULL;
+
+	/* If the queue is empty, the first element will be the new one. */
+	if (queue->first == NULL)
+		queue->first = elem;
+
+	/*
+	 * If the queue is not empty, the last element's next pointer must be
+	 * updated.
+	 */
+	if (queue->last != NULL)
+		queue->last->next = elem;
+
+	/* The last element is the new one */
+	queue->last = elem;
+
+	return (0);
+}
+
+/*
+ * node_queue_dequeue --
+ *	Removes a node from the beginning of the queue and copies the node's
+ *	pointer to the location referred by the retp parameter.
+ */
+static void
+node_queue_dequeue(ENV *env, NODE_QUEUE *queue, WT_FREQTREE_NODE **retp)
+{
+	NODE_QUEUE_ELEM *first_elem;
+
+	/*
+	 * Getting the first element of the queue and updating it to point to
+	 * the next element as first.
+	 */
+	first_elem = queue->first;
+	*retp = first_elem->node;
+	queue->first = first_elem->next;
+
+	/*
+	 * If the last element was the dequeued element, we have to update it
+	 * to NULL.
+	 */
+	if (queue->last == first_elem)
+		queue->last = NULL;
+
+	/* Freeing the linked list element that has been dequeued */
+	__wt_free(env, first_elem, sizeof(NODE_QUEUE_ELEM));
+}
diff --git a/src/support/pow.c b/src/support/pow.c
new file mode 100644
index 00000000000..3a6b6b1d686
--- /dev/null
+++ b/src/support/pow.c
@@ -0,0 +1,56 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_nlpo2 --
+ *	Return the next-largest power-of-two for a 32-bit unsigned value.
+ *
+ * In 12 operations, this code computes the next highest power of 2 for a 32-bit
+ * integer. The result may be expressed by the formula 1U << (lg(v - 1) + 1).
+ * Note that in the edge case where v is 0, it returns 0, which isn't a power of
+ * 2; you might append the expression v += (v == 0) to remedy this if it
+ * matters.  It would be faster by 2 operations to use the formula and the
+ * log base 2 methed that uses a lookup table, but in some situations, lookup
+ * tables are not suitable, so the above code may be best. (On a Athlon XP 2100+
+ * I've found the above shift-left and then OR code is as fast as using a single
+ * BSR assembly language instruction, which scans in reverse to find the highest
+ * set bit.) It works by copying the highest set bit to all of the lower bits,
+ * and then adding one, which results in carries that set all of the lower bits
+ * to 0 and one bit beyond the highest set bit to 1. If the original number was
+ * a power of 2, then the decrement will reduce it to one less, so that we round
+ * up to the same original value.  Devised by Sean Anderson, Sepember 14, 2001.
+ * Pete Hart pointed me to a couple newsgroup posts by him and William Lewis in
+ * February of 1997, where they arrive at the same algorithm.
+ *	http://graphics.stanford.edu/~seander/bithacks.html
+ *	Sean Eron Anderson, seander@cs.stanford.edu
+ */
+uint32_t
+__wt_nlpo2(uint32_t v)
+{
+	v--;
+	v |= v >> 1;
+	v |= v >> 2;
+	v |= v >> 4;
+	v |= v >> 8;
+	v |= v >> 16;
+	v++;
+	return (v);
+}
+
+/*
+ * __wt_ispo2 --
+ *	Return if a number is a power-of-two.
+ */
+int
+__wt_ispo2(uint32_t v)
+{
+	return ((v & (v - 1)) == 0 ? 1 : 0);
+}
diff --git a/src/support/prime.c b/src/support/prime.c
new file mode 100644
index 00000000000..8abe43158b2
--- /dev/null
+++ b/src/support/prime.c
@@ -0,0 +1,75 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_prime
+ *	Return a prime number relatively close to a value.
+ */
+uint32_t
+__wt_prime(uint32_t n)
+{
+	/*
+	 * Ref: the hash functions section of "Algorithms in C", by Sedgewick.
+	 *
+	 * The table is the same as the one in Berkeley DB -- check at each
+	 * power-of-two up to 2^18, then mid-points between each power-of-two
+	 * to a maximum of 2^30.
+	 */
+	static const struct {
+		uint32_t value;
+		uint32_t prime;
+	} t[] = {
+		{ 32, 37 },			/* 2^5 */
+		{ 64, 67 },			/* 2^6 */
+		{ 128, 131 },			/* 2^7 */
+		{ 256, 257 },			/* 2^8 */
+		{ 512, 521 },			/* 2^9 */
+		{ 1024, 1031 },			/* 2^10 */
+		{ 2048, 2053 },			/* 2^11 */
+		{ 4096, 4099 },			/* 2^12 */
+		{ 8192, 8191 },			/* 2^13 */
+		{ 16384, 16381 },		/* 2^14 */
+		{ 32768, 32771 },		/* 2^15 */
+		{ 65536, 65537 },		/* 2^16 */
+		{ 131072, 131071 },		/* 2^17 */
+		{ 262144, 262147 },		/* 2^18 */
+		{ 393216, 393209 },		/* 2^18 + 2^18/2 */
+		{ 524288, 524287 },		/* 2^19 */
+		{ 786432, 786431 },		/* 2^19 + 2^19/2 */
+		{ 1048576, 1048573 },		/* 2^20 */
+		{ 1572864, 1572869 },		/* 2^20 + 2^20/2 */
+		{ 2097152, 2097169 },		/* 2^21 */
+		{ 3145728, 3145721 },		/* 2^21 + 2^21/2 */
+		{ 4194304, 4194301 },		/* 2^22 */
+		{ 6291456, 6291449 },		/* 2^22 + 2^22/2 */
+		{ 8388608, 8388617 },		/* 2^23 */
+		{ 12582912, 12582917 },		/* 2^23 + 2^23/2 */
+		{ 16777216, 16777213 },		/* 2^24 */
+		{ 25165824, 25165813 },		/* 2^24 + 2^24/2 */
+		{ 33554432, 33554393 },		/* 2^25 */
+		{ 50331648, 50331653 },		/* 2^25 + 2^25/2 */
+		{ 67108864, 67108859 },		/* 2^26 */
+		{ 100663296, 100663291 },	/* 2^26 + 2^26/2 */
+		{ 134217728, 134217757 },	/* 2^27 */
+		{ 201326592, 201326611 },	/* 2^27 + 2^27/2 */
+		{ 268435456, 268435459 },	/* 2^28 */
+		{ 402653184, 402653189 },	/* 2^28 + 2^28/2 */
+		{ 536870912, 536870909 },	/* 2^29 */
+		{ 805306368, 805306357 },	/* 2^29 + 2^29/2 */
+		{ 1073741824, 1073741827 },	/* 2^30 */
+	};
+	u_int i;
+
+	for (i = 0; i < WT_ELEMENTS(t); ++i)
+		if (t[i].value > n)
+			return (t[i].prime);
+	return (t[WT_ELEMENTS(t) - 1].prime);
+}
diff --git a/src/support/progress.c b/src/support/progress.c
new file mode 100644
index 00000000000..480699cbdd1
--- /dev/null
+++ b/src/support/progress.c
@@ -0,0 +1,17 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+void
+__wt_progress(const char *s, uint64_t v)
+{
+	(void)printf("\r\t%s: %llu", s, (unsigned long long)v);
+	(void)fflush(stdout);
+}
diff --git a/src/support/scratch.c b/src/support/scratch.c
new file mode 100644
index 00000000000..9b20ea963f3
--- /dev/null
+++ b/src/support/scratch.c
@@ -0,0 +1,98 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_scr_alloc --
+ *	Scratch buffer allocation function.
+ */
+int
+__wt_scr_alloc(WT_TOC *toc, uint32_t size, DBT **dbtp)
+{
+	DBT *scratch;
+	ENV *env;
+	uint32_t allocated;
+	u_int i;
+	int ret;
+
+	env = toc->env;
+
+	*dbtp = NULL;	/* Don't risk the caller not catching the error. */
+
+	/*
+	 * There's an array of scratch buffers in each WT_TOC that can be used
+	 * by any function.  We use DBTs for scratch buffers because we already
+	 * have to have functions that do variable-length allocation on DBTs.
+	 * Scratch buffers are allocated only by a single thread of control, so
+	 * no locking is necessary.
+	 */
+	for (i = 0,
+	    scratch = toc->scratch; i < toc->scratch_alloc; ++i, ++scratch)
+		if (!F_ISSET(scratch, WT_SCRATCH_INUSE)) {
+			*dbtp = scratch;
+			F_SET(scratch, WT_SCRATCH_INUSE);
+
+			/*
+			 * If the caller has a minimum size, grow the scratch
+			 * buffer as necessary.
+			 */
+			if (size != 0 && scratch->mem_size < size)
+				WT_RET(__wt_realloc(env,
+				    &scratch->mem_size, size, &scratch->data));
+			return (0);
+		}
+
+	/* Resize the array, we need more scratch buffers. */
+	allocated = toc->scratch_alloc * sizeof(DBT);
+	WT_ERR(__wt_realloc(env, &allocated,
+	    (toc->scratch_alloc + 10) * sizeof(DBT), &toc->scratch));
+	toc->scratch_alloc += 10;
+	return (__wt_scr_alloc(toc, size, dbtp));
+
+err:	__wt_api_env_errx(env,
+	    "WT_TOC unable to allocate more scratch buffers");
+	return (ret);
+}
+
+/*
+ * __wt_scr_release --
+ *	Release a scratch buffer.
+ */
+void
+__wt_scr_release(DBT **dbt)
+{
+	DBT *scratch;
+
+	scratch = *dbt;
+	*dbt = NULL;
+
+	F_CLR(scratch, WT_SCRATCH_INUSE);
+}
+
+/*
+ * __wt_scr_free --
+ *	Free all memory associated with the scratch buffers.
+ */
+void
+__wt_scr_free(WT_TOC *toc)
+{
+	DBT *scratch;
+	ENV *env;
+	u_int i;
+
+	env = toc->env;
+
+	for (i = 0,
+	    scratch = toc->scratch; i < toc->scratch_alloc; ++i, ++scratch)
+		if (scratch->data != NULL)
+			__wt_free(env, scratch->data, scratch->mem_size);
+
+	__wt_free(env, toc->scratch, toc->scratch_alloc * sizeof(DBT));
+}
diff --git a/src/support/serial.c b/src/support/serial.c
new file mode 100644
index 00000000000..9974f1f6b38
--- /dev/null
+++ b/src/support/serial.c
@@ -0,0 +1,123 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Serialization:
+ *
+ * Serialization support allows scheduling operations that require serialized
+ * access to a piece of data, where the data (1) is accessed only by serialized
+ * code, or where the data, when accessed by non-serialized code, can either
+ * (2) be read atomically, or (3) it doesn't matter if it's read incorrectly.
+ * In other words, the readers are key, and they are known to be indifferent
+ * to the serialization code modifying the data.
+ *
+ * An example of #1 is updating the size of a database file.  The size is only
+ * changed in serialized code, and never read by anything else.  An example of
+ * #2 is updating a 32-bit value, because readers by definition get consistent
+ * views of 32-bit memory locations.   An example of #3 is updating a 64-bit
+ * value (such as the bytes allocated in the cache).  While there is a small
+ * possibility a reader will see a corrupted value, the value is only used for
+ * advisory actions, such as waking the cache thread to see if there's work to
+ * do.
+ */
+
+/*
+ * __wt_toc_serialize_func --
+ *	Schedule a serialization request, and block or spin until it completes.
+ */
+int
+__wt_toc_serialize_func(
+    WT_TOC *toc, wq_state_t op, int spin, int (*func)(WT_TOC *), void *args)
+{
+	int done;
+
+	/*
+	 * Threads serializing access to data using a function:
+	 *	set a function/argument pair in the WT_TOC handle,
+	 *	flush memory,
+	 *	update the WT_TOC workq state, and
+	 *	spins or blocks.
+	 *
+	 * The workQ thread notices the state change and calls the serialization
+	 * function.
+	 *
+	 * The first memory flush ensures all supporting information is written
+	 * before the wq_state field (which makes the entry visible to the workQ
+	 * thread).  No second memory flush is required, the wq_state field is
+	 * declared volatile.
+	 */
+	toc->wq_args = args;
+	toc->wq_func = func;
+	toc->wq_sleeping = spin ? 0 : 1;
+	WT_MEMORY_FLUSH;
+	toc->wq_state = op;
+
+	/*
+	 * Callers can spin on the WT_TOC state (implying the call is quickly
+	 * satisfied), or block until its mutex is unlocked by another thread
+	 * when the operation has completed.
+	 */
+	if (spin) {
+		/*
+		 * !!!
+		 * Don't do arithmetic comparisons (even equality) on enum's,
+		 * it makes some compilers/lint tools angry.
+		 */
+		for (done = 0; !done;) {
+			switch (toc->wq_state) {
+			case WT_WORKQ_NONE:
+				done = 1;
+				break;
+			case WT_WORKQ_FUNC:
+			case WT_WORKQ_READ:
+			case WT_WORKQ_READ_SCHED:
+				__wt_yield();
+				break;
+			}
+		}
+	} else
+		__wt_lock(toc->env, toc->mtx);
+
+	return (toc->wq_ret);
+}
+
+/*
+ * __wt_toc_serialize_wrapup --
+ *	Server function cleanup.
+ */
+void
+__wt_toc_serialize_wrapup(WT_TOC *toc, WT_PAGE *page, int ret)
+{
+	ENV *env;
+
+	env = toc->env;
+
+	/*
+	 * If passed a page and the return value is good, we modified the page;
+	 * no need for a memory flush, we'll use the one below.
+	 */
+	if (page != NULL && ret == 0)
+		WT_PAGE_SET_MODIFIED(page);
+
+	/*
+	 * Set the return value and reset the state -- the workQ no longer needs
+	 * to worry about us.
+	 *
+	 * The return value isn't volatile, so requires an explicit flush.
+	 */
+	toc->wq_ret = ret;
+	toc->wq_state = WT_WORKQ_NONE;
+	WT_MEMORY_FLUSH;
+
+	/* If the calling thread is sleeping, wake it up. */
+	if (toc->wq_sleeping)
+		__wt_unlock(env, toc->mtx);
+}
diff --git a/src/support/simple_setup.c b/src/support/simple_setup.c
new file mode 100644
index 00000000000..a4464fead69
--- /dev/null
+++ b/src/support/simple_setup.c
@@ -0,0 +1,94 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include <stdlib.h>
+
+#include "wiredtiger.h"
+
+extern const char *progname;
+
+static ENV *__env;
+
+/*
+ * wiredtiger_simple_setup --
+ *	Standard setup for simple applications.
+ */
+int
+wiredtiger_simple_setup(
+    const char *progname, DB **dbp, u_int32_t cache_size, u_int32_t flags)
+{
+	DB *db;
+	ENV *env;
+	int ret;
+
+	db = *dbp = NULL;
+
+	if ((ret = wiredtiger_env_init(&env, flags)) != 0) {
+		fprintf(stderr,
+		    "%s: wiredtiger_env_init: %s\n",
+		    progname, wiredtiger_strerror(ret));
+		return (ret);
+	}
+	__env = env;
+
+	if (cache_size != 0 &&
+	    (ret = env->cache_size_set(env, cache_size)) != 0) {
+		env->err(env, ret, "Env.cache_size_set");
+		goto err;
+	}
+
+	if ((ret = env->open(env, NULL, 0, 0)) != 0) {
+		env->err(env, ret, "%s: Env.open", progname);
+		goto err;
+	}
+	if ((ret = env->db(env, 0, &db)) != 0) {
+		env->err(env, ret, "%s: Env.db", progname);
+		goto err;
+	}
+	if ((ret = db->errpfx_set(db, progname)) != 0) {
+		db->err(db, ret, "%s: Db.errpfx_set", progname);
+		goto err;
+	}
+
+	*dbp = db;
+	return (EXIT_SUCCESS);
+
+err:	wiredtiger_simple_teardown(progname, db);
+	return (ret);
+}
+
+/*
+ * wiredtiger_simple_teardown --
+ *	Standard teardown for simple applications.
+ */
+int
+wiredtiger_simple_teardown(const char *progname, DB *db)
+{
+	int ret, tret;
+
+	ret = 0;
+	if (db != NULL && (tret = db->close(db, 0)) != 0) {
+		fprintf(stderr,
+		    "%s: Db.close: %s\n", progname, wiredtiger_strerror(ret));
+		if (ret == 0)
+			ret = tret;
+	}
+
+	if (__env != NULL) {
+		if ((tret = __env->close(__env, 0)) != 0) {
+			fprintf(stderr, "%s: Env.close: %s\n",
+			    progname, wiredtiger_strerror(ret));
+			if (ret == 0)
+				ret = tret;
+		}
+		__env = NULL;
+	}
+
+	return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+}
diff --git a/src/support/stat.c b/src/support/stat.c
new file mode 100644
index 00000000000..bf08a95b12f
--- /dev/null
+++ b/src/support/stat.c
@@ -0,0 +1,370 @@
+/* DO NOT EDIT: automatically built by dist/stat.py. */
+
+#include "wt_internal.h"
+
+int
+__wt_stat_alloc_cache_stats(ENV *env, WT_STATS **statsp)
+{
+	WT_STATS *stats;
+
+	WT_RET(__wt_calloc(env, 10, sizeof(WT_STATS), &stats));
+
+	stats[WT_STAT_CACHE_BYTES_INUSE].desc = "bytes in the cache";
+	stats[WT_STAT_CACHE_BYTES_MAX].desc =
+	    "maximum bytes configured for the cache";
+	stats[WT_STAT_CACHE_EVICT_HAZARD].desc =
+	    "pages selected for eviction not evicted because of a hazard reference";
+	stats[WT_STAT_CACHE_EVICT_MODIFIED].desc =
+	    "modified pages selected for eviction";
+	stats[WT_STAT_CACHE_EVICT_UNMODIFIED].desc =
+	    "unmodified pages selected for eviction";
+	stats[WT_STAT_CACHE_PAGES_INUSE].desc = "pages in the cache";
+	stats[WT_STAT_OVERFLOW_READ].desc =
+	    "overflow pages read from the file";
+	stats[WT_STAT_PAGE_READ].desc = "pages read from a file";
+	stats[WT_STAT_PAGE_WRITE].desc = "pages written to a file";
+
+	*statsp = stats;
+	return (0);
+}
+
+void
+__wt_stat_clear_cache_stats(WT_STATS *stats)
+{
+	stats[WT_STAT_CACHE_EVICT_HAZARD].v = 0;
+	stats[WT_STAT_CACHE_EVICT_MODIFIED].v = 0;
+	stats[WT_STAT_CACHE_EVICT_UNMODIFIED].v = 0;
+	stats[WT_STAT_OVERFLOW_READ].v = 0;
+	stats[WT_STAT_PAGE_READ].v = 0;
+	stats[WT_STAT_PAGE_WRITE].v = 0;
+}
+
+int
+__wt_stat_alloc_database_stats(ENV *env, WT_STATS **statsp)
+{
+	WT_STATS *stats;
+
+	WT_RET(__wt_calloc(env, 27, sizeof(WT_STATS), &stats));
+
+	stats[WT_STAT_BASE_RECNO].desc = "base record number";
+	stats[WT_STAT_DUP_TREE].desc = "duplicate data off-page trees";
+	stats[WT_STAT_FIXED_LEN].desc = "database fixed-record size";
+	stats[WT_STAT_INTLMAX].desc = "maximum internal page size";
+	stats[WT_STAT_INTLMIN].desc = "minimum internal page size";
+	stats[WT_STAT_ITEM_COL_DELETED].desc =
+	    "column store deleted data items";
+	stats[WT_STAT_ITEM_DATA_OVFL].desc = "total overflow data items";
+	stats[WT_STAT_ITEM_DUP_DATA].desc = "total duplicate data items";
+	stats[WT_STAT_ITEM_KEY_OVFL].desc = "total overflow keys";
+	stats[WT_STAT_ITEM_TOTAL_DATA].desc = "total data items";
+	stats[WT_STAT_ITEM_TOTAL_KEY].desc = "total keys";
+	stats[WT_STAT_LEAFMAX].desc = "maximum leaf page size";
+	stats[WT_STAT_LEAFMIN].desc = "minimum leaf page size";
+	stats[WT_STAT_MAGIC].desc = "magic number";
+	stats[WT_STAT_MAJOR].desc = "major version number";
+	stats[WT_STAT_MINOR].desc = "minor version number";
+	stats[WT_STAT_PAGE_COL_FIX].desc =
+	    "column-store fixed-size leaf pages";
+	stats[WT_STAT_PAGE_COL_INTERNAL].desc = "column-store internal pages";
+	stats[WT_STAT_PAGE_COL_RLE].desc =
+	    "column-store repeat-count compressed fixed-size leaf pages";
+	stats[WT_STAT_PAGE_COL_VARIABLE].desc =
+	    "column-store variable-size leaf pages";
+	stats[WT_STAT_PAGE_DUP_INTERNAL].desc = "duplicate internal pages";
+	stats[WT_STAT_PAGE_DUP_LEAF].desc = "duplicate leaf pages";
+	stats[WT_STAT_PAGE_OVERFLOW].desc = "overflow pages";
+	stats[WT_STAT_PAGE_ROW_INTERNAL].desc = "row-store internal pages";
+	stats[WT_STAT_PAGE_ROW_LEAF].desc = "row-store leaf pages";
+	stats[WT_STAT_TREE_LEVEL].desc = "number of levels in the btree";
+
+	*statsp = stats;
+	return (0);
+}
+
+void
+__wt_stat_clear_database_stats(WT_STATS *stats)
+{
+	stats[WT_STAT_BASE_RECNO].v = 0;
+	stats[WT_STAT_DUP_TREE].v = 0;
+	stats[WT_STAT_FIXED_LEN].v = 0;
+	stats[WT_STAT_INTLMAX].v = 0;
+	stats[WT_STAT_INTLMIN].v = 0;
+	stats[WT_STAT_ITEM_COL_DELETED].v = 0;
+	stats[WT_STAT_ITEM_DATA_OVFL].v = 0;
+	stats[WT_STAT_ITEM_DUP_DATA].v = 0;
+	stats[WT_STAT_ITEM_KEY_OVFL].v = 0;
+	stats[WT_STAT_ITEM_TOTAL_DATA].v = 0;
+	stats[WT_STAT_ITEM_TOTAL_KEY].v = 0;
+	stats[WT_STAT_LEAFMAX].v = 0;
+	stats[WT_STAT_LEAFMIN].v = 0;
+	stats[WT_STAT_MAGIC].v = 0;
+	stats[WT_STAT_MAJOR].v = 0;
+	stats[WT_STAT_MINOR].v = 0;
+	stats[WT_STAT_PAGE_COL_FIX].v = 0;
+	stats[WT_STAT_PAGE_COL_INTERNAL].v = 0;
+	stats[WT_STAT_PAGE_COL_RLE].v = 0;
+	stats[WT_STAT_PAGE_COL_VARIABLE].v = 0;
+	stats[WT_STAT_PAGE_DUP_INTERNAL].v = 0;
+	stats[WT_STAT_PAGE_DUP_LEAF].v = 0;
+	stats[WT_STAT_PAGE_OVERFLOW].v = 0;
+	stats[WT_STAT_PAGE_ROW_INTERNAL].v = 0;
+	stats[WT_STAT_PAGE_ROW_LEAF].v = 0;
+	stats[WT_STAT_TREE_LEVEL].v = 0;
+}
+
+int
+__wt_stat_alloc_db_stats(ENV *env, WT_STATS **statsp)
+{
+	WT_STATS *stats;
+
+	WT_RET(__wt_calloc(env, 11, sizeof(WT_STATS), &stats));
+
+	stats[WT_STAT_DB_ALLOC].desc = "database allocations";
+	stats[WT_STAT_DB_ALLOC_FILE].desc = "database extensions";
+	stats[WT_STAT_DB_FREE].desc = "database frees";
+	stats[WT_STAT_DUPLICATE_ITEMS_INSERTED].desc =
+	    "duplicate key/data pairs inserted";
+	stats[WT_STAT_HUFFMAN_DATA].desc = "huffman data compression in bytes";
+	stats[WT_STAT_HUFFMAN_KEY].desc = "huffman key compression in bytes";
+	stats[WT_STAT_ITEMS_INSERTED].desc = "key/data pairs inserted";
+	stats[WT_STAT_OVERFLOW_DATA].desc = "overflow data items inserted";
+	stats[WT_STAT_OVERFLOW_KEY].desc = "overflow key items inserted";
+	stats[WT_STAT_REPEAT_COUNT].desc = "repeat value compression count";
+
+	*statsp = stats;
+	return (0);
+}
+
+void
+__wt_stat_clear_db_stats(WT_STATS *stats)
+{
+	stats[WT_STAT_DB_ALLOC].v = 0;
+	stats[WT_STAT_DB_ALLOC_FILE].v = 0;
+	stats[WT_STAT_DB_FREE].v = 0;
+	stats[WT_STAT_DUPLICATE_ITEMS_INSERTED].v = 0;
+	stats[WT_STAT_HUFFMAN_DATA].v = 0;
+	stats[WT_STAT_HUFFMAN_KEY].v = 0;
+	stats[WT_STAT_ITEMS_INSERTED].v = 0;
+	stats[WT_STAT_OVERFLOW_DATA].v = 0;
+	stats[WT_STAT_OVERFLOW_KEY].v = 0;
+	stats[WT_STAT_REPEAT_COUNT].v = 0;
+}
+
+int
+__wt_stat_alloc_env_stats(ENV *env, WT_STATS **statsp)
+{
+	WT_STATS *stats;
+
+	WT_RET(__wt_calloc(env, 9, sizeof(WT_STATS), &stats));
+
+	stats[WT_STAT_DATABASE_OPEN].desc = "database open";
+	stats[WT_STAT_MEMALLOC].desc = "memory allocations";
+	stats[WT_STAT_MEMFREE].desc = "memory frees";
+	stats[WT_STAT_MTX_LOCK].desc = "mutex lock calls";
+	stats[WT_STAT_TOTAL_READ_IO].desc = "total read I/Os";
+	stats[WT_STAT_TOTAL_WRITE_IO].desc = "total write I/Os";
+	stats[WT_STAT_WORKQ_PASSES].desc = "workQ queue passes";
+	stats[WT_STAT_WORKQ_YIELD].desc = "workQ yields";
+
+	*statsp = stats;
+	return (0);
+}
+
+void
+__wt_stat_clear_env_stats(WT_STATS *stats)
+{
+	stats[WT_STAT_DATABASE_OPEN].v = 0;
+	stats[WT_STAT_MEMALLOC].v = 0;
+	stats[WT_STAT_MEMFREE].v = 0;
+	stats[WT_STAT_MTX_LOCK].v = 0;
+	stats[WT_STAT_TOTAL_READ_IO].v = 0;
+	stats[WT_STAT_TOTAL_WRITE_IO].v = 0;
+	stats[WT_STAT_WORKQ_PASSES].v = 0;
+	stats[WT_STAT_WORKQ_YIELD].v = 0;
+}
+
+int
+__wt_stat_alloc_fh_stats(ENV *env, WT_STATS **statsp)
+{
+	WT_STATS *stats;
+
+	WT_RET(__wt_calloc(env, 4, sizeof(WT_STATS), &stats));
+
+	stats[WT_STAT_FSYNC].desc = "fsyncs";
+	stats[WT_STAT_READ_IO].desc = "read I/Os";
+	stats[WT_STAT_WRITE_IO].desc = "write I/Os";
+
+	*statsp = stats;
+	return (0);
+}
+
+void
+__wt_stat_clear_fh_stats(WT_STATS *stats)
+{
+	stats[WT_STAT_FSYNC].v = 0;
+	stats[WT_STAT_READ_IO].v = 0;
+	stats[WT_STAT_WRITE_IO].v = 0;
+}
+
+int
+__wt_stat_alloc_method_stats(ENV *env, WT_STATS **statsp)
+{
+	WT_STATS *stats;
+
+	WT_RET(__wt_calloc(env, 69, sizeof(WT_STATS), &stats));
+
+	stats[WT_STAT_DB_BTREE_COMPARE_DUP_GET].desc =
+	    "db.btree_compare_dup_get";
+	stats[WT_STAT_DB_BTREE_COMPARE_DUP_SET].desc =
+	    "db.btree_compare_dup_set";
+	stats[WT_STAT_DB_BTREE_COMPARE_GET].desc = "db.btree_compare_get";
+	stats[WT_STAT_DB_BTREE_COMPARE_INT_GET].desc =
+	    "db.btree_compare_int_get";
+	stats[WT_STAT_DB_BTREE_COMPARE_INT_SET].desc =
+	    "db.btree_compare_int_set";
+	stats[WT_STAT_DB_BTREE_COMPARE_SET].desc = "db.btree_compare_set";
+	stats[WT_STAT_DB_BTREE_DUP_OFFPAGE_GET].desc =
+	    "db.btree_dup_offpage_get";
+	stats[WT_STAT_DB_BTREE_DUP_OFFPAGE_SET].desc =
+	    "db.btree_dup_offpage_set";
+	stats[WT_STAT_DB_BTREE_ITEMSIZE_GET].desc = "db.btree_itemsize_get";
+	stats[WT_STAT_DB_BTREE_ITEMSIZE_SET].desc = "db.btree_itemsize_set";
+	stats[WT_STAT_DB_BTREE_PAGESIZE_GET].desc = "db.btree_pagesize_get";
+	stats[WT_STAT_DB_BTREE_PAGESIZE_SET].desc = "db.btree_pagesize_set";
+	stats[WT_STAT_DB_BULK_LOAD].desc = "db.bulk_load";
+	stats[WT_STAT_DB_CLOSE].desc = "db.close";
+	stats[WT_STAT_DB_COLUMN_SET].desc = "db.column_set";
+	stats[WT_STAT_DB_COL_DEL].desc = "db.col_del";
+	stats[WT_STAT_DB_COL_DEL_RESTART].desc = "db.col_del method restarts";
+	stats[WT_STAT_DB_COL_GET].desc = "db.col_get";
+	stats[WT_STAT_DB_COL_PUT].desc = "db.col_put";
+	stats[WT_STAT_DB_COL_PUT_RESTART].desc = "db.col_put method restarts";
+	stats[WT_STAT_DB_DUMP].desc = "db.dump";
+	stats[WT_STAT_DB_ERRCALL_GET].desc = "db.errcall_get";
+	stats[WT_STAT_DB_ERRCALL_SET].desc = "db.errcall_set";
+	stats[WT_STAT_DB_ERRFILE_GET].desc = "db.errfile_get";
+	stats[WT_STAT_DB_ERRFILE_SET].desc = "db.errfile_set";
+	stats[WT_STAT_DB_ERRPFX_GET].desc = "db.errpfx_get";
+	stats[WT_STAT_DB_ERRPFX_SET].desc = "db.errpfx_set";
+	stats[WT_STAT_DB_HUFFMAN_SET].desc = "db.huffman_set";
+	stats[WT_STAT_DB_OPEN].desc = "db.open";
+	stats[WT_STAT_DB_ROW_DEL].desc = "db.row_del";
+	stats[WT_STAT_DB_ROW_DEL_RESTART].desc = "db.row_del method restarts";
+	stats[WT_STAT_DB_ROW_GET].desc = "db.row_get";
+	stats[WT_STAT_DB_ROW_PUT].desc = "db.row_put";
+	stats[WT_STAT_DB_ROW_PUT_RESTART].desc = "db.row_put method restarts";
+	stats[WT_STAT_DB_STAT_CLEAR].desc = "db.stat_clear";
+	stats[WT_STAT_DB_STAT_PRINT].desc = "db.stat_print";
+	stats[WT_STAT_DB_SYNC].desc = "db.sync";
+	stats[WT_STAT_DB_VERIFY].desc = "db.verify";
+	stats[WT_STAT_ENV_CACHE_SIZE_GET].desc = "env.cache_size_get";
+	stats[WT_STAT_ENV_CACHE_SIZE_SET].desc = "env.cache_size_set";
+	stats[WT_STAT_ENV_CLOSE].desc = "env.close";
+	stats[WT_STAT_ENV_DATA_UPDATE_INITIAL_GET].desc =
+	    "env.data_update_initial_get";
+	stats[WT_STAT_ENV_DATA_UPDATE_INITIAL_SET].desc =
+	    "env.data_update_initial_set";
+	stats[WT_STAT_ENV_DATA_UPDATE_MAX_GET].desc =
+	    "env.data_update_max_get";
+	stats[WT_STAT_ENV_DATA_UPDATE_MAX_SET].desc =
+	    "env.data_update_max_set";
+	stats[WT_STAT_ENV_DB].desc = "env.db";
+	stats[WT_STAT_ENV_ERRCALL_GET].desc = "env.errcall_get";
+	stats[WT_STAT_ENV_ERRCALL_SET].desc = "env.errcall_set";
+	stats[WT_STAT_ENV_ERRFILE_GET].desc = "env.errfile_get";
+	stats[WT_STAT_ENV_ERRFILE_SET].desc = "env.errfile_set";
+	stats[WT_STAT_ENV_ERRPFX_GET].desc = "env.errpfx_get";
+	stats[WT_STAT_ENV_ERRPFX_SET].desc = "env.errpfx_set";
+	stats[WT_STAT_ENV_HAZARD_SIZE_GET].desc = "env.hazard_size_get";
+	stats[WT_STAT_ENV_HAZARD_SIZE_SET].desc = "env.hazard_size_set";
+	stats[WT_STAT_ENV_MSGCALL_GET].desc = "env.msgcall_get";
+	stats[WT_STAT_ENV_MSGCALL_SET].desc = "env.msgcall_set";
+	stats[WT_STAT_ENV_MSGFILE_GET].desc = "env.msgfile_get";
+	stats[WT_STAT_ENV_MSGFILE_SET].desc = "env.msgfile_set";
+	stats[WT_STAT_ENV_OPEN].desc = "env.open";
+	stats[WT_STAT_ENV_STAT_CLEAR].desc = "env.stat_clear";
+	stats[WT_STAT_ENV_STAT_PRINT].desc = "env.stat_print";
+	stats[WT_STAT_ENV_SYNC].desc = "env.sync";
+	stats[WT_STAT_ENV_TOC].desc = "env.toc";
+	stats[WT_STAT_ENV_TOC_SIZE_GET].desc = "env.toc_size_get";
+	stats[WT_STAT_ENV_TOC_SIZE_SET].desc = "env.toc_size_set";
+	stats[WT_STAT_ENV_VERBOSE_GET].desc = "env.verbose_get";
+	stats[WT_STAT_ENV_VERBOSE_SET].desc = "env.verbose_set";
+	stats[WT_STAT_WT_TOC_CLOSE].desc = "wt_toc.close";
+
+	*statsp = stats;
+	return (0);
+}
+
+void
+__wt_stat_clear_method_stats(WT_STATS *stats)
+{
+	stats[WT_STAT_DB_BTREE_COMPARE_DUP_GET].v = 0;
+	stats[WT_STAT_DB_BTREE_COMPARE_DUP_SET].v = 0;
+	stats[WT_STAT_DB_BTREE_COMPARE_GET].v = 0;
+	stats[WT_STAT_DB_BTREE_COMPARE_INT_GET].v = 0;
+	stats[WT_STAT_DB_BTREE_COMPARE_INT_SET].v = 0;
+	stats[WT_STAT_DB_BTREE_COMPARE_SET].v = 0;
+	stats[WT_STAT_DB_BTREE_DUP_OFFPAGE_GET].v = 0;
+	stats[WT_STAT_DB_BTREE_DUP_OFFPAGE_SET].v = 0;
+	stats[WT_STAT_DB_BTREE_ITEMSIZE_GET].v = 0;
+	stats[WT_STAT_DB_BTREE_ITEMSIZE_SET].v = 0;
+	stats[WT_STAT_DB_BTREE_PAGESIZE_GET].v = 0;
+	stats[WT_STAT_DB_BTREE_PAGESIZE_SET].v = 0;
+	stats[WT_STAT_DB_BULK_LOAD].v = 0;
+	stats[WT_STAT_DB_CLOSE].v = 0;
+	stats[WT_STAT_DB_COLUMN_SET].v = 0;
+	stats[WT_STAT_DB_COL_DEL].v = 0;
+	stats[WT_STAT_DB_COL_DEL_RESTART].v = 0;
+	stats[WT_STAT_DB_COL_GET].v = 0;
+	stats[WT_STAT_DB_COL_PUT].v = 0;
+	stats[WT_STAT_DB_COL_PUT_RESTART].v = 0;
+	stats[WT_STAT_DB_DUMP].v = 0;
+	stats[WT_STAT_DB_ERRCALL_GET].v = 0;
+	stats[WT_STAT_DB_ERRCALL_SET].v = 0;
+	stats[WT_STAT_DB_ERRFILE_GET].v = 0;
+	stats[WT_STAT_DB_ERRFILE_SET].v = 0;
+	stats[WT_STAT_DB_ERRPFX_GET].v = 0;
+	stats[WT_STAT_DB_ERRPFX_SET].v = 0;
+	stats[WT_STAT_DB_HUFFMAN_SET].v = 0;
+	stats[WT_STAT_DB_OPEN].v = 0;
+	stats[WT_STAT_DB_ROW_DEL].v = 0;
+	stats[WT_STAT_DB_ROW_DEL_RESTART].v = 0;
+	stats[WT_STAT_DB_ROW_GET].v = 0;
+	stats[WT_STAT_DB_ROW_PUT].v = 0;
+	stats[WT_STAT_DB_ROW_PUT_RESTART].v = 0;
+	stats[WT_STAT_DB_STAT_CLEAR].v = 0;
+	stats[WT_STAT_DB_STAT_PRINT].v = 0;
+	stats[WT_STAT_DB_SYNC].v = 0;
+	stats[WT_STAT_DB_VERIFY].v = 0;
+	stats[WT_STAT_ENV_CACHE_SIZE_GET].v = 0;
+	stats[WT_STAT_ENV_CACHE_SIZE_SET].v = 0;
+	stats[WT_STAT_ENV_CLOSE].v = 0;
+	stats[WT_STAT_ENV_DATA_UPDATE_INITIAL_GET].v = 0;
+	stats[WT_STAT_ENV_DATA_UPDATE_INITIAL_SET].v = 0;
+	stats[WT_STAT_ENV_DATA_UPDATE_MAX_GET].v = 0;
+	stats[WT_STAT_ENV_DATA_UPDATE_MAX_SET].v = 0;
+	stats[WT_STAT_ENV_DB].v = 0;
+	stats[WT_STAT_ENV_ERRCALL_GET].v = 0;
+	stats[WT_STAT_ENV_ERRCALL_SET].v = 0;
+	stats[WT_STAT_ENV_ERRFILE_GET].v = 0;
+	stats[WT_STAT_ENV_ERRFILE_SET].v = 0;
+	stats[WT_STAT_ENV_ERRPFX_GET].v = 0;
+	stats[WT_STAT_ENV_ERRPFX_SET].v = 0;
+	stats[WT_STAT_ENV_HAZARD_SIZE_GET].v = 0;
+	stats[WT_STAT_ENV_HAZARD_SIZE_SET].v = 0;
+	stats[WT_STAT_ENV_MSGCALL_GET].v = 0;
+	stats[WT_STAT_ENV_MSGCALL_SET].v = 0;
+	stats[WT_STAT_ENV_MSGFILE_GET].v = 0;
+	stats[WT_STAT_ENV_MSGFILE_SET].v = 0;
+	stats[WT_STAT_ENV_OPEN].v = 0;
+	stats[WT_STAT_ENV_STAT_CLEAR].v = 0;
+	stats[WT_STAT_ENV_STAT_PRINT].v = 0;
+	stats[WT_STAT_ENV_SYNC].v = 0;
+	stats[WT_STAT_ENV_TOC].v = 0;
+	stats[WT_STAT_ENV_TOC_SIZE_GET].v = 0;
+	stats[WT_STAT_ENV_TOC_SIZE_SET].v = 0;
+	stats[WT_STAT_ENV_VERBOSE_GET].v = 0;
+	stats[WT_STAT_ENV_VERBOSE_SET].v = 0;
+	stats[WT_STAT_WT_TOC_CLOSE].v = 0;
+}
diff --git a/src/support/strerror.c b/src/support/strerror.c
new file mode 100644
index 00000000000..17a4653438a
--- /dev/null
+++ b/src/support/strerror.c
@@ -0,0 +1,41 @@
+/* DO NOT EDIT: automatically built by dist/api_err.py. */
+
+#include "wt_internal.h"
+
+/*
+ * wiredtiger_strerror --
+ *	Return a string for any error value.
+ */
+char *
+wiredtiger_strerror(int error)
+{
+	static char errbuf[64];
+	char *p;
+
+	if (error == 0)
+		return ("Successful return: 0");
+
+	switch (error) {
+	case WT_ERROR:
+		return ("WT_ERROR: non-specific WiredTiger error");
+	case WT_NOTFOUND:
+		return ("WT_NOTFOUND: database item not found");
+	case WT_READONLY:
+		return ("WT_READONLY: modification attempted of a read-only database");
+	case WT_RESTART:
+		return ("WT_RESTART: restart the operation (internal)");
+	case WT_TOOSMALL:
+		return ("WT_TOOSMALL: buffer too small");
+	default:
+		if (error > 0 && (p = strerror(error)) != NULL)
+			return (p);
+		break;
+	}
+
+	/*
+	 * !!!
+	 * Not thread-safe, but this is never supposed to happen.
+	 */
+	(void)snprintf(errbuf, sizeof(errbuf), "Unknown error: %d", error);
+	return (errbuf);
+}
diff --git a/src/support/version.c b/src/support/version.c
new file mode 100644
index 00000000000..dbd60162c16
--- /dev/null
+++ b/src/support/version.c
@@ -0,0 +1,26 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * wiredtiger_version --
+ *	Return library version information.
+ */
+char *
+wiredtiger_version(int *majorp, int *minorp, int *patchp)
+{
+	if (majorp != NULL)
+		*majorp = WIREDTIGER_VERSION_MAJOR;
+	if (minorp != NULL)
+		*minorp = WIREDTIGER_VERSION_MINOR;
+	if (patchp != NULL)
+		*patchp = WIREDTIGER_VERSION_PATCH;
+	return ((char *)WIREDTIGER_VERSION_STRING);
+}
diff --git a/src/utilities/db_dump/util_dump.c b/src/utilities/db_dump/util_dump.c
new file mode 100644
index 00000000000..68cc6d69061
--- /dev/null
+++ b/src/utilities/db_dump/util_dump.c
@@ -0,0 +1,83 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2009 WiredTiger Software.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+#include "util.h"
+
+const char *progname;
+
+int	usage(void);
+
+int
+main(int argc, char *argv[])
+{
+	extern char *optarg;
+	extern int optind;
+	DB *db;
+	u_int32_t flags;
+	int ch, ret, tret;
+
+	WT_UTILITY_INTRO(progname, argv);
+
+	flags = 0;
+	while ((ch = getopt(argc, argv, "df:p")) != EOF)
+		switch (ch) {
+		case 'd':
+			flags = WT_DEBUG;
+			break;
+		case 'f':			/* output file */
+			if (freopen(optarg, "w", stdout) == NULL) {
+				fprintf(stderr, "%s: %s: reopen: %s\n",
+				    progname, optarg, strerror(errno));
+				return (EXIT_FAILURE);
+			}
+			break;
+		case 'p':
+			flags = WT_PRINTABLES;
+			break;
+		case 'V':			/* version */
+			printf("%s\n", wiredtiger_version(NULL, NULL, NULL));
+			return (EXIT_SUCCESS);
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= optind;
+	argv += optind;
+
+	/* The remaining argument is the database name. */
+	if (argc != 1)
+		return (usage());
+
+	if ((ret = wiredtiger_simple_setup(progname, &db, 0, 0)) == 0) {
+		if ((ret = db->open(db, *argv, 0, 0)) != 0) {
+			db->err(db, ret, "Db.open: %s", *argv);
+			goto err;
+		}
+		if ((ret = db->dump(db, stdout, NULL, flags)) != 0) {
+			db->err(db, ret, "Db.dump");
+			goto err;
+		}
+	}
+
+	if (0) {
+err:		ret = 1;
+	}
+	if ((tret = wiredtiger_simple_teardown(progname, db)) != 0 && ret == 0)
+		ret = tret;
+	return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+int
+usage()
+{
+	(void)fprintf(stderr,
+	    "usage: %s [-dpV] [-f output-file] database\n", progname);
+	return (EXIT_FAILURE);
+}
diff --git a/src/utilities/db_load/util_load.c b/src/utilities/db_load/util_load.c
new file mode 100644
index 00000000000..6ededed7c28
--- /dev/null
+++ b/src/utilities/db_load/util_load.c
@@ -0,0 +1,292 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2009 WiredTiger Software.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+#include "util.h"
+
+const char *progname;
+
+int	bulk_callback(DB *, DBT **, DBT **);
+int	bulk_read(DBT *dbt, int);
+int	config_read(char **);
+int	config_read_single(char *);
+int	config_set(DB *);
+int	usage(void);
+
+struct {
+	int pagesize_set;
+	u_long allocsize, intlmin, intlmax, leafmin, leafmax;
+} config;
+
+int
+main(int argc, char *argv[])
+{
+	extern char *optarg;
+	extern int optind;
+	DB *db;
+	int ch, ret, text_input, tret, verbose;
+	char **config_list, **config_next;
+
+	WT_UTILITY_INTRO(progname, argv);
+
+	/*
+	 * We can't handle configuration-line information until we've opened
+	 * the DB handle, so we need a place to store it for now.
+	 */
+	if ((config_next =
+	    config_list = calloc(argc + 1, sizeof(char *))) == NULL) {
+		fprintf(stderr, "%s: %s\n", progname, strerror(errno));
+		return (EXIT_FAILURE);
+	}
+
+	text_input = verbose = 0;
+	while ((ch = getopt(argc, argv, "c:f:TVv")) != EOF)
+		switch (ch) {
+		case 'c':			/* command-line option */
+			*config_next++ = optarg;
+			break;
+		case 'f':			/* input file */
+			if (freopen(optarg, "r", stdin) == NULL) {
+				fprintf(stderr, "%s: %s: reopen: %s\n",
+				    progname, optarg, strerror(errno));
+				return (EXIT_FAILURE);
+			}
+			break;
+		case 'T':
+			text_input = 1;
+			break;
+		case 'V':			/* version */
+			printf("%s\n", wiredtiger_version(NULL, NULL, NULL));
+			return (EXIT_SUCCESS);
+		case 'v':
+			verbose = 1;
+			break;
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= optind;
+	argv += optind;
+
+	/* The remaining argument is the database name. */
+	if (argc != 1)
+		return (usage());
+
+	/*
+	 * Read through the command-line configuration options and convert
+	 * to the config structure.
+	 */
+	if (config_read(config_list) != 0)
+		goto err;
+
+	/*
+	 * Right now, we only support text input -- require the T option to
+	 * match Berkeley DB's API.
+	 */
+	if (text_input == 0) {
+		fprintf(stderr,
+		    "%s: the -T option is currently required\n", progname);
+		return (EXIT_FAILURE);
+	}
+
+	if ((ret = wiredtiger_simple_setup(progname, &db, 0, 0)) == 0) {
+		if (config_set(db) != 0)
+			goto err;
+
+		(void)remove(*argv);
+
+		if ((ret = db->open(db, *argv, 0600, WT_CREATE)) != 0) {
+			db->err(db, ret, "Db.open: %s", *argv);
+			goto err;
+		}
+
+		if ((ret = db->bulk_load(db, WT_DUPLICATES,
+		    verbose ? __wt_progress : NULL, bulk_callback)) != 0) {
+			db->err(db, ret, "Db.bulk_load");
+			goto err;
+		}
+		if (verbose)
+			printf("\n");
+	}
+
+	if (0) {
+err:		ret = 1;
+	}
+	if ((tret = wiredtiger_simple_teardown(progname, db)) != 0 && ret == 0)
+		ret = tret;
+	return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+/*
+ * config_read --
+ *	Convert command-line options into the config structure.
+ */
+int
+config_read(char **list)
+{
+	int ret;
+
+	for (; *list != NULL; ++list)
+		if ((ret = config_read_single(*list)) != 0)
+			return (ret);
+	return (0);
+}
+
+/*
+ * config_read_single --
+ *	Process a single command-line configuration option, converting it into
+ *	the config structure.
+ */
+int
+config_read_single(char *opt)
+{
+	u_long v;
+	char *p, *ep;
+
+	/* Get pointers to the two parts of an X=Y format string. */
+	if ((p = strchr(opt, '=')) == NULL || p[1] == '\0')
+		goto format;
+	*p++ = '\0';
+	v = strtoul(p, &ep, 10);
+	if (v == ULONG_MAX && errno == ERANGE) {
+format:		fprintf(stderr,
+		    "%s: -c option %s is not correctly formatted\n",
+		    progname, opt);
+		return (1);
+	}
+	if (strcmp(opt, "allocsize") == 0) {
+		config.allocsize = v;
+		config.pagesize_set = 1;
+		return (0);
+	}
+	if (strcmp(opt, "intlmin") == 0) {
+		config.intlmin = v;
+		config.pagesize_set = 1;
+		return (0);
+	}
+	if (strcmp(opt, "intlmax") == 0) {
+		config.intlmax = v;
+		config.pagesize_set = 1;
+		return (0);
+	}
+	if (strcmp(opt, "leafmin") == 0) {
+		config.leafmin = v;
+		config.pagesize_set = 1;
+		return (0);
+	}
+	if (strcmp(opt, "leafmax") == 0) {
+		config.leafmax = v;
+		config.pagesize_set = 1;
+		return (0);
+	}
+
+	fprintf(stderr,
+	    "%s: -c option %s has an unknown keyword\n", progname, opt);
+	return (1);
+}
+
+/*
+ * config_set --
+ *	Set the command-line configuration options on the database handle.
+ */
+int
+config_set(DB *db)
+{
+	u_int32_t allocsize, intlmin, intlmax, leafmin, leafmax;
+	int ret;
+
+	if (config.pagesize_set) {
+		if ((ret = db->btree_pagesize_get(db,
+		    &allocsize, &intlmin, &intlmax, &leafmin, &leafmax)) != 0) {
+			db->err(db, ret, "Db.btree_pagesize_get");
+			return (1);
+		}
+		if (config.allocsize != 0)
+			allocsize = config.allocsize;
+		if (config.intlmin != 0)
+			intlmin = config.intlmin;
+		if (config.intlmax != 0)
+			intlmax = config.intlmax;
+		if (config.leafmin != 0)
+			leafmin = config.leafmin;
+		if (config.leafmax != 0)
+			leafmax = config.leafmax;
+		if ((ret = db->btree_pagesize_set(db,
+		    allocsize, intlmin, intlmax, leafmin, leafmax)) != 0) {
+			db->err(db, ret, "Db.btree_pagesize_set");
+			return (1);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * bulk_read --
+ *	Read a line from stdin into a DBT.
+ */
+int
+bulk_read(DBT *dbt, int iskey)
+{
+	static u_int64_t line = 0;
+	size_t len;
+	int ch;
+
+	++line;
+	for (len = 0;; ++len) {
+		if ((ch = getchar()) == EOF) {
+			if (iskey && len == 0)
+				return (1);
+			fprintf(stderr, "%s: corrupted input at line %llu\n",
+			    progname, line);
+			return (WT_ERROR);
+		}
+		if (ch == '\n')
+			break;
+		if (len >= dbt->mem_size) {
+			if ((dbt->data = realloc(dbt->data, len + 128)) == NULL)
+				return (errno);
+			dbt->mem_size = len + 128;
+		}
+		((u_int8_t *)(dbt->data))[len] = ch;
+	}
+	dbt->size = len;
+	return (0);
+}
+
+/*
+ * bulk_callback --
+ *	Bulk-load callback function.
+ */
+int
+bulk_callback(DB *db, DBT **keyp, DBT **datap)
+{
+	static DBT key, data;
+	int ret;
+
+	WT_CC_QUIET(db, NULL);
+
+	if ((ret = bulk_read(&key, 1)) != 0)
+		return (ret);
+	if ((ret = bulk_read(&data, 0)) != 0)
+		return (ret);
+
+	*keyp = &key;
+	*datap = &data;
+	return (0);
+}
+
+int
+usage()
+{
+	(void)fprintf(stderr,
+	    "usage: %s [-TVv] [-c configuration] [-f input-file] database\n",
+	    progname);
+	return (EXIT_FAILURE);
+}
diff --git a/src/utilities/db_stat/util_stat.c b/src/utilities/db_stat/util_stat.c
new file mode 100644
index 00000000000..afb2f94cba8
--- /dev/null
+++ b/src/utilities/db_stat/util_stat.c
@@ -0,0 +1,67 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2009 WiredTiger Software.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+#include "util.h"
+
+const char *progname;
+
+int	usage(void);
+
+int
+main(int argc, char *argv[])
+{
+	extern char *optarg;
+	extern int optind;
+	DB *db;
+	int ch, ret, tret;
+
+	WT_UTILITY_INTRO(progname, argv);
+
+	while ((ch = getopt(argc, argv, "V")) != EOF)
+		switch (ch) {
+		case 'V':			/* version */
+			printf("%s\n", wiredtiger_version(NULL, NULL, NULL));
+			return (EXIT_SUCCESS);
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= optind;
+	argv += optind;
+
+	/* The remaining argument is the database name. */
+	if (argc != 1)
+		return (usage());
+
+	if ((ret = wiredtiger_simple_setup(progname, &db, 0, 0)) == 0) {
+		if ((ret = db->open(db, *argv, 0, 0)) != 0) {
+			db->err(db, ret, "Db.open: %s", *argv);
+			goto err;
+		}
+		if ((ret = db->stat_print(db, stdout, 0)) != 0) {
+			db->err(db, ret, "Db.stat: %s", *argv);
+			goto err;
+		}
+	}
+
+	if (0) {
+err:		ret = 1;
+	}
+	if ((tret = wiredtiger_simple_teardown(progname, db)) != 0 && ret == 0)
+		ret = tret;
+	return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+int
+usage()
+{
+	(void)fprintf(stderr, "usage: %s [-V] database\n", progname);
+	return (EXIT_FAILURE);
+}
diff --git a/src/utilities/db_verify/util_verify.c b/src/utilities/db_verify/util_verify.c
new file mode 100644
index 00000000000..5c5bd02407f
--- /dev/null
+++ b/src/utilities/db_verify/util_verify.c
@@ -0,0 +1,74 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2009 WiredTiger Software.
+ *	All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+#include "util.h"
+
+const char *progname;
+
+int	usage(void);
+
+int
+main(int argc, char *argv[])
+{
+	extern char *optarg;
+	extern int optind;
+	DB *db;
+	int ch, ret, tret, verbose;
+
+	WT_UTILITY_INTRO(progname, argv);
+
+	verbose = 0;
+	while ((ch = getopt(argc, argv, "Vv")) != EOF)
+		switch (ch) {
+		case 'v':			/* verbose */
+			verbose = 1;
+			break;
+		case 'V':			/* version */
+			printf("%s\n", wiredtiger_version(NULL, NULL, NULL));
+			return (EXIT_SUCCESS);
+		case '?':
+		default:
+			return (usage());
+		}
+	argc -= optind;
+	argv += optind;
+
+	/* The remaining argument is the database name. */
+	if (argc != 1)
+		return (usage());
+
+	if ((ret = wiredtiger_simple_setup(progname, &db, 0, 0)) == 0) {
+		if ((ret = db->open(db, *argv, 0, 0)) != 0) {
+			db->err(db, ret, "Db.open: %s", *argv);
+			goto err;
+		}
+		if ((ret =
+		    db->verify(db, verbose ? __wt_progress : NULL, 0)) != 0) {
+			db->err(db, ret, "Db.verify: %s", *argv);
+			goto err;
+		}
+		if (verbose)
+			printf("\n");
+	}
+
+	if (0) {
+err:		ret = 1;
+	}
+	if ((tret = wiredtiger_simple_teardown(progname, db)) != 0 && ret == 0)
+		ret = tret;
+	return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+int
+usage()
+{
+	(void)fprintf(stderr, "usage: %s [-Vv] database\n", progname);
+	return (EXIT_FAILURE);
+}
author	Michael Cahill <michael.cahill@wiredtiger.com>	2011-02-01 09:24:17 +1100
committer	Michael Cahill <michael.cahill@wiredtiger.com>	2011-02-01 09:24:17 +1100
commit	7ebbbf1d52c1ed989cfe5f4fde3b98e983db2e63 (patch)
tree	0e0fd0f6b190dbcd283ca3c4040b5dcd89a94014 /src
parent	6f87637341366fb90f890a5ef860e90c57b36d1f (diff)
download	mongo-7ebbbf1d52c1ed989cfe5f4fde3b98e983db2e63.tar.gz