summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@wiredtiger.com>2011-02-01 09:24:17 +1100
committerMichael Cahill <michael.cahill@wiredtiger.com>2011-02-01 09:24:17 +1100
commit7ebbbf1d52c1ed989cfe5f4fde3b98e983db2e63 (patch)
tree0e0fd0f6b190dbcd283ca3c4040b5dcd89a94014 /src
parent6f87637341366fb90f890a5ef860e90c57b36d1f (diff)
downloadmongo-7ebbbf1d52c1ed989cfe5f4fde3b98e983db2e63.tar.gz
Restructure the tree to ease merge.
refs #27 --HG-- branch : keith rename : lint/fl => dist/lint/fl rename : lint/lint.current => dist/lint/lint.current rename : inc_posix/bitstring.h => include/bitstring.h rename : inc_posix/btree.h => include/btree.h rename : inc_posix/cache.h => include/cache.h rename : inc_posix/debug.h => include/debug.h rename : inc_posix/extern.h => include/extern.h rename : inc_posix/fh.h => include/fh.h rename : inc_posix/mem.h => include/mem.h rename : inc_posix/misc.h => include/misc.h rename : inc_posix/mutex.h => include/mutex.h rename : inc_posix/queue.h => include/queue.h rename : inc_posix/serial.h => include/serial.h rename : inc_posix/stat.h => include/stat.h rename : inc_posix/util.h => include/util.h rename : inc_posix/walk.h => include/walk.h rename : inc_posix/wiredtiger.in => include/wiredtiger.in rename : inc_posix/wt_internal.in => include/wt_internal.in rename : btree/bt_alloc.c => src/btree/bt_alloc.c rename : btree/bt_bulk.c => src/btree/bt_bulk.c rename : btree/bt_cache.c => src/btree/bt_cache.c rename : btree/bt_close.c => src/btree/bt_close.c rename : btree/bt_cmp.c => src/btree/bt_cmp.c rename : btree/bt_debug.c => src/btree/bt_debug.c rename : btree/bt_desc.c => src/btree/bt_desc.c rename : btree/bt_discard.c => src/btree/bt_discard.c rename : btree/bt_dump.c => src/btree/bt_dump.c rename : btree/bt_evict.c => src/btree/bt_evict.c rename : btree/bt_misc.c => src/btree/bt_misc.c rename : btree/bt_open.c => src/btree/bt_open.c rename : btree/bt_ovfl.c => src/btree/bt_ovfl.c rename : btree/bt_page.c => src/btree/bt_page.c rename : btree/bt_read.c => src/btree/bt_read.c rename : btree/bt_reconcile.c => src/btree/bt_reconcile.c rename : btree/bt_ret.c => src/btree/bt_ret.c rename : btree/bt_rw.c => src/btree/bt_rw.c rename : btree/bt_stat.c => src/btree/bt_stat.c rename : btree/bt_sync.c => src/btree/bt_sync.c rename : btree/bt_vrfy.c => src/btree/bt_vrfy.c rename : btree/bt_walk.c => src/btree/bt_walk.c rename : btree/c_drain.c => src/btree/c_drain.c rename : btree/c_init.c => src/btree/c_init.c rename : btree/c_page.c => src/btree/c_page.c rename : btree/c_read.c => src/btree/c_read.c rename : btree/col_get.c => src/btree/col_get.c rename : btree/col_put.c => src/btree/col_put.c rename : btree/col_srch.c => src/btree/col_srch.c rename : btree/row_get.c => src/btree/row_get.c rename : btree/row_put.c => src/btree/row_put.c rename : btree/row_srch.c => src/btree/row_srch.c rename : db/db_err.c => src/db/db_err.c rename : db/db_getset.c => src/db/db_getset.c rename : db/db_handle.c => src/db/db_handle.c rename : db/db_huffman.c => src/db/db_huffman.c rename : db/db_open.c => src/db/db_open.c rename : db/db_stat.c => src/db/db_stat.c rename : db/db_sync.c => src/db/db_sync.c rename : env/env_err.c => src/env/env_err.c rename : env/env_getset.c => src/env/env_getset.c rename : env/env_global.c => src/env/env_global.c rename : env/env_handle.c => src/env/env_handle.c rename : env/env_init.c => src/env/env_init.c rename : env/env_msg.c => src/env/env_msg.c rename : env/env_open.c => src/env/env_open.c rename : env/env_stat.c => src/env/env_stat.c rename : env/env_sync.c => src/env/env_sync.c rename : env/env_toc.c => src/env/env_toc.c rename : env/env_workq.c => src/env/env_workq.c rename : os_posix/os_abort.c => src/os_posix/os_abort.c rename : os_posix/os_alloc.c => src/os_posix/os_alloc.c rename : os_posix/os_filesize.c => src/os_posix/os_filesize.c rename : os_posix/os_fsync.c => src/os_posix/os_fsync.c rename : os_posix/os_mtx.c => src/os_posix/os_mtx.c rename : os_posix/os_open.c => src/os_posix/os_open.c rename : os_posix/os_rw.c => src/os_posix/os_rw.c rename : os_posix/os_sleep.c => src/os_posix/os_sleep.c rename : os_posix/os_thread.c => src/os_posix/os_thread.c rename : os_posix/os_yield.c => src/os_posix/os_yield.c rename : support/api.c => src/support/api.c rename : support/cksum.c => src/support/cksum.c rename : support/err.c => src/support/err.c rename : support/hazard.c => src/support/hazard.c rename : support/huffman.c => src/support/huffman.c rename : support/pow.c => src/support/pow.c rename : support/prime.c => src/support/prime.c rename : support/progress.c => src/support/progress.c rename : support/scratch.c => src/support/scratch.c rename : support/serial.c => src/support/serial.c rename : support/simple_setup.c => src/support/simple_setup.c rename : support/stat.c => src/support/stat.c rename : support/strerror.c => src/support/strerror.c rename : support/version.c => src/support/version.c rename : utilities/db_dump/util_dump.c => src/utilities/db_dump/util_dump.c rename : utilities/db_load/util_load.c => src/utilities/db_load/util_load.c rename : utilities/db_stat/util_stat.c => src/utilities/db_stat/util_stat.c rename : utilities/db_verify/util_verify.c => src/utilities/db_verify/util_verify.c
Diffstat (limited to 'src')
-rw-r--r--src/btree/bt_alloc.c106
-rw-r--r--src/btree/bt_bulk.c1467
-rw-r--r--src/btree/bt_cache.c133
-rw-r--r--src/btree/bt_close.c86
-rw-r--r--src/btree/bt_cmp.c74
-rw-r--r--src/btree/bt_debug.c661
-rw-r--r--src/btree/bt_desc.c132
-rw-r--r--src/btree/bt_discard.c234
-rw-r--r--src/btree/bt_dump.c472
-rw-r--r--src/btree/bt_evict.c944
-rw-r--r--src/btree/bt_misc.c175
-rw-r--r--src/btree/bt_open.c279
-rw-r--r--src/btree/bt_ovfl.c72
-rw-r--r--src/btree/bt_page.c656
-rw-r--r--src/btree/bt_read.c272
-rw-r--r--src/btree/bt_reconcile.c982
-rw-r--r--src/btree/bt_ret.c179
-rw-r--r--src/btree/bt_rw.c85
-rw-r--r--src/btree/bt_stat.c348
-rw-r--r--src/btree/bt_sync.c61
-rw-r--r--src/btree/bt_vrfy.c1346
-rw-r--r--src/btree/bt_walk.c306
-rw-r--r--src/btree/c_drain.c940
-rw-r--r--src/btree/c_init.c133
-rw-r--r--src/btree/c_page.c69
-rw-r--r--src/btree/c_read.c273
-rw-r--r--src/btree/col_get.c40
-rw-r--r--src/btree/col_put.c229
-rw-r--r--src/btree/col_srch.c211
-rw-r--r--src/btree/row_get.c61
-rw-r--r--src/btree/row_put.c288
-rw-r--r--src/btree/row_srch.c196
-rw-r--r--src/db/db_err.c64
-rw-r--r--src/db/db_getset.c85
-rw-r--r--src/db/db_handle.c184
-rw-r--r--src/db/db_huffman.c233
-rw-r--r--src/db/db_open.c104
-rw-r--r--src/db/db_stat.c72
-rw-r--r--src/db/db_sync.c20
-rw-r--r--src/env/env_err.c83
-rw-r--r--src/env/env_getset.c70
-rw-r--r--src/env/env_global.c72
-rw-r--r--src/env/env_handle.c137
-rw-r--r--src/env/env_init.c41
-rw-r--r--src/env/env_msg.c138
-rw-r--r--src/env/env_open.c132
-rw-r--r--src/env/env_stat.c86
-rw-r--r--src/env/env_sync.c30
-rw-r--r--src/env/env_toc.c238
-rw-r--r--src/env/env_workq.c94
-rw-r--r--src/os_posix/os_abort.c25
-rw-r--r--src/os_posix/os_alloc.c359
-rw-r--r--src/os_posix/os_filesize.c27
-rw-r--r--src/os_posix/os_fsync.c29
-rw-r--r--src/os_posix/os_mtx.c148
-rw-r--r--src/os_posix/os_open.c128
-rw-r--r--src/os_posix/os_rw.c56
-rw-r--r--src/os_posix/os_sleep.c25
-rw-r--r--src/os_posix/os_thread.c31
-rw-r--r--src/os_posix/os_yield.c24
-rw-r--r--src/support/api.c1597
-rw-r--r--src/support/cksum.c134
-rw-r--r--src/support/err.c247
-rw-r--r--src/support/hazard.c133
-rw-r--r--src/support/huffman.c692
-rw-r--r--src/support/pow.c56
-rw-r--r--src/support/prime.c75
-rw-r--r--src/support/progress.c17
-rw-r--r--src/support/scratch.c98
-rw-r--r--src/support/serial.c123
-rw-r--r--src/support/simple_setup.c94
-rw-r--r--src/support/stat.c370
-rw-r--r--src/support/strerror.c41
-rw-r--r--src/support/version.c26
-rw-r--r--src/utilities/db_dump/util_dump.c83
-rw-r--r--src/utilities/db_load/util_load.c292
-rw-r--r--src/utilities/db_stat/util_stat.c67
-rw-r--r--src/utilities/db_verify/util_verify.c74
78 files changed, 18464 insertions, 0 deletions
diff --git a/src/btree/bt_alloc.c b/src/btree/bt_alloc.c
new file mode 100644
index 00000000000..4477ce4e0f9
--- /dev/null
+++ b/src/btree/bt_alloc.c
@@ -0,0 +1,106 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static void __wt_file_extend(WT_TOC *, uint32_t *, uint32_t);
+
+#ifdef HAVE_DIAGNOSTIC
+static int __wt_file_free_write(WT_TOC *, uint32_t, uint32_t);
+#endif
+
+/*
+ * __wt_file_alloc --
+ * Alloc a chunk of space from the underlying file.
+ */
+int
+__wt_file_alloc(WT_TOC *toc, uint32_t *addrp, uint32_t size)
+{
+ IDB *idb;
+
+ idb = toc->db->idb;
+
+ __wt_file_extend(toc, addrp, size);
+
+ WT_STAT_INCR(idb->stats, DB_ALLOC);
+
+ return (0);
+}
+
+/*
+ * __wt_file_extend --
+ * Extend the file to allocate space.
+ */
+static void
+__wt_file_extend(WT_TOC *toc, uint32_t *addrp, uint32_t size)
+{
+ DB *db;
+ IDB *idb;
+ WT_FH *fh;
+
+ db = toc->db;
+ idb = db->idb;
+ fh = idb->fh;
+
+ /* Extend the file. */
+ *addrp = WT_OFF_TO_ADDR(db, fh->file_size);
+ fh->file_size += size;
+
+ WT_STAT_INCR(idb->stats, DB_ALLOC_FILE);
+}
+
+/*
+ * __wt_file_free --
+ * Free a chunk of space to the underlying file.
+ */
+int
+__wt_file_free(WT_TOC *toc, uint32_t addr, uint32_t size)
+{
+ WT_STATS *stats;
+
+ stats = toc->db->idb->stats;
+
+#ifdef HAVE_DIAGNOSTIC
+ WT_RET(__wt_file_free_write(toc, addr, size));
+#endif
+
+ WT_STAT_INCR(stats, DB_FREE);
+
+ return (0);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_file_free_write --
+ * Overwrite the space in the file so future reads don't get fooled.
+ * DIAGNOSTIC only.
+ */
+static int
+__wt_file_free_write(WT_TOC *toc, uint32_t addr, uint32_t size)
+{
+ DBT *tmp;
+ WT_PAGE_DISK *dsk;
+ uint32_t allocsize;
+ int ret;
+
+ allocsize = toc->db->allocsize;
+ ret = 0;
+
+ WT_RET(__wt_scr_alloc(toc, allocsize, &tmp));
+ memset(tmp->data, 0, allocsize);
+
+ dsk = tmp->data;
+ dsk->type = WT_PAGE_FREE;
+ for (; size >= allocsize; size -= allocsize)
+ WT_ERR(__wt_page_disk_write(toc, dsk, addr++, allocsize));
+
+err: __wt_scr_release(&tmp);
+ return (ret);
+}
+#endif
diff --git a/src/btree/bt_bulk.c b/src/btree/bt_bulk.c
new file mode 100644
index 00000000000..f88c0d5e8ae
--- /dev/null
+++ b/src/btree/bt_bulk.c
@@ -0,0 +1,1467 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * WT_STACK --
+ * We maintain a stack of parent pages as we build the tree, encapsulated
+ * in this structure.
+ */
+typedef struct {
+ WT_PAGE *page; /* page header */
+ uint8_t *first_free; /* page's first free byte */
+ uint32_t space_avail; /* page's space available */
+
+ DBT *tmp; /* page-in-a-buffer */
+ void *data; /* last on-page WT_COL/WT_ROW */
+} WT_STACK_ELEM;
+typedef struct {
+ WT_STACK_ELEM *elem; /* stack */
+ u_int size; /* stack size */
+} WT_STACK;
+
+static int __wt_bulk_dbt_copy(ENV *, DBT *, DBT *);
+static int __wt_bulk_dup_offpage(WT_TOC *, DBT **, DBT **, DBT *, WT_ITEM *,
+ uint32_t, uint32_t, WT_OFF *, int (*)(DB *, DBT **, DBT **));
+static int __wt_bulk_fix(WT_TOC *, void (*)(const char *,
+ uint64_t), int (*)(DB *, DBT **, DBT **));
+static int __wt_bulk_ovfl_copy(WT_TOC *, WT_OVFL *, WT_OVFL *);
+static int __wt_bulk_ovfl_write(WT_TOC *, DBT *, WT_OVFL *);
+static int __wt_bulk_promote(
+ WT_TOC *, WT_PAGE *, uint64_t, WT_STACK *, u_int, uint32_t *);
+static int __wt_bulk_scratch_page(
+ WT_TOC *, uint32_t, uint32_t, uint32_t, WT_PAGE **, DBT **);
+static int __wt_bulk_stack_put(WT_TOC *, WT_STACK *);
+static int __wt_bulk_var(WT_TOC *, uint32_t, void (*)(const char *,
+ uint64_t), int (*)(DB *, DBT **, DBT **));
+static int __wt_item_build_key(WT_TOC *, DBT *, WT_ITEM *, WT_OVFL *);
+
+/*
+ * __wt_db_bulk_load --
+ * Db.bulk_load method.
+ */
+int
+__wt_db_bulk_load(WT_TOC *toc, uint32_t flags,
+ void (*f)(const char *, uint64_t), int (*cb)(DB *, DBT **, DBT **))
+{
+ DB *db;
+ IDB *idb;
+ uint32_t addr;
+
+ db = toc->db;
+ idb = db->idb;
+
+ /*
+ * XXX
+ * Write out the description record -- this goes away when we figure
+ * out how the table schema is going to work, but for now, we use the
+ * first sector, and this file extend makes sure we don't allocate it
+ * as a table page.
+ */
+ WT_RET(__wt_file_alloc(toc, &addr, 512));
+
+ if (F_ISSET(idb, WT_COLUMN))
+ WT_DB_FCHK(db, "DB.bulk_load", flags, 0);
+
+ /*
+ * There are two styles of bulk-load: variable length pages or
+ * fixed-length pages.
+ */
+ if (F_ISSET(idb, WT_COLUMN) && db->fixed_len != 0)
+ WT_RET(__wt_bulk_fix(toc, f, cb));
+ else
+ WT_RET(__wt_bulk_var(toc, flags, f, cb));
+
+ /* Get a permanent root page reference. */
+ return (__wt_root_pin(toc));
+}
+
+/*
+ * __wt_bulk_fix
+ * Db.bulk_load method for column-store, fixed-length database pages.
+ */
+static int
+__wt_bulk_fix(WT_TOC *toc,
+ void (*f)(const char *, uint64_t), int (*cb)(DB *, DBT **, DBT **))
+{
+ DB *db;
+ DBT *key, *data, *tmp;
+ IDB *idb;
+ WT_PAGE *page;
+ WT_PAGE_DISK *dsk;
+ WT_STACK stack;
+ uint64_t insert_cnt;
+ uint32_t len, space_avail;
+ uint16_t *last_repeat;
+ uint8_t *first_free, *last_data;
+ int rle, ret;
+
+ db = toc->db;
+ tmp = NULL;
+ idb = db->idb;
+ insert_cnt = 0;
+ WT_CLEAR(stack);
+
+ rle = F_ISSET(idb, WT_RLE) ? 1 : 0;
+
+ /* Figure out how large is the chunk we're storing on the page. */
+ len = db->fixed_len;
+ if (rle)
+ len += sizeof(uint16_t);
+
+ /* Get a scratch buffer and make it look like our work page. */
+ WT_ERR(__wt_bulk_scratch_page(toc, db->leafmin,
+ rle ? WT_PAGE_COL_RLE : WT_PAGE_COL_FIX, WT_LLEAF, &page, &tmp));
+ dsk = page->dsk;
+ dsk->start_recno = 1;
+ __wt_set_ff_and_sa_from_offset(
+ page, WT_PAGE_BYTE(page), &first_free, &space_avail);
+
+ while ((ret = cb(db, &key, &data)) == 0) {
+ if (key != NULL) {
+ __wt_api_db_errx(db,
+ "column database keys are implied and so should "
+ "not be set by the bulk load input routine");
+ ret = WT_ERROR;
+ goto err;
+ }
+ if (data->size != db->fixed_len)
+ WT_ERR(__wt_database_wrong_fixed_size(toc, data->size));
+
+ /*
+ * We use the high bit of the data field as a "deleted" value,
+ * make sure the user's data doesn't set it.
+ */
+ if (WT_FIX_DELETE_ISSET(data->data)) {
+ __wt_api_db_errx(db,
+ "the first bit may not be stored in fixed-length "
+ "column-store database items");
+ ret = WT_ERROR;
+ goto err;
+ }
+
+ /* Report on progress every 100 inserts. */
+ if (f != NULL && ++insert_cnt % 100 == 0)
+ f(toc->name, insert_cnt);
+ WT_STAT_INCR(idb->stats, ITEMS_INSERTED);
+
+ /*
+ * If doing run-length encoding, check to see if this record
+ * matches the last data inserted. If there's a match try
+ * and increment that item's repeat count instead of entering
+ * new data.
+ */
+ if (rle && dsk->u.entries != 0)
+ if (*last_repeat < UINT16_MAX &&
+ memcmp(last_data, data->data, data->size) == 0) {
+ ++*last_repeat;
+ ++page->records;
+ WT_STAT_INCR(idb->stats, REPEAT_COUNT);
+ continue;
+ }
+
+ /*
+ * We now have the data item to store on the page. If there
+ * is insufficient space on the current page, allocate a new
+ * one.
+ */
+ if (len > space_avail) {
+ /*
+ * We've finished with the page: promote its first key
+ * to its parent and discard it, then switch to the new
+ * page.
+ */
+ WT_ERR(__wt_bulk_promote(
+ toc, page, page->records, &stack, 0, NULL));
+ WT_ERR(__wt_page_write(toc, page));
+ dsk->u.entries = 0;
+ page->records = 0;
+ dsk->start_recno = insert_cnt;
+ WT_ERR(
+ __wt_file_alloc(toc, &page->addr, db->leafmin));
+ __wt_set_ff_and_sa_from_offset(page,
+ WT_PAGE_BYTE(page), &first_free, &space_avail);
+ }
+
+ ++dsk->u.entries;
+ ++page->records;
+
+ /*
+ * Copy the data item onto the page -- if doing run-length
+ * encoding, track the location of the item for comparison.
+ */
+ if (rle) {
+ last_repeat = (uint16_t *)first_free;
+ *last_repeat = 1;
+ first_free += sizeof(uint16_t);
+ space_avail -= sizeof(uint16_t);
+ last_data = first_free;
+ }
+ memcpy(first_free, data->data, data->size);
+ first_free += data->size;
+ space_avail -= data->size;
+ }
+
+ /* A ret of 1 just means we've reached the end of the input. */
+ if (ret != 1)
+ goto err;
+ ret = 0;
+
+ /* Promote a key from any partially-filled page and write it. */
+ if (dsk->u.entries != 0) {
+ ret = __wt_bulk_promote(
+ toc, page, page->records, &stack, 0, NULL);
+ WT_ERR(__wt_page_write(toc, page));
+ }
+
+ /* Wrap up reporting. */
+ if (f != NULL)
+ f(toc->name, insert_cnt);
+
+err: WT_TRET(__wt_bulk_stack_put(toc, &stack));
+ if (tmp != NULL)
+ __wt_scr_release(&tmp);
+
+ return (ret);
+}
+
+/*
+ * __wt_bulk_var --
+ * Db.bulk_load method for row or column-store variable-length database
+ * pages.
+ */
+static int
+__wt_bulk_var(WT_TOC *toc, uint32_t flags,
+ void (*f)(const char *, uint64_t), int (*cb)(DB *, DBT **, DBT **))
+{
+ DB *db;
+ DBT *key, *data, key_copy, data_copy;
+ DBT *lastkey, *lastkey_copy, lastkey_std;
+ DBT *tmp1, *tmp2;
+ ENV *env;
+ IDB *idb;
+ WT_ITEM key_item, data_item, *dup_key, *dup_data;
+ WT_OFF off;
+ WT_OVFL key_ovfl, data_ovfl;
+ WT_PAGE *page, *next;
+ WT_STACK stack;
+ uint64_t insert_cnt;
+ uint32_t dup_count, dup_space, len, next_space_avail, space_avail;
+ uint8_t *first_free, *next_first_free, *p, type;
+ int ret;
+
+ db = toc->db;
+ tmp1 = tmp2 = NULL;
+ env = toc->env;
+ idb = db->idb;
+ ret = 0;
+
+ WT_CLEAR(stack);
+ dup_space = dup_count = 0;
+ insert_cnt = 0;
+ type = F_ISSET(idb, WT_COLUMN) ? WT_PAGE_COL_VAR : WT_PAGE_ROW_LEAF;
+
+ lastkey = &lastkey_std;
+ WT_CLEAR(data_copy);
+ WT_CLEAR(key_copy);
+ WT_CLEAR(key_item);
+ WT_CLEAR(lastkey_std);
+ WT_ERR(__wt_scr_alloc(toc, 0, &lastkey_copy));
+
+ /* Get a scratch buffer and make it look like our work page. */
+ WT_ERR(__wt_bulk_scratch_page(
+ toc, db->leafmin, type, WT_LLEAF, &page, &tmp1));
+ __wt_set_ff_and_sa_from_offset(
+ page, WT_PAGE_BYTE(page), &first_free, &space_avail);
+ if (type == WT_PAGE_COL_VAR)
+ page->dsk->start_recno = 1;
+
+ while ((ret = cb(db, &key, &data)) == 0) {
+ if (F_ISSET(idb, WT_COLUMN) ) {
+ if (key != NULL) {
+ __wt_api_db_errx(db,
+ "column database keys are implied and "
+ "so should not be returned by the bulk "
+ "load input routine");
+ ret = WT_ERROR;
+ goto err;
+ }
+ } else {
+ if (key == NULL && !LF_ISSET(WT_DUPLICATES)) {
+ __wt_api_db_errx(db,
+ "keys must be specified unless duplicates "
+ "are configured");
+ ret = WT_ERROR;
+ goto err;
+ }
+ if (key != NULL && key->size == 0) {
+ __wt_api_db_errx(db,
+ "zero-length keys are not supported");
+ ret = WT_ERROR;
+ goto err;
+ }
+ }
+
+ /* Report on progress every 100 inserts. */
+ if (f != NULL && ++insert_cnt % 100 == 0)
+ f(toc->name, insert_cnt);
+ WT_STAT_INCR(idb->stats, ITEMS_INSERTED);
+
+ /*
+ * We don't have a key to store on the page if we're building a
+ * column-store, and we don't store the key on the page in the
+ * case of a row-store duplicate data item. The check from here
+ * on is if "key == NULL" for both cases, that is, there's no
+ * key to store.
+ */
+
+skip_read: /*
+ * We pushed a set of duplicates off-page, and that routine
+ * returned an ending key/data pair to us.
+ */
+
+ /*
+ * Copy the caller's DBTs, we don't want to modify them. But,
+ * copy them carefully, all we want is a pointer and a length.
+ */
+ if (key != NULL) {
+ key_copy.data = key->data;
+ key_copy.size = key->size;
+ key = &key_copy;
+ }
+ data_copy.data = data->data;
+ data_copy.size = data->size;
+ data = &data_copy;
+
+ /* Build the data item we're going to store on the page. */
+ WT_ERR(__wt_item_build_data(
+ toc, data, &data_item, &data_ovfl, 0));
+
+ /*
+ * Check for duplicate keys; we don't store the key on the page
+ * in the case of a duplicate.
+ *
+ * !!!
+ * Do a fast check of the old and new sizes -- note checking
+ * lastkey->size is safe -- it's initialized to 0, and we do
+ * not allow zero-length keys.
+ */
+ if (LF_ISSET(WT_DUPLICATES) &&
+ (key == NULL ||
+ (lastkey->size == key->size &&
+ db->btree_compare(db, lastkey, key) == 0))) {
+ /*
+ * The first duplicate in the set is already on the
+ * page, but with an item type set to WT_ITEM_DATA or
+ * WT_ITEM_DATA_OVFL. Correct the type and dup_count.
+ */
+ if (++dup_count == 1) {
+ dup_count = 2;
+ WT_ITEM_SET_TYPE(dup_data,
+ WT_ITEM_TYPE(dup_data) == WT_ITEM_DATA ?
+ WT_ITEM_DATA_DUP : WT_ITEM_DATA_DUP_OVFL);
+ }
+
+ /* Reset the type of the current item to a duplicate. */
+ WT_ITEM_SET_TYPE(&data_item,
+ WT_ITEM_TYPE(&data_item) == WT_ITEM_DATA ?
+ WT_ITEM_DATA_DUP : WT_ITEM_DATA_DUP_OVFL);
+
+ WT_STAT_INCR(idb->stats, DUPLICATE_ITEMS_INSERTED);
+
+ key = NULL;
+ } else {
+ /*
+ * It's a new key, but if duplicates are possible we'll
+ * need a copy of the key for comparison with the next
+ * key. If the key is Huffman encoded or an overflow
+ * object, we can't use the on-page version, we have to
+ * save a copy.
+ */
+ if (LF_ISSET(WT_DUPLICATES) &&
+ (key->size > db->leafitemsize ||
+ idb->huffman_key != NULL)) {
+ WT_ERR(
+ __wt_bulk_dbt_copy(env, key, lastkey_copy));
+ lastkey = lastkey_copy;
+ } else
+ lastkey = NULL;
+
+ dup_count = 0;
+ }
+
+ /* Build the key item we're going to store on the page. */
+ if (key != NULL)
+ WT_ERR(__wt_item_build_key(
+ toc, key, &key_item, &key_ovfl));
+
+ /*
+ * We now have the key/data items to store on the page. If
+ * there is insufficient space on the current page, allocate
+ * a new one.
+ */
+ if ((key == NULL ? 0 : WT_ITEM_SPACE_REQ(key->size)) +
+ WT_ITEM_SPACE_REQ(data->size) > space_avail) {
+ WT_ERR(__wt_bulk_scratch_page(toc,
+ db->leafmin, type, WT_LLEAF, &next, &tmp2));
+ __wt_set_ff_and_sa_from_offset(next,
+ WT_PAGE_BYTE(next),
+ &next_first_free, &next_space_avail);
+ if (type == WT_PAGE_COL_VAR)
+ next->dsk->start_recno = insert_cnt;
+
+ /*
+ * If in the middle of loading a set of duplicates, but
+ * the set hasn't yet reached the boundary where we'd
+ * push them offpage, we can't split them across the two
+ * pages. Move the entire set to the new page. This
+ * can waste up to 25% of the old page, but it would be
+ * difficult and messy to move them and then go back
+ * and fix things up if and when they moved offpage.
+ *
+ * We use a check of dup_count instead of checking the
+ * WT_DUPLICATES flag, since we have to check it anyway.
+ */
+ if (dup_count != 0) {
+ /*
+ * Reset the page entry and record counts -- we
+ * are moving a single key plus the duplicate
+ * set.
+ *
+ * Since dup_count was already incremented to
+ * reflect the data item we're loading now, it
+ * is the right number of elements to move, that
+ * is, move (dup_count - 1) + 1 for the key.
+ */
+ page->dsk->u.entries -= dup_count;
+ page->records -= dup_count - 1;
+ next->dsk->u.entries += dup_count;
+ next->records += dup_count - 1;
+
+ /*
+ * Move the duplicate set and adjust the page
+ * information for "next" -- we don't have to
+ * fix up "page", we're never going to use it
+ * again.
+ */
+ len =
+ (uint32_t)(first_free - (uint8_t *)dup_key);
+ memcpy(next_first_free, dup_key, len);
+ next_first_free += len;
+ next_space_avail -= len;
+
+ /*
+ * We'll never have to move this dup set to
+ * another primary page -- if the dup set
+ * continues to grow, it will be moved
+ * off-page. We still need to know where
+ * the dup set starts, though, for the
+ * possible move off-page: it's the second
+ * entry on the page, where the first entry
+ * is the dup set's key.
+ */
+ dup_key = (WT_ITEM *)WT_PAGE_BYTE(next);
+ dup_data = (WT_ITEM *)((uint8_t *)dup_key +
+ WT_ITEM_SPACE_REQ(WT_ITEM_LEN(dup_key)));
+
+ /*
+ * The "lastkey" value just moved to a new page.
+ * If it's an overflow item, we have a copy; if
+ * it's not, then we need to reset it.
+ */
+ if (lastkey == &lastkey_std) {
+ lastkey_std.data =
+ WT_ITEM_BYTE(dup_key);
+ lastkey_std.size = WT_ITEM_LEN(dup_key);
+ }
+ }
+
+ /*
+ * We've finished with the page: promote its first key
+ * to its parent and discard it, then switch to the new
+ * page.
+ */
+ WT_ERR(__wt_bulk_promote(
+ toc, page, page->records, &stack, 0, NULL));
+ WT_ERR(__wt_page_write(toc, page));
+ __wt_scr_release(&tmp1);
+
+ /*
+ * Discard the last page, and switch to the next page.
+ *
+ * XXX
+ * The obvious speed-up here is to re-initialize page
+ * instead of discarding it and acquiring it again as
+ * as soon as the just-allocated page fills up. I am
+ * not doing that deliberately: eventually we'll use
+ * asynchronous I/O in bulk load, which means the page
+ * won't be reusable until the I/O completes.
+ */
+ page = next;
+ first_free = next_first_free;
+ space_avail = next_space_avail;
+ next = NULL;
+ next_first_free = NULL;
+ next_space_avail = 0;
+ tmp1 = tmp2;
+ tmp2 = NULL;
+ }
+
+ ++page->records;
+
+ /* Copy the key item onto the page. */
+ if (key != NULL) {
+ ++page->dsk->u.entries;
+
+ memcpy(first_free, &key_item, sizeof(key_item));
+ memcpy(first_free +
+ sizeof(key_item), key->data, key->size);
+ space_avail -= WT_ITEM_SPACE_REQ(key->size);
+
+ /*
+ * If processing duplicates we'll need a copy of the key
+ * for comparison with the next key. If the key was an
+ * overflow or Huffman encoded item, we already have a
+ * copy -- otherwise, use the copy we just put on the
+ * page.
+ *
+ * We also save the location for the key of any current
+ * duplicate set in case we have to move the set to a
+ * different page (the case where a duplicate set isn't
+ * large enough to move offpage, but doesn't entirely
+ * fit on this page).
+ */
+ if (LF_ISSET(WT_DUPLICATES)) {
+ if (lastkey == NULL) {
+ lastkey = &lastkey_std;
+ lastkey_std.data =
+ WT_ITEM_BYTE(first_free);
+ lastkey_std.size = key->size;
+ }
+ dup_key = (WT_ITEM *)first_free;
+ }
+ first_free += WT_ITEM_SPACE_REQ(key->size);
+ }
+
+ /* Copy the data item onto the page. */
+ ++page->dsk->u.entries;
+ memcpy(first_free, &data_item, sizeof(data_item));
+ memcpy(first_free + sizeof(data_item), data->data, data->size);
+ space_avail -= WT_ITEM_SPACE_REQ(data->size);
+
+ /*
+ * If duplicates: if this isn't a duplicate data item, save
+ * the item location, since it's potentially the first of a
+ * duplicate data set, and we need to know where duplicate
+ * data sets start. Additionally, reset the counter and
+ * space calculation.
+ */
+ if (LF_ISSET(WT_DUPLICATES) && dup_count == 0) {
+ dup_space = data->size;
+ dup_data = (WT_ITEM *)first_free;
+ }
+ first_free += WT_ITEM_SPACE_REQ(data->size);
+
+ /*
+ * If duplicates: check to see if the duplicate set crosses
+ * the (roughly) 25% of the page space boundary. If it does,
+ * move it offpage.
+ */
+ if (LF_ISSET(WT_DUPLICATES) && dup_count != 0) {
+ dup_space += data->size;
+
+ if (dup_space < db->leafmin / db->btree_dup_offpage)
+ continue;
+
+ /*
+ * Move the duplicate set off our page, and read in the
+ * rest of the off-page duplicate set.
+ */
+ WT_ERR(__wt_bulk_dup_offpage(toc, &key, &data, lastkey,
+ dup_data,
+ (uint32_t)(first_free - (uint8_t *)dup_data),
+ dup_count, &off, cb));
+
+ /* Reset the page entry and record counts. */
+ page->dsk->u.entries -= (dup_count - 1);
+ page->records -= dup_count;
+ page->records += WT_RECORDS(&off);
+
+ /*
+ * Replace the duplicate set with a WT_OFF structure,
+ * that is, we've replaced dup_count entries with a
+ * single entry.
+ */
+ WT_ITEM_SET(&data_item, WT_ITEM_OFF, sizeof(WT_OFF));
+ p = (uint8_t *)dup_data;
+ memcpy(p, &data_item, sizeof(data_item));
+ memcpy(p + sizeof(data_item), &off, sizeof(WT_OFF));
+ __wt_set_ff_and_sa_from_offset(page,
+ (uint8_t *)p + WT_ITEM_SPACE_REQ(sizeof(WT_OFF)),
+ &first_free, &space_avail);
+
+ /* Reset local counters. */
+ dup_count = dup_space = 0;
+
+ goto skip_read;
+ }
+ }
+
+ /* A ret of 1 just means we've reached the end of the input. */
+ if (ret != 1)
+ goto err;
+ ret = 0;
+
+ /* Promote a key from any partially-filled page and write it. */
+ if (page->dsk->u.entries != 0) {
+ WT_ERR(__wt_bulk_promote(
+ toc, page, page->records, &stack, 0, NULL));
+ WT_ERR(__wt_page_write(toc, page));
+ }
+
+ /* Wrap up reporting. */
+ if (f != NULL)
+ f(toc->name, insert_cnt);
+
+err: WT_TRET(__wt_bulk_stack_put(toc, &stack));
+ if (lastkey_copy != NULL)
+ __wt_scr_release(&lastkey_copy);
+ if (tmp1 != NULL)
+ __wt_scr_release(&tmp1);
+ if (tmp2 != NULL)
+ __wt_scr_release(&tmp2);
+
+ return (ret);
+}
+
+/*
+ * __wt_bulk_dup_offpage --
+ * Move the last set of duplicates on the page to a page of their own,
+ * then load the rest of the duplicate set.
+ */
+static int
+__wt_bulk_dup_offpage(WT_TOC *toc, DBT **keyp, DBT **datap, DBT *lastkey,
+ WT_ITEM *dup_data, uint32_t dup_len, uint32_t dup_count, WT_OFF *off,
+ int (*cb)(DB *, DBT **, DBT **))
+{
+ DB *db;
+ DBT *key, *data, *tmp;
+ IDB *idb;
+ WT_ITEM data_item;
+ WT_OVFL data_ovfl;
+ WT_PAGE *page;
+ WT_STACK stack;
+ uint32_t root_addr, space_avail;
+ uint8_t *first_free;
+ int ret, success_return;
+
+ db = toc->db;
+ idb = db->idb;
+ success_return = 0;
+
+ /*
+ * This routine is the same as the bulk load routine, except it loads
+ * only data items into off-page duplicate trees. It's passed a lot
+ * of state from the bulk load routine, and updates that state as a
+ * side-effect.
+ *
+ * In summary, the bulk load routine stops loading a primary btree leaf
+ * page, calls us to load a set of duplicate data items into a separate
+ * btree, and then continues on with its primary leaf page when we
+ * return. The arguments are complex enough that it's worth describing
+ * them:
+ *
+ * keyp/datap --
+ * The key and data pairs the application is filling in -- we
+ * get them passed to us because we get additional key/data
+ * pairs returned to us, and the last one we get is likely to
+ * be consumed by our caller.
+ * lastkey --
+ * The last key pushed onto the caller's page -- we use this to
+ * compare against future keys we read.
+ * dup_data --
+ * On-page reference to the first duplicate data item in the set.
+ * dup_count --
+ * Count of duplicates in the set.
+ * off --
+ * Callers WT_OFF structure, which we have to fill in.
+ * cb --
+ * User's callback function.
+ */
+
+ WT_CLEAR(data_item);
+ WT_CLEAR(stack);
+ ret = 0;
+
+ /* Get a scratch buffer and make it look like our work page. */
+ WT_ERR(__wt_bulk_scratch_page(toc,
+ db->leafmin, WT_PAGE_DUP_LEAF, WT_LLEAF, &page, &tmp));
+ __wt_set_ff_and_sa_from_offset(
+ page, WT_PAGE_BYTE(page), &first_free, &space_avail);
+
+ /* Move the duplicates onto the newly allocated page. */
+ page->records = dup_count;
+ page->dsk->u.entries = dup_count;
+ memcpy(first_free, dup_data, (size_t)dup_len);
+ first_free += dup_len;
+ space_avail -= dup_len;
+
+ /*
+ * Unless we have enough duplicates to split this page, it will be the
+ * "root" of the offpage duplicates.
+ */
+ root_addr = page->addr;
+
+ /* Read in new duplicate records until the key changes. */
+ while ((ret = cb(db, &key, &data)) == 0) {
+ if (key->size == 0) {
+ __wt_api_db_errx(
+ db, "zero-length keys are not supported");
+ return (WT_ERROR);
+ }
+ WT_STAT_INCR(idb->stats, ITEMS_INSERTED);
+ WT_STAT_INCR(idb->stats, DUPLICATE_ITEMS_INSERTED);
+
+ /* Loading duplicates, so a key change means we're done. */
+ if (lastkey->size != key->size ||
+ db->btree_compare_dup(db, lastkey, key) != 0) {
+ *keyp = key;
+ *datap = data;
+ break;
+ }
+
+ /* Build the data item we're going to store on the page. */
+ WT_ERR(__wt_item_build_data(
+ toc, data, &data_item, &data_ovfl, WT_IS_DUP));
+
+ /*
+ * If there's insufficient space available, allocate a new
+ * page.
+ */
+ if (WT_ITEM_SPACE_REQ(data->size) > space_avail) {
+ /*
+ * We've finished with the page: promote its first key
+ * to its parent and discard it, then switch to the new
+ * page.
+ *
+ * If we promoted a key, we might have split, and so
+ * there may be a new offpage duplicates root page.
+ */
+ WT_RET(__wt_bulk_promote(toc,
+ page, page->records, &stack, 0, &root_addr));
+ WT_ERR(__wt_page_write(toc, page));
+ page->records = 0;
+ page->dsk->u.entries = 0;
+ __wt_set_ff_and_sa_from_offset(page,
+ WT_PAGE_BYTE(page), &first_free, &space_avail);
+ }
+
+ ++dup_count; /* Total duplicate count */
+ ++page->records; /* On-page key/data count */
+ ++page->dsk->u.entries; /* On-page entry count */
+
+ /* Copy the data item onto the page. */
+ WT_ITEM_SET_LEN(&data_item, data->size);
+ memcpy(first_free, &data_item, sizeof(data_item));
+ memcpy(first_free + sizeof(data_item), data->data, data->size);
+ space_avail -= WT_ITEM_SPACE_REQ(data->size);
+ first_free += WT_ITEM_SPACE_REQ(data->size);
+ }
+
+ /*
+ * Ret values of 1 and 0 are both "OK", the ret value of 1 means we
+ * reached the end of the bulk input. Save the successful return
+ * for our final return value.
+ */
+ if (ret != 0 && ret != 1)
+ goto err;
+ success_return = ret;
+
+ /* Promote a key from the partially-filled page and write it. */
+ WT_ERR(
+ __wt_bulk_promote(toc, page, page->records, &stack, 0, &root_addr));
+ WT_ERR(__wt_page_write(toc, page));
+
+ /* Fill in the caller's WT_OFF structure. */
+ WT_RECORDS(off) = dup_count;
+ off->addr = root_addr;
+ off->size = db->intlmin;
+
+err: WT_TRET(__wt_bulk_stack_put(toc, &stack));
+ if (tmp != NULL)
+ __wt_scr_release(&tmp);
+
+ return (ret == 0 ? success_return : ret);
+}
+
+/*
+ * __wt_bulk_promote --
+ * Promote the first entry on a page to its parent.
+ */
+static int
+__wt_bulk_promote(WT_TOC *toc, WT_PAGE *page, uint64_t incr,
+ WT_STACK *stack, u_int level, uint32_t *dup_root_addrp)
+{
+ DB *db;
+ DBT *key, key_build, *next_tmp;
+ ENV *env;
+ WT_ITEM *key_item, item;
+ WT_OFF off;
+ WT_OVFL tmp_ovfl;
+ WT_PAGE *next, *parent;
+ WT_PAGE_DISK *dsk;
+ WT_STACK_ELEM *elem;
+ uint32_t next_space_avail;
+ uint8_t *next_first_free;
+ u_int type;
+ int need_promotion, ret;
+ void *parent_data;
+
+ db = toc->db;
+ env = toc->env;
+ dsk = page->dsk;
+ WT_CLEAR(item);
+ next_tmp = NULL;
+ next = parent = NULL;
+ ret = 0;
+
+ /*
+ * If it's a row-store, get a copy of the first item on the page -- it
+ * might be an overflow item, in which case we need to make a copy for
+ * the database. Most versions of Berkeley DB tried to reference count
+ * overflow items if they were promoted to internal pages. That turned
+ * out to be hard to get right, so I'm not doing it again.
+ *
+ * If it's a column-store page, we don't promote a key at all.
+ */
+ switch (dsk->type) {
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_DUP_LEAF:
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ key = &key_build;
+ WT_CLEAR(key_build);
+
+ key_item = (WT_ITEM *)WT_PAGE_BYTE(page);
+ switch (WT_ITEM_TYPE(key_item)) {
+ case WT_ITEM_KEY:
+ case WT_ITEM_DATA_DUP:
+ key->data = WT_ITEM_BYTE(key_item);
+ key->size = WT_ITEM_LEN(key_item);
+ switch (dsk->type) {
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ WT_ITEM_SET(&item, WT_ITEM_KEY, key->size);
+ break;
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_DUP_LEAF:
+ WT_ITEM_SET(&item, WT_ITEM_KEY_DUP, key->size);
+ break;
+ default: /* Not possible */
+ break;
+ }
+ break;
+ case WT_ITEM_KEY_OVFL:
+ case WT_ITEM_DATA_DUP_OVFL:
+ /*
+ * Assume overflow keys remain overflow keys when they
+ * are promoted; not necessarily true if internal nodes
+ * are larger than leaf nodes), but that's unlikely.
+ */
+ WT_CLEAR(tmp_ovfl);
+ WT_RET(__wt_bulk_ovfl_copy(toc,
+ WT_ITEM_BYTE_OVFL(key_item), &tmp_ovfl));
+ key->data = &tmp_ovfl;
+ key->size = sizeof(tmp_ovfl);
+ switch (dsk->type) {
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ WT_ITEM_SET(&item,
+ WT_ITEM_KEY_OVFL, sizeof(WT_OVFL));
+ break;
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_DUP_LEAF:
+ WT_ITEM_SET(&item,
+ WT_ITEM_KEY_DUP_OVFL, sizeof(WT_OVFL));
+ break;
+ default: /* Not possible */
+ break;
+ }
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+ break;
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_RLE:
+ case WT_PAGE_COL_VAR:
+ key = NULL;
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+
+ /*
+ * There are two paths into this code based on whether the page already
+ * has a parent.
+ *
+ * If we have a page with no parent page, create the parent page. In
+ * this path, there's not much to do -- allocate a parent page, copy
+ * reference information from the page to the parent, and we're done.
+ * This is a modified root-split: we're putting a single key on an
+ * internal page, which is illegal, but we know another page on this
+ * page's level will be created, and it will be promoted to the parent
+ * at some point. This is case #1.
+ *
+ * The second path into this code is if we have a page and its parent,
+ * but the page's reference information doesn't fit on the parent and
+ * we have to split the parent. This path has two different cases,
+ * based on whether the page's parent itself has a parent.
+ *
+ * Here's a diagram of case #2, where the parent also has a parent:
+ *
+ * P2 -> P1 -> L (case #2)
+ *
+ * The promoted key from leaf L won't fit onto P1, and so we split P1:
+ *
+ * P2 -> P1
+ * -> P3 -> L
+ *
+ * In case #2, allocate P3 and copy reference information from the leaf
+ * page to it, then recursively call the promote code to promote the
+ * first entry from P3 to P2.
+ *
+ * Here's a diagram of case #3, where the parent does not have a parent,
+ * in other words, a root split:
+ *
+ * P1 -> L (case #3)
+ *
+ * The promoted key from leaf L won't fit onto P1, and so we split P1:
+ *
+ * P1 ->
+ * P2 -> L
+ *
+ * In case #3, we allocate P2, copy reference information from the page
+ * to it, and then recursively call the promote code twice: first to
+ * promote the first entry from P1 to a new page, and again to promote
+ * the first entry from P2 to a new page, creating a new root level of
+ * the tree:
+ *
+ * P3 -> P1
+ * -> P2 -> L
+ */
+ /*
+ * To simplify the rest of the code, check to see if there's room for
+ * another entry in our stack structure. Allocate the stack in groups
+ * of 20, which is probably big enough for any tree we'll ever see in
+ * the field, we'll never test the realloc code unless we work at it.
+ */
+#ifdef HAVE_DIAGNOSTIC
+#define WT_STACK_ALLOC_INCR 2
+#else
+#define WT_STACK_ALLOC_INCR 20
+#endif
+ if (stack->size == 0 || level == stack->size - 1) {
+ uint32_t bytes_allocated = stack->size * sizeof(WT_STACK_ELEM);
+ WT_RET(__wt_realloc(env, &bytes_allocated,
+ (stack->size + WT_STACK_ALLOC_INCR) * sizeof(WT_STACK_ELEM),
+ &stack->elem));
+ stack->size += WT_STACK_ALLOC_INCR;
+ /*
+ * Note, the stack structure may be entirely uninitialized here,
+ * that is, everything set to 0 bytes. That's OK: the level of
+ * the stack starts out at 0, that is, the 0th element of the
+ * stack is the 1st level of internal/parent pages in the tree.
+ */
+ }
+
+ elem = &stack->elem[level];
+ parent = elem->page;
+ if (parent == NULL) {
+split: switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_RLE:
+ case WT_PAGE_COL_VAR:
+ type = WT_PAGE_COL_INT;
+ break;
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_DUP_LEAF:
+ type = WT_PAGE_DUP_INT;
+ break;
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ type = WT_PAGE_ROW_INT;
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+
+ WT_ERR(__wt_bulk_scratch_page(
+ toc, db->intlmin, type, dsk->level + 1, &next, &next_tmp));
+ __wt_set_ff_and_sa_from_offset(next,
+ WT_PAGE_BYTE(next), &next_first_free, &next_space_avail);
+
+ /*
+ * Column stores set the starting record number to the starting
+ * record number of the promoted leaf -- the new leaf is always
+ * the first record in the new parent's page. Ignore the type
+ * of the database, it's simpler ot just promote 0 up the tree
+ * in row store databases.
+ */
+ next->dsk->start_recno = page->dsk->start_recno;
+
+ /*
+ * If we don't have a parent page, it's case #1 -- allocate the
+ * parent page immediately.
+ */
+ if (parent == NULL) {
+ /*
+ * Case #1 -- there's no parent, it's a root split. No
+ * additional work in the main tree. In an off-page
+ * duplicates tree, return the new root of the off-page
+ * tree.
+ */
+ if (type == WT_PAGE_DUP_INT)
+ *dup_root_addrp = next->addr;
+ need_promotion = 0;
+ } else {
+ /*
+ * Case #2 and #3.
+ *
+ * Case #3: a root split, so we have to promote a key
+ * from both of the parent pages: promote the key from
+ * the existing parent page.
+ */
+ if (stack->elem[level + 1].page == NULL)
+ WT_ERR(__wt_bulk_promote(toc, parent,
+ incr, stack, level + 1, dup_root_addrp));
+ need_promotion = 1;
+
+ /* Write the last parent page, we have a new one. */
+ WT_ERR(__wt_page_write(toc, parent));
+ __wt_scr_release(&stack->elem[level].tmp);
+ }
+
+ /* There's a new parent page, reset the stack. */
+ elem = &stack->elem[level];
+ elem->page = parent = next;
+ elem->first_free = next_first_free;
+ elem->space_avail = next_space_avail;
+ elem->tmp = next_tmp;
+ next = NULL;
+ next_first_free = NULL;
+ next_space_avail = 0;
+ next_tmp = NULL;
+ } else
+ need_promotion = 0;
+
+ /*
+ * See if the promoted data will fit (if they don't, we have to split).
+ * We don't need to check for overflow keys: if the key was an overflow,
+ * we already created a smaller, on-page version of it.
+ *
+ * If there's room, copy the promoted data onto the parent's page.
+ */
+ switch (parent->dsk->type) {
+ case WT_PAGE_COL_INT:
+ if (elem->space_avail < sizeof(WT_OFF))
+ goto split;
+
+ /* Create the WT_OFF reference. */
+ WT_RECORDS(&off) = page->records;
+ off.addr = page->addr;
+ off.size = dsk->level == WT_LLEAF ? db->leafmin : db->intlmin;
+
+ /* Store the data item. */
+ ++parent->dsk->u.entries;
+ parent_data = elem->first_free;
+ memcpy(elem->first_free, &off, sizeof(off));
+ elem->first_free += sizeof(WT_OFF);
+ elem->space_avail -= sizeof(WT_OFF);
+
+ /* Track the last entry on the page for record count updates. */
+ stack->elem[level].data = parent_data;
+ break;
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_DUP_INT:
+ if (elem->space_avail <
+ WT_ITEM_SPACE_REQ(sizeof(WT_OFF)) +
+ WT_ITEM_SPACE_REQ(key->size))
+ goto split;
+
+ /* Store the key. */
+ ++parent->dsk->u.entries;
+ memcpy(elem->first_free, &item, sizeof(item));
+ memcpy(elem->first_free + sizeof(item), key->data, key->size);
+ elem->first_free += WT_ITEM_SPACE_REQ(key->size);
+ elem->space_avail -= WT_ITEM_SPACE_REQ(key->size);
+
+ /* Create the WT_ITEM(WT_OFF) reference. */
+ WT_ITEM_SET(&item, WT_ITEM_OFF, sizeof(WT_OFF));
+ WT_RECORDS(&off) = page->records;
+ off.addr = page->addr;
+ off.size = dsk->level == WT_LLEAF ? db->leafmin : db->intlmin;
+
+ /* Store the data item. */
+ ++parent->dsk->u.entries;
+ parent_data = elem->first_free;
+ memcpy(elem->first_free, &item, sizeof(item));
+ memcpy(elem->first_free + sizeof(item), &off, sizeof(off));
+ elem->first_free += WT_ITEM_SPACE_REQ(sizeof(WT_OFF));
+ elem->space_avail -= WT_ITEM_SPACE_REQ(sizeof(WT_OFF));
+
+ /* Track the last entry on the page for record count updates. */
+ stack->elem[level].data = parent_data;
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+
+ parent->records += page->records;
+
+ /*
+ * The promotion for case #2 and the second part of case #3 -- promote
+ * the key from the newly allocated internal page to its parent.
+ */
+ if (need_promotion)
+ WT_RET(__wt_bulk_promote(
+ toc, parent, incr, stack, level + 1, dup_root_addrp));
+ else {
+ /*
+ * We've finished promoting the new page's key into the tree.
+ * What remains is to push the new record counts all the way
+ * to the root. We've already corrected our current "parent"
+ * page, so proceed from there to the root.
+ */
+ for (elem =
+ &stack->elem[level + 1]; elem->page != NULL; ++elem) {
+ switch (elem->page->dsk->type) {
+ case WT_PAGE_COL_INT:
+ WT_RECORDS((WT_OFF *)elem->data) += incr;
+ break;
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_DUP_INT:
+ WT_RECORDS(
+ (WT_OFF *)WT_ITEM_BYTE(elem->data)) += incr;
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+ elem->page->records += incr;
+ }
+ }
+
+err: if (next_tmp != NULL)
+ __wt_scr_release(&next_tmp);
+
+ return (ret);
+}
+
+/*
+ * __wt_item_build_key --
+ * Process an inserted key item and return an WT_ITEM structure and byte
+ * string to be stored on the page.
+ */
+static int
+__wt_item_build_key(WT_TOC *toc, DBT *dbt, WT_ITEM *item, WT_OVFL *ovfl)
+{
+ DB *db;
+ IDB *idb;
+ WT_STATS *stats;
+
+ db = toc->db;
+ idb = db->idb;
+ stats = idb->stats;
+
+ /*
+ * We're called with a DBT that references a data/size pair. We can
+ * re-point that DBT's data and size fields to other memory, but we
+ * cannot allocate memory in that DBT -- all we can do is re-point it.
+ *
+ * For Huffman-encoded key/data items, we need a chunk of new space;
+ * use the WT_TOC key/data return memory: this routine is called during
+ * bulk insert and reconciliation, we aren't returning key/data pairs.
+ */
+
+ /* Optionally compress the data using the Huffman engine. */
+ if (idb->huffman_key != NULL) {
+ WT_RET(__wt_huffman_encode(
+ idb->huffman_key, dbt->data, dbt->size,
+ &toc->key.data, &toc->key.mem_size, &toc->key.size));
+ if (toc->key.size > dbt->size)
+ WT_STAT_INCRV(stats,
+ HUFFMAN_KEY, toc->key.size - dbt->size);
+ dbt->data = toc->key.data;
+ dbt->size = toc->key.size;
+ }
+
+ /* Create an overflow object if the data won't fit. */
+ if (dbt->size > db->leafitemsize) {
+ WT_STAT_INCR(stats, OVERFLOW_KEY);
+
+ WT_RET(__wt_bulk_ovfl_write(toc, dbt, ovfl));
+
+ dbt->data = ovfl;
+ dbt->size = sizeof(*ovfl);
+ WT_ITEM_SET(item, WT_ITEM_KEY_OVFL, dbt->size);
+ } else
+ WT_ITEM_SET(item, WT_ITEM_KEY, dbt->size);
+ return (0);
+}
+
+/*
+ * __wt_item_build_data --
+ * Process an inserted data item and return an WT_ITEM structure and byte
+ * string to be stored on the page.
+ */
+int
+__wt_item_build_data(
+ WT_TOC *toc, DBT *dbt, WT_ITEM *item, WT_OVFL *ovfl, u_int flags)
+{
+ DB *db;
+ IDB *idb;
+ WT_STATS *stats;
+
+ WT_ENV_FCHK(toc->env,
+ "__wt_item_build_data", flags, WT_APIMASK_BT_BUILD_DATA_ITEM);
+
+ db = toc->db;
+ idb = db->idb;
+ stats = idb->stats;
+
+ /*
+ * We're called with a DBT that references a data/size pair. We can
+ * re-point that DBT's data and size fields to other memory, but we
+ * cannot allocate memory in that DBT -- all we can do is re-point it.
+ *
+ * For Huffman-encoded key/data items, we need a chunk of new space;
+ * use the WT_TOC key/data return memory: this routine is called during
+ * bulk insert and reconciliation, we aren't returning key/data pairs.
+ */
+ WT_CLEAR(*item);
+ WT_ITEM_SET_TYPE(
+ item, LF_ISSET(WT_IS_DUP) ? WT_ITEM_DATA_DUP : WT_ITEM_DATA);
+
+ /*
+ * Handle zero-length items quickly -- this is a common value, it's
+ * a deleted column-store variable length item.
+ */
+ if (dbt->size == 0) {
+ WT_ITEM_SET_LEN(item, 0);
+ return (0);
+ }
+
+ /* Optionally compress the data using the Huffman engine. */
+ if (idb->huffman_data != NULL) {
+ WT_RET(__wt_huffman_encode(
+ idb->huffman_data, dbt->data, dbt->size,
+ &toc->data.data, &toc->data.mem_size, &toc->data.size));
+ if (toc->data.size > dbt->size)
+ WT_STAT_INCRV(stats,
+ HUFFMAN_DATA, toc->data.size - dbt->size);
+ dbt->data = toc->data.data;
+ dbt->size = toc->data.size;
+ }
+
+ /* Create an overflow object if the data won't fit. */
+ if (dbt->size > db->leafitemsize) {
+ WT_RET(__wt_bulk_ovfl_write(toc, dbt, ovfl));
+
+ dbt->data = ovfl;
+ dbt->size = sizeof(*ovfl);
+ WT_ITEM_SET_TYPE(item, LF_ISSET(WT_IS_DUP) ?
+ WT_ITEM_DATA_DUP_OVFL : WT_ITEM_DATA_OVFL);
+ WT_STAT_INCR(stats, OVERFLOW_DATA);
+ }
+
+ WT_ITEM_SET_LEN(item, dbt->size);
+ return (0);
+}
+
+/*
+ * __wt_bulk_ovfl_copy --
+ * Copy bulk-loaded overflow items in the database, returning the WT_OVFL
+ * structure, filled in.
+ */
+static int
+__wt_bulk_ovfl_copy(WT_TOC *toc, WT_OVFL *from, WT_OVFL *to)
+{
+ DB *db;
+ DBT *tmp;
+ WT_PAGE *page;
+ uint32_t size;
+ int ret;
+
+ db = toc->db;
+ tmp = NULL;
+
+ /* Get a scratch buffer and make it look like an overflow page. */
+ size = WT_ALIGN(sizeof(WT_PAGE_DISK) + from->size, db->allocsize);
+ WT_RET(__wt_bulk_scratch_page(
+ toc, size, WT_PAGE_OVFL, WT_LLEAF, &page, &tmp));
+ page->dsk->u.datalen = from->size;
+
+ /* Fill in the return information. */
+ to->addr = page->addr;
+ to->size = from->size;
+
+ /*
+ * Read the page into our scratch buffer, then write it out to the
+ * new location.
+ */
+ if ((ret =
+ __wt_page_disk_read(toc, page->dsk, from->addr, from->size)) == 0)
+ ret =
+ __wt_page_disk_write(toc, page->dsk, to->addr, from->size);
+
+ __wt_scr_release(&tmp);
+
+ return (ret);
+}
+
+/*
+ * __wt_bulk_ovfl_write --
+ * Store bulk-loaded overflow items in the database, returning the page
+ * addr.
+ */
+static int
+__wt_bulk_ovfl_write(WT_TOC *toc, DBT *dbt, WT_OVFL *to)
+{
+ DB *db;
+ DBT *tmp;
+ WT_PAGE *page;
+ WT_PAGE_DISK *dsk;
+ uint32_t size;
+ int ret;
+
+ db = toc->db;
+ tmp = NULL;
+
+ /* Get a scratch buffer and make it look like our work page. */
+ size = WT_ALIGN(sizeof(WT_PAGE_DISK) + dbt->size, db->allocsize);
+ WT_ERR(__wt_bulk_scratch_page(
+ toc, size, WT_PAGE_OVFL, WT_LLEAF, &page, &tmp));
+
+ /* Fill in the return information. */
+ to->addr = page->addr;
+ to->size = dbt->size;
+
+ /* Initialize the page header and copy the record into place. */
+ dsk = page->dsk;
+ dsk->u.datalen = dbt->size;
+ memcpy((uint8_t *)dsk + sizeof(WT_PAGE_DISK), dbt->data, dbt->size);
+
+ ret = __wt_page_write(toc, page);
+
+err: if (tmp != NULL)
+ __wt_scr_release(&tmp);
+
+ return (ret);
+}
+
+/*
+ * __wt_bulk_scratch_page --
+ * Allocate a scratch buffer and make it look like a database page.
+ */
+static int
+__wt_bulk_scratch_page(WT_TOC *toc, uint32_t page_size,
+ uint32_t page_type, uint32_t page_level, WT_PAGE **page_ret, DBT **tmp_ret)
+{
+ DBT *tmp;
+ WT_PAGE *page;
+ WT_PAGE_DISK *dsk;
+ uint32_t size;
+ int ret;
+
+ ret = 0;
+
+ /*
+ * Allocate a scratch buffer and make sure it's big enough to hold a
+ * WT_PAGE structure plus the page itself, and clear the memory so
+ * it's never random bytes.
+ */
+ size = page_size + sizeof(WT_PAGE);
+ WT_ERR(__wt_scr_alloc(toc, size, &tmp));
+ memset(tmp->data, 0, size);
+
+ /*
+ * Set up the page and allocate a file address.
+ *
+ * We don't run the leaf pages through the cache -- that means passing
+ * a lot of messages we don't want to bother with. We're the only user
+ * of the file, which means we can grab file space whenever we want.
+ */
+ page = tmp->data;
+ page->dsk = dsk =
+ (WT_PAGE_DISK *)((uint8_t *)tmp->data + sizeof(WT_PAGE));
+ WT_ERR(__wt_file_alloc(toc, &page->addr, page_size));
+ page->size = page_size;
+ dsk->type = (uint8_t)page_type;
+ dsk->level = (uint8_t)page_level;
+
+ *page_ret = page;
+ *tmp_ret = tmp;
+ return (0);
+
+err: if (tmp != NULL)
+ __wt_scr_release(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_bulk_stack_put --
+ * Push out the tree's stack of pages.
+ */
+static int
+__wt_bulk_stack_put(WT_TOC *toc, WT_STACK *stack)
+{
+ ENV *env;
+ IDB *idb;
+ WT_STACK_ELEM *elem;
+ int ret;
+
+ env = toc->env;
+ idb = toc->db->idb;
+ ret = 0;
+
+ for (elem = stack->elem; elem->page != NULL; ++elem) {
+ WT_TRET(__wt_page_write(toc, elem->page));
+
+ /*
+ * If we've reached the last element in the stack, it's the
+ * root page of the tree. Update the in-memory root address
+ * and the descriptor record.
+ */
+ if ((elem + 1)->page == NULL) {
+ idb->root_off.addr = elem->page->addr;
+ idb->root_off.size = elem->page->size;
+ WT_RECORDS(&idb->root_off) = elem->page->records;
+ WT_TRET(__wt_desc_write(toc));
+ }
+
+ __wt_scr_release(&elem->tmp);
+ }
+ __wt_free(env, stack->elem, stack->size * sizeof(WT_STACK_ELEM));
+
+ return (0);
+}
+
+/*
+ * __wt_bulk_dbt_copy --
+ * Get a copy of DBT referenced object.
+ */
+static int
+__wt_bulk_dbt_copy(ENV *env, DBT *orig, DBT *copy)
+{
+ if (copy->mem_size < orig->size)
+ WT_RET(__wt_realloc(
+ env, &copy->mem_size, orig->size, &copy->data));
+ memcpy(copy->data, orig->data, orig->size);
+ copy->size = orig->size;
+
+ return (0);
+}
diff --git a/src/btree/bt_cache.c b/src/btree/bt_cache.c
new file mode 100644
index 00000000000..43d4f7e6596
--- /dev/null
+++ b/src/btree/bt_cache.c
@@ -0,0 +1,133 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cache_create --
+ * Create the underlying cache.
+ */
+int
+__wt_cache_create(ENV *env)
+{
+ IENV *ienv;
+ WT_CACHE *cache;
+ int ret;
+
+ ienv = env->ienv;
+ ret = 0;
+
+ WT_RET(__wt_calloc(env, 1, sizeof(WT_CACHE), &ienv->cache));
+ cache = ienv->cache;
+
+ WT_ERR(
+ __wt_mtx_alloc(env, "cache eviction server", 1, &cache->mtx_evict));
+ WT_ERR(__wt_mtx_alloc(env, "cache read server", 1, &cache->mtx_read));
+ WT_ERR(__wt_mtx_alloc(env, "reconciliation", 0, &cache->mtx_reconcile));
+
+ WT_ERR(__wt_stat_alloc_cache_stats(env, &cache->stats));
+
+ WT_STAT_SET(
+ cache->stats, CACHE_BYTES_MAX, env->cache_size * WT_MEGABYTE);
+
+ return (0);
+
+err: (void)__wt_cache_destroy(env);
+ return (ret);
+}
+
+/*
+ * __wt_cache_pages_inuse --
+ * Return the number of pages in use.
+ */
+inline uint64_t
+__wt_cache_pages_inuse(WT_CACHE *cache)
+{
+ uint64_t pages_in, pages_out;
+
+ /*
+ * Reading 64-bit fields, potentially on 32-bit machines, and other
+ * threads of control may be modifying them. Check them for sanity
+ * (although "interesting" corruption is vanishingly unlikely, these
+ * values just increment over time).
+ */
+ pages_in = cache->stat_pages_in;
+ pages_out = cache->stat_pages_out;
+ return (pages_in > pages_out ? pages_in - pages_out : 0);
+}
+
+/*
+ * __wt_cache_bytes_inuse --
+ * Return the number of bytes in use.
+ */
+inline uint64_t
+__wt_cache_bytes_inuse(WT_CACHE *cache)
+{
+ uint64_t bytes_in, bytes_out;
+
+ /*
+ * Reading 64-bit fields, potentially on 32-bit machines, and other
+ * threads of control may be modifying them. Check them for sanity
+ * (although "interesting" corruption is vanishingly unlikely, these
+ * values just increment over time).
+ */
+ bytes_in = cache->stat_bytes_in;
+ bytes_out = cache->stat_bytes_out;
+ return (bytes_in > bytes_out ? bytes_in - bytes_out : 0);
+}
+
+/*
+ * __wt_cache_stats --
+ * Update the cache statistics for return to the application.
+ */
+void
+__wt_cache_stats(ENV *env)
+{
+ WT_CACHE *cache;
+ WT_STATS *stats;
+
+ cache = env->ienv->cache;
+ stats = cache->stats;
+
+ WT_STAT_SET(stats, CACHE_BYTES_INUSE, __wt_cache_bytes_inuse(cache));
+ WT_STAT_SET(stats, CACHE_PAGES_INUSE, __wt_cache_pages_inuse(cache));
+}
+
+/*
+ * __wt_cache_destroy --
+ * Discard the underlying cache.
+ */
+int
+__wt_cache_destroy(ENV *env)
+{
+ IENV *ienv;
+ WT_CACHE *cache;
+ int ret;
+
+ ienv = env->ienv;
+ cache = ienv->cache;
+ ret = 0;
+
+ if (cache == NULL)
+ return (0);
+
+ /* Discard mutexes. */
+ if (cache->mtx_evict != NULL)
+ (void)__wt_mtx_destroy(env, cache->mtx_evict);
+ if (cache->mtx_read != NULL)
+ (void)__wt_mtx_destroy(env, cache->mtx_read);
+ if (cache->mtx_reconcile != NULL)
+ (void)__wt_mtx_destroy(env, cache->mtx_reconcile);
+
+ /* Discard allocated memory, and clear. */
+ __wt_free(env, cache->stats, 0);
+ __wt_free(env, ienv->cache, sizeof(WT_CACHE));
+
+ return (ret);
+}
diff --git a/src/btree/bt_close.c b/src/btree/bt_close.c
new file mode 100644
index 00000000000..6bf58e98d7e
--- /dev/null
+++ b/src/btree/bt_close.c
@@ -0,0 +1,86 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_bt_close_page(WT_TOC *, WT_PAGE *, void *);
+
+/*
+ * __wt_bt_close --
+ * Close the tree.
+ */
+int
+__wt_bt_close(WT_TOC *toc)
+{
+ ENV *env;
+ IDB *idb;
+ WT_CACHE *cache;
+ int ret;
+
+ env = toc->env;
+ idb = toc->db->idb;
+ cache = env->ienv->cache;
+ ret = 0;
+
+ /*
+ * XXX
+ * We assume two threads can't call the close method at the same time,
+ * nor can close be called while other threads are in the tree -- the
+ * higher level API has to ensure this.
+ */
+
+ if (WT_UNOPENED_DATABASE(idb))
+ return (0);
+
+ /*
+ * The tree walk is depth first, that is, the worker function is not
+ * called on internal pages until all children have been visited; so,
+ * we don't have to worry about a page being dirtied after the visit.
+ *
+ * Lock out the cache evictions thread, though, we don't want it trying
+ * to evict pages we're flushing.
+ */
+ __wt_lock(env, cache->mtx_reconcile);
+ WT_TRET(__wt_tree_walk(toc, NULL,
+ WT_WALK_CACHE | WT_WALK_OFFDUP, __wt_bt_close_page, NULL));
+ __wt_evict_db_clear(toc);
+ __wt_unlock(env, cache->mtx_reconcile);
+
+ /* There's no root page any more, kill the pointer to catch mistakes. */
+ idb->root_page.page = NULL;
+
+ /* Close the underlying file handle. */
+ WT_TRET(__wt_close(env, idb->fh));
+ idb->fh = NULL;
+
+ return (ret);
+}
+
+/*
+ * __wt_bt_close_page --
+ * Close a page.
+ */
+static int
+__wt_bt_close_page(WT_TOC *toc, WT_PAGE *page, void *arg)
+{
+ WT_CC_QUIET(arg, NULL);
+
+ /* Reconcile any dirty pages, then discard the page. */
+ if (WT_PAGE_IS_MODIFIED(page))
+ WT_RET(__wt_page_reconcile(toc, page));
+
+ /*
+ * The tree walk is depth first, that is, the worker function is not
+ * called on internal pages until all children have been visited; so,
+ * we don't have to worry about reading a page after we discard it.
+ */
+ __wt_page_discard(toc, page);
+
+ return (0);
+}
diff --git a/src/btree/bt_cmp.c b/src/btree/bt_cmp.c
new file mode 100644
index 00000000000..8cfddc0496a
--- /dev/null
+++ b/src/btree/bt_cmp.c
@@ -0,0 +1,74 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_bt_lex_compare --
+ * Lexicographic comparison routine.
+ */
+int
+__wt_bt_lex_compare(DB *db, const DBT *user_dbt, const DBT *tree_dbt)
+{
+ uint32_t len;
+ uint8_t *userp, *treep;
+
+ /*
+ * The DB * argument isn't used by the default routine, but is
+ * a standard argument for user-specified comparison functions.
+ */
+ WT_CC_QUIET(db, NULL);
+
+ /*
+ * Return:
+ * < 0 if user_dbt is lexicographically < tree_dbt
+ * = 0 if user_dbt is lexicographically = tree_dbt
+ * > 0 if user_dbt is lexicographically > tree_dbt
+ *
+ * We use the names "user" and "tree" so it's clear which the
+ * application is looking at when we call its comparison func.
+ */
+ if ((len = user_dbt->size) > tree_dbt->size)
+ len = tree_dbt->size;
+ for (userp = user_dbt->data,
+ treep = tree_dbt->data; len > 0; --len, ++userp, ++treep)
+ if (*userp != *treep)
+ return (*userp < *treep ? -1 : 1);
+
+ /* Contents are equal up to the smallest length. */
+ return (user_dbt->size == tree_dbt->size ? 0 :
+ (user_dbt->size < tree_dbt->size ? -1 : 1));
+}
+
+/*
+ * __wt_bt_int_compare --
+ * Integer comparison routine.
+ */
+int
+__wt_bt_int_compare(DB *db, const DBT *user_dbt, const DBT *tree_dbt)
+{
+ uint64_t user_int, tree_int;
+
+ /*
+ * The DBT must hold the low-order bits in machine integer order.
+ *
+ * Return:
+ * < 0 if user_dbt is < tree_dbt
+ * = 0 if user_dbt is = tree_dbt
+ * > 0 if user_dbt is > tree_dbt
+ *
+ * We use the names "user" and "tree" so it's clear which the
+ * application is looking at when we call its comparison func.
+ */
+ user_int = tree_int = 0;
+ memcpy(&user_int, user_dbt->data, (size_t)db->btree_compare_int);
+ memcpy(&tree_int, tree_dbt->data, (size_t)db->btree_compare_int);
+
+ return (user_int == tree_int ? 0 : (user_int < tree_int ? -1 : 1));
+}
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
new file mode 100644
index 00000000000..e27607aba6a
--- /dev/null
+++ b/src/btree/bt_debug.c
@@ -0,0 +1,661 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+#ifdef HAVE_DIAGNOSTIC
+static void __wt_debug_dsk_col_fix(DB *, WT_PAGE_DISK *, FILE *);
+static void __wt_debug_dsk_col_int(WT_PAGE_DISK *, FILE *);
+static void __wt_debug_dsk_col_rle(DB *, WT_PAGE_DISK *, FILE *);
+static int __wt_debug_dsk_item(WT_TOC *, WT_PAGE_DISK *, FILE *);
+static void __wt_debug_inmem_col_fix(WT_TOC *, WT_PAGE *, FILE *);
+static void __wt_debug_inmem_col_int(WT_PAGE *, FILE *);
+static void __wt_debug_inmem_col_rle(WT_TOC *, WT_PAGE *, FILE *);
+static int __wt_debug_inmem_col_var(WT_TOC *, WT_PAGE *, FILE *);
+static void __wt_debug_inmem_row_int(WT_PAGE *, FILE *);
+static int __wt_debug_inmem_row_leaf(WT_TOC *, WT_PAGE *, FILE *);
+static int __wt_debug_item(WT_TOC *, WT_ITEM *, FILE *);
+static int __wt_debug_item_data(WT_TOC *, WT_ITEM *, FILE *fp);
+static void __wt_debug_off(WT_OFF *, const char *, FILE *);
+static void __wt_debug_page_hdr(WT_TOC *, WT_PAGE *, FILE *);
+static void __wt_debug_pair(const char *, void *, uint32_t, FILE *);
+static void __wt_debug_repl(WT_REPL *, FILE *);
+static void __wt_debug_rleexp(WT_RLE_EXPAND *, FILE *);
+static int __wt_debug_set_fp(const char *, FILE **, int *);
+
+static int
+__wt_debug_set_fp(const char *ofile, FILE **fpp, int *close_varp)
+{
+ FILE *fp;
+
+ *close_varp = 0;
+
+ /* If we were giving a stream, use it. */
+ if ((fp = *fpp) != NULL)
+ return (0);
+
+ /* If we were given a file, use it. */
+ if (ofile != NULL) {
+ if ((fp = fopen(ofile, "w")) == NULL)
+ return (WT_ERROR);
+ *fpp = fp;
+ *close_varp = 1;
+ return (0);
+ }
+
+ /* Default to stdout. */
+ *fpp = stdout;
+ return (0);
+}
+
+/*
+ * __wt_debug_dump --
+ * Dump a database in debugging mode.
+ */
+int
+__wt_debug_dump(WT_TOC *toc, char *ofile, FILE *fp)
+{
+ int do_close, ret;
+
+ WT_RET(__wt_debug_set_fp(ofile, &fp, &do_close));
+
+ /*
+ * We use the verification code to do debugging dumps because if we're
+ * dumping in debugging mode, we want to confirm the page is OK before
+ * walking it.
+ */
+ ret = __wt_verify(toc, NULL, fp);
+
+ if (do_close)
+ (void)fclose(fp);
+
+ return (ret);
+}
+
+/*
+ * __wt_debug_page --
+ * Dump a page in debugging mode.
+ */
+int
+__wt_debug_page(WT_TOC *toc, WT_PAGE *page, char *ofile, FILE *fp)
+{
+ WT_PAGE_DISK *dsk;
+ DB *db;
+ int do_close, ret;
+
+ db = toc->db;
+ dsk = page->dsk;
+ ret = 0;
+
+ WT_RET(__wt_debug_set_fp(ofile, &fp, &do_close));
+
+ __wt_debug_page_hdr(toc, page, fp);
+
+ switch (dsk->type) {
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_DUP_LEAF:
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_RLE:
+ case WT_PAGE_COL_INT:
+ fprintf(fp,
+ "\trecords %llu, starting recno %llu, level %lu, "
+ "entries %lu, lsn %lu/%lu\n",
+ (unsigned long long)page->records,
+ (unsigned long long)dsk->start_recno,
+ (u_long)dsk->level, (u_long)dsk->u.entries,
+ (u_long)dsk->lsn[0], (u_long)dsk->lsn[1]);
+ break;
+ case WT_PAGE_OVFL:
+ fprintf(fp, "size %lu\n", (u_long)dsk->u.datalen);
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+
+ switch (dsk->type) {
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_DUP_LEAF:
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ ret = __wt_debug_dsk_item(toc, dsk, fp);
+ break;
+ case WT_PAGE_COL_FIX:
+ __wt_debug_dsk_col_fix(db, dsk, fp);
+ break;
+ case WT_PAGE_COL_RLE:
+ __wt_debug_dsk_col_rle(db, dsk, fp);
+ break;
+ case WT_PAGE_COL_INT:
+ __wt_debug_dsk_col_int(dsk, fp);
+ break;
+ default:
+ break;
+ }
+
+ fprintf(fp, "}\n");
+
+ if (do_close)
+ (void)fclose(fp);
+
+ return (ret);
+}
+
+/*
+ * __wt_debug_inmem --
+ * Dump the in-memory information for a page.
+ */
+int
+__wt_debug_inmem(WT_TOC *toc, WT_PAGE *page, char *ofile, FILE *fp)
+{
+ DB *db;
+ int do_close;
+
+ db = toc->db;
+
+ WT_RET(__wt_debug_set_fp(ofile, &fp, &do_close));
+
+ __wt_debug_page_hdr(toc, page, fp);
+
+ /* Dump the WT_{ROW,COL}_INDX array. */
+ switch (page->dsk->type) {
+ case WT_PAGE_COL_FIX:
+ __wt_debug_inmem_col_fix(toc, page, fp);
+ break;
+ case WT_PAGE_COL_INT:
+ __wt_debug_inmem_col_int(page, fp);
+ break;
+ case WT_PAGE_COL_RLE:
+ __wt_debug_inmem_col_rle(toc, page, fp);
+ break;
+ case WT_PAGE_COL_VAR:
+ WT_RET(__wt_debug_inmem_col_var(toc, page, fp));
+ break;
+ case WT_PAGE_DUP_LEAF:
+ case WT_PAGE_ROW_LEAF:
+ WT_RET(__wt_debug_inmem_row_leaf(toc, page, fp));
+ break;
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_ROW_INT:
+ __wt_debug_inmem_row_int(page, fp);
+ break;
+ case WT_PAGE_OVFL:
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+
+ fprintf(fp, "}\n");
+
+ if (do_close)
+ (void)fclose(fp);
+
+ return (0);
+}
+
+/*
+ * __wt_debug_inmem_col_fix --
+ * Dump an in-memory WT_PAGE_COL_FIX page.
+ */
+static void
+__wt_debug_inmem_col_fix(WT_TOC *toc, WT_PAGE *page, FILE *fp)
+{
+ WT_COL *cip;
+ WT_REPL *repl;
+ uint32_t fixed_len, i;
+
+ fixed_len = toc->db->fixed_len;
+
+ if (fp == NULL) /* Default to stderr */
+ fp = stderr;
+
+ WT_INDX_FOREACH(page, cip, i) {
+ fprintf(fp, "\tdata {");
+ if (WT_FIX_DELETE_ISSET(cip->data))
+ fprintf(fp, "deleted");
+ else
+ __wt_print_byte_string(cip->data, fixed_len, fp);
+ fprintf(fp, "}\n");
+
+ if ((repl = WT_COL_REPL(page, cip)) != NULL)
+ __wt_debug_repl(repl, fp);
+ }
+}
+
+/*
+ * __wt_debug_inmem_col_int --
+ * Dump an in-memory WT_PAGE_COL_INT page.
+ */
+static void
+__wt_debug_inmem_col_int(WT_PAGE *page, FILE *fp)
+{
+ WT_COL *cip;
+ uint32_t i;
+
+ if (fp == NULL) /* Default to stderr */
+ fp = stderr;
+
+ WT_INDX_FOREACH(page, cip, i)
+ __wt_debug_off(cip->data, "\t", fp);
+}
+
+/*
+ * __wt_debug_inmem_col_rle --
+ * Dump an in-memory WT_PAGE_COL_RLE page.
+ */
+static void
+__wt_debug_inmem_col_rle(WT_TOC *toc, WT_PAGE *page, FILE *fp)
+{
+ WT_COL *cip;
+ WT_RLE_EXPAND *exp;
+ uint32_t fixed_len, i;
+
+ fixed_len = toc->db->fixed_len;
+
+ if (fp == NULL) /* Default to stderr */
+ fp = stderr;
+
+ WT_INDX_FOREACH(page, cip, i) {
+ fprintf(fp,
+ "\trepeat %lu {", (u_long)WT_RLE_REPEAT_COUNT(cip->data));
+ if (WT_FIX_DELETE_ISSET(WT_RLE_REPEAT_DATA(cip->data)))
+ fprintf(fp, "deleted");
+ else
+ __wt_print_byte_string(
+ WT_RLE_REPEAT_DATA(cip->data), fixed_len, fp);
+ fprintf(fp, "}\n");
+
+ if ((exp = WT_COL_RLEEXP(page, cip)) != NULL)
+ __wt_debug_rleexp(exp, fp);
+ }
+}
+
+/*
+ * __wt_debug_inmem_col_var --
+ * Dump an in-memory WT_PAGE_COL_VAR page.
+ */
+static int
+__wt_debug_inmem_col_var(WT_TOC *toc, WT_PAGE *page, FILE *fp)
+{
+ WT_COL *cip;
+ WT_REPL *repl;
+ uint32_t i;
+
+ if (fp == NULL) /* Default to stderr */
+ fp = stderr;
+
+ WT_INDX_FOREACH(page, cip, i) {
+ fprintf(fp, "\tdata {");
+ WT_RET(__wt_debug_item_data(toc, cip->data, fp));
+ fprintf(fp, "}\n");
+
+ if ((repl = WT_COL_REPL(page, cip)) != NULL)
+ __wt_debug_repl(repl, fp);
+ }
+ return (0);
+}
+
+/*
+ * __wt_debug_inmem_row_leaf --
+ * Dump an in-memory WT_PAGE_DUP_LEAF or WT_PAGE_ROW_LEAF page.
+ */
+static int
+__wt_debug_inmem_row_leaf(WT_TOC *toc, WT_PAGE *page, FILE *fp)
+{
+ WT_REPL *repl;
+ WT_ROW *rip;
+ uint32_t i;
+
+ if (fp == NULL) /* Default to stderr */
+ fp = stderr;
+
+ WT_INDX_FOREACH(page, rip, i) {
+ if (__wt_key_process(rip))
+ fprintf(fp, "\tkey: {requires processing}\n");
+ else
+ __wt_debug_dbt("\tkey", rip, fp);
+
+ fprintf(fp, "\tdata: {");
+ WT_RET(__wt_debug_item_data(toc, rip->data, fp));
+ fprintf(fp, "}\n");
+
+ if ((repl = WT_ROW_REPL(page, rip)) != NULL)
+ __wt_debug_repl(repl, fp);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_debug_inmem_row_int --
+ * Dump an in-memory WT_PAGE_DUP_INT or WT_PAGE_ROW_INT page.
+ */
+static void
+__wt_debug_inmem_row_int(WT_PAGE *page, FILE *fp)
+{
+ WT_ROW *rip;
+ uint32_t i;
+
+ if (fp == NULL) /* Default to stderr */
+ fp = stderr;
+
+ WT_INDX_FOREACH(page, rip, i) {
+ if (__wt_key_process(rip))
+ fprintf(fp, "\tkey: {requires processing}\n");
+ else
+ __wt_debug_dbt("\tkey", rip, fp);
+
+ __wt_debug_off(rip->data, "\t", fp);
+ }
+}
+
+/*
+ * __wt_debug_repl --
+ * Dump a replacement array.
+ */
+static void
+__wt_debug_repl(WT_REPL *repl, FILE *fp)
+{
+ if (fp == NULL) /* Default to stderr */
+ fp = stderr;
+
+ for (; repl != NULL; repl = repl->next)
+ if (WT_REPL_DELETED_ISSET(repl))
+ fprintf(fp, "\trepl: {deleted}\n");
+ else
+ __wt_debug_pair(
+ "\trepl", WT_REPL_DATA(repl), repl->size, fp);
+}
+
+/*
+ * __wt_debug_rleexp --
+ * Dump a column store expansion array.
+ */
+static void
+__wt_debug_rleexp(WT_RLE_EXPAND *exp, FILE *fp)
+{
+ WT_REPL *repl;
+
+ if (fp == NULL) /* Default to stderr */
+ fp = stderr;
+
+ for (; exp != NULL; exp = exp->next) {
+ repl = exp->repl;
+ if (WT_REPL_DELETED_ISSET(repl))
+ fprintf(fp, "\trepl: {deleted}\n");
+ else
+ __wt_debug_pair(
+ "\trepl", WT_REPL_DATA(repl), repl->size, fp);
+ }
+}
+
+/*
+ * __wt_debug_dsk_item --
+ * Dump a page of WT_ITEM's.
+ */
+static int
+__wt_debug_dsk_item(WT_TOC *toc, WT_PAGE_DISK *dsk, FILE *fp)
+{
+ WT_ITEM *item;
+ uint32_t i;
+
+ if (fp == NULL) /* Default to stderr */
+ fp = stderr;
+
+ WT_ITEM_FOREACH(dsk, item, i)
+ WT_RET(__wt_debug_item(toc, item, fp));
+ return (0);
+}
+
+/*
+ * __wt_debug_item --
+ * Dump a single WT_ITEM.
+ */
+static int
+__wt_debug_item(WT_TOC *toc, WT_ITEM *item, FILE *fp)
+{
+ DB *db;
+ WT_OVFL *ovfl;
+
+ if (fp == NULL) /* Default to stderr */
+ fp = stderr;
+
+ db = toc->db;
+
+ fprintf(fp, "\t%s: len %lu",
+ __wt_item_type_string(item), (u_long)WT_ITEM_LEN(item));
+
+ switch (WT_ITEM_TYPE(item)) {
+ case WT_ITEM_KEY:
+ case WT_ITEM_KEY_DUP:
+ case WT_ITEM_DATA:
+ case WT_ITEM_DATA_DUP:
+ break;
+ case WT_ITEM_KEY_OVFL:
+ case WT_ITEM_KEY_DUP_OVFL:
+ case WT_ITEM_DATA_OVFL:
+ case WT_ITEM_DATA_DUP_OVFL:
+ ovfl = WT_ITEM_BYTE_OVFL(item);
+ fprintf(fp, ", addr %lu, size %lu",
+ (u_long)ovfl->addr, (u_long)ovfl->size);
+ break;
+ case WT_ITEM_DEL:
+ fprintf(fp, "\n");
+ return (0);
+ case WT_ITEM_OFF:
+ __wt_debug_off(WT_ITEM_BYTE_OFF(item), ", ", fp);
+ return (0);
+ WT_ILLEGAL_FORMAT(db);
+ }
+
+ fprintf(fp, "\n\t{");
+ WT_RET(__wt_debug_item_data(toc, item, fp));
+ fprintf(fp, "}\n");
+ return (0);
+}
+
+/*
+ * __wt_debug_dsk_col_int --
+ * Dump a WT_PAGE_COL_INT page.
+ */
+static void
+__wt_debug_dsk_col_int(WT_PAGE_DISK *dsk, FILE *fp)
+{
+ WT_OFF *off;
+ uint32_t i;
+
+ if (fp == NULL) /* Default to stderr */
+ fp = stderr;
+
+ WT_OFF_FOREACH(dsk, off, i)
+ __wt_debug_off(off, "\t", fp);
+}
+
+/*
+ * __wt_debug_dsk_col_fix --
+ * Dump a WT_PAGE_COL_FIX page.
+ */
+static void
+__wt_debug_dsk_col_fix(DB *db, WT_PAGE_DISK *dsk, FILE *fp)
+{
+ uint32_t i;
+ uint8_t *p;
+
+ if (fp == NULL) /* Default to stderr */
+ fp = stderr;
+
+ WT_FIX_FOREACH(db, dsk, p, i) {
+ fprintf(fp, "\t{");
+ if (WT_FIX_DELETE_ISSET(p))
+ fprintf(fp, "deleted");
+ else
+ __wt_print_byte_string(p, db->fixed_len, fp);
+ fprintf(fp, "}\n");
+ }
+}
+
+/*
+ * __wt_debug_dsk_col_rle --
+ * Dump a WT_PAGE_COL_RLE page.
+ */
+static void
+__wt_debug_dsk_col_rle(DB *db, WT_PAGE_DISK *dsk, FILE *fp)
+{
+ uint32_t i;
+ uint8_t *p;
+
+ if (fp == NULL) /* Default to stderr */
+ fp = stderr;
+
+ WT_RLE_REPEAT_FOREACH(db, dsk, p, i) {
+ fprintf(fp, "\trepeat %lu {",
+ (u_long)WT_RLE_REPEAT_COUNT(p));
+ if (WT_FIX_DELETE_ISSET(WT_RLE_REPEAT_DATA(p)))
+ fprintf(fp, "deleted");
+ else
+ __wt_print_byte_string(
+ WT_RLE_REPEAT_DATA(p), db->fixed_len, fp);
+ fprintf(fp, "}\n");
+ }
+}
+
+/*
+ * __wt_debug_item_data --
+ * Dump a single item's data in debugging mode.
+ */
+static int
+__wt_debug_item_data(WT_TOC *toc, WT_ITEM *item, FILE *fp)
+{
+ DB *db;
+ DBT *tmp;
+ IDB *idb;
+ uint32_t size;
+ uint8_t *p;
+ int ret;
+
+ if (fp == NULL) /* Default to stderr */
+ fp = stderr;
+
+ db = toc->db;
+ tmp = NULL;
+ idb = db->idb;
+ ret = 0;
+
+ switch (WT_ITEM_TYPE(item)) {
+ case WT_ITEM_KEY:
+ if (idb->huffman_key != NULL)
+ goto process;
+ goto onpage;
+ case WT_ITEM_KEY_DUP:
+ case WT_ITEM_DATA:
+ case WT_ITEM_DATA_DUP:
+ if (idb->huffman_data != NULL)
+ goto process;
+onpage: p = WT_ITEM_BYTE(item);
+ size = WT_ITEM_LEN(item);
+ break;
+ case WT_ITEM_KEY_OVFL:
+ case WT_ITEM_KEY_DUP_OVFL:
+ case WT_ITEM_DATA_OVFL:
+ case WT_ITEM_DATA_DUP_OVFL:
+process: WT_ERR(__wt_scr_alloc(toc, 0, &tmp));
+ WT_ERR(__wt_item_process(toc, item, tmp));
+ p = tmp->data;
+ size = tmp->size;
+ break;
+ case WT_ITEM_DEL:
+ p = (uint8_t *)"deleted";
+ size = 7;
+ break;
+ case WT_ITEM_OFF:
+ p = (uint8_t *)"offpage";
+ size = 7;
+ break;
+ WT_ILLEGAL_FORMAT_ERR(db, ret);
+ }
+
+ __wt_print_byte_string(p, size, fp);
+
+err: if (tmp != NULL)
+ __wt_scr_release(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_debug_off --
+ * Dump a WT_OFF structure.
+ */
+static void
+__wt_debug_off(WT_OFF *off, const char *prefix, FILE *fp)
+{
+ if (fp == NULL) /* Default to stderr */
+ fp = stderr;
+
+ fprintf(fp, "%soffpage: addr %lu, size %lu, records %llu\n",
+ prefix, (u_long)off->addr, (u_long)off->size,
+ (unsigned long long)WT_RECORDS(off));
+}
+
+/*
+ * __wt_debug_dbt --
+ * Dump a single DBT in debugging mode, with an optional tag.
+ */
+void
+__wt_debug_dbt(const char *tag, void *arg_dbt, FILE *fp)
+{
+ DBT *dbt;
+
+ if (fp == NULL) /* Default to stderr */
+ fp = stderr;
+
+ /*
+ * The argument isn't necessarily a DBT structure, but the first two
+ * fields of the argument are always a void *data/uint32_t size pair.
+ */
+ dbt = arg_dbt;
+ __wt_debug_pair(tag, dbt->data, dbt->size, fp);
+}
+
+/*
+ * __wt_debug_pair --
+ * Dump a single data/size pair, with an optional tag.
+ */
+static void
+__wt_debug_pair(const char *tag, void *data, uint32_t size, FILE *fp)
+{
+ if (fp == NULL) /* Default to stderr */
+ fp = stderr;
+
+ if (tag != NULL)
+ fprintf(fp, "%s: ", tag);
+ fprintf(fp, "%lu {", (u_long)size);
+ __wt_print_byte_string(data, size, fp);
+ fprintf(fp, "}\n");
+}
+#endif
+
+/*
+ * __wt_debug_page_hdr --
+ * Standard debug page-header output.
+ */
+static void
+__wt_debug_page_hdr(WT_TOC *toc, WT_PAGE *page, FILE *fp)
+{
+ DB *db;
+
+ db = toc->db;
+
+ fprintf(fp,
+ "addr: %lu-%lu {\n\t%s: size %lu\n",
+ (u_long)page->addr,
+ (u_long)page->addr + (WT_OFF_TO_ADDR(db, page->size) - 1),
+ __wt_page_type_string(page->dsk), (u_long)page->size);
+
+}
diff --git a/src/btree/bt_desc.c b/src/btree/bt_desc.c
new file mode 100644
index 00000000000..2fc024d1e8c
--- /dev/null
+++ b/src/btree/bt_desc.c
@@ -0,0 +1,132 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_desc_io(WT_TOC *, void *, int);
+
+/*
+ * __wt_desc_stat --
+ * Fill in the statistics from the database description.
+ */
+int
+__wt_desc_stat(WT_TOC *toc)
+{
+ WT_PAGE_DESC desc;
+ WT_STATS *stats;
+
+ stats = toc->db->idb->dstats;
+
+ WT_RET(__wt_desc_io(toc, &desc, 1));
+
+ WT_STAT_SET(stats, MAGIC, desc.magic);
+ WT_STAT_SET(stats, MAJOR, desc.majorv);
+ WT_STAT_SET(stats, MINOR, desc.minorv);
+ WT_STAT_SET(stats, INTLMAX, desc.intlmax);
+ WT_STAT_SET(stats, INTLMIN, desc.intlmin);
+ WT_STAT_SET(stats, LEAFMAX, desc.leafmax);
+ WT_STAT_SET(stats, LEAFMIN, desc.leafmin);
+ WT_STAT_SET(stats, BASE_RECNO, desc.recno_offset);
+ WT_STAT_SET(stats, FIXED_LEN, desc.fixed_len);
+
+ return (0);
+}
+
+/*
+ * __wt_desc_read --
+ * Read the descriptor structure from page 0.
+ */
+int
+__wt_desc_read(WT_TOC *toc)
+{
+ DB *db;
+ WT_PAGE_DESC desc;
+
+ db = toc->db;
+
+ WT_RET(__wt_desc_io(toc, &desc, 1));
+
+ db->intlmax = desc.intlmax; /* Update DB handle */
+ db->intlmin = desc.intlmin;
+ db->leafmax = desc.leafmax;
+ db->leafmin = desc.leafmin;
+ db->idb->root_off.addr = desc.root_addr;
+ db->idb->root_off.size = desc.root_size;
+ WT_RECORDS(&db->idb->root_off) = desc.records;
+ db->idb->free_addr = desc.free_addr;
+ db->idb->free_size = desc.free_size;
+ db->fixed_len = desc.fixed_len;
+
+ /*
+ * XXX
+ * This is the wrong place to do this -- need to think about how
+ * to update open/configuration information in a reasonable way.
+ */
+ if (db->fixed_len != 0)
+ F_SET(db->idb, WT_COLUMN);
+
+ return (0);
+}
+
+/*
+ * __wt_desc_write --
+ * Update the description page.
+ */
+int
+__wt_desc_write(WT_TOC *toc)
+{
+ DB *db;
+ IDB *idb;
+ WT_PAGE_DESC desc;
+ int ret;
+
+ db = toc->db;
+ idb = db->idb;
+ ret = 0;
+
+ desc.magic = WT_BTREE_MAGIC;
+ desc.majorv = WT_BTREE_MAJOR_VERSION;
+ desc.minorv = WT_BTREE_MINOR_VERSION;
+ desc.intlmax = db->intlmax;
+ desc.intlmin = db->intlmin;
+ desc.leafmax = db->leafmax;
+ desc.leafmin = db->leafmin;
+ desc.recno_offset = 0;
+ desc.root_addr = idb->root_off.addr;
+ desc.root_size = idb->root_off.size;
+ desc.records = WT_RECORDS(&idb->root_off);
+ desc.free_addr = idb->free_addr;
+ desc.free_size = idb->free_size;
+ desc.fixed_len = (uint8_t)db->fixed_len;
+ desc.flags = 0;
+ if (F_ISSET(idb, WT_RLE))
+ F_SET(&desc, WT_PAGE_DESC_RLE);
+
+ WT_RET(__wt_desc_io(toc, &desc, 0));
+
+ return (ret);
+}
+
+/*
+ * __wt_desc_io --
+ * Read/write the WT_DESC sector.
+ */
+static int
+__wt_desc_io(WT_TOC *toc, void *p, int is_read)
+{
+ WT_FH *fh;
+ ENV *env;
+
+ fh = toc->db->idb->fh;
+ env = toc->env;
+
+ return (is_read ?
+ __wt_read(env, fh, (off_t)0, 512, p) :
+ __wt_write(env, fh, (off_t)0, 512, p));
+}
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
new file mode 100644
index 00000000000..8e189204ce0
--- /dev/null
+++ b/src/btree/bt_discard.c
@@ -0,0 +1,234 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static void __wt_page_discard_dup(ENV *, WT_PAGE *);
+static void __wt_page_discard_rleexp(ENV *, WT_PAGE *);
+static void __wt_page_discard_repl(ENV *, WT_PAGE *);
+static void __wt_page_discard_repl_list(ENV *, WT_REPL *);
+static inline int __wt_row_key_on_page(WT_PAGE *, WT_ROW *);
+
+/*
+ * __wt_page_discard --
+ * Free all memory associated with a page.
+ */
+void
+__wt_page_discard(WT_TOC *toc, WT_PAGE *page)
+{
+ ENV *env;
+ WT_ROW *rip;
+ uint32_t i, type;
+ void *last_key;
+
+ env = toc->env;
+ type = page->dsk->type;
+
+ /* Never discard a dirty page. */
+ WT_ASSERT(env, !WT_PAGE_IS_MODIFIED(page));
+
+ /* Free the in-memory index array. */
+ switch (type) {
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_DUP_LEAF:
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ /*
+ * For each entry, see if the key was an allocation (that is,
+ * if it points somewhere other than the original page), and
+ * if so, free the memory. This test is a superset of the
+ * __wt_key_process test, that is, any key requiring processing
+ * but not yet processed, must reference on-page information.
+ */
+ last_key = NULL;
+ WT_INDX_FOREACH(page, rip, i) {
+ if (__wt_row_key_on_page(page, rip))
+ continue;
+
+ /*
+ * Only test the first entry for duplicate key/data
+ * pairs, the others reference the same memory. (This
+ * test only makes sense for WT_PAGE_ROW_LEAF pages,
+ * but there is no cost in doing the test for duplicate
+ * leaf pages as well.)
+ */
+ if (rip->key == last_key)
+ continue;
+ last_key = rip->key;
+ __wt_free(env, rip->key, rip->size);
+ }
+ __wt_free(env, page->u.irow, page->indx_count * sizeof(WT_ROW));
+ break;
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_RLE:
+ case WT_PAGE_COL_VAR:
+ __wt_free(env, page->u.icol, page->indx_count * sizeof(WT_COL));
+ break;
+ default:
+ break;
+ }
+
+ /* Free the modified/deletion replacements array. */
+ switch (type) {
+ case WT_PAGE_DUP_LEAF:
+ case WT_PAGE_ROW_LEAF:
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ if (page->u2.repl != NULL)
+ __wt_page_discard_repl(env, page);
+ break;
+ default:
+ break;
+ }
+
+ /* Free the run-length encoded column store expansion array. */
+ switch (type) {
+ case WT_PAGE_COL_RLE:
+ if (page->u2.rleexp != NULL)
+ __wt_page_discard_rleexp(env, page);
+ break;
+ default:
+ break;
+ }
+
+ /* Free the subtree-reference array. */
+ switch (type) {
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_ROW_INT:
+ if (page->u3.ref != NULL)
+ __wt_free(env, page->u3.ref,
+ page->indx_count * sizeof(WT_REF));
+ break;
+ case WT_PAGE_ROW_LEAF:
+ if (WT_PAGE_DUP_TREES(page))
+ __wt_page_discard_dup(env, page);
+ break;
+ default:
+ break;
+ }
+
+ if (page->dsk != NULL)
+ __wt_free(env, page->dsk, page->size);
+ __wt_free(env, page, sizeof(WT_PAGE));
+}
+
+/*
+ * __wt_page_discard_repl --
+ * Discard the replacement array.
+ */
+static void
+__wt_page_discard_repl(ENV *env, WT_PAGE *page)
+{
+ WT_REPL **replp;
+ u_int i;
+
+ /*
+ * For each non-NULL slot in the page's array of replacements, free the
+ * linked list anchored in that slot.
+ */
+ WT_REPL_FOREACH(page, replp, i)
+ if (*replp != NULL)
+ __wt_page_discard_repl_list(env, *replp);
+
+ /* Free the page's array of replacements. */
+ __wt_free(env, page->u2.repl, page->indx_count * sizeof(WT_REPL *));
+}
+
+/*
+ * __wt_page_discard_rleexp --
+ * Discard the run-length encoded column store expansion array.
+ */
+static void
+__wt_page_discard_rleexp(ENV *env, WT_PAGE *page)
+{
+ WT_RLE_EXPAND **expp, *exp, *a;
+ u_int i;
+
+ /*
+ * For each non-NULL slot in the page's run-length encoded column
+ * store expansion array, free the linked list of WT_RLE_EXPAND
+ * structures anchored in that slot.
+ */
+ WT_RLE_EXPAND_FOREACH(page, expp, i) {
+ if ((exp = *expp) == NULL)
+ continue;
+ /*
+ * Free the linked list of WT_REPL structures anchored in the
+ * WT_RLE_EXPAND entry.
+ */
+ __wt_page_discard_repl_list(env, exp->repl);
+ do {
+ a = exp->next;
+ __wt_free(env, exp, sizeof(WT_RLE_EXPAND));
+ } while ((exp = a) != NULL);
+ }
+
+ /* Free the page's expansion array. */
+ __wt_free(
+ env, page->u2.rleexp, page->indx_count * sizeof(WT_RLE_EXPAND *));
+}
+
+/*
+ * __wt_page_discard_repl_list --
+ * Walk a WT_REPL forward-linked list and free the per-thread combination
+ * of a WT_REPL structure and its associated data.
+ */
+static void
+__wt_page_discard_repl_list(ENV *env, WT_REPL *repl)
+{
+ WT_REPL *a;
+ WT_TOC_UPDATE *update;
+
+ do {
+ a = repl->next;
+
+ update = repl->update;
+ WT_ASSERT(env, update->out < update->in);
+ if (++update->out == update->in)
+ __wt_free(env, update, update->len);
+ } while ((repl = a) != NULL);
+}
+
+/*
+ * __wt_page_discard_dup --
+ * Walk the off-page duplicates tree array.
+ */
+static void
+__wt_page_discard_dup(ENV *env, WT_PAGE *page)
+{
+ WT_REF **dupp;
+ u_int i;
+
+ /*
+ * For each non-NULL slot in the page's array of off-page duplicate
+ * references, free the reference.
+ */
+ WT_DUP_FOREACH(page, dupp, i)
+ if (*dupp != NULL)
+ __wt_free(env, *dupp, sizeof(WT_REF));
+
+ /* Free the page's array of off-page duplicate references. */
+ __wt_free(env, page->u3.dup, page->indx_count * sizeof(WT_REF *));
+}
+
+/*
+ * __wt_row_key_on_page --
+ * Return if a WT_ROW structure's key references on-page data.
+ */
+static inline int
+__wt_row_key_on_page(WT_PAGE *page, WT_ROW *rip)
+{
+ uint8_t *p;
+
+ p = rip->key;
+ return (p >= (uint8_t *)page->dsk &&
+ p < (uint8_t *)page->dsk + page->size ? 1 : 0);
+}
diff --git a/src/btree/bt_dump.c b/src/btree/bt_dump.c
new file mode 100644
index 00000000000..4d46fceff27
--- /dev/null
+++ b/src/btree/bt_dump.c
@@ -0,0 +1,472 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+typedef struct {
+ void (*p) /* Print function */
+ (uint8_t *, uint32_t, FILE *);
+ FILE *stream; /* Dump stream */
+
+ void (*f)(const char *, uint64_t); /* Progress callback */
+ uint64_t fcnt; /* Progress counter */
+
+ DBT *dupkey; /* Offpage duplicate tree key */
+} WT_DSTUFF;
+
+static int __wt_dump_page(WT_TOC *, WT_PAGE *, void *);
+static void __wt_dump_page_col_fix(WT_TOC *, WT_PAGE *, WT_DSTUFF *);
+static int __wt_dump_page_col_rle(WT_TOC *, WT_PAGE *, WT_DSTUFF *);
+static int __wt_dump_page_col_var(WT_TOC *, WT_PAGE *, WT_DSTUFF *);
+static int __wt_dump_page_dup_leaf(WT_TOC *, WT_PAGE *, WT_DSTUFF *);
+static int __wt_dump_page_row_leaf(WT_TOC *, WT_PAGE *, WT_DSTUFF *);
+static void __wt_print_byte_string_hex(uint8_t *, uint32_t, FILE *);
+static void __wt_print_byte_string_nl(uint8_t *, uint32_t, FILE *);
+
+/*
+ * __wt_db_dump --
+ * Db.dump method.
+ */
+int
+__wt_db_dump(WT_TOC *toc,
+ FILE *stream, void (*f)(const char *, uint64_t), uint32_t flags)
+{
+ WT_DSTUFF dstuff;
+ int ret;
+
+ if (LF_ISSET(WT_DEBUG)) {
+ /*
+ * We use the verification code to do debugging dumps because
+ * if we're dumping in debugging mode, we want to confirm the
+ * page is OK before blindly reading it.
+ */
+ return (__wt_verify(toc, f, stream));
+ }
+
+ dstuff.p = flags == WT_PRINTABLES ?
+ __wt_print_byte_string_nl : __wt_print_byte_string_hex;
+ dstuff.stream = stream;
+ dstuff.f = f;
+ dstuff.fcnt = 0;
+ dstuff.dupkey = NULL;
+
+ /*
+ * Note we do not have a hazard reference for the root page, and that's
+ * safe -- root pages are pinned into memory when a database is opened,
+ * and never re-written until the database is closed.
+ */
+ fprintf(stream, "VERSION=1\n");
+ fprintf(stream, "HEADER=END\n");
+ ret = __wt_tree_walk(toc, NULL, 0, __wt_dump_page, &dstuff);
+ fprintf(stream, "DATA=END\n");
+
+ /* Wrap up reporting. */
+ if (f != NULL)
+ f(toc->name, dstuff.fcnt);
+
+ return (ret);
+}
+
+/*
+ * __wt_dump_page --
+ * Depth-first recursive walk of a btree.
+ */
+static int
+__wt_dump_page(WT_TOC *toc, WT_PAGE *page, void *arg)
+{
+ DB *db;
+ WT_DSTUFF *dp;
+
+ db = toc->db;
+ dp = arg;
+
+ switch (page->dsk->type) {
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_ROW_INT:
+ break;
+ case WT_PAGE_COL_FIX:
+ __wt_dump_page_col_fix(toc, page, dp);
+ break;
+ case WT_PAGE_COL_RLE:
+ WT_RET(__wt_dump_page_col_rle(toc, page, dp));
+ break;
+ case WT_PAGE_COL_VAR:
+ WT_RET(__wt_dump_page_col_var(toc, page, dp));
+ break;
+ case WT_PAGE_DUP_LEAF:
+ WT_RET(__wt_dump_page_dup_leaf(toc, page, dp));
+ break;
+ case WT_PAGE_ROW_LEAF:
+ WT_RET(__wt_dump_page_row_leaf(toc, page, dp));
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+
+ /* Report progress every 10 pages. */
+ if (dp->f != NULL && ++dp->fcnt % 10 == 0)
+ dp->f(toc->name, dp->fcnt);
+
+ return (0);
+}
+
+/*
+ * __wt_dump_page_col_fix --
+ * Dump a WT_PAGE_COL_FIX page.
+ */
+static void
+__wt_dump_page_col_fix(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp)
+{
+ DB *db;
+ WT_COL *cip;
+ WT_REPL *repl;
+ uint32_t i;
+
+ db = toc->db;
+
+ /* Walk the page, dumping data items. */
+ WT_INDX_FOREACH(page, cip, i) {
+ if ((repl = WT_COL_REPL(page, cip)) == NULL) {
+ if (!WT_FIX_DELETE_ISSET(cip->data))
+ dp->p(cip->data, db->fixed_len, dp->stream);
+ } else
+ if (!WT_REPL_DELETED_ISSET(repl))
+ dp->p(WT_REPL_DATA(repl),
+ db->fixed_len, dp->stream);
+ }
+}
+
+/*
+ * __wt_dump_page_col_rle --
+ * Dump a WT_PAGE_COL_RLE page.
+ */
+static int
+__wt_dump_page_col_rle(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp)
+{
+ DB *db;
+ ENV *env;
+ WT_COL *cip;
+ WT_RLE_EXPAND *exp, **expsort, **expp;
+ WT_REPL *repl;
+ uint64_t recno;
+ uint32_t i, n_expsort;
+ uint16_t n_repeat;
+
+ db = toc->db;
+ env = toc->env;
+ expsort = NULL;
+ n_expsort = 0;
+
+ recno = page->dsk->start_recno;
+ WT_INDX_FOREACH(page, cip, i) {
+ /*
+ * Get a sorted list of any expansion entries we've created for
+ * this set of records. The sort function returns a NULL-
+ * terminated array of references to WT_RLE_EXPAND structures,
+ * sorted by record number.
+ */
+ WT_RET(__wt_rle_expand_sort(
+ env, page, cip, &expsort, &n_expsort));
+
+ /*
+ * Dump the records. We use the WT_REPL entry for records in
+ * in the WT_RLE_EXPAND array, and original data otherwise.
+ */
+ for (expp = expsort,
+ n_repeat = WT_RLE_REPEAT_COUNT(cip->data);
+ n_repeat > 0; --n_repeat, ++recno)
+ if ((exp = *expp) != NULL && exp->recno == recno) {
+ ++expp;
+ repl = exp->repl;
+ if (WT_REPL_DELETED_ISSET(repl))
+ continue;
+ dp->p(
+ WT_REPL_DATA(repl), repl->size, dp->stream);
+ } else
+ dp->p(WT_RLE_REPEAT_DATA(cip->data),
+ db->fixed_len, dp->stream);
+ }
+ /* Free the sort array. */
+ if (expsort != NULL)
+ __wt_free(env, expsort, n_expsort * sizeof(WT_RLE_EXPAND *));
+
+ return (0);
+}
+
+/*
+ * __wt_dump_page_col_var --
+ * Dump a WT_PAGE_COL_VAR page.
+ */
+static int
+__wt_dump_page_col_var(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp)
+{
+ DB *db;
+ DBT *tmp;
+ WT_COL *cip;
+ WT_ITEM *item;
+ WT_REPL *repl;
+ uint32_t i;
+ int ret;
+ void *huffman;
+
+ db = toc->db;
+ huffman = db->idb->huffman_data;
+ ret = 0;
+
+ WT_RET(__wt_scr_alloc(toc, 0, &tmp));
+ WT_INDX_FOREACH(page, cip, i) {
+ /* Check for replace or deletion. */
+ if ((repl = WT_COL_REPL(page, cip)) != NULL) {
+ if (!WT_REPL_DELETED_ISSET(repl))
+ dp->p(
+ WT_REPL_DATA(repl), repl->size, dp->stream);
+ continue;
+ }
+
+ /* Process the original data. */
+ item = cip->data;
+ switch (WT_ITEM_TYPE(item)) {
+ case WT_ITEM_DATA:
+ if (huffman == NULL) {
+ dp->p(WT_ITEM_BYTE(item),
+ WT_ITEM_LEN(item), dp->stream);
+ break;
+ }
+ /* FALLTHROUGH */
+ case WT_ITEM_DATA_OVFL:
+ WT_ERR(__wt_item_process(toc, item, tmp));
+ dp->p(tmp->data, tmp->size, dp->stream);
+ break;
+ case WT_ITEM_DEL:
+ break;
+ WT_ILLEGAL_FORMAT_ERR(db, ret);
+ }
+ }
+
+err: __wt_scr_release(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_dump_page_dup_leaf --
+ * Dump a WT_PAGE_DUP_LEAF page.
+ */
+static int
+__wt_dump_page_dup_leaf(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp)
+{
+ DB *db;
+ DBT *dupkey, *tmp;
+ WT_ITEM *item;
+ WT_REPL *repl;
+ WT_ROW *rip;
+ uint32_t i;
+ int ret;
+ void *huffman;
+
+ db = toc->db;
+ dupkey = dp->dupkey;
+ huffman = db->idb->huffman_data;
+ ret = 0;
+
+ WT_ERR(__wt_scr_alloc(toc, 0, &tmp));
+ WT_INDX_FOREACH(page, rip, i) {
+ /* Check for deletion. */
+ if ((repl = WT_ROW_REPL(
+ page, rip)) != NULL && WT_REPL_DELETED_ISSET(repl))
+ continue;
+
+ /* Output the key, we're going to need it. */
+ dp->p(dupkey->data, dupkey->size, dp->stream);
+
+ /* Output the replacement item. */
+ if (repl != NULL) {
+ dp->p(WT_REPL_DATA(repl), repl->size, dp->stream);
+ continue;
+ }
+
+ /* Process the original data. */
+ item = rip->data;
+ switch (WT_ITEM_TYPE(item)) {
+ case WT_ITEM_DATA_DUP:
+ if (huffman == NULL) {
+ dp->p(WT_ITEM_BYTE(item),
+ WT_ITEM_LEN(item), dp->stream);
+ break;
+ }
+ /* FALLTHROUGH */
+ case WT_ITEM_DATA_DUP_OVFL:
+ WT_ERR(__wt_item_process(toc, item, tmp));
+ dp->p(tmp->data, tmp->size, dp->stream);
+ break;
+ WT_ILLEGAL_FORMAT_ERR(db, ret);
+ }
+ }
+
+err: __wt_scr_release(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_dump_page_row_leaf --
+ * Dump a WT_PAGE_ROW_LEAF page.
+ */
+static int
+__wt_dump_page_row_leaf(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp)
+{
+ DB *db;
+ DBT *key, *data, *key_tmp, *data_tmp, key_local, data_local;
+ WT_ITEM *item;
+ WT_OFF *off;
+ WT_REF *ref;
+ WT_REPL *repl;
+ WT_ROW *rip;
+ uint32_t i;
+ int ret;
+ void *huffman;
+
+ db = toc->db;
+ key = data = key_tmp = data_tmp = NULL;
+ huffman = db->idb->huffman_data;
+ ret = 0;
+
+ WT_ERR(__wt_scr_alloc(toc, 0, &key_tmp));
+ WT_ERR(__wt_scr_alloc(toc, 0, &data_tmp));
+ WT_CLEAR(key_local);
+ WT_CLEAR(data_local);
+
+ WT_INDX_FOREACH(page, rip, i) {
+ /* Check for deletion. */
+ if ((repl = WT_ROW_REPL(
+ page, rip)) != NULL && WT_REPL_DELETED_ISSET(repl))
+ continue;
+
+ /*
+ * The key and data variables reference the DBT's we'll print.
+ * Set the key.
+ */
+ if (__wt_key_process(rip)) {
+ WT_ERR(__wt_item_process(toc, rip->key, key_tmp));
+ key = key_tmp;
+ } else
+ key = (DBT *)rip;
+
+ /*
+ * If the item was ever replaced, we're done: it can't be an
+ * off-page tree, and we don't care what kind of item it was
+ * originally. Dump the data from the replacement entry.
+ *
+ * XXX
+ * This is wrong -- if an off-page dup tree is reconciled,
+ * the off-page reference will change underfoot.
+ */
+ if (repl != NULL) {
+ dp->p(key->data, key->size, dp->stream);
+ dp->p(WT_REPL_DATA(repl), repl->size, dp->stream);
+ continue;
+ }
+
+ /* Set data to reference the data we'll dump. */
+ item = rip->data;
+ switch (WT_ITEM_TYPE(item)) {
+ case WT_ITEM_DATA:
+ case WT_ITEM_DATA_DUP:
+ if (huffman == NULL) {
+ data_local.data = WT_ITEM_BYTE(item);
+ data_local.size = WT_ITEM_LEN(item);
+ data = &data_local;
+ break;
+ }
+ /* FALLTHROUGH */
+ case WT_ITEM_DATA_DUP_OVFL:
+ case WT_ITEM_DATA_OVFL:
+ WT_ERR(__wt_item_process(toc, item, data_tmp));
+ data = data_tmp;
+ break;
+ case WT_ITEM_OFF:
+ /*
+ * Set the key and recursively call the tree-walk code
+ * for any off-page duplicate trees. (Check for any
+ * off-page duplicate trees locally because we already
+ * have to walk the page, so it's faster than walking
+ * the page both here and in the tree-walk function.)
+ */
+ dp->dupkey = key;
+
+ ref = WT_ROW_DUP(page, rip);
+ off = WT_ROW_OFF(rip);
+ WT_RET(__wt_page_in(toc, page, ref, off, 0));
+ ret = __wt_tree_walk(toc, ref, 0, __wt_dump_page, dp);
+ __wt_hazard_clear(toc, ref->page);
+ if (ret != 0)
+ goto err;
+ continue;
+ WT_ILLEGAL_FORMAT_ERR(db, ret);
+ }
+
+ dp->p(key->data, key->size, dp->stream);
+ dp->p(data->data, data->size, dp->stream);
+ }
+
+err: /* Discard any space allocated to hold off-page key/data items. */
+ if (key_tmp != NULL)
+ __wt_scr_release(&key_tmp);
+ if (data_tmp != NULL)
+ __wt_scr_release(&data_tmp);
+
+ return (ret);
+}
+
+static const char hex[] = "0123456789abcdef";
+
+/*
+ * __wt_print_byte_string_nl --
+ * Output a single byte stringin printable characters, where possible.
+ * In addition, terminate with a <newline> character, unless the entry
+ * is itself terminated with a <newline> character.
+ */
+static void
+__wt_print_byte_string_nl(uint8_t *data, uint32_t size, FILE *stream)
+{
+ if (data[size - 1] == '\n')
+ --size;
+ __wt_print_byte_string(data, size, stream);
+ fprintf(stream, "\n");
+}
+
+/*
+ * __wt_print_byte_string --
+ * Output a single byte string in printable characters, where possible.
+ */
+void
+__wt_print_byte_string(uint8_t *data, uint32_t size, FILE *stream)
+{
+ int ch;
+
+ for (; size > 0; --size, ++data) {
+ ch = data[0];
+ if (isprint(ch))
+ fprintf(stream, "%c", ch);
+ else
+ fprintf(stream, "%x%x",
+ hex[(data[0] & 0xf0) >> 4], hex[data[0] & 0x0f]);
+ }
+}
+
+/*
+ * __wt_print_byte_string_hex --
+ * Output a single byte string in hexadecimal characters.
+ */
+static void
+__wt_print_byte_string_hex(uint8_t *data, uint32_t size, FILE *stream)
+{
+ for (; size > 0; --size, ++data)
+ fprintf(stream, "%x%x",
+ hex[(data[0] & 0xf0) >> 4], hex[data[0] & 0x0f]);
+ fprintf(stream, "\n");
+}
diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c
new file mode 100644
index 00000000000..cd4cb87bfb4
--- /dev/null
+++ b/src/btree/bt_evict.c
@@ -0,0 +1,944 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_evict(WT_TOC *);
+static int __wt_evict_compare_lru(const void *a, const void *b);
+static int __wt_evict_compare_page(const void *a, const void *b);
+static void __wt_evict_hazard_check(WT_TOC *);
+static int __wt_evict_hazard_compare(const void *a, const void *b);
+static void __wt_evict_page(WT_TOC *, int);
+static int __wt_evict_page_subtrees(WT_PAGE *);
+static void __wt_evict_set(WT_TOC *);
+static void __wt_evict_state_check(WT_TOC *);
+static int __wt_evict_walk(WT_TOC *);
+static int __wt_evict_walk_single(WT_TOC *, IDB *, uint);
+static void __wt_evict_write(WT_TOC *);
+
+#ifdef HAVE_DIAGNOSTIC
+static void __wt_evict_hazard_validate(ENV *, WT_PAGE *);
+#endif
+
+/*
+ * Tuning constants -- I hesitate to call this tuning, but we should review some
+ * number of pages from each file's in-memory tree for each page we evict, and
+ * we should amortize the comparison of the hazard references across some number
+ * of eviction candidates.
+ */
+#define WT_EVICT_GROUP 10 /* Evict N pages at a time */
+#define WT_EVICT_WALK_PER_TABLE 5 /* Pages to visit per file */
+#define WT_EVICT_WALK_BASE 25 /* Pages tracked across file visits */
+
+/*
+ * WT_EVICT_FOREACH --
+ * Walk a list of eviction candidates.
+ */
+#define WT_EVICT_FOREACH(cache, p, i) \
+ for ((i) = 0, (p) = (cache)->evict; (i) < WT_EVICT_GROUP; ++(i), ++(p))
+
+/*
+ * WT_EVICT_REF_CLR --
+ * Clear an eviction list entry.
+ */
+#define WT_EVICT_CLR(p) do { \
+ (p)->ref = NULL; \
+ (p)->idb = WT_DEBUG_POINT; \
+} while (0)
+
+/*
+ * __wt_workq_evict_server --
+ * See if the eviction server thread needs to be awakened.
+ */
+void
+__wt_workq_evict_server(ENV *env, int force)
+{
+ WT_CACHE *cache;
+ uint64_t bytes_inuse, bytes_max;
+
+ cache = env->ienv->cache;
+
+ /* If the eviction server is running, there's nothing to do. */
+ if (!cache->evict_sleeping)
+ return;
+
+ /*
+ * If we're locking out reads, or over our cache limit, or forcing the
+ * issue (when closing the environment), run the eviction server.
+ */
+ bytes_inuse = __wt_cache_bytes_inuse(cache);
+ bytes_max = WT_STAT(cache->stats, CACHE_BYTES_MAX);
+ if (!force && !cache->read_lockout && bytes_inuse < bytes_max)
+ return;
+
+ WT_VERBOSE(env, WT_VERB_EVICT, (env,
+ "waking eviction server: force %sset, read lockout %sset, "
+ "bytes inuse %s max (%lluMB %s %lluMB), ",
+ force ? "" : "not ", cache->read_lockout ? "" : "not ",
+ bytes_inuse <= bytes_max ? "<=" : ">",
+ (unsigned long long)(bytes_inuse / WT_MEGABYTE),
+ bytes_inuse <= bytes_max ? "<=" : ">",
+ (unsigned long long)(bytes_max / WT_MEGABYTE)));
+
+ cache->evict_sleeping = 0;
+ __wt_unlock(env, cache->mtx_evict);
+}
+
+/*
+ * __wt_cache_evict_server --
+ * Thread to evict pages from the cache.
+ */
+void *
+__wt_cache_evict_server(void *arg)
+{
+ ENV *env;
+ IENV *ienv;
+ WT_CACHE *cache;
+ WT_TOC *toc;
+ uint64_t bytes_inuse, bytes_max;
+ int ret;
+
+ env = arg;
+ ienv = env->ienv;
+ cache = ienv->cache;
+ ret = 0;
+
+ /* We need a thread of control because we're reading/writing pages. */
+ toc = NULL;
+ WT_ERR(__wt_toc_api_set(env, "CacheReconciliation", NULL, &toc));
+
+ /*
+ * Multiple pages are marked for eviction by the eviction server, which
+ * means nobody can read them -- but, this thread of control has to
+ * update higher pages in the tree when it writes this page, which
+ * requires reading other pages, which might themselves be marked for
+ * eviction. Set a flag to allow this thread of control to see pages
+ * marked for eviction -- we know it's safe, because only this thread
+ * is writing pages.
+ *
+ * Reconciliation is probably running because the cache is full, which
+ * means reads are locked out -- reconciliation can read, regardless.
+ */
+ F_SET(toc, WT_READ_EVICT | WT_READ_PRIORITY);
+
+ /*
+ * Allocate memory for a copy of the hazard references -- it's a fixed
+ * size so doesn't need run-time adjustments.
+ */
+ cache->hazard_elem = env->toc_size * env->hazard_size;
+ WT_ERR(__wt_calloc(
+ env, cache->hazard_elem, sizeof(WT_PAGE *), &cache->hazard));
+ cache->hazard_len = cache->hazard_elem * sizeof(WT_PAGE *);
+
+ for (;;) {
+ WT_VERBOSE(env,
+ WT_VERB_EVICT, (env, "eviction server sleeping"));
+ cache->evict_sleeping = 1;
+ __wt_lock(env, cache->mtx_evict);
+ WT_VERBOSE(env,
+ WT_VERB_EVICT, (env, "eviction server waking"));
+
+ /*
+ * Check for environment exit; do it here, instead of the top of
+ * the loop because doing it here keeps us from doing a bunch of
+ * worked when simply awakened to quit.
+ */
+ if (!F_ISSET(ienv, WT_SERVER_RUN))
+ break;
+
+ for (;;) {
+ /* Single-thread reconciliation. */
+ __wt_lock(env, cache->mtx_reconcile);
+ ret = __wt_evict(toc);
+ __wt_unlock(env, cache->mtx_reconcile);
+ if (ret != 0)
+ goto err;
+
+ /*
+ * If we've locked out reads, keep evicting until we
+ * get to at least 5% under the maximum cache. Else,
+ * quit evicting as soon as we get under the maximum
+ * cache.
+ */
+ bytes_inuse = __wt_cache_bytes_inuse(cache);
+ bytes_max = WT_STAT(cache->stats, CACHE_BYTES_MAX);
+ if (cache->read_lockout) {
+ if (bytes_inuse <= bytes_max - (bytes_max / 20))
+ break;
+ } else if (bytes_inuse < bytes_max)
+ break;
+ }
+ }
+
+err: if (cache->evict != NULL)
+ __wt_free(env, cache->evict, cache->evict_len);
+ if (cache->hazard != NULL)
+ __wt_free(env, cache->hazard, cache->hazard_len);
+ if (toc != NULL)
+ WT_TRET(toc->close(toc, 0));
+
+ if (ret != 0)
+ __wt_api_env_err(env, ret, "cache eviction server error");
+
+ WT_VERBOSE(
+ env, WT_VERB_EVICT, (env, "cache eviction server exiting"));
+
+ return (NULL);
+}
+
+/*
+ * __wt_evict --
+ * Evict pages from the cache.
+ */
+static int
+__wt_evict(WT_TOC *toc)
+{
+ ENV *env;
+ WT_CACHE *cache;
+ WT_EVICT_LIST *evict;
+ uint elem, i, j;
+
+ env = toc->env;
+ cache = env->ienv->cache;
+
+ /* Get some more pages to consider for eviction. */
+ WT_RET(__wt_evict_walk(toc));
+
+ /*
+ * We have an array of page eviction references that may contain NULLs,
+ * as well as duplicate entries.
+ *
+ * First, sort the array by WT_REF address, then delete any duplicates.
+ * The reason is because we might evict the page but leave a duplicate
+ * entry in the "saved" area of the array, and that would be a NULL
+ * dereference on the next run. (If someone ever tries to remove this
+ * duplicate cleanup for better performance, you can't fix it just by
+ * checking the WT_REF state -- that only works if you are discarding
+ * a page from a single level of the tree; if you are discarding a
+ * page and its parent, the duplicate of the page's WT_REF might have
+ * been free'd before a subsequent review of the eviction array.)
+ */
+ evict = cache->evict;
+ elem = cache->evict_elem;
+ qsort(evict,
+ (size_t)elem, sizeof(WT_EVICT_LIST), __wt_evict_compare_page);
+ for (i = 0; i < elem; i = j)
+ for (j = i + 1; j < elem; ++j) {
+ /*
+ * If the leading pointer hits a NULL, we're done, the
+ * NULLs all sorted to the top of the array.
+ */
+ if (evict[j].ref == NULL)
+ goto done_duplicates;
+
+ /* Delete the second and any subsequent duplicates. */
+ if (evict[i].ref == evict[j].ref)
+ WT_EVICT_CLR(&evict[j]);
+ else
+ break;
+ }
+done_duplicates:
+
+ /* Second, sort the array by LRU. */
+ qsort(evict,
+ (size_t)elem, sizeof(WT_EVICT_LIST), __wt_evict_compare_lru);
+
+ /*
+ * Discarding pages is done in 5 steps:
+ * Set the WT_EVICT state
+ * Check for any hazard references
+ * Discard clean pages
+ * Reconcile dirty pages (making them clean)
+ * Discard clean pages
+ *
+ * The reason we release clean pages, then reconcile dirty pages, then
+ * release clean pages again is because reconciling a dirty page is a
+ * slow operation, and this releases space sooner. (Arguably, we are
+ * going to discard all of the pages anyway, so what does it matter if
+ * we make clean pages wait for the dirty page writes? On the other
+ * hand, it's a small change and benefits any thread waiting to read a
+ * clean page we picked for discarding, unlikely though that may be.)
+ */
+ __wt_evict_set(toc);
+ __wt_evict_hazard_check(toc);
+ __wt_evict_state_check(toc);
+ __wt_evict_page(toc, 0);
+ __wt_evict_write(toc);
+ __wt_evict_page(toc, 1);
+
+ return (0);
+}
+
+/*
+ * __wt_evict_walk --
+ * Fill in the array by walk the next set of pages.
+ */
+static int
+__wt_evict_walk(WT_TOC *toc)
+{
+ ENV *env;
+ IDB *idb;
+ IENV *ienv;
+ WT_CACHE *cache;
+ uint elem, i;
+ int ret;
+
+ env = toc->env;
+ ienv = env->ienv;
+ cache = ienv->cache;
+
+ /*
+ * Resize the array in which we're tracking pages, as necessary, then
+ * get some pages from each underlying file. We hold a mutex for the
+ * entire time -- it's slow, but (1) how often do new files get added
+ * or removed to/from the system, and (2) it's all in-memory stuff, so
+ * it's not that slow.
+ */
+ ret = 0;
+ __wt_lock(env, ienv->mtx);
+ elem = WT_EVICT_WALK_BASE + (ienv->dbqcnt * WT_EVICT_WALK_PER_TABLE);
+ if (elem <= cache->evict_elem || (ret = __wt_realloc(env,
+ &cache->evict_len,
+ elem * sizeof(WT_EVICT_LIST), &cache->evict)) == 0) {
+ cache->evict_elem = elem;
+
+ i = WT_EVICT_WALK_BASE;
+ TAILQ_FOREACH(idb, &ienv->dbqh, q) {
+ if ((ret = __wt_evict_walk_single(toc, idb, i)) != 0)
+ break;
+ i += WT_EVICT_WALK_PER_TABLE;
+ }
+ }
+ __wt_unlock(env, ienv->mtx);
+ return (ret);
+}
+
+/*
+ * __wt_evict_walk_single --
+ * Get a few page eviction candidates from a single underlying file.
+ */
+static int
+__wt_evict_walk_single(WT_TOC *toc, IDB *idb, uint slot)
+{
+ WT_CACHE *cache;
+ WT_EVICT_LIST *evict;
+ int i, restarted_once;
+
+ cache = toc->env->ienv->cache;
+
+ /*
+ * Tricky little loop that restarts the walk as necessary, without
+ * resetting the count of pages retrieved.
+ */
+ i = restarted_once = 0;
+
+ /* If we haven't yet opened a tree-walk structure, do so. */
+ if (idb->evict_walk.tree == NULL)
+restart: WT_RET(__wt_walk_begin(toc, &idb->root_page, &idb->evict_walk));
+
+ /* Get the next WT_EVICT_WALK_PER_TABLE entries. */
+ do {
+ evict = &cache->evict[slot];
+ WT_RET(__wt_walk_next(toc, &idb->evict_walk, &evict->ref));
+
+ /*
+ * Restart the walk as necessary, but only once (after one
+ * restart we've already acquired all of the pages, and we
+ * could loop infinitely on a tree with a single, pinned, page).
+ */
+ if (evict->ref == NULL) {
+ if (restarted_once++)
+ break;
+ goto restart;
+ }
+
+ evict->idb = idb;
+ ++slot;
+ } while (++i < WT_EVICT_WALK_PER_TABLE);
+
+ return (0);
+}
+
+/*
+ * __wt_evict_db_clear --
+ * Remove any entries for a file from the eviction list.
+ */
+void
+__wt_evict_db_clear(WT_TOC *toc)
+{
+ ENV *env;
+ IDB *idb;
+ IENV *ienv;
+ WT_CACHE *cache;
+ WT_EVICT_LIST *evict;
+ uint i;
+
+ env = toc->env;
+ idb = toc->db->idb;
+ ienv = env->ienv;
+ cache = ienv->cache;
+
+ /*
+ * Discard any entries in the eviction list to a file we're closing
+ * (the caller better have locked out the eviction thread).
+ */
+ if (cache->evict == NULL)
+ return;
+ WT_EVICT_FOREACH(cache, evict, i)
+ if (evict->ref != NULL && evict->idb == idb)
+ WT_EVICT_CLR(evict);
+}
+
+/*
+ * __wt_evict_set --
+ * Set the WT_EVICT flag on a set of pages.
+ */
+static void
+__wt_evict_set(WT_TOC *toc)
+{
+ ENV *env;
+ WT_CACHE *cache;
+ WT_EVICT_LIST *evict;
+ WT_REF *ref;
+ uint i;
+
+ env = toc->env;
+ cache = env->ienv->cache;
+
+ /*
+ * Set the entry state so readers don't try and use the pages. Once
+ * that's done, any thread searching for a page will either see our
+ * state value, or will have already set a hazard reference to the page.
+ * We don't evict a page with a hazard reference set, so we can't race.
+ *
+ * No memory flush needed, the state field is declared volatile.
+ */
+ WT_EVICT_FOREACH(cache, evict, i) {
+ if ((ref = evict->ref) == NULL)
+ continue;
+ ref->state = WT_EVICT;
+ }
+}
+
+/*
+ * __wt_evict_hazard_check --
+ * Compare the list of hazard references to the list of pages to be
+ * discarded.
+ */
+static void
+__wt_evict_hazard_check(WT_TOC *toc)
+{
+ ENV *env;
+ IENV *ienv;
+ WT_CACHE *cache;
+ WT_EVICT_LIST *evict;
+ WT_PAGE **hazard, **end_hazard, *page;
+ WT_REF *ref;
+ WT_STATS *stats;
+ uint i;
+
+ env = toc->env;
+ ienv = env->ienv;
+ cache = ienv->cache;
+ stats = cache->stats;
+
+ /* Sort the eviction candidates by WT_PAGE address. */
+ qsort(cache->evict, (size_t)WT_EVICT_GROUP,
+ sizeof(WT_EVICT_LIST), __wt_evict_compare_page);
+
+ /* Copy the hazard reference array and sort it by WT_PAGE address. */
+ hazard = cache->hazard;
+ end_hazard = hazard + cache->hazard_elem;
+ memcpy(hazard, ienv->hazard, cache->hazard_elem * sizeof(WT_PAGE *));
+ qsort(hazard, (size_t)cache->hazard_elem,
+ sizeof(WT_PAGE *), __wt_evict_hazard_compare);
+
+ /* Walk the lists in parallel and look for matches. */
+ WT_EVICT_FOREACH(cache, evict, i) {
+ if ((ref = evict->ref) == NULL)
+ continue;
+
+ /*
+ * Look for the page in the hazard list until we reach the end
+ * of the list or find a hazard pointer larger than the page.
+ */
+ for (page = ref->page;
+ hazard < end_hazard && *hazard < page; ++hazard)
+ ;
+ if (hazard == end_hazard)
+ break;
+
+ /*
+ * If we find a matching hazard reference, the page is in use:
+ * remove it from the eviction list.
+ *
+ * No memory flush needed, the state field is declared volatile.
+ */
+ if (*hazard == page) {
+ WT_VERBOSE(env, WT_VERB_EVICT, (env,
+ "eviction skipped page addr %lu (hazard reference)",
+ page->addr));
+ WT_STAT_INCR(stats, CACHE_EVICT_HAZARD);
+
+ /*
+ * A page with a low LRU and a hazard reference?
+ *
+ * Set the page's LRU so we don't select it again.
+ * Return the page to service.
+ * Discard our reference.
+ */
+ ref->page->read_gen = ++cache->read_gen;
+ ref->state = WT_OK;
+ WT_EVICT_CLR(evict);
+ }
+ }
+}
+
+/*
+ * __wt_evict_state_check --
+ * Confirm these are pages we want to evict.
+ */
+static void
+__wt_evict_state_check(WT_TOC *toc)
+{
+ ENV *env;
+ WT_CACHE *cache;
+ WT_EVICT_LIST *evict;
+ WT_PAGE *page;
+ WT_REF *ref;
+ uint i;
+
+ env = toc->env;
+ cache = env->ienv->cache;
+
+ /*
+ * We "own" the pages (we've flagged them for eviction, and there were
+ * no hazard references). Now do checks to see if these are pages we
+ * can evict -- we have to wait until after we own the page because the
+ * page might be updated and race with us.
+ */
+ WT_EVICT_FOREACH(cache, evict, i) {
+ if ((ref = evict->ref) == NULL)
+ continue;
+ page = ref->page;
+
+ /* Ignore pinned pages. */
+ if (F_ISSET(page, WT_PINNED)) {
+ WT_VERBOSE(env, WT_VERB_EVICT, (env,
+ "eviction skipped page addr %lu (pinned)",
+ page->addr));
+ goto skip;
+ }
+
+ /* Ignore pages with in-memory subtrees. */
+ switch (page->dsk->type) {
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ if (__wt_evict_page_subtrees(page)) {
+ WT_VERBOSE(env, WT_VERB_EVICT, (env,
+ "eviction skipped page addr %lu (subtrees)",
+ page->addr));
+ goto skip;
+ }
+ break;
+ default:
+ break;
+ }
+
+ continue;
+
+skip: /*
+ * Set the page's LRU so we don't select it again.
+ * Return the page to service.
+ * Discard our reference.
+ */
+ page->read_gen = ++cache->read_gen;
+ ref->state = WT_OK;
+ WT_EVICT_CLR(evict);
+ }
+}
+
+/*
+ * __wt_evict_write --
+ * Write any modified pages.
+ */
+static void
+__wt_evict_write(WT_TOC *toc)
+{
+ ENV *env;
+ WT_CACHE *cache;
+ WT_EVICT_LIST *evict;
+ WT_PAGE *page;
+ WT_REF *ref;
+ uint i;
+
+ env = toc->env;
+ cache = env->ienv->cache;
+
+ WT_EVICT_FOREACH(cache, evict, i) {
+ if ((ref = evict->ref) == NULL)
+ continue;
+ page = ref->page;
+
+ /* Ignore dirty pages. */
+ if (!WT_PAGE_IS_MODIFIED(page))
+ continue;
+
+ /*
+ * We're using our WT_TOC handle, it needs to reference the
+ * correct DB handle.
+ *
+ * XXX
+ * This is pretty sleazy, but I'm hesitant to try and drive
+ * a separate DB/IDB handle down through the reconciliation
+ * code.
+ */
+ toc->db = evict->idb->db;
+ (void)__wt_page_reconcile(toc, page);
+ }
+}
+
+/*
+ * __wt_evict_page --
+ * Evict cache pages.
+ */
+static void
+__wt_evict_page(WT_TOC *toc, int was_dirty)
+{
+ ENV *env;
+ WT_CACHE *cache;
+ WT_EVICT_LIST *evict;
+ WT_PAGE *page;
+ WT_REF *ref;
+ WT_STATS *stats;
+ uint i;
+
+ env = toc->env;
+ cache = env->ienv->cache;
+ stats = cache->stats;
+
+ WT_EVICT_FOREACH(cache, evict, i) {
+ if ((ref = evict->ref) == NULL)
+ continue;
+ page = ref->page;
+
+ /*
+ * The first time we're called, we get rid of the clean pages;
+ * the second time we're called, we get rid of the pages that
+ * were dirty but have since been cleaned. Ignore dirty pages
+ * in all cases, it's simpler.
+ */
+ if (WT_PAGE_IS_MODIFIED(page))
+ continue;
+
+ if (was_dirty)
+ WT_STAT_INCR(stats, CACHE_EVICT_MODIFIED);
+ else
+ WT_STAT_INCR(stats, CACHE_EVICT_UNMODIFIED);
+
+#ifdef HAVE_DIAGNOSTIC
+ __wt_evict_hazard_validate(env, page);
+#endif
+ WT_VERBOSE(env, WT_VERB_EVICT, (env,
+ "cache evicting page addr %lu", page->addr));
+
+ /*
+ * Copy a page reference, then make the cache entry available
+ * for re-use.
+ *
+ * No memory flush needed, the state field is declared volatile.
+ */
+ ref->page = NULL;
+ ref->state = WT_EMPTY;
+
+ /* Remove the entry from the eviction list. */
+ WT_EVICT_CLR(evict);
+
+ /* We've got more space. */
+ WT_CACHE_PAGE_OUT(cache, page->size);
+
+ /* The page can no longer be found, free the memory. */
+ __wt_page_discard(toc, page);
+ }
+}
+
+/*
+ * __wt_evict_page_subtrees --
+ * Return if a page has an in-memory subtree.
+ */
+static int
+__wt_evict_page_subtrees(WT_PAGE *page)
+{
+ WT_REF *ref, **dupp;
+ uint32_t i;
+
+ /*
+ * Return if a page has an in-memory subtree -- this array search could
+ * be replaced by a reference count in the page, but (1) the eviction
+ * thread isn't where I expect performance problems, (2) I hate to lose
+ * more bytes on every page, (3) how often will an internal page be
+ * evicted anyway?
+ */
+ switch (page->dsk->type) {
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_ROW_INT:
+ WT_REF_FOREACH(page, ref, i)
+ if (ref->state != WT_EMPTY)
+ return (1);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ if (WT_PAGE_DUP_TREES(page))
+ WT_DUP_FOREACH(page, dupp, i)
+ if (*dupp != NULL && (*dupp)->state != WT_EMPTY)
+ return (1);
+ break;
+ default:
+ break;
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_evict_compare_page --
+ * Qsort function: sort WT_EVICT_LIST array based on the page's address.
+ */
+static int
+__wt_evict_compare_page(const void *a, const void *b)
+{
+ WT_REF *a_ref, *b_ref;
+ WT_PAGE *a_page, *b_page;
+
+ /*
+ * There may be NULL references in the array; sort them as greater than
+ * anything else so they migrate to the end of the array.
+ */
+ a_ref = ((WT_EVICT_LIST *)a)->ref;
+ b_ref = ((WT_EVICT_LIST *)b)->ref;
+ if (a_ref == NULL)
+ return (b_ref == NULL ? 0 : 1);
+ if (b_ref == NULL)
+ return (-1);
+
+ /* Sort the page address in ascending order. */
+ a_page = a_ref->page;
+ b_page = b_ref->page;
+ return (a_page > b_page ? 1 : (a_page < b_page ? -1 : 0));
+}
+
+/*
+ * __wt_evict_compare_lru --
+ * Qsort function: sort WT_EVICT_LIST array based on the page's read
+ * generation.
+ */
+static int
+__wt_evict_compare_lru(const void *a, const void *b)
+{
+ WT_REF *a_ref, *b_ref;
+ uint32_t a_lru, b_lru;
+
+ /*
+ * There may be NULL references in the array; sort them as greater than
+ * anything else so they migrate to the end of the array.
+ */
+ a_ref = ((WT_EVICT_LIST *)a)->ref;
+ b_ref = ((WT_EVICT_LIST *)b)->ref;
+ if (a_ref == NULL)
+ return (b_ref == NULL ? 0 : 1);
+ if (b_ref == NULL)
+ return (-1);
+
+ /* Sort the LRU in ascending order. */
+ a_lru = a_ref->page->read_gen;
+ b_lru = b_ref->page->read_gen;
+ return (a_lru > b_lru ? 1 : (a_lru < b_lru ? -1 : 0));
+}
+
+/*
+ * __wt_evict_hazard_compare --
+ * Qsort function: sort hazard list based on the page's address.
+ */
+static int
+__wt_evict_hazard_compare(const void *a, const void *b)
+{
+ WT_PAGE *a_page, *b_page;
+
+ a_page = *(WT_PAGE **)a;
+ b_page = *(WT_PAGE **)b;
+
+ return (a_page > b_page ? 1 : (a_page < b_page ? -1 : 0));
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_evict_hazard_validate --
+ * Return if a page is or isn't on the hazard list.
+ */
+static void
+__wt_evict_hazard_validate(ENV *env, WT_PAGE *page)
+{
+ IENV *ienv;
+ WT_PAGE **hp;
+ WT_TOC **tp, *toc;
+
+ ienv = env->ienv;
+
+ for (tp = ienv->toc; (toc = *tp) != NULL; ++tp)
+ for (hp = toc->hazard;
+ hp < toc->hazard + toc->env->hazard_size; ++hp)
+ if (*hp == page) {
+ __wt_api_env_errx(env,
+ "hazard eviction check for page %lu "
+ "failed",
+ (u_long)page->addr);
+ __wt_abort(env);
+ }
+}
+
+/*
+ * __wt_evict_dump --
+ * Display the eviction list.
+ */
+void
+__wt_evict_dump(WT_TOC *toc)
+{
+ ENV *env;
+ WT_CACHE *cache;
+ WT_EVICT_LIST *evict;
+ WT_MBUF mb;
+ uint n;
+ int sep;
+
+ env = toc->env;
+ cache = env->ienv->cache;
+
+ __wt_mb_init(env, &mb);
+ __wt_mb_add(&mb, "eviction list");
+
+ for (sep = ':', n = 0; n < cache->evict_elem; ++n) {
+ evict = &cache->evict[n];
+ if (evict->ref == NULL)
+ continue;
+ __wt_mb_add(&mb, "%c %lu", sep, (u_long)evict->ref->page->addr);
+ sep = ',';
+ }
+ __wt_mb_discard(&mb);
+}
+
+/*
+ * __wt_evict_dump_cache
+ * Dump the in-memory cache.
+ */
+int
+__wt_evict_cache_dump(WT_TOC *toc)
+{
+ IDB *idb;
+ IENV *ienv;
+
+ ienv = toc->env->ienv;
+
+ TAILQ_FOREACH(idb, &ienv->dbqh, q)
+ WT_RET(__wt_evict_tree_dump(toc, idb));
+ return (0);
+}
+
+/*
+ * __wt_evict_tree_dump
+ * Dump an in-memory tree.
+ */
+int
+__wt_evict_tree_dump(WT_TOC *toc, IDB *idb)
+{
+ ENV *env;
+ WT_CACHE *cache;
+ WT_REF *ref;
+ WT_WALK walk;
+ WT_MBUF mb;
+ int sep;
+
+ env = toc->env;
+ cache = env->ienv->cache;
+
+ WT_VERBOSE(env, WT_VERB_EVICT, (env,
+ "%s: pages inuse %llu, bytes inuse (%llu), max (%llu)",
+ idb->name,
+ __wt_cache_pages_inuse(cache),
+ __wt_cache_bytes_inuse(cache),
+ WT_STAT(cache->stats, CACHE_BYTES_MAX)));
+
+ __wt_mb_init(env, &mb);
+ __wt_mb_add(&mb, "in-memory page list");
+
+ WT_CLEAR(walk);
+ WT_RET(__wt_walk_begin(toc, &idb->root_page, &walk));
+ for (sep = ':';;) {
+ WT_RET(__wt_walk_next(toc, &walk, &ref));
+ if (ref == NULL)
+ break;
+ __wt_mb_add(&mb, "%c %lu", sep, (u_long)ref->page->addr);
+ sep = ',';
+ }
+ __wt_walk_end(env, &walk);
+ __wt_mb_discard(&mb);
+
+ return (0);
+}
+
+/*
+ * __wt_evict_cache_count
+ * Retrun the count of nodes in the cache.
+ */
+int
+__wt_evict_cache_count(WT_TOC *toc, uint64_t *nodesp)
+{
+ IDB *idb;
+ IENV *ienv;
+ uint64_t nodes;
+
+ ienv = toc->env->ienv;
+
+ *nodesp = 0;
+ TAILQ_FOREACH(idb, &ienv->dbqh, q) {
+ WT_RET(__wt_evict_tree_count(toc, idb, &nodes));
+ *nodesp += nodes;
+ }
+ return (0);
+}
+
+/*
+ * __wt_evict_tree_count
+ * Return a count of nodes in the tree.
+ */
+int
+__wt_evict_tree_count(WT_TOC *toc, IDB *idb, uint64_t *nodesp)
+{
+ ENV *env;
+ WT_REF *ref;
+ WT_WALK walk;
+ uint64_t nodes;
+
+ env = toc->env;
+
+ WT_CLEAR(walk);
+ WT_RET(__wt_walk_begin(toc, &idb->root_page, &walk));
+ for (nodes = 0;;) {
+ WT_RET(__wt_walk_next(toc, &walk, &ref));
+ if (ref == NULL)
+ break;
+ ++nodes;
+ }
+ *nodesp = nodes;
+ __wt_walk_end(env, &walk);
+
+ return (0);
+}
+#endif
diff --git a/src/btree/bt_misc.c b/src/btree/bt_misc.c
new file mode 100644
index 00000000000..c0f58002522
--- /dev/null
+++ b/src/btree/bt_misc.c
@@ -0,0 +1,175 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_bt_build_verify --
+ * Verify the Btree build itself.
+ */
+int
+__wt_bt_build_verify(void)
+{
+ static const struct {
+ char *name;
+ u_int size, expected;
+ } size_check[] = {
+ { "WT_COL", sizeof(WT_COL), WT_COL_SIZE },
+ { "WT_ITEM", sizeof(WT_ITEM), WT_ITEM_SIZE },
+ { "WT_OFF", sizeof(WT_OFF), WT_OFF_SIZE },
+ { "WT_OVFL", sizeof(WT_OVFL), WT_OVFL_SIZE },
+ { "WT_PAGE", sizeof(WT_PAGE), WT_PAGE_SIZE },
+ { "WT_PAGE_DESC", sizeof(WT_PAGE_DESC), WT_PAGE_DESC_SIZE },
+ { "WT_PAGE_DISK", sizeof(WT_PAGE_DISK), WT_PAGE_DISK_SIZE },
+ { "WT_ROW", sizeof(WT_ROW), WT_ROW_SIZE }
+ };
+ static const struct {
+ char *name;
+ u_int size, align;
+ } align_check[] = {
+ { "WT_OFF", sizeof(WT_OFF), sizeof(uint32_t) },
+ { "WT_OVFL", sizeof(WT_OVFL), sizeof(uint32_t) },
+ { "WT_PAGE_DISK", sizeof(WT_PAGE_DISK), sizeof(uint32_t) },
+ { "WT_TOC_UPDATE", sizeof(WT_TOC_UPDATE), sizeof(uint32_t) }
+ };
+ u_int i;
+
+ /*
+ * The compiler had better not have padded our structures -- make
+ * sure the page header structure is exactly what we expect.
+ */
+ for (i = 0; i < WT_ELEMENTS(size_check); ++i) {
+ if (size_check[i].size == size_check[i].expected)
+ continue;
+ __wt_api_env_errx(NULL,
+ "WiredTiger build failed, the %s header structure is not "
+ "the correct size (expected %u, got %u)",
+ size_check[i].name,
+ size_check[i].expected, size_check[i].size);
+ return (WT_ERROR);
+ }
+
+ /* There are also structures that must be aligned correctly. */
+ for (i = 0; i < WT_ELEMENTS(align_check); ++i) {
+ if (WT_ALIGN(align_check[i].size,
+ align_check[i].align) == align_check[i].size)
+ continue;
+ __wt_api_env_errx(NULL,
+ "Build verification failed, the %s structure is not"
+ " correctly aligned", align_check[i].name);
+ return (WT_ERROR);
+ }
+
+ /*
+ * We mix-and-match 32-bit unsigned values and size_t's, mostly because
+ * we allocate and handle 32-bit objects, and lots of the underlying C
+ * library expects size_t values for the length of memory objects. We
+ * check, just to be sure.
+ */
+ if (sizeof(size_t) < sizeof(uint32_t)) {
+ __wt_api_env_errx(NULL, "%s",
+ "Build verification failed, a size_t is smaller than "
+ "4-bytes");
+ return (WT_ERROR);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_set_ff_and_sa_from_offset --
+ * Set first-free and space-available values from an address positioned
+ * one past the last used byte on the page.
+ */
+inline void
+__wt_set_ff_and_sa_from_offset(WT_PAGE *page,
+ void *p, uint8_t **first_freep, uint32_t *space_availp)
+{
+ *first_freep = (uint8_t *)p;
+ *space_availp =
+ page->size - (uint32_t)((uint8_t *)p - (uint8_t *)page->dsk);
+}
+
+/*
+ * __wt_page_write_gen_check --
+ * Confirm the page's write generation number is correct.
+ */
+inline int
+__wt_page_write_gen_check(WT_PAGE *page, uint32_t write_gen)
+{
+ return (page->write_gen == write_gen ? 0 : WT_RESTART);
+}
+
+/*
+ * __wt_page_type_string --
+ * Return a string representing the page type.
+ */
+const char *
+__wt_page_type_string(WT_PAGE_DISK *dsk)
+{
+ switch (dsk->type) {
+ case WT_PAGE_INVALID:
+ return ("invalid");
+ case WT_PAGE_COL_FIX:
+ return ("column-store fixed-length leaf");
+ case WT_PAGE_COL_INT:
+ return ("column-store internal");
+ case WT_PAGE_COL_RLE:
+ return ("column-store fixed-length run-length encoded leaf");
+ case WT_PAGE_COL_VAR:
+ return ("column-store variable-length leaf");
+ case WT_PAGE_DUP_INT:
+ return ("duplicate tree internal");
+ case WT_PAGE_DUP_LEAF:
+ return ("duplicate tree leaf");
+ case WT_PAGE_OVFL:
+ return ("overflow");
+ case WT_PAGE_ROW_INT:
+ return ("row-store internal");
+ case WT_PAGE_ROW_LEAF:
+ return ("row-store leaf");
+ default:
+ break;
+ }
+ return ("unknown");
+}
+
+/*
+ * __wt_item_type_string --
+ * Return a string representing the item type.
+ */
+const char *
+__wt_item_type_string(WT_ITEM *item)
+{
+ switch (WT_ITEM_TYPE(item)) {
+ case WT_ITEM_KEY:
+ return ("key");
+ case WT_ITEM_KEY_OVFL:
+ return ("key-overflow");
+ case WT_ITEM_KEY_DUP:
+ return ("key-duplicate");
+ case WT_ITEM_KEY_DUP_OVFL:
+ return ("key-duplicate-overflow");
+ case WT_ITEM_DATA:
+ return ("data");
+ case WT_ITEM_DATA_OVFL:
+ return ("data-overflow");
+ case WT_ITEM_DATA_DUP:
+ return ("data-duplicate");
+ case WT_ITEM_DATA_DUP_OVFL:
+ return ("data-duplicate-overflow");
+ case WT_ITEM_DEL:
+ return ("deleted");
+ case WT_ITEM_OFF:
+ return ("off-page");
+ default:
+ break;
+ }
+ return ("unknown");
+}
diff --git a/src/btree/bt_open.c b/src/btree/bt_open.c
new file mode 100644
index 00000000000..c746782221e
--- /dev/null
+++ b/src/btree/bt_open.c
@@ -0,0 +1,279 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_open_verify(DB *);
+static int __wt_open_verify_page_sizes(DB *);
+
+/*
+ * __wt_bt_open --
+ * Open a Btree.
+ */
+int
+__wt_bt_open(WT_TOC *toc, int ok_create)
+{
+ DB *db;
+ ENV *env;
+ IDB *idb;
+
+ db = toc->db;
+ env = toc->env;
+ idb = db->idb;
+
+ /* Check page size configuration. */
+ WT_RET(__wt_open_verify(db));
+
+ /* Open the fle. */
+ WT_RET(__wt_open(env, idb->name, idb->mode, ok_create, &idb->fh));
+
+ /*
+ * If the file size is 0, write a description page; if the file size
+ * is non-zero, update the DB handle based on the on-disk description
+ * page. (If the file isn't empty, there must be a description page.)
+ */
+ if (idb->fh->file_size == 0)
+ WT_RET(__wt_desc_write(toc));
+ else {
+ WT_RET(__wt_desc_read(toc));
+
+ /* If there's a root page, pin it. */
+ if (idb->root_off.addr != WT_ADDR_INVALID)
+ WT_RET(__wt_root_pin(toc));
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_open_verify --
+ * Verify anything we can't verify before we're about to open the file;
+ * set defaults as necessary.
+ */
+static int
+__wt_open_verify(DB *db)
+{
+ IDB *idb;
+
+ idb = db->idb;
+
+ /* Verify the page sizes. */
+ WT_RET(__wt_open_verify_page_sizes(db));
+
+ /* Verify other configuration combinations. */
+ if (db->fixed_len != 0 && (idb->huffman_key || idb->huffman_data)) {
+ __wt_api_db_errx(db,
+ "Fixed size column-store databases may not be Huffman "
+ "compressed");
+ return (WT_ERROR);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_open_verify_page_sizes --
+ * Verify the page sizes.
+ */
+static int
+__wt_open_verify_page_sizes(DB *db)
+{
+ IDB *idb;
+
+ idb = db->idb;
+
+ /*
+ * The application can set lots of page sizes. It's complicated, so
+ * instead of verifying the relationships when they're set, verify
+ * then when the database is opened and we know we have the final
+ * values. (Besides, if we verify the relationships when they're set,
+ * the application has to set them in a specific order or we'd need
+ * one set function that took 10 parameters.)
+ *
+ * If the values haven't been set, set the defaults.
+ *
+ * Default to a small fragment size, so overflow items don't consume
+ * a lot of space.
+ */
+ if (db->allocsize == 0)
+ db->allocsize = WT_BTREE_ALLOCATION_SIZE;
+
+ /* Allocation sizes must be a power-of-two, nothing else makes sense. */
+ if (!__wt_ispo2(db->allocsize)) {
+ __wt_api_db_errx(db,
+ "the allocation size must be a power of two");
+ return (WT_ERROR);
+ }
+
+ /*
+ * Limit allocation units to 256MB, and page sizes to 128MB. There's
+ * no reason (other than testing) we can't support larger sizes (any
+ * sizes up to the smaller of an off_t and a size_t should work), but
+ * an application specifying larger allocation or page sizes is almost
+ * certainly making a mistake.
+ */
+ if (db->allocsize > WT_BTREE_ALLOCATION_SIZE_MAX) {
+ __wt_api_db_errx(db,
+ "the allocation size must less than or equal to %luMB",
+ (u_long)(WT_BTREE_PAGE_SIZE_MAX / WT_MEGABYTE));
+ return (WT_ERROR);
+ }
+
+ /*
+ * Internal pages are also usually small, we want it to fit into the
+ * L1 cache. We try and put at least 40 keys on each internal page
+ * (40 because that results in 100M keys in a level 5 Btree). But,
+ * if it's a small page, push anything bigger than about 50 bytes
+ * off-page. Here's the table:
+ * Pagesize Largest key retained on-page:
+ * 512B 50 bytes
+ * 1K 50 bytes
+ * 2K 51 bytes
+ * 4K 102 bytes
+ * 8K 204 bytes
+ * and so on, roughly doubling for each power-of-two.
+ */
+ if (db->intlmin == 0)
+ db->intlmin = WT_BTREE_INTLMIN_DEFAULT;
+ if (db->intlmax == 0)
+ db->intlmax = WT_MAX(db->intlmin, WT_BTREE_INTLMAX_DEFAULT);
+ if (db->intlitemsize == 0) {
+ if (db->intlmin <= 1024)
+ db->intlitemsize = 50;
+ else
+ db->intlitemsize = db->intlmin / 40;
+ }
+
+ /*
+ * Leaf pages are larger to amortize I/O across a large chunk of the
+ * data space, but still minimize the chance of a broken write. We
+ * only require 20 key/data pairs fit onto a leaf page. Again, if it's
+ * a small page, push anything bigger than about 80 bytes off-page.
+ * Here's the table:
+ * Pagesize Largest key or data item retained on-page:
+ * 512B 80 bytes
+ * 1K 80 bytes
+ * 2K 80 bytes
+ * 4K 80 bytes
+ * 8K 204 bytes
+ * 16K 409 bytes
+ * and so on, roughly doubling for each power-of-two.
+ */
+ if (db->leafmin == 0)
+ db->leafmin = WT_BTREE_LEAFMIN_DEFAULT;
+ if (db->leafmax == 0)
+ db->leafmax = WT_MAX(db->leafmin, WT_BTREE_LEAFMAX_DEFAULT);
+ if (db->leafitemsize == 0) {
+ if (db->leafmin <= 4096)
+ db->leafitemsize = 80;
+ else
+ db->leafitemsize = db->leafmin / 40;
+ }
+
+ /* Final checks for safety. */
+ if (db->intlmin % db->allocsize != 0 ||
+ db->intlmax % db->allocsize != 0 ||
+ db->leafmin % db->allocsize != 0 ||
+ db->leafmax % db->allocsize != 0) {
+ __wt_api_db_errx(db,
+ "all page sizes must be a multiple of %lu bytes",
+ (u_long)db->allocsize);
+ return (WT_ERROR);
+ }
+
+ if (db->intlmin > db->intlmax || db->leafmin > db->leafmax) {
+ __wt_api_db_errx(db,
+ "minimum page sizes must be less than or equal to maximum "
+ "page sizes");
+ return (WT_ERROR);
+ }
+
+ if (db->intlmin > WT_BTREE_PAGE_SIZE_MAX ||
+ db->intlmax > WT_BTREE_PAGE_SIZE_MAX ||
+ db->leafmin > WT_BTREE_PAGE_SIZE_MAX ||
+ db->leafmax > WT_BTREE_PAGE_SIZE_MAX) {
+ __wt_api_db_errx(db,
+ "all page sizes must less than or equal to %luMB",
+ (u_long)WT_BTREE_PAGE_SIZE_MAX / WT_MEGABYTE);
+ return (WT_ERROR);
+ }
+
+ /*
+ * We only have 3 bytes of length for on-page items, so the maximum
+ * on-page item size is limited to 16MB.
+ */
+ if (db->intlitemsize > WT_ITEM_MAX_LEN)
+ db->intlitemsize = WT_ITEM_MAX_LEN;
+ if (db->leafitemsize > WT_ITEM_MAX_LEN)
+ db->leafitemsize = WT_ITEM_MAX_LEN;
+
+ /*
+ * By default, any duplicate set that reaches 25% of a leaf page is
+ * moved into its own separate tree.
+ */
+ if (db->btree_dup_offpage == 0)
+ db->btree_dup_offpage = 4;
+
+ /*
+ * A leaf page must hold at least 2 key/data pairs, otherwise the
+ * whole btree thing breaks down because we can't split. We have
+ * to include WT_DESC_SIZE in leaf page calculations, it's not
+ * strictly necessary in internal pages because page 0 is always
+ * a leaf page. The additional 10 bytes is for slop -- Berkeley DB
+ * took roughly a decade to get the calculation correct, and that
+ * way I can skip the suspense.
+ */
+#define WT_MINIMUM_DATA_SPACE(db, s) \
+ (((s) - (WT_PAGE_DISK_SIZE + WT_PAGE_DESC_SIZE + 10)) / 4)
+ if (db->intlitemsize > WT_MINIMUM_DATA_SPACE(db, db->intlmin)) {
+ __wt_api_db_errx(db,
+ "The internal page size is too small for its maximum item "
+ "size");
+ return (WT_ERROR);
+ }
+ if (db->leafitemsize > WT_MINIMUM_DATA_SPACE(db, db->leafmin)) {
+ __wt_api_db_errx(db,
+ "The leaf page size is too small for its maximum item "
+ "size");
+ return (WT_ERROR);
+ }
+
+ /*
+ * A fixed-size column store should be able to store at least 20
+ * objects on a page, otherwise it just doesn't make sense.
+ */
+ if (F_ISSET(idb, WT_COLUMN) &&
+ db->fixed_len != 0 && db->leafmin / db->fixed_len < 20) {
+ __wt_api_db_errx(db,
+ "The leaf page size cannot store at least 20 fixed-length "
+ "objects");
+ return (WT_ERROR);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_root_pin --
+ * Read in the root page and pin it into memory.
+ */
+int
+__wt_root_pin(WT_TOC *toc)
+{
+ IDB *idb;
+
+ idb = toc->db->idb;
+
+ /* Get the root page. */
+ WT_RET(__wt_page_in(toc, NULL, &idb->root_page, &idb->root_off, 0));
+ F_SET(idb->root_page.page, WT_PINNED);
+ __wt_hazard_clear(toc, idb->root_page.page);
+
+ return (0);
+}
diff --git a/src/btree/bt_ovfl.c b/src/btree/bt_ovfl.c
new file mode 100644
index 00000000000..09eac77264b
--- /dev/null
+++ b/src/btree/bt_ovfl.c
@@ -0,0 +1,72 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_ovfl_in --
+ * Read an overflow item from the disk.
+ */
+int
+__wt_ovfl_in(WT_TOC *toc, WT_OVFL *ovfl, DBT *store)
+{
+ DB *db;
+ ENV *env;
+ WT_PAGE_DISK *dsk;
+ WT_STATS *stats;
+ uint32_t size;
+
+ env = toc->env;
+ db = toc->db;
+ stats = env->ienv->cache->stats;
+
+ /*
+ * Read an overflow page, using an overflow structure from a page for
+ * which we (better) have a hazard reference.
+ *
+ * Overflow reads are synchronous. That may bite me at some point, but
+ * WiredTiger supports large page sizes, and overflow items should be
+ * rare.
+ */
+ WT_VERBOSE(env, WT_VERB_READ, (env,
+ "overflow read addr/size %lu/%lu",
+ (u_long)ovfl->addr, (u_long)ovfl->size));
+ WT_STAT_INCR(stats, OVERFLOW_READ);
+
+ /*
+ * The only caller that wants a copy of the overflow pages (as opposed
+ * to the contents of the overflow pages), is the verify code. For that
+ * reason, it reads its own overflow pages, it doesn't call this code.
+ *
+ * But, we still have to verify the checksum, which means we have to
+ * read the entire set of pages, then copy the interesting information
+ * to the beginning of the buffer. The copy is a shift in a single
+ * buffer and so should be fast, but it's still not a good thing. If
+ * it ever becomes a problem, then we either have to pass the fact that
+ * it's a "page" back to our caller and let them deal with the offset,
+ * or add a new field to the DBT that flags the start of the allocated
+ * buffer, instead of using the "data" field to indicate both the start
+ * of the data and the start of the allocated memory.
+ *
+ * Re-allocate memory as necessary to hold the overflow pages.
+ */
+ size = WT_HDR_BYTES_TO_ALLOC(db, ovfl->size);
+ if (store->mem_size < size)
+ WT_RET(__wt_realloc(env, &store->mem_size, size, &store->data));
+
+ /* Read the page. */
+ WT_RET(__wt_page_disk_read(toc, store->data, ovfl->addr, size));
+
+ /* Copy the actual data in the DBT down to the start of the data. */
+ (void)memmove(store->data,
+ (uint8_t *)store->data + sizeof(WT_PAGE_DISK), ovfl->size);
+ store->size = ovfl->size;
+
+ return (0);
+}
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
new file mode 100644
index 00000000000..915d038751b
--- /dev/null
+++ b/src/btree/bt_page.c
@@ -0,0 +1,656 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static void __wt_page_inmem_col_fix(DB *, WT_PAGE *);
+static void __wt_page_inmem_col_int(WT_PAGE *);
+static void __wt_page_inmem_col_rle(DB *, WT_PAGE *);
+static void __wt_page_inmem_col_var(WT_PAGE *);
+static int __wt_page_inmem_dup_leaf(DB *, WT_PAGE *);
+static int __wt_page_inmem_int_ref(WT_TOC *, uint32_t, WT_PAGE *);
+static int __wt_page_inmem_row_int(DB *, WT_PAGE *);
+static int __wt_page_inmem_row_leaf(DB *, WT_PAGE *);
+
+/*
+ * __wt_page_in --
+ * Acquire a hazard reference to a page; if the page is not in-memory,
+ * read it from the disk and build an in-memory version.
+ */
+int
+__wt_page_in(
+ WT_TOC *toc, WT_PAGE *parent, WT_REF *ref, WT_OFF *off, int dsk_verify)
+{
+ ENV *env;
+ WT_CACHE *cache;
+ int ret;
+
+ env = toc->env;
+ cache = env->ienv->cache;
+
+ for (;;)
+ switch (ref->state) {
+ case WT_OK:
+ /*
+ * The page is in memory: get a hazard reference, update
+ * the page's LRU and return.
+ */
+ if (__wt_hazard_set(toc, ref)) {
+ ref->page->read_gen = ++cache->read_gen;
+ return (0);
+ }
+ /* FALLTHROUGH */
+ case WT_EVICT:
+ /*
+ * The page is being considered for eviction, wait for
+ * that to resolve.
+ */
+ __wt_yield();
+ break;
+ case WT_EMPTY:
+ /* The page isn't in memory, request it be read. */
+ __wt_cache_read_serial(
+ toc, parent, ref, off, dsk_verify, ret);
+ if (ret != 0)
+ return (ret);
+ break;
+ default:
+ WT_ABORT(env, "WT_REF->state invalid");
+ break;
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __wt_page_inmem --
+ * Build in-memory page information.
+ */
+int
+__wt_page_inmem(WT_TOC *toc, WT_PAGE *page)
+{
+ DB *db;
+ ENV *env;
+ WT_PAGE_DISK *dsk;
+ uint32_t nindx;
+ int ret;
+
+ db = toc->db;
+ env = toc->env;
+ dsk = page->dsk;
+ ret = 0;
+
+ WT_ASSERT(env, page->u.indx == NULL);
+
+ /* Determine the maximum number of indexes we'll need for this page. */
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_RLE:
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_DUP_LEAF:
+ nindx = dsk->u.entries;
+ break;
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_ROW_INT:
+ nindx = dsk->u.entries / 2;
+ break;
+ case WT_PAGE_ROW_LEAF:
+ /*
+ * Row store leaf pages support duplicates, so the real worst
+ * case is one key plus some number of duplicate data items.
+ * The number is configurable, that is, you can configure when
+ * a duplicate set is big enough to be pushed off the page;
+ * we're conservative here.
+ */
+ nindx = dsk->u.entries - 1;
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+
+ /*
+ * XXX
+ * We don't yet have a free-list on which to put empty pages -- for
+ * now, we handle them.
+ */
+ if (nindx == 0)
+ return (0);
+
+ /* Allocate an array of WT_{ROW,COL}_INDX structures for the page. */
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_RLE:
+ case WT_PAGE_COL_VAR:
+ WT_ERR((__wt_calloc(env,
+ nindx, sizeof(WT_COL), &page->u.icol)));
+ break;
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_DUP_LEAF:
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ WT_ERR((__wt_calloc(env,
+ nindx, sizeof(WT_ROW), &page->u.irow)));
+ break;
+ default:
+ break;
+ }
+
+ /* Allocate reference array for internal pages. */
+ switch (dsk->type) {
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_ROW_INT:
+ WT_ERR(__wt_page_inmem_int_ref(toc, nindx, page));
+ break;
+ default:
+ break;
+ }
+
+ /* Fill in the structures. */
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ __wt_page_inmem_col_fix(db, page);
+ break;
+ case WT_PAGE_COL_INT:
+ __wt_page_inmem_col_int(page);
+ break;
+ case WT_PAGE_COL_RLE:
+ __wt_page_inmem_col_rle(db, page);
+ break;
+ case WT_PAGE_COL_VAR:
+ __wt_page_inmem_col_var(page);
+ break;
+ case WT_PAGE_DUP_LEAF:
+ WT_ERR(__wt_page_inmem_dup_leaf(db, page));
+ break;
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_ROW_INT:
+ WT_ERR(__wt_page_inmem_row_int(db, page));
+ break;
+ case WT_PAGE_ROW_LEAF:
+ WT_ERR(__wt_page_inmem_row_leaf(db, page));
+ break;
+ default:
+ break;
+ }
+ return (0);
+
+err: __wt_page_discard(toc, page);
+ return (ret);
+}
+
+/*
+ * __wt_page_inmem_col_fix --
+ * Build in-memory index for fixed-length column-store leaf pages.
+ */
+static void
+__wt_page_inmem_col_fix(DB *db, WT_PAGE *page)
+{
+ WT_COL *cip;
+ WT_PAGE_DISK *dsk;
+ uint32_t i;
+ uint8_t *p;
+
+ dsk = page->dsk;
+ cip = page->u.icol;
+
+ /*
+ * Walk the page, building indices and finding the end of the page.
+ * The page contains fixed-length objects.
+ */
+ WT_FIX_FOREACH(db, dsk, p, i) {
+ cip->data = p;
+ ++cip;
+ }
+
+ page->indx_count = page->records = dsk->u.entries;
+}
+
+/*
+ * __wt_page_inmem_col_int --
+ * Build in-memory index for column-store internal pages.
+ */
+static void
+__wt_page_inmem_col_int(WT_PAGE *page)
+{
+ WT_COL *cip;
+ WT_OFF *off;
+ WT_PAGE_DISK *dsk;
+ uint64_t records;
+ uint32_t i;
+
+ dsk = page->dsk;
+ cip = page->u.icol;
+ records = 0;
+
+ /*
+ * Walk the page, building indices and finding the end of the page.
+ * The page contains WT_OFF structures.
+ */
+ WT_OFF_FOREACH(dsk, off, i) {
+ cip->data = off;
+ ++cip;
+ records += WT_RECORDS(off);
+ }
+
+ page->indx_count = dsk->u.entries;
+ page->records = records;
+}
+
+/*
+ * __wt_page_inmem_col_rle --
+ * Build in-memory index for fixed-length, run-length encoded, column-store
+ * leaf pages.
+ */
+static void
+__wt_page_inmem_col_rle(DB *db, WT_PAGE *page)
+{
+ WT_COL *cip;
+ WT_PAGE_DISK *dsk;
+ uint64_t records;
+ uint32_t i;
+ uint8_t *p;
+
+ dsk = page->dsk;
+ cip = page->u.icol;
+ records = 0;
+
+ /*
+ * Walk the page, building indices and finding the end of the page.
+ * The page contains fixed-length objects.
+ */
+ WT_RLE_REPEAT_FOREACH(db, dsk, p, i) {
+ records += WT_RLE_REPEAT_COUNT(p);
+ cip->data = p;
+ ++cip;
+ }
+
+ page->indx_count = dsk->u.entries;
+ page->records = records;
+}
+
+/*
+ * __wt_page_inmem_col_var --
+ * Build in-memory index for variable-length, data-only leaf pages in
+ * column-store trees.
+ */
+static void
+__wt_page_inmem_col_var(WT_PAGE *page)
+{
+ WT_COL *cip;
+ WT_ITEM *item;
+ WT_PAGE_DISK *dsk;
+ uint32_t i;
+
+ dsk = page->dsk;
+ cip = page->u.icol;
+
+ /*
+ * Walk the page, building indices and finding the end of the page.
+ * The page contains unsorted data items. The data items are on-page
+ * data (WT_ITEM_DATA), overflow (WT_ITEM_DATA_OVFL) or deleted
+ * (WT_ITEM_DEL) items.
+ */
+ WT_ITEM_FOREACH(dsk, item, i) {
+ cip->data = item;
+ ++cip;
+ }
+
+ page->indx_count = page->records = dsk->u.entries;
+}
+
+/*
+ * __wt_page_inmem_dup_leaf --
+ * Build in-memory index for variable-length, data-only leaf pages in
+ * duplicate trees.
+ */
+static int
+__wt_page_inmem_dup_leaf(DB *db, WT_PAGE *page)
+{
+ WT_ROW *rip;
+ WT_ITEM *item;
+ WT_PAGE_DISK *dsk;
+ uint32_t i;
+
+ dsk = page->dsk;
+
+ /*
+ * Walk the page, building indices and finding the end of the page.
+ * The page contains sorted data items. The data items are on-page
+ * (WT_ITEM_DATA_DUP) or overflow (WT_ITEM_DUP_OVFL) items.
+ *
+ * These data values are sorted, so we want to treat them as keys, and
+ * we return them as on-page WT_ITEM values, so we want to tream them
+ * as data. Set both the WT_ROW key and data fields.
+ */
+ rip = page->u.irow;
+ WT_ITEM_FOREACH(dsk, item, i) {
+ switch (WT_ITEM_TYPE(item)) {
+ case WT_ITEM_DATA_DUP:
+ __wt_key_set
+ (rip, WT_ITEM_BYTE(item), WT_ITEM_LEN(item));
+ break;
+ case WT_ITEM_DATA_DUP_OVFL:
+ __wt_key_set_process(rip, item);
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+ rip->data = item;
+ ++rip;
+ }
+
+ page->indx_count = dsk->u.entries;
+ page->records = dsk->u.entries;
+ return (0);
+}
+
+/*
+ * __wt_page_inmem_row_int --
+ * Build in-memory index for row-store and off-page duplicate tree
+ * internal pages.
+ */
+static int
+__wt_page_inmem_row_int(DB *db, WT_PAGE *page)
+{
+ IDB *idb;
+ WT_ITEM *item;
+ WT_OFF *off;
+ WT_PAGE_DISK *dsk;
+ WT_ROW *rip;
+ uint64_t records;
+ uint32_t i;
+ void *huffman;
+
+ idb = db->idb;
+ dsk = page->dsk;
+ rip = page->u.irow;
+ records = 0;
+
+ huffman =
+ dsk->type == WT_PAGE_DUP_INT ? idb->huffman_data : idb->huffman_key;
+
+ /*
+ * Walk the page, building indices and finding the end of the page.
+ *
+ * The page contains sorted key/offpage-reference pairs. Keys are row
+ * store internal pages with on-page/overflow (WT_ITEM_KEY/KEY_OVFL)
+ * items, or row store duplicate internal pages with on-page/overflow
+ * (WT_ITEM_KEY_DUP/WT_ITEM_DATA_KEY_DUP_OVFL) items. In both cases,
+ * offpage references are WT_ITEM_OFF items.
+ */
+ WT_ITEM_FOREACH(dsk, item, i)
+ switch (WT_ITEM_TYPE(item)) {
+ case WT_ITEM_KEY:
+ case WT_ITEM_KEY_DUP:
+ if (huffman == NULL) {
+ __wt_key_set(rip,
+ WT_ITEM_BYTE(item), WT_ITEM_LEN(item));
+ break;
+ }
+ /* FALLTHROUGH */
+ case WT_ITEM_KEY_OVFL:
+ case WT_ITEM_KEY_DUP_OVFL:
+ __wt_key_set_process(rip, item);
+ break;
+ case WT_ITEM_OFF:
+ off = WT_ITEM_BYTE_OFF(item);
+ records += WT_RECORDS(off);
+ rip->data = item;
+ ++rip;
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+
+ page->indx_count = dsk->u.entries / 2;
+ page->records = records;
+ return (0);
+}
+
+/*
+ * __wt_page_inmem_row_leaf --
+ * Build in-memory index for row-store leaf pages.
+ */
+static int
+__wt_page_inmem_row_leaf(DB *db, WT_PAGE *page)
+{
+ ENV *env;
+ IDB *idb;
+ WT_ITEM *item;
+ WT_PAGE_DISK *dsk;
+ WT_REF *ref;
+ WT_ROW *rip;
+ uint32_t i, indx_count;
+ uint64_t records;
+
+ env = db->env;
+ idb = db->idb;
+ dsk = page->dsk;
+ records = 0;
+
+ /*
+ * Walk a row-store page of WT_ITEMs, building indices and finding the
+ * end of the page.
+ *
+ * The page contains key/data pairs. Keys are on-page (WT_ITEM_KEY) or
+ * overflow (WT_ITEM_KEY_OVFL) items. The data sets are either: a
+ * single on-page (WT_ITEM_DATA) or overflow (WT_ITEM_DATA_OVFL) item;
+ * a group of duplicate data items where each duplicate is an on-page
+ * (WT_ITEM_DATA_DUP) or overflow (WT_ITEM_DUP_OVFL) item; or an offpage
+ * reference (WT_ITEM_OFF).
+ */
+ rip = NULL;
+ indx_count = 0;
+ WT_ITEM_FOREACH(dsk, item, i)
+ switch (WT_ITEM_TYPE(item)) {
+ case WT_ITEM_KEY:
+ case WT_ITEM_KEY_OVFL:
+ if (rip == NULL)
+ rip = page->u.irow;
+ else
+ ++rip;
+ if (idb->huffman_key != NULL ||
+ WT_ITEM_TYPE(item) == WT_ITEM_KEY_OVFL)
+ __wt_key_set_process(rip, item);
+ else
+ __wt_key_set(rip,
+ WT_ITEM_BYTE(item), WT_ITEM_LEN(item));
+ ++indx_count;
+ break;
+ case WT_ITEM_DATA_DUP:
+ case WT_ITEM_DATA_DUP_OVFL:
+ /*
+ * If the second or subsequent duplicate, move to the
+ * next slot and copy the previous key.
+ */
+ if (rip->data != NULL) {
+ __wt_key_set(rip + 1, rip->key, rip->size);
+ ++rip;
+ ++indx_count;
+ }
+ /* FALLTHROUGH */
+ case WT_ITEM_DATA:
+ case WT_ITEM_DATA_OVFL:
+ rip->data = item;
+ ++records;
+ break;
+ case WT_ITEM_OFF:
+ rip->data = item;
+ records += WT_ROW_OFF_RECORDS(rip);
+
+ /*
+ * We need a WT_REF entry for any item referencing an
+ * off-page duplicate tree. Create the array of WT_REF
+ * pointers and fill in a WT_REF structure.
+ */
+ if (page->u3.dup == NULL)
+ WT_RET(__wt_calloc(env, indx_count,
+ sizeof(WT_REF *), &page->u3.dup));
+ WT_RET(__wt_calloc(env, 1, sizeof(WT_REF), &ref));
+ ref->state = WT_EMPTY;
+ page->u3.dup[WT_ROW_SLOT(page, rip)] = ref;
+
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+
+ page->indx_count = indx_count;
+ page->records = records;
+
+ return (0);
+}
+
+/*
+ * __wt_item_process --
+ * Overflow and/or compressed on-page items need processing before
+ * we look at them.
+ */
+int
+__wt_item_process(WT_TOC *toc, WT_ITEM *item, DBT *dbt_ret)
+{
+ DB *db;
+ DBT *tmp;
+ ENV *env;
+ IDB *idb;
+ uint32_t size;
+ int ret;
+ void *huffman, *p;
+
+ db = toc->db;
+ tmp = NULL;
+ env = toc->env;
+ idb = db->idb;
+ ret = 0;
+
+ /*
+ * 3 cases: compressed on-page item, or compressed or uncompressed
+ * overflow item.
+ */
+ switch (WT_ITEM_TYPE(item)) {
+ case WT_ITEM_KEY:
+ huffman = idb->huffman_key;
+ goto onpage;
+ case WT_ITEM_KEY_DUP:
+ case WT_ITEM_DATA:
+ case WT_ITEM_DATA_DUP:
+ huffman = idb->huffman_data;
+onpage: p = WT_ITEM_BYTE(item);
+ size = WT_ITEM_LEN(item);
+ break;
+ case WT_ITEM_KEY_OVFL:
+ huffman = idb->huffman_key;
+ goto offpage;
+ case WT_ITEM_KEY_DUP_OVFL:
+ case WT_ITEM_DATA_OVFL:
+ case WT_ITEM_DATA_DUP_OVFL:
+ huffman = idb->huffman_data;
+offpage: /*
+ * It's an overflow item -- if it's not encoded, we can read
+ * it directly into the user's return DBT, otherwise we have to
+ * have our own buffer as temporary space, and the decode call
+ * will put a decoded version into the user's return DBT.
+ */
+ if (huffman == NULL)
+ tmp = dbt_ret;
+ else
+ WT_RET(__wt_scr_alloc(toc, 0, &tmp));
+ WT_RET(__wt_ovfl_in(toc, WT_ITEM_BYTE_OVFL(item), tmp));
+ p = tmp->data;
+ size = tmp->size;
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+
+ /*
+ * If the item is not compressed, and it's not an overflow item, copy
+ * it into the caller's DBT. If the item is not compressed, and it's
+ * an overflow item, it was already copied into the caller's DBT.
+ *
+ * If the item is compressed, pass it to the decode routines, they'll
+ * copy a decoded version into the caller's DBT.
+ */
+ if (huffman == NULL) {
+ if (tmp != dbt_ret) {
+ if (size > dbt_ret->mem_size)
+ WT_ERR(__wt_realloc(
+ env, &dbt_ret->mem_size,
+ size, &dbt_ret->data));
+ memcpy(dbt_ret->data, p, size);
+ dbt_ret->size = size;
+ }
+ } else
+ WT_ERR(__wt_huffman_decode(huffman, p, size,
+ &dbt_ret->data, &dbt_ret->mem_size, &dbt_ret->size));
+
+err: if (tmp != NULL && tmp != dbt_ret)
+ __wt_scr_release(&tmp);
+
+ return (ret);
+}
+
+/*
+ * __wt_page_inmem_int_ref --
+ * Allocate and initialize the reference array for internal pages.
+ */
+static int
+__wt_page_inmem_int_ref(WT_TOC *toc, uint32_t nindx, WT_PAGE *page)
+{
+ ENV *env;
+ WT_REF *cp;
+ uint32_t i;
+
+ env = toc->env;
+
+ /*
+ * Allocate an array of WT_REF structures for internal pages. In the
+ * case of an internal page, we know all of the slots are going to be
+ * filled in -- every slot on the page references a subtree. In the
+ * case of row-store leaf pages, the only slots that get filled in are
+ * slots that reference off-page duplicate trees. So, if it's an
+ * internal page, it's a simple one-time allocation; if a leaf page,
+ * we'll do similar work, but lazily in the routine that fills in the
+ * in-memory information.
+ */
+ WT_RET(__wt_calloc(
+ env, nindx, sizeof(WT_REF), &page->u3.ref));
+ for (i = 0, cp = page->u3.ref; i < nindx; ++i, ++cp)
+ cp->state = WT_EMPTY;
+ return (0);
+}
+
+/*
+ * __wt_key_set --
+ * Set a key/size pair, where the key does not require further processing.
+ */
+inline void
+__wt_key_set(WT_ROW *rip, void *key, uint32_t size)
+{
+ rip->key = key;
+ rip->size = size;
+}
+
+/*
+ * __wt_key_set_process --
+ * Set a key/size pair, where the key requires further processing.
+ */
+inline void
+__wt_key_set_process(WT_ROW *rip, void *key)
+{
+ rip->key = key;
+ rip->size = 0;
+}
+
+/*
+ * __wt_key_process --
+ * Return if a key requires processing.
+ */
+inline int
+__wt_key_process(WT_ROW *rip)
+{
+ return (rip->size == 0 ? 1 : 0);
+}
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
new file mode 100644
index 00000000000..f7e594d2217
--- /dev/null
+++ b/src/btree/bt_read.c
@@ -0,0 +1,272 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_cache_read(WT_READ_REQ *);
+
+/*
+ * __wt_workq_read_server --
+ * See if the read server thread needs to be awakened.
+ */
+void
+__wt_workq_read_server(ENV *env, int force)
+{
+ WT_CACHE *cache;
+ uint64_t bytes_inuse, bytes_max;
+
+ cache = env->ienv->cache;
+
+ /*
+ * If we're 10% over the maximum cache, shut out reads (which include
+ * page allocations) until we evict to at least 5% under the maximum
+ * cache. The idea is that we don't want to run on the edge all the
+ * time -- if we're seriously out of space, get things under control
+ * before opening up for more reads.
+ */
+ bytes_inuse = __wt_cache_bytes_inuse(cache);
+ bytes_max = WT_STAT(cache->stats, CACHE_BYTES_MAX);
+ if (cache->read_lockout) {
+ if (bytes_inuse <= bytes_max - (bytes_max / 20))
+ cache->read_lockout = 0;
+ } else if (bytes_inuse > bytes_max + (bytes_max / 10)) {
+ WT_VERBOSE(env, WT_VERB_READ, (env,
+ "workQ locks out reads: bytes-inuse %llu of bytes-max %llu",
+ (unsigned long long)bytes_inuse,
+ (unsigned long long)bytes_max));
+ cache->read_lockout = 1;
+ }
+
+ /* If the cache read server is running, there's nothing to do. */
+ if (!cache->read_sleeping)
+ return;
+
+ /*
+ * If reads are locked out and we're not forcing the issue (that's when
+ * closing the environment, or if there's a priority read waiting to be
+ * handled), we're done.
+ */
+ if (!force && cache->read_lockout)
+ return;
+
+ cache->read_sleeping = 0;
+ __wt_unlock(env, cache->mtx_read);
+}
+
+/*
+ * __wt_cache_read_serial_func --
+ * Read/allocation serialization function called when a page-in requires
+ * allocation or a read.
+ */
+int
+__wt_cache_read_serial_func(WT_TOC *toc)
+{
+ ENV *env;
+ WT_CACHE *cache;
+ WT_OFF *off;
+ WT_PAGE *parent;
+ WT_READ_REQ *rr, *rr_end;
+ WT_REF *ref;
+ int dsk_verify;
+
+ __wt_cache_read_unpack(toc, parent, ref, off, dsk_verify);
+
+ env = toc->env;
+ cache = env->ienv->cache;
+
+ /* Find an empty slot and enter the read request. */
+ rr = cache->read_request;
+ rr_end = rr + WT_ELEMENTS(cache->read_request);
+ for (; rr < rr_end; ++rr)
+ if (WT_READ_REQ_ISEMPTY(rr)) {
+ WT_READ_REQ_SET(rr, toc, parent, ref, off, dsk_verify);
+ return (0);
+ }
+ __wt_api_env_errx(env, "read server request table full");
+ return (WT_RESTART);
+}
+
+/*
+ * __wt_cache_read_server --
+ * Thread to do database reads.
+ */
+void *
+__wt_cache_read_server(void *arg)
+{
+ ENV *env;
+ IENV *ienv;
+ WT_CACHE *cache;
+ WT_READ_REQ *rr, *rr_end;
+ WT_TOC *toc;
+ int didwork, ret;
+
+ env = arg;
+ ienv = env->ienv;
+ cache = ienv->cache;
+ ret = 0;
+
+ rr = cache->read_request;
+ rr_end = rr + WT_ELEMENTS(cache->read_request);
+
+ for (;;) {
+ WT_VERBOSE(env,
+ WT_VERB_READ, (env, "cache read server sleeping"));
+ cache->read_sleeping = 1;
+ __wt_lock(env, cache->mtx_read);
+ WT_VERBOSE(
+ env, WT_VERB_READ, (env, "cache read server waking"));
+
+ /*
+ * Check for environment exit; do it here, instead of the top of
+ * the loop because doing it here keeps us from doing a bunch of
+ * worked when simply awakened to quit.
+ */
+ if (!F_ISSET(ienv, WT_SERVER_RUN))
+ break;
+
+ /*
+ * Walk the read-request queue, looking for reads (defined by
+ * a valid WT_TOC handle). If we find a read request, perform
+ * it, flush the result and clear the request slot, then wake
+ * up the requesting thread. The request slot clear doesn't
+ * need to be flushed, but we have to flush the read result,
+ * might as well include it. If we don't find any work, go to
+ * sleep.
+ */
+ do {
+ didwork = 0;
+ for (rr = cache->read_request; rr < rr_end; ++rr) {
+ if ((toc = rr->toc) == NULL)
+ continue;
+ if (cache->read_lockout &&
+ !F_ISSET(toc, WT_READ_PRIORITY))
+ continue;
+
+ /*
+ * The read server thread does both general file
+ * allocation and cache page instantiation. In
+ * a file allocation, there's no pagep field in
+ * in which to return a page.
+ */
+ ret = __wt_cache_read(rr);
+
+ WT_READ_REQ_CLR(rr);
+ __wt_toc_serialize_wrapup(toc, NULL, ret);
+
+ didwork = 1;
+
+ /*
+ * Any error terminates the request; a serious
+ * error causes the read server to exit.
+ */
+ if (ret != 0) {
+ if (ret != WT_RESTART)
+ goto err;
+ ret = 0;
+ }
+ }
+ } while (didwork);
+ }
+
+ if (ret != 0)
+err: __wt_api_env_err(env, ret, "cache read server error");
+
+ WT_VERBOSE(env, WT_VERB_READ, (env, "cache read server exiting"));
+ return (NULL);
+}
+
+/*
+ * __wt_cache_read --
+ * Read a page from the file.
+ */
+static int
+__wt_cache_read(WT_READ_REQ *rr)
+{
+ ENV *env;
+ WT_CACHE *cache;
+ WT_OFF *off;
+ WT_PAGE *page;
+ WT_PAGE_DISK *dsk;
+ WT_REF *ref;
+ WT_TOC *toc;
+ uint32_t addr, size;
+ int ret;
+
+ toc = rr->toc;
+ ref = rr->ref;
+ off = rr->off;
+ addr = off->addr;
+ size = off->size;
+
+ env = toc->env;
+ cache = env->ienv->cache;
+ ret = 0;
+
+ /*
+ * Check to see if some other thread brought the page into the cache
+ * while our request was in the queue. If the state is anything
+ * other than empty, it's not our problem.
+ */
+ if (ref->state != WT_EMPTY)
+ return (0);
+
+ /*
+ * The page isn't in the cache, and since we're the only path for the
+ * page to get into the cache, we don't have to worry further, and
+ * we might as well get to it.
+ *
+ * Allocate memory for the in-memory page information and for the page
+ * itself. They're two separate allocation calls so we (hopefully) get
+ * better alignment from the underlying heap memory allocator.
+ */
+ WT_RET(__wt_calloc(env, 1, sizeof(WT_PAGE), &page));
+ WT_ERR(__wt_calloc(env, (size_t)size, sizeof(uint8_t), &dsk));
+
+ /* Read the page. */
+ WT_VERBOSE(env, WT_VERB_READ,
+ (env, "cache read addr/size %lu/%lu", (u_long)addr, (u_long)size));
+
+ WT_ERR(__wt_page_disk_read(toc, dsk, addr, size));
+ WT_CACHE_PAGE_IN(cache, size);
+
+ /* If the page needs to be verified, that's next. */
+ if (rr->dsk_verify)
+ WT_ERR(__wt_verify_dsk_page(toc, dsk, addr, size));
+
+ /*
+ * Fill in the WT_PAGE addr, size.
+ * Reference the parent's WT_PAGE and parent's WT_OFF structures.
+ * Reference the underlying disk page.
+ */
+ page->addr = addr;
+ page->size = size;
+ page->parent = rr->parent;
+ page->parent_off = off;
+ page->dsk = dsk;
+
+ /* Build the in-memory version of the page. */
+ WT_ERR(__wt_page_inmem(toc, page));
+
+ /*
+ * The page is now available -- set the LRU so the page is not selected
+ * for eviction.
+ */
+ page->read_gen = ++cache->read_gen;
+ ref->page = page;
+ ref->state = WT_OK;
+
+ return (0);
+
+err: if (page != NULL) {
+ if (page->dsk != NULL)
+ __wt_free(env, page->dsk, size);
+ __wt_free(env, page, sizeof(WT_PAGE));
+ }
+ return (ret);
+}
diff --git a/src/btree/bt_reconcile.c b/src/btree/bt_reconcile.c
new file mode 100644
index 00000000000..7a57cfe4a97
--- /dev/null
+++ b/src/btree/bt_reconcile.c
@@ -0,0 +1,982 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_rle_expand_compare(const void *, const void *);
+static int __wt_rec_col_fix(WT_TOC *, WT_PAGE *, WT_PAGE *);
+static int __wt_rec_col_int(WT_TOC *, WT_PAGE *, WT_PAGE *);
+static int __wt_rec_col_rle(WT_TOC *, WT_PAGE *, WT_PAGE *);
+static int __wt_rec_col_var(WT_TOC *, WT_PAGE *, WT_PAGE *);
+static int __wt_rec_page_write(WT_TOC *, WT_PAGE *, WT_PAGE *);
+static int __wt_rec_parent_update(WT_TOC *, WT_PAGE *, WT_PAGE *);
+static int __wt_rec_row(WT_TOC *, WT_PAGE *, WT_PAGE *);
+static int __wt_rec_row_int(WT_TOC *, WT_PAGE *, WT_PAGE *);
+static inline void __wt_rec_set_page_size(WT_TOC *, WT_PAGE *, uint8_t *);
+
+/*
+ * __wt_rec_set_page_size --
+ * Set the page's size to the minimum number of allocation units.
+ */
+static inline void
+__wt_rec_set_page_size(WT_TOC *toc, WT_PAGE *page, uint8_t *first_free)
+{
+ DB *db;
+
+ db = toc->db;
+
+ /*
+ * Set the page's size to the minimum number of allocation units needed
+ * (the page size can either grow or shrink).
+ *
+ * Set the page size before verifying the page, the verification code
+ * checks for entries that extend past the end of the page, and expects
+ * the WT_PAGE->size field to be valid.
+ */
+ page->size = WT_ALIGN(first_free - (uint8_t *)page->dsk, db->allocsize);
+}
+
+/*
+ * __wt_page_reconcile --
+ * Format an in-memory page to its on-disk format, and write it.
+ */
+int
+__wt_page_reconcile(WT_TOC *toc, WT_PAGE *page)
+{
+ DB *db;
+ DBT *tmp;
+ ENV *env;
+ WT_PAGE *new, _new;
+ WT_PAGE_DISK *dsk;
+ uint32_t max;
+ int ret;
+
+ db = toc->db;
+ tmp = NULL;
+ env = toc->env;
+ dsk = page->dsk;
+
+ /* If the page isn't dirty, we should never have been called. */
+ WT_ASSERT(env, WT_PAGE_IS_MODIFIED(page));
+
+ WT_VERBOSE(env, WT_VERB_EVICT,
+ (env, "reconcile addr %lu (page %p, type %s)",
+ (u_long)page->addr, page, __wt_page_type_string(dsk)));
+
+ /*
+ * Update the disk generation before reading the page. The workQ will
+ * update the write generation after it makes a change, and if we have
+ * different disk and write generation numbers, the page may be dirty.
+ * We technically requires a flush (the eviction server might run on a
+ * different core before a flush naturally occurred).
+ */
+ WT_PAGE_DISK_WRITE(page);
+ WT_MEMORY_FLUSH;
+
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ /*
+ * Fixed-width pages without run-length encoding cannot change
+ * size.
+ */
+ max = page->size;
+ break;
+ case WT_PAGE_COL_RLE:
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_DUP_LEAF:
+ case WT_PAGE_ROW_LEAF:
+ /*
+ * Other leaf page types can grow, allocate the maximum leaf
+ * page size.
+ */
+ max = db->leafmax;
+ break;
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_ROW_INT:
+ /*
+ * All internal page types can grow, allocate the maximum
+ * internal page size.
+ */
+ max = db->intlmax;
+ break;
+ case WT_PAGE_OVFL:
+ WT_ILLEGAL_FORMAT_ERR(db, ret);
+ }
+
+ /*
+ * Initialize a WT_PAGE page on the stack and allocate a scratch buffer
+ * for its contents. We use two pieces of memory because we want the
+ * page contents to be aligned for direct I/O. The WT_PAGE structure
+ * is relatively small, the stack is fine.
+ */
+ WT_CLEAR(_new);
+ new = &_new;
+ WT_ERR(__wt_scr_alloc(toc, max, &tmp));
+ memset(tmp->data, 0, max);
+ new->addr = page->addr;
+ new->size = max;
+ new->dsk = tmp->data;
+ new->dsk->start_recno = dsk->start_recno;
+ new->dsk->type = dsk->type;
+ new->dsk->level = dsk->level;
+
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ WT_ERR(__wt_rec_col_fix(toc, page, new));
+ break;
+ case WT_PAGE_COL_RLE:
+ WT_ERR(__wt_rec_col_rle(toc, page, new));
+ break;
+ case WT_PAGE_COL_VAR:
+ WT_ERR(__wt_rec_col_var(toc, page, new));
+ break;
+ case WT_PAGE_COL_INT:
+ WT_ERR(__wt_rec_col_int(toc, page, new));
+ break;
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_ROW_INT:
+ WT_ERR(__wt_rec_row_int(toc, page, new));
+ break;
+ case WT_PAGE_ROW_LEAF:
+ case WT_PAGE_DUP_LEAF:
+ WT_ERR(__wt_rec_row(toc, page, new));
+ break;
+ WT_ILLEGAL_FORMAT_ERR(db, ret);
+ }
+
+ /* Write the new page to disk. */
+ WT_ERR(__wt_rec_page_write(toc, page, new));
+
+ /* Free the original page -- update the address and size. */
+ WT_ERR(__wt_file_free(toc, page->addr, page->size));
+
+ /*
+ * Update the backing address.
+ *
+ * XXX
+ * This is more for diagnostic information than anything else, that is,
+ * this will match the WT_REF->addr in the parent.
+ *
+ * The parent's WT_REF->size may be different, that is, page->size is
+ * the original page size at the original address and the size of the
+ * page's buffer in memory, NOT the size of the newly written page at
+ * the new address. We may NOT update the size here, otherwise we
+ * can no longer figure out if WT_ROW/WT_COL items reference on-page
+ * data vs. allocated data.
+ */
+ page->addr = new->addr;
+
+err: if (tmp != NULL)
+ __wt_scr_release(&tmp);
+
+ return (ret);
+}
+
+/*
+ * __wt_rec_col_int --
+ * Reconcile a column store internal page.
+ */
+static int
+__wt_rec_col_int(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
+{
+ WT_COL *cip;
+ WT_OFF *from;
+ WT_PAGE_DISK *dsk;
+ WT_REPL *repl;
+ uint32_t i, space_avail;
+ uint8_t *first_free;
+
+ dsk = new->dsk;
+ __wt_set_ff_and_sa_from_offset(
+ new, WT_PAGE_BYTE(new), &first_free, &space_avail);
+
+ WT_INDX_FOREACH(page, cip, i) {
+ if ((repl = WT_COL_REPL(page, cip)) != NULL)
+ from = WT_REPL_DATA(repl);
+ else
+ from = cip->data;
+
+ /*
+ * XXX
+ * We don't yet handle splits: we allocated the maximum page
+ * size, but it still wasn't enough. We must allocate another
+ * page and split the parent.
+ */
+ if (sizeof(WT_OFF) > space_avail) {
+ fprintf(stderr,
+ "__wt_rec_col_int: page %lu split\n",
+ (u_long)page->addr);
+ __wt_abort(toc->env);
+ }
+
+ memcpy(first_free, from, sizeof(WT_OFF));
+ first_free += sizeof(WT_OFF);
+ space_avail -= sizeof(WT_OFF);
+ ++dsk->u.entries;
+ }
+
+ new->records = page->records;
+ __wt_rec_set_page_size(toc, new, first_free);
+
+ return (0);
+}
+
+/*
+ * __wt_rec_row_int --
+ * Reconcile a row store, or off-page duplicate tree, internal page.
+ */
+static int
+__wt_rec_row_int(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
+{
+ WT_ITEM *key_item, *data_item, *next;
+ WT_PAGE_DISK *dsk;
+ WT_REPL *repl;
+ WT_ROW *rip;
+ uint32_t i, len, space_avail;
+ uint8_t *first_free;
+
+ dsk = new->dsk;
+ __wt_set_ff_and_sa_from_offset(
+ new, WT_PAGE_BYTE(new), &first_free, &space_avail);
+
+ /*
+ * We have to walk both the WT_ROW structures as well as the original
+ * page: the problem is keys that require processing. When a page is
+ * read into memory from a simple database, the WT_ROW key/size pair
+ * is set to reference an on-page group of bytes in the key's WT_ITEM
+ * structure. As Btree keys are immutable, that original WT_ITEM is
+ * usually what we want to write, and we can pretty easily find it by
+ * moving to immediately before the on-page key.
+ *
+ * Keys that require processing are harder (for example, a Huffman
+ * encoded key). When we have to use a key that requires processing,
+ * we process the key and set the WT_ROW key/size pair to reference
+ * the allocated memory that holds the key. At that point we've lost
+ * any reference to the original WT_ITEM structure, which is what we
+ * want to re-write when reconciling the page. We don't want to make
+ * the WT_ROW structure bigger by another sizeof(void *) bytes, so we
+ * walk the original page at the same time we walk the WT_PAGE array
+ * when reconciling the page so we can find the original WT_ITEM.
+ */
+ key_item = WT_PAGE_BYTE(page);
+ WT_INDX_FOREACH(page, rip, i) {
+ /*
+ * Copy the paired items off the old page into the new page; if
+ * the page has been replaced, update its information.
+ *
+ * XXX
+ * Internal pages can't grow, yet, so we could more easily just
+ * update the old page. We do the copy because eventually we
+ * will have to split the internal pages, and they'll be able to
+ * grow.
+ */
+ data_item = WT_ITEM_NEXT(key_item);
+ if ((repl = WT_ROW_REPL(page, rip)) != NULL)
+ memcpy(WT_ITEM_BYTE(data_item),
+ WT_REPL_DATA(repl), sizeof(WT_OFF));
+ next = WT_ITEM_NEXT(data_item);
+ len = (uint32_t)((uint8_t *)next - (uint8_t *)key_item);
+
+ /*
+ * XXX
+ * We don't yet handle splits: we allocated the maximum page
+ * size, but it still wasn't enough. We must allocate another
+ * page and split the parent.
+ */
+ if (len > space_avail) {
+ fprintf(stderr,
+ "__wt_rec_row_int: page %lu split\n",
+ (u_long)page->addr);
+ __wt_abort(toc->env);
+ }
+
+ memcpy(first_free, key_item, len);
+ first_free += len;
+ space_avail -= len;
+ ++dsk->u.entries;
+
+ key_item = next;
+ }
+
+ new->records = page->records;
+ __wt_rec_set_page_size(toc, new, first_free);
+
+ return (0);
+}
+
+/*
+ * __wt_rec_col_fix --
+ * Reconcile a fixed-width, column-store leaf page (does not handle
+ * run-length encoding).
+ */
+static int
+__wt_rec_col_fix(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
+{
+ DB *db;
+ DBT *tmp;
+ ENV *env;
+ WT_COL *cip;
+ WT_PAGE_DISK *dsk;
+ WT_REPL *repl;
+ uint32_t i, len, space_avail;
+ uint8_t *data, *first_free;
+ int ret;
+
+ db = toc->db;
+ tmp = NULL;
+ env = toc->env;
+ dsk = new->dsk;
+ ret = 0;
+
+ __wt_set_ff_and_sa_from_offset(
+ new, WT_PAGE_BYTE(new), &first_free, &space_avail);
+
+ /*
+ * We need a "deleted" data item to store on the page. Make sure the
+ * WT_TOC's scratch buffer is big enough. Clear the buffer's contents
+ * and set the delete flag.
+ */
+ len = db->fixed_len;
+ WT_ERR(__wt_scr_alloc(toc, len, &tmp));
+ memset(tmp->data, 0, len);
+ WT_FIX_DELETE_SET(tmp->data);
+
+ WT_INDX_FOREACH(page, cip, i) {
+ /*
+ * Get a reference to the data, on- or off- page, and see if
+ * it's been deleted.
+ */
+ if ((repl = WT_COL_REPL(page, cip)) != NULL) {
+ if (WT_REPL_DELETED_ISSET(repl))
+ data = tmp->data; /* Replaced deleted */
+ else /* Replaced data */
+ data = WT_REPL_DATA(repl);
+ } else if (WT_FIX_DELETE_ISSET(cip->data))
+ data = tmp->data; /* On-page deleted */
+ else
+ data = cip->data; /* On-page data */
+
+ /*
+ * When reconciling a fixed-width page that doesn't support
+ * run-length encoding, the on-page information can't change
+ * size -- there's no reason to ever split such a page.
+ */
+ WT_ASSERT(env, len <= space_avail);
+
+ memcpy(first_free, data, len);
+ first_free += len;
+ space_avail -= len;
+ ++dsk->u.entries;
+ }
+
+ new->records = page->records;
+ __wt_rec_set_page_size(toc, new, first_free);
+
+err: if (tmp != NULL)
+ __wt_scr_release(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_rec_col_rle --
+ * Reconcile a fixed-width, run-length encoded, column-store leaf page.
+ */
+static int
+__wt_rec_col_rle(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
+{
+ DB *db;
+ DBT *tmp;
+ ENV *env;
+ WT_COL *cip;
+ WT_PAGE_DISK *dsk;
+ WT_RLE_EXPAND *exp, **expsort, **expp;
+ WT_REPL *repl;
+ uint64_t recno;
+ uint32_t i, len, n_expsort, space_avail;
+ uint16_t n, nrepeat, repeat_count;
+ uint8_t *data, *first_free, *last_data;
+ int from_repl, ret;
+
+ db = toc->db;
+ tmp = NULL;
+ env = toc->env;
+ expsort = NULL;
+ dsk = new->dsk;
+ n_expsort = 0; /* Necessary for the sort function */
+ last_data = NULL;
+ ret = 0;
+
+ __wt_set_ff_and_sa_from_offset(
+ new, WT_PAGE_BYTE(new), &first_free, &space_avail);
+
+ /*
+ * We need a "deleted" data item to store on the page. Make sure the
+ * WT_TOC's scratch buffer is big enough. Clear the buffer's contents
+ * and set the delete flag.
+ */
+ len = db->fixed_len + sizeof(uint16_t);
+ WT_ERR(__wt_scr_alloc(toc, len, &tmp));
+ memset(tmp->data, 0, len);
+ WT_RLE_REPEAT_COUNT(tmp->data) = 1;
+ WT_FIX_DELETE_SET(WT_RLE_REPEAT_DATA(tmp->data));
+
+ /* Set recno to the first record on the page. */
+ recno = page->dsk->start_recno;
+ WT_INDX_FOREACH(page, cip, i) {
+ /*
+ * Get a sorted list of any expansion entries we've created for
+ * this set of records. The sort function returns a NULL-
+ * terminated array of references to WT_RLE_EXPAND structures,
+ * sorted by record number.
+ */
+ WT_ERR(__wt_rle_expand_sort(
+ env, page, cip, &expsort, &n_expsort));
+
+ /*
+ *
+ * Generate entries for the new page: loop through the repeat
+ * records, checking for WT_RLE_EXPAND entries that match the
+ * current record number.
+ */
+ nrepeat = WT_RLE_REPEAT_COUNT(cip->data);
+ for (expp = expsort, n = 1;
+ n <= nrepeat; n += repeat_count, recno += repeat_count) {
+ from_repl = 0;
+ if ((exp = *expp) != NULL && recno == exp->recno) {
+ ++expp;
+
+ /* Use the WT_RLE_EXPAND's WT_REPL field. */
+ repl = exp->repl;
+ if (WT_REPL_DELETED_ISSET(repl))
+ data = tmp->data;
+ else {
+ from_repl = 1;
+ data = WT_REPL_DATA(repl);
+ }
+ repeat_count = 1;
+ } else {
+ if (WT_FIX_DELETE_ISSET(cip->data))
+ data = tmp->data;
+ else
+ data = cip->data;
+ /*
+ * The repeat count is the number of records
+ * up to the next WT_RLE_EXPAND record, or
+ * up to the end of this entry if we have no
+ * more WT_RLE_EXPAND records.
+ */
+ if (exp == NULL)
+ repeat_count = (nrepeat - n) + 1;
+ else
+ repeat_count =
+ (uint16_t)(exp->recno - recno);
+ }
+
+ /*
+ * In all cases, check the last entry written on the
+ * page to see if it's identical, and increment its
+ * repeat count where possible.
+ */
+ if (last_data != NULL &&
+ memcmp(WT_RLE_REPEAT_DATA(last_data),
+ WT_RLE_REPEAT_DATA(data), db->fixed_len) == 0 &&
+ WT_RLE_REPEAT_COUNT(last_data) < UINT16_MAX) {
+ WT_RLE_REPEAT_COUNT(last_data) += repeat_count;
+ continue;
+ }
+
+ /*
+ * XXX
+ * We don't yet handle splits: we allocated the maximum
+ * leaf page size, but it still wasn't enough. We must
+ * allocate another leaf page and split the parent.
+ */
+ if (len > space_avail) {
+ fprintf(stderr,
+ "__wt_rec_col_rle: page %lu split\n",
+ (u_long)page->addr);
+ __wt_abort(env);
+ }
+
+ /*
+ * Most of the formats already include a repeat count:
+ * specifically the deleted buffer, or any entry we're
+ * copying from the original page. However, entries
+ * that were deleted or replaced are read from a WT_REPL
+ * structure, which has no repeat count.
+ */
+ last_data = first_free;
+ if (from_repl) {
+ WT_RLE_REPEAT_COUNT(last_data) = repeat_count;
+ memcpy(WT_RLE_REPEAT_DATA(
+ last_data), data, db->fixed_len);
+ } else
+ memcpy(last_data, data, len);
+ first_free += len;
+ space_avail -= len;
+ ++dsk->u.entries;
+ }
+ }
+
+ new->records = page->records;
+ __wt_rec_set_page_size(toc, new, first_free);
+
+ /* Free the sort array. */
+err: if (expsort != NULL)
+ __wt_free(env, expsort, n_expsort * sizeof(WT_RLE_EXPAND *));
+
+ if (tmp != NULL)
+ __wt_scr_release(&tmp);
+
+ return (ret);
+}
+
+/*
+ * __wt_rle_expand_compare --
+ * Qsort function: sort WT_RLE_EXPAND structures based on the record
+ * offset, in ascending order.
+ */
+static int
+__wt_rle_expand_compare(const void *a, const void *b)
+{
+ WT_RLE_EXPAND *a_exp, *b_exp;
+
+ a_exp = *(WT_RLE_EXPAND **)a;
+ b_exp = *(WT_RLE_EXPAND **)b;
+
+ return (a_exp->recno > b_exp->recno ? 1 : 0);
+}
+
+/*
+ * __wt_rle_expand_sort --
+ * Return the current on-page index's array of WT_RLE_EXPAND structures,
+ * sorted by record offset.
+ */
+int
+__wt_rle_expand_sort(ENV *env,
+ WT_PAGE *page, WT_COL *cip, WT_RLE_EXPAND ***expsortp, uint32_t *np)
+{
+ WT_RLE_EXPAND *exp;
+ uint16_t n;
+
+ /* Figure out how big the array needs to be. */
+ for (n = 0,
+ exp = WT_COL_RLEEXP(page, cip); exp != NULL; exp = exp->next, ++n)
+ ;
+
+ /*
+ * Allocate that big an array -- always allocate at least one slot,
+ * our caller expects NULL-termination.
+ */
+ if (n >= *np) {
+ if (*expsortp != NULL)
+ __wt_free(
+ env, *expsortp, *np * sizeof(WT_RLE_EXPAND *));
+ WT_RET(__wt_calloc(
+ env, n + 10, sizeof(WT_RLE_EXPAND *), expsortp));
+ *np = n + 10;
+ }
+
+ /* Enter the WT_RLE_EXPAND structures into the array. */
+ for (n = 0,
+ exp = WT_COL_RLEEXP(page, cip); exp != NULL; exp = exp->next, ++n)
+ (*expsortp)[n] = exp;
+
+ /* Sort the entries. */
+ if (n != 0)
+ qsort(*expsortp, (size_t)n,
+ sizeof(WT_RLE_EXPAND *), __wt_rle_expand_compare);
+
+ /* NULL-terminate the array. */
+ (*expsortp)[n] = NULL;
+
+ return (0);
+}
+
+/*
+ * __wt_rec_col_var --
+ * Reconcile a variable-width column-store leaf page.
+ */
+static int
+__wt_rec_col_var(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
+{
+ enum { DATA_ON_PAGE, DATA_OFF_PAGE } data_loc;
+ DBT *data, data_dbt;
+ WT_COL *cip;
+ WT_ITEM data_item;
+ WT_OVFL data_ovfl;
+ WT_PAGE_DISK *dsk;
+ WT_REPL *repl;
+ uint32_t i, len, space_avail;
+ uint8_t *first_free;
+
+ dsk = new->dsk;
+ __wt_set_ff_and_sa_from_offset(
+ new, WT_PAGE_BYTE(new), &first_free, &space_avail);
+
+ WT_CLEAR(data_dbt);
+ WT_CLEAR(data_item);
+ data = &data_dbt;
+
+ WT_INDX_FOREACH(page, cip, i) {
+ /*
+ * Get a reference to the data: it's either a replacement value
+ * or the original on-page item.
+ */
+ if ((repl = WT_COL_REPL(page, cip)) != NULL) {
+ /*
+ * Check for deletion, else build the data's WT_ITEM
+ * chunk from the most recent replacement value.
+ */
+ if (WT_REPL_DELETED_ISSET(repl)) {
+ WT_CLEAR(data_item);
+ WT_ITEM_SET(&data_item, WT_ITEM_DEL, 0);
+ len = WT_ITEM_SPACE_REQ(0);
+ } else {
+ data->data = WT_REPL_DATA(repl);
+ data->size = repl->size;
+ WT_RET(__wt_item_build_data(
+ toc, data, &data_item, &data_ovfl, 0));
+ len = WT_ITEM_SPACE_REQ(data->size);
+ }
+ data_loc = DATA_OFF_PAGE;
+ } else {
+ data->data = cip->data;
+ data->size = WT_ITEM_SPACE_REQ(WT_ITEM_LEN(cip->data));
+ len = data->size;
+ data_loc = DATA_ON_PAGE;
+ }
+
+ /*
+ * XXX
+ * We don't yet handle splits -- we allocated the maximum leaf
+ * page size, but it still wasn't enough. We must allocate
+ * another leaf page and split the parent.
+ */
+ if (len > space_avail) {
+ fprintf(stderr,
+ "__wt_rec_col_var: page %lu split\n",
+ (u_long)page->addr);
+ __wt_abort(toc->env);
+ }
+
+ switch (data_loc) {
+ case DATA_ON_PAGE:
+ memcpy(first_free, data->data, data->size);
+ first_free += data->size;
+ space_avail -= data->size;
+ break;
+ case DATA_OFF_PAGE:
+ memcpy(first_free, &data_item, sizeof(data_item));
+ memcpy(first_free +
+ sizeof(data_item), data->data, data->size);
+ first_free += len;
+ space_avail -= len;
+ }
+ ++dsk->u.entries;
+ }
+
+ new->records = page->records;
+ __wt_rec_set_page_size(toc, new, first_free);
+
+ return (0);
+}
+
+/*
+ * __wt_rec_row --
+ * Reconcile a row-store leaf page.
+ */
+static int
+__wt_rec_row(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
+{
+ enum { DATA_ON_PAGE, DATA_OFF_PAGE } data_loc;
+ enum { KEY_ON_PAGE, KEY_NONE } key_loc;
+ DB *db;
+ DBT *key, key_dbt, *data, data_dbt;
+ WT_ITEM key_item, data_item, *item;
+ WT_OVFL data_ovfl;
+ WT_PAGE_DISK *dsk;
+ WT_ROW *rip;
+ WT_REPL *repl;
+ uint32_t i, len, space_avail, type;
+ uint8_t *first_free;
+
+ db = toc->db;
+ dsk = new->dsk;
+ __wt_set_ff_and_sa_from_offset(
+ new, WT_PAGE_BYTE(new), &first_free, &space_avail);
+
+ WT_CLEAR(data_dbt);
+ WT_CLEAR(key_dbt);
+ WT_CLEAR(data_item);
+ WT_CLEAR(key_item);
+
+ key = &key_dbt;
+ data = &data_dbt;
+
+ /*
+ * Walk the page, accumulating key/data groups (groups, because a key
+ * can reference a duplicate data set).
+ *
+ * We have to walk both the WT_ROW structures as well as the original
+ * page: the problem is keys that require processing. When a page is
+ * read into memory from a simple database, the WT_ROW key/size pair
+ * is set to reference an on-page group of bytes in the key's WT_ITEM
+ * structure. As Btree keys are immutable, that original WT_ITEM is
+ * usually what we want to write, and we can pretty easily find it by
+ * moving to immediately before the on-page key.
+ *
+ * Keys that require processing are harder (for example, a Huffman
+ * encoded key). When we have to use a key that requires processing,
+ * we process the key and set the WT_ROW key/size pair to reference
+ * the allocated memory that holds the key. At that point we've lost
+ * any reference to the original WT_ITEM structure, which is what we
+ * want to re-write when reconciling the page. We don't want to make
+ * the WT_ROW structure bigger by another sizeof(void *) bytes, so we
+ * walk the original page at the same time we walk the WT_PAGE array
+ * when reconciling the page so we can find the original WT_ITEM.
+ */
+ item = NULL;
+ WT_INDX_FOREACH(page, rip, i) {
+ /* Move to the next key on the original page. */
+ if (item == NULL)
+ item = (WT_ITEM *)WT_PAGE_BYTE(page);
+ else
+ do {
+ item = WT_ITEM_NEXT(item);
+ } while (WT_ITEM_TYPE(item) != WT_ITEM_KEY &&
+ WT_ITEM_TYPE(item) != WT_ITEM_KEY_OVFL);
+
+ /*
+ * Get a reference to the data. We get the data first because
+ * it may have been deleted, in which case we ignore the pair.
+ */
+ if ((repl = WT_ROW_REPL(page, rip)) != NULL) {
+ if (WT_REPL_DELETED_ISSET(repl))
+ continue;
+
+ /*
+ * Build the data's WT_ITEM chunk from the most recent
+ * replacement value.
+ */
+ data->data = WT_REPL_DATA(repl);
+ data->size = repl->size;
+ WT_RET(__wt_item_build_data(
+ toc, data, &data_item, &data_ovfl, 0));
+ data_loc = DATA_OFF_PAGE;
+ } else {
+ /* Copy the item off the page. */
+ data->data = rip->data;
+ data->size = WT_ITEM_SPACE_REQ(WT_ITEM_LEN(rip->data));
+ data_loc = DATA_ON_PAGE;
+ }
+
+ /*
+ * Check if the key is a duplicate (the key preceding it on the
+ * page references the same information). We don't store the
+ * key for the second and subsequent data items in duplicated
+ * groups.
+ */
+ if (WT_ROW_INDX_IS_DUPLICATE(page, rip)) {
+ type = data_loc == DATA_ON_PAGE ?
+ WT_ITEM_TYPE(rip->data) : WT_ITEM_TYPE(&data_item);
+ switch (type) {
+ case WT_ITEM_DATA:
+ case WT_ITEM_DATA_DUP:
+ type = WT_ITEM_DATA_DUP;
+ break;
+ case WT_ITEM_DATA_OVFL:
+ case WT_ITEM_DATA_DUP_OVFL:
+ type = WT_ITEM_DATA_DUP_OVFL;
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+ if (data_loc == DATA_ON_PAGE)
+ WT_ITEM_SET_TYPE(rip->data, type);
+ else
+ WT_ITEM_SET_TYPE(&data_item, type);
+ key_loc = KEY_NONE;
+ } else {
+ /* Take the key's WT_ITEM from the original page. */
+ key->data = item;
+ key->size = WT_ITEM_SPACE_REQ(WT_ITEM_LEN(item));
+ key_loc = KEY_ON_PAGE;
+ }
+
+ len = 0;
+ switch (key_loc) {
+ case KEY_ON_PAGE:
+ len = key->size;
+ break;
+ case KEY_NONE:
+ break;
+ }
+ switch (data_loc) {
+ case DATA_OFF_PAGE:
+ len += WT_ITEM_SPACE_REQ(data->size);
+ break;
+ case DATA_ON_PAGE:
+ len += data->size;
+ break;
+ }
+
+ /*
+ * XXX
+ * We don't yet handle splits -- we allocated the maximum leaf
+ * page size, but it still wasn't enough. We must allocate
+ * another leaf page and split the parent.
+ */
+ if (len > space_avail) {
+ fprintf(stderr, "__wt_rec_row: page %lu split\n",
+ (u_long)page->addr);
+ __wt_abort(toc->env);
+ }
+
+ switch (key_loc) {
+ case KEY_ON_PAGE:
+ memcpy(first_free, key->data, key->size);
+ first_free += key->size;
+ space_avail -= key->size;
+ ++dsk->u.entries;
+ break;
+ case KEY_NONE:
+ break;
+ }
+ switch (data_loc) {
+ case DATA_ON_PAGE:
+ memcpy(first_free, data->data, data->size);
+ first_free += data->size;
+ space_avail -= data->size;
+ ++dsk->u.entries;
+ break;
+ case DATA_OFF_PAGE:
+ memcpy(first_free, &data_item, sizeof(data_item));
+ memcpy(first_free +
+ sizeof(WT_ITEM), data->data, data->size);
+ first_free += WT_ITEM_SPACE_REQ(data->size);
+ space_avail -= WT_ITEM_SPACE_REQ(data->size);
+ ++dsk->u.entries;
+ break;
+ }
+ }
+
+ __wt_rec_set_page_size(toc, new, first_free);
+
+ return (0);
+}
+
+/*
+ * __wt_rec_page_write --
+ * Write a newly reconciled page.
+ */
+static int
+__wt_rec_page_write(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
+{
+ ENV *env;
+ int ret;
+
+ env = toc->env;
+
+ /*
+ * XXX
+ * We fail if the page gets emptied -- we'll need to do some kind of
+ * reverse split where the internal page disappears. That shouldn't
+ * be difficult, but I haven't written it yet.
+ */
+ if (new->dsk->u.entries == 0) {
+ new->addr = WT_ADDR_INVALID;
+ WT_VERBOSE(env, WT_VERB_EVICT, (env,
+ "reconcile removing empty page %lu", (u_long)page->addr));
+ fprintf(stderr, "PAGE %lu EMPTIED\n", (u_long)page->addr);
+ __wt_abort(env);
+ } else {
+ /*
+ * Allocate file space for the page.
+ *
+ * The cache eviction server is the only thread allocating space
+ * from the file, so there's no need to do any serialization.
+ */
+ WT_RET(__wt_file_alloc(toc, &new->addr, new->size));
+
+ /*
+ * Write the page to disk.
+ *
+ * !!!
+ * This is safe for now, but it's a problem when we switch to
+ * asynchronous I/O: the scenario is (1) schedule the write,
+ * (2) discard the newly-clean in-memory version, (3) another
+ * thread tries to read down the tree before the write finishes.
+ */
+ WT_RET(__wt_page_write(toc, new));
+
+ WT_VERBOSE(env, WT_VERB_EVICT,
+ (env, "reconcile move %lu to %lu, resize %lu to %lu",
+ (u_long)page->addr, (u_long)new->addr,
+ (u_long)page->size, (u_long)new->size));
+ }
+
+ /* Update the page's parent. */
+ if ((ret = __wt_rec_parent_update(toc, page, new)) != 0) {
+ (void)__wt_file_free(toc, new->addr, new->size);
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_rec_parent_update --
+ * Update a parent page's reference when a page is reconciled.
+ */
+static int
+__wt_rec_parent_update(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new)
+{
+ IDB *idb;
+ WT_OFF *parent_off;
+
+ idb = toc->db->idb;
+
+ /*
+ * If we're writing the root of the tree, then we have to update the
+ * descriptor record, there's no parent to update.
+ */
+ if (page->addr == idb->root_off.addr) {
+ idb->root_off.addr = new->addr;
+ idb->root_off.size = new->size;
+ return (__wt_desc_write(toc));
+ }
+
+ /*
+ * Update the relevant WT_OFF structure. There are two memory locations
+ * that change (address and size), and we could race, but that's not a
+ * problem. Only a single thread ever reconciles a page at a time, and
+ * pages cannot leave memory while they have children.
+ */
+ parent_off = page->parent_off;
+ WT_RECORDS(parent_off) = new->records;
+ parent_off->addr = new->addr;
+ parent_off->size = new->size;
+
+ /*
+ * Mark the parent page as dirty.
+ *
+ * There's no chance we need to flush this write -- the eviction thread
+ * is the only thread that eventually cares if the page is dirty or not,
+ * and it's our update that's making it dirty. (The workQ thread does
+ * have to flush its set-modified update, of course).
+ *
+ * We don't care if we race with the workQ; if the workQ thread races
+ * with us, the page will still be marked dirty and that's all we care
+ * about.
+ */
+ WT_PAGE_SET_MODIFIED(page->parent);
+
+ return (0);
+}
diff --git a/src/btree/bt_ret.c b/src/btree/bt_ret.c
new file mode 100644
index 00000000000..8cdf8d90ce1
--- /dev/null
+++ b/src/btree/bt_ret.c
@@ -0,0 +1,179 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_dbt_return --
+ * Retrun a WT_PAGE/WT_{ROW,COL}_INDX pair to the application.
+ */
+int
+__wt_dbt_return(WT_TOC *toc, DBT *key, DBT *data, int key_return)
+{
+ DB *db;
+ DBT local_key, local_data;
+ ENV *env;
+ IDB *idb;
+ WT_COL *cip;
+ WT_ITEM *item;
+ WT_PAGE *page;
+ WT_PAGE_DISK *dsk;
+ WT_ROW *rip;
+ WT_REPL *repl;
+ void *data_ret;
+ uint32_t size_ret;
+ int (*callback)(DB *, DBT *, DBT *), ret;
+
+ db = toc->db;
+ env = toc->env;
+ idb = db->idb;
+ callback = data->callback;
+ ret = 0;
+
+ page = toc->srch_page;
+ dsk = page->dsk;
+ cip = toc->srch_ip;
+ rip = toc->srch_ip;
+ repl = toc->srch_repl;
+
+ /*
+ * Handle the key item -- the key may be unchanged, in which case we
+ * don't touch it, it's already correct.
+ *
+ * If the key/data items are being passed to a callback routine and
+ * there's nothing special about them (they aren't uninstantiated
+ * overflow or compressed items), then give the callback a pointer to
+ * the on-page data. (We use a local DBT in this case, so we don't
+ * touch potentially allocated application DBT memory.) Else, copy
+ * the items into the application's DBTs.
+ *
+ * If the key/data item are uninstantiated overflow and/or compressed
+ * items, they require processing before being copied into the DBTs.
+ * Don't allocate WT_INDX memory for key/data items here. (We never
+ * allocate WT_INDX memory for data items. We do allocate WT_INDX
+ * memory for keys, but if we are looking at a key only to return it,
+ * it's not that likely to be accessed again (think of a cursor moving
+ * through the tree). Use memory in the application's DBT instead, it
+ * is discarded when the WT_TOC is discarded.
+ *
+ * Key return implies a reference to a WT_ROW index (we don't return
+ * record number keys yet, that will probably change when I add cursor
+ * support).
+ */
+ if (key_return) {
+ if (__wt_key_process(rip)) {
+ WT_RET(__wt_item_process(toc, rip->key, &toc->key));
+
+ key->data = toc->key.data;
+ key->size = toc->key.size;
+ } else if (callback == NULL) {
+ if (toc->key.mem_size < rip->size)
+ WT_RET(__wt_realloc(env,
+ &toc->key.mem_size,
+ rip->size, &toc->key.data));
+ memcpy(toc->key.data, rip->key, rip->size);
+ toc->key.size = rip->size;
+
+ key->data = toc->key.data;
+ key->size = toc->key.size;
+ } else {
+ WT_CLEAR(local_key);
+ key = &local_key;
+ key->data = rip->key;
+ key->size = rip->size;
+ }
+ }
+
+ /*
+ * Handle the data item.
+ *
+ * If the item was ever replaced, it's easy, take the last replacement
+ * data item, it's just a byte string.
+ */
+ if (repl != NULL) {
+ if (WT_REPL_DELETED_ISSET(repl))
+ return (WT_NOTFOUND);
+ data->data = WT_REPL_DATA(repl);
+ data->size = repl->size;
+ return (callback == NULL ? 0 : callback(db, key, data));
+ }
+
+ /* Otherwise, take the item from the original page. */
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ data_ret = cip->data;
+ size_ret = db->fixed_len;
+ break;
+ case WT_PAGE_COL_RLE:
+ data_ret = WT_RLE_REPEAT_DATA(cip->data);
+ size_ret = db->fixed_len;
+ break;
+ case WT_PAGE_COL_VAR:
+ item = cip->data;
+ goto item_set;
+ case WT_PAGE_ROW_LEAF:
+ case WT_PAGE_DUP_LEAF:
+ item = rip->data;
+item_set: switch (WT_ITEM_TYPE(item)) {
+ case WT_ITEM_DATA:
+ case WT_ITEM_DATA_DUP:
+ if (idb->huffman_data == NULL) {
+ data_ret = WT_ITEM_BYTE(item);
+ size_ret = WT_ITEM_LEN(item);
+ }
+ /* FALLTHROUGH */
+ case WT_ITEM_DATA_OVFL:
+ case WT_ITEM_DATA_DUP_OVFL:
+ WT_RET(__wt_item_process(toc, item, &toc->data));
+ data_ret = toc->data.data;
+ size_ret = toc->data.size;
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+
+ /*
+ * When we get here, data_ret and size_ret are set to the byte string
+ * and the length we're going to return. That byte string has been
+ * decoded, we called __wt_item_process above in all cases where the
+ * item could be encoded.
+ */
+ if (callback == NULL) {
+ /*
+ * We're copying the key/data pair out to the caller. If we
+ * haven't yet copied the data_ret/size_ret pair into the return
+ * DBT (potentially done by __wt_item_process), do so now.
+ */
+ if (data_ret != toc->data.data) {
+ if (toc->data.mem_size < size_ret)
+ WT_RET(__wt_realloc(env,
+ &toc->data.mem_size,
+ size_ret, &toc->data.data));
+ memcpy(toc->data.data, data_ret, size_ret);
+ toc->data.size = size_ret;
+ }
+
+ data->data = toc->data.data;
+ data->size = toc->data.size;
+ } else {
+ /*
+ * If we're given a callback function, use the data_ret/size_ret
+ * fields as set.
+ */
+ WT_CLEAR(local_data);
+ data = &local_data;
+ data->data = data_ret;
+ data->size = size_ret;
+ ret = callback(db, key, data);
+ }
+
+ return (ret);
+}
diff --git a/src/btree/bt_rw.c b/src/btree/bt_rw.c
new file mode 100644
index 00000000000..ad8f12482b1
--- /dev/null
+++ b/src/btree/bt_rw.c
@@ -0,0 +1,85 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_page_disk_read --
+ * Read a file page.
+ */
+int
+__wt_page_disk_read(
+ WT_TOC *toc, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size)
+{
+ DB *db;
+ ENV *env;
+ WT_FH *fh;
+ WT_STATS *stats;
+ off_t offset;
+ uint32_t checksum;
+
+ db = toc->db;
+ env = toc->env;
+ fh = db->idb->fh;
+ stats = env->ienv->cache->stats;
+
+ WT_STAT_INCR(stats, PAGE_READ);
+
+ offset = WT_ADDR_TO_OFF(db, addr);
+ WT_RET(__wt_read(env, fh, offset, size, dsk));
+
+ checksum = dsk->checksum;
+ dsk->checksum = 0;
+ if (checksum != __wt_cksum(dsk, size)) {
+ __wt_api_env_errx(env,
+ "read checksum error: addr/size %lu/%lu at offset %llu",
+ (u_long)addr, (u_long)size, (unsigned long long)offset);
+ return (WT_ERROR);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_page_write --
+ * Write a file page.
+ */
+inline int
+__wt_page_write(WT_TOC *toc, WT_PAGE *page)
+{
+ return (__wt_page_disk_write(toc, page->dsk, page->addr, page->size));
+}
+
+/*
+ * __wt_page_disk_write --
+ * Write a file page.
+ */
+int
+__wt_page_disk_write(
+ WT_TOC *toc, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size)
+{
+ DB *db;
+ ENV *env;
+ WT_FH *fh;
+ WT_STATS *stats;
+
+ db = toc->db;
+ env = toc->env;
+ fh = db->idb->fh;
+ stats = env->ienv->cache->stats;
+
+ WT_ASSERT(env, __wt_verify_dsk_page(toc, dsk, addr, size) == 0);
+
+ WT_STAT_INCR(stats, PAGE_WRITE);
+
+ dsk->checksum = 0;
+ dsk->checksum = __wt_cksum(dsk, size);
+
+ return (__wt_write(env, fh, WT_ADDR_TO_OFF(db, addr), size, dsk));
+}
diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c
new file mode 100644
index 00000000000..5beb931f578
--- /dev/null
+++ b/src/btree/bt_stat.c
@@ -0,0 +1,348 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_stat_page_col_fix(WT_TOC *, WT_PAGE *);
+static int __wt_stat_page_col_rle(WT_TOC *, WT_PAGE *);
+static int __wt_stat_page_col_var(WT_TOC *, WT_PAGE *);
+static int __wt_stat_page_dup_leaf(WT_TOC *, WT_PAGE *);
+static int __wt_stat_page_row_leaf(WT_TOC *, WT_PAGE *, void *);
+
+/*
+ * __wt_page_stat --
+ * Stat any Btree page.
+ */
+int
+__wt_page_stat(WT_TOC *toc, WT_PAGE *page, void *arg)
+{
+ DB *db;
+ IDB *idb;
+ WT_PAGE_DISK *dsk;
+ WT_STATS *stats;
+
+ db = toc->db;
+ idb = db->idb;
+ dsk = page->dsk;
+ stats = idb->dstats;
+
+ /*
+ * All internal pages and overflow pages are trivial, all we track is
+ * a count of the page type.
+ */
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ WT_STAT_INCR(stats, PAGE_COL_FIX);
+ WT_RET(__wt_stat_page_col_fix(toc, page));
+ break;
+ case WT_PAGE_COL_INT:
+ WT_STAT_INCR(stats, PAGE_COL_INTERNAL);
+ break;
+ case WT_PAGE_COL_RLE:
+ WT_STAT_INCR(stats, PAGE_COL_RLE);
+ WT_RET(__wt_stat_page_col_rle(toc, page));
+ break;
+ case WT_PAGE_COL_VAR:
+ WT_STAT_INCR(stats, PAGE_COL_VARIABLE);
+ WT_RET(__wt_stat_page_col_var(toc, page));
+ break;
+ case WT_PAGE_DUP_INT:
+ WT_STAT_INCR(stats, PAGE_DUP_INTERNAL);
+ break;
+ case WT_PAGE_DUP_LEAF:
+ WT_STAT_INCR(stats, PAGE_DUP_LEAF);
+ WT_RET(__wt_stat_page_dup_leaf(toc, page));
+ break;
+ case WT_PAGE_OVFL:
+ WT_STAT_INCR(stats, PAGE_OVERFLOW);
+ break;
+ case WT_PAGE_ROW_INT:
+ WT_STAT_INCR(stats, PAGE_ROW_INTERNAL);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ WT_STAT_INCR(stats, PAGE_ROW_LEAF);
+ WT_RET(__wt_stat_page_row_leaf(toc, page, arg));
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+ return (0);
+}
+
+/*
+ * __wt_stat_page_col_fix --
+ * Stat a WT_PAGE_COL_FIX page.
+ */
+static int
+__wt_stat_page_col_fix(WT_TOC *toc, WT_PAGE *page)
+{
+ WT_COL *cip;
+ WT_REPL *repl;
+ WT_STATS *stats;
+ uint32_t i;
+
+ stats = toc->db->idb->dstats;
+
+ /* Walk the page, counting data items. */
+ WT_INDX_FOREACH(page, cip, i) {
+ if ((repl = WT_COL_REPL(page, cip)) == NULL)
+ if (WT_FIX_DELETE_ISSET(cip->data))
+ WT_STAT_INCR(stats, ITEM_COL_DELETED);
+ else
+ WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+ else
+ if (WT_REPL_DELETED_ISSET(repl))
+ WT_STAT_INCR(stats, ITEM_COL_DELETED);
+ else
+ WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+ }
+ return (0);
+}
+
+/*
+ * __wt_stat_page_col_rle --
+ * Stat a WT_PAGE_COL_RLE page.
+ */
+static int
+__wt_stat_page_col_rle(WT_TOC *toc, WT_PAGE *page)
+{
+ WT_COL *cip;
+ WT_RLE_EXPAND *exp;
+ WT_REPL *repl;
+ WT_STATS *stats;
+ uint32_t i;
+
+ stats = toc->db->idb->dstats;
+
+ /* Walk the page, counting data items. */
+ WT_INDX_FOREACH(page, cip, i) {
+ if (WT_FIX_DELETE_ISSET(WT_RLE_REPEAT_DATA(cip->data)))
+ WT_STAT_INCRV(stats,
+ ITEM_COL_DELETED, WT_RLE_REPEAT_COUNT(cip->data));
+ else
+ WT_STAT_INCRV(stats,
+ ITEM_TOTAL_DATA, WT_RLE_REPEAT_COUNT(cip->data));
+
+ /*
+ * Check for corrections.
+ *
+ * XXX
+ * This gets the count wrong if an application changes existing
+ * records, or updates a deleted record two times in a row --
+ * we'll incorrectly count the records as unique, when they are
+ * changes to the same record. I'm not fixing it as I don't
+ * expect the WT_COL_RLEEXP data structure to be permanent, it's
+ * too likely to become a linked list in bad cases.
+ */
+ for (exp =
+ WT_COL_RLEEXP(page, cip); exp != NULL; exp = exp->next) {
+ repl = exp->repl;
+ if (WT_REPL_DELETED_ISSET(repl))
+ WT_STAT_INCR(stats, ITEM_COL_DELETED);
+ else
+ WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+ }
+ }
+ return (0);
+}
+
+/*
+ * __wt_stat_page_col_var --
+ * Stat a WT_PAGE_COL_VAR page.
+ */
+static int
+__wt_stat_page_col_var(WT_TOC *toc, WT_PAGE *page)
+{
+ DB *db;
+ WT_COL *cip;
+ WT_REPL *repl;
+ WT_STATS *stats;
+ uint32_t i;
+
+ db = toc->db;
+ stats = db->idb->dstats;
+
+ /*
+ * Walk the page, counting regular and overflow data items, and checking
+ * to be sure any replacements weren't deletions. If the item has been
+ * replaced, assume it was replaced by an item of the same size (it's
+ * to expensive to figure out if it will require the same space or not,
+ * especially if there's Huffman encoding).
+ */
+ WT_INDX_FOREACH(page, cip, i) {
+ switch (WT_ITEM_TYPE(cip->data)) {
+ case WT_ITEM_DATA:
+ repl = WT_COL_REPL(page, cip);
+ if (repl == NULL || !WT_REPL_DELETED_ISSET(repl))
+ WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+ break;
+ case WT_ITEM_DATA_OVFL:
+ repl = WT_COL_REPL(page, cip);
+ if (repl == NULL || !WT_REPL_DELETED_ISSET(repl)) {
+ WT_STAT_INCR(stats, ITEM_DATA_OVFL);
+ WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+ }
+ break;
+ case WT_ITEM_DEL:
+ WT_STAT_INCR(stats, ITEM_COL_DELETED);
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+ }
+ return (0);
+}
+
+/*
+ * __wt_stat_page_dup_leaf --
+ * Stat a WT_PAGE_DUP_LEAF page.
+ */
+static int
+__wt_stat_page_dup_leaf(WT_TOC *toc, WT_PAGE *page)
+{
+ DB *db;
+ WT_REPL *repl;
+ WT_ROW *rip;
+ WT_STATS *stats;
+ uint32_t i;
+
+ db = toc->db;
+ stats = db->idb->dstats;
+
+ /*
+ * Walk the page, counting regular and overflow data items, and checking
+ * to be sure any replacements weren't deletions. If the item has been
+ * replaced, assume it was replaced by an item of the same size (it's
+ * to expensive to figure out if it will require the same space or not,
+ * especially if there's Huffman encoding).
+ */
+ WT_INDX_FOREACH(page, rip, i) {
+ switch (WT_ITEM_TYPE(rip->data)) {
+ case WT_ITEM_DATA_DUP:
+ repl = WT_ROW_REPL(page, rip);
+ if (repl == NULL || !WT_REPL_DELETED_ISSET(repl)) {
+ WT_STAT_INCR(stats, ITEM_DUP_DATA);
+ WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+ }
+ break;
+ case WT_ITEM_DATA_DUP_OVFL:
+ repl = WT_ROW_REPL(page, rip);
+ if (repl == NULL || !WT_REPL_DELETED_ISSET(repl)) {
+ WT_STAT_INCR(stats, ITEM_DUP_DATA);
+ WT_STAT_INCR(stats, ITEM_DATA_OVFL);
+ WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+ }
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+ }
+ return (0);
+}
+
+/*
+ * __wt_stat_page_row_leaf --
+ * Stat a WT_PAGE_ROW_LEAF page.
+ */
+static int
+__wt_stat_page_row_leaf(WT_TOC *toc, WT_PAGE *page, void *arg)
+{
+ DB *db;
+ WT_OFF *off;
+ WT_REF *ref;
+ WT_REPL *repl;
+ WT_ROW *rip;
+ WT_STATS *stats;
+ uint32_t i;
+ int ret;
+
+ db = toc->db;
+ stats = db->idb->dstats;
+
+ /*
+ * Walk the page, counting regular and overflow data items, and checking
+ * to be sure any replacements weren't deletions. If the item has been
+ * replaced, assume it was replaced by an item of the same size (it's
+ * to expensive to figure out if it will require the same space or not,
+ * especially if there's Huffman encoding).
+ */
+ WT_INDX_FOREACH(page, rip, i) {
+ switch (WT_ITEM_TYPE(rip->data)) {
+ case WT_ITEM_DATA:
+ repl = WT_ROW_REPL(page, rip);
+ if (repl != NULL && WT_REPL_DELETED_ISSET(repl))
+ continue;
+ WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+ break;
+ case WT_ITEM_DATA_OVFL:
+ repl = WT_ROW_REPL(page, rip);
+ if (repl != NULL && WT_REPL_DELETED_ISSET(repl))
+ continue;
+ WT_STAT_INCR(stats, ITEM_DATA_OVFL);
+ WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+ break;
+ case WT_ITEM_DATA_DUP:
+ repl = WT_ROW_REPL(page, rip);
+ if (repl != NULL && WT_REPL_DELETED_ISSET(repl))
+ continue;
+ WT_STAT_INCR(stats, ITEM_DUP_DATA);
+ WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+ break;
+ case WT_ITEM_DATA_DUP_OVFL:
+ repl = WT_ROW_REPL(page, rip);
+ if (repl != NULL && WT_REPL_DELETED_ISSET(repl))
+ continue;
+ WT_STAT_INCR(stats, ITEM_DUP_DATA);
+ WT_STAT_INCR(stats, ITEM_DATA_OVFL);
+ WT_STAT_INCR(stats, ITEM_TOTAL_DATA);
+ break;
+ case WT_ITEM_OFF:
+ /*
+ * Recursively call the tree-walk code for any off-page
+ * duplicate trees. (Check for any off-page duplicate
+ * trees locally because we already have to walk the
+ * page, so it's faster than walking the page both here
+ * and in the tree-walk function.)
+ */
+ ref = WT_ROW_REF(page, rip);
+ off = WT_ROW_OFF(rip);
+ WT_RET(__wt_page_in(toc, page, ref, off, 0));
+ ret = __wt_tree_walk(toc, ref, 0, __wt_page_stat, arg);
+ __wt_hazard_clear(toc, ref->page);
+ if (ret != 0)
+ return (ret);
+ WT_STAT_INCR(stats, DUP_TREE);
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+
+ /*
+ * If the data item wasn't deleted, count the key.
+ *
+ * If we have processed the key, we have lost the information as
+ * to whether or not it's an overflow key -- we can figure out
+ * if it's Huffman encoded by looking at the huffman key, but
+ * that doesn't tell us if it's an overflow key or not. To fix
+ * this we'd have to maintain a reference to the on-page key and
+ * check it, and I'm not willing to spend the additional pointer
+ * in the WT_ROW structure.
+ */
+ if (__wt_key_process(rip))
+ switch (WT_ITEM_TYPE(rip->key)) {
+ case WT_ITEM_KEY_OVFL:
+ WT_STAT_INCR(stats, ITEM_KEY_OVFL);
+ /* FALLTHROUGH */
+ case WT_ITEM_KEY:
+ WT_STAT_INCR(stats, ITEM_TOTAL_KEY);
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+ else
+ WT_STAT_INCR(stats, ITEM_TOTAL_KEY);
+
+ }
+ return (0);
+}
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
new file mode 100644
index 00000000000..af5a9d65258
--- /dev/null
+++ b/src/btree/bt_sync.c
@@ -0,0 +1,61 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_bt_tree_sync(WT_TOC *, WT_PAGE *, void *);
+
+/*
+ * __wt_bt_sync --
+ * Sync the tree.
+ */
+int
+__wt_bt_sync(WT_TOC *toc)
+{
+ ENV *env;
+ IDB *idb;
+ WT_CACHE *cache;
+ int ret;
+
+ env = toc->env;
+ idb = toc->db->idb;
+ cache = env->ienv->cache;
+
+ if (WT_UNOPENED_DATABASE(idb))
+ return (0);
+
+ /*
+ * The tree walk is depth first, that is, the worker function is not
+ * called on internal pages until all children have been visited; so,
+ * we don't have to worry about a page being dirtied after the visit.
+ *
+ * Lock out the cache eviction thread, though, we don't want it trying
+ * to reconcile pages we're flushing.
+ */
+ __wt_lock(env, cache->mtx_reconcile);
+ ret = __wt_tree_walk(toc, NULL,
+ WT_WALK_CACHE | WT_WALK_OFFDUP, __wt_bt_tree_sync, NULL);
+ __wt_unlock(env, cache->mtx_reconcile);
+ return (ret);
+}
+
+/*
+ * __wt_bt_tree_sync --
+ * Sync a page.
+ */
+static int
+__wt_bt_tree_sync(WT_TOC *toc, WT_PAGE *page, void *arg)
+{
+ WT_CC_QUIET(arg, NULL);
+
+ /* Reconcile any dirty pages. */
+ if (WT_PAGE_IS_MODIFIED(page))
+ WT_RET(__wt_page_reconcile(toc, page));
+ return (0);
+}
diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c
new file mode 100644
index 00000000000..19e9fccb82a
--- /dev/null
+++ b/src/btree/bt_vrfy.c
@@ -0,0 +1,1346 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * There's a bunch of stuff we pass around during verification, group it
+ * together to make the code prettier.
+ */
+typedef struct {
+ uint32_t frags; /* Total frags */
+ bitstr_t *fragbits; /* Frag tracking bit list */
+
+ FILE *stream; /* Dump file stream */
+
+ void (*f)(const char *, uint64_t); /* Progress callback */
+ uint64_t fcnt; /* Progress counter */
+
+ WT_PAGE *leaf; /* Last leaf-page */
+} WT_VSTUFF;
+
+static int __wt_verify_addfrag(WT_TOC *, uint32_t, uint32_t, WT_VSTUFF *);
+static int __wt_verify_checkfrag(DB *, WT_VSTUFF *);
+static int __wt_verify_delfmt(DB *, uint32_t, uint32_t);
+static int __wt_verify_dsk_col_fix(DB *, WT_PAGE_DISK *, uint32_t, uint32_t);
+static int __wt_verify_dsk_col_int(DB *, WT_PAGE_DISK *, uint32_t, uint32_t);
+static int __wt_verify_dsk_col_rle(DB *, WT_PAGE_DISK *, uint32_t, uint32_t);
+static int __wt_verify_dsk_item(WT_TOC *, WT_PAGE_DISK *, uint32_t, uint32_t);
+static int __wt_verify_dsk_ovfl(WT_TOC *, WT_PAGE_DISK *, uint32_t, uint32_t);
+static int __wt_verify_eof(DB *, uint32_t, uint32_t);
+static int __wt_verify_eop(DB *, uint32_t, uint32_t);
+static int __wt_verify_key_order(WT_TOC *, WT_PAGE *);
+static int __wt_verify_overflow_col(WT_TOC *, WT_PAGE *, WT_VSTUFF *);
+static int __wt_verify_overflow_common(
+ WT_TOC *, WT_OVFL *, uint32_t, uint32_t, WT_VSTUFF *);
+static int __wt_verify_overflow_row(WT_TOC *, WT_PAGE *, WT_VSTUFF *);
+static int __wt_verify_pc(WT_TOC *, WT_ROW *, WT_PAGE *, int);
+static int __wt_verify_tree(WT_TOC *,
+ WT_ROW *, uint64_t, uint64_t, uint32_t, WT_REF *, WT_VSTUFF *);
+
+/*
+ * __wt_db_verify --
+ * Verify a Btree.
+ */
+int
+__wt_db_verify(WT_TOC *toc, void (*f)(const char *, uint64_t))
+{
+ return (__wt_verify(toc, f, NULL));
+}
+
+/*
+ * __wt_verify --
+ * Verify a Btree, optionally dumping each page in debugging mode.
+ */
+int
+__wt_verify(
+ WT_TOC *toc, void (*f)(const char *, uint64_t), FILE *stream)
+{
+ DB *db;
+ ENV *env;
+ IDB *idb;
+ WT_VSTUFF vstuff;
+ int ret;
+
+ env = toc->env;
+ db = toc->db;
+ idb = db->idb;
+ ret = 0;
+
+ memset(&vstuff, 0, sizeof(vstuff));
+ vstuff.stream = stream;
+ vstuff.f = f;
+
+ /*
+ * Allocate a bit array, where each bit represents a single allocation
+ * size piece of the file. This is how we track the parts of the file
+ * we've verified. Storing this on the heap seems reasonable: with a
+ * minimum allocation size of 512B, we would allocate 4MB to verify a
+ * 16GB file. To verify larger files than we can handle this way, we'd
+ * have to write parts of the bit array into a disk file.
+ *
+ * !!!
+ * There's one portability issue -- the bitstring package uses "ints",
+ * not unsigned ints, or any fixed size. If an "int" can't hold a
+ * big enough value, we could lose. There's a check here to make we
+ * don't overflow. I don't ever expect to see this error message, but
+ * better safe than sorry.
+ */
+ vstuff.frags = WT_OFF_TO_ADDR(db, idb->fh->file_size);
+ if (vstuff.frags > INT_MAX) {
+ __wt_api_db_errx(db, "file is too large to verify");
+ goto err;
+ }
+ WT_ERR(bit_alloc(env, vstuff.frags, &vstuff.fragbits));
+
+ /*
+ * The first sector of the file is the description record -- ignore
+ * it for now.
+ */
+ bit_nset(vstuff.fragbits, 0, 0);
+
+ /* Verify the tree, starting at the root. */
+ WT_ERR(__wt_verify_tree(toc, NULL, WT_RECORDS(&idb->root_off),
+ (uint64_t)1, WT_NOLEVEL, &idb->root_page, &vstuff));
+
+ WT_ERR(__wt_verify_checkfrag(db, &vstuff));
+
+err: /* Wrap up reporting and free allocated memory. */
+ if (vstuff.f != NULL)
+ vstuff.f(toc->name, vstuff.fcnt);
+ if (vstuff.fragbits != NULL)
+ __wt_free(env, vstuff.fragbits, 0);
+
+ return (ret);
+}
+
+/*
+ * __wt_verify_tree --
+ * Verify a tree, recursively descending through it in depth-first fashion.
+ * The page argument was physically verified (so we know it's correctly formed),
+ * and the in-memory version built. Our job is to check logical relationships
+ * in the page and in the tree.
+ */
+static int
+__wt_verify_tree(
+ WT_TOC *toc, /* Thread of control */
+ WT_ROW *parent_rip, /* Internal key referencing this page, if any */
+ uint64_t parent_records, /* Parent's count of records in this tree */
+ uint64_t start_recno, /* First record on this page */
+ uint32_t level, /* Page's tree level */
+ WT_REF *ref, /* Already verified page reference */
+ WT_VSTUFF *vs) /* The verify package */
+{
+ DB *db;
+ WT_COL *cip;
+ WT_ITEM *item;
+ WT_OFF *off;
+ WT_PAGE *page;
+ WT_PAGE_DISK *dsk;
+ WT_REPL *repl;
+ WT_ROW *rip;
+ uint64_t records;
+ uint32_t i;
+ int is_root, ret;
+
+ db = toc->db;
+ page = ref->page;
+ dsk = page->dsk;
+ ret = 0;
+
+ /* Report progress every 10 pages. */
+ if (vs->f != NULL && ++vs->fcnt % 10 == 0)
+ vs->f(toc->name, vs->fcnt);
+
+ /* Update frags list. */
+ WT_ERR(__wt_verify_addfrag(toc, page->addr, page->size, vs));
+
+#ifdef DIAGNOSTIC
+ /* Optionally dump the page in debugging mode. */
+ if (vs->stream != NULL)
+ return (__wt_debug_page(toc, page, NULL, vs->stream));
+#endif
+
+ /*
+ * The page's physical structure was verified when it was read into
+ * memory by the read server thread, and then the in-memory version
+ * of the page was built. Now we make sure the page and tree are
+ * logically consistent.
+ *
+ * !!!
+ * The problem: (1) the read server has to build the in-memory version
+ * of the page because the read server is the thread that flags when
+ * any thread can access the page in the tree; (2) we can't build the
+ * in-memory version of the page until the physical structure is known
+ * to be OK, so the read server has to verify at least the physical
+ * structure of the page; (3) doing complete page verification requires
+ * reading additional pages (for example, overflow keys imply reading
+ * overflow pages in order to test the key's order in the page); (4)
+ * the read server cannot read additional pages because it will hang
+ * waiting on itself. For this reason, we split page verification
+ * into a physical verification, which allows the in-memory version
+ * of the page to be built, and then a subsequent logical verification
+ * which happens here.
+ */
+
+ /*
+ * If passed a level of WT_NOLEVEL, that is, the only level that can't
+ * possibly be a valid database page level, this is the root page of
+ * the tree.
+ *
+ * If it's the root, use this page's level to initialize expected the
+ * values for the rest of the tree.
+ */
+ is_root = level == WT_NOLEVEL ? 1 : 0;
+ if (is_root)
+ level = dsk->level;
+
+ /* Check that tree levels and record counts match up. */
+ if (dsk->level != level) {
+ __wt_api_db_errx(db,
+ "page at addr %lu has a tree level of %lu where the "
+ "expected level was %lu",
+ (u_long)page->addr, (u_long)dsk->level, (u_long)level);
+ goto err;
+ }
+
+ /*
+ * Check the record counts.
+ *
+ * Confirm the number of records found on this page (by summing the
+ * WT_OFF structure record counts) matches the WT_OFF structure record
+ * count in our parent. Use the in-memory record count for internal
+ * pages -- we could sum the record counts as we walk the page below,
+ * but we did that when building the in-memory version of the page,
+ * there's no reason to do it again.
+ */
+ if (page->records != parent_records) {
+ __wt_api_db_errx(db,
+ "page at addr %lu has a record count of %llu where the "
+ "expected record count was %llu",
+ (u_long)page->addr, page->records,
+ (unsigned long long)parent_records);
+ goto err;
+ }
+
+ /* Check the starting record number. */
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_RLE:
+ case WT_PAGE_COL_VAR:
+ if (dsk->start_recno != start_recno) {
+ __wt_api_db_errx(db,
+ "page at addr %lu has a starting record of %llu "
+ "where the expected starting record was %llu",
+ (u_long)page->addr,
+ (unsigned long long)dsk->start_recno,
+ (unsigned long long)start_recno);
+ goto err;
+ }
+ break;
+ default:
+ break;
+ }
+
+ /*
+ * Check on-page overflow page references.
+ *
+ * There's a potential performance problem here: we read key overflow
+ * pages twice, once when checking the overflow page itself, and again
+ * when checking the key ordering. It's a pain to combine the two
+ * tests (the page types with overflow items aren't exactly the same
+ * as the page types with ordered keys, and the underlying functions
+ * that instantiate (and decompress) overflow pages don't want to know
+ * anything about verification), and I don't want to keep the overflow
+ * keys in the cache, it's likely to be wasted space. Until it's a
+ * problem, I'm going to assume the second read of the overflow key is
+ * satisfied in the operating system buffer cache, and not worry about
+ * it. Table verify isn't likely to be a performance path anyway.
+ */
+ switch (dsk->type) {
+ case WT_PAGE_COL_VAR:
+ WT_RET(__wt_verify_overflow_col(toc, page, vs));
+ break;
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_DUP_LEAF:
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ WT_RET(__wt_verify_overflow_row(toc, page, vs));
+ break;
+ default:
+ break;
+ }
+
+ /* Check on-page key ordering. */
+ switch (dsk->type) {
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_DUP_LEAF:
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ WT_RET(__wt_verify_key_order(toc, page));
+ break;
+ default:
+ break;
+ }
+
+ /* Check tree connections and recursively descend the tree. */
+ switch (dsk->type) {
+ case WT_PAGE_COL_INT:
+ /* For each entry in an internal page, verify the subtree. */
+ start_recno = dsk->start_recno;
+ WT_INDX_FOREACH(page, cip, i) {
+ /* cip references the subtree containing the record */
+ ref = WT_COL_REF(page, cip);
+ off = WT_COL_OFF(cip);
+ records = WT_COL_OFF_RECORDS(cip);
+ WT_ERR(__wt_page_in(toc, page, ref, off, 1));
+ ret = __wt_verify_tree(toc, NULL,
+ records, start_recno, level - 1, ref, vs);
+ __wt_hazard_clear(toc, ref->page);
+ if (ret != 0)
+ goto err;
+ start_recno += records;
+ }
+ break;
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_ROW_INT:
+ /*
+ * There are two row-store, logical connection checks:
+ *
+ * First, compare the internal node key leading to the current
+ * page against the first entry on the current page. The
+ * internal node key must compare less than or equal to the
+ * first entry on the current page.
+ *
+ * Second, compare the largest key we've seen on any leaf page
+ * against the next internal node key we find. This check is
+ * a little tricky: every time we find a leaf page, we save a
+ * reference in the vs->leaf field. The next time we're about
+ * to indirect through an entry on an internal node, we compare
+ * the last entry on that saved page against the internal node
+ * entry's key. In that comparison, the leaf page's key must
+ * be less than the internal node entry's key.
+ */
+ if (parent_rip != NULL)
+ WT_ERR(__wt_verify_pc(toc, parent_rip, page, 1));
+
+ /* For each entry in an internal page, verify the subtree. */
+ WT_INDX_FOREACH(page, rip, i) {
+ /*
+ * At each off-page entry, we compare the current entry
+ * against the largest key in the subtree rooted to the
+ * immediate left of the current item; this key must
+ * compare less than or equal to the current item. The
+ * trick here is we need the last leaf key, not the last
+ * internal node key. It's returned to us in the leaf
+ * field of the vs structure, whenever we verify a leaf
+ * page. Discard the leaf node as soon as we've used it
+ * in a comparison.
+ */
+ if (vs->leaf != NULL) {
+ WT_ERR(
+ __wt_verify_pc(toc, rip, vs->leaf, 0));
+ __wt_hazard_clear(toc, vs->leaf);
+ vs->leaf = NULL;
+ }
+ /* rip references the subtree containing the record */
+ ref = WT_ROW_REF(page, rip);
+ off = WT_ROW_OFF(rip);
+ records = WT_ROW_OFF_RECORDS(rip);
+ WT_ERR(__wt_page_in(toc, page, ref, off, 1));
+ ret = __wt_verify_tree(toc, rip,
+ records, (uint64_t)0, level - 1, ref, vs);
+
+ /*
+ * Remaining special handling of the last verified leaf
+ * page: if we kept a reference to that page, don't
+ * release the hazard reference until after comparing
+ * the last key on that page against the next key in the
+ * tree.
+ */
+ if (vs->leaf != ref->page)
+ __wt_hazard_clear(toc, ref->page);
+ if (ret != 0)
+ goto err;
+ }
+ break;
+ case WT_PAGE_ROW_LEAF:
+ /*
+ * For each entry in a row-store leaf page, verify any off-page
+ * duplicates tree.
+ */
+ WT_INDX_FOREACH(page, rip, i) {
+ /* Ignore anything except off-page duplicate trees. */
+ if ((repl = WT_ROW_REPL(
+ page, rip)) != NULL && WT_REPL_DELETED_ISSET(repl))
+ continue;
+ item = rip->data;
+ if (WT_ITEM_TYPE(item) != WT_ITEM_OFF)
+ continue;
+
+ /* Verify the off-page duplicate tree. */
+ ref = WT_ROW_DUP(page, rip);
+ off = WT_ROW_OFF(rip);
+ records = WT_ROW_OFF_RECORDS(rip);
+ WT_ERR(__wt_page_in(toc, page, ref, off, 1));
+ ret = __wt_verify_tree(toc, NULL,
+ records, (uint64_t)0, WT_NOLEVEL, ref, vs);
+ __wt_hazard_clear(toc, ref->page);
+ if (ret != 0)
+ goto err;
+ }
+ /* FALLTHROUGH */
+ case WT_PAGE_DUP_LEAF:
+ /*
+ * Retain a reference to all row-store leaf pages, we need them
+ * to check their last entry against the next internal key in
+ * the tree.
+ */
+ vs->leaf = page;
+ return (0);
+ default:
+ break;
+ }
+
+ /*
+ * The largest key on the last leaf page in the tree is never needed,
+ * there aren't any internal pages after it. So, we get here with
+ * vs->leaf needing to be released.
+ */
+err: if (vs->leaf != NULL) {
+ __wt_hazard_clear(toc, vs->leaf);
+ vs->leaf = NULL;
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_verify_pc --
+ * Compare a key on a parent page to a designated entry on a child page.
+ */
+static int
+__wt_verify_pc(WT_TOC *toc, WT_ROW *parent_rip, WT_PAGE *child, int first_entry)
+{
+ DB *db;
+ DBT *cd_ref, *pd_ref, *scratch1, *scratch2;
+ WT_ROW *child_rip;
+ int cmp, ret, (*func)(DB *, const DBT *, const DBT *);
+
+ db = toc->db;
+ scratch1 = scratch2 = NULL;
+ ret = 0;
+
+ /* Set the comparison function. */
+ switch (child->dsk->type) {
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_DUP_LEAF:
+ func = db->btree_compare_dup;
+ break;
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ func = db->btree_compare;
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+
+ /*
+ * The two keys we're going to compare may be overflow keys -- don't
+ * bother instantiating the keys in the tree, there's no reason to
+ * believe we're going to be working in this database.
+ */
+ child_rip = first_entry ?
+ child->u.irow : child->u.irow + (child->indx_count - 1);
+ if (__wt_key_process(child_rip)) {
+ WT_ERR(__wt_scr_alloc(toc, 0, &scratch1));
+ WT_ERR(__wt_item_process(toc, child_rip->key, scratch1));
+ cd_ref = scratch1;
+ } else
+ cd_ref = (DBT *)child_rip;
+ if (__wt_key_process(parent_rip)) {
+ WT_ERR(__wt_scr_alloc(toc, 0, &scratch2));
+ WT_RET(__wt_item_process(toc, parent_rip->key, scratch2));
+ pd_ref = scratch2;
+ } else
+ pd_ref = (DBT *)parent_rip;
+
+ /* Compare the parent's key against the child's key. */
+ cmp = func(db, cd_ref, pd_ref);
+
+ if (first_entry && cmp < 0) {
+ __wt_api_db_errx(db,
+ "the first key on page at addr %lu sorts before its "
+ "reference key on its parent's page",
+ (u_long)child->addr);
+ ret = WT_ERROR;
+ }
+ if (!first_entry && cmp >= 0) {
+ __wt_api_db_errx(db,
+ "the last key on the page at addr %lu sorts after a parent "
+ "page's key for the subsequent page",
+ (u_long)child->addr);
+ ret = WT_ERROR;
+ }
+
+err: if (scratch1 != NULL)
+ __wt_scr_release(&scratch1);
+ if (scratch2 != NULL)
+ __wt_scr_release(&scratch2);
+
+ return (ret);
+}
+
+/*
+ * __wt_verify_key_order --
+ * Check on-page key ordering.
+ */
+static int
+__wt_verify_key_order(WT_TOC *toc, WT_PAGE *page)
+{
+ struct {
+ DBT *dbt; /* DBT to compare */
+ DBT *scratch; /* scratch buffer */
+ } *current, *last, _a, _b;
+ DB *db;
+ WT_PAGE_DISK *dsk;
+ WT_ROW *rip;
+ uint32_t i;
+ int (*func)(DB *, const DBT *, const DBT *), ret;
+
+ db = toc->db;
+ dsk = page->dsk;
+ ret = 0;
+
+ WT_CLEAR(_a);
+ WT_CLEAR(_b);
+ current = &_a;
+ WT_ERR(__wt_scr_alloc(toc, 0, &current->scratch));
+ last = &_b;
+ WT_ERR(__wt_scr_alloc(toc, 0, &last->scratch));
+
+ /* Set the comparison function. */
+ switch (dsk->type) {
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_DUP_LEAF:
+ func = db->btree_compare_dup;
+ break;
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ func = db->btree_compare;
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+
+ /* Walk the page, comparing keys. */
+ WT_INDX_FOREACH(page, rip, i) {
+ /* Skip duplicates */
+ if (WT_ROW_INDX_IS_DUPLICATE(page, rip))
+ continue;
+
+ /*
+ * The two keys we're going to compare may be overflow keys --
+ * don't bother instantiating the keys in the tree, there's no
+ * reason to believe we're going to be working in this database.
+ */
+ if (__wt_key_process(rip)) {
+ WT_RET(__wt_item_process(
+ toc, rip->key, current->scratch));
+ current->dbt = current->scratch;
+ } else
+ current->dbt = (DBT *)rip;
+
+ /* Compare the current key against the last key. */
+ if (last->dbt != NULL &&
+ func(db, last->dbt, current->dbt) >= 0) {
+ __wt_api_db_errx(db,
+ "the %lu and %lu keys on page at addr %lu are "
+ "incorrectly sorted",
+ (u_long)WT_ROW_SLOT(page, rip) - 1,
+ (u_long)WT_ROW_SLOT(page, rip),
+ (u_long)page->addr);
+ ret = WT_ERROR;
+ goto err;
+ }
+ }
+
+err: if (_a.scratch != NULL)
+ __wt_scr_release(&_a.scratch);
+ if (_b.scratch != NULL)
+ __wt_scr_release(&_b.scratch);
+
+ return (ret);
+}
+
+/*
+ * __wt_verify_dsk_page --
+ * Verify a single Btree page as read from disk.
+ */
+int
+__wt_verify_dsk_page(
+ WT_TOC *toc, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size)
+{
+ DB *db;
+
+ db = toc->db;
+
+ /* Check the page type. */
+ switch (dsk->type) {
+ case WT_PAGE_FREE:
+ /*
+ * Free pages are only written in diagnostic mode, and the
+ * type is the only thing that can be verified about them.
+ */
+ return (0);
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_RLE:
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_DUP_LEAF:
+ case WT_PAGE_OVFL:
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ break;
+ case WT_PAGE_INVALID:
+ default:
+ __wt_api_db_errx(db,
+ "page at addr %lu has an invalid type of %lu",
+ (u_long)addr, (u_long)dsk->type);
+ return (WT_ERROR);
+ }
+
+ /*
+ * FUTURE:
+ * Check the LSN against the existing log files.
+ */
+ if (dsk->lsn[0] != 0 || dsk->lsn[1] != 0) {
+ __wt_api_db_errx(db,
+ "page at addr %lu has non-zero lsn header fields",
+ (u_long)addr);
+ return (WT_ERROR);
+ }
+
+ /* Ignore the checksum -- it verified when we first read the page. */
+
+ /* Check the page level. */
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_RLE:
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_DUP_LEAF:
+ case WT_PAGE_OVFL:
+ case WT_PAGE_ROW_LEAF:
+ if (dsk->level != WT_LLEAF)
+ goto err_level;
+ break;
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_ROW_INT:
+ if (dsk->level <= WT_LLEAF) {
+err_level: __wt_api_db_errx(db,
+ "%s page at addr %lu has incorrect tree level "
+ "of %lu",
+ __wt_page_type_string(dsk),
+ (u_long)addr, (u_long)dsk->level);
+ return (WT_ERROR);
+ }
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+
+ if (dsk->unused[0] != '\0' || dsk->unused[1] != '\0') {
+ __wt_api_db_errx(db,
+ "page at addr %lu has non-zero unused header fields",
+ (u_long)addr);
+ return (WT_ERROR);
+ }
+
+ /* Verify the items on the page. */
+ switch (dsk->type) {
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_DUP_LEAF:
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ WT_RET(__wt_verify_dsk_item(toc, dsk, addr, size));
+ break;
+ case WT_PAGE_COL_INT:
+ WT_RET(__wt_verify_dsk_col_int(db, dsk, addr, size));
+ break;
+ case WT_PAGE_COL_FIX:
+ WT_RET(__wt_verify_dsk_col_fix(db, dsk, addr, size));
+ break;
+ case WT_PAGE_COL_RLE:
+ WT_RET(__wt_verify_dsk_col_rle(db, dsk, addr, size));
+ break;
+ case WT_PAGE_OVFL:
+ WT_RET(__wt_verify_dsk_ovfl(toc, dsk, addr, size));
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_verify_dsk_item --
+ * Walk a disk page of WT_ITEMs, and verify them.
+ */
+static int
+__wt_verify_dsk_item(
+ WT_TOC *toc, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size)
+{
+ enum { IS_FIRST, WAS_KEY, WAS_DATA, WAS_DUP_DATA } last_item_type;
+ DB *db;
+ WT_ITEM *item;
+ WT_OVFL *ovfl;
+ WT_OFF *off;
+ off_t file_size;
+ uint8_t *end;
+ uint32_t i, item_num, item_len, item_type;
+
+ db = toc->db;
+ file_size = db->idb->fh->file_size;
+
+ end = (uint8_t *)dsk + size;
+
+ last_item_type = IS_FIRST;
+ item_num = 0;
+ WT_ITEM_FOREACH(dsk, item, i) {
+ ++item_num;
+
+ /* Check if this item is entirely on the page. */
+ if ((uint8_t *)item + sizeof(WT_ITEM) > end)
+ goto eop;
+
+ item_type = WT_ITEM_TYPE(item);
+ item_len = WT_ITEM_LEN(item);
+
+ /* Check the item's type. */
+ switch (item_type) {
+ case WT_ITEM_KEY:
+ case WT_ITEM_KEY_OVFL:
+ if (dsk->type != WT_PAGE_ROW_INT &&
+ dsk->type != WT_PAGE_ROW_LEAF)
+ goto item_vs_page;
+ break;
+ case WT_ITEM_KEY_DUP:
+ case WT_ITEM_KEY_DUP_OVFL:
+ if (dsk->type != WT_PAGE_DUP_INT)
+ goto item_vs_page;
+ break;
+ case WT_ITEM_DATA:
+ case WT_ITEM_DATA_OVFL:
+ if (dsk->type != WT_PAGE_COL_VAR &&
+ dsk->type != WT_PAGE_ROW_LEAF)
+ goto item_vs_page;
+ break;
+ case WT_ITEM_DATA_DUP:
+ case WT_ITEM_DATA_DUP_OVFL:
+ if (dsk->type != WT_PAGE_DUP_LEAF &&
+ dsk->type != WT_PAGE_ROW_LEAF)
+ goto item_vs_page;
+ break;
+ case WT_ITEM_DEL:
+ /* Deleted items only appear on column-store pages. */
+ if (dsk->type != WT_PAGE_COL_VAR)
+ goto item_vs_page;
+ break;
+ case WT_ITEM_OFF:
+ if (dsk->type != WT_PAGE_DUP_INT &&
+ dsk->type != WT_PAGE_ROW_INT &&
+ dsk->type != WT_PAGE_ROW_LEAF) {
+item_vs_page: __wt_api_db_errx(db,
+ "illegal item and page type combination "
+ "(item %lu on page at addr %lu is a %s "
+ "item on a %s page)",
+ (u_long)item_num, (u_long)addr,
+ __wt_item_type_string(item),
+ __wt_page_type_string(dsk));
+ return (WT_ERROR);
+ }
+ break;
+ default:
+ __wt_api_db_errx(db,
+ "item %lu on page at addr %lu has an illegal type "
+ "of %lu",
+ (u_long)item_num, (u_long)addr, (u_long)item_type);
+ return (WT_ERROR);
+ }
+
+ /*
+ * Check the item type ordering. For row-stores, check for:
+ * two keys in a row,
+ * two non-dup data items in a row,
+ * a non-dup data item followed by a dup data item
+ * a data item as the first item on a page.
+ *
+ * Column-stores only have data items, and we already checked
+ * to see if there was anything else on the page. Skip the
+ * order check.
+ */
+ if (dsk->type == WT_PAGE_COL_VAR)
+ goto skip_order_check;
+
+ switch (item_type) {
+ case WT_ITEM_KEY:
+ case WT_ITEM_KEY_OVFL:
+ case WT_ITEM_KEY_DUP:
+ case WT_ITEM_KEY_DUP_OVFL:
+ switch (last_item_type) {
+ case IS_FIRST:
+ case WAS_DATA:
+ case WAS_DUP_DATA:
+ last_item_type = WAS_KEY;
+ break;
+ case WAS_KEY:
+ __wt_api_db_errx(db,
+ "item %lu on page at addr %lu is first of "
+ "two adjacent keys",
+ (u_long)item_num - 1, (u_long)addr);
+ return (WT_ERROR);
+ }
+ break;
+ case WT_ITEM_DATA:
+ case WT_ITEM_DATA_DUP:
+ case WT_ITEM_DATA_DUP_OVFL:
+ case WT_ITEM_DATA_OVFL:
+ case WT_ITEM_DEL:
+ case WT_ITEM_OFF:
+ if (last_item_type == IS_FIRST) {
+ __wt_api_db_errx(db,
+ "page at addr %lu begins with a data item",
+ (u_long)addr);
+ return (WT_ERROR);
+ }
+ switch (item_type) {
+ case WT_ITEM_DATA:
+ case WT_ITEM_DATA_DUP:
+ case WT_ITEM_DEL:
+ case WT_ITEM_OFF:
+ switch (last_item_type) {
+ case IS_FIRST:
+ case WAS_DATA:
+ case WAS_DUP_DATA:
+ __wt_api_db_errx(db,
+ "item %lu on page at addr %lu is "
+ "the first of two adjacent data "
+ "items",
+ (u_long)item_num - 1, (u_long)addr);
+ return (WT_ERROR);
+ case WAS_KEY:
+ last_item_type = WAS_DATA;
+ break;
+ }
+ break;
+ case WT_ITEM_DATA_DUP_OVFL:
+ case WT_ITEM_DATA_OVFL:
+ switch (last_item_type) {
+ case WAS_DATA:
+ __wt_api_db_errx(db,
+ "item %lu on page at addr %lu is "
+ "a non-duplicate data item "
+ "followed by a duplicate data item",
+ (u_long)item_num - 1, (u_long)addr);
+ return (WT_ERROR);
+ case IS_FIRST:
+ case WAS_DUP_DATA:
+ case WAS_KEY:
+ last_item_type = WAS_DUP_DATA;
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+
+skip_order_check:
+ /* Check the item's length. */
+ switch (item_type) {
+ case WT_ITEM_KEY:
+ case WT_ITEM_KEY_DUP:
+ case WT_ITEM_DATA:
+ case WT_ITEM_DATA_DUP:
+ /* The length is variable, we can't check it. */
+ break;
+ case WT_ITEM_KEY_OVFL:
+ case WT_ITEM_KEY_DUP_OVFL:
+ case WT_ITEM_DATA_OVFL:
+ case WT_ITEM_DATA_DUP_OVFL:
+ if (item_len != sizeof(WT_OVFL))
+ goto item_len;
+ break;
+ case WT_ITEM_DEL:
+ if (item_len != 0)
+ goto item_len;
+ break;
+ case WT_ITEM_OFF:
+ if (item_len != sizeof(WT_OFF)) {
+item_len: __wt_api_db_errx(db,
+ "item %lu on page at addr %lu has an "
+ "incorrect length",
+ (u_long)item_num, (u_long)addr);
+ return (WT_ERROR);
+ }
+ break;
+ default:
+ break;
+ }
+
+ /* Check if the item is entirely on the page. */
+ if ((uint8_t *)WT_ITEM_NEXT(item) > end)
+ goto eop;
+
+ /* Check if the referenced item is entirely in the file. */
+ switch (item_type) {
+ case WT_ITEM_KEY_OVFL:
+ case WT_ITEM_KEY_DUP_OVFL:
+ case WT_ITEM_DATA_OVFL:
+ case WT_ITEM_DATA_DUP_OVFL:
+ ovfl = WT_ITEM_BYTE_OVFL(item);
+ if (WT_ADDR_TO_OFF(db, ovfl->addr) +
+ WT_HDR_BYTES_TO_ALLOC(db, ovfl->size) > file_size)
+ goto eof;
+ break;
+ case WT_ITEM_OFF:
+ off = WT_ITEM_BYTE_OFF(item);
+ if (WT_ADDR_TO_OFF(db, off->addr) +
+ off->size > file_size)
+ goto eof;
+ break;
+ default:
+ break;
+ }
+ }
+ return (0);
+
+eof: return (__wt_verify_eof(db, item_num, addr));
+eop: return (__wt_verify_eop(db, item_num, addr));
+}
+
+/*
+ * __wt_verify_dsk_col_int --
+ * Walk a WT_PAGE_COL_INT disk page and verify it.
+ */
+static int
+__wt_verify_dsk_col_int(DB *db, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size)
+{
+ IDB *idb;
+ WT_OFF *off;
+ uint8_t *end;
+ uint32_t i, entry_num;
+
+ idb = db->idb;
+ end = (uint8_t *)dsk + size;
+
+ entry_num = 0;
+ WT_OFF_FOREACH(dsk, off, i) {
+ ++entry_num;
+
+ /* Check if this entry is entirely on the page. */
+ if ((uint8_t *)off + sizeof(WT_OFF) > end)
+ return (__wt_verify_eop(db, entry_num, addr));
+
+ /* Check if the reference is past the end-of-file. */
+ if (WT_ADDR_TO_OFF(
+ db, off->addr) + off->size > idb->fh->file_size)
+ return (__wt_verify_eof(db, entry_num, addr));
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_verify_dsk_col_fix --
+ * Walk a WT_PAGE_COL_FIX disk page and verify it.
+ */
+static int
+__wt_verify_dsk_col_fix(DB *db, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size)
+{
+ u_int len;
+ uint32_t i, j, entry_num;
+ uint8_t *data, *end, *p;
+
+ len = db->fixed_len;
+ end = (uint8_t *)dsk + size;
+
+ entry_num = 0;
+ WT_FIX_FOREACH(db, dsk, data, i) {
+ ++entry_num;
+
+ /* Check if this entry is entirely on the page. */
+ if (data + len > end)
+ return (__wt_verify_eop(db, entry_num, addr));
+
+ /* Deleted items are entirely nul bytes. */
+ p = data;
+ if (WT_FIX_DELETE_ISSET(data)) {
+ if (*p != WT_FIX_DELETE_BYTE)
+ goto delfmt;
+ for (j = 1; j < db->fixed_len; ++j)
+ if (*++p != '\0')
+ goto delfmt;
+ }
+ }
+
+ return (0);
+
+delfmt: return (__wt_verify_delfmt(db, entry_num, addr));
+}
+
+/*
+ * __wt_verify_dsk_col_rle --
+ * Walk a WT_PAGE_COL_RLE disk page and verify it.
+ */
+static int
+__wt_verify_dsk_col_rle(DB *db, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size)
+{
+ u_int len;
+ uint32_t i, j, entry_num;
+ uint8_t *data, *end, *last_data, *p;
+
+ end = (uint8_t *)dsk + size;
+
+ last_data = NULL;
+ len = db->fixed_len + sizeof(uint16_t);
+
+ entry_num = 0;
+ WT_RLE_REPEAT_FOREACH(db, dsk, data, i) {
+ ++entry_num;
+
+ /* Check if this entry is entirely on the page. */
+ if (data + len > end)
+ return (__wt_verify_eop(db, entry_num, addr));
+
+ /* Count must be non-zero. */
+ if (WT_RLE_REPEAT_COUNT(data) == 0) {
+ __wt_api_db_errx(db,
+ "fixed-length entry %lu on page at addr "
+ "%lu has a repeat count of 0",
+ (u_long)entry_num, (u_long)addr);
+ return (WT_ERROR);
+ }
+
+ /* Deleted items are entirely nul bytes. */
+ p = WT_RLE_REPEAT_DATA(data);
+ if (WT_FIX_DELETE_ISSET(p)) {
+ if (*p != WT_FIX_DELETE_BYTE)
+ goto delfmt;
+ for (j = 1; j < db->fixed_len; ++j)
+ if (*++p != '\0')
+ goto delfmt;
+ }
+
+ /*
+ * If the previous data is the same as this data, we
+ * missed an opportunity for compression -- complain.
+ */
+ if (last_data != NULL &&
+ memcmp(WT_RLE_REPEAT_DATA(last_data),
+ WT_RLE_REPEAT_DATA(data), db->fixed_len) == 0 &&
+ WT_RLE_REPEAT_COUNT(last_data) < UINT16_MAX) {
+ __wt_api_db_errx(db,
+ "fixed-length entries %lu and %lu on page "
+ "at addr %lu are identical and should have "
+ "been compressed",
+ (u_long)entry_num,
+ (u_long)entry_num - 1, (u_long)addr);
+ return (WT_ERROR);
+ }
+ last_data = data;
+ }
+
+ return (0);
+
+delfmt: return (__wt_verify_delfmt(db, entry_num, addr));
+}
+
+/*
+ * __wt_verify_overflow_col --
+ * Check on-page column-store overflow references.
+ */
+static int
+__wt_verify_overflow_col(WT_TOC *toc, WT_PAGE *page, WT_VSTUFF *vs)
+{
+ WT_COL *cip;
+ WT_ITEM *item;
+ uint32_t i;
+
+ /* Walk the in-memory page, verifying overflow items. */
+ WT_INDX_FOREACH(page, cip, i) {
+ item = cip->data;
+ if (WT_ITEM_TYPE(item) == WT_ITEM_DATA_OVFL)
+ WT_RET(__wt_verify_overflow_common(
+ toc, WT_ITEM_BYTE_OVFL(item),
+ WT_COL_SLOT(page, cip) + 1, page->addr, vs));
+ }
+ return (0);
+}
+
+/*
+ * __wt_verify_overflow_row --
+ * Check on-page row-store overflow references.
+ */
+static int
+__wt_verify_overflow_row(WT_TOC *toc, WT_PAGE *page, WT_VSTUFF *vs)
+{
+ WT_ITEM *item;
+ WT_ROW *rip;
+ uint32_t i;
+ int check_data;
+
+ /*
+ * Walk the in-memory page, verifying overflow items. We service 4
+ * page types here: DUP_INT, DUP_LEAF, ROW_INT and ROW_LEAF. In the
+ * case of DUP_INT, DUP_LEAF and ROW_INT, we only check the key, as
+ * there is either no data item, or the data item is known to not be
+ * an overflow page. In the case of ROW_LEAF, we have to check both
+ * the key and the data item.
+ */
+ check_data = page->dsk->type == WT_PAGE_ROW_LEAF ? 1 : 0;
+
+ /* Walk the in-memory page, verifying overflow items. */
+ WT_INDX_FOREACH(page, rip, i) {
+ item = rip->key;
+ switch (WT_ITEM_TYPE(item)) {
+ case WT_ITEM_KEY_OVFL:
+ case WT_ITEM_KEY_DUP_OVFL:
+ WT_RET(__wt_verify_overflow_common(
+ toc, WT_ITEM_BYTE_OVFL(item),
+ WT_ROW_SLOT(page, rip) + 1, page->addr, vs));
+ break;
+ default:
+ break;
+ }
+
+ if (!check_data)
+ continue;
+
+ item = rip->data;
+ switch (WT_ITEM_TYPE(item)) {
+ case WT_ITEM_DATA_OVFL:
+ case WT_ITEM_DATA_DUP_OVFL:
+ WT_RET(__wt_verify_overflow_common(
+ toc, WT_ITEM_BYTE_OVFL(item),
+ WT_ROW_SLOT(page, rip) + 1, page->addr, vs));
+ break;
+ default:
+ break;
+ }
+ }
+ return (0);
+}
+
+/*
+ * __wt_verify_overflow_common --
+ * Common code that reads in an overflow page and checks it.
+ */
+static int
+__wt_verify_overflow_common(WT_TOC *toc,
+ WT_OVFL *ovfl, uint32_t entry_num, uint32_t page_ref_addr, WT_VSTUFF *vs)
+{
+ DB *db;
+ DBT *scratch1;
+ WT_PAGE_DISK *dsk;
+ uint32_t addr, size;
+ int ret;
+
+ db = toc->db;
+ scratch1 = NULL;
+ ret = 0;
+
+ addr = ovfl->addr;
+ size = WT_HDR_BYTES_TO_ALLOC(db, ovfl->size);
+
+ /* Allocate enough memory to hold the overflow pages. */
+ WT_RET(__wt_scr_alloc(toc, size, &scratch1));
+
+ /* Read the page. */
+ dsk = scratch1->data;
+ WT_ERR(__wt_page_disk_read(toc, dsk, addr, size));
+
+ /*
+ * Verify the disk image -- this function would normally be called
+ * from the asynchronous read server, but overflow pages are read
+ * synchronously. Regardless, we break the overflow verification code
+ * into two parts, on-disk format checking and internal checking,
+ * just so it looks like all of the other page type checking.
+ */
+ WT_ERR(__wt_verify_dsk_ovfl(toc, dsk, addr, size));
+
+ /* Add the fragments. */
+ WT_ERR(__wt_verify_addfrag(toc, addr, size, vs));
+
+ /*
+ * The only other thing to check is that the size we have in the page
+ * matches the size on the underlying overflow page.
+ */
+ if (ovfl->size != dsk->u.datalen) {
+ __wt_api_db_errx(db,
+ "overflow page reference in item %lu on page at addr %lu "
+ "does not match the data size on the overflow page",
+ (u_long)entry_num, (u_long)page_ref_addr);
+ ret = WT_ERROR;
+ }
+
+err: __wt_scr_release(&scratch1);
+
+ return (ret);
+}
+
+/*
+ * __wt_verify_dsk_ovfl --
+ * Verify a WT_PAGE_OVFL disk page.
+ */
+static int
+__wt_verify_dsk_ovfl(
+ WT_TOC *toc, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size)
+{
+ DB *db;
+ uint32_t len;
+ uint8_t *p;
+
+ db = toc->db;
+
+ if (dsk->u.datalen == 0) {
+ __wt_api_db_errx(db,
+ "overflow page at addr %lu has no data", (u_long)addr);
+ return (WT_ERROR);
+ }
+
+ /* Any page data after the overflow record should be nul bytes. */
+ p = (uint8_t *)dsk + (sizeof(WT_PAGE_DISK) + dsk->u.datalen);
+ len = size - (sizeof(WT_PAGE_DISK) + dsk->u.datalen);
+ for (; len > 0; ++p, --len)
+ if (*p != '\0') {
+ __wt_api_db_errx(db,
+ "overflow page at addr %lu has non-zero trailing "
+ "bytes",
+ (u_long)addr);
+ return (WT_ERROR);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_verify_eop --
+ * Generic item extends past the end-of-page error.
+ */
+static int
+__wt_verify_eop(DB *db, uint32_t entry_num, uint32_t addr)
+{
+ __wt_api_db_errx(db,
+ "item %lu on page at addr %lu extends past the end of the page",
+ (u_long)entry_num, (u_long)addr);
+ return (WT_ERROR);
+}
+
+/*
+ * __wt_verify_eof --
+ * Generic item references non-existent file pages error.
+ */
+static int
+__wt_verify_eof(DB *db, uint32_t entry_num, uint32_t addr)
+{
+ __wt_api_db_errx(db,
+ "off-page item %lu on page at addr %lu references non-existent "
+ "file pages",
+ (u_long)entry_num, (u_long)addr);
+ return (WT_ERROR);
+}
+
+/*
+ * __wt_verify_delfmt --
+ * WT_PAGE_COL_FIX and WT_PAGE_COL_RLE error where a deleted item has
+ * non-nul bytes.
+ */
+static int
+__wt_verify_delfmt(DB *db, uint32_t entry_num, uint32_t addr)
+{
+ __wt_api_db_errx(db,
+ "deleted fixed-length entry %lu on page at addr %lu has non-nul "
+ "bytes",
+ (u_long)entry_num, (u_long)addr);
+ return (WT_ERROR);
+}
+
+/*
+ * __wt_verify_addfrag --
+ * Add the WT_PAGE's fragments to the list, and complain if we've already
+ * verified this chunk of the file.
+ */
+static int
+__wt_verify_addfrag(WT_TOC *toc, uint32_t addr, uint32_t size, WT_VSTUFF *vs)
+{
+ DB *db;
+ uint32_t frags, i;
+
+ db = toc->db;
+
+ frags = WT_OFF_TO_ADDR(db, size);
+ for (i = 0; i < frags; ++i)
+ if (bit_test(vs->fragbits, addr + i)) {
+ __wt_api_db_errx(db,
+ "page fragment at addr %lu already verified",
+ (u_long)addr);
+ return (0);
+ }
+ bit_nset(vs->fragbits, addr, addr + (frags - 1));
+ return (0);
+}
+
+/*
+ * __wt_verify_checkfrag --
+ * Verify we've checked all the fragments in the file.
+ */
+static int
+__wt_verify_checkfrag(DB *db, WT_VSTUFF *vs)
+{
+ int ffc, ffc_start, ffc_end, frags, ret;
+
+ frags = (int)vs->frags; /* XXX: bitstring.h wants "ints" */
+ ret = 0;
+
+ /* Check for page fragments we haven't verified. */
+ for (ffc_start = ffc_end = -1;;) {
+ bit_ffc(vs->fragbits, frags, &ffc);
+ if (ffc != -1) {
+ bit_set(vs->fragbits, ffc);
+ if (ffc_start == -1) {
+ ffc_start = ffc_end = ffc;
+ continue;
+ }
+ if (ffc_end == ffc - 1) {
+ ffc_end = ffc;
+ continue;
+ }
+ }
+ if (ffc_start != -1) {
+ if (ffc_start == ffc_end)
+ __wt_api_db_errx(db,
+ "fragment %d was never verified",
+ ffc_start);
+ else
+ __wt_api_db_errx(db,
+ "fragments %d to %d were never verified",
+ ffc_start, ffc_end);
+ ret = WT_ERROR;
+ }
+ ffc_start = ffc_end = ffc;
+ if (ffc == -1)
+ break;
+ }
+ return (ret);
+}
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
new file mode 100644
index 00000000000..f5ef9674f9b
--- /dev/null
+++ b/src/btree/bt_walk.c
@@ -0,0 +1,306 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * There are two tree-walk implementations: a textbook, depth-first recursive
+ * tree walk in __wt_tree_walk(), and a non-recursive, depth-first tree walk
+ * in __wt_walk_{begin,end,next}().
+ *
+ * The simple recursive walk is sufficient in most cases -- a hazard reference
+ * is obtained on each page in turn, a worker function is called on the page,
+ * then the hazard reference is released.
+ *
+ * The complicated tree walk routine was added because the cache eviction code
+ * needs:
+ * + to walk the tree a few pages at a time, that is, periodically wake,
+ * visit some pages, then go back to sleep, which requires enough state
+ * to restart the traversal at any point,
+ * + to only visit pages that currently appear in the cache,
+ * + to return the WT_REF structure (not the WT_PAGE structure),
+ * + to walk files not associated with the current WT_TOC's DB handle,
+ * + and finally, it doesn't require a hazard reference.
+ *
+ * My guess is we'll generalize a more complicated walk at some point, which
+ * means some or all of those behaviors will become configurable, and that's
+ * why the code lives here instead of in the eviction code.
+ */
+
+/*
+ * __wt_tree_walk --
+ * Depth-first recursive walk of a btree, calling a worker function on
+ * each page.
+ */
+int
+__wt_tree_walk(WT_TOC *toc, WT_REF *ref,
+ uint32_t flags, int (*work)(WT_TOC *, WT_PAGE *, void *), void *arg)
+{
+ IDB *idb;
+ WT_COL *cip;
+ WT_OFF *off;
+ WT_PAGE *page;
+ WT_ROW *rip;
+ uint32_t i;
+ int ret;
+
+ WT_ENV_FCHK(
+ toc->env, "__wt_tree_walk", flags, WT_APIMASK_BT_TREE_WALK);
+
+ idb = toc->db->idb;
+
+ /*
+ * A NULL WT_REF means to start at the top of the tree -- it's just
+ * a convenience.
+ */
+ page = ref == NULL ? idb->root_page.page : ref->page;
+
+ /*
+ * Walk any internal pages, descending through any off-page references.
+ *
+ * Descending into row-store off-page duplicate trees is optional for
+ * two reasons. (1) it may be faster to call this function recursively
+ * from the worker function, which is already walking the page, and (2)
+ * information for off-page dup trees is split (the key is on the
+ * row-leaf page, and the data is obviously in the off-page dup tree):
+ * we need the key when we dump the data, and that would be a hard
+ * special case in this code. Functions where it's both possible and
+ * no slower to walk off-page dupliate trees in this code can request
+ * it be done here.
+ */
+ switch (page->dsk->type) {
+ case WT_PAGE_COL_INT:
+ WT_INDX_FOREACH(page, cip, i) {
+ /* cip references the subtree containing the record */
+ ref = WT_COL_REF(page, cip);
+ if (LF_ISSET(WT_WALK_CACHE) && ref->state != WT_OK)
+ continue;
+
+ off = WT_COL_OFF(cip);
+ WT_RET(__wt_page_in(toc, page, ref, off, 0));
+ ret = __wt_tree_walk(toc, ref, flags, work, arg);
+ __wt_hazard_clear(toc, ref->page);
+ if (ret != 0)
+ return (ret);
+ }
+ break;
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_ROW_INT:
+ WT_INDX_FOREACH(page, rip, i) {
+ /* rip references the subtree containing the record */
+ ref = WT_ROW_REF(page, rip);
+ if (LF_ISSET(WT_WALK_CACHE) && ref->state != WT_OK)
+ continue;
+
+ off = WT_ROW_OFF(rip);
+ WT_RET(__wt_page_in(toc, page, ref, off, 0));
+ ret = __wt_tree_walk(toc, ref, flags, work, arg);
+ __wt_hazard_clear(toc, ref->page);
+ if (ret != 0)
+ return (ret);
+ }
+ break;
+ case WT_PAGE_ROW_LEAF:
+ if (!LF_ISSET(WT_WALK_OFFDUP))
+ break;
+ WT_INDX_FOREACH(page, rip, i) {
+ if (WT_ITEM_TYPE(rip->data) != WT_ITEM_OFF)
+ break;
+
+ /*
+ * Recursively call the tree-walk function for the
+ * off-page duplicate tree.
+ */
+ ref = WT_ROW_REF(page, rip);
+ if (LF_ISSET(WT_WALK_CACHE) && ref->state != WT_OK)
+ continue;
+
+ off = WT_ROW_OFF(rip);
+ WT_RET(__wt_page_in(toc, page, ref, off, 0));
+ ret = __wt_tree_walk(toc, ref, flags, work, arg);
+ __wt_hazard_clear(toc, ref->page);
+ if (ret != 0)
+ return (ret);
+ }
+ break;
+ default:
+ break;
+ }
+
+ /*
+ * Don't call the worker function for any page until all of its children
+ * have been visited. This allows the walker function to be used for
+ * the sync method, where reconciling a modified child page modifies the
+ * parent.
+ */
+ WT_RET(work(toc, page, arg));
+
+ return (0);
+}
+
+/*
+ * __wt_walk_begin --
+ * Start a tree walk.
+ */
+int
+__wt_walk_begin(WT_TOC *toc, WT_REF *ref, WT_WALK *walk)
+{
+ ENV *env;
+
+ env = toc->env;
+
+ /*
+ * The caller may be restarting a walk, so the structure may already
+ * be allocated. Allocate 20 slots: it's always going to be enough.
+ */
+ if (walk->tree_len == 0)
+ WT_RET(__wt_realloc(env, &walk->tree_len,
+ 20 * sizeof(WT_WALK_ENTRY), &walk->tree));
+ walk->tree_slot = 0;
+
+ walk->tree[0].ref = ref;
+ walk->tree[0].indx = 0;
+ walk->tree[0].visited = 0;
+
+ return (0);
+}
+
+/*
+ * __wt_walk_end --
+ * End a tree walk.
+ */
+void
+__wt_walk_end(ENV *env, WT_WALK *walk)
+{
+ __wt_free(env, walk->tree, walk->tree_len);
+}
+
+/*
+ * __wt_walk_next --
+ * Return the next WT_REF/WT_PAGE in the tree, in a non-recursive way.
+ */
+int
+__wt_walk_next(WT_TOC *toc, WT_WALK *walk, WT_REF **refp)
+{
+ DB *db;
+ ENV *env;
+ WT_PAGE *page, *child;
+ WT_REF *ref;
+ WT_WALK_ENTRY *e;
+ uint elem;
+
+ env = toc->env;
+ db = toc->db;
+
+ e = &walk->tree[walk->tree_slot];
+ page = e->ref->page;
+
+ /*
+ * Coming into this function we have either a tree internal page (and
+ * we're walking the array of children), or a row-leaf page (and we're
+ * walking the array of off-page duplicate trees).
+ *
+ * If we've reached the end of this page, and haven't yet returned it,
+ * do that now. If the page has been returned, traversal is finished:
+ * pop the stack and call ourselve recursively, unless the entire tree
+ * has been traversed, in which case we return NULL.
+ */
+ if (e->visited) {
+ if (walk->tree_slot == 0) {
+ *refp = NULL;
+ return (0);
+ } else {
+ --walk->tree_slot;
+ return (__wt_walk_next(toc, walk, refp));
+ }
+ } else
+ if (e->indx == page->indx_count) {
+eop: e->visited = 1;
+ *refp = e->ref;
+ return (0);
+ }
+
+ /* Find the next WT_REF/WT_PAGE pair present in the cache. */
+ for (;;) {
+ switch (page->dsk->type) {
+ case WT_PAGE_ROW_LEAF:
+ ref = page->u3.dup[e->indx];
+ break;
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_ROW_INT:
+ ref = &page->u3.ref[e->indx];
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+
+ /*
+ * The row-leaf page off-page duplicates tree array has empty
+ * slots (unlike col/row internal pages), so check for a NULL
+ * ref.
+ *
+ * We only care about pages in the cache.
+ */
+ if (ref != NULL && ref->state == WT_OK)
+ break;
+
+ /*
+ * If we don't find another WT_REF/WT_OFF pair, do the
+ * post-order visit.
+ */
+ if (++e->indx == page->indx_count)
+ goto eop;
+ }
+
+ /*
+ * Check to see if the page has sub-trees associated with it, in which
+ * case we traverse those pages.
+ */
+ child = ref->page;
+ switch (child->dsk->type) {
+ case WT_PAGE_ROW_LEAF:
+ /*
+ * Check for off-page duplicates -- if there are any, push them
+ * onto the stack and recursively call ourselves to descend the
+ * tree.
+ */
+ if (!WT_PAGE_DUP_TREES(child))
+ break;
+ /* FALLTHROUGH */
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_ROW_INT:
+ /*
+ * The page has children.
+ *
+ * First, move past this child, then push the child onto our
+ * stack, and recursively descend the tree.
+ */
+ ++e->indx;
+
+ /* Check to see if we grew past the end of our stack. */
+ elem = walk->tree_len / sizeof(WT_WALK_ENTRY);
+ if (walk->tree_slot >= elem)
+ WT_RET(__wt_realloc(env, &walk->tree_len,
+ (elem + 20) * sizeof(WT_WALK_ENTRY), &walk->tree));
+
+ e = &walk->tree[++walk->tree_slot];
+ e->ref = ref;
+ e->indx = 0;
+ e->visited = 0;
+ return (__wt_walk_next(toc, walk, refp));
+ default:
+ break;
+ }
+
+ /* Return the child page, it's not interesting for further traversal. */
+ ++e->indx;
+ *refp = ref;
+ return (0);
+}
diff --git a/src/btree/c_drain.c b/src/btree/c_drain.c
new file mode 100644
index 00000000000..c213f652e75
--- /dev/null
+++ b/src/btree/c_drain.c
@@ -0,0 +1,940 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2010 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_evict(WT_TOC *);
+static int __wt_evict_compare_lru(const void *a, const void *b);
+static int __wt_evict_compare_page(const void *a, const void *b);
+static void __wt_evict_hazard_check(WT_TOC *);
+static int __wt_evict_hazard_compare(const void *a, const void *b);
+static void __wt_evict_page(WT_TOC *, int);
+static int __wt_evict_page_subtrees(WT_PAGE *);
+static void __wt_evict_set(WT_TOC *);
+static void __wt_evict_state_check(WT_TOC *);
+static int __wt_evict_walk(WT_TOC *);
+static int __wt_evict_walk_single(WT_TOC *, IDB *, uint);
+static void __wt_evict_write(WT_TOC *);
+
+#ifdef HAVE_DIAGNOSTIC
+static void __wt_evict_hazard_validate(ENV *, WT_PAGE *);
+#endif
+
+/*
+ * Tuning constants -- I hesitate to call this tuning, but we should review some
+ * number of pages from each file's in-memory tree for each page we evict, and
+ * we should amortize the comparison of the hazard references across some number
+ * of eviction candidates.
+ */
+#define WT_EVICT_GROUP 10 /* Evict N pages at a time */
+#define WT_EVICT_WALK_PER_TABLE 5 /* Pages to visit per file */
+#define WT_EVICT_WALK_BASE 25 /* Pages tracked across file visits */
+
+/*
+ * WT_EVICT_FOREACH --
+ * Walk a list of eviction candidates.
+ */
+#define WT_EVICT_FOREACH(cache, p, i) \
+ for ((i) = 0, (p) = (cache)->evict; (i) < WT_EVICT_GROUP; ++(i), ++(p))
+
+/*
+ * WT_EVICT_REF_CLR --
+ * Clear an eviction list entry.
+ */
+#define WT_EVICT_CLR(p) do { \
+ (p)->ref = NULL; \
+ (p)->idb = WT_DEBUG_POINT; \
+} while (0)
+
+/*
+ * __wt_workq_evict_server --
+ * See if the eviction server thread needs to be awakened.
+ */
+void
+__wt_workq_evict_server(ENV *env, int force)
+{
+ WT_CACHE *cache;
+ uint64_t bytes_inuse, bytes_max;
+
+ cache = env->ienv->cache;
+
+ /* If the eviction server is running, there's nothing to do. */
+ if (!cache->evict_sleeping)
+ return;
+
+ /*
+ * If we're locking out reads, or over our cache limit, or forcing the
+ * issue (when closing the environment), run the eviction server.
+ */
+ bytes_inuse = __wt_cache_bytes_inuse(cache);
+ bytes_max = WT_STAT(cache->stats, CACHE_BYTES_MAX);
+ if (!force && !cache->read_lockout && bytes_inuse < bytes_max)
+ return;
+
+ WT_VERBOSE(env, WT_VERB_EVICT, (env,
+ "waking eviction server: force %sset, read lockout %sset, "
+ "bytes inuse %s max (%lluMB %s %lluMB), ",
+ force ? "" : "not ", cache->read_lockout ? "" : "not ",
+ bytes_inuse <= bytes_max ? "<=" : ">",
+ (unsigned long long)(bytes_inuse / WT_MEGABYTE),
+ bytes_inuse <= bytes_max ? "<=" : ">",
+ (unsigned long long)(bytes_max / WT_MEGABYTE)));
+
+ cache->evict_sleeping = 0;
+ __wt_unlock(env, cache->mtx_evict);
+}
+
+/*
+ * __wt_cache_evict_server --
+ * Thread to evict pages from the cache.
+ */
+void *
+__wt_cache_evict_server(void *arg)
+{
+ ENV *env;
+ IENV *ienv;
+ WT_CACHE *cache;
+ WT_TOC *toc;
+ uint64_t bytes_inuse, bytes_max;
+ int ret;
+
+ env = arg;
+ ienv = env->ienv;
+ cache = ienv->cache;
+ ret = 0;
+
+ /* We need a thread of control because we're reading/writing pages. */
+ toc = NULL;
+ WT_ERR(__wt_toc_api_set(env, "CacheReconciliation", NULL, &toc));
+
+ /*
+ * Allocate memory for a copy of the hazard references -- it's a fixed
+ * size so doesn't need run-time adjustments.
+ */
+ cache->hazard_elem = env->toc_size * env->hazard_size;
+ WT_ERR(__wt_calloc(
+ env, cache->hazard_elem, sizeof(WT_PAGE *), &cache->hazard));
+ cache->hazard_len = cache->hazard_elem * sizeof(WT_PAGE *);
+
+ for (;;) {
+ WT_VERBOSE(env,
+ WT_VERB_EVICT, (env, "eviction server sleeping"));
+ cache->evict_sleeping = 1;
+ __wt_lock(env, cache->mtx_evict);
+ WT_VERBOSE(env,
+ WT_VERB_EVICT, (env, "eviction server waking"));
+
+ /*
+ * Check for environment exit; do it here, instead of the top of
+ * the loop because doing it here keeps us from doing a bunch of
+ * worked when simply awakened to quit.
+ */
+ if (!F_ISSET(ienv, WT_SERVER_RUN))
+ break;
+
+ for (;;) {
+ /*
+ * The cache eviction server is a long-running thread;
+ * its TOC must "enter" and "leave" the library
+ * periodically in order to be a good thread citizen.
+ */
+ WT_TOC_GEN_SET(toc);
+
+ /* Single-thread reconciliation. */
+ __wt_lock(env, cache->mtx_reconcile);
+ ret = __wt_evict(toc);
+ __wt_unlock(env, cache->mtx_reconcile);
+ if (ret != 0)
+ goto err;
+
+ WT_TOC_GEN_CLR(toc);
+
+ /*
+ * If we've locked out reads, keep evicting until we
+ * get to at least 5% under the maximum cache. Else,
+ * quit evicting as soon as we get under the maximum
+ * cache.
+ */
+ bytes_inuse = __wt_cache_bytes_inuse(cache);
+ bytes_max = WT_STAT(cache->stats, CACHE_BYTES_MAX);
+ if (cache->read_lockout) {
+ if (bytes_inuse <= bytes_max - (bytes_max / 20))
+ break;
+ } else if (bytes_inuse < bytes_max)
+ break;
+ }
+ }
+
+err: if (cache->evict != NULL)
+ __wt_free(env, cache->evict, cache->evict_len);
+ if (cache->hazard != NULL)
+ __wt_free(env, cache->hazard, cache->hazard_len);
+ if (toc != NULL)
+ WT_TRET(toc->close(toc, 0));
+
+ if (ret != 0)
+ __wt_api_env_err(env, ret, "cache eviction server error");
+
+ WT_VERBOSE(
+ env, WT_VERB_EVICT, (env, "cache eviction server exiting"));
+
+ return (NULL);
+}
+
+/*
+ * __wt_evict --
+ * Evict pages from the cache.
+ */
+static int
+__wt_evict(WT_TOC *toc)
+{
+ ENV *env;
+ WT_CACHE *cache;
+ WT_EVICT_LIST *evict;
+ uint elem, i, j;
+
+ env = toc->env;
+ cache = env->ienv->cache;
+
+ /* Get some more pages to consider for eviction. */
+ WT_RET(__wt_evict_walk(toc));
+
+ /*
+ * We have an array of page eviction references that may contain NULLs,
+ * as well as duplicate entries.
+ *
+ * First, sort the array by WT_REF address, then delete any duplicates.
+ * The reason is because we might evict the page but leave a duplicate
+ * entry in the "saved" area of the array, and that would be a NULL
+ * dereference on the next run. (If someone ever tries to remove this
+ * duplicate cleanup for better performance, you can't fix it just by
+ * checking the WT_REF state -- that only works if you are discarding
+ * a page from a single level of the tree; if you are discarding a
+ * page and its parent, the duplicate of the page's WT_REF might have
+ * been free'd before a subsequent review of the eviction array.)
+ */
+ evict = cache->evict;
+ elem = cache->evict_elem;
+ qsort(evict,
+ (size_t)elem, sizeof(WT_EVICT_LIST), __wt_evict_compare_page);
+ for (i = 0; i < elem; i = j)
+ for (j = i + 1; j < elem; ++j) {
+ /*
+ * If the leading pointer hits a NULL, we're done, the
+ * NULLs all sorted to the top of the array.
+ */
+ if (evict[j].ref == NULL)
+ goto done_duplicates;
+
+ /* Delete the second and any subsequent duplicates. */
+ if (evict[i].ref == evict[j].ref)
+ WT_EVICT_CLR(&evict[j]);
+ else
+ break;
+ }
+done_duplicates:
+
+ /* Second, sort the array by LRU. */
+ qsort(evict,
+ (size_t)elem, sizeof(WT_EVICT_LIST), __wt_evict_compare_lru);
+
+ /*
+ * Discarding pages is done in 5 steps:
+ * Set the WT_EVICT state
+ * Check for any hazard references
+ * Discard clean pages
+ * Reconcile dirty pages (making them clean)
+ * Discard clean pages
+ *
+ * The reason we release clean pages, then reconcile dirty pages, then
+ * release clean pages again is because reconciling a dirty page is a
+ * slow operation, and this releases space sooner. (Arguably, we are
+ * going to discard all of the pages anyway, so what does it matter if
+ * we make clean pages wait for the dirty page writes? On the other
+ * hand, it's a small change and benefits any thread waiting to read a
+ * clean page we picked for discarding, unlikely though that may be.)
+ */
+ __wt_evict_set(toc);
+ __wt_evict_hazard_check(toc);
+ __wt_evict_state_check(toc);
+ __wt_evict_page(toc, 0);
+ __wt_evict_write(toc);
+ __wt_evict_page(toc, 1);
+
+ return (0);
+}
+
+/*
+ * __wt_evict_walk --
+ * Fill in the array by walk the next set of pages.
+ */
+static int
+__wt_evict_walk(WT_TOC *toc)
+{
+ ENV *env;
+ IDB *idb;
+ IENV *ienv;
+ WT_CACHE *cache;
+ uint elem, i;
+ int ret;
+
+ env = toc->env;
+ ienv = env->ienv;
+ cache = ienv->cache;
+
+ /*
+ * Resize the array in which we're tracking pages, as necessary, then
+ * get some pages from each underlying file. We hold a mutex for the
+ * entire time -- it's slow, but (1) how often do new files get added
+ * or removed to/from the system, and (2) it's all in-memory stuff, so
+ * it's not that slow.
+ */
+ ret = 0;
+ __wt_lock(env, ienv->mtx);
+ elem = WT_EVICT_WALK_BASE + (ienv->dbqcnt * WT_EVICT_WALK_PER_TABLE);
+ if (elem <= cache->evict_elem || (ret = __wt_realloc(env,
+ &cache->evict_len,
+ elem * sizeof(WT_EVICT_LIST), &cache->evict)) == 0) {
+ cache->evict_elem = elem;
+
+ i = WT_EVICT_WALK_BASE;
+ TAILQ_FOREACH(idb, &ienv->dbqh, q) {
+ if ((ret = __wt_evict_walk_single(toc, idb, i)) != 0)
+ break;
+ i += WT_EVICT_WALK_PER_TABLE;
+ }
+ }
+ __wt_unlock(env, ienv->mtx);
+ return (ret);
+}
+
+/*
+ * __wt_evict_walk_single --
+ * Get a few page eviction candidates from a single underlying file.
+ */
+static int
+__wt_evict_walk_single(WT_TOC *toc, IDB *idb, uint slot)
+{
+ WT_CACHE *cache;
+ WT_EVICT_LIST *evict;
+ int i, restarted_once;
+
+ cache = toc->env->ienv->cache;
+
+ /*
+ * Tricky little loop that restarts the walk as necessary, without
+ * resetting the count of pages retrieved.
+ */
+ i = restarted_once = 0;
+
+ /* If we haven't yet opened a tree-walk structure, do so. */
+ if (idb->evict_walk.tree == NULL)
+restart: WT_RET(
+ __wt_bt_walk_begin(toc, &idb->root_page, &idb->evict_walk));
+
+ /* Get the next WT_EVICT_WALK_PER_TABLE entries. */
+ do {
+ evict = &cache->evict[slot];
+ WT_RET(__wt_bt_walk_next(toc, &idb->evict_walk, &evict->ref));
+
+ /*
+ * Restart the walk as necessary, but only once (after one
+ * restart we've already acquired all of the pages, and we
+ * could loop infinitely on a tree with a single, pinned, page).
+ */
+ if (evict->ref == NULL) {
+ if (restarted_once++)
+ break;
+ goto restart;
+ }
+
+ evict->idb = idb;
+ ++slot;
+ } while (++i < WT_EVICT_WALK_PER_TABLE);
+
+ return (0);
+}
+
+/*
+ * __wt_evict_db_clear --
+ * Remove any entries for a file from the eviction list.
+ */
+void
+__wt_evict_db_clear(WT_TOC *toc)
+{
+ ENV *env;
+ IDB *idb;
+ IENV *ienv;
+ WT_CACHE *cache;
+ WT_EVICT_LIST *evict;
+ uint i;
+
+ env = toc->env;
+ idb = toc->db->idb;
+ ienv = env->ienv;
+ cache = ienv->cache;
+
+ /*
+ * Discard any entries in the eviction list to a file we're closing
+ * (the caller better have locked out the eviction thread).
+ */
+ if (cache->evict == NULL)
+ return;
+ WT_EVICT_FOREACH(cache, evict, i)
+ if (evict->ref != NULL && evict->idb == idb)
+ WT_EVICT_CLR(evict);
+}
+
+/*
+ * __wt_evict_set --
+ * Set the WT_EVICT flag on a set of pages.
+ */
+static void
+__wt_evict_set(WT_TOC *toc)
+{
+ ENV *env;
+ WT_CACHE *cache;
+ WT_EVICT_LIST *evict;
+ WT_REF *ref;
+ uint i;
+
+ env = toc->env;
+ cache = env->ienv->cache;
+
+ /*
+ * Set the entry state so readers don't try and use the pages. Once
+ * that's done, any thread searching for a page will either see our
+ * state value, or will have already set a hazard reference to the page.
+ * We don't evict a page with a hazard reference set, so we can't race.
+ *
+ * No memory flush needed, the state field is declared volatile.
+ */
+ WT_EVICT_FOREACH(cache, evict, i) {
+ if ((ref = evict->ref) == NULL)
+ continue;
+ ref->state = WT_EVICT;
+ }
+}
+
+/*
+ * __wt_evict_hazard_check --
+ * Compare the list of hazard references to the list of pages to be
+ * discarded.
+ */
+static void
+__wt_evict_hazard_check(WT_TOC *toc)
+{
+ ENV *env;
+ IENV *ienv;
+ WT_CACHE *cache;
+ WT_EVICT_LIST *evict;
+ WT_PAGE **hazard, **end_hazard, *page;
+ WT_REF *ref;
+ WT_STATS *stats;
+ uint i;
+
+ env = toc->env;
+ ienv = env->ienv;
+ cache = ienv->cache;
+ stats = cache->stats;
+
+ /* Sort the eviction candidates by WT_PAGE address. */
+ qsort(cache->evict, (size_t)WT_EVICT_GROUP,
+ sizeof(WT_EVICT_LIST), __wt_evict_compare_page);
+
+ /* Copy the hazard reference array and sort it by WT_PAGE address. */
+ hazard = cache->hazard;
+ end_hazard = hazard + cache->hazard_elem;
+ memcpy(hazard, ienv->hazard, cache->hazard_elem * sizeof(WT_PAGE *));
+ qsort(hazard, (size_t)cache->hazard_elem,
+ sizeof(WT_PAGE *), __wt_evict_hazard_compare);
+
+ /* Walk the lists in parallel and look for matches. */
+ WT_EVICT_FOREACH(cache, evict, i) {
+ if ((ref = evict->ref) == NULL)
+ continue;
+
+ /*
+ * Look for the page in the hazard list until we reach the end
+ * of the list or find a hazard pointer larger than the page.
+ */
+ for (page = ref->page;
+ hazard < end_hazard && *hazard < page; ++hazard)
+ ;
+ if (hazard == end_hazard)
+ break;
+
+ /*
+ * If we find a matching hazard reference, the page is in use:
+ * remove it from the eviction list.
+ *
+ * No memory flush needed, the state field is declared volatile.
+ */
+ if (*hazard == page) {
+ WT_VERBOSE(env, WT_VERB_EVICT, (env,
+ "eviction skipped page addr %lu (hazard reference)",
+ page->addr));
+ WT_STAT_INCR(stats, CACHE_EVICT_HAZARD);
+
+ /*
+ * A page with a low LRU and a hazard reference?
+ *
+ * Set the page's LRU so we don't select it again.
+ * Return the page to service.
+ * Discard our reference.
+ */
+ ref->page->read_gen = ++cache->read_gen;
+ ref->state = WT_OK;
+ WT_EVICT_CLR(evict);
+ }
+ }
+}
+
+/*
+ * __wt_evict_state_check --
+ * Confirm these are pages we want to evict.
+ */
+static void
+__wt_evict_state_check(WT_TOC *toc)
+{
+ ENV *env;
+ WT_CACHE *cache;
+ WT_EVICT_LIST *evict;
+ WT_PAGE *page;
+ WT_REF *ref;
+ uint i;
+
+ env = toc->env;
+ cache = env->ienv->cache;
+
+ /*
+ * We "own" the pages (we've flagged them for eviction, and there were
+ * no hazard references). Now do checks to see if these are pages we
+ * can evict -- we have to wait until after we own the page because the
+ * page might be updated and race with us.
+ */
+ WT_EVICT_FOREACH(cache, evict, i) {
+ if ((ref = evict->ref) == NULL)
+ continue;
+ page = ref->page;
+
+ /* Ignore pinned pages. */
+ if (F_ISSET(page, WT_PINNED)) {
+ WT_VERBOSE(env, WT_VERB_EVICT, (env,
+ "eviction skipped page addr %lu (pinned)",
+ page->addr));
+ goto skip;
+ }
+
+ /* Ignore pages with in-memory subtrees. */
+ switch (page->hdr->type) {
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ if (__wt_evict_page_subtrees(page)) {
+ WT_VERBOSE(env, WT_VERB_EVICT, (env,
+ "eviction skipped page addr %lu (subtrees)",
+ page->addr));
+ goto skip;
+ }
+ break;
+ default:
+ break;
+ }
+
+ continue;
+
+skip: /*
+ * Set the page's LRU so we don't select it again.
+ * Return the page to service.
+ * Discard our reference.
+ */
+ page->read_gen = ++cache->read_gen;
+ ref->state = WT_OK;
+ WT_EVICT_CLR(evict);
+ }
+}
+
+/*
+ * __wt_evict_write --
+ * Write any modified pages.
+ */
+static void
+__wt_evict_write(WT_TOC *toc)
+{
+ ENV *env;
+ WT_CACHE *cache;
+ WT_EVICT_LIST *evict;
+ WT_PAGE *page;
+ WT_REF *ref;
+ uint i;
+
+ env = toc->env;
+ cache = env->ienv->cache;
+
+ WT_EVICT_FOREACH(cache, evict, i) {
+ if ((ref = evict->ref) == NULL)
+ continue;
+ page = ref->page;
+
+ /* Ignore dirty pages. */
+ if (!WT_PAGE_IS_MODIFIED(page))
+ continue;
+
+ /*
+ * We're using our WT_TOC handle, it needs to reference the
+ * correct DB handle.
+ *
+ * XXX
+ * This is pretty sleazy, but I'm hesitant to try and drive
+ * a separate DB/IDB handle down through the reconciliation
+ * code.
+ */
+ toc->db = evict->idb->db;
+ (void)__wt_bt_rec_page(toc, page);
+ }
+}
+
+/*
+ * __wt_evict_page --
+ * Evict cache pages.
+ */
+static void
+__wt_evict_page(WT_TOC *toc, int was_dirty)
+{
+ ENV *env;
+ WT_CACHE *cache;
+ WT_EVICT_LIST *evict;
+ WT_PAGE *page;
+ WT_REF *ref;
+ WT_STATS *stats;
+ uint i;
+
+ env = toc->env;
+ cache = env->ienv->cache;
+ stats = cache->stats;
+
+ WT_EVICT_FOREACH(cache, evict, i) {
+ if ((ref = evict->ref) == NULL)
+ continue;
+ page = ref->page;
+
+ /*
+ * The first time we're called, we get rid of the clean pages;
+ * the second time we're called, we get rid of the pages that
+ * were dirty but have since been cleaned. Ignore dirty pages
+ * in all cases, it's simpler.
+ */
+ if (WT_PAGE_IS_MODIFIED(page))
+ continue;
+
+ if (was_dirty)
+ WT_STAT_INCR(stats, CACHE_EVICT_MODIFIED);
+ else
+ WT_STAT_INCR(stats, CACHE_EVICT_UNMODIFIED);
+
+#ifdef HAVE_DIAGNOSTIC
+ __wt_evict_hazard_validate(env, page);
+#endif
+ WT_VERBOSE(env, WT_VERB_EVICT, (env,
+ "cache evicting page addr %lu", page->addr));
+
+ /*
+ * Copy a page reference, then make the cache entry available
+ * for re-use.
+ *
+ * No memory flush needed, the state field is declared volatile.
+ */
+ ref->page = NULL;
+ ref->state = WT_EMPTY;
+
+ /* Remove the entry from the eviction list. */
+ WT_EVICT_CLR(evict);
+
+ /* We've got more space. */
+ WT_CACHE_PAGE_OUT(cache, page->size);
+
+ /* The page can no longer be found, free the memory. */
+ __wt_bt_page_discard(toc, page);
+ }
+}
+
+/*
+ * __wt_evict_page_subtrees --
+ * Return if a page has an in-memory subtree.
+ */
+static int
+__wt_evict_page_subtrees(WT_PAGE *page)
+{
+ WT_REF *ref, **dupp;
+ uint32_t i;
+
+ /*
+ * Return if a page has an in-memory subtree -- this array search could
+ * be replaced by a reference count in the page, but (1) the eviction
+ * thread isn't where I expect performance problems, (2) I hate to lose
+ * more bytes on every page, (3) how often will an internal page be
+ * evicted anyway?
+ */
+ switch (page->hdr->type) {
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_ROW_INT:
+ WT_REF_FOREACH(page, ref, i)
+ if (ref->state != WT_EMPTY)
+ return (1);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ if (WT_PAGE_DUP_TREES(page))
+ WT_DUP_FOREACH(page, dupp, i)
+ if (*dupp != NULL && (*dupp)->state != WT_EMPTY)
+ return (1);
+ break;
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_evict_compare_page --
+ * Qsort function: sort WT_EVICT_LIST array based on the page's address.
+ */
+static int
+__wt_evict_compare_page(const void *a, const void *b)
+{
+ WT_REF *a_ref, *b_ref;
+ WT_PAGE *a_page, *b_page;
+
+ /*
+ * There may be NULL references in the array; sort them as greater than
+ * anything else so they migrate to the end of the array.
+ */
+ a_ref = ((WT_EVICT_LIST *)a)->ref;
+ b_ref = ((WT_EVICT_LIST *)b)->ref;
+ if (a_ref == NULL)
+ return (b_ref == NULL ? 0 : 1);
+ if (b_ref == NULL)
+ return (-1);
+
+ /* Sort the page address in ascending order. */
+ a_page = a_ref->page;
+ b_page = b_ref->page;
+ return (a_page > b_page ? 1 : (a_page < b_page ? -1 : 0));
+}
+
+/*
+ * __wt_evict_compare_lru --
+ * Qsort function: sort WT_EVICT_LIST array based on the page's read
+ * generation.
+ */
+static int
+__wt_evict_compare_lru(const void *a, const void *b)
+{
+ WT_REF *a_ref, *b_ref;
+ uint32_t a_lru, b_lru;
+
+ /*
+ * There may be NULL references in the array; sort them as greater than
+ * anything else so they migrate to the end of the array.
+ */
+ a_ref = ((WT_EVICT_LIST *)a)->ref;
+ b_ref = ((WT_EVICT_LIST *)b)->ref;
+ if (a_ref == NULL)
+ return (b_ref == NULL ? 0 : 1);
+ if (b_ref == NULL)
+ return (-1);
+
+ /* Sort the LRU in ascending order. */
+ a_lru = a_ref->page->read_gen;
+ b_lru = b_ref->page->read_gen;
+ return (a_lru > b_lru ? 1 : (a_lru < b_lru ? -1 : 0));
+}
+
+/*
+ * __wt_evict_hazard_compare --
+ * Qsort function: sort hazard list based on the page's address.
+ */
+static int
+__wt_evict_hazard_compare(const void *a, const void *b)
+{
+ WT_PAGE *a_page, *b_page;
+
+ a_page = *(WT_PAGE **)a;
+ b_page = *(WT_PAGE **)b;
+
+ return (a_page > b_page ? 1 : (a_page < b_page ? -1 : 0));
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_evict_hazard_validate --
+ * Return if a page is or isn't on the hazard list.
+ */
+static void
+__wt_evict_hazard_validate(ENV *env, WT_PAGE *page)
+{
+ IENV *ienv;
+ WT_PAGE **hp;
+ WT_TOC **tp, *toc;
+
+ ienv = env->ienv;
+
+ for (tp = ienv->toc; (toc = *tp) != NULL; ++tp)
+ for (hp = toc->hazard;
+ hp < toc->hazard + toc->env->hazard_size; ++hp)
+ if (*hp == page) {
+ __wt_api_env_errx(env,
+ "hazard eviction check for page %lu "
+ "failed",
+ (u_long)page->addr);
+ __wt_abort(env);
+ }
+}
+
+/*
+ * __wt_evict_dump --
+ * Display the eviction list.
+ */
+void
+__wt_evict_dump(WT_TOC *toc)
+{
+ ENV *env;
+ WT_CACHE *cache;
+ WT_EVICT_LIST *evict;
+ WT_MBUF mb;
+ uint n;
+ int sep;
+
+ env = toc->env;
+ cache = env->ienv->cache;
+
+ __wt_mb_init(env, &mb);
+ __wt_mb_add(&mb, "eviction list");
+
+ for (sep = ':', n = 0; n < cache->evict_elem; ++n) {
+ evict = &cache->evict[n];
+ if (evict->ref == NULL)
+ continue;
+ __wt_mb_add(&mb, "%c %lu", sep, (u_long)evict->ref->page->addr);
+ sep = ',';
+ }
+ __wt_mb_discard(&mb);
+}
+
+/*
+ * __wt_evict_dump_cache
+ * Dump the in-memory cache.
+ */
+int
+__wt_evict_cache_dump(WT_TOC *toc)
+{
+ IDB *idb;
+ IENV *ienv;
+
+ ienv = toc->env->ienv;
+
+ TAILQ_FOREACH(idb, &ienv->dbqh, q)
+ WT_RET(__wt_evict_tree_dump(toc, idb));
+ return (0);
+}
+
+/*
+ * __wt_evict_tree_dump
+ * Dump an in-memory tree.
+ */
+int
+__wt_evict_tree_dump(WT_TOC *toc, IDB *idb)
+{
+ ENV *env;
+ WT_CACHE *cache;
+ WT_REF *ref;
+ WT_WALK walk;
+ WT_MBUF mb;
+ int sep;
+
+ env = toc->env;
+ cache = env->ienv->cache;
+
+ WT_VERBOSE(env, WT_VERB_EVICT, (env,
+ "%s: pages inuse %llu, bytes inuse (%llu), max (%llu)",
+ idb->name,
+ __wt_cache_pages_inuse(cache),
+ __wt_cache_bytes_inuse(cache),
+ WT_STAT(cache->stats, CACHE_BYTES_MAX)));
+
+ __wt_mb_init(env, &mb);
+ __wt_mb_add(&mb, "in-memory page list");
+
+ WT_CLEAR(walk);
+ WT_RET(__wt_bt_walk_begin(toc, &idb->root_page, &walk));
+ for (sep = ':';;) {
+ WT_RET(__wt_bt_walk_next(toc, &walk, &ref));
+ if (ref == NULL)
+ break;
+ __wt_mb_add(&mb, "%c %lu", sep, (u_long)ref->page->addr);
+ sep = ',';
+ }
+ __wt_bt_walk_end(env, &walk);
+ __wt_mb_discard(&mb);
+
+ return (0);
+}
+
+/*
+ * __wt_evict_cache_count
+ * Retrun the count of nodes in the cache.
+ */
+int
+__wt_evict_cache_count(WT_TOC *toc, uint64_t *nodesp)
+{
+ IDB *idb;
+ IENV *ienv;
+ uint64_t nodes;
+
+ ienv = toc->env->ienv;
+
+ *nodesp = 0;
+ TAILQ_FOREACH(idb, &ienv->dbqh, q) {
+ WT_RET(__wt_evict_tree_count(toc, idb, &nodes));
+ *nodesp += nodes;
+ }
+ return (0);
+}
+
+/*
+ * __wt_evict_tree_count
+ * Return a count of nodes in the tree.
+ */
+int
+__wt_evict_tree_count(WT_TOC *toc, IDB *idb, uint64_t *nodesp)
+{
+ ENV *env;
+ WT_CACHE *cache;
+ WT_REF *ref;
+ WT_WALK walk;
+ uint64_t nodes;
+
+ env = toc->env;
+ cache = env->ienv->cache;
+
+ WT_CLEAR(walk);
+ WT_RET(__wt_bt_walk_begin(toc, &idb->root_page, &walk));
+ for (nodes = 0;;) {
+ WT_RET(__wt_bt_walk_next(toc, &walk, &ref));
+ if (ref == NULL)
+ break;
+ ++nodes;
+ }
+ *nodesp = nodes;
+ __wt_bt_walk_end(env, &walk);
+
+ return (0);
+}
+#endif
diff --git a/src/btree/c_init.c b/src/btree/c_init.c
new file mode 100644
index 00000000000..641f90d9a56
--- /dev/null
+++ b/src/btree/c_init.c
@@ -0,0 +1,133 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2010 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cache_create --
+ * Create the underlying cache.
+ */
+int
+__wt_cache_create(ENV *env)
+{
+ IENV *ienv;
+ WT_CACHE *cache;
+ int ret;
+
+ ienv = env->ienv;
+ ret = 0;
+
+ WT_RET(__wt_calloc(env, 1, sizeof(WT_CACHE), &ienv->cache));
+ cache = ienv->cache;
+
+ WT_ERR(
+ __wt_mtx_alloc(env, "cache eviction server", 1, &cache->mtx_evict));
+ WT_ERR(__wt_mtx_alloc(env, "cache read server", 1, &cache->mtx_read));
+ WT_ERR(__wt_mtx_alloc(env, "reconciliation", 0, &cache->mtx_reconcile));
+
+ WT_ERR(__wt_stat_alloc_cache_stats(env, &cache->stats));
+
+ WT_STAT_SET(
+ cache->stats, CACHE_BYTES_MAX, env->cache_size * WT_MEGABYTE);
+
+ return (0);
+
+err: (void)__wt_cache_destroy(env);
+ return (ret);
+}
+
+/*
+ * __wt_cache_pages_inuse --
+ * Return the number of pages in use.
+ */
+inline uint64_t
+__wt_cache_pages_inuse(WT_CACHE *cache)
+{
+ uint64_t pages_in, pages_out;
+
+ /*
+ * Reading 64-bit fields, potentially on 32-bit machines, and other
+ * threads of control may be modifying them. Check them for sanity
+ * (although "interesting" corruption is vanishingly unlikely, these
+ * values just increment over time).
+ */
+ pages_in = cache->stat_pages_in;
+ pages_out = cache->stat_pages_out;
+ return (pages_in > pages_out ? pages_in - pages_out : 0);
+}
+
+/*
+ * __wt_cache_bytes_inuse --
+ * Return the number of bytes in use.
+ */
+inline uint64_t
+__wt_cache_bytes_inuse(WT_CACHE *cache)
+{
+ uint64_t bytes_in, bytes_out;
+
+ /*
+ * Reading 64-bit fields, potentially on 32-bit machines, and other
+ * threads of control may be modifying them. Check them for sanity
+ * (although "interesting" corruption is vanishingly unlikely, these
+ * values just increment over time).
+ */
+ bytes_in = cache->stat_bytes_in;
+ bytes_out = cache->stat_bytes_out;
+ return (bytes_in > bytes_out ? bytes_in - bytes_out : 0);
+}
+
+/*
+ * __wt_cache_stats --
+ * Update the cache statistics for return to the application.
+ */
+void
+__wt_cache_stats(ENV *env)
+{
+ WT_CACHE *cache;
+ WT_STATS *stats;
+
+ cache = env->ienv->cache;
+ stats = cache->stats;
+
+ WT_STAT_SET(stats, CACHE_BYTES_INUSE, __wt_cache_bytes_inuse(cache));
+ WT_STAT_SET(stats, CACHE_PAGES_INUSE, __wt_cache_pages_inuse(cache));
+}
+
+/*
+ * __wt_cache_destroy --
+ * Discard the underlying cache.
+ */
+int
+__wt_cache_destroy(ENV *env)
+{
+ IENV *ienv;
+ WT_CACHE *cache;
+ int ret;
+
+ ienv = env->ienv;
+ cache = ienv->cache;
+ ret = 0;
+
+ if (cache == NULL)
+ return (0);
+
+ /* Discard mutexes. */
+ if (cache->mtx_evict != NULL)
+ (void)__wt_mtx_destroy(env, cache->mtx_evict);
+ if (cache->mtx_read != NULL)
+ __wt_mtx_destroy(env, cache->mtx_read);
+ if (cache->mtx_reconcile != NULL)
+ __wt_mtx_destroy(env, cache->mtx_reconcile);
+
+ /* Discard allocated memory, and clear. */
+ __wt_free(env, cache->stats, 0);
+ __wt_free(env, ienv->cache, sizeof(WT_CACHE));
+
+ return (ret);
+}
diff --git a/src/btree/c_page.c b/src/btree/c_page.c
new file mode 100644
index 00000000000..cd71c0b4ebf
--- /dev/null
+++ b/src/btree/c_page.c
@@ -0,0 +1,69 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2010 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_page_read --
+ * Read a database page (same as read, but verify the checksum).
+ */
+int
+__wt_page_read(DB *db, WT_PAGE *page)
+{
+ ENV *env;
+ WT_FH *fh;
+ WT_PAGE_HDR *hdr;
+ off_t offset;
+ uint32_t checksum;
+
+ env = db->env;
+ fh = db->idb->fh;
+ hdr = page->hdr;
+
+ offset = WT_ADDR_TO_OFF(db, page->addr);
+ WT_RET(__wt_read(env, fh, offset, page->size, hdr));
+
+ checksum = hdr->checksum;
+ hdr->checksum = 0;
+ if (checksum != __wt_cksum(hdr, page->size)) {
+ __wt_api_env_errx(env,
+ "read checksum error: addr/size %lu/%lu at offset %llu",
+ (u_long)page->addr,
+ (u_long)page->size, (unsigned long long)offset);
+ return (WT_ERROR);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_page_write --
+ * Write a database page.
+ */
+int
+__wt_page_write(WT_TOC *toc, WT_PAGE *page)
+{
+ DB *db;
+ ENV *env;
+ WT_FH *fh;
+ WT_PAGE_HDR *hdr;
+
+ db = toc->db;
+ env = toc->env;
+ fh = db->idb->fh;
+
+ WT_ASSERT(env, __wt_bt_verify_dsk_page(toc, page) == 0);
+
+ hdr = page->hdr;
+ hdr->checksum = 0;
+ hdr->checksum = __wt_cksum(hdr, page->size);
+
+ return (__wt_write(
+ env, fh, WT_ADDR_TO_OFF(db, page->addr), page->size, hdr));
+}
diff --git a/src/btree/c_read.c b/src/btree/c_read.c
new file mode 100644
index 00000000000..1578b5ee642
--- /dev/null
+++ b/src/btree/c_read.c
@@ -0,0 +1,273 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2010 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_cache_read(WT_READ_REQ *);
+
+/*
+ * __wt_workq_read_server --
+ * See if the read server thread needs to be awakened.
+ */
+void
+__wt_workq_read_server(ENV *env, int force)
+{
+ WT_CACHE *cache;
+ uint64_t bytes_inuse, bytes_max;
+
+ cache = env->ienv->cache;
+
+ /*
+ * If we're 10% over the maximum cache, shut out reads (which include
+ * page allocations) until we evict to at least 5% under the maximum
+ * cache. The idea is that we don't want to run on the edge all the
+ * time -- if we're seriously out of space, get things under control
+ * before opening up for more reads.
+ */
+ bytes_inuse = __wt_cache_bytes_inuse(cache);
+ bytes_max = WT_STAT(cache->stats, CACHE_BYTES_MAX);
+ if (cache->read_lockout) {
+ if (bytes_inuse <= bytes_max - (bytes_max / 20))
+ cache->read_lockout = 0;
+ } else if (bytes_inuse > bytes_max + (bytes_max / 10)) {
+ WT_VERBOSE(env, WT_VERB_READ, (env,
+ "workQ locks out reads: bytes-inuse %llu of bytes-max %llu",
+ (unsigned long long)bytes_inuse,
+ (unsigned long long)bytes_max));
+ cache->read_lockout = 1;
+ }
+
+ /* If the cache read server is running, there's nothing to do. */
+ if (!cache->read_sleeping)
+ return;
+
+ /*
+ * If reads are locked out and we're not forcing the issue (that's when
+ * closing the environment, or if there's a priority read waiting to be
+ * handled), we're done.
+ */
+ if (!force && cache->read_lockout)
+ return;
+
+ cache->read_sleeping = 0;
+ __wt_unlock(env, cache->mtx_read);
+}
+
+/*
+ * __wt_cache_read_serial_func --
+ * Read/allocation serialization function called when a page-in requires
+ * allocation or a read.
+ */
+int
+__wt_cache_read_serial_func(WT_TOC *toc)
+{
+ ENV *env;
+ WT_CACHE *cache;
+ WT_OFF *off;
+ WT_PAGE *parent;
+ WT_READ_REQ *rr, *rr_end;
+ WT_REF *ref;
+ int dsk_verify;
+
+ __wt_cache_read_unpack(toc, parent, ref, off, dsk_verify);
+
+ env = toc->env;
+ cache = env->ienv->cache;
+
+ /* Find an empty slot and enter the read request. */
+ rr = cache->read_request;
+ rr_end = rr + WT_ELEMENTS(cache->read_request);
+ for (; rr < rr_end; ++rr)
+ if (WT_READ_REQ_ISEMPTY(rr)) {
+ WT_READ_REQ_SET(rr, toc, parent, ref, off, dsk_verify);
+ return (0);
+ }
+ __wt_api_env_errx(env, "read server request table full");
+ return (WT_RESTART);
+}
+
+/*
+ * __wt_cache_read_server --
+ * Thread to do database reads.
+ */
+void *
+__wt_cache_read_server(void *arg)
+{
+ ENV *env;
+ IENV *ienv;
+ WT_CACHE *cache;
+ WT_READ_REQ *rr, *rr_end;
+ WT_TOC *toc;
+ int didwork, ret;
+
+ env = arg;
+ ienv = env->ienv;
+ cache = ienv->cache;
+
+ rr = cache->read_request;
+ rr_end = rr + WT_ELEMENTS(cache->read_request);
+
+ for (;;) {
+ WT_VERBOSE(env,
+ WT_VERB_READ, (env, "cache read server sleeping"));
+ cache->read_sleeping = 1;
+ __wt_lock(env, cache->mtx_read);
+ WT_VERBOSE(
+ env, WT_VERB_READ, (env, "cache read server waking"));
+
+ /*
+ * Check for environment exit; do it here, instead of the top of
+ * the loop because doing it here keeps us from doing a bunch of
+ * worked when simply awakened to quit.
+ */
+ if (!F_ISSET(ienv, WT_SERVER_RUN))
+ break;
+
+ /*
+ * Walk the read-request queue, looking for reads (defined by
+ * a valid WT_TOC handle). If we find a read request, perform
+ * it, flush the result and clear the request slot, then wake
+ * up the requesting thread. The request slot clear doesn't
+ * need to be flushed, but we have to flush the read result,
+ * might as well include it. If we don't find any work, go to
+ * sleep.
+ */
+ do {
+ didwork = 0;
+ for (rr = cache->read_request; rr < rr_end; ++rr) {
+ if ((toc = rr->toc) == NULL)
+ continue;
+ if (cache->read_lockout &&
+ !F_ISSET(toc, WT_READ_PRIORITY))
+ continue;
+
+ /*
+ * The read server thread does both general file
+ * allocation and cache page instantiation. In
+ * a file allocation, there's no pagep field in
+ * in which to return a page.
+ */
+ ret = __wt_cache_read(rr);
+
+ WT_READ_REQ_CLR(rr);
+ __wt_toc_serialize_wrapup(toc, NULL, ret);
+
+ didwork = 1;
+
+ /*
+ * Any error terminates the request; a serious
+ * error causes the read server to exit.
+ */
+ if (ret != 0) {
+ if (ret != WT_RESTART)
+ goto err;
+ ret = 0;
+ }
+ }
+ } while (didwork);
+ }
+
+ if (ret != 0)
+err: __wt_api_env_err(env, ret, "cache read server error");
+
+ WT_VERBOSE(env, WT_VERB_READ, (env, "cache read server exiting"));
+ return (NULL);
+}
+
+/*
+ * __wt_cache_read --
+ * Read a page from the file.
+ */
+static int
+__wt_cache_read(WT_READ_REQ *rr)
+{
+ DB *db;
+ ENV *env;
+ WT_CACHE *cache;
+ WT_FH *fh;
+ WT_OFF *off;
+ WT_PAGE *page;
+ WT_REF *ref;
+ WT_TOC *toc;
+ uint32_t addr, size;
+ int ret;
+
+ toc = rr->toc;
+ ref = rr->ref;
+ off = rr->off;
+ addr = off->addr;
+ size = off->size;
+
+ db = toc->db;
+ env = toc->env;
+ cache = env->ienv->cache;
+ fh = db->idb->fh;
+ ret = 0;
+
+ /*
+ * Check to see if some other thread brought the page into the cache
+ * while our request was in the queue. If the state is anything
+ * other than empty, it's not our problem.
+ */
+ if (ref->state != WT_EMPTY)
+ return (0);
+
+ /*
+ * The page isn't in the cache, and since we're the only path for the
+ * page to get into the cache, we don't have to worry further, and
+ * we might as well get to it.
+ *
+ * Allocate memory for the in-memory page information and for the page
+ * itself. They're two separate allocation calls so we (hopefully) get
+ * better alignment from the underlying heap memory allocator.
+ */
+ WT_RET(__wt_calloc(env, 1, sizeof(WT_PAGE), &page));
+ WT_ERR(__wt_calloc(env, (size_t)size, sizeof(uint8_t), &page->hdr));
+
+ /* Read the page. */
+ WT_VERBOSE(env, WT_VERB_READ,
+ (env, "cache read addr/size %lu/%lu", (u_long)addr, (u_long)size));
+ WT_STAT_INCR(cache->stats, PAGE_READ);
+
+ page->addr = addr;
+ page->size = size;
+ WT_ERR(__wt_page_read(db, page));
+ WT_CACHE_PAGE_IN(cache, size);
+
+ /* If the page needs to be verified, that's next. */
+ if (rr->dsk_verify)
+ WT_ERR(__wt_bt_verify_dsk_page(toc, page));
+
+ /* Build the in-memory version of the page. */
+ WT_ERR(__wt_bt_page_inmem(toc, page));
+
+ /*
+ * Reference the parent's WT_PAGE and parent's WT_OFF structure that
+ * read the page.
+ */
+ page->parent = rr->parent;
+ page->parent_off = off;
+
+ /*
+ * The page is now available -- set the LRU so the page is not selected
+ * for eviction.
+ */
+ page->read_gen = ++cache->read_gen;
+ ref->page = page;
+ ref->state = WT_OK;
+
+ return (0);
+
+err: if (page != NULL) {
+ if (page->hdr != NULL)
+ __wt_free(env, page->hdr, size);
+ __wt_free(env, page, sizeof(WT_PAGE));
+ }
+ return (ret);
+}
diff --git a/src/btree/col_get.c b/src/btree/col_get.c
new file mode 100644
index 00000000000..7ab2f242a35
--- /dev/null
+++ b/src/btree/col_get.c
@@ -0,0 +1,40 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_db_col_get --
+ * Db.col_get method.
+ */
+int
+__wt_db_col_get(WT_TOC *toc, uint64_t recno, DBT *data)
+{
+ DB *db;
+ IDB *idb;
+ int ret;
+
+ db = toc->db;
+ idb = db->idb;
+
+ /* Search the column store for the key. */
+ if (!F_ISSET(idb, WT_COLUMN)) {
+ __wt_api_db_errx(db,
+ "row database records cannot be retrieved by record "
+ "number");
+ return (WT_ERROR);
+ }
+
+ WT_ERR(__wt_col_search(toc, recno, WT_NOLEVEL, 0));
+ ret = __wt_dbt_return(toc, NULL, data, 0);
+
+err: if (toc->srch_page != idb->root_page.page)
+ __wt_hazard_clear(toc, toc->srch_page);
+ return (ret);
+}
diff --git a/src/btree/col_put.c b/src/btree/col_put.c
new file mode 100644
index 00000000000..e7e76778fe3
--- /dev/null
+++ b/src/btree/col_put.c
@@ -0,0 +1,229 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_col_update(WT_TOC *, uint64_t, DBT *, int);
+
+/*
+ * __wt_db_col_del --
+ * Db.col_del method.
+ */
+inline int
+__wt_db_col_del(WT_TOC *toc, uint64_t recno)
+{
+ return (__wt_col_update(toc, recno, NULL, 0));
+}
+
+/*
+ * __wt_db_col_put --
+ * Db.put method.
+ */
+inline int
+__wt_db_col_put(WT_TOC *toc, uint64_t recno, DBT *data)
+{
+ DB *db;
+
+ db = toc->db;
+
+ if (db->fixed_len != 0 && data->size != db->fixed_len)
+ WT_RET(__wt_database_wrong_fixed_size(toc, data->size));
+
+ return (__wt_col_update(toc, recno, data, 1));
+}
+
+/*
+ * __wt_col_update --
+ * Column store delete and update.
+ */
+static int
+__wt_col_update(WT_TOC *toc, uint64_t recno, DBT *data, int data_overwrite)
+{
+ DB *db;
+ ENV *env;
+ WT_PAGE *page;
+ WT_RLE_EXPAND *exp, **new_rleexp;
+ WT_REPL **new_repl, *repl;
+ int ret;
+
+ env = toc->env;
+ db = toc->db;
+
+ page = NULL;
+ exp = NULL;
+ new_rleexp = NULL;
+ new_repl = NULL;
+ repl = NULL;
+
+ /* Search the btree for the key. */
+ WT_RET(__wt_col_search(
+ toc, recno, WT_NOLEVEL, data_overwrite ? WT_DATA_OVERWRITE : 0));
+ page = toc->srch_page;
+
+ /*
+ * Run-length encoded (RLE) column store operations are hard because
+ * each original on-disk index for an RLE can represent large numbers
+ * of records, and we're only deleting a single one of those records,
+ * which means working in the WT_RLE_EXPAND array. All other column
+ * store deletes are simple changes where a new WT_REPL entry is added
+ * to the page's modification array. There are three code paths:
+ *
+ * 1: column store deletes other than RLE column stores: delete an entry
+ * from the on-disk page by creating a new WT_REPL entry, and linking it
+ * into the WT_REPL array.
+ *
+ * 2: an RLE column store delete of an already modified record: create
+ * a new WT_REPL entry, and link it to the WT_RLE_EXPAND entry's WT_REPL
+ * list.
+ *
+ * 3: an RLE column store delete of a record not yet modified: create
+ * a new WT_RLE_EXPAND/WT_REPL pair, and link it into the WT_RLE_EXPAND
+ * array.
+ */
+ switch (page->dsk->type) {
+ case WT_PAGE_COL_FIX: /* #1 */
+ case WT_PAGE_COL_VAR:
+ /* Allocate a page replacement array if necessary. */
+ if (page->u2.repl == NULL)
+ WT_ERR(__wt_calloc(env,
+ page->indx_count, sizeof(WT_REPL *), &new_repl));
+
+ /* Allocate a WT_REPL structure and fill it in. */
+ WT_ERR(__wt_repl_alloc(toc, &repl, data));
+
+ /* workQ: schedule insert of the WT_REPL structure. */
+ __wt_item_update_serial(toc, page, toc->srch_write_gen,
+ WT_COL_SLOT(page, toc->srch_ip), new_repl, repl, ret);
+ break;
+ case WT_PAGE_COL_RLE:
+ if (toc->srch_repl != NULL) { /* #2 */
+ /* Allocate a WT_REPL structure and fill it in. */
+ WT_ERR(__wt_repl_alloc(toc, &repl, data));
+
+ /* workQ: schedule insert of the WT_REPL structure. */
+ __wt_rle_expand_repl_serial(toc, page,
+ toc->srch_write_gen, toc->srch_exp, repl, ret);
+ break;
+ }
+ /* #3 */
+ /* Allocate a page expansion array as necessary. */
+ if (page->u2.rleexp == NULL)
+ WT_ERR(__wt_calloc(env, page->indx_count,
+ sizeof(WT_RLE_EXPAND *), &new_rleexp));
+
+ /* Allocate a WT_REPL structure and fill it in. */
+ WT_ERR(__wt_repl_alloc(toc, &repl, data));
+
+ /* Allocate a WT_RLE_EXPAND structure and fill it in. */
+ WT_ERR(__wt_calloc(env, 1, sizeof(WT_RLE_EXPAND), &exp));
+ exp->recno = recno;
+ exp->repl = repl;
+
+ /* Schedule the workQ to link in the WT_RLE_EXPAND structure. */
+ __wt_rle_expand_serial(toc, page, toc->srch_write_gen,
+ WT_COL_SLOT(page, toc->srch_ip), new_rleexp, exp, ret);
+ break;
+ WT_ILLEGAL_FORMAT_ERR(db, ret);
+ }
+
+ if (ret != 0) {
+err: if (exp != NULL)
+ __wt_free(env, exp, sizeof(WT_RLE_EXPAND));
+ if (repl != NULL)
+ __wt_repl_free(toc, repl);
+ }
+
+ /* Free any allocated page expansion array unless the workQ used it. */
+ if (new_rleexp != NULL && new_rleexp != page->u2.rleexp)
+ __wt_free(env,
+ new_rleexp, page->indx_count * sizeof(WT_RLE_EXPAND *));
+
+ /* Free any page replacement array unless the workQ used it. */
+ if (new_repl != NULL && new_repl != page->u2.repl)
+ __wt_free(env, new_repl, page->indx_count * sizeof(WT_REPL *));
+
+ WT_PAGE_OUT(toc, page);
+
+ return (0);
+}
+
+/*
+ * __wt_rle_expand_serial_func --
+ * Server function to expand a run-length encoded column store during a
+ * delete.
+ */
+int
+__wt_rle_expand_serial_func(WT_TOC *toc)
+{
+ WT_PAGE *page;
+ WT_RLE_EXPAND **new_rleexp, *exp;
+ uint32_t slot, write_gen;
+ int ret;
+
+ ret = 0;
+
+ __wt_rle_expand_unpack(toc, page, write_gen, slot, new_rleexp, exp);
+
+ /* Check the page's write-generation. */
+ WT_ERR(__wt_page_write_gen_check(page, write_gen));
+
+ /*
+ * If the page does not yet have an expansion array, our caller passed
+ * us one of the correct size. (It's the caller's responsibility to
+ * detect & free the passed-in expansion array if we don't use it.)
+ */
+ if (page->u2.rleexp == NULL)
+ page->u2.rleexp = new_rleexp;
+
+ /*
+ * Insert the new WT_RLE_EXPAND as the first item in the forward-linked
+ * list of expansion structures. Flush memory to ensure the list is
+ * never broken.
+ */
+ exp->next = page->u2.rleexp[slot];
+ WT_MEMORY_FLUSH;
+ page->u2.rleexp[slot] = exp;
+
+err: __wt_toc_serialize_wrapup(toc, page, ret);
+ return (0);
+}
+
+/*
+ * __wt_rle_expand_repl_serial_func --
+ * Server function to update a WT_REPL entry in an already expanded
+ * run-length encoded column store during a delete.
+ */
+int
+__wt_rle_expand_repl_serial_func(WT_TOC *toc)
+{
+ WT_PAGE *page;
+ WT_RLE_EXPAND *exp;
+ WT_REPL *repl;
+ uint32_t write_gen;
+ int ret;
+
+ ret = 0;
+
+ __wt_rle_expand_repl_unpack(toc, page, write_gen, exp, repl);
+
+ /* Check the page's write-generation. */
+ WT_ERR(__wt_page_write_gen_check(page, write_gen));
+
+ /*
+ * Insert the new WT_REPL as the first item in the forward-linked list
+ * of replacement structures from the WT_RLE_EXPAND structure. Flush
+ * memory to ensure the list is never broken.
+ */
+ repl->next = exp->repl;
+ WT_MEMORY_FLUSH;
+ exp->repl = repl;
+
+err: __wt_toc_serialize_wrapup(toc, page, ret);
+ return (0);
+}
diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c
new file mode 100644
index 00000000000..81c24e3d54f
--- /dev/null
+++ b/src/btree/col_srch.c
@@ -0,0 +1,211 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_col_search --
+ * Search a column-store tree for a specific record-based key.
+ */
+int
+__wt_col_search(WT_TOC *toc, uint64_t recno, uint32_t level, uint32_t flags)
+{
+ DB *db;
+ IDB *idb;
+ WT_COL *cip;
+ WT_OFF *off;
+ WT_PAGE *page;
+ WT_PAGE_DISK *dsk;
+ WT_RLE_EXPAND *exp;
+ WT_REF *ref;
+ WT_REPL *repl;
+ uint64_t record_cnt;
+ uint32_t i, write_gen;
+ int ret;
+
+ toc->srch_page = NULL; /* Return values. */
+ toc->srch_ip = NULL;
+ toc->srch_repl = NULL;
+ toc->srch_exp = NULL;
+ toc->srch_write_gen = 0;
+
+ db = toc->db;
+ idb = db->idb;
+
+ WT_DB_FCHK(db, "__wt_col_search", flags, WT_APIMASK_BT_SEARCH_COL);
+
+ /* Check for a record past the end of the database. */
+ page = idb->root_page.page;
+ if (page->records < recno)
+ return (WT_NOTFOUND);
+
+ /* Search the tree. */
+ for (;;) {
+ /* Save the write generation value before the read. */
+ write_gen = page->write_gen;
+
+ /* Walk the page looking for the record. */
+ dsk = page->dsk;
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ cip = page->u.icol + (recno - dsk->start_recno);
+ goto done;
+ case WT_PAGE_COL_RLE:
+ /*
+ * Walk the page, counting records -- do the record
+ * count calculation in a funny way to avoid overflow.
+ */
+ record_cnt = recno - dsk->start_recno;
+ WT_INDX_FOREACH(page, cip, i) {
+ if (record_cnt < WT_RLE_REPEAT_COUNT(cip->data))
+ break;
+ record_cnt -= WT_RLE_REPEAT_COUNT(cip->data);
+ }
+ goto done;
+ case WT_PAGE_COL_INT:
+ default:
+ /*
+ * Walk the page, counting records -- do the record
+ * count calculation in a funny way to avoid overflow.
+ */
+ record_cnt = recno - dsk->start_recno;
+ WT_INDX_FOREACH(page, cip, i) {
+ if (record_cnt < WT_COL_OFF_RECORDS(cip))
+ break;
+ record_cnt -= WT_COL_OFF_RECORDS(cip);
+ }
+ break;
+ }
+
+ /* If a level was set, see if we found the asked-for page. */
+ if (level == dsk->level)
+ goto done;
+
+ /* cip references the subtree containing the record. */
+ ref = WT_COL_REF(page, cip);
+ off = WT_COL_OFF(cip);
+ WT_ERR(__wt_page_in(toc, page, ref, off, 0));
+
+ /* Swap the parent page for the child page. */
+ if (page != idb->root_page.page)
+ __wt_hazard_clear(toc, page);
+ page = ref->page;
+ }
+
+done: /*
+ * We've found the right on-page WT_COL structure, but that's only the
+ * first step; the record may have been updated since reading the page
+ * into the cache.
+ */
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ /* Find the item's WT_REPL slot if it exists. */
+ repl = WT_COL_REPL(page, cip);
+
+ /*
+ * If overwriting an existing data item, we don't care if the
+ * item was previously deleted, return the gathered information.
+ */
+ if (LF_ISSET(WT_DATA_OVERWRITE)) {
+ toc->srch_repl = repl;
+ break;
+ }
+
+ /*
+ * Otherwise, check for deletion, in either the WT_REPL slot
+ * or in the original data.
+ */
+ if (repl != NULL) {
+ if (WT_REPL_DELETED_ISSET(repl))
+ goto notfound;
+ toc->srch_repl = repl;
+ } else
+ if (WT_FIX_DELETE_ISSET(cip->data))
+ goto notfound;
+ break;
+ case WT_PAGE_COL_RLE:
+ /* Find the item's WT_COL_EXP slot if it exists. */
+ for (exp =
+ WT_COL_RLEEXP(page, cip); exp != NULL; exp = exp->next)
+ if (exp->recno == recno)
+ break;
+
+ /*
+ * If overwriting an existing data item, we don't care if the
+ * item was previously deleted, return the gathered information.
+ */
+ if (LF_ISSET(WT_DATA_OVERWRITE)) {
+ if (exp != NULL) {
+ toc->srch_exp = exp;
+ toc->srch_repl = exp->repl;
+ }
+ break;
+ }
+
+ /*
+ * Otherwise, check for deletion, in either the WT_REPL slot
+ * (referenced by the WT_COL_EXP slot), or in the original data.
+ */
+ if (exp != NULL) {
+ if (WT_REPL_DELETED_ISSET(exp->repl))
+ goto notfound;
+ toc->srch_exp = exp;
+ toc->srch_repl = exp->repl;
+ } else
+ if (WT_FIX_DELETE_ISSET(WT_RLE_REPEAT_DATA(cip->data)))
+ goto notfound;
+ break;
+ case WT_PAGE_COL_VAR:
+ /* Find the item's WT_REPL slot if it exists. */
+ repl = WT_COL_REPL(page, cip);
+
+ /*
+ * If overwriting an existing data item, we don't care if the
+ * item was previously deleted, return the gathered information.
+ */
+ if (LF_ISSET(WT_DATA_OVERWRITE)) {
+ toc->srch_repl = repl;
+ break;
+ }
+
+ /*
+ * Otherwise, check for deletion, in either the WT_REPL slot
+ * or in the original data.
+ */
+ if (repl != NULL) {
+ if (WT_REPL_DELETED_ISSET(repl))
+ goto notfound;
+ toc->srch_repl = repl;
+ break;
+ } else
+ if (WT_ITEM_TYPE(cip->data) == WT_ITEM_DEL)
+ goto notfound;
+ break;
+ case WT_PAGE_COL_INT:
+ /*
+ * When returning internal pages, set the item's WT_REPL slot
+ * if it exists, otherwise we're done.
+ */
+ toc->srch_repl = WT_COL_REPL(page, cip);
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+
+ toc->srch_page = page;
+ toc->srch_ip = cip;
+ toc->srch_write_gen = write_gen;
+ return (0);
+
+notfound:
+ ret = WT_NOTFOUND;
+
+err: WT_PAGE_OUT(toc, page);
+ return (ret);
+}
diff --git a/src/btree/row_get.c b/src/btree/row_get.c
new file mode 100644
index 00000000000..03f2cce44bc
--- /dev/null
+++ b/src/btree/row_get.c
@@ -0,0 +1,61 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_db_row_get --
+ * Db.row_get method.
+ */
+int
+__wt_db_row_get(WT_TOC *toc, DBT *key, DBT *data)
+{
+ DB *db;
+ IDB *idb;
+ WT_PAGE *page;
+ WT_ROW *rip;
+ uint32_t type;
+ int ret;
+
+ db = toc->db;
+ idb = db->idb;
+ page = NULL;
+
+ /* Search the btree for the key. */
+ WT_ERR(__wt_row_search(toc, key, WT_NOLEVEL, 0));
+ page = toc->srch_page;
+ rip = toc->srch_ip;
+
+ /*
+ * The Db.get method can only return single key/data pairs.
+ * If that's not what we found, we're done.
+ *
+ * XXX
+ * Checking if page_data is NULL isn't the right thing to do
+ * here. Re-visit this when we figure out how we handle
+ * dup inserts into the tree. Maybe pass NO-DUP flag into the
+ * search function?
+ */
+ if (rip->data != NULL) {
+ type = WT_ITEM_TYPE(rip->data);
+ if (type != WT_ITEM_DATA && type != WT_ITEM_DATA_OVFL) {
+ __wt_api_db_errx(db,
+ "the Db.get method cannot return keys with "
+ "duplicate data items; use the Db.cursor method "
+ "instead");
+ ret = WT_ERROR;
+ goto err;
+ }
+ }
+ ret = __wt_dbt_return(toc, key, data, 0);
+
+err: if (page != idb->root_page.page)
+ __wt_hazard_clear(toc, page);
+ return (ret);
+}
diff --git a/src/btree/row_put.c b/src/btree/row_put.c
new file mode 100644
index 00000000000..3ac4304ccec
--- /dev/null
+++ b/src/btree/row_put.c
@@ -0,0 +1,288 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_row_update(WT_TOC *, DBT *, DBT *, int);
+
+/*
+ * __wt_db_row_del --
+ * Db.row_del method.
+ */
+inline int
+__wt_db_row_del(WT_TOC *toc, DBT *key)
+{
+ return (__wt_row_update(toc, key, NULL, 0));
+}
+
+/*
+ * __wt_db_row_put --
+ * Db.row_put method.
+ */
+inline int
+__wt_db_row_put(WT_TOC *toc, DBT *key, DBT *data)
+{
+ return (__wt_row_update(toc, key, data, 1));
+}
+
+/*
+ * __wt_row_update --
+ * Row store delete and update.
+ */
+static int
+__wt_row_update(WT_TOC *toc, DBT *key, DBT *data, int insert)
+{
+ ENV *env;
+ WT_PAGE *page;
+ WT_REPL **new_repl, *repl;
+ int ret;
+
+ env = toc->env;
+ new_repl = NULL;
+ repl = NULL;
+
+ /* Search the btree for the key. */
+ WT_RET(__wt_row_search(toc, key, WT_NOLEVEL, insert ? WT_INSERT : 0));
+ page = toc->srch_page;
+
+ /* Allocate a page replacement array as necessary. */
+ if (page->u2.repl == NULL)
+ WT_ERR(__wt_calloc(
+ env, page->indx_count, sizeof(WT_REPL *), &new_repl));
+
+ /* Allocate room for the new data item from per-thread memory. */
+ WT_ERR(__wt_repl_alloc(toc, &repl, data));
+
+ /* Schedule the workQ to insert the WT_REPL structure. */
+ __wt_item_update_serial(toc, page, toc->srch_write_gen,
+ WT_ROW_SLOT(page, toc->srch_ip), new_repl, repl, ret);
+
+ if (ret != 0) {
+err: if (repl != NULL)
+ __wt_repl_free(toc, repl);
+ }
+
+ /* Free any replacement array unless the workQ used it. */
+ if (new_repl != NULL && new_repl != page->u2.repl)
+ __wt_free(env, new_repl, page->indx_count * sizeof(WT_REPL *));
+
+ WT_PAGE_OUT(toc, page);
+
+ return (0);
+}
+
+/*
+ * __wt_item_update_serial_func --
+ * Server function to update a WT_REPL entry in the modification array.
+ */
+int
+__wt_item_update_serial_func(WT_TOC *toc)
+{
+ WT_PAGE *page;
+ WT_REPL **new_repl, *repl;
+ uint32_t slot, write_gen;
+ int ret;
+
+ __wt_item_update_unpack(toc, page, write_gen, slot, new_repl, repl);
+
+ ret = 0;
+
+ /* Check the page's write-generation. */
+ WT_ERR(__wt_page_write_gen_check(page, write_gen));
+
+ /*
+ * If the page does not yet have a replacement array, our caller passed
+ * us one of the correct size. (It's the caller's responsibility to
+ * detect & free the passed-in expansion array if we don't use it.)
+ */
+ if (page->u2.repl == NULL)
+ page->u2.repl = new_repl;
+
+ /*
+ * Insert the new WT_REPL as the first item in the forward-linked list
+ * of replacement structures. Flush memory to ensure the list is never
+ * broken.
+ */
+ repl->next = page->u2.repl[slot];
+ WT_MEMORY_FLUSH;
+ page->u2.repl[slot] = repl;
+
+err: __wt_toc_serialize_wrapup(toc, page, ret);
+ return (0);
+}
+
+/*
+ * __wt_repl_alloc --
+ * Allocate a WT_REPL structure and associated data from the TOC's update
+ * memory, and fill it in.
+ */
+int
+__wt_repl_alloc(WT_TOC *toc, WT_REPL **replp, DBT *data)
+{
+ DB *db;
+ ENV *env;
+ WT_REPL *repl;
+ WT_TOC_UPDATE *update;
+ uint32_t align_size, alloc_size, size;
+ int single_use;
+
+ env = toc->env;
+ db = toc->db;
+
+ /*
+ * Allocate memory for a data insert or change; there's a buffer in the
+ * WT_TOC structure for allocation of chunks of memory to hold changed
+ * or inserted data items.
+ *
+ * We align each allocation because we directly access WT_REPL structure
+ * fields in the memory (the x86 handles unaligned accesses, but I don't
+ * want to have to find and fix this code for a port to a system that
+ * doesn't handle unaligned accesses). It wastes space, but this memory
+ * is never written to disk and there are fewer concerns about memory
+ * than with on-disk structures. Any other code allocating memory from
+ * this buffer needs to align its allocations as well.
+ *
+ * The first thing in each chunk of memory is WT_TOC_UPDATE structure
+ * (which we check is a multiple of 4B during initialization); then
+ * there are one or more WT_REPL structure plus data chunk pairs.
+ *
+ * XXX
+ * Figure out how much space we need: this code limits the maximum size
+ * of a data item stored in the database. In summary, for a big item we
+ * have to store a WT_TOC_UPDATE structure, the WT_REPL structure and
+ * the data, all in an allocated buffer. We only pass a 32-bit value
+ * to our allocation routine, so we can't store an item bigger than the
+ * maximum 32-bit value minus the sizes of those two structures, where
+ * the WT_REPL structure and data item are aligned to a 32-bit boundary.
+ * We could fix this, but it's unclear it's worth the effort -- document
+ * you can store a (4GB - 20B) item max, and you're done, because it's
+ * insane to store a 4GB item in the database anyway.
+ *
+ * Check first we won't overflow when calculating an aligned size, then
+ * check the total required space for this item.
+ */
+ size = data == NULL ? 0 : data->size;
+ if (UINT32_MAX - size < sizeof(WT_REPL) + sizeof(uint32_t))
+ return (__wt_database_item_too_big(db));
+ align_size = WT_ALIGN(size + sizeof(WT_REPL), sizeof(uint32_t));
+ if (UINT32_MAX - align_size < sizeof(WT_TOC_UPDATE))
+ return (__wt_database_item_too_big(db));
+
+ /*
+ * If we already have a buffer and the data fits, just copy the WT_REPL
+ * structure and data into place, we're done.
+ */
+ update = toc->update;
+ if (update != NULL && align_size <= update->space_avail)
+ goto no_allocation;
+
+ /*
+ * Decide how much memory to allocate: if it's a one-off (that is, the
+ * data is bigger than anything we'll aggregate into these buffers, it's
+ * a one-off. Otherwise, allocate the next power-of-two larger than 4
+ * times the requested size, and at least the default buffer size.
+ *
+ * XXX
+ * I have no reason for the 4x the request size, I just hate to allocate
+ * a buffer for every change to the database. A better approach would
+ * be to grow the allocation buffer as the thread makes more changes; if
+ * a thread is doing lots of work, give it lots of memory, otherwise
+ * only allocate as it's necessary.
+ */
+ if (align_size > env->data_update_max) {
+ alloc_size = sizeof(WT_TOC_UPDATE) + align_size;
+ single_use = 1;
+ } else {
+ alloc_size = __wt_nlpo2(
+ WT_MAX(align_size * 4, env->data_update_initial));
+ single_use = 0;
+ }
+ WT_RET(__wt_calloc(env, 1, alloc_size, &update));
+
+ update->len = alloc_size;
+ update->space_avail = alloc_size - sizeof(WT_TOC_UPDATE);
+ update->first_free = (uint8_t *)update + sizeof(WT_TOC_UPDATE);
+
+ /*
+ * If it's a single use allocation, ignore any current update buffer.
+ * Else, release the old update buffer and replace it with the new one.
+ */
+ if (!single_use) {
+ /*
+ * The "in" reference count is artificially incremented by 1 as
+ * long as an update buffer is referenced by the WT_TOC thread;
+ * we don't want them freed because a page was evicted and the
+ * count went to 0. Decrement the reference count on the buffer
+ * as part of releasing it. There's a similar reference count
+ * decrement when the WT_TOC structure is discarded.
+ *
+ * XXX
+ * There's a race here: if this code, or the WT_TOC structure
+ * close code, and the page discard code race, it's possible
+ * neither will realize the buffer is no longer needed and free
+ * it. The fix is to involve the eviction or workQ threads:
+ * they may need a linked list of buffers they review to ensure
+ * it never happens. I'm living with this now: it's unlikely
+ * and it's a memory leak if it ever happens.
+ */
+ if (toc->update != NULL)
+ --toc->update->in;
+ toc->update = update;
+
+ update->in = 1;
+ }
+
+no_allocation:
+ /* Copy the WT_REPL structure into place. */
+ repl = (WT_REPL *)update->first_free;
+ repl->update = update;
+ if (data == NULL)
+ WT_REPL_DELETED_SET(repl);
+ else {
+ repl->size = data->size;
+ memcpy(WT_REPL_DATA(repl), data->data, data->size);
+ }
+
+ update->first_free += align_size;
+ update->space_avail -= align_size;
+ ++update->in;
+
+ *replp = repl;
+ return (0);
+}
+
+/*
+ * __wt_repl_free --
+ * Free a WT_REPL structure and associated data from the TOC's update
+ * memory.
+ */
+void
+__wt_repl_free(WT_TOC *toc, WT_REPL *repl)
+{
+ ENV *env;
+
+ env = toc->env;
+
+ /*
+ * It's possible we allocated a WT_REPL structure and associated item
+ * memory from the WT_TOC update buffer, but then an error occurred.
+ * Don't try and clean up the update buffer, it's simpler to decrement
+ * the use count and let the page discard code deal with it during the
+ * page reconciliation process. (Note we're still in the allocation
+ * path, so we decrement the "in" field, not the "out" field.)
+ */
+ --repl->update->in;
+
+ /*
+ * One other thing: if the update buffer was a one-off, we have to free
+ * it here, it's not linked to any WT_PAGE in the system.
+ */
+ if (repl->update->in == 0)
+ __wt_free(env, repl->update, repl->update->len);
+}
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
new file mode 100644
index 00000000000..a8ff78dc380
--- /dev/null
+++ b/src/btree/row_srch.c
@@ -0,0 +1,196 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_key_build(WT_TOC *, WT_PAGE *, WT_ROW *);
+
+/*
+ * __wt_row_search --
+ * Search a row-store tree for a specific key.
+ */
+int
+__wt_row_search(WT_TOC *toc, DBT *key, uint32_t level, uint32_t flags)
+{
+ DB *db;
+ IDB *idb;
+ WT_OFF *off;
+ WT_PAGE *page;
+ WT_PAGE_DISK *dsk;
+ WT_REF *ref;
+ WT_ROW *rip;
+ WT_REPL *repl;
+ uint32_t base, indx, limit, write_gen;
+ int cmp, isleaf, ret;
+
+ toc->srch_page = NULL; /* Return values. */
+ toc->srch_ip = NULL;
+ toc->srch_repl = NULL;
+ toc->srch_exp = NULL;
+ toc->srch_write_gen = 0;
+
+ db = toc->db;
+ idb = db->idb;
+
+ WT_DB_FCHK(db, "__wt_row_search", flags, WT_APIMASK_BT_SEARCH_KEY_ROW);
+
+ /* Search the tree. */
+ for (page = idb->root_page.page;;) {
+ /* Copy the write generation value before the read. */
+ write_gen = page->write_gen;
+
+ dsk = page->dsk;
+ isleaf =
+ dsk->type == WT_PAGE_DUP_LEAF ||
+ dsk->type == WT_PAGE_ROW_LEAF;
+ for (base = 0,
+ limit = page->indx_count; limit != 0; limit >>= 1) {
+ indx = base + (limit >> 1);
+
+ /*
+ * If the key is compressed or an overflow, it may not
+ * have been instantiated yet.
+ */
+ rip = page->u.irow + indx;
+ if (__wt_key_process(rip))
+ WT_ERR(__wt_key_build(toc, page, rip));
+
+ /*
+ * If we're about to compare an application key with the
+ * 0th index on an internal page, pretend the 0th index
+ * sorts less than any application key. This test is so
+ * we don't have to update internal pages if the
+ * application stores a new, "smallest" key in the tree.
+ *
+ * For the record, we still maintain the key at the 0th
+ * location because it means tree verification and other
+ * code that processes a level of the tree doesn't need
+ * to know about this hack.
+ */
+ if (indx != 0 || isleaf) {
+ cmp = db->btree_compare(db, key, (DBT *)rip);
+ if (cmp == 0)
+ break;
+ if (cmp < 0)
+ continue;
+ }
+ base = indx + 1;
+ --limit;
+ }
+
+ /*
+ * Reference the slot used for next step down the tree. We do
+ * this on leaf pages too, because it's simpler to code, and we
+ * only care if there's an exact match on leaf pages; setting
+ * rip doesn't matter for leaf pages because we always return
+ * WT_NOTFOUND if there's no match.
+ *
+ * Base is the smallest index greater than key and may be the
+ * 0th index or the (last + 1) indx. If base is not the 0th
+ * index (remember, the 0th index always sorts less than any
+ * application key), decrement it to the smallest index less
+ * than or equal to key.
+ */
+ if (cmp != 0)
+ rip = page->u.irow + (base == 0 ? 0 : base - 1);
+
+ /*
+ * If we've reached the leaf page, or we've reached the level
+ * requested by our caller, we're done.
+ */
+ if (isleaf || level == dsk->level)
+ break;
+
+ /* rip references the subtree containing the record. */
+ ref = WT_ROW_REF(page, rip);
+ off = WT_ROW_OFF(rip);
+ WT_ERR(__wt_page_in(toc, page, ref, off, 0));
+
+ /* Swap the parent page for the child page. */
+ if (page != idb->root_page.page)
+ __wt_hazard_clear(toc, page);
+ page = ref->page;
+ }
+
+ /*
+ * We've got the right on-page WT_ROW structure (an exact match in the
+ * case of a lookup, or the smallest key on the page less than or equal
+ * to the specified key in the case of an insert). If it's an insert,
+ * we're done, return the information. Otherwise, check to see if the
+ * item was modified/deleted.
+ */
+ switch (dsk->type) {
+ case WT_PAGE_DUP_LEAF:
+ case WT_PAGE_ROW_LEAF:
+ if (LF_ISSET(WT_INSERT))
+ break;
+ if (cmp != 0) /* No match */
+ goto notfound;
+ /* Deleted match. */
+ if ((repl = WT_ROW_REPL(page, rip)) != NULL) {
+ if (WT_REPL_DELETED_ISSET(repl))
+ goto notfound;
+ toc->srch_repl = repl;
+ }
+ break;
+ case WT_PAGE_DUP_INT:
+ case WT_PAGE_ROW_INT:
+ /*
+ * When returning internal pages, set the item's WT_REPL slot
+ * if it exists, otherwise we're done.
+ */
+ toc->srch_repl = WT_ROW_REPL(page, rip);
+ break;
+ WT_ILLEGAL_FORMAT(db);
+ }
+
+ toc->srch_page = page;
+ toc->srch_ip = rip;
+ toc->srch_write_gen = write_gen;
+ return (0);
+
+notfound:
+ ret = WT_NOTFOUND;
+
+err: WT_PAGE_OUT(toc, page);
+ return (ret);
+}
+
+/*
+ * __wt_key_build --
+ * Instantiate an overflow or compressed key into a WT_ROW structure.
+ */
+static int
+__wt_key_build(WT_TOC *toc, WT_PAGE *page, WT_ROW *rip_arg)
+{
+ DBT *dbt, _dbt;
+ WT_ROW *rip;
+ WT_ITEM *item;
+ uint32_t i;
+
+ WT_CLEAR(_dbt);
+ dbt = &_dbt;
+
+ item = rip_arg->key;
+ WT_RET(__wt_item_process(toc, item, dbt));
+
+ /*
+ * Update the WT_ROW reference with the processed key. If there are
+ * any duplicates of this item, update them as well.
+ */
+ __wt_key_set(rip_arg, dbt->data, dbt->size);
+ if (WT_ITEM_TYPE(rip_arg->data) == WT_ITEM_DATA_DUP ||
+ WT_ITEM_TYPE(rip_arg->data) == WT_ITEM_DATA_DUP_OVFL) {
+ WT_INDX_FOREACH(page, rip, i)
+ if (rip->key == item)
+ __wt_key_set(rip, dbt->data, dbt->size);
+ }
+
+ return (0);
+}
diff --git a/src/db/db_err.c b/src/db/db_err.c
new file mode 100644
index 00000000000..1ba46e06a69
--- /dev/null
+++ b/src/db/db_err.c
@@ -0,0 +1,64 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+#define WT_DB_ERR(db, error, fmt) { \
+ va_list __ap; \
+ \
+ /* Application-specified callback function. */ \
+ va_start(__ap, fmt); \
+ if ((db)->errcall != NULL) \
+ __wt_msg_call((void *)((db)->errcall), \
+ (void *)(db), (db)->errpfx, \
+ (db)->idb == NULL ? NULL : (db)->idb->name, \
+ error, fmt, __ap); \
+ va_end(__ap); \
+ \
+ /* \
+ * If the application set an error callback function but not an \
+ * error stream, we're done. Otherwise, write an error stream. \
+ */ \
+ if ((db)->errcall != NULL && (db)->errfile == NULL) \
+ return; \
+ \
+ va_start(__ap, fmt); \
+ __wt_msg_stream((db)->errfile, (db)->errpfx, \
+ (db)->idb == NULL ? NULL : (db)->idb->name, \
+ error, fmt, __ap); \
+ va_end(__ap); \
+}
+
+/*
+ * __wt_api_db_err --
+ * Db.err method.
+ */
+void
+__wt_api_db_err(DB *db, int error, const char *fmt, ...)
+{
+ /*
+ * This function may be called at before/after the statistics memory
+ * has been allocated/freed; don't increment method statistics here.
+ */
+ WT_DB_ERR(db, error, fmt);
+}
+
+/*
+ * __wt_api_db_errx --
+ * Db.errx method.
+ */
+void
+__wt_api_db_errx(DB *db, const char *fmt, ...)
+{
+ /*
+ * This function may be called at before/after the statistics memory
+ * has been allocated/freed; don't increment method statistics here.
+ */
+ WT_DB_ERR(db, 0, fmt);
+}
diff --git a/src/db/db_getset.c b/src/db/db_getset.c
new file mode 100644
index 00000000000..6c133a0a3fb
--- /dev/null
+++ b/src/db/db_getset.c
@@ -0,0 +1,85 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_db_btree_compare_int_set_verify --
+ * Verify arguments to the Db.btree_compare_int_set method.
+ */
+int
+__wt_db_btree_compare_int_set_verify(DB *db, int btree_compare_int)
+{
+ if (btree_compare_int >= 0 && btree_compare_int <= 8)
+ return (0);
+
+ __wt_api_db_errx(db,
+ "The number of bytes must be an integral value between 1 and 8");
+ return (WT_ERROR);
+}
+
+/*
+ * __wt_db_btree_dup_offpage_set_verify --
+ * Verify arguments to the Db.btree_dup_offpage_set method.
+ */
+int
+__wt_db_btree_dup_offpage_set_verify(DB *db, uint32_t dup_offpage)
+{
+ /*
+ * Limiting this value to something between 10 and 50 is a sanity test,
+ * not a hard constraint (although a value of 100 might fail hard).
+ *
+ * If the value is too large, pages can end up being empty because it
+ * isn't possible for duplicate sets to span pages. So, if you set
+ * the value to 50%, and you have two sequential, large duplicate sets,
+ * you end up with two, half-empty pages.
+ */
+ if (dup_offpage > 10 && dup_offpage <= 50)
+ return (0);
+
+ __wt_api_db_errx(db,
+ "The percent of the page taken up by duplicate entries before "
+ "being moved off-page must must be between 10 and 50");
+ return (WT_ERROR);
+}
+
+/*
+ * __wt_db_column_set_verify --
+ * Verify arguments to the Db.column_set method.
+ */
+int
+__wt_db_column_set_verify(
+ DB *db, uint32_t fixed_len, const char *dictionary, uint32_t flags)
+{
+ ENV *env;
+ IDB *idb;
+
+ env = db->env;
+ idb = db->idb;
+
+ /*
+ * The fixed-length number of bytes is stored in a single byte, which
+ * limits the size to 255 bytes.
+ */
+ WT_RET(__wt_api_arg_max(
+ env, "DB.column_set", "fixed_len", fixed_len, 255));
+
+ /* Run-length encoding is incompatible with variable length records. */
+ if (fixed_len == 0 && LF_ISSET(WT_RLE)) {
+ __wt_api_db_errx(db,
+ "Run-length encoding is incompatible with variable length "
+ "column-store records");
+ return (WT_ERROR);
+ }
+
+ if (LF_ISSET(WT_RLE))
+ F_SET(idb, WT_RLE);
+ F_SET(idb, WT_COLUMN);
+ return (0);
+}
diff --git a/src/db/db_handle.c b/src/db/db_handle.c
new file mode 100644
index 00000000000..b9e244d5ea9
--- /dev/null
+++ b/src/db/db_handle.c
@@ -0,0 +1,184 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_db_config(DB *);
+static int __wt_idb_config(DB *);
+static int __wt_idb_destroy(DB *);
+
+/*
+ * __wt_env_db --
+ * DB constructor.
+ */
+int
+__wt_env_db(ENV *env, DB **dbp)
+{
+ DB *db;
+ IDB *idb;
+ int ret;
+
+ db = NULL;
+ idb = NULL;
+
+ /* Create the DB and IDB structures. */
+ WT_ERR(__wt_calloc(env, 1, sizeof(DB), &db));
+ WT_ERR(__wt_calloc(env, 1, sizeof(IDB), &idb));
+
+ /* Connect everything together. */
+ db->idb = idb;
+ idb->db = db;
+ db->env = env;
+
+ /* Configure the DB and the IDB. */
+ WT_ERR(__wt_db_config(db));
+ WT_ERR(__wt_idb_config(db));
+
+ *dbp = db;
+ return (0);
+
+err: (void)__wt_db_destroy(db);
+ return (ret);
+}
+
+/*
+ * __wt_db_config --
+ * Set configuration for a just-created DB handle.
+ */
+static int
+__wt_db_config(DB *db)
+{
+ __wt_methods_db_config_default(db);
+ __wt_methods_db_lockout(db);
+ __wt_methods_db_init_transition(db);
+
+ return (0);
+}
+
+/*
+ * __wt_idb_config --
+ * Set configuration for a just-created IDB handle.
+ */
+static int
+__wt_idb_config(DB *db)
+{
+ ENV *env;
+ IDB *idb;
+ IENV *ienv;
+
+ env = db->env;
+ idb = db->idb;
+ ienv = env->ienv;
+
+ idb->db = db;
+ idb->root_off.addr = idb->free_addr = WT_ADDR_INVALID;
+
+ __wt_lock(env, ienv->mtx); /* Add to the ENV's list */
+ TAILQ_INSERT_TAIL(&ienv->dbqh, idb, q);
+ ++ienv->dbqcnt;
+ __wt_unlock(env, ienv->mtx);
+
+ WT_RET(__wt_stat_alloc_db_stats(env, &idb->stats));
+ WT_RET(__wt_stat_alloc_database_stats(env, &idb->dstats));
+
+ return (0);
+}
+
+/*
+ * __wt_db_destroy --
+ * DB handle destructor.
+ */
+int
+__wt_db_destroy(DB *db)
+{
+ ENV *env;
+ int ret;
+
+ env = db->env;
+
+ /* Discard the underlying IDB object. */
+ ret = __wt_idb_destroy(db);
+
+ /* Discard the DB object. */
+ __wt_free(env, db, sizeof(DB));
+
+ return (ret);
+}
+
+/*
+ * __wt_idb_destroy --
+ * IDB handle destructor.
+ */
+static int
+__wt_idb_destroy(DB *db)
+{
+ ENV *env;
+ IDB *idb;
+ IENV *ienv;
+ int ret;
+
+ env = db->env;
+ idb = db->idb;
+ ienv = env->ienv;
+ ret = 0;
+
+ /* Check that there's something to close. */
+ if (idb == NULL)
+ return (0);
+
+ /* Diagnostic check: check flags against approved list. */
+ WT_ENV_FCHK_RET(env, "Db.close", idb->flags, WT_APIMASK_IDB, ret);
+
+ __wt_free(env, idb->name, 0);
+
+ if (idb->huffman_key != NULL) {
+ /* Key and data may use the same table, only close it once. */
+ if (idb->huffman_data == idb->huffman_key)
+ idb->huffman_data = NULL;
+ __wt_huffman_close(env, idb->huffman_key);
+ idb->huffman_key = NULL;
+ }
+ if (idb->huffman_data != NULL) {
+ __wt_huffman_close(env, idb->huffman_data);
+ idb->huffman_data = NULL;
+ }
+
+ __wt_walk_end(env, &idb->evict_walk);
+
+ __wt_free(env, idb->stats, 0);
+ __wt_free(env, idb->dstats, 0);
+
+ __wt_lock(env, ienv->mtx); /* Delete from the ENV's list */
+ TAILQ_REMOVE(&ienv->dbqh, idb, q);
+ --ienv->dbqcnt;
+ __wt_unlock(env, ienv->mtx);
+
+ __wt_free(env, idb, sizeof(IDB));
+ db->idb = NULL;
+ return (ret);
+}
+
+int
+__wt_db_lockout_err(DB *db)
+{
+ __wt_api_db_errx(db,
+ "This Db handle has failed for some reason, and can no longer "
+ "be used; the only method permitted on it is Db.close which "
+ "discards the handle permanently");
+ return (WT_ERROR);
+}
+
+int
+__wt_db_lockout_open(DB *db)
+{
+ __wt_api_db_errx(db,
+ "This method may not be called until after the Db.open method has "
+ "been called");
+ return (WT_ERROR);
+}
diff --git a/src/db/db_huffman.c b/src/db/db_huffman.c
new file mode 100644
index 00000000000..ae9fe7fccde
--- /dev/null
+++ b/src/db/db_huffman.c
@@ -0,0 +1,233 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * 7-bit ASCII, with English language frequencies.
+ *
+ * Based on "Case-sensitive letter and bigram frequency counts from large-scale
+ * English corpora"
+ * Michael N. Jones and D.J.K. Mewhort
+ * Queen's University, Kingston, Ontario, Canada
+ * Behavior Research Methods, Instruments, & Computers 2004, 36 (3), 388-396
+ *
+ * Additionally supports space and tab characters; space is the most common
+ * character in text where it occurs, and tab appears about as frequently as
+ * 'a' and 'n' in text where it occurs.
+ */
+static uint8_t const __wt_huffman_ascii_english[256] = {
+ 1, /* 000 nul */
+ 1, /* 001 soh */
+ 1, /* 002 stx */
+ 1, /* 003 etx */
+ 1, /* 004 eot */
+ 1, /* 005 enq */
+ 1, /* 006 ack */
+ 1, /* 007 bel */
+ 1, /* 010 bs */
+ 251, /* 011 ht */
+ 1, /* 012 nl */
+ 1, /* 013 vt */
+ 1, /* 014 np */
+ 1, /* 015 cr */
+ 1, /* 016 so */
+ 1, /* 017 si */
+ 1, /* 020 dle */
+ 1, /* 021 dc1 */
+ 1, /* 022 dc2 */
+ 1, /* 023 dc3 */
+ 1, /* 024 dc4 */
+ 1, /* 025 nak */
+ 1, /* 026 syn */
+ 1, /* 027 etb */
+ 1, /* 030 can */
+ 1, /* 031 em */
+ 1, /* 032 sub */
+ 1, /* 033 esc */
+ 1, /* 034 fs */
+ 1, /* 035 gs */
+ 1, /* 036 rs */
+ 1, /* 037 us */
+ 255, /* 040 sp */
+ 177, /* 041 ! */
+ 223, /* 042 " */
+ 171, /* 043 # */
+ 188, /* 044 $ */
+ 176, /* 045 % */
+ 179, /* 046 & */
+ 215, /* 047 ' */
+ 189, /* 050 ( */
+ 190, /* 051 ) */
+ 184, /* 052 * */
+ 175, /* 053 + */
+ 234, /* 054 , */
+ 219, /* 055 - */
+ 233, /* 056 . */
+ 181, /* 057 / */
+ 230, /* 060 0 */
+ 229, /* 061 1 */
+ 226, /* 062 2 */
+ 213, /* 063 3 */
+ 214, /* 064 4 */
+ 227, /* 065 5 */
+ 210, /* 066 6 */
+ 203, /* 067 7 */
+ 212, /* 070 8 */
+ 222, /* 071 9 */
+ 191, /* 072 : */
+ 186, /* 073 ; */
+ 173, /* 074 < */
+ 172, /* 075 = */
+ 174, /* 076 > */
+ 183, /* 077 ? */
+ 170, /* 100 @ */
+ 221, /* 101 A */
+ 211, /* 102 B */
+ 218, /* 103 C */
+ 206, /* 104 D */
+ 207, /* 105 E */
+ 199, /* 106 F */
+ 197, /* 107 G */
+ 205, /* 110 H */
+ 217, /* 111 I */
+ 196, /* 112 J */
+ 187, /* 113 K */
+ 201, /* 114 L */
+ 220, /* 115 M */
+ 216, /* 116 N */
+ 200, /* 117 O */
+ 208, /* 120 P */
+ 182, /* 121 Q */
+ 209, /* 122 R */
+ 224, /* 123 S */
+ 225, /* 124 T */
+ 193, /* 125 U */
+ 185, /* 126 V */
+ 202, /* 127 W */
+ 180, /* 130 X */
+ 198, /* 131 Y */
+ 178, /* 132 Z */
+ 1, /* 133 [ */
+ 1, /* 134 \ */
+ 1, /* 135 ] */
+ 1, /* 136 ^ */
+ 1, /* 137 _ */
+ 1, /* 140 ` */
+ 252, /* 141 a */
+ 232, /* 142 b */
+ 242, /* 143 c */
+ 243, /* 144 d */
+ 254, /* 145 e */
+ 239, /* 146 f */
+ 237, /* 147 g */
+ 245, /* 150 h */
+ 248, /* 151 i */
+ 194, /* 152 j */
+ 228, /* 153 k */
+ 244, /* 154 l */
+ 240, /* 155 m */
+ 249, /* 156 n */
+ 250, /* 157 o */
+ 238, /* 160 p */
+ 192, /* 161 q */
+ 246, /* 162 r */
+ 247, /* 163 s */
+ 253, /* 164 t */
+ 241, /* 165 u */
+ 231, /* 166 v */
+ 235, /* 167 w */
+ 204, /* 170 x */
+ 236, /* 171 y */
+ 195, /* 172 z */
+ 1, /* 173 { */
+ 1, /* 174 | */
+ 1, /* 175 } */
+ 1, /* 176 ~ */
+ 1, /* 177 del */
+};
+
+/*
+ * __wt_db_huffman_set --
+ * DB huffman configuration setter.
+ */
+int
+__wt_db_huffman_set(DB *db,
+ uint8_t const *huffman_table, u_int huffman_table_size, uint32_t flags)
+{
+ ENV *env;
+ IDB *idb;
+ uint8_t phone[256];
+
+ env = db->env;
+ idb = db->idb;
+
+ switch (LF_ISSET(WT_ASCII_ENGLISH | WT_TELEPHONE)) {
+ case WT_ASCII_ENGLISH:
+ if (huffman_table != NULL)
+ goto err;
+ huffman_table = __wt_huffman_ascii_english;
+ huffman_table_size = sizeof(__wt_huffman_ascii_english);
+ break;
+ case WT_TELEPHONE:
+ if (huffman_table != NULL)
+ goto err;
+ memset(phone, 0, sizeof(phone));
+ phone['('] = 2;
+ phone[')'] = 2;
+ phone['+'] = 1;
+ phone['-'] = 3;
+ phone['0'] = 1;
+ phone['1'] = 1;
+ phone['2'] = 1;
+ phone['3'] = 1;
+ phone['4'] = 1;
+ phone['5'] = 1;
+ phone['6'] = 1;
+ phone['7'] = 1;
+ phone['8'] = 1;
+ phone['9'] = 1;
+ huffman_table = phone;
+ huffman_table_size = sizeof(phone);
+ break;
+ default:
+err: return (__wt_api_args(env, "Db.huffman_set"));
+ }
+
+ /*
+ * If we're using an already-specified table, close it. It's probably
+ * an application error to set the Huffman table twice, but hey, I just
+ * work here.
+ */
+ if (LF_ISSET(WT_HUFFMAN_KEY) && idb->huffman_key != NULL) {
+ /* Key and data may use the same table, only close it once. */
+ if (idb->huffman_data == idb->huffman_key)
+ idb->huffman_data = NULL;
+ __wt_huffman_close(env, idb->huffman_key);
+ idb->huffman_key = NULL;
+ }
+ if (LF_ISSET(WT_HUFFMAN_DATA) && idb->huffman_data != NULL) {
+ __wt_huffman_close(env, idb->huffman_data);
+ idb->huffman_data = NULL;
+ }
+ if (LF_ISSET(WT_HUFFMAN_KEY)) {
+ WT_RET(__wt_huffman_open(env,
+ huffman_table, huffman_table_size, &idb->huffman_key));
+ /* Key and data may use the same table. */
+ if (LF_ISSET(WT_HUFFMAN_DATA)) {
+ idb->huffman_data = idb->huffman_key;
+ LF_CLR(WT_HUFFMAN_DATA);
+ }
+ }
+ if (LF_ISSET(WT_HUFFMAN_DATA))
+ WT_RET(__wt_huffman_open(env,
+ huffman_table, huffman_table_size, &idb->huffman_data));
+
+ return (0);
+}
diff --git a/src/db/db_open.c b/src/db/db_open.c
new file mode 100644
index 00000000000..1cdf04c1288
--- /dev/null
+++ b/src/db/db_open.c
@@ -0,0 +1,104 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_db_idb_open(DB *, const char *, mode_t, uint32_t);
+
+/*
+ * __wt_db_open --
+ * Open a DB handle.
+ */
+int
+__wt_db_open(WT_TOC *toc, const char *name, mode_t mode, uint32_t flags)
+{
+ DB *db;
+ ENV *env;
+
+ env = toc->env;
+ db = toc->db;
+
+ WT_STAT_INCR(env->ienv->stats, DATABASE_OPEN);
+
+ /* Initialize the IDB structure. */
+ WT_RET(__wt_db_idb_open(db, name, mode, flags));
+
+ /* Open the underlying Btree. */
+ WT_RET(__wt_bt_open(toc, LF_ISSET(WT_CREATE) ? 1 : 0));
+
+ /* Turn on the methods that require open. */
+ __wt_methods_db_open_transition(db);
+
+ return (0);
+}
+
+/*
+ * __wt_db_idb_open --
+ * Routine to intialize any IDB values based on a DB value during open.
+ */
+static int
+__wt_db_idb_open(DB *db, const char *name, mode_t mode, uint32_t flags)
+{
+ ENV *env;
+ IENV *ienv;
+ IDB *idb;
+
+ env = db->env;
+ ienv = env->ienv;
+ idb = db->idb;
+
+ WT_RET(__wt_strdup(env, name, &idb->name));
+ idb->mode = mode;
+
+ __wt_lock(env, ienv->mtx);
+ idb->file_id = ++ienv->next_file_id;
+ __wt_unlock(env, ienv->mtx);
+
+ /*
+ * XXX
+ * Initialize the root WT_REF/WT_OFF pair to point to the start of
+ * the file. This is all wrong, and we'll get the information from
+ * somewhere else, eventually.
+ */
+ WT_CLEAR(idb->root_page);
+ idb->root_page.state = WT_EMPTY;
+ WT_CLEAR(idb->root_off);
+ idb->root_off.addr = 0;
+ idb->root_off.size = 0;
+
+ if (LF_ISSET(WT_RDONLY))
+ F_SET(idb, WT_RDONLY);
+
+ return (0);
+}
+
+/*
+ * __wt_db_close --
+ * Db.close method (DB close & handle destructor).
+ */
+int
+__wt_db_close(WT_TOC *toc, uint32_t flags)
+{
+ DB *db;
+ int ret;
+
+ db = toc->db;
+ ret = 0;
+
+ /* Flush the underlying Btree. */
+ if (!LF_ISSET(WT_NOWRITE))
+ WT_TRET(__wt_bt_sync(toc));
+
+ /* Close the underlying Btree. */
+ ret = __wt_bt_close(toc);
+
+ WT_TRET(__wt_db_destroy(db));
+
+ return (ret);
+}
diff --git a/src/db/db_stat.c b/src/db/db_stat.c
new file mode 100644
index 00000000000..84ac9960860
--- /dev/null
+++ b/src/db/db_stat.c
@@ -0,0 +1,72 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_db_stat_print --
+ * Print DB handle statistics to a stream.
+ */
+int
+__wt_db_stat_print(WT_TOC *toc, FILE *stream)
+{
+ DB *db;
+ ENV *env;
+ IDB *idb;
+
+ db = toc->db;
+ env = toc->env;
+ idb = db->idb;
+
+ fprintf(stream, "Database handle statistics: %s\n", idb->name);
+ __wt_stat_print(env, idb->stats, stream);
+
+ /* Clear the database stats, then call Btree stat to fill them in. */
+ __wt_stat_clear_database_stats(idb->dstats);
+ WT_STAT_SET(idb->dstats, TREE_LEVEL, idb->root_page.page->dsk->level);
+ WT_RET(__wt_desc_stat(toc));
+
+ /*
+ * Note we do not have a hazard reference for the root page, and that's
+ * safe -- root pages are pinned into memory when a database is opened,
+ * and never re-written until the database is closed.
+ */
+ WT_RET(__wt_tree_walk(toc, NULL, 0, __wt_page_stat, NULL));
+
+ fprintf(stream, "Database statistics: %s\n", idb->name);
+ __wt_stat_print(env, idb->dstats, stream);
+
+ /* Underlying file handle statistics. */
+ if (idb->fh != NULL) {
+ fprintf(stream,
+ "Underlying file I/O statistics: %s\n", idb->name);
+ __wt_stat_print(env, idb->fh->stats, stream);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_db_stat_clear --
+ * Clear DB handle statistics.
+ */
+int
+__wt_db_stat_clear(DB *db)
+{
+ IDB *idb;
+
+ idb = db->idb;
+
+ __wt_stat_clear_db_stats(idb->stats);
+ __wt_stat_clear_database_stats(idb->dstats);
+ if (idb->fh != NULL)
+ __wt_stat_clear_fh_stats(idb->fh->stats);
+
+ return (0);
+}
diff --git a/src/db/db_sync.c b/src/db/db_sync.c
new file mode 100644
index 00000000000..eec5026f0c2
--- /dev/null
+++ b/src/db/db_sync.c
@@ -0,0 +1,20 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_db_sync --
+ * Flush a database to the backing file.
+ */
+int
+__wt_db_sync(WT_TOC *toc, void (*f)(const char *, uint64_t), uint32_t flags)
+{
+ return (__wt_bt_sync(toc));
+}
diff --git a/src/env/env_err.c b/src/env/env_err.c
new file mode 100644
index 00000000000..b5bc0ca5966
--- /dev/null
+++ b/src/env/env_err.c
@@ -0,0 +1,83 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+void
+wiredtiger_err_stream(FILE *stream)
+{
+ extern FILE *__wt_err_stream;
+
+ __wt_err_stream = stream;
+}
+
+#define WT_ENV_ERR(env, error, fmt) { \
+ extern FILE *__wt_err_stream; \
+ va_list __ap; \
+ /* \
+ * Support error messages even when we don't yet have an ENV \
+ * handle. \
+ */ \
+ if ((env) == NULL) { \
+ va_start(__ap, fmt); \
+ __wt_msg_stream( \
+ __wt_err_stream, NULL, NULL, error, fmt, __ap); \
+ va_end(__ap); \
+ return; \
+ } \
+ \
+ /* Application-specified callback function. */ \
+ if ((env)->errcall != NULL) { \
+ va_start(__ap, fmt); \
+ __wt_msg_call((void *)((env)->errcall), \
+ (void *)(env), env->errpfx, \
+ NULL, error, fmt, __ap); \
+ va_end(__ap); \
+ } \
+ \
+ /* \
+ * If the application set an error callback function but not an \
+ * error stream, we're done. Otherwise, write the stream. \
+ */ \
+ if ((env)->errcall != NULL && (env)->errfile == NULL) \
+ return; \
+ \
+ va_start(__ap, fmt); \
+ __wt_msg_stream((env)->errfile, \
+ (env)->errpfx, NULL, error, fmt, __ap); \
+ va_end(__ap); \
+}
+
+/*
+ * __wt_api_env_err --
+ * Env.err method.
+ */
+void
+__wt_api_env_err(ENV *env, int error, const char *fmt, ...)
+{
+ /*
+ * This function may be called at before/after the statistics memory
+ * has been allocated/freed; don't increment method statistics here.
+ */
+ WT_ENV_ERR(env, error, fmt);
+}
+
+/*
+ * __wt_api_env_errx --
+ * Env.errx method.
+ */
+void
+__wt_api_env_errx(ENV *env, const char *fmt, ...)
+{
+ /*
+ * This function may be called at before/after the statistics memory
+ * has been allocated/freed; don't increment method statistics here.
+ */
+ WT_ENV_ERR(env, 0, fmt);
+}
diff --git a/src/env/env_getset.c b/src/env/env_getset.c
new file mode 100644
index 00000000000..6786c87b41d
--- /dev/null
+++ b/src/env/env_getset.c
@@ -0,0 +1,70 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_env_cache_cache_size_set_verify --
+ * Verify an argument to the Env.cache_size_set method.
+ */
+int
+__wt_env_cache_size_set_verify(ENV *env, uint32_t cache_size)
+{
+ return (__wt_api_arg_min(env,
+ "Env.cache_size_set", "cache size", cache_size, 1));
+}
+
+/*
+ * __wt_env_cache_hash_size_set_verify --
+ * Verify an argument to the Env.hash_size_set method.
+ */
+int
+__wt_env_cache_hash_size_set_verify(ENV *env, uint32_t hash_size)
+{
+ return (__wt_api_arg_min(env,
+ "Env.hash_size_set", "hash size", hash_size, 1));
+}
+
+/*
+ * __wt_env_cache_hazard_size_set_verify --
+ * Verify an argument to the Env.hazard_size_set method.
+ */
+int
+__wt_env_hazard_size_set_verify(ENV *env, uint32_t hazard_size)
+{
+ return (__wt_api_arg_min(env,
+ "Env.hazard_size_set", "hazard size", hazard_size, 1));
+}
+
+/*
+ * __wt_env_toc_size_set_verify --
+ * Verify an argument to the Env.toc_size_set method.
+ */
+int
+__wt_env_toc_size_set_verify(ENV *env, uint32_t toc_size)
+{
+ return (__wt_api_arg_min(env,
+ "Env.toc_size_set", "toc size", toc_size, 1));
+}
+
+/*
+ * __wt_env_verbose_set_verify --
+ * Verify an argument to the Env.verbose_set method.
+ */
+int
+__wt_env_verbose_set_verify(ENV *env, uint32_t verbose)
+{
+#ifdef HAVE_VERBOSE
+ WT_ENV_FCHK(env,
+ "Env.verbose_set", verbose, WT_APIMASK_ENV_VERBOSE_SET);
+ return (0);
+#else
+ return (__wt_api_config(env, "Env.verbose_set", "--enable-verbose"));
+#endif
+}
diff --git a/src/env/env_global.c b/src/env/env_global.c
new file mode 100644
index 00000000000..e41a7bccfad
--- /dev/null
+++ b/src/env/env_global.c
@@ -0,0 +1,72 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+void *__wt_addr; /* Memory flush address. */
+FILE *__wt_err_stream; /* Error stream from init. */
+
+/*
+ * __wt_library_init --
+ * Some things to do, before we do anything else.
+ */
+int
+__wt_library_init(void)
+{
+ /*
+ * We need an address for memory flushing -- it doesn't matter which
+ * one we choose.
+ */
+ __wt_addr = &__wt_addr;
+
+ /*
+ * We want to be able to redirect error messages from the very first
+ * instruction.
+ */
+ __wt_err_stream = stderr;
+
+ /*
+ * Check the build & compiler itself before going further.
+ */
+ WT_RET(__wt_bt_build_verify());
+
+#ifdef HAVE_DIAGNOSTIC
+ /* Load debug code the compiler might optimize out. */
+ WT_RET(__wt_breakpoint());
+#endif
+
+ return (0);
+}
+
+/*
+ * __wt_breakpoint --
+ * A simple place to put a breakpoint, if you need one.
+ */
+int
+__wt_breakpoint(void)
+{
+ return (0);
+}
+
+int __wt_debugger_attach;
+
+/*
+ * __wt_attach --
+ * A routine to wait for the debugging to attach.
+ */
+void
+__wt_attach(ENV *env)
+{
+#ifdef HAVE_ATTACH
+ __wt_api_env_errx(env,
+ "process ID %lld: waiting for debugger...", (long long)getpid());
+ while (__wt_debugger_attach == 0)
+ __wt_sleep(10, 0);
+#endif
+}
diff --git a/src/env/env_handle.c b/src/env/env_handle.c
new file mode 100644
index 00000000000..1c02675041f
--- /dev/null
+++ b/src/env/env_handle.c
@@ -0,0 +1,137 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_env_config(ENV *);
+static int __wt_ienv_config(ENV *);
+
+/*
+ * __wt_env_create --
+ * ENV constructor.
+ */
+int
+__wt_env_create(uint32_t flags, ENV **envp)
+{
+ ENV *env;
+ IENV *ienv;
+ int ret;
+
+ /*
+ * !!!
+ * We don't yet have valid ENV/IENV structures to use to call other
+ * functions. The only functions that can handle NULL ENV handles
+ * are the memory allocation and free functions, no other functions
+ * may be called.
+ */
+ WT_RET(__wt_calloc(NULL, 1, sizeof(ENV), &env));
+ WT_ERR(__wt_calloc(NULL, 1, sizeof(IENV), &ienv));
+
+ /* Connect everything together. */
+ env->ienv = ienv;
+
+ /* Set flags. */
+ if (LF_ISSET(WT_MEMORY_CHECK))
+ F_SET(env, WT_MEMORY_CHECK);
+
+ /* Configure the ENV and the IENV. */
+ WT_ERR(__wt_env_config(env));
+ WT_ERR(__wt_ienv_config(env));
+
+ *envp = env;
+ return (0);
+
+err: (void)__wt_env_close(env);
+ return (ret);
+}
+
+/*
+ * __wt_env_config --
+ * Set configuration for a just-created ENV handle.
+ */
+static int
+__wt_env_config(ENV *env)
+{
+ __wt_methods_env_config_default(env);
+ __wt_methods_env_lockout(env);
+ __wt_methods_env_init_transition(env);
+ return (0);
+}
+
+/*
+ * __wt_ienv_config --
+ * Set configuration for a just-created IENV handle.
+ */
+static int
+__wt_ienv_config(ENV *env)
+{
+ IENV *ienv;
+
+ ienv = env->ienv;
+
+#ifdef HAVE_DIAGNOSTIC
+ /* If we're tracking memory, initialize those structures first. */
+ if (F_ISSET(env, WT_MEMORY_CHECK))
+ WT_RET(__wt_mtrack_alloc(env));
+#endif
+ /* Global mutex */
+ WT_RET(__wt_mtx_alloc(env, "IENV", 0, &ienv->mtx));
+
+ TAILQ_INIT(&ienv->dbqh); /* DB list */
+ TAILQ_INIT(&ienv->fhqh); /* File list */
+
+ /* Statistics. */
+ WT_RET(__wt_stat_alloc_env_stats(env, &ienv->stats));
+ WT_RET(__wt_stat_alloc_method_stats(env, &ienv->method_stats));
+
+ /* Diagnostic output separator. */
+ ienv->sep = "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=";
+
+ return (0);
+}
+
+/*
+ * __wt_ienv_destroy --
+ * Destroy the ENV's underlying IENV structure.
+ */
+int
+__wt_ienv_destroy(ENV *env)
+{
+ IENV *ienv;
+ int ret;
+
+ ienv = env->ienv;
+ ret = 0;
+
+ /* Check there's something to destroy. */
+ if (ienv == NULL)
+ return (0);
+
+ /* Diagnostic check: check flags against approved list. */
+ WT_ENV_FCHK_RET(env, "Env.close", ienv->flags, WT_APIMASK_IENV, ret);
+
+ (void)__wt_mtx_destroy(env, ienv->mtx);
+
+ /* Free allocated memory. */
+ __wt_free(env, ienv->toc, 0);
+ __wt_free(env, ienv->toc_array, 0);
+ __wt_free(env, ienv->hazard, 0);
+ __wt_free(env, ienv->stats, 0);
+ __wt_free(env, ienv->method_stats, 0);
+
+#ifdef HAVE_DIAGNOSTIC
+ /* If we're tracking memory, check to see if everything was free'd. */
+ __wt_mtrack_dump(env);
+ __wt_mtrack_free(env);
+#endif
+
+ __wt_free(NULL, ienv, sizeof(IENV));
+ env->ienv = NULL;
+ return (ret);
+}
diff --git a/src/env/env_init.c b/src/env/env_init.c
new file mode 100644
index 00000000000..26c7062d63f
--- /dev/null
+++ b/src/env/env_init.c
@@ -0,0 +1,41 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * wiredtiger_env_init --
+ * Initialize the library, creating an ENV handle.
+ */
+int
+wiredtiger_env_init(ENV **envp, uint32_t flags)
+{
+ static int library_init = 0;
+ ENV *env;
+
+ *envp = NULL;
+
+ /*
+ * We end up here before we do any real work. Check the build itself,
+ * and do some global stuff.
+ */
+ if (library_init == 0) {
+ WT_RET(__wt_library_init());
+ library_init = 1;
+ }
+
+ WT_ENV_FCHK(NULL,
+ "wiredtiger_env_init", flags, WT_APIMASK_WIREDTIGER_ENV_INIT);
+
+ /* Create the ENV handle. */
+ WT_RET(__wt_env_create(flags, &env));
+
+ *envp = env;
+ return (0);
+}
diff --git a/src/env/env_msg.c b/src/env/env_msg.c
new file mode 100644
index 00000000000..9dcfdec9514
--- /dev/null
+++ b/src/env/env_msg.c
@@ -0,0 +1,138 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+#define WT_MSG(env, fmt) { \
+ extern FILE *__wt_err_stream; \
+ va_list __ap; \
+ /* \
+ * Support messages even when we don't yet have an ENV handle, \
+ * using the error stream.
+ */ \
+ if ((env) == NULL) { \
+ va_start(__ap, fmt); \
+ __wt_msg_stream( \
+ __wt_err_stream, NULL, NULL, 0, fmt, __ap); \
+ va_end(__ap); \
+ return; \
+ } \
+ \
+ /* Application-specified callback function. */ \
+ if ((env)->msgcall != NULL) { \
+ va_start(__ap, fmt); \
+ __wt_msg_call((void *)((env)->msgcall), \
+ (void *)env, NULL, NULL, 0, fmt, __ap); \
+ va_end(__ap); \
+ } \
+ \
+ /* \
+ * If the application set an message callback function but not a\
+ * message stream, we're done. Otherwise, write the stream. \
+ */ \
+ if ((env)->msgcall != NULL && (env)->msgfile == NULL) \
+ return; \
+ \
+ va_start(__ap, fmt); \
+ __wt_msg_stream((env)->msgfile, NULL, NULL, 0, fmt, __ap); \
+ va_end(__ap); \
+}
+
+/*
+ * __wt_msg --
+ * Write a message.
+ */
+void
+__wt_msg(ENV *env, const char *fmt, ...)
+{
+ WT_MSG(env, fmt);
+}
+
+/*
+ * __wt_mb_init --
+ * Initialize a WT_MBUF structure for message aggregation.
+ */
+void
+__wt_mb_init(ENV *env, WT_MBUF *mbp)
+{
+ mbp->env = env;
+ mbp->first = mbp->next = NULL;
+ mbp->len = 0;
+}
+
+/*
+ * __wt_mb_discard --
+ * Discard a WT_MBUF structure.
+ */
+void
+__wt_mb_discard(WT_MBUF *mbp)
+{
+ if (mbp->first == NULL)
+ return;
+
+ /* Write any remaining message. */
+ if (mbp->next != mbp->first)
+ __wt_mb_write(mbp);
+
+ __wt_free(mbp->env, mbp->first, mbp->len);
+}
+
+/*
+ * __wt_mb_add --
+ * Append log messages into a WT_MBUF structure.
+ */
+void
+__wt_mb_add(WT_MBUF *mbp, const char *fmt, ...)
+{
+ va_list ap;
+ size_t current, len, remain;
+
+ va_start(ap, fmt);
+
+ current = (size_t)(mbp->next - mbp->first);
+ remain = mbp->len - current;
+ len = 64;
+ for (;;) {
+ /*
+ * If we don't have at least "len" bytes allocate 2x len bytes
+ * more memory.
+ */
+ if (remain <= len) {
+ if (__wt_realloc(mbp->env,
+ &mbp->len, mbp->len + len * 2, &mbp->first))
+ return;
+ mbp->next = mbp->first + current;
+ remain = mbp->len - current;
+ }
+ /*
+ * Format the user's information. If it doesn't fit into the
+ * buffer we have, re-allocate enough memory and try again.
+ */
+ len = (size_t)vsnprintf(mbp->next, remain, fmt, ap);
+ if (len < remain) {
+ mbp->next += len;
+ break;
+ }
+ }
+}
+
+/*
+ * __wt_mb_write --
+ * Write the messages from a WT_MBUF structure.
+ */
+void
+__wt_mb_write(WT_MBUF *mbp)
+{
+ if (mbp->first == NULL || mbp->next == mbp->first)
+ return;
+
+ __wt_msg(mbp->env, "%s", mbp->first);
+
+ mbp->next = mbp->first;
+}
diff --git a/src/env/env_open.c b/src/env/env_open.c
new file mode 100644
index 00000000000..a6f95838ede
--- /dev/null
+++ b/src/env/env_open.c
@@ -0,0 +1,132 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_env_open --
+ * Open a Env handle.
+ */
+int
+__wt_env_open(ENV *env, const char *home, mode_t mode)
+{
+ IENV *ienv;
+ int ret;
+
+ WT_CC_QUIET(home, NULL);
+ WT_CC_QUIET(mode, 0);
+
+ ienv = env->ienv;
+ ret = 0;
+
+ /* WT_TOC and hazard arrays. */
+ WT_RET(__wt_calloc(env, env->toc_size, sizeof(WT_TOC *), &ienv->toc));
+ WT_RET(
+ __wt_calloc(env, env->toc_size, sizeof(WT_TOC), &ienv->toc_array));
+ WT_RET(__wt_calloc(env,
+ env->toc_size * env->hazard_size, sizeof(WT_PAGE *), &ienv->hazard));
+
+ /* Create the cache. */
+ WT_RET(__wt_cache_create(env));
+
+ /* Transition to the open state. */
+ __wt_methods_env_open_transition(env);
+
+ /* Start worker threads. */
+ F_SET(ienv, WT_WORKQ_RUN | WT_SERVER_RUN);
+ WT_MEMORY_FLUSH;
+
+ WT_ERR(__wt_thread_create(
+ &ienv->cache_evict_tid, __wt_cache_evict_server, env));
+ WT_ERR(__wt_thread_create(
+ &ienv->cache_read_tid, __wt_cache_read_server, env));
+ WT_ERR(__wt_thread_create(&ienv->workq_tid, __wt_workq_srvr, env));
+
+ return (0);
+
+err: (void)__wt_env_close(env);
+ return (ret);
+}
+
+/*
+ * __wt_env_close --
+ * Close an Env handle.
+ */
+int
+__wt_env_close(ENV *env)
+{
+ IDB *idb;
+ IENV *ienv;
+ WT_FH *fh;
+ int ret, secondary_err;
+
+ WT_ENV_FCHK_RET(env, "Env.close", env->flags, WT_APIMASK_ENV, ret);
+
+ ienv = env->ienv;
+ ret = secondary_err = 0;
+
+ /* Complain if DB handles weren't closed. */
+ if (TAILQ_FIRST(&ienv->dbqh) != NULL) {
+ TAILQ_FOREACH(idb, &ienv->dbqh, q) {
+ __wt_api_env_errx(env,
+ "Env handle has open Db handles: %s",
+ idb->name);
+ WT_TRET(idb->db->close(idb->db, 0));
+ }
+ secondary_err = WT_ERROR;
+ }
+
+ /* Complain if files weren't closed. */
+ if (TAILQ_FIRST(&ienv->fhqh) != NULL) {
+ TAILQ_FOREACH(fh, &ienv->fhqh, q) {
+ __wt_api_env_errx(env,
+ "Env handle has open file handles: %s",
+ fh->name);
+ WT_TRET(__wt_close(env, fh));
+ }
+ secondary_err = WT_ERROR;
+ }
+
+ /* Shut down the server threads. */
+ F_CLR(ienv, WT_SERVER_RUN);
+ WT_MEMORY_FLUSH;
+
+ /*
+ * Force the cache server threads to run and wait for them to exit.
+ * Wait for the cache eviction server first, it potentially schedules
+ * work for the read thread.
+ */
+ __wt_workq_evict_server(env, 1);
+ __wt_thread_join(ienv->cache_evict_tid);
+ __wt_workq_read_server(env, 1);
+ __wt_thread_join(ienv->cache_read_tid);
+
+ /*
+ * Close down and wait for the workQ thread; this only happens after
+ * all other server threads have exited, as they may be waiting on a
+ * request from the workQ, or vice-versa.
+ */
+ F_CLR(ienv, WT_WORKQ_RUN);
+ WT_MEMORY_FLUSH;
+ __wt_thread_join(ienv->workq_tid);
+
+ /* Discard the cache. */
+ WT_TRET(__wt_cache_destroy(env));
+
+ /* Re-cycle the underlying ENV/IENV structures. */
+ WT_TRET(__wt_ienv_destroy(env));
+
+ /* Free the Env structure. */
+ __wt_free(NULL, env, sizeof(ENV));
+
+ if (ret == 0)
+ ret = secondary_err;
+
+ return (ret == 0 ? secondary_err : ret);
+}
diff --git a/src/env/env_stat.c b/src/env/env_stat.c
new file mode 100644
index 00000000000..997d9080f31
--- /dev/null
+++ b/src/env/env_stat.c
@@ -0,0 +1,86 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_env_stat_print --
+ * Print ENV handle statistics to a stream.
+ */
+int
+__wt_env_stat_print(ENV *env, FILE *stream)
+{
+ IDB *idb;
+ IENV *ienv;
+
+ ienv = env->ienv;
+
+ fprintf(stream, "Environment handle statistics:\n");
+ __wt_stat_print(env, ienv->stats, stream);
+
+ fprintf(stream, "Environment cache statistics:\n");
+ __wt_cache_stats(env);
+ __wt_stat_print(env, ienv->cache->stats, stream);
+ fprintf(stream, "Environment method statistics:\n");
+ __wt_stat_print(env, ienv->method_stats, stream);
+
+ TAILQ_FOREACH(idb, &ienv->dbqh, q)
+ WT_RET(idb->db->stat_print(idb->db, stream, 0));
+ return (0);
+}
+
+/*
+ * __wt_env_stat_clear --
+ * Clear ENV handle statistics.
+ */
+int
+__wt_env_stat_clear(ENV *env)
+{
+ IDB *idb;
+ IENV *ienv;
+ int ret;
+
+ ienv = env->ienv;
+ ret = 0;
+
+ TAILQ_FOREACH(idb, &ienv->dbqh, q)
+ WT_TRET(__wt_db_stat_clear(idb->db));
+
+ __wt_stat_clear_env_stats(ienv->stats);
+ __wt_stat_clear_cache_stats(ienv->cache->stats);
+ __wt_stat_clear_method_stats(ienv->method_stats);
+
+ return (ret);
+}
+
+/*
+ * __wt_stat_print --
+ * Print out a statistics table.
+ */
+void
+__wt_stat_print(ENV *env, WT_STATS *s, FILE *stream)
+{
+ IENV *ienv;
+
+ ienv = env->ienv;
+
+ for (; s->desc != NULL; ++s)
+ if (s->v >= WT_BILLION)
+ fprintf(stream, "%lluB\t%s (%llu bytes)\n",
+ (unsigned long long)s->v / WT_BILLION,
+ s->desc, (unsigned long long)s->v);
+ else if (s->v >= WT_MILLION)
+ fprintf(stream, "%lluM\t%s (%llu bytes)\n",
+ (unsigned long long)s->v / WT_MILLION,
+ s->desc, (unsigned long long)s->v);
+ else
+ fprintf(stream,
+ "%llu\t%s\n", (unsigned long long)s->v, s->desc);
+ fprintf(stream, "%s\n", ienv->sep);
+}
diff --git a/src/env/env_sync.c b/src/env/env_sync.c
new file mode 100644
index 00000000000..4c40b52ad1c
--- /dev/null
+++ b/src/env/env_sync.c
@@ -0,0 +1,30 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_env_sync --
+ * Flush the environment's cache.
+ */
+int
+__wt_env_sync(ENV *env, void (*f)(const char *, uint64_t))
+{
+ IDB *idb;
+ IENV *ienv;
+ int ret;
+
+ ienv = env->ienv;
+ ret = 0;
+
+ TAILQ_FOREACH(idb, &ienv->dbqh, q)
+ WT_TRET(idb->db->sync(idb->db, f, 0));
+
+ return (ret);
+}
diff --git a/src/env/env_toc.c b/src/env/env_toc.c
new file mode 100644
index 00000000000..46d132707b5
--- /dev/null
+++ b/src/env/env_toc.c
@@ -0,0 +1,238 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_env_toc --
+ * ENV.toc method.
+ */
+int
+__wt_env_toc(ENV *env, WT_TOC **tocp)
+{
+ IENV *ienv;
+ WT_TOC *toc;
+ uint32_t slot;
+
+ ienv = env->ienv;
+ *tocp = NULL;
+
+ /* Check to see if there's an available WT_TOC slot. */
+ if (ienv->toc_cnt == env->toc_size - 1) {
+ __wt_api_env_errx(env,
+ "WiredTiger only configured to support %d thread contexts",
+ env->toc_size);
+ return (WT_ERROR);
+ }
+
+ /*
+ * The WT_TOC reference list is compact, the WT_TOC array is not. Find
+ * the first empty WT_TOC slot.
+ */
+ for (slot = 0, toc = ienv->toc_array; toc->env != NULL; ++toc, ++slot)
+ ;
+
+ /* Clear previous contents of the WT_TOC entry, they get re-used. */
+ memset(toc, 0, sizeof(WT_TOC));
+
+ toc->env = env;
+ toc->hazard = ienv->hazard + slot * env->hazard_size;
+
+ WT_RET(__wt_mtx_alloc(env, "toc", 1, &toc->mtx));
+
+ __wt_methods_wt_toc_lockout(toc);
+ __wt_methods_wt_toc_init_transition(toc);
+
+ /* Make the entry visible to the workQ. */
+ ienv->toc[ienv->toc_cnt++] = toc;
+ WT_MEMORY_FLUSH;
+
+ *tocp = toc;
+ return (0);
+}
+
+/*
+ * __wt_wt_toc_close --
+ * WT_TOC.close method.
+ */
+int
+__wt_wt_toc_close(WT_TOC *toc)
+{
+ ENV *env;
+ IENV *ienv;
+ WT_TOC **tp;
+ WT_TOC_UPDATE *update;
+ int ret;
+
+ env = toc->env;
+ ienv = env->ienv;
+ ret = 0;
+
+ WT_ENV_FCHK_RET(
+ env, "WT_TOC.close", toc->flags, WT_APIMASK_WT_TOC, ret);
+
+ /*
+ * The "in" reference count is artificially incremented by 1 as
+ * long as an update buffer is referenced by the WT_TOC thread;
+ * we don't want them freed because a page was evicted and their
+ * count went to 0. Decrement the reference count on the buffer
+ * as part of releasing it. There's a similar reference count
+ * decrement when the WT_TOC structure is discarded.
+ *
+ * XXX
+ * There's a race here: if this code, or the WT_TOC structure
+ * close code, and the page discard code race, it's possible
+ * neither will realize the buffer is no longer needed and free
+ * it. The fix is to involve the eviction or workQ threads:
+ * they may need a linked list of buffers they review to ensure
+ * it never happens. I'm living with this now: it's unlikely
+ * and it's a memory leak if it ever happens.
+ */
+ update = toc->update;
+ if (update != NULL && --update->in == update->out)
+ __wt_free(env, update, update->len);
+
+ /* Discard DBT memory. */
+ __wt_free(env, toc->key.data, toc->key.mem_size);
+ __wt_free(env, toc->data.data, toc->data.mem_size);
+ __wt_scr_free(toc);
+
+ /* Unlock and destroy the thread's mutex. */
+ if (toc->mtx != NULL) {
+ __wt_unlock(env, toc->mtx);
+ (void)__wt_mtx_destroy(env, toc->mtx);
+ }
+
+ /*
+ * Replace the WT_TOC reference we're closing with the last entry in
+ * the table, then clear the last entry. As far as the walk of the
+ * workQ is concerned, it's OK if the WT_TOC appears twice, or if it
+ * doesn't appear at all, so these lines can race all they want.
+ */
+ for (tp = ienv->toc; *tp != toc; ++tp)
+ ;
+ --ienv->toc_cnt;
+ *tp = ienv->toc[ienv->toc_cnt];
+ ienv->toc[ienv->toc_cnt] = NULL;
+
+ /* Make the WT_TOC array entry available for re-use. */
+ toc->env = NULL;
+ WT_MEMORY_FLUSH;
+
+ return (ret);
+}
+
+/*
+ * __wt_toc_api_set --
+ * Pair WT_TOC and DB handle, allocating the WT_TOC as necessary.
+ */
+int
+__wt_toc_api_set(ENV *env, const char *name, DB *db, WT_TOC **tocp)
+{
+ WT_TOC *toc;
+
+ /*
+ * We pass around WT_TOCs internally in the Btree, (rather than a DB),
+ * because the DB's are free-threaded, and the WT_TOCs are per-thread.
+ * Lots of the API calls don't require the application to allocate and
+ * manage the WT_TOC, which means we have to do it for them.
+ *
+ * WT_TOCs always reference a DB handle, and we do that here, as well.
+ */
+ if ((toc = *tocp) == NULL) {
+ WT_RET(env->toc(env, 0, tocp));
+ toc = *tocp;
+ }
+ toc->db = db;
+ toc->name = name;
+ return (0);
+}
+
+/*
+ * __wt_toc_api_clr --
+ * Clear the WT_TOC, freeing it if it was allocated by the library.
+ */
+int
+__wt_toc_api_clr(WT_TOC *toc, const char *name, int islocal)
+{
+ /*
+ * The WT_TOC should hold no more hazard references; this is a
+ * diagnostic check, but it's cheap so we do it all the time.
+ */
+ __wt_hazard_empty(toc, name);
+
+ if (islocal)
+ return (toc->close(toc, 0));
+
+ toc->db = NULL;
+ toc->name = NULL;
+ return (0);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+static const char *__wt_toc_print_state(WT_TOC *);
+
+int
+__wt_toc_dump(ENV *env)
+{
+ IENV *ienv;
+ WT_MBUF mb;
+ WT_TOC *toc, **tp;
+ WT_PAGE **hp;
+
+ ienv = env->ienv;
+ __wt_mb_init(env, &mb);
+
+ __wt_mb_add(&mb, "%s\n", ienv->sep);
+ for (tp = ienv->toc; (toc = *tp) != NULL; ++tp) {
+ __wt_mb_add(&mb,
+ "toc: %p {\n\tworkq func: ", toc);
+ if (toc->wq_func == NULL)
+ __wt_mb_add(&mb, "none");
+ else
+ __wt_mb_add(&mb, "%p", toc->wq_func);
+
+ __wt_mb_add(&mb, " state: %s", __wt_toc_print_state(toc));
+
+ __wt_mb_add(&mb, "\n\thazard: ");
+ for (hp = toc->hazard;
+ hp < toc->hazard + env->hazard_size; ++hp)
+ __wt_mb_add(&mb, "%p ", *hp);
+
+ __wt_mb_add(&mb, "\n}");
+ if (toc->name != NULL)
+ __wt_mb_add(&mb, " %s", toc->name);
+ __wt_mb_write(&mb);
+ }
+
+ __wt_mb_discard(&mb);
+ return (0);
+}
+
+/*
+ * __wt_toc_print_state --
+ * Return the WT_TOC state as a string.
+ */
+static const char *
+__wt_toc_print_state(WT_TOC *toc)
+{
+ switch (toc->wq_state) {
+ case WT_WORKQ_READ:
+ return ("read");
+ case WT_WORKQ_READ_SCHED:
+ return ("read scheduled");
+ case WT_WORKQ_FUNC:
+ return ("function");
+ case WT_WORKQ_NONE:
+ return ("none");
+ }
+ return ("unknown");
+ /* NOTREACHED */
+}
+#endif
diff --git a/src/env/env_workq.c b/src/env/env_workq.c
new file mode 100644
index 00000000000..76a00b0dce5
--- /dev/null
+++ b/src/env/env_workq.c
@@ -0,0 +1,94 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_workq_srvr --
+ * Routine to process the WT_TOC work queue.
+ */
+void *
+__wt_workq_srvr(void *arg)
+{
+ ENV *env;
+ IENV *ienv;
+ WT_TOC **tp, *toc;
+ int chk_read, read_force, request;
+
+ env = (ENV *)arg;
+ ienv = env->ienv;
+
+ /* Walk the WT_TOC list and execute requests. */
+ while (F_ISSET(ienv, WT_WORKQ_RUN)) {
+ ++ienv->api_gen;
+ WT_STAT_INCR(ienv->stats, WORKQ_PASSES);
+
+ chk_read = read_force = request = 0;
+ for (tp = ienv->toc; (toc = *tp) != NULL; ++tp) {
+ switch (toc->wq_state) {
+ case WT_WORKQ_NONE:
+ break;
+ case WT_WORKQ_FUNC:
+ request = 1;
+ (void)toc->wq_func(toc);
+ break;
+ case WT_WORKQ_READ:
+ request = 1;
+
+ /*
+ * Call a function which makes a request of the
+ * read server. There are two read states: READ
+ * (the initial request), and READ_SCHED (the
+ * function has been called and we're waiting on
+ * the read to complete). There are two states
+ * because we can race with the server: if the
+ * called function adds itself to the queue just
+ * as the server is going to sleep, the server
+ * might not see the request. So, READ_SCHED
+ * means we don't have to call the function, but
+ * we do have check if the server is running.
+ *
+ * The read state is eventually reset by the
+ * read server, so we set it before we call the
+ * function that will contact the server, so we
+ * can't race on that update.
+ */
+ toc->wq_state = WT_WORKQ_READ_SCHED;
+
+ /*
+ * Call the function (which contacts the read
+ * server). If that call fails, we're done.
+ */
+ if (toc->wq_func(toc) != 0)
+ break;
+
+ /* FALLTHROUGH */
+ case WT_WORKQ_READ_SCHED:
+ chk_read = 1;
+ if (F_ISSET(toc, WT_READ_PRIORITY))
+ read_force = 1;
+ break;
+ }
+ }
+
+ /* If a read is scheduled, check on the read server. */
+ if (chk_read)
+ __wt_workq_read_server(env, read_force);
+
+ /* Check on the cache eviction server. */
+ __wt_workq_evict_server(env, 0);
+
+ /* If we didn't find work, yield the processor. */
+ if (!request) {
+ WT_STAT_INCR(ienv->stats, WORKQ_YIELD);
+ __wt_yield();
+ }
+ }
+ return (NULL);
+}
diff --git a/src/os_posix/os_abort.c b/src/os_posix/os_abort.c
new file mode 100644
index 00000000000..68106636831
--- /dev/null
+++ b/src/os_posix/os_abort.c
@@ -0,0 +1,25 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_abort --
+ * Abort the process, dropping core.
+ */
+void
+__wt_abort(ENV *env)
+{
+ __wt_msg(env, "aborting WiredTiger library");
+
+ __wt_attach(env);
+
+ abort();
+ /* NOTREACHED */
+}
diff --git a/src/os_posix/os_alloc.c b/src/os_posix/os_alloc.c
new file mode 100644
index 00000000000..dbbb915822a
--- /dev/null
+++ b/src/os_posix/os_alloc.c
@@ -0,0 +1,359 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+#ifdef HAVE_DIAGNOSTIC
+static void __wt_mtrack(
+ ENV *env, const void *, const void *, const char *, int);
+#endif
+
+/*
+ * There's no malloc interface, WiredTiger never calls malloc. The problem is
+ * an application might: allocate memory, write secret stuff into it, free the
+ * memory, we allocate the memory, and then use it for a database page or log
+ * record and write it to disk. That would result in the secret stuff being
+ * protected by the WiredTiger permission mechanisms, potentially inappropriate
+ * for the secret stuff.
+ */
+
+/*
+ * __wt_calloc_func --
+ * ANSI calloc function.
+ */
+int
+__wt_calloc_func(ENV *env, uint32_t number, uint32_t size, void *retp
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ )
+{
+ void *p;
+
+ /*
+ * !!!
+ * This function MUST handle a NULL ENV structure reference.
+ */
+ WT_ASSERT(env, number != 0 && size != 0);
+
+ if (env != NULL && env->ienv != NULL && env->ienv->stats != NULL)
+ WT_STAT_INCR(env->ienv->stats, MEMALLOC);
+
+ if ((p = calloc(number, (size_t)size)) == NULL) {
+ __wt_api_env_err(env, errno, "memory allocation");
+ return (WT_ERROR);
+ }
+ *(void **)retp = p;
+
+#ifdef HAVE_DIAGNOSTIC
+ __wt_mtrack(env, NULL, p, file, line);
+#endif
+ return (0);
+}
+
+/*
+ * __wt_realloc_func --
+ * ANSI realloc function.
+ */
+int
+__wt_realloc_func(ENV *env,
+ uint32_t *bytes_allocated_ret, uint32_t bytes_to_allocate, void *retp
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ )
+{
+ void *p;
+ uint32_t bytes_allocated;
+
+ /*
+ * !!!
+ * This function MUST handle a NULL ENV structure reference.
+ */
+ WT_ASSERT(env, bytes_to_allocate != 0);
+
+ if (env != NULL && env->ienv != NULL && env->ienv->stats != NULL)
+ WT_STAT_INCR(env->ienv->stats, MEMALLOC);
+
+ p = *(void **)retp;
+
+ /*
+ * Sometimes we're allocating memory and we don't care about the
+ * final length -- bytes_allocated_ret may be NULL.
+ */
+ bytes_allocated =
+ bytes_allocated_ret == NULL ? 0 : *bytes_allocated_ret;
+ WT_ASSERT(env, bytes_allocated < bytes_to_allocate);
+
+ if ((p = realloc(p, (size_t)bytes_to_allocate)) == NULL) {
+ __wt_api_env_err(env, errno, "memory allocation");
+ return (WT_ERROR);
+ }
+
+ /*
+ * Clear the allocated memory -- an application might: allocate memory,
+ * write secret stuff into it, free the memory, we re-allocate the
+ * memory, then use it for a database page or log record and write it
+ * to disk. That would result in the secret stuff being protected by
+ * the WiredTiger permission mechanisms, potentially inappropriate for
+ * the secret stuff.
+ */
+ memset((uint8_t *)
+ p + bytes_allocated, 0, bytes_to_allocate - bytes_allocated);
+
+ /* Update caller's bytes allocated value. */
+ if (bytes_allocated_ret != NULL)
+ *bytes_allocated_ret = bytes_to_allocate;
+
+#ifdef HAVE_DIAGNOSTIC
+ __wt_mtrack(env, *(void **)retp, p, file, line);
+#endif
+
+ *(void **)retp = p;
+ return (0);
+}
+
+/*
+ * __wt_strdup_func --
+ * ANSI strdup function.
+ */
+int
+__wt_strdup_func(ENV *env, const char *str, void *retp
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ )
+{
+ size_t len;
+ void *p;
+
+ /*
+ * !!!
+ * This function MUST handle a NULL ENV structure reference.
+ */
+ if (env != NULL && env->ienv != NULL && env->ienv->stats != NULL)
+ WT_STAT_INCR(env->ienv->stats, MEMALLOC);
+
+ len = strlen(str) + 1;
+#ifdef HAVE_DIAGNOSTIC
+ WT_RET(__wt_calloc_func(env, len, 1, &p, file, line));
+#else
+ WT_RET(__wt_calloc_func(env, len, 1, &p));
+#endif
+
+ memcpy(p, str, len);
+
+ *(void **)retp = p;
+ return (0);
+}
+
+/*
+ * __wt_free_func --
+ * ANSI free function.
+ */
+void
+__wt_free_func(ENV *env, void *p_arg
+#ifdef HAVE_DIAGNOSTIC
+ , uint32_t len
+#endif
+ )
+{
+ void *p;
+
+ /*
+ * !!!
+ * This function MUST handle a NULL ENV structure reference.
+ */
+ if (env != NULL && env->ienv != NULL && env->ienv->stats != NULL)
+ WT_STAT_INCR(env->ienv->stats, MEMFREE);
+
+ /*
+ * If there's a serialization bug we might race with another thread.
+ * We can't avoid the race (and we aren't willing to flush memory),
+ * but we minimize the window by clearing the free address atomically,
+ * hoping a racing thread will see, and won't free, a NULL pointer.
+ */
+ p = *(void **)p_arg;
+ *(void **)p_arg = NULL;
+
+ if (p == NULL) /* ANSI C free semantics */
+ return;
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * If we know how long the object is, overwrite it with an easily
+ * recognizable value for debugging.
+ */
+ if (len != 0)
+ memset(p, WT_DEBUG_BYTE, len);
+
+ __wt_mtrack(env, p, NULL, NULL, 0);
+#endif
+
+ free(p);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_mtrack_alloc --
+ * Allocate memory tracking structures.
+ */
+int
+__wt_mtrack_alloc(ENV *env)
+{
+ IENV *ienv;
+ WT_MTRACK *p;
+
+ ienv = env->ienv;
+
+ /*
+ * Use a temporary variable -- assigning memory to ienv->mtrack turns
+ * on memory object tracking, and we need to set up the rest of the
+ * structure first.
+ */
+ WT_RET(__wt_calloc(env, 1, sizeof(WT_MTRACK), &p));
+ WT_RET(__wt_calloc(env, 1000, sizeof(WT_MEM), &p->list));
+ p->next = p->list;
+ p->slots = 1000;
+ ienv->mtrack = p;
+ return (0);
+}
+
+/*
+ * __wt_mtrack_free --
+ * Free memory tracking structures.
+ */
+void
+__wt_mtrack_free(ENV *env)
+{
+ IENV *ienv;
+ WT_MTRACK *p;
+
+ ienv = env->ienv;
+
+ /*
+ * Clear ienv->mtrack (to turn off memory object tracking) before the
+ * free.
+ */
+ if ((p = ienv->mtrack) == NULL)
+ return;
+ ienv->mtrack = NULL;
+
+ __wt_free(env, p->list, 0);
+ __wt_free(env, p, 0);
+}
+
+/*
+ * __wt_mtrack_free --
+ * Track memory allocations and frees.
+ */
+static void
+__wt_mtrack(ENV *env, const void *f, const void *a, const char *file, int line)
+{
+ WT_MEM *mp, *t, *mp_end;
+ WT_MTRACK *mtrack;
+ int slot_check;
+
+ if (env == NULL ||
+ env->ienv == NULL || (mtrack = env->ienv->mtrack) == NULL)
+ return;
+
+ /*
+ * Remove freed memory from the list. If it's a free/alloc pair (that
+ * is, if __wt_realloc was called), re-use the slot.
+ */
+ if (f != NULL) {
+ if ((mp = mtrack->next) > mtrack->list)
+ do {
+ if ((--mp)->addr == f)
+ goto enter;
+ } while (mp > mtrack->list);
+
+ __wt_api_env_errx(env, "mtrack: %p: not found", f);
+ __wt_attach(env);
+ }
+
+ if (a == NULL)
+ return;
+
+ /*
+ * Add allocated memory to the list.
+ *
+ * First, see if there's a slot close by we can re-use (the assumption
+ * is that when memory is allocated and quickly freed we re-use the
+ * slots instead of leaving lots of free spots in the array.
+ */
+ if ((mp = mtrack->next) > mtrack->list)
+ for (slot_check = 0; slot_check < 10; ++slot_check) {
+ if ((--mp)->addr == NULL)
+ goto enter;
+ if (mp == mtrack->list)
+ break;
+ }
+
+ mp_end = mtrack->list + mtrack->slots;
+
+ /* If there's an empty slot, use it. */
+ if (mtrack->next < mp_end)
+ goto next;
+
+ /* Try to compress the array. */
+ for (mp = mtrack->list, t = NULL;; ++mp, ++t) {
+ while (mp < mp_end && mp->addr != NULL)
+ ++mp;
+ if (mp == mp_end)
+ break;
+ if (t == NULL)
+ t = mp + 1;
+ while (t < mp_end && t->addr == NULL)
+ ++t;
+ if (t == mp_end)
+ break;
+ *mp++ = *t;
+ t->addr = NULL;
+ }
+ mtrack->next = mp;
+
+ /* If there's an empty slot, use it. */
+ if (mtrack->next < mp_end)
+ goto next;
+
+ /* Re-allocate the array and use the next empty slot. */
+ if ((mtrack->list = realloc(mtrack->list,
+ mtrack->slots * 2 * sizeof(WT_MEM))) == NULL)
+ return;
+ mtrack->next = mtrack->list + mtrack->slots;
+ mtrack->slots *= 2;
+
+next: mp = mtrack->next++;
+enter: mp->addr = a;
+ mp->file = file;
+ mp->line = line;
+}
+
+/*
+ * __wt_mtrack_dump --
+ * Complain about any memory allocated but never freed.
+ */
+void
+__wt_mtrack_dump(ENV *env)
+{
+ WT_MTRACK *mtrack;
+ WT_MEM *mp;
+
+ if ((mtrack = env->ienv->mtrack) == NULL)
+ return;
+
+ for (mp = mtrack->list; mp < mtrack->next; ++mp)
+ if (mp->addr != NULL)
+ __wt_api_env_errx(env,
+ "mtrack: %p {%s/%d}: never freed",
+ mp->addr, mp->file, mp->line);
+}
+#endif
diff --git a/src/os_posix/os_filesize.c b/src/os_posix/os_filesize.c
new file mode 100644
index 00000000000..604d963f8e6
--- /dev/null
+++ b/src/os_posix/os_filesize.c
@@ -0,0 +1,27 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+int
+__wt_filesize(ENV *env, WT_FH *fh, off_t *sizep)
+{
+ struct stat sb;
+
+ WT_VERBOSE(env,
+ WT_VERB_FILEOPS, (env, "fileops: %s: fstat", fh->name));
+
+ if (fstat(fh->fd, &sb) == -1) {
+ __wt_api_env_err(env, errno, "%s: fstat", fh->name);
+ return (WT_ERROR);
+ }
+
+ *sizep = sb.st_size; /* Return size in bytes. */
+ return (0);
+}
diff --git a/src/os_posix/os_fsync.c b/src/os_posix/os_fsync.c
new file mode 100644
index 00000000000..e6ecfd95a21
--- /dev/null
+++ b/src/os_posix/os_fsync.c
@@ -0,0 +1,29 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_fsync --
+ * Flush a file handle.
+ */
+int
+__wt_fsync(ENV *env, WT_FH *fh)
+{
+
+ WT_STAT_INCR(fh->stats, FSYNC);
+
+ WT_VERBOSE(env, WT_VERB_FILEOPS, (env, "fileops: %s: fsync", fh->name));
+
+ if (fsync(fh->fd) == 0)
+ return (0);
+
+ __wt_api_env_err(env, errno, "%s fsync error", fh->name);
+ return (WT_ERROR);
+}
diff --git a/src/os_posix/os_mtx.c b/src/os_posix/os_mtx.c
new file mode 100644
index 00000000000..fb58784ec2d
--- /dev/null
+++ b/src/os_posix/os_mtx.c
@@ -0,0 +1,148 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_mtx_alloc --
+ * Allocate and initialize a pthread mutex.
+ */
+int
+__wt_mtx_alloc(ENV *env, const char *name, int is_locked, WT_MTX **mtxp)
+{
+ WT_MTX *mtx;
+ pthread_condattr_t condattr;
+ pthread_mutexattr_t mutexattr;
+
+ WT_RET(__wt_calloc(env, 1, sizeof(WT_MTX), &mtx));
+
+ /*
+ * !!!
+ * This function MUST handle a NULL ENV structure reference.
+ *
+ * Initialize the mutex.
+ * Mutexes are shared between processes.
+ */
+ if (pthread_mutexattr_init(&mutexattr) != 0)
+ goto err;
+#if 0
+ if (pthread_mutexattr_setpshared(
+ &mutexattr, PTHREAD_PROCESS_SHARED) != 0)
+ goto err;
+#endif
+ if (pthread_mutex_init(&mtx->mtx, &mutexattr) != 0)
+ goto err;
+ (void)pthread_mutexattr_destroy(&mutexattr);
+
+ /* Initialize the condition variable (mutexes are self-blocking). */
+ if (pthread_condattr_init(&condattr) != 0)
+ goto err;
+#if 0
+ if (pthread_condattr_setpshared(
+ &condattr, PTHREAD_PROCESS_SHARED) != 0)
+ goto err;
+#endif
+ if (pthread_cond_init(&mtx->cond, &condattr) != 0)
+ goto err;
+ (void)pthread_condattr_destroy(&condattr);
+
+ mtx->name = name;
+
+ /* If the normal state of the mutex is locked, lock it immediately. */
+ if (is_locked)
+ __wt_lock(env, mtx);
+
+ *mtxp = mtx;
+ return (0);
+
+err: __wt_free(env, mtx, sizeof(WT_MTX));
+ return (WT_ERROR);
+}
+
+/*
+ * __wt_lock
+ * Lock a mutex.
+ */
+void
+__wt_lock(ENV *env, WT_MTX *mtx)
+{
+ int ret;
+
+ WT_VERBOSE(env,
+ WT_VERB_MUTEX, (env, "lock %s mutex (%p)", mtx->name, mtx));
+
+ WT_ERR(pthread_mutex_lock(&mtx->mtx));
+
+ /*
+ * Check pthread_cond_wait() return for EINTR, ETIME and ETIMEDOUT,
+ * it's known to return these errors on some systems.
+ */
+ while (mtx->locked) {
+ ret = pthread_cond_wait(&mtx->cond, &mtx->mtx);
+ if (ret != 0 &&
+ ret != EINTR &&
+#ifdef ETIME
+ ret != ETIME &&
+#endif
+ ret != ETIMEDOUT) {
+ (void)pthread_mutex_unlock(&mtx->mtx);
+ goto err;
+ }
+ }
+
+ mtx->locked = 1;
+ WT_STAT_INCR(env->ienv->stats, MTX_LOCK);
+
+ WT_ERR(pthread_mutex_unlock(&mtx->mtx));
+ return;
+
+err: __wt_api_env_err(env, ret, "mutex lock failed");
+ __wt_abort(env);
+}
+
+/*
+ * __wt_unlock --
+ * Release a mutex.
+ */
+void
+__wt_unlock(ENV *env, WT_MTX *mtx)
+{
+ int ret;
+
+ WT_VERBOSE(env,
+ WT_VERB_MUTEX, (env, "unlock %s mutex (%p)", mtx->name, mtx));
+
+ ret = 0;
+ WT_ERR(pthread_mutex_lock(&mtx->mtx));
+ mtx->locked = 0;
+ WT_ERR(pthread_cond_signal(&mtx->cond));
+
+ WT_ERR(pthread_mutex_unlock(&mtx->mtx));
+ return;
+
+err: __wt_api_env_err(env, ret, "mutex unlock failed");
+ __wt_abort(NULL);
+}
+
+/*
+ * __wt_mtx_destroy --
+ * Destroy a mutex.
+ */
+int
+__wt_mtx_destroy(ENV *env, WT_MTX *mtx)
+{
+ int ret;
+
+ ret = pthread_cond_destroy(&mtx->cond);
+ WT_TRET(pthread_mutex_destroy(&mtx->mtx));
+
+ __wt_free(env, mtx, sizeof(WT_MTX));
+
+ return (ret == 0 ? 0 : WT_ERROR);
+}
diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c
new file mode 100644
index 00000000000..971fe47f11b
--- /dev/null
+++ b/src/os_posix/os_open.c
@@ -0,0 +1,128 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_open --
+ * Open a file handle.
+ */
+int
+__wt_open(ENV *env, const char *name, mode_t mode, int ok_create, WT_FH **fhp)
+{
+ IDB *idb;
+ IENV *ienv;
+ WT_FH *fh;
+ int f, fd, ret;
+
+ fh = NULL;
+ ienv = env->ienv;
+
+ WT_VERBOSE(env, WT_VERB_FILEOPS, (env, "fileops: %s: open", name));
+
+ /* Increment the reference count if we already have the file open. */
+ __wt_lock(env, ienv->mtx);
+ TAILQ_FOREACH(idb, &ienv->dbqh, q) {
+ if ((fh = idb->fh) == NULL)
+ continue;
+ if (strcmp(name, idb->name) == 0) {
+ ++fh->refcnt;
+ *fhp = fh;
+ break;
+ }
+ }
+ __wt_unlock(env, ienv->mtx);
+ if (fh != NULL)
+ return (0);
+
+ f = O_RDWR;
+#ifdef O_BINARY
+ /* Windows clones: we always want to treat the file as a binary. */
+ f |= O_BINARY;
+#endif
+ if (ok_create)
+ f |= O_CREAT;
+
+ if ((fd = open(name, f, mode)) == -1) {
+ __wt_api_env_err(env, errno, "%s", name);
+ return (WT_ERROR);
+ }
+
+ WT_RET(__wt_calloc(env, 1, sizeof(WT_FH), &fh));
+ WT_ERR(__wt_stat_alloc_fh_stats(env, &fh->stats));
+ WT_ERR(__wt_strdup(env, name, &fh->name));
+
+#if defined(HAVE_FCNTL) && defined(FD_CLOEXEC)
+ /*
+ * Security:
+ * The application may spawn a new process, and we don't want another
+ * process to have access to our file handles. There's an obvious
+ * race here...
+ */
+ if ((f = fcntl(fd, F_GETFD)) == -1 ||
+ fcntl(fd, F_SETFD, f | FD_CLOEXEC) == -1) {
+ __wt_api_env_err(env, errno, "%s: fcntl", name);
+ goto err;
+ }
+#endif
+
+ fh->fd = fd;
+ fh->refcnt = 1;
+ *fhp = fh;
+
+ /* Set the file's size. */
+ WT_ERR(__wt_filesize(env, fh, &fh->file_size));
+
+ /* Link onto the environment's list of files. */
+ __wt_lock(env, ienv->mtx);
+ TAILQ_INSERT_TAIL(&ienv->fhqh, fh, q);
+ __wt_unlock(env, ienv->mtx);
+
+ return (0);
+
+err: if (fh != NULL) {
+ if (fh->name != NULL)
+ __wt_free(env, fh->name, 0);
+ __wt_free(env, fh, sizeof(WT_FH));
+ }
+ (void)close(fd);
+ return (ret);
+}
+
+/*
+ * __wt_close --
+ * Close a file handle.
+ */
+int
+__wt_close(ENV *env, WT_FH *fh)
+{
+ IENV *ienv;
+ int ret;
+
+ ienv = env->ienv;
+ ret = 0;
+
+ if (fh == NULL || fh->refcnt == 0 || --fh->refcnt > 0)
+ return (0);
+
+ /* Remove from the list and discard the memory. */
+ __wt_lock(env, ienv->mtx);
+ TAILQ_REMOVE(&ienv->fhqh, fh, q);
+ __wt_unlock(env, ienv->mtx);
+
+ if (close(fh->fd) != 0) {
+ __wt_api_env_err(env, errno, "%s", fh->name);
+ ret = WT_ERROR;
+ }
+
+ __wt_free(env, fh->name, 0);
+ __wt_free(env, fh->stats, 0);
+ __wt_free(env, fh, sizeof(WT_FH));
+ return (ret);
+}
diff --git a/src/os_posix/os_rw.c b/src/os_posix/os_rw.c
new file mode 100644
index 00000000000..1ce48f3ec56
--- /dev/null
+++ b/src/os_posix/os_rw.c
@@ -0,0 +1,56 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_read --
+ * Read a chunk.
+ */
+int
+__wt_read(ENV *env, WT_FH *fh, off_t offset, uint32_t bytes, void *buf)
+{
+ WT_STAT_INCR(fh->stats, READ_IO);
+ WT_STAT_INCR(env->ienv->stats, TOTAL_READ_IO);
+
+ WT_VERBOSE(env, WT_VERB_FILEOPS,
+ (env, "fileops: %s: read %lu bytes at offset %lu",
+ fh->name, (u_long)bytes, (u_long)offset));
+
+ if (pread(fh->fd, buf, (size_t)bytes, offset) == (ssize_t)bytes)
+ return (0);
+
+ __wt_api_env_err(env, errno,
+ "%s read error: attempt to read %lu bytes at offset %lu",
+ fh->name, (u_long)bytes, (u_long)offset);
+ return (WT_ERROR);
+}
+
+/*
+ * __wt_write --
+ * Write a chunk.
+ */
+int
+__wt_write(ENV *env, WT_FH *fh, off_t offset, uint32_t bytes, void *buf)
+{
+ WT_STAT_INCR(fh->stats, WRITE_IO);
+ WT_STAT_INCR(env->ienv->stats, TOTAL_WRITE_IO);
+
+ WT_VERBOSE(env, WT_VERB_FILEOPS,
+ (env, "fileops: %s: write %lu bytes at offset %lu",
+ fh->name, (u_long)bytes, (u_long)offset));
+
+ if (pwrite(fh->fd, buf, (size_t)bytes, offset) == (ssize_t)bytes)
+ return (0);
+
+ __wt_api_env_err(env, errno,
+ "%s write error: attempt to write %lu bytes at offset %lu",
+ fh->name, (u_long)bytes, (u_long)offset);
+ return (WT_ERROR);
+}
diff --git a/src/os_posix/os_sleep.c b/src/os_posix/os_sleep.c
new file mode 100644
index 00000000000..74b86a30d42
--- /dev/null
+++ b/src/os_posix/os_sleep.c
@@ -0,0 +1,25 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_sleep --
+ * Pause the thread of control.
+ */
+void
+__wt_sleep(long seconds, long micro_seconds)
+{
+ struct timeval t;
+
+ t.tv_sec = (long)seconds + micro_seconds / 1000000;
+ t.tv_usec = (long)micro_seconds % 1000000;
+
+ (void)select(0, NULL, NULL, NULL, &t);
+}
diff --git a/src/os_posix/os_thread.c b/src/os_posix/os_thread.c
new file mode 100644
index 00000000000..3fb62a482d8
--- /dev/null
+++ b/src/os_posix/os_thread.c
@@ -0,0 +1,31 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_thread_create --
+ * Create a new thread of control.
+ */
+int
+__wt_thread_create(pthread_t *tidret, void *(*func)(void *), void *arg)
+{
+ /* Spawn a new thread of control. */
+ return (pthread_create(tidret, NULL, func, arg) == 0 ? 0 : WT_ERROR);
+}
+
+/*
+ * __wt_thread_join --
+ * Wait for a thread of control to exit.
+ */
+void
+__wt_thread_join(pthread_t tid)
+{
+ (void)pthread_join(tid, NULL);
+}
diff --git a/src/os_posix/os_yield.c b/src/os_posix/os_yield.c
new file mode 100644
index 00000000000..a13b407150d
--- /dev/null
+++ b/src/os_posix/os_yield.c
@@ -0,0 +1,24 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_yield --
+ * Yield the thread of control.
+ */
+void
+__wt_yield(void)
+{
+#ifdef HAVE_PTHREAD_YIELD
+ pthread_yield();
+#else
+ sched_yield();
+#endif
+}
diff --git a/src/support/api.c b/src/support/api.c
new file mode 100644
index 00000000000..cb7b48a7d69
--- /dev/null
+++ b/src/support/api.c
@@ -0,0 +1,1597 @@
+/* DO NOT EDIT: automatically built by dist/api.py. */
+
+#include "wt_internal.h"
+
+static int __wt_api_db_btree_compare_dup_get(
+ DB *db,
+ int (**btree_compare_dup)(DB *, const DBT *, const DBT *));
+static int __wt_api_db_btree_compare_dup_get(
+ DB *db,
+ int (**btree_compare_dup)(DB *, const DBT *, const DBT *))
+{
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, DB_BTREE_COMPARE_DUP_GET);
+ *btree_compare_dup = db->btree_compare_dup;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_db_btree_compare_dup_set(
+ DB *db,
+ int (*btree_compare_dup)(DB *, const DBT *, const DBT *));
+static int __wt_api_db_btree_compare_dup_set(
+ DB *db,
+ int (*btree_compare_dup)(DB *, const DBT *, const DBT *))
+{
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, DB_BTREE_COMPARE_DUP_SET);
+ db->btree_compare_dup = btree_compare_dup;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_db_btree_compare_get(
+ DB *db,
+ int (**btree_compare)(DB *, const DBT *, const DBT *));
+static int __wt_api_db_btree_compare_get(
+ DB *db,
+ int (**btree_compare)(DB *, const DBT *, const DBT *))
+{
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, DB_BTREE_COMPARE_GET);
+ *btree_compare = db->btree_compare;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_db_btree_compare_int_get(
+ DB *db,
+ int *btree_compare_int);
+static int __wt_api_db_btree_compare_int_get(
+ DB *db,
+ int *btree_compare_int)
+{
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, DB_BTREE_COMPARE_INT_GET);
+ *btree_compare_int = db->btree_compare_int;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_db_btree_compare_int_set(
+ DB *db,
+ int btree_compare_int);
+static int __wt_api_db_btree_compare_int_set(
+ DB *db,
+ int btree_compare_int)
+{
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+
+ WT_RET((__wt_db_btree_compare_int_set_verify(
+ db, btree_compare_int)));
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, DB_BTREE_COMPARE_INT_SET);
+ db->btree_compare_int = btree_compare_int;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_db_btree_compare_set(
+ DB *db,
+ int (*btree_compare)(DB *, const DBT *, const DBT *));
+static int __wt_api_db_btree_compare_set(
+ DB *db,
+ int (*btree_compare)(DB *, const DBT *, const DBT *))
+{
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, DB_BTREE_COMPARE_SET);
+ db->btree_compare = btree_compare;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_db_btree_dup_offpage_get(
+ DB *db,
+ uint32_t *btree_dup_offpage);
+static int __wt_api_db_btree_dup_offpage_get(
+ DB *db,
+ uint32_t *btree_dup_offpage)
+{
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, DB_BTREE_DUP_OFFPAGE_GET);
+ *btree_dup_offpage = db->btree_dup_offpage;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_db_btree_dup_offpage_set(
+ DB *db,
+ uint32_t btree_dup_offpage);
+static int __wt_api_db_btree_dup_offpage_set(
+ DB *db,
+ uint32_t btree_dup_offpage)
+{
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+
+ WT_RET((__wt_db_btree_dup_offpage_set_verify(
+ db, btree_dup_offpage)));
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, DB_BTREE_DUP_OFFPAGE_SET);
+ db->btree_dup_offpage = btree_dup_offpage;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_db_btree_itemsize_get(
+ DB *db,
+ uint32_t *intlitemsize,
+ uint32_t *leafitemsize);
+static int __wt_api_db_btree_itemsize_get(
+ DB *db,
+ uint32_t *intlitemsize,
+ uint32_t *leafitemsize)
+{
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, DB_BTREE_ITEMSIZE_GET);
+ *intlitemsize = db->intlitemsize;
+ *leafitemsize = db->leafitemsize;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_db_btree_itemsize_set(
+ DB *db,
+ uint32_t intlitemsize,
+ uint32_t leafitemsize);
+static int __wt_api_db_btree_itemsize_set(
+ DB *db,
+ uint32_t intlitemsize,
+ uint32_t leafitemsize)
+{
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, DB_BTREE_ITEMSIZE_SET);
+ db->intlitemsize = intlitemsize;
+ db->leafitemsize = leafitemsize;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_db_btree_pagesize_get(
+ DB *db,
+ uint32_t *allocsize,
+ uint32_t *intlmin,
+ uint32_t *intlmax,
+ uint32_t *leafmin,
+ uint32_t *leafmax);
+static int __wt_api_db_btree_pagesize_get(
+ DB *db,
+ uint32_t *allocsize,
+ uint32_t *intlmin,
+ uint32_t *intlmax,
+ uint32_t *leafmin,
+ uint32_t *leafmax)
+{
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, DB_BTREE_PAGESIZE_GET);
+ *allocsize = db->allocsize;
+ *intlmin = db->intlmin;
+ *intlmax = db->intlmax;
+ *leafmin = db->leafmin;
+ *leafmax = db->leafmax;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_db_btree_pagesize_set(
+ DB *db,
+ uint32_t allocsize,
+ uint32_t intlmin,
+ uint32_t intlmax,
+ uint32_t leafmin,
+ uint32_t leafmax);
+static int __wt_api_db_btree_pagesize_set(
+ DB *db,
+ uint32_t allocsize,
+ uint32_t intlmin,
+ uint32_t intlmax,
+ uint32_t leafmin,
+ uint32_t leafmax)
+{
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, DB_BTREE_PAGESIZE_SET);
+ db->allocsize = allocsize;
+ db->intlmin = intlmin;
+ db->intlmax = intlmax;
+ db->leafmin = leafmin;
+ db->leafmax = leafmax;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_db_bulk_load(
+ DB *db,
+ uint32_t flags,
+ void (*progress)(const char *, uint64_t),
+ int (*cb)(DB *, DBT **, DBT **));
+static int __wt_api_db_bulk_load(
+ DB *db,
+ uint32_t flags,
+ void (*progress)(const char *, uint64_t),
+ int (*cb)(DB *, DBT **, DBT **))
+{
+ const char *method_name = "DB.bulk_load";
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+ WT_TOC *toc = NULL;
+ int ret;
+
+ WT_DB_RDONLY(db, method_name);
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_BULK_LOAD);
+ WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+ WT_STAT_INCR(ienv->method_stats, DB_BULK_LOAD);
+ ret = __wt_db_bulk_load(toc, flags, progress, cb);
+ WT_TRET(__wt_toc_api_clr(toc, method_name, 1));
+ return (ret);
+}
+
+static int __wt_api_db_close(
+ DB *db,
+ uint32_t flags);
+static int __wt_api_db_close(
+ DB *db,
+ uint32_t flags)
+{
+ const char *method_name = "DB.close";
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+ WT_TOC *toc = NULL;
+ int ret;
+
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_CLOSE);
+ WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+ WT_STAT_INCR(ienv->method_stats, DB_CLOSE);
+ ret = __wt_db_close(toc, flags);
+ WT_TRET(__wt_toc_api_clr(toc, method_name, 1));
+ return (ret);
+}
+
+static int __wt_api_db_col_del(
+ DB *db,
+ WT_TOC *toc,
+ uint64_t recno,
+ uint32_t flags);
+static int __wt_api_db_col_del(
+ DB *db,
+ WT_TOC *toc,
+ uint64_t recno,
+ uint32_t flags)
+{
+ const char *method_name = "DB.col_del";
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+ int ret;
+
+ WT_DB_COL_ONLY(db, method_name);
+ WT_DB_RDONLY(db, method_name);
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_COL_DEL);
+ WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+ WT_STAT_INCR(ienv->method_stats, DB_COL_DEL);
+ while ((ret = __wt_db_col_del(toc, recno)) == WT_RESTART)
+ WT_STAT_INCR(ienv->method_stats, DB_COL_DEL_RESTART);
+ WT_TRET(__wt_toc_api_clr(toc, method_name, 0));
+ return (ret);
+}
+
+static int __wt_api_db_col_get(
+ DB *db,
+ WT_TOC *toc,
+ uint64_t recno,
+ DBT *data,
+ uint32_t flags);
+static int __wt_api_db_col_get(
+ DB *db,
+ WT_TOC *toc,
+ uint64_t recno,
+ DBT *data,
+ uint32_t flags)
+{
+ const char *method_name = "DB.col_get";
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+ int ret;
+
+ WT_DB_COL_ONLY(db, method_name);
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_COL_GET);
+ WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+ WT_STAT_INCR(ienv->method_stats, DB_COL_GET);
+ ret = __wt_db_col_get(toc, recno, data);
+ WT_TRET(__wt_toc_api_clr(toc, method_name, 0));
+ return (ret);
+}
+
+static int __wt_api_db_col_put(
+ DB *db,
+ WT_TOC *toc,
+ uint64_t recno,
+ DBT *data,
+ uint32_t flags);
+static int __wt_api_db_col_put(
+ DB *db,
+ WT_TOC *toc,
+ uint64_t recno,
+ DBT *data,
+ uint32_t flags)
+{
+ const char *method_name = "DB.col_put";
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+ int ret;
+
+ WT_DB_COL_ONLY(db, method_name);
+ WT_DB_RDONLY(db, method_name);
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_COL_PUT);
+ WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+ WT_STAT_INCR(ienv->method_stats, DB_COL_PUT);
+ while ((ret = __wt_db_col_put(toc, recno, data)) == WT_RESTART)
+ WT_STAT_INCR(ienv->method_stats, DB_COL_PUT_RESTART);
+ WT_TRET(__wt_toc_api_clr(toc, method_name, 0));
+ return (ret);
+}
+
+static int __wt_api_db_column_set(
+ DB *db,
+ uint32_t fixed_len,
+ const char *dictionary,
+ uint32_t flags);
+static int __wt_api_db_column_set(
+ DB *db,
+ uint32_t fixed_len,
+ const char *dictionary,
+ uint32_t flags)
+{
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+
+ WT_ENV_FCHK(env, "DB.column_set",
+ flags, WT_APIMASK_DB_COLUMN_SET);
+
+ WT_RET((__wt_db_column_set_verify(
+ db, fixed_len, dictionary, flags)));
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, DB_COLUMN_SET);
+ db->fixed_len = fixed_len;
+ db->dictionary = dictionary;
+ db->flags = flags;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_db_dump(
+ DB *db,
+ FILE *stream,
+ void (*progress)(const char *, uint64_t),
+ uint32_t flags);
+static int __wt_api_db_dump(
+ DB *db,
+ FILE *stream,
+ void (*progress)(const char *, uint64_t),
+ uint32_t flags)
+{
+ const char *method_name = "DB.dump";
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+ WT_TOC *toc = NULL;
+ int ret;
+
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_DUMP);
+ WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+ WT_STAT_INCR(ienv->method_stats, DB_DUMP);
+ ret = __wt_db_dump(toc, stream, progress, flags);
+ WT_TRET(__wt_toc_api_clr(toc, method_name, 1));
+ return (ret);
+}
+
+static int __wt_api_db_errcall_get(
+ DB *db,
+ void (**errcall)(const DB *, const char *));
+static int __wt_api_db_errcall_get(
+ DB *db,
+ void (**errcall)(const DB *, const char *))
+{
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, DB_ERRCALL_GET);
+ *errcall = db->errcall;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_db_errcall_set(
+ DB *db,
+ void (*errcall)(const DB *, const char *));
+static int __wt_api_db_errcall_set(
+ DB *db,
+ void (*errcall)(const DB *, const char *))
+{
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, DB_ERRCALL_SET);
+ db->errcall = errcall;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_db_errfile_get(
+ DB *db,
+ FILE **errfile);
+static int __wt_api_db_errfile_get(
+ DB *db,
+ FILE **errfile)
+{
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, DB_ERRFILE_GET);
+ *errfile = db->errfile;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_db_errfile_set(
+ DB *db,
+ FILE *errfile);
+static int __wt_api_db_errfile_set(
+ DB *db,
+ FILE *errfile)
+{
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, DB_ERRFILE_SET);
+ db->errfile = errfile;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_db_errpfx_get(
+ DB *db,
+ const char **errpfx);
+static int __wt_api_db_errpfx_get(
+ DB *db,
+ const char **errpfx)
+{
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, DB_ERRPFX_GET);
+ *errpfx = db->errpfx;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_db_errpfx_set(
+ DB *db,
+ const char *errpfx);
+static int __wt_api_db_errpfx_set(
+ DB *db,
+ const char *errpfx)
+{
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, DB_ERRPFX_SET);
+ db->errpfx = errpfx;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_db_huffman_set(
+ DB *db,
+ uint8_t const *huffman_table,
+ u_int huffman_table_size,
+ uint32_t huffman_flags);
+static int __wt_api_db_huffman_set(
+ DB *db,
+ uint8_t const *huffman_table,
+ u_int huffman_table_size,
+ uint32_t huffman_flags)
+{
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+ int ret;
+
+ WT_ENV_FCHK(env, "DB.huffman_set",
+ huffman_flags, WT_APIMASK_DB_HUFFMAN_SET);
+
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, DB_HUFFMAN_SET);
+ ret = __wt_db_huffman_set(
+ db, huffman_table, huffman_table_size, huffman_flags);
+ __wt_unlock(env, ienv->mtx);
+ return (ret);
+}
+
+static int __wt_api_db_open(
+ DB *db,
+ const char *name,
+ mode_t mode,
+ uint32_t flags);
+static int __wt_api_db_open(
+ DB *db,
+ const char *name,
+ mode_t mode,
+ uint32_t flags)
+{
+ const char *method_name = "DB.open";
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+ WT_TOC *toc = NULL;
+ int ret;
+
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_OPEN);
+ WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+ WT_STAT_INCR(ienv->method_stats, DB_OPEN);
+ ret = __wt_db_open(toc, name, mode, flags);
+ WT_TRET(__wt_toc_api_clr(toc, method_name, 1));
+ return (ret);
+}
+
+static int __wt_api_db_row_del(
+ DB *db,
+ WT_TOC *toc,
+ DBT *key,
+ uint32_t flags);
+static int __wt_api_db_row_del(
+ DB *db,
+ WT_TOC *toc,
+ DBT *key,
+ uint32_t flags)
+{
+ const char *method_name = "DB.row_del";
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+ int ret;
+
+ WT_DB_ROW_ONLY(db, method_name);
+ WT_DB_RDONLY(db, method_name);
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_ROW_DEL);
+ WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+ WT_STAT_INCR(ienv->method_stats, DB_ROW_DEL);
+ while ((ret = __wt_db_row_del(toc, key)) == WT_RESTART)
+ WT_STAT_INCR(ienv->method_stats, DB_ROW_DEL_RESTART);
+ WT_TRET(__wt_toc_api_clr(toc, method_name, 0));
+ return (ret);
+}
+
+static int __wt_api_db_row_get(
+ DB *db,
+ WT_TOC *toc,
+ DBT *key,
+ DBT *data,
+ uint32_t flags);
+static int __wt_api_db_row_get(
+ DB *db,
+ WT_TOC *toc,
+ DBT *key,
+ DBT *data,
+ uint32_t flags)
+{
+ const char *method_name = "DB.row_get";
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+ int ret;
+
+ WT_DB_ROW_ONLY(db, method_name);
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_ROW_GET);
+ WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+ WT_STAT_INCR(ienv->method_stats, DB_ROW_GET);
+ ret = __wt_db_row_get(toc, key, data);
+ WT_TRET(__wt_toc_api_clr(toc, method_name, 0));
+ return (ret);
+}
+
+static int __wt_api_db_row_put(
+ DB *db,
+ WT_TOC *toc,
+ DBT *key,
+ DBT *data,
+ uint32_t flags);
+static int __wt_api_db_row_put(
+ DB *db,
+ WT_TOC *toc,
+ DBT *key,
+ DBT *data,
+ uint32_t flags)
+{
+ const char *method_name = "DB.row_put";
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+ int ret;
+
+ WT_DB_ROW_ONLY(db, method_name);
+ WT_DB_RDONLY(db, method_name);
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_ROW_PUT);
+ WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+ WT_STAT_INCR(ienv->method_stats, DB_ROW_PUT);
+ while ((ret = __wt_db_row_put(toc, key, data)) == WT_RESTART)
+ WT_STAT_INCR(ienv->method_stats, DB_ROW_PUT_RESTART);
+ WT_TRET(__wt_toc_api_clr(toc, method_name, 0));
+ return (ret);
+}
+
+static int __wt_api_db_stat_clear(
+ DB *db,
+ uint32_t flags);
+static int __wt_api_db_stat_clear(
+ DB *db,
+ uint32_t flags)
+{
+ const char *method_name = "DB.stat_clear";
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+ int ret;
+
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_STAT_CLEAR);
+ WT_STAT_INCR(ienv->method_stats, DB_STAT_CLEAR);
+ ret = __wt_db_stat_clear(db);
+ return (ret);
+}
+
+static int __wt_api_db_stat_print(
+ DB *db,
+ FILE *stream,
+ uint32_t flags);
+static int __wt_api_db_stat_print(
+ DB *db,
+ FILE *stream,
+ uint32_t flags)
+{
+ const char *method_name = "DB.stat_print";
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+ WT_TOC *toc = NULL;
+ int ret;
+
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_STAT_PRINT);
+ WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+ WT_STAT_INCR(ienv->method_stats, DB_STAT_PRINT);
+ ret = __wt_db_stat_print(toc, stream);
+ WT_TRET(__wt_toc_api_clr(toc, method_name, 1));
+ return (ret);
+}
+
+static int __wt_api_db_sync(
+ DB *db,
+ void (*progress)(const char *, uint64_t),
+ uint32_t flags);
+static int __wt_api_db_sync(
+ DB *db,
+ void (*progress)(const char *, uint64_t),
+ uint32_t flags)
+{
+ const char *method_name = "DB.sync";
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+ WT_TOC *toc = NULL;
+ int ret;
+
+ WT_DB_RDONLY(db, method_name);
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_SYNC);
+ WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+ WT_STAT_INCR(ienv->method_stats, DB_SYNC);
+ ret = __wt_db_sync(toc, progress, flags);
+ WT_TRET(__wt_toc_api_clr(toc, method_name, 1));
+ return (ret);
+}
+
+static int __wt_api_db_verify(
+ DB *db,
+ void (*progress)(const char *, uint64_t),
+ uint32_t flags);
+static int __wt_api_db_verify(
+ DB *db,
+ void (*progress)(const char *, uint64_t),
+ uint32_t flags)
+{
+ const char *method_name = "DB.verify";
+ ENV *env = db->env;
+ IENV *ienv = env->ienv;
+ WT_TOC *toc = NULL;
+ int ret;
+
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_VERIFY);
+ WT_RET(__wt_toc_api_set(env, method_name, db, &toc));
+ WT_STAT_INCR(ienv->method_stats, DB_VERIFY);
+ ret = __wt_db_verify(toc, progress);
+ WT_TRET(__wt_toc_api_clr(toc, method_name, 1));
+ return (ret);
+}
+
+static int __wt_api_env_cache_size_get(
+ ENV *env,
+ uint32_t *cache_size);
+static int __wt_api_env_cache_size_get(
+ ENV *env,
+ uint32_t *cache_size)
+{
+ IENV *ienv = env->ienv;
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_CACHE_SIZE_GET);
+ *cache_size = env->cache_size;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_cache_size_set(
+ ENV *env,
+ uint32_t cache_size);
+static int __wt_api_env_cache_size_set(
+ ENV *env,
+ uint32_t cache_size)
+{
+ IENV *ienv = env->ienv;
+ WT_RET((__wt_env_cache_size_set_verify(
+ env, cache_size)));
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_CACHE_SIZE_SET);
+ env->cache_size = cache_size;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_close(
+ ENV *env,
+ uint32_t flags);
+static int __wt_api_env_close(
+ ENV *env,
+ uint32_t flags)
+{
+ const char *method_name = "ENV.close";
+ IENV *ienv = env->ienv;
+ int ret;
+
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_CLOSE);
+ WT_STAT_INCR(ienv->method_stats, ENV_CLOSE);
+ ret = __wt_env_close(env);
+ return (ret);
+}
+
+static int __wt_api_env_data_update_initial_get(
+ ENV *env,
+ uint32_t *data_update_initial);
+static int __wt_api_env_data_update_initial_get(
+ ENV *env,
+ uint32_t *data_update_initial)
+{
+ IENV *ienv = env->ienv;
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_DATA_UPDATE_INITIAL_GET);
+ *data_update_initial = env->data_update_initial;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_data_update_initial_set(
+ ENV *env,
+ uint32_t data_update_initial);
+static int __wt_api_env_data_update_initial_set(
+ ENV *env,
+ uint32_t data_update_initial)
+{
+ IENV *ienv = env->ienv;
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_DATA_UPDATE_INITIAL_SET);
+ env->data_update_initial = data_update_initial;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_data_update_max_get(
+ ENV *env,
+ uint32_t *data_update_max);
+static int __wt_api_env_data_update_max_get(
+ ENV *env,
+ uint32_t *data_update_max)
+{
+ IENV *ienv = env->ienv;
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_DATA_UPDATE_MAX_GET);
+ *data_update_max = env->data_update_max;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_data_update_max_set(
+ ENV *env,
+ uint32_t data_update_max);
+static int __wt_api_env_data_update_max_set(
+ ENV *env,
+ uint32_t data_update_max)
+{
+ IENV *ienv = env->ienv;
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_DATA_UPDATE_MAX_SET);
+ env->data_update_max = data_update_max;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_db(
+ ENV *env,
+ uint32_t flags,
+ DB **dbp);
+static int __wt_api_env_db(
+ ENV *env,
+ uint32_t flags,
+ DB **dbp)
+{
+ const char *method_name = "ENV.db";
+ IENV *ienv = env->ienv;
+ int ret;
+
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_DB);
+ WT_STAT_INCR(ienv->method_stats, ENV_DB);
+ ret = __wt_env_db(env, dbp);
+ return (ret);
+}
+
+static int __wt_api_env_errcall_get(
+ ENV *env,
+ void (**errcall)(const ENV *, const char *));
+static int __wt_api_env_errcall_get(
+ ENV *env,
+ void (**errcall)(const ENV *, const char *))
+{
+ IENV *ienv = env->ienv;
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_ERRCALL_GET);
+ *errcall = env->errcall;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_errcall_set(
+ ENV *env,
+ void (*errcall)(const ENV *, const char *));
+static int __wt_api_env_errcall_set(
+ ENV *env,
+ void (*errcall)(const ENV *, const char *))
+{
+ IENV *ienv = env->ienv;
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_ERRCALL_SET);
+ env->errcall = errcall;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_errfile_get(
+ ENV *env,
+ FILE **errfile);
+static int __wt_api_env_errfile_get(
+ ENV *env,
+ FILE **errfile)
+{
+ IENV *ienv = env->ienv;
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_ERRFILE_GET);
+ *errfile = env->errfile;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_errfile_set(
+ ENV *env,
+ FILE *errfile);
+static int __wt_api_env_errfile_set(
+ ENV *env,
+ FILE *errfile)
+{
+ IENV *ienv = env->ienv;
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_ERRFILE_SET);
+ env->errfile = errfile;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_errpfx_get(
+ ENV *env,
+ const char **errpfx);
+static int __wt_api_env_errpfx_get(
+ ENV *env,
+ const char **errpfx)
+{
+ IENV *ienv = env->ienv;
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_ERRPFX_GET);
+ *errpfx = env->errpfx;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_errpfx_set(
+ ENV *env,
+ const char *errpfx);
+static int __wt_api_env_errpfx_set(
+ ENV *env,
+ const char *errpfx)
+{
+ IENV *ienv = env->ienv;
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_ERRPFX_SET);
+ env->errpfx = errpfx;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_hazard_size_get(
+ ENV *env,
+ uint32_t *hazard_size);
+static int __wt_api_env_hazard_size_get(
+ ENV *env,
+ uint32_t *hazard_size)
+{
+ IENV *ienv = env->ienv;
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_HAZARD_SIZE_GET);
+ *hazard_size = env->hazard_size;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_hazard_size_set(
+ ENV *env,
+ uint32_t hazard_size);
+static int __wt_api_env_hazard_size_set(
+ ENV *env,
+ uint32_t hazard_size)
+{
+ IENV *ienv = env->ienv;
+ WT_RET((__wt_env_hazard_size_set_verify(
+ env, hazard_size)));
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_HAZARD_SIZE_SET);
+ env->hazard_size = hazard_size;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_msgcall_get(
+ ENV *env,
+ void (**msgcall)(const ENV *, const char *));
+static int __wt_api_env_msgcall_get(
+ ENV *env,
+ void (**msgcall)(const ENV *, const char *))
+{
+ IENV *ienv = env->ienv;
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_MSGCALL_GET);
+ *msgcall = env->msgcall;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_msgcall_set(
+ ENV *env,
+ void (*msgcall)(const ENV *, const char *));
+static int __wt_api_env_msgcall_set(
+ ENV *env,
+ void (*msgcall)(const ENV *, const char *))
+{
+ IENV *ienv = env->ienv;
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_MSGCALL_SET);
+ env->msgcall = msgcall;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_msgfile_get(
+ ENV *env,
+ FILE **msgfile);
+static int __wt_api_env_msgfile_get(
+ ENV *env,
+ FILE **msgfile)
+{
+ IENV *ienv = env->ienv;
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_MSGFILE_GET);
+ *msgfile = env->msgfile;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_msgfile_set(
+ ENV *env,
+ FILE *msgfile);
+static int __wt_api_env_msgfile_set(
+ ENV *env,
+ FILE *msgfile)
+{
+ IENV *ienv = env->ienv;
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_MSGFILE_SET);
+ env->msgfile = msgfile;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_open(
+ ENV *env,
+ const char *home,
+ mode_t mode,
+ uint32_t flags);
+static int __wt_api_env_open(
+ ENV *env,
+ const char *home,
+ mode_t mode,
+ uint32_t flags)
+{
+ const char *method_name = "ENV.open";
+ IENV *ienv = env->ienv;
+ int ret;
+
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_OPEN);
+ WT_STAT_INCR(ienv->method_stats, ENV_OPEN);
+ ret = __wt_env_open(env, home, mode);
+ return (ret);
+}
+
+static int __wt_api_env_stat_clear(
+ ENV *env,
+ uint32_t flags);
+static int __wt_api_env_stat_clear(
+ ENV *env,
+ uint32_t flags)
+{
+ const char *method_name = "ENV.stat_clear";
+ IENV *ienv = env->ienv;
+ int ret;
+
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_STAT_CLEAR);
+ WT_STAT_INCR(ienv->method_stats, ENV_STAT_CLEAR);
+ ret = __wt_env_stat_clear(env);
+ return (ret);
+}
+
+static int __wt_api_env_stat_print(
+ ENV *env,
+ FILE *stream,
+ uint32_t flags);
+static int __wt_api_env_stat_print(
+ ENV *env,
+ FILE *stream,
+ uint32_t flags)
+{
+ const char *method_name = "ENV.stat_print";
+ IENV *ienv = env->ienv;
+ int ret;
+
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_STAT_PRINT);
+ WT_STAT_INCR(ienv->method_stats, ENV_STAT_PRINT);
+ ret = __wt_env_stat_print(env, stream);
+ return (ret);
+}
+
+static int __wt_api_env_sync(
+ ENV *env,
+ void (*progress)(const char *, uint64_t),
+ uint32_t flags);
+static int __wt_api_env_sync(
+ ENV *env,
+ void (*progress)(const char *, uint64_t),
+ uint32_t flags)
+{
+ const char *method_name = "ENV.sync";
+ IENV *ienv = env->ienv;
+ int ret;
+
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_SYNC);
+ WT_STAT_INCR(ienv->method_stats, ENV_SYNC);
+ ret = __wt_env_sync(env, progress);
+ return (ret);
+}
+
+static int __wt_api_env_toc(
+ ENV *env,
+ uint32_t flags,
+ WT_TOC **tocp);
+static int __wt_api_env_toc(
+ ENV *env,
+ uint32_t flags,
+ WT_TOC **tocp)
+{
+ const char *method_name = "ENV.toc";
+ IENV *ienv = env->ienv;
+ int ret;
+
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_TOC);
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_TOC);
+ ret = __wt_env_toc(env, tocp);
+ __wt_unlock(env, ienv->mtx);
+ return (ret);
+}
+
+static int __wt_api_env_toc_size_get(
+ ENV *env,
+ uint32_t *toc_size);
+static int __wt_api_env_toc_size_get(
+ ENV *env,
+ uint32_t *toc_size)
+{
+ IENV *ienv = env->ienv;
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_TOC_SIZE_GET);
+ *toc_size = env->toc_size;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_toc_size_set(
+ ENV *env,
+ uint32_t toc_size);
+static int __wt_api_env_toc_size_set(
+ ENV *env,
+ uint32_t toc_size)
+{
+ IENV *ienv = env->ienv;
+ WT_RET((__wt_env_toc_size_set_verify(
+ env, toc_size)));
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_TOC_SIZE_SET);
+ env->toc_size = toc_size;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_verbose_get(
+ ENV *env,
+ uint32_t *verbose);
+static int __wt_api_env_verbose_get(
+ ENV *env,
+ uint32_t *verbose)
+{
+ IENV *ienv = env->ienv;
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_VERBOSE_GET);
+ *verbose = env->verbose;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_env_verbose_set(
+ ENV *env,
+ uint32_t verbose);
+static int __wt_api_env_verbose_set(
+ ENV *env,
+ uint32_t verbose)
+{
+ IENV *ienv = env->ienv;
+ WT_RET((__wt_env_verbose_set_verify(
+ env, verbose)));
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, ENV_VERBOSE_SET);
+ env->verbose = verbose;
+ __wt_unlock(env, ienv->mtx);
+ return (0);
+}
+
+static int __wt_api_wt_toc_close(
+ WT_TOC *wt_toc,
+ uint32_t flags);
+static int __wt_api_wt_toc_close(
+ WT_TOC *wt_toc,
+ uint32_t flags)
+{
+ const char *method_name = "WT_TOC.close";
+ ENV *env = wt_toc->env;
+ IENV *ienv = env->ienv;
+ int ret;
+
+ WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_WT_TOC_CLOSE);
+ __wt_lock(env, ienv->mtx);
+ WT_STAT_INCR(ienv->method_stats, WT_TOC_CLOSE);
+ ret = __wt_wt_toc_close(wt_toc);
+ __wt_unlock(env, ienv->mtx);
+ return (ret);
+}
+
+void
+__wt_methods_db_config_default(DB *db)
+{
+ db->btree_compare_dup = __wt_bt_lex_compare;
+ db->btree_compare = __wt_bt_lex_compare;
+}
+
+void
+__wt_methods_db_lockout(DB *db)
+{
+ db->btree_compare_dup_get = (int (*)
+ (DB *, int (**)(DB *, const DBT *, const DBT *)))
+ __wt_db_lockout;
+ db->btree_compare_dup_set = (int (*)
+ (DB *, int (*)(DB *, const DBT *, const DBT *)))
+ __wt_db_lockout;
+ db->btree_compare_get = (int (*)
+ (DB *, int (**)(DB *, const DBT *, const DBT *)))
+ __wt_db_lockout;
+ db->btree_compare_int_get = (int (*)
+ (DB *, int *))
+ __wt_db_lockout;
+ db->btree_compare_int_set = (int (*)
+ (DB *, int ))
+ __wt_db_lockout;
+ db->btree_compare_set = (int (*)
+ (DB *, int (*)(DB *, const DBT *, const DBT *)))
+ __wt_db_lockout;
+ db->btree_dup_offpage_get = (int (*)
+ (DB *, uint32_t *))
+ __wt_db_lockout;
+ db->btree_dup_offpage_set = (int (*)
+ (DB *, uint32_t ))
+ __wt_db_lockout;
+ db->btree_itemsize_get = (int (*)
+ (DB *, uint32_t *, uint32_t *))
+ __wt_db_lockout;
+ db->btree_itemsize_set = (int (*)
+ (DB *, uint32_t , uint32_t ))
+ __wt_db_lockout;
+ db->btree_pagesize_get = (int (*)
+ (DB *, uint32_t *, uint32_t *, uint32_t *, uint32_t *, uint32_t *))
+ __wt_db_lockout;
+ db->btree_pagesize_set = (int (*)
+ (DB *, uint32_t , uint32_t , uint32_t , uint32_t , uint32_t ))
+ __wt_db_lockout;
+ db->bulk_load = (int (*)
+ (DB *, uint32_t , void (*)(const char *, uint64_t), int (*)(DB *, DBT **, DBT **)))
+ __wt_db_lockout;
+ db->col_del = (int (*)
+ (DB *, WT_TOC *, uint64_t , uint32_t ))
+ __wt_db_lockout;
+ db->col_get = (int (*)
+ (DB *, WT_TOC *, uint64_t , DBT *, uint32_t ))
+ __wt_db_lockout;
+ db->col_put = (int (*)
+ (DB *, WT_TOC *, uint64_t , DBT *, uint32_t ))
+ __wt_db_lockout;
+ db->column_set = (int (*)
+ (DB *, uint32_t , const char *, uint32_t ))
+ __wt_db_lockout;
+ db->dump = (int (*)
+ (DB *, FILE *, void (*)(const char *, uint64_t), uint32_t ))
+ __wt_db_lockout;
+ db->err = (void (*)
+ (DB *, int , const char *, ...))
+ __wt_db_lockout;
+ db->errcall_get = (int (*)
+ (DB *, void (**)(const DB *, const char *)))
+ __wt_db_lockout;
+ db->errcall_set = (int (*)
+ (DB *, void (*)(const DB *, const char *)))
+ __wt_db_lockout;
+ db->errfile_get = (int (*)
+ (DB *, FILE **))
+ __wt_db_lockout;
+ db->errfile_set = (int (*)
+ (DB *, FILE *))
+ __wt_db_lockout;
+ db->errpfx_get = (int (*)
+ (DB *, const char **))
+ __wt_db_lockout;
+ db->errpfx_set = (int (*)
+ (DB *, const char *))
+ __wt_db_lockout;
+ db->errx = (void (*)
+ (DB *, const char *, ...))
+ __wt_db_lockout;
+ db->huffman_set = (int (*)
+ (DB *, uint8_t const *, u_int , uint32_t ))
+ __wt_db_lockout;
+ db->open = (int (*)
+ (DB *, const char *, mode_t , uint32_t ))
+ __wt_db_lockout;
+ db->row_del = (int (*)
+ (DB *, WT_TOC *, DBT *, uint32_t ))
+ __wt_db_lockout;
+ db->row_get = (int (*)
+ (DB *, WT_TOC *, DBT *, DBT *, uint32_t ))
+ __wt_db_lockout;
+ db->row_put = (int (*)
+ (DB *, WT_TOC *, DBT *, DBT *, uint32_t ))
+ __wt_db_lockout;
+ db->stat_clear = (int (*)
+ (DB *, uint32_t ))
+ __wt_db_lockout;
+ db->stat_print = (int (*)
+ (DB *, FILE *, uint32_t ))
+ __wt_db_lockout;
+ db->sync = (int (*)
+ (DB *, void (*)(const char *, uint64_t), uint32_t ))
+ __wt_db_lockout;
+ db->verify = (int (*)
+ (DB *, void (*)(const char *, uint64_t), uint32_t ))
+ __wt_db_lockout;
+}
+
+void
+__wt_methods_db_init_transition(DB *db)
+{
+ db->btree_compare_dup_get = __wt_api_db_btree_compare_dup_get;
+ db->btree_compare_dup_set = __wt_api_db_btree_compare_dup_set;
+ db->btree_compare_get = __wt_api_db_btree_compare_get;
+ db->btree_compare_int_get = __wt_api_db_btree_compare_int_get;
+ db->btree_compare_int_set = __wt_api_db_btree_compare_int_set;
+ db->btree_compare_set = __wt_api_db_btree_compare_set;
+ db->btree_dup_offpage_get = __wt_api_db_btree_dup_offpage_get;
+ db->btree_dup_offpage_set = __wt_api_db_btree_dup_offpage_set;
+ db->btree_itemsize_get = __wt_api_db_btree_itemsize_get;
+ db->btree_itemsize_set = __wt_api_db_btree_itemsize_set;
+ db->btree_pagesize_get = __wt_api_db_btree_pagesize_get;
+ db->btree_pagesize_set = __wt_api_db_btree_pagesize_set;
+ db->close = __wt_api_db_close;
+ db->column_set = __wt_api_db_column_set;
+ db->err = __wt_api_db_err;
+ db->errcall_get = __wt_api_db_errcall_get;
+ db->errcall_set = __wt_api_db_errcall_set;
+ db->errfile_get = __wt_api_db_errfile_get;
+ db->errfile_set = __wt_api_db_errfile_set;
+ db->errpfx_get = __wt_api_db_errpfx_get;
+ db->errpfx_set = __wt_api_db_errpfx_set;
+ db->errx = __wt_api_db_errx;
+ db->huffman_set = __wt_api_db_huffman_set;
+ db->open = __wt_api_db_open;
+}
+
+void
+__wt_methods_db_open_transition(DB *db)
+{
+ db->btree_compare_dup_set = (int (*)
+ (DB *, int (*)(DB *, const DBT *, const DBT *)))
+ __wt_db_lockout;
+ db->btree_compare_int_set = (int (*)
+ (DB *, int ))
+ __wt_db_lockout;
+ db->btree_compare_set = (int (*)
+ (DB *, int (*)(DB *, const DBT *, const DBT *)))
+ __wt_db_lockout;
+ db->btree_dup_offpage_set = (int (*)
+ (DB *, uint32_t ))
+ __wt_db_lockout;
+ db->btree_itemsize_set = (int (*)
+ (DB *, uint32_t , uint32_t ))
+ __wt_db_lockout;
+ db->btree_pagesize_set = (int (*)
+ (DB *, uint32_t , uint32_t , uint32_t , uint32_t , uint32_t ))
+ __wt_db_lockout;
+ db->column_set = (int (*)
+ (DB *, uint32_t , const char *, uint32_t ))
+ __wt_db_lockout;
+ db->huffman_set = (int (*)
+ (DB *, uint8_t const *, u_int , uint32_t ))
+ __wt_db_lockout;
+ db->bulk_load = __wt_api_db_bulk_load;
+ db->col_del = __wt_api_db_col_del;
+ db->col_get = __wt_api_db_col_get;
+ db->col_put = __wt_api_db_col_put;
+ db->dump = __wt_api_db_dump;
+ db->row_del = __wt_api_db_row_del;
+ db->row_get = __wt_api_db_row_get;
+ db->row_put = __wt_api_db_row_put;
+ db->stat_clear = __wt_api_db_stat_clear;
+ db->stat_print = __wt_api_db_stat_print;
+ db->sync = __wt_api_db_sync;
+ db->verify = __wt_api_db_verify;
+}
+
+void
+__wt_methods_env_config_default(ENV *env)
+{
+ env->cache_size = 20;
+ env->data_update_initial = 8 * 1024;
+ env->data_update_max = 32 * 1024;
+ env->hazard_size = 15;
+ env->toc_size = 50;
+}
+
+void
+__wt_methods_env_lockout(ENV *env)
+{
+ env->cache_size_get = (int (*)
+ (ENV *, uint32_t *))
+ __wt_env_lockout;
+ env->cache_size_set = (int (*)
+ (ENV *, uint32_t ))
+ __wt_env_lockout;
+ env->data_update_initial_get = (int (*)
+ (ENV *, uint32_t *))
+ __wt_env_lockout;
+ env->data_update_initial_set = (int (*)
+ (ENV *, uint32_t ))
+ __wt_env_lockout;
+ env->data_update_max_get = (int (*)
+ (ENV *, uint32_t *))
+ __wt_env_lockout;
+ env->data_update_max_set = (int (*)
+ (ENV *, uint32_t ))
+ __wt_env_lockout;
+ env->db = (int (*)
+ (ENV *, uint32_t , DB **))
+ __wt_env_lockout;
+ env->err = (void (*)
+ (ENV *, int , const char *, ...))
+ __wt_env_lockout;
+ env->errcall_get = (int (*)
+ (ENV *, void (**)(const ENV *, const char *)))
+ __wt_env_lockout;
+ env->errcall_set = (int (*)
+ (ENV *, void (*)(const ENV *, const char *)))
+ __wt_env_lockout;
+ env->errfile_get = (int (*)
+ (ENV *, FILE **))
+ __wt_env_lockout;
+ env->errfile_set = (int (*)
+ (ENV *, FILE *))
+ __wt_env_lockout;
+ env->errpfx_get = (int (*)
+ (ENV *, const char **))
+ __wt_env_lockout;
+ env->errpfx_set = (int (*)
+ (ENV *, const char *))
+ __wt_env_lockout;
+ env->errx = (void (*)
+ (ENV *, const char *, ...))
+ __wt_env_lockout;
+ env->hazard_size_get = (int (*)
+ (ENV *, uint32_t *))
+ __wt_env_lockout;
+ env->hazard_size_set = (int (*)
+ (ENV *, uint32_t ))
+ __wt_env_lockout;
+ env->msgcall_get = (int (*)
+ (ENV *, void (**)(const ENV *, const char *)))
+ __wt_env_lockout;
+ env->msgcall_set = (int (*)
+ (ENV *, void (*)(const ENV *, const char *)))
+ __wt_env_lockout;
+ env->msgfile_get = (int (*)
+ (ENV *, FILE **))
+ __wt_env_lockout;
+ env->msgfile_set = (int (*)
+ (ENV *, FILE *))
+ __wt_env_lockout;
+ env->open = (int (*)
+ (ENV *, const char *, mode_t , uint32_t ))
+ __wt_env_lockout;
+ env->stat_clear = (int (*)
+ (ENV *, uint32_t ))
+ __wt_env_lockout;
+ env->stat_print = (int (*)
+ (ENV *, FILE *, uint32_t ))
+ __wt_env_lockout;
+ env->sync = (int (*)
+ (ENV *, void (*)(const char *, uint64_t), uint32_t ))
+ __wt_env_lockout;
+ env->toc = (int (*)
+ (ENV *, uint32_t , WT_TOC **))
+ __wt_env_lockout;
+ env->toc_size_get = (int (*)
+ (ENV *, uint32_t *))
+ __wt_env_lockout;
+ env->toc_size_set = (int (*)
+ (ENV *, uint32_t ))
+ __wt_env_lockout;
+ env->verbose_get = (int (*)
+ (ENV *, uint32_t *))
+ __wt_env_lockout;
+ env->verbose_set = (int (*)
+ (ENV *, uint32_t ))
+ __wt_env_lockout;
+}
+
+void
+__wt_methods_env_init_transition(ENV *env)
+{
+ env->cache_size_get = __wt_api_env_cache_size_get;
+ env->cache_size_set = __wt_api_env_cache_size_set;
+ env->close = __wt_api_env_close;
+ env->data_update_initial_get = __wt_api_env_data_update_initial_get;
+ env->data_update_initial_set = __wt_api_env_data_update_initial_set;
+ env->data_update_max_get = __wt_api_env_data_update_max_get;
+ env->data_update_max_set = __wt_api_env_data_update_max_set;
+ env->err = __wt_api_env_err;
+ env->errcall_get = __wt_api_env_errcall_get;
+ env->errcall_set = __wt_api_env_errcall_set;
+ env->errfile_get = __wt_api_env_errfile_get;
+ env->errfile_set = __wt_api_env_errfile_set;
+ env->errpfx_get = __wt_api_env_errpfx_get;
+ env->errpfx_set = __wt_api_env_errpfx_set;
+ env->errx = __wt_api_env_errx;
+ env->hazard_size_get = __wt_api_env_hazard_size_get;
+ env->hazard_size_set = __wt_api_env_hazard_size_set;
+ env->msgcall_get = __wt_api_env_msgcall_get;
+ env->msgcall_set = __wt_api_env_msgcall_set;
+ env->msgfile_get = __wt_api_env_msgfile_get;
+ env->msgfile_set = __wt_api_env_msgfile_set;
+ env->open = __wt_api_env_open;
+ env->stat_clear = __wt_api_env_stat_clear;
+ env->stat_print = __wt_api_env_stat_print;
+ env->toc_size_get = __wt_api_env_toc_size_get;
+ env->toc_size_set = __wt_api_env_toc_size_set;
+ env->verbose_get = __wt_api_env_verbose_get;
+ env->verbose_set = __wt_api_env_verbose_set;
+}
+
+void
+__wt_methods_env_open_transition(ENV *env)
+{
+ env->cache_size_set = (int (*)
+ (ENV *, uint32_t ))
+ __wt_env_lockout;
+ env->hazard_size_set = (int (*)
+ (ENV *, uint32_t ))
+ __wt_env_lockout;
+ env->open = (int (*)
+ (ENV *, const char *, mode_t , uint32_t ))
+ __wt_env_lockout;
+ env->toc_size_set = (int (*)
+ (ENV *, uint32_t ))
+ __wt_env_lockout;
+ env->db = __wt_api_env_db;
+ env->sync = __wt_api_env_sync;
+ env->toc = __wt_api_env_toc;
+}
+
+void
+__wt_methods_wt_toc_lockout(WT_TOC *wt_toc)
+{
+ WT_CC_QUIET(wt_toc, NULL);
+}
+
+void
+__wt_methods_wt_toc_init_transition(WT_TOC *wt_toc)
+{
+ wt_toc->close = __wt_api_wt_toc_close;
+}
+
diff --git a/src/support/cksum.c b/src/support/cksum.c
new file mode 100644
index 00000000000..06b0e625b0d
--- /dev/null
+++ b/src/support/cksum.c
@@ -0,0 +1,134 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cksum --
+ * Return a checksum for a chunk of memory.
+ *
+ * Algorithm 3 from Richard Black's discussion of CRC32.
+ * http://www.cl.cam.ac.uk/research/srg/
+ * bluebook/21/crc/node6.html#SECTION00060000000000000000
+ */
+uint32_t
+__wt_cksum(void *chunk, uint32_t bytes)
+{
+ #if 0
+ /*
+ * Code to generate the crctab.
+ */
+ #define __QUOTIENT 0x04c11db7
+ int
+ main()
+ {
+ int i, j;
+ unsigned int crc, crctab[256];
+
+ for (i = 0; i < 256; i++) {
+ crc = i << 24;
+ for (j = 0; j < 8; j++) {
+ if (crc & 0x80000000)
+ crc = (crc << 1) ^ __QUOTIENT;
+ else
+ crc = crc << 1;
+ }
+ crctab[i] = crc;
+ }
+ for (i = 0; i < 256;) {
+ printf("0x%08lx, ", (unsigned long)crctab[i]);
+ if (++i % 4 == 0)
+ printf("\n");
+ }
+ return (0);
+ }
+ #endif
+
+ static const uint32_t crctab[256] = {
+ 0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9,
+ 0x130476dc, 0x17c56b6b, 0x1a864db2, 0x1e475005,
+ 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61,
+ 0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd,
+ 0x4c11db70, 0x48d0c6c7, 0x4593e01e, 0x4152fda9,
+ 0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75,
+ 0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011,
+ 0x791d4014, 0x7ddc5da3, 0x709f7b7a, 0x745e66cd,
+ 0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039,
+ 0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5,
+ 0xbe2b5b58, 0xbaea46ef, 0xb7a96036, 0xb3687d81,
+ 0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d,
+ 0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49,
+ 0xc7361b4c, 0xc3f706fb, 0xceb42022, 0xca753d95,
+ 0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1,
+ 0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d,
+ 0x34867077, 0x30476dc0, 0x3d044b19, 0x39c556ae,
+ 0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072,
+ 0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16,
+ 0x018aeb13, 0x054bf6a4, 0x0808d07d, 0x0cc9cdca,
+ 0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde,
+ 0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02,
+ 0x5e9f46bf, 0x5a5e5b08, 0x571d7dd1, 0x53dc6066,
+ 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba,
+ 0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e,
+ 0xbfa1b04b, 0xbb60adfc, 0xb6238b25, 0xb2e29692,
+ 0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6,
+ 0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a,
+ 0xe0b41de7, 0xe4750050, 0xe9362689, 0xedf73b3e,
+ 0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2,
+ 0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686,
+ 0xd5b88683, 0xd1799b34, 0xdc3abded, 0xd8fba05a,
+ 0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637,
+ 0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb,
+ 0x4f040d56, 0x4bc510e1, 0x46863638, 0x42472b8f,
+ 0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53,
+ 0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47,
+ 0x36194d42, 0x32d850f5, 0x3f9b762c, 0x3b5a6b9b,
+ 0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff,
+ 0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623,
+ 0xf12f560e, 0xf5ee4bb9, 0xf8ad6d60, 0xfc6c70d7,
+ 0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b,
+ 0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f,
+ 0xc423cd6a, 0xc0e2d0dd, 0xcda1f604, 0xc960ebb3,
+ 0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7,
+ 0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b,
+ 0x9b3660c6, 0x9ff77d71, 0x92b45ba8, 0x9675461f,
+ 0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3,
+ 0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640,
+ 0x4e8ee645, 0x4a4ffbf2, 0x470cdd2b, 0x43cdc09c,
+ 0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8,
+ 0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24,
+ 0x119b4be9, 0x155a565e, 0x18197087, 0x1cd86d30,
+ 0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec,
+ 0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088,
+ 0x2497d08d, 0x2056cd3a, 0x2d15ebe3, 0x29d4f654,
+ 0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0,
+ 0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c,
+ 0xe3a1cbc1, 0xe760d676, 0xea23f0af, 0xeee2ed18,
+ 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4,
+ 0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0,
+ 0x9abc8bd5, 0x9e7d9662, 0x933eb0bb, 0x97ffad0c,
+ 0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668,
+ 0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4
+ };
+ uint32_t i, result;
+ uint8_t *data;
+
+ data = chunk;
+ result = *data++ << 24;
+ result |= *data++ << 16;
+ result |= *data++ << 8;
+ result |= *data++;
+ result = ~result;
+ bytes -= 4;
+
+ for (i = 0; i < bytes; ++i)
+ result = (result << 8 | *data++) ^ crctab[result >> 24];
+
+ return (~result);
+}
diff --git a/src/support/err.c b/src/support/err.c
new file mode 100644
index 00000000000..dc8eac01189
--- /dev/null
+++ b/src/support/err.c
@@ -0,0 +1,247 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_msg_call --
+ * Pass a message to a callback function.
+ */
+void
+__wt_msg_call(void *cb, void *handle,
+ const char *pfx1, const char *pfx2,
+ int error, const char *fmt, va_list ap)
+{
+ size_t len;
+ int separator;
+
+ /*
+ * !!!
+ * SECURITY:
+ * Buffer placed at the end of the stack in case snprintf overflows.
+ */
+ char s[2048];
+
+ len = 0;
+ separator = 0;
+ s[0] = '\0';
+ if (pfx1 != NULL) {
+ len += (size_t)snprintf(s + len, sizeof(s) - len, "%s", pfx1);
+ separator = 1;
+ }
+ if (pfx2 != NULL && len < sizeof(s) - 1) {
+ len += (size_t)snprintf(s + len, sizeof(s) - len,
+ "%s%s", separator ? ": " : "", pfx2);
+ separator = 1;
+ }
+ if (separator && len < sizeof(s) - 1)
+ len += (size_t)snprintf(s + len, sizeof(s) - len, ": ");
+ if (len < sizeof(s) - 1)
+ len += (size_t)vsnprintf(s + len, sizeof(s) - len, fmt, ap);
+ if (error != 0 && len < sizeof(s) - 1)
+ (void)snprintf(s + len,
+ sizeof(s) - len, ": %s", wiredtiger_strerror(error));
+
+ ((void (*)(void *, const char *))cb)(handle, s);
+}
+
+/*
+ * __wt_msg_stream --
+ * Write a message to a FILE stream.
+ */
+void
+__wt_msg_stream(FILE *fp,
+ const char *pfx1, const char *pfx2, int error, const char *fmt, va_list ap)
+{
+ if (fp == NULL)
+ fp = stderr;
+
+ if (pfx1 != NULL)
+ (void)fprintf(fp, "%s: ", pfx1);
+ if (pfx2 != NULL)
+ (void)fprintf(fp, "%s: ", pfx2);
+ (void)vfprintf(fp, fmt, ap);
+ if (error != 0)
+ (void)fprintf(fp, ": %s", wiredtiger_strerror(error));
+ (void)fprintf(fp, "\n");
+ (void)fflush(fp);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_assert --
+ * Internal version of assert function.
+ */
+void
+__wt_assert(ENV *env, const char *check, const char *file_name, int line_number)
+{
+ __wt_api_env_errx(env,
+ "assertion failure: %s/%d: \"%s\"", file_name, line_number, check);
+
+ __wt_abort(env);
+ /* NOTREACHED */
+}
+#endif
+
+/*
+ * __wt_api_args --
+ * Print a standard error message when an API function is passed illegal
+ * arguments.
+ */
+int
+__wt_api_args(ENV *env, const char *name)
+{
+ __wt_api_env_errx(env,
+ "%s: illegal API arguments or flag values specified", name);
+ return (WT_ERROR);
+}
+
+/*
+ * __wt_api_arg_min --
+ * Print a standard error message when an API function is passed a
+ * too-small argument.
+ */
+int
+__wt_api_arg_min(ENV *env,
+ const char *name, const char *arg_name, uint32_t v, uint32_t min)
+{
+ if (v >= min)
+ return (0);
+
+ __wt_api_env_errx(env,
+ "%s: %s argument less than minimum value of %lu",
+ name, arg_name, (u_long)min);
+ return (WT_ERROR);
+}
+
+/*
+ * __wt_api_arg_max --
+ * Print a standard error message when an API function is passed a
+ * too-large argument.
+ */
+int
+__wt_api_arg_max(ENV *env,
+ const char *name, const char *arg_name, uint32_t v, uint32_t max)
+{
+ if (v <= max)
+ return (0);
+
+ __wt_api_env_errx(env,
+ "%s: %s argument larger than maximum value of %lu",
+ name, arg_name, (u_long)max);
+ return (WT_ERROR);
+}
+
+/*
+ * __wt_database_method_type --
+ * Print a standard error message on attempts to call methods inappropriate
+ * for a database type.
+ */
+int
+__wt_database_method_type(DB *db, const char *name, int column_err)
+{
+ __wt_api_db_errx(db,
+ "%s: this method is not supported for a %s database",
+ name, column_err ? "column store" : "row store");
+ return (WT_ERROR);
+}
+
+/*
+ * __wt_database_wrong_fixed_size --
+ * Print a standard error message on attempts to put the wrong size element
+ * into a fixed-size database.
+ */
+int
+__wt_database_wrong_fixed_size(WT_TOC *toc, uint32_t len)
+{
+ DB *db;
+
+ db = toc->db;
+
+ __wt_api_db_errx(db,
+ "%s: length of %lu does not match fixed-length database "
+ "configuration of %lu",
+ toc->name, (u_long)len, (u_long)db->fixed_len);
+ return (WT_ERROR);
+}
+
+/*
+ * __wt_database_readonly --
+ * Print a standard error message on attempts to modify a read-only
+ * database.
+ */
+int
+__wt_database_readonly(DB *db, const char *name)
+{
+ __wt_api_db_errx(db,
+ "%s: the database was opened read-only and may not be modified",
+ name);
+ return (WT_READONLY);
+}
+
+/*
+ * __wt_database_format --
+ * Print a standard error message when a database format error is
+ * suddenly discovered.
+ */
+int
+__wt_database_format(DB *db)
+{
+ __wt_api_db_errx(db, "the database is corrupted; use the Db.salvage"
+ " method or the db_salvage utility to repair the database");
+ return (WT_ERROR);
+}
+
+/*
+ * __wt_database_item_too_big --
+ * Print a standard error message when an element is too large to store.
+ */
+int
+__wt_database_item_too_big(DB *db)
+{
+ __wt_api_db_errx(db, "the item is too large for the database to store");
+ return (WT_ERROR);
+}
+
+/*
+ * __wt_wt_toc_lockout --
+ * Standard WT_TOC handle lockout error message.
+ */
+int
+__wt_wt_toc_lockout(WT_TOC *toc)
+{
+ return (__wt_env_lockout(toc->env));
+}
+
+/*
+ * __wt_db_lockout --
+ * Standard DB handle lockout error message.
+ */
+int
+__wt_db_lockout(DB *db)
+{
+ return (__wt_env_lockout(db->env));
+}
+
+/*
+ * __wt_env_lockout --
+ * Standard ENV handle lockout error message.
+ */
+int
+__wt_env_lockout(ENV *env)
+{
+ __wt_api_env_errx(env,
+ "An unavailable handle method was called; the handle method is "
+ "not available for some reason, for example, handle methods are "
+ "restricted after an error, or configuration methods may be "
+ "restricted after the database or environment have been opened, "
+ "or operational methods may be restricted until the database or "
+ "environment has been opened.");
+ return (WT_ERROR);
+}
diff --git a/src/support/hazard.c b/src/support/hazard.c
new file mode 100644
index 00000000000..5bef0731aa5
--- /dev/null
+++ b/src/support/hazard.c
@@ -0,0 +1,133 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_hazard_set --
+ * Set a hazard reference.
+ */
+int
+__wt_hazard_set(WT_TOC *toc, WT_REF *ref)
+{
+ ENV *env;
+ WT_PAGE **hp;
+
+ env = toc->env;
+
+ /*
+ * Do the dance:
+ *
+ * The memory location making a page "real" is the WT_REF's state which
+ * can be reset from WT_OK to WT_EVICT at any time by the page eviction
+ * server.
+ *
+ * Add the WT_REF reference to the WT_TOC's hazard list and flush the
+ * write, then see if the state field is still WT_OK. If it's still
+ * WT_OK, we know we can use the page because the page eviction server
+ * will see our hazard reference before it discards the buffer (the
+ * eviction server sets the WT_REF state to WT_EVICT, flushes memory,
+ * and then checks the hazard references).
+ */
+ for (hp = toc->hazard; hp < toc->hazard + env->hazard_size; ++hp) {
+ if (*hp != NULL)
+ continue;
+
+ /*
+ * Memory flush needed; the hazard array isn't declared volatile
+ * and an explicit memory flush is necessary.
+ */
+ *hp = ref->page;
+ WT_MEMORY_FLUSH;
+
+ /*
+ * If the cache entry is set, check to see if it's still valid.
+ * Valid means the state is WT_OK, or the state is WT_EVICT and
+ * this thread is allowed to see pages flagged for eviction.
+ */
+ if (ref->state == WT_OK ||
+ (ref->state == WT_EVICT && F_ISSET(toc, WT_READ_EVICT))) {
+ WT_VERBOSE(env, WT_VERB_HAZARD,
+ (env, "toc %p hazard %p: set", toc, ref->page));
+ return (1);
+ }
+
+ /* The cache eviction server owns the page, we can't have it. */
+ *hp = NULL;
+ return (0);
+ }
+
+ __wt_api_env_errx(env, "WT_TOC has no more hazard reference slots");
+ WT_ASSERT(env, hp < toc->hazard + env->hazard_size);
+ return (0);
+}
+
+/*
+ * __wt_hazard_clear --
+ * Clear a hazard reference.
+ */
+void
+__wt_hazard_clear(WT_TOC *toc, WT_PAGE *page)
+{
+ ENV *env;
+ WT_PAGE **hp;
+
+ env = toc->env;
+
+ WT_VERBOSE(env,
+ WT_VERB_HAZARD, (env, "toc %p hazard %p: clr", toc, page));
+
+ /* Clear the caller's hazard pointer. */
+ for (hp = toc->hazard; hp < toc->hazard + env->hazard_size; ++hp)
+ if (*hp == page) {
+ *hp = NULL;
+ /*
+ * We don't have to flush memory here for correctness;
+ * it would give the page server thread faster access
+ * to the block were the block selected to be evicted,
+ * but the generation number was just set which makes
+ * it unlikely to be selected for eviction.
+ */
+ return;
+ }
+ __wt_api_env_errx(env, "WT_TOC hazard reference not found");
+ WT_ASSERT(env, hp < toc->hazard + env->hazard_size);
+}
+
+/*
+ * __wt_hazard_empty --
+ * Verify that no hazard references are set.
+ */
+void
+__wt_hazard_empty(WT_TOC *toc, const char *name)
+{
+ ENV *env;
+ WT_PAGE **hp;
+
+ env = toc->env;
+
+ /*
+ * Check for a set hazard reference and complain if we find one. Clear
+ * any we find because it's not a correctness problem (any hazard ref
+ * we find can't be real because the WT_TOC is being closed when we're
+ * called). We do this work because it's not expensive, and we don't
+ * want to let a hazard reference lie around, keeping a page from being
+ * flushed. The flush isn't necessary for correctness, but gives the
+ * cache eviction thread immediate access to any page our reference
+ * blocks.
+ */
+ for (hp = toc->hazard; hp < toc->hazard + env->hazard_size; ++hp)
+ if (*hp != NULL) {
+ __wt_api_env_errx(env,
+ "%s: returned with a hazard reference set (%p)",
+ name, *hp);
+ *hp = NULL;
+ WT_MEMORY_FLUSH;
+ }
+}
diff --git a/src/support/huffman.c b/src/support/huffman.c
new file mode 100644
index 00000000000..2a0fcfde218
--- /dev/null
+++ b/src/support/huffman.c
@@ -0,0 +1,692 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ *
+ * Huffman Encoder/Decoder v1.0
+ * Author Brian Pollack <brian@brians.com>
+ */
+
+#include "wt_internal.h"
+
+typedef struct __wt_freqtree_node {
+ /*
+ * Data structure representing a node of the huffman tree. It holds a
+ * 32-bit weight and pointers to the left and right child nodes.
+ * The node either has two child nodes or none.
+ */
+ uint16_t symbol; /* only used in leaf nodes */
+ uint32_t weight;
+ uint16_t codeword_length;
+ struct __wt_freqtree_node *left; /* bit 0 */
+ struct __wt_freqtree_node *right; /* bit 1 */
+} WT_FREQTREE_NODE;
+
+typedef struct __wt_static_huffman_node {
+ /*
+ * This data structure is used to represent the huffman tree in a
+ * static array, after it has been created (using a dynamic tree
+ * representation with WT_FREQTREE_NODE nodes).
+ *
+ * In the binary tree's array representation if a node's index is i,
+ * then its left child node is 2i+1 and its right child node is 2i+2.
+ */
+ uint8_t valid;
+ uint16_t symbol;
+ uint16_t codeword_length;
+} WT_STATIC_HUFFMAN_NODE;
+
+typedef struct __wt_huffman_obj {
+ ENV *env; /* Enclosing environment */
+ /*
+ * Data structure here defines specific instance of the encoder/decoder.
+ * This contains the frequency table (tree) used to produce optimal
+ * results. This version of the encoder supports 1- and 2-byte symbols.
+ */
+ uint32_t numSymbols;
+ uint8_t numBytes; /* 1 or 2 */
+ /* The tree in static array reprentation */
+ WT_STATIC_HUFFMAN_NODE *nodes;
+ uint16_t max_depth;
+} WT_HUFFMAN_OBJ;
+
+/*
+ * Queue element data structure.
+ *
+ * Consists of a pointer to a huffman tree node, and a pointer to the next
+ * element in the queue.
+ */
+typedef struct node_queue_elem {
+ WT_FREQTREE_NODE *node;
+ struct node_queue_elem *next;
+} NODE_QUEUE_ELEM;
+
+/*
+ * Queue of huffman tree nodes.
+ *
+ * Contains a pointer to the beginning and the end of the queue, which is
+ * implemented as a linked list.
+ */
+typedef struct node_queue {
+ NODE_QUEUE_ELEM *first;
+ NODE_QUEUE_ELEM *last;
+} NODE_QUEUE;
+
+#define node_queue_is_empty(queue) \
+ (((queue) == NULL || (queue)->first == NULL) ? 1 : 0)
+
+static void node_queue_close(ENV *, NODE_QUEUE *);
+static void node_queue_dequeue(ENV *, NODE_QUEUE *, WT_FREQTREE_NODE **);
+static int node_queue_enqueue(ENV *, NODE_QUEUE *, WT_FREQTREE_NODE *);
+static void recursive_free_node(ENV *env, WT_FREQTREE_NODE *node);
+
+/*
+ * The following macros are used by the encoder to write the buffer with bit
+ * addressing.
+ */
+#undef SET_BIT
+#define SET_BIT(ptr, pos) \
+ *((ptr) + ((pos) / 8)) |= 1 << (7 - ((pos) % 8))
+#undef CLEAR_BIT
+#define CLEAR_BIT(ptr, pos) \
+ *((ptr) + ((pos) / 8)) &= ~(uint8_t)(1 << (7 - ((pos) % 8)))
+#undef MODIFY_BIT
+#define MODIFY_BIT(ptr, pos, bit) \
+ if (bit) \
+ SET_BIT(ptr, pos); \
+ else \
+ CLEAR_BIT(ptr, pos);
+
+/*
+ * Internal data structure used to preserve the symbol when rearranging the
+ * frequency array.
+ */
+typedef struct __indexed_byte {
+ uint8_t frequency;
+ uint16_t symbol;
+} INDEXED_BYTE;
+
+/*
+ * Comparator function used by QuickSort to order the frequency table by
+ * frequency (most frequent symbols will be at the end of the array).
+ */
+static int
+indexed_byte_comparator(const void *elem1, const void *elem2)
+{
+ return (((INDEXED_BYTE *)
+ elem1)->frequency) - (((INDEXED_BYTE *)elem2)->frequency);
+}
+
+/*
+ * traverse_tree --
+ * Recursive function with dual functionality:
+ * - It sets the codeword_length field of each leaf node to the
+ * appropriate value.
+ * - It finds the maximum depth of the tree.
+ */
+static void
+traverse_tree(
+ WT_FREQTREE_NODE *node, uint16_t current_length, uint16_t *max_depth)
+{
+ /* Recursively traverse the tree */
+ if (node->left != NULL)
+ traverse_tree(node->left, current_length + 1, max_depth);
+ if (node->right != NULL)
+ traverse_tree(node->right, current_length + 1, max_depth);
+
+ /* If this is a leaf: */
+ if (node->left == NULL && node->right == NULL) {
+ /*
+ * Setting the leaf's codeword length (for inner nodes, it
+ * is always 0!)
+ */
+ node->codeword_length = current_length;
+
+ /* Store the new maximal depth. */
+ if (*max_depth < current_length + 1)
+ *max_depth = current_length + 1;
+ }
+}
+
+/*
+ * fill_static_representation --
+ * Recursive function that converts the huffman tree from its dynamic
+ * representation to static tree representation, to a preallocated array.
+ *
+ * To know the required size of the array the traverse_tree function can be
+ * used, determining the maximum depth N. Then the required array size is 2^N.
+ */
+static void
+fill_static_representation(
+ WT_STATIC_HUFFMAN_NODE *target, WT_FREQTREE_NODE *node, int idx)
+{
+ WT_STATIC_HUFFMAN_NODE *current_target;
+
+ current_target = &target[idx];
+ current_target->symbol = node->symbol;
+ current_target->codeword_length = node->codeword_length;
+ current_target->valid = 1;
+
+ if (node->left != NULL)
+ fill_static_representation(target, node->left, idx * 2 + 1);
+ if (node->right != NULL)
+ fill_static_representation(target, node->right, idx * 2 + 2);
+}
+
+/*
+ * recursive_free_node --
+ * Recursively free the huffman frequency tree's nodes.
+ */
+static void
+recursive_free_node(ENV *env, WT_FREQTREE_NODE *node)
+{
+ if (node != NULL) {
+ recursive_free_node(env, node->left);
+ recursive_free_node(env, node->right);
+ __wt_free(env, node, sizeof(WT_FREQTREE_NODE));
+ }
+}
+
+/*
+ * __wt_huffman_open --
+ * Take a frequency table and return a pointer to a descriptor object.
+ *
+ * The frequency table must be the full range of valid values. For 1 byte
+ * tables there are 256 values in 8 bits. The highest rank is 255, and the
+ * lowest rank is 1 (0 means the byte never appears in the input), so 1 byte
+ * is needed to hold the rank and the input table must be 1 byte x 256 values.
+ *
+ * For UTF-16 (nbytes == 2) the range is 0 - 65535 and the max rank is 65535.
+ * The table should be 2 bytes x 65536 values.
+ */
+int
+__wt_huffman_open(ENV *env,
+ uint8_t const *byte_frequency_array, u_int nbytes, void *retp)
+{
+ INDEXED_BYTE *indexed_freqs;
+ NODE_QUEUE *combined_nodes, *leaves;
+ WT_FREQTREE_NODE *node, *node2, **refnode, *tempnode;
+ WT_HUFFMAN_OBJ *huffman;
+ uint32_t w1, w2;
+ uint16_t i;
+ int ret;
+
+ indexed_freqs = NULL;
+ combined_nodes = leaves = NULL;
+ node = node2 = tempnode = NULL;
+ ret = 0;
+
+ WT_RET(__wt_calloc(env, 1, sizeof(WT_HUFFMAN_OBJ), &huffman));
+ WT_ERR(__wt_calloc(env, nbytes, sizeof(INDEXED_BYTE), &indexed_freqs));
+ huffman->env = env;
+
+ /*
+ * The frequency array must be sorted to be able to use linear time
+ * construction algorithm.
+ */
+ for (i = 0; i < nbytes; ++i) {
+ indexed_freqs[i].frequency = byte_frequency_array[i];
+ indexed_freqs[i].symbol = i;
+ }
+
+ qsort(indexed_freqs,
+ nbytes, sizeof(INDEXED_BYTE), indexed_byte_comparator);
+
+ /* We need two node queues to build the tree. */
+ WT_ERR(__wt_calloc(env, 1, sizeof(NODE_QUEUE), &leaves));
+ WT_ERR(__wt_calloc(env, 1, sizeof(NODE_QUEUE), &combined_nodes));
+
+ /* Adding the leaves to the queue */
+ for (i = 0; i < nbytes; ++i) {
+ /*
+ * We are leaving out symbols with a frequency of 0. This
+ * assumes these symbols will NEVER occur in the source stream,
+ * and the purpose is to reduce the huffman tree's size.
+ *
+ * NOTE: Even if this behavior is not desired, the frequencies
+ * should have a range between 1 - 255, otherwise the algorithm
+ * cannot produce well balanced tree; so this can be treated as
+ * an optional feature.
+ */
+ if (indexed_freqs[i].frequency > 0) {
+ WT_ERR(__wt_calloc(
+ env, 1, sizeof(WT_FREQTREE_NODE), &tempnode));
+ tempnode->symbol = indexed_freqs[i].symbol;
+ tempnode->weight = indexed_freqs[i].frequency;
+ WT_ERR(node_queue_enqueue(env, leaves, tempnode));
+ tempnode = NULL;
+ }
+ }
+
+ while (!node_queue_is_empty(leaves) ||
+ !node_queue_is_empty(combined_nodes)) {
+ /*
+ * We have to get the node with the smaller weight, examining
+ * both queues first element. We are collecting pairs of these
+ * items, by alternating between node and node2:
+ */
+ refnode = !node ? &node : &node2;
+
+ /*
+ * To decide which queue must be used, we get the weights of
+ * the first items from both:
+ */
+ w1 = node_queue_is_empty(leaves) ?
+ UINT32_MAX : leaves->first->node->weight;
+ w2 = node_queue_is_empty(combined_nodes) ?
+ UINT32_MAX : combined_nodes->first->node->weight;
+
+ /*
+ * Based on the two weights we finally can dequeue the smaller
+ * element and place it to the alternating target node pointer:
+ */
+ if (w1 < w2)
+ node_queue_dequeue(env, leaves, refnode);
+ else
+ node_queue_dequeue(env, combined_nodes, refnode);
+
+ /*
+ * In every second run, we have both node and node2 initialized.
+ */
+ if (node != NULL && node2 != NULL) {
+ WT_ERR(__wt_calloc(
+ env, 1, sizeof(WT_FREQTREE_NODE), &tempnode));
+
+ /* The new weight is the sum of the two weights. */
+ tempnode->weight = node->weight + node2->weight;
+ tempnode->left = node;
+ tempnode->right = node2;
+
+ /* Enqueue it to the combined nodes queue */
+ WT_ERR(
+ node_queue_enqueue(env, combined_nodes, tempnode));
+ tempnode = NULL;
+
+ /* Reset the state pointers */
+ node = node2 = NULL;
+ }
+ }
+
+ /*
+ * The remaining node is in the node variable, this is the root of the
+ * tree. Calculate the number of bytes it takes to hold nbytes bits.
+ */
+ huffman->numSymbols = nbytes;
+ huffman->numBytes = nbytes > 256 ? 2 : 1;
+
+ /* Traverse the tree and set the code word length for each node. */
+ traverse_tree(node, 0, &huffman->max_depth);
+
+ /* Converting the tree to a static array representation. */
+ WT_ERR(__wt_calloc(env, 1 << huffman->max_depth,
+ sizeof(WT_STATIC_HUFFMAN_NODE), &huffman->nodes));
+ fill_static_representation(huffman->nodes, node, 0);
+
+ *(void **)retp = huffman;
+
+err: if (leaves != NULL)
+ node_queue_close(env, leaves);
+ if (combined_nodes != NULL)
+ node_queue_close(env, combined_nodes);
+ if (indexed_freqs != NULL)
+ __wt_free(env, indexed_freqs, 0);
+ if (node != NULL)
+ recursive_free_node(env, node);
+ if (node2 != NULL)
+ recursive_free_node(env, node2);
+ if (tempnode != NULL)
+ __wt_free(env, tempnode, sizeof(WT_FREQTREE_NODE));
+ if (ret != 0) {
+ if (huffman->nodes != NULL)
+ __wt_free(env, huffman->nodes, 0);
+ __wt_free(env, huffman, sizeof(WT_HUFFMAN_OBJ));
+ }
+ return (ret);
+}
+
+/*
+ * __wt_huffman_close --
+ * Discard a Huffman descriptor object.
+ */
+void
+__wt_huffman_close(ENV *env, void *huffman_arg)
+{
+ WT_HUFFMAN_OBJ *huffman;
+
+ huffman = huffman_arg;
+
+ __wt_free(env, huffman->nodes, 0);
+ __wt_free(env, huffman, sizeof(WT_HUFFMAN_OBJ));
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_print_huffman_code --
+ * Prints a symbol's huffman code. Can be used for debugging purposes.
+ */
+int
+__wt_print_huffman_code(ENV *env, void *huffman_arg, uint16_t symbol)
+{
+ WT_HUFFMAN_OBJ *huffman;
+ WT_STATIC_HUFFMAN_NODE *node;
+ u_int i, n;
+ int p;
+ char *buffer;
+
+ huffman = huffman_arg;
+
+ /* Check if the symbol is in valid range */
+ if (symbol < huffman->numSymbols) {
+ WT_RET(__wt_calloc(env, huffman->max_depth, 1, &buffer));
+
+ node = NULL;
+ for (i = 0, n = 1 << huffman->max_depth; i < n; ++i) {
+ node = &huffman->nodes[i];
+ if (node->valid &&
+ node->symbol == symbol && node->codeword_length > 0)
+ break;
+ }
+
+ if (node != NULL) {
+ /*
+ * We've got the leaf node, at index 'i'. Now we fill
+ * the output buffer in back order.
+ */
+ for (p = node->codeword_length - 1; p >= 0; --p) {
+ buffer[p] = (i % 2) == 1 ? '0' : '1';
+ i = (i - 1) / 2;
+ }
+
+ (void)printf("%s\n", buffer);
+ } else {
+ (void)printf(
+ "Symbol is not in the huffman tree: %x\n", symbol);
+ return (WT_ERROR);
+ }
+
+ __wt_free(env, buffer, 0);
+ } else
+ (void)printf("Symbol out of range: %lu >= %lu\n",
+ (u_long)symbol, (u_long)huffman->numSymbols);
+ return (0);
+}
+#endif
+
+/*
+ * __wt_huffman_encode --
+ * Take a byte string, encode it into the target.
+ */
+int
+__wt_huffman_encode(void *huffman_arg,
+ uint8_t *from, uint32_t from_len,
+ void *top, uint32_t *to_len, uint32_t *out_bytes_used)
+{
+ ENV *env;
+ WT_HUFFMAN_OBJ *huffman;
+ WT_STATIC_HUFFMAN_NODE *node;
+ uint32_t bitpos, i, n, j;
+ uint16_t symbol;
+ uint8_t padding_info, *to;
+ int p;
+
+ huffman = huffman_arg;
+ env = huffman->env;
+
+ /*
+ * We need N+1 bytes to encode N bytes, re-allocate as necessary.
+ *
+ * If the initial target pointer, or the initial target buffer length,
+ * aren't set, it's an allocation. Clear the initial target pointer,
+ * our caller may have only set the initial target buffer length, not
+ * the initial pointer value.
+ */
+ if (to_len == NULL || *to_len < from_len + 1) {
+ if (to_len == NULL)
+ *(void **)top = NULL;
+ WT_RET(__wt_realloc(env, to_len, from_len + 1, top));
+ }
+
+ to = *(uint8_t **)top;
+ memset(to, 0, from_len + 1);
+
+ /*
+ * Leave the first 3 bits of the encoded value empty, it holds the
+ * number of bits actually used in the last byte of the encoded value.
+ */
+ bitpos = 3;
+ n = 1 << huffman->max_depth;
+ for (i = 0; i < from_len; i += huffman->numBytes) {
+ /* Getting the next symbol, either 1 or 2 bytes */
+ if (huffman->numBytes == 1)
+ symbol = *from++;
+ else {
+ symbol = ((uint16_t)(*from++)) << 8;
+ symbol |= *from++;
+ }
+
+ /* Getting the symbol's huffman code from the table */
+ node = NULL;
+ for (j = 0; j < n; ++j) {
+ node = &huffman->nodes[j];
+ if (node->valid &&
+ node->symbol == symbol && node->codeword_length > 0)
+ break;
+ }
+
+ if (node != NULL) {
+ /*
+ * We've got the leaf node, at index 'j'. Now we fill
+ * the output buffer in back order.
+ */
+ for (p = node->codeword_length - 1; p >= 0; --p) {
+ MODIFY_BIT(to, bitpos + (u_int)p, (j % 2) ^ 1);
+ j = (j - 1) / 2;
+ }
+
+ bitpos += node->codeword_length;
+ } else {
+ __wt_api_env_errx(NULL,
+ "Huffman compression: there was a symbol in the "
+ "source originally declared with zero frequency; "
+ "undefined source symbol: %lu", (u_long)symbol);
+ return (WT_ERROR);
+ }
+ }
+
+ /*
+ * At this point, bitpos is the total number of used bits (including
+ * the 3 bits at the beginning of the buffer, which we'll set now to
+ * the number of bits used in the last byte). Note if the number of
+ * bits used in the last byte is 8, we set the 3 bits to 0, in other
+ * words, the first 3 bits of the encoded value are the number of bits
+ * used in the last byte, unless they're 0, in which case there are 8
+ * bits used in the last byte.
+ */
+ padding_info = (bitpos % 8) << 5;
+ *to |= padding_info;
+
+ *out_bytes_used = bitpos / 8 + ((bitpos % 8) ? 1 : 0);
+
+ return (0);
+}
+
+/*
+ * __wt_huffman_decode --
+ * Take a byte string, decode it into the target.
+ */
+int
+__wt_huffman_decode(void *huffman_arg,
+ uint8_t *from, uint32_t from_len,
+ void *top, uint32_t *to_len, uint32_t *out_bytes_used)
+{
+ ENV *env;
+ WT_HUFFMAN_OBJ *huffman;
+ WT_STATIC_HUFFMAN_NODE* node;
+ uint32_t bytes, i, from_len_bits, node_idx;
+ uint8_t bitpos, mask, bit, padding_info, *to;
+
+ huffman = huffman_arg;
+ env = huffman->env;
+
+ /*
+ * We need 2N+1 bytes to decode N bytes, re-allocate as necessary.
+ *
+ * If the initial target pointer, or the initial target buffer length,
+ * aren't set, it's an allocation. Clear the initial target pointer,
+ * our caller may have only set the initial target buffer length, not
+ * the initial pointer value.
+ */
+ if (to_len == NULL || *to_len < 2 * from_len + 1) {
+ if (to_len == NULL)
+ *(void **)top = NULL;
+ WT_RET(__wt_realloc(env, to_len, 2 * from_len + 1, top));
+ }
+
+ to = *(uint8_t **)top;
+
+ bitpos = 4; /* Skipping the first 3 bits. */
+ bytes = 0;
+ node_idx = 0;
+
+ /*
+ * The first 3 bits are the number of used bits in the last byte, unless
+ * they're 0, in which case there are 8 bits used in the last byte.
+ */
+ padding_info = (*from & 0xE0) >> 5;
+ from_len_bits = from_len * 8;
+ if (padding_info != 0)
+ from_len_bits -= 8 - padding_info;
+
+ /*
+ * The loop will go through each bit of the source stream, its length
+ * is given in BITS!
+ */
+ for (i = 3; i < from_len_bits; i++) {
+ /* Extracting the current bit */
+ mask = (uint8_t)(1 << bitpos);
+ bit = (*from & mask);
+
+ /*
+ * As we go through the bits, we also make steps in the huffman
+ * tree, originated from the root, toward the leaves.
+ */
+ if (bit)
+ node_idx = (node_idx * 2) + 2;
+ else
+ node_idx = (node_idx * 2) + 1;
+
+ node = &huffman->nodes[node_idx];
+
+ /* If this is a leaf, we've found a complete symbol. */
+ if (node->valid && node->codeword_length > 0) {
+ if (huffman->numBytes == 1)
+ *to++ = (uint8_t)node->symbol;
+ else {
+ *to++ = (node->symbol & 0xFF00) >> 8;
+ *to++ = node->symbol & 0xFF;
+ }
+
+ bytes += huffman->numBytes;
+ node_idx = 0;
+ }
+
+ /* Moving forward one bit in the source stream. */
+ if (bitpos > 0)
+ bitpos--;
+ else {
+ bitpos = 7;
+ from++;
+ }
+ }
+
+ /* Return the number of bytes used. */
+ *out_bytes_used = bytes;
+
+ return (0);
+}
+
+/*
+ * node_queue_close --
+ * Delete a queue from memory.
+ *
+ * It does not delete the pointed huffman tree nodes!
+ */
+static void
+node_queue_close(ENV *env, NODE_QUEUE *queue)
+{
+ NODE_QUEUE_ELEM *elem, *next_elem;
+
+ /* Freeing each element of the queue's linked list. */
+ for (elem = queue->first; elem != NULL; elem = next_elem) {
+ next_elem = elem->next;
+ __wt_free(env, elem, sizeof(NODE_QUEUE_ELEM));
+ }
+
+ /* Freeing the queue record itself. */
+ __wt_free(env, queue, sizeof(NODE_QUEUE));
+}
+
+/*
+ * node_queue_enqueue --
+ * Push a tree node to the end of the queue.
+ */
+static int
+node_queue_enqueue(ENV *env, NODE_QUEUE *queue, WT_FREQTREE_NODE *node)
+{
+ NODE_QUEUE_ELEM *elem;
+
+ /* Allocating a new linked list element */
+ WT_RET(__wt_calloc(env, 1, sizeof(NODE_QUEUE_ELEM), &elem));
+
+ /* It holds the tree node, and has no next element yet */
+ elem->node = node;
+ elem->next = NULL;
+
+ /* If the queue is empty, the first element will be the new one. */
+ if (queue->first == NULL)
+ queue->first = elem;
+
+ /*
+ * If the queue is not empty, the last element's next pointer must be
+ * updated.
+ */
+ if (queue->last != NULL)
+ queue->last->next = elem;
+
+ /* The last element is the new one */
+ queue->last = elem;
+
+ return (0);
+}
+
+/*
+ * node_queue_dequeue --
+ * Removes a node from the beginning of the queue and copies the node's
+ * pointer to the location referred by the retp parameter.
+ */
+static void
+node_queue_dequeue(ENV *env, NODE_QUEUE *queue, WT_FREQTREE_NODE **retp)
+{
+ NODE_QUEUE_ELEM *first_elem;
+
+ /*
+ * Getting the first element of the queue and updating it to point to
+ * the next element as first.
+ */
+ first_elem = queue->first;
+ *retp = first_elem->node;
+ queue->first = first_elem->next;
+
+ /*
+ * If the last element was the dequeued element, we have to update it
+ * to NULL.
+ */
+ if (queue->last == first_elem)
+ queue->last = NULL;
+
+ /* Freeing the linked list element that has been dequeued */
+ __wt_free(env, first_elem, sizeof(NODE_QUEUE_ELEM));
+}
diff --git a/src/support/pow.c b/src/support/pow.c
new file mode 100644
index 00000000000..3a6b6b1d686
--- /dev/null
+++ b/src/support/pow.c
@@ -0,0 +1,56 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_nlpo2 --
+ * Return the next-largest power-of-two for a 32-bit unsigned value.
+ *
+ * In 12 operations, this code computes the next highest power of 2 for a 32-bit
+ * integer. The result may be expressed by the formula 1U << (lg(v - 1) + 1).
+ * Note that in the edge case where v is 0, it returns 0, which isn't a power of
+ * 2; you might append the expression v += (v == 0) to remedy this if it
+ * matters. It would be faster by 2 operations to use the formula and the
+ * log base 2 methed that uses a lookup table, but in some situations, lookup
+ * tables are not suitable, so the above code may be best. (On a Athlon XP 2100+
+ * I've found the above shift-left and then OR code is as fast as using a single
+ * BSR assembly language instruction, which scans in reverse to find the highest
+ * set bit.) It works by copying the highest set bit to all of the lower bits,
+ * and then adding one, which results in carries that set all of the lower bits
+ * to 0 and one bit beyond the highest set bit to 1. If the original number was
+ * a power of 2, then the decrement will reduce it to one less, so that we round
+ * up to the same original value. Devised by Sean Anderson, Sepember 14, 2001.
+ * Pete Hart pointed me to a couple newsgroup posts by him and William Lewis in
+ * February of 1997, where they arrive at the same algorithm.
+ * http://graphics.stanford.edu/~seander/bithacks.html
+ * Sean Eron Anderson, seander@cs.stanford.edu
+ */
+uint32_t
+__wt_nlpo2(uint32_t v)
+{
+ v--;
+ v |= v >> 1;
+ v |= v >> 2;
+ v |= v >> 4;
+ v |= v >> 8;
+ v |= v >> 16;
+ v++;
+ return (v);
+}
+
+/*
+ * __wt_ispo2 --
+ * Return if a number is a power-of-two.
+ */
+int
+__wt_ispo2(uint32_t v)
+{
+ return ((v & (v - 1)) == 0 ? 1 : 0);
+}
diff --git a/src/support/prime.c b/src/support/prime.c
new file mode 100644
index 00000000000..8abe43158b2
--- /dev/null
+++ b/src/support/prime.c
@@ -0,0 +1,75 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_prime
+ * Return a prime number relatively close to a value.
+ */
+uint32_t
+__wt_prime(uint32_t n)
+{
+ /*
+ * Ref: the hash functions section of "Algorithms in C", by Sedgewick.
+ *
+ * The table is the same as the one in Berkeley DB -- check at each
+ * power-of-two up to 2^18, then mid-points between each power-of-two
+ * to a maximum of 2^30.
+ */
+ static const struct {
+ uint32_t value;
+ uint32_t prime;
+ } t[] = {
+ { 32, 37 }, /* 2^5 */
+ { 64, 67 }, /* 2^6 */
+ { 128, 131 }, /* 2^7 */
+ { 256, 257 }, /* 2^8 */
+ { 512, 521 }, /* 2^9 */
+ { 1024, 1031 }, /* 2^10 */
+ { 2048, 2053 }, /* 2^11 */
+ { 4096, 4099 }, /* 2^12 */
+ { 8192, 8191 }, /* 2^13 */
+ { 16384, 16381 }, /* 2^14 */
+ { 32768, 32771 }, /* 2^15 */
+ { 65536, 65537 }, /* 2^16 */
+ { 131072, 131071 }, /* 2^17 */
+ { 262144, 262147 }, /* 2^18 */
+ { 393216, 393209 }, /* 2^18 + 2^18/2 */
+ { 524288, 524287 }, /* 2^19 */
+ { 786432, 786431 }, /* 2^19 + 2^19/2 */
+ { 1048576, 1048573 }, /* 2^20 */
+ { 1572864, 1572869 }, /* 2^20 + 2^20/2 */
+ { 2097152, 2097169 }, /* 2^21 */
+ { 3145728, 3145721 }, /* 2^21 + 2^21/2 */
+ { 4194304, 4194301 }, /* 2^22 */
+ { 6291456, 6291449 }, /* 2^22 + 2^22/2 */
+ { 8388608, 8388617 }, /* 2^23 */
+ { 12582912, 12582917 }, /* 2^23 + 2^23/2 */
+ { 16777216, 16777213 }, /* 2^24 */
+ { 25165824, 25165813 }, /* 2^24 + 2^24/2 */
+ { 33554432, 33554393 }, /* 2^25 */
+ { 50331648, 50331653 }, /* 2^25 + 2^25/2 */
+ { 67108864, 67108859 }, /* 2^26 */
+ { 100663296, 100663291 }, /* 2^26 + 2^26/2 */
+ { 134217728, 134217757 }, /* 2^27 */
+ { 201326592, 201326611 }, /* 2^27 + 2^27/2 */
+ { 268435456, 268435459 }, /* 2^28 */
+ { 402653184, 402653189 }, /* 2^28 + 2^28/2 */
+ { 536870912, 536870909 }, /* 2^29 */
+ { 805306368, 805306357 }, /* 2^29 + 2^29/2 */
+ { 1073741824, 1073741827 }, /* 2^30 */
+ };
+ u_int i;
+
+ for (i = 0; i < WT_ELEMENTS(t); ++i)
+ if (t[i].value > n)
+ return (t[i].prime);
+ return (t[WT_ELEMENTS(t) - 1].prime);
+}
diff --git a/src/support/progress.c b/src/support/progress.c
new file mode 100644
index 00000000000..480699cbdd1
--- /dev/null
+++ b/src/support/progress.c
@@ -0,0 +1,17 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+void
+__wt_progress(const char *s, uint64_t v)
+{
+ (void)printf("\r\t%s: %llu", s, (unsigned long long)v);
+ (void)fflush(stdout);
+}
diff --git a/src/support/scratch.c b/src/support/scratch.c
new file mode 100644
index 00000000000..9b20ea963f3
--- /dev/null
+++ b/src/support/scratch.c
@@ -0,0 +1,98 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_scr_alloc --
+ * Scratch buffer allocation function.
+ */
+int
+__wt_scr_alloc(WT_TOC *toc, uint32_t size, DBT **dbtp)
+{
+ DBT *scratch;
+ ENV *env;
+ uint32_t allocated;
+ u_int i;
+ int ret;
+
+ env = toc->env;
+
+ *dbtp = NULL; /* Don't risk the caller not catching the error. */
+
+ /*
+ * There's an array of scratch buffers in each WT_TOC that can be used
+ * by any function. We use DBTs for scratch buffers because we already
+ * have to have functions that do variable-length allocation on DBTs.
+ * Scratch buffers are allocated only by a single thread of control, so
+ * no locking is necessary.
+ */
+ for (i = 0,
+ scratch = toc->scratch; i < toc->scratch_alloc; ++i, ++scratch)
+ if (!F_ISSET(scratch, WT_SCRATCH_INUSE)) {
+ *dbtp = scratch;
+ F_SET(scratch, WT_SCRATCH_INUSE);
+
+ /*
+ * If the caller has a minimum size, grow the scratch
+ * buffer as necessary.
+ */
+ if (size != 0 && scratch->mem_size < size)
+ WT_RET(__wt_realloc(env,
+ &scratch->mem_size, size, &scratch->data));
+ return (0);
+ }
+
+ /* Resize the array, we need more scratch buffers. */
+ allocated = toc->scratch_alloc * sizeof(DBT);
+ WT_ERR(__wt_realloc(env, &allocated,
+ (toc->scratch_alloc + 10) * sizeof(DBT), &toc->scratch));
+ toc->scratch_alloc += 10;
+ return (__wt_scr_alloc(toc, size, dbtp));
+
+err: __wt_api_env_errx(env,
+ "WT_TOC unable to allocate more scratch buffers");
+ return (ret);
+}
+
+/*
+ * __wt_scr_release --
+ * Release a scratch buffer.
+ */
+void
+__wt_scr_release(DBT **dbt)
+{
+ DBT *scratch;
+
+ scratch = *dbt;
+ *dbt = NULL;
+
+ F_CLR(scratch, WT_SCRATCH_INUSE);
+}
+
+/*
+ * __wt_scr_free --
+ * Free all memory associated with the scratch buffers.
+ */
+void
+__wt_scr_free(WT_TOC *toc)
+{
+ DBT *scratch;
+ ENV *env;
+ u_int i;
+
+ env = toc->env;
+
+ for (i = 0,
+ scratch = toc->scratch; i < toc->scratch_alloc; ++i, ++scratch)
+ if (scratch->data != NULL)
+ __wt_free(env, scratch->data, scratch->mem_size);
+
+ __wt_free(env, toc->scratch, toc->scratch_alloc * sizeof(DBT));
+}
diff --git a/src/support/serial.c b/src/support/serial.c
new file mode 100644
index 00000000000..9974f1f6b38
--- /dev/null
+++ b/src/support/serial.c
@@ -0,0 +1,123 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Serialization:
+ *
+ * Serialization support allows scheduling operations that require serialized
+ * access to a piece of data, where the data (1) is accessed only by serialized
+ * code, or where the data, when accessed by non-serialized code, can either
+ * (2) be read atomically, or (3) it doesn't matter if it's read incorrectly.
+ * In other words, the readers are key, and they are known to be indifferent
+ * to the serialization code modifying the data.
+ *
+ * An example of #1 is updating the size of a database file. The size is only
+ * changed in serialized code, and never read by anything else. An example of
+ * #2 is updating a 32-bit value, because readers by definition get consistent
+ * views of 32-bit memory locations. An example of #3 is updating a 64-bit
+ * value (such as the bytes allocated in the cache). While there is a small
+ * possibility a reader will see a corrupted value, the value is only used for
+ * advisory actions, such as waking the cache thread to see if there's work to
+ * do.
+ */
+
+/*
+ * __wt_toc_serialize_func --
+ * Schedule a serialization request, and block or spin until it completes.
+ */
+int
+__wt_toc_serialize_func(
+ WT_TOC *toc, wq_state_t op, int spin, int (*func)(WT_TOC *), void *args)
+{
+ int done;
+
+ /*
+ * Threads serializing access to data using a function:
+ * set a function/argument pair in the WT_TOC handle,
+ * flush memory,
+ * update the WT_TOC workq state, and
+ * spins or blocks.
+ *
+ * The workQ thread notices the state change and calls the serialization
+ * function.
+ *
+ * The first memory flush ensures all supporting information is written
+ * before the wq_state field (which makes the entry visible to the workQ
+ * thread). No second memory flush is required, the wq_state field is
+ * declared volatile.
+ */
+ toc->wq_args = args;
+ toc->wq_func = func;
+ toc->wq_sleeping = spin ? 0 : 1;
+ WT_MEMORY_FLUSH;
+ toc->wq_state = op;
+
+ /*
+ * Callers can spin on the WT_TOC state (implying the call is quickly
+ * satisfied), or block until its mutex is unlocked by another thread
+ * when the operation has completed.
+ */
+ if (spin) {
+ /*
+ * !!!
+ * Don't do arithmetic comparisons (even equality) on enum's,
+ * it makes some compilers/lint tools angry.
+ */
+ for (done = 0; !done;) {
+ switch (toc->wq_state) {
+ case WT_WORKQ_NONE:
+ done = 1;
+ break;
+ case WT_WORKQ_FUNC:
+ case WT_WORKQ_READ:
+ case WT_WORKQ_READ_SCHED:
+ __wt_yield();
+ break;
+ }
+ }
+ } else
+ __wt_lock(toc->env, toc->mtx);
+
+ return (toc->wq_ret);
+}
+
+/*
+ * __wt_toc_serialize_wrapup --
+ * Server function cleanup.
+ */
+void
+__wt_toc_serialize_wrapup(WT_TOC *toc, WT_PAGE *page, int ret)
+{
+ ENV *env;
+
+ env = toc->env;
+
+ /*
+ * If passed a page and the return value is good, we modified the page;
+ * no need for a memory flush, we'll use the one below.
+ */
+ if (page != NULL && ret == 0)
+ WT_PAGE_SET_MODIFIED(page);
+
+ /*
+ * Set the return value and reset the state -- the workQ no longer needs
+ * to worry about us.
+ *
+ * The return value isn't volatile, so requires an explicit flush.
+ */
+ toc->wq_ret = ret;
+ toc->wq_state = WT_WORKQ_NONE;
+ WT_MEMORY_FLUSH;
+
+ /* If the calling thread is sleeping, wake it up. */
+ if (toc->wq_sleeping)
+ __wt_unlock(env, toc->mtx);
+}
diff --git a/src/support/simple_setup.c b/src/support/simple_setup.c
new file mode 100644
index 00000000000..a4464fead69
--- /dev/null
+++ b/src/support/simple_setup.c
@@ -0,0 +1,94 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include <stdlib.h>
+
+#include "wiredtiger.h"
+
+extern const char *progname;
+
+static ENV *__env;
+
+/*
+ * wiredtiger_simple_setup --
+ * Standard setup for simple applications.
+ */
+int
+wiredtiger_simple_setup(
+ const char *progname, DB **dbp, u_int32_t cache_size, u_int32_t flags)
+{
+ DB *db;
+ ENV *env;
+ int ret;
+
+ db = *dbp = NULL;
+
+ if ((ret = wiredtiger_env_init(&env, flags)) != 0) {
+ fprintf(stderr,
+ "%s: wiredtiger_env_init: %s\n",
+ progname, wiredtiger_strerror(ret));
+ return (ret);
+ }
+ __env = env;
+
+ if (cache_size != 0 &&
+ (ret = env->cache_size_set(env, cache_size)) != 0) {
+ env->err(env, ret, "Env.cache_size_set");
+ goto err;
+ }
+
+ if ((ret = env->open(env, NULL, 0, 0)) != 0) {
+ env->err(env, ret, "%s: Env.open", progname);
+ goto err;
+ }
+ if ((ret = env->db(env, 0, &db)) != 0) {
+ env->err(env, ret, "%s: Env.db", progname);
+ goto err;
+ }
+ if ((ret = db->errpfx_set(db, progname)) != 0) {
+ db->err(db, ret, "%s: Db.errpfx_set", progname);
+ goto err;
+ }
+
+ *dbp = db;
+ return (EXIT_SUCCESS);
+
+err: wiredtiger_simple_teardown(progname, db);
+ return (ret);
+}
+
+/*
+ * wiredtiger_simple_teardown --
+ * Standard teardown for simple applications.
+ */
+int
+wiredtiger_simple_teardown(const char *progname, DB *db)
+{
+ int ret, tret;
+
+ ret = 0;
+ if (db != NULL && (tret = db->close(db, 0)) != 0) {
+ fprintf(stderr,
+ "%s: Db.close: %s\n", progname, wiredtiger_strerror(ret));
+ if (ret == 0)
+ ret = tret;
+ }
+
+ if (__env != NULL) {
+ if ((tret = __env->close(__env, 0)) != 0) {
+ fprintf(stderr, "%s: Env.close: %s\n",
+ progname, wiredtiger_strerror(ret));
+ if (ret == 0)
+ ret = tret;
+ }
+ __env = NULL;
+ }
+
+ return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+}
diff --git a/src/support/stat.c b/src/support/stat.c
new file mode 100644
index 00000000000..bf08a95b12f
--- /dev/null
+++ b/src/support/stat.c
@@ -0,0 +1,370 @@
+/* DO NOT EDIT: automatically built by dist/stat.py. */
+
+#include "wt_internal.h"
+
+int
+__wt_stat_alloc_cache_stats(ENV *env, WT_STATS **statsp)
+{
+ WT_STATS *stats;
+
+ WT_RET(__wt_calloc(env, 10, sizeof(WT_STATS), &stats));
+
+ stats[WT_STAT_CACHE_BYTES_INUSE].desc = "bytes in the cache";
+ stats[WT_STAT_CACHE_BYTES_MAX].desc =
+ "maximum bytes configured for the cache";
+ stats[WT_STAT_CACHE_EVICT_HAZARD].desc =
+ "pages selected for eviction not evicted because of a hazard reference";
+ stats[WT_STAT_CACHE_EVICT_MODIFIED].desc =
+ "modified pages selected for eviction";
+ stats[WT_STAT_CACHE_EVICT_UNMODIFIED].desc =
+ "unmodified pages selected for eviction";
+ stats[WT_STAT_CACHE_PAGES_INUSE].desc = "pages in the cache";
+ stats[WT_STAT_OVERFLOW_READ].desc =
+ "overflow pages read from the file";
+ stats[WT_STAT_PAGE_READ].desc = "pages read from a file";
+ stats[WT_STAT_PAGE_WRITE].desc = "pages written to a file";
+
+ *statsp = stats;
+ return (0);
+}
+
+void
+__wt_stat_clear_cache_stats(WT_STATS *stats)
+{
+ stats[WT_STAT_CACHE_EVICT_HAZARD].v = 0;
+ stats[WT_STAT_CACHE_EVICT_MODIFIED].v = 0;
+ stats[WT_STAT_CACHE_EVICT_UNMODIFIED].v = 0;
+ stats[WT_STAT_OVERFLOW_READ].v = 0;
+ stats[WT_STAT_PAGE_READ].v = 0;
+ stats[WT_STAT_PAGE_WRITE].v = 0;
+}
+
+int
+__wt_stat_alloc_database_stats(ENV *env, WT_STATS **statsp)
+{
+ WT_STATS *stats;
+
+ WT_RET(__wt_calloc(env, 27, sizeof(WT_STATS), &stats));
+
+ stats[WT_STAT_BASE_RECNO].desc = "base record number";
+ stats[WT_STAT_DUP_TREE].desc = "duplicate data off-page trees";
+ stats[WT_STAT_FIXED_LEN].desc = "database fixed-record size";
+ stats[WT_STAT_INTLMAX].desc = "maximum internal page size";
+ stats[WT_STAT_INTLMIN].desc = "minimum internal page size";
+ stats[WT_STAT_ITEM_COL_DELETED].desc =
+ "column store deleted data items";
+ stats[WT_STAT_ITEM_DATA_OVFL].desc = "total overflow data items";
+ stats[WT_STAT_ITEM_DUP_DATA].desc = "total duplicate data items";
+ stats[WT_STAT_ITEM_KEY_OVFL].desc = "total overflow keys";
+ stats[WT_STAT_ITEM_TOTAL_DATA].desc = "total data items";
+ stats[WT_STAT_ITEM_TOTAL_KEY].desc = "total keys";
+ stats[WT_STAT_LEAFMAX].desc = "maximum leaf page size";
+ stats[WT_STAT_LEAFMIN].desc = "minimum leaf page size";
+ stats[WT_STAT_MAGIC].desc = "magic number";
+ stats[WT_STAT_MAJOR].desc = "major version number";
+ stats[WT_STAT_MINOR].desc = "minor version number";
+ stats[WT_STAT_PAGE_COL_FIX].desc =
+ "column-store fixed-size leaf pages";
+ stats[WT_STAT_PAGE_COL_INTERNAL].desc = "column-store internal pages";
+ stats[WT_STAT_PAGE_COL_RLE].desc =
+ "column-store repeat-count compressed fixed-size leaf pages";
+ stats[WT_STAT_PAGE_COL_VARIABLE].desc =
+ "column-store variable-size leaf pages";
+ stats[WT_STAT_PAGE_DUP_INTERNAL].desc = "duplicate internal pages";
+ stats[WT_STAT_PAGE_DUP_LEAF].desc = "duplicate leaf pages";
+ stats[WT_STAT_PAGE_OVERFLOW].desc = "overflow pages";
+ stats[WT_STAT_PAGE_ROW_INTERNAL].desc = "row-store internal pages";
+ stats[WT_STAT_PAGE_ROW_LEAF].desc = "row-store leaf pages";
+ stats[WT_STAT_TREE_LEVEL].desc = "number of levels in the btree";
+
+ *statsp = stats;
+ return (0);
+}
+
+void
+__wt_stat_clear_database_stats(WT_STATS *stats)
+{
+ stats[WT_STAT_BASE_RECNO].v = 0;
+ stats[WT_STAT_DUP_TREE].v = 0;
+ stats[WT_STAT_FIXED_LEN].v = 0;
+ stats[WT_STAT_INTLMAX].v = 0;
+ stats[WT_STAT_INTLMIN].v = 0;
+ stats[WT_STAT_ITEM_COL_DELETED].v = 0;
+ stats[WT_STAT_ITEM_DATA_OVFL].v = 0;
+ stats[WT_STAT_ITEM_DUP_DATA].v = 0;
+ stats[WT_STAT_ITEM_KEY_OVFL].v = 0;
+ stats[WT_STAT_ITEM_TOTAL_DATA].v = 0;
+ stats[WT_STAT_ITEM_TOTAL_KEY].v = 0;
+ stats[WT_STAT_LEAFMAX].v = 0;
+ stats[WT_STAT_LEAFMIN].v = 0;
+ stats[WT_STAT_MAGIC].v = 0;
+ stats[WT_STAT_MAJOR].v = 0;
+ stats[WT_STAT_MINOR].v = 0;
+ stats[WT_STAT_PAGE_COL_FIX].v = 0;
+ stats[WT_STAT_PAGE_COL_INTERNAL].v = 0;
+ stats[WT_STAT_PAGE_COL_RLE].v = 0;
+ stats[WT_STAT_PAGE_COL_VARIABLE].v = 0;
+ stats[WT_STAT_PAGE_DUP_INTERNAL].v = 0;
+ stats[WT_STAT_PAGE_DUP_LEAF].v = 0;
+ stats[WT_STAT_PAGE_OVERFLOW].v = 0;
+ stats[WT_STAT_PAGE_ROW_INTERNAL].v = 0;
+ stats[WT_STAT_PAGE_ROW_LEAF].v = 0;
+ stats[WT_STAT_TREE_LEVEL].v = 0;
+}
+
+int
+__wt_stat_alloc_db_stats(ENV *env, WT_STATS **statsp)
+{
+ WT_STATS *stats;
+
+ WT_RET(__wt_calloc(env, 11, sizeof(WT_STATS), &stats));
+
+ stats[WT_STAT_DB_ALLOC].desc = "database allocations";
+ stats[WT_STAT_DB_ALLOC_FILE].desc = "database extensions";
+ stats[WT_STAT_DB_FREE].desc = "database frees";
+ stats[WT_STAT_DUPLICATE_ITEMS_INSERTED].desc =
+ "duplicate key/data pairs inserted";
+ stats[WT_STAT_HUFFMAN_DATA].desc = "huffman data compression in bytes";
+ stats[WT_STAT_HUFFMAN_KEY].desc = "huffman key compression in bytes";
+ stats[WT_STAT_ITEMS_INSERTED].desc = "key/data pairs inserted";
+ stats[WT_STAT_OVERFLOW_DATA].desc = "overflow data items inserted";
+ stats[WT_STAT_OVERFLOW_KEY].desc = "overflow key items inserted";
+ stats[WT_STAT_REPEAT_COUNT].desc = "repeat value compression count";
+
+ *statsp = stats;
+ return (0);
+}
+
+void
+__wt_stat_clear_db_stats(WT_STATS *stats)
+{
+ stats[WT_STAT_DB_ALLOC].v = 0;
+ stats[WT_STAT_DB_ALLOC_FILE].v = 0;
+ stats[WT_STAT_DB_FREE].v = 0;
+ stats[WT_STAT_DUPLICATE_ITEMS_INSERTED].v = 0;
+ stats[WT_STAT_HUFFMAN_DATA].v = 0;
+ stats[WT_STAT_HUFFMAN_KEY].v = 0;
+ stats[WT_STAT_ITEMS_INSERTED].v = 0;
+ stats[WT_STAT_OVERFLOW_DATA].v = 0;
+ stats[WT_STAT_OVERFLOW_KEY].v = 0;
+ stats[WT_STAT_REPEAT_COUNT].v = 0;
+}
+
+int
+__wt_stat_alloc_env_stats(ENV *env, WT_STATS **statsp)
+{
+ WT_STATS *stats;
+
+ WT_RET(__wt_calloc(env, 9, sizeof(WT_STATS), &stats));
+
+ stats[WT_STAT_DATABASE_OPEN].desc = "database open";
+ stats[WT_STAT_MEMALLOC].desc = "memory allocations";
+ stats[WT_STAT_MEMFREE].desc = "memory frees";
+ stats[WT_STAT_MTX_LOCK].desc = "mutex lock calls";
+ stats[WT_STAT_TOTAL_READ_IO].desc = "total read I/Os";
+ stats[WT_STAT_TOTAL_WRITE_IO].desc = "total write I/Os";
+ stats[WT_STAT_WORKQ_PASSES].desc = "workQ queue passes";
+ stats[WT_STAT_WORKQ_YIELD].desc = "workQ yields";
+
+ *statsp = stats;
+ return (0);
+}
+
+void
+__wt_stat_clear_env_stats(WT_STATS *stats)
+{
+ stats[WT_STAT_DATABASE_OPEN].v = 0;
+ stats[WT_STAT_MEMALLOC].v = 0;
+ stats[WT_STAT_MEMFREE].v = 0;
+ stats[WT_STAT_MTX_LOCK].v = 0;
+ stats[WT_STAT_TOTAL_READ_IO].v = 0;
+ stats[WT_STAT_TOTAL_WRITE_IO].v = 0;
+ stats[WT_STAT_WORKQ_PASSES].v = 0;
+ stats[WT_STAT_WORKQ_YIELD].v = 0;
+}
+
+int
+__wt_stat_alloc_fh_stats(ENV *env, WT_STATS **statsp)
+{
+ WT_STATS *stats;
+
+ WT_RET(__wt_calloc(env, 4, sizeof(WT_STATS), &stats));
+
+ stats[WT_STAT_FSYNC].desc = "fsyncs";
+ stats[WT_STAT_READ_IO].desc = "read I/Os";
+ stats[WT_STAT_WRITE_IO].desc = "write I/Os";
+
+ *statsp = stats;
+ return (0);
+}
+
+void
+__wt_stat_clear_fh_stats(WT_STATS *stats)
+{
+ stats[WT_STAT_FSYNC].v = 0;
+ stats[WT_STAT_READ_IO].v = 0;
+ stats[WT_STAT_WRITE_IO].v = 0;
+}
+
+int
+__wt_stat_alloc_method_stats(ENV *env, WT_STATS **statsp)
+{
+ WT_STATS *stats;
+
+ WT_RET(__wt_calloc(env, 69, sizeof(WT_STATS), &stats));
+
+ stats[WT_STAT_DB_BTREE_COMPARE_DUP_GET].desc =
+ "db.btree_compare_dup_get";
+ stats[WT_STAT_DB_BTREE_COMPARE_DUP_SET].desc =
+ "db.btree_compare_dup_set";
+ stats[WT_STAT_DB_BTREE_COMPARE_GET].desc = "db.btree_compare_get";
+ stats[WT_STAT_DB_BTREE_COMPARE_INT_GET].desc =
+ "db.btree_compare_int_get";
+ stats[WT_STAT_DB_BTREE_COMPARE_INT_SET].desc =
+ "db.btree_compare_int_set";
+ stats[WT_STAT_DB_BTREE_COMPARE_SET].desc = "db.btree_compare_set";
+ stats[WT_STAT_DB_BTREE_DUP_OFFPAGE_GET].desc =
+ "db.btree_dup_offpage_get";
+ stats[WT_STAT_DB_BTREE_DUP_OFFPAGE_SET].desc =
+ "db.btree_dup_offpage_set";
+ stats[WT_STAT_DB_BTREE_ITEMSIZE_GET].desc = "db.btree_itemsize_get";
+ stats[WT_STAT_DB_BTREE_ITEMSIZE_SET].desc = "db.btree_itemsize_set";
+ stats[WT_STAT_DB_BTREE_PAGESIZE_GET].desc = "db.btree_pagesize_get";
+ stats[WT_STAT_DB_BTREE_PAGESIZE_SET].desc = "db.btree_pagesize_set";
+ stats[WT_STAT_DB_BULK_LOAD].desc = "db.bulk_load";
+ stats[WT_STAT_DB_CLOSE].desc = "db.close";
+ stats[WT_STAT_DB_COLUMN_SET].desc = "db.column_set";
+ stats[WT_STAT_DB_COL_DEL].desc = "db.col_del";
+ stats[WT_STAT_DB_COL_DEL_RESTART].desc = "db.col_del method restarts";
+ stats[WT_STAT_DB_COL_GET].desc = "db.col_get";
+ stats[WT_STAT_DB_COL_PUT].desc = "db.col_put";
+ stats[WT_STAT_DB_COL_PUT_RESTART].desc = "db.col_put method restarts";
+ stats[WT_STAT_DB_DUMP].desc = "db.dump";
+ stats[WT_STAT_DB_ERRCALL_GET].desc = "db.errcall_get";
+ stats[WT_STAT_DB_ERRCALL_SET].desc = "db.errcall_set";
+ stats[WT_STAT_DB_ERRFILE_GET].desc = "db.errfile_get";
+ stats[WT_STAT_DB_ERRFILE_SET].desc = "db.errfile_set";
+ stats[WT_STAT_DB_ERRPFX_GET].desc = "db.errpfx_get";
+ stats[WT_STAT_DB_ERRPFX_SET].desc = "db.errpfx_set";
+ stats[WT_STAT_DB_HUFFMAN_SET].desc = "db.huffman_set";
+ stats[WT_STAT_DB_OPEN].desc = "db.open";
+ stats[WT_STAT_DB_ROW_DEL].desc = "db.row_del";
+ stats[WT_STAT_DB_ROW_DEL_RESTART].desc = "db.row_del method restarts";
+ stats[WT_STAT_DB_ROW_GET].desc = "db.row_get";
+ stats[WT_STAT_DB_ROW_PUT].desc = "db.row_put";
+ stats[WT_STAT_DB_ROW_PUT_RESTART].desc = "db.row_put method restarts";
+ stats[WT_STAT_DB_STAT_CLEAR].desc = "db.stat_clear";
+ stats[WT_STAT_DB_STAT_PRINT].desc = "db.stat_print";
+ stats[WT_STAT_DB_SYNC].desc = "db.sync";
+ stats[WT_STAT_DB_VERIFY].desc = "db.verify";
+ stats[WT_STAT_ENV_CACHE_SIZE_GET].desc = "env.cache_size_get";
+ stats[WT_STAT_ENV_CACHE_SIZE_SET].desc = "env.cache_size_set";
+ stats[WT_STAT_ENV_CLOSE].desc = "env.close";
+ stats[WT_STAT_ENV_DATA_UPDATE_INITIAL_GET].desc =
+ "env.data_update_initial_get";
+ stats[WT_STAT_ENV_DATA_UPDATE_INITIAL_SET].desc =
+ "env.data_update_initial_set";
+ stats[WT_STAT_ENV_DATA_UPDATE_MAX_GET].desc =
+ "env.data_update_max_get";
+ stats[WT_STAT_ENV_DATA_UPDATE_MAX_SET].desc =
+ "env.data_update_max_set";
+ stats[WT_STAT_ENV_DB].desc = "env.db";
+ stats[WT_STAT_ENV_ERRCALL_GET].desc = "env.errcall_get";
+ stats[WT_STAT_ENV_ERRCALL_SET].desc = "env.errcall_set";
+ stats[WT_STAT_ENV_ERRFILE_GET].desc = "env.errfile_get";
+ stats[WT_STAT_ENV_ERRFILE_SET].desc = "env.errfile_set";
+ stats[WT_STAT_ENV_ERRPFX_GET].desc = "env.errpfx_get";
+ stats[WT_STAT_ENV_ERRPFX_SET].desc = "env.errpfx_set";
+ stats[WT_STAT_ENV_HAZARD_SIZE_GET].desc = "env.hazard_size_get";
+ stats[WT_STAT_ENV_HAZARD_SIZE_SET].desc = "env.hazard_size_set";
+ stats[WT_STAT_ENV_MSGCALL_GET].desc = "env.msgcall_get";
+ stats[WT_STAT_ENV_MSGCALL_SET].desc = "env.msgcall_set";
+ stats[WT_STAT_ENV_MSGFILE_GET].desc = "env.msgfile_get";
+ stats[WT_STAT_ENV_MSGFILE_SET].desc = "env.msgfile_set";
+ stats[WT_STAT_ENV_OPEN].desc = "env.open";
+ stats[WT_STAT_ENV_STAT_CLEAR].desc = "env.stat_clear";
+ stats[WT_STAT_ENV_STAT_PRINT].desc = "env.stat_print";
+ stats[WT_STAT_ENV_SYNC].desc = "env.sync";
+ stats[WT_STAT_ENV_TOC].desc = "env.toc";
+ stats[WT_STAT_ENV_TOC_SIZE_GET].desc = "env.toc_size_get";
+ stats[WT_STAT_ENV_TOC_SIZE_SET].desc = "env.toc_size_set";
+ stats[WT_STAT_ENV_VERBOSE_GET].desc = "env.verbose_get";
+ stats[WT_STAT_ENV_VERBOSE_SET].desc = "env.verbose_set";
+ stats[WT_STAT_WT_TOC_CLOSE].desc = "wt_toc.close";
+
+ *statsp = stats;
+ return (0);
+}
+
+void
+__wt_stat_clear_method_stats(WT_STATS *stats)
+{
+ stats[WT_STAT_DB_BTREE_COMPARE_DUP_GET].v = 0;
+ stats[WT_STAT_DB_BTREE_COMPARE_DUP_SET].v = 0;
+ stats[WT_STAT_DB_BTREE_COMPARE_GET].v = 0;
+ stats[WT_STAT_DB_BTREE_COMPARE_INT_GET].v = 0;
+ stats[WT_STAT_DB_BTREE_COMPARE_INT_SET].v = 0;
+ stats[WT_STAT_DB_BTREE_COMPARE_SET].v = 0;
+ stats[WT_STAT_DB_BTREE_DUP_OFFPAGE_GET].v = 0;
+ stats[WT_STAT_DB_BTREE_DUP_OFFPAGE_SET].v = 0;
+ stats[WT_STAT_DB_BTREE_ITEMSIZE_GET].v = 0;
+ stats[WT_STAT_DB_BTREE_ITEMSIZE_SET].v = 0;
+ stats[WT_STAT_DB_BTREE_PAGESIZE_GET].v = 0;
+ stats[WT_STAT_DB_BTREE_PAGESIZE_SET].v = 0;
+ stats[WT_STAT_DB_BULK_LOAD].v = 0;
+ stats[WT_STAT_DB_CLOSE].v = 0;
+ stats[WT_STAT_DB_COLUMN_SET].v = 0;
+ stats[WT_STAT_DB_COL_DEL].v = 0;
+ stats[WT_STAT_DB_COL_DEL_RESTART].v = 0;
+ stats[WT_STAT_DB_COL_GET].v = 0;
+ stats[WT_STAT_DB_COL_PUT].v = 0;
+ stats[WT_STAT_DB_COL_PUT_RESTART].v = 0;
+ stats[WT_STAT_DB_DUMP].v = 0;
+ stats[WT_STAT_DB_ERRCALL_GET].v = 0;
+ stats[WT_STAT_DB_ERRCALL_SET].v = 0;
+ stats[WT_STAT_DB_ERRFILE_GET].v = 0;
+ stats[WT_STAT_DB_ERRFILE_SET].v = 0;
+ stats[WT_STAT_DB_ERRPFX_GET].v = 0;
+ stats[WT_STAT_DB_ERRPFX_SET].v = 0;
+ stats[WT_STAT_DB_HUFFMAN_SET].v = 0;
+ stats[WT_STAT_DB_OPEN].v = 0;
+ stats[WT_STAT_DB_ROW_DEL].v = 0;
+ stats[WT_STAT_DB_ROW_DEL_RESTART].v = 0;
+ stats[WT_STAT_DB_ROW_GET].v = 0;
+ stats[WT_STAT_DB_ROW_PUT].v = 0;
+ stats[WT_STAT_DB_ROW_PUT_RESTART].v = 0;
+ stats[WT_STAT_DB_STAT_CLEAR].v = 0;
+ stats[WT_STAT_DB_STAT_PRINT].v = 0;
+ stats[WT_STAT_DB_SYNC].v = 0;
+ stats[WT_STAT_DB_VERIFY].v = 0;
+ stats[WT_STAT_ENV_CACHE_SIZE_GET].v = 0;
+ stats[WT_STAT_ENV_CACHE_SIZE_SET].v = 0;
+ stats[WT_STAT_ENV_CLOSE].v = 0;
+ stats[WT_STAT_ENV_DATA_UPDATE_INITIAL_GET].v = 0;
+ stats[WT_STAT_ENV_DATA_UPDATE_INITIAL_SET].v = 0;
+ stats[WT_STAT_ENV_DATA_UPDATE_MAX_GET].v = 0;
+ stats[WT_STAT_ENV_DATA_UPDATE_MAX_SET].v = 0;
+ stats[WT_STAT_ENV_DB].v = 0;
+ stats[WT_STAT_ENV_ERRCALL_GET].v = 0;
+ stats[WT_STAT_ENV_ERRCALL_SET].v = 0;
+ stats[WT_STAT_ENV_ERRFILE_GET].v = 0;
+ stats[WT_STAT_ENV_ERRFILE_SET].v = 0;
+ stats[WT_STAT_ENV_ERRPFX_GET].v = 0;
+ stats[WT_STAT_ENV_ERRPFX_SET].v = 0;
+ stats[WT_STAT_ENV_HAZARD_SIZE_GET].v = 0;
+ stats[WT_STAT_ENV_HAZARD_SIZE_SET].v = 0;
+ stats[WT_STAT_ENV_MSGCALL_GET].v = 0;
+ stats[WT_STAT_ENV_MSGCALL_SET].v = 0;
+ stats[WT_STAT_ENV_MSGFILE_GET].v = 0;
+ stats[WT_STAT_ENV_MSGFILE_SET].v = 0;
+ stats[WT_STAT_ENV_OPEN].v = 0;
+ stats[WT_STAT_ENV_STAT_CLEAR].v = 0;
+ stats[WT_STAT_ENV_STAT_PRINT].v = 0;
+ stats[WT_STAT_ENV_SYNC].v = 0;
+ stats[WT_STAT_ENV_TOC].v = 0;
+ stats[WT_STAT_ENV_TOC_SIZE_GET].v = 0;
+ stats[WT_STAT_ENV_TOC_SIZE_SET].v = 0;
+ stats[WT_STAT_ENV_VERBOSE_GET].v = 0;
+ stats[WT_STAT_ENV_VERBOSE_SET].v = 0;
+ stats[WT_STAT_WT_TOC_CLOSE].v = 0;
+}
diff --git a/src/support/strerror.c b/src/support/strerror.c
new file mode 100644
index 00000000000..17a4653438a
--- /dev/null
+++ b/src/support/strerror.c
@@ -0,0 +1,41 @@
+/* DO NOT EDIT: automatically built by dist/api_err.py. */
+
+#include "wt_internal.h"
+
+/*
+ * wiredtiger_strerror --
+ * Return a string for any error value.
+ */
+char *
+wiredtiger_strerror(int error)
+{
+ static char errbuf[64];
+ char *p;
+
+ if (error == 0)
+ return ("Successful return: 0");
+
+ switch (error) {
+ case WT_ERROR:
+ return ("WT_ERROR: non-specific WiredTiger error");
+ case WT_NOTFOUND:
+ return ("WT_NOTFOUND: database item not found");
+ case WT_READONLY:
+ return ("WT_READONLY: modification attempted of a read-only database");
+ case WT_RESTART:
+ return ("WT_RESTART: restart the operation (internal)");
+ case WT_TOOSMALL:
+ return ("WT_TOOSMALL: buffer too small");
+ default:
+ if (error > 0 && (p = strerror(error)) != NULL)
+ return (p);
+ break;
+ }
+
+ /*
+ * !!!
+ * Not thread-safe, but this is never supposed to happen.
+ */
+ (void)snprintf(errbuf, sizeof(errbuf), "Unknown error: %d", error);
+ return (errbuf);
+}
diff --git a/src/support/version.c b/src/support/version.c
new file mode 100644
index 00000000000..dbd60162c16
--- /dev/null
+++ b/src/support/version.c
@@ -0,0 +1,26 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+/*
+ * wiredtiger_version --
+ * Return library version information.
+ */
+char *
+wiredtiger_version(int *majorp, int *minorp, int *patchp)
+{
+ if (majorp != NULL)
+ *majorp = WIREDTIGER_VERSION_MAJOR;
+ if (minorp != NULL)
+ *minorp = WIREDTIGER_VERSION_MINOR;
+ if (patchp != NULL)
+ *patchp = WIREDTIGER_VERSION_PATCH;
+ return ((char *)WIREDTIGER_VERSION_STRING);
+}
diff --git a/src/utilities/db_dump/util_dump.c b/src/utilities/db_dump/util_dump.c
new file mode 100644
index 00000000000..68cc6d69061
--- /dev/null
+++ b/src/utilities/db_dump/util_dump.c
@@ -0,0 +1,83 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2009 WiredTiger Software.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+#include "util.h"
+
+const char *progname;
+
+int usage(void);
+
+int
+main(int argc, char *argv[])
+{
+ extern char *optarg;
+ extern int optind;
+ DB *db;
+ u_int32_t flags;
+ int ch, ret, tret;
+
+ WT_UTILITY_INTRO(progname, argv);
+
+ flags = 0;
+ while ((ch = getopt(argc, argv, "df:p")) != EOF)
+ switch (ch) {
+ case 'd':
+ flags = WT_DEBUG;
+ break;
+ case 'f': /* output file */
+ if (freopen(optarg, "w", stdout) == NULL) {
+ fprintf(stderr, "%s: %s: reopen: %s\n",
+ progname, optarg, strerror(errno));
+ return (EXIT_FAILURE);
+ }
+ break;
+ case 'p':
+ flags = WT_PRINTABLES;
+ break;
+ case 'V': /* version */
+ printf("%s\n", wiredtiger_version(NULL, NULL, NULL));
+ return (EXIT_SUCCESS);
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= optind;
+ argv += optind;
+
+ /* The remaining argument is the database name. */
+ if (argc != 1)
+ return (usage());
+
+ if ((ret = wiredtiger_simple_setup(progname, &db, 0, 0)) == 0) {
+ if ((ret = db->open(db, *argv, 0, 0)) != 0) {
+ db->err(db, ret, "Db.open: %s", *argv);
+ goto err;
+ }
+ if ((ret = db->dump(db, stdout, NULL, flags)) != 0) {
+ db->err(db, ret, "Db.dump");
+ goto err;
+ }
+ }
+
+ if (0) {
+err: ret = 1;
+ }
+ if ((tret = wiredtiger_simple_teardown(progname, db)) != 0 && ret == 0)
+ ret = tret;
+ return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+int
+usage()
+{
+ (void)fprintf(stderr,
+ "usage: %s [-dpV] [-f output-file] database\n", progname);
+ return (EXIT_FAILURE);
+}
diff --git a/src/utilities/db_load/util_load.c b/src/utilities/db_load/util_load.c
new file mode 100644
index 00000000000..6ededed7c28
--- /dev/null
+++ b/src/utilities/db_load/util_load.c
@@ -0,0 +1,292 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2009 WiredTiger Software.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+#include "util.h"
+
+const char *progname;
+
+int bulk_callback(DB *, DBT **, DBT **);
+int bulk_read(DBT *dbt, int);
+int config_read(char **);
+int config_read_single(char *);
+int config_set(DB *);
+int usage(void);
+
+struct {
+ int pagesize_set;
+ u_long allocsize, intlmin, intlmax, leafmin, leafmax;
+} config;
+
+int
+main(int argc, char *argv[])
+{
+ extern char *optarg;
+ extern int optind;
+ DB *db;
+ int ch, ret, text_input, tret, verbose;
+ char **config_list, **config_next;
+
+ WT_UTILITY_INTRO(progname, argv);
+
+ /*
+ * We can't handle configuration-line information until we've opened
+ * the DB handle, so we need a place to store it for now.
+ */
+ if ((config_next =
+ config_list = calloc(argc + 1, sizeof(char *))) == NULL) {
+ fprintf(stderr, "%s: %s\n", progname, strerror(errno));
+ return (EXIT_FAILURE);
+ }
+
+ text_input = verbose = 0;
+ while ((ch = getopt(argc, argv, "c:f:TVv")) != EOF)
+ switch (ch) {
+ case 'c': /* command-line option */
+ *config_next++ = optarg;
+ break;
+ case 'f': /* input file */
+ if (freopen(optarg, "r", stdin) == NULL) {
+ fprintf(stderr, "%s: %s: reopen: %s\n",
+ progname, optarg, strerror(errno));
+ return (EXIT_FAILURE);
+ }
+ break;
+ case 'T':
+ text_input = 1;
+ break;
+ case 'V': /* version */
+ printf("%s\n", wiredtiger_version(NULL, NULL, NULL));
+ return (EXIT_SUCCESS);
+ case 'v':
+ verbose = 1;
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= optind;
+ argv += optind;
+
+ /* The remaining argument is the database name. */
+ if (argc != 1)
+ return (usage());
+
+ /*
+ * Read through the command-line configuration options and convert
+ * to the config structure.
+ */
+ if (config_read(config_list) != 0)
+ goto err;
+
+ /*
+ * Right now, we only support text input -- require the T option to
+ * match Berkeley DB's API.
+ */
+ if (text_input == 0) {
+ fprintf(stderr,
+ "%s: the -T option is currently required\n", progname);
+ return (EXIT_FAILURE);
+ }
+
+ if ((ret = wiredtiger_simple_setup(progname, &db, 0, 0)) == 0) {
+ if (config_set(db) != 0)
+ goto err;
+
+ (void)remove(*argv);
+
+ if ((ret = db->open(db, *argv, 0600, WT_CREATE)) != 0) {
+ db->err(db, ret, "Db.open: %s", *argv);
+ goto err;
+ }
+
+ if ((ret = db->bulk_load(db, WT_DUPLICATES,
+ verbose ? __wt_progress : NULL, bulk_callback)) != 0) {
+ db->err(db, ret, "Db.bulk_load");
+ goto err;
+ }
+ if (verbose)
+ printf("\n");
+ }
+
+ if (0) {
+err: ret = 1;
+ }
+ if ((tret = wiredtiger_simple_teardown(progname, db)) != 0 && ret == 0)
+ ret = tret;
+ return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+/*
+ * config_read --
+ * Convert command-line options into the config structure.
+ */
+int
+config_read(char **list)
+{
+ int ret;
+
+ for (; *list != NULL; ++list)
+ if ((ret = config_read_single(*list)) != 0)
+ return (ret);
+ return (0);
+}
+
+/*
+ * config_read_single --
+ * Process a single command-line configuration option, converting it into
+ * the config structure.
+ */
+int
+config_read_single(char *opt)
+{
+ u_long v;
+ char *p, *ep;
+
+ /* Get pointers to the two parts of an X=Y format string. */
+ if ((p = strchr(opt, '=')) == NULL || p[1] == '\0')
+ goto format;
+ *p++ = '\0';
+ v = strtoul(p, &ep, 10);
+ if (v == ULONG_MAX && errno == ERANGE) {
+format: fprintf(stderr,
+ "%s: -c option %s is not correctly formatted\n",
+ progname, opt);
+ return (1);
+ }
+ if (strcmp(opt, "allocsize") == 0) {
+ config.allocsize = v;
+ config.pagesize_set = 1;
+ return (0);
+ }
+ if (strcmp(opt, "intlmin") == 0) {
+ config.intlmin = v;
+ config.pagesize_set = 1;
+ return (0);
+ }
+ if (strcmp(opt, "intlmax") == 0) {
+ config.intlmax = v;
+ config.pagesize_set = 1;
+ return (0);
+ }
+ if (strcmp(opt, "leafmin") == 0) {
+ config.leafmin = v;
+ config.pagesize_set = 1;
+ return (0);
+ }
+ if (strcmp(opt, "leafmax") == 0) {
+ config.leafmax = v;
+ config.pagesize_set = 1;
+ return (0);
+ }
+
+ fprintf(stderr,
+ "%s: -c option %s has an unknown keyword\n", progname, opt);
+ return (1);
+}
+
+/*
+ * config_set --
+ * Set the command-line configuration options on the database handle.
+ */
+int
+config_set(DB *db)
+{
+ u_int32_t allocsize, intlmin, intlmax, leafmin, leafmax;
+ int ret;
+
+ if (config.pagesize_set) {
+ if ((ret = db->btree_pagesize_get(db,
+ &allocsize, &intlmin, &intlmax, &leafmin, &leafmax)) != 0) {
+ db->err(db, ret, "Db.btree_pagesize_get");
+ return (1);
+ }
+ if (config.allocsize != 0)
+ allocsize = config.allocsize;
+ if (config.intlmin != 0)
+ intlmin = config.intlmin;
+ if (config.intlmax != 0)
+ intlmax = config.intlmax;
+ if (config.leafmin != 0)
+ leafmin = config.leafmin;
+ if (config.leafmax != 0)
+ leafmax = config.leafmax;
+ if ((ret = db->btree_pagesize_set(db,
+ allocsize, intlmin, intlmax, leafmin, leafmax)) != 0) {
+ db->err(db, ret, "Db.btree_pagesize_set");
+ return (1);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * bulk_read --
+ * Read a line from stdin into a DBT.
+ */
+int
+bulk_read(DBT *dbt, int iskey)
+{
+ static u_int64_t line = 0;
+ size_t len;
+ int ch;
+
+ ++line;
+ for (len = 0;; ++len) {
+ if ((ch = getchar()) == EOF) {
+ if (iskey && len == 0)
+ return (1);
+ fprintf(stderr, "%s: corrupted input at line %llu\n",
+ progname, line);
+ return (WT_ERROR);
+ }
+ if (ch == '\n')
+ break;
+ if (len >= dbt->mem_size) {
+ if ((dbt->data = realloc(dbt->data, len + 128)) == NULL)
+ return (errno);
+ dbt->mem_size = len + 128;
+ }
+ ((u_int8_t *)(dbt->data))[len] = ch;
+ }
+ dbt->size = len;
+ return (0);
+}
+
+/*
+ * bulk_callback --
+ * Bulk-load callback function.
+ */
+int
+bulk_callback(DB *db, DBT **keyp, DBT **datap)
+{
+ static DBT key, data;
+ int ret;
+
+ WT_CC_QUIET(db, NULL);
+
+ if ((ret = bulk_read(&key, 1)) != 0)
+ return (ret);
+ if ((ret = bulk_read(&data, 0)) != 0)
+ return (ret);
+
+ *keyp = &key;
+ *datap = &data;
+ return (0);
+}
+
+int
+usage()
+{
+ (void)fprintf(stderr,
+ "usage: %s [-TVv] [-c configuration] [-f input-file] database\n",
+ progname);
+ return (EXIT_FAILURE);
+}
diff --git a/src/utilities/db_stat/util_stat.c b/src/utilities/db_stat/util_stat.c
new file mode 100644
index 00000000000..afb2f94cba8
--- /dev/null
+++ b/src/utilities/db_stat/util_stat.c
@@ -0,0 +1,67 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2009 WiredTiger Software.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+#include "util.h"
+
+const char *progname;
+
+int usage(void);
+
+int
+main(int argc, char *argv[])
+{
+ extern char *optarg;
+ extern int optind;
+ DB *db;
+ int ch, ret, tret;
+
+ WT_UTILITY_INTRO(progname, argv);
+
+ while ((ch = getopt(argc, argv, "V")) != EOF)
+ switch (ch) {
+ case 'V': /* version */
+ printf("%s\n", wiredtiger_version(NULL, NULL, NULL));
+ return (EXIT_SUCCESS);
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= optind;
+ argv += optind;
+
+ /* The remaining argument is the database name. */
+ if (argc != 1)
+ return (usage());
+
+ if ((ret = wiredtiger_simple_setup(progname, &db, 0, 0)) == 0) {
+ if ((ret = db->open(db, *argv, 0, 0)) != 0) {
+ db->err(db, ret, "Db.open: %s", *argv);
+ goto err;
+ }
+ if ((ret = db->stat_print(db, stdout, 0)) != 0) {
+ db->err(db, ret, "Db.stat: %s", *argv);
+ goto err;
+ }
+ }
+
+ if (0) {
+err: ret = 1;
+ }
+ if ((tret = wiredtiger_simple_teardown(progname, db)) != 0 && ret == 0)
+ ret = tret;
+ return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+int
+usage()
+{
+ (void)fprintf(stderr, "usage: %s [-V] database\n", progname);
+ return (EXIT_FAILURE);
+}
diff --git a/src/utilities/db_verify/util_verify.c b/src/utilities/db_verify/util_verify.c
new file mode 100644
index 00000000000..5c5bd02407f
--- /dev/null
+++ b/src/utilities/db_verify/util_verify.c
@@ -0,0 +1,74 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2009 WiredTiger Software.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+#include "util.h"
+
+const char *progname;
+
+int usage(void);
+
+int
+main(int argc, char *argv[])
+{
+ extern char *optarg;
+ extern int optind;
+ DB *db;
+ int ch, ret, tret, verbose;
+
+ WT_UTILITY_INTRO(progname, argv);
+
+ verbose = 0;
+ while ((ch = getopt(argc, argv, "Vv")) != EOF)
+ switch (ch) {
+ case 'v': /* verbose */
+ verbose = 1;
+ break;
+ case 'V': /* version */
+ printf("%s\n", wiredtiger_version(NULL, NULL, NULL));
+ return (EXIT_SUCCESS);
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= optind;
+ argv += optind;
+
+ /* The remaining argument is the database name. */
+ if (argc != 1)
+ return (usage());
+
+ if ((ret = wiredtiger_simple_setup(progname, &db, 0, 0)) == 0) {
+ if ((ret = db->open(db, *argv, 0, 0)) != 0) {
+ db->err(db, ret, "Db.open: %s", *argv);
+ goto err;
+ }
+ if ((ret =
+ db->verify(db, verbose ? __wt_progress : NULL, 0)) != 0) {
+ db->err(db, ret, "Db.verify: %s", *argv);
+ goto err;
+ }
+ if (verbose)
+ printf("\n");
+ }
+
+ if (0) {
+err: ret = 1;
+ }
+ if ((tret = wiredtiger_simple_teardown(progname, db)) != 0 && ret == 0)
+ ret = tret;
+ return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+int
+usage()
+{
+ (void)fprintf(stderr, "usage: %s [-Vv] database\n", progname);
+ return (EXIT_FAILURE);
+}