diff options
author | Michael Cahill <michael.cahill@wiredtiger.com> | 2011-02-01 09:24:17 +1100 |
---|---|---|
committer | Michael Cahill <michael.cahill@wiredtiger.com> | 2011-02-01 09:24:17 +1100 |
commit | 7ebbbf1d52c1ed989cfe5f4fde3b98e983db2e63 (patch) | |
tree | 0e0fd0f6b190dbcd283ca3c4040b5dcd89a94014 /src | |
parent | 6f87637341366fb90f890a5ef860e90c57b36d1f (diff) | |
download | mongo-7ebbbf1d52c1ed989cfe5f4fde3b98e983db2e63.tar.gz |
Restructure the tree to ease merge.
refs #27
--HG--
branch : keith
rename : lint/fl => dist/lint/fl
rename : lint/lint.current => dist/lint/lint.current
rename : inc_posix/bitstring.h => include/bitstring.h
rename : inc_posix/btree.h => include/btree.h
rename : inc_posix/cache.h => include/cache.h
rename : inc_posix/debug.h => include/debug.h
rename : inc_posix/extern.h => include/extern.h
rename : inc_posix/fh.h => include/fh.h
rename : inc_posix/mem.h => include/mem.h
rename : inc_posix/misc.h => include/misc.h
rename : inc_posix/mutex.h => include/mutex.h
rename : inc_posix/queue.h => include/queue.h
rename : inc_posix/serial.h => include/serial.h
rename : inc_posix/stat.h => include/stat.h
rename : inc_posix/util.h => include/util.h
rename : inc_posix/walk.h => include/walk.h
rename : inc_posix/wiredtiger.in => include/wiredtiger.in
rename : inc_posix/wt_internal.in => include/wt_internal.in
rename : btree/bt_alloc.c => src/btree/bt_alloc.c
rename : btree/bt_bulk.c => src/btree/bt_bulk.c
rename : btree/bt_cache.c => src/btree/bt_cache.c
rename : btree/bt_close.c => src/btree/bt_close.c
rename : btree/bt_cmp.c => src/btree/bt_cmp.c
rename : btree/bt_debug.c => src/btree/bt_debug.c
rename : btree/bt_desc.c => src/btree/bt_desc.c
rename : btree/bt_discard.c => src/btree/bt_discard.c
rename : btree/bt_dump.c => src/btree/bt_dump.c
rename : btree/bt_evict.c => src/btree/bt_evict.c
rename : btree/bt_misc.c => src/btree/bt_misc.c
rename : btree/bt_open.c => src/btree/bt_open.c
rename : btree/bt_ovfl.c => src/btree/bt_ovfl.c
rename : btree/bt_page.c => src/btree/bt_page.c
rename : btree/bt_read.c => src/btree/bt_read.c
rename : btree/bt_reconcile.c => src/btree/bt_reconcile.c
rename : btree/bt_ret.c => src/btree/bt_ret.c
rename : btree/bt_rw.c => src/btree/bt_rw.c
rename : btree/bt_stat.c => src/btree/bt_stat.c
rename : btree/bt_sync.c => src/btree/bt_sync.c
rename : btree/bt_vrfy.c => src/btree/bt_vrfy.c
rename : btree/bt_walk.c => src/btree/bt_walk.c
rename : btree/c_drain.c => src/btree/c_drain.c
rename : btree/c_init.c => src/btree/c_init.c
rename : btree/c_page.c => src/btree/c_page.c
rename : btree/c_read.c => src/btree/c_read.c
rename : btree/col_get.c => src/btree/col_get.c
rename : btree/col_put.c => src/btree/col_put.c
rename : btree/col_srch.c => src/btree/col_srch.c
rename : btree/row_get.c => src/btree/row_get.c
rename : btree/row_put.c => src/btree/row_put.c
rename : btree/row_srch.c => src/btree/row_srch.c
rename : db/db_err.c => src/db/db_err.c
rename : db/db_getset.c => src/db/db_getset.c
rename : db/db_handle.c => src/db/db_handle.c
rename : db/db_huffman.c => src/db/db_huffman.c
rename : db/db_open.c => src/db/db_open.c
rename : db/db_stat.c => src/db/db_stat.c
rename : db/db_sync.c => src/db/db_sync.c
rename : env/env_err.c => src/env/env_err.c
rename : env/env_getset.c => src/env/env_getset.c
rename : env/env_global.c => src/env/env_global.c
rename : env/env_handle.c => src/env/env_handle.c
rename : env/env_init.c => src/env/env_init.c
rename : env/env_msg.c => src/env/env_msg.c
rename : env/env_open.c => src/env/env_open.c
rename : env/env_stat.c => src/env/env_stat.c
rename : env/env_sync.c => src/env/env_sync.c
rename : env/env_toc.c => src/env/env_toc.c
rename : env/env_workq.c => src/env/env_workq.c
rename : os_posix/os_abort.c => src/os_posix/os_abort.c
rename : os_posix/os_alloc.c => src/os_posix/os_alloc.c
rename : os_posix/os_filesize.c => src/os_posix/os_filesize.c
rename : os_posix/os_fsync.c => src/os_posix/os_fsync.c
rename : os_posix/os_mtx.c => src/os_posix/os_mtx.c
rename : os_posix/os_open.c => src/os_posix/os_open.c
rename : os_posix/os_rw.c => src/os_posix/os_rw.c
rename : os_posix/os_sleep.c => src/os_posix/os_sleep.c
rename : os_posix/os_thread.c => src/os_posix/os_thread.c
rename : os_posix/os_yield.c => src/os_posix/os_yield.c
rename : support/api.c => src/support/api.c
rename : support/cksum.c => src/support/cksum.c
rename : support/err.c => src/support/err.c
rename : support/hazard.c => src/support/hazard.c
rename : support/huffman.c => src/support/huffman.c
rename : support/pow.c => src/support/pow.c
rename : support/prime.c => src/support/prime.c
rename : support/progress.c => src/support/progress.c
rename : support/scratch.c => src/support/scratch.c
rename : support/serial.c => src/support/serial.c
rename : support/simple_setup.c => src/support/simple_setup.c
rename : support/stat.c => src/support/stat.c
rename : support/strerror.c => src/support/strerror.c
rename : support/version.c => src/support/version.c
rename : utilities/db_dump/util_dump.c => src/utilities/db_dump/util_dump.c
rename : utilities/db_load/util_load.c => src/utilities/db_load/util_load.c
rename : utilities/db_stat/util_stat.c => src/utilities/db_stat/util_stat.c
rename : utilities/db_verify/util_verify.c => src/utilities/db_verify/util_verify.c
Diffstat (limited to 'src')
78 files changed, 18464 insertions, 0 deletions
diff --git a/src/btree/bt_alloc.c b/src/btree/bt_alloc.c new file mode 100644 index 00000000000..4477ce4e0f9 --- /dev/null +++ b/src/btree/bt_alloc.c @@ -0,0 +1,106 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +static void __wt_file_extend(WT_TOC *, uint32_t *, uint32_t); + +#ifdef HAVE_DIAGNOSTIC +static int __wt_file_free_write(WT_TOC *, uint32_t, uint32_t); +#endif + +/* + * __wt_file_alloc -- + * Alloc a chunk of space from the underlying file. + */ +int +__wt_file_alloc(WT_TOC *toc, uint32_t *addrp, uint32_t size) +{ + IDB *idb; + + idb = toc->db->idb; + + __wt_file_extend(toc, addrp, size); + + WT_STAT_INCR(idb->stats, DB_ALLOC); + + return (0); +} + +/* + * __wt_file_extend -- + * Extend the file to allocate space. + */ +static void +__wt_file_extend(WT_TOC *toc, uint32_t *addrp, uint32_t size) +{ + DB *db; + IDB *idb; + WT_FH *fh; + + db = toc->db; + idb = db->idb; + fh = idb->fh; + + /* Extend the file. */ + *addrp = WT_OFF_TO_ADDR(db, fh->file_size); + fh->file_size += size; + + WT_STAT_INCR(idb->stats, DB_ALLOC_FILE); +} + +/* + * __wt_file_free -- + * Free a chunk of space to the underlying file. + */ +int +__wt_file_free(WT_TOC *toc, uint32_t addr, uint32_t size) +{ + WT_STATS *stats; + + stats = toc->db->idb->stats; + +#ifdef HAVE_DIAGNOSTIC + WT_RET(__wt_file_free_write(toc, addr, size)); +#endif + + WT_STAT_INCR(stats, DB_FREE); + + return (0); +} + +#ifdef HAVE_DIAGNOSTIC +/* + * __wt_file_free_write -- + * Overwrite the space in the file so future reads don't get fooled. + * DIAGNOSTIC only. + */ +static int +__wt_file_free_write(WT_TOC *toc, uint32_t addr, uint32_t size) +{ + DBT *tmp; + WT_PAGE_DISK *dsk; + uint32_t allocsize; + int ret; + + allocsize = toc->db->allocsize; + ret = 0; + + WT_RET(__wt_scr_alloc(toc, allocsize, &tmp)); + memset(tmp->data, 0, allocsize); + + dsk = tmp->data; + dsk->type = WT_PAGE_FREE; + for (; size >= allocsize; size -= allocsize) + WT_ERR(__wt_page_disk_write(toc, dsk, addr++, allocsize)); + +err: __wt_scr_release(&tmp); + return (ret); +} +#endif diff --git a/src/btree/bt_bulk.c b/src/btree/bt_bulk.c new file mode 100644 index 00000000000..f88c0d5e8ae --- /dev/null +++ b/src/btree/bt_bulk.c @@ -0,0 +1,1467 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * WT_STACK -- + * We maintain a stack of parent pages as we build the tree, encapsulated + * in this structure. + */ +typedef struct { + WT_PAGE *page; /* page header */ + uint8_t *first_free; /* page's first free byte */ + uint32_t space_avail; /* page's space available */ + + DBT *tmp; /* page-in-a-buffer */ + void *data; /* last on-page WT_COL/WT_ROW */ +} WT_STACK_ELEM; +typedef struct { + WT_STACK_ELEM *elem; /* stack */ + u_int size; /* stack size */ +} WT_STACK; + +static int __wt_bulk_dbt_copy(ENV *, DBT *, DBT *); +static int __wt_bulk_dup_offpage(WT_TOC *, DBT **, DBT **, DBT *, WT_ITEM *, + uint32_t, uint32_t, WT_OFF *, int (*)(DB *, DBT **, DBT **)); +static int __wt_bulk_fix(WT_TOC *, void (*)(const char *, + uint64_t), int (*)(DB *, DBT **, DBT **)); +static int __wt_bulk_ovfl_copy(WT_TOC *, WT_OVFL *, WT_OVFL *); +static int __wt_bulk_ovfl_write(WT_TOC *, DBT *, WT_OVFL *); +static int __wt_bulk_promote( + WT_TOC *, WT_PAGE *, uint64_t, WT_STACK *, u_int, uint32_t *); +static int __wt_bulk_scratch_page( + WT_TOC *, uint32_t, uint32_t, uint32_t, WT_PAGE **, DBT **); +static int __wt_bulk_stack_put(WT_TOC *, WT_STACK *); +static int __wt_bulk_var(WT_TOC *, uint32_t, void (*)(const char *, + uint64_t), int (*)(DB *, DBT **, DBT **)); +static int __wt_item_build_key(WT_TOC *, DBT *, WT_ITEM *, WT_OVFL *); + +/* + * __wt_db_bulk_load -- + * Db.bulk_load method. + */ +int +__wt_db_bulk_load(WT_TOC *toc, uint32_t flags, + void (*f)(const char *, uint64_t), int (*cb)(DB *, DBT **, DBT **)) +{ + DB *db; + IDB *idb; + uint32_t addr; + + db = toc->db; + idb = db->idb; + + /* + * XXX + * Write out the description record -- this goes away when we figure + * out how the table schema is going to work, but for now, we use the + * first sector, and this file extend makes sure we don't allocate it + * as a table page. + */ + WT_RET(__wt_file_alloc(toc, &addr, 512)); + + if (F_ISSET(idb, WT_COLUMN)) + WT_DB_FCHK(db, "DB.bulk_load", flags, 0); + + /* + * There are two styles of bulk-load: variable length pages or + * fixed-length pages. + */ + if (F_ISSET(idb, WT_COLUMN) && db->fixed_len != 0) + WT_RET(__wt_bulk_fix(toc, f, cb)); + else + WT_RET(__wt_bulk_var(toc, flags, f, cb)); + + /* Get a permanent root page reference. */ + return (__wt_root_pin(toc)); +} + +/* + * __wt_bulk_fix + * Db.bulk_load method for column-store, fixed-length database pages. + */ +static int +__wt_bulk_fix(WT_TOC *toc, + void (*f)(const char *, uint64_t), int (*cb)(DB *, DBT **, DBT **)) +{ + DB *db; + DBT *key, *data, *tmp; + IDB *idb; + WT_PAGE *page; + WT_PAGE_DISK *dsk; + WT_STACK stack; + uint64_t insert_cnt; + uint32_t len, space_avail; + uint16_t *last_repeat; + uint8_t *first_free, *last_data; + int rle, ret; + + db = toc->db; + tmp = NULL; + idb = db->idb; + insert_cnt = 0; + WT_CLEAR(stack); + + rle = F_ISSET(idb, WT_RLE) ? 1 : 0; + + /* Figure out how large is the chunk we're storing on the page. */ + len = db->fixed_len; + if (rle) + len += sizeof(uint16_t); + + /* Get a scratch buffer and make it look like our work page. */ + WT_ERR(__wt_bulk_scratch_page(toc, db->leafmin, + rle ? WT_PAGE_COL_RLE : WT_PAGE_COL_FIX, WT_LLEAF, &page, &tmp)); + dsk = page->dsk; + dsk->start_recno = 1; + __wt_set_ff_and_sa_from_offset( + page, WT_PAGE_BYTE(page), &first_free, &space_avail); + + while ((ret = cb(db, &key, &data)) == 0) { + if (key != NULL) { + __wt_api_db_errx(db, + "column database keys are implied and so should " + "not be set by the bulk load input routine"); + ret = WT_ERROR; + goto err; + } + if (data->size != db->fixed_len) + WT_ERR(__wt_database_wrong_fixed_size(toc, data->size)); + + /* + * We use the high bit of the data field as a "deleted" value, + * make sure the user's data doesn't set it. + */ + if (WT_FIX_DELETE_ISSET(data->data)) { + __wt_api_db_errx(db, + "the first bit may not be stored in fixed-length " + "column-store database items"); + ret = WT_ERROR; + goto err; + } + + /* Report on progress every 100 inserts. */ + if (f != NULL && ++insert_cnt % 100 == 0) + f(toc->name, insert_cnt); + WT_STAT_INCR(idb->stats, ITEMS_INSERTED); + + /* + * If doing run-length encoding, check to see if this record + * matches the last data inserted. If there's a match try + * and increment that item's repeat count instead of entering + * new data. + */ + if (rle && dsk->u.entries != 0) + if (*last_repeat < UINT16_MAX && + memcmp(last_data, data->data, data->size) == 0) { + ++*last_repeat; + ++page->records; + WT_STAT_INCR(idb->stats, REPEAT_COUNT); + continue; + } + + /* + * We now have the data item to store on the page. If there + * is insufficient space on the current page, allocate a new + * one. + */ + if (len > space_avail) { + /* + * We've finished with the page: promote its first key + * to its parent and discard it, then switch to the new + * page. + */ + WT_ERR(__wt_bulk_promote( + toc, page, page->records, &stack, 0, NULL)); + WT_ERR(__wt_page_write(toc, page)); + dsk->u.entries = 0; + page->records = 0; + dsk->start_recno = insert_cnt; + WT_ERR( + __wt_file_alloc(toc, &page->addr, db->leafmin)); + __wt_set_ff_and_sa_from_offset(page, + WT_PAGE_BYTE(page), &first_free, &space_avail); + } + + ++dsk->u.entries; + ++page->records; + + /* + * Copy the data item onto the page -- if doing run-length + * encoding, track the location of the item for comparison. + */ + if (rle) { + last_repeat = (uint16_t *)first_free; + *last_repeat = 1; + first_free += sizeof(uint16_t); + space_avail -= sizeof(uint16_t); + last_data = first_free; + } + memcpy(first_free, data->data, data->size); + first_free += data->size; + space_avail -= data->size; + } + + /* A ret of 1 just means we've reached the end of the input. */ + if (ret != 1) + goto err; + ret = 0; + + /* Promote a key from any partially-filled page and write it. */ + if (dsk->u.entries != 0) { + ret = __wt_bulk_promote( + toc, page, page->records, &stack, 0, NULL); + WT_ERR(__wt_page_write(toc, page)); + } + + /* Wrap up reporting. */ + if (f != NULL) + f(toc->name, insert_cnt); + +err: WT_TRET(__wt_bulk_stack_put(toc, &stack)); + if (tmp != NULL) + __wt_scr_release(&tmp); + + return (ret); +} + +/* + * __wt_bulk_var -- + * Db.bulk_load method for row or column-store variable-length database + * pages. + */ +static int +__wt_bulk_var(WT_TOC *toc, uint32_t flags, + void (*f)(const char *, uint64_t), int (*cb)(DB *, DBT **, DBT **)) +{ + DB *db; + DBT *key, *data, key_copy, data_copy; + DBT *lastkey, *lastkey_copy, lastkey_std; + DBT *tmp1, *tmp2; + ENV *env; + IDB *idb; + WT_ITEM key_item, data_item, *dup_key, *dup_data; + WT_OFF off; + WT_OVFL key_ovfl, data_ovfl; + WT_PAGE *page, *next; + WT_STACK stack; + uint64_t insert_cnt; + uint32_t dup_count, dup_space, len, next_space_avail, space_avail; + uint8_t *first_free, *next_first_free, *p, type; + int ret; + + db = toc->db; + tmp1 = tmp2 = NULL; + env = toc->env; + idb = db->idb; + ret = 0; + + WT_CLEAR(stack); + dup_space = dup_count = 0; + insert_cnt = 0; + type = F_ISSET(idb, WT_COLUMN) ? WT_PAGE_COL_VAR : WT_PAGE_ROW_LEAF; + + lastkey = &lastkey_std; + WT_CLEAR(data_copy); + WT_CLEAR(key_copy); + WT_CLEAR(key_item); + WT_CLEAR(lastkey_std); + WT_ERR(__wt_scr_alloc(toc, 0, &lastkey_copy)); + + /* Get a scratch buffer and make it look like our work page. */ + WT_ERR(__wt_bulk_scratch_page( + toc, db->leafmin, type, WT_LLEAF, &page, &tmp1)); + __wt_set_ff_and_sa_from_offset( + page, WT_PAGE_BYTE(page), &first_free, &space_avail); + if (type == WT_PAGE_COL_VAR) + page->dsk->start_recno = 1; + + while ((ret = cb(db, &key, &data)) == 0) { + if (F_ISSET(idb, WT_COLUMN) ) { + if (key != NULL) { + __wt_api_db_errx(db, + "column database keys are implied and " + "so should not be returned by the bulk " + "load input routine"); + ret = WT_ERROR; + goto err; + } + } else { + if (key == NULL && !LF_ISSET(WT_DUPLICATES)) { + __wt_api_db_errx(db, + "keys must be specified unless duplicates " + "are configured"); + ret = WT_ERROR; + goto err; + } + if (key != NULL && key->size == 0) { + __wt_api_db_errx(db, + "zero-length keys are not supported"); + ret = WT_ERROR; + goto err; + } + } + + /* Report on progress every 100 inserts. */ + if (f != NULL && ++insert_cnt % 100 == 0) + f(toc->name, insert_cnt); + WT_STAT_INCR(idb->stats, ITEMS_INSERTED); + + /* + * We don't have a key to store on the page if we're building a + * column-store, and we don't store the key on the page in the + * case of a row-store duplicate data item. The check from here + * on is if "key == NULL" for both cases, that is, there's no + * key to store. + */ + +skip_read: /* + * We pushed a set of duplicates off-page, and that routine + * returned an ending key/data pair to us. + */ + + /* + * Copy the caller's DBTs, we don't want to modify them. But, + * copy them carefully, all we want is a pointer and a length. + */ + if (key != NULL) { + key_copy.data = key->data; + key_copy.size = key->size; + key = &key_copy; + } + data_copy.data = data->data; + data_copy.size = data->size; + data = &data_copy; + + /* Build the data item we're going to store on the page. */ + WT_ERR(__wt_item_build_data( + toc, data, &data_item, &data_ovfl, 0)); + + /* + * Check for duplicate keys; we don't store the key on the page + * in the case of a duplicate. + * + * !!! + * Do a fast check of the old and new sizes -- note checking + * lastkey->size is safe -- it's initialized to 0, and we do + * not allow zero-length keys. + */ + if (LF_ISSET(WT_DUPLICATES) && + (key == NULL || + (lastkey->size == key->size && + db->btree_compare(db, lastkey, key) == 0))) { + /* + * The first duplicate in the set is already on the + * page, but with an item type set to WT_ITEM_DATA or + * WT_ITEM_DATA_OVFL. Correct the type and dup_count. + */ + if (++dup_count == 1) { + dup_count = 2; + WT_ITEM_SET_TYPE(dup_data, + WT_ITEM_TYPE(dup_data) == WT_ITEM_DATA ? + WT_ITEM_DATA_DUP : WT_ITEM_DATA_DUP_OVFL); + } + + /* Reset the type of the current item to a duplicate. */ + WT_ITEM_SET_TYPE(&data_item, + WT_ITEM_TYPE(&data_item) == WT_ITEM_DATA ? + WT_ITEM_DATA_DUP : WT_ITEM_DATA_DUP_OVFL); + + WT_STAT_INCR(idb->stats, DUPLICATE_ITEMS_INSERTED); + + key = NULL; + } else { + /* + * It's a new key, but if duplicates are possible we'll + * need a copy of the key for comparison with the next + * key. If the key is Huffman encoded or an overflow + * object, we can't use the on-page version, we have to + * save a copy. + */ + if (LF_ISSET(WT_DUPLICATES) && + (key->size > db->leafitemsize || + idb->huffman_key != NULL)) { + WT_ERR( + __wt_bulk_dbt_copy(env, key, lastkey_copy)); + lastkey = lastkey_copy; + } else + lastkey = NULL; + + dup_count = 0; + } + + /* Build the key item we're going to store on the page. */ + if (key != NULL) + WT_ERR(__wt_item_build_key( + toc, key, &key_item, &key_ovfl)); + + /* + * We now have the key/data items to store on the page. If + * there is insufficient space on the current page, allocate + * a new one. + */ + if ((key == NULL ? 0 : WT_ITEM_SPACE_REQ(key->size)) + + WT_ITEM_SPACE_REQ(data->size) > space_avail) { + WT_ERR(__wt_bulk_scratch_page(toc, + db->leafmin, type, WT_LLEAF, &next, &tmp2)); + __wt_set_ff_and_sa_from_offset(next, + WT_PAGE_BYTE(next), + &next_first_free, &next_space_avail); + if (type == WT_PAGE_COL_VAR) + next->dsk->start_recno = insert_cnt; + + /* + * If in the middle of loading a set of duplicates, but + * the set hasn't yet reached the boundary where we'd + * push them offpage, we can't split them across the two + * pages. Move the entire set to the new page. This + * can waste up to 25% of the old page, but it would be + * difficult and messy to move them and then go back + * and fix things up if and when they moved offpage. + * + * We use a check of dup_count instead of checking the + * WT_DUPLICATES flag, since we have to check it anyway. + */ + if (dup_count != 0) { + /* + * Reset the page entry and record counts -- we + * are moving a single key plus the duplicate + * set. + * + * Since dup_count was already incremented to + * reflect the data item we're loading now, it + * is the right number of elements to move, that + * is, move (dup_count - 1) + 1 for the key. + */ + page->dsk->u.entries -= dup_count; + page->records -= dup_count - 1; + next->dsk->u.entries += dup_count; + next->records += dup_count - 1; + + /* + * Move the duplicate set and adjust the page + * information for "next" -- we don't have to + * fix up "page", we're never going to use it + * again. + */ + len = + (uint32_t)(first_free - (uint8_t *)dup_key); + memcpy(next_first_free, dup_key, len); + next_first_free += len; + next_space_avail -= len; + + /* + * We'll never have to move this dup set to + * another primary page -- if the dup set + * continues to grow, it will be moved + * off-page. We still need to know where + * the dup set starts, though, for the + * possible move off-page: it's the second + * entry on the page, where the first entry + * is the dup set's key. + */ + dup_key = (WT_ITEM *)WT_PAGE_BYTE(next); + dup_data = (WT_ITEM *)((uint8_t *)dup_key + + WT_ITEM_SPACE_REQ(WT_ITEM_LEN(dup_key))); + + /* + * The "lastkey" value just moved to a new page. + * If it's an overflow item, we have a copy; if + * it's not, then we need to reset it. + */ + if (lastkey == &lastkey_std) { + lastkey_std.data = + WT_ITEM_BYTE(dup_key); + lastkey_std.size = WT_ITEM_LEN(dup_key); + } + } + + /* + * We've finished with the page: promote its first key + * to its parent and discard it, then switch to the new + * page. + */ + WT_ERR(__wt_bulk_promote( + toc, page, page->records, &stack, 0, NULL)); + WT_ERR(__wt_page_write(toc, page)); + __wt_scr_release(&tmp1); + + /* + * Discard the last page, and switch to the next page. + * + * XXX + * The obvious speed-up here is to re-initialize page + * instead of discarding it and acquiring it again as + * as soon as the just-allocated page fills up. I am + * not doing that deliberately: eventually we'll use + * asynchronous I/O in bulk load, which means the page + * won't be reusable until the I/O completes. + */ + page = next; + first_free = next_first_free; + space_avail = next_space_avail; + next = NULL; + next_first_free = NULL; + next_space_avail = 0; + tmp1 = tmp2; + tmp2 = NULL; + } + + ++page->records; + + /* Copy the key item onto the page. */ + if (key != NULL) { + ++page->dsk->u.entries; + + memcpy(first_free, &key_item, sizeof(key_item)); + memcpy(first_free + + sizeof(key_item), key->data, key->size); + space_avail -= WT_ITEM_SPACE_REQ(key->size); + + /* + * If processing duplicates we'll need a copy of the key + * for comparison with the next key. If the key was an + * overflow or Huffman encoded item, we already have a + * copy -- otherwise, use the copy we just put on the + * page. + * + * We also save the location for the key of any current + * duplicate set in case we have to move the set to a + * different page (the case where a duplicate set isn't + * large enough to move offpage, but doesn't entirely + * fit on this page). + */ + if (LF_ISSET(WT_DUPLICATES)) { + if (lastkey == NULL) { + lastkey = &lastkey_std; + lastkey_std.data = + WT_ITEM_BYTE(first_free); + lastkey_std.size = key->size; + } + dup_key = (WT_ITEM *)first_free; + } + first_free += WT_ITEM_SPACE_REQ(key->size); + } + + /* Copy the data item onto the page. */ + ++page->dsk->u.entries; + memcpy(first_free, &data_item, sizeof(data_item)); + memcpy(first_free + sizeof(data_item), data->data, data->size); + space_avail -= WT_ITEM_SPACE_REQ(data->size); + + /* + * If duplicates: if this isn't a duplicate data item, save + * the item location, since it's potentially the first of a + * duplicate data set, and we need to know where duplicate + * data sets start. Additionally, reset the counter and + * space calculation. + */ + if (LF_ISSET(WT_DUPLICATES) && dup_count == 0) { + dup_space = data->size; + dup_data = (WT_ITEM *)first_free; + } + first_free += WT_ITEM_SPACE_REQ(data->size); + + /* + * If duplicates: check to see if the duplicate set crosses + * the (roughly) 25% of the page space boundary. If it does, + * move it offpage. + */ + if (LF_ISSET(WT_DUPLICATES) && dup_count != 0) { + dup_space += data->size; + + if (dup_space < db->leafmin / db->btree_dup_offpage) + continue; + + /* + * Move the duplicate set off our page, and read in the + * rest of the off-page duplicate set. + */ + WT_ERR(__wt_bulk_dup_offpage(toc, &key, &data, lastkey, + dup_data, + (uint32_t)(first_free - (uint8_t *)dup_data), + dup_count, &off, cb)); + + /* Reset the page entry and record counts. */ + page->dsk->u.entries -= (dup_count - 1); + page->records -= dup_count; + page->records += WT_RECORDS(&off); + + /* + * Replace the duplicate set with a WT_OFF structure, + * that is, we've replaced dup_count entries with a + * single entry. + */ + WT_ITEM_SET(&data_item, WT_ITEM_OFF, sizeof(WT_OFF)); + p = (uint8_t *)dup_data; + memcpy(p, &data_item, sizeof(data_item)); + memcpy(p + sizeof(data_item), &off, sizeof(WT_OFF)); + __wt_set_ff_and_sa_from_offset(page, + (uint8_t *)p + WT_ITEM_SPACE_REQ(sizeof(WT_OFF)), + &first_free, &space_avail); + + /* Reset local counters. */ + dup_count = dup_space = 0; + + goto skip_read; + } + } + + /* A ret of 1 just means we've reached the end of the input. */ + if (ret != 1) + goto err; + ret = 0; + + /* Promote a key from any partially-filled page and write it. */ + if (page->dsk->u.entries != 0) { + WT_ERR(__wt_bulk_promote( + toc, page, page->records, &stack, 0, NULL)); + WT_ERR(__wt_page_write(toc, page)); + } + + /* Wrap up reporting. */ + if (f != NULL) + f(toc->name, insert_cnt); + +err: WT_TRET(__wt_bulk_stack_put(toc, &stack)); + if (lastkey_copy != NULL) + __wt_scr_release(&lastkey_copy); + if (tmp1 != NULL) + __wt_scr_release(&tmp1); + if (tmp2 != NULL) + __wt_scr_release(&tmp2); + + return (ret); +} + +/* + * __wt_bulk_dup_offpage -- + * Move the last set of duplicates on the page to a page of their own, + * then load the rest of the duplicate set. + */ +static int +__wt_bulk_dup_offpage(WT_TOC *toc, DBT **keyp, DBT **datap, DBT *lastkey, + WT_ITEM *dup_data, uint32_t dup_len, uint32_t dup_count, WT_OFF *off, + int (*cb)(DB *, DBT **, DBT **)) +{ + DB *db; + DBT *key, *data, *tmp; + IDB *idb; + WT_ITEM data_item; + WT_OVFL data_ovfl; + WT_PAGE *page; + WT_STACK stack; + uint32_t root_addr, space_avail; + uint8_t *first_free; + int ret, success_return; + + db = toc->db; + idb = db->idb; + success_return = 0; + + /* + * This routine is the same as the bulk load routine, except it loads + * only data items into off-page duplicate trees. It's passed a lot + * of state from the bulk load routine, and updates that state as a + * side-effect. + * + * In summary, the bulk load routine stops loading a primary btree leaf + * page, calls us to load a set of duplicate data items into a separate + * btree, and then continues on with its primary leaf page when we + * return. The arguments are complex enough that it's worth describing + * them: + * + * keyp/datap -- + * The key and data pairs the application is filling in -- we + * get them passed to us because we get additional key/data + * pairs returned to us, and the last one we get is likely to + * be consumed by our caller. + * lastkey -- + * The last key pushed onto the caller's page -- we use this to + * compare against future keys we read. + * dup_data -- + * On-page reference to the first duplicate data item in the set. + * dup_count -- + * Count of duplicates in the set. + * off -- + * Callers WT_OFF structure, which we have to fill in. + * cb -- + * User's callback function. + */ + + WT_CLEAR(data_item); + WT_CLEAR(stack); + ret = 0; + + /* Get a scratch buffer and make it look like our work page. */ + WT_ERR(__wt_bulk_scratch_page(toc, + db->leafmin, WT_PAGE_DUP_LEAF, WT_LLEAF, &page, &tmp)); + __wt_set_ff_and_sa_from_offset( + page, WT_PAGE_BYTE(page), &first_free, &space_avail); + + /* Move the duplicates onto the newly allocated page. */ + page->records = dup_count; + page->dsk->u.entries = dup_count; + memcpy(first_free, dup_data, (size_t)dup_len); + first_free += dup_len; + space_avail -= dup_len; + + /* + * Unless we have enough duplicates to split this page, it will be the + * "root" of the offpage duplicates. + */ + root_addr = page->addr; + + /* Read in new duplicate records until the key changes. */ + while ((ret = cb(db, &key, &data)) == 0) { + if (key->size == 0) { + __wt_api_db_errx( + db, "zero-length keys are not supported"); + return (WT_ERROR); + } + WT_STAT_INCR(idb->stats, ITEMS_INSERTED); + WT_STAT_INCR(idb->stats, DUPLICATE_ITEMS_INSERTED); + + /* Loading duplicates, so a key change means we're done. */ + if (lastkey->size != key->size || + db->btree_compare_dup(db, lastkey, key) != 0) { + *keyp = key; + *datap = data; + break; + } + + /* Build the data item we're going to store on the page. */ + WT_ERR(__wt_item_build_data( + toc, data, &data_item, &data_ovfl, WT_IS_DUP)); + + /* + * If there's insufficient space available, allocate a new + * page. + */ + if (WT_ITEM_SPACE_REQ(data->size) > space_avail) { + /* + * We've finished with the page: promote its first key + * to its parent and discard it, then switch to the new + * page. + * + * If we promoted a key, we might have split, and so + * there may be a new offpage duplicates root page. + */ + WT_RET(__wt_bulk_promote(toc, + page, page->records, &stack, 0, &root_addr)); + WT_ERR(__wt_page_write(toc, page)); + page->records = 0; + page->dsk->u.entries = 0; + __wt_set_ff_and_sa_from_offset(page, + WT_PAGE_BYTE(page), &first_free, &space_avail); + } + + ++dup_count; /* Total duplicate count */ + ++page->records; /* On-page key/data count */ + ++page->dsk->u.entries; /* On-page entry count */ + + /* Copy the data item onto the page. */ + WT_ITEM_SET_LEN(&data_item, data->size); + memcpy(first_free, &data_item, sizeof(data_item)); + memcpy(first_free + sizeof(data_item), data->data, data->size); + space_avail -= WT_ITEM_SPACE_REQ(data->size); + first_free += WT_ITEM_SPACE_REQ(data->size); + } + + /* + * Ret values of 1 and 0 are both "OK", the ret value of 1 means we + * reached the end of the bulk input. Save the successful return + * for our final return value. + */ + if (ret != 0 && ret != 1) + goto err; + success_return = ret; + + /* Promote a key from the partially-filled page and write it. */ + WT_ERR( + __wt_bulk_promote(toc, page, page->records, &stack, 0, &root_addr)); + WT_ERR(__wt_page_write(toc, page)); + + /* Fill in the caller's WT_OFF structure. */ + WT_RECORDS(off) = dup_count; + off->addr = root_addr; + off->size = db->intlmin; + +err: WT_TRET(__wt_bulk_stack_put(toc, &stack)); + if (tmp != NULL) + __wt_scr_release(&tmp); + + return (ret == 0 ? success_return : ret); +} + +/* + * __wt_bulk_promote -- + * Promote the first entry on a page to its parent. + */ +static int +__wt_bulk_promote(WT_TOC *toc, WT_PAGE *page, uint64_t incr, + WT_STACK *stack, u_int level, uint32_t *dup_root_addrp) +{ + DB *db; + DBT *key, key_build, *next_tmp; + ENV *env; + WT_ITEM *key_item, item; + WT_OFF off; + WT_OVFL tmp_ovfl; + WT_PAGE *next, *parent; + WT_PAGE_DISK *dsk; + WT_STACK_ELEM *elem; + uint32_t next_space_avail; + uint8_t *next_first_free; + u_int type; + int need_promotion, ret; + void *parent_data; + + db = toc->db; + env = toc->env; + dsk = page->dsk; + WT_CLEAR(item); + next_tmp = NULL; + next = parent = NULL; + ret = 0; + + /* + * If it's a row-store, get a copy of the first item on the page -- it + * might be an overflow item, in which case we need to make a copy for + * the database. Most versions of Berkeley DB tried to reference count + * overflow items if they were promoted to internal pages. That turned + * out to be hard to get right, so I'm not doing it again. + * + * If it's a column-store page, we don't promote a key at all. + */ + switch (dsk->type) { + case WT_PAGE_DUP_INT: + case WT_PAGE_DUP_LEAF: + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + key = &key_build; + WT_CLEAR(key_build); + + key_item = (WT_ITEM *)WT_PAGE_BYTE(page); + switch (WT_ITEM_TYPE(key_item)) { + case WT_ITEM_KEY: + case WT_ITEM_DATA_DUP: + key->data = WT_ITEM_BYTE(key_item); + key->size = WT_ITEM_LEN(key_item); + switch (dsk->type) { + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + WT_ITEM_SET(&item, WT_ITEM_KEY, key->size); + break; + case WT_PAGE_DUP_INT: + case WT_PAGE_DUP_LEAF: + WT_ITEM_SET(&item, WT_ITEM_KEY_DUP, key->size); + break; + default: /* Not possible */ + break; + } + break; + case WT_ITEM_KEY_OVFL: + case WT_ITEM_DATA_DUP_OVFL: + /* + * Assume overflow keys remain overflow keys when they + * are promoted; not necessarily true if internal nodes + * are larger than leaf nodes), but that's unlikely. + */ + WT_CLEAR(tmp_ovfl); + WT_RET(__wt_bulk_ovfl_copy(toc, + WT_ITEM_BYTE_OVFL(key_item), &tmp_ovfl)); + key->data = &tmp_ovfl; + key->size = sizeof(tmp_ovfl); + switch (dsk->type) { + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + WT_ITEM_SET(&item, + WT_ITEM_KEY_OVFL, sizeof(WT_OVFL)); + break; + case WT_PAGE_DUP_INT: + case WT_PAGE_DUP_LEAF: + WT_ITEM_SET(&item, + WT_ITEM_KEY_DUP_OVFL, sizeof(WT_OVFL)); + break; + default: /* Not possible */ + break; + } + break; + WT_ILLEGAL_FORMAT(db); + } + break; + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_INT: + case WT_PAGE_COL_RLE: + case WT_PAGE_COL_VAR: + key = NULL; + break; + WT_ILLEGAL_FORMAT(db); + } + + /* + * There are two paths into this code based on whether the page already + * has a parent. + * + * If we have a page with no parent page, create the parent page. In + * this path, there's not much to do -- allocate a parent page, copy + * reference information from the page to the parent, and we're done. + * This is a modified root-split: we're putting a single key on an + * internal page, which is illegal, but we know another page on this + * page's level will be created, and it will be promoted to the parent + * at some point. This is case #1. + * + * The second path into this code is if we have a page and its parent, + * but the page's reference information doesn't fit on the parent and + * we have to split the parent. This path has two different cases, + * based on whether the page's parent itself has a parent. + * + * Here's a diagram of case #2, where the parent also has a parent: + * + * P2 -> P1 -> L (case #2) + * + * The promoted key from leaf L won't fit onto P1, and so we split P1: + * + * P2 -> P1 + * -> P3 -> L + * + * In case #2, allocate P3 and copy reference information from the leaf + * page to it, then recursively call the promote code to promote the + * first entry from P3 to P2. + * + * Here's a diagram of case #3, where the parent does not have a parent, + * in other words, a root split: + * + * P1 -> L (case #3) + * + * The promoted key from leaf L won't fit onto P1, and so we split P1: + * + * P1 -> + * P2 -> L + * + * In case #3, we allocate P2, copy reference information from the page + * to it, and then recursively call the promote code twice: first to + * promote the first entry from P1 to a new page, and again to promote + * the first entry from P2 to a new page, creating a new root level of + * the tree: + * + * P3 -> P1 + * -> P2 -> L + */ + /* + * To simplify the rest of the code, check to see if there's room for + * another entry in our stack structure. Allocate the stack in groups + * of 20, which is probably big enough for any tree we'll ever see in + * the field, we'll never test the realloc code unless we work at it. + */ +#ifdef HAVE_DIAGNOSTIC +#define WT_STACK_ALLOC_INCR 2 +#else +#define WT_STACK_ALLOC_INCR 20 +#endif + if (stack->size == 0 || level == stack->size - 1) { + uint32_t bytes_allocated = stack->size * sizeof(WT_STACK_ELEM); + WT_RET(__wt_realloc(env, &bytes_allocated, + (stack->size + WT_STACK_ALLOC_INCR) * sizeof(WT_STACK_ELEM), + &stack->elem)); + stack->size += WT_STACK_ALLOC_INCR; + /* + * Note, the stack structure may be entirely uninitialized here, + * that is, everything set to 0 bytes. That's OK: the level of + * the stack starts out at 0, that is, the 0th element of the + * stack is the 1st level of internal/parent pages in the tree. + */ + } + + elem = &stack->elem[level]; + parent = elem->page; + if (parent == NULL) { +split: switch (dsk->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_INT: + case WT_PAGE_COL_RLE: + case WT_PAGE_COL_VAR: + type = WT_PAGE_COL_INT; + break; + case WT_PAGE_DUP_INT: + case WT_PAGE_DUP_LEAF: + type = WT_PAGE_DUP_INT; + break; + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + type = WT_PAGE_ROW_INT; + break; + WT_ILLEGAL_FORMAT(db); + } + + WT_ERR(__wt_bulk_scratch_page( + toc, db->intlmin, type, dsk->level + 1, &next, &next_tmp)); + __wt_set_ff_and_sa_from_offset(next, + WT_PAGE_BYTE(next), &next_first_free, &next_space_avail); + + /* + * Column stores set the starting record number to the starting + * record number of the promoted leaf -- the new leaf is always + * the first record in the new parent's page. Ignore the type + * of the database, it's simpler ot just promote 0 up the tree + * in row store databases. + */ + next->dsk->start_recno = page->dsk->start_recno; + + /* + * If we don't have a parent page, it's case #1 -- allocate the + * parent page immediately. + */ + if (parent == NULL) { + /* + * Case #1 -- there's no parent, it's a root split. No + * additional work in the main tree. In an off-page + * duplicates tree, return the new root of the off-page + * tree. + */ + if (type == WT_PAGE_DUP_INT) + *dup_root_addrp = next->addr; + need_promotion = 0; + } else { + /* + * Case #2 and #3. + * + * Case #3: a root split, so we have to promote a key + * from both of the parent pages: promote the key from + * the existing parent page. + */ + if (stack->elem[level + 1].page == NULL) + WT_ERR(__wt_bulk_promote(toc, parent, + incr, stack, level + 1, dup_root_addrp)); + need_promotion = 1; + + /* Write the last parent page, we have a new one. */ + WT_ERR(__wt_page_write(toc, parent)); + __wt_scr_release(&stack->elem[level].tmp); + } + + /* There's a new parent page, reset the stack. */ + elem = &stack->elem[level]; + elem->page = parent = next; + elem->first_free = next_first_free; + elem->space_avail = next_space_avail; + elem->tmp = next_tmp; + next = NULL; + next_first_free = NULL; + next_space_avail = 0; + next_tmp = NULL; + } else + need_promotion = 0; + + /* + * See if the promoted data will fit (if they don't, we have to split). + * We don't need to check for overflow keys: if the key was an overflow, + * we already created a smaller, on-page version of it. + * + * If there's room, copy the promoted data onto the parent's page. + */ + switch (parent->dsk->type) { + case WT_PAGE_COL_INT: + if (elem->space_avail < sizeof(WT_OFF)) + goto split; + + /* Create the WT_OFF reference. */ + WT_RECORDS(&off) = page->records; + off.addr = page->addr; + off.size = dsk->level == WT_LLEAF ? db->leafmin : db->intlmin; + + /* Store the data item. */ + ++parent->dsk->u.entries; + parent_data = elem->first_free; + memcpy(elem->first_free, &off, sizeof(off)); + elem->first_free += sizeof(WT_OFF); + elem->space_avail -= sizeof(WT_OFF); + + /* Track the last entry on the page for record count updates. */ + stack->elem[level].data = parent_data; + break; + case WT_PAGE_ROW_INT: + case WT_PAGE_DUP_INT: + if (elem->space_avail < + WT_ITEM_SPACE_REQ(sizeof(WT_OFF)) + + WT_ITEM_SPACE_REQ(key->size)) + goto split; + + /* Store the key. */ + ++parent->dsk->u.entries; + memcpy(elem->first_free, &item, sizeof(item)); + memcpy(elem->first_free + sizeof(item), key->data, key->size); + elem->first_free += WT_ITEM_SPACE_REQ(key->size); + elem->space_avail -= WT_ITEM_SPACE_REQ(key->size); + + /* Create the WT_ITEM(WT_OFF) reference. */ + WT_ITEM_SET(&item, WT_ITEM_OFF, sizeof(WT_OFF)); + WT_RECORDS(&off) = page->records; + off.addr = page->addr; + off.size = dsk->level == WT_LLEAF ? db->leafmin : db->intlmin; + + /* Store the data item. */ + ++parent->dsk->u.entries; + parent_data = elem->first_free; + memcpy(elem->first_free, &item, sizeof(item)); + memcpy(elem->first_free + sizeof(item), &off, sizeof(off)); + elem->first_free += WT_ITEM_SPACE_REQ(sizeof(WT_OFF)); + elem->space_avail -= WT_ITEM_SPACE_REQ(sizeof(WT_OFF)); + + /* Track the last entry on the page for record count updates. */ + stack->elem[level].data = parent_data; + break; + WT_ILLEGAL_FORMAT(db); + } + + parent->records += page->records; + + /* + * The promotion for case #2 and the second part of case #3 -- promote + * the key from the newly allocated internal page to its parent. + */ + if (need_promotion) + WT_RET(__wt_bulk_promote( + toc, parent, incr, stack, level + 1, dup_root_addrp)); + else { + /* + * We've finished promoting the new page's key into the tree. + * What remains is to push the new record counts all the way + * to the root. We've already corrected our current "parent" + * page, so proceed from there to the root. + */ + for (elem = + &stack->elem[level + 1]; elem->page != NULL; ++elem) { + switch (elem->page->dsk->type) { + case WT_PAGE_COL_INT: + WT_RECORDS((WT_OFF *)elem->data) += incr; + break; + case WT_PAGE_ROW_INT: + case WT_PAGE_DUP_INT: + WT_RECORDS( + (WT_OFF *)WT_ITEM_BYTE(elem->data)) += incr; + break; + WT_ILLEGAL_FORMAT(db); + } + elem->page->records += incr; + } + } + +err: if (next_tmp != NULL) + __wt_scr_release(&next_tmp); + + return (ret); +} + +/* + * __wt_item_build_key -- + * Process an inserted key item and return an WT_ITEM structure and byte + * string to be stored on the page. + */ +static int +__wt_item_build_key(WT_TOC *toc, DBT *dbt, WT_ITEM *item, WT_OVFL *ovfl) +{ + DB *db; + IDB *idb; + WT_STATS *stats; + + db = toc->db; + idb = db->idb; + stats = idb->stats; + + /* + * We're called with a DBT that references a data/size pair. We can + * re-point that DBT's data and size fields to other memory, but we + * cannot allocate memory in that DBT -- all we can do is re-point it. + * + * For Huffman-encoded key/data items, we need a chunk of new space; + * use the WT_TOC key/data return memory: this routine is called during + * bulk insert and reconciliation, we aren't returning key/data pairs. + */ + + /* Optionally compress the data using the Huffman engine. */ + if (idb->huffman_key != NULL) { + WT_RET(__wt_huffman_encode( + idb->huffman_key, dbt->data, dbt->size, + &toc->key.data, &toc->key.mem_size, &toc->key.size)); + if (toc->key.size > dbt->size) + WT_STAT_INCRV(stats, + HUFFMAN_KEY, toc->key.size - dbt->size); + dbt->data = toc->key.data; + dbt->size = toc->key.size; + } + + /* Create an overflow object if the data won't fit. */ + if (dbt->size > db->leafitemsize) { + WT_STAT_INCR(stats, OVERFLOW_KEY); + + WT_RET(__wt_bulk_ovfl_write(toc, dbt, ovfl)); + + dbt->data = ovfl; + dbt->size = sizeof(*ovfl); + WT_ITEM_SET(item, WT_ITEM_KEY_OVFL, dbt->size); + } else + WT_ITEM_SET(item, WT_ITEM_KEY, dbt->size); + return (0); +} + +/* + * __wt_item_build_data -- + * Process an inserted data item and return an WT_ITEM structure and byte + * string to be stored on the page. + */ +int +__wt_item_build_data( + WT_TOC *toc, DBT *dbt, WT_ITEM *item, WT_OVFL *ovfl, u_int flags) +{ + DB *db; + IDB *idb; + WT_STATS *stats; + + WT_ENV_FCHK(toc->env, + "__wt_item_build_data", flags, WT_APIMASK_BT_BUILD_DATA_ITEM); + + db = toc->db; + idb = db->idb; + stats = idb->stats; + + /* + * We're called with a DBT that references a data/size pair. We can + * re-point that DBT's data and size fields to other memory, but we + * cannot allocate memory in that DBT -- all we can do is re-point it. + * + * For Huffman-encoded key/data items, we need a chunk of new space; + * use the WT_TOC key/data return memory: this routine is called during + * bulk insert and reconciliation, we aren't returning key/data pairs. + */ + WT_CLEAR(*item); + WT_ITEM_SET_TYPE( + item, LF_ISSET(WT_IS_DUP) ? WT_ITEM_DATA_DUP : WT_ITEM_DATA); + + /* + * Handle zero-length items quickly -- this is a common value, it's + * a deleted column-store variable length item. + */ + if (dbt->size == 0) { + WT_ITEM_SET_LEN(item, 0); + return (0); + } + + /* Optionally compress the data using the Huffman engine. */ + if (idb->huffman_data != NULL) { + WT_RET(__wt_huffman_encode( + idb->huffman_data, dbt->data, dbt->size, + &toc->data.data, &toc->data.mem_size, &toc->data.size)); + if (toc->data.size > dbt->size) + WT_STAT_INCRV(stats, + HUFFMAN_DATA, toc->data.size - dbt->size); + dbt->data = toc->data.data; + dbt->size = toc->data.size; + } + + /* Create an overflow object if the data won't fit. */ + if (dbt->size > db->leafitemsize) { + WT_RET(__wt_bulk_ovfl_write(toc, dbt, ovfl)); + + dbt->data = ovfl; + dbt->size = sizeof(*ovfl); + WT_ITEM_SET_TYPE(item, LF_ISSET(WT_IS_DUP) ? + WT_ITEM_DATA_DUP_OVFL : WT_ITEM_DATA_OVFL); + WT_STAT_INCR(stats, OVERFLOW_DATA); + } + + WT_ITEM_SET_LEN(item, dbt->size); + return (0); +} + +/* + * __wt_bulk_ovfl_copy -- + * Copy bulk-loaded overflow items in the database, returning the WT_OVFL + * structure, filled in. + */ +static int +__wt_bulk_ovfl_copy(WT_TOC *toc, WT_OVFL *from, WT_OVFL *to) +{ + DB *db; + DBT *tmp; + WT_PAGE *page; + uint32_t size; + int ret; + + db = toc->db; + tmp = NULL; + + /* Get a scratch buffer and make it look like an overflow page. */ + size = WT_ALIGN(sizeof(WT_PAGE_DISK) + from->size, db->allocsize); + WT_RET(__wt_bulk_scratch_page( + toc, size, WT_PAGE_OVFL, WT_LLEAF, &page, &tmp)); + page->dsk->u.datalen = from->size; + + /* Fill in the return information. */ + to->addr = page->addr; + to->size = from->size; + + /* + * Read the page into our scratch buffer, then write it out to the + * new location. + */ + if ((ret = + __wt_page_disk_read(toc, page->dsk, from->addr, from->size)) == 0) + ret = + __wt_page_disk_write(toc, page->dsk, to->addr, from->size); + + __wt_scr_release(&tmp); + + return (ret); +} + +/* + * __wt_bulk_ovfl_write -- + * Store bulk-loaded overflow items in the database, returning the page + * addr. + */ +static int +__wt_bulk_ovfl_write(WT_TOC *toc, DBT *dbt, WT_OVFL *to) +{ + DB *db; + DBT *tmp; + WT_PAGE *page; + WT_PAGE_DISK *dsk; + uint32_t size; + int ret; + + db = toc->db; + tmp = NULL; + + /* Get a scratch buffer and make it look like our work page. */ + size = WT_ALIGN(sizeof(WT_PAGE_DISK) + dbt->size, db->allocsize); + WT_ERR(__wt_bulk_scratch_page( + toc, size, WT_PAGE_OVFL, WT_LLEAF, &page, &tmp)); + + /* Fill in the return information. */ + to->addr = page->addr; + to->size = dbt->size; + + /* Initialize the page header and copy the record into place. */ + dsk = page->dsk; + dsk->u.datalen = dbt->size; + memcpy((uint8_t *)dsk + sizeof(WT_PAGE_DISK), dbt->data, dbt->size); + + ret = __wt_page_write(toc, page); + +err: if (tmp != NULL) + __wt_scr_release(&tmp); + + return (ret); +} + +/* + * __wt_bulk_scratch_page -- + * Allocate a scratch buffer and make it look like a database page. + */ +static int +__wt_bulk_scratch_page(WT_TOC *toc, uint32_t page_size, + uint32_t page_type, uint32_t page_level, WT_PAGE **page_ret, DBT **tmp_ret) +{ + DBT *tmp; + WT_PAGE *page; + WT_PAGE_DISK *dsk; + uint32_t size; + int ret; + + ret = 0; + + /* + * Allocate a scratch buffer and make sure it's big enough to hold a + * WT_PAGE structure plus the page itself, and clear the memory so + * it's never random bytes. + */ + size = page_size + sizeof(WT_PAGE); + WT_ERR(__wt_scr_alloc(toc, size, &tmp)); + memset(tmp->data, 0, size); + + /* + * Set up the page and allocate a file address. + * + * We don't run the leaf pages through the cache -- that means passing + * a lot of messages we don't want to bother with. We're the only user + * of the file, which means we can grab file space whenever we want. + */ + page = tmp->data; + page->dsk = dsk = + (WT_PAGE_DISK *)((uint8_t *)tmp->data + sizeof(WT_PAGE)); + WT_ERR(__wt_file_alloc(toc, &page->addr, page_size)); + page->size = page_size; + dsk->type = (uint8_t)page_type; + dsk->level = (uint8_t)page_level; + + *page_ret = page; + *tmp_ret = tmp; + return (0); + +err: if (tmp != NULL) + __wt_scr_release(&tmp); + return (ret); +} + +/* + * __wt_bulk_stack_put -- + * Push out the tree's stack of pages. + */ +static int +__wt_bulk_stack_put(WT_TOC *toc, WT_STACK *stack) +{ + ENV *env; + IDB *idb; + WT_STACK_ELEM *elem; + int ret; + + env = toc->env; + idb = toc->db->idb; + ret = 0; + + for (elem = stack->elem; elem->page != NULL; ++elem) { + WT_TRET(__wt_page_write(toc, elem->page)); + + /* + * If we've reached the last element in the stack, it's the + * root page of the tree. Update the in-memory root address + * and the descriptor record. + */ + if ((elem + 1)->page == NULL) { + idb->root_off.addr = elem->page->addr; + idb->root_off.size = elem->page->size; + WT_RECORDS(&idb->root_off) = elem->page->records; + WT_TRET(__wt_desc_write(toc)); + } + + __wt_scr_release(&elem->tmp); + } + __wt_free(env, stack->elem, stack->size * sizeof(WT_STACK_ELEM)); + + return (0); +} + +/* + * __wt_bulk_dbt_copy -- + * Get a copy of DBT referenced object. + */ +static int +__wt_bulk_dbt_copy(ENV *env, DBT *orig, DBT *copy) +{ + if (copy->mem_size < orig->size) + WT_RET(__wt_realloc( + env, ©->mem_size, orig->size, ©->data)); + memcpy(copy->data, orig->data, orig->size); + copy->size = orig->size; + + return (0); +} diff --git a/src/btree/bt_cache.c b/src/btree/bt_cache.c new file mode 100644 index 00000000000..43d4f7e6596 --- /dev/null +++ b/src/btree/bt_cache.c @@ -0,0 +1,133 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_cache_create -- + * Create the underlying cache. + */ +int +__wt_cache_create(ENV *env) +{ + IENV *ienv; + WT_CACHE *cache; + int ret; + + ienv = env->ienv; + ret = 0; + + WT_RET(__wt_calloc(env, 1, sizeof(WT_CACHE), &ienv->cache)); + cache = ienv->cache; + + WT_ERR( + __wt_mtx_alloc(env, "cache eviction server", 1, &cache->mtx_evict)); + WT_ERR(__wt_mtx_alloc(env, "cache read server", 1, &cache->mtx_read)); + WT_ERR(__wt_mtx_alloc(env, "reconciliation", 0, &cache->mtx_reconcile)); + + WT_ERR(__wt_stat_alloc_cache_stats(env, &cache->stats)); + + WT_STAT_SET( + cache->stats, CACHE_BYTES_MAX, env->cache_size * WT_MEGABYTE); + + return (0); + +err: (void)__wt_cache_destroy(env); + return (ret); +} + +/* + * __wt_cache_pages_inuse -- + * Return the number of pages in use. + */ +inline uint64_t +__wt_cache_pages_inuse(WT_CACHE *cache) +{ + uint64_t pages_in, pages_out; + + /* + * Reading 64-bit fields, potentially on 32-bit machines, and other + * threads of control may be modifying them. Check them for sanity + * (although "interesting" corruption is vanishingly unlikely, these + * values just increment over time). + */ + pages_in = cache->stat_pages_in; + pages_out = cache->stat_pages_out; + return (pages_in > pages_out ? pages_in - pages_out : 0); +} + +/* + * __wt_cache_bytes_inuse -- + * Return the number of bytes in use. + */ +inline uint64_t +__wt_cache_bytes_inuse(WT_CACHE *cache) +{ + uint64_t bytes_in, bytes_out; + + /* + * Reading 64-bit fields, potentially on 32-bit machines, and other + * threads of control may be modifying them. Check them for sanity + * (although "interesting" corruption is vanishingly unlikely, these + * values just increment over time). + */ + bytes_in = cache->stat_bytes_in; + bytes_out = cache->stat_bytes_out; + return (bytes_in > bytes_out ? bytes_in - bytes_out : 0); +} + +/* + * __wt_cache_stats -- + * Update the cache statistics for return to the application. + */ +void +__wt_cache_stats(ENV *env) +{ + WT_CACHE *cache; + WT_STATS *stats; + + cache = env->ienv->cache; + stats = cache->stats; + + WT_STAT_SET(stats, CACHE_BYTES_INUSE, __wt_cache_bytes_inuse(cache)); + WT_STAT_SET(stats, CACHE_PAGES_INUSE, __wt_cache_pages_inuse(cache)); +} + +/* + * __wt_cache_destroy -- + * Discard the underlying cache. + */ +int +__wt_cache_destroy(ENV *env) +{ + IENV *ienv; + WT_CACHE *cache; + int ret; + + ienv = env->ienv; + cache = ienv->cache; + ret = 0; + + if (cache == NULL) + return (0); + + /* Discard mutexes. */ + if (cache->mtx_evict != NULL) + (void)__wt_mtx_destroy(env, cache->mtx_evict); + if (cache->mtx_read != NULL) + (void)__wt_mtx_destroy(env, cache->mtx_read); + if (cache->mtx_reconcile != NULL) + (void)__wt_mtx_destroy(env, cache->mtx_reconcile); + + /* Discard allocated memory, and clear. */ + __wt_free(env, cache->stats, 0); + __wt_free(env, ienv->cache, sizeof(WT_CACHE)); + + return (ret); +} diff --git a/src/btree/bt_close.c b/src/btree/bt_close.c new file mode 100644 index 00000000000..6bf58e98d7e --- /dev/null +++ b/src/btree/bt_close.c @@ -0,0 +1,86 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +static int __wt_bt_close_page(WT_TOC *, WT_PAGE *, void *); + +/* + * __wt_bt_close -- + * Close the tree. + */ +int +__wt_bt_close(WT_TOC *toc) +{ + ENV *env; + IDB *idb; + WT_CACHE *cache; + int ret; + + env = toc->env; + idb = toc->db->idb; + cache = env->ienv->cache; + ret = 0; + + /* + * XXX + * We assume two threads can't call the close method at the same time, + * nor can close be called while other threads are in the tree -- the + * higher level API has to ensure this. + */ + + if (WT_UNOPENED_DATABASE(idb)) + return (0); + + /* + * The tree walk is depth first, that is, the worker function is not + * called on internal pages until all children have been visited; so, + * we don't have to worry about a page being dirtied after the visit. + * + * Lock out the cache evictions thread, though, we don't want it trying + * to evict pages we're flushing. + */ + __wt_lock(env, cache->mtx_reconcile); + WT_TRET(__wt_tree_walk(toc, NULL, + WT_WALK_CACHE | WT_WALK_OFFDUP, __wt_bt_close_page, NULL)); + __wt_evict_db_clear(toc); + __wt_unlock(env, cache->mtx_reconcile); + + /* There's no root page any more, kill the pointer to catch mistakes. */ + idb->root_page.page = NULL; + + /* Close the underlying file handle. */ + WT_TRET(__wt_close(env, idb->fh)); + idb->fh = NULL; + + return (ret); +} + +/* + * __wt_bt_close_page -- + * Close a page. + */ +static int +__wt_bt_close_page(WT_TOC *toc, WT_PAGE *page, void *arg) +{ + WT_CC_QUIET(arg, NULL); + + /* Reconcile any dirty pages, then discard the page. */ + if (WT_PAGE_IS_MODIFIED(page)) + WT_RET(__wt_page_reconcile(toc, page)); + + /* + * The tree walk is depth first, that is, the worker function is not + * called on internal pages until all children have been visited; so, + * we don't have to worry about reading a page after we discard it. + */ + __wt_page_discard(toc, page); + + return (0); +} diff --git a/src/btree/bt_cmp.c b/src/btree/bt_cmp.c new file mode 100644 index 00000000000..8cfddc0496a --- /dev/null +++ b/src/btree/bt_cmp.c @@ -0,0 +1,74 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_bt_lex_compare -- + * Lexicographic comparison routine. + */ +int +__wt_bt_lex_compare(DB *db, const DBT *user_dbt, const DBT *tree_dbt) +{ + uint32_t len; + uint8_t *userp, *treep; + + /* + * The DB * argument isn't used by the default routine, but is + * a standard argument for user-specified comparison functions. + */ + WT_CC_QUIET(db, NULL); + + /* + * Return: + * < 0 if user_dbt is lexicographically < tree_dbt + * = 0 if user_dbt is lexicographically = tree_dbt + * > 0 if user_dbt is lexicographically > tree_dbt + * + * We use the names "user" and "tree" so it's clear which the + * application is looking at when we call its comparison func. + */ + if ((len = user_dbt->size) > tree_dbt->size) + len = tree_dbt->size; + for (userp = user_dbt->data, + treep = tree_dbt->data; len > 0; --len, ++userp, ++treep) + if (*userp != *treep) + return (*userp < *treep ? -1 : 1); + + /* Contents are equal up to the smallest length. */ + return (user_dbt->size == tree_dbt->size ? 0 : + (user_dbt->size < tree_dbt->size ? -1 : 1)); +} + +/* + * __wt_bt_int_compare -- + * Integer comparison routine. + */ +int +__wt_bt_int_compare(DB *db, const DBT *user_dbt, const DBT *tree_dbt) +{ + uint64_t user_int, tree_int; + + /* + * The DBT must hold the low-order bits in machine integer order. + * + * Return: + * < 0 if user_dbt is < tree_dbt + * = 0 if user_dbt is = tree_dbt + * > 0 if user_dbt is > tree_dbt + * + * We use the names "user" and "tree" so it's clear which the + * application is looking at when we call its comparison func. + */ + user_int = tree_int = 0; + memcpy(&user_int, user_dbt->data, (size_t)db->btree_compare_int); + memcpy(&tree_int, tree_dbt->data, (size_t)db->btree_compare_int); + + return (user_int == tree_int ? 0 : (user_int < tree_int ? -1 : 1)); +} diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c new file mode 100644 index 00000000000..e27607aba6a --- /dev/null +++ b/src/btree/bt_debug.c @@ -0,0 +1,661 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +#ifdef HAVE_DIAGNOSTIC +static void __wt_debug_dsk_col_fix(DB *, WT_PAGE_DISK *, FILE *); +static void __wt_debug_dsk_col_int(WT_PAGE_DISK *, FILE *); +static void __wt_debug_dsk_col_rle(DB *, WT_PAGE_DISK *, FILE *); +static int __wt_debug_dsk_item(WT_TOC *, WT_PAGE_DISK *, FILE *); +static void __wt_debug_inmem_col_fix(WT_TOC *, WT_PAGE *, FILE *); +static void __wt_debug_inmem_col_int(WT_PAGE *, FILE *); +static void __wt_debug_inmem_col_rle(WT_TOC *, WT_PAGE *, FILE *); +static int __wt_debug_inmem_col_var(WT_TOC *, WT_PAGE *, FILE *); +static void __wt_debug_inmem_row_int(WT_PAGE *, FILE *); +static int __wt_debug_inmem_row_leaf(WT_TOC *, WT_PAGE *, FILE *); +static int __wt_debug_item(WT_TOC *, WT_ITEM *, FILE *); +static int __wt_debug_item_data(WT_TOC *, WT_ITEM *, FILE *fp); +static void __wt_debug_off(WT_OFF *, const char *, FILE *); +static void __wt_debug_page_hdr(WT_TOC *, WT_PAGE *, FILE *); +static void __wt_debug_pair(const char *, void *, uint32_t, FILE *); +static void __wt_debug_repl(WT_REPL *, FILE *); +static void __wt_debug_rleexp(WT_RLE_EXPAND *, FILE *); +static int __wt_debug_set_fp(const char *, FILE **, int *); + +static int +__wt_debug_set_fp(const char *ofile, FILE **fpp, int *close_varp) +{ + FILE *fp; + + *close_varp = 0; + + /* If we were giving a stream, use it. */ + if ((fp = *fpp) != NULL) + return (0); + + /* If we were given a file, use it. */ + if (ofile != NULL) { + if ((fp = fopen(ofile, "w")) == NULL) + return (WT_ERROR); + *fpp = fp; + *close_varp = 1; + return (0); + } + + /* Default to stdout. */ + *fpp = stdout; + return (0); +} + +/* + * __wt_debug_dump -- + * Dump a database in debugging mode. + */ +int +__wt_debug_dump(WT_TOC *toc, char *ofile, FILE *fp) +{ + int do_close, ret; + + WT_RET(__wt_debug_set_fp(ofile, &fp, &do_close)); + + /* + * We use the verification code to do debugging dumps because if we're + * dumping in debugging mode, we want to confirm the page is OK before + * walking it. + */ + ret = __wt_verify(toc, NULL, fp); + + if (do_close) + (void)fclose(fp); + + return (ret); +} + +/* + * __wt_debug_page -- + * Dump a page in debugging mode. + */ +int +__wt_debug_page(WT_TOC *toc, WT_PAGE *page, char *ofile, FILE *fp) +{ + WT_PAGE_DISK *dsk; + DB *db; + int do_close, ret; + + db = toc->db; + dsk = page->dsk; + ret = 0; + + WT_RET(__wt_debug_set_fp(ofile, &fp, &do_close)); + + __wt_debug_page_hdr(toc, page, fp); + + switch (dsk->type) { + case WT_PAGE_COL_VAR: + case WT_PAGE_DUP_INT: + case WT_PAGE_DUP_LEAF: + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_RLE: + case WT_PAGE_COL_INT: + fprintf(fp, + "\trecords %llu, starting recno %llu, level %lu, " + "entries %lu, lsn %lu/%lu\n", + (unsigned long long)page->records, + (unsigned long long)dsk->start_recno, + (u_long)dsk->level, (u_long)dsk->u.entries, + (u_long)dsk->lsn[0], (u_long)dsk->lsn[1]); + break; + case WT_PAGE_OVFL: + fprintf(fp, "size %lu\n", (u_long)dsk->u.datalen); + break; + WT_ILLEGAL_FORMAT(db); + } + + switch (dsk->type) { + case WT_PAGE_COL_VAR: + case WT_PAGE_DUP_INT: + case WT_PAGE_DUP_LEAF: + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + ret = __wt_debug_dsk_item(toc, dsk, fp); + break; + case WT_PAGE_COL_FIX: + __wt_debug_dsk_col_fix(db, dsk, fp); + break; + case WT_PAGE_COL_RLE: + __wt_debug_dsk_col_rle(db, dsk, fp); + break; + case WT_PAGE_COL_INT: + __wt_debug_dsk_col_int(dsk, fp); + break; + default: + break; + } + + fprintf(fp, "}\n"); + + if (do_close) + (void)fclose(fp); + + return (ret); +} + +/* + * __wt_debug_inmem -- + * Dump the in-memory information for a page. + */ +int +__wt_debug_inmem(WT_TOC *toc, WT_PAGE *page, char *ofile, FILE *fp) +{ + DB *db; + int do_close; + + db = toc->db; + + WT_RET(__wt_debug_set_fp(ofile, &fp, &do_close)); + + __wt_debug_page_hdr(toc, page, fp); + + /* Dump the WT_{ROW,COL}_INDX array. */ + switch (page->dsk->type) { + case WT_PAGE_COL_FIX: + __wt_debug_inmem_col_fix(toc, page, fp); + break; + case WT_PAGE_COL_INT: + __wt_debug_inmem_col_int(page, fp); + break; + case WT_PAGE_COL_RLE: + __wt_debug_inmem_col_rle(toc, page, fp); + break; + case WT_PAGE_COL_VAR: + WT_RET(__wt_debug_inmem_col_var(toc, page, fp)); + break; + case WT_PAGE_DUP_LEAF: + case WT_PAGE_ROW_LEAF: + WT_RET(__wt_debug_inmem_row_leaf(toc, page, fp)); + break; + case WT_PAGE_DUP_INT: + case WT_PAGE_ROW_INT: + __wt_debug_inmem_row_int(page, fp); + break; + case WT_PAGE_OVFL: + break; + WT_ILLEGAL_FORMAT(db); + } + + fprintf(fp, "}\n"); + + if (do_close) + (void)fclose(fp); + + return (0); +} + +/* + * __wt_debug_inmem_col_fix -- + * Dump an in-memory WT_PAGE_COL_FIX page. + */ +static void +__wt_debug_inmem_col_fix(WT_TOC *toc, WT_PAGE *page, FILE *fp) +{ + WT_COL *cip; + WT_REPL *repl; + uint32_t fixed_len, i; + + fixed_len = toc->db->fixed_len; + + if (fp == NULL) /* Default to stderr */ + fp = stderr; + + WT_INDX_FOREACH(page, cip, i) { + fprintf(fp, "\tdata {"); + if (WT_FIX_DELETE_ISSET(cip->data)) + fprintf(fp, "deleted"); + else + __wt_print_byte_string(cip->data, fixed_len, fp); + fprintf(fp, "}\n"); + + if ((repl = WT_COL_REPL(page, cip)) != NULL) + __wt_debug_repl(repl, fp); + } +} + +/* + * __wt_debug_inmem_col_int -- + * Dump an in-memory WT_PAGE_COL_INT page. + */ +static void +__wt_debug_inmem_col_int(WT_PAGE *page, FILE *fp) +{ + WT_COL *cip; + uint32_t i; + + if (fp == NULL) /* Default to stderr */ + fp = stderr; + + WT_INDX_FOREACH(page, cip, i) + __wt_debug_off(cip->data, "\t", fp); +} + +/* + * __wt_debug_inmem_col_rle -- + * Dump an in-memory WT_PAGE_COL_RLE page. + */ +static void +__wt_debug_inmem_col_rle(WT_TOC *toc, WT_PAGE *page, FILE *fp) +{ + WT_COL *cip; + WT_RLE_EXPAND *exp; + uint32_t fixed_len, i; + + fixed_len = toc->db->fixed_len; + + if (fp == NULL) /* Default to stderr */ + fp = stderr; + + WT_INDX_FOREACH(page, cip, i) { + fprintf(fp, + "\trepeat %lu {", (u_long)WT_RLE_REPEAT_COUNT(cip->data)); + if (WT_FIX_DELETE_ISSET(WT_RLE_REPEAT_DATA(cip->data))) + fprintf(fp, "deleted"); + else + __wt_print_byte_string( + WT_RLE_REPEAT_DATA(cip->data), fixed_len, fp); + fprintf(fp, "}\n"); + + if ((exp = WT_COL_RLEEXP(page, cip)) != NULL) + __wt_debug_rleexp(exp, fp); + } +} + +/* + * __wt_debug_inmem_col_var -- + * Dump an in-memory WT_PAGE_COL_VAR page. + */ +static int +__wt_debug_inmem_col_var(WT_TOC *toc, WT_PAGE *page, FILE *fp) +{ + WT_COL *cip; + WT_REPL *repl; + uint32_t i; + + if (fp == NULL) /* Default to stderr */ + fp = stderr; + + WT_INDX_FOREACH(page, cip, i) { + fprintf(fp, "\tdata {"); + WT_RET(__wt_debug_item_data(toc, cip->data, fp)); + fprintf(fp, "}\n"); + + if ((repl = WT_COL_REPL(page, cip)) != NULL) + __wt_debug_repl(repl, fp); + } + return (0); +} + +/* + * __wt_debug_inmem_row_leaf -- + * Dump an in-memory WT_PAGE_DUP_LEAF or WT_PAGE_ROW_LEAF page. + */ +static int +__wt_debug_inmem_row_leaf(WT_TOC *toc, WT_PAGE *page, FILE *fp) +{ + WT_REPL *repl; + WT_ROW *rip; + uint32_t i; + + if (fp == NULL) /* Default to stderr */ + fp = stderr; + + WT_INDX_FOREACH(page, rip, i) { + if (__wt_key_process(rip)) + fprintf(fp, "\tkey: {requires processing}\n"); + else + __wt_debug_dbt("\tkey", rip, fp); + + fprintf(fp, "\tdata: {"); + WT_RET(__wt_debug_item_data(toc, rip->data, fp)); + fprintf(fp, "}\n"); + + if ((repl = WT_ROW_REPL(page, rip)) != NULL) + __wt_debug_repl(repl, fp); + } + + return (0); +} + +/* + * __wt_debug_inmem_row_int -- + * Dump an in-memory WT_PAGE_DUP_INT or WT_PAGE_ROW_INT page. + */ +static void +__wt_debug_inmem_row_int(WT_PAGE *page, FILE *fp) +{ + WT_ROW *rip; + uint32_t i; + + if (fp == NULL) /* Default to stderr */ + fp = stderr; + + WT_INDX_FOREACH(page, rip, i) { + if (__wt_key_process(rip)) + fprintf(fp, "\tkey: {requires processing}\n"); + else + __wt_debug_dbt("\tkey", rip, fp); + + __wt_debug_off(rip->data, "\t", fp); + } +} + +/* + * __wt_debug_repl -- + * Dump a replacement array. + */ +static void +__wt_debug_repl(WT_REPL *repl, FILE *fp) +{ + if (fp == NULL) /* Default to stderr */ + fp = stderr; + + for (; repl != NULL; repl = repl->next) + if (WT_REPL_DELETED_ISSET(repl)) + fprintf(fp, "\trepl: {deleted}\n"); + else + __wt_debug_pair( + "\trepl", WT_REPL_DATA(repl), repl->size, fp); +} + +/* + * __wt_debug_rleexp -- + * Dump a column store expansion array. + */ +static void +__wt_debug_rleexp(WT_RLE_EXPAND *exp, FILE *fp) +{ + WT_REPL *repl; + + if (fp == NULL) /* Default to stderr */ + fp = stderr; + + for (; exp != NULL; exp = exp->next) { + repl = exp->repl; + if (WT_REPL_DELETED_ISSET(repl)) + fprintf(fp, "\trepl: {deleted}\n"); + else + __wt_debug_pair( + "\trepl", WT_REPL_DATA(repl), repl->size, fp); + } +} + +/* + * __wt_debug_dsk_item -- + * Dump a page of WT_ITEM's. + */ +static int +__wt_debug_dsk_item(WT_TOC *toc, WT_PAGE_DISK *dsk, FILE *fp) +{ + WT_ITEM *item; + uint32_t i; + + if (fp == NULL) /* Default to stderr */ + fp = stderr; + + WT_ITEM_FOREACH(dsk, item, i) + WT_RET(__wt_debug_item(toc, item, fp)); + return (0); +} + +/* + * __wt_debug_item -- + * Dump a single WT_ITEM. + */ +static int +__wt_debug_item(WT_TOC *toc, WT_ITEM *item, FILE *fp) +{ + DB *db; + WT_OVFL *ovfl; + + if (fp == NULL) /* Default to stderr */ + fp = stderr; + + db = toc->db; + + fprintf(fp, "\t%s: len %lu", + __wt_item_type_string(item), (u_long)WT_ITEM_LEN(item)); + + switch (WT_ITEM_TYPE(item)) { + case WT_ITEM_KEY: + case WT_ITEM_KEY_DUP: + case WT_ITEM_DATA: + case WT_ITEM_DATA_DUP: + break; + case WT_ITEM_KEY_OVFL: + case WT_ITEM_KEY_DUP_OVFL: + case WT_ITEM_DATA_OVFL: + case WT_ITEM_DATA_DUP_OVFL: + ovfl = WT_ITEM_BYTE_OVFL(item); + fprintf(fp, ", addr %lu, size %lu", + (u_long)ovfl->addr, (u_long)ovfl->size); + break; + case WT_ITEM_DEL: + fprintf(fp, "\n"); + return (0); + case WT_ITEM_OFF: + __wt_debug_off(WT_ITEM_BYTE_OFF(item), ", ", fp); + return (0); + WT_ILLEGAL_FORMAT(db); + } + + fprintf(fp, "\n\t{"); + WT_RET(__wt_debug_item_data(toc, item, fp)); + fprintf(fp, "}\n"); + return (0); +} + +/* + * __wt_debug_dsk_col_int -- + * Dump a WT_PAGE_COL_INT page. + */ +static void +__wt_debug_dsk_col_int(WT_PAGE_DISK *dsk, FILE *fp) +{ + WT_OFF *off; + uint32_t i; + + if (fp == NULL) /* Default to stderr */ + fp = stderr; + + WT_OFF_FOREACH(dsk, off, i) + __wt_debug_off(off, "\t", fp); +} + +/* + * __wt_debug_dsk_col_fix -- + * Dump a WT_PAGE_COL_FIX page. + */ +static void +__wt_debug_dsk_col_fix(DB *db, WT_PAGE_DISK *dsk, FILE *fp) +{ + uint32_t i; + uint8_t *p; + + if (fp == NULL) /* Default to stderr */ + fp = stderr; + + WT_FIX_FOREACH(db, dsk, p, i) { + fprintf(fp, "\t{"); + if (WT_FIX_DELETE_ISSET(p)) + fprintf(fp, "deleted"); + else + __wt_print_byte_string(p, db->fixed_len, fp); + fprintf(fp, "}\n"); + } +} + +/* + * __wt_debug_dsk_col_rle -- + * Dump a WT_PAGE_COL_RLE page. + */ +static void +__wt_debug_dsk_col_rle(DB *db, WT_PAGE_DISK *dsk, FILE *fp) +{ + uint32_t i; + uint8_t *p; + + if (fp == NULL) /* Default to stderr */ + fp = stderr; + + WT_RLE_REPEAT_FOREACH(db, dsk, p, i) { + fprintf(fp, "\trepeat %lu {", + (u_long)WT_RLE_REPEAT_COUNT(p)); + if (WT_FIX_DELETE_ISSET(WT_RLE_REPEAT_DATA(p))) + fprintf(fp, "deleted"); + else + __wt_print_byte_string( + WT_RLE_REPEAT_DATA(p), db->fixed_len, fp); + fprintf(fp, "}\n"); + } +} + +/* + * __wt_debug_item_data -- + * Dump a single item's data in debugging mode. + */ +static int +__wt_debug_item_data(WT_TOC *toc, WT_ITEM *item, FILE *fp) +{ + DB *db; + DBT *tmp; + IDB *idb; + uint32_t size; + uint8_t *p; + int ret; + + if (fp == NULL) /* Default to stderr */ + fp = stderr; + + db = toc->db; + tmp = NULL; + idb = db->idb; + ret = 0; + + switch (WT_ITEM_TYPE(item)) { + case WT_ITEM_KEY: + if (idb->huffman_key != NULL) + goto process; + goto onpage; + case WT_ITEM_KEY_DUP: + case WT_ITEM_DATA: + case WT_ITEM_DATA_DUP: + if (idb->huffman_data != NULL) + goto process; +onpage: p = WT_ITEM_BYTE(item); + size = WT_ITEM_LEN(item); + break; + case WT_ITEM_KEY_OVFL: + case WT_ITEM_KEY_DUP_OVFL: + case WT_ITEM_DATA_OVFL: + case WT_ITEM_DATA_DUP_OVFL: +process: WT_ERR(__wt_scr_alloc(toc, 0, &tmp)); + WT_ERR(__wt_item_process(toc, item, tmp)); + p = tmp->data; + size = tmp->size; + break; + case WT_ITEM_DEL: + p = (uint8_t *)"deleted"; + size = 7; + break; + case WT_ITEM_OFF: + p = (uint8_t *)"offpage"; + size = 7; + break; + WT_ILLEGAL_FORMAT_ERR(db, ret); + } + + __wt_print_byte_string(p, size, fp); + +err: if (tmp != NULL) + __wt_scr_release(&tmp); + return (ret); +} + +/* + * __wt_debug_off -- + * Dump a WT_OFF structure. + */ +static void +__wt_debug_off(WT_OFF *off, const char *prefix, FILE *fp) +{ + if (fp == NULL) /* Default to stderr */ + fp = stderr; + + fprintf(fp, "%soffpage: addr %lu, size %lu, records %llu\n", + prefix, (u_long)off->addr, (u_long)off->size, + (unsigned long long)WT_RECORDS(off)); +} + +/* + * __wt_debug_dbt -- + * Dump a single DBT in debugging mode, with an optional tag. + */ +void +__wt_debug_dbt(const char *tag, void *arg_dbt, FILE *fp) +{ + DBT *dbt; + + if (fp == NULL) /* Default to stderr */ + fp = stderr; + + /* + * The argument isn't necessarily a DBT structure, but the first two + * fields of the argument are always a void *data/uint32_t size pair. + */ + dbt = arg_dbt; + __wt_debug_pair(tag, dbt->data, dbt->size, fp); +} + +/* + * __wt_debug_pair -- + * Dump a single data/size pair, with an optional tag. + */ +static void +__wt_debug_pair(const char *tag, void *data, uint32_t size, FILE *fp) +{ + if (fp == NULL) /* Default to stderr */ + fp = stderr; + + if (tag != NULL) + fprintf(fp, "%s: ", tag); + fprintf(fp, "%lu {", (u_long)size); + __wt_print_byte_string(data, size, fp); + fprintf(fp, "}\n"); +} +#endif + +/* + * __wt_debug_page_hdr -- + * Standard debug page-header output. + */ +static void +__wt_debug_page_hdr(WT_TOC *toc, WT_PAGE *page, FILE *fp) +{ + DB *db; + + db = toc->db; + + fprintf(fp, + "addr: %lu-%lu {\n\t%s: size %lu\n", + (u_long)page->addr, + (u_long)page->addr + (WT_OFF_TO_ADDR(db, page->size) - 1), + __wt_page_type_string(page->dsk), (u_long)page->size); + +} diff --git a/src/btree/bt_desc.c b/src/btree/bt_desc.c new file mode 100644 index 00000000000..2fc024d1e8c --- /dev/null +++ b/src/btree/bt_desc.c @@ -0,0 +1,132 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +static int __wt_desc_io(WT_TOC *, void *, int); + +/* + * __wt_desc_stat -- + * Fill in the statistics from the database description. + */ +int +__wt_desc_stat(WT_TOC *toc) +{ + WT_PAGE_DESC desc; + WT_STATS *stats; + + stats = toc->db->idb->dstats; + + WT_RET(__wt_desc_io(toc, &desc, 1)); + + WT_STAT_SET(stats, MAGIC, desc.magic); + WT_STAT_SET(stats, MAJOR, desc.majorv); + WT_STAT_SET(stats, MINOR, desc.minorv); + WT_STAT_SET(stats, INTLMAX, desc.intlmax); + WT_STAT_SET(stats, INTLMIN, desc.intlmin); + WT_STAT_SET(stats, LEAFMAX, desc.leafmax); + WT_STAT_SET(stats, LEAFMIN, desc.leafmin); + WT_STAT_SET(stats, BASE_RECNO, desc.recno_offset); + WT_STAT_SET(stats, FIXED_LEN, desc.fixed_len); + + return (0); +} + +/* + * __wt_desc_read -- + * Read the descriptor structure from page 0. + */ +int +__wt_desc_read(WT_TOC *toc) +{ + DB *db; + WT_PAGE_DESC desc; + + db = toc->db; + + WT_RET(__wt_desc_io(toc, &desc, 1)); + + db->intlmax = desc.intlmax; /* Update DB handle */ + db->intlmin = desc.intlmin; + db->leafmax = desc.leafmax; + db->leafmin = desc.leafmin; + db->idb->root_off.addr = desc.root_addr; + db->idb->root_off.size = desc.root_size; + WT_RECORDS(&db->idb->root_off) = desc.records; + db->idb->free_addr = desc.free_addr; + db->idb->free_size = desc.free_size; + db->fixed_len = desc.fixed_len; + + /* + * XXX + * This is the wrong place to do this -- need to think about how + * to update open/configuration information in a reasonable way. + */ + if (db->fixed_len != 0) + F_SET(db->idb, WT_COLUMN); + + return (0); +} + +/* + * __wt_desc_write -- + * Update the description page. + */ +int +__wt_desc_write(WT_TOC *toc) +{ + DB *db; + IDB *idb; + WT_PAGE_DESC desc; + int ret; + + db = toc->db; + idb = db->idb; + ret = 0; + + desc.magic = WT_BTREE_MAGIC; + desc.majorv = WT_BTREE_MAJOR_VERSION; + desc.minorv = WT_BTREE_MINOR_VERSION; + desc.intlmax = db->intlmax; + desc.intlmin = db->intlmin; + desc.leafmax = db->leafmax; + desc.leafmin = db->leafmin; + desc.recno_offset = 0; + desc.root_addr = idb->root_off.addr; + desc.root_size = idb->root_off.size; + desc.records = WT_RECORDS(&idb->root_off); + desc.free_addr = idb->free_addr; + desc.free_size = idb->free_size; + desc.fixed_len = (uint8_t)db->fixed_len; + desc.flags = 0; + if (F_ISSET(idb, WT_RLE)) + F_SET(&desc, WT_PAGE_DESC_RLE); + + WT_RET(__wt_desc_io(toc, &desc, 0)); + + return (ret); +} + +/* + * __wt_desc_io -- + * Read/write the WT_DESC sector. + */ +static int +__wt_desc_io(WT_TOC *toc, void *p, int is_read) +{ + WT_FH *fh; + ENV *env; + + fh = toc->db->idb->fh; + env = toc->env; + + return (is_read ? + __wt_read(env, fh, (off_t)0, 512, p) : + __wt_write(env, fh, (off_t)0, 512, p)); +} diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c new file mode 100644 index 00000000000..8e189204ce0 --- /dev/null +++ b/src/btree/bt_discard.c @@ -0,0 +1,234 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +static void __wt_page_discard_dup(ENV *, WT_PAGE *); +static void __wt_page_discard_rleexp(ENV *, WT_PAGE *); +static void __wt_page_discard_repl(ENV *, WT_PAGE *); +static void __wt_page_discard_repl_list(ENV *, WT_REPL *); +static inline int __wt_row_key_on_page(WT_PAGE *, WT_ROW *); + +/* + * __wt_page_discard -- + * Free all memory associated with a page. + */ +void +__wt_page_discard(WT_TOC *toc, WT_PAGE *page) +{ + ENV *env; + WT_ROW *rip; + uint32_t i, type; + void *last_key; + + env = toc->env; + type = page->dsk->type; + + /* Never discard a dirty page. */ + WT_ASSERT(env, !WT_PAGE_IS_MODIFIED(page)); + + /* Free the in-memory index array. */ + switch (type) { + case WT_PAGE_DUP_INT: + case WT_PAGE_DUP_LEAF: + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + /* + * For each entry, see if the key was an allocation (that is, + * if it points somewhere other than the original page), and + * if so, free the memory. This test is a superset of the + * __wt_key_process test, that is, any key requiring processing + * but not yet processed, must reference on-page information. + */ + last_key = NULL; + WT_INDX_FOREACH(page, rip, i) { + if (__wt_row_key_on_page(page, rip)) + continue; + + /* + * Only test the first entry for duplicate key/data + * pairs, the others reference the same memory. (This + * test only makes sense for WT_PAGE_ROW_LEAF pages, + * but there is no cost in doing the test for duplicate + * leaf pages as well.) + */ + if (rip->key == last_key) + continue; + last_key = rip->key; + __wt_free(env, rip->key, rip->size); + } + __wt_free(env, page->u.irow, page->indx_count * sizeof(WT_ROW)); + break; + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_INT: + case WT_PAGE_COL_RLE: + case WT_PAGE_COL_VAR: + __wt_free(env, page->u.icol, page->indx_count * sizeof(WT_COL)); + break; + default: + break; + } + + /* Free the modified/deletion replacements array. */ + switch (type) { + case WT_PAGE_DUP_LEAF: + case WT_PAGE_ROW_LEAF: + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + if (page->u2.repl != NULL) + __wt_page_discard_repl(env, page); + break; + default: + break; + } + + /* Free the run-length encoded column store expansion array. */ + switch (type) { + case WT_PAGE_COL_RLE: + if (page->u2.rleexp != NULL) + __wt_page_discard_rleexp(env, page); + break; + default: + break; + } + + /* Free the subtree-reference array. */ + switch (type) { + case WT_PAGE_COL_INT: + case WT_PAGE_DUP_INT: + case WT_PAGE_ROW_INT: + if (page->u3.ref != NULL) + __wt_free(env, page->u3.ref, + page->indx_count * sizeof(WT_REF)); + break; + case WT_PAGE_ROW_LEAF: + if (WT_PAGE_DUP_TREES(page)) + __wt_page_discard_dup(env, page); + break; + default: + break; + } + + if (page->dsk != NULL) + __wt_free(env, page->dsk, page->size); + __wt_free(env, page, sizeof(WT_PAGE)); +} + +/* + * __wt_page_discard_repl -- + * Discard the replacement array. + */ +static void +__wt_page_discard_repl(ENV *env, WT_PAGE *page) +{ + WT_REPL **replp; + u_int i; + + /* + * For each non-NULL slot in the page's array of replacements, free the + * linked list anchored in that slot. + */ + WT_REPL_FOREACH(page, replp, i) + if (*replp != NULL) + __wt_page_discard_repl_list(env, *replp); + + /* Free the page's array of replacements. */ + __wt_free(env, page->u2.repl, page->indx_count * sizeof(WT_REPL *)); +} + +/* + * __wt_page_discard_rleexp -- + * Discard the run-length encoded column store expansion array. + */ +static void +__wt_page_discard_rleexp(ENV *env, WT_PAGE *page) +{ + WT_RLE_EXPAND **expp, *exp, *a; + u_int i; + + /* + * For each non-NULL slot in the page's run-length encoded column + * store expansion array, free the linked list of WT_RLE_EXPAND + * structures anchored in that slot. + */ + WT_RLE_EXPAND_FOREACH(page, expp, i) { + if ((exp = *expp) == NULL) + continue; + /* + * Free the linked list of WT_REPL structures anchored in the + * WT_RLE_EXPAND entry. + */ + __wt_page_discard_repl_list(env, exp->repl); + do { + a = exp->next; + __wt_free(env, exp, sizeof(WT_RLE_EXPAND)); + } while ((exp = a) != NULL); + } + + /* Free the page's expansion array. */ + __wt_free( + env, page->u2.rleexp, page->indx_count * sizeof(WT_RLE_EXPAND *)); +} + +/* + * __wt_page_discard_repl_list -- + * Walk a WT_REPL forward-linked list and free the per-thread combination + * of a WT_REPL structure and its associated data. + */ +static void +__wt_page_discard_repl_list(ENV *env, WT_REPL *repl) +{ + WT_REPL *a; + WT_TOC_UPDATE *update; + + do { + a = repl->next; + + update = repl->update; + WT_ASSERT(env, update->out < update->in); + if (++update->out == update->in) + __wt_free(env, update, update->len); + } while ((repl = a) != NULL); +} + +/* + * __wt_page_discard_dup -- + * Walk the off-page duplicates tree array. + */ +static void +__wt_page_discard_dup(ENV *env, WT_PAGE *page) +{ + WT_REF **dupp; + u_int i; + + /* + * For each non-NULL slot in the page's array of off-page duplicate + * references, free the reference. + */ + WT_DUP_FOREACH(page, dupp, i) + if (*dupp != NULL) + __wt_free(env, *dupp, sizeof(WT_REF)); + + /* Free the page's array of off-page duplicate references. */ + __wt_free(env, page->u3.dup, page->indx_count * sizeof(WT_REF *)); +} + +/* + * __wt_row_key_on_page -- + * Return if a WT_ROW structure's key references on-page data. + */ +static inline int +__wt_row_key_on_page(WT_PAGE *page, WT_ROW *rip) +{ + uint8_t *p; + + p = rip->key; + return (p >= (uint8_t *)page->dsk && + p < (uint8_t *)page->dsk + page->size ? 1 : 0); +} diff --git a/src/btree/bt_dump.c b/src/btree/bt_dump.c new file mode 100644 index 00000000000..4d46fceff27 --- /dev/null +++ b/src/btree/bt_dump.c @@ -0,0 +1,472 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +typedef struct { + void (*p) /* Print function */ + (uint8_t *, uint32_t, FILE *); + FILE *stream; /* Dump stream */ + + void (*f)(const char *, uint64_t); /* Progress callback */ + uint64_t fcnt; /* Progress counter */ + + DBT *dupkey; /* Offpage duplicate tree key */ +} WT_DSTUFF; + +static int __wt_dump_page(WT_TOC *, WT_PAGE *, void *); +static void __wt_dump_page_col_fix(WT_TOC *, WT_PAGE *, WT_DSTUFF *); +static int __wt_dump_page_col_rle(WT_TOC *, WT_PAGE *, WT_DSTUFF *); +static int __wt_dump_page_col_var(WT_TOC *, WT_PAGE *, WT_DSTUFF *); +static int __wt_dump_page_dup_leaf(WT_TOC *, WT_PAGE *, WT_DSTUFF *); +static int __wt_dump_page_row_leaf(WT_TOC *, WT_PAGE *, WT_DSTUFF *); +static void __wt_print_byte_string_hex(uint8_t *, uint32_t, FILE *); +static void __wt_print_byte_string_nl(uint8_t *, uint32_t, FILE *); + +/* + * __wt_db_dump -- + * Db.dump method. + */ +int +__wt_db_dump(WT_TOC *toc, + FILE *stream, void (*f)(const char *, uint64_t), uint32_t flags) +{ + WT_DSTUFF dstuff; + int ret; + + if (LF_ISSET(WT_DEBUG)) { + /* + * We use the verification code to do debugging dumps because + * if we're dumping in debugging mode, we want to confirm the + * page is OK before blindly reading it. + */ + return (__wt_verify(toc, f, stream)); + } + + dstuff.p = flags == WT_PRINTABLES ? + __wt_print_byte_string_nl : __wt_print_byte_string_hex; + dstuff.stream = stream; + dstuff.f = f; + dstuff.fcnt = 0; + dstuff.dupkey = NULL; + + /* + * Note we do not have a hazard reference for the root page, and that's + * safe -- root pages are pinned into memory when a database is opened, + * and never re-written until the database is closed. + */ + fprintf(stream, "VERSION=1\n"); + fprintf(stream, "HEADER=END\n"); + ret = __wt_tree_walk(toc, NULL, 0, __wt_dump_page, &dstuff); + fprintf(stream, "DATA=END\n"); + + /* Wrap up reporting. */ + if (f != NULL) + f(toc->name, dstuff.fcnt); + + return (ret); +} + +/* + * __wt_dump_page -- + * Depth-first recursive walk of a btree. + */ +static int +__wt_dump_page(WT_TOC *toc, WT_PAGE *page, void *arg) +{ + DB *db; + WT_DSTUFF *dp; + + db = toc->db; + dp = arg; + + switch (page->dsk->type) { + case WT_PAGE_COL_INT: + case WT_PAGE_DUP_INT: + case WT_PAGE_ROW_INT: + break; + case WT_PAGE_COL_FIX: + __wt_dump_page_col_fix(toc, page, dp); + break; + case WT_PAGE_COL_RLE: + WT_RET(__wt_dump_page_col_rle(toc, page, dp)); + break; + case WT_PAGE_COL_VAR: + WT_RET(__wt_dump_page_col_var(toc, page, dp)); + break; + case WT_PAGE_DUP_LEAF: + WT_RET(__wt_dump_page_dup_leaf(toc, page, dp)); + break; + case WT_PAGE_ROW_LEAF: + WT_RET(__wt_dump_page_row_leaf(toc, page, dp)); + break; + WT_ILLEGAL_FORMAT(db); + } + + /* Report progress every 10 pages. */ + if (dp->f != NULL && ++dp->fcnt % 10 == 0) + dp->f(toc->name, dp->fcnt); + + return (0); +} + +/* + * __wt_dump_page_col_fix -- + * Dump a WT_PAGE_COL_FIX page. + */ +static void +__wt_dump_page_col_fix(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp) +{ + DB *db; + WT_COL *cip; + WT_REPL *repl; + uint32_t i; + + db = toc->db; + + /* Walk the page, dumping data items. */ + WT_INDX_FOREACH(page, cip, i) { + if ((repl = WT_COL_REPL(page, cip)) == NULL) { + if (!WT_FIX_DELETE_ISSET(cip->data)) + dp->p(cip->data, db->fixed_len, dp->stream); + } else + if (!WT_REPL_DELETED_ISSET(repl)) + dp->p(WT_REPL_DATA(repl), + db->fixed_len, dp->stream); + } +} + +/* + * __wt_dump_page_col_rle -- + * Dump a WT_PAGE_COL_RLE page. + */ +static int +__wt_dump_page_col_rle(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp) +{ + DB *db; + ENV *env; + WT_COL *cip; + WT_RLE_EXPAND *exp, **expsort, **expp; + WT_REPL *repl; + uint64_t recno; + uint32_t i, n_expsort; + uint16_t n_repeat; + + db = toc->db; + env = toc->env; + expsort = NULL; + n_expsort = 0; + + recno = page->dsk->start_recno; + WT_INDX_FOREACH(page, cip, i) { + /* + * Get a sorted list of any expansion entries we've created for + * this set of records. The sort function returns a NULL- + * terminated array of references to WT_RLE_EXPAND structures, + * sorted by record number. + */ + WT_RET(__wt_rle_expand_sort( + env, page, cip, &expsort, &n_expsort)); + + /* + * Dump the records. We use the WT_REPL entry for records in + * in the WT_RLE_EXPAND array, and original data otherwise. + */ + for (expp = expsort, + n_repeat = WT_RLE_REPEAT_COUNT(cip->data); + n_repeat > 0; --n_repeat, ++recno) + if ((exp = *expp) != NULL && exp->recno == recno) { + ++expp; + repl = exp->repl; + if (WT_REPL_DELETED_ISSET(repl)) + continue; + dp->p( + WT_REPL_DATA(repl), repl->size, dp->stream); + } else + dp->p(WT_RLE_REPEAT_DATA(cip->data), + db->fixed_len, dp->stream); + } + /* Free the sort array. */ + if (expsort != NULL) + __wt_free(env, expsort, n_expsort * sizeof(WT_RLE_EXPAND *)); + + return (0); +} + +/* + * __wt_dump_page_col_var -- + * Dump a WT_PAGE_COL_VAR page. + */ +static int +__wt_dump_page_col_var(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp) +{ + DB *db; + DBT *tmp; + WT_COL *cip; + WT_ITEM *item; + WT_REPL *repl; + uint32_t i; + int ret; + void *huffman; + + db = toc->db; + huffman = db->idb->huffman_data; + ret = 0; + + WT_RET(__wt_scr_alloc(toc, 0, &tmp)); + WT_INDX_FOREACH(page, cip, i) { + /* Check for replace or deletion. */ + if ((repl = WT_COL_REPL(page, cip)) != NULL) { + if (!WT_REPL_DELETED_ISSET(repl)) + dp->p( + WT_REPL_DATA(repl), repl->size, dp->stream); + continue; + } + + /* Process the original data. */ + item = cip->data; + switch (WT_ITEM_TYPE(item)) { + case WT_ITEM_DATA: + if (huffman == NULL) { + dp->p(WT_ITEM_BYTE(item), + WT_ITEM_LEN(item), dp->stream); + break; + } + /* FALLTHROUGH */ + case WT_ITEM_DATA_OVFL: + WT_ERR(__wt_item_process(toc, item, tmp)); + dp->p(tmp->data, tmp->size, dp->stream); + break; + case WT_ITEM_DEL: + break; + WT_ILLEGAL_FORMAT_ERR(db, ret); + } + } + +err: __wt_scr_release(&tmp); + return (ret); +} + +/* + * __wt_dump_page_dup_leaf -- + * Dump a WT_PAGE_DUP_LEAF page. + */ +static int +__wt_dump_page_dup_leaf(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp) +{ + DB *db; + DBT *dupkey, *tmp; + WT_ITEM *item; + WT_REPL *repl; + WT_ROW *rip; + uint32_t i; + int ret; + void *huffman; + + db = toc->db; + dupkey = dp->dupkey; + huffman = db->idb->huffman_data; + ret = 0; + + WT_ERR(__wt_scr_alloc(toc, 0, &tmp)); + WT_INDX_FOREACH(page, rip, i) { + /* Check for deletion. */ + if ((repl = WT_ROW_REPL( + page, rip)) != NULL && WT_REPL_DELETED_ISSET(repl)) + continue; + + /* Output the key, we're going to need it. */ + dp->p(dupkey->data, dupkey->size, dp->stream); + + /* Output the replacement item. */ + if (repl != NULL) { + dp->p(WT_REPL_DATA(repl), repl->size, dp->stream); + continue; + } + + /* Process the original data. */ + item = rip->data; + switch (WT_ITEM_TYPE(item)) { + case WT_ITEM_DATA_DUP: + if (huffman == NULL) { + dp->p(WT_ITEM_BYTE(item), + WT_ITEM_LEN(item), dp->stream); + break; + } + /* FALLTHROUGH */ + case WT_ITEM_DATA_DUP_OVFL: + WT_ERR(__wt_item_process(toc, item, tmp)); + dp->p(tmp->data, tmp->size, dp->stream); + break; + WT_ILLEGAL_FORMAT_ERR(db, ret); + } + } + +err: __wt_scr_release(&tmp); + return (ret); +} + +/* + * __wt_dump_page_row_leaf -- + * Dump a WT_PAGE_ROW_LEAF page. + */ +static int +__wt_dump_page_row_leaf(WT_TOC *toc, WT_PAGE *page, WT_DSTUFF *dp) +{ + DB *db; + DBT *key, *data, *key_tmp, *data_tmp, key_local, data_local; + WT_ITEM *item; + WT_OFF *off; + WT_REF *ref; + WT_REPL *repl; + WT_ROW *rip; + uint32_t i; + int ret; + void *huffman; + + db = toc->db; + key = data = key_tmp = data_tmp = NULL; + huffman = db->idb->huffman_data; + ret = 0; + + WT_ERR(__wt_scr_alloc(toc, 0, &key_tmp)); + WT_ERR(__wt_scr_alloc(toc, 0, &data_tmp)); + WT_CLEAR(key_local); + WT_CLEAR(data_local); + + WT_INDX_FOREACH(page, rip, i) { + /* Check for deletion. */ + if ((repl = WT_ROW_REPL( + page, rip)) != NULL && WT_REPL_DELETED_ISSET(repl)) + continue; + + /* + * The key and data variables reference the DBT's we'll print. + * Set the key. + */ + if (__wt_key_process(rip)) { + WT_ERR(__wt_item_process(toc, rip->key, key_tmp)); + key = key_tmp; + } else + key = (DBT *)rip; + + /* + * If the item was ever replaced, we're done: it can't be an + * off-page tree, and we don't care what kind of item it was + * originally. Dump the data from the replacement entry. + * + * XXX + * This is wrong -- if an off-page dup tree is reconciled, + * the off-page reference will change underfoot. + */ + if (repl != NULL) { + dp->p(key->data, key->size, dp->stream); + dp->p(WT_REPL_DATA(repl), repl->size, dp->stream); + continue; + } + + /* Set data to reference the data we'll dump. */ + item = rip->data; + switch (WT_ITEM_TYPE(item)) { + case WT_ITEM_DATA: + case WT_ITEM_DATA_DUP: + if (huffman == NULL) { + data_local.data = WT_ITEM_BYTE(item); + data_local.size = WT_ITEM_LEN(item); + data = &data_local; + break; + } + /* FALLTHROUGH */ + case WT_ITEM_DATA_DUP_OVFL: + case WT_ITEM_DATA_OVFL: + WT_ERR(__wt_item_process(toc, item, data_tmp)); + data = data_tmp; + break; + case WT_ITEM_OFF: + /* + * Set the key and recursively call the tree-walk code + * for any off-page duplicate trees. (Check for any + * off-page duplicate trees locally because we already + * have to walk the page, so it's faster than walking + * the page both here and in the tree-walk function.) + */ + dp->dupkey = key; + + ref = WT_ROW_DUP(page, rip); + off = WT_ROW_OFF(rip); + WT_RET(__wt_page_in(toc, page, ref, off, 0)); + ret = __wt_tree_walk(toc, ref, 0, __wt_dump_page, dp); + __wt_hazard_clear(toc, ref->page); + if (ret != 0) + goto err; + continue; + WT_ILLEGAL_FORMAT_ERR(db, ret); + } + + dp->p(key->data, key->size, dp->stream); + dp->p(data->data, data->size, dp->stream); + } + +err: /* Discard any space allocated to hold off-page key/data items. */ + if (key_tmp != NULL) + __wt_scr_release(&key_tmp); + if (data_tmp != NULL) + __wt_scr_release(&data_tmp); + + return (ret); +} + +static const char hex[] = "0123456789abcdef"; + +/* + * __wt_print_byte_string_nl -- + * Output a single byte stringin printable characters, where possible. + * In addition, terminate with a <newline> character, unless the entry + * is itself terminated with a <newline> character. + */ +static void +__wt_print_byte_string_nl(uint8_t *data, uint32_t size, FILE *stream) +{ + if (data[size - 1] == '\n') + --size; + __wt_print_byte_string(data, size, stream); + fprintf(stream, "\n"); +} + +/* + * __wt_print_byte_string -- + * Output a single byte string in printable characters, where possible. + */ +void +__wt_print_byte_string(uint8_t *data, uint32_t size, FILE *stream) +{ + int ch; + + for (; size > 0; --size, ++data) { + ch = data[0]; + if (isprint(ch)) + fprintf(stream, "%c", ch); + else + fprintf(stream, "%x%x", + hex[(data[0] & 0xf0) >> 4], hex[data[0] & 0x0f]); + } +} + +/* + * __wt_print_byte_string_hex -- + * Output a single byte string in hexadecimal characters. + */ +static void +__wt_print_byte_string_hex(uint8_t *data, uint32_t size, FILE *stream) +{ + for (; size > 0; --size, ++data) + fprintf(stream, "%x%x", + hex[(data[0] & 0xf0) >> 4], hex[data[0] & 0x0f]); + fprintf(stream, "\n"); +} diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c new file mode 100644 index 00000000000..cd4cb87bfb4 --- /dev/null +++ b/src/btree/bt_evict.c @@ -0,0 +1,944 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +static int __wt_evict(WT_TOC *); +static int __wt_evict_compare_lru(const void *a, const void *b); +static int __wt_evict_compare_page(const void *a, const void *b); +static void __wt_evict_hazard_check(WT_TOC *); +static int __wt_evict_hazard_compare(const void *a, const void *b); +static void __wt_evict_page(WT_TOC *, int); +static int __wt_evict_page_subtrees(WT_PAGE *); +static void __wt_evict_set(WT_TOC *); +static void __wt_evict_state_check(WT_TOC *); +static int __wt_evict_walk(WT_TOC *); +static int __wt_evict_walk_single(WT_TOC *, IDB *, uint); +static void __wt_evict_write(WT_TOC *); + +#ifdef HAVE_DIAGNOSTIC +static void __wt_evict_hazard_validate(ENV *, WT_PAGE *); +#endif + +/* + * Tuning constants -- I hesitate to call this tuning, but we should review some + * number of pages from each file's in-memory tree for each page we evict, and + * we should amortize the comparison of the hazard references across some number + * of eviction candidates. + */ +#define WT_EVICT_GROUP 10 /* Evict N pages at a time */ +#define WT_EVICT_WALK_PER_TABLE 5 /* Pages to visit per file */ +#define WT_EVICT_WALK_BASE 25 /* Pages tracked across file visits */ + +/* + * WT_EVICT_FOREACH -- + * Walk a list of eviction candidates. + */ +#define WT_EVICT_FOREACH(cache, p, i) \ + for ((i) = 0, (p) = (cache)->evict; (i) < WT_EVICT_GROUP; ++(i), ++(p)) + +/* + * WT_EVICT_REF_CLR -- + * Clear an eviction list entry. + */ +#define WT_EVICT_CLR(p) do { \ + (p)->ref = NULL; \ + (p)->idb = WT_DEBUG_POINT; \ +} while (0) + +/* + * __wt_workq_evict_server -- + * See if the eviction server thread needs to be awakened. + */ +void +__wt_workq_evict_server(ENV *env, int force) +{ + WT_CACHE *cache; + uint64_t bytes_inuse, bytes_max; + + cache = env->ienv->cache; + + /* If the eviction server is running, there's nothing to do. */ + if (!cache->evict_sleeping) + return; + + /* + * If we're locking out reads, or over our cache limit, or forcing the + * issue (when closing the environment), run the eviction server. + */ + bytes_inuse = __wt_cache_bytes_inuse(cache); + bytes_max = WT_STAT(cache->stats, CACHE_BYTES_MAX); + if (!force && !cache->read_lockout && bytes_inuse < bytes_max) + return; + + WT_VERBOSE(env, WT_VERB_EVICT, (env, + "waking eviction server: force %sset, read lockout %sset, " + "bytes inuse %s max (%lluMB %s %lluMB), ", + force ? "" : "not ", cache->read_lockout ? "" : "not ", + bytes_inuse <= bytes_max ? "<=" : ">", + (unsigned long long)(bytes_inuse / WT_MEGABYTE), + bytes_inuse <= bytes_max ? "<=" : ">", + (unsigned long long)(bytes_max / WT_MEGABYTE))); + + cache->evict_sleeping = 0; + __wt_unlock(env, cache->mtx_evict); +} + +/* + * __wt_cache_evict_server -- + * Thread to evict pages from the cache. + */ +void * +__wt_cache_evict_server(void *arg) +{ + ENV *env; + IENV *ienv; + WT_CACHE *cache; + WT_TOC *toc; + uint64_t bytes_inuse, bytes_max; + int ret; + + env = arg; + ienv = env->ienv; + cache = ienv->cache; + ret = 0; + + /* We need a thread of control because we're reading/writing pages. */ + toc = NULL; + WT_ERR(__wt_toc_api_set(env, "CacheReconciliation", NULL, &toc)); + + /* + * Multiple pages are marked for eviction by the eviction server, which + * means nobody can read them -- but, this thread of control has to + * update higher pages in the tree when it writes this page, which + * requires reading other pages, which might themselves be marked for + * eviction. Set a flag to allow this thread of control to see pages + * marked for eviction -- we know it's safe, because only this thread + * is writing pages. + * + * Reconciliation is probably running because the cache is full, which + * means reads are locked out -- reconciliation can read, regardless. + */ + F_SET(toc, WT_READ_EVICT | WT_READ_PRIORITY); + + /* + * Allocate memory for a copy of the hazard references -- it's a fixed + * size so doesn't need run-time adjustments. + */ + cache->hazard_elem = env->toc_size * env->hazard_size; + WT_ERR(__wt_calloc( + env, cache->hazard_elem, sizeof(WT_PAGE *), &cache->hazard)); + cache->hazard_len = cache->hazard_elem * sizeof(WT_PAGE *); + + for (;;) { + WT_VERBOSE(env, + WT_VERB_EVICT, (env, "eviction server sleeping")); + cache->evict_sleeping = 1; + __wt_lock(env, cache->mtx_evict); + WT_VERBOSE(env, + WT_VERB_EVICT, (env, "eviction server waking")); + + /* + * Check for environment exit; do it here, instead of the top of + * the loop because doing it here keeps us from doing a bunch of + * worked when simply awakened to quit. + */ + if (!F_ISSET(ienv, WT_SERVER_RUN)) + break; + + for (;;) { + /* Single-thread reconciliation. */ + __wt_lock(env, cache->mtx_reconcile); + ret = __wt_evict(toc); + __wt_unlock(env, cache->mtx_reconcile); + if (ret != 0) + goto err; + + /* + * If we've locked out reads, keep evicting until we + * get to at least 5% under the maximum cache. Else, + * quit evicting as soon as we get under the maximum + * cache. + */ + bytes_inuse = __wt_cache_bytes_inuse(cache); + bytes_max = WT_STAT(cache->stats, CACHE_BYTES_MAX); + if (cache->read_lockout) { + if (bytes_inuse <= bytes_max - (bytes_max / 20)) + break; + } else if (bytes_inuse < bytes_max) + break; + } + } + +err: if (cache->evict != NULL) + __wt_free(env, cache->evict, cache->evict_len); + if (cache->hazard != NULL) + __wt_free(env, cache->hazard, cache->hazard_len); + if (toc != NULL) + WT_TRET(toc->close(toc, 0)); + + if (ret != 0) + __wt_api_env_err(env, ret, "cache eviction server error"); + + WT_VERBOSE( + env, WT_VERB_EVICT, (env, "cache eviction server exiting")); + + return (NULL); +} + +/* + * __wt_evict -- + * Evict pages from the cache. + */ +static int +__wt_evict(WT_TOC *toc) +{ + ENV *env; + WT_CACHE *cache; + WT_EVICT_LIST *evict; + uint elem, i, j; + + env = toc->env; + cache = env->ienv->cache; + + /* Get some more pages to consider for eviction. */ + WT_RET(__wt_evict_walk(toc)); + + /* + * We have an array of page eviction references that may contain NULLs, + * as well as duplicate entries. + * + * First, sort the array by WT_REF address, then delete any duplicates. + * The reason is because we might evict the page but leave a duplicate + * entry in the "saved" area of the array, and that would be a NULL + * dereference on the next run. (If someone ever tries to remove this + * duplicate cleanup for better performance, you can't fix it just by + * checking the WT_REF state -- that only works if you are discarding + * a page from a single level of the tree; if you are discarding a + * page and its parent, the duplicate of the page's WT_REF might have + * been free'd before a subsequent review of the eviction array.) + */ + evict = cache->evict; + elem = cache->evict_elem; + qsort(evict, + (size_t)elem, sizeof(WT_EVICT_LIST), __wt_evict_compare_page); + for (i = 0; i < elem; i = j) + for (j = i + 1; j < elem; ++j) { + /* + * If the leading pointer hits a NULL, we're done, the + * NULLs all sorted to the top of the array. + */ + if (evict[j].ref == NULL) + goto done_duplicates; + + /* Delete the second and any subsequent duplicates. */ + if (evict[i].ref == evict[j].ref) + WT_EVICT_CLR(&evict[j]); + else + break; + } +done_duplicates: + + /* Second, sort the array by LRU. */ + qsort(evict, + (size_t)elem, sizeof(WT_EVICT_LIST), __wt_evict_compare_lru); + + /* + * Discarding pages is done in 5 steps: + * Set the WT_EVICT state + * Check for any hazard references + * Discard clean pages + * Reconcile dirty pages (making them clean) + * Discard clean pages + * + * The reason we release clean pages, then reconcile dirty pages, then + * release clean pages again is because reconciling a dirty page is a + * slow operation, and this releases space sooner. (Arguably, we are + * going to discard all of the pages anyway, so what does it matter if + * we make clean pages wait for the dirty page writes? On the other + * hand, it's a small change and benefits any thread waiting to read a + * clean page we picked for discarding, unlikely though that may be.) + */ + __wt_evict_set(toc); + __wt_evict_hazard_check(toc); + __wt_evict_state_check(toc); + __wt_evict_page(toc, 0); + __wt_evict_write(toc); + __wt_evict_page(toc, 1); + + return (0); +} + +/* + * __wt_evict_walk -- + * Fill in the array by walk the next set of pages. + */ +static int +__wt_evict_walk(WT_TOC *toc) +{ + ENV *env; + IDB *idb; + IENV *ienv; + WT_CACHE *cache; + uint elem, i; + int ret; + + env = toc->env; + ienv = env->ienv; + cache = ienv->cache; + + /* + * Resize the array in which we're tracking pages, as necessary, then + * get some pages from each underlying file. We hold a mutex for the + * entire time -- it's slow, but (1) how often do new files get added + * or removed to/from the system, and (2) it's all in-memory stuff, so + * it's not that slow. + */ + ret = 0; + __wt_lock(env, ienv->mtx); + elem = WT_EVICT_WALK_BASE + (ienv->dbqcnt * WT_EVICT_WALK_PER_TABLE); + if (elem <= cache->evict_elem || (ret = __wt_realloc(env, + &cache->evict_len, + elem * sizeof(WT_EVICT_LIST), &cache->evict)) == 0) { + cache->evict_elem = elem; + + i = WT_EVICT_WALK_BASE; + TAILQ_FOREACH(idb, &ienv->dbqh, q) { + if ((ret = __wt_evict_walk_single(toc, idb, i)) != 0) + break; + i += WT_EVICT_WALK_PER_TABLE; + } + } + __wt_unlock(env, ienv->mtx); + return (ret); +} + +/* + * __wt_evict_walk_single -- + * Get a few page eviction candidates from a single underlying file. + */ +static int +__wt_evict_walk_single(WT_TOC *toc, IDB *idb, uint slot) +{ + WT_CACHE *cache; + WT_EVICT_LIST *evict; + int i, restarted_once; + + cache = toc->env->ienv->cache; + + /* + * Tricky little loop that restarts the walk as necessary, without + * resetting the count of pages retrieved. + */ + i = restarted_once = 0; + + /* If we haven't yet opened a tree-walk structure, do so. */ + if (idb->evict_walk.tree == NULL) +restart: WT_RET(__wt_walk_begin(toc, &idb->root_page, &idb->evict_walk)); + + /* Get the next WT_EVICT_WALK_PER_TABLE entries. */ + do { + evict = &cache->evict[slot]; + WT_RET(__wt_walk_next(toc, &idb->evict_walk, &evict->ref)); + + /* + * Restart the walk as necessary, but only once (after one + * restart we've already acquired all of the pages, and we + * could loop infinitely on a tree with a single, pinned, page). + */ + if (evict->ref == NULL) { + if (restarted_once++) + break; + goto restart; + } + + evict->idb = idb; + ++slot; + } while (++i < WT_EVICT_WALK_PER_TABLE); + + return (0); +} + +/* + * __wt_evict_db_clear -- + * Remove any entries for a file from the eviction list. + */ +void +__wt_evict_db_clear(WT_TOC *toc) +{ + ENV *env; + IDB *idb; + IENV *ienv; + WT_CACHE *cache; + WT_EVICT_LIST *evict; + uint i; + + env = toc->env; + idb = toc->db->idb; + ienv = env->ienv; + cache = ienv->cache; + + /* + * Discard any entries in the eviction list to a file we're closing + * (the caller better have locked out the eviction thread). + */ + if (cache->evict == NULL) + return; + WT_EVICT_FOREACH(cache, evict, i) + if (evict->ref != NULL && evict->idb == idb) + WT_EVICT_CLR(evict); +} + +/* + * __wt_evict_set -- + * Set the WT_EVICT flag on a set of pages. + */ +static void +__wt_evict_set(WT_TOC *toc) +{ + ENV *env; + WT_CACHE *cache; + WT_EVICT_LIST *evict; + WT_REF *ref; + uint i; + + env = toc->env; + cache = env->ienv->cache; + + /* + * Set the entry state so readers don't try and use the pages. Once + * that's done, any thread searching for a page will either see our + * state value, or will have already set a hazard reference to the page. + * We don't evict a page with a hazard reference set, so we can't race. + * + * No memory flush needed, the state field is declared volatile. + */ + WT_EVICT_FOREACH(cache, evict, i) { + if ((ref = evict->ref) == NULL) + continue; + ref->state = WT_EVICT; + } +} + +/* + * __wt_evict_hazard_check -- + * Compare the list of hazard references to the list of pages to be + * discarded. + */ +static void +__wt_evict_hazard_check(WT_TOC *toc) +{ + ENV *env; + IENV *ienv; + WT_CACHE *cache; + WT_EVICT_LIST *evict; + WT_PAGE **hazard, **end_hazard, *page; + WT_REF *ref; + WT_STATS *stats; + uint i; + + env = toc->env; + ienv = env->ienv; + cache = ienv->cache; + stats = cache->stats; + + /* Sort the eviction candidates by WT_PAGE address. */ + qsort(cache->evict, (size_t)WT_EVICT_GROUP, + sizeof(WT_EVICT_LIST), __wt_evict_compare_page); + + /* Copy the hazard reference array and sort it by WT_PAGE address. */ + hazard = cache->hazard; + end_hazard = hazard + cache->hazard_elem; + memcpy(hazard, ienv->hazard, cache->hazard_elem * sizeof(WT_PAGE *)); + qsort(hazard, (size_t)cache->hazard_elem, + sizeof(WT_PAGE *), __wt_evict_hazard_compare); + + /* Walk the lists in parallel and look for matches. */ + WT_EVICT_FOREACH(cache, evict, i) { + if ((ref = evict->ref) == NULL) + continue; + + /* + * Look for the page in the hazard list until we reach the end + * of the list or find a hazard pointer larger than the page. + */ + for (page = ref->page; + hazard < end_hazard && *hazard < page; ++hazard) + ; + if (hazard == end_hazard) + break; + + /* + * If we find a matching hazard reference, the page is in use: + * remove it from the eviction list. + * + * No memory flush needed, the state field is declared volatile. + */ + if (*hazard == page) { + WT_VERBOSE(env, WT_VERB_EVICT, (env, + "eviction skipped page addr %lu (hazard reference)", + page->addr)); + WT_STAT_INCR(stats, CACHE_EVICT_HAZARD); + + /* + * A page with a low LRU and a hazard reference? + * + * Set the page's LRU so we don't select it again. + * Return the page to service. + * Discard our reference. + */ + ref->page->read_gen = ++cache->read_gen; + ref->state = WT_OK; + WT_EVICT_CLR(evict); + } + } +} + +/* + * __wt_evict_state_check -- + * Confirm these are pages we want to evict. + */ +static void +__wt_evict_state_check(WT_TOC *toc) +{ + ENV *env; + WT_CACHE *cache; + WT_EVICT_LIST *evict; + WT_PAGE *page; + WT_REF *ref; + uint i; + + env = toc->env; + cache = env->ienv->cache; + + /* + * We "own" the pages (we've flagged them for eviction, and there were + * no hazard references). Now do checks to see if these are pages we + * can evict -- we have to wait until after we own the page because the + * page might be updated and race with us. + */ + WT_EVICT_FOREACH(cache, evict, i) { + if ((ref = evict->ref) == NULL) + continue; + page = ref->page; + + /* Ignore pinned pages. */ + if (F_ISSET(page, WT_PINNED)) { + WT_VERBOSE(env, WT_VERB_EVICT, (env, + "eviction skipped page addr %lu (pinned)", + page->addr)); + goto skip; + } + + /* Ignore pages with in-memory subtrees. */ + switch (page->dsk->type) { + case WT_PAGE_COL_INT: + case WT_PAGE_DUP_INT: + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + if (__wt_evict_page_subtrees(page)) { + WT_VERBOSE(env, WT_VERB_EVICT, (env, + "eviction skipped page addr %lu (subtrees)", + page->addr)); + goto skip; + } + break; + default: + break; + } + + continue; + +skip: /* + * Set the page's LRU so we don't select it again. + * Return the page to service. + * Discard our reference. + */ + page->read_gen = ++cache->read_gen; + ref->state = WT_OK; + WT_EVICT_CLR(evict); + } +} + +/* + * __wt_evict_write -- + * Write any modified pages. + */ +static void +__wt_evict_write(WT_TOC *toc) +{ + ENV *env; + WT_CACHE *cache; + WT_EVICT_LIST *evict; + WT_PAGE *page; + WT_REF *ref; + uint i; + + env = toc->env; + cache = env->ienv->cache; + + WT_EVICT_FOREACH(cache, evict, i) { + if ((ref = evict->ref) == NULL) + continue; + page = ref->page; + + /* Ignore dirty pages. */ + if (!WT_PAGE_IS_MODIFIED(page)) + continue; + + /* + * We're using our WT_TOC handle, it needs to reference the + * correct DB handle. + * + * XXX + * This is pretty sleazy, but I'm hesitant to try and drive + * a separate DB/IDB handle down through the reconciliation + * code. + */ + toc->db = evict->idb->db; + (void)__wt_page_reconcile(toc, page); + } +} + +/* + * __wt_evict_page -- + * Evict cache pages. + */ +static void +__wt_evict_page(WT_TOC *toc, int was_dirty) +{ + ENV *env; + WT_CACHE *cache; + WT_EVICT_LIST *evict; + WT_PAGE *page; + WT_REF *ref; + WT_STATS *stats; + uint i; + + env = toc->env; + cache = env->ienv->cache; + stats = cache->stats; + + WT_EVICT_FOREACH(cache, evict, i) { + if ((ref = evict->ref) == NULL) + continue; + page = ref->page; + + /* + * The first time we're called, we get rid of the clean pages; + * the second time we're called, we get rid of the pages that + * were dirty but have since been cleaned. Ignore dirty pages + * in all cases, it's simpler. + */ + if (WT_PAGE_IS_MODIFIED(page)) + continue; + + if (was_dirty) + WT_STAT_INCR(stats, CACHE_EVICT_MODIFIED); + else + WT_STAT_INCR(stats, CACHE_EVICT_UNMODIFIED); + +#ifdef HAVE_DIAGNOSTIC + __wt_evict_hazard_validate(env, page); +#endif + WT_VERBOSE(env, WT_VERB_EVICT, (env, + "cache evicting page addr %lu", page->addr)); + + /* + * Copy a page reference, then make the cache entry available + * for re-use. + * + * No memory flush needed, the state field is declared volatile. + */ + ref->page = NULL; + ref->state = WT_EMPTY; + + /* Remove the entry from the eviction list. */ + WT_EVICT_CLR(evict); + + /* We've got more space. */ + WT_CACHE_PAGE_OUT(cache, page->size); + + /* The page can no longer be found, free the memory. */ + __wt_page_discard(toc, page); + } +} + +/* + * __wt_evict_page_subtrees -- + * Return if a page has an in-memory subtree. + */ +static int +__wt_evict_page_subtrees(WT_PAGE *page) +{ + WT_REF *ref, **dupp; + uint32_t i; + + /* + * Return if a page has an in-memory subtree -- this array search could + * be replaced by a reference count in the page, but (1) the eviction + * thread isn't where I expect performance problems, (2) I hate to lose + * more bytes on every page, (3) how often will an internal page be + * evicted anyway? + */ + switch (page->dsk->type) { + case WT_PAGE_COL_INT: + case WT_PAGE_DUP_INT: + case WT_PAGE_ROW_INT: + WT_REF_FOREACH(page, ref, i) + if (ref->state != WT_EMPTY) + return (1); + break; + case WT_PAGE_ROW_LEAF: + if (WT_PAGE_DUP_TREES(page)) + WT_DUP_FOREACH(page, dupp, i) + if (*dupp != NULL && (*dupp)->state != WT_EMPTY) + return (1); + break; + default: + break; + } + + return (0); +} + +/* + * __wt_evict_compare_page -- + * Qsort function: sort WT_EVICT_LIST array based on the page's address. + */ +static int +__wt_evict_compare_page(const void *a, const void *b) +{ + WT_REF *a_ref, *b_ref; + WT_PAGE *a_page, *b_page; + + /* + * There may be NULL references in the array; sort them as greater than + * anything else so they migrate to the end of the array. + */ + a_ref = ((WT_EVICT_LIST *)a)->ref; + b_ref = ((WT_EVICT_LIST *)b)->ref; + if (a_ref == NULL) + return (b_ref == NULL ? 0 : 1); + if (b_ref == NULL) + return (-1); + + /* Sort the page address in ascending order. */ + a_page = a_ref->page; + b_page = b_ref->page; + return (a_page > b_page ? 1 : (a_page < b_page ? -1 : 0)); +} + +/* + * __wt_evict_compare_lru -- + * Qsort function: sort WT_EVICT_LIST array based on the page's read + * generation. + */ +static int +__wt_evict_compare_lru(const void *a, const void *b) +{ + WT_REF *a_ref, *b_ref; + uint32_t a_lru, b_lru; + + /* + * There may be NULL references in the array; sort them as greater than + * anything else so they migrate to the end of the array. + */ + a_ref = ((WT_EVICT_LIST *)a)->ref; + b_ref = ((WT_EVICT_LIST *)b)->ref; + if (a_ref == NULL) + return (b_ref == NULL ? 0 : 1); + if (b_ref == NULL) + return (-1); + + /* Sort the LRU in ascending order. */ + a_lru = a_ref->page->read_gen; + b_lru = b_ref->page->read_gen; + return (a_lru > b_lru ? 1 : (a_lru < b_lru ? -1 : 0)); +} + +/* + * __wt_evict_hazard_compare -- + * Qsort function: sort hazard list based on the page's address. + */ +static int +__wt_evict_hazard_compare(const void *a, const void *b) +{ + WT_PAGE *a_page, *b_page; + + a_page = *(WT_PAGE **)a; + b_page = *(WT_PAGE **)b; + + return (a_page > b_page ? 1 : (a_page < b_page ? -1 : 0)); +} + +#ifdef HAVE_DIAGNOSTIC +/* + * __wt_evict_hazard_validate -- + * Return if a page is or isn't on the hazard list. + */ +static void +__wt_evict_hazard_validate(ENV *env, WT_PAGE *page) +{ + IENV *ienv; + WT_PAGE **hp; + WT_TOC **tp, *toc; + + ienv = env->ienv; + + for (tp = ienv->toc; (toc = *tp) != NULL; ++tp) + for (hp = toc->hazard; + hp < toc->hazard + toc->env->hazard_size; ++hp) + if (*hp == page) { + __wt_api_env_errx(env, + "hazard eviction check for page %lu " + "failed", + (u_long)page->addr); + __wt_abort(env); + } +} + +/* + * __wt_evict_dump -- + * Display the eviction list. + */ +void +__wt_evict_dump(WT_TOC *toc) +{ + ENV *env; + WT_CACHE *cache; + WT_EVICT_LIST *evict; + WT_MBUF mb; + uint n; + int sep; + + env = toc->env; + cache = env->ienv->cache; + + __wt_mb_init(env, &mb); + __wt_mb_add(&mb, "eviction list"); + + for (sep = ':', n = 0; n < cache->evict_elem; ++n) { + evict = &cache->evict[n]; + if (evict->ref == NULL) + continue; + __wt_mb_add(&mb, "%c %lu", sep, (u_long)evict->ref->page->addr); + sep = ','; + } + __wt_mb_discard(&mb); +} + +/* + * __wt_evict_dump_cache + * Dump the in-memory cache. + */ +int +__wt_evict_cache_dump(WT_TOC *toc) +{ + IDB *idb; + IENV *ienv; + + ienv = toc->env->ienv; + + TAILQ_FOREACH(idb, &ienv->dbqh, q) + WT_RET(__wt_evict_tree_dump(toc, idb)); + return (0); +} + +/* + * __wt_evict_tree_dump + * Dump an in-memory tree. + */ +int +__wt_evict_tree_dump(WT_TOC *toc, IDB *idb) +{ + ENV *env; + WT_CACHE *cache; + WT_REF *ref; + WT_WALK walk; + WT_MBUF mb; + int sep; + + env = toc->env; + cache = env->ienv->cache; + + WT_VERBOSE(env, WT_VERB_EVICT, (env, + "%s: pages inuse %llu, bytes inuse (%llu), max (%llu)", + idb->name, + __wt_cache_pages_inuse(cache), + __wt_cache_bytes_inuse(cache), + WT_STAT(cache->stats, CACHE_BYTES_MAX))); + + __wt_mb_init(env, &mb); + __wt_mb_add(&mb, "in-memory page list"); + + WT_CLEAR(walk); + WT_RET(__wt_walk_begin(toc, &idb->root_page, &walk)); + for (sep = ':';;) { + WT_RET(__wt_walk_next(toc, &walk, &ref)); + if (ref == NULL) + break; + __wt_mb_add(&mb, "%c %lu", sep, (u_long)ref->page->addr); + sep = ','; + } + __wt_walk_end(env, &walk); + __wt_mb_discard(&mb); + + return (0); +} + +/* + * __wt_evict_cache_count + * Retrun the count of nodes in the cache. + */ +int +__wt_evict_cache_count(WT_TOC *toc, uint64_t *nodesp) +{ + IDB *idb; + IENV *ienv; + uint64_t nodes; + + ienv = toc->env->ienv; + + *nodesp = 0; + TAILQ_FOREACH(idb, &ienv->dbqh, q) { + WT_RET(__wt_evict_tree_count(toc, idb, &nodes)); + *nodesp += nodes; + } + return (0); +} + +/* + * __wt_evict_tree_count + * Return a count of nodes in the tree. + */ +int +__wt_evict_tree_count(WT_TOC *toc, IDB *idb, uint64_t *nodesp) +{ + ENV *env; + WT_REF *ref; + WT_WALK walk; + uint64_t nodes; + + env = toc->env; + + WT_CLEAR(walk); + WT_RET(__wt_walk_begin(toc, &idb->root_page, &walk)); + for (nodes = 0;;) { + WT_RET(__wt_walk_next(toc, &walk, &ref)); + if (ref == NULL) + break; + ++nodes; + } + *nodesp = nodes; + __wt_walk_end(env, &walk); + + return (0); +} +#endif diff --git a/src/btree/bt_misc.c b/src/btree/bt_misc.c new file mode 100644 index 00000000000..c0f58002522 --- /dev/null +++ b/src/btree/bt_misc.c @@ -0,0 +1,175 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_bt_build_verify -- + * Verify the Btree build itself. + */ +int +__wt_bt_build_verify(void) +{ + static const struct { + char *name; + u_int size, expected; + } size_check[] = { + { "WT_COL", sizeof(WT_COL), WT_COL_SIZE }, + { "WT_ITEM", sizeof(WT_ITEM), WT_ITEM_SIZE }, + { "WT_OFF", sizeof(WT_OFF), WT_OFF_SIZE }, + { "WT_OVFL", sizeof(WT_OVFL), WT_OVFL_SIZE }, + { "WT_PAGE", sizeof(WT_PAGE), WT_PAGE_SIZE }, + { "WT_PAGE_DESC", sizeof(WT_PAGE_DESC), WT_PAGE_DESC_SIZE }, + { "WT_PAGE_DISK", sizeof(WT_PAGE_DISK), WT_PAGE_DISK_SIZE }, + { "WT_ROW", sizeof(WT_ROW), WT_ROW_SIZE } + }; + static const struct { + char *name; + u_int size, align; + } align_check[] = { + { "WT_OFF", sizeof(WT_OFF), sizeof(uint32_t) }, + { "WT_OVFL", sizeof(WT_OVFL), sizeof(uint32_t) }, + { "WT_PAGE_DISK", sizeof(WT_PAGE_DISK), sizeof(uint32_t) }, + { "WT_TOC_UPDATE", sizeof(WT_TOC_UPDATE), sizeof(uint32_t) } + }; + u_int i; + + /* + * The compiler had better not have padded our structures -- make + * sure the page header structure is exactly what we expect. + */ + for (i = 0; i < WT_ELEMENTS(size_check); ++i) { + if (size_check[i].size == size_check[i].expected) + continue; + __wt_api_env_errx(NULL, + "WiredTiger build failed, the %s header structure is not " + "the correct size (expected %u, got %u)", + size_check[i].name, + size_check[i].expected, size_check[i].size); + return (WT_ERROR); + } + + /* There are also structures that must be aligned correctly. */ + for (i = 0; i < WT_ELEMENTS(align_check); ++i) { + if (WT_ALIGN(align_check[i].size, + align_check[i].align) == align_check[i].size) + continue; + __wt_api_env_errx(NULL, + "Build verification failed, the %s structure is not" + " correctly aligned", align_check[i].name); + return (WT_ERROR); + } + + /* + * We mix-and-match 32-bit unsigned values and size_t's, mostly because + * we allocate and handle 32-bit objects, and lots of the underlying C + * library expects size_t values for the length of memory objects. We + * check, just to be sure. + */ + if (sizeof(size_t) < sizeof(uint32_t)) { + __wt_api_env_errx(NULL, "%s", + "Build verification failed, a size_t is smaller than " + "4-bytes"); + return (WT_ERROR); + } + + return (0); +} + +/* + * __wt_set_ff_and_sa_from_offset -- + * Set first-free and space-available values from an address positioned + * one past the last used byte on the page. + */ +inline void +__wt_set_ff_and_sa_from_offset(WT_PAGE *page, + void *p, uint8_t **first_freep, uint32_t *space_availp) +{ + *first_freep = (uint8_t *)p; + *space_availp = + page->size - (uint32_t)((uint8_t *)p - (uint8_t *)page->dsk); +} + +/* + * __wt_page_write_gen_check -- + * Confirm the page's write generation number is correct. + */ +inline int +__wt_page_write_gen_check(WT_PAGE *page, uint32_t write_gen) +{ + return (page->write_gen == write_gen ? 0 : WT_RESTART); +} + +/* + * __wt_page_type_string -- + * Return a string representing the page type. + */ +const char * +__wt_page_type_string(WT_PAGE_DISK *dsk) +{ + switch (dsk->type) { + case WT_PAGE_INVALID: + return ("invalid"); + case WT_PAGE_COL_FIX: + return ("column-store fixed-length leaf"); + case WT_PAGE_COL_INT: + return ("column-store internal"); + case WT_PAGE_COL_RLE: + return ("column-store fixed-length run-length encoded leaf"); + case WT_PAGE_COL_VAR: + return ("column-store variable-length leaf"); + case WT_PAGE_DUP_INT: + return ("duplicate tree internal"); + case WT_PAGE_DUP_LEAF: + return ("duplicate tree leaf"); + case WT_PAGE_OVFL: + return ("overflow"); + case WT_PAGE_ROW_INT: + return ("row-store internal"); + case WT_PAGE_ROW_LEAF: + return ("row-store leaf"); + default: + break; + } + return ("unknown"); +} + +/* + * __wt_item_type_string -- + * Return a string representing the item type. + */ +const char * +__wt_item_type_string(WT_ITEM *item) +{ + switch (WT_ITEM_TYPE(item)) { + case WT_ITEM_KEY: + return ("key"); + case WT_ITEM_KEY_OVFL: + return ("key-overflow"); + case WT_ITEM_KEY_DUP: + return ("key-duplicate"); + case WT_ITEM_KEY_DUP_OVFL: + return ("key-duplicate-overflow"); + case WT_ITEM_DATA: + return ("data"); + case WT_ITEM_DATA_OVFL: + return ("data-overflow"); + case WT_ITEM_DATA_DUP: + return ("data-duplicate"); + case WT_ITEM_DATA_DUP_OVFL: + return ("data-duplicate-overflow"); + case WT_ITEM_DEL: + return ("deleted"); + case WT_ITEM_OFF: + return ("off-page"); + default: + break; + } + return ("unknown"); +} diff --git a/src/btree/bt_open.c b/src/btree/bt_open.c new file mode 100644 index 00000000000..c746782221e --- /dev/null +++ b/src/btree/bt_open.c @@ -0,0 +1,279 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +static int __wt_open_verify(DB *); +static int __wt_open_verify_page_sizes(DB *); + +/* + * __wt_bt_open -- + * Open a Btree. + */ +int +__wt_bt_open(WT_TOC *toc, int ok_create) +{ + DB *db; + ENV *env; + IDB *idb; + + db = toc->db; + env = toc->env; + idb = db->idb; + + /* Check page size configuration. */ + WT_RET(__wt_open_verify(db)); + + /* Open the fle. */ + WT_RET(__wt_open(env, idb->name, idb->mode, ok_create, &idb->fh)); + + /* + * If the file size is 0, write a description page; if the file size + * is non-zero, update the DB handle based on the on-disk description + * page. (If the file isn't empty, there must be a description page.) + */ + if (idb->fh->file_size == 0) + WT_RET(__wt_desc_write(toc)); + else { + WT_RET(__wt_desc_read(toc)); + + /* If there's a root page, pin it. */ + if (idb->root_off.addr != WT_ADDR_INVALID) + WT_RET(__wt_root_pin(toc)); + } + + return (0); +} + +/* + * __wt_open_verify -- + * Verify anything we can't verify before we're about to open the file; + * set defaults as necessary. + */ +static int +__wt_open_verify(DB *db) +{ + IDB *idb; + + idb = db->idb; + + /* Verify the page sizes. */ + WT_RET(__wt_open_verify_page_sizes(db)); + + /* Verify other configuration combinations. */ + if (db->fixed_len != 0 && (idb->huffman_key || idb->huffman_data)) { + __wt_api_db_errx(db, + "Fixed size column-store databases may not be Huffman " + "compressed"); + return (WT_ERROR); + } + + return (0); +} + +/* + * __wt_open_verify_page_sizes -- + * Verify the page sizes. + */ +static int +__wt_open_verify_page_sizes(DB *db) +{ + IDB *idb; + + idb = db->idb; + + /* + * The application can set lots of page sizes. It's complicated, so + * instead of verifying the relationships when they're set, verify + * then when the database is opened and we know we have the final + * values. (Besides, if we verify the relationships when they're set, + * the application has to set them in a specific order or we'd need + * one set function that took 10 parameters.) + * + * If the values haven't been set, set the defaults. + * + * Default to a small fragment size, so overflow items don't consume + * a lot of space. + */ + if (db->allocsize == 0) + db->allocsize = WT_BTREE_ALLOCATION_SIZE; + + /* Allocation sizes must be a power-of-two, nothing else makes sense. */ + if (!__wt_ispo2(db->allocsize)) { + __wt_api_db_errx(db, + "the allocation size must be a power of two"); + return (WT_ERROR); + } + + /* + * Limit allocation units to 256MB, and page sizes to 128MB. There's + * no reason (other than testing) we can't support larger sizes (any + * sizes up to the smaller of an off_t and a size_t should work), but + * an application specifying larger allocation or page sizes is almost + * certainly making a mistake. + */ + if (db->allocsize > WT_BTREE_ALLOCATION_SIZE_MAX) { + __wt_api_db_errx(db, + "the allocation size must less than or equal to %luMB", + (u_long)(WT_BTREE_PAGE_SIZE_MAX / WT_MEGABYTE)); + return (WT_ERROR); + } + + /* + * Internal pages are also usually small, we want it to fit into the + * L1 cache. We try and put at least 40 keys on each internal page + * (40 because that results in 100M keys in a level 5 Btree). But, + * if it's a small page, push anything bigger than about 50 bytes + * off-page. Here's the table: + * Pagesize Largest key retained on-page: + * 512B 50 bytes + * 1K 50 bytes + * 2K 51 bytes + * 4K 102 bytes + * 8K 204 bytes + * and so on, roughly doubling for each power-of-two. + */ + if (db->intlmin == 0) + db->intlmin = WT_BTREE_INTLMIN_DEFAULT; + if (db->intlmax == 0) + db->intlmax = WT_MAX(db->intlmin, WT_BTREE_INTLMAX_DEFAULT); + if (db->intlitemsize == 0) { + if (db->intlmin <= 1024) + db->intlitemsize = 50; + else + db->intlitemsize = db->intlmin / 40; + } + + /* + * Leaf pages are larger to amortize I/O across a large chunk of the + * data space, but still minimize the chance of a broken write. We + * only require 20 key/data pairs fit onto a leaf page. Again, if it's + * a small page, push anything bigger than about 80 bytes off-page. + * Here's the table: + * Pagesize Largest key or data item retained on-page: + * 512B 80 bytes + * 1K 80 bytes + * 2K 80 bytes + * 4K 80 bytes + * 8K 204 bytes + * 16K 409 bytes + * and so on, roughly doubling for each power-of-two. + */ + if (db->leafmin == 0) + db->leafmin = WT_BTREE_LEAFMIN_DEFAULT; + if (db->leafmax == 0) + db->leafmax = WT_MAX(db->leafmin, WT_BTREE_LEAFMAX_DEFAULT); + if (db->leafitemsize == 0) { + if (db->leafmin <= 4096) + db->leafitemsize = 80; + else + db->leafitemsize = db->leafmin / 40; + } + + /* Final checks for safety. */ + if (db->intlmin % db->allocsize != 0 || + db->intlmax % db->allocsize != 0 || + db->leafmin % db->allocsize != 0 || + db->leafmax % db->allocsize != 0) { + __wt_api_db_errx(db, + "all page sizes must be a multiple of %lu bytes", + (u_long)db->allocsize); + return (WT_ERROR); + } + + if (db->intlmin > db->intlmax || db->leafmin > db->leafmax) { + __wt_api_db_errx(db, + "minimum page sizes must be less than or equal to maximum " + "page sizes"); + return (WT_ERROR); + } + + if (db->intlmin > WT_BTREE_PAGE_SIZE_MAX || + db->intlmax > WT_BTREE_PAGE_SIZE_MAX || + db->leafmin > WT_BTREE_PAGE_SIZE_MAX || + db->leafmax > WT_BTREE_PAGE_SIZE_MAX) { + __wt_api_db_errx(db, + "all page sizes must less than or equal to %luMB", + (u_long)WT_BTREE_PAGE_SIZE_MAX / WT_MEGABYTE); + return (WT_ERROR); + } + + /* + * We only have 3 bytes of length for on-page items, so the maximum + * on-page item size is limited to 16MB. + */ + if (db->intlitemsize > WT_ITEM_MAX_LEN) + db->intlitemsize = WT_ITEM_MAX_LEN; + if (db->leafitemsize > WT_ITEM_MAX_LEN) + db->leafitemsize = WT_ITEM_MAX_LEN; + + /* + * By default, any duplicate set that reaches 25% of a leaf page is + * moved into its own separate tree. + */ + if (db->btree_dup_offpage == 0) + db->btree_dup_offpage = 4; + + /* + * A leaf page must hold at least 2 key/data pairs, otherwise the + * whole btree thing breaks down because we can't split. We have + * to include WT_DESC_SIZE in leaf page calculations, it's not + * strictly necessary in internal pages because page 0 is always + * a leaf page. The additional 10 bytes is for slop -- Berkeley DB + * took roughly a decade to get the calculation correct, and that + * way I can skip the suspense. + */ +#define WT_MINIMUM_DATA_SPACE(db, s) \ + (((s) - (WT_PAGE_DISK_SIZE + WT_PAGE_DESC_SIZE + 10)) / 4) + if (db->intlitemsize > WT_MINIMUM_DATA_SPACE(db, db->intlmin)) { + __wt_api_db_errx(db, + "The internal page size is too small for its maximum item " + "size"); + return (WT_ERROR); + } + if (db->leafitemsize > WT_MINIMUM_DATA_SPACE(db, db->leafmin)) { + __wt_api_db_errx(db, + "The leaf page size is too small for its maximum item " + "size"); + return (WT_ERROR); + } + + /* + * A fixed-size column store should be able to store at least 20 + * objects on a page, otherwise it just doesn't make sense. + */ + if (F_ISSET(idb, WT_COLUMN) && + db->fixed_len != 0 && db->leafmin / db->fixed_len < 20) { + __wt_api_db_errx(db, + "The leaf page size cannot store at least 20 fixed-length " + "objects"); + return (WT_ERROR); + } + + return (0); +} + +/* + * __wt_root_pin -- + * Read in the root page and pin it into memory. + */ +int +__wt_root_pin(WT_TOC *toc) +{ + IDB *idb; + + idb = toc->db->idb; + + /* Get the root page. */ + WT_RET(__wt_page_in(toc, NULL, &idb->root_page, &idb->root_off, 0)); + F_SET(idb->root_page.page, WT_PINNED); + __wt_hazard_clear(toc, idb->root_page.page); + + return (0); +} diff --git a/src/btree/bt_ovfl.c b/src/btree/bt_ovfl.c new file mode 100644 index 00000000000..09eac77264b --- /dev/null +++ b/src/btree/bt_ovfl.c @@ -0,0 +1,72 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_ovfl_in -- + * Read an overflow item from the disk. + */ +int +__wt_ovfl_in(WT_TOC *toc, WT_OVFL *ovfl, DBT *store) +{ + DB *db; + ENV *env; + WT_PAGE_DISK *dsk; + WT_STATS *stats; + uint32_t size; + + env = toc->env; + db = toc->db; + stats = env->ienv->cache->stats; + + /* + * Read an overflow page, using an overflow structure from a page for + * which we (better) have a hazard reference. + * + * Overflow reads are synchronous. That may bite me at some point, but + * WiredTiger supports large page sizes, and overflow items should be + * rare. + */ + WT_VERBOSE(env, WT_VERB_READ, (env, + "overflow read addr/size %lu/%lu", + (u_long)ovfl->addr, (u_long)ovfl->size)); + WT_STAT_INCR(stats, OVERFLOW_READ); + + /* + * The only caller that wants a copy of the overflow pages (as opposed + * to the contents of the overflow pages), is the verify code. For that + * reason, it reads its own overflow pages, it doesn't call this code. + * + * But, we still have to verify the checksum, which means we have to + * read the entire set of pages, then copy the interesting information + * to the beginning of the buffer. The copy is a shift in a single + * buffer and so should be fast, but it's still not a good thing. If + * it ever becomes a problem, then we either have to pass the fact that + * it's a "page" back to our caller and let them deal with the offset, + * or add a new field to the DBT that flags the start of the allocated + * buffer, instead of using the "data" field to indicate both the start + * of the data and the start of the allocated memory. + * + * Re-allocate memory as necessary to hold the overflow pages. + */ + size = WT_HDR_BYTES_TO_ALLOC(db, ovfl->size); + if (store->mem_size < size) + WT_RET(__wt_realloc(env, &store->mem_size, size, &store->data)); + + /* Read the page. */ + WT_RET(__wt_page_disk_read(toc, store->data, ovfl->addr, size)); + + /* Copy the actual data in the DBT down to the start of the data. */ + (void)memmove(store->data, + (uint8_t *)store->data + sizeof(WT_PAGE_DISK), ovfl->size); + store->size = ovfl->size; + + return (0); +} diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c new file mode 100644 index 00000000000..915d038751b --- /dev/null +++ b/src/btree/bt_page.c @@ -0,0 +1,656 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +static void __wt_page_inmem_col_fix(DB *, WT_PAGE *); +static void __wt_page_inmem_col_int(WT_PAGE *); +static void __wt_page_inmem_col_rle(DB *, WT_PAGE *); +static void __wt_page_inmem_col_var(WT_PAGE *); +static int __wt_page_inmem_dup_leaf(DB *, WT_PAGE *); +static int __wt_page_inmem_int_ref(WT_TOC *, uint32_t, WT_PAGE *); +static int __wt_page_inmem_row_int(DB *, WT_PAGE *); +static int __wt_page_inmem_row_leaf(DB *, WT_PAGE *); + +/* + * __wt_page_in -- + * Acquire a hazard reference to a page; if the page is not in-memory, + * read it from the disk and build an in-memory version. + */ +int +__wt_page_in( + WT_TOC *toc, WT_PAGE *parent, WT_REF *ref, WT_OFF *off, int dsk_verify) +{ + ENV *env; + WT_CACHE *cache; + int ret; + + env = toc->env; + cache = env->ienv->cache; + + for (;;) + switch (ref->state) { + case WT_OK: + /* + * The page is in memory: get a hazard reference, update + * the page's LRU and return. + */ + if (__wt_hazard_set(toc, ref)) { + ref->page->read_gen = ++cache->read_gen; + return (0); + } + /* FALLTHROUGH */ + case WT_EVICT: + /* + * The page is being considered for eviction, wait for + * that to resolve. + */ + __wt_yield(); + break; + case WT_EMPTY: + /* The page isn't in memory, request it be read. */ + __wt_cache_read_serial( + toc, parent, ref, off, dsk_verify, ret); + if (ret != 0) + return (ret); + break; + default: + WT_ABORT(env, "WT_REF->state invalid"); + break; + } + /* NOTREACHED */ +} + +/* + * __wt_page_inmem -- + * Build in-memory page information. + */ +int +__wt_page_inmem(WT_TOC *toc, WT_PAGE *page) +{ + DB *db; + ENV *env; + WT_PAGE_DISK *dsk; + uint32_t nindx; + int ret; + + db = toc->db; + env = toc->env; + dsk = page->dsk; + ret = 0; + + WT_ASSERT(env, page->u.indx == NULL); + + /* Determine the maximum number of indexes we'll need for this page. */ + switch (dsk->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_INT: + case WT_PAGE_COL_RLE: + case WT_PAGE_COL_VAR: + case WT_PAGE_DUP_LEAF: + nindx = dsk->u.entries; + break; + case WT_PAGE_DUP_INT: + case WT_PAGE_ROW_INT: + nindx = dsk->u.entries / 2; + break; + case WT_PAGE_ROW_LEAF: + /* + * Row store leaf pages support duplicates, so the real worst + * case is one key plus some number of duplicate data items. + * The number is configurable, that is, you can configure when + * a duplicate set is big enough to be pushed off the page; + * we're conservative here. + */ + nindx = dsk->u.entries - 1; + break; + WT_ILLEGAL_FORMAT(db); + } + + /* + * XXX + * We don't yet have a free-list on which to put empty pages -- for + * now, we handle them. + */ + if (nindx == 0) + return (0); + + /* Allocate an array of WT_{ROW,COL}_INDX structures for the page. */ + switch (dsk->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_INT: + case WT_PAGE_COL_RLE: + case WT_PAGE_COL_VAR: + WT_ERR((__wt_calloc(env, + nindx, sizeof(WT_COL), &page->u.icol))); + break; + case WT_PAGE_DUP_INT: + case WT_PAGE_DUP_LEAF: + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + WT_ERR((__wt_calloc(env, + nindx, sizeof(WT_ROW), &page->u.irow))); + break; + default: + break; + } + + /* Allocate reference array for internal pages. */ + switch (dsk->type) { + case WT_PAGE_COL_INT: + case WT_PAGE_DUP_INT: + case WT_PAGE_ROW_INT: + WT_ERR(__wt_page_inmem_int_ref(toc, nindx, page)); + break; + default: + break; + } + + /* Fill in the structures. */ + switch (dsk->type) { + case WT_PAGE_COL_FIX: + __wt_page_inmem_col_fix(db, page); + break; + case WT_PAGE_COL_INT: + __wt_page_inmem_col_int(page); + break; + case WT_PAGE_COL_RLE: + __wt_page_inmem_col_rle(db, page); + break; + case WT_PAGE_COL_VAR: + __wt_page_inmem_col_var(page); + break; + case WT_PAGE_DUP_LEAF: + WT_ERR(__wt_page_inmem_dup_leaf(db, page)); + break; + case WT_PAGE_DUP_INT: + case WT_PAGE_ROW_INT: + WT_ERR(__wt_page_inmem_row_int(db, page)); + break; + case WT_PAGE_ROW_LEAF: + WT_ERR(__wt_page_inmem_row_leaf(db, page)); + break; + default: + break; + } + return (0); + +err: __wt_page_discard(toc, page); + return (ret); +} + +/* + * __wt_page_inmem_col_fix -- + * Build in-memory index for fixed-length column-store leaf pages. + */ +static void +__wt_page_inmem_col_fix(DB *db, WT_PAGE *page) +{ + WT_COL *cip; + WT_PAGE_DISK *dsk; + uint32_t i; + uint8_t *p; + + dsk = page->dsk; + cip = page->u.icol; + + /* + * Walk the page, building indices and finding the end of the page. + * The page contains fixed-length objects. + */ + WT_FIX_FOREACH(db, dsk, p, i) { + cip->data = p; + ++cip; + } + + page->indx_count = page->records = dsk->u.entries; +} + +/* + * __wt_page_inmem_col_int -- + * Build in-memory index for column-store internal pages. + */ +static void +__wt_page_inmem_col_int(WT_PAGE *page) +{ + WT_COL *cip; + WT_OFF *off; + WT_PAGE_DISK *dsk; + uint64_t records; + uint32_t i; + + dsk = page->dsk; + cip = page->u.icol; + records = 0; + + /* + * Walk the page, building indices and finding the end of the page. + * The page contains WT_OFF structures. + */ + WT_OFF_FOREACH(dsk, off, i) { + cip->data = off; + ++cip; + records += WT_RECORDS(off); + } + + page->indx_count = dsk->u.entries; + page->records = records; +} + +/* + * __wt_page_inmem_col_rle -- + * Build in-memory index for fixed-length, run-length encoded, column-store + * leaf pages. + */ +static void +__wt_page_inmem_col_rle(DB *db, WT_PAGE *page) +{ + WT_COL *cip; + WT_PAGE_DISK *dsk; + uint64_t records; + uint32_t i; + uint8_t *p; + + dsk = page->dsk; + cip = page->u.icol; + records = 0; + + /* + * Walk the page, building indices and finding the end of the page. + * The page contains fixed-length objects. + */ + WT_RLE_REPEAT_FOREACH(db, dsk, p, i) { + records += WT_RLE_REPEAT_COUNT(p); + cip->data = p; + ++cip; + } + + page->indx_count = dsk->u.entries; + page->records = records; +} + +/* + * __wt_page_inmem_col_var -- + * Build in-memory index for variable-length, data-only leaf pages in + * column-store trees. + */ +static void +__wt_page_inmem_col_var(WT_PAGE *page) +{ + WT_COL *cip; + WT_ITEM *item; + WT_PAGE_DISK *dsk; + uint32_t i; + + dsk = page->dsk; + cip = page->u.icol; + + /* + * Walk the page, building indices and finding the end of the page. + * The page contains unsorted data items. The data items are on-page + * data (WT_ITEM_DATA), overflow (WT_ITEM_DATA_OVFL) or deleted + * (WT_ITEM_DEL) items. + */ + WT_ITEM_FOREACH(dsk, item, i) { + cip->data = item; + ++cip; + } + + page->indx_count = page->records = dsk->u.entries; +} + +/* + * __wt_page_inmem_dup_leaf -- + * Build in-memory index for variable-length, data-only leaf pages in + * duplicate trees. + */ +static int +__wt_page_inmem_dup_leaf(DB *db, WT_PAGE *page) +{ + WT_ROW *rip; + WT_ITEM *item; + WT_PAGE_DISK *dsk; + uint32_t i; + + dsk = page->dsk; + + /* + * Walk the page, building indices and finding the end of the page. + * The page contains sorted data items. The data items are on-page + * (WT_ITEM_DATA_DUP) or overflow (WT_ITEM_DUP_OVFL) items. + * + * These data values are sorted, so we want to treat them as keys, and + * we return them as on-page WT_ITEM values, so we want to tream them + * as data. Set both the WT_ROW key and data fields. + */ + rip = page->u.irow; + WT_ITEM_FOREACH(dsk, item, i) { + switch (WT_ITEM_TYPE(item)) { + case WT_ITEM_DATA_DUP: + __wt_key_set + (rip, WT_ITEM_BYTE(item), WT_ITEM_LEN(item)); + break; + case WT_ITEM_DATA_DUP_OVFL: + __wt_key_set_process(rip, item); + break; + WT_ILLEGAL_FORMAT(db); + } + rip->data = item; + ++rip; + } + + page->indx_count = dsk->u.entries; + page->records = dsk->u.entries; + return (0); +} + +/* + * __wt_page_inmem_row_int -- + * Build in-memory index for row-store and off-page duplicate tree + * internal pages. + */ +static int +__wt_page_inmem_row_int(DB *db, WT_PAGE *page) +{ + IDB *idb; + WT_ITEM *item; + WT_OFF *off; + WT_PAGE_DISK *dsk; + WT_ROW *rip; + uint64_t records; + uint32_t i; + void *huffman; + + idb = db->idb; + dsk = page->dsk; + rip = page->u.irow; + records = 0; + + huffman = + dsk->type == WT_PAGE_DUP_INT ? idb->huffman_data : idb->huffman_key; + + /* + * Walk the page, building indices and finding the end of the page. + * + * The page contains sorted key/offpage-reference pairs. Keys are row + * store internal pages with on-page/overflow (WT_ITEM_KEY/KEY_OVFL) + * items, or row store duplicate internal pages with on-page/overflow + * (WT_ITEM_KEY_DUP/WT_ITEM_DATA_KEY_DUP_OVFL) items. In both cases, + * offpage references are WT_ITEM_OFF items. + */ + WT_ITEM_FOREACH(dsk, item, i) + switch (WT_ITEM_TYPE(item)) { + case WT_ITEM_KEY: + case WT_ITEM_KEY_DUP: + if (huffman == NULL) { + __wt_key_set(rip, + WT_ITEM_BYTE(item), WT_ITEM_LEN(item)); + break; + } + /* FALLTHROUGH */ + case WT_ITEM_KEY_OVFL: + case WT_ITEM_KEY_DUP_OVFL: + __wt_key_set_process(rip, item); + break; + case WT_ITEM_OFF: + off = WT_ITEM_BYTE_OFF(item); + records += WT_RECORDS(off); + rip->data = item; + ++rip; + break; + WT_ILLEGAL_FORMAT(db); + } + + page->indx_count = dsk->u.entries / 2; + page->records = records; + return (0); +} + +/* + * __wt_page_inmem_row_leaf -- + * Build in-memory index for row-store leaf pages. + */ +static int +__wt_page_inmem_row_leaf(DB *db, WT_PAGE *page) +{ + ENV *env; + IDB *idb; + WT_ITEM *item; + WT_PAGE_DISK *dsk; + WT_REF *ref; + WT_ROW *rip; + uint32_t i, indx_count; + uint64_t records; + + env = db->env; + idb = db->idb; + dsk = page->dsk; + records = 0; + + /* + * Walk a row-store page of WT_ITEMs, building indices and finding the + * end of the page. + * + * The page contains key/data pairs. Keys are on-page (WT_ITEM_KEY) or + * overflow (WT_ITEM_KEY_OVFL) items. The data sets are either: a + * single on-page (WT_ITEM_DATA) or overflow (WT_ITEM_DATA_OVFL) item; + * a group of duplicate data items where each duplicate is an on-page + * (WT_ITEM_DATA_DUP) or overflow (WT_ITEM_DUP_OVFL) item; or an offpage + * reference (WT_ITEM_OFF). + */ + rip = NULL; + indx_count = 0; + WT_ITEM_FOREACH(dsk, item, i) + switch (WT_ITEM_TYPE(item)) { + case WT_ITEM_KEY: + case WT_ITEM_KEY_OVFL: + if (rip == NULL) + rip = page->u.irow; + else + ++rip; + if (idb->huffman_key != NULL || + WT_ITEM_TYPE(item) == WT_ITEM_KEY_OVFL) + __wt_key_set_process(rip, item); + else + __wt_key_set(rip, + WT_ITEM_BYTE(item), WT_ITEM_LEN(item)); + ++indx_count; + break; + case WT_ITEM_DATA_DUP: + case WT_ITEM_DATA_DUP_OVFL: + /* + * If the second or subsequent duplicate, move to the + * next slot and copy the previous key. + */ + if (rip->data != NULL) { + __wt_key_set(rip + 1, rip->key, rip->size); + ++rip; + ++indx_count; + } + /* FALLTHROUGH */ + case WT_ITEM_DATA: + case WT_ITEM_DATA_OVFL: + rip->data = item; + ++records; + break; + case WT_ITEM_OFF: + rip->data = item; + records += WT_ROW_OFF_RECORDS(rip); + + /* + * We need a WT_REF entry for any item referencing an + * off-page duplicate tree. Create the array of WT_REF + * pointers and fill in a WT_REF structure. + */ + if (page->u3.dup == NULL) + WT_RET(__wt_calloc(env, indx_count, + sizeof(WT_REF *), &page->u3.dup)); + WT_RET(__wt_calloc(env, 1, sizeof(WT_REF), &ref)); + ref->state = WT_EMPTY; + page->u3.dup[WT_ROW_SLOT(page, rip)] = ref; + + break; + WT_ILLEGAL_FORMAT(db); + } + + page->indx_count = indx_count; + page->records = records; + + return (0); +} + +/* + * __wt_item_process -- + * Overflow and/or compressed on-page items need processing before + * we look at them. + */ +int +__wt_item_process(WT_TOC *toc, WT_ITEM *item, DBT *dbt_ret) +{ + DB *db; + DBT *tmp; + ENV *env; + IDB *idb; + uint32_t size; + int ret; + void *huffman, *p; + + db = toc->db; + tmp = NULL; + env = toc->env; + idb = db->idb; + ret = 0; + + /* + * 3 cases: compressed on-page item, or compressed or uncompressed + * overflow item. + */ + switch (WT_ITEM_TYPE(item)) { + case WT_ITEM_KEY: + huffman = idb->huffman_key; + goto onpage; + case WT_ITEM_KEY_DUP: + case WT_ITEM_DATA: + case WT_ITEM_DATA_DUP: + huffman = idb->huffman_data; +onpage: p = WT_ITEM_BYTE(item); + size = WT_ITEM_LEN(item); + break; + case WT_ITEM_KEY_OVFL: + huffman = idb->huffman_key; + goto offpage; + case WT_ITEM_KEY_DUP_OVFL: + case WT_ITEM_DATA_OVFL: + case WT_ITEM_DATA_DUP_OVFL: + huffman = idb->huffman_data; +offpage: /* + * It's an overflow item -- if it's not encoded, we can read + * it directly into the user's return DBT, otherwise we have to + * have our own buffer as temporary space, and the decode call + * will put a decoded version into the user's return DBT. + */ + if (huffman == NULL) + tmp = dbt_ret; + else + WT_RET(__wt_scr_alloc(toc, 0, &tmp)); + WT_RET(__wt_ovfl_in(toc, WT_ITEM_BYTE_OVFL(item), tmp)); + p = tmp->data; + size = tmp->size; + break; + WT_ILLEGAL_FORMAT(db); + } + + /* + * If the item is not compressed, and it's not an overflow item, copy + * it into the caller's DBT. If the item is not compressed, and it's + * an overflow item, it was already copied into the caller's DBT. + * + * If the item is compressed, pass it to the decode routines, they'll + * copy a decoded version into the caller's DBT. + */ + if (huffman == NULL) { + if (tmp != dbt_ret) { + if (size > dbt_ret->mem_size) + WT_ERR(__wt_realloc( + env, &dbt_ret->mem_size, + size, &dbt_ret->data)); + memcpy(dbt_ret->data, p, size); + dbt_ret->size = size; + } + } else + WT_ERR(__wt_huffman_decode(huffman, p, size, + &dbt_ret->data, &dbt_ret->mem_size, &dbt_ret->size)); + +err: if (tmp != NULL && tmp != dbt_ret) + __wt_scr_release(&tmp); + + return (ret); +} + +/* + * __wt_page_inmem_int_ref -- + * Allocate and initialize the reference array for internal pages. + */ +static int +__wt_page_inmem_int_ref(WT_TOC *toc, uint32_t nindx, WT_PAGE *page) +{ + ENV *env; + WT_REF *cp; + uint32_t i; + + env = toc->env; + + /* + * Allocate an array of WT_REF structures for internal pages. In the + * case of an internal page, we know all of the slots are going to be + * filled in -- every slot on the page references a subtree. In the + * case of row-store leaf pages, the only slots that get filled in are + * slots that reference off-page duplicate trees. So, if it's an + * internal page, it's a simple one-time allocation; if a leaf page, + * we'll do similar work, but lazily in the routine that fills in the + * in-memory information. + */ + WT_RET(__wt_calloc( + env, nindx, sizeof(WT_REF), &page->u3.ref)); + for (i = 0, cp = page->u3.ref; i < nindx; ++i, ++cp) + cp->state = WT_EMPTY; + return (0); +} + +/* + * __wt_key_set -- + * Set a key/size pair, where the key does not require further processing. + */ +inline void +__wt_key_set(WT_ROW *rip, void *key, uint32_t size) +{ + rip->key = key; + rip->size = size; +} + +/* + * __wt_key_set_process -- + * Set a key/size pair, where the key requires further processing. + */ +inline void +__wt_key_set_process(WT_ROW *rip, void *key) +{ + rip->key = key; + rip->size = 0; +} + +/* + * __wt_key_process -- + * Return if a key requires processing. + */ +inline int +__wt_key_process(WT_ROW *rip) +{ + return (rip->size == 0 ? 1 : 0); +} diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c new file mode 100644 index 00000000000..f7e594d2217 --- /dev/null +++ b/src/btree/bt_read.c @@ -0,0 +1,272 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +static int __wt_cache_read(WT_READ_REQ *); + +/* + * __wt_workq_read_server -- + * See if the read server thread needs to be awakened. + */ +void +__wt_workq_read_server(ENV *env, int force) +{ + WT_CACHE *cache; + uint64_t bytes_inuse, bytes_max; + + cache = env->ienv->cache; + + /* + * If we're 10% over the maximum cache, shut out reads (which include + * page allocations) until we evict to at least 5% under the maximum + * cache. The idea is that we don't want to run on the edge all the + * time -- if we're seriously out of space, get things under control + * before opening up for more reads. + */ + bytes_inuse = __wt_cache_bytes_inuse(cache); + bytes_max = WT_STAT(cache->stats, CACHE_BYTES_MAX); + if (cache->read_lockout) { + if (bytes_inuse <= bytes_max - (bytes_max / 20)) + cache->read_lockout = 0; + } else if (bytes_inuse > bytes_max + (bytes_max / 10)) { + WT_VERBOSE(env, WT_VERB_READ, (env, + "workQ locks out reads: bytes-inuse %llu of bytes-max %llu", + (unsigned long long)bytes_inuse, + (unsigned long long)bytes_max)); + cache->read_lockout = 1; + } + + /* If the cache read server is running, there's nothing to do. */ + if (!cache->read_sleeping) + return; + + /* + * If reads are locked out and we're not forcing the issue (that's when + * closing the environment, or if there's a priority read waiting to be + * handled), we're done. + */ + if (!force && cache->read_lockout) + return; + + cache->read_sleeping = 0; + __wt_unlock(env, cache->mtx_read); +} + +/* + * __wt_cache_read_serial_func -- + * Read/allocation serialization function called when a page-in requires + * allocation or a read. + */ +int +__wt_cache_read_serial_func(WT_TOC *toc) +{ + ENV *env; + WT_CACHE *cache; + WT_OFF *off; + WT_PAGE *parent; + WT_READ_REQ *rr, *rr_end; + WT_REF *ref; + int dsk_verify; + + __wt_cache_read_unpack(toc, parent, ref, off, dsk_verify); + + env = toc->env; + cache = env->ienv->cache; + + /* Find an empty slot and enter the read request. */ + rr = cache->read_request; + rr_end = rr + WT_ELEMENTS(cache->read_request); + for (; rr < rr_end; ++rr) + if (WT_READ_REQ_ISEMPTY(rr)) { + WT_READ_REQ_SET(rr, toc, parent, ref, off, dsk_verify); + return (0); + } + __wt_api_env_errx(env, "read server request table full"); + return (WT_RESTART); +} + +/* + * __wt_cache_read_server -- + * Thread to do database reads. + */ +void * +__wt_cache_read_server(void *arg) +{ + ENV *env; + IENV *ienv; + WT_CACHE *cache; + WT_READ_REQ *rr, *rr_end; + WT_TOC *toc; + int didwork, ret; + + env = arg; + ienv = env->ienv; + cache = ienv->cache; + ret = 0; + + rr = cache->read_request; + rr_end = rr + WT_ELEMENTS(cache->read_request); + + for (;;) { + WT_VERBOSE(env, + WT_VERB_READ, (env, "cache read server sleeping")); + cache->read_sleeping = 1; + __wt_lock(env, cache->mtx_read); + WT_VERBOSE( + env, WT_VERB_READ, (env, "cache read server waking")); + + /* + * Check for environment exit; do it here, instead of the top of + * the loop because doing it here keeps us from doing a bunch of + * worked when simply awakened to quit. + */ + if (!F_ISSET(ienv, WT_SERVER_RUN)) + break; + + /* + * Walk the read-request queue, looking for reads (defined by + * a valid WT_TOC handle). If we find a read request, perform + * it, flush the result and clear the request slot, then wake + * up the requesting thread. The request slot clear doesn't + * need to be flushed, but we have to flush the read result, + * might as well include it. If we don't find any work, go to + * sleep. + */ + do { + didwork = 0; + for (rr = cache->read_request; rr < rr_end; ++rr) { + if ((toc = rr->toc) == NULL) + continue; + if (cache->read_lockout && + !F_ISSET(toc, WT_READ_PRIORITY)) + continue; + + /* + * The read server thread does both general file + * allocation and cache page instantiation. In + * a file allocation, there's no pagep field in + * in which to return a page. + */ + ret = __wt_cache_read(rr); + + WT_READ_REQ_CLR(rr); + __wt_toc_serialize_wrapup(toc, NULL, ret); + + didwork = 1; + + /* + * Any error terminates the request; a serious + * error causes the read server to exit. + */ + if (ret != 0) { + if (ret != WT_RESTART) + goto err; + ret = 0; + } + } + } while (didwork); + } + + if (ret != 0) +err: __wt_api_env_err(env, ret, "cache read server error"); + + WT_VERBOSE(env, WT_VERB_READ, (env, "cache read server exiting")); + return (NULL); +} + +/* + * __wt_cache_read -- + * Read a page from the file. + */ +static int +__wt_cache_read(WT_READ_REQ *rr) +{ + ENV *env; + WT_CACHE *cache; + WT_OFF *off; + WT_PAGE *page; + WT_PAGE_DISK *dsk; + WT_REF *ref; + WT_TOC *toc; + uint32_t addr, size; + int ret; + + toc = rr->toc; + ref = rr->ref; + off = rr->off; + addr = off->addr; + size = off->size; + + env = toc->env; + cache = env->ienv->cache; + ret = 0; + + /* + * Check to see if some other thread brought the page into the cache + * while our request was in the queue. If the state is anything + * other than empty, it's not our problem. + */ + if (ref->state != WT_EMPTY) + return (0); + + /* + * The page isn't in the cache, and since we're the only path for the + * page to get into the cache, we don't have to worry further, and + * we might as well get to it. + * + * Allocate memory for the in-memory page information and for the page + * itself. They're two separate allocation calls so we (hopefully) get + * better alignment from the underlying heap memory allocator. + */ + WT_RET(__wt_calloc(env, 1, sizeof(WT_PAGE), &page)); + WT_ERR(__wt_calloc(env, (size_t)size, sizeof(uint8_t), &dsk)); + + /* Read the page. */ + WT_VERBOSE(env, WT_VERB_READ, + (env, "cache read addr/size %lu/%lu", (u_long)addr, (u_long)size)); + + WT_ERR(__wt_page_disk_read(toc, dsk, addr, size)); + WT_CACHE_PAGE_IN(cache, size); + + /* If the page needs to be verified, that's next. */ + if (rr->dsk_verify) + WT_ERR(__wt_verify_dsk_page(toc, dsk, addr, size)); + + /* + * Fill in the WT_PAGE addr, size. + * Reference the parent's WT_PAGE and parent's WT_OFF structures. + * Reference the underlying disk page. + */ + page->addr = addr; + page->size = size; + page->parent = rr->parent; + page->parent_off = off; + page->dsk = dsk; + + /* Build the in-memory version of the page. */ + WT_ERR(__wt_page_inmem(toc, page)); + + /* + * The page is now available -- set the LRU so the page is not selected + * for eviction. + */ + page->read_gen = ++cache->read_gen; + ref->page = page; + ref->state = WT_OK; + + return (0); + +err: if (page != NULL) { + if (page->dsk != NULL) + __wt_free(env, page->dsk, size); + __wt_free(env, page, sizeof(WT_PAGE)); + } + return (ret); +} diff --git a/src/btree/bt_reconcile.c b/src/btree/bt_reconcile.c new file mode 100644 index 00000000000..7a57cfe4a97 --- /dev/null +++ b/src/btree/bt_reconcile.c @@ -0,0 +1,982 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +static int __wt_rle_expand_compare(const void *, const void *); +static int __wt_rec_col_fix(WT_TOC *, WT_PAGE *, WT_PAGE *); +static int __wt_rec_col_int(WT_TOC *, WT_PAGE *, WT_PAGE *); +static int __wt_rec_col_rle(WT_TOC *, WT_PAGE *, WT_PAGE *); +static int __wt_rec_col_var(WT_TOC *, WT_PAGE *, WT_PAGE *); +static int __wt_rec_page_write(WT_TOC *, WT_PAGE *, WT_PAGE *); +static int __wt_rec_parent_update(WT_TOC *, WT_PAGE *, WT_PAGE *); +static int __wt_rec_row(WT_TOC *, WT_PAGE *, WT_PAGE *); +static int __wt_rec_row_int(WT_TOC *, WT_PAGE *, WT_PAGE *); +static inline void __wt_rec_set_page_size(WT_TOC *, WT_PAGE *, uint8_t *); + +/* + * __wt_rec_set_page_size -- + * Set the page's size to the minimum number of allocation units. + */ +static inline void +__wt_rec_set_page_size(WT_TOC *toc, WT_PAGE *page, uint8_t *first_free) +{ + DB *db; + + db = toc->db; + + /* + * Set the page's size to the minimum number of allocation units needed + * (the page size can either grow or shrink). + * + * Set the page size before verifying the page, the verification code + * checks for entries that extend past the end of the page, and expects + * the WT_PAGE->size field to be valid. + */ + page->size = WT_ALIGN(first_free - (uint8_t *)page->dsk, db->allocsize); +} + +/* + * __wt_page_reconcile -- + * Format an in-memory page to its on-disk format, and write it. + */ +int +__wt_page_reconcile(WT_TOC *toc, WT_PAGE *page) +{ + DB *db; + DBT *tmp; + ENV *env; + WT_PAGE *new, _new; + WT_PAGE_DISK *dsk; + uint32_t max; + int ret; + + db = toc->db; + tmp = NULL; + env = toc->env; + dsk = page->dsk; + + /* If the page isn't dirty, we should never have been called. */ + WT_ASSERT(env, WT_PAGE_IS_MODIFIED(page)); + + WT_VERBOSE(env, WT_VERB_EVICT, + (env, "reconcile addr %lu (page %p, type %s)", + (u_long)page->addr, page, __wt_page_type_string(dsk))); + + /* + * Update the disk generation before reading the page. The workQ will + * update the write generation after it makes a change, and if we have + * different disk and write generation numbers, the page may be dirty. + * We technically requires a flush (the eviction server might run on a + * different core before a flush naturally occurred). + */ + WT_PAGE_DISK_WRITE(page); + WT_MEMORY_FLUSH; + + switch (dsk->type) { + case WT_PAGE_COL_FIX: + /* + * Fixed-width pages without run-length encoding cannot change + * size. + */ + max = page->size; + break; + case WT_PAGE_COL_RLE: + case WT_PAGE_COL_VAR: + case WT_PAGE_DUP_LEAF: + case WT_PAGE_ROW_LEAF: + /* + * Other leaf page types can grow, allocate the maximum leaf + * page size. + */ + max = db->leafmax; + break; + case WT_PAGE_COL_INT: + case WT_PAGE_DUP_INT: + case WT_PAGE_ROW_INT: + /* + * All internal page types can grow, allocate the maximum + * internal page size. + */ + max = db->intlmax; + break; + case WT_PAGE_OVFL: + WT_ILLEGAL_FORMAT_ERR(db, ret); + } + + /* + * Initialize a WT_PAGE page on the stack and allocate a scratch buffer + * for its contents. We use two pieces of memory because we want the + * page contents to be aligned for direct I/O. The WT_PAGE structure + * is relatively small, the stack is fine. + */ + WT_CLEAR(_new); + new = &_new; + WT_ERR(__wt_scr_alloc(toc, max, &tmp)); + memset(tmp->data, 0, max); + new->addr = page->addr; + new->size = max; + new->dsk = tmp->data; + new->dsk->start_recno = dsk->start_recno; + new->dsk->type = dsk->type; + new->dsk->level = dsk->level; + + switch (dsk->type) { + case WT_PAGE_COL_FIX: + WT_ERR(__wt_rec_col_fix(toc, page, new)); + break; + case WT_PAGE_COL_RLE: + WT_ERR(__wt_rec_col_rle(toc, page, new)); + break; + case WT_PAGE_COL_VAR: + WT_ERR(__wt_rec_col_var(toc, page, new)); + break; + case WT_PAGE_COL_INT: + WT_ERR(__wt_rec_col_int(toc, page, new)); + break; + case WT_PAGE_DUP_INT: + case WT_PAGE_ROW_INT: + WT_ERR(__wt_rec_row_int(toc, page, new)); + break; + case WT_PAGE_ROW_LEAF: + case WT_PAGE_DUP_LEAF: + WT_ERR(__wt_rec_row(toc, page, new)); + break; + WT_ILLEGAL_FORMAT_ERR(db, ret); + } + + /* Write the new page to disk. */ + WT_ERR(__wt_rec_page_write(toc, page, new)); + + /* Free the original page -- update the address and size. */ + WT_ERR(__wt_file_free(toc, page->addr, page->size)); + + /* + * Update the backing address. + * + * XXX + * This is more for diagnostic information than anything else, that is, + * this will match the WT_REF->addr in the parent. + * + * The parent's WT_REF->size may be different, that is, page->size is + * the original page size at the original address and the size of the + * page's buffer in memory, NOT the size of the newly written page at + * the new address. We may NOT update the size here, otherwise we + * can no longer figure out if WT_ROW/WT_COL items reference on-page + * data vs. allocated data. + */ + page->addr = new->addr; + +err: if (tmp != NULL) + __wt_scr_release(&tmp); + + return (ret); +} + +/* + * __wt_rec_col_int -- + * Reconcile a column store internal page. + */ +static int +__wt_rec_col_int(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new) +{ + WT_COL *cip; + WT_OFF *from; + WT_PAGE_DISK *dsk; + WT_REPL *repl; + uint32_t i, space_avail; + uint8_t *first_free; + + dsk = new->dsk; + __wt_set_ff_and_sa_from_offset( + new, WT_PAGE_BYTE(new), &first_free, &space_avail); + + WT_INDX_FOREACH(page, cip, i) { + if ((repl = WT_COL_REPL(page, cip)) != NULL) + from = WT_REPL_DATA(repl); + else + from = cip->data; + + /* + * XXX + * We don't yet handle splits: we allocated the maximum page + * size, but it still wasn't enough. We must allocate another + * page and split the parent. + */ + if (sizeof(WT_OFF) > space_avail) { + fprintf(stderr, + "__wt_rec_col_int: page %lu split\n", + (u_long)page->addr); + __wt_abort(toc->env); + } + + memcpy(first_free, from, sizeof(WT_OFF)); + first_free += sizeof(WT_OFF); + space_avail -= sizeof(WT_OFF); + ++dsk->u.entries; + } + + new->records = page->records; + __wt_rec_set_page_size(toc, new, first_free); + + return (0); +} + +/* + * __wt_rec_row_int -- + * Reconcile a row store, or off-page duplicate tree, internal page. + */ +static int +__wt_rec_row_int(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new) +{ + WT_ITEM *key_item, *data_item, *next; + WT_PAGE_DISK *dsk; + WT_REPL *repl; + WT_ROW *rip; + uint32_t i, len, space_avail; + uint8_t *first_free; + + dsk = new->dsk; + __wt_set_ff_and_sa_from_offset( + new, WT_PAGE_BYTE(new), &first_free, &space_avail); + + /* + * We have to walk both the WT_ROW structures as well as the original + * page: the problem is keys that require processing. When a page is + * read into memory from a simple database, the WT_ROW key/size pair + * is set to reference an on-page group of bytes in the key's WT_ITEM + * structure. As Btree keys are immutable, that original WT_ITEM is + * usually what we want to write, and we can pretty easily find it by + * moving to immediately before the on-page key. + * + * Keys that require processing are harder (for example, a Huffman + * encoded key). When we have to use a key that requires processing, + * we process the key and set the WT_ROW key/size pair to reference + * the allocated memory that holds the key. At that point we've lost + * any reference to the original WT_ITEM structure, which is what we + * want to re-write when reconciling the page. We don't want to make + * the WT_ROW structure bigger by another sizeof(void *) bytes, so we + * walk the original page at the same time we walk the WT_PAGE array + * when reconciling the page so we can find the original WT_ITEM. + */ + key_item = WT_PAGE_BYTE(page); + WT_INDX_FOREACH(page, rip, i) { + /* + * Copy the paired items off the old page into the new page; if + * the page has been replaced, update its information. + * + * XXX + * Internal pages can't grow, yet, so we could more easily just + * update the old page. We do the copy because eventually we + * will have to split the internal pages, and they'll be able to + * grow. + */ + data_item = WT_ITEM_NEXT(key_item); + if ((repl = WT_ROW_REPL(page, rip)) != NULL) + memcpy(WT_ITEM_BYTE(data_item), + WT_REPL_DATA(repl), sizeof(WT_OFF)); + next = WT_ITEM_NEXT(data_item); + len = (uint32_t)((uint8_t *)next - (uint8_t *)key_item); + + /* + * XXX + * We don't yet handle splits: we allocated the maximum page + * size, but it still wasn't enough. We must allocate another + * page and split the parent. + */ + if (len > space_avail) { + fprintf(stderr, + "__wt_rec_row_int: page %lu split\n", + (u_long)page->addr); + __wt_abort(toc->env); + } + + memcpy(first_free, key_item, len); + first_free += len; + space_avail -= len; + ++dsk->u.entries; + + key_item = next; + } + + new->records = page->records; + __wt_rec_set_page_size(toc, new, first_free); + + return (0); +} + +/* + * __wt_rec_col_fix -- + * Reconcile a fixed-width, column-store leaf page (does not handle + * run-length encoding). + */ +static int +__wt_rec_col_fix(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new) +{ + DB *db; + DBT *tmp; + ENV *env; + WT_COL *cip; + WT_PAGE_DISK *dsk; + WT_REPL *repl; + uint32_t i, len, space_avail; + uint8_t *data, *first_free; + int ret; + + db = toc->db; + tmp = NULL; + env = toc->env; + dsk = new->dsk; + ret = 0; + + __wt_set_ff_and_sa_from_offset( + new, WT_PAGE_BYTE(new), &first_free, &space_avail); + + /* + * We need a "deleted" data item to store on the page. Make sure the + * WT_TOC's scratch buffer is big enough. Clear the buffer's contents + * and set the delete flag. + */ + len = db->fixed_len; + WT_ERR(__wt_scr_alloc(toc, len, &tmp)); + memset(tmp->data, 0, len); + WT_FIX_DELETE_SET(tmp->data); + + WT_INDX_FOREACH(page, cip, i) { + /* + * Get a reference to the data, on- or off- page, and see if + * it's been deleted. + */ + if ((repl = WT_COL_REPL(page, cip)) != NULL) { + if (WT_REPL_DELETED_ISSET(repl)) + data = tmp->data; /* Replaced deleted */ + else /* Replaced data */ + data = WT_REPL_DATA(repl); + } else if (WT_FIX_DELETE_ISSET(cip->data)) + data = tmp->data; /* On-page deleted */ + else + data = cip->data; /* On-page data */ + + /* + * When reconciling a fixed-width page that doesn't support + * run-length encoding, the on-page information can't change + * size -- there's no reason to ever split such a page. + */ + WT_ASSERT(env, len <= space_avail); + + memcpy(first_free, data, len); + first_free += len; + space_avail -= len; + ++dsk->u.entries; + } + + new->records = page->records; + __wt_rec_set_page_size(toc, new, first_free); + +err: if (tmp != NULL) + __wt_scr_release(&tmp); + return (ret); +} + +/* + * __wt_rec_col_rle -- + * Reconcile a fixed-width, run-length encoded, column-store leaf page. + */ +static int +__wt_rec_col_rle(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new) +{ + DB *db; + DBT *tmp; + ENV *env; + WT_COL *cip; + WT_PAGE_DISK *dsk; + WT_RLE_EXPAND *exp, **expsort, **expp; + WT_REPL *repl; + uint64_t recno; + uint32_t i, len, n_expsort, space_avail; + uint16_t n, nrepeat, repeat_count; + uint8_t *data, *first_free, *last_data; + int from_repl, ret; + + db = toc->db; + tmp = NULL; + env = toc->env; + expsort = NULL; + dsk = new->dsk; + n_expsort = 0; /* Necessary for the sort function */ + last_data = NULL; + ret = 0; + + __wt_set_ff_and_sa_from_offset( + new, WT_PAGE_BYTE(new), &first_free, &space_avail); + + /* + * We need a "deleted" data item to store on the page. Make sure the + * WT_TOC's scratch buffer is big enough. Clear the buffer's contents + * and set the delete flag. + */ + len = db->fixed_len + sizeof(uint16_t); + WT_ERR(__wt_scr_alloc(toc, len, &tmp)); + memset(tmp->data, 0, len); + WT_RLE_REPEAT_COUNT(tmp->data) = 1; + WT_FIX_DELETE_SET(WT_RLE_REPEAT_DATA(tmp->data)); + + /* Set recno to the first record on the page. */ + recno = page->dsk->start_recno; + WT_INDX_FOREACH(page, cip, i) { + /* + * Get a sorted list of any expansion entries we've created for + * this set of records. The sort function returns a NULL- + * terminated array of references to WT_RLE_EXPAND structures, + * sorted by record number. + */ + WT_ERR(__wt_rle_expand_sort( + env, page, cip, &expsort, &n_expsort)); + + /* + * + * Generate entries for the new page: loop through the repeat + * records, checking for WT_RLE_EXPAND entries that match the + * current record number. + */ + nrepeat = WT_RLE_REPEAT_COUNT(cip->data); + for (expp = expsort, n = 1; + n <= nrepeat; n += repeat_count, recno += repeat_count) { + from_repl = 0; + if ((exp = *expp) != NULL && recno == exp->recno) { + ++expp; + + /* Use the WT_RLE_EXPAND's WT_REPL field. */ + repl = exp->repl; + if (WT_REPL_DELETED_ISSET(repl)) + data = tmp->data; + else { + from_repl = 1; + data = WT_REPL_DATA(repl); + } + repeat_count = 1; + } else { + if (WT_FIX_DELETE_ISSET(cip->data)) + data = tmp->data; + else + data = cip->data; + /* + * The repeat count is the number of records + * up to the next WT_RLE_EXPAND record, or + * up to the end of this entry if we have no + * more WT_RLE_EXPAND records. + */ + if (exp == NULL) + repeat_count = (nrepeat - n) + 1; + else + repeat_count = + (uint16_t)(exp->recno - recno); + } + + /* + * In all cases, check the last entry written on the + * page to see if it's identical, and increment its + * repeat count where possible. + */ + if (last_data != NULL && + memcmp(WT_RLE_REPEAT_DATA(last_data), + WT_RLE_REPEAT_DATA(data), db->fixed_len) == 0 && + WT_RLE_REPEAT_COUNT(last_data) < UINT16_MAX) { + WT_RLE_REPEAT_COUNT(last_data) += repeat_count; + continue; + } + + /* + * XXX + * We don't yet handle splits: we allocated the maximum + * leaf page size, but it still wasn't enough. We must + * allocate another leaf page and split the parent. + */ + if (len > space_avail) { + fprintf(stderr, + "__wt_rec_col_rle: page %lu split\n", + (u_long)page->addr); + __wt_abort(env); + } + + /* + * Most of the formats already include a repeat count: + * specifically the deleted buffer, or any entry we're + * copying from the original page. However, entries + * that were deleted or replaced are read from a WT_REPL + * structure, which has no repeat count. + */ + last_data = first_free; + if (from_repl) { + WT_RLE_REPEAT_COUNT(last_data) = repeat_count; + memcpy(WT_RLE_REPEAT_DATA( + last_data), data, db->fixed_len); + } else + memcpy(last_data, data, len); + first_free += len; + space_avail -= len; + ++dsk->u.entries; + } + } + + new->records = page->records; + __wt_rec_set_page_size(toc, new, first_free); + + /* Free the sort array. */ +err: if (expsort != NULL) + __wt_free(env, expsort, n_expsort * sizeof(WT_RLE_EXPAND *)); + + if (tmp != NULL) + __wt_scr_release(&tmp); + + return (ret); +} + +/* + * __wt_rle_expand_compare -- + * Qsort function: sort WT_RLE_EXPAND structures based on the record + * offset, in ascending order. + */ +static int +__wt_rle_expand_compare(const void *a, const void *b) +{ + WT_RLE_EXPAND *a_exp, *b_exp; + + a_exp = *(WT_RLE_EXPAND **)a; + b_exp = *(WT_RLE_EXPAND **)b; + + return (a_exp->recno > b_exp->recno ? 1 : 0); +} + +/* + * __wt_rle_expand_sort -- + * Return the current on-page index's array of WT_RLE_EXPAND structures, + * sorted by record offset. + */ +int +__wt_rle_expand_sort(ENV *env, + WT_PAGE *page, WT_COL *cip, WT_RLE_EXPAND ***expsortp, uint32_t *np) +{ + WT_RLE_EXPAND *exp; + uint16_t n; + + /* Figure out how big the array needs to be. */ + for (n = 0, + exp = WT_COL_RLEEXP(page, cip); exp != NULL; exp = exp->next, ++n) + ; + + /* + * Allocate that big an array -- always allocate at least one slot, + * our caller expects NULL-termination. + */ + if (n >= *np) { + if (*expsortp != NULL) + __wt_free( + env, *expsortp, *np * sizeof(WT_RLE_EXPAND *)); + WT_RET(__wt_calloc( + env, n + 10, sizeof(WT_RLE_EXPAND *), expsortp)); + *np = n + 10; + } + + /* Enter the WT_RLE_EXPAND structures into the array. */ + for (n = 0, + exp = WT_COL_RLEEXP(page, cip); exp != NULL; exp = exp->next, ++n) + (*expsortp)[n] = exp; + + /* Sort the entries. */ + if (n != 0) + qsort(*expsortp, (size_t)n, + sizeof(WT_RLE_EXPAND *), __wt_rle_expand_compare); + + /* NULL-terminate the array. */ + (*expsortp)[n] = NULL; + + return (0); +} + +/* + * __wt_rec_col_var -- + * Reconcile a variable-width column-store leaf page. + */ +static int +__wt_rec_col_var(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new) +{ + enum { DATA_ON_PAGE, DATA_OFF_PAGE } data_loc; + DBT *data, data_dbt; + WT_COL *cip; + WT_ITEM data_item; + WT_OVFL data_ovfl; + WT_PAGE_DISK *dsk; + WT_REPL *repl; + uint32_t i, len, space_avail; + uint8_t *first_free; + + dsk = new->dsk; + __wt_set_ff_and_sa_from_offset( + new, WT_PAGE_BYTE(new), &first_free, &space_avail); + + WT_CLEAR(data_dbt); + WT_CLEAR(data_item); + data = &data_dbt; + + WT_INDX_FOREACH(page, cip, i) { + /* + * Get a reference to the data: it's either a replacement value + * or the original on-page item. + */ + if ((repl = WT_COL_REPL(page, cip)) != NULL) { + /* + * Check for deletion, else build the data's WT_ITEM + * chunk from the most recent replacement value. + */ + if (WT_REPL_DELETED_ISSET(repl)) { + WT_CLEAR(data_item); + WT_ITEM_SET(&data_item, WT_ITEM_DEL, 0); + len = WT_ITEM_SPACE_REQ(0); + } else { + data->data = WT_REPL_DATA(repl); + data->size = repl->size; + WT_RET(__wt_item_build_data( + toc, data, &data_item, &data_ovfl, 0)); + len = WT_ITEM_SPACE_REQ(data->size); + } + data_loc = DATA_OFF_PAGE; + } else { + data->data = cip->data; + data->size = WT_ITEM_SPACE_REQ(WT_ITEM_LEN(cip->data)); + len = data->size; + data_loc = DATA_ON_PAGE; + } + + /* + * XXX + * We don't yet handle splits -- we allocated the maximum leaf + * page size, but it still wasn't enough. We must allocate + * another leaf page and split the parent. + */ + if (len > space_avail) { + fprintf(stderr, + "__wt_rec_col_var: page %lu split\n", + (u_long)page->addr); + __wt_abort(toc->env); + } + + switch (data_loc) { + case DATA_ON_PAGE: + memcpy(first_free, data->data, data->size); + first_free += data->size; + space_avail -= data->size; + break; + case DATA_OFF_PAGE: + memcpy(first_free, &data_item, sizeof(data_item)); + memcpy(first_free + + sizeof(data_item), data->data, data->size); + first_free += len; + space_avail -= len; + } + ++dsk->u.entries; + } + + new->records = page->records; + __wt_rec_set_page_size(toc, new, first_free); + + return (0); +} + +/* + * __wt_rec_row -- + * Reconcile a row-store leaf page. + */ +static int +__wt_rec_row(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new) +{ + enum { DATA_ON_PAGE, DATA_OFF_PAGE } data_loc; + enum { KEY_ON_PAGE, KEY_NONE } key_loc; + DB *db; + DBT *key, key_dbt, *data, data_dbt; + WT_ITEM key_item, data_item, *item; + WT_OVFL data_ovfl; + WT_PAGE_DISK *dsk; + WT_ROW *rip; + WT_REPL *repl; + uint32_t i, len, space_avail, type; + uint8_t *first_free; + + db = toc->db; + dsk = new->dsk; + __wt_set_ff_and_sa_from_offset( + new, WT_PAGE_BYTE(new), &first_free, &space_avail); + + WT_CLEAR(data_dbt); + WT_CLEAR(key_dbt); + WT_CLEAR(data_item); + WT_CLEAR(key_item); + + key = &key_dbt; + data = &data_dbt; + + /* + * Walk the page, accumulating key/data groups (groups, because a key + * can reference a duplicate data set). + * + * We have to walk both the WT_ROW structures as well as the original + * page: the problem is keys that require processing. When a page is + * read into memory from a simple database, the WT_ROW key/size pair + * is set to reference an on-page group of bytes in the key's WT_ITEM + * structure. As Btree keys are immutable, that original WT_ITEM is + * usually what we want to write, and we can pretty easily find it by + * moving to immediately before the on-page key. + * + * Keys that require processing are harder (for example, a Huffman + * encoded key). When we have to use a key that requires processing, + * we process the key and set the WT_ROW key/size pair to reference + * the allocated memory that holds the key. At that point we've lost + * any reference to the original WT_ITEM structure, which is what we + * want to re-write when reconciling the page. We don't want to make + * the WT_ROW structure bigger by another sizeof(void *) bytes, so we + * walk the original page at the same time we walk the WT_PAGE array + * when reconciling the page so we can find the original WT_ITEM. + */ + item = NULL; + WT_INDX_FOREACH(page, rip, i) { + /* Move to the next key on the original page. */ + if (item == NULL) + item = (WT_ITEM *)WT_PAGE_BYTE(page); + else + do { + item = WT_ITEM_NEXT(item); + } while (WT_ITEM_TYPE(item) != WT_ITEM_KEY && + WT_ITEM_TYPE(item) != WT_ITEM_KEY_OVFL); + + /* + * Get a reference to the data. We get the data first because + * it may have been deleted, in which case we ignore the pair. + */ + if ((repl = WT_ROW_REPL(page, rip)) != NULL) { + if (WT_REPL_DELETED_ISSET(repl)) + continue; + + /* + * Build the data's WT_ITEM chunk from the most recent + * replacement value. + */ + data->data = WT_REPL_DATA(repl); + data->size = repl->size; + WT_RET(__wt_item_build_data( + toc, data, &data_item, &data_ovfl, 0)); + data_loc = DATA_OFF_PAGE; + } else { + /* Copy the item off the page. */ + data->data = rip->data; + data->size = WT_ITEM_SPACE_REQ(WT_ITEM_LEN(rip->data)); + data_loc = DATA_ON_PAGE; + } + + /* + * Check if the key is a duplicate (the key preceding it on the + * page references the same information). We don't store the + * key for the second and subsequent data items in duplicated + * groups. + */ + if (WT_ROW_INDX_IS_DUPLICATE(page, rip)) { + type = data_loc == DATA_ON_PAGE ? + WT_ITEM_TYPE(rip->data) : WT_ITEM_TYPE(&data_item); + switch (type) { + case WT_ITEM_DATA: + case WT_ITEM_DATA_DUP: + type = WT_ITEM_DATA_DUP; + break; + case WT_ITEM_DATA_OVFL: + case WT_ITEM_DATA_DUP_OVFL: + type = WT_ITEM_DATA_DUP_OVFL; + break; + WT_ILLEGAL_FORMAT(db); + } + if (data_loc == DATA_ON_PAGE) + WT_ITEM_SET_TYPE(rip->data, type); + else + WT_ITEM_SET_TYPE(&data_item, type); + key_loc = KEY_NONE; + } else { + /* Take the key's WT_ITEM from the original page. */ + key->data = item; + key->size = WT_ITEM_SPACE_REQ(WT_ITEM_LEN(item)); + key_loc = KEY_ON_PAGE; + } + + len = 0; + switch (key_loc) { + case KEY_ON_PAGE: + len = key->size; + break; + case KEY_NONE: + break; + } + switch (data_loc) { + case DATA_OFF_PAGE: + len += WT_ITEM_SPACE_REQ(data->size); + break; + case DATA_ON_PAGE: + len += data->size; + break; + } + + /* + * XXX + * We don't yet handle splits -- we allocated the maximum leaf + * page size, but it still wasn't enough. We must allocate + * another leaf page and split the parent. + */ + if (len > space_avail) { + fprintf(stderr, "__wt_rec_row: page %lu split\n", + (u_long)page->addr); + __wt_abort(toc->env); + } + + switch (key_loc) { + case KEY_ON_PAGE: + memcpy(first_free, key->data, key->size); + first_free += key->size; + space_avail -= key->size; + ++dsk->u.entries; + break; + case KEY_NONE: + break; + } + switch (data_loc) { + case DATA_ON_PAGE: + memcpy(first_free, data->data, data->size); + first_free += data->size; + space_avail -= data->size; + ++dsk->u.entries; + break; + case DATA_OFF_PAGE: + memcpy(first_free, &data_item, sizeof(data_item)); + memcpy(first_free + + sizeof(WT_ITEM), data->data, data->size); + first_free += WT_ITEM_SPACE_REQ(data->size); + space_avail -= WT_ITEM_SPACE_REQ(data->size); + ++dsk->u.entries; + break; + } + } + + __wt_rec_set_page_size(toc, new, first_free); + + return (0); +} + +/* + * __wt_rec_page_write -- + * Write a newly reconciled page. + */ +static int +__wt_rec_page_write(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new) +{ + ENV *env; + int ret; + + env = toc->env; + + /* + * XXX + * We fail if the page gets emptied -- we'll need to do some kind of + * reverse split where the internal page disappears. That shouldn't + * be difficult, but I haven't written it yet. + */ + if (new->dsk->u.entries == 0) { + new->addr = WT_ADDR_INVALID; + WT_VERBOSE(env, WT_VERB_EVICT, (env, + "reconcile removing empty page %lu", (u_long)page->addr)); + fprintf(stderr, "PAGE %lu EMPTIED\n", (u_long)page->addr); + __wt_abort(env); + } else { + /* + * Allocate file space for the page. + * + * The cache eviction server is the only thread allocating space + * from the file, so there's no need to do any serialization. + */ + WT_RET(__wt_file_alloc(toc, &new->addr, new->size)); + + /* + * Write the page to disk. + * + * !!! + * This is safe for now, but it's a problem when we switch to + * asynchronous I/O: the scenario is (1) schedule the write, + * (2) discard the newly-clean in-memory version, (3) another + * thread tries to read down the tree before the write finishes. + */ + WT_RET(__wt_page_write(toc, new)); + + WT_VERBOSE(env, WT_VERB_EVICT, + (env, "reconcile move %lu to %lu, resize %lu to %lu", + (u_long)page->addr, (u_long)new->addr, + (u_long)page->size, (u_long)new->size)); + } + + /* Update the page's parent. */ + if ((ret = __wt_rec_parent_update(toc, page, new)) != 0) { + (void)__wt_file_free(toc, new->addr, new->size); + return (ret); + } + + return (0); +} + +/* + * __wt_rec_parent_update -- + * Update a parent page's reference when a page is reconciled. + */ +static int +__wt_rec_parent_update(WT_TOC *toc, WT_PAGE *page, WT_PAGE *new) +{ + IDB *idb; + WT_OFF *parent_off; + + idb = toc->db->idb; + + /* + * If we're writing the root of the tree, then we have to update the + * descriptor record, there's no parent to update. + */ + if (page->addr == idb->root_off.addr) { + idb->root_off.addr = new->addr; + idb->root_off.size = new->size; + return (__wt_desc_write(toc)); + } + + /* + * Update the relevant WT_OFF structure. There are two memory locations + * that change (address and size), and we could race, but that's not a + * problem. Only a single thread ever reconciles a page at a time, and + * pages cannot leave memory while they have children. + */ + parent_off = page->parent_off; + WT_RECORDS(parent_off) = new->records; + parent_off->addr = new->addr; + parent_off->size = new->size; + + /* + * Mark the parent page as dirty. + * + * There's no chance we need to flush this write -- the eviction thread + * is the only thread that eventually cares if the page is dirty or not, + * and it's our update that's making it dirty. (The workQ thread does + * have to flush its set-modified update, of course). + * + * We don't care if we race with the workQ; if the workQ thread races + * with us, the page will still be marked dirty and that's all we care + * about. + */ + WT_PAGE_SET_MODIFIED(page->parent); + + return (0); +} diff --git a/src/btree/bt_ret.c b/src/btree/bt_ret.c new file mode 100644 index 00000000000..8cdf8d90ce1 --- /dev/null +++ b/src/btree/bt_ret.c @@ -0,0 +1,179 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_dbt_return -- + * Retrun a WT_PAGE/WT_{ROW,COL}_INDX pair to the application. + */ +int +__wt_dbt_return(WT_TOC *toc, DBT *key, DBT *data, int key_return) +{ + DB *db; + DBT local_key, local_data; + ENV *env; + IDB *idb; + WT_COL *cip; + WT_ITEM *item; + WT_PAGE *page; + WT_PAGE_DISK *dsk; + WT_ROW *rip; + WT_REPL *repl; + void *data_ret; + uint32_t size_ret; + int (*callback)(DB *, DBT *, DBT *), ret; + + db = toc->db; + env = toc->env; + idb = db->idb; + callback = data->callback; + ret = 0; + + page = toc->srch_page; + dsk = page->dsk; + cip = toc->srch_ip; + rip = toc->srch_ip; + repl = toc->srch_repl; + + /* + * Handle the key item -- the key may be unchanged, in which case we + * don't touch it, it's already correct. + * + * If the key/data items are being passed to a callback routine and + * there's nothing special about them (they aren't uninstantiated + * overflow or compressed items), then give the callback a pointer to + * the on-page data. (We use a local DBT in this case, so we don't + * touch potentially allocated application DBT memory.) Else, copy + * the items into the application's DBTs. + * + * If the key/data item are uninstantiated overflow and/or compressed + * items, they require processing before being copied into the DBTs. + * Don't allocate WT_INDX memory for key/data items here. (We never + * allocate WT_INDX memory for data items. We do allocate WT_INDX + * memory for keys, but if we are looking at a key only to return it, + * it's not that likely to be accessed again (think of a cursor moving + * through the tree). Use memory in the application's DBT instead, it + * is discarded when the WT_TOC is discarded. + * + * Key return implies a reference to a WT_ROW index (we don't return + * record number keys yet, that will probably change when I add cursor + * support). + */ + if (key_return) { + if (__wt_key_process(rip)) { + WT_RET(__wt_item_process(toc, rip->key, &toc->key)); + + key->data = toc->key.data; + key->size = toc->key.size; + } else if (callback == NULL) { + if (toc->key.mem_size < rip->size) + WT_RET(__wt_realloc(env, + &toc->key.mem_size, + rip->size, &toc->key.data)); + memcpy(toc->key.data, rip->key, rip->size); + toc->key.size = rip->size; + + key->data = toc->key.data; + key->size = toc->key.size; + } else { + WT_CLEAR(local_key); + key = &local_key; + key->data = rip->key; + key->size = rip->size; + } + } + + /* + * Handle the data item. + * + * If the item was ever replaced, it's easy, take the last replacement + * data item, it's just a byte string. + */ + if (repl != NULL) { + if (WT_REPL_DELETED_ISSET(repl)) + return (WT_NOTFOUND); + data->data = WT_REPL_DATA(repl); + data->size = repl->size; + return (callback == NULL ? 0 : callback(db, key, data)); + } + + /* Otherwise, take the item from the original page. */ + switch (dsk->type) { + case WT_PAGE_COL_FIX: + data_ret = cip->data; + size_ret = db->fixed_len; + break; + case WT_PAGE_COL_RLE: + data_ret = WT_RLE_REPEAT_DATA(cip->data); + size_ret = db->fixed_len; + break; + case WT_PAGE_COL_VAR: + item = cip->data; + goto item_set; + case WT_PAGE_ROW_LEAF: + case WT_PAGE_DUP_LEAF: + item = rip->data; +item_set: switch (WT_ITEM_TYPE(item)) { + case WT_ITEM_DATA: + case WT_ITEM_DATA_DUP: + if (idb->huffman_data == NULL) { + data_ret = WT_ITEM_BYTE(item); + size_ret = WT_ITEM_LEN(item); + } + /* FALLTHROUGH */ + case WT_ITEM_DATA_OVFL: + case WT_ITEM_DATA_DUP_OVFL: + WT_RET(__wt_item_process(toc, item, &toc->data)); + data_ret = toc->data.data; + size_ret = toc->data.size; + break; + WT_ILLEGAL_FORMAT(db); + } + break; + WT_ILLEGAL_FORMAT(db); + } + + /* + * When we get here, data_ret and size_ret are set to the byte string + * and the length we're going to return. That byte string has been + * decoded, we called __wt_item_process above in all cases where the + * item could be encoded. + */ + if (callback == NULL) { + /* + * We're copying the key/data pair out to the caller. If we + * haven't yet copied the data_ret/size_ret pair into the return + * DBT (potentially done by __wt_item_process), do so now. + */ + if (data_ret != toc->data.data) { + if (toc->data.mem_size < size_ret) + WT_RET(__wt_realloc(env, + &toc->data.mem_size, + size_ret, &toc->data.data)); + memcpy(toc->data.data, data_ret, size_ret); + toc->data.size = size_ret; + } + + data->data = toc->data.data; + data->size = toc->data.size; + } else { + /* + * If we're given a callback function, use the data_ret/size_ret + * fields as set. + */ + WT_CLEAR(local_data); + data = &local_data; + data->data = data_ret; + data->size = size_ret; + ret = callback(db, key, data); + } + + return (ret); +} diff --git a/src/btree/bt_rw.c b/src/btree/bt_rw.c new file mode 100644 index 00000000000..ad8f12482b1 --- /dev/null +++ b/src/btree/bt_rw.c @@ -0,0 +1,85 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_page_disk_read -- + * Read a file page. + */ +int +__wt_page_disk_read( + WT_TOC *toc, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size) +{ + DB *db; + ENV *env; + WT_FH *fh; + WT_STATS *stats; + off_t offset; + uint32_t checksum; + + db = toc->db; + env = toc->env; + fh = db->idb->fh; + stats = env->ienv->cache->stats; + + WT_STAT_INCR(stats, PAGE_READ); + + offset = WT_ADDR_TO_OFF(db, addr); + WT_RET(__wt_read(env, fh, offset, size, dsk)); + + checksum = dsk->checksum; + dsk->checksum = 0; + if (checksum != __wt_cksum(dsk, size)) { + __wt_api_env_errx(env, + "read checksum error: addr/size %lu/%lu at offset %llu", + (u_long)addr, (u_long)size, (unsigned long long)offset); + return (WT_ERROR); + } + + return (0); +} + +/* + * __wt_page_write -- + * Write a file page. + */ +inline int +__wt_page_write(WT_TOC *toc, WT_PAGE *page) +{ + return (__wt_page_disk_write(toc, page->dsk, page->addr, page->size)); +} + +/* + * __wt_page_disk_write -- + * Write a file page. + */ +int +__wt_page_disk_write( + WT_TOC *toc, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size) +{ + DB *db; + ENV *env; + WT_FH *fh; + WT_STATS *stats; + + db = toc->db; + env = toc->env; + fh = db->idb->fh; + stats = env->ienv->cache->stats; + + WT_ASSERT(env, __wt_verify_dsk_page(toc, dsk, addr, size) == 0); + + WT_STAT_INCR(stats, PAGE_WRITE); + + dsk->checksum = 0; + dsk->checksum = __wt_cksum(dsk, size); + + return (__wt_write(env, fh, WT_ADDR_TO_OFF(db, addr), size, dsk)); +} diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c new file mode 100644 index 00000000000..5beb931f578 --- /dev/null +++ b/src/btree/bt_stat.c @@ -0,0 +1,348 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +static int __wt_stat_page_col_fix(WT_TOC *, WT_PAGE *); +static int __wt_stat_page_col_rle(WT_TOC *, WT_PAGE *); +static int __wt_stat_page_col_var(WT_TOC *, WT_PAGE *); +static int __wt_stat_page_dup_leaf(WT_TOC *, WT_PAGE *); +static int __wt_stat_page_row_leaf(WT_TOC *, WT_PAGE *, void *); + +/* + * __wt_page_stat -- + * Stat any Btree page. + */ +int +__wt_page_stat(WT_TOC *toc, WT_PAGE *page, void *arg) +{ + DB *db; + IDB *idb; + WT_PAGE_DISK *dsk; + WT_STATS *stats; + + db = toc->db; + idb = db->idb; + dsk = page->dsk; + stats = idb->dstats; + + /* + * All internal pages and overflow pages are trivial, all we track is + * a count of the page type. + */ + switch (dsk->type) { + case WT_PAGE_COL_FIX: + WT_STAT_INCR(stats, PAGE_COL_FIX); + WT_RET(__wt_stat_page_col_fix(toc, page)); + break; + case WT_PAGE_COL_INT: + WT_STAT_INCR(stats, PAGE_COL_INTERNAL); + break; + case WT_PAGE_COL_RLE: + WT_STAT_INCR(stats, PAGE_COL_RLE); + WT_RET(__wt_stat_page_col_rle(toc, page)); + break; + case WT_PAGE_COL_VAR: + WT_STAT_INCR(stats, PAGE_COL_VARIABLE); + WT_RET(__wt_stat_page_col_var(toc, page)); + break; + case WT_PAGE_DUP_INT: + WT_STAT_INCR(stats, PAGE_DUP_INTERNAL); + break; + case WT_PAGE_DUP_LEAF: + WT_STAT_INCR(stats, PAGE_DUP_LEAF); + WT_RET(__wt_stat_page_dup_leaf(toc, page)); + break; + case WT_PAGE_OVFL: + WT_STAT_INCR(stats, PAGE_OVERFLOW); + break; + case WT_PAGE_ROW_INT: + WT_STAT_INCR(stats, PAGE_ROW_INTERNAL); + break; + case WT_PAGE_ROW_LEAF: + WT_STAT_INCR(stats, PAGE_ROW_LEAF); + WT_RET(__wt_stat_page_row_leaf(toc, page, arg)); + break; + WT_ILLEGAL_FORMAT(db); + } + return (0); +} + +/* + * __wt_stat_page_col_fix -- + * Stat a WT_PAGE_COL_FIX page. + */ +static int +__wt_stat_page_col_fix(WT_TOC *toc, WT_PAGE *page) +{ + WT_COL *cip; + WT_REPL *repl; + WT_STATS *stats; + uint32_t i; + + stats = toc->db->idb->dstats; + + /* Walk the page, counting data items. */ + WT_INDX_FOREACH(page, cip, i) { + if ((repl = WT_COL_REPL(page, cip)) == NULL) + if (WT_FIX_DELETE_ISSET(cip->data)) + WT_STAT_INCR(stats, ITEM_COL_DELETED); + else + WT_STAT_INCR(stats, ITEM_TOTAL_DATA); + else + if (WT_REPL_DELETED_ISSET(repl)) + WT_STAT_INCR(stats, ITEM_COL_DELETED); + else + WT_STAT_INCR(stats, ITEM_TOTAL_DATA); + } + return (0); +} + +/* + * __wt_stat_page_col_rle -- + * Stat a WT_PAGE_COL_RLE page. + */ +static int +__wt_stat_page_col_rle(WT_TOC *toc, WT_PAGE *page) +{ + WT_COL *cip; + WT_RLE_EXPAND *exp; + WT_REPL *repl; + WT_STATS *stats; + uint32_t i; + + stats = toc->db->idb->dstats; + + /* Walk the page, counting data items. */ + WT_INDX_FOREACH(page, cip, i) { + if (WT_FIX_DELETE_ISSET(WT_RLE_REPEAT_DATA(cip->data))) + WT_STAT_INCRV(stats, + ITEM_COL_DELETED, WT_RLE_REPEAT_COUNT(cip->data)); + else + WT_STAT_INCRV(stats, + ITEM_TOTAL_DATA, WT_RLE_REPEAT_COUNT(cip->data)); + + /* + * Check for corrections. + * + * XXX + * This gets the count wrong if an application changes existing + * records, or updates a deleted record two times in a row -- + * we'll incorrectly count the records as unique, when they are + * changes to the same record. I'm not fixing it as I don't + * expect the WT_COL_RLEEXP data structure to be permanent, it's + * too likely to become a linked list in bad cases. + */ + for (exp = + WT_COL_RLEEXP(page, cip); exp != NULL; exp = exp->next) { + repl = exp->repl; + if (WT_REPL_DELETED_ISSET(repl)) + WT_STAT_INCR(stats, ITEM_COL_DELETED); + else + WT_STAT_INCR(stats, ITEM_TOTAL_DATA); + } + } + return (0); +} + +/* + * __wt_stat_page_col_var -- + * Stat a WT_PAGE_COL_VAR page. + */ +static int +__wt_stat_page_col_var(WT_TOC *toc, WT_PAGE *page) +{ + DB *db; + WT_COL *cip; + WT_REPL *repl; + WT_STATS *stats; + uint32_t i; + + db = toc->db; + stats = db->idb->dstats; + + /* + * Walk the page, counting regular and overflow data items, and checking + * to be sure any replacements weren't deletions. If the item has been + * replaced, assume it was replaced by an item of the same size (it's + * to expensive to figure out if it will require the same space or not, + * especially if there's Huffman encoding). + */ + WT_INDX_FOREACH(page, cip, i) { + switch (WT_ITEM_TYPE(cip->data)) { + case WT_ITEM_DATA: + repl = WT_COL_REPL(page, cip); + if (repl == NULL || !WT_REPL_DELETED_ISSET(repl)) + WT_STAT_INCR(stats, ITEM_TOTAL_DATA); + break; + case WT_ITEM_DATA_OVFL: + repl = WT_COL_REPL(page, cip); + if (repl == NULL || !WT_REPL_DELETED_ISSET(repl)) { + WT_STAT_INCR(stats, ITEM_DATA_OVFL); + WT_STAT_INCR(stats, ITEM_TOTAL_DATA); + } + break; + case WT_ITEM_DEL: + WT_STAT_INCR(stats, ITEM_COL_DELETED); + break; + WT_ILLEGAL_FORMAT(db); + } + } + return (0); +} + +/* + * __wt_stat_page_dup_leaf -- + * Stat a WT_PAGE_DUP_LEAF page. + */ +static int +__wt_stat_page_dup_leaf(WT_TOC *toc, WT_PAGE *page) +{ + DB *db; + WT_REPL *repl; + WT_ROW *rip; + WT_STATS *stats; + uint32_t i; + + db = toc->db; + stats = db->idb->dstats; + + /* + * Walk the page, counting regular and overflow data items, and checking + * to be sure any replacements weren't deletions. If the item has been + * replaced, assume it was replaced by an item of the same size (it's + * to expensive to figure out if it will require the same space or not, + * especially if there's Huffman encoding). + */ + WT_INDX_FOREACH(page, rip, i) { + switch (WT_ITEM_TYPE(rip->data)) { + case WT_ITEM_DATA_DUP: + repl = WT_ROW_REPL(page, rip); + if (repl == NULL || !WT_REPL_DELETED_ISSET(repl)) { + WT_STAT_INCR(stats, ITEM_DUP_DATA); + WT_STAT_INCR(stats, ITEM_TOTAL_DATA); + } + break; + case WT_ITEM_DATA_DUP_OVFL: + repl = WT_ROW_REPL(page, rip); + if (repl == NULL || !WT_REPL_DELETED_ISSET(repl)) { + WT_STAT_INCR(stats, ITEM_DUP_DATA); + WT_STAT_INCR(stats, ITEM_DATA_OVFL); + WT_STAT_INCR(stats, ITEM_TOTAL_DATA); + } + break; + WT_ILLEGAL_FORMAT(db); + } + } + return (0); +} + +/* + * __wt_stat_page_row_leaf -- + * Stat a WT_PAGE_ROW_LEAF page. + */ +static int +__wt_stat_page_row_leaf(WT_TOC *toc, WT_PAGE *page, void *arg) +{ + DB *db; + WT_OFF *off; + WT_REF *ref; + WT_REPL *repl; + WT_ROW *rip; + WT_STATS *stats; + uint32_t i; + int ret; + + db = toc->db; + stats = db->idb->dstats; + + /* + * Walk the page, counting regular and overflow data items, and checking + * to be sure any replacements weren't deletions. If the item has been + * replaced, assume it was replaced by an item of the same size (it's + * to expensive to figure out if it will require the same space or not, + * especially if there's Huffman encoding). + */ + WT_INDX_FOREACH(page, rip, i) { + switch (WT_ITEM_TYPE(rip->data)) { + case WT_ITEM_DATA: + repl = WT_ROW_REPL(page, rip); + if (repl != NULL && WT_REPL_DELETED_ISSET(repl)) + continue; + WT_STAT_INCR(stats, ITEM_TOTAL_DATA); + break; + case WT_ITEM_DATA_OVFL: + repl = WT_ROW_REPL(page, rip); + if (repl != NULL && WT_REPL_DELETED_ISSET(repl)) + continue; + WT_STAT_INCR(stats, ITEM_DATA_OVFL); + WT_STAT_INCR(stats, ITEM_TOTAL_DATA); + break; + case WT_ITEM_DATA_DUP: + repl = WT_ROW_REPL(page, rip); + if (repl != NULL && WT_REPL_DELETED_ISSET(repl)) + continue; + WT_STAT_INCR(stats, ITEM_DUP_DATA); + WT_STAT_INCR(stats, ITEM_TOTAL_DATA); + break; + case WT_ITEM_DATA_DUP_OVFL: + repl = WT_ROW_REPL(page, rip); + if (repl != NULL && WT_REPL_DELETED_ISSET(repl)) + continue; + WT_STAT_INCR(stats, ITEM_DUP_DATA); + WT_STAT_INCR(stats, ITEM_DATA_OVFL); + WT_STAT_INCR(stats, ITEM_TOTAL_DATA); + break; + case WT_ITEM_OFF: + /* + * Recursively call the tree-walk code for any off-page + * duplicate trees. (Check for any off-page duplicate + * trees locally because we already have to walk the + * page, so it's faster than walking the page both here + * and in the tree-walk function.) + */ + ref = WT_ROW_REF(page, rip); + off = WT_ROW_OFF(rip); + WT_RET(__wt_page_in(toc, page, ref, off, 0)); + ret = __wt_tree_walk(toc, ref, 0, __wt_page_stat, arg); + __wt_hazard_clear(toc, ref->page); + if (ret != 0) + return (ret); + WT_STAT_INCR(stats, DUP_TREE); + break; + WT_ILLEGAL_FORMAT(db); + } + + /* + * If the data item wasn't deleted, count the key. + * + * If we have processed the key, we have lost the information as + * to whether or not it's an overflow key -- we can figure out + * if it's Huffman encoded by looking at the huffman key, but + * that doesn't tell us if it's an overflow key or not. To fix + * this we'd have to maintain a reference to the on-page key and + * check it, and I'm not willing to spend the additional pointer + * in the WT_ROW structure. + */ + if (__wt_key_process(rip)) + switch (WT_ITEM_TYPE(rip->key)) { + case WT_ITEM_KEY_OVFL: + WT_STAT_INCR(stats, ITEM_KEY_OVFL); + /* FALLTHROUGH */ + case WT_ITEM_KEY: + WT_STAT_INCR(stats, ITEM_TOTAL_KEY); + break; + WT_ILLEGAL_FORMAT(db); + } + else + WT_STAT_INCR(stats, ITEM_TOTAL_KEY); + + } + return (0); +} diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c new file mode 100644 index 00000000000..af5a9d65258 --- /dev/null +++ b/src/btree/bt_sync.c @@ -0,0 +1,61 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +static int __wt_bt_tree_sync(WT_TOC *, WT_PAGE *, void *); + +/* + * __wt_bt_sync -- + * Sync the tree. + */ +int +__wt_bt_sync(WT_TOC *toc) +{ + ENV *env; + IDB *idb; + WT_CACHE *cache; + int ret; + + env = toc->env; + idb = toc->db->idb; + cache = env->ienv->cache; + + if (WT_UNOPENED_DATABASE(idb)) + return (0); + + /* + * The tree walk is depth first, that is, the worker function is not + * called on internal pages until all children have been visited; so, + * we don't have to worry about a page being dirtied after the visit. + * + * Lock out the cache eviction thread, though, we don't want it trying + * to reconcile pages we're flushing. + */ + __wt_lock(env, cache->mtx_reconcile); + ret = __wt_tree_walk(toc, NULL, + WT_WALK_CACHE | WT_WALK_OFFDUP, __wt_bt_tree_sync, NULL); + __wt_unlock(env, cache->mtx_reconcile); + return (ret); +} + +/* + * __wt_bt_tree_sync -- + * Sync a page. + */ +static int +__wt_bt_tree_sync(WT_TOC *toc, WT_PAGE *page, void *arg) +{ + WT_CC_QUIET(arg, NULL); + + /* Reconcile any dirty pages. */ + if (WT_PAGE_IS_MODIFIED(page)) + WT_RET(__wt_page_reconcile(toc, page)); + return (0); +} diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c new file mode 100644 index 00000000000..19e9fccb82a --- /dev/null +++ b/src/btree/bt_vrfy.c @@ -0,0 +1,1346 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * There's a bunch of stuff we pass around during verification, group it + * together to make the code prettier. + */ +typedef struct { + uint32_t frags; /* Total frags */ + bitstr_t *fragbits; /* Frag tracking bit list */ + + FILE *stream; /* Dump file stream */ + + void (*f)(const char *, uint64_t); /* Progress callback */ + uint64_t fcnt; /* Progress counter */ + + WT_PAGE *leaf; /* Last leaf-page */ +} WT_VSTUFF; + +static int __wt_verify_addfrag(WT_TOC *, uint32_t, uint32_t, WT_VSTUFF *); +static int __wt_verify_checkfrag(DB *, WT_VSTUFF *); +static int __wt_verify_delfmt(DB *, uint32_t, uint32_t); +static int __wt_verify_dsk_col_fix(DB *, WT_PAGE_DISK *, uint32_t, uint32_t); +static int __wt_verify_dsk_col_int(DB *, WT_PAGE_DISK *, uint32_t, uint32_t); +static int __wt_verify_dsk_col_rle(DB *, WT_PAGE_DISK *, uint32_t, uint32_t); +static int __wt_verify_dsk_item(WT_TOC *, WT_PAGE_DISK *, uint32_t, uint32_t); +static int __wt_verify_dsk_ovfl(WT_TOC *, WT_PAGE_DISK *, uint32_t, uint32_t); +static int __wt_verify_eof(DB *, uint32_t, uint32_t); +static int __wt_verify_eop(DB *, uint32_t, uint32_t); +static int __wt_verify_key_order(WT_TOC *, WT_PAGE *); +static int __wt_verify_overflow_col(WT_TOC *, WT_PAGE *, WT_VSTUFF *); +static int __wt_verify_overflow_common( + WT_TOC *, WT_OVFL *, uint32_t, uint32_t, WT_VSTUFF *); +static int __wt_verify_overflow_row(WT_TOC *, WT_PAGE *, WT_VSTUFF *); +static int __wt_verify_pc(WT_TOC *, WT_ROW *, WT_PAGE *, int); +static int __wt_verify_tree(WT_TOC *, + WT_ROW *, uint64_t, uint64_t, uint32_t, WT_REF *, WT_VSTUFF *); + +/* + * __wt_db_verify -- + * Verify a Btree. + */ +int +__wt_db_verify(WT_TOC *toc, void (*f)(const char *, uint64_t)) +{ + return (__wt_verify(toc, f, NULL)); +} + +/* + * __wt_verify -- + * Verify a Btree, optionally dumping each page in debugging mode. + */ +int +__wt_verify( + WT_TOC *toc, void (*f)(const char *, uint64_t), FILE *stream) +{ + DB *db; + ENV *env; + IDB *idb; + WT_VSTUFF vstuff; + int ret; + + env = toc->env; + db = toc->db; + idb = db->idb; + ret = 0; + + memset(&vstuff, 0, sizeof(vstuff)); + vstuff.stream = stream; + vstuff.f = f; + + /* + * Allocate a bit array, where each bit represents a single allocation + * size piece of the file. This is how we track the parts of the file + * we've verified. Storing this on the heap seems reasonable: with a + * minimum allocation size of 512B, we would allocate 4MB to verify a + * 16GB file. To verify larger files than we can handle this way, we'd + * have to write parts of the bit array into a disk file. + * + * !!! + * There's one portability issue -- the bitstring package uses "ints", + * not unsigned ints, or any fixed size. If an "int" can't hold a + * big enough value, we could lose. There's a check here to make we + * don't overflow. I don't ever expect to see this error message, but + * better safe than sorry. + */ + vstuff.frags = WT_OFF_TO_ADDR(db, idb->fh->file_size); + if (vstuff.frags > INT_MAX) { + __wt_api_db_errx(db, "file is too large to verify"); + goto err; + } + WT_ERR(bit_alloc(env, vstuff.frags, &vstuff.fragbits)); + + /* + * The first sector of the file is the description record -- ignore + * it for now. + */ + bit_nset(vstuff.fragbits, 0, 0); + + /* Verify the tree, starting at the root. */ + WT_ERR(__wt_verify_tree(toc, NULL, WT_RECORDS(&idb->root_off), + (uint64_t)1, WT_NOLEVEL, &idb->root_page, &vstuff)); + + WT_ERR(__wt_verify_checkfrag(db, &vstuff)); + +err: /* Wrap up reporting and free allocated memory. */ + if (vstuff.f != NULL) + vstuff.f(toc->name, vstuff.fcnt); + if (vstuff.fragbits != NULL) + __wt_free(env, vstuff.fragbits, 0); + + return (ret); +} + +/* + * __wt_verify_tree -- + * Verify a tree, recursively descending through it in depth-first fashion. + * The page argument was physically verified (so we know it's correctly formed), + * and the in-memory version built. Our job is to check logical relationships + * in the page and in the tree. + */ +static int +__wt_verify_tree( + WT_TOC *toc, /* Thread of control */ + WT_ROW *parent_rip, /* Internal key referencing this page, if any */ + uint64_t parent_records, /* Parent's count of records in this tree */ + uint64_t start_recno, /* First record on this page */ + uint32_t level, /* Page's tree level */ + WT_REF *ref, /* Already verified page reference */ + WT_VSTUFF *vs) /* The verify package */ +{ + DB *db; + WT_COL *cip; + WT_ITEM *item; + WT_OFF *off; + WT_PAGE *page; + WT_PAGE_DISK *dsk; + WT_REPL *repl; + WT_ROW *rip; + uint64_t records; + uint32_t i; + int is_root, ret; + + db = toc->db; + page = ref->page; + dsk = page->dsk; + ret = 0; + + /* Report progress every 10 pages. */ + if (vs->f != NULL && ++vs->fcnt % 10 == 0) + vs->f(toc->name, vs->fcnt); + + /* Update frags list. */ + WT_ERR(__wt_verify_addfrag(toc, page->addr, page->size, vs)); + +#ifdef DIAGNOSTIC + /* Optionally dump the page in debugging mode. */ + if (vs->stream != NULL) + return (__wt_debug_page(toc, page, NULL, vs->stream)); +#endif + + /* + * The page's physical structure was verified when it was read into + * memory by the read server thread, and then the in-memory version + * of the page was built. Now we make sure the page and tree are + * logically consistent. + * + * !!! + * The problem: (1) the read server has to build the in-memory version + * of the page because the read server is the thread that flags when + * any thread can access the page in the tree; (2) we can't build the + * in-memory version of the page until the physical structure is known + * to be OK, so the read server has to verify at least the physical + * structure of the page; (3) doing complete page verification requires + * reading additional pages (for example, overflow keys imply reading + * overflow pages in order to test the key's order in the page); (4) + * the read server cannot read additional pages because it will hang + * waiting on itself. For this reason, we split page verification + * into a physical verification, which allows the in-memory version + * of the page to be built, and then a subsequent logical verification + * which happens here. + */ + + /* + * If passed a level of WT_NOLEVEL, that is, the only level that can't + * possibly be a valid database page level, this is the root page of + * the tree. + * + * If it's the root, use this page's level to initialize expected the + * values for the rest of the tree. + */ + is_root = level == WT_NOLEVEL ? 1 : 0; + if (is_root) + level = dsk->level; + + /* Check that tree levels and record counts match up. */ + if (dsk->level != level) { + __wt_api_db_errx(db, + "page at addr %lu has a tree level of %lu where the " + "expected level was %lu", + (u_long)page->addr, (u_long)dsk->level, (u_long)level); + goto err; + } + + /* + * Check the record counts. + * + * Confirm the number of records found on this page (by summing the + * WT_OFF structure record counts) matches the WT_OFF structure record + * count in our parent. Use the in-memory record count for internal + * pages -- we could sum the record counts as we walk the page below, + * but we did that when building the in-memory version of the page, + * there's no reason to do it again. + */ + if (page->records != parent_records) { + __wt_api_db_errx(db, + "page at addr %lu has a record count of %llu where the " + "expected record count was %llu", + (u_long)page->addr, page->records, + (unsigned long long)parent_records); + goto err; + } + + /* Check the starting record number. */ + switch (dsk->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_INT: + case WT_PAGE_COL_RLE: + case WT_PAGE_COL_VAR: + if (dsk->start_recno != start_recno) { + __wt_api_db_errx(db, + "page at addr %lu has a starting record of %llu " + "where the expected starting record was %llu", + (u_long)page->addr, + (unsigned long long)dsk->start_recno, + (unsigned long long)start_recno); + goto err; + } + break; + default: + break; + } + + /* + * Check on-page overflow page references. + * + * There's a potential performance problem here: we read key overflow + * pages twice, once when checking the overflow page itself, and again + * when checking the key ordering. It's a pain to combine the two + * tests (the page types with overflow items aren't exactly the same + * as the page types with ordered keys, and the underlying functions + * that instantiate (and decompress) overflow pages don't want to know + * anything about verification), and I don't want to keep the overflow + * keys in the cache, it's likely to be wasted space. Until it's a + * problem, I'm going to assume the second read of the overflow key is + * satisfied in the operating system buffer cache, and not worry about + * it. Table verify isn't likely to be a performance path anyway. + */ + switch (dsk->type) { + case WT_PAGE_COL_VAR: + WT_RET(__wt_verify_overflow_col(toc, page, vs)); + break; + case WT_PAGE_DUP_INT: + case WT_PAGE_DUP_LEAF: + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + WT_RET(__wt_verify_overflow_row(toc, page, vs)); + break; + default: + break; + } + + /* Check on-page key ordering. */ + switch (dsk->type) { + case WT_PAGE_DUP_INT: + case WT_PAGE_DUP_LEAF: + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + WT_RET(__wt_verify_key_order(toc, page)); + break; + default: + break; + } + + /* Check tree connections and recursively descend the tree. */ + switch (dsk->type) { + case WT_PAGE_COL_INT: + /* For each entry in an internal page, verify the subtree. */ + start_recno = dsk->start_recno; + WT_INDX_FOREACH(page, cip, i) { + /* cip references the subtree containing the record */ + ref = WT_COL_REF(page, cip); + off = WT_COL_OFF(cip); + records = WT_COL_OFF_RECORDS(cip); + WT_ERR(__wt_page_in(toc, page, ref, off, 1)); + ret = __wt_verify_tree(toc, NULL, + records, start_recno, level - 1, ref, vs); + __wt_hazard_clear(toc, ref->page); + if (ret != 0) + goto err; + start_recno += records; + } + break; + case WT_PAGE_DUP_INT: + case WT_PAGE_ROW_INT: + /* + * There are two row-store, logical connection checks: + * + * First, compare the internal node key leading to the current + * page against the first entry on the current page. The + * internal node key must compare less than or equal to the + * first entry on the current page. + * + * Second, compare the largest key we've seen on any leaf page + * against the next internal node key we find. This check is + * a little tricky: every time we find a leaf page, we save a + * reference in the vs->leaf field. The next time we're about + * to indirect through an entry on an internal node, we compare + * the last entry on that saved page against the internal node + * entry's key. In that comparison, the leaf page's key must + * be less than the internal node entry's key. + */ + if (parent_rip != NULL) + WT_ERR(__wt_verify_pc(toc, parent_rip, page, 1)); + + /* For each entry in an internal page, verify the subtree. */ + WT_INDX_FOREACH(page, rip, i) { + /* + * At each off-page entry, we compare the current entry + * against the largest key in the subtree rooted to the + * immediate left of the current item; this key must + * compare less than or equal to the current item. The + * trick here is we need the last leaf key, not the last + * internal node key. It's returned to us in the leaf + * field of the vs structure, whenever we verify a leaf + * page. Discard the leaf node as soon as we've used it + * in a comparison. + */ + if (vs->leaf != NULL) { + WT_ERR( + __wt_verify_pc(toc, rip, vs->leaf, 0)); + __wt_hazard_clear(toc, vs->leaf); + vs->leaf = NULL; + } + /* rip references the subtree containing the record */ + ref = WT_ROW_REF(page, rip); + off = WT_ROW_OFF(rip); + records = WT_ROW_OFF_RECORDS(rip); + WT_ERR(__wt_page_in(toc, page, ref, off, 1)); + ret = __wt_verify_tree(toc, rip, + records, (uint64_t)0, level - 1, ref, vs); + + /* + * Remaining special handling of the last verified leaf + * page: if we kept a reference to that page, don't + * release the hazard reference until after comparing + * the last key on that page against the next key in the + * tree. + */ + if (vs->leaf != ref->page) + __wt_hazard_clear(toc, ref->page); + if (ret != 0) + goto err; + } + break; + case WT_PAGE_ROW_LEAF: + /* + * For each entry in a row-store leaf page, verify any off-page + * duplicates tree. + */ + WT_INDX_FOREACH(page, rip, i) { + /* Ignore anything except off-page duplicate trees. */ + if ((repl = WT_ROW_REPL( + page, rip)) != NULL && WT_REPL_DELETED_ISSET(repl)) + continue; + item = rip->data; + if (WT_ITEM_TYPE(item) != WT_ITEM_OFF) + continue; + + /* Verify the off-page duplicate tree. */ + ref = WT_ROW_DUP(page, rip); + off = WT_ROW_OFF(rip); + records = WT_ROW_OFF_RECORDS(rip); + WT_ERR(__wt_page_in(toc, page, ref, off, 1)); + ret = __wt_verify_tree(toc, NULL, + records, (uint64_t)0, WT_NOLEVEL, ref, vs); + __wt_hazard_clear(toc, ref->page); + if (ret != 0) + goto err; + } + /* FALLTHROUGH */ + case WT_PAGE_DUP_LEAF: + /* + * Retain a reference to all row-store leaf pages, we need them + * to check their last entry against the next internal key in + * the tree. + */ + vs->leaf = page; + return (0); + default: + break; + } + + /* + * The largest key on the last leaf page in the tree is never needed, + * there aren't any internal pages after it. So, we get here with + * vs->leaf needing to be released. + */ +err: if (vs->leaf != NULL) { + __wt_hazard_clear(toc, vs->leaf); + vs->leaf = NULL; + } + + return (ret); +} + +/* + * __wt_verify_pc -- + * Compare a key on a parent page to a designated entry on a child page. + */ +static int +__wt_verify_pc(WT_TOC *toc, WT_ROW *parent_rip, WT_PAGE *child, int first_entry) +{ + DB *db; + DBT *cd_ref, *pd_ref, *scratch1, *scratch2; + WT_ROW *child_rip; + int cmp, ret, (*func)(DB *, const DBT *, const DBT *); + + db = toc->db; + scratch1 = scratch2 = NULL; + ret = 0; + + /* Set the comparison function. */ + switch (child->dsk->type) { + case WT_PAGE_DUP_INT: + case WT_PAGE_DUP_LEAF: + func = db->btree_compare_dup; + break; + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + func = db->btree_compare; + break; + WT_ILLEGAL_FORMAT(db); + } + + /* + * The two keys we're going to compare may be overflow keys -- don't + * bother instantiating the keys in the tree, there's no reason to + * believe we're going to be working in this database. + */ + child_rip = first_entry ? + child->u.irow : child->u.irow + (child->indx_count - 1); + if (__wt_key_process(child_rip)) { + WT_ERR(__wt_scr_alloc(toc, 0, &scratch1)); + WT_ERR(__wt_item_process(toc, child_rip->key, scratch1)); + cd_ref = scratch1; + } else + cd_ref = (DBT *)child_rip; + if (__wt_key_process(parent_rip)) { + WT_ERR(__wt_scr_alloc(toc, 0, &scratch2)); + WT_RET(__wt_item_process(toc, parent_rip->key, scratch2)); + pd_ref = scratch2; + } else + pd_ref = (DBT *)parent_rip; + + /* Compare the parent's key against the child's key. */ + cmp = func(db, cd_ref, pd_ref); + + if (first_entry && cmp < 0) { + __wt_api_db_errx(db, + "the first key on page at addr %lu sorts before its " + "reference key on its parent's page", + (u_long)child->addr); + ret = WT_ERROR; + } + if (!first_entry && cmp >= 0) { + __wt_api_db_errx(db, + "the last key on the page at addr %lu sorts after a parent " + "page's key for the subsequent page", + (u_long)child->addr); + ret = WT_ERROR; + } + +err: if (scratch1 != NULL) + __wt_scr_release(&scratch1); + if (scratch2 != NULL) + __wt_scr_release(&scratch2); + + return (ret); +} + +/* + * __wt_verify_key_order -- + * Check on-page key ordering. + */ +static int +__wt_verify_key_order(WT_TOC *toc, WT_PAGE *page) +{ + struct { + DBT *dbt; /* DBT to compare */ + DBT *scratch; /* scratch buffer */ + } *current, *last, _a, _b; + DB *db; + WT_PAGE_DISK *dsk; + WT_ROW *rip; + uint32_t i; + int (*func)(DB *, const DBT *, const DBT *), ret; + + db = toc->db; + dsk = page->dsk; + ret = 0; + + WT_CLEAR(_a); + WT_CLEAR(_b); + current = &_a; + WT_ERR(__wt_scr_alloc(toc, 0, ¤t->scratch)); + last = &_b; + WT_ERR(__wt_scr_alloc(toc, 0, &last->scratch)); + + /* Set the comparison function. */ + switch (dsk->type) { + case WT_PAGE_DUP_INT: + case WT_PAGE_DUP_LEAF: + func = db->btree_compare_dup; + break; + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + func = db->btree_compare; + break; + WT_ILLEGAL_FORMAT(db); + } + + /* Walk the page, comparing keys. */ + WT_INDX_FOREACH(page, rip, i) { + /* Skip duplicates */ + if (WT_ROW_INDX_IS_DUPLICATE(page, rip)) + continue; + + /* + * The two keys we're going to compare may be overflow keys -- + * don't bother instantiating the keys in the tree, there's no + * reason to believe we're going to be working in this database. + */ + if (__wt_key_process(rip)) { + WT_RET(__wt_item_process( + toc, rip->key, current->scratch)); + current->dbt = current->scratch; + } else + current->dbt = (DBT *)rip; + + /* Compare the current key against the last key. */ + if (last->dbt != NULL && + func(db, last->dbt, current->dbt) >= 0) { + __wt_api_db_errx(db, + "the %lu and %lu keys on page at addr %lu are " + "incorrectly sorted", + (u_long)WT_ROW_SLOT(page, rip) - 1, + (u_long)WT_ROW_SLOT(page, rip), + (u_long)page->addr); + ret = WT_ERROR; + goto err; + } + } + +err: if (_a.scratch != NULL) + __wt_scr_release(&_a.scratch); + if (_b.scratch != NULL) + __wt_scr_release(&_b.scratch); + + return (ret); +} + +/* + * __wt_verify_dsk_page -- + * Verify a single Btree page as read from disk. + */ +int +__wt_verify_dsk_page( + WT_TOC *toc, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size) +{ + DB *db; + + db = toc->db; + + /* Check the page type. */ + switch (dsk->type) { + case WT_PAGE_FREE: + /* + * Free pages are only written in diagnostic mode, and the + * type is the only thing that can be verified about them. + */ + return (0); + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_INT: + case WT_PAGE_COL_RLE: + case WT_PAGE_COL_VAR: + case WT_PAGE_DUP_INT: + case WT_PAGE_DUP_LEAF: + case WT_PAGE_OVFL: + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + break; + case WT_PAGE_INVALID: + default: + __wt_api_db_errx(db, + "page at addr %lu has an invalid type of %lu", + (u_long)addr, (u_long)dsk->type); + return (WT_ERROR); + } + + /* + * FUTURE: + * Check the LSN against the existing log files. + */ + if (dsk->lsn[0] != 0 || dsk->lsn[1] != 0) { + __wt_api_db_errx(db, + "page at addr %lu has non-zero lsn header fields", + (u_long)addr); + return (WT_ERROR); + } + + /* Ignore the checksum -- it verified when we first read the page. */ + + /* Check the page level. */ + switch (dsk->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_RLE: + case WT_PAGE_COL_VAR: + case WT_PAGE_DUP_LEAF: + case WT_PAGE_OVFL: + case WT_PAGE_ROW_LEAF: + if (dsk->level != WT_LLEAF) + goto err_level; + break; + case WT_PAGE_COL_INT: + case WT_PAGE_DUP_INT: + case WT_PAGE_ROW_INT: + if (dsk->level <= WT_LLEAF) { +err_level: __wt_api_db_errx(db, + "%s page at addr %lu has incorrect tree level " + "of %lu", + __wt_page_type_string(dsk), + (u_long)addr, (u_long)dsk->level); + return (WT_ERROR); + } + break; + WT_ILLEGAL_FORMAT(db); + } + + if (dsk->unused[0] != '\0' || dsk->unused[1] != '\0') { + __wt_api_db_errx(db, + "page at addr %lu has non-zero unused header fields", + (u_long)addr); + return (WT_ERROR); + } + + /* Verify the items on the page. */ + switch (dsk->type) { + case WT_PAGE_COL_VAR: + case WT_PAGE_DUP_INT: + case WT_PAGE_DUP_LEAF: + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + WT_RET(__wt_verify_dsk_item(toc, dsk, addr, size)); + break; + case WT_PAGE_COL_INT: + WT_RET(__wt_verify_dsk_col_int(db, dsk, addr, size)); + break; + case WT_PAGE_COL_FIX: + WT_RET(__wt_verify_dsk_col_fix(db, dsk, addr, size)); + break; + case WT_PAGE_COL_RLE: + WT_RET(__wt_verify_dsk_col_rle(db, dsk, addr, size)); + break; + case WT_PAGE_OVFL: + WT_RET(__wt_verify_dsk_ovfl(toc, dsk, addr, size)); + break; + WT_ILLEGAL_FORMAT(db); + } + + return (0); +} + +/* + * __wt_verify_dsk_item -- + * Walk a disk page of WT_ITEMs, and verify them. + */ +static int +__wt_verify_dsk_item( + WT_TOC *toc, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size) +{ + enum { IS_FIRST, WAS_KEY, WAS_DATA, WAS_DUP_DATA } last_item_type; + DB *db; + WT_ITEM *item; + WT_OVFL *ovfl; + WT_OFF *off; + off_t file_size; + uint8_t *end; + uint32_t i, item_num, item_len, item_type; + + db = toc->db; + file_size = db->idb->fh->file_size; + + end = (uint8_t *)dsk + size; + + last_item_type = IS_FIRST; + item_num = 0; + WT_ITEM_FOREACH(dsk, item, i) { + ++item_num; + + /* Check if this item is entirely on the page. */ + if ((uint8_t *)item + sizeof(WT_ITEM) > end) + goto eop; + + item_type = WT_ITEM_TYPE(item); + item_len = WT_ITEM_LEN(item); + + /* Check the item's type. */ + switch (item_type) { + case WT_ITEM_KEY: + case WT_ITEM_KEY_OVFL: + if (dsk->type != WT_PAGE_ROW_INT && + dsk->type != WT_PAGE_ROW_LEAF) + goto item_vs_page; + break; + case WT_ITEM_KEY_DUP: + case WT_ITEM_KEY_DUP_OVFL: + if (dsk->type != WT_PAGE_DUP_INT) + goto item_vs_page; + break; + case WT_ITEM_DATA: + case WT_ITEM_DATA_OVFL: + if (dsk->type != WT_PAGE_COL_VAR && + dsk->type != WT_PAGE_ROW_LEAF) + goto item_vs_page; + break; + case WT_ITEM_DATA_DUP: + case WT_ITEM_DATA_DUP_OVFL: + if (dsk->type != WT_PAGE_DUP_LEAF && + dsk->type != WT_PAGE_ROW_LEAF) + goto item_vs_page; + break; + case WT_ITEM_DEL: + /* Deleted items only appear on column-store pages. */ + if (dsk->type != WT_PAGE_COL_VAR) + goto item_vs_page; + break; + case WT_ITEM_OFF: + if (dsk->type != WT_PAGE_DUP_INT && + dsk->type != WT_PAGE_ROW_INT && + dsk->type != WT_PAGE_ROW_LEAF) { +item_vs_page: __wt_api_db_errx(db, + "illegal item and page type combination " + "(item %lu on page at addr %lu is a %s " + "item on a %s page)", + (u_long)item_num, (u_long)addr, + __wt_item_type_string(item), + __wt_page_type_string(dsk)); + return (WT_ERROR); + } + break; + default: + __wt_api_db_errx(db, + "item %lu on page at addr %lu has an illegal type " + "of %lu", + (u_long)item_num, (u_long)addr, (u_long)item_type); + return (WT_ERROR); + } + + /* + * Check the item type ordering. For row-stores, check for: + * two keys in a row, + * two non-dup data items in a row, + * a non-dup data item followed by a dup data item + * a data item as the first item on a page. + * + * Column-stores only have data items, and we already checked + * to see if there was anything else on the page. Skip the + * order check. + */ + if (dsk->type == WT_PAGE_COL_VAR) + goto skip_order_check; + + switch (item_type) { + case WT_ITEM_KEY: + case WT_ITEM_KEY_OVFL: + case WT_ITEM_KEY_DUP: + case WT_ITEM_KEY_DUP_OVFL: + switch (last_item_type) { + case IS_FIRST: + case WAS_DATA: + case WAS_DUP_DATA: + last_item_type = WAS_KEY; + break; + case WAS_KEY: + __wt_api_db_errx(db, + "item %lu on page at addr %lu is first of " + "two adjacent keys", + (u_long)item_num - 1, (u_long)addr); + return (WT_ERROR); + } + break; + case WT_ITEM_DATA: + case WT_ITEM_DATA_DUP: + case WT_ITEM_DATA_DUP_OVFL: + case WT_ITEM_DATA_OVFL: + case WT_ITEM_DEL: + case WT_ITEM_OFF: + if (last_item_type == IS_FIRST) { + __wt_api_db_errx(db, + "page at addr %lu begins with a data item", + (u_long)addr); + return (WT_ERROR); + } + switch (item_type) { + case WT_ITEM_DATA: + case WT_ITEM_DATA_DUP: + case WT_ITEM_DEL: + case WT_ITEM_OFF: + switch (last_item_type) { + case IS_FIRST: + case WAS_DATA: + case WAS_DUP_DATA: + __wt_api_db_errx(db, + "item %lu on page at addr %lu is " + "the first of two adjacent data " + "items", + (u_long)item_num - 1, (u_long)addr); + return (WT_ERROR); + case WAS_KEY: + last_item_type = WAS_DATA; + break; + } + break; + case WT_ITEM_DATA_DUP_OVFL: + case WT_ITEM_DATA_OVFL: + switch (last_item_type) { + case WAS_DATA: + __wt_api_db_errx(db, + "item %lu on page at addr %lu is " + "a non-duplicate data item " + "followed by a duplicate data item", + (u_long)item_num - 1, (u_long)addr); + return (WT_ERROR); + case IS_FIRST: + case WAS_DUP_DATA: + case WAS_KEY: + last_item_type = WAS_DUP_DATA; + break; + } + break; + default: + break; + } + break; + default: + break; + } + +skip_order_check: + /* Check the item's length. */ + switch (item_type) { + case WT_ITEM_KEY: + case WT_ITEM_KEY_DUP: + case WT_ITEM_DATA: + case WT_ITEM_DATA_DUP: + /* The length is variable, we can't check it. */ + break; + case WT_ITEM_KEY_OVFL: + case WT_ITEM_KEY_DUP_OVFL: + case WT_ITEM_DATA_OVFL: + case WT_ITEM_DATA_DUP_OVFL: + if (item_len != sizeof(WT_OVFL)) + goto item_len; + break; + case WT_ITEM_DEL: + if (item_len != 0) + goto item_len; + break; + case WT_ITEM_OFF: + if (item_len != sizeof(WT_OFF)) { +item_len: __wt_api_db_errx(db, + "item %lu on page at addr %lu has an " + "incorrect length", + (u_long)item_num, (u_long)addr); + return (WT_ERROR); + } + break; + default: + break; + } + + /* Check if the item is entirely on the page. */ + if ((uint8_t *)WT_ITEM_NEXT(item) > end) + goto eop; + + /* Check if the referenced item is entirely in the file. */ + switch (item_type) { + case WT_ITEM_KEY_OVFL: + case WT_ITEM_KEY_DUP_OVFL: + case WT_ITEM_DATA_OVFL: + case WT_ITEM_DATA_DUP_OVFL: + ovfl = WT_ITEM_BYTE_OVFL(item); + if (WT_ADDR_TO_OFF(db, ovfl->addr) + + WT_HDR_BYTES_TO_ALLOC(db, ovfl->size) > file_size) + goto eof; + break; + case WT_ITEM_OFF: + off = WT_ITEM_BYTE_OFF(item); + if (WT_ADDR_TO_OFF(db, off->addr) + + off->size > file_size) + goto eof; + break; + default: + break; + } + } + return (0); + +eof: return (__wt_verify_eof(db, item_num, addr)); +eop: return (__wt_verify_eop(db, item_num, addr)); +} + +/* + * __wt_verify_dsk_col_int -- + * Walk a WT_PAGE_COL_INT disk page and verify it. + */ +static int +__wt_verify_dsk_col_int(DB *db, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size) +{ + IDB *idb; + WT_OFF *off; + uint8_t *end; + uint32_t i, entry_num; + + idb = db->idb; + end = (uint8_t *)dsk + size; + + entry_num = 0; + WT_OFF_FOREACH(dsk, off, i) { + ++entry_num; + + /* Check if this entry is entirely on the page. */ + if ((uint8_t *)off + sizeof(WT_OFF) > end) + return (__wt_verify_eop(db, entry_num, addr)); + + /* Check if the reference is past the end-of-file. */ + if (WT_ADDR_TO_OFF( + db, off->addr) + off->size > idb->fh->file_size) + return (__wt_verify_eof(db, entry_num, addr)); + } + + return (0); +} + +/* + * __wt_verify_dsk_col_fix -- + * Walk a WT_PAGE_COL_FIX disk page and verify it. + */ +static int +__wt_verify_dsk_col_fix(DB *db, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size) +{ + u_int len; + uint32_t i, j, entry_num; + uint8_t *data, *end, *p; + + len = db->fixed_len; + end = (uint8_t *)dsk + size; + + entry_num = 0; + WT_FIX_FOREACH(db, dsk, data, i) { + ++entry_num; + + /* Check if this entry is entirely on the page. */ + if (data + len > end) + return (__wt_verify_eop(db, entry_num, addr)); + + /* Deleted items are entirely nul bytes. */ + p = data; + if (WT_FIX_DELETE_ISSET(data)) { + if (*p != WT_FIX_DELETE_BYTE) + goto delfmt; + for (j = 1; j < db->fixed_len; ++j) + if (*++p != '\0') + goto delfmt; + } + } + + return (0); + +delfmt: return (__wt_verify_delfmt(db, entry_num, addr)); +} + +/* + * __wt_verify_dsk_col_rle -- + * Walk a WT_PAGE_COL_RLE disk page and verify it. + */ +static int +__wt_verify_dsk_col_rle(DB *db, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size) +{ + u_int len; + uint32_t i, j, entry_num; + uint8_t *data, *end, *last_data, *p; + + end = (uint8_t *)dsk + size; + + last_data = NULL; + len = db->fixed_len + sizeof(uint16_t); + + entry_num = 0; + WT_RLE_REPEAT_FOREACH(db, dsk, data, i) { + ++entry_num; + + /* Check if this entry is entirely on the page. */ + if (data + len > end) + return (__wt_verify_eop(db, entry_num, addr)); + + /* Count must be non-zero. */ + if (WT_RLE_REPEAT_COUNT(data) == 0) { + __wt_api_db_errx(db, + "fixed-length entry %lu on page at addr " + "%lu has a repeat count of 0", + (u_long)entry_num, (u_long)addr); + return (WT_ERROR); + } + + /* Deleted items are entirely nul bytes. */ + p = WT_RLE_REPEAT_DATA(data); + if (WT_FIX_DELETE_ISSET(p)) { + if (*p != WT_FIX_DELETE_BYTE) + goto delfmt; + for (j = 1; j < db->fixed_len; ++j) + if (*++p != '\0') + goto delfmt; + } + + /* + * If the previous data is the same as this data, we + * missed an opportunity for compression -- complain. + */ + if (last_data != NULL && + memcmp(WT_RLE_REPEAT_DATA(last_data), + WT_RLE_REPEAT_DATA(data), db->fixed_len) == 0 && + WT_RLE_REPEAT_COUNT(last_data) < UINT16_MAX) { + __wt_api_db_errx(db, + "fixed-length entries %lu and %lu on page " + "at addr %lu are identical and should have " + "been compressed", + (u_long)entry_num, + (u_long)entry_num - 1, (u_long)addr); + return (WT_ERROR); + } + last_data = data; + } + + return (0); + +delfmt: return (__wt_verify_delfmt(db, entry_num, addr)); +} + +/* + * __wt_verify_overflow_col -- + * Check on-page column-store overflow references. + */ +static int +__wt_verify_overflow_col(WT_TOC *toc, WT_PAGE *page, WT_VSTUFF *vs) +{ + WT_COL *cip; + WT_ITEM *item; + uint32_t i; + + /* Walk the in-memory page, verifying overflow items. */ + WT_INDX_FOREACH(page, cip, i) { + item = cip->data; + if (WT_ITEM_TYPE(item) == WT_ITEM_DATA_OVFL) + WT_RET(__wt_verify_overflow_common( + toc, WT_ITEM_BYTE_OVFL(item), + WT_COL_SLOT(page, cip) + 1, page->addr, vs)); + } + return (0); +} + +/* + * __wt_verify_overflow_row -- + * Check on-page row-store overflow references. + */ +static int +__wt_verify_overflow_row(WT_TOC *toc, WT_PAGE *page, WT_VSTUFF *vs) +{ + WT_ITEM *item; + WT_ROW *rip; + uint32_t i; + int check_data; + + /* + * Walk the in-memory page, verifying overflow items. We service 4 + * page types here: DUP_INT, DUP_LEAF, ROW_INT and ROW_LEAF. In the + * case of DUP_INT, DUP_LEAF and ROW_INT, we only check the key, as + * there is either no data item, or the data item is known to not be + * an overflow page. In the case of ROW_LEAF, we have to check both + * the key and the data item. + */ + check_data = page->dsk->type == WT_PAGE_ROW_LEAF ? 1 : 0; + + /* Walk the in-memory page, verifying overflow items. */ + WT_INDX_FOREACH(page, rip, i) { + item = rip->key; + switch (WT_ITEM_TYPE(item)) { + case WT_ITEM_KEY_OVFL: + case WT_ITEM_KEY_DUP_OVFL: + WT_RET(__wt_verify_overflow_common( + toc, WT_ITEM_BYTE_OVFL(item), + WT_ROW_SLOT(page, rip) + 1, page->addr, vs)); + break; + default: + break; + } + + if (!check_data) + continue; + + item = rip->data; + switch (WT_ITEM_TYPE(item)) { + case WT_ITEM_DATA_OVFL: + case WT_ITEM_DATA_DUP_OVFL: + WT_RET(__wt_verify_overflow_common( + toc, WT_ITEM_BYTE_OVFL(item), + WT_ROW_SLOT(page, rip) + 1, page->addr, vs)); + break; + default: + break; + } + } + return (0); +} + +/* + * __wt_verify_overflow_common -- + * Common code that reads in an overflow page and checks it. + */ +static int +__wt_verify_overflow_common(WT_TOC *toc, + WT_OVFL *ovfl, uint32_t entry_num, uint32_t page_ref_addr, WT_VSTUFF *vs) +{ + DB *db; + DBT *scratch1; + WT_PAGE_DISK *dsk; + uint32_t addr, size; + int ret; + + db = toc->db; + scratch1 = NULL; + ret = 0; + + addr = ovfl->addr; + size = WT_HDR_BYTES_TO_ALLOC(db, ovfl->size); + + /* Allocate enough memory to hold the overflow pages. */ + WT_RET(__wt_scr_alloc(toc, size, &scratch1)); + + /* Read the page. */ + dsk = scratch1->data; + WT_ERR(__wt_page_disk_read(toc, dsk, addr, size)); + + /* + * Verify the disk image -- this function would normally be called + * from the asynchronous read server, but overflow pages are read + * synchronously. Regardless, we break the overflow verification code + * into two parts, on-disk format checking and internal checking, + * just so it looks like all of the other page type checking. + */ + WT_ERR(__wt_verify_dsk_ovfl(toc, dsk, addr, size)); + + /* Add the fragments. */ + WT_ERR(__wt_verify_addfrag(toc, addr, size, vs)); + + /* + * The only other thing to check is that the size we have in the page + * matches the size on the underlying overflow page. + */ + if (ovfl->size != dsk->u.datalen) { + __wt_api_db_errx(db, + "overflow page reference in item %lu on page at addr %lu " + "does not match the data size on the overflow page", + (u_long)entry_num, (u_long)page_ref_addr); + ret = WT_ERROR; + } + +err: __wt_scr_release(&scratch1); + + return (ret); +} + +/* + * __wt_verify_dsk_ovfl -- + * Verify a WT_PAGE_OVFL disk page. + */ +static int +__wt_verify_dsk_ovfl( + WT_TOC *toc, WT_PAGE_DISK *dsk, uint32_t addr, uint32_t size) +{ + DB *db; + uint32_t len; + uint8_t *p; + + db = toc->db; + + if (dsk->u.datalen == 0) { + __wt_api_db_errx(db, + "overflow page at addr %lu has no data", (u_long)addr); + return (WT_ERROR); + } + + /* Any page data after the overflow record should be nul bytes. */ + p = (uint8_t *)dsk + (sizeof(WT_PAGE_DISK) + dsk->u.datalen); + len = size - (sizeof(WT_PAGE_DISK) + dsk->u.datalen); + for (; len > 0; ++p, --len) + if (*p != '\0') { + __wt_api_db_errx(db, + "overflow page at addr %lu has non-zero trailing " + "bytes", + (u_long)addr); + return (WT_ERROR); + } + + return (0); +} + +/* + * __wt_verify_eop -- + * Generic item extends past the end-of-page error. + */ +static int +__wt_verify_eop(DB *db, uint32_t entry_num, uint32_t addr) +{ + __wt_api_db_errx(db, + "item %lu on page at addr %lu extends past the end of the page", + (u_long)entry_num, (u_long)addr); + return (WT_ERROR); +} + +/* + * __wt_verify_eof -- + * Generic item references non-existent file pages error. + */ +static int +__wt_verify_eof(DB *db, uint32_t entry_num, uint32_t addr) +{ + __wt_api_db_errx(db, + "off-page item %lu on page at addr %lu references non-existent " + "file pages", + (u_long)entry_num, (u_long)addr); + return (WT_ERROR); +} + +/* + * __wt_verify_delfmt -- + * WT_PAGE_COL_FIX and WT_PAGE_COL_RLE error where a deleted item has + * non-nul bytes. + */ +static int +__wt_verify_delfmt(DB *db, uint32_t entry_num, uint32_t addr) +{ + __wt_api_db_errx(db, + "deleted fixed-length entry %lu on page at addr %lu has non-nul " + "bytes", + (u_long)entry_num, (u_long)addr); + return (WT_ERROR); +} + +/* + * __wt_verify_addfrag -- + * Add the WT_PAGE's fragments to the list, and complain if we've already + * verified this chunk of the file. + */ +static int +__wt_verify_addfrag(WT_TOC *toc, uint32_t addr, uint32_t size, WT_VSTUFF *vs) +{ + DB *db; + uint32_t frags, i; + + db = toc->db; + + frags = WT_OFF_TO_ADDR(db, size); + for (i = 0; i < frags; ++i) + if (bit_test(vs->fragbits, addr + i)) { + __wt_api_db_errx(db, + "page fragment at addr %lu already verified", + (u_long)addr); + return (0); + } + bit_nset(vs->fragbits, addr, addr + (frags - 1)); + return (0); +} + +/* + * __wt_verify_checkfrag -- + * Verify we've checked all the fragments in the file. + */ +static int +__wt_verify_checkfrag(DB *db, WT_VSTUFF *vs) +{ + int ffc, ffc_start, ffc_end, frags, ret; + + frags = (int)vs->frags; /* XXX: bitstring.h wants "ints" */ + ret = 0; + + /* Check for page fragments we haven't verified. */ + for (ffc_start = ffc_end = -1;;) { + bit_ffc(vs->fragbits, frags, &ffc); + if (ffc != -1) { + bit_set(vs->fragbits, ffc); + if (ffc_start == -1) { + ffc_start = ffc_end = ffc; + continue; + } + if (ffc_end == ffc - 1) { + ffc_end = ffc; + continue; + } + } + if (ffc_start != -1) { + if (ffc_start == ffc_end) + __wt_api_db_errx(db, + "fragment %d was never verified", + ffc_start); + else + __wt_api_db_errx(db, + "fragments %d to %d were never verified", + ffc_start, ffc_end); + ret = WT_ERROR; + } + ffc_start = ffc_end = ffc; + if (ffc == -1) + break; + } + return (ret); +} diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c new file mode 100644 index 00000000000..f5ef9674f9b --- /dev/null +++ b/src/btree/bt_walk.c @@ -0,0 +1,306 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * There are two tree-walk implementations: a textbook, depth-first recursive + * tree walk in __wt_tree_walk(), and a non-recursive, depth-first tree walk + * in __wt_walk_{begin,end,next}(). + * + * The simple recursive walk is sufficient in most cases -- a hazard reference + * is obtained on each page in turn, a worker function is called on the page, + * then the hazard reference is released. + * + * The complicated tree walk routine was added because the cache eviction code + * needs: + * + to walk the tree a few pages at a time, that is, periodically wake, + * visit some pages, then go back to sleep, which requires enough state + * to restart the traversal at any point, + * + to only visit pages that currently appear in the cache, + * + to return the WT_REF structure (not the WT_PAGE structure), + * + to walk files not associated with the current WT_TOC's DB handle, + * + and finally, it doesn't require a hazard reference. + * + * My guess is we'll generalize a more complicated walk at some point, which + * means some or all of those behaviors will become configurable, and that's + * why the code lives here instead of in the eviction code. + */ + +/* + * __wt_tree_walk -- + * Depth-first recursive walk of a btree, calling a worker function on + * each page. + */ +int +__wt_tree_walk(WT_TOC *toc, WT_REF *ref, + uint32_t flags, int (*work)(WT_TOC *, WT_PAGE *, void *), void *arg) +{ + IDB *idb; + WT_COL *cip; + WT_OFF *off; + WT_PAGE *page; + WT_ROW *rip; + uint32_t i; + int ret; + + WT_ENV_FCHK( + toc->env, "__wt_tree_walk", flags, WT_APIMASK_BT_TREE_WALK); + + idb = toc->db->idb; + + /* + * A NULL WT_REF means to start at the top of the tree -- it's just + * a convenience. + */ + page = ref == NULL ? idb->root_page.page : ref->page; + + /* + * Walk any internal pages, descending through any off-page references. + * + * Descending into row-store off-page duplicate trees is optional for + * two reasons. (1) it may be faster to call this function recursively + * from the worker function, which is already walking the page, and (2) + * information for off-page dup trees is split (the key is on the + * row-leaf page, and the data is obviously in the off-page dup tree): + * we need the key when we dump the data, and that would be a hard + * special case in this code. Functions where it's both possible and + * no slower to walk off-page dupliate trees in this code can request + * it be done here. + */ + switch (page->dsk->type) { + case WT_PAGE_COL_INT: + WT_INDX_FOREACH(page, cip, i) { + /* cip references the subtree containing the record */ + ref = WT_COL_REF(page, cip); + if (LF_ISSET(WT_WALK_CACHE) && ref->state != WT_OK) + continue; + + off = WT_COL_OFF(cip); + WT_RET(__wt_page_in(toc, page, ref, off, 0)); + ret = __wt_tree_walk(toc, ref, flags, work, arg); + __wt_hazard_clear(toc, ref->page); + if (ret != 0) + return (ret); + } + break; + case WT_PAGE_DUP_INT: + case WT_PAGE_ROW_INT: + WT_INDX_FOREACH(page, rip, i) { + /* rip references the subtree containing the record */ + ref = WT_ROW_REF(page, rip); + if (LF_ISSET(WT_WALK_CACHE) && ref->state != WT_OK) + continue; + + off = WT_ROW_OFF(rip); + WT_RET(__wt_page_in(toc, page, ref, off, 0)); + ret = __wt_tree_walk(toc, ref, flags, work, arg); + __wt_hazard_clear(toc, ref->page); + if (ret != 0) + return (ret); + } + break; + case WT_PAGE_ROW_LEAF: + if (!LF_ISSET(WT_WALK_OFFDUP)) + break; + WT_INDX_FOREACH(page, rip, i) { + if (WT_ITEM_TYPE(rip->data) != WT_ITEM_OFF) + break; + + /* + * Recursively call the tree-walk function for the + * off-page duplicate tree. + */ + ref = WT_ROW_REF(page, rip); + if (LF_ISSET(WT_WALK_CACHE) && ref->state != WT_OK) + continue; + + off = WT_ROW_OFF(rip); + WT_RET(__wt_page_in(toc, page, ref, off, 0)); + ret = __wt_tree_walk(toc, ref, flags, work, arg); + __wt_hazard_clear(toc, ref->page); + if (ret != 0) + return (ret); + } + break; + default: + break; + } + + /* + * Don't call the worker function for any page until all of its children + * have been visited. This allows the walker function to be used for + * the sync method, where reconciling a modified child page modifies the + * parent. + */ + WT_RET(work(toc, page, arg)); + + return (0); +} + +/* + * __wt_walk_begin -- + * Start a tree walk. + */ +int +__wt_walk_begin(WT_TOC *toc, WT_REF *ref, WT_WALK *walk) +{ + ENV *env; + + env = toc->env; + + /* + * The caller may be restarting a walk, so the structure may already + * be allocated. Allocate 20 slots: it's always going to be enough. + */ + if (walk->tree_len == 0) + WT_RET(__wt_realloc(env, &walk->tree_len, + 20 * sizeof(WT_WALK_ENTRY), &walk->tree)); + walk->tree_slot = 0; + + walk->tree[0].ref = ref; + walk->tree[0].indx = 0; + walk->tree[0].visited = 0; + + return (0); +} + +/* + * __wt_walk_end -- + * End a tree walk. + */ +void +__wt_walk_end(ENV *env, WT_WALK *walk) +{ + __wt_free(env, walk->tree, walk->tree_len); +} + +/* + * __wt_walk_next -- + * Return the next WT_REF/WT_PAGE in the tree, in a non-recursive way. + */ +int +__wt_walk_next(WT_TOC *toc, WT_WALK *walk, WT_REF **refp) +{ + DB *db; + ENV *env; + WT_PAGE *page, *child; + WT_REF *ref; + WT_WALK_ENTRY *e; + uint elem; + + env = toc->env; + db = toc->db; + + e = &walk->tree[walk->tree_slot]; + page = e->ref->page; + + /* + * Coming into this function we have either a tree internal page (and + * we're walking the array of children), or a row-leaf page (and we're + * walking the array of off-page duplicate trees). + * + * If we've reached the end of this page, and haven't yet returned it, + * do that now. If the page has been returned, traversal is finished: + * pop the stack and call ourselve recursively, unless the entire tree + * has been traversed, in which case we return NULL. + */ + if (e->visited) { + if (walk->tree_slot == 0) { + *refp = NULL; + return (0); + } else { + --walk->tree_slot; + return (__wt_walk_next(toc, walk, refp)); + } + } else + if (e->indx == page->indx_count) { +eop: e->visited = 1; + *refp = e->ref; + return (0); + } + + /* Find the next WT_REF/WT_PAGE pair present in the cache. */ + for (;;) { + switch (page->dsk->type) { + case WT_PAGE_ROW_LEAF: + ref = page->u3.dup[e->indx]; + break; + case WT_PAGE_COL_INT: + case WT_PAGE_DUP_INT: + case WT_PAGE_ROW_INT: + ref = &page->u3.ref[e->indx]; + break; + WT_ILLEGAL_FORMAT(db); + } + + /* + * The row-leaf page off-page duplicates tree array has empty + * slots (unlike col/row internal pages), so check for a NULL + * ref. + * + * We only care about pages in the cache. + */ + if (ref != NULL && ref->state == WT_OK) + break; + + /* + * If we don't find another WT_REF/WT_OFF pair, do the + * post-order visit. + */ + if (++e->indx == page->indx_count) + goto eop; + } + + /* + * Check to see if the page has sub-trees associated with it, in which + * case we traverse those pages. + */ + child = ref->page; + switch (child->dsk->type) { + case WT_PAGE_ROW_LEAF: + /* + * Check for off-page duplicates -- if there are any, push them + * onto the stack and recursively call ourselves to descend the + * tree. + */ + if (!WT_PAGE_DUP_TREES(child)) + break; + /* FALLTHROUGH */ + case WT_PAGE_COL_INT: + case WT_PAGE_DUP_INT: + case WT_PAGE_ROW_INT: + /* + * The page has children. + * + * First, move past this child, then push the child onto our + * stack, and recursively descend the tree. + */ + ++e->indx; + + /* Check to see if we grew past the end of our stack. */ + elem = walk->tree_len / sizeof(WT_WALK_ENTRY); + if (walk->tree_slot >= elem) + WT_RET(__wt_realloc(env, &walk->tree_len, + (elem + 20) * sizeof(WT_WALK_ENTRY), &walk->tree)); + + e = &walk->tree[++walk->tree_slot]; + e->ref = ref; + e->indx = 0; + e->visited = 0; + return (__wt_walk_next(toc, walk, refp)); + default: + break; + } + + /* Return the child page, it's not interesting for further traversal. */ + ++e->indx; + *refp = ref; + return (0); +} diff --git a/src/btree/c_drain.c b/src/btree/c_drain.c new file mode 100644 index 00000000000..c213f652e75 --- /dev/null +++ b/src/btree/c_drain.c @@ -0,0 +1,940 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2010 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +static int __wt_evict(WT_TOC *); +static int __wt_evict_compare_lru(const void *a, const void *b); +static int __wt_evict_compare_page(const void *a, const void *b); +static void __wt_evict_hazard_check(WT_TOC *); +static int __wt_evict_hazard_compare(const void *a, const void *b); +static void __wt_evict_page(WT_TOC *, int); +static int __wt_evict_page_subtrees(WT_PAGE *); +static void __wt_evict_set(WT_TOC *); +static void __wt_evict_state_check(WT_TOC *); +static int __wt_evict_walk(WT_TOC *); +static int __wt_evict_walk_single(WT_TOC *, IDB *, uint); +static void __wt_evict_write(WT_TOC *); + +#ifdef HAVE_DIAGNOSTIC +static void __wt_evict_hazard_validate(ENV *, WT_PAGE *); +#endif + +/* + * Tuning constants -- I hesitate to call this tuning, but we should review some + * number of pages from each file's in-memory tree for each page we evict, and + * we should amortize the comparison of the hazard references across some number + * of eviction candidates. + */ +#define WT_EVICT_GROUP 10 /* Evict N pages at a time */ +#define WT_EVICT_WALK_PER_TABLE 5 /* Pages to visit per file */ +#define WT_EVICT_WALK_BASE 25 /* Pages tracked across file visits */ + +/* + * WT_EVICT_FOREACH -- + * Walk a list of eviction candidates. + */ +#define WT_EVICT_FOREACH(cache, p, i) \ + for ((i) = 0, (p) = (cache)->evict; (i) < WT_EVICT_GROUP; ++(i), ++(p)) + +/* + * WT_EVICT_REF_CLR -- + * Clear an eviction list entry. + */ +#define WT_EVICT_CLR(p) do { \ + (p)->ref = NULL; \ + (p)->idb = WT_DEBUG_POINT; \ +} while (0) + +/* + * __wt_workq_evict_server -- + * See if the eviction server thread needs to be awakened. + */ +void +__wt_workq_evict_server(ENV *env, int force) +{ + WT_CACHE *cache; + uint64_t bytes_inuse, bytes_max; + + cache = env->ienv->cache; + + /* If the eviction server is running, there's nothing to do. */ + if (!cache->evict_sleeping) + return; + + /* + * If we're locking out reads, or over our cache limit, or forcing the + * issue (when closing the environment), run the eviction server. + */ + bytes_inuse = __wt_cache_bytes_inuse(cache); + bytes_max = WT_STAT(cache->stats, CACHE_BYTES_MAX); + if (!force && !cache->read_lockout && bytes_inuse < bytes_max) + return; + + WT_VERBOSE(env, WT_VERB_EVICT, (env, + "waking eviction server: force %sset, read lockout %sset, " + "bytes inuse %s max (%lluMB %s %lluMB), ", + force ? "" : "not ", cache->read_lockout ? "" : "not ", + bytes_inuse <= bytes_max ? "<=" : ">", + (unsigned long long)(bytes_inuse / WT_MEGABYTE), + bytes_inuse <= bytes_max ? "<=" : ">", + (unsigned long long)(bytes_max / WT_MEGABYTE))); + + cache->evict_sleeping = 0; + __wt_unlock(env, cache->mtx_evict); +} + +/* + * __wt_cache_evict_server -- + * Thread to evict pages from the cache. + */ +void * +__wt_cache_evict_server(void *arg) +{ + ENV *env; + IENV *ienv; + WT_CACHE *cache; + WT_TOC *toc; + uint64_t bytes_inuse, bytes_max; + int ret; + + env = arg; + ienv = env->ienv; + cache = ienv->cache; + ret = 0; + + /* We need a thread of control because we're reading/writing pages. */ + toc = NULL; + WT_ERR(__wt_toc_api_set(env, "CacheReconciliation", NULL, &toc)); + + /* + * Allocate memory for a copy of the hazard references -- it's a fixed + * size so doesn't need run-time adjustments. + */ + cache->hazard_elem = env->toc_size * env->hazard_size; + WT_ERR(__wt_calloc( + env, cache->hazard_elem, sizeof(WT_PAGE *), &cache->hazard)); + cache->hazard_len = cache->hazard_elem * sizeof(WT_PAGE *); + + for (;;) { + WT_VERBOSE(env, + WT_VERB_EVICT, (env, "eviction server sleeping")); + cache->evict_sleeping = 1; + __wt_lock(env, cache->mtx_evict); + WT_VERBOSE(env, + WT_VERB_EVICT, (env, "eviction server waking")); + + /* + * Check for environment exit; do it here, instead of the top of + * the loop because doing it here keeps us from doing a bunch of + * worked when simply awakened to quit. + */ + if (!F_ISSET(ienv, WT_SERVER_RUN)) + break; + + for (;;) { + /* + * The cache eviction server is a long-running thread; + * its TOC must "enter" and "leave" the library + * periodically in order to be a good thread citizen. + */ + WT_TOC_GEN_SET(toc); + + /* Single-thread reconciliation. */ + __wt_lock(env, cache->mtx_reconcile); + ret = __wt_evict(toc); + __wt_unlock(env, cache->mtx_reconcile); + if (ret != 0) + goto err; + + WT_TOC_GEN_CLR(toc); + + /* + * If we've locked out reads, keep evicting until we + * get to at least 5% under the maximum cache. Else, + * quit evicting as soon as we get under the maximum + * cache. + */ + bytes_inuse = __wt_cache_bytes_inuse(cache); + bytes_max = WT_STAT(cache->stats, CACHE_BYTES_MAX); + if (cache->read_lockout) { + if (bytes_inuse <= bytes_max - (bytes_max / 20)) + break; + } else if (bytes_inuse < bytes_max) + break; + } + } + +err: if (cache->evict != NULL) + __wt_free(env, cache->evict, cache->evict_len); + if (cache->hazard != NULL) + __wt_free(env, cache->hazard, cache->hazard_len); + if (toc != NULL) + WT_TRET(toc->close(toc, 0)); + + if (ret != 0) + __wt_api_env_err(env, ret, "cache eviction server error"); + + WT_VERBOSE( + env, WT_VERB_EVICT, (env, "cache eviction server exiting")); + + return (NULL); +} + +/* + * __wt_evict -- + * Evict pages from the cache. + */ +static int +__wt_evict(WT_TOC *toc) +{ + ENV *env; + WT_CACHE *cache; + WT_EVICT_LIST *evict; + uint elem, i, j; + + env = toc->env; + cache = env->ienv->cache; + + /* Get some more pages to consider for eviction. */ + WT_RET(__wt_evict_walk(toc)); + + /* + * We have an array of page eviction references that may contain NULLs, + * as well as duplicate entries. + * + * First, sort the array by WT_REF address, then delete any duplicates. + * The reason is because we might evict the page but leave a duplicate + * entry in the "saved" area of the array, and that would be a NULL + * dereference on the next run. (If someone ever tries to remove this + * duplicate cleanup for better performance, you can't fix it just by + * checking the WT_REF state -- that only works if you are discarding + * a page from a single level of the tree; if you are discarding a + * page and its parent, the duplicate of the page's WT_REF might have + * been free'd before a subsequent review of the eviction array.) + */ + evict = cache->evict; + elem = cache->evict_elem; + qsort(evict, + (size_t)elem, sizeof(WT_EVICT_LIST), __wt_evict_compare_page); + for (i = 0; i < elem; i = j) + for (j = i + 1; j < elem; ++j) { + /* + * If the leading pointer hits a NULL, we're done, the + * NULLs all sorted to the top of the array. + */ + if (evict[j].ref == NULL) + goto done_duplicates; + + /* Delete the second and any subsequent duplicates. */ + if (evict[i].ref == evict[j].ref) + WT_EVICT_CLR(&evict[j]); + else + break; + } +done_duplicates: + + /* Second, sort the array by LRU. */ + qsort(evict, + (size_t)elem, sizeof(WT_EVICT_LIST), __wt_evict_compare_lru); + + /* + * Discarding pages is done in 5 steps: + * Set the WT_EVICT state + * Check for any hazard references + * Discard clean pages + * Reconcile dirty pages (making them clean) + * Discard clean pages + * + * The reason we release clean pages, then reconcile dirty pages, then + * release clean pages again is because reconciling a dirty page is a + * slow operation, and this releases space sooner. (Arguably, we are + * going to discard all of the pages anyway, so what does it matter if + * we make clean pages wait for the dirty page writes? On the other + * hand, it's a small change and benefits any thread waiting to read a + * clean page we picked for discarding, unlikely though that may be.) + */ + __wt_evict_set(toc); + __wt_evict_hazard_check(toc); + __wt_evict_state_check(toc); + __wt_evict_page(toc, 0); + __wt_evict_write(toc); + __wt_evict_page(toc, 1); + + return (0); +} + +/* + * __wt_evict_walk -- + * Fill in the array by walk the next set of pages. + */ +static int +__wt_evict_walk(WT_TOC *toc) +{ + ENV *env; + IDB *idb; + IENV *ienv; + WT_CACHE *cache; + uint elem, i; + int ret; + + env = toc->env; + ienv = env->ienv; + cache = ienv->cache; + + /* + * Resize the array in which we're tracking pages, as necessary, then + * get some pages from each underlying file. We hold a mutex for the + * entire time -- it's slow, but (1) how often do new files get added + * or removed to/from the system, and (2) it's all in-memory stuff, so + * it's not that slow. + */ + ret = 0; + __wt_lock(env, ienv->mtx); + elem = WT_EVICT_WALK_BASE + (ienv->dbqcnt * WT_EVICT_WALK_PER_TABLE); + if (elem <= cache->evict_elem || (ret = __wt_realloc(env, + &cache->evict_len, + elem * sizeof(WT_EVICT_LIST), &cache->evict)) == 0) { + cache->evict_elem = elem; + + i = WT_EVICT_WALK_BASE; + TAILQ_FOREACH(idb, &ienv->dbqh, q) { + if ((ret = __wt_evict_walk_single(toc, idb, i)) != 0) + break; + i += WT_EVICT_WALK_PER_TABLE; + } + } + __wt_unlock(env, ienv->mtx); + return (ret); +} + +/* + * __wt_evict_walk_single -- + * Get a few page eviction candidates from a single underlying file. + */ +static int +__wt_evict_walk_single(WT_TOC *toc, IDB *idb, uint slot) +{ + WT_CACHE *cache; + WT_EVICT_LIST *evict; + int i, restarted_once; + + cache = toc->env->ienv->cache; + + /* + * Tricky little loop that restarts the walk as necessary, without + * resetting the count of pages retrieved. + */ + i = restarted_once = 0; + + /* If we haven't yet opened a tree-walk structure, do so. */ + if (idb->evict_walk.tree == NULL) +restart: WT_RET( + __wt_bt_walk_begin(toc, &idb->root_page, &idb->evict_walk)); + + /* Get the next WT_EVICT_WALK_PER_TABLE entries. */ + do { + evict = &cache->evict[slot]; + WT_RET(__wt_bt_walk_next(toc, &idb->evict_walk, &evict->ref)); + + /* + * Restart the walk as necessary, but only once (after one + * restart we've already acquired all of the pages, and we + * could loop infinitely on a tree with a single, pinned, page). + */ + if (evict->ref == NULL) { + if (restarted_once++) + break; + goto restart; + } + + evict->idb = idb; + ++slot; + } while (++i < WT_EVICT_WALK_PER_TABLE); + + return (0); +} + +/* + * __wt_evict_db_clear -- + * Remove any entries for a file from the eviction list. + */ +void +__wt_evict_db_clear(WT_TOC *toc) +{ + ENV *env; + IDB *idb; + IENV *ienv; + WT_CACHE *cache; + WT_EVICT_LIST *evict; + uint i; + + env = toc->env; + idb = toc->db->idb; + ienv = env->ienv; + cache = ienv->cache; + + /* + * Discard any entries in the eviction list to a file we're closing + * (the caller better have locked out the eviction thread). + */ + if (cache->evict == NULL) + return; + WT_EVICT_FOREACH(cache, evict, i) + if (evict->ref != NULL && evict->idb == idb) + WT_EVICT_CLR(evict); +} + +/* + * __wt_evict_set -- + * Set the WT_EVICT flag on a set of pages. + */ +static void +__wt_evict_set(WT_TOC *toc) +{ + ENV *env; + WT_CACHE *cache; + WT_EVICT_LIST *evict; + WT_REF *ref; + uint i; + + env = toc->env; + cache = env->ienv->cache; + + /* + * Set the entry state so readers don't try and use the pages. Once + * that's done, any thread searching for a page will either see our + * state value, or will have already set a hazard reference to the page. + * We don't evict a page with a hazard reference set, so we can't race. + * + * No memory flush needed, the state field is declared volatile. + */ + WT_EVICT_FOREACH(cache, evict, i) { + if ((ref = evict->ref) == NULL) + continue; + ref->state = WT_EVICT; + } +} + +/* + * __wt_evict_hazard_check -- + * Compare the list of hazard references to the list of pages to be + * discarded. + */ +static void +__wt_evict_hazard_check(WT_TOC *toc) +{ + ENV *env; + IENV *ienv; + WT_CACHE *cache; + WT_EVICT_LIST *evict; + WT_PAGE **hazard, **end_hazard, *page; + WT_REF *ref; + WT_STATS *stats; + uint i; + + env = toc->env; + ienv = env->ienv; + cache = ienv->cache; + stats = cache->stats; + + /* Sort the eviction candidates by WT_PAGE address. */ + qsort(cache->evict, (size_t)WT_EVICT_GROUP, + sizeof(WT_EVICT_LIST), __wt_evict_compare_page); + + /* Copy the hazard reference array and sort it by WT_PAGE address. */ + hazard = cache->hazard; + end_hazard = hazard + cache->hazard_elem; + memcpy(hazard, ienv->hazard, cache->hazard_elem * sizeof(WT_PAGE *)); + qsort(hazard, (size_t)cache->hazard_elem, + sizeof(WT_PAGE *), __wt_evict_hazard_compare); + + /* Walk the lists in parallel and look for matches. */ + WT_EVICT_FOREACH(cache, evict, i) { + if ((ref = evict->ref) == NULL) + continue; + + /* + * Look for the page in the hazard list until we reach the end + * of the list or find a hazard pointer larger than the page. + */ + for (page = ref->page; + hazard < end_hazard && *hazard < page; ++hazard) + ; + if (hazard == end_hazard) + break; + + /* + * If we find a matching hazard reference, the page is in use: + * remove it from the eviction list. + * + * No memory flush needed, the state field is declared volatile. + */ + if (*hazard == page) { + WT_VERBOSE(env, WT_VERB_EVICT, (env, + "eviction skipped page addr %lu (hazard reference)", + page->addr)); + WT_STAT_INCR(stats, CACHE_EVICT_HAZARD); + + /* + * A page with a low LRU and a hazard reference? + * + * Set the page's LRU so we don't select it again. + * Return the page to service. + * Discard our reference. + */ + ref->page->read_gen = ++cache->read_gen; + ref->state = WT_OK; + WT_EVICT_CLR(evict); + } + } +} + +/* + * __wt_evict_state_check -- + * Confirm these are pages we want to evict. + */ +static void +__wt_evict_state_check(WT_TOC *toc) +{ + ENV *env; + WT_CACHE *cache; + WT_EVICT_LIST *evict; + WT_PAGE *page; + WT_REF *ref; + uint i; + + env = toc->env; + cache = env->ienv->cache; + + /* + * We "own" the pages (we've flagged them for eviction, and there were + * no hazard references). Now do checks to see if these are pages we + * can evict -- we have to wait until after we own the page because the + * page might be updated and race with us. + */ + WT_EVICT_FOREACH(cache, evict, i) { + if ((ref = evict->ref) == NULL) + continue; + page = ref->page; + + /* Ignore pinned pages. */ + if (F_ISSET(page, WT_PINNED)) { + WT_VERBOSE(env, WT_VERB_EVICT, (env, + "eviction skipped page addr %lu (pinned)", + page->addr)); + goto skip; + } + + /* Ignore pages with in-memory subtrees. */ + switch (page->hdr->type) { + case WT_PAGE_COL_INT: + case WT_PAGE_DUP_INT: + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + if (__wt_evict_page_subtrees(page)) { + WT_VERBOSE(env, WT_VERB_EVICT, (env, + "eviction skipped page addr %lu (subtrees)", + page->addr)); + goto skip; + } + break; + default: + break; + } + + continue; + +skip: /* + * Set the page's LRU so we don't select it again. + * Return the page to service. + * Discard our reference. + */ + page->read_gen = ++cache->read_gen; + ref->state = WT_OK; + WT_EVICT_CLR(evict); + } +} + +/* + * __wt_evict_write -- + * Write any modified pages. + */ +static void +__wt_evict_write(WT_TOC *toc) +{ + ENV *env; + WT_CACHE *cache; + WT_EVICT_LIST *evict; + WT_PAGE *page; + WT_REF *ref; + uint i; + + env = toc->env; + cache = env->ienv->cache; + + WT_EVICT_FOREACH(cache, evict, i) { + if ((ref = evict->ref) == NULL) + continue; + page = ref->page; + + /* Ignore dirty pages. */ + if (!WT_PAGE_IS_MODIFIED(page)) + continue; + + /* + * We're using our WT_TOC handle, it needs to reference the + * correct DB handle. + * + * XXX + * This is pretty sleazy, but I'm hesitant to try and drive + * a separate DB/IDB handle down through the reconciliation + * code. + */ + toc->db = evict->idb->db; + (void)__wt_bt_rec_page(toc, page); + } +} + +/* + * __wt_evict_page -- + * Evict cache pages. + */ +static void +__wt_evict_page(WT_TOC *toc, int was_dirty) +{ + ENV *env; + WT_CACHE *cache; + WT_EVICT_LIST *evict; + WT_PAGE *page; + WT_REF *ref; + WT_STATS *stats; + uint i; + + env = toc->env; + cache = env->ienv->cache; + stats = cache->stats; + + WT_EVICT_FOREACH(cache, evict, i) { + if ((ref = evict->ref) == NULL) + continue; + page = ref->page; + + /* + * The first time we're called, we get rid of the clean pages; + * the second time we're called, we get rid of the pages that + * were dirty but have since been cleaned. Ignore dirty pages + * in all cases, it's simpler. + */ + if (WT_PAGE_IS_MODIFIED(page)) + continue; + + if (was_dirty) + WT_STAT_INCR(stats, CACHE_EVICT_MODIFIED); + else + WT_STAT_INCR(stats, CACHE_EVICT_UNMODIFIED); + +#ifdef HAVE_DIAGNOSTIC + __wt_evict_hazard_validate(env, page); +#endif + WT_VERBOSE(env, WT_VERB_EVICT, (env, + "cache evicting page addr %lu", page->addr)); + + /* + * Copy a page reference, then make the cache entry available + * for re-use. + * + * No memory flush needed, the state field is declared volatile. + */ + ref->page = NULL; + ref->state = WT_EMPTY; + + /* Remove the entry from the eviction list. */ + WT_EVICT_CLR(evict); + + /* We've got more space. */ + WT_CACHE_PAGE_OUT(cache, page->size); + + /* The page can no longer be found, free the memory. */ + __wt_bt_page_discard(toc, page); + } +} + +/* + * __wt_evict_page_subtrees -- + * Return if a page has an in-memory subtree. + */ +static int +__wt_evict_page_subtrees(WT_PAGE *page) +{ + WT_REF *ref, **dupp; + uint32_t i; + + /* + * Return if a page has an in-memory subtree -- this array search could + * be replaced by a reference count in the page, but (1) the eviction + * thread isn't where I expect performance problems, (2) I hate to lose + * more bytes on every page, (3) how often will an internal page be + * evicted anyway? + */ + switch (page->hdr->type) { + case WT_PAGE_COL_INT: + case WT_PAGE_DUP_INT: + case WT_PAGE_ROW_INT: + WT_REF_FOREACH(page, ref, i) + if (ref->state != WT_EMPTY) + return (1); + break; + case WT_PAGE_ROW_LEAF: + if (WT_PAGE_DUP_TREES(page)) + WT_DUP_FOREACH(page, dupp, i) + if (*dupp != NULL && (*dupp)->state != WT_EMPTY) + return (1); + break; + } + + return (0); +} + +/* + * __wt_evict_compare_page -- + * Qsort function: sort WT_EVICT_LIST array based on the page's address. + */ +static int +__wt_evict_compare_page(const void *a, const void *b) +{ + WT_REF *a_ref, *b_ref; + WT_PAGE *a_page, *b_page; + + /* + * There may be NULL references in the array; sort them as greater than + * anything else so they migrate to the end of the array. + */ + a_ref = ((WT_EVICT_LIST *)a)->ref; + b_ref = ((WT_EVICT_LIST *)b)->ref; + if (a_ref == NULL) + return (b_ref == NULL ? 0 : 1); + if (b_ref == NULL) + return (-1); + + /* Sort the page address in ascending order. */ + a_page = a_ref->page; + b_page = b_ref->page; + return (a_page > b_page ? 1 : (a_page < b_page ? -1 : 0)); +} + +/* + * __wt_evict_compare_lru -- + * Qsort function: sort WT_EVICT_LIST array based on the page's read + * generation. + */ +static int +__wt_evict_compare_lru(const void *a, const void *b) +{ + WT_REF *a_ref, *b_ref; + uint32_t a_lru, b_lru; + + /* + * There may be NULL references in the array; sort them as greater than + * anything else so they migrate to the end of the array. + */ + a_ref = ((WT_EVICT_LIST *)a)->ref; + b_ref = ((WT_EVICT_LIST *)b)->ref; + if (a_ref == NULL) + return (b_ref == NULL ? 0 : 1); + if (b_ref == NULL) + return (-1); + + /* Sort the LRU in ascending order. */ + a_lru = a_ref->page->read_gen; + b_lru = b_ref->page->read_gen; + return (a_lru > b_lru ? 1 : (a_lru < b_lru ? -1 : 0)); +} + +/* + * __wt_evict_hazard_compare -- + * Qsort function: sort hazard list based on the page's address. + */ +static int +__wt_evict_hazard_compare(const void *a, const void *b) +{ + WT_PAGE *a_page, *b_page; + + a_page = *(WT_PAGE **)a; + b_page = *(WT_PAGE **)b; + + return (a_page > b_page ? 1 : (a_page < b_page ? -1 : 0)); +} + +#ifdef HAVE_DIAGNOSTIC +/* + * __wt_evict_hazard_validate -- + * Return if a page is or isn't on the hazard list. + */ +static void +__wt_evict_hazard_validate(ENV *env, WT_PAGE *page) +{ + IENV *ienv; + WT_PAGE **hp; + WT_TOC **tp, *toc; + + ienv = env->ienv; + + for (tp = ienv->toc; (toc = *tp) != NULL; ++tp) + for (hp = toc->hazard; + hp < toc->hazard + toc->env->hazard_size; ++hp) + if (*hp == page) { + __wt_api_env_errx(env, + "hazard eviction check for page %lu " + "failed", + (u_long)page->addr); + __wt_abort(env); + } +} + +/* + * __wt_evict_dump -- + * Display the eviction list. + */ +void +__wt_evict_dump(WT_TOC *toc) +{ + ENV *env; + WT_CACHE *cache; + WT_EVICT_LIST *evict; + WT_MBUF mb; + uint n; + int sep; + + env = toc->env; + cache = env->ienv->cache; + + __wt_mb_init(env, &mb); + __wt_mb_add(&mb, "eviction list"); + + for (sep = ':', n = 0; n < cache->evict_elem; ++n) { + evict = &cache->evict[n]; + if (evict->ref == NULL) + continue; + __wt_mb_add(&mb, "%c %lu", sep, (u_long)evict->ref->page->addr); + sep = ','; + } + __wt_mb_discard(&mb); +} + +/* + * __wt_evict_dump_cache + * Dump the in-memory cache. + */ +int +__wt_evict_cache_dump(WT_TOC *toc) +{ + IDB *idb; + IENV *ienv; + + ienv = toc->env->ienv; + + TAILQ_FOREACH(idb, &ienv->dbqh, q) + WT_RET(__wt_evict_tree_dump(toc, idb)); + return (0); +} + +/* + * __wt_evict_tree_dump + * Dump an in-memory tree. + */ +int +__wt_evict_tree_dump(WT_TOC *toc, IDB *idb) +{ + ENV *env; + WT_CACHE *cache; + WT_REF *ref; + WT_WALK walk; + WT_MBUF mb; + int sep; + + env = toc->env; + cache = env->ienv->cache; + + WT_VERBOSE(env, WT_VERB_EVICT, (env, + "%s: pages inuse %llu, bytes inuse (%llu), max (%llu)", + idb->name, + __wt_cache_pages_inuse(cache), + __wt_cache_bytes_inuse(cache), + WT_STAT(cache->stats, CACHE_BYTES_MAX))); + + __wt_mb_init(env, &mb); + __wt_mb_add(&mb, "in-memory page list"); + + WT_CLEAR(walk); + WT_RET(__wt_bt_walk_begin(toc, &idb->root_page, &walk)); + for (sep = ':';;) { + WT_RET(__wt_bt_walk_next(toc, &walk, &ref)); + if (ref == NULL) + break; + __wt_mb_add(&mb, "%c %lu", sep, (u_long)ref->page->addr); + sep = ','; + } + __wt_bt_walk_end(env, &walk); + __wt_mb_discard(&mb); + + return (0); +} + +/* + * __wt_evict_cache_count + * Retrun the count of nodes in the cache. + */ +int +__wt_evict_cache_count(WT_TOC *toc, uint64_t *nodesp) +{ + IDB *idb; + IENV *ienv; + uint64_t nodes; + + ienv = toc->env->ienv; + + *nodesp = 0; + TAILQ_FOREACH(idb, &ienv->dbqh, q) { + WT_RET(__wt_evict_tree_count(toc, idb, &nodes)); + *nodesp += nodes; + } + return (0); +} + +/* + * __wt_evict_tree_count + * Return a count of nodes in the tree. + */ +int +__wt_evict_tree_count(WT_TOC *toc, IDB *idb, uint64_t *nodesp) +{ + ENV *env; + WT_CACHE *cache; + WT_REF *ref; + WT_WALK walk; + uint64_t nodes; + + env = toc->env; + cache = env->ienv->cache; + + WT_CLEAR(walk); + WT_RET(__wt_bt_walk_begin(toc, &idb->root_page, &walk)); + for (nodes = 0;;) { + WT_RET(__wt_bt_walk_next(toc, &walk, &ref)); + if (ref == NULL) + break; + ++nodes; + } + *nodesp = nodes; + __wt_bt_walk_end(env, &walk); + + return (0); +} +#endif diff --git a/src/btree/c_init.c b/src/btree/c_init.c new file mode 100644 index 00000000000..641f90d9a56 --- /dev/null +++ b/src/btree/c_init.c @@ -0,0 +1,133 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2010 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_cache_create -- + * Create the underlying cache. + */ +int +__wt_cache_create(ENV *env) +{ + IENV *ienv; + WT_CACHE *cache; + int ret; + + ienv = env->ienv; + ret = 0; + + WT_RET(__wt_calloc(env, 1, sizeof(WT_CACHE), &ienv->cache)); + cache = ienv->cache; + + WT_ERR( + __wt_mtx_alloc(env, "cache eviction server", 1, &cache->mtx_evict)); + WT_ERR(__wt_mtx_alloc(env, "cache read server", 1, &cache->mtx_read)); + WT_ERR(__wt_mtx_alloc(env, "reconciliation", 0, &cache->mtx_reconcile)); + + WT_ERR(__wt_stat_alloc_cache_stats(env, &cache->stats)); + + WT_STAT_SET( + cache->stats, CACHE_BYTES_MAX, env->cache_size * WT_MEGABYTE); + + return (0); + +err: (void)__wt_cache_destroy(env); + return (ret); +} + +/* + * __wt_cache_pages_inuse -- + * Return the number of pages in use. + */ +inline uint64_t +__wt_cache_pages_inuse(WT_CACHE *cache) +{ + uint64_t pages_in, pages_out; + + /* + * Reading 64-bit fields, potentially on 32-bit machines, and other + * threads of control may be modifying them. Check them for sanity + * (although "interesting" corruption is vanishingly unlikely, these + * values just increment over time). + */ + pages_in = cache->stat_pages_in; + pages_out = cache->stat_pages_out; + return (pages_in > pages_out ? pages_in - pages_out : 0); +} + +/* + * __wt_cache_bytes_inuse -- + * Return the number of bytes in use. + */ +inline uint64_t +__wt_cache_bytes_inuse(WT_CACHE *cache) +{ + uint64_t bytes_in, bytes_out; + + /* + * Reading 64-bit fields, potentially on 32-bit machines, and other + * threads of control may be modifying them. Check them for sanity + * (although "interesting" corruption is vanishingly unlikely, these + * values just increment over time). + */ + bytes_in = cache->stat_bytes_in; + bytes_out = cache->stat_bytes_out; + return (bytes_in > bytes_out ? bytes_in - bytes_out : 0); +} + +/* + * __wt_cache_stats -- + * Update the cache statistics for return to the application. + */ +void +__wt_cache_stats(ENV *env) +{ + WT_CACHE *cache; + WT_STATS *stats; + + cache = env->ienv->cache; + stats = cache->stats; + + WT_STAT_SET(stats, CACHE_BYTES_INUSE, __wt_cache_bytes_inuse(cache)); + WT_STAT_SET(stats, CACHE_PAGES_INUSE, __wt_cache_pages_inuse(cache)); +} + +/* + * __wt_cache_destroy -- + * Discard the underlying cache. + */ +int +__wt_cache_destroy(ENV *env) +{ + IENV *ienv; + WT_CACHE *cache; + int ret; + + ienv = env->ienv; + cache = ienv->cache; + ret = 0; + + if (cache == NULL) + return (0); + + /* Discard mutexes. */ + if (cache->mtx_evict != NULL) + (void)__wt_mtx_destroy(env, cache->mtx_evict); + if (cache->mtx_read != NULL) + __wt_mtx_destroy(env, cache->mtx_read); + if (cache->mtx_reconcile != NULL) + __wt_mtx_destroy(env, cache->mtx_reconcile); + + /* Discard allocated memory, and clear. */ + __wt_free(env, cache->stats, 0); + __wt_free(env, ienv->cache, sizeof(WT_CACHE)); + + return (ret); +} diff --git a/src/btree/c_page.c b/src/btree/c_page.c new file mode 100644 index 00000000000..cd71c0b4ebf --- /dev/null +++ b/src/btree/c_page.c @@ -0,0 +1,69 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2010 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_page_read -- + * Read a database page (same as read, but verify the checksum). + */ +int +__wt_page_read(DB *db, WT_PAGE *page) +{ + ENV *env; + WT_FH *fh; + WT_PAGE_HDR *hdr; + off_t offset; + uint32_t checksum; + + env = db->env; + fh = db->idb->fh; + hdr = page->hdr; + + offset = WT_ADDR_TO_OFF(db, page->addr); + WT_RET(__wt_read(env, fh, offset, page->size, hdr)); + + checksum = hdr->checksum; + hdr->checksum = 0; + if (checksum != __wt_cksum(hdr, page->size)) { + __wt_api_env_errx(env, + "read checksum error: addr/size %lu/%lu at offset %llu", + (u_long)page->addr, + (u_long)page->size, (unsigned long long)offset); + return (WT_ERROR); + } + + return (0); +} + +/* + * __wt_page_write -- + * Write a database page. + */ +int +__wt_page_write(WT_TOC *toc, WT_PAGE *page) +{ + DB *db; + ENV *env; + WT_FH *fh; + WT_PAGE_HDR *hdr; + + db = toc->db; + env = toc->env; + fh = db->idb->fh; + + WT_ASSERT(env, __wt_bt_verify_dsk_page(toc, page) == 0); + + hdr = page->hdr; + hdr->checksum = 0; + hdr->checksum = __wt_cksum(hdr, page->size); + + return (__wt_write( + env, fh, WT_ADDR_TO_OFF(db, page->addr), page->size, hdr)); +} diff --git a/src/btree/c_read.c b/src/btree/c_read.c new file mode 100644 index 00000000000..1578b5ee642 --- /dev/null +++ b/src/btree/c_read.c @@ -0,0 +1,273 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2010 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +static int __wt_cache_read(WT_READ_REQ *); + +/* + * __wt_workq_read_server -- + * See if the read server thread needs to be awakened. + */ +void +__wt_workq_read_server(ENV *env, int force) +{ + WT_CACHE *cache; + uint64_t bytes_inuse, bytes_max; + + cache = env->ienv->cache; + + /* + * If we're 10% over the maximum cache, shut out reads (which include + * page allocations) until we evict to at least 5% under the maximum + * cache. The idea is that we don't want to run on the edge all the + * time -- if we're seriously out of space, get things under control + * before opening up for more reads. + */ + bytes_inuse = __wt_cache_bytes_inuse(cache); + bytes_max = WT_STAT(cache->stats, CACHE_BYTES_MAX); + if (cache->read_lockout) { + if (bytes_inuse <= bytes_max - (bytes_max / 20)) + cache->read_lockout = 0; + } else if (bytes_inuse > bytes_max + (bytes_max / 10)) { + WT_VERBOSE(env, WT_VERB_READ, (env, + "workQ locks out reads: bytes-inuse %llu of bytes-max %llu", + (unsigned long long)bytes_inuse, + (unsigned long long)bytes_max)); + cache->read_lockout = 1; + } + + /* If the cache read server is running, there's nothing to do. */ + if (!cache->read_sleeping) + return; + + /* + * If reads are locked out and we're not forcing the issue (that's when + * closing the environment, or if there's a priority read waiting to be + * handled), we're done. + */ + if (!force && cache->read_lockout) + return; + + cache->read_sleeping = 0; + __wt_unlock(env, cache->mtx_read); +} + +/* + * __wt_cache_read_serial_func -- + * Read/allocation serialization function called when a page-in requires + * allocation or a read. + */ +int +__wt_cache_read_serial_func(WT_TOC *toc) +{ + ENV *env; + WT_CACHE *cache; + WT_OFF *off; + WT_PAGE *parent; + WT_READ_REQ *rr, *rr_end; + WT_REF *ref; + int dsk_verify; + + __wt_cache_read_unpack(toc, parent, ref, off, dsk_verify); + + env = toc->env; + cache = env->ienv->cache; + + /* Find an empty slot and enter the read request. */ + rr = cache->read_request; + rr_end = rr + WT_ELEMENTS(cache->read_request); + for (; rr < rr_end; ++rr) + if (WT_READ_REQ_ISEMPTY(rr)) { + WT_READ_REQ_SET(rr, toc, parent, ref, off, dsk_verify); + return (0); + } + __wt_api_env_errx(env, "read server request table full"); + return (WT_RESTART); +} + +/* + * __wt_cache_read_server -- + * Thread to do database reads. + */ +void * +__wt_cache_read_server(void *arg) +{ + ENV *env; + IENV *ienv; + WT_CACHE *cache; + WT_READ_REQ *rr, *rr_end; + WT_TOC *toc; + int didwork, ret; + + env = arg; + ienv = env->ienv; + cache = ienv->cache; + + rr = cache->read_request; + rr_end = rr + WT_ELEMENTS(cache->read_request); + + for (;;) { + WT_VERBOSE(env, + WT_VERB_READ, (env, "cache read server sleeping")); + cache->read_sleeping = 1; + __wt_lock(env, cache->mtx_read); + WT_VERBOSE( + env, WT_VERB_READ, (env, "cache read server waking")); + + /* + * Check for environment exit; do it here, instead of the top of + * the loop because doing it here keeps us from doing a bunch of + * worked when simply awakened to quit. + */ + if (!F_ISSET(ienv, WT_SERVER_RUN)) + break; + + /* + * Walk the read-request queue, looking for reads (defined by + * a valid WT_TOC handle). If we find a read request, perform + * it, flush the result and clear the request slot, then wake + * up the requesting thread. The request slot clear doesn't + * need to be flushed, but we have to flush the read result, + * might as well include it. If we don't find any work, go to + * sleep. + */ + do { + didwork = 0; + for (rr = cache->read_request; rr < rr_end; ++rr) { + if ((toc = rr->toc) == NULL) + continue; + if (cache->read_lockout && + !F_ISSET(toc, WT_READ_PRIORITY)) + continue; + + /* + * The read server thread does both general file + * allocation and cache page instantiation. In + * a file allocation, there's no pagep field in + * in which to return a page. + */ + ret = __wt_cache_read(rr); + + WT_READ_REQ_CLR(rr); + __wt_toc_serialize_wrapup(toc, NULL, ret); + + didwork = 1; + + /* + * Any error terminates the request; a serious + * error causes the read server to exit. + */ + if (ret != 0) { + if (ret != WT_RESTART) + goto err; + ret = 0; + } + } + } while (didwork); + } + + if (ret != 0) +err: __wt_api_env_err(env, ret, "cache read server error"); + + WT_VERBOSE(env, WT_VERB_READ, (env, "cache read server exiting")); + return (NULL); +} + +/* + * __wt_cache_read -- + * Read a page from the file. + */ +static int +__wt_cache_read(WT_READ_REQ *rr) +{ + DB *db; + ENV *env; + WT_CACHE *cache; + WT_FH *fh; + WT_OFF *off; + WT_PAGE *page; + WT_REF *ref; + WT_TOC *toc; + uint32_t addr, size; + int ret; + + toc = rr->toc; + ref = rr->ref; + off = rr->off; + addr = off->addr; + size = off->size; + + db = toc->db; + env = toc->env; + cache = env->ienv->cache; + fh = db->idb->fh; + ret = 0; + + /* + * Check to see if some other thread brought the page into the cache + * while our request was in the queue. If the state is anything + * other than empty, it's not our problem. + */ + if (ref->state != WT_EMPTY) + return (0); + + /* + * The page isn't in the cache, and since we're the only path for the + * page to get into the cache, we don't have to worry further, and + * we might as well get to it. + * + * Allocate memory for the in-memory page information and for the page + * itself. They're two separate allocation calls so we (hopefully) get + * better alignment from the underlying heap memory allocator. + */ + WT_RET(__wt_calloc(env, 1, sizeof(WT_PAGE), &page)); + WT_ERR(__wt_calloc(env, (size_t)size, sizeof(uint8_t), &page->hdr)); + + /* Read the page. */ + WT_VERBOSE(env, WT_VERB_READ, + (env, "cache read addr/size %lu/%lu", (u_long)addr, (u_long)size)); + WT_STAT_INCR(cache->stats, PAGE_READ); + + page->addr = addr; + page->size = size; + WT_ERR(__wt_page_read(db, page)); + WT_CACHE_PAGE_IN(cache, size); + + /* If the page needs to be verified, that's next. */ + if (rr->dsk_verify) + WT_ERR(__wt_bt_verify_dsk_page(toc, page)); + + /* Build the in-memory version of the page. */ + WT_ERR(__wt_bt_page_inmem(toc, page)); + + /* + * Reference the parent's WT_PAGE and parent's WT_OFF structure that + * read the page. + */ + page->parent = rr->parent; + page->parent_off = off; + + /* + * The page is now available -- set the LRU so the page is not selected + * for eviction. + */ + page->read_gen = ++cache->read_gen; + ref->page = page; + ref->state = WT_OK; + + return (0); + +err: if (page != NULL) { + if (page->hdr != NULL) + __wt_free(env, page->hdr, size); + __wt_free(env, page, sizeof(WT_PAGE)); + } + return (ret); +} diff --git a/src/btree/col_get.c b/src/btree/col_get.c new file mode 100644 index 00000000000..7ab2f242a35 --- /dev/null +++ b/src/btree/col_get.c @@ -0,0 +1,40 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_db_col_get -- + * Db.col_get method. + */ +int +__wt_db_col_get(WT_TOC *toc, uint64_t recno, DBT *data) +{ + DB *db; + IDB *idb; + int ret; + + db = toc->db; + idb = db->idb; + + /* Search the column store for the key. */ + if (!F_ISSET(idb, WT_COLUMN)) { + __wt_api_db_errx(db, + "row database records cannot be retrieved by record " + "number"); + return (WT_ERROR); + } + + WT_ERR(__wt_col_search(toc, recno, WT_NOLEVEL, 0)); + ret = __wt_dbt_return(toc, NULL, data, 0); + +err: if (toc->srch_page != idb->root_page.page) + __wt_hazard_clear(toc, toc->srch_page); + return (ret); +} diff --git a/src/btree/col_put.c b/src/btree/col_put.c new file mode 100644 index 00000000000..e7e76778fe3 --- /dev/null +++ b/src/btree/col_put.c @@ -0,0 +1,229 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +static int __wt_col_update(WT_TOC *, uint64_t, DBT *, int); + +/* + * __wt_db_col_del -- + * Db.col_del method. + */ +inline int +__wt_db_col_del(WT_TOC *toc, uint64_t recno) +{ + return (__wt_col_update(toc, recno, NULL, 0)); +} + +/* + * __wt_db_col_put -- + * Db.put method. + */ +inline int +__wt_db_col_put(WT_TOC *toc, uint64_t recno, DBT *data) +{ + DB *db; + + db = toc->db; + + if (db->fixed_len != 0 && data->size != db->fixed_len) + WT_RET(__wt_database_wrong_fixed_size(toc, data->size)); + + return (__wt_col_update(toc, recno, data, 1)); +} + +/* + * __wt_col_update -- + * Column store delete and update. + */ +static int +__wt_col_update(WT_TOC *toc, uint64_t recno, DBT *data, int data_overwrite) +{ + DB *db; + ENV *env; + WT_PAGE *page; + WT_RLE_EXPAND *exp, **new_rleexp; + WT_REPL **new_repl, *repl; + int ret; + + env = toc->env; + db = toc->db; + + page = NULL; + exp = NULL; + new_rleexp = NULL; + new_repl = NULL; + repl = NULL; + + /* Search the btree for the key. */ + WT_RET(__wt_col_search( + toc, recno, WT_NOLEVEL, data_overwrite ? WT_DATA_OVERWRITE : 0)); + page = toc->srch_page; + + /* + * Run-length encoded (RLE) column store operations are hard because + * each original on-disk index for an RLE can represent large numbers + * of records, and we're only deleting a single one of those records, + * which means working in the WT_RLE_EXPAND array. All other column + * store deletes are simple changes where a new WT_REPL entry is added + * to the page's modification array. There are three code paths: + * + * 1: column store deletes other than RLE column stores: delete an entry + * from the on-disk page by creating a new WT_REPL entry, and linking it + * into the WT_REPL array. + * + * 2: an RLE column store delete of an already modified record: create + * a new WT_REPL entry, and link it to the WT_RLE_EXPAND entry's WT_REPL + * list. + * + * 3: an RLE column store delete of a record not yet modified: create + * a new WT_RLE_EXPAND/WT_REPL pair, and link it into the WT_RLE_EXPAND + * array. + */ + switch (page->dsk->type) { + case WT_PAGE_COL_FIX: /* #1 */ + case WT_PAGE_COL_VAR: + /* Allocate a page replacement array if necessary. */ + if (page->u2.repl == NULL) + WT_ERR(__wt_calloc(env, + page->indx_count, sizeof(WT_REPL *), &new_repl)); + + /* Allocate a WT_REPL structure and fill it in. */ + WT_ERR(__wt_repl_alloc(toc, &repl, data)); + + /* workQ: schedule insert of the WT_REPL structure. */ + __wt_item_update_serial(toc, page, toc->srch_write_gen, + WT_COL_SLOT(page, toc->srch_ip), new_repl, repl, ret); + break; + case WT_PAGE_COL_RLE: + if (toc->srch_repl != NULL) { /* #2 */ + /* Allocate a WT_REPL structure and fill it in. */ + WT_ERR(__wt_repl_alloc(toc, &repl, data)); + + /* workQ: schedule insert of the WT_REPL structure. */ + __wt_rle_expand_repl_serial(toc, page, + toc->srch_write_gen, toc->srch_exp, repl, ret); + break; + } + /* #3 */ + /* Allocate a page expansion array as necessary. */ + if (page->u2.rleexp == NULL) + WT_ERR(__wt_calloc(env, page->indx_count, + sizeof(WT_RLE_EXPAND *), &new_rleexp)); + + /* Allocate a WT_REPL structure and fill it in. */ + WT_ERR(__wt_repl_alloc(toc, &repl, data)); + + /* Allocate a WT_RLE_EXPAND structure and fill it in. */ + WT_ERR(__wt_calloc(env, 1, sizeof(WT_RLE_EXPAND), &exp)); + exp->recno = recno; + exp->repl = repl; + + /* Schedule the workQ to link in the WT_RLE_EXPAND structure. */ + __wt_rle_expand_serial(toc, page, toc->srch_write_gen, + WT_COL_SLOT(page, toc->srch_ip), new_rleexp, exp, ret); + break; + WT_ILLEGAL_FORMAT_ERR(db, ret); + } + + if (ret != 0) { +err: if (exp != NULL) + __wt_free(env, exp, sizeof(WT_RLE_EXPAND)); + if (repl != NULL) + __wt_repl_free(toc, repl); + } + + /* Free any allocated page expansion array unless the workQ used it. */ + if (new_rleexp != NULL && new_rleexp != page->u2.rleexp) + __wt_free(env, + new_rleexp, page->indx_count * sizeof(WT_RLE_EXPAND *)); + + /* Free any page replacement array unless the workQ used it. */ + if (new_repl != NULL && new_repl != page->u2.repl) + __wt_free(env, new_repl, page->indx_count * sizeof(WT_REPL *)); + + WT_PAGE_OUT(toc, page); + + return (0); +} + +/* + * __wt_rle_expand_serial_func -- + * Server function to expand a run-length encoded column store during a + * delete. + */ +int +__wt_rle_expand_serial_func(WT_TOC *toc) +{ + WT_PAGE *page; + WT_RLE_EXPAND **new_rleexp, *exp; + uint32_t slot, write_gen; + int ret; + + ret = 0; + + __wt_rle_expand_unpack(toc, page, write_gen, slot, new_rleexp, exp); + + /* Check the page's write-generation. */ + WT_ERR(__wt_page_write_gen_check(page, write_gen)); + + /* + * If the page does not yet have an expansion array, our caller passed + * us one of the correct size. (It's the caller's responsibility to + * detect & free the passed-in expansion array if we don't use it.) + */ + if (page->u2.rleexp == NULL) + page->u2.rleexp = new_rleexp; + + /* + * Insert the new WT_RLE_EXPAND as the first item in the forward-linked + * list of expansion structures. Flush memory to ensure the list is + * never broken. + */ + exp->next = page->u2.rleexp[slot]; + WT_MEMORY_FLUSH; + page->u2.rleexp[slot] = exp; + +err: __wt_toc_serialize_wrapup(toc, page, ret); + return (0); +} + +/* + * __wt_rle_expand_repl_serial_func -- + * Server function to update a WT_REPL entry in an already expanded + * run-length encoded column store during a delete. + */ +int +__wt_rle_expand_repl_serial_func(WT_TOC *toc) +{ + WT_PAGE *page; + WT_RLE_EXPAND *exp; + WT_REPL *repl; + uint32_t write_gen; + int ret; + + ret = 0; + + __wt_rle_expand_repl_unpack(toc, page, write_gen, exp, repl); + + /* Check the page's write-generation. */ + WT_ERR(__wt_page_write_gen_check(page, write_gen)); + + /* + * Insert the new WT_REPL as the first item in the forward-linked list + * of replacement structures from the WT_RLE_EXPAND structure. Flush + * memory to ensure the list is never broken. + */ + repl->next = exp->repl; + WT_MEMORY_FLUSH; + exp->repl = repl; + +err: __wt_toc_serialize_wrapup(toc, page, ret); + return (0); +} diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c new file mode 100644 index 00000000000..81c24e3d54f --- /dev/null +++ b/src/btree/col_srch.c @@ -0,0 +1,211 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_col_search -- + * Search a column-store tree for a specific record-based key. + */ +int +__wt_col_search(WT_TOC *toc, uint64_t recno, uint32_t level, uint32_t flags) +{ + DB *db; + IDB *idb; + WT_COL *cip; + WT_OFF *off; + WT_PAGE *page; + WT_PAGE_DISK *dsk; + WT_RLE_EXPAND *exp; + WT_REF *ref; + WT_REPL *repl; + uint64_t record_cnt; + uint32_t i, write_gen; + int ret; + + toc->srch_page = NULL; /* Return values. */ + toc->srch_ip = NULL; + toc->srch_repl = NULL; + toc->srch_exp = NULL; + toc->srch_write_gen = 0; + + db = toc->db; + idb = db->idb; + + WT_DB_FCHK(db, "__wt_col_search", flags, WT_APIMASK_BT_SEARCH_COL); + + /* Check for a record past the end of the database. */ + page = idb->root_page.page; + if (page->records < recno) + return (WT_NOTFOUND); + + /* Search the tree. */ + for (;;) { + /* Save the write generation value before the read. */ + write_gen = page->write_gen; + + /* Walk the page looking for the record. */ + dsk = page->dsk; + switch (dsk->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + cip = page->u.icol + (recno - dsk->start_recno); + goto done; + case WT_PAGE_COL_RLE: + /* + * Walk the page, counting records -- do the record + * count calculation in a funny way to avoid overflow. + */ + record_cnt = recno - dsk->start_recno; + WT_INDX_FOREACH(page, cip, i) { + if (record_cnt < WT_RLE_REPEAT_COUNT(cip->data)) + break; + record_cnt -= WT_RLE_REPEAT_COUNT(cip->data); + } + goto done; + case WT_PAGE_COL_INT: + default: + /* + * Walk the page, counting records -- do the record + * count calculation in a funny way to avoid overflow. + */ + record_cnt = recno - dsk->start_recno; + WT_INDX_FOREACH(page, cip, i) { + if (record_cnt < WT_COL_OFF_RECORDS(cip)) + break; + record_cnt -= WT_COL_OFF_RECORDS(cip); + } + break; + } + + /* If a level was set, see if we found the asked-for page. */ + if (level == dsk->level) + goto done; + + /* cip references the subtree containing the record. */ + ref = WT_COL_REF(page, cip); + off = WT_COL_OFF(cip); + WT_ERR(__wt_page_in(toc, page, ref, off, 0)); + + /* Swap the parent page for the child page. */ + if (page != idb->root_page.page) + __wt_hazard_clear(toc, page); + page = ref->page; + } + +done: /* + * We've found the right on-page WT_COL structure, but that's only the + * first step; the record may have been updated since reading the page + * into the cache. + */ + switch (dsk->type) { + case WT_PAGE_COL_FIX: + /* Find the item's WT_REPL slot if it exists. */ + repl = WT_COL_REPL(page, cip); + + /* + * If overwriting an existing data item, we don't care if the + * item was previously deleted, return the gathered information. + */ + if (LF_ISSET(WT_DATA_OVERWRITE)) { + toc->srch_repl = repl; + break; + } + + /* + * Otherwise, check for deletion, in either the WT_REPL slot + * or in the original data. + */ + if (repl != NULL) { + if (WT_REPL_DELETED_ISSET(repl)) + goto notfound; + toc->srch_repl = repl; + } else + if (WT_FIX_DELETE_ISSET(cip->data)) + goto notfound; + break; + case WT_PAGE_COL_RLE: + /* Find the item's WT_COL_EXP slot if it exists. */ + for (exp = + WT_COL_RLEEXP(page, cip); exp != NULL; exp = exp->next) + if (exp->recno == recno) + break; + + /* + * If overwriting an existing data item, we don't care if the + * item was previously deleted, return the gathered information. + */ + if (LF_ISSET(WT_DATA_OVERWRITE)) { + if (exp != NULL) { + toc->srch_exp = exp; + toc->srch_repl = exp->repl; + } + break; + } + + /* + * Otherwise, check for deletion, in either the WT_REPL slot + * (referenced by the WT_COL_EXP slot), or in the original data. + */ + if (exp != NULL) { + if (WT_REPL_DELETED_ISSET(exp->repl)) + goto notfound; + toc->srch_exp = exp; + toc->srch_repl = exp->repl; + } else + if (WT_FIX_DELETE_ISSET(WT_RLE_REPEAT_DATA(cip->data))) + goto notfound; + break; + case WT_PAGE_COL_VAR: + /* Find the item's WT_REPL slot if it exists. */ + repl = WT_COL_REPL(page, cip); + + /* + * If overwriting an existing data item, we don't care if the + * item was previously deleted, return the gathered information. + */ + if (LF_ISSET(WT_DATA_OVERWRITE)) { + toc->srch_repl = repl; + break; + } + + /* + * Otherwise, check for deletion, in either the WT_REPL slot + * or in the original data. + */ + if (repl != NULL) { + if (WT_REPL_DELETED_ISSET(repl)) + goto notfound; + toc->srch_repl = repl; + break; + } else + if (WT_ITEM_TYPE(cip->data) == WT_ITEM_DEL) + goto notfound; + break; + case WT_PAGE_COL_INT: + /* + * When returning internal pages, set the item's WT_REPL slot + * if it exists, otherwise we're done. + */ + toc->srch_repl = WT_COL_REPL(page, cip); + break; + WT_ILLEGAL_FORMAT(db); + } + + toc->srch_page = page; + toc->srch_ip = cip; + toc->srch_write_gen = write_gen; + return (0); + +notfound: + ret = WT_NOTFOUND; + +err: WT_PAGE_OUT(toc, page); + return (ret); +} diff --git a/src/btree/row_get.c b/src/btree/row_get.c new file mode 100644 index 00000000000..03f2cce44bc --- /dev/null +++ b/src/btree/row_get.c @@ -0,0 +1,61 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_db_row_get -- + * Db.row_get method. + */ +int +__wt_db_row_get(WT_TOC *toc, DBT *key, DBT *data) +{ + DB *db; + IDB *idb; + WT_PAGE *page; + WT_ROW *rip; + uint32_t type; + int ret; + + db = toc->db; + idb = db->idb; + page = NULL; + + /* Search the btree for the key. */ + WT_ERR(__wt_row_search(toc, key, WT_NOLEVEL, 0)); + page = toc->srch_page; + rip = toc->srch_ip; + + /* + * The Db.get method can only return single key/data pairs. + * If that's not what we found, we're done. + * + * XXX + * Checking if page_data is NULL isn't the right thing to do + * here. Re-visit this when we figure out how we handle + * dup inserts into the tree. Maybe pass NO-DUP flag into the + * search function? + */ + if (rip->data != NULL) { + type = WT_ITEM_TYPE(rip->data); + if (type != WT_ITEM_DATA && type != WT_ITEM_DATA_OVFL) { + __wt_api_db_errx(db, + "the Db.get method cannot return keys with " + "duplicate data items; use the Db.cursor method " + "instead"); + ret = WT_ERROR; + goto err; + } + } + ret = __wt_dbt_return(toc, key, data, 0); + +err: if (page != idb->root_page.page) + __wt_hazard_clear(toc, page); + return (ret); +} diff --git a/src/btree/row_put.c b/src/btree/row_put.c new file mode 100644 index 00000000000..3ac4304ccec --- /dev/null +++ b/src/btree/row_put.c @@ -0,0 +1,288 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +static int __wt_row_update(WT_TOC *, DBT *, DBT *, int); + +/* + * __wt_db_row_del -- + * Db.row_del method. + */ +inline int +__wt_db_row_del(WT_TOC *toc, DBT *key) +{ + return (__wt_row_update(toc, key, NULL, 0)); +} + +/* + * __wt_db_row_put -- + * Db.row_put method. + */ +inline int +__wt_db_row_put(WT_TOC *toc, DBT *key, DBT *data) +{ + return (__wt_row_update(toc, key, data, 1)); +} + +/* + * __wt_row_update -- + * Row store delete and update. + */ +static int +__wt_row_update(WT_TOC *toc, DBT *key, DBT *data, int insert) +{ + ENV *env; + WT_PAGE *page; + WT_REPL **new_repl, *repl; + int ret; + + env = toc->env; + new_repl = NULL; + repl = NULL; + + /* Search the btree for the key. */ + WT_RET(__wt_row_search(toc, key, WT_NOLEVEL, insert ? WT_INSERT : 0)); + page = toc->srch_page; + + /* Allocate a page replacement array as necessary. */ + if (page->u2.repl == NULL) + WT_ERR(__wt_calloc( + env, page->indx_count, sizeof(WT_REPL *), &new_repl)); + + /* Allocate room for the new data item from per-thread memory. */ + WT_ERR(__wt_repl_alloc(toc, &repl, data)); + + /* Schedule the workQ to insert the WT_REPL structure. */ + __wt_item_update_serial(toc, page, toc->srch_write_gen, + WT_ROW_SLOT(page, toc->srch_ip), new_repl, repl, ret); + + if (ret != 0) { +err: if (repl != NULL) + __wt_repl_free(toc, repl); + } + + /* Free any replacement array unless the workQ used it. */ + if (new_repl != NULL && new_repl != page->u2.repl) + __wt_free(env, new_repl, page->indx_count * sizeof(WT_REPL *)); + + WT_PAGE_OUT(toc, page); + + return (0); +} + +/* + * __wt_item_update_serial_func -- + * Server function to update a WT_REPL entry in the modification array. + */ +int +__wt_item_update_serial_func(WT_TOC *toc) +{ + WT_PAGE *page; + WT_REPL **new_repl, *repl; + uint32_t slot, write_gen; + int ret; + + __wt_item_update_unpack(toc, page, write_gen, slot, new_repl, repl); + + ret = 0; + + /* Check the page's write-generation. */ + WT_ERR(__wt_page_write_gen_check(page, write_gen)); + + /* + * If the page does not yet have a replacement array, our caller passed + * us one of the correct size. (It's the caller's responsibility to + * detect & free the passed-in expansion array if we don't use it.) + */ + if (page->u2.repl == NULL) + page->u2.repl = new_repl; + + /* + * Insert the new WT_REPL as the first item in the forward-linked list + * of replacement structures. Flush memory to ensure the list is never + * broken. + */ + repl->next = page->u2.repl[slot]; + WT_MEMORY_FLUSH; + page->u2.repl[slot] = repl; + +err: __wt_toc_serialize_wrapup(toc, page, ret); + return (0); +} + +/* + * __wt_repl_alloc -- + * Allocate a WT_REPL structure and associated data from the TOC's update + * memory, and fill it in. + */ +int +__wt_repl_alloc(WT_TOC *toc, WT_REPL **replp, DBT *data) +{ + DB *db; + ENV *env; + WT_REPL *repl; + WT_TOC_UPDATE *update; + uint32_t align_size, alloc_size, size; + int single_use; + + env = toc->env; + db = toc->db; + + /* + * Allocate memory for a data insert or change; there's a buffer in the + * WT_TOC structure for allocation of chunks of memory to hold changed + * or inserted data items. + * + * We align each allocation because we directly access WT_REPL structure + * fields in the memory (the x86 handles unaligned accesses, but I don't + * want to have to find and fix this code for a port to a system that + * doesn't handle unaligned accesses). It wastes space, but this memory + * is never written to disk and there are fewer concerns about memory + * than with on-disk structures. Any other code allocating memory from + * this buffer needs to align its allocations as well. + * + * The first thing in each chunk of memory is WT_TOC_UPDATE structure + * (which we check is a multiple of 4B during initialization); then + * there are one or more WT_REPL structure plus data chunk pairs. + * + * XXX + * Figure out how much space we need: this code limits the maximum size + * of a data item stored in the database. In summary, for a big item we + * have to store a WT_TOC_UPDATE structure, the WT_REPL structure and + * the data, all in an allocated buffer. We only pass a 32-bit value + * to our allocation routine, so we can't store an item bigger than the + * maximum 32-bit value minus the sizes of those two structures, where + * the WT_REPL structure and data item are aligned to a 32-bit boundary. + * We could fix this, but it's unclear it's worth the effort -- document + * you can store a (4GB - 20B) item max, and you're done, because it's + * insane to store a 4GB item in the database anyway. + * + * Check first we won't overflow when calculating an aligned size, then + * check the total required space for this item. + */ + size = data == NULL ? 0 : data->size; + if (UINT32_MAX - size < sizeof(WT_REPL) + sizeof(uint32_t)) + return (__wt_database_item_too_big(db)); + align_size = WT_ALIGN(size + sizeof(WT_REPL), sizeof(uint32_t)); + if (UINT32_MAX - align_size < sizeof(WT_TOC_UPDATE)) + return (__wt_database_item_too_big(db)); + + /* + * If we already have a buffer and the data fits, just copy the WT_REPL + * structure and data into place, we're done. + */ + update = toc->update; + if (update != NULL && align_size <= update->space_avail) + goto no_allocation; + + /* + * Decide how much memory to allocate: if it's a one-off (that is, the + * data is bigger than anything we'll aggregate into these buffers, it's + * a one-off. Otherwise, allocate the next power-of-two larger than 4 + * times the requested size, and at least the default buffer size. + * + * XXX + * I have no reason for the 4x the request size, I just hate to allocate + * a buffer for every change to the database. A better approach would + * be to grow the allocation buffer as the thread makes more changes; if + * a thread is doing lots of work, give it lots of memory, otherwise + * only allocate as it's necessary. + */ + if (align_size > env->data_update_max) { + alloc_size = sizeof(WT_TOC_UPDATE) + align_size; + single_use = 1; + } else { + alloc_size = __wt_nlpo2( + WT_MAX(align_size * 4, env->data_update_initial)); + single_use = 0; + } + WT_RET(__wt_calloc(env, 1, alloc_size, &update)); + + update->len = alloc_size; + update->space_avail = alloc_size - sizeof(WT_TOC_UPDATE); + update->first_free = (uint8_t *)update + sizeof(WT_TOC_UPDATE); + + /* + * If it's a single use allocation, ignore any current update buffer. + * Else, release the old update buffer and replace it with the new one. + */ + if (!single_use) { + /* + * The "in" reference count is artificially incremented by 1 as + * long as an update buffer is referenced by the WT_TOC thread; + * we don't want them freed because a page was evicted and the + * count went to 0. Decrement the reference count on the buffer + * as part of releasing it. There's a similar reference count + * decrement when the WT_TOC structure is discarded. + * + * XXX + * There's a race here: if this code, or the WT_TOC structure + * close code, and the page discard code race, it's possible + * neither will realize the buffer is no longer needed and free + * it. The fix is to involve the eviction or workQ threads: + * they may need a linked list of buffers they review to ensure + * it never happens. I'm living with this now: it's unlikely + * and it's a memory leak if it ever happens. + */ + if (toc->update != NULL) + --toc->update->in; + toc->update = update; + + update->in = 1; + } + +no_allocation: + /* Copy the WT_REPL structure into place. */ + repl = (WT_REPL *)update->first_free; + repl->update = update; + if (data == NULL) + WT_REPL_DELETED_SET(repl); + else { + repl->size = data->size; + memcpy(WT_REPL_DATA(repl), data->data, data->size); + } + + update->first_free += align_size; + update->space_avail -= align_size; + ++update->in; + + *replp = repl; + return (0); +} + +/* + * __wt_repl_free -- + * Free a WT_REPL structure and associated data from the TOC's update + * memory. + */ +void +__wt_repl_free(WT_TOC *toc, WT_REPL *repl) +{ + ENV *env; + + env = toc->env; + + /* + * It's possible we allocated a WT_REPL structure and associated item + * memory from the WT_TOC update buffer, but then an error occurred. + * Don't try and clean up the update buffer, it's simpler to decrement + * the use count and let the page discard code deal with it during the + * page reconciliation process. (Note we're still in the allocation + * path, so we decrement the "in" field, not the "out" field.) + */ + --repl->update->in; + + /* + * One other thing: if the update buffer was a one-off, we have to free + * it here, it's not linked to any WT_PAGE in the system. + */ + if (repl->update->in == 0) + __wt_free(env, repl->update, repl->update->len); +} diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c new file mode 100644 index 00000000000..a8ff78dc380 --- /dev/null +++ b/src/btree/row_srch.c @@ -0,0 +1,196 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +static int __wt_key_build(WT_TOC *, WT_PAGE *, WT_ROW *); + +/* + * __wt_row_search -- + * Search a row-store tree for a specific key. + */ +int +__wt_row_search(WT_TOC *toc, DBT *key, uint32_t level, uint32_t flags) +{ + DB *db; + IDB *idb; + WT_OFF *off; + WT_PAGE *page; + WT_PAGE_DISK *dsk; + WT_REF *ref; + WT_ROW *rip; + WT_REPL *repl; + uint32_t base, indx, limit, write_gen; + int cmp, isleaf, ret; + + toc->srch_page = NULL; /* Return values. */ + toc->srch_ip = NULL; + toc->srch_repl = NULL; + toc->srch_exp = NULL; + toc->srch_write_gen = 0; + + db = toc->db; + idb = db->idb; + + WT_DB_FCHK(db, "__wt_row_search", flags, WT_APIMASK_BT_SEARCH_KEY_ROW); + + /* Search the tree. */ + for (page = idb->root_page.page;;) { + /* Copy the write generation value before the read. */ + write_gen = page->write_gen; + + dsk = page->dsk; + isleaf = + dsk->type == WT_PAGE_DUP_LEAF || + dsk->type == WT_PAGE_ROW_LEAF; + for (base = 0, + limit = page->indx_count; limit != 0; limit >>= 1) { + indx = base + (limit >> 1); + + /* + * If the key is compressed or an overflow, it may not + * have been instantiated yet. + */ + rip = page->u.irow + indx; + if (__wt_key_process(rip)) + WT_ERR(__wt_key_build(toc, page, rip)); + + /* + * If we're about to compare an application key with the + * 0th index on an internal page, pretend the 0th index + * sorts less than any application key. This test is so + * we don't have to update internal pages if the + * application stores a new, "smallest" key in the tree. + * + * For the record, we still maintain the key at the 0th + * location because it means tree verification and other + * code that processes a level of the tree doesn't need + * to know about this hack. + */ + if (indx != 0 || isleaf) { + cmp = db->btree_compare(db, key, (DBT *)rip); + if (cmp == 0) + break; + if (cmp < 0) + continue; + } + base = indx + 1; + --limit; + } + + /* + * Reference the slot used for next step down the tree. We do + * this on leaf pages too, because it's simpler to code, and we + * only care if there's an exact match on leaf pages; setting + * rip doesn't matter for leaf pages because we always return + * WT_NOTFOUND if there's no match. + * + * Base is the smallest index greater than key and may be the + * 0th index or the (last + 1) indx. If base is not the 0th + * index (remember, the 0th index always sorts less than any + * application key), decrement it to the smallest index less + * than or equal to key. + */ + if (cmp != 0) + rip = page->u.irow + (base == 0 ? 0 : base - 1); + + /* + * If we've reached the leaf page, or we've reached the level + * requested by our caller, we're done. + */ + if (isleaf || level == dsk->level) + break; + + /* rip references the subtree containing the record. */ + ref = WT_ROW_REF(page, rip); + off = WT_ROW_OFF(rip); + WT_ERR(__wt_page_in(toc, page, ref, off, 0)); + + /* Swap the parent page for the child page. */ + if (page != idb->root_page.page) + __wt_hazard_clear(toc, page); + page = ref->page; + } + + /* + * We've got the right on-page WT_ROW structure (an exact match in the + * case of a lookup, or the smallest key on the page less than or equal + * to the specified key in the case of an insert). If it's an insert, + * we're done, return the information. Otherwise, check to see if the + * item was modified/deleted. + */ + switch (dsk->type) { + case WT_PAGE_DUP_LEAF: + case WT_PAGE_ROW_LEAF: + if (LF_ISSET(WT_INSERT)) + break; + if (cmp != 0) /* No match */ + goto notfound; + /* Deleted match. */ + if ((repl = WT_ROW_REPL(page, rip)) != NULL) { + if (WT_REPL_DELETED_ISSET(repl)) + goto notfound; + toc->srch_repl = repl; + } + break; + case WT_PAGE_DUP_INT: + case WT_PAGE_ROW_INT: + /* + * When returning internal pages, set the item's WT_REPL slot + * if it exists, otherwise we're done. + */ + toc->srch_repl = WT_ROW_REPL(page, rip); + break; + WT_ILLEGAL_FORMAT(db); + } + + toc->srch_page = page; + toc->srch_ip = rip; + toc->srch_write_gen = write_gen; + return (0); + +notfound: + ret = WT_NOTFOUND; + +err: WT_PAGE_OUT(toc, page); + return (ret); +} + +/* + * __wt_key_build -- + * Instantiate an overflow or compressed key into a WT_ROW structure. + */ +static int +__wt_key_build(WT_TOC *toc, WT_PAGE *page, WT_ROW *rip_arg) +{ + DBT *dbt, _dbt; + WT_ROW *rip; + WT_ITEM *item; + uint32_t i; + + WT_CLEAR(_dbt); + dbt = &_dbt; + + item = rip_arg->key; + WT_RET(__wt_item_process(toc, item, dbt)); + + /* + * Update the WT_ROW reference with the processed key. If there are + * any duplicates of this item, update them as well. + */ + __wt_key_set(rip_arg, dbt->data, dbt->size); + if (WT_ITEM_TYPE(rip_arg->data) == WT_ITEM_DATA_DUP || + WT_ITEM_TYPE(rip_arg->data) == WT_ITEM_DATA_DUP_OVFL) { + WT_INDX_FOREACH(page, rip, i) + if (rip->key == item) + __wt_key_set(rip, dbt->data, dbt->size); + } + + return (0); +} diff --git a/src/db/db_err.c b/src/db/db_err.c new file mode 100644 index 00000000000..1ba46e06a69 --- /dev/null +++ b/src/db/db_err.c @@ -0,0 +1,64 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +#define WT_DB_ERR(db, error, fmt) { \ + va_list __ap; \ + \ + /* Application-specified callback function. */ \ + va_start(__ap, fmt); \ + if ((db)->errcall != NULL) \ + __wt_msg_call((void *)((db)->errcall), \ + (void *)(db), (db)->errpfx, \ + (db)->idb == NULL ? NULL : (db)->idb->name, \ + error, fmt, __ap); \ + va_end(__ap); \ + \ + /* \ + * If the application set an error callback function but not an \ + * error stream, we're done. Otherwise, write an error stream. \ + */ \ + if ((db)->errcall != NULL && (db)->errfile == NULL) \ + return; \ + \ + va_start(__ap, fmt); \ + __wt_msg_stream((db)->errfile, (db)->errpfx, \ + (db)->idb == NULL ? NULL : (db)->idb->name, \ + error, fmt, __ap); \ + va_end(__ap); \ +} + +/* + * __wt_api_db_err -- + * Db.err method. + */ +void +__wt_api_db_err(DB *db, int error, const char *fmt, ...) +{ + /* + * This function may be called at before/after the statistics memory + * has been allocated/freed; don't increment method statistics here. + */ + WT_DB_ERR(db, error, fmt); +} + +/* + * __wt_api_db_errx -- + * Db.errx method. + */ +void +__wt_api_db_errx(DB *db, const char *fmt, ...) +{ + /* + * This function may be called at before/after the statistics memory + * has been allocated/freed; don't increment method statistics here. + */ + WT_DB_ERR(db, 0, fmt); +} diff --git a/src/db/db_getset.c b/src/db/db_getset.c new file mode 100644 index 00000000000..6c133a0a3fb --- /dev/null +++ b/src/db/db_getset.c @@ -0,0 +1,85 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_db_btree_compare_int_set_verify -- + * Verify arguments to the Db.btree_compare_int_set method. + */ +int +__wt_db_btree_compare_int_set_verify(DB *db, int btree_compare_int) +{ + if (btree_compare_int >= 0 && btree_compare_int <= 8) + return (0); + + __wt_api_db_errx(db, + "The number of bytes must be an integral value between 1 and 8"); + return (WT_ERROR); +} + +/* + * __wt_db_btree_dup_offpage_set_verify -- + * Verify arguments to the Db.btree_dup_offpage_set method. + */ +int +__wt_db_btree_dup_offpage_set_verify(DB *db, uint32_t dup_offpage) +{ + /* + * Limiting this value to something between 10 and 50 is a sanity test, + * not a hard constraint (although a value of 100 might fail hard). + * + * If the value is too large, pages can end up being empty because it + * isn't possible for duplicate sets to span pages. So, if you set + * the value to 50%, and you have two sequential, large duplicate sets, + * you end up with two, half-empty pages. + */ + if (dup_offpage > 10 && dup_offpage <= 50) + return (0); + + __wt_api_db_errx(db, + "The percent of the page taken up by duplicate entries before " + "being moved off-page must must be between 10 and 50"); + return (WT_ERROR); +} + +/* + * __wt_db_column_set_verify -- + * Verify arguments to the Db.column_set method. + */ +int +__wt_db_column_set_verify( + DB *db, uint32_t fixed_len, const char *dictionary, uint32_t flags) +{ + ENV *env; + IDB *idb; + + env = db->env; + idb = db->idb; + + /* + * The fixed-length number of bytes is stored in a single byte, which + * limits the size to 255 bytes. + */ + WT_RET(__wt_api_arg_max( + env, "DB.column_set", "fixed_len", fixed_len, 255)); + + /* Run-length encoding is incompatible with variable length records. */ + if (fixed_len == 0 && LF_ISSET(WT_RLE)) { + __wt_api_db_errx(db, + "Run-length encoding is incompatible with variable length " + "column-store records"); + return (WT_ERROR); + } + + if (LF_ISSET(WT_RLE)) + F_SET(idb, WT_RLE); + F_SET(idb, WT_COLUMN); + return (0); +} diff --git a/src/db/db_handle.c b/src/db/db_handle.c new file mode 100644 index 00000000000..b9e244d5ea9 --- /dev/null +++ b/src/db/db_handle.c @@ -0,0 +1,184 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +static int __wt_db_config(DB *); +static int __wt_idb_config(DB *); +static int __wt_idb_destroy(DB *); + +/* + * __wt_env_db -- + * DB constructor. + */ +int +__wt_env_db(ENV *env, DB **dbp) +{ + DB *db; + IDB *idb; + int ret; + + db = NULL; + idb = NULL; + + /* Create the DB and IDB structures. */ + WT_ERR(__wt_calloc(env, 1, sizeof(DB), &db)); + WT_ERR(__wt_calloc(env, 1, sizeof(IDB), &idb)); + + /* Connect everything together. */ + db->idb = idb; + idb->db = db; + db->env = env; + + /* Configure the DB and the IDB. */ + WT_ERR(__wt_db_config(db)); + WT_ERR(__wt_idb_config(db)); + + *dbp = db; + return (0); + +err: (void)__wt_db_destroy(db); + return (ret); +} + +/* + * __wt_db_config -- + * Set configuration for a just-created DB handle. + */ +static int +__wt_db_config(DB *db) +{ + __wt_methods_db_config_default(db); + __wt_methods_db_lockout(db); + __wt_methods_db_init_transition(db); + + return (0); +} + +/* + * __wt_idb_config -- + * Set configuration for a just-created IDB handle. + */ +static int +__wt_idb_config(DB *db) +{ + ENV *env; + IDB *idb; + IENV *ienv; + + env = db->env; + idb = db->idb; + ienv = env->ienv; + + idb->db = db; + idb->root_off.addr = idb->free_addr = WT_ADDR_INVALID; + + __wt_lock(env, ienv->mtx); /* Add to the ENV's list */ + TAILQ_INSERT_TAIL(&ienv->dbqh, idb, q); + ++ienv->dbqcnt; + __wt_unlock(env, ienv->mtx); + + WT_RET(__wt_stat_alloc_db_stats(env, &idb->stats)); + WT_RET(__wt_stat_alloc_database_stats(env, &idb->dstats)); + + return (0); +} + +/* + * __wt_db_destroy -- + * DB handle destructor. + */ +int +__wt_db_destroy(DB *db) +{ + ENV *env; + int ret; + + env = db->env; + + /* Discard the underlying IDB object. */ + ret = __wt_idb_destroy(db); + + /* Discard the DB object. */ + __wt_free(env, db, sizeof(DB)); + + return (ret); +} + +/* + * __wt_idb_destroy -- + * IDB handle destructor. + */ +static int +__wt_idb_destroy(DB *db) +{ + ENV *env; + IDB *idb; + IENV *ienv; + int ret; + + env = db->env; + idb = db->idb; + ienv = env->ienv; + ret = 0; + + /* Check that there's something to close. */ + if (idb == NULL) + return (0); + + /* Diagnostic check: check flags against approved list. */ + WT_ENV_FCHK_RET(env, "Db.close", idb->flags, WT_APIMASK_IDB, ret); + + __wt_free(env, idb->name, 0); + + if (idb->huffman_key != NULL) { + /* Key and data may use the same table, only close it once. */ + if (idb->huffman_data == idb->huffman_key) + idb->huffman_data = NULL; + __wt_huffman_close(env, idb->huffman_key); + idb->huffman_key = NULL; + } + if (idb->huffman_data != NULL) { + __wt_huffman_close(env, idb->huffman_data); + idb->huffman_data = NULL; + } + + __wt_walk_end(env, &idb->evict_walk); + + __wt_free(env, idb->stats, 0); + __wt_free(env, idb->dstats, 0); + + __wt_lock(env, ienv->mtx); /* Delete from the ENV's list */ + TAILQ_REMOVE(&ienv->dbqh, idb, q); + --ienv->dbqcnt; + __wt_unlock(env, ienv->mtx); + + __wt_free(env, idb, sizeof(IDB)); + db->idb = NULL; + return (ret); +} + +int +__wt_db_lockout_err(DB *db) +{ + __wt_api_db_errx(db, + "This Db handle has failed for some reason, and can no longer " + "be used; the only method permitted on it is Db.close which " + "discards the handle permanently"); + return (WT_ERROR); +} + +int +__wt_db_lockout_open(DB *db) +{ + __wt_api_db_errx(db, + "This method may not be called until after the Db.open method has " + "been called"); + return (WT_ERROR); +} diff --git a/src/db/db_huffman.c b/src/db/db_huffman.c new file mode 100644 index 00000000000..ae9fe7fccde --- /dev/null +++ b/src/db/db_huffman.c @@ -0,0 +1,233 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * 7-bit ASCII, with English language frequencies. + * + * Based on "Case-sensitive letter and bigram frequency counts from large-scale + * English corpora" + * Michael N. Jones and D.J.K. Mewhort + * Queen's University, Kingston, Ontario, Canada + * Behavior Research Methods, Instruments, & Computers 2004, 36 (3), 388-396 + * + * Additionally supports space and tab characters; space is the most common + * character in text where it occurs, and tab appears about as frequently as + * 'a' and 'n' in text where it occurs. + */ +static uint8_t const __wt_huffman_ascii_english[256] = { + 1, /* 000 nul */ + 1, /* 001 soh */ + 1, /* 002 stx */ + 1, /* 003 etx */ + 1, /* 004 eot */ + 1, /* 005 enq */ + 1, /* 006 ack */ + 1, /* 007 bel */ + 1, /* 010 bs */ + 251, /* 011 ht */ + 1, /* 012 nl */ + 1, /* 013 vt */ + 1, /* 014 np */ + 1, /* 015 cr */ + 1, /* 016 so */ + 1, /* 017 si */ + 1, /* 020 dle */ + 1, /* 021 dc1 */ + 1, /* 022 dc2 */ + 1, /* 023 dc3 */ + 1, /* 024 dc4 */ + 1, /* 025 nak */ + 1, /* 026 syn */ + 1, /* 027 etb */ + 1, /* 030 can */ + 1, /* 031 em */ + 1, /* 032 sub */ + 1, /* 033 esc */ + 1, /* 034 fs */ + 1, /* 035 gs */ + 1, /* 036 rs */ + 1, /* 037 us */ + 255, /* 040 sp */ + 177, /* 041 ! */ + 223, /* 042 " */ + 171, /* 043 # */ + 188, /* 044 $ */ + 176, /* 045 % */ + 179, /* 046 & */ + 215, /* 047 ' */ + 189, /* 050 ( */ + 190, /* 051 ) */ + 184, /* 052 * */ + 175, /* 053 + */ + 234, /* 054 , */ + 219, /* 055 - */ + 233, /* 056 . */ + 181, /* 057 / */ + 230, /* 060 0 */ + 229, /* 061 1 */ + 226, /* 062 2 */ + 213, /* 063 3 */ + 214, /* 064 4 */ + 227, /* 065 5 */ + 210, /* 066 6 */ + 203, /* 067 7 */ + 212, /* 070 8 */ + 222, /* 071 9 */ + 191, /* 072 : */ + 186, /* 073 ; */ + 173, /* 074 < */ + 172, /* 075 = */ + 174, /* 076 > */ + 183, /* 077 ? */ + 170, /* 100 @ */ + 221, /* 101 A */ + 211, /* 102 B */ + 218, /* 103 C */ + 206, /* 104 D */ + 207, /* 105 E */ + 199, /* 106 F */ + 197, /* 107 G */ + 205, /* 110 H */ + 217, /* 111 I */ + 196, /* 112 J */ + 187, /* 113 K */ + 201, /* 114 L */ + 220, /* 115 M */ + 216, /* 116 N */ + 200, /* 117 O */ + 208, /* 120 P */ + 182, /* 121 Q */ + 209, /* 122 R */ + 224, /* 123 S */ + 225, /* 124 T */ + 193, /* 125 U */ + 185, /* 126 V */ + 202, /* 127 W */ + 180, /* 130 X */ + 198, /* 131 Y */ + 178, /* 132 Z */ + 1, /* 133 [ */ + 1, /* 134 \ */ + 1, /* 135 ] */ + 1, /* 136 ^ */ + 1, /* 137 _ */ + 1, /* 140 ` */ + 252, /* 141 a */ + 232, /* 142 b */ + 242, /* 143 c */ + 243, /* 144 d */ + 254, /* 145 e */ + 239, /* 146 f */ + 237, /* 147 g */ + 245, /* 150 h */ + 248, /* 151 i */ + 194, /* 152 j */ + 228, /* 153 k */ + 244, /* 154 l */ + 240, /* 155 m */ + 249, /* 156 n */ + 250, /* 157 o */ + 238, /* 160 p */ + 192, /* 161 q */ + 246, /* 162 r */ + 247, /* 163 s */ + 253, /* 164 t */ + 241, /* 165 u */ + 231, /* 166 v */ + 235, /* 167 w */ + 204, /* 170 x */ + 236, /* 171 y */ + 195, /* 172 z */ + 1, /* 173 { */ + 1, /* 174 | */ + 1, /* 175 } */ + 1, /* 176 ~ */ + 1, /* 177 del */ +}; + +/* + * __wt_db_huffman_set -- + * DB huffman configuration setter. + */ +int +__wt_db_huffman_set(DB *db, + uint8_t const *huffman_table, u_int huffman_table_size, uint32_t flags) +{ + ENV *env; + IDB *idb; + uint8_t phone[256]; + + env = db->env; + idb = db->idb; + + switch (LF_ISSET(WT_ASCII_ENGLISH | WT_TELEPHONE)) { + case WT_ASCII_ENGLISH: + if (huffman_table != NULL) + goto err; + huffman_table = __wt_huffman_ascii_english; + huffman_table_size = sizeof(__wt_huffman_ascii_english); + break; + case WT_TELEPHONE: + if (huffman_table != NULL) + goto err; + memset(phone, 0, sizeof(phone)); + phone['('] = 2; + phone[')'] = 2; + phone['+'] = 1; + phone['-'] = 3; + phone['0'] = 1; + phone['1'] = 1; + phone['2'] = 1; + phone['3'] = 1; + phone['4'] = 1; + phone['5'] = 1; + phone['6'] = 1; + phone['7'] = 1; + phone['8'] = 1; + phone['9'] = 1; + huffman_table = phone; + huffman_table_size = sizeof(phone); + break; + default: +err: return (__wt_api_args(env, "Db.huffman_set")); + } + + /* + * If we're using an already-specified table, close it. It's probably + * an application error to set the Huffman table twice, but hey, I just + * work here. + */ + if (LF_ISSET(WT_HUFFMAN_KEY) && idb->huffman_key != NULL) { + /* Key and data may use the same table, only close it once. */ + if (idb->huffman_data == idb->huffman_key) + idb->huffman_data = NULL; + __wt_huffman_close(env, idb->huffman_key); + idb->huffman_key = NULL; + } + if (LF_ISSET(WT_HUFFMAN_DATA) && idb->huffman_data != NULL) { + __wt_huffman_close(env, idb->huffman_data); + idb->huffman_data = NULL; + } + if (LF_ISSET(WT_HUFFMAN_KEY)) { + WT_RET(__wt_huffman_open(env, + huffman_table, huffman_table_size, &idb->huffman_key)); + /* Key and data may use the same table. */ + if (LF_ISSET(WT_HUFFMAN_DATA)) { + idb->huffman_data = idb->huffman_key; + LF_CLR(WT_HUFFMAN_DATA); + } + } + if (LF_ISSET(WT_HUFFMAN_DATA)) + WT_RET(__wt_huffman_open(env, + huffman_table, huffman_table_size, &idb->huffman_data)); + + return (0); +} diff --git a/src/db/db_open.c b/src/db/db_open.c new file mode 100644 index 00000000000..1cdf04c1288 --- /dev/null +++ b/src/db/db_open.c @@ -0,0 +1,104 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +static int __wt_db_idb_open(DB *, const char *, mode_t, uint32_t); + +/* + * __wt_db_open -- + * Open a DB handle. + */ +int +__wt_db_open(WT_TOC *toc, const char *name, mode_t mode, uint32_t flags) +{ + DB *db; + ENV *env; + + env = toc->env; + db = toc->db; + + WT_STAT_INCR(env->ienv->stats, DATABASE_OPEN); + + /* Initialize the IDB structure. */ + WT_RET(__wt_db_idb_open(db, name, mode, flags)); + + /* Open the underlying Btree. */ + WT_RET(__wt_bt_open(toc, LF_ISSET(WT_CREATE) ? 1 : 0)); + + /* Turn on the methods that require open. */ + __wt_methods_db_open_transition(db); + + return (0); +} + +/* + * __wt_db_idb_open -- + * Routine to intialize any IDB values based on a DB value during open. + */ +static int +__wt_db_idb_open(DB *db, const char *name, mode_t mode, uint32_t flags) +{ + ENV *env; + IENV *ienv; + IDB *idb; + + env = db->env; + ienv = env->ienv; + idb = db->idb; + + WT_RET(__wt_strdup(env, name, &idb->name)); + idb->mode = mode; + + __wt_lock(env, ienv->mtx); + idb->file_id = ++ienv->next_file_id; + __wt_unlock(env, ienv->mtx); + + /* + * XXX + * Initialize the root WT_REF/WT_OFF pair to point to the start of + * the file. This is all wrong, and we'll get the information from + * somewhere else, eventually. + */ + WT_CLEAR(idb->root_page); + idb->root_page.state = WT_EMPTY; + WT_CLEAR(idb->root_off); + idb->root_off.addr = 0; + idb->root_off.size = 0; + + if (LF_ISSET(WT_RDONLY)) + F_SET(idb, WT_RDONLY); + + return (0); +} + +/* + * __wt_db_close -- + * Db.close method (DB close & handle destructor). + */ +int +__wt_db_close(WT_TOC *toc, uint32_t flags) +{ + DB *db; + int ret; + + db = toc->db; + ret = 0; + + /* Flush the underlying Btree. */ + if (!LF_ISSET(WT_NOWRITE)) + WT_TRET(__wt_bt_sync(toc)); + + /* Close the underlying Btree. */ + ret = __wt_bt_close(toc); + + WT_TRET(__wt_db_destroy(db)); + + return (ret); +} diff --git a/src/db/db_stat.c b/src/db/db_stat.c new file mode 100644 index 00000000000..84ac9960860 --- /dev/null +++ b/src/db/db_stat.c @@ -0,0 +1,72 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_db_stat_print -- + * Print DB handle statistics to a stream. + */ +int +__wt_db_stat_print(WT_TOC *toc, FILE *stream) +{ + DB *db; + ENV *env; + IDB *idb; + + db = toc->db; + env = toc->env; + idb = db->idb; + + fprintf(stream, "Database handle statistics: %s\n", idb->name); + __wt_stat_print(env, idb->stats, stream); + + /* Clear the database stats, then call Btree stat to fill them in. */ + __wt_stat_clear_database_stats(idb->dstats); + WT_STAT_SET(idb->dstats, TREE_LEVEL, idb->root_page.page->dsk->level); + WT_RET(__wt_desc_stat(toc)); + + /* + * Note we do not have a hazard reference for the root page, and that's + * safe -- root pages are pinned into memory when a database is opened, + * and never re-written until the database is closed. + */ + WT_RET(__wt_tree_walk(toc, NULL, 0, __wt_page_stat, NULL)); + + fprintf(stream, "Database statistics: %s\n", idb->name); + __wt_stat_print(env, idb->dstats, stream); + + /* Underlying file handle statistics. */ + if (idb->fh != NULL) { + fprintf(stream, + "Underlying file I/O statistics: %s\n", idb->name); + __wt_stat_print(env, idb->fh->stats, stream); + } + + return (0); +} + +/* + * __wt_db_stat_clear -- + * Clear DB handle statistics. + */ +int +__wt_db_stat_clear(DB *db) +{ + IDB *idb; + + idb = db->idb; + + __wt_stat_clear_db_stats(idb->stats); + __wt_stat_clear_database_stats(idb->dstats); + if (idb->fh != NULL) + __wt_stat_clear_fh_stats(idb->fh->stats); + + return (0); +} diff --git a/src/db/db_sync.c b/src/db/db_sync.c new file mode 100644 index 00000000000..eec5026f0c2 --- /dev/null +++ b/src/db/db_sync.c @@ -0,0 +1,20 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_db_sync -- + * Flush a database to the backing file. + */ +int +__wt_db_sync(WT_TOC *toc, void (*f)(const char *, uint64_t), uint32_t flags) +{ + return (__wt_bt_sync(toc)); +} diff --git a/src/env/env_err.c b/src/env/env_err.c new file mode 100644 index 00000000000..b5bc0ca5966 --- /dev/null +++ b/src/env/env_err.c @@ -0,0 +1,83 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +void +wiredtiger_err_stream(FILE *stream) +{ + extern FILE *__wt_err_stream; + + __wt_err_stream = stream; +} + +#define WT_ENV_ERR(env, error, fmt) { \ + extern FILE *__wt_err_stream; \ + va_list __ap; \ + /* \ + * Support error messages even when we don't yet have an ENV \ + * handle. \ + */ \ + if ((env) == NULL) { \ + va_start(__ap, fmt); \ + __wt_msg_stream( \ + __wt_err_stream, NULL, NULL, error, fmt, __ap); \ + va_end(__ap); \ + return; \ + } \ + \ + /* Application-specified callback function. */ \ + if ((env)->errcall != NULL) { \ + va_start(__ap, fmt); \ + __wt_msg_call((void *)((env)->errcall), \ + (void *)(env), env->errpfx, \ + NULL, error, fmt, __ap); \ + va_end(__ap); \ + } \ + \ + /* \ + * If the application set an error callback function but not an \ + * error stream, we're done. Otherwise, write the stream. \ + */ \ + if ((env)->errcall != NULL && (env)->errfile == NULL) \ + return; \ + \ + va_start(__ap, fmt); \ + __wt_msg_stream((env)->errfile, \ + (env)->errpfx, NULL, error, fmt, __ap); \ + va_end(__ap); \ +} + +/* + * __wt_api_env_err -- + * Env.err method. + */ +void +__wt_api_env_err(ENV *env, int error, const char *fmt, ...) +{ + /* + * This function may be called at before/after the statistics memory + * has been allocated/freed; don't increment method statistics here. + */ + WT_ENV_ERR(env, error, fmt); +} + +/* + * __wt_api_env_errx -- + * Env.errx method. + */ +void +__wt_api_env_errx(ENV *env, const char *fmt, ...) +{ + /* + * This function may be called at before/after the statistics memory + * has been allocated/freed; don't increment method statistics here. + */ + WT_ENV_ERR(env, 0, fmt); +} diff --git a/src/env/env_getset.c b/src/env/env_getset.c new file mode 100644 index 00000000000..6786c87b41d --- /dev/null +++ b/src/env/env_getset.c @@ -0,0 +1,70 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_env_cache_cache_size_set_verify -- + * Verify an argument to the Env.cache_size_set method. + */ +int +__wt_env_cache_size_set_verify(ENV *env, uint32_t cache_size) +{ + return (__wt_api_arg_min(env, + "Env.cache_size_set", "cache size", cache_size, 1)); +} + +/* + * __wt_env_cache_hash_size_set_verify -- + * Verify an argument to the Env.hash_size_set method. + */ +int +__wt_env_cache_hash_size_set_verify(ENV *env, uint32_t hash_size) +{ + return (__wt_api_arg_min(env, + "Env.hash_size_set", "hash size", hash_size, 1)); +} + +/* + * __wt_env_cache_hazard_size_set_verify -- + * Verify an argument to the Env.hazard_size_set method. + */ +int +__wt_env_hazard_size_set_verify(ENV *env, uint32_t hazard_size) +{ + return (__wt_api_arg_min(env, + "Env.hazard_size_set", "hazard size", hazard_size, 1)); +} + +/* + * __wt_env_toc_size_set_verify -- + * Verify an argument to the Env.toc_size_set method. + */ +int +__wt_env_toc_size_set_verify(ENV *env, uint32_t toc_size) +{ + return (__wt_api_arg_min(env, + "Env.toc_size_set", "toc size", toc_size, 1)); +} + +/* + * __wt_env_verbose_set_verify -- + * Verify an argument to the Env.verbose_set method. + */ +int +__wt_env_verbose_set_verify(ENV *env, uint32_t verbose) +{ +#ifdef HAVE_VERBOSE + WT_ENV_FCHK(env, + "Env.verbose_set", verbose, WT_APIMASK_ENV_VERBOSE_SET); + return (0); +#else + return (__wt_api_config(env, "Env.verbose_set", "--enable-verbose")); +#endif +} diff --git a/src/env/env_global.c b/src/env/env_global.c new file mode 100644 index 00000000000..e41a7bccfad --- /dev/null +++ b/src/env/env_global.c @@ -0,0 +1,72 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +void *__wt_addr; /* Memory flush address. */ +FILE *__wt_err_stream; /* Error stream from init. */ + +/* + * __wt_library_init -- + * Some things to do, before we do anything else. + */ +int +__wt_library_init(void) +{ + /* + * We need an address for memory flushing -- it doesn't matter which + * one we choose. + */ + __wt_addr = &__wt_addr; + + /* + * We want to be able to redirect error messages from the very first + * instruction. + */ + __wt_err_stream = stderr; + + /* + * Check the build & compiler itself before going further. + */ + WT_RET(__wt_bt_build_verify()); + +#ifdef HAVE_DIAGNOSTIC + /* Load debug code the compiler might optimize out. */ + WT_RET(__wt_breakpoint()); +#endif + + return (0); +} + +/* + * __wt_breakpoint -- + * A simple place to put a breakpoint, if you need one. + */ +int +__wt_breakpoint(void) +{ + return (0); +} + +int __wt_debugger_attach; + +/* + * __wt_attach -- + * A routine to wait for the debugging to attach. + */ +void +__wt_attach(ENV *env) +{ +#ifdef HAVE_ATTACH + __wt_api_env_errx(env, + "process ID %lld: waiting for debugger...", (long long)getpid()); + while (__wt_debugger_attach == 0) + __wt_sleep(10, 0); +#endif +} diff --git a/src/env/env_handle.c b/src/env/env_handle.c new file mode 100644 index 00000000000..1c02675041f --- /dev/null +++ b/src/env/env_handle.c @@ -0,0 +1,137 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +static int __wt_env_config(ENV *); +static int __wt_ienv_config(ENV *); + +/* + * __wt_env_create -- + * ENV constructor. + */ +int +__wt_env_create(uint32_t flags, ENV **envp) +{ + ENV *env; + IENV *ienv; + int ret; + + /* + * !!! + * We don't yet have valid ENV/IENV structures to use to call other + * functions. The only functions that can handle NULL ENV handles + * are the memory allocation and free functions, no other functions + * may be called. + */ + WT_RET(__wt_calloc(NULL, 1, sizeof(ENV), &env)); + WT_ERR(__wt_calloc(NULL, 1, sizeof(IENV), &ienv)); + + /* Connect everything together. */ + env->ienv = ienv; + + /* Set flags. */ + if (LF_ISSET(WT_MEMORY_CHECK)) + F_SET(env, WT_MEMORY_CHECK); + + /* Configure the ENV and the IENV. */ + WT_ERR(__wt_env_config(env)); + WT_ERR(__wt_ienv_config(env)); + + *envp = env; + return (0); + +err: (void)__wt_env_close(env); + return (ret); +} + +/* + * __wt_env_config -- + * Set configuration for a just-created ENV handle. + */ +static int +__wt_env_config(ENV *env) +{ + __wt_methods_env_config_default(env); + __wt_methods_env_lockout(env); + __wt_methods_env_init_transition(env); + return (0); +} + +/* + * __wt_ienv_config -- + * Set configuration for a just-created IENV handle. + */ +static int +__wt_ienv_config(ENV *env) +{ + IENV *ienv; + + ienv = env->ienv; + +#ifdef HAVE_DIAGNOSTIC + /* If we're tracking memory, initialize those structures first. */ + if (F_ISSET(env, WT_MEMORY_CHECK)) + WT_RET(__wt_mtrack_alloc(env)); +#endif + /* Global mutex */ + WT_RET(__wt_mtx_alloc(env, "IENV", 0, &ienv->mtx)); + + TAILQ_INIT(&ienv->dbqh); /* DB list */ + TAILQ_INIT(&ienv->fhqh); /* File list */ + + /* Statistics. */ + WT_RET(__wt_stat_alloc_env_stats(env, &ienv->stats)); + WT_RET(__wt_stat_alloc_method_stats(env, &ienv->method_stats)); + + /* Diagnostic output separator. */ + ienv->sep = "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="; + + return (0); +} + +/* + * __wt_ienv_destroy -- + * Destroy the ENV's underlying IENV structure. + */ +int +__wt_ienv_destroy(ENV *env) +{ + IENV *ienv; + int ret; + + ienv = env->ienv; + ret = 0; + + /* Check there's something to destroy. */ + if (ienv == NULL) + return (0); + + /* Diagnostic check: check flags against approved list. */ + WT_ENV_FCHK_RET(env, "Env.close", ienv->flags, WT_APIMASK_IENV, ret); + + (void)__wt_mtx_destroy(env, ienv->mtx); + + /* Free allocated memory. */ + __wt_free(env, ienv->toc, 0); + __wt_free(env, ienv->toc_array, 0); + __wt_free(env, ienv->hazard, 0); + __wt_free(env, ienv->stats, 0); + __wt_free(env, ienv->method_stats, 0); + +#ifdef HAVE_DIAGNOSTIC + /* If we're tracking memory, check to see if everything was free'd. */ + __wt_mtrack_dump(env); + __wt_mtrack_free(env); +#endif + + __wt_free(NULL, ienv, sizeof(IENV)); + env->ienv = NULL; + return (ret); +} diff --git a/src/env/env_init.c b/src/env/env_init.c new file mode 100644 index 00000000000..26c7062d63f --- /dev/null +++ b/src/env/env_init.c @@ -0,0 +1,41 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * wiredtiger_env_init -- + * Initialize the library, creating an ENV handle. + */ +int +wiredtiger_env_init(ENV **envp, uint32_t flags) +{ + static int library_init = 0; + ENV *env; + + *envp = NULL; + + /* + * We end up here before we do any real work. Check the build itself, + * and do some global stuff. + */ + if (library_init == 0) { + WT_RET(__wt_library_init()); + library_init = 1; + } + + WT_ENV_FCHK(NULL, + "wiredtiger_env_init", flags, WT_APIMASK_WIREDTIGER_ENV_INIT); + + /* Create the ENV handle. */ + WT_RET(__wt_env_create(flags, &env)); + + *envp = env; + return (0); +} diff --git a/src/env/env_msg.c b/src/env/env_msg.c new file mode 100644 index 00000000000..9dcfdec9514 --- /dev/null +++ b/src/env/env_msg.c @@ -0,0 +1,138 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +#define WT_MSG(env, fmt) { \ + extern FILE *__wt_err_stream; \ + va_list __ap; \ + /* \ + * Support messages even when we don't yet have an ENV handle, \ + * using the error stream. + */ \ + if ((env) == NULL) { \ + va_start(__ap, fmt); \ + __wt_msg_stream( \ + __wt_err_stream, NULL, NULL, 0, fmt, __ap); \ + va_end(__ap); \ + return; \ + } \ + \ + /* Application-specified callback function. */ \ + if ((env)->msgcall != NULL) { \ + va_start(__ap, fmt); \ + __wt_msg_call((void *)((env)->msgcall), \ + (void *)env, NULL, NULL, 0, fmt, __ap); \ + va_end(__ap); \ + } \ + \ + /* \ + * If the application set an message callback function but not a\ + * message stream, we're done. Otherwise, write the stream. \ + */ \ + if ((env)->msgcall != NULL && (env)->msgfile == NULL) \ + return; \ + \ + va_start(__ap, fmt); \ + __wt_msg_stream((env)->msgfile, NULL, NULL, 0, fmt, __ap); \ + va_end(__ap); \ +} + +/* + * __wt_msg -- + * Write a message. + */ +void +__wt_msg(ENV *env, const char *fmt, ...) +{ + WT_MSG(env, fmt); +} + +/* + * __wt_mb_init -- + * Initialize a WT_MBUF structure for message aggregation. + */ +void +__wt_mb_init(ENV *env, WT_MBUF *mbp) +{ + mbp->env = env; + mbp->first = mbp->next = NULL; + mbp->len = 0; +} + +/* + * __wt_mb_discard -- + * Discard a WT_MBUF structure. + */ +void +__wt_mb_discard(WT_MBUF *mbp) +{ + if (mbp->first == NULL) + return; + + /* Write any remaining message. */ + if (mbp->next != mbp->first) + __wt_mb_write(mbp); + + __wt_free(mbp->env, mbp->first, mbp->len); +} + +/* + * __wt_mb_add -- + * Append log messages into a WT_MBUF structure. + */ +void +__wt_mb_add(WT_MBUF *mbp, const char *fmt, ...) +{ + va_list ap; + size_t current, len, remain; + + va_start(ap, fmt); + + current = (size_t)(mbp->next - mbp->first); + remain = mbp->len - current; + len = 64; + for (;;) { + /* + * If we don't have at least "len" bytes allocate 2x len bytes + * more memory. + */ + if (remain <= len) { + if (__wt_realloc(mbp->env, + &mbp->len, mbp->len + len * 2, &mbp->first)) + return; + mbp->next = mbp->first + current; + remain = mbp->len - current; + } + /* + * Format the user's information. If it doesn't fit into the + * buffer we have, re-allocate enough memory and try again. + */ + len = (size_t)vsnprintf(mbp->next, remain, fmt, ap); + if (len < remain) { + mbp->next += len; + break; + } + } +} + +/* + * __wt_mb_write -- + * Write the messages from a WT_MBUF structure. + */ +void +__wt_mb_write(WT_MBUF *mbp) +{ + if (mbp->first == NULL || mbp->next == mbp->first) + return; + + __wt_msg(mbp->env, "%s", mbp->first); + + mbp->next = mbp->first; +} diff --git a/src/env/env_open.c b/src/env/env_open.c new file mode 100644 index 00000000000..a6f95838ede --- /dev/null +++ b/src/env/env_open.c @@ -0,0 +1,132 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_env_open -- + * Open a Env handle. + */ +int +__wt_env_open(ENV *env, const char *home, mode_t mode) +{ + IENV *ienv; + int ret; + + WT_CC_QUIET(home, NULL); + WT_CC_QUIET(mode, 0); + + ienv = env->ienv; + ret = 0; + + /* WT_TOC and hazard arrays. */ + WT_RET(__wt_calloc(env, env->toc_size, sizeof(WT_TOC *), &ienv->toc)); + WT_RET( + __wt_calloc(env, env->toc_size, sizeof(WT_TOC), &ienv->toc_array)); + WT_RET(__wt_calloc(env, + env->toc_size * env->hazard_size, sizeof(WT_PAGE *), &ienv->hazard)); + + /* Create the cache. */ + WT_RET(__wt_cache_create(env)); + + /* Transition to the open state. */ + __wt_methods_env_open_transition(env); + + /* Start worker threads. */ + F_SET(ienv, WT_WORKQ_RUN | WT_SERVER_RUN); + WT_MEMORY_FLUSH; + + WT_ERR(__wt_thread_create( + &ienv->cache_evict_tid, __wt_cache_evict_server, env)); + WT_ERR(__wt_thread_create( + &ienv->cache_read_tid, __wt_cache_read_server, env)); + WT_ERR(__wt_thread_create(&ienv->workq_tid, __wt_workq_srvr, env)); + + return (0); + +err: (void)__wt_env_close(env); + return (ret); +} + +/* + * __wt_env_close -- + * Close an Env handle. + */ +int +__wt_env_close(ENV *env) +{ + IDB *idb; + IENV *ienv; + WT_FH *fh; + int ret, secondary_err; + + WT_ENV_FCHK_RET(env, "Env.close", env->flags, WT_APIMASK_ENV, ret); + + ienv = env->ienv; + ret = secondary_err = 0; + + /* Complain if DB handles weren't closed. */ + if (TAILQ_FIRST(&ienv->dbqh) != NULL) { + TAILQ_FOREACH(idb, &ienv->dbqh, q) { + __wt_api_env_errx(env, + "Env handle has open Db handles: %s", + idb->name); + WT_TRET(idb->db->close(idb->db, 0)); + } + secondary_err = WT_ERROR; + } + + /* Complain if files weren't closed. */ + if (TAILQ_FIRST(&ienv->fhqh) != NULL) { + TAILQ_FOREACH(fh, &ienv->fhqh, q) { + __wt_api_env_errx(env, + "Env handle has open file handles: %s", + fh->name); + WT_TRET(__wt_close(env, fh)); + } + secondary_err = WT_ERROR; + } + + /* Shut down the server threads. */ + F_CLR(ienv, WT_SERVER_RUN); + WT_MEMORY_FLUSH; + + /* + * Force the cache server threads to run and wait for them to exit. + * Wait for the cache eviction server first, it potentially schedules + * work for the read thread. + */ + __wt_workq_evict_server(env, 1); + __wt_thread_join(ienv->cache_evict_tid); + __wt_workq_read_server(env, 1); + __wt_thread_join(ienv->cache_read_tid); + + /* + * Close down and wait for the workQ thread; this only happens after + * all other server threads have exited, as they may be waiting on a + * request from the workQ, or vice-versa. + */ + F_CLR(ienv, WT_WORKQ_RUN); + WT_MEMORY_FLUSH; + __wt_thread_join(ienv->workq_tid); + + /* Discard the cache. */ + WT_TRET(__wt_cache_destroy(env)); + + /* Re-cycle the underlying ENV/IENV structures. */ + WT_TRET(__wt_ienv_destroy(env)); + + /* Free the Env structure. */ + __wt_free(NULL, env, sizeof(ENV)); + + if (ret == 0) + ret = secondary_err; + + return (ret == 0 ? secondary_err : ret); +} diff --git a/src/env/env_stat.c b/src/env/env_stat.c new file mode 100644 index 00000000000..997d9080f31 --- /dev/null +++ b/src/env/env_stat.c @@ -0,0 +1,86 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_env_stat_print -- + * Print ENV handle statistics to a stream. + */ +int +__wt_env_stat_print(ENV *env, FILE *stream) +{ + IDB *idb; + IENV *ienv; + + ienv = env->ienv; + + fprintf(stream, "Environment handle statistics:\n"); + __wt_stat_print(env, ienv->stats, stream); + + fprintf(stream, "Environment cache statistics:\n"); + __wt_cache_stats(env); + __wt_stat_print(env, ienv->cache->stats, stream); + fprintf(stream, "Environment method statistics:\n"); + __wt_stat_print(env, ienv->method_stats, stream); + + TAILQ_FOREACH(idb, &ienv->dbqh, q) + WT_RET(idb->db->stat_print(idb->db, stream, 0)); + return (0); +} + +/* + * __wt_env_stat_clear -- + * Clear ENV handle statistics. + */ +int +__wt_env_stat_clear(ENV *env) +{ + IDB *idb; + IENV *ienv; + int ret; + + ienv = env->ienv; + ret = 0; + + TAILQ_FOREACH(idb, &ienv->dbqh, q) + WT_TRET(__wt_db_stat_clear(idb->db)); + + __wt_stat_clear_env_stats(ienv->stats); + __wt_stat_clear_cache_stats(ienv->cache->stats); + __wt_stat_clear_method_stats(ienv->method_stats); + + return (ret); +} + +/* + * __wt_stat_print -- + * Print out a statistics table. + */ +void +__wt_stat_print(ENV *env, WT_STATS *s, FILE *stream) +{ + IENV *ienv; + + ienv = env->ienv; + + for (; s->desc != NULL; ++s) + if (s->v >= WT_BILLION) + fprintf(stream, "%lluB\t%s (%llu bytes)\n", + (unsigned long long)s->v / WT_BILLION, + s->desc, (unsigned long long)s->v); + else if (s->v >= WT_MILLION) + fprintf(stream, "%lluM\t%s (%llu bytes)\n", + (unsigned long long)s->v / WT_MILLION, + s->desc, (unsigned long long)s->v); + else + fprintf(stream, + "%llu\t%s\n", (unsigned long long)s->v, s->desc); + fprintf(stream, "%s\n", ienv->sep); +} diff --git a/src/env/env_sync.c b/src/env/env_sync.c new file mode 100644 index 00000000000..4c40b52ad1c --- /dev/null +++ b/src/env/env_sync.c @@ -0,0 +1,30 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_env_sync -- + * Flush the environment's cache. + */ +int +__wt_env_sync(ENV *env, void (*f)(const char *, uint64_t)) +{ + IDB *idb; + IENV *ienv; + int ret; + + ienv = env->ienv; + ret = 0; + + TAILQ_FOREACH(idb, &ienv->dbqh, q) + WT_TRET(idb->db->sync(idb->db, f, 0)); + + return (ret); +} diff --git a/src/env/env_toc.c b/src/env/env_toc.c new file mode 100644 index 00000000000..46d132707b5 --- /dev/null +++ b/src/env/env_toc.c @@ -0,0 +1,238 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_env_toc -- + * ENV.toc method. + */ +int +__wt_env_toc(ENV *env, WT_TOC **tocp) +{ + IENV *ienv; + WT_TOC *toc; + uint32_t slot; + + ienv = env->ienv; + *tocp = NULL; + + /* Check to see if there's an available WT_TOC slot. */ + if (ienv->toc_cnt == env->toc_size - 1) { + __wt_api_env_errx(env, + "WiredTiger only configured to support %d thread contexts", + env->toc_size); + return (WT_ERROR); + } + + /* + * The WT_TOC reference list is compact, the WT_TOC array is not. Find + * the first empty WT_TOC slot. + */ + for (slot = 0, toc = ienv->toc_array; toc->env != NULL; ++toc, ++slot) + ; + + /* Clear previous contents of the WT_TOC entry, they get re-used. */ + memset(toc, 0, sizeof(WT_TOC)); + + toc->env = env; + toc->hazard = ienv->hazard + slot * env->hazard_size; + + WT_RET(__wt_mtx_alloc(env, "toc", 1, &toc->mtx)); + + __wt_methods_wt_toc_lockout(toc); + __wt_methods_wt_toc_init_transition(toc); + + /* Make the entry visible to the workQ. */ + ienv->toc[ienv->toc_cnt++] = toc; + WT_MEMORY_FLUSH; + + *tocp = toc; + return (0); +} + +/* + * __wt_wt_toc_close -- + * WT_TOC.close method. + */ +int +__wt_wt_toc_close(WT_TOC *toc) +{ + ENV *env; + IENV *ienv; + WT_TOC **tp; + WT_TOC_UPDATE *update; + int ret; + + env = toc->env; + ienv = env->ienv; + ret = 0; + + WT_ENV_FCHK_RET( + env, "WT_TOC.close", toc->flags, WT_APIMASK_WT_TOC, ret); + + /* + * The "in" reference count is artificially incremented by 1 as + * long as an update buffer is referenced by the WT_TOC thread; + * we don't want them freed because a page was evicted and their + * count went to 0. Decrement the reference count on the buffer + * as part of releasing it. There's a similar reference count + * decrement when the WT_TOC structure is discarded. + * + * XXX + * There's a race here: if this code, or the WT_TOC structure + * close code, and the page discard code race, it's possible + * neither will realize the buffer is no longer needed and free + * it. The fix is to involve the eviction or workQ threads: + * they may need a linked list of buffers they review to ensure + * it never happens. I'm living with this now: it's unlikely + * and it's a memory leak if it ever happens. + */ + update = toc->update; + if (update != NULL && --update->in == update->out) + __wt_free(env, update, update->len); + + /* Discard DBT memory. */ + __wt_free(env, toc->key.data, toc->key.mem_size); + __wt_free(env, toc->data.data, toc->data.mem_size); + __wt_scr_free(toc); + + /* Unlock and destroy the thread's mutex. */ + if (toc->mtx != NULL) { + __wt_unlock(env, toc->mtx); + (void)__wt_mtx_destroy(env, toc->mtx); + } + + /* + * Replace the WT_TOC reference we're closing with the last entry in + * the table, then clear the last entry. As far as the walk of the + * workQ is concerned, it's OK if the WT_TOC appears twice, or if it + * doesn't appear at all, so these lines can race all they want. + */ + for (tp = ienv->toc; *tp != toc; ++tp) + ; + --ienv->toc_cnt; + *tp = ienv->toc[ienv->toc_cnt]; + ienv->toc[ienv->toc_cnt] = NULL; + + /* Make the WT_TOC array entry available for re-use. */ + toc->env = NULL; + WT_MEMORY_FLUSH; + + return (ret); +} + +/* + * __wt_toc_api_set -- + * Pair WT_TOC and DB handle, allocating the WT_TOC as necessary. + */ +int +__wt_toc_api_set(ENV *env, const char *name, DB *db, WT_TOC **tocp) +{ + WT_TOC *toc; + + /* + * We pass around WT_TOCs internally in the Btree, (rather than a DB), + * because the DB's are free-threaded, and the WT_TOCs are per-thread. + * Lots of the API calls don't require the application to allocate and + * manage the WT_TOC, which means we have to do it for them. + * + * WT_TOCs always reference a DB handle, and we do that here, as well. + */ + if ((toc = *tocp) == NULL) { + WT_RET(env->toc(env, 0, tocp)); + toc = *tocp; + } + toc->db = db; + toc->name = name; + return (0); +} + +/* + * __wt_toc_api_clr -- + * Clear the WT_TOC, freeing it if it was allocated by the library. + */ +int +__wt_toc_api_clr(WT_TOC *toc, const char *name, int islocal) +{ + /* + * The WT_TOC should hold no more hazard references; this is a + * diagnostic check, but it's cheap so we do it all the time. + */ + __wt_hazard_empty(toc, name); + + if (islocal) + return (toc->close(toc, 0)); + + toc->db = NULL; + toc->name = NULL; + return (0); +} + +#ifdef HAVE_DIAGNOSTIC +static const char *__wt_toc_print_state(WT_TOC *); + +int +__wt_toc_dump(ENV *env) +{ + IENV *ienv; + WT_MBUF mb; + WT_TOC *toc, **tp; + WT_PAGE **hp; + + ienv = env->ienv; + __wt_mb_init(env, &mb); + + __wt_mb_add(&mb, "%s\n", ienv->sep); + for (tp = ienv->toc; (toc = *tp) != NULL; ++tp) { + __wt_mb_add(&mb, + "toc: %p {\n\tworkq func: ", toc); + if (toc->wq_func == NULL) + __wt_mb_add(&mb, "none"); + else + __wt_mb_add(&mb, "%p", toc->wq_func); + + __wt_mb_add(&mb, " state: %s", __wt_toc_print_state(toc)); + + __wt_mb_add(&mb, "\n\thazard: "); + for (hp = toc->hazard; + hp < toc->hazard + env->hazard_size; ++hp) + __wt_mb_add(&mb, "%p ", *hp); + + __wt_mb_add(&mb, "\n}"); + if (toc->name != NULL) + __wt_mb_add(&mb, " %s", toc->name); + __wt_mb_write(&mb); + } + + __wt_mb_discard(&mb); + return (0); +} + +/* + * __wt_toc_print_state -- + * Return the WT_TOC state as a string. + */ +static const char * +__wt_toc_print_state(WT_TOC *toc) +{ + switch (toc->wq_state) { + case WT_WORKQ_READ: + return ("read"); + case WT_WORKQ_READ_SCHED: + return ("read scheduled"); + case WT_WORKQ_FUNC: + return ("function"); + case WT_WORKQ_NONE: + return ("none"); + } + return ("unknown"); + /* NOTREACHED */ +} +#endif diff --git a/src/env/env_workq.c b/src/env/env_workq.c new file mode 100644 index 00000000000..76a00b0dce5 --- /dev/null +++ b/src/env/env_workq.c @@ -0,0 +1,94 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_workq_srvr -- + * Routine to process the WT_TOC work queue. + */ +void * +__wt_workq_srvr(void *arg) +{ + ENV *env; + IENV *ienv; + WT_TOC **tp, *toc; + int chk_read, read_force, request; + + env = (ENV *)arg; + ienv = env->ienv; + + /* Walk the WT_TOC list and execute requests. */ + while (F_ISSET(ienv, WT_WORKQ_RUN)) { + ++ienv->api_gen; + WT_STAT_INCR(ienv->stats, WORKQ_PASSES); + + chk_read = read_force = request = 0; + for (tp = ienv->toc; (toc = *tp) != NULL; ++tp) { + switch (toc->wq_state) { + case WT_WORKQ_NONE: + break; + case WT_WORKQ_FUNC: + request = 1; + (void)toc->wq_func(toc); + break; + case WT_WORKQ_READ: + request = 1; + + /* + * Call a function which makes a request of the + * read server. There are two read states: READ + * (the initial request), and READ_SCHED (the + * function has been called and we're waiting on + * the read to complete). There are two states + * because we can race with the server: if the + * called function adds itself to the queue just + * as the server is going to sleep, the server + * might not see the request. So, READ_SCHED + * means we don't have to call the function, but + * we do have check if the server is running. + * + * The read state is eventually reset by the + * read server, so we set it before we call the + * function that will contact the server, so we + * can't race on that update. + */ + toc->wq_state = WT_WORKQ_READ_SCHED; + + /* + * Call the function (which contacts the read + * server). If that call fails, we're done. + */ + if (toc->wq_func(toc) != 0) + break; + + /* FALLTHROUGH */ + case WT_WORKQ_READ_SCHED: + chk_read = 1; + if (F_ISSET(toc, WT_READ_PRIORITY)) + read_force = 1; + break; + } + } + + /* If a read is scheduled, check on the read server. */ + if (chk_read) + __wt_workq_read_server(env, read_force); + + /* Check on the cache eviction server. */ + __wt_workq_evict_server(env, 0); + + /* If we didn't find work, yield the processor. */ + if (!request) { + WT_STAT_INCR(ienv->stats, WORKQ_YIELD); + __wt_yield(); + } + } + return (NULL); +} diff --git a/src/os_posix/os_abort.c b/src/os_posix/os_abort.c new file mode 100644 index 00000000000..68106636831 --- /dev/null +++ b/src/os_posix/os_abort.c @@ -0,0 +1,25 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_abort -- + * Abort the process, dropping core. + */ +void +__wt_abort(ENV *env) +{ + __wt_msg(env, "aborting WiredTiger library"); + + __wt_attach(env); + + abort(); + /* NOTREACHED */ +} diff --git a/src/os_posix/os_alloc.c b/src/os_posix/os_alloc.c new file mode 100644 index 00000000000..dbbb915822a --- /dev/null +++ b/src/os_posix/os_alloc.c @@ -0,0 +1,359 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +#ifdef HAVE_DIAGNOSTIC +static void __wt_mtrack( + ENV *env, const void *, const void *, const char *, int); +#endif + +/* + * There's no malloc interface, WiredTiger never calls malloc. The problem is + * an application might: allocate memory, write secret stuff into it, free the + * memory, we allocate the memory, and then use it for a database page or log + * record and write it to disk. That would result in the secret stuff being + * protected by the WiredTiger permission mechanisms, potentially inappropriate + * for the secret stuff. + */ + +/* + * __wt_calloc_func -- + * ANSI calloc function. + */ +int +__wt_calloc_func(ENV *env, uint32_t number, uint32_t size, void *retp +#ifdef HAVE_DIAGNOSTIC + , const char *file, int line +#endif + ) +{ + void *p; + + /* + * !!! + * This function MUST handle a NULL ENV structure reference. + */ + WT_ASSERT(env, number != 0 && size != 0); + + if (env != NULL && env->ienv != NULL && env->ienv->stats != NULL) + WT_STAT_INCR(env->ienv->stats, MEMALLOC); + + if ((p = calloc(number, (size_t)size)) == NULL) { + __wt_api_env_err(env, errno, "memory allocation"); + return (WT_ERROR); + } + *(void **)retp = p; + +#ifdef HAVE_DIAGNOSTIC + __wt_mtrack(env, NULL, p, file, line); +#endif + return (0); +} + +/* + * __wt_realloc_func -- + * ANSI realloc function. + */ +int +__wt_realloc_func(ENV *env, + uint32_t *bytes_allocated_ret, uint32_t bytes_to_allocate, void *retp +#ifdef HAVE_DIAGNOSTIC + , const char *file, int line +#endif + ) +{ + void *p; + uint32_t bytes_allocated; + + /* + * !!! + * This function MUST handle a NULL ENV structure reference. + */ + WT_ASSERT(env, bytes_to_allocate != 0); + + if (env != NULL && env->ienv != NULL && env->ienv->stats != NULL) + WT_STAT_INCR(env->ienv->stats, MEMALLOC); + + p = *(void **)retp; + + /* + * Sometimes we're allocating memory and we don't care about the + * final length -- bytes_allocated_ret may be NULL. + */ + bytes_allocated = + bytes_allocated_ret == NULL ? 0 : *bytes_allocated_ret; + WT_ASSERT(env, bytes_allocated < bytes_to_allocate); + + if ((p = realloc(p, (size_t)bytes_to_allocate)) == NULL) { + __wt_api_env_err(env, errno, "memory allocation"); + return (WT_ERROR); + } + + /* + * Clear the allocated memory -- an application might: allocate memory, + * write secret stuff into it, free the memory, we re-allocate the + * memory, then use it for a database page or log record and write it + * to disk. That would result in the secret stuff being protected by + * the WiredTiger permission mechanisms, potentially inappropriate for + * the secret stuff. + */ + memset((uint8_t *) + p + bytes_allocated, 0, bytes_to_allocate - bytes_allocated); + + /* Update caller's bytes allocated value. */ + if (bytes_allocated_ret != NULL) + *bytes_allocated_ret = bytes_to_allocate; + +#ifdef HAVE_DIAGNOSTIC + __wt_mtrack(env, *(void **)retp, p, file, line); +#endif + + *(void **)retp = p; + return (0); +} + +/* + * __wt_strdup_func -- + * ANSI strdup function. + */ +int +__wt_strdup_func(ENV *env, const char *str, void *retp +#ifdef HAVE_DIAGNOSTIC + , const char *file, int line +#endif + ) +{ + size_t len; + void *p; + + /* + * !!! + * This function MUST handle a NULL ENV structure reference. + */ + if (env != NULL && env->ienv != NULL && env->ienv->stats != NULL) + WT_STAT_INCR(env->ienv->stats, MEMALLOC); + + len = strlen(str) + 1; +#ifdef HAVE_DIAGNOSTIC + WT_RET(__wt_calloc_func(env, len, 1, &p, file, line)); +#else + WT_RET(__wt_calloc_func(env, len, 1, &p)); +#endif + + memcpy(p, str, len); + + *(void **)retp = p; + return (0); +} + +/* + * __wt_free_func -- + * ANSI free function. + */ +void +__wt_free_func(ENV *env, void *p_arg +#ifdef HAVE_DIAGNOSTIC + , uint32_t len +#endif + ) +{ + void *p; + + /* + * !!! + * This function MUST handle a NULL ENV structure reference. + */ + if (env != NULL && env->ienv != NULL && env->ienv->stats != NULL) + WT_STAT_INCR(env->ienv->stats, MEMFREE); + + /* + * If there's a serialization bug we might race with another thread. + * We can't avoid the race (and we aren't willing to flush memory), + * but we minimize the window by clearing the free address atomically, + * hoping a racing thread will see, and won't free, a NULL pointer. + */ + p = *(void **)p_arg; + *(void **)p_arg = NULL; + + if (p == NULL) /* ANSI C free semantics */ + return; + +#ifdef HAVE_DIAGNOSTIC + /* + * If we know how long the object is, overwrite it with an easily + * recognizable value for debugging. + */ + if (len != 0) + memset(p, WT_DEBUG_BYTE, len); + + __wt_mtrack(env, p, NULL, NULL, 0); +#endif + + free(p); +} + +#ifdef HAVE_DIAGNOSTIC +/* + * __wt_mtrack_alloc -- + * Allocate memory tracking structures. + */ +int +__wt_mtrack_alloc(ENV *env) +{ + IENV *ienv; + WT_MTRACK *p; + + ienv = env->ienv; + + /* + * Use a temporary variable -- assigning memory to ienv->mtrack turns + * on memory object tracking, and we need to set up the rest of the + * structure first. + */ + WT_RET(__wt_calloc(env, 1, sizeof(WT_MTRACK), &p)); + WT_RET(__wt_calloc(env, 1000, sizeof(WT_MEM), &p->list)); + p->next = p->list; + p->slots = 1000; + ienv->mtrack = p; + return (0); +} + +/* + * __wt_mtrack_free -- + * Free memory tracking structures. + */ +void +__wt_mtrack_free(ENV *env) +{ + IENV *ienv; + WT_MTRACK *p; + + ienv = env->ienv; + + /* + * Clear ienv->mtrack (to turn off memory object tracking) before the + * free. + */ + if ((p = ienv->mtrack) == NULL) + return; + ienv->mtrack = NULL; + + __wt_free(env, p->list, 0); + __wt_free(env, p, 0); +} + +/* + * __wt_mtrack_free -- + * Track memory allocations and frees. + */ +static void +__wt_mtrack(ENV *env, const void *f, const void *a, const char *file, int line) +{ + WT_MEM *mp, *t, *mp_end; + WT_MTRACK *mtrack; + int slot_check; + + if (env == NULL || + env->ienv == NULL || (mtrack = env->ienv->mtrack) == NULL) + return; + + /* + * Remove freed memory from the list. If it's a free/alloc pair (that + * is, if __wt_realloc was called), re-use the slot. + */ + if (f != NULL) { + if ((mp = mtrack->next) > mtrack->list) + do { + if ((--mp)->addr == f) + goto enter; + } while (mp > mtrack->list); + + __wt_api_env_errx(env, "mtrack: %p: not found", f); + __wt_attach(env); + } + + if (a == NULL) + return; + + /* + * Add allocated memory to the list. + * + * First, see if there's a slot close by we can re-use (the assumption + * is that when memory is allocated and quickly freed we re-use the + * slots instead of leaving lots of free spots in the array. + */ + if ((mp = mtrack->next) > mtrack->list) + for (slot_check = 0; slot_check < 10; ++slot_check) { + if ((--mp)->addr == NULL) + goto enter; + if (mp == mtrack->list) + break; + } + + mp_end = mtrack->list + mtrack->slots; + + /* If there's an empty slot, use it. */ + if (mtrack->next < mp_end) + goto next; + + /* Try to compress the array. */ + for (mp = mtrack->list, t = NULL;; ++mp, ++t) { + while (mp < mp_end && mp->addr != NULL) + ++mp; + if (mp == mp_end) + break; + if (t == NULL) + t = mp + 1; + while (t < mp_end && t->addr == NULL) + ++t; + if (t == mp_end) + break; + *mp++ = *t; + t->addr = NULL; + } + mtrack->next = mp; + + /* If there's an empty slot, use it. */ + if (mtrack->next < mp_end) + goto next; + + /* Re-allocate the array and use the next empty slot. */ + if ((mtrack->list = realloc(mtrack->list, + mtrack->slots * 2 * sizeof(WT_MEM))) == NULL) + return; + mtrack->next = mtrack->list + mtrack->slots; + mtrack->slots *= 2; + +next: mp = mtrack->next++; +enter: mp->addr = a; + mp->file = file; + mp->line = line; +} + +/* + * __wt_mtrack_dump -- + * Complain about any memory allocated but never freed. + */ +void +__wt_mtrack_dump(ENV *env) +{ + WT_MTRACK *mtrack; + WT_MEM *mp; + + if ((mtrack = env->ienv->mtrack) == NULL) + return; + + for (mp = mtrack->list; mp < mtrack->next; ++mp) + if (mp->addr != NULL) + __wt_api_env_errx(env, + "mtrack: %p {%s/%d}: never freed", + mp->addr, mp->file, mp->line); +} +#endif diff --git a/src/os_posix/os_filesize.c b/src/os_posix/os_filesize.c new file mode 100644 index 00000000000..604d963f8e6 --- /dev/null +++ b/src/os_posix/os_filesize.c @@ -0,0 +1,27 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +int +__wt_filesize(ENV *env, WT_FH *fh, off_t *sizep) +{ + struct stat sb; + + WT_VERBOSE(env, + WT_VERB_FILEOPS, (env, "fileops: %s: fstat", fh->name)); + + if (fstat(fh->fd, &sb) == -1) { + __wt_api_env_err(env, errno, "%s: fstat", fh->name); + return (WT_ERROR); + } + + *sizep = sb.st_size; /* Return size in bytes. */ + return (0); +} diff --git a/src/os_posix/os_fsync.c b/src/os_posix/os_fsync.c new file mode 100644 index 00000000000..e6ecfd95a21 --- /dev/null +++ b/src/os_posix/os_fsync.c @@ -0,0 +1,29 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_fsync -- + * Flush a file handle. + */ +int +__wt_fsync(ENV *env, WT_FH *fh) +{ + + WT_STAT_INCR(fh->stats, FSYNC); + + WT_VERBOSE(env, WT_VERB_FILEOPS, (env, "fileops: %s: fsync", fh->name)); + + if (fsync(fh->fd) == 0) + return (0); + + __wt_api_env_err(env, errno, "%s fsync error", fh->name); + return (WT_ERROR); +} diff --git a/src/os_posix/os_mtx.c b/src/os_posix/os_mtx.c new file mode 100644 index 00000000000..fb58784ec2d --- /dev/null +++ b/src/os_posix/os_mtx.c @@ -0,0 +1,148 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_mtx_alloc -- + * Allocate and initialize a pthread mutex. + */ +int +__wt_mtx_alloc(ENV *env, const char *name, int is_locked, WT_MTX **mtxp) +{ + WT_MTX *mtx; + pthread_condattr_t condattr; + pthread_mutexattr_t mutexattr; + + WT_RET(__wt_calloc(env, 1, sizeof(WT_MTX), &mtx)); + + /* + * !!! + * This function MUST handle a NULL ENV structure reference. + * + * Initialize the mutex. + * Mutexes are shared between processes. + */ + if (pthread_mutexattr_init(&mutexattr) != 0) + goto err; +#if 0 + if (pthread_mutexattr_setpshared( + &mutexattr, PTHREAD_PROCESS_SHARED) != 0) + goto err; +#endif + if (pthread_mutex_init(&mtx->mtx, &mutexattr) != 0) + goto err; + (void)pthread_mutexattr_destroy(&mutexattr); + + /* Initialize the condition variable (mutexes are self-blocking). */ + if (pthread_condattr_init(&condattr) != 0) + goto err; +#if 0 + if (pthread_condattr_setpshared( + &condattr, PTHREAD_PROCESS_SHARED) != 0) + goto err; +#endif + if (pthread_cond_init(&mtx->cond, &condattr) != 0) + goto err; + (void)pthread_condattr_destroy(&condattr); + + mtx->name = name; + + /* If the normal state of the mutex is locked, lock it immediately. */ + if (is_locked) + __wt_lock(env, mtx); + + *mtxp = mtx; + return (0); + +err: __wt_free(env, mtx, sizeof(WT_MTX)); + return (WT_ERROR); +} + +/* + * __wt_lock + * Lock a mutex. + */ +void +__wt_lock(ENV *env, WT_MTX *mtx) +{ + int ret; + + WT_VERBOSE(env, + WT_VERB_MUTEX, (env, "lock %s mutex (%p)", mtx->name, mtx)); + + WT_ERR(pthread_mutex_lock(&mtx->mtx)); + + /* + * Check pthread_cond_wait() return for EINTR, ETIME and ETIMEDOUT, + * it's known to return these errors on some systems. + */ + while (mtx->locked) { + ret = pthread_cond_wait(&mtx->cond, &mtx->mtx); + if (ret != 0 && + ret != EINTR && +#ifdef ETIME + ret != ETIME && +#endif + ret != ETIMEDOUT) { + (void)pthread_mutex_unlock(&mtx->mtx); + goto err; + } + } + + mtx->locked = 1; + WT_STAT_INCR(env->ienv->stats, MTX_LOCK); + + WT_ERR(pthread_mutex_unlock(&mtx->mtx)); + return; + +err: __wt_api_env_err(env, ret, "mutex lock failed"); + __wt_abort(env); +} + +/* + * __wt_unlock -- + * Release a mutex. + */ +void +__wt_unlock(ENV *env, WT_MTX *mtx) +{ + int ret; + + WT_VERBOSE(env, + WT_VERB_MUTEX, (env, "unlock %s mutex (%p)", mtx->name, mtx)); + + ret = 0; + WT_ERR(pthread_mutex_lock(&mtx->mtx)); + mtx->locked = 0; + WT_ERR(pthread_cond_signal(&mtx->cond)); + + WT_ERR(pthread_mutex_unlock(&mtx->mtx)); + return; + +err: __wt_api_env_err(env, ret, "mutex unlock failed"); + __wt_abort(NULL); +} + +/* + * __wt_mtx_destroy -- + * Destroy a mutex. + */ +int +__wt_mtx_destroy(ENV *env, WT_MTX *mtx) +{ + int ret; + + ret = pthread_cond_destroy(&mtx->cond); + WT_TRET(pthread_mutex_destroy(&mtx->mtx)); + + __wt_free(env, mtx, sizeof(WT_MTX)); + + return (ret == 0 ? 0 : WT_ERROR); +} diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c new file mode 100644 index 00000000000..971fe47f11b --- /dev/null +++ b/src/os_posix/os_open.c @@ -0,0 +1,128 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_open -- + * Open a file handle. + */ +int +__wt_open(ENV *env, const char *name, mode_t mode, int ok_create, WT_FH **fhp) +{ + IDB *idb; + IENV *ienv; + WT_FH *fh; + int f, fd, ret; + + fh = NULL; + ienv = env->ienv; + + WT_VERBOSE(env, WT_VERB_FILEOPS, (env, "fileops: %s: open", name)); + + /* Increment the reference count if we already have the file open. */ + __wt_lock(env, ienv->mtx); + TAILQ_FOREACH(idb, &ienv->dbqh, q) { + if ((fh = idb->fh) == NULL) + continue; + if (strcmp(name, idb->name) == 0) { + ++fh->refcnt; + *fhp = fh; + break; + } + } + __wt_unlock(env, ienv->mtx); + if (fh != NULL) + return (0); + + f = O_RDWR; +#ifdef O_BINARY + /* Windows clones: we always want to treat the file as a binary. */ + f |= O_BINARY; +#endif + if (ok_create) + f |= O_CREAT; + + if ((fd = open(name, f, mode)) == -1) { + __wt_api_env_err(env, errno, "%s", name); + return (WT_ERROR); + } + + WT_RET(__wt_calloc(env, 1, sizeof(WT_FH), &fh)); + WT_ERR(__wt_stat_alloc_fh_stats(env, &fh->stats)); + WT_ERR(__wt_strdup(env, name, &fh->name)); + +#if defined(HAVE_FCNTL) && defined(FD_CLOEXEC) + /* + * Security: + * The application may spawn a new process, and we don't want another + * process to have access to our file handles. There's an obvious + * race here... + */ + if ((f = fcntl(fd, F_GETFD)) == -1 || + fcntl(fd, F_SETFD, f | FD_CLOEXEC) == -1) { + __wt_api_env_err(env, errno, "%s: fcntl", name); + goto err; + } +#endif + + fh->fd = fd; + fh->refcnt = 1; + *fhp = fh; + + /* Set the file's size. */ + WT_ERR(__wt_filesize(env, fh, &fh->file_size)); + + /* Link onto the environment's list of files. */ + __wt_lock(env, ienv->mtx); + TAILQ_INSERT_TAIL(&ienv->fhqh, fh, q); + __wt_unlock(env, ienv->mtx); + + return (0); + +err: if (fh != NULL) { + if (fh->name != NULL) + __wt_free(env, fh->name, 0); + __wt_free(env, fh, sizeof(WT_FH)); + } + (void)close(fd); + return (ret); +} + +/* + * __wt_close -- + * Close a file handle. + */ +int +__wt_close(ENV *env, WT_FH *fh) +{ + IENV *ienv; + int ret; + + ienv = env->ienv; + ret = 0; + + if (fh == NULL || fh->refcnt == 0 || --fh->refcnt > 0) + return (0); + + /* Remove from the list and discard the memory. */ + __wt_lock(env, ienv->mtx); + TAILQ_REMOVE(&ienv->fhqh, fh, q); + __wt_unlock(env, ienv->mtx); + + if (close(fh->fd) != 0) { + __wt_api_env_err(env, errno, "%s", fh->name); + ret = WT_ERROR; + } + + __wt_free(env, fh->name, 0); + __wt_free(env, fh->stats, 0); + __wt_free(env, fh, sizeof(WT_FH)); + return (ret); +} diff --git a/src/os_posix/os_rw.c b/src/os_posix/os_rw.c new file mode 100644 index 00000000000..1ce48f3ec56 --- /dev/null +++ b/src/os_posix/os_rw.c @@ -0,0 +1,56 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_read -- + * Read a chunk. + */ +int +__wt_read(ENV *env, WT_FH *fh, off_t offset, uint32_t bytes, void *buf) +{ + WT_STAT_INCR(fh->stats, READ_IO); + WT_STAT_INCR(env->ienv->stats, TOTAL_READ_IO); + + WT_VERBOSE(env, WT_VERB_FILEOPS, + (env, "fileops: %s: read %lu bytes at offset %lu", + fh->name, (u_long)bytes, (u_long)offset)); + + if (pread(fh->fd, buf, (size_t)bytes, offset) == (ssize_t)bytes) + return (0); + + __wt_api_env_err(env, errno, + "%s read error: attempt to read %lu bytes at offset %lu", + fh->name, (u_long)bytes, (u_long)offset); + return (WT_ERROR); +} + +/* + * __wt_write -- + * Write a chunk. + */ +int +__wt_write(ENV *env, WT_FH *fh, off_t offset, uint32_t bytes, void *buf) +{ + WT_STAT_INCR(fh->stats, WRITE_IO); + WT_STAT_INCR(env->ienv->stats, TOTAL_WRITE_IO); + + WT_VERBOSE(env, WT_VERB_FILEOPS, + (env, "fileops: %s: write %lu bytes at offset %lu", + fh->name, (u_long)bytes, (u_long)offset)); + + if (pwrite(fh->fd, buf, (size_t)bytes, offset) == (ssize_t)bytes) + return (0); + + __wt_api_env_err(env, errno, + "%s write error: attempt to write %lu bytes at offset %lu", + fh->name, (u_long)bytes, (u_long)offset); + return (WT_ERROR); +} diff --git a/src/os_posix/os_sleep.c b/src/os_posix/os_sleep.c new file mode 100644 index 00000000000..74b86a30d42 --- /dev/null +++ b/src/os_posix/os_sleep.c @@ -0,0 +1,25 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_sleep -- + * Pause the thread of control. + */ +void +__wt_sleep(long seconds, long micro_seconds) +{ + struct timeval t; + + t.tv_sec = (long)seconds + micro_seconds / 1000000; + t.tv_usec = (long)micro_seconds % 1000000; + + (void)select(0, NULL, NULL, NULL, &t); +} diff --git a/src/os_posix/os_thread.c b/src/os_posix/os_thread.c new file mode 100644 index 00000000000..3fb62a482d8 --- /dev/null +++ b/src/os_posix/os_thread.c @@ -0,0 +1,31 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_thread_create -- + * Create a new thread of control. + */ +int +__wt_thread_create(pthread_t *tidret, void *(*func)(void *), void *arg) +{ + /* Spawn a new thread of control. */ + return (pthread_create(tidret, NULL, func, arg) == 0 ? 0 : WT_ERROR); +} + +/* + * __wt_thread_join -- + * Wait for a thread of control to exit. + */ +void +__wt_thread_join(pthread_t tid) +{ + (void)pthread_join(tid, NULL); +} diff --git a/src/os_posix/os_yield.c b/src/os_posix/os_yield.c new file mode 100644 index 00000000000..a13b407150d --- /dev/null +++ b/src/os_posix/os_yield.c @@ -0,0 +1,24 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_yield -- + * Yield the thread of control. + */ +void +__wt_yield(void) +{ +#ifdef HAVE_PTHREAD_YIELD + pthread_yield(); +#else + sched_yield(); +#endif +} diff --git a/src/support/api.c b/src/support/api.c new file mode 100644 index 00000000000..cb7b48a7d69 --- /dev/null +++ b/src/support/api.c @@ -0,0 +1,1597 @@ +/* DO NOT EDIT: automatically built by dist/api.py. */ + +#include "wt_internal.h" + +static int __wt_api_db_btree_compare_dup_get( + DB *db, + int (**btree_compare_dup)(DB *, const DBT *, const DBT *)); +static int __wt_api_db_btree_compare_dup_get( + DB *db, + int (**btree_compare_dup)(DB *, const DBT *, const DBT *)) +{ + ENV *env = db->env; + IENV *ienv = env->ienv; + + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, DB_BTREE_COMPARE_DUP_GET); + *btree_compare_dup = db->btree_compare_dup; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_db_btree_compare_dup_set( + DB *db, + int (*btree_compare_dup)(DB *, const DBT *, const DBT *)); +static int __wt_api_db_btree_compare_dup_set( + DB *db, + int (*btree_compare_dup)(DB *, const DBT *, const DBT *)) +{ + ENV *env = db->env; + IENV *ienv = env->ienv; + + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, DB_BTREE_COMPARE_DUP_SET); + db->btree_compare_dup = btree_compare_dup; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_db_btree_compare_get( + DB *db, + int (**btree_compare)(DB *, const DBT *, const DBT *)); +static int __wt_api_db_btree_compare_get( + DB *db, + int (**btree_compare)(DB *, const DBT *, const DBT *)) +{ + ENV *env = db->env; + IENV *ienv = env->ienv; + + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, DB_BTREE_COMPARE_GET); + *btree_compare = db->btree_compare; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_db_btree_compare_int_get( + DB *db, + int *btree_compare_int); +static int __wt_api_db_btree_compare_int_get( + DB *db, + int *btree_compare_int) +{ + ENV *env = db->env; + IENV *ienv = env->ienv; + + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, DB_BTREE_COMPARE_INT_GET); + *btree_compare_int = db->btree_compare_int; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_db_btree_compare_int_set( + DB *db, + int btree_compare_int); +static int __wt_api_db_btree_compare_int_set( + DB *db, + int btree_compare_int) +{ + ENV *env = db->env; + IENV *ienv = env->ienv; + + WT_RET((__wt_db_btree_compare_int_set_verify( + db, btree_compare_int))); + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, DB_BTREE_COMPARE_INT_SET); + db->btree_compare_int = btree_compare_int; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_db_btree_compare_set( + DB *db, + int (*btree_compare)(DB *, const DBT *, const DBT *)); +static int __wt_api_db_btree_compare_set( + DB *db, + int (*btree_compare)(DB *, const DBT *, const DBT *)) +{ + ENV *env = db->env; + IENV *ienv = env->ienv; + + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, DB_BTREE_COMPARE_SET); + db->btree_compare = btree_compare; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_db_btree_dup_offpage_get( + DB *db, + uint32_t *btree_dup_offpage); +static int __wt_api_db_btree_dup_offpage_get( + DB *db, + uint32_t *btree_dup_offpage) +{ + ENV *env = db->env; + IENV *ienv = env->ienv; + + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, DB_BTREE_DUP_OFFPAGE_GET); + *btree_dup_offpage = db->btree_dup_offpage; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_db_btree_dup_offpage_set( + DB *db, + uint32_t btree_dup_offpage); +static int __wt_api_db_btree_dup_offpage_set( + DB *db, + uint32_t btree_dup_offpage) +{ + ENV *env = db->env; + IENV *ienv = env->ienv; + + WT_RET((__wt_db_btree_dup_offpage_set_verify( + db, btree_dup_offpage))); + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, DB_BTREE_DUP_OFFPAGE_SET); + db->btree_dup_offpage = btree_dup_offpage; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_db_btree_itemsize_get( + DB *db, + uint32_t *intlitemsize, + uint32_t *leafitemsize); +static int __wt_api_db_btree_itemsize_get( + DB *db, + uint32_t *intlitemsize, + uint32_t *leafitemsize) +{ + ENV *env = db->env; + IENV *ienv = env->ienv; + + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, DB_BTREE_ITEMSIZE_GET); + *intlitemsize = db->intlitemsize; + *leafitemsize = db->leafitemsize; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_db_btree_itemsize_set( + DB *db, + uint32_t intlitemsize, + uint32_t leafitemsize); +static int __wt_api_db_btree_itemsize_set( + DB *db, + uint32_t intlitemsize, + uint32_t leafitemsize) +{ + ENV *env = db->env; + IENV *ienv = env->ienv; + + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, DB_BTREE_ITEMSIZE_SET); + db->intlitemsize = intlitemsize; + db->leafitemsize = leafitemsize; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_db_btree_pagesize_get( + DB *db, + uint32_t *allocsize, + uint32_t *intlmin, + uint32_t *intlmax, + uint32_t *leafmin, + uint32_t *leafmax); +static int __wt_api_db_btree_pagesize_get( + DB *db, + uint32_t *allocsize, + uint32_t *intlmin, + uint32_t *intlmax, + uint32_t *leafmin, + uint32_t *leafmax) +{ + ENV *env = db->env; + IENV *ienv = env->ienv; + + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, DB_BTREE_PAGESIZE_GET); + *allocsize = db->allocsize; + *intlmin = db->intlmin; + *intlmax = db->intlmax; + *leafmin = db->leafmin; + *leafmax = db->leafmax; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_db_btree_pagesize_set( + DB *db, + uint32_t allocsize, + uint32_t intlmin, + uint32_t intlmax, + uint32_t leafmin, + uint32_t leafmax); +static int __wt_api_db_btree_pagesize_set( + DB *db, + uint32_t allocsize, + uint32_t intlmin, + uint32_t intlmax, + uint32_t leafmin, + uint32_t leafmax) +{ + ENV *env = db->env; + IENV *ienv = env->ienv; + + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, DB_BTREE_PAGESIZE_SET); + db->allocsize = allocsize; + db->intlmin = intlmin; + db->intlmax = intlmax; + db->leafmin = leafmin; + db->leafmax = leafmax; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_db_bulk_load( + DB *db, + uint32_t flags, + void (*progress)(const char *, uint64_t), + int (*cb)(DB *, DBT **, DBT **)); +static int __wt_api_db_bulk_load( + DB *db, + uint32_t flags, + void (*progress)(const char *, uint64_t), + int (*cb)(DB *, DBT **, DBT **)) +{ + const char *method_name = "DB.bulk_load"; + ENV *env = db->env; + IENV *ienv = env->ienv; + WT_TOC *toc = NULL; + int ret; + + WT_DB_RDONLY(db, method_name); + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_BULK_LOAD); + WT_RET(__wt_toc_api_set(env, method_name, db, &toc)); + WT_STAT_INCR(ienv->method_stats, DB_BULK_LOAD); + ret = __wt_db_bulk_load(toc, flags, progress, cb); + WT_TRET(__wt_toc_api_clr(toc, method_name, 1)); + return (ret); +} + +static int __wt_api_db_close( + DB *db, + uint32_t flags); +static int __wt_api_db_close( + DB *db, + uint32_t flags) +{ + const char *method_name = "DB.close"; + ENV *env = db->env; + IENV *ienv = env->ienv; + WT_TOC *toc = NULL; + int ret; + + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_CLOSE); + WT_RET(__wt_toc_api_set(env, method_name, db, &toc)); + WT_STAT_INCR(ienv->method_stats, DB_CLOSE); + ret = __wt_db_close(toc, flags); + WT_TRET(__wt_toc_api_clr(toc, method_name, 1)); + return (ret); +} + +static int __wt_api_db_col_del( + DB *db, + WT_TOC *toc, + uint64_t recno, + uint32_t flags); +static int __wt_api_db_col_del( + DB *db, + WT_TOC *toc, + uint64_t recno, + uint32_t flags) +{ + const char *method_name = "DB.col_del"; + ENV *env = db->env; + IENV *ienv = env->ienv; + int ret; + + WT_DB_COL_ONLY(db, method_name); + WT_DB_RDONLY(db, method_name); + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_COL_DEL); + WT_RET(__wt_toc_api_set(env, method_name, db, &toc)); + WT_STAT_INCR(ienv->method_stats, DB_COL_DEL); + while ((ret = __wt_db_col_del(toc, recno)) == WT_RESTART) + WT_STAT_INCR(ienv->method_stats, DB_COL_DEL_RESTART); + WT_TRET(__wt_toc_api_clr(toc, method_name, 0)); + return (ret); +} + +static int __wt_api_db_col_get( + DB *db, + WT_TOC *toc, + uint64_t recno, + DBT *data, + uint32_t flags); +static int __wt_api_db_col_get( + DB *db, + WT_TOC *toc, + uint64_t recno, + DBT *data, + uint32_t flags) +{ + const char *method_name = "DB.col_get"; + ENV *env = db->env; + IENV *ienv = env->ienv; + int ret; + + WT_DB_COL_ONLY(db, method_name); + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_COL_GET); + WT_RET(__wt_toc_api_set(env, method_name, db, &toc)); + WT_STAT_INCR(ienv->method_stats, DB_COL_GET); + ret = __wt_db_col_get(toc, recno, data); + WT_TRET(__wt_toc_api_clr(toc, method_name, 0)); + return (ret); +} + +static int __wt_api_db_col_put( + DB *db, + WT_TOC *toc, + uint64_t recno, + DBT *data, + uint32_t flags); +static int __wt_api_db_col_put( + DB *db, + WT_TOC *toc, + uint64_t recno, + DBT *data, + uint32_t flags) +{ + const char *method_name = "DB.col_put"; + ENV *env = db->env; + IENV *ienv = env->ienv; + int ret; + + WT_DB_COL_ONLY(db, method_name); + WT_DB_RDONLY(db, method_name); + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_COL_PUT); + WT_RET(__wt_toc_api_set(env, method_name, db, &toc)); + WT_STAT_INCR(ienv->method_stats, DB_COL_PUT); + while ((ret = __wt_db_col_put(toc, recno, data)) == WT_RESTART) + WT_STAT_INCR(ienv->method_stats, DB_COL_PUT_RESTART); + WT_TRET(__wt_toc_api_clr(toc, method_name, 0)); + return (ret); +} + +static int __wt_api_db_column_set( + DB *db, + uint32_t fixed_len, + const char *dictionary, + uint32_t flags); +static int __wt_api_db_column_set( + DB *db, + uint32_t fixed_len, + const char *dictionary, + uint32_t flags) +{ + ENV *env = db->env; + IENV *ienv = env->ienv; + + WT_ENV_FCHK(env, "DB.column_set", + flags, WT_APIMASK_DB_COLUMN_SET); + + WT_RET((__wt_db_column_set_verify( + db, fixed_len, dictionary, flags))); + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, DB_COLUMN_SET); + db->fixed_len = fixed_len; + db->dictionary = dictionary; + db->flags = flags; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_db_dump( + DB *db, + FILE *stream, + void (*progress)(const char *, uint64_t), + uint32_t flags); +static int __wt_api_db_dump( + DB *db, + FILE *stream, + void (*progress)(const char *, uint64_t), + uint32_t flags) +{ + const char *method_name = "DB.dump"; + ENV *env = db->env; + IENV *ienv = env->ienv; + WT_TOC *toc = NULL; + int ret; + + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_DUMP); + WT_RET(__wt_toc_api_set(env, method_name, db, &toc)); + WT_STAT_INCR(ienv->method_stats, DB_DUMP); + ret = __wt_db_dump(toc, stream, progress, flags); + WT_TRET(__wt_toc_api_clr(toc, method_name, 1)); + return (ret); +} + +static int __wt_api_db_errcall_get( + DB *db, + void (**errcall)(const DB *, const char *)); +static int __wt_api_db_errcall_get( + DB *db, + void (**errcall)(const DB *, const char *)) +{ + ENV *env = db->env; + IENV *ienv = env->ienv; + + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, DB_ERRCALL_GET); + *errcall = db->errcall; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_db_errcall_set( + DB *db, + void (*errcall)(const DB *, const char *)); +static int __wt_api_db_errcall_set( + DB *db, + void (*errcall)(const DB *, const char *)) +{ + ENV *env = db->env; + IENV *ienv = env->ienv; + + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, DB_ERRCALL_SET); + db->errcall = errcall; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_db_errfile_get( + DB *db, + FILE **errfile); +static int __wt_api_db_errfile_get( + DB *db, + FILE **errfile) +{ + ENV *env = db->env; + IENV *ienv = env->ienv; + + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, DB_ERRFILE_GET); + *errfile = db->errfile; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_db_errfile_set( + DB *db, + FILE *errfile); +static int __wt_api_db_errfile_set( + DB *db, + FILE *errfile) +{ + ENV *env = db->env; + IENV *ienv = env->ienv; + + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, DB_ERRFILE_SET); + db->errfile = errfile; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_db_errpfx_get( + DB *db, + const char **errpfx); +static int __wt_api_db_errpfx_get( + DB *db, + const char **errpfx) +{ + ENV *env = db->env; + IENV *ienv = env->ienv; + + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, DB_ERRPFX_GET); + *errpfx = db->errpfx; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_db_errpfx_set( + DB *db, + const char *errpfx); +static int __wt_api_db_errpfx_set( + DB *db, + const char *errpfx) +{ + ENV *env = db->env; + IENV *ienv = env->ienv; + + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, DB_ERRPFX_SET); + db->errpfx = errpfx; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_db_huffman_set( + DB *db, + uint8_t const *huffman_table, + u_int huffman_table_size, + uint32_t huffman_flags); +static int __wt_api_db_huffman_set( + DB *db, + uint8_t const *huffman_table, + u_int huffman_table_size, + uint32_t huffman_flags) +{ + ENV *env = db->env; + IENV *ienv = env->ienv; + int ret; + + WT_ENV_FCHK(env, "DB.huffman_set", + huffman_flags, WT_APIMASK_DB_HUFFMAN_SET); + + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, DB_HUFFMAN_SET); + ret = __wt_db_huffman_set( + db, huffman_table, huffman_table_size, huffman_flags); + __wt_unlock(env, ienv->mtx); + return (ret); +} + +static int __wt_api_db_open( + DB *db, + const char *name, + mode_t mode, + uint32_t flags); +static int __wt_api_db_open( + DB *db, + const char *name, + mode_t mode, + uint32_t flags) +{ + const char *method_name = "DB.open"; + ENV *env = db->env; + IENV *ienv = env->ienv; + WT_TOC *toc = NULL; + int ret; + + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_OPEN); + WT_RET(__wt_toc_api_set(env, method_name, db, &toc)); + WT_STAT_INCR(ienv->method_stats, DB_OPEN); + ret = __wt_db_open(toc, name, mode, flags); + WT_TRET(__wt_toc_api_clr(toc, method_name, 1)); + return (ret); +} + +static int __wt_api_db_row_del( + DB *db, + WT_TOC *toc, + DBT *key, + uint32_t flags); +static int __wt_api_db_row_del( + DB *db, + WT_TOC *toc, + DBT *key, + uint32_t flags) +{ + const char *method_name = "DB.row_del"; + ENV *env = db->env; + IENV *ienv = env->ienv; + int ret; + + WT_DB_ROW_ONLY(db, method_name); + WT_DB_RDONLY(db, method_name); + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_ROW_DEL); + WT_RET(__wt_toc_api_set(env, method_name, db, &toc)); + WT_STAT_INCR(ienv->method_stats, DB_ROW_DEL); + while ((ret = __wt_db_row_del(toc, key)) == WT_RESTART) + WT_STAT_INCR(ienv->method_stats, DB_ROW_DEL_RESTART); + WT_TRET(__wt_toc_api_clr(toc, method_name, 0)); + return (ret); +} + +static int __wt_api_db_row_get( + DB *db, + WT_TOC *toc, + DBT *key, + DBT *data, + uint32_t flags); +static int __wt_api_db_row_get( + DB *db, + WT_TOC *toc, + DBT *key, + DBT *data, + uint32_t flags) +{ + const char *method_name = "DB.row_get"; + ENV *env = db->env; + IENV *ienv = env->ienv; + int ret; + + WT_DB_ROW_ONLY(db, method_name); + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_ROW_GET); + WT_RET(__wt_toc_api_set(env, method_name, db, &toc)); + WT_STAT_INCR(ienv->method_stats, DB_ROW_GET); + ret = __wt_db_row_get(toc, key, data); + WT_TRET(__wt_toc_api_clr(toc, method_name, 0)); + return (ret); +} + +static int __wt_api_db_row_put( + DB *db, + WT_TOC *toc, + DBT *key, + DBT *data, + uint32_t flags); +static int __wt_api_db_row_put( + DB *db, + WT_TOC *toc, + DBT *key, + DBT *data, + uint32_t flags) +{ + const char *method_name = "DB.row_put"; + ENV *env = db->env; + IENV *ienv = env->ienv; + int ret; + + WT_DB_ROW_ONLY(db, method_name); + WT_DB_RDONLY(db, method_name); + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_ROW_PUT); + WT_RET(__wt_toc_api_set(env, method_name, db, &toc)); + WT_STAT_INCR(ienv->method_stats, DB_ROW_PUT); + while ((ret = __wt_db_row_put(toc, key, data)) == WT_RESTART) + WT_STAT_INCR(ienv->method_stats, DB_ROW_PUT_RESTART); + WT_TRET(__wt_toc_api_clr(toc, method_name, 0)); + return (ret); +} + +static int __wt_api_db_stat_clear( + DB *db, + uint32_t flags); +static int __wt_api_db_stat_clear( + DB *db, + uint32_t flags) +{ + const char *method_name = "DB.stat_clear"; + ENV *env = db->env; + IENV *ienv = env->ienv; + int ret; + + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_STAT_CLEAR); + WT_STAT_INCR(ienv->method_stats, DB_STAT_CLEAR); + ret = __wt_db_stat_clear(db); + return (ret); +} + +static int __wt_api_db_stat_print( + DB *db, + FILE *stream, + uint32_t flags); +static int __wt_api_db_stat_print( + DB *db, + FILE *stream, + uint32_t flags) +{ + const char *method_name = "DB.stat_print"; + ENV *env = db->env; + IENV *ienv = env->ienv; + WT_TOC *toc = NULL; + int ret; + + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_STAT_PRINT); + WT_RET(__wt_toc_api_set(env, method_name, db, &toc)); + WT_STAT_INCR(ienv->method_stats, DB_STAT_PRINT); + ret = __wt_db_stat_print(toc, stream); + WT_TRET(__wt_toc_api_clr(toc, method_name, 1)); + return (ret); +} + +static int __wt_api_db_sync( + DB *db, + void (*progress)(const char *, uint64_t), + uint32_t flags); +static int __wt_api_db_sync( + DB *db, + void (*progress)(const char *, uint64_t), + uint32_t flags) +{ + const char *method_name = "DB.sync"; + ENV *env = db->env; + IENV *ienv = env->ienv; + WT_TOC *toc = NULL; + int ret; + + WT_DB_RDONLY(db, method_name); + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_SYNC); + WT_RET(__wt_toc_api_set(env, method_name, db, &toc)); + WT_STAT_INCR(ienv->method_stats, DB_SYNC); + ret = __wt_db_sync(toc, progress, flags); + WT_TRET(__wt_toc_api_clr(toc, method_name, 1)); + return (ret); +} + +static int __wt_api_db_verify( + DB *db, + void (*progress)(const char *, uint64_t), + uint32_t flags); +static int __wt_api_db_verify( + DB *db, + void (*progress)(const char *, uint64_t), + uint32_t flags) +{ + const char *method_name = "DB.verify"; + ENV *env = db->env; + IENV *ienv = env->ienv; + WT_TOC *toc = NULL; + int ret; + + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_DB_VERIFY); + WT_RET(__wt_toc_api_set(env, method_name, db, &toc)); + WT_STAT_INCR(ienv->method_stats, DB_VERIFY); + ret = __wt_db_verify(toc, progress); + WT_TRET(__wt_toc_api_clr(toc, method_name, 1)); + return (ret); +} + +static int __wt_api_env_cache_size_get( + ENV *env, + uint32_t *cache_size); +static int __wt_api_env_cache_size_get( + ENV *env, + uint32_t *cache_size) +{ + IENV *ienv = env->ienv; + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_CACHE_SIZE_GET); + *cache_size = env->cache_size; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_cache_size_set( + ENV *env, + uint32_t cache_size); +static int __wt_api_env_cache_size_set( + ENV *env, + uint32_t cache_size) +{ + IENV *ienv = env->ienv; + WT_RET((__wt_env_cache_size_set_verify( + env, cache_size))); + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_CACHE_SIZE_SET); + env->cache_size = cache_size; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_close( + ENV *env, + uint32_t flags); +static int __wt_api_env_close( + ENV *env, + uint32_t flags) +{ + const char *method_name = "ENV.close"; + IENV *ienv = env->ienv; + int ret; + + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_CLOSE); + WT_STAT_INCR(ienv->method_stats, ENV_CLOSE); + ret = __wt_env_close(env); + return (ret); +} + +static int __wt_api_env_data_update_initial_get( + ENV *env, + uint32_t *data_update_initial); +static int __wt_api_env_data_update_initial_get( + ENV *env, + uint32_t *data_update_initial) +{ + IENV *ienv = env->ienv; + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_DATA_UPDATE_INITIAL_GET); + *data_update_initial = env->data_update_initial; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_data_update_initial_set( + ENV *env, + uint32_t data_update_initial); +static int __wt_api_env_data_update_initial_set( + ENV *env, + uint32_t data_update_initial) +{ + IENV *ienv = env->ienv; + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_DATA_UPDATE_INITIAL_SET); + env->data_update_initial = data_update_initial; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_data_update_max_get( + ENV *env, + uint32_t *data_update_max); +static int __wt_api_env_data_update_max_get( + ENV *env, + uint32_t *data_update_max) +{ + IENV *ienv = env->ienv; + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_DATA_UPDATE_MAX_GET); + *data_update_max = env->data_update_max; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_data_update_max_set( + ENV *env, + uint32_t data_update_max); +static int __wt_api_env_data_update_max_set( + ENV *env, + uint32_t data_update_max) +{ + IENV *ienv = env->ienv; + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_DATA_UPDATE_MAX_SET); + env->data_update_max = data_update_max; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_db( + ENV *env, + uint32_t flags, + DB **dbp); +static int __wt_api_env_db( + ENV *env, + uint32_t flags, + DB **dbp) +{ + const char *method_name = "ENV.db"; + IENV *ienv = env->ienv; + int ret; + + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_DB); + WT_STAT_INCR(ienv->method_stats, ENV_DB); + ret = __wt_env_db(env, dbp); + return (ret); +} + +static int __wt_api_env_errcall_get( + ENV *env, + void (**errcall)(const ENV *, const char *)); +static int __wt_api_env_errcall_get( + ENV *env, + void (**errcall)(const ENV *, const char *)) +{ + IENV *ienv = env->ienv; + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_ERRCALL_GET); + *errcall = env->errcall; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_errcall_set( + ENV *env, + void (*errcall)(const ENV *, const char *)); +static int __wt_api_env_errcall_set( + ENV *env, + void (*errcall)(const ENV *, const char *)) +{ + IENV *ienv = env->ienv; + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_ERRCALL_SET); + env->errcall = errcall; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_errfile_get( + ENV *env, + FILE **errfile); +static int __wt_api_env_errfile_get( + ENV *env, + FILE **errfile) +{ + IENV *ienv = env->ienv; + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_ERRFILE_GET); + *errfile = env->errfile; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_errfile_set( + ENV *env, + FILE *errfile); +static int __wt_api_env_errfile_set( + ENV *env, + FILE *errfile) +{ + IENV *ienv = env->ienv; + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_ERRFILE_SET); + env->errfile = errfile; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_errpfx_get( + ENV *env, + const char **errpfx); +static int __wt_api_env_errpfx_get( + ENV *env, + const char **errpfx) +{ + IENV *ienv = env->ienv; + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_ERRPFX_GET); + *errpfx = env->errpfx; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_errpfx_set( + ENV *env, + const char *errpfx); +static int __wt_api_env_errpfx_set( + ENV *env, + const char *errpfx) +{ + IENV *ienv = env->ienv; + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_ERRPFX_SET); + env->errpfx = errpfx; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_hazard_size_get( + ENV *env, + uint32_t *hazard_size); +static int __wt_api_env_hazard_size_get( + ENV *env, + uint32_t *hazard_size) +{ + IENV *ienv = env->ienv; + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_HAZARD_SIZE_GET); + *hazard_size = env->hazard_size; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_hazard_size_set( + ENV *env, + uint32_t hazard_size); +static int __wt_api_env_hazard_size_set( + ENV *env, + uint32_t hazard_size) +{ + IENV *ienv = env->ienv; + WT_RET((__wt_env_hazard_size_set_verify( + env, hazard_size))); + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_HAZARD_SIZE_SET); + env->hazard_size = hazard_size; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_msgcall_get( + ENV *env, + void (**msgcall)(const ENV *, const char *)); +static int __wt_api_env_msgcall_get( + ENV *env, + void (**msgcall)(const ENV *, const char *)) +{ + IENV *ienv = env->ienv; + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_MSGCALL_GET); + *msgcall = env->msgcall; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_msgcall_set( + ENV *env, + void (*msgcall)(const ENV *, const char *)); +static int __wt_api_env_msgcall_set( + ENV *env, + void (*msgcall)(const ENV *, const char *)) +{ + IENV *ienv = env->ienv; + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_MSGCALL_SET); + env->msgcall = msgcall; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_msgfile_get( + ENV *env, + FILE **msgfile); +static int __wt_api_env_msgfile_get( + ENV *env, + FILE **msgfile) +{ + IENV *ienv = env->ienv; + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_MSGFILE_GET); + *msgfile = env->msgfile; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_msgfile_set( + ENV *env, + FILE *msgfile); +static int __wt_api_env_msgfile_set( + ENV *env, + FILE *msgfile) +{ + IENV *ienv = env->ienv; + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_MSGFILE_SET); + env->msgfile = msgfile; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_open( + ENV *env, + const char *home, + mode_t mode, + uint32_t flags); +static int __wt_api_env_open( + ENV *env, + const char *home, + mode_t mode, + uint32_t flags) +{ + const char *method_name = "ENV.open"; + IENV *ienv = env->ienv; + int ret; + + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_OPEN); + WT_STAT_INCR(ienv->method_stats, ENV_OPEN); + ret = __wt_env_open(env, home, mode); + return (ret); +} + +static int __wt_api_env_stat_clear( + ENV *env, + uint32_t flags); +static int __wt_api_env_stat_clear( + ENV *env, + uint32_t flags) +{ + const char *method_name = "ENV.stat_clear"; + IENV *ienv = env->ienv; + int ret; + + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_STAT_CLEAR); + WT_STAT_INCR(ienv->method_stats, ENV_STAT_CLEAR); + ret = __wt_env_stat_clear(env); + return (ret); +} + +static int __wt_api_env_stat_print( + ENV *env, + FILE *stream, + uint32_t flags); +static int __wt_api_env_stat_print( + ENV *env, + FILE *stream, + uint32_t flags) +{ + const char *method_name = "ENV.stat_print"; + IENV *ienv = env->ienv; + int ret; + + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_STAT_PRINT); + WT_STAT_INCR(ienv->method_stats, ENV_STAT_PRINT); + ret = __wt_env_stat_print(env, stream); + return (ret); +} + +static int __wt_api_env_sync( + ENV *env, + void (*progress)(const char *, uint64_t), + uint32_t flags); +static int __wt_api_env_sync( + ENV *env, + void (*progress)(const char *, uint64_t), + uint32_t flags) +{ + const char *method_name = "ENV.sync"; + IENV *ienv = env->ienv; + int ret; + + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_SYNC); + WT_STAT_INCR(ienv->method_stats, ENV_SYNC); + ret = __wt_env_sync(env, progress); + return (ret); +} + +static int __wt_api_env_toc( + ENV *env, + uint32_t flags, + WT_TOC **tocp); +static int __wt_api_env_toc( + ENV *env, + uint32_t flags, + WT_TOC **tocp) +{ + const char *method_name = "ENV.toc"; + IENV *ienv = env->ienv; + int ret; + + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_ENV_TOC); + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_TOC); + ret = __wt_env_toc(env, tocp); + __wt_unlock(env, ienv->mtx); + return (ret); +} + +static int __wt_api_env_toc_size_get( + ENV *env, + uint32_t *toc_size); +static int __wt_api_env_toc_size_get( + ENV *env, + uint32_t *toc_size) +{ + IENV *ienv = env->ienv; + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_TOC_SIZE_GET); + *toc_size = env->toc_size; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_toc_size_set( + ENV *env, + uint32_t toc_size); +static int __wt_api_env_toc_size_set( + ENV *env, + uint32_t toc_size) +{ + IENV *ienv = env->ienv; + WT_RET((__wt_env_toc_size_set_verify( + env, toc_size))); + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_TOC_SIZE_SET); + env->toc_size = toc_size; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_verbose_get( + ENV *env, + uint32_t *verbose); +static int __wt_api_env_verbose_get( + ENV *env, + uint32_t *verbose) +{ + IENV *ienv = env->ienv; + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_VERBOSE_GET); + *verbose = env->verbose; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_env_verbose_set( + ENV *env, + uint32_t verbose); +static int __wt_api_env_verbose_set( + ENV *env, + uint32_t verbose) +{ + IENV *ienv = env->ienv; + WT_RET((__wt_env_verbose_set_verify( + env, verbose))); + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, ENV_VERBOSE_SET); + env->verbose = verbose; + __wt_unlock(env, ienv->mtx); + return (0); +} + +static int __wt_api_wt_toc_close( + WT_TOC *wt_toc, + uint32_t flags); +static int __wt_api_wt_toc_close( + WT_TOC *wt_toc, + uint32_t flags) +{ + const char *method_name = "WT_TOC.close"; + ENV *env = wt_toc->env; + IENV *ienv = env->ienv; + int ret; + + WT_ENV_FCHK(env, method_name, flags, WT_APIMASK_WT_TOC_CLOSE); + __wt_lock(env, ienv->mtx); + WT_STAT_INCR(ienv->method_stats, WT_TOC_CLOSE); + ret = __wt_wt_toc_close(wt_toc); + __wt_unlock(env, ienv->mtx); + return (ret); +} + +void +__wt_methods_db_config_default(DB *db) +{ + db->btree_compare_dup = __wt_bt_lex_compare; + db->btree_compare = __wt_bt_lex_compare; +} + +void +__wt_methods_db_lockout(DB *db) +{ + db->btree_compare_dup_get = (int (*) + (DB *, int (**)(DB *, const DBT *, const DBT *))) + __wt_db_lockout; + db->btree_compare_dup_set = (int (*) + (DB *, int (*)(DB *, const DBT *, const DBT *))) + __wt_db_lockout; + db->btree_compare_get = (int (*) + (DB *, int (**)(DB *, const DBT *, const DBT *))) + __wt_db_lockout; + db->btree_compare_int_get = (int (*) + (DB *, int *)) + __wt_db_lockout; + db->btree_compare_int_set = (int (*) + (DB *, int )) + __wt_db_lockout; + db->btree_compare_set = (int (*) + (DB *, int (*)(DB *, const DBT *, const DBT *))) + __wt_db_lockout; + db->btree_dup_offpage_get = (int (*) + (DB *, uint32_t *)) + __wt_db_lockout; + db->btree_dup_offpage_set = (int (*) + (DB *, uint32_t )) + __wt_db_lockout; + db->btree_itemsize_get = (int (*) + (DB *, uint32_t *, uint32_t *)) + __wt_db_lockout; + db->btree_itemsize_set = (int (*) + (DB *, uint32_t , uint32_t )) + __wt_db_lockout; + db->btree_pagesize_get = (int (*) + (DB *, uint32_t *, uint32_t *, uint32_t *, uint32_t *, uint32_t *)) + __wt_db_lockout; + db->btree_pagesize_set = (int (*) + (DB *, uint32_t , uint32_t , uint32_t , uint32_t , uint32_t )) + __wt_db_lockout; + db->bulk_load = (int (*) + (DB *, uint32_t , void (*)(const char *, uint64_t), int (*)(DB *, DBT **, DBT **))) + __wt_db_lockout; + db->col_del = (int (*) + (DB *, WT_TOC *, uint64_t , uint32_t )) + __wt_db_lockout; + db->col_get = (int (*) + (DB *, WT_TOC *, uint64_t , DBT *, uint32_t )) + __wt_db_lockout; + db->col_put = (int (*) + (DB *, WT_TOC *, uint64_t , DBT *, uint32_t )) + __wt_db_lockout; + db->column_set = (int (*) + (DB *, uint32_t , const char *, uint32_t )) + __wt_db_lockout; + db->dump = (int (*) + (DB *, FILE *, void (*)(const char *, uint64_t), uint32_t )) + __wt_db_lockout; + db->err = (void (*) + (DB *, int , const char *, ...)) + __wt_db_lockout; + db->errcall_get = (int (*) + (DB *, void (**)(const DB *, const char *))) + __wt_db_lockout; + db->errcall_set = (int (*) + (DB *, void (*)(const DB *, const char *))) + __wt_db_lockout; + db->errfile_get = (int (*) + (DB *, FILE **)) + __wt_db_lockout; + db->errfile_set = (int (*) + (DB *, FILE *)) + __wt_db_lockout; + db->errpfx_get = (int (*) + (DB *, const char **)) + __wt_db_lockout; + db->errpfx_set = (int (*) + (DB *, const char *)) + __wt_db_lockout; + db->errx = (void (*) + (DB *, const char *, ...)) + __wt_db_lockout; + db->huffman_set = (int (*) + (DB *, uint8_t const *, u_int , uint32_t )) + __wt_db_lockout; + db->open = (int (*) + (DB *, const char *, mode_t , uint32_t )) + __wt_db_lockout; + db->row_del = (int (*) + (DB *, WT_TOC *, DBT *, uint32_t )) + __wt_db_lockout; + db->row_get = (int (*) + (DB *, WT_TOC *, DBT *, DBT *, uint32_t )) + __wt_db_lockout; + db->row_put = (int (*) + (DB *, WT_TOC *, DBT *, DBT *, uint32_t )) + __wt_db_lockout; + db->stat_clear = (int (*) + (DB *, uint32_t )) + __wt_db_lockout; + db->stat_print = (int (*) + (DB *, FILE *, uint32_t )) + __wt_db_lockout; + db->sync = (int (*) + (DB *, void (*)(const char *, uint64_t), uint32_t )) + __wt_db_lockout; + db->verify = (int (*) + (DB *, void (*)(const char *, uint64_t), uint32_t )) + __wt_db_lockout; +} + +void +__wt_methods_db_init_transition(DB *db) +{ + db->btree_compare_dup_get = __wt_api_db_btree_compare_dup_get; + db->btree_compare_dup_set = __wt_api_db_btree_compare_dup_set; + db->btree_compare_get = __wt_api_db_btree_compare_get; + db->btree_compare_int_get = __wt_api_db_btree_compare_int_get; + db->btree_compare_int_set = __wt_api_db_btree_compare_int_set; + db->btree_compare_set = __wt_api_db_btree_compare_set; + db->btree_dup_offpage_get = __wt_api_db_btree_dup_offpage_get; + db->btree_dup_offpage_set = __wt_api_db_btree_dup_offpage_set; + db->btree_itemsize_get = __wt_api_db_btree_itemsize_get; + db->btree_itemsize_set = __wt_api_db_btree_itemsize_set; + db->btree_pagesize_get = __wt_api_db_btree_pagesize_get; + db->btree_pagesize_set = __wt_api_db_btree_pagesize_set; + db->close = __wt_api_db_close; + db->column_set = __wt_api_db_column_set; + db->err = __wt_api_db_err; + db->errcall_get = __wt_api_db_errcall_get; + db->errcall_set = __wt_api_db_errcall_set; + db->errfile_get = __wt_api_db_errfile_get; + db->errfile_set = __wt_api_db_errfile_set; + db->errpfx_get = __wt_api_db_errpfx_get; + db->errpfx_set = __wt_api_db_errpfx_set; + db->errx = __wt_api_db_errx; + db->huffman_set = __wt_api_db_huffman_set; + db->open = __wt_api_db_open; +} + +void +__wt_methods_db_open_transition(DB *db) +{ + db->btree_compare_dup_set = (int (*) + (DB *, int (*)(DB *, const DBT *, const DBT *))) + __wt_db_lockout; + db->btree_compare_int_set = (int (*) + (DB *, int )) + __wt_db_lockout; + db->btree_compare_set = (int (*) + (DB *, int (*)(DB *, const DBT *, const DBT *))) + __wt_db_lockout; + db->btree_dup_offpage_set = (int (*) + (DB *, uint32_t )) + __wt_db_lockout; + db->btree_itemsize_set = (int (*) + (DB *, uint32_t , uint32_t )) + __wt_db_lockout; + db->btree_pagesize_set = (int (*) + (DB *, uint32_t , uint32_t , uint32_t , uint32_t , uint32_t )) + __wt_db_lockout; + db->column_set = (int (*) + (DB *, uint32_t , const char *, uint32_t )) + __wt_db_lockout; + db->huffman_set = (int (*) + (DB *, uint8_t const *, u_int , uint32_t )) + __wt_db_lockout; + db->bulk_load = __wt_api_db_bulk_load; + db->col_del = __wt_api_db_col_del; + db->col_get = __wt_api_db_col_get; + db->col_put = __wt_api_db_col_put; + db->dump = __wt_api_db_dump; + db->row_del = __wt_api_db_row_del; + db->row_get = __wt_api_db_row_get; + db->row_put = __wt_api_db_row_put; + db->stat_clear = __wt_api_db_stat_clear; + db->stat_print = __wt_api_db_stat_print; + db->sync = __wt_api_db_sync; + db->verify = __wt_api_db_verify; +} + +void +__wt_methods_env_config_default(ENV *env) +{ + env->cache_size = 20; + env->data_update_initial = 8 * 1024; + env->data_update_max = 32 * 1024; + env->hazard_size = 15; + env->toc_size = 50; +} + +void +__wt_methods_env_lockout(ENV *env) +{ + env->cache_size_get = (int (*) + (ENV *, uint32_t *)) + __wt_env_lockout; + env->cache_size_set = (int (*) + (ENV *, uint32_t )) + __wt_env_lockout; + env->data_update_initial_get = (int (*) + (ENV *, uint32_t *)) + __wt_env_lockout; + env->data_update_initial_set = (int (*) + (ENV *, uint32_t )) + __wt_env_lockout; + env->data_update_max_get = (int (*) + (ENV *, uint32_t *)) + __wt_env_lockout; + env->data_update_max_set = (int (*) + (ENV *, uint32_t )) + __wt_env_lockout; + env->db = (int (*) + (ENV *, uint32_t , DB **)) + __wt_env_lockout; + env->err = (void (*) + (ENV *, int , const char *, ...)) + __wt_env_lockout; + env->errcall_get = (int (*) + (ENV *, void (**)(const ENV *, const char *))) + __wt_env_lockout; + env->errcall_set = (int (*) + (ENV *, void (*)(const ENV *, const char *))) + __wt_env_lockout; + env->errfile_get = (int (*) + (ENV *, FILE **)) + __wt_env_lockout; + env->errfile_set = (int (*) + (ENV *, FILE *)) + __wt_env_lockout; + env->errpfx_get = (int (*) + (ENV *, const char **)) + __wt_env_lockout; + env->errpfx_set = (int (*) + (ENV *, const char *)) + __wt_env_lockout; + env->errx = (void (*) + (ENV *, const char *, ...)) + __wt_env_lockout; + env->hazard_size_get = (int (*) + (ENV *, uint32_t *)) + __wt_env_lockout; + env->hazard_size_set = (int (*) + (ENV *, uint32_t )) + __wt_env_lockout; + env->msgcall_get = (int (*) + (ENV *, void (**)(const ENV *, const char *))) + __wt_env_lockout; + env->msgcall_set = (int (*) + (ENV *, void (*)(const ENV *, const char *))) + __wt_env_lockout; + env->msgfile_get = (int (*) + (ENV *, FILE **)) + __wt_env_lockout; + env->msgfile_set = (int (*) + (ENV *, FILE *)) + __wt_env_lockout; + env->open = (int (*) + (ENV *, const char *, mode_t , uint32_t )) + __wt_env_lockout; + env->stat_clear = (int (*) + (ENV *, uint32_t )) + __wt_env_lockout; + env->stat_print = (int (*) + (ENV *, FILE *, uint32_t )) + __wt_env_lockout; + env->sync = (int (*) + (ENV *, void (*)(const char *, uint64_t), uint32_t )) + __wt_env_lockout; + env->toc = (int (*) + (ENV *, uint32_t , WT_TOC **)) + __wt_env_lockout; + env->toc_size_get = (int (*) + (ENV *, uint32_t *)) + __wt_env_lockout; + env->toc_size_set = (int (*) + (ENV *, uint32_t )) + __wt_env_lockout; + env->verbose_get = (int (*) + (ENV *, uint32_t *)) + __wt_env_lockout; + env->verbose_set = (int (*) + (ENV *, uint32_t )) + __wt_env_lockout; +} + +void +__wt_methods_env_init_transition(ENV *env) +{ + env->cache_size_get = __wt_api_env_cache_size_get; + env->cache_size_set = __wt_api_env_cache_size_set; + env->close = __wt_api_env_close; + env->data_update_initial_get = __wt_api_env_data_update_initial_get; + env->data_update_initial_set = __wt_api_env_data_update_initial_set; + env->data_update_max_get = __wt_api_env_data_update_max_get; + env->data_update_max_set = __wt_api_env_data_update_max_set; + env->err = __wt_api_env_err; + env->errcall_get = __wt_api_env_errcall_get; + env->errcall_set = __wt_api_env_errcall_set; + env->errfile_get = __wt_api_env_errfile_get; + env->errfile_set = __wt_api_env_errfile_set; + env->errpfx_get = __wt_api_env_errpfx_get; + env->errpfx_set = __wt_api_env_errpfx_set; + env->errx = __wt_api_env_errx; + env->hazard_size_get = __wt_api_env_hazard_size_get; + env->hazard_size_set = __wt_api_env_hazard_size_set; + env->msgcall_get = __wt_api_env_msgcall_get; + env->msgcall_set = __wt_api_env_msgcall_set; + env->msgfile_get = __wt_api_env_msgfile_get; + env->msgfile_set = __wt_api_env_msgfile_set; + env->open = __wt_api_env_open; + env->stat_clear = __wt_api_env_stat_clear; + env->stat_print = __wt_api_env_stat_print; + env->toc_size_get = __wt_api_env_toc_size_get; + env->toc_size_set = __wt_api_env_toc_size_set; + env->verbose_get = __wt_api_env_verbose_get; + env->verbose_set = __wt_api_env_verbose_set; +} + +void +__wt_methods_env_open_transition(ENV *env) +{ + env->cache_size_set = (int (*) + (ENV *, uint32_t )) + __wt_env_lockout; + env->hazard_size_set = (int (*) + (ENV *, uint32_t )) + __wt_env_lockout; + env->open = (int (*) + (ENV *, const char *, mode_t , uint32_t )) + __wt_env_lockout; + env->toc_size_set = (int (*) + (ENV *, uint32_t )) + __wt_env_lockout; + env->db = __wt_api_env_db; + env->sync = __wt_api_env_sync; + env->toc = __wt_api_env_toc; +} + +void +__wt_methods_wt_toc_lockout(WT_TOC *wt_toc) +{ + WT_CC_QUIET(wt_toc, NULL); +} + +void +__wt_methods_wt_toc_init_transition(WT_TOC *wt_toc) +{ + wt_toc->close = __wt_api_wt_toc_close; +} + diff --git a/src/support/cksum.c b/src/support/cksum.c new file mode 100644 index 00000000000..06b0e625b0d --- /dev/null +++ b/src/support/cksum.c @@ -0,0 +1,134 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_cksum -- + * Return a checksum for a chunk of memory. + * + * Algorithm 3 from Richard Black's discussion of CRC32. + * http://www.cl.cam.ac.uk/research/srg/ + * bluebook/21/crc/node6.html#SECTION00060000000000000000 + */ +uint32_t +__wt_cksum(void *chunk, uint32_t bytes) +{ + #if 0 + /* + * Code to generate the crctab. + */ + #define __QUOTIENT 0x04c11db7 + int + main() + { + int i, j; + unsigned int crc, crctab[256]; + + for (i = 0; i < 256; i++) { + crc = i << 24; + for (j = 0; j < 8; j++) { + if (crc & 0x80000000) + crc = (crc << 1) ^ __QUOTIENT; + else + crc = crc << 1; + } + crctab[i] = crc; + } + for (i = 0; i < 256;) { + printf("0x%08lx, ", (unsigned long)crctab[i]); + if (++i % 4 == 0) + printf("\n"); + } + return (0); + } + #endif + + static const uint32_t crctab[256] = { + 0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, + 0x130476dc, 0x17c56b6b, 0x1a864db2, 0x1e475005, + 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61, + 0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd, + 0x4c11db70, 0x48d0c6c7, 0x4593e01e, 0x4152fda9, + 0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75, + 0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011, + 0x791d4014, 0x7ddc5da3, 0x709f7b7a, 0x745e66cd, + 0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039, + 0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5, + 0xbe2b5b58, 0xbaea46ef, 0xb7a96036, 0xb3687d81, + 0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d, + 0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49, + 0xc7361b4c, 0xc3f706fb, 0xceb42022, 0xca753d95, + 0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1, + 0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d, + 0x34867077, 0x30476dc0, 0x3d044b19, 0x39c556ae, + 0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072, + 0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16, + 0x018aeb13, 0x054bf6a4, 0x0808d07d, 0x0cc9cdca, + 0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde, + 0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02, + 0x5e9f46bf, 0x5a5e5b08, 0x571d7dd1, 0x53dc6066, + 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba, + 0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e, + 0xbfa1b04b, 0xbb60adfc, 0xb6238b25, 0xb2e29692, + 0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6, + 0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a, + 0xe0b41de7, 0xe4750050, 0xe9362689, 0xedf73b3e, + 0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2, + 0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686, + 0xd5b88683, 0xd1799b34, 0xdc3abded, 0xd8fba05a, + 0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637, + 0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb, + 0x4f040d56, 0x4bc510e1, 0x46863638, 0x42472b8f, + 0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53, + 0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47, + 0x36194d42, 0x32d850f5, 0x3f9b762c, 0x3b5a6b9b, + 0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff, + 0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623, + 0xf12f560e, 0xf5ee4bb9, 0xf8ad6d60, 0xfc6c70d7, + 0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b, + 0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f, + 0xc423cd6a, 0xc0e2d0dd, 0xcda1f604, 0xc960ebb3, + 0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7, + 0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b, + 0x9b3660c6, 0x9ff77d71, 0x92b45ba8, 0x9675461f, + 0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3, + 0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640, + 0x4e8ee645, 0x4a4ffbf2, 0x470cdd2b, 0x43cdc09c, + 0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8, + 0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24, + 0x119b4be9, 0x155a565e, 0x18197087, 0x1cd86d30, + 0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec, + 0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088, + 0x2497d08d, 0x2056cd3a, 0x2d15ebe3, 0x29d4f654, + 0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0, + 0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c, + 0xe3a1cbc1, 0xe760d676, 0xea23f0af, 0xeee2ed18, + 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4, + 0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0, + 0x9abc8bd5, 0x9e7d9662, 0x933eb0bb, 0x97ffad0c, + 0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668, + 0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4 + }; + uint32_t i, result; + uint8_t *data; + + data = chunk; + result = *data++ << 24; + result |= *data++ << 16; + result |= *data++ << 8; + result |= *data++; + result = ~result; + bytes -= 4; + + for (i = 0; i < bytes; ++i) + result = (result << 8 | *data++) ^ crctab[result >> 24]; + + return (~result); +} diff --git a/src/support/err.c b/src/support/err.c new file mode 100644 index 00000000000..dc8eac01189 --- /dev/null +++ b/src/support/err.c @@ -0,0 +1,247 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_msg_call -- + * Pass a message to a callback function. + */ +void +__wt_msg_call(void *cb, void *handle, + const char *pfx1, const char *pfx2, + int error, const char *fmt, va_list ap) +{ + size_t len; + int separator; + + /* + * !!! + * SECURITY: + * Buffer placed at the end of the stack in case snprintf overflows. + */ + char s[2048]; + + len = 0; + separator = 0; + s[0] = '\0'; + if (pfx1 != NULL) { + len += (size_t)snprintf(s + len, sizeof(s) - len, "%s", pfx1); + separator = 1; + } + if (pfx2 != NULL && len < sizeof(s) - 1) { + len += (size_t)snprintf(s + len, sizeof(s) - len, + "%s%s", separator ? ": " : "", pfx2); + separator = 1; + } + if (separator && len < sizeof(s) - 1) + len += (size_t)snprintf(s + len, sizeof(s) - len, ": "); + if (len < sizeof(s) - 1) + len += (size_t)vsnprintf(s + len, sizeof(s) - len, fmt, ap); + if (error != 0 && len < sizeof(s) - 1) + (void)snprintf(s + len, + sizeof(s) - len, ": %s", wiredtiger_strerror(error)); + + ((void (*)(void *, const char *))cb)(handle, s); +} + +/* + * __wt_msg_stream -- + * Write a message to a FILE stream. + */ +void +__wt_msg_stream(FILE *fp, + const char *pfx1, const char *pfx2, int error, const char *fmt, va_list ap) +{ + if (fp == NULL) + fp = stderr; + + if (pfx1 != NULL) + (void)fprintf(fp, "%s: ", pfx1); + if (pfx2 != NULL) + (void)fprintf(fp, "%s: ", pfx2); + (void)vfprintf(fp, fmt, ap); + if (error != 0) + (void)fprintf(fp, ": %s", wiredtiger_strerror(error)); + (void)fprintf(fp, "\n"); + (void)fflush(fp); +} + +#ifdef HAVE_DIAGNOSTIC +/* + * __wt_assert -- + * Internal version of assert function. + */ +void +__wt_assert(ENV *env, const char *check, const char *file_name, int line_number) +{ + __wt_api_env_errx(env, + "assertion failure: %s/%d: \"%s\"", file_name, line_number, check); + + __wt_abort(env); + /* NOTREACHED */ +} +#endif + +/* + * __wt_api_args -- + * Print a standard error message when an API function is passed illegal + * arguments. + */ +int +__wt_api_args(ENV *env, const char *name) +{ + __wt_api_env_errx(env, + "%s: illegal API arguments or flag values specified", name); + return (WT_ERROR); +} + +/* + * __wt_api_arg_min -- + * Print a standard error message when an API function is passed a + * too-small argument. + */ +int +__wt_api_arg_min(ENV *env, + const char *name, const char *arg_name, uint32_t v, uint32_t min) +{ + if (v >= min) + return (0); + + __wt_api_env_errx(env, + "%s: %s argument less than minimum value of %lu", + name, arg_name, (u_long)min); + return (WT_ERROR); +} + +/* + * __wt_api_arg_max -- + * Print a standard error message when an API function is passed a + * too-large argument. + */ +int +__wt_api_arg_max(ENV *env, + const char *name, const char *arg_name, uint32_t v, uint32_t max) +{ + if (v <= max) + return (0); + + __wt_api_env_errx(env, + "%s: %s argument larger than maximum value of %lu", + name, arg_name, (u_long)max); + return (WT_ERROR); +} + +/* + * __wt_database_method_type -- + * Print a standard error message on attempts to call methods inappropriate + * for a database type. + */ +int +__wt_database_method_type(DB *db, const char *name, int column_err) +{ + __wt_api_db_errx(db, + "%s: this method is not supported for a %s database", + name, column_err ? "column store" : "row store"); + return (WT_ERROR); +} + +/* + * __wt_database_wrong_fixed_size -- + * Print a standard error message on attempts to put the wrong size element + * into a fixed-size database. + */ +int +__wt_database_wrong_fixed_size(WT_TOC *toc, uint32_t len) +{ + DB *db; + + db = toc->db; + + __wt_api_db_errx(db, + "%s: length of %lu does not match fixed-length database " + "configuration of %lu", + toc->name, (u_long)len, (u_long)db->fixed_len); + return (WT_ERROR); +} + +/* + * __wt_database_readonly -- + * Print a standard error message on attempts to modify a read-only + * database. + */ +int +__wt_database_readonly(DB *db, const char *name) +{ + __wt_api_db_errx(db, + "%s: the database was opened read-only and may not be modified", + name); + return (WT_READONLY); +} + +/* + * __wt_database_format -- + * Print a standard error message when a database format error is + * suddenly discovered. + */ +int +__wt_database_format(DB *db) +{ + __wt_api_db_errx(db, "the database is corrupted; use the Db.salvage" + " method or the db_salvage utility to repair the database"); + return (WT_ERROR); +} + +/* + * __wt_database_item_too_big -- + * Print a standard error message when an element is too large to store. + */ +int +__wt_database_item_too_big(DB *db) +{ + __wt_api_db_errx(db, "the item is too large for the database to store"); + return (WT_ERROR); +} + +/* + * __wt_wt_toc_lockout -- + * Standard WT_TOC handle lockout error message. + */ +int +__wt_wt_toc_lockout(WT_TOC *toc) +{ + return (__wt_env_lockout(toc->env)); +} + +/* + * __wt_db_lockout -- + * Standard DB handle lockout error message. + */ +int +__wt_db_lockout(DB *db) +{ + return (__wt_env_lockout(db->env)); +} + +/* + * __wt_env_lockout -- + * Standard ENV handle lockout error message. + */ +int +__wt_env_lockout(ENV *env) +{ + __wt_api_env_errx(env, + "An unavailable handle method was called; the handle method is " + "not available for some reason, for example, handle methods are " + "restricted after an error, or configuration methods may be " + "restricted after the database or environment have been opened, " + "or operational methods may be restricted until the database or " + "environment has been opened."); + return (WT_ERROR); +} diff --git a/src/support/hazard.c b/src/support/hazard.c new file mode 100644 index 00000000000..5bef0731aa5 --- /dev/null +++ b/src/support/hazard.c @@ -0,0 +1,133 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_hazard_set -- + * Set a hazard reference. + */ +int +__wt_hazard_set(WT_TOC *toc, WT_REF *ref) +{ + ENV *env; + WT_PAGE **hp; + + env = toc->env; + + /* + * Do the dance: + * + * The memory location making a page "real" is the WT_REF's state which + * can be reset from WT_OK to WT_EVICT at any time by the page eviction + * server. + * + * Add the WT_REF reference to the WT_TOC's hazard list and flush the + * write, then see if the state field is still WT_OK. If it's still + * WT_OK, we know we can use the page because the page eviction server + * will see our hazard reference before it discards the buffer (the + * eviction server sets the WT_REF state to WT_EVICT, flushes memory, + * and then checks the hazard references). + */ + for (hp = toc->hazard; hp < toc->hazard + env->hazard_size; ++hp) { + if (*hp != NULL) + continue; + + /* + * Memory flush needed; the hazard array isn't declared volatile + * and an explicit memory flush is necessary. + */ + *hp = ref->page; + WT_MEMORY_FLUSH; + + /* + * If the cache entry is set, check to see if it's still valid. + * Valid means the state is WT_OK, or the state is WT_EVICT and + * this thread is allowed to see pages flagged for eviction. + */ + if (ref->state == WT_OK || + (ref->state == WT_EVICT && F_ISSET(toc, WT_READ_EVICT))) { + WT_VERBOSE(env, WT_VERB_HAZARD, + (env, "toc %p hazard %p: set", toc, ref->page)); + return (1); + } + + /* The cache eviction server owns the page, we can't have it. */ + *hp = NULL; + return (0); + } + + __wt_api_env_errx(env, "WT_TOC has no more hazard reference slots"); + WT_ASSERT(env, hp < toc->hazard + env->hazard_size); + return (0); +} + +/* + * __wt_hazard_clear -- + * Clear a hazard reference. + */ +void +__wt_hazard_clear(WT_TOC *toc, WT_PAGE *page) +{ + ENV *env; + WT_PAGE **hp; + + env = toc->env; + + WT_VERBOSE(env, + WT_VERB_HAZARD, (env, "toc %p hazard %p: clr", toc, page)); + + /* Clear the caller's hazard pointer. */ + for (hp = toc->hazard; hp < toc->hazard + env->hazard_size; ++hp) + if (*hp == page) { + *hp = NULL; + /* + * We don't have to flush memory here for correctness; + * it would give the page server thread faster access + * to the block were the block selected to be evicted, + * but the generation number was just set which makes + * it unlikely to be selected for eviction. + */ + return; + } + __wt_api_env_errx(env, "WT_TOC hazard reference not found"); + WT_ASSERT(env, hp < toc->hazard + env->hazard_size); +} + +/* + * __wt_hazard_empty -- + * Verify that no hazard references are set. + */ +void +__wt_hazard_empty(WT_TOC *toc, const char *name) +{ + ENV *env; + WT_PAGE **hp; + + env = toc->env; + + /* + * Check for a set hazard reference and complain if we find one. Clear + * any we find because it's not a correctness problem (any hazard ref + * we find can't be real because the WT_TOC is being closed when we're + * called). We do this work because it's not expensive, and we don't + * want to let a hazard reference lie around, keeping a page from being + * flushed. The flush isn't necessary for correctness, but gives the + * cache eviction thread immediate access to any page our reference + * blocks. + */ + for (hp = toc->hazard; hp < toc->hazard + env->hazard_size; ++hp) + if (*hp != NULL) { + __wt_api_env_errx(env, + "%s: returned with a hazard reference set (%p)", + name, *hp); + *hp = NULL; + WT_MEMORY_FLUSH; + } +} diff --git a/src/support/huffman.c b/src/support/huffman.c new file mode 100644 index 00000000000..2a0fcfde218 --- /dev/null +++ b/src/support/huffman.c @@ -0,0 +1,692 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + * + * Huffman Encoder/Decoder v1.0 + * Author Brian Pollack <brian@brians.com> + */ + +#include "wt_internal.h" + +typedef struct __wt_freqtree_node { + /* + * Data structure representing a node of the huffman tree. It holds a + * 32-bit weight and pointers to the left and right child nodes. + * The node either has two child nodes or none. + */ + uint16_t symbol; /* only used in leaf nodes */ + uint32_t weight; + uint16_t codeword_length; + struct __wt_freqtree_node *left; /* bit 0 */ + struct __wt_freqtree_node *right; /* bit 1 */ +} WT_FREQTREE_NODE; + +typedef struct __wt_static_huffman_node { + /* + * This data structure is used to represent the huffman tree in a + * static array, after it has been created (using a dynamic tree + * representation with WT_FREQTREE_NODE nodes). + * + * In the binary tree's array representation if a node's index is i, + * then its left child node is 2i+1 and its right child node is 2i+2. + */ + uint8_t valid; + uint16_t symbol; + uint16_t codeword_length; +} WT_STATIC_HUFFMAN_NODE; + +typedef struct __wt_huffman_obj { + ENV *env; /* Enclosing environment */ + /* + * Data structure here defines specific instance of the encoder/decoder. + * This contains the frequency table (tree) used to produce optimal + * results. This version of the encoder supports 1- and 2-byte symbols. + */ + uint32_t numSymbols; + uint8_t numBytes; /* 1 or 2 */ + /* The tree in static array reprentation */ + WT_STATIC_HUFFMAN_NODE *nodes; + uint16_t max_depth; +} WT_HUFFMAN_OBJ; + +/* + * Queue element data structure. + * + * Consists of a pointer to a huffman tree node, and a pointer to the next + * element in the queue. + */ +typedef struct node_queue_elem { + WT_FREQTREE_NODE *node; + struct node_queue_elem *next; +} NODE_QUEUE_ELEM; + +/* + * Queue of huffman tree nodes. + * + * Contains a pointer to the beginning and the end of the queue, which is + * implemented as a linked list. + */ +typedef struct node_queue { + NODE_QUEUE_ELEM *first; + NODE_QUEUE_ELEM *last; +} NODE_QUEUE; + +#define node_queue_is_empty(queue) \ + (((queue) == NULL || (queue)->first == NULL) ? 1 : 0) + +static void node_queue_close(ENV *, NODE_QUEUE *); +static void node_queue_dequeue(ENV *, NODE_QUEUE *, WT_FREQTREE_NODE **); +static int node_queue_enqueue(ENV *, NODE_QUEUE *, WT_FREQTREE_NODE *); +static void recursive_free_node(ENV *env, WT_FREQTREE_NODE *node); + +/* + * The following macros are used by the encoder to write the buffer with bit + * addressing. + */ +#undef SET_BIT +#define SET_BIT(ptr, pos) \ + *((ptr) + ((pos) / 8)) |= 1 << (7 - ((pos) % 8)) +#undef CLEAR_BIT +#define CLEAR_BIT(ptr, pos) \ + *((ptr) + ((pos) / 8)) &= ~(uint8_t)(1 << (7 - ((pos) % 8))) +#undef MODIFY_BIT +#define MODIFY_BIT(ptr, pos, bit) \ + if (bit) \ + SET_BIT(ptr, pos); \ + else \ + CLEAR_BIT(ptr, pos); + +/* + * Internal data structure used to preserve the symbol when rearranging the + * frequency array. + */ +typedef struct __indexed_byte { + uint8_t frequency; + uint16_t symbol; +} INDEXED_BYTE; + +/* + * Comparator function used by QuickSort to order the frequency table by + * frequency (most frequent symbols will be at the end of the array). + */ +static int +indexed_byte_comparator(const void *elem1, const void *elem2) +{ + return (((INDEXED_BYTE *) + elem1)->frequency) - (((INDEXED_BYTE *)elem2)->frequency); +} + +/* + * traverse_tree -- + * Recursive function with dual functionality: + * - It sets the codeword_length field of each leaf node to the + * appropriate value. + * - It finds the maximum depth of the tree. + */ +static void +traverse_tree( + WT_FREQTREE_NODE *node, uint16_t current_length, uint16_t *max_depth) +{ + /* Recursively traverse the tree */ + if (node->left != NULL) + traverse_tree(node->left, current_length + 1, max_depth); + if (node->right != NULL) + traverse_tree(node->right, current_length + 1, max_depth); + + /* If this is a leaf: */ + if (node->left == NULL && node->right == NULL) { + /* + * Setting the leaf's codeword length (for inner nodes, it + * is always 0!) + */ + node->codeword_length = current_length; + + /* Store the new maximal depth. */ + if (*max_depth < current_length + 1) + *max_depth = current_length + 1; + } +} + +/* + * fill_static_representation -- + * Recursive function that converts the huffman tree from its dynamic + * representation to static tree representation, to a preallocated array. + * + * To know the required size of the array the traverse_tree function can be + * used, determining the maximum depth N. Then the required array size is 2^N. + */ +static void +fill_static_representation( + WT_STATIC_HUFFMAN_NODE *target, WT_FREQTREE_NODE *node, int idx) +{ + WT_STATIC_HUFFMAN_NODE *current_target; + + current_target = &target[idx]; + current_target->symbol = node->symbol; + current_target->codeword_length = node->codeword_length; + current_target->valid = 1; + + if (node->left != NULL) + fill_static_representation(target, node->left, idx * 2 + 1); + if (node->right != NULL) + fill_static_representation(target, node->right, idx * 2 + 2); +} + +/* + * recursive_free_node -- + * Recursively free the huffman frequency tree's nodes. + */ +static void +recursive_free_node(ENV *env, WT_FREQTREE_NODE *node) +{ + if (node != NULL) { + recursive_free_node(env, node->left); + recursive_free_node(env, node->right); + __wt_free(env, node, sizeof(WT_FREQTREE_NODE)); + } +} + +/* + * __wt_huffman_open -- + * Take a frequency table and return a pointer to a descriptor object. + * + * The frequency table must be the full range of valid values. For 1 byte + * tables there are 256 values in 8 bits. The highest rank is 255, and the + * lowest rank is 1 (0 means the byte never appears in the input), so 1 byte + * is needed to hold the rank and the input table must be 1 byte x 256 values. + * + * For UTF-16 (nbytes == 2) the range is 0 - 65535 and the max rank is 65535. + * The table should be 2 bytes x 65536 values. + */ +int +__wt_huffman_open(ENV *env, + uint8_t const *byte_frequency_array, u_int nbytes, void *retp) +{ + INDEXED_BYTE *indexed_freqs; + NODE_QUEUE *combined_nodes, *leaves; + WT_FREQTREE_NODE *node, *node2, **refnode, *tempnode; + WT_HUFFMAN_OBJ *huffman; + uint32_t w1, w2; + uint16_t i; + int ret; + + indexed_freqs = NULL; + combined_nodes = leaves = NULL; + node = node2 = tempnode = NULL; + ret = 0; + + WT_RET(__wt_calloc(env, 1, sizeof(WT_HUFFMAN_OBJ), &huffman)); + WT_ERR(__wt_calloc(env, nbytes, sizeof(INDEXED_BYTE), &indexed_freqs)); + huffman->env = env; + + /* + * The frequency array must be sorted to be able to use linear time + * construction algorithm. + */ + for (i = 0; i < nbytes; ++i) { + indexed_freqs[i].frequency = byte_frequency_array[i]; + indexed_freqs[i].symbol = i; + } + + qsort(indexed_freqs, + nbytes, sizeof(INDEXED_BYTE), indexed_byte_comparator); + + /* We need two node queues to build the tree. */ + WT_ERR(__wt_calloc(env, 1, sizeof(NODE_QUEUE), &leaves)); + WT_ERR(__wt_calloc(env, 1, sizeof(NODE_QUEUE), &combined_nodes)); + + /* Adding the leaves to the queue */ + for (i = 0; i < nbytes; ++i) { + /* + * We are leaving out symbols with a frequency of 0. This + * assumes these symbols will NEVER occur in the source stream, + * and the purpose is to reduce the huffman tree's size. + * + * NOTE: Even if this behavior is not desired, the frequencies + * should have a range between 1 - 255, otherwise the algorithm + * cannot produce well balanced tree; so this can be treated as + * an optional feature. + */ + if (indexed_freqs[i].frequency > 0) { + WT_ERR(__wt_calloc( + env, 1, sizeof(WT_FREQTREE_NODE), &tempnode)); + tempnode->symbol = indexed_freqs[i].symbol; + tempnode->weight = indexed_freqs[i].frequency; + WT_ERR(node_queue_enqueue(env, leaves, tempnode)); + tempnode = NULL; + } + } + + while (!node_queue_is_empty(leaves) || + !node_queue_is_empty(combined_nodes)) { + /* + * We have to get the node with the smaller weight, examining + * both queues first element. We are collecting pairs of these + * items, by alternating between node and node2: + */ + refnode = !node ? &node : &node2; + + /* + * To decide which queue must be used, we get the weights of + * the first items from both: + */ + w1 = node_queue_is_empty(leaves) ? + UINT32_MAX : leaves->first->node->weight; + w2 = node_queue_is_empty(combined_nodes) ? + UINT32_MAX : combined_nodes->first->node->weight; + + /* + * Based on the two weights we finally can dequeue the smaller + * element and place it to the alternating target node pointer: + */ + if (w1 < w2) + node_queue_dequeue(env, leaves, refnode); + else + node_queue_dequeue(env, combined_nodes, refnode); + + /* + * In every second run, we have both node and node2 initialized. + */ + if (node != NULL && node2 != NULL) { + WT_ERR(__wt_calloc( + env, 1, sizeof(WT_FREQTREE_NODE), &tempnode)); + + /* The new weight is the sum of the two weights. */ + tempnode->weight = node->weight + node2->weight; + tempnode->left = node; + tempnode->right = node2; + + /* Enqueue it to the combined nodes queue */ + WT_ERR( + node_queue_enqueue(env, combined_nodes, tempnode)); + tempnode = NULL; + + /* Reset the state pointers */ + node = node2 = NULL; + } + } + + /* + * The remaining node is in the node variable, this is the root of the + * tree. Calculate the number of bytes it takes to hold nbytes bits. + */ + huffman->numSymbols = nbytes; + huffman->numBytes = nbytes > 256 ? 2 : 1; + + /* Traverse the tree and set the code word length for each node. */ + traverse_tree(node, 0, &huffman->max_depth); + + /* Converting the tree to a static array representation. */ + WT_ERR(__wt_calloc(env, 1 << huffman->max_depth, + sizeof(WT_STATIC_HUFFMAN_NODE), &huffman->nodes)); + fill_static_representation(huffman->nodes, node, 0); + + *(void **)retp = huffman; + +err: if (leaves != NULL) + node_queue_close(env, leaves); + if (combined_nodes != NULL) + node_queue_close(env, combined_nodes); + if (indexed_freqs != NULL) + __wt_free(env, indexed_freqs, 0); + if (node != NULL) + recursive_free_node(env, node); + if (node2 != NULL) + recursive_free_node(env, node2); + if (tempnode != NULL) + __wt_free(env, tempnode, sizeof(WT_FREQTREE_NODE)); + if (ret != 0) { + if (huffman->nodes != NULL) + __wt_free(env, huffman->nodes, 0); + __wt_free(env, huffman, sizeof(WT_HUFFMAN_OBJ)); + } + return (ret); +} + +/* + * __wt_huffman_close -- + * Discard a Huffman descriptor object. + */ +void +__wt_huffman_close(ENV *env, void *huffman_arg) +{ + WT_HUFFMAN_OBJ *huffman; + + huffman = huffman_arg; + + __wt_free(env, huffman->nodes, 0); + __wt_free(env, huffman, sizeof(WT_HUFFMAN_OBJ)); +} + +#ifdef HAVE_DIAGNOSTIC +/* + * __wt_print_huffman_code -- + * Prints a symbol's huffman code. Can be used for debugging purposes. + */ +int +__wt_print_huffman_code(ENV *env, void *huffman_arg, uint16_t symbol) +{ + WT_HUFFMAN_OBJ *huffman; + WT_STATIC_HUFFMAN_NODE *node; + u_int i, n; + int p; + char *buffer; + + huffman = huffman_arg; + + /* Check if the symbol is in valid range */ + if (symbol < huffman->numSymbols) { + WT_RET(__wt_calloc(env, huffman->max_depth, 1, &buffer)); + + node = NULL; + for (i = 0, n = 1 << huffman->max_depth; i < n; ++i) { + node = &huffman->nodes[i]; + if (node->valid && + node->symbol == symbol && node->codeword_length > 0) + break; + } + + if (node != NULL) { + /* + * We've got the leaf node, at index 'i'. Now we fill + * the output buffer in back order. + */ + for (p = node->codeword_length - 1; p >= 0; --p) { + buffer[p] = (i % 2) == 1 ? '0' : '1'; + i = (i - 1) / 2; + } + + (void)printf("%s\n", buffer); + } else { + (void)printf( + "Symbol is not in the huffman tree: %x\n", symbol); + return (WT_ERROR); + } + + __wt_free(env, buffer, 0); + } else + (void)printf("Symbol out of range: %lu >= %lu\n", + (u_long)symbol, (u_long)huffman->numSymbols); + return (0); +} +#endif + +/* + * __wt_huffman_encode -- + * Take a byte string, encode it into the target. + */ +int +__wt_huffman_encode(void *huffman_arg, + uint8_t *from, uint32_t from_len, + void *top, uint32_t *to_len, uint32_t *out_bytes_used) +{ + ENV *env; + WT_HUFFMAN_OBJ *huffman; + WT_STATIC_HUFFMAN_NODE *node; + uint32_t bitpos, i, n, j; + uint16_t symbol; + uint8_t padding_info, *to; + int p; + + huffman = huffman_arg; + env = huffman->env; + + /* + * We need N+1 bytes to encode N bytes, re-allocate as necessary. + * + * If the initial target pointer, or the initial target buffer length, + * aren't set, it's an allocation. Clear the initial target pointer, + * our caller may have only set the initial target buffer length, not + * the initial pointer value. + */ + if (to_len == NULL || *to_len < from_len + 1) { + if (to_len == NULL) + *(void **)top = NULL; + WT_RET(__wt_realloc(env, to_len, from_len + 1, top)); + } + + to = *(uint8_t **)top; + memset(to, 0, from_len + 1); + + /* + * Leave the first 3 bits of the encoded value empty, it holds the + * number of bits actually used in the last byte of the encoded value. + */ + bitpos = 3; + n = 1 << huffman->max_depth; + for (i = 0; i < from_len; i += huffman->numBytes) { + /* Getting the next symbol, either 1 or 2 bytes */ + if (huffman->numBytes == 1) + symbol = *from++; + else { + symbol = ((uint16_t)(*from++)) << 8; + symbol |= *from++; + } + + /* Getting the symbol's huffman code from the table */ + node = NULL; + for (j = 0; j < n; ++j) { + node = &huffman->nodes[j]; + if (node->valid && + node->symbol == symbol && node->codeword_length > 0) + break; + } + + if (node != NULL) { + /* + * We've got the leaf node, at index 'j'. Now we fill + * the output buffer in back order. + */ + for (p = node->codeword_length - 1; p >= 0; --p) { + MODIFY_BIT(to, bitpos + (u_int)p, (j % 2) ^ 1); + j = (j - 1) / 2; + } + + bitpos += node->codeword_length; + } else { + __wt_api_env_errx(NULL, + "Huffman compression: there was a symbol in the " + "source originally declared with zero frequency; " + "undefined source symbol: %lu", (u_long)symbol); + return (WT_ERROR); + } + } + + /* + * At this point, bitpos is the total number of used bits (including + * the 3 bits at the beginning of the buffer, which we'll set now to + * the number of bits used in the last byte). Note if the number of + * bits used in the last byte is 8, we set the 3 bits to 0, in other + * words, the first 3 bits of the encoded value are the number of bits + * used in the last byte, unless they're 0, in which case there are 8 + * bits used in the last byte. + */ + padding_info = (bitpos % 8) << 5; + *to |= padding_info; + + *out_bytes_used = bitpos / 8 + ((bitpos % 8) ? 1 : 0); + + return (0); +} + +/* + * __wt_huffman_decode -- + * Take a byte string, decode it into the target. + */ +int +__wt_huffman_decode(void *huffman_arg, + uint8_t *from, uint32_t from_len, + void *top, uint32_t *to_len, uint32_t *out_bytes_used) +{ + ENV *env; + WT_HUFFMAN_OBJ *huffman; + WT_STATIC_HUFFMAN_NODE* node; + uint32_t bytes, i, from_len_bits, node_idx; + uint8_t bitpos, mask, bit, padding_info, *to; + + huffman = huffman_arg; + env = huffman->env; + + /* + * We need 2N+1 bytes to decode N bytes, re-allocate as necessary. + * + * If the initial target pointer, or the initial target buffer length, + * aren't set, it's an allocation. Clear the initial target pointer, + * our caller may have only set the initial target buffer length, not + * the initial pointer value. + */ + if (to_len == NULL || *to_len < 2 * from_len + 1) { + if (to_len == NULL) + *(void **)top = NULL; + WT_RET(__wt_realloc(env, to_len, 2 * from_len + 1, top)); + } + + to = *(uint8_t **)top; + + bitpos = 4; /* Skipping the first 3 bits. */ + bytes = 0; + node_idx = 0; + + /* + * The first 3 bits are the number of used bits in the last byte, unless + * they're 0, in which case there are 8 bits used in the last byte. + */ + padding_info = (*from & 0xE0) >> 5; + from_len_bits = from_len * 8; + if (padding_info != 0) + from_len_bits -= 8 - padding_info; + + /* + * The loop will go through each bit of the source stream, its length + * is given in BITS! + */ + for (i = 3; i < from_len_bits; i++) { + /* Extracting the current bit */ + mask = (uint8_t)(1 << bitpos); + bit = (*from & mask); + + /* + * As we go through the bits, we also make steps in the huffman + * tree, originated from the root, toward the leaves. + */ + if (bit) + node_idx = (node_idx * 2) + 2; + else + node_idx = (node_idx * 2) + 1; + + node = &huffman->nodes[node_idx]; + + /* If this is a leaf, we've found a complete symbol. */ + if (node->valid && node->codeword_length > 0) { + if (huffman->numBytes == 1) + *to++ = (uint8_t)node->symbol; + else { + *to++ = (node->symbol & 0xFF00) >> 8; + *to++ = node->symbol & 0xFF; + } + + bytes += huffman->numBytes; + node_idx = 0; + } + + /* Moving forward one bit in the source stream. */ + if (bitpos > 0) + bitpos--; + else { + bitpos = 7; + from++; + } + } + + /* Return the number of bytes used. */ + *out_bytes_used = bytes; + + return (0); +} + +/* + * node_queue_close -- + * Delete a queue from memory. + * + * It does not delete the pointed huffman tree nodes! + */ +static void +node_queue_close(ENV *env, NODE_QUEUE *queue) +{ + NODE_QUEUE_ELEM *elem, *next_elem; + + /* Freeing each element of the queue's linked list. */ + for (elem = queue->first; elem != NULL; elem = next_elem) { + next_elem = elem->next; + __wt_free(env, elem, sizeof(NODE_QUEUE_ELEM)); + } + + /* Freeing the queue record itself. */ + __wt_free(env, queue, sizeof(NODE_QUEUE)); +} + +/* + * node_queue_enqueue -- + * Push a tree node to the end of the queue. + */ +static int +node_queue_enqueue(ENV *env, NODE_QUEUE *queue, WT_FREQTREE_NODE *node) +{ + NODE_QUEUE_ELEM *elem; + + /* Allocating a new linked list element */ + WT_RET(__wt_calloc(env, 1, sizeof(NODE_QUEUE_ELEM), &elem)); + + /* It holds the tree node, and has no next element yet */ + elem->node = node; + elem->next = NULL; + + /* If the queue is empty, the first element will be the new one. */ + if (queue->first == NULL) + queue->first = elem; + + /* + * If the queue is not empty, the last element's next pointer must be + * updated. + */ + if (queue->last != NULL) + queue->last->next = elem; + + /* The last element is the new one */ + queue->last = elem; + + return (0); +} + +/* + * node_queue_dequeue -- + * Removes a node from the beginning of the queue and copies the node's + * pointer to the location referred by the retp parameter. + */ +static void +node_queue_dequeue(ENV *env, NODE_QUEUE *queue, WT_FREQTREE_NODE **retp) +{ + NODE_QUEUE_ELEM *first_elem; + + /* + * Getting the first element of the queue and updating it to point to + * the next element as first. + */ + first_elem = queue->first; + *retp = first_elem->node; + queue->first = first_elem->next; + + /* + * If the last element was the dequeued element, we have to update it + * to NULL. + */ + if (queue->last == first_elem) + queue->last = NULL; + + /* Freeing the linked list element that has been dequeued */ + __wt_free(env, first_elem, sizeof(NODE_QUEUE_ELEM)); +} diff --git a/src/support/pow.c b/src/support/pow.c new file mode 100644 index 00000000000..3a6b6b1d686 --- /dev/null +++ b/src/support/pow.c @@ -0,0 +1,56 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_nlpo2 -- + * Return the next-largest power-of-two for a 32-bit unsigned value. + * + * In 12 operations, this code computes the next highest power of 2 for a 32-bit + * integer. The result may be expressed by the formula 1U << (lg(v - 1) + 1). + * Note that in the edge case where v is 0, it returns 0, which isn't a power of + * 2; you might append the expression v += (v == 0) to remedy this if it + * matters. It would be faster by 2 operations to use the formula and the + * log base 2 methed that uses a lookup table, but in some situations, lookup + * tables are not suitable, so the above code may be best. (On a Athlon XP 2100+ + * I've found the above shift-left and then OR code is as fast as using a single + * BSR assembly language instruction, which scans in reverse to find the highest + * set bit.) It works by copying the highest set bit to all of the lower bits, + * and then adding one, which results in carries that set all of the lower bits + * to 0 and one bit beyond the highest set bit to 1. If the original number was + * a power of 2, then the decrement will reduce it to one less, so that we round + * up to the same original value. Devised by Sean Anderson, Sepember 14, 2001. + * Pete Hart pointed me to a couple newsgroup posts by him and William Lewis in + * February of 1997, where they arrive at the same algorithm. + * http://graphics.stanford.edu/~seander/bithacks.html + * Sean Eron Anderson, seander@cs.stanford.edu + */ +uint32_t +__wt_nlpo2(uint32_t v) +{ + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + return (v); +} + +/* + * __wt_ispo2 -- + * Return if a number is a power-of-two. + */ +int +__wt_ispo2(uint32_t v) +{ + return ((v & (v - 1)) == 0 ? 1 : 0); +} diff --git a/src/support/prime.c b/src/support/prime.c new file mode 100644 index 00000000000..8abe43158b2 --- /dev/null +++ b/src/support/prime.c @@ -0,0 +1,75 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_prime + * Return a prime number relatively close to a value. + */ +uint32_t +__wt_prime(uint32_t n) +{ + /* + * Ref: the hash functions section of "Algorithms in C", by Sedgewick. + * + * The table is the same as the one in Berkeley DB -- check at each + * power-of-two up to 2^18, then mid-points between each power-of-two + * to a maximum of 2^30. + */ + static const struct { + uint32_t value; + uint32_t prime; + } t[] = { + { 32, 37 }, /* 2^5 */ + { 64, 67 }, /* 2^6 */ + { 128, 131 }, /* 2^7 */ + { 256, 257 }, /* 2^8 */ + { 512, 521 }, /* 2^9 */ + { 1024, 1031 }, /* 2^10 */ + { 2048, 2053 }, /* 2^11 */ + { 4096, 4099 }, /* 2^12 */ + { 8192, 8191 }, /* 2^13 */ + { 16384, 16381 }, /* 2^14 */ + { 32768, 32771 }, /* 2^15 */ + { 65536, 65537 }, /* 2^16 */ + { 131072, 131071 }, /* 2^17 */ + { 262144, 262147 }, /* 2^18 */ + { 393216, 393209 }, /* 2^18 + 2^18/2 */ + { 524288, 524287 }, /* 2^19 */ + { 786432, 786431 }, /* 2^19 + 2^19/2 */ + { 1048576, 1048573 }, /* 2^20 */ + { 1572864, 1572869 }, /* 2^20 + 2^20/2 */ + { 2097152, 2097169 }, /* 2^21 */ + { 3145728, 3145721 }, /* 2^21 + 2^21/2 */ + { 4194304, 4194301 }, /* 2^22 */ + { 6291456, 6291449 }, /* 2^22 + 2^22/2 */ + { 8388608, 8388617 }, /* 2^23 */ + { 12582912, 12582917 }, /* 2^23 + 2^23/2 */ + { 16777216, 16777213 }, /* 2^24 */ + { 25165824, 25165813 }, /* 2^24 + 2^24/2 */ + { 33554432, 33554393 }, /* 2^25 */ + { 50331648, 50331653 }, /* 2^25 + 2^25/2 */ + { 67108864, 67108859 }, /* 2^26 */ + { 100663296, 100663291 }, /* 2^26 + 2^26/2 */ + { 134217728, 134217757 }, /* 2^27 */ + { 201326592, 201326611 }, /* 2^27 + 2^27/2 */ + { 268435456, 268435459 }, /* 2^28 */ + { 402653184, 402653189 }, /* 2^28 + 2^28/2 */ + { 536870912, 536870909 }, /* 2^29 */ + { 805306368, 805306357 }, /* 2^29 + 2^29/2 */ + { 1073741824, 1073741827 }, /* 2^30 */ + }; + u_int i; + + for (i = 0; i < WT_ELEMENTS(t); ++i) + if (t[i].value > n) + return (t[i].prime); + return (t[WT_ELEMENTS(t) - 1].prime); +} diff --git a/src/support/progress.c b/src/support/progress.c new file mode 100644 index 00000000000..480699cbdd1 --- /dev/null +++ b/src/support/progress.c @@ -0,0 +1,17 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +void +__wt_progress(const char *s, uint64_t v) +{ + (void)printf("\r\t%s: %llu", s, (unsigned long long)v); + (void)fflush(stdout); +} diff --git a/src/support/scratch.c b/src/support/scratch.c new file mode 100644 index 00000000000..9b20ea963f3 --- /dev/null +++ b/src/support/scratch.c @@ -0,0 +1,98 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * __wt_scr_alloc -- + * Scratch buffer allocation function. + */ +int +__wt_scr_alloc(WT_TOC *toc, uint32_t size, DBT **dbtp) +{ + DBT *scratch; + ENV *env; + uint32_t allocated; + u_int i; + int ret; + + env = toc->env; + + *dbtp = NULL; /* Don't risk the caller not catching the error. */ + + /* + * There's an array of scratch buffers in each WT_TOC that can be used + * by any function. We use DBTs for scratch buffers because we already + * have to have functions that do variable-length allocation on DBTs. + * Scratch buffers are allocated only by a single thread of control, so + * no locking is necessary. + */ + for (i = 0, + scratch = toc->scratch; i < toc->scratch_alloc; ++i, ++scratch) + if (!F_ISSET(scratch, WT_SCRATCH_INUSE)) { + *dbtp = scratch; + F_SET(scratch, WT_SCRATCH_INUSE); + + /* + * If the caller has a minimum size, grow the scratch + * buffer as necessary. + */ + if (size != 0 && scratch->mem_size < size) + WT_RET(__wt_realloc(env, + &scratch->mem_size, size, &scratch->data)); + return (0); + } + + /* Resize the array, we need more scratch buffers. */ + allocated = toc->scratch_alloc * sizeof(DBT); + WT_ERR(__wt_realloc(env, &allocated, + (toc->scratch_alloc + 10) * sizeof(DBT), &toc->scratch)); + toc->scratch_alloc += 10; + return (__wt_scr_alloc(toc, size, dbtp)); + +err: __wt_api_env_errx(env, + "WT_TOC unable to allocate more scratch buffers"); + return (ret); +} + +/* + * __wt_scr_release -- + * Release a scratch buffer. + */ +void +__wt_scr_release(DBT **dbt) +{ + DBT *scratch; + + scratch = *dbt; + *dbt = NULL; + + F_CLR(scratch, WT_SCRATCH_INUSE); +} + +/* + * __wt_scr_free -- + * Free all memory associated with the scratch buffers. + */ +void +__wt_scr_free(WT_TOC *toc) +{ + DBT *scratch; + ENV *env; + u_int i; + + env = toc->env; + + for (i = 0, + scratch = toc->scratch; i < toc->scratch_alloc; ++i, ++scratch) + if (scratch->data != NULL) + __wt_free(env, scratch->data, scratch->mem_size); + + __wt_free(env, toc->scratch, toc->scratch_alloc * sizeof(DBT)); +} diff --git a/src/support/serial.c b/src/support/serial.c new file mode 100644 index 00000000000..9974f1f6b38 --- /dev/null +++ b/src/support/serial.c @@ -0,0 +1,123 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * Serialization: + * + * Serialization support allows scheduling operations that require serialized + * access to a piece of data, where the data (1) is accessed only by serialized + * code, or where the data, when accessed by non-serialized code, can either + * (2) be read atomically, or (3) it doesn't matter if it's read incorrectly. + * In other words, the readers are key, and they are known to be indifferent + * to the serialization code modifying the data. + * + * An example of #1 is updating the size of a database file. The size is only + * changed in serialized code, and never read by anything else. An example of + * #2 is updating a 32-bit value, because readers by definition get consistent + * views of 32-bit memory locations. An example of #3 is updating a 64-bit + * value (such as the bytes allocated in the cache). While there is a small + * possibility a reader will see a corrupted value, the value is only used for + * advisory actions, such as waking the cache thread to see if there's work to + * do. + */ + +/* + * __wt_toc_serialize_func -- + * Schedule a serialization request, and block or spin until it completes. + */ +int +__wt_toc_serialize_func( + WT_TOC *toc, wq_state_t op, int spin, int (*func)(WT_TOC *), void *args) +{ + int done; + + /* + * Threads serializing access to data using a function: + * set a function/argument pair in the WT_TOC handle, + * flush memory, + * update the WT_TOC workq state, and + * spins or blocks. + * + * The workQ thread notices the state change and calls the serialization + * function. + * + * The first memory flush ensures all supporting information is written + * before the wq_state field (which makes the entry visible to the workQ + * thread). No second memory flush is required, the wq_state field is + * declared volatile. + */ + toc->wq_args = args; + toc->wq_func = func; + toc->wq_sleeping = spin ? 0 : 1; + WT_MEMORY_FLUSH; + toc->wq_state = op; + + /* + * Callers can spin on the WT_TOC state (implying the call is quickly + * satisfied), or block until its mutex is unlocked by another thread + * when the operation has completed. + */ + if (spin) { + /* + * !!! + * Don't do arithmetic comparisons (even equality) on enum's, + * it makes some compilers/lint tools angry. + */ + for (done = 0; !done;) { + switch (toc->wq_state) { + case WT_WORKQ_NONE: + done = 1; + break; + case WT_WORKQ_FUNC: + case WT_WORKQ_READ: + case WT_WORKQ_READ_SCHED: + __wt_yield(); + break; + } + } + } else + __wt_lock(toc->env, toc->mtx); + + return (toc->wq_ret); +} + +/* + * __wt_toc_serialize_wrapup -- + * Server function cleanup. + */ +void +__wt_toc_serialize_wrapup(WT_TOC *toc, WT_PAGE *page, int ret) +{ + ENV *env; + + env = toc->env; + + /* + * If passed a page and the return value is good, we modified the page; + * no need for a memory flush, we'll use the one below. + */ + if (page != NULL && ret == 0) + WT_PAGE_SET_MODIFIED(page); + + /* + * Set the return value and reset the state -- the workQ no longer needs + * to worry about us. + * + * The return value isn't volatile, so requires an explicit flush. + */ + toc->wq_ret = ret; + toc->wq_state = WT_WORKQ_NONE; + WT_MEMORY_FLUSH; + + /* If the calling thread is sleeping, wake it up. */ + if (toc->wq_sleeping) + __wt_unlock(env, toc->mtx); +} diff --git a/src/support/simple_setup.c b/src/support/simple_setup.c new file mode 100644 index 00000000000..a4464fead69 --- /dev/null +++ b/src/support/simple_setup.c @@ -0,0 +1,94 @@ +/* + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include <stdlib.h> + +#include "wiredtiger.h" + +extern const char *progname; + +static ENV *__env; + +/* + * wiredtiger_simple_setup -- + * Standard setup for simple applications. + */ +int +wiredtiger_simple_setup( + const char *progname, DB **dbp, u_int32_t cache_size, u_int32_t flags) +{ + DB *db; + ENV *env; + int ret; + + db = *dbp = NULL; + + if ((ret = wiredtiger_env_init(&env, flags)) != 0) { + fprintf(stderr, + "%s: wiredtiger_env_init: %s\n", + progname, wiredtiger_strerror(ret)); + return (ret); + } + __env = env; + + if (cache_size != 0 && + (ret = env->cache_size_set(env, cache_size)) != 0) { + env->err(env, ret, "Env.cache_size_set"); + goto err; + } + + if ((ret = env->open(env, NULL, 0, 0)) != 0) { + env->err(env, ret, "%s: Env.open", progname); + goto err; + } + if ((ret = env->db(env, 0, &db)) != 0) { + env->err(env, ret, "%s: Env.db", progname); + goto err; + } + if ((ret = db->errpfx_set(db, progname)) != 0) { + db->err(db, ret, "%s: Db.errpfx_set", progname); + goto err; + } + + *dbp = db; + return (EXIT_SUCCESS); + +err: wiredtiger_simple_teardown(progname, db); + return (ret); +} + +/* + * wiredtiger_simple_teardown -- + * Standard teardown for simple applications. + */ +int +wiredtiger_simple_teardown(const char *progname, DB *db) +{ + int ret, tret; + + ret = 0; + if (db != NULL && (tret = db->close(db, 0)) != 0) { + fprintf(stderr, + "%s: Db.close: %s\n", progname, wiredtiger_strerror(ret)); + if (ret == 0) + ret = tret; + } + + if (__env != NULL) { + if ((tret = __env->close(__env, 0)) != 0) { + fprintf(stderr, "%s: Env.close: %s\n", + progname, wiredtiger_strerror(ret)); + if (ret == 0) + ret = tret; + } + __env = NULL; + } + + return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE); +} diff --git a/src/support/stat.c b/src/support/stat.c new file mode 100644 index 00000000000..bf08a95b12f --- /dev/null +++ b/src/support/stat.c @@ -0,0 +1,370 @@ +/* DO NOT EDIT: automatically built by dist/stat.py. */ + +#include "wt_internal.h" + +int +__wt_stat_alloc_cache_stats(ENV *env, WT_STATS **statsp) +{ + WT_STATS *stats; + + WT_RET(__wt_calloc(env, 10, sizeof(WT_STATS), &stats)); + + stats[WT_STAT_CACHE_BYTES_INUSE].desc = "bytes in the cache"; + stats[WT_STAT_CACHE_BYTES_MAX].desc = + "maximum bytes configured for the cache"; + stats[WT_STAT_CACHE_EVICT_HAZARD].desc = + "pages selected for eviction not evicted because of a hazard reference"; + stats[WT_STAT_CACHE_EVICT_MODIFIED].desc = + "modified pages selected for eviction"; + stats[WT_STAT_CACHE_EVICT_UNMODIFIED].desc = + "unmodified pages selected for eviction"; + stats[WT_STAT_CACHE_PAGES_INUSE].desc = "pages in the cache"; + stats[WT_STAT_OVERFLOW_READ].desc = + "overflow pages read from the file"; + stats[WT_STAT_PAGE_READ].desc = "pages read from a file"; + stats[WT_STAT_PAGE_WRITE].desc = "pages written to a file"; + + *statsp = stats; + return (0); +} + +void +__wt_stat_clear_cache_stats(WT_STATS *stats) +{ + stats[WT_STAT_CACHE_EVICT_HAZARD].v = 0; + stats[WT_STAT_CACHE_EVICT_MODIFIED].v = 0; + stats[WT_STAT_CACHE_EVICT_UNMODIFIED].v = 0; + stats[WT_STAT_OVERFLOW_READ].v = 0; + stats[WT_STAT_PAGE_READ].v = 0; + stats[WT_STAT_PAGE_WRITE].v = 0; +} + +int +__wt_stat_alloc_database_stats(ENV *env, WT_STATS **statsp) +{ + WT_STATS *stats; + + WT_RET(__wt_calloc(env, 27, sizeof(WT_STATS), &stats)); + + stats[WT_STAT_BASE_RECNO].desc = "base record number"; + stats[WT_STAT_DUP_TREE].desc = "duplicate data off-page trees"; + stats[WT_STAT_FIXED_LEN].desc = "database fixed-record size"; + stats[WT_STAT_INTLMAX].desc = "maximum internal page size"; + stats[WT_STAT_INTLMIN].desc = "minimum internal page size"; + stats[WT_STAT_ITEM_COL_DELETED].desc = + "column store deleted data items"; + stats[WT_STAT_ITEM_DATA_OVFL].desc = "total overflow data items"; + stats[WT_STAT_ITEM_DUP_DATA].desc = "total duplicate data items"; + stats[WT_STAT_ITEM_KEY_OVFL].desc = "total overflow keys"; + stats[WT_STAT_ITEM_TOTAL_DATA].desc = "total data items"; + stats[WT_STAT_ITEM_TOTAL_KEY].desc = "total keys"; + stats[WT_STAT_LEAFMAX].desc = "maximum leaf page size"; + stats[WT_STAT_LEAFMIN].desc = "minimum leaf page size"; + stats[WT_STAT_MAGIC].desc = "magic number"; + stats[WT_STAT_MAJOR].desc = "major version number"; + stats[WT_STAT_MINOR].desc = "minor version number"; + stats[WT_STAT_PAGE_COL_FIX].desc = + "column-store fixed-size leaf pages"; + stats[WT_STAT_PAGE_COL_INTERNAL].desc = "column-store internal pages"; + stats[WT_STAT_PAGE_COL_RLE].desc = + "column-store repeat-count compressed fixed-size leaf pages"; + stats[WT_STAT_PAGE_COL_VARIABLE].desc = + "column-store variable-size leaf pages"; + stats[WT_STAT_PAGE_DUP_INTERNAL].desc = "duplicate internal pages"; + stats[WT_STAT_PAGE_DUP_LEAF].desc = "duplicate leaf pages"; + stats[WT_STAT_PAGE_OVERFLOW].desc = "overflow pages"; + stats[WT_STAT_PAGE_ROW_INTERNAL].desc = "row-store internal pages"; + stats[WT_STAT_PAGE_ROW_LEAF].desc = "row-store leaf pages"; + stats[WT_STAT_TREE_LEVEL].desc = "number of levels in the btree"; + + *statsp = stats; + return (0); +} + +void +__wt_stat_clear_database_stats(WT_STATS *stats) +{ + stats[WT_STAT_BASE_RECNO].v = 0; + stats[WT_STAT_DUP_TREE].v = 0; + stats[WT_STAT_FIXED_LEN].v = 0; + stats[WT_STAT_INTLMAX].v = 0; + stats[WT_STAT_INTLMIN].v = 0; + stats[WT_STAT_ITEM_COL_DELETED].v = 0; + stats[WT_STAT_ITEM_DATA_OVFL].v = 0; + stats[WT_STAT_ITEM_DUP_DATA].v = 0; + stats[WT_STAT_ITEM_KEY_OVFL].v = 0; + stats[WT_STAT_ITEM_TOTAL_DATA].v = 0; + stats[WT_STAT_ITEM_TOTAL_KEY].v = 0; + stats[WT_STAT_LEAFMAX].v = 0; + stats[WT_STAT_LEAFMIN].v = 0; + stats[WT_STAT_MAGIC].v = 0; + stats[WT_STAT_MAJOR].v = 0; + stats[WT_STAT_MINOR].v = 0; + stats[WT_STAT_PAGE_COL_FIX].v = 0; + stats[WT_STAT_PAGE_COL_INTERNAL].v = 0; + stats[WT_STAT_PAGE_COL_RLE].v = 0; + stats[WT_STAT_PAGE_COL_VARIABLE].v = 0; + stats[WT_STAT_PAGE_DUP_INTERNAL].v = 0; + stats[WT_STAT_PAGE_DUP_LEAF].v = 0; + stats[WT_STAT_PAGE_OVERFLOW].v = 0; + stats[WT_STAT_PAGE_ROW_INTERNAL].v = 0; + stats[WT_STAT_PAGE_ROW_LEAF].v = 0; + stats[WT_STAT_TREE_LEVEL].v = 0; +} + +int +__wt_stat_alloc_db_stats(ENV *env, WT_STATS **statsp) +{ + WT_STATS *stats; + + WT_RET(__wt_calloc(env, 11, sizeof(WT_STATS), &stats)); + + stats[WT_STAT_DB_ALLOC].desc = "database allocations"; + stats[WT_STAT_DB_ALLOC_FILE].desc = "database extensions"; + stats[WT_STAT_DB_FREE].desc = "database frees"; + stats[WT_STAT_DUPLICATE_ITEMS_INSERTED].desc = + "duplicate key/data pairs inserted"; + stats[WT_STAT_HUFFMAN_DATA].desc = "huffman data compression in bytes"; + stats[WT_STAT_HUFFMAN_KEY].desc = "huffman key compression in bytes"; + stats[WT_STAT_ITEMS_INSERTED].desc = "key/data pairs inserted"; + stats[WT_STAT_OVERFLOW_DATA].desc = "overflow data items inserted"; + stats[WT_STAT_OVERFLOW_KEY].desc = "overflow key items inserted"; + stats[WT_STAT_REPEAT_COUNT].desc = "repeat value compression count"; + + *statsp = stats; + return (0); +} + +void +__wt_stat_clear_db_stats(WT_STATS *stats) +{ + stats[WT_STAT_DB_ALLOC].v = 0; + stats[WT_STAT_DB_ALLOC_FILE].v = 0; + stats[WT_STAT_DB_FREE].v = 0; + stats[WT_STAT_DUPLICATE_ITEMS_INSERTED].v = 0; + stats[WT_STAT_HUFFMAN_DATA].v = 0; + stats[WT_STAT_HUFFMAN_KEY].v = 0; + stats[WT_STAT_ITEMS_INSERTED].v = 0; + stats[WT_STAT_OVERFLOW_DATA].v = 0; + stats[WT_STAT_OVERFLOW_KEY].v = 0; + stats[WT_STAT_REPEAT_COUNT].v = 0; +} + +int +__wt_stat_alloc_env_stats(ENV *env, WT_STATS **statsp) +{ + WT_STATS *stats; + + WT_RET(__wt_calloc(env, 9, sizeof(WT_STATS), &stats)); + + stats[WT_STAT_DATABASE_OPEN].desc = "database open"; + stats[WT_STAT_MEMALLOC].desc = "memory allocations"; + stats[WT_STAT_MEMFREE].desc = "memory frees"; + stats[WT_STAT_MTX_LOCK].desc = "mutex lock calls"; + stats[WT_STAT_TOTAL_READ_IO].desc = "total read I/Os"; + stats[WT_STAT_TOTAL_WRITE_IO].desc = "total write I/Os"; + stats[WT_STAT_WORKQ_PASSES].desc = "workQ queue passes"; + stats[WT_STAT_WORKQ_YIELD].desc = "workQ yields"; + + *statsp = stats; + return (0); +} + +void +__wt_stat_clear_env_stats(WT_STATS *stats) +{ + stats[WT_STAT_DATABASE_OPEN].v = 0; + stats[WT_STAT_MEMALLOC].v = 0; + stats[WT_STAT_MEMFREE].v = 0; + stats[WT_STAT_MTX_LOCK].v = 0; + stats[WT_STAT_TOTAL_READ_IO].v = 0; + stats[WT_STAT_TOTAL_WRITE_IO].v = 0; + stats[WT_STAT_WORKQ_PASSES].v = 0; + stats[WT_STAT_WORKQ_YIELD].v = 0; +} + +int +__wt_stat_alloc_fh_stats(ENV *env, WT_STATS **statsp) +{ + WT_STATS *stats; + + WT_RET(__wt_calloc(env, 4, sizeof(WT_STATS), &stats)); + + stats[WT_STAT_FSYNC].desc = "fsyncs"; + stats[WT_STAT_READ_IO].desc = "read I/Os"; + stats[WT_STAT_WRITE_IO].desc = "write I/Os"; + + *statsp = stats; + return (0); +} + +void +__wt_stat_clear_fh_stats(WT_STATS *stats) +{ + stats[WT_STAT_FSYNC].v = 0; + stats[WT_STAT_READ_IO].v = 0; + stats[WT_STAT_WRITE_IO].v = 0; +} + +int +__wt_stat_alloc_method_stats(ENV *env, WT_STATS **statsp) +{ + WT_STATS *stats; + + WT_RET(__wt_calloc(env, 69, sizeof(WT_STATS), &stats)); + + stats[WT_STAT_DB_BTREE_COMPARE_DUP_GET].desc = + "db.btree_compare_dup_get"; + stats[WT_STAT_DB_BTREE_COMPARE_DUP_SET].desc = + "db.btree_compare_dup_set"; + stats[WT_STAT_DB_BTREE_COMPARE_GET].desc = "db.btree_compare_get"; + stats[WT_STAT_DB_BTREE_COMPARE_INT_GET].desc = + "db.btree_compare_int_get"; + stats[WT_STAT_DB_BTREE_COMPARE_INT_SET].desc = + "db.btree_compare_int_set"; + stats[WT_STAT_DB_BTREE_COMPARE_SET].desc = "db.btree_compare_set"; + stats[WT_STAT_DB_BTREE_DUP_OFFPAGE_GET].desc = + "db.btree_dup_offpage_get"; + stats[WT_STAT_DB_BTREE_DUP_OFFPAGE_SET].desc = + "db.btree_dup_offpage_set"; + stats[WT_STAT_DB_BTREE_ITEMSIZE_GET].desc = "db.btree_itemsize_get"; + stats[WT_STAT_DB_BTREE_ITEMSIZE_SET].desc = "db.btree_itemsize_set"; + stats[WT_STAT_DB_BTREE_PAGESIZE_GET].desc = "db.btree_pagesize_get"; + stats[WT_STAT_DB_BTREE_PAGESIZE_SET].desc = "db.btree_pagesize_set"; + stats[WT_STAT_DB_BULK_LOAD].desc = "db.bulk_load"; + stats[WT_STAT_DB_CLOSE].desc = "db.close"; + stats[WT_STAT_DB_COLUMN_SET].desc = "db.column_set"; + stats[WT_STAT_DB_COL_DEL].desc = "db.col_del"; + stats[WT_STAT_DB_COL_DEL_RESTART].desc = "db.col_del method restarts"; + stats[WT_STAT_DB_COL_GET].desc = "db.col_get"; + stats[WT_STAT_DB_COL_PUT].desc = "db.col_put"; + stats[WT_STAT_DB_COL_PUT_RESTART].desc = "db.col_put method restarts"; + stats[WT_STAT_DB_DUMP].desc = "db.dump"; + stats[WT_STAT_DB_ERRCALL_GET].desc = "db.errcall_get"; + stats[WT_STAT_DB_ERRCALL_SET].desc = "db.errcall_set"; + stats[WT_STAT_DB_ERRFILE_GET].desc = "db.errfile_get"; + stats[WT_STAT_DB_ERRFILE_SET].desc = "db.errfile_set"; + stats[WT_STAT_DB_ERRPFX_GET].desc = "db.errpfx_get"; + stats[WT_STAT_DB_ERRPFX_SET].desc = "db.errpfx_set"; + stats[WT_STAT_DB_HUFFMAN_SET].desc = "db.huffman_set"; + stats[WT_STAT_DB_OPEN].desc = "db.open"; + stats[WT_STAT_DB_ROW_DEL].desc = "db.row_del"; + stats[WT_STAT_DB_ROW_DEL_RESTART].desc = "db.row_del method restarts"; + stats[WT_STAT_DB_ROW_GET].desc = "db.row_get"; + stats[WT_STAT_DB_ROW_PUT].desc = "db.row_put"; + stats[WT_STAT_DB_ROW_PUT_RESTART].desc = "db.row_put method restarts"; + stats[WT_STAT_DB_STAT_CLEAR].desc = "db.stat_clear"; + stats[WT_STAT_DB_STAT_PRINT].desc = "db.stat_print"; + stats[WT_STAT_DB_SYNC].desc = "db.sync"; + stats[WT_STAT_DB_VERIFY].desc = "db.verify"; + stats[WT_STAT_ENV_CACHE_SIZE_GET].desc = "env.cache_size_get"; + stats[WT_STAT_ENV_CACHE_SIZE_SET].desc = "env.cache_size_set"; + stats[WT_STAT_ENV_CLOSE].desc = "env.close"; + stats[WT_STAT_ENV_DATA_UPDATE_INITIAL_GET].desc = + "env.data_update_initial_get"; + stats[WT_STAT_ENV_DATA_UPDATE_INITIAL_SET].desc = + "env.data_update_initial_set"; + stats[WT_STAT_ENV_DATA_UPDATE_MAX_GET].desc = + "env.data_update_max_get"; + stats[WT_STAT_ENV_DATA_UPDATE_MAX_SET].desc = + "env.data_update_max_set"; + stats[WT_STAT_ENV_DB].desc = "env.db"; + stats[WT_STAT_ENV_ERRCALL_GET].desc = "env.errcall_get"; + stats[WT_STAT_ENV_ERRCALL_SET].desc = "env.errcall_set"; + stats[WT_STAT_ENV_ERRFILE_GET].desc = "env.errfile_get"; + stats[WT_STAT_ENV_ERRFILE_SET].desc = "env.errfile_set"; + stats[WT_STAT_ENV_ERRPFX_GET].desc = "env.errpfx_get"; + stats[WT_STAT_ENV_ERRPFX_SET].desc = "env.errpfx_set"; + stats[WT_STAT_ENV_HAZARD_SIZE_GET].desc = "env.hazard_size_get"; + stats[WT_STAT_ENV_HAZARD_SIZE_SET].desc = "env.hazard_size_set"; + stats[WT_STAT_ENV_MSGCALL_GET].desc = "env.msgcall_get"; + stats[WT_STAT_ENV_MSGCALL_SET].desc = "env.msgcall_set"; + stats[WT_STAT_ENV_MSGFILE_GET].desc = "env.msgfile_get"; + stats[WT_STAT_ENV_MSGFILE_SET].desc = "env.msgfile_set"; + stats[WT_STAT_ENV_OPEN].desc = "env.open"; + stats[WT_STAT_ENV_STAT_CLEAR].desc = "env.stat_clear"; + stats[WT_STAT_ENV_STAT_PRINT].desc = "env.stat_print"; + stats[WT_STAT_ENV_SYNC].desc = "env.sync"; + stats[WT_STAT_ENV_TOC].desc = "env.toc"; + stats[WT_STAT_ENV_TOC_SIZE_GET].desc = "env.toc_size_get"; + stats[WT_STAT_ENV_TOC_SIZE_SET].desc = "env.toc_size_set"; + stats[WT_STAT_ENV_VERBOSE_GET].desc = "env.verbose_get"; + stats[WT_STAT_ENV_VERBOSE_SET].desc = "env.verbose_set"; + stats[WT_STAT_WT_TOC_CLOSE].desc = "wt_toc.close"; + + *statsp = stats; + return (0); +} + +void +__wt_stat_clear_method_stats(WT_STATS *stats) +{ + stats[WT_STAT_DB_BTREE_COMPARE_DUP_GET].v = 0; + stats[WT_STAT_DB_BTREE_COMPARE_DUP_SET].v = 0; + stats[WT_STAT_DB_BTREE_COMPARE_GET].v = 0; + stats[WT_STAT_DB_BTREE_COMPARE_INT_GET].v = 0; + stats[WT_STAT_DB_BTREE_COMPARE_INT_SET].v = 0; + stats[WT_STAT_DB_BTREE_COMPARE_SET].v = 0; + stats[WT_STAT_DB_BTREE_DUP_OFFPAGE_GET].v = 0; + stats[WT_STAT_DB_BTREE_DUP_OFFPAGE_SET].v = 0; + stats[WT_STAT_DB_BTREE_ITEMSIZE_GET].v = 0; + stats[WT_STAT_DB_BTREE_ITEMSIZE_SET].v = 0; + stats[WT_STAT_DB_BTREE_PAGESIZE_GET].v = 0; + stats[WT_STAT_DB_BTREE_PAGESIZE_SET].v = 0; + stats[WT_STAT_DB_BULK_LOAD].v = 0; + stats[WT_STAT_DB_CLOSE].v = 0; + stats[WT_STAT_DB_COLUMN_SET].v = 0; + stats[WT_STAT_DB_COL_DEL].v = 0; + stats[WT_STAT_DB_COL_DEL_RESTART].v = 0; + stats[WT_STAT_DB_COL_GET].v = 0; + stats[WT_STAT_DB_COL_PUT].v = 0; + stats[WT_STAT_DB_COL_PUT_RESTART].v = 0; + stats[WT_STAT_DB_DUMP].v = 0; + stats[WT_STAT_DB_ERRCALL_GET].v = 0; + stats[WT_STAT_DB_ERRCALL_SET].v = 0; + stats[WT_STAT_DB_ERRFILE_GET].v = 0; + stats[WT_STAT_DB_ERRFILE_SET].v = 0; + stats[WT_STAT_DB_ERRPFX_GET].v = 0; + stats[WT_STAT_DB_ERRPFX_SET].v = 0; + stats[WT_STAT_DB_HUFFMAN_SET].v = 0; + stats[WT_STAT_DB_OPEN].v = 0; + stats[WT_STAT_DB_ROW_DEL].v = 0; + stats[WT_STAT_DB_ROW_DEL_RESTART].v = 0; + stats[WT_STAT_DB_ROW_GET].v = 0; + stats[WT_STAT_DB_ROW_PUT].v = 0; + stats[WT_STAT_DB_ROW_PUT_RESTART].v = 0; + stats[WT_STAT_DB_STAT_CLEAR].v = 0; + stats[WT_STAT_DB_STAT_PRINT].v = 0; + stats[WT_STAT_DB_SYNC].v = 0; + stats[WT_STAT_DB_VERIFY].v = 0; + stats[WT_STAT_ENV_CACHE_SIZE_GET].v = 0; + stats[WT_STAT_ENV_CACHE_SIZE_SET].v = 0; + stats[WT_STAT_ENV_CLOSE].v = 0; + stats[WT_STAT_ENV_DATA_UPDATE_INITIAL_GET].v = 0; + stats[WT_STAT_ENV_DATA_UPDATE_INITIAL_SET].v = 0; + stats[WT_STAT_ENV_DATA_UPDATE_MAX_GET].v = 0; + stats[WT_STAT_ENV_DATA_UPDATE_MAX_SET].v = 0; + stats[WT_STAT_ENV_DB].v = 0; + stats[WT_STAT_ENV_ERRCALL_GET].v = 0; + stats[WT_STAT_ENV_ERRCALL_SET].v = 0; + stats[WT_STAT_ENV_ERRFILE_GET].v = 0; + stats[WT_STAT_ENV_ERRFILE_SET].v = 0; + stats[WT_STAT_ENV_ERRPFX_GET].v = 0; + stats[WT_STAT_ENV_ERRPFX_SET].v = 0; + stats[WT_STAT_ENV_HAZARD_SIZE_GET].v = 0; + stats[WT_STAT_ENV_HAZARD_SIZE_SET].v = 0; + stats[WT_STAT_ENV_MSGCALL_GET].v = 0; + stats[WT_STAT_ENV_MSGCALL_SET].v = 0; + stats[WT_STAT_ENV_MSGFILE_GET].v = 0; + stats[WT_STAT_ENV_MSGFILE_SET].v = 0; + stats[WT_STAT_ENV_OPEN].v = 0; + stats[WT_STAT_ENV_STAT_CLEAR].v = 0; + stats[WT_STAT_ENV_STAT_PRINT].v = 0; + stats[WT_STAT_ENV_SYNC].v = 0; + stats[WT_STAT_ENV_TOC].v = 0; + stats[WT_STAT_ENV_TOC_SIZE_GET].v = 0; + stats[WT_STAT_ENV_TOC_SIZE_SET].v = 0; + stats[WT_STAT_ENV_VERBOSE_GET].v = 0; + stats[WT_STAT_ENV_VERBOSE_SET].v = 0; + stats[WT_STAT_WT_TOC_CLOSE].v = 0; +} diff --git a/src/support/strerror.c b/src/support/strerror.c new file mode 100644 index 00000000000..17a4653438a --- /dev/null +++ b/src/support/strerror.c @@ -0,0 +1,41 @@ +/* DO NOT EDIT: automatically built by dist/api_err.py. */ + +#include "wt_internal.h" + +/* + * wiredtiger_strerror -- + * Return a string for any error value. + */ +char * +wiredtiger_strerror(int error) +{ + static char errbuf[64]; + char *p; + + if (error == 0) + return ("Successful return: 0"); + + switch (error) { + case WT_ERROR: + return ("WT_ERROR: non-specific WiredTiger error"); + case WT_NOTFOUND: + return ("WT_NOTFOUND: database item not found"); + case WT_READONLY: + return ("WT_READONLY: modification attempted of a read-only database"); + case WT_RESTART: + return ("WT_RESTART: restart the operation (internal)"); + case WT_TOOSMALL: + return ("WT_TOOSMALL: buffer too small"); + default: + if (error > 0 && (p = strerror(error)) != NULL) + return (p); + break; + } + + /* + * !!! + * Not thread-safe, but this is never supposed to happen. + */ + (void)snprintf(errbuf, sizeof(errbuf), "Unknown error: %d", error); + return (errbuf); +} diff --git a/src/support/version.c b/src/support/version.c new file mode 100644 index 00000000000..dbd60162c16 --- /dev/null +++ b/src/support/version.c @@ -0,0 +1,26 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2008-2011 WiredTiger, Inc. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" + +/* + * wiredtiger_version -- + * Return library version information. + */ +char * +wiredtiger_version(int *majorp, int *minorp, int *patchp) +{ + if (majorp != NULL) + *majorp = WIREDTIGER_VERSION_MAJOR; + if (minorp != NULL) + *minorp = WIREDTIGER_VERSION_MINOR; + if (patchp != NULL) + *patchp = WIREDTIGER_VERSION_PATCH; + return ((char *)WIREDTIGER_VERSION_STRING); +} diff --git a/src/utilities/db_dump/util_dump.c b/src/utilities/db_dump/util_dump.c new file mode 100644 index 00000000000..68cc6d69061 --- /dev/null +++ b/src/utilities/db_dump/util_dump.c @@ -0,0 +1,83 @@ +/* + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2009 WiredTiger Software. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" +#include "util.h" + +const char *progname; + +int usage(void); + +int +main(int argc, char *argv[]) +{ + extern char *optarg; + extern int optind; + DB *db; + u_int32_t flags; + int ch, ret, tret; + + WT_UTILITY_INTRO(progname, argv); + + flags = 0; + while ((ch = getopt(argc, argv, "df:p")) != EOF) + switch (ch) { + case 'd': + flags = WT_DEBUG; + break; + case 'f': /* output file */ + if (freopen(optarg, "w", stdout) == NULL) { + fprintf(stderr, "%s: %s: reopen: %s\n", + progname, optarg, strerror(errno)); + return (EXIT_FAILURE); + } + break; + case 'p': + flags = WT_PRINTABLES; + break; + case 'V': /* version */ + printf("%s\n", wiredtiger_version(NULL, NULL, NULL)); + return (EXIT_SUCCESS); + case '?': + default: + return (usage()); + } + argc -= optind; + argv += optind; + + /* The remaining argument is the database name. */ + if (argc != 1) + return (usage()); + + if ((ret = wiredtiger_simple_setup(progname, &db, 0, 0)) == 0) { + if ((ret = db->open(db, *argv, 0, 0)) != 0) { + db->err(db, ret, "Db.open: %s", *argv); + goto err; + } + if ((ret = db->dump(db, stdout, NULL, flags)) != 0) { + db->err(db, ret, "Db.dump"); + goto err; + } + } + + if (0) { +err: ret = 1; + } + if ((tret = wiredtiger_simple_teardown(progname, db)) != 0 && ret == 0) + ret = tret; + return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE); +} + +int +usage() +{ + (void)fprintf(stderr, + "usage: %s [-dpV] [-f output-file] database\n", progname); + return (EXIT_FAILURE); +} diff --git a/src/utilities/db_load/util_load.c b/src/utilities/db_load/util_load.c new file mode 100644 index 00000000000..6ededed7c28 --- /dev/null +++ b/src/utilities/db_load/util_load.c @@ -0,0 +1,292 @@ +/* + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2009 WiredTiger Software. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" +#include "util.h" + +const char *progname; + +int bulk_callback(DB *, DBT **, DBT **); +int bulk_read(DBT *dbt, int); +int config_read(char **); +int config_read_single(char *); +int config_set(DB *); +int usage(void); + +struct { + int pagesize_set; + u_long allocsize, intlmin, intlmax, leafmin, leafmax; +} config; + +int +main(int argc, char *argv[]) +{ + extern char *optarg; + extern int optind; + DB *db; + int ch, ret, text_input, tret, verbose; + char **config_list, **config_next; + + WT_UTILITY_INTRO(progname, argv); + + /* + * We can't handle configuration-line information until we've opened + * the DB handle, so we need a place to store it for now. + */ + if ((config_next = + config_list = calloc(argc + 1, sizeof(char *))) == NULL) { + fprintf(stderr, "%s: %s\n", progname, strerror(errno)); + return (EXIT_FAILURE); + } + + text_input = verbose = 0; + while ((ch = getopt(argc, argv, "c:f:TVv")) != EOF) + switch (ch) { + case 'c': /* command-line option */ + *config_next++ = optarg; + break; + case 'f': /* input file */ + if (freopen(optarg, "r", stdin) == NULL) { + fprintf(stderr, "%s: %s: reopen: %s\n", + progname, optarg, strerror(errno)); + return (EXIT_FAILURE); + } + break; + case 'T': + text_input = 1; + break; + case 'V': /* version */ + printf("%s\n", wiredtiger_version(NULL, NULL, NULL)); + return (EXIT_SUCCESS); + case 'v': + verbose = 1; + break; + case '?': + default: + return (usage()); + } + argc -= optind; + argv += optind; + + /* The remaining argument is the database name. */ + if (argc != 1) + return (usage()); + + /* + * Read through the command-line configuration options and convert + * to the config structure. + */ + if (config_read(config_list) != 0) + goto err; + + /* + * Right now, we only support text input -- require the T option to + * match Berkeley DB's API. + */ + if (text_input == 0) { + fprintf(stderr, + "%s: the -T option is currently required\n", progname); + return (EXIT_FAILURE); + } + + if ((ret = wiredtiger_simple_setup(progname, &db, 0, 0)) == 0) { + if (config_set(db) != 0) + goto err; + + (void)remove(*argv); + + if ((ret = db->open(db, *argv, 0600, WT_CREATE)) != 0) { + db->err(db, ret, "Db.open: %s", *argv); + goto err; + } + + if ((ret = db->bulk_load(db, WT_DUPLICATES, + verbose ? __wt_progress : NULL, bulk_callback)) != 0) { + db->err(db, ret, "Db.bulk_load"); + goto err; + } + if (verbose) + printf("\n"); + } + + if (0) { +err: ret = 1; + } + if ((tret = wiredtiger_simple_teardown(progname, db)) != 0 && ret == 0) + ret = tret; + return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE); +} + +/* + * config_read -- + * Convert command-line options into the config structure. + */ +int +config_read(char **list) +{ + int ret; + + for (; *list != NULL; ++list) + if ((ret = config_read_single(*list)) != 0) + return (ret); + return (0); +} + +/* + * config_read_single -- + * Process a single command-line configuration option, converting it into + * the config structure. + */ +int +config_read_single(char *opt) +{ + u_long v; + char *p, *ep; + + /* Get pointers to the two parts of an X=Y format string. */ + if ((p = strchr(opt, '=')) == NULL || p[1] == '\0') + goto format; + *p++ = '\0'; + v = strtoul(p, &ep, 10); + if (v == ULONG_MAX && errno == ERANGE) { +format: fprintf(stderr, + "%s: -c option %s is not correctly formatted\n", + progname, opt); + return (1); + } + if (strcmp(opt, "allocsize") == 0) { + config.allocsize = v; + config.pagesize_set = 1; + return (0); + } + if (strcmp(opt, "intlmin") == 0) { + config.intlmin = v; + config.pagesize_set = 1; + return (0); + } + if (strcmp(opt, "intlmax") == 0) { + config.intlmax = v; + config.pagesize_set = 1; + return (0); + } + if (strcmp(opt, "leafmin") == 0) { + config.leafmin = v; + config.pagesize_set = 1; + return (0); + } + if (strcmp(opt, "leafmax") == 0) { + config.leafmax = v; + config.pagesize_set = 1; + return (0); + } + + fprintf(stderr, + "%s: -c option %s has an unknown keyword\n", progname, opt); + return (1); +} + +/* + * config_set -- + * Set the command-line configuration options on the database handle. + */ +int +config_set(DB *db) +{ + u_int32_t allocsize, intlmin, intlmax, leafmin, leafmax; + int ret; + + if (config.pagesize_set) { + if ((ret = db->btree_pagesize_get(db, + &allocsize, &intlmin, &intlmax, &leafmin, &leafmax)) != 0) { + db->err(db, ret, "Db.btree_pagesize_get"); + return (1); + } + if (config.allocsize != 0) + allocsize = config.allocsize; + if (config.intlmin != 0) + intlmin = config.intlmin; + if (config.intlmax != 0) + intlmax = config.intlmax; + if (config.leafmin != 0) + leafmin = config.leafmin; + if (config.leafmax != 0) + leafmax = config.leafmax; + if ((ret = db->btree_pagesize_set(db, + allocsize, intlmin, intlmax, leafmin, leafmax)) != 0) { + db->err(db, ret, "Db.btree_pagesize_set"); + return (1); + } + } + + return (0); +} + +/* + * bulk_read -- + * Read a line from stdin into a DBT. + */ +int +bulk_read(DBT *dbt, int iskey) +{ + static u_int64_t line = 0; + size_t len; + int ch; + + ++line; + for (len = 0;; ++len) { + if ((ch = getchar()) == EOF) { + if (iskey && len == 0) + return (1); + fprintf(stderr, "%s: corrupted input at line %llu\n", + progname, line); + return (WT_ERROR); + } + if (ch == '\n') + break; + if (len >= dbt->mem_size) { + if ((dbt->data = realloc(dbt->data, len + 128)) == NULL) + return (errno); + dbt->mem_size = len + 128; + } + ((u_int8_t *)(dbt->data))[len] = ch; + } + dbt->size = len; + return (0); +} + +/* + * bulk_callback -- + * Bulk-load callback function. + */ +int +bulk_callback(DB *db, DBT **keyp, DBT **datap) +{ + static DBT key, data; + int ret; + + WT_CC_QUIET(db, NULL); + + if ((ret = bulk_read(&key, 1)) != 0) + return (ret); + if ((ret = bulk_read(&data, 0)) != 0) + return (ret); + + *keyp = &key; + *datap = &data; + return (0); +} + +int +usage() +{ + (void)fprintf(stderr, + "usage: %s [-TVv] [-c configuration] [-f input-file] database\n", + progname); + return (EXIT_FAILURE); +} diff --git a/src/utilities/db_stat/util_stat.c b/src/utilities/db_stat/util_stat.c new file mode 100644 index 00000000000..afb2f94cba8 --- /dev/null +++ b/src/utilities/db_stat/util_stat.c @@ -0,0 +1,67 @@ +/* + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2009 WiredTiger Software. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" +#include "util.h" + +const char *progname; + +int usage(void); + +int +main(int argc, char *argv[]) +{ + extern char *optarg; + extern int optind; + DB *db; + int ch, ret, tret; + + WT_UTILITY_INTRO(progname, argv); + + while ((ch = getopt(argc, argv, "V")) != EOF) + switch (ch) { + case 'V': /* version */ + printf("%s\n", wiredtiger_version(NULL, NULL, NULL)); + return (EXIT_SUCCESS); + case '?': + default: + return (usage()); + } + argc -= optind; + argv += optind; + + /* The remaining argument is the database name. */ + if (argc != 1) + return (usage()); + + if ((ret = wiredtiger_simple_setup(progname, &db, 0, 0)) == 0) { + if ((ret = db->open(db, *argv, 0, 0)) != 0) { + db->err(db, ret, "Db.open: %s", *argv); + goto err; + } + if ((ret = db->stat_print(db, stdout, 0)) != 0) { + db->err(db, ret, "Db.stat: %s", *argv); + goto err; + } + } + + if (0) { +err: ret = 1; + } + if ((tret = wiredtiger_simple_teardown(progname, db)) != 0 && ret == 0) + ret = tret; + return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE); +} + +int +usage() +{ + (void)fprintf(stderr, "usage: %s [-V] database\n", progname); + return (EXIT_FAILURE); +} diff --git a/src/utilities/db_verify/util_verify.c b/src/utilities/db_verify/util_verify.c new file mode 100644 index 00000000000..5c5bd02407f --- /dev/null +++ b/src/utilities/db_verify/util_verify.c @@ -0,0 +1,74 @@ +/* + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2009 WiredTiger Software. + * All rights reserved. + * + * $Id$ + */ + +#include "wt_internal.h" +#include "util.h" + +const char *progname; + +int usage(void); + +int +main(int argc, char *argv[]) +{ + extern char *optarg; + extern int optind; + DB *db; + int ch, ret, tret, verbose; + + WT_UTILITY_INTRO(progname, argv); + + verbose = 0; + while ((ch = getopt(argc, argv, "Vv")) != EOF) + switch (ch) { + case 'v': /* verbose */ + verbose = 1; + break; + case 'V': /* version */ + printf("%s\n", wiredtiger_version(NULL, NULL, NULL)); + return (EXIT_SUCCESS); + case '?': + default: + return (usage()); + } + argc -= optind; + argv += optind; + + /* The remaining argument is the database name. */ + if (argc != 1) + return (usage()); + + if ((ret = wiredtiger_simple_setup(progname, &db, 0, 0)) == 0) { + if ((ret = db->open(db, *argv, 0, 0)) != 0) { + db->err(db, ret, "Db.open: %s", *argv); + goto err; + } + if ((ret = + db->verify(db, verbose ? __wt_progress : NULL, 0)) != 0) { + db->err(db, ret, "Db.verify: %s", *argv); + goto err; + } + if (verbose) + printf("\n"); + } + + if (0) { +err: ret = 1; + } + if ((tret = wiredtiger_simple_teardown(progname, db)) != 0 && ret == 0) + ret = tret; + return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE); +} + +int +usage() +{ + (void)fprintf(stderr, "usage: %s [-Vv] database\n", progname); + return (EXIT_FAILURE); +} |