summaryrefslogtreecommitdiff
path: root/src/btree/bt_open.c
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@wiredtiger.com>2011-02-01 09:24:17 +1100
committerMichael Cahill <michael.cahill@wiredtiger.com>2011-02-01 09:24:17 +1100
commit7ebbbf1d52c1ed989cfe5f4fde3b98e983db2e63 (patch)
tree0e0fd0f6b190dbcd283ca3c4040b5dcd89a94014 /src/btree/bt_open.c
parent6f87637341366fb90f890a5ef860e90c57b36d1f (diff)
downloadmongo-7ebbbf1d52c1ed989cfe5f4fde3b98e983db2e63.tar.gz
Restructure the tree to ease merge.
refs #27 --HG-- branch : keith rename : lint/fl => dist/lint/fl rename : lint/lint.current => dist/lint/lint.current rename : inc_posix/bitstring.h => include/bitstring.h rename : inc_posix/btree.h => include/btree.h rename : inc_posix/cache.h => include/cache.h rename : inc_posix/debug.h => include/debug.h rename : inc_posix/extern.h => include/extern.h rename : inc_posix/fh.h => include/fh.h rename : inc_posix/mem.h => include/mem.h rename : inc_posix/misc.h => include/misc.h rename : inc_posix/mutex.h => include/mutex.h rename : inc_posix/queue.h => include/queue.h rename : inc_posix/serial.h => include/serial.h rename : inc_posix/stat.h => include/stat.h rename : inc_posix/util.h => include/util.h rename : inc_posix/walk.h => include/walk.h rename : inc_posix/wiredtiger.in => include/wiredtiger.in rename : inc_posix/wt_internal.in => include/wt_internal.in rename : btree/bt_alloc.c => src/btree/bt_alloc.c rename : btree/bt_bulk.c => src/btree/bt_bulk.c rename : btree/bt_cache.c => src/btree/bt_cache.c rename : btree/bt_close.c => src/btree/bt_close.c rename : btree/bt_cmp.c => src/btree/bt_cmp.c rename : btree/bt_debug.c => src/btree/bt_debug.c rename : btree/bt_desc.c => src/btree/bt_desc.c rename : btree/bt_discard.c => src/btree/bt_discard.c rename : btree/bt_dump.c => src/btree/bt_dump.c rename : btree/bt_evict.c => src/btree/bt_evict.c rename : btree/bt_misc.c => src/btree/bt_misc.c rename : btree/bt_open.c => src/btree/bt_open.c rename : btree/bt_ovfl.c => src/btree/bt_ovfl.c rename : btree/bt_page.c => src/btree/bt_page.c rename : btree/bt_read.c => src/btree/bt_read.c rename : btree/bt_reconcile.c => src/btree/bt_reconcile.c rename : btree/bt_ret.c => src/btree/bt_ret.c rename : btree/bt_rw.c => src/btree/bt_rw.c rename : btree/bt_stat.c => src/btree/bt_stat.c rename : btree/bt_sync.c => src/btree/bt_sync.c rename : btree/bt_vrfy.c => src/btree/bt_vrfy.c rename : btree/bt_walk.c => src/btree/bt_walk.c rename : btree/c_drain.c => src/btree/c_drain.c rename : btree/c_init.c => src/btree/c_init.c rename : btree/c_page.c => src/btree/c_page.c rename : btree/c_read.c => src/btree/c_read.c rename : btree/col_get.c => src/btree/col_get.c rename : btree/col_put.c => src/btree/col_put.c rename : btree/col_srch.c => src/btree/col_srch.c rename : btree/row_get.c => src/btree/row_get.c rename : btree/row_put.c => src/btree/row_put.c rename : btree/row_srch.c => src/btree/row_srch.c rename : db/db_err.c => src/db/db_err.c rename : db/db_getset.c => src/db/db_getset.c rename : db/db_handle.c => src/db/db_handle.c rename : db/db_huffman.c => src/db/db_huffman.c rename : db/db_open.c => src/db/db_open.c rename : db/db_stat.c => src/db/db_stat.c rename : db/db_sync.c => src/db/db_sync.c rename : env/env_err.c => src/env/env_err.c rename : env/env_getset.c => src/env/env_getset.c rename : env/env_global.c => src/env/env_global.c rename : env/env_handle.c => src/env/env_handle.c rename : env/env_init.c => src/env/env_init.c rename : env/env_msg.c => src/env/env_msg.c rename : env/env_open.c => src/env/env_open.c rename : env/env_stat.c => src/env/env_stat.c rename : env/env_sync.c => src/env/env_sync.c rename : env/env_toc.c => src/env/env_toc.c rename : env/env_workq.c => src/env/env_workq.c rename : os_posix/os_abort.c => src/os_posix/os_abort.c rename : os_posix/os_alloc.c => src/os_posix/os_alloc.c rename : os_posix/os_filesize.c => src/os_posix/os_filesize.c rename : os_posix/os_fsync.c => src/os_posix/os_fsync.c rename : os_posix/os_mtx.c => src/os_posix/os_mtx.c rename : os_posix/os_open.c => src/os_posix/os_open.c rename : os_posix/os_rw.c => src/os_posix/os_rw.c rename : os_posix/os_sleep.c => src/os_posix/os_sleep.c rename : os_posix/os_thread.c => src/os_posix/os_thread.c rename : os_posix/os_yield.c => src/os_posix/os_yield.c rename : support/api.c => src/support/api.c rename : support/cksum.c => src/support/cksum.c rename : support/err.c => src/support/err.c rename : support/hazard.c => src/support/hazard.c rename : support/huffman.c => src/support/huffman.c rename : support/pow.c => src/support/pow.c rename : support/prime.c => src/support/prime.c rename : support/progress.c => src/support/progress.c rename : support/scratch.c => src/support/scratch.c rename : support/serial.c => src/support/serial.c rename : support/simple_setup.c => src/support/simple_setup.c rename : support/stat.c => src/support/stat.c rename : support/strerror.c => src/support/strerror.c rename : support/version.c => src/support/version.c rename : utilities/db_dump/util_dump.c => src/utilities/db_dump/util_dump.c rename : utilities/db_load/util_load.c => src/utilities/db_load/util_load.c rename : utilities/db_stat/util_stat.c => src/utilities/db_stat/util_stat.c rename : utilities/db_verify/util_verify.c => src/utilities/db_verify/util_verify.c
Diffstat (limited to 'src/btree/bt_open.c')
-rw-r--r--src/btree/bt_open.c279
1 files changed, 279 insertions, 0 deletions
diff --git a/src/btree/bt_open.c b/src/btree/bt_open.c
new file mode 100644
index 00000000000..c746782221e
--- /dev/null
+++ b/src/btree/bt_open.c
@@ -0,0 +1,279 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2008-2011 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "wt_internal.h"
+
+static int __wt_open_verify(DB *);
+static int __wt_open_verify_page_sizes(DB *);
+
+/*
+ * __wt_bt_open --
+ * Open a Btree.
+ */
+int
+__wt_bt_open(WT_TOC *toc, int ok_create)
+{
+ DB *db;
+ ENV *env;
+ IDB *idb;
+
+ db = toc->db;
+ env = toc->env;
+ idb = db->idb;
+
+ /* Check page size configuration. */
+ WT_RET(__wt_open_verify(db));
+
+ /* Open the fle. */
+ WT_RET(__wt_open(env, idb->name, idb->mode, ok_create, &idb->fh));
+
+ /*
+ * If the file size is 0, write a description page; if the file size
+ * is non-zero, update the DB handle based on the on-disk description
+ * page. (If the file isn't empty, there must be a description page.)
+ */
+ if (idb->fh->file_size == 0)
+ WT_RET(__wt_desc_write(toc));
+ else {
+ WT_RET(__wt_desc_read(toc));
+
+ /* If there's a root page, pin it. */
+ if (idb->root_off.addr != WT_ADDR_INVALID)
+ WT_RET(__wt_root_pin(toc));
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_open_verify --
+ * Verify anything we can't verify before we're about to open the file;
+ * set defaults as necessary.
+ */
+static int
+__wt_open_verify(DB *db)
+{
+ IDB *idb;
+
+ idb = db->idb;
+
+ /* Verify the page sizes. */
+ WT_RET(__wt_open_verify_page_sizes(db));
+
+ /* Verify other configuration combinations. */
+ if (db->fixed_len != 0 && (idb->huffman_key || idb->huffman_data)) {
+ __wt_api_db_errx(db,
+ "Fixed size column-store databases may not be Huffman "
+ "compressed");
+ return (WT_ERROR);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_open_verify_page_sizes --
+ * Verify the page sizes.
+ */
+static int
+__wt_open_verify_page_sizes(DB *db)
+{
+ IDB *idb;
+
+ idb = db->idb;
+
+ /*
+ * The application can set lots of page sizes. It's complicated, so
+ * instead of verifying the relationships when they're set, verify
+ * then when the database is opened and we know we have the final
+ * values. (Besides, if we verify the relationships when they're set,
+ * the application has to set them in a specific order or we'd need
+ * one set function that took 10 parameters.)
+ *
+ * If the values haven't been set, set the defaults.
+ *
+ * Default to a small fragment size, so overflow items don't consume
+ * a lot of space.
+ */
+ if (db->allocsize == 0)
+ db->allocsize = WT_BTREE_ALLOCATION_SIZE;
+
+ /* Allocation sizes must be a power-of-two, nothing else makes sense. */
+ if (!__wt_ispo2(db->allocsize)) {
+ __wt_api_db_errx(db,
+ "the allocation size must be a power of two");
+ return (WT_ERROR);
+ }
+
+ /*
+ * Limit allocation units to 256MB, and page sizes to 128MB. There's
+ * no reason (other than testing) we can't support larger sizes (any
+ * sizes up to the smaller of an off_t and a size_t should work), but
+ * an application specifying larger allocation or page sizes is almost
+ * certainly making a mistake.
+ */
+ if (db->allocsize > WT_BTREE_ALLOCATION_SIZE_MAX) {
+ __wt_api_db_errx(db,
+ "the allocation size must less than or equal to %luMB",
+ (u_long)(WT_BTREE_PAGE_SIZE_MAX / WT_MEGABYTE));
+ return (WT_ERROR);
+ }
+
+ /*
+ * Internal pages are also usually small, we want it to fit into the
+ * L1 cache. We try and put at least 40 keys on each internal page
+ * (40 because that results in 100M keys in a level 5 Btree). But,
+ * if it's a small page, push anything bigger than about 50 bytes
+ * off-page. Here's the table:
+ * Pagesize Largest key retained on-page:
+ * 512B 50 bytes
+ * 1K 50 bytes
+ * 2K 51 bytes
+ * 4K 102 bytes
+ * 8K 204 bytes
+ * and so on, roughly doubling for each power-of-two.
+ */
+ if (db->intlmin == 0)
+ db->intlmin = WT_BTREE_INTLMIN_DEFAULT;
+ if (db->intlmax == 0)
+ db->intlmax = WT_MAX(db->intlmin, WT_BTREE_INTLMAX_DEFAULT);
+ if (db->intlitemsize == 0) {
+ if (db->intlmin <= 1024)
+ db->intlitemsize = 50;
+ else
+ db->intlitemsize = db->intlmin / 40;
+ }
+
+ /*
+ * Leaf pages are larger to amortize I/O across a large chunk of the
+ * data space, but still minimize the chance of a broken write. We
+ * only require 20 key/data pairs fit onto a leaf page. Again, if it's
+ * a small page, push anything bigger than about 80 bytes off-page.
+ * Here's the table:
+ * Pagesize Largest key or data item retained on-page:
+ * 512B 80 bytes
+ * 1K 80 bytes
+ * 2K 80 bytes
+ * 4K 80 bytes
+ * 8K 204 bytes
+ * 16K 409 bytes
+ * and so on, roughly doubling for each power-of-two.
+ */
+ if (db->leafmin == 0)
+ db->leafmin = WT_BTREE_LEAFMIN_DEFAULT;
+ if (db->leafmax == 0)
+ db->leafmax = WT_MAX(db->leafmin, WT_BTREE_LEAFMAX_DEFAULT);
+ if (db->leafitemsize == 0) {
+ if (db->leafmin <= 4096)
+ db->leafitemsize = 80;
+ else
+ db->leafitemsize = db->leafmin / 40;
+ }
+
+ /* Final checks for safety. */
+ if (db->intlmin % db->allocsize != 0 ||
+ db->intlmax % db->allocsize != 0 ||
+ db->leafmin % db->allocsize != 0 ||
+ db->leafmax % db->allocsize != 0) {
+ __wt_api_db_errx(db,
+ "all page sizes must be a multiple of %lu bytes",
+ (u_long)db->allocsize);
+ return (WT_ERROR);
+ }
+
+ if (db->intlmin > db->intlmax || db->leafmin > db->leafmax) {
+ __wt_api_db_errx(db,
+ "minimum page sizes must be less than or equal to maximum "
+ "page sizes");
+ return (WT_ERROR);
+ }
+
+ if (db->intlmin > WT_BTREE_PAGE_SIZE_MAX ||
+ db->intlmax > WT_BTREE_PAGE_SIZE_MAX ||
+ db->leafmin > WT_BTREE_PAGE_SIZE_MAX ||
+ db->leafmax > WT_BTREE_PAGE_SIZE_MAX) {
+ __wt_api_db_errx(db,
+ "all page sizes must less than or equal to %luMB",
+ (u_long)WT_BTREE_PAGE_SIZE_MAX / WT_MEGABYTE);
+ return (WT_ERROR);
+ }
+
+ /*
+ * We only have 3 bytes of length for on-page items, so the maximum
+ * on-page item size is limited to 16MB.
+ */
+ if (db->intlitemsize > WT_ITEM_MAX_LEN)
+ db->intlitemsize = WT_ITEM_MAX_LEN;
+ if (db->leafitemsize > WT_ITEM_MAX_LEN)
+ db->leafitemsize = WT_ITEM_MAX_LEN;
+
+ /*
+ * By default, any duplicate set that reaches 25% of a leaf page is
+ * moved into its own separate tree.
+ */
+ if (db->btree_dup_offpage == 0)
+ db->btree_dup_offpage = 4;
+
+ /*
+ * A leaf page must hold at least 2 key/data pairs, otherwise the
+ * whole btree thing breaks down because we can't split. We have
+ * to include WT_DESC_SIZE in leaf page calculations, it's not
+ * strictly necessary in internal pages because page 0 is always
+ * a leaf page. The additional 10 bytes is for slop -- Berkeley DB
+ * took roughly a decade to get the calculation correct, and that
+ * way I can skip the suspense.
+ */
+#define WT_MINIMUM_DATA_SPACE(db, s) \
+ (((s) - (WT_PAGE_DISK_SIZE + WT_PAGE_DESC_SIZE + 10)) / 4)
+ if (db->intlitemsize > WT_MINIMUM_DATA_SPACE(db, db->intlmin)) {
+ __wt_api_db_errx(db,
+ "The internal page size is too small for its maximum item "
+ "size");
+ return (WT_ERROR);
+ }
+ if (db->leafitemsize > WT_MINIMUM_DATA_SPACE(db, db->leafmin)) {
+ __wt_api_db_errx(db,
+ "The leaf page size is too small for its maximum item "
+ "size");
+ return (WT_ERROR);
+ }
+
+ /*
+ * A fixed-size column store should be able to store at least 20
+ * objects on a page, otherwise it just doesn't make sense.
+ */
+ if (F_ISSET(idb, WT_COLUMN) &&
+ db->fixed_len != 0 && db->leafmin / db->fixed_len < 20) {
+ __wt_api_db_errx(db,
+ "The leaf page size cannot store at least 20 fixed-length "
+ "objects");
+ return (WT_ERROR);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_root_pin --
+ * Read in the root page and pin it into memory.
+ */
+int
+__wt_root_pin(WT_TOC *toc)
+{
+ IDB *idb;
+
+ idb = toc->db->idb;
+
+ /* Get the root page. */
+ WT_RET(__wt_page_in(toc, NULL, &idb->root_page, &idb->root_off, 0));
+ F_SET(idb->root_page.page, WT_PINNED);
+ __wt_hazard_clear(toc, idb->root_page.page);
+
+ return (0);
+}