diff options
author | Keith Bostic <keith@wiredtiger.com> | 2014-12-15 09:35:54 -0500 |
---|---|---|
committer | Keith Bostic <keith@wiredtiger.com> | 2014-12-15 09:35:54 -0500 |
commit | 980165614f114dbcf02344ba7209ae77369bcb80 (patch) | |
tree | 952a89a49aa758ec177ed9ce491524d0c1c79c1f | |
parent | 4c26d2324bae1d7030b0142d50dbd2ccf11ddeb6 (diff) | |
parent | 5cf21acf8fd66876e71334cc09deac0a09e8ea91 (diff) | |
download | mongo-980165614f114dbcf02344ba7209ae77369bcb80.tar.gz |
Merge branch 'develop' into cursor-reconfigure
Conflicts:
src/cursor/cur_metadata.c
81 files changed, 927 insertions, 745 deletions
diff --git a/bench/wtperf/runners/small-lsm.wtperf b/bench/wtperf/runners/small-lsm.wtperf index 1b00d18d76b..8c7f65bb8b0 100644 --- a/bench/wtperf/runners/small-lsm.wtperf +++ b/bench/wtperf/runners/small-lsm.wtperf @@ -1,6 +1,6 @@ # wtperf options file: small lsm configuration conn_config="cache_size=500MB" -table_config="lsm=(chunk_size=5MB),type=lsm,os_cache_dirty_max=16MB" +table_config="lsm=(chunk_size=10MB),type=lsm" icount=500000 report_interval=5 run_time=120 diff --git a/dist/api_data.py b/dist/api_data.py index 2f7757dce6b..bf1346c187c 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -207,17 +207,26 @@ file_config = format_meta + [ block compression is done''', min='512B', max='512MB'), Config('internal_item_max', '0', r''' - the largest key stored within an internal node, in bytes. If - non-zero, any key larger than the specified size will be - stored as an overflow item (which may require additional I/O - to access). If zero, a default size is chosen that permits at - least 8 keys per internal page''', - min=0), + historic term for internal_key_max''', + min=0, undoc=True), + Config('internal_key_max', '0', r''' + the largest key stored in an internal node, in bytes. If set, keys + larger than the specified size are stored as overflow items (which + may require additional I/O to access). The default and the maximum + allowed value are both one-tenth the size of a newly split internal + page''', + min='0'), Config('key_gap', '10', r''' the maximum gap between instantiated keys in a Btree leaf page, constraining the number of keys processed to instantiate a random Btree leaf page key''', min='0', undoc=True), + Config('leaf_key_max', '0', r''' + the largest key stored in a leaf node, in bytes. If set, keys + larger than the specified size are stored as overflow items (which + may require additional I/O to access). The default value is + one-tenth the size of a newly split leaf page''', + min='0'), Config('leaf_page_max', '32KB', r''' the maximum page size for leaf nodes, in bytes; the size must be a multiple of the allocation size, and is significant for @@ -226,13 +235,17 @@ file_config = format_meta + [ data, that is, the limit is applied before any block compression is done''', min='512B', max='512MB'), + Config('leaf_value_max', '0', r''' + the largest value stored in a leaf node, in bytes. If set, values + larger than the specified size are stored as overflow items (which + may require additional I/O to access). If the size is larger than + the maximum leaf page size, the page size is temporarily ignored + when large values are written. The default is one-half the size of + a newly split leaf page''', + min='0'), Config('leaf_item_max', '0', r''' - the largest key or value stored within a leaf node, in bytes. - If non-zero, any key or value larger than the specified size - will be stored as an overflow item (which may require additional - I/O to access). If zero, a default size is chosen that permits - at least 4 key and value pairs per leaf page''', - min=0), + historic term for leaf_key_max and leaf_value_max''', + min=0, undoc=True), Config('memory_page_max', '5MB', r''' the maximum size a page can grow to in memory before being reconciled to disk. The specified size will be adjusted to a lower diff --git a/dist/api_err.py b/dist/api_err.py index 0c61a41ff28..cb2c8cc588e 100644 --- a/dist/api_err.py +++ b/dist/api_err.py @@ -42,7 +42,9 @@ errors = [ Error('WT_PANIC', -31804, 'WiredTiger library panic', ''' This error indicates an underlying problem that requires the - application exit and restart.'''), + application exit and restart. The application can exit + immediately when \c WT_PANIC is returned from a WiredTiger + interface, no further WiredTiger calls are required.'''), Error('WT_RESTART', -31805, 'restart the operation (internal)', undoc=True), ] diff --git a/dist/stat_data.py b/dist/stat_data.py index bd628e7418a..d1d3dd7e5ea 100644 --- a/dist/stat_data.py +++ b/dist/stat_data.py @@ -348,14 +348,16 @@ dsrc_stats = [ BtreeStat('btree_fixed_len', 'fixed-record size', 'no_aggregate,no_scale'), BtreeStat('btree_maximum_depth', 'maximum tree depth', 'max_aggregate,no_scale'), - BtreeStat('btree_maxintlitem', - 'maximum internal page item size', 'no_aggregate,no_scale'), + BtreeStat('btree_maxintlkey', + 'maximum internal page key size', 'no_aggregate,no_scale'), BtreeStat('btree_maxintlpage', 'maximum internal page size', 'no_aggregate,no_scale'), - BtreeStat('btree_maxleafitem', - 'maximum leaf page item size', 'no_aggregate,no_scale'), + BtreeStat('btree_maxleafkey', + 'maximum leaf page key size', 'no_aggregate,no_scale'), BtreeStat('btree_maxleafpage', 'maximum leaf page size', 'no_aggregate,no_scale'), + BtreeStat('btree_maxleafvalue', + 'maximum leaf page value size', 'no_aggregate,no_scale'), BtreeStat('btree_overflow', 'overflow pages', 'no_scale'), BtreeStat('btree_row_internal', 'row-store internal pages', 'no_scale'), BtreeStat('btree_row_leaf', 'row-store leaf pages', 'no_scale'), diff --git a/examples/c/Makefile.am b/examples/c/Makefile.am index 17beba4a470..382c5912fef 100644 --- a/examples/c/Makefile.am +++ b/examples/c/Makefile.am @@ -13,7 +13,6 @@ noinst_PROGRAMS = \ ex_data_source \ ex_extending \ ex_extractor \ - ex_file \ ex_hello \ ex_log \ ex_pack \ diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c index db418deed9d..cf5fb363c2f 100644 --- a/examples/c/ex_all.c +++ b/examples/c/ex_all.c @@ -524,6 +524,20 @@ session_ops(WT_SESSION *session) /*! [Create a table with columns] */ ret = session->drop(session, "table:mytable", NULL); + /*! [Create a table and configure the page size] */ + ret = session->create(session, + "table:mytable", "key_format=S,value_format=S" + "internal_page_max=16KB,leaf_page_max=1MB,leaf_value_max=64KB"); + /*! [Create a table and configure the page size] */ + ret = session->drop(session, "table:mytable", NULL); + + /*! [Create a table and configure a large leaf value max] */ + ret = session->create(session, + "table:mytable", "key_format=S,value_format=S" + "leaf_page_max=16KB,leaf_value_max=256KB"); + /*! [Create a table and configure a large leaf value max] */ + ret = session->drop(session, "table:mytable", NULL); + /* * This example code gets run, and the compression libraries might not * be loaded, causing the create to fail. The documentation requires diff --git a/examples/c/ex_backup.c b/examples/c/ex_backup.c index fb5c5b9d299..ea572c8810b 100644 --- a/examples/c/ex_backup.c +++ b/examples/c/ex_backup.c @@ -125,7 +125,7 @@ compare_backups(int i) * That way we can compare the full and incremental each time through. */ static int -setup_directories() +setup_directories(void) { int i, ret; char buf[1024]; diff --git a/examples/c/ex_file.c b/examples/c/ex_file.c deleted file mode 100644 index 4170d1b099d..00000000000 --- a/examples/c/ex_file.c +++ /dev/null @@ -1,72 +0,0 @@ -/*- - * Public Domain 2008-2014 WiredTiger, Inc. - * - * This is free and unencumbered software released into the public domain. - * - * Anyone is free to copy, modify, publish, use, compile, sell, or - * distribute this software, either in source code form or as a compiled - * binary, for any purpose, commercial or non-commercial, and by any - * means. - * - * In jurisdictions that recognize copyright laws, the author or authors - * of this software dedicate any and all copyright interest in the - * software to the public domain. We make this dedication for the benefit - * of the public at large and to the detriment of our heirs and - * successors. We intend this dedication to be an overt act of - * relinquishment in perpetuity of all present and future rights to this - * software under copyright law. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - * ex_file.c - * This is an example demonstrating how to configure an individual file. - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#include <wiredtiger.h> - -static const char *home; - -int -main(void) -{ - WT_CONNECTION *conn; - WT_SESSION *session; - int ret; - - /* - * Create a clean test directory for this run of the test program if the - * environment variable isn't already set (as is done by make check). - */ - if (getenv("WIREDTIGER_HOME") == NULL) { - home = "WT_HOME"; - ret = system("rm -rf WT_HOME && mkdir WT_HOME"); - } else - home = NULL; - - if ((ret = wiredtiger_open(home, NULL, "create", &conn)) != 0 || - (ret = conn->open_session(conn, NULL, NULL, &session)) != 0) { - fprintf(stderr, "Error connecting to %s: %s\n", - home, wiredtiger_strerror(ret)); - return (ret); - } - /* Note: further error checking omitted for clarity. */ - - /*! [file create] */ - ret = session->create(session, "file:example", - "key_format=u," - "internal_page_max=32KB,internal_item_max=1KB," - "leaf_page_max=1MB,leaf_item_max=32KB"); - /*! [file create] */ - - return (conn->close(conn, NULL) == 0 ? ret : EXIT_FAILURE); -} diff --git a/src/async/async_api.c b/src/async/async_api.c index 3cb78e80b09..6aeb404bccd 100644 --- a/src/async/async_api.c +++ b/src/async/async_api.c @@ -54,7 +54,7 @@ __async_get_format(WT_CONNECTION_IMPL *conn, const char *uri, WT_RET( __wt_open_internal_session(conn, "async-cursor", 1, 1, &session)); __wt_spin_lock(session, &async->ops_lock); - WT_ERR(__wt_calloc_def(session, 1, &af)); + WT_ERR(__wt_calloc_one(session, &af)); WT_ERR(__wt_strdup(session, uri, &af->uri)); WT_ERR(__wt_strdup(session, config, &af->config)); af->uri_hash = uri_hash; @@ -232,7 +232,7 @@ __async_start(WT_SESSION_IMPL *session) /* * Async is on, allocate the WT_ASYNC structure and initialize the ops. */ - WT_RET(__wt_calloc(session, 1, sizeof(WT_ASYNC), &conn->async)); + WT_RET(__wt_calloc_one(session, &conn->async)); async = conn->async; STAILQ_INIT(&async->formatqh); WT_RET(__wt_spin_init(session, &async->ops_lock, "ops")); diff --git a/src/async/async_worker.c b/src/async/async_worker.c index 7a88ac9dd6e..ecf052fc3bf 100644 --- a/src/async/async_worker.c +++ b/src/async/async_worker.c @@ -150,7 +150,7 @@ __async_worker_cursor(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op, * We didn't find one in our cache. Open one and cache it. * Insert it at the head expecting LRU usage. */ - WT_RET(__wt_calloc_def(session, 1, &ac)); + WT_RET(__wt_calloc_one(session, &ac)); WT_ERR(wt_session->open_cursor( wt_session, op->format->uri, NULL, op->format->config, &c)); ac->cfg_hash = op->format->cfg_hash; diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c index 4f7f2898de5..a9b3b07904d 100644 --- a/src/block/block_mgr.c +++ b/src/block/block_mgr.c @@ -419,7 +419,7 @@ __wt_block_manager_open(WT_SESSION_IMPL *session, *bmp = NULL; - WT_RET(__wt_calloc_def(session, 1, &bm)); + WT_RET(__wt_calloc_one(session, &bm)); __bm_method_set(bm, 0); WT_ERR(__wt_block_open(session, filename, cfg, diff --git a/src/block/block_open.c b/src/block/block_open.c index 7b68c59c766..0abe9cffc5f 100644 --- a/src/block/block_open.c +++ b/src/block/block_open.c @@ -128,7 +128,7 @@ __wt_block_open(WT_SESSION_IMPL *session, } /* Basic structure allocation, initialization. */ - WT_ERR(__wt_calloc_def(session, 1, &block)); + WT_ERR(__wt_calloc_one(session, &block)); block->ref = 1; TAILQ_INSERT_HEAD(&conn->blockqh, block, q); diff --git a/src/block/block_session.c b/src/block/block_session.c index fa56b72f49b..90fe0af562a 100644 --- a/src/block/block_session.c +++ b/src/block/block_session.c @@ -152,7 +152,7 @@ __block_ext_discard(WT_SESSION_IMPL *session, u_int max) static int __block_size_alloc(WT_SESSION_IMPL *session, WT_SIZE **szp) { - return (__wt_calloc(session, 1, sizeof(WT_SIZE), szp)); + return (__wt_calloc_one(session, szp)); } /* diff --git a/src/bloom/bloom.c b/src/bloom/bloom.c index b8fecfe0efd..5f7a8f47c21 100644 --- a/src/bloom/bloom.c +++ b/src/bloom/bloom.c @@ -28,7 +28,7 @@ __bloom_init(WT_SESSION_IMPL *session, *bloomp = NULL; - WT_RET(__wt_calloc_def(session, 1, &bloom)); + WT_RET(__wt_calloc_one(session, &bloom)); WT_ERR(__wt_strdup(session, uri, &bloom->uri)); len = strlen(WT_BLOOM_TABLE_CONFIG) + 2; diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c index 2fc1b0d5460..a58ed5d66e9 100644 --- a/src/btree/bt_delete.c +++ b/src/btree/bt_delete.c @@ -117,7 +117,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp) * Record the change in the transaction structure and set the change's * transaction ID. */ - WT_ERR(__wt_calloc_def(session, 1, &ref->page_del)); + WT_ERR(__wt_calloc_one(session, &ref->page_del)); ref->page_del->txnid = session->txn.id; WT_ERR(__wt_txn_modify_ref(session, ref)); @@ -306,7 +306,7 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) * deleted items. */ for (i = 0; i < page->pg_row_entries; ++i) { - WT_ERR(__wt_calloc_def(session, 1, &upd)); + WT_ERR(__wt_calloc_one(session, &upd)); WT_UPDATE_DELETED_SET(upd); if (page_del == NULL) diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index fe2623b055b..10ea6cd019c 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -13,9 +13,6 @@ static int __btree_page_sizes(WT_SESSION_IMPL *); static int __btree_preload(WT_SESSION_IMPL *); static int __btree_tree_open_empty(WT_SESSION_IMPL *, int, int); -static int pse1(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t); -static int pse2(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t, int); - /* * __wt_btree_open -- * Open a Btree. @@ -623,153 +620,98 @@ __btree_page_sizes(WT_SESSION_IMPL *session) btree = S2BT(session); cfg = btree->dhandle->cfg; + /* + * Get the allocation size. Allocation sizes must be a power-of-two, + * nothing else makes sense. + */ WT_RET(__wt_direct_io_size_check( session, cfg, "allocation_size", &btree->allocsize)); + if (!__wt_ispo2(btree->allocsize)) + WT_RET_MSG(session, + EINVAL, "the allocation size must be a power of two"); + + /* + * Get the internal/leaf page sizes. + * All page sizes must be in units of the allocation size. + */ WT_RET(__wt_direct_io_size_check( session, cfg, "internal_page_max", &btree->maxintlpage)); - WT_RET(__wt_config_gets(session, cfg, "internal_item_max", &cval)); - btree->maxintlitem = (uint32_t)cval.val; WT_RET(__wt_direct_io_size_check( session, cfg, "leaf_page_max", &btree->maxleafpage)); - WT_RET(__wt_config_gets(session, cfg, "leaf_item_max", &cval)); - btree->maxleafitem = (uint32_t)cval.val; - - WT_RET(__wt_config_gets(session, cfg, "split_pct", &cval)); - btree->split_pct = (int)cval.val; + if (btree->maxintlpage < btree->allocsize || + btree->maxintlpage % btree->allocsize != 0 || + btree->maxleafpage < btree->allocsize || + btree->maxleafpage % btree->allocsize != 0) + WT_RET_MSG(session, EINVAL, + "page sizes must be a multiple of the page allocation " + "size (%" PRIu32 "B)", btree->allocsize); /* * When a page is forced to split, we want at least 50 entries on its * parent. - */ - WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval)); - btree->maxmempage = WT_MAX((uint64_t)cval.val, 50 * btree->maxleafpage); - - /* + * * Don't let pages grow to more than half the cache size. Otherwise, * with very small caches, we can end up in a situation where nothing * can be evicted. Take care getting the cache size: with a shared * cache, it may not have been set. */ + WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval)); + btree->maxmempage = WT_MAX((uint64_t)cval.val, 50 * btree->maxleafpage); cache_size = S2C(session)->cache_size; if (cache_size > 0) btree->maxmempage = WT_MIN(btree->maxmempage, cache_size / 2); - /* Allocation sizes must be a power-of-two, nothing else makes sense. */ - if (!__wt_ispo2(btree->allocsize)) - WT_RET_MSG(session, - EINVAL, "the allocation size must be a power of two"); - - /* All page sizes must be in units of the allocation size. */ - if (btree->maxintlpage < btree->allocsize || - btree->maxintlpage % btree->allocsize != 0 || - btree->maxleafpage < btree->allocsize || - btree->maxleafpage % btree->allocsize != 0) - WT_RET_MSG(session, EINVAL, - "page sizes must be a multiple of the page allocation " - "size (%" PRIu32 "B)", btree->allocsize); - /* - * Set the split percentage: reconciliation splits to a smaller-than- - * maximum page size so we don't split every time a new entry is added. + * Get the split percentage (reconciliation splits pages into smaller + * than the maximum page size chunks so we don't split every time a + * new entry is added). Determine how large newly split pages will be. */ + WT_RET(__wt_config_gets(session, cfg, "split_pct", &cval)); + btree->split_pct = (int)cval.val; intl_split_size = __wt_split_page_size(btree, btree->maxintlpage); leaf_split_size = __wt_split_page_size(btree, btree->maxleafpage); /* - * Default values for internal and leaf page items: make sure at least - * 8 items fit on split pages. - */ - if (btree->maxintlitem == 0) - btree->maxintlitem = intl_split_size / 8; - if (btree->maxleafitem == 0) - btree->maxleafitem = leaf_split_size / 8; - - /* - * If raw compression is configured, the application owns page layout, - * it's not our problem. Hopefully the application chose well. + * Get the maximum internal/leaf page key/value sizes. + * + * In historic versions of WiredTiger, the maximum internal/leaf page + * key/value sizes were set by the internal_item_max and leaf_item_max + * configuration strings. Look for those strings if we don't find the + * newer ones. */ - if (btree->compressor != NULL && - btree->compressor->compress_raw != NULL) - return (0); - - /* Check we can fit at least 2 items on a page. */ - if (btree->maxintlitem > btree->maxintlpage / 2) - return (pse1(session, "internal", - btree->maxintlpage, btree->maxintlitem)); - if (btree->maxleafitem > btree->maxleafpage / 2) - return (pse1(session, "leaf", - btree->maxleafpage, btree->maxleafitem)); + WT_RET(__wt_config_gets(session, cfg, "internal_key_max", &cval)); + btree->maxintlkey = (uint32_t)cval.val; + if (btree->maxintlkey == 0) { + WT_RET( + __wt_config_gets(session, cfg, "internal_item_max", &cval)); + btree->maxintlkey = (uint32_t)cval.val; + } + WT_RET(__wt_config_gets(session, cfg, "leaf_key_max", &cval)); + btree->maxleafkey = (uint32_t)cval.val; + WT_RET(__wt_config_gets(session, cfg, "leaf_value_max", &cval)); + btree->maxleafvalue = (uint32_t)cval.val; + if (btree->maxleafkey == 0 && btree->maxleafvalue == 0) { + WT_RET(__wt_config_gets(session, cfg, "leaf_item_max", &cval)); + btree->maxleafkey = (uint32_t)cval.val; + btree->maxleafvalue = (uint32_t)cval.val; + } /* - * Take into account the size of a split page: + * Default/maximum for internal and leaf page keys: split-page / 10. + * Default for leaf page values: split-page / 2. * - * Make it a separate error message so it's clear what went wrong. + * It's difficult for applications to configure this in any exact way as + * they have to duplicate our calculation of how many keys must fit on a + * page, and given a split-percentage and page header, that isn't easy + * to do. If the maximum internal key value is too large for the page, + * reset it to the default. */ - if (btree->maxintlitem > intl_split_size / 2) - return (pse2(session, "internal", - btree->maxintlpage, btree->maxintlitem, btree->split_pct)); - if (btree->maxleafitem > leaf_split_size / 2) - return (pse2(session, "leaf", - btree->maxleafpage, btree->maxleafitem, btree->split_pct)); + if (btree->maxintlkey == 0 || btree->maxintlkey > intl_split_size / 10) + btree->maxintlkey = intl_split_size / 10; + if (btree->maxleafkey == 0) + btree->maxleafkey = leaf_split_size / 10; + if (btree->maxleafvalue == 0) + btree->maxleafvalue = leaf_split_size / 2; return (0); } - -/* - * __wt_split_page_size -- - * Split page size calculation: we don't want to repeatedly split every - * time a new entry is added, so we split to a smaller-than-maximum page size. - */ -uint32_t -__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) -{ - uintmax_t a; - uint32_t split_size; - - /* - * Ideally, the split page size is some percentage of the maximum page - * size rounded to an allocation unit (round to an allocation unit so - * we don't waste space when we write). - */ - a = maxpagesize; /* Don't overflow. */ - split_size = (uint32_t) - WT_ALIGN((a * (u_int)btree->split_pct) / 100, btree->allocsize); - - /* - * If the result of that calculation is the same as the allocation unit - * (that happens if the maximum size is the same size as an allocation - * unit, use a percentage of the maximum page size). - */ - if (split_size == btree->allocsize) - split_size = (uint32_t)((a * (u_int)btree->split_pct) / 100); - - return (split_size); -} - -/* - * pse1 -- - * Page size error message 1. - */ -static int -pse1(WT_SESSION_IMPL *session, const char *type, uint32_t max, uint32_t ovfl) -{ - WT_RET_MSG(session, EINVAL, - "%s page size (%" PRIu32 "B) too small for the maximum item size " - "(%" PRIu32 "B); the page must be able to hold at least 2 items", - type, max, ovfl); -} - -/* - * pse2 -- - * Page size error message 2. - */ -static int -pse2(WT_SESSION_IMPL *session, - const char *type, uint32_t max, uint32_t ovfl, int pct) -{ - WT_RET_MSG(session, EINVAL, - "%s page size (%" PRIu32 "B) too small for the maximum item size " - "(%" PRIu32 "B), because of the split percentage (%d %%); a split " - "page must be able to hold at least 2 items", - type, max, ovfl, pct); -} diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index b2767e74bac..799f0cca3ee 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -227,8 +227,8 @@ __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, WT_INTL_INDEX_SET(page, pindex); if (alloc_refs) for (i = 0; i < pindex->entries; ++i) { - WT_ERR(__wt_calloc_def( - session, 1, &pindex->index[i])); + WT_ERR(__wt_calloc_one( + session, &pindex->index[i])); size += sizeof(WT_REF); } if (0) { diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index 6e70c9ea2b6..96b63f3f8f0 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -491,8 +491,8 @@ __slvg_trk_init(WT_SESSION_IMPL *session, WT_DECL_RET; WT_TRACK *trk; - WT_RET(__wt_calloc_def(session, 1, &trk)); - WT_ERR(__wt_calloc_def(session, 1, &trk->shared)); + WT_RET(__wt_calloc_one(session, &trk)); + WT_ERR(__wt_calloc_one(session, &trk->shared)); trk->shared->ref = 1; trk->ss = ss; @@ -519,7 +519,7 @@ __slvg_trk_split(WT_SESSION_IMPL *session, WT_TRACK *orig, WT_TRACK **newp) { WT_TRACK *trk; - WT_RET(__wt_calloc_def(session, 1, &trk)); + WT_RET(__wt_calloc_one(session, &trk)); trk->shared = orig->shared; trk->ss = orig->ss; @@ -1181,7 +1181,7 @@ __slvg_col_build_internal( ref->home = page; ref->page = NULL; - WT_ERR(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr)); + WT_ERR(__wt_calloc_one(session, &addr)); WT_ERR(__wt_strndup( session, trk->trk_addr, trk->trk_addr_size, &addr->addr)); addr->size = trk->trk_addr_size; @@ -1826,7 +1826,7 @@ __slvg_row_build_internal( ref->home = page; ref->page = NULL; - WT_ERR(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr)); + WT_ERR(__wt_calloc_one(session, &addr)); WT_ERR(__wt_strndup( session, trk->trk_addr, trk->trk_addr_size, &addr->addr)); addr->size = trk->trk_addr_size; diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index e25f0b73e01..c6b97733b69 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -310,7 +310,7 @@ __split_ref_instantiate(WT_SESSION_IMPL *session, sizeof(WT_ADDR) + addr->size); else { __wt_cell_unpack((WT_CELL *)ref->addr, &unpack); - WT_RET(__wt_calloc_def(session, 1, &addr)); + WT_RET(__wt_calloc_one(session, &addr)); if ((ret = __wt_strndup( session, unpack.data, unpack.size, &addr->addr)) != 0) { __wt_free(session, addr); @@ -444,7 +444,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) pindex->index[pindex->entries - 1]; for (alloc_refp = alloc_index->index + SPLIT_CORRECT_1, i = 0; i < children; ++alloc_refp, ++i) { - WT_ERR(__wt_calloc_def(session, 1, alloc_refp)); + WT_ERR(__wt_calloc_one(session, alloc_refp)); WT_MEMSIZE_ADD(parent_incr, sizeof(WT_REF)); } @@ -747,7 +747,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, /* In some cases, the underlying WT_REF has not yet been allocated. */ if (*refp == NULL) { - WT_RET(__wt_calloc_def(session, 1, refp)); + WT_RET(__wt_calloc_one(session, refp)); WT_MEMSIZE_ADD(incr, sizeof(WT_REF)); } ref = *refp; @@ -768,7 +768,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, * would have to avoid freeing the memory, and it's not worth * the confusion. */ - WT_RET(__wt_calloc_def(session, 1, &addr)); + WT_RET(__wt_calloc_one(session, &addr)); WT_MEMSIZE_ADD(incr, sizeof(WT_ADDR)); ref->addr = addr; addr->size = multi->addr.size; @@ -1081,7 +1081,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) * * The new reference is visible to readers once the split completes. */ - WT_ERR(__wt_calloc_def(session, 1, &split_ref[0])); + WT_ERR(__wt_calloc_one(session, &split_ref[0])); child = split_ref[0]; *child = *ref; child->state = WT_REF_MEM; @@ -1112,12 +1112,12 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp) * The second page in the split is a new WT_REF/page pair. */ WT_ERR(__wt_page_alloc(session, WT_PAGE_ROW_LEAF, 0, 0, 0, &right)); - WT_ERR(__wt_calloc_def(session, 1, &right->pg_row_ins)); - WT_ERR(__wt_calloc_def(session, 1, &right->pg_row_ins[0])); + WT_ERR(__wt_calloc_one(session, &right->pg_row_ins)); + WT_ERR(__wt_calloc_one(session, &right->pg_row_ins[0])); WT_MEMSIZE_ADD(right_incr, sizeof(WT_INSERT_HEAD)); WT_MEMSIZE_ADD(right_incr, sizeof(WT_INSERT_HEAD *)); - WT_ERR(__wt_calloc_def(session, 1, &split_ref[1])); + WT_ERR(__wt_calloc_one(session, &split_ref[1])); child = split_ref[1]; child->page = right; child->state = WT_REF_MEM; diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c index 3da0bcf346c..c08e9d9218b 100644 --- a/src/btree/bt_stat.c +++ b/src/btree/bt_stat.c @@ -32,10 +32,11 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) WT_STAT_SET(stats, btree_fixed_len, btree->bitcnt); WT_STAT_SET(stats, btree_maximum_depth, btree->maximum_depth); - WT_STAT_SET(stats, btree_maxintlitem, btree->maxintlitem); WT_STAT_SET(stats, btree_maxintlpage, btree->maxintlpage); - WT_STAT_SET(stats, btree_maxleafitem, btree->maxleafitem); + WT_STAT_SET(stats, btree_maxintlkey, btree->maxintlkey); WT_STAT_SET(stats, btree_maxleafpage, btree->maxleafpage); + WT_STAT_SET(stats, btree_maxleafkey, btree->maxleafkey); + WT_STAT_SET(stats, btree_maxleafvalue, btree->maxleafvalue); /* Everything else is really, really expensive. */ if (!F_ISSET(cst, WT_CONN_STAT_ALL)) diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c index e0036d14cbb..e7fb75dc8cb 100644 --- a/src/btree/row_modify.c +++ b/src/btree/row_modify.c @@ -19,7 +19,7 @@ __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page) conn = S2C(session); - WT_RET(__wt_calloc_def(session, 1, &modify)); + WT_RET(__wt_calloc_one(session, &modify)); /* * Select a spinlock for the page; let the barrier immediately below diff --git a/src/config/config_api.c b/src/config/config_api.c index 42f4c117b81..0c920af0d0e 100644 --- a/src/config/config_api.c +++ b/src/config/config_api.c @@ -84,7 +84,7 @@ wiredtiger_config_parser_open(WT_SESSION *wt_session, *config_parserp = NULL; session = (WT_SESSION_IMPL *)wt_session; - WT_RET(__wt_calloc_def(session, 1, &config_parser)); + WT_RET(__wt_calloc_one(session, &config_parser)); config_parser->iface = stds; config_parser->session = session; diff --git a/src/config/config_check.c b/src/config/config_check.c index c6fd6bbd75b..18300da8282 100644 --- a/src/config/config_check.c +++ b/src/config/config_check.c @@ -122,7 +122,7 @@ __wt_configure_method(WT_SESSION_IMPL *session, * The new base value is the previous base value, a separator and the * new configuration string. */ - WT_ERR(__wt_calloc_def(session, 1, &entry)); + WT_ERR(__wt_calloc_one(session, &entry)); entry->method = (*epp)->method; WT_ERR(__wt_calloc_def(session, strlen((*epp)->base) + strlen(",") + strlen(config) + 1, &p)); diff --git a/src/config/config_def.c b/src/config/config_def.c index 23f7b27338f..750d9843279 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -138,12 +138,15 @@ static const WT_CONFIG_CHECK confchk_file_meta[] = { { "huffman_value", "string", NULL, NULL }, { "id", "string", NULL, NULL }, { "internal_item_max", "int", "min=0", NULL }, + { "internal_key_max", "int", "min=0", NULL }, { "internal_key_truncate", "boolean", NULL, NULL }, { "internal_page_max", "int", "min=512B,max=512MB", NULL }, { "key_format", "format", NULL, NULL }, { "key_gap", "int", "min=0", NULL }, { "leaf_item_max", "int", "min=0", NULL }, + { "leaf_key_max", "int", "min=0", NULL }, { "leaf_page_max", "int", "min=512B,max=512MB", NULL }, + { "leaf_value_max", "int", "min=0", NULL }, { "memory_page_max", "int", "min=512B,max=10TB", NULL }, { "os_cache_dirty_max", "int", "min=0", NULL }, { "os_cache_max", "int", "min=0", NULL }, @@ -227,12 +230,15 @@ static const WT_CONFIG_CHECK confchk_session_create[] = { { "huffman_value", "string", NULL, NULL }, { "immutable", "boolean", NULL, NULL }, { "internal_item_max", "int", "min=0", NULL }, + { "internal_key_max", "int", "min=0", NULL }, { "internal_key_truncate", "boolean", NULL, NULL }, { "internal_page_max", "int", "min=512B,max=512MB", NULL }, { "key_format", "format", NULL, NULL }, { "key_gap", "int", "min=0", NULL }, { "leaf_item_max", "int", "min=0", NULL }, + { "leaf_key_max", "int", "min=0", NULL }, { "leaf_page_max", "int", "min=512B,max=512MB", NULL }, + { "leaf_value_max", "int", "min=0", NULL }, { "lsm", "category", NULL, confchk_lsm_subconfigs }, { "memory_page_max", "int", "min=512B,max=10TB", NULL }, { "os_cache_dirty_max", "int", "min=0", NULL }, @@ -567,11 +573,12 @@ static const WT_CONFIG_ENTRY config_entries[] = { "block_compressor=,cache_resident=0,checkpoint=,checkpoint_lsn=," "checksum=uncompressed,collator=,columns=,dictionary=0," "format=btree,huffman_key=,huffman_value=,id=,internal_item_max=0" - ",internal_key_truncate=,internal_page_max=4KB,key_format=u," - "key_gap=10,leaf_item_max=0,leaf_page_max=32KB," - "memory_page_max=5MB,os_cache_dirty_max=0,os_cache_max=0," - "prefix_compression=0,prefix_compression_min=4,split_pct=75," - "value_format=u,version=(major=0,minor=0)", + ",internal_key_max=0,internal_key_truncate=,internal_page_max=4KB" + ",key_format=u,key_gap=10,leaf_item_max=0,leaf_key_max=0," + "leaf_page_max=32KB,leaf_value_max=0,memory_page_max=5MB," + "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0," + "prefix_compression_min=4,split_pct=75,value_format=u," + "version=(major=0,minor=0)", confchk_file_meta }, { "index.meta", @@ -604,8 +611,9 @@ static const WT_CONFIG_ENTRY config_entries[] = { "block_compressor=,cache_resident=0,checksum=uncompressed," "colgroups=,collator=,columns=,dictionary=0,exclusive=0," "extractor=,format=btree,huffman_key=,huffman_value=,immutable=0," - "internal_item_max=0,internal_key_truncate=,internal_page_max=4KB" - ",key_format=u,key_gap=10,leaf_item_max=0,leaf_page_max=32KB," + "internal_item_max=0,internal_key_max=0,internal_key_truncate=," + "internal_page_max=4KB,key_format=u,key_gap=10,leaf_item_max=0," + "leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=0," "lsm=(auto_throttle=,bloom=,bloom_bit_count=16,bloom_config=," "bloom_hash_count=8,bloom_oldest=0,chunk_max=5GB,chunk_size=10MB," "merge_max=15,merge_min=0),memory_page_max=5MB," diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index 8d104729733..551c3037f7b 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -292,7 +292,7 @@ __conn_add_collator(WT_CONNECTION *wt_conn, WT_ERR_MSG(session, EINVAL, "invalid name for a collator: %s", name); - WT_ERR(__wt_calloc_def(session, 1, &ncoll)); + WT_ERR(__wt_calloc_one(session, &ncoll)); WT_ERR(__wt_strdup(session, name, &ncoll->name)); ncoll->collator = collator; @@ -363,7 +363,7 @@ __conn_add_compressor(WT_CONNECTION *wt_conn, WT_ERR_MSG(session, EINVAL, "invalid name for a compressor: %s", name); - WT_ERR(__wt_calloc_def(session, 1, &ncomp)); + WT_ERR(__wt_calloc_one(session, &ncomp)); WT_ERR(__wt_strdup(session, name, &ncomp->name)); ncomp->compressor = compressor; @@ -428,7 +428,7 @@ __conn_add_data_source(WT_CONNECTION *wt_conn, CONNECTION_API_CALL(conn, session, add_data_source, config, cfg); WT_UNUSED(cfg); - WT_ERR(__wt_calloc_def(session, 1, &ndsrc)); + WT_ERR(__wt_calloc_one(session, &ndsrc)); WT_ERR(__wt_strdup(session, prefix, &ndsrc->prefix)); ndsrc->dsrc = dsrc; @@ -497,7 +497,7 @@ __conn_add_extractor(WT_CONNECTION *wt_conn, WT_ERR_MSG(session, EINVAL, "invalid name for an extractor: %s", name); - WT_ERR(__wt_calloc_def(session, 1, &nextractor)); + WT_ERR(__wt_calloc_one(session, &nextractor)); WT_ERR(__wt_strdup(session, name, &nextractor->name)); nextractor->extractor = extractor; @@ -1490,7 +1490,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_RET(__wt_library_init()); - WT_RET(__wt_calloc_def(NULL, 1, &conn)); + WT_RET(__wt_calloc_one(NULL, &conn)); conn->iface = stdc; /* diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c index 079bd05ff1e..61bd4447abf 100644 --- a/src/conn/conn_cache.c +++ b/src/conn/conn_cache.c @@ -83,7 +83,7 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_ASSERT(session, conn->cache == NULL || (F_ISSET(conn, WT_CONN_CACHE_POOL) && conn->cache != NULL)); - WT_RET(__wt_calloc_def(session, 1, &conn->cache)); + WT_RET(__wt_calloc_one(session, &conn->cache)); cache = conn->cache; diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c index c7558eea5fb..dcc37da3b3b 100644 --- a/src/conn/conn_cache_pool.c +++ b/src/conn/conn_cache_pool.c @@ -81,7 +81,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) if (__wt_process.cache_pool == NULL) { WT_ASSERT(session, !reconfiguring); /* Create a cache pool. */ - WT_ERR(__wt_calloc_def(session, 1, &cp)); + WT_ERR(__wt_calloc_one(session, &cp)); created = 1; cp->name = pool_name; pool_name = NULL; /* Belongs to the cache pool now. */ diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index 088ff2f3d2c..cfd99ac1f8f 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -187,7 +187,7 @@ __conn_dhandle_get(WT_SESSION_IMPL *session, * then initialize the data handle. Exclusively lock the data handle * before inserting it in the list. */ - WT_RET(__wt_calloc_def(session, 1, &dhandle)); + WT_RET(__wt_calloc_one(session, &dhandle)); WT_ERR(__wt_rwlock_alloc(session, &dhandle->rwlock, "data handle")); @@ -196,7 +196,7 @@ __conn_dhandle_get(WT_SESSION_IMPL *session, if (ckpt != NULL) WT_ERR(__wt_strdup(session, ckpt, &dhandle->checkpoint)); - WT_ERR(__wt_calloc_def(session, 1, &btree)); + WT_ERR(__wt_calloc_one(session, &btree)); dhandle->handle = btree; btree->dhandle = dhandle; diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index 618a0934ce1..6a1a63b5abe 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -341,7 +341,7 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]) /* * Logging is on, allocate the WT_LOG structure and open the log file. */ - WT_RET(__wt_calloc(session, 1, sizeof(WT_LOG), &conn->log)); + WT_RET(__wt_calloc_one(session, &conn->log)); log = conn->log; WT_RET(__wt_spin_init(session, &log->log_lock, "log")); WT_RET(__wt_spin_init(session, &log->log_slot_lock, "log slot")); diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c index 41bfaea7ee3..2c03fc55b85 100644 --- a/src/cursor/cur_backup.c +++ b/src/cursor/cur_backup.c @@ -126,7 +126,7 @@ __wt_curbackup_open(WT_SESSION_IMPL *session, cb = NULL; - WT_RET(__wt_calloc_def(session, 1, &cb)); + WT_RET(__wt_calloc_one(session, &cb)); cursor = &cb->iface; *cursor = iface; cursor->session = &session->iface; diff --git a/src/cursor/cur_config.c b/src/cursor/cur_config.c index 5e7ca487ae2..b37736d1b43 100644 --- a/src/cursor/cur_config.c +++ b/src/cursor/cur_config.c @@ -49,7 +49,7 @@ __wt_curconfig_open(WT_SESSION_IMPL *session, WT_UNUSED(uri); - WT_RET(__wt_calloc_def(session, 1, &cconfig)); + WT_RET(__wt_calloc_one(session, &cconfig)); cursor = &cconfig->iface; *cursor = iface; diff --git a/src/cursor/cur_ds.c b/src/cursor/cur_ds.c index 096a0e27f8d..f16cc9b33f0 100644 --- a/src/cursor/cur_ds.c +++ b/src/cursor/cur_ds.c @@ -474,7 +474,7 @@ __wt_curds_open( data_source = NULL; metaconf = NULL; - WT_RET(__wt_calloc_def(session, 1, &data_source)); + WT_RET(__wt_calloc_one(session, &data_source)); cursor = &data_source->iface; *cursor = iface; cursor->session = &session->iface; diff --git a/src/cursor/cur_dump.c b/src/cursor/cur_dump.c index 5760752d406..55b47d13a6d 100644 --- a/src/cursor/cur_dump.c +++ b/src/cursor/cur_dump.c @@ -372,7 +372,7 @@ __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp) session = (WT_SESSION_IMPL *)child->session; - WT_RET(__wt_calloc_def(session, 1, &cdump)); + WT_RET(__wt_calloc_one(session, &cdump)); cursor = &cdump->iface; *cursor = iface; cursor->session = child->session; @@ -385,7 +385,7 @@ __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp) F_SET(cursor, F_ISSET(child, WT_CURSTD_DUMP_HEX | WT_CURSTD_DUMP_JSON | WT_CURSTD_DUMP_PRINT)); if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) { - WT_ERR(__wt_calloc_def(session, 1, &json)); + WT_ERR(__wt_calloc_one(session, &json)); cursor->json_private = child->json_private = json; } diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c index b516b5c58b1..2b31f75cf08 100644 --- a/src/cursor/cur_index.c +++ b/src/cursor/cur_index.c @@ -383,7 +383,7 @@ __wt_curindex_open(WT_SESSION_IMPL *session, namesize = (size_t)(columns - idxname); WT_RET(__wt_schema_open_index(session, table, idxname, namesize, &idx)); - WT_RET(__wt_calloc_def(session, 1, &cindex)); + WT_RET(__wt_calloc_one(session, &cindex)); cursor = &cindex->iface; *cursor = iface; diff --git a/src/cursor/cur_log.c b/src/cursor/cur_log.c index bdb19d05c01..0d375ee4a52 100644 --- a/src/cursor/cur_log.c +++ b/src/cursor/cur_log.c @@ -336,12 +336,12 @@ __wt_curlog_open(WT_SESSION_IMPL *session, log = conn->log; cl = NULL; - WT_RET(__wt_calloc_def(session, 1, &cl)); + WT_RET(__wt_calloc_one(session, &cl)); cursor = &cl->iface; *cursor = iface; cursor->session = &session->iface; - WT_ERR(__wt_calloc_def(session, 1, &cl->cur_lsn)); - WT_ERR(__wt_calloc_def(session, 1, &cl->next_lsn)); + WT_ERR(__wt_calloc_one(session, &cl->cur_lsn)); + WT_ERR(__wt_calloc_one(session, &cl->next_lsn)); WT_ERR(__wt_scr_alloc(session, 0, &cl->logrec)); WT_ERR(__wt_scr_alloc(session, 0, &cl->opkey)); WT_ERR(__wt_scr_alloc(session, 0, &cl->opvalue)); diff --git a/src/cursor/cur_metadata.c b/src/cursor/cur_metadata.c index d6c76c48ab9..e1e08c307fc 100644 --- a/src/cursor/cur_metadata.c +++ b/src/cursor/cur_metadata.c @@ -423,7 +423,7 @@ __wt_curmetadata_open(WT_SESSION_IMPL *session, WT_DECL_RET; WT_CONFIG_ITEM cval; - WT_RET(__wt_calloc_def(session, 1, &mdc)); + WT_RET(__wt_calloc_one(session, &mdc)); cursor = &mdc->iface; *cursor = iface; @@ -445,7 +445,9 @@ __wt_curmetadata_open(WT_SESSION_IMPL *session, } if (0) { -err: __wt_free(session, mdc); +err: if (mdc->file_cursor != NULL) + WT_TRET(mdc->file_cursor->close(mdc->file_cursor)); + __wt_free(session, mdc); } return (ret); } diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c index cc12077024f..74237c6ffdc 100644 --- a/src/cursor/cur_stat.c +++ b/src/cursor/cur_stat.c @@ -503,7 +503,7 @@ __wt_curstat_open(WT_SESSION_IMPL *session, conn = S2C(session); - WT_ERR(__wt_calloc_def(session, 1, &cst)); + WT_ERR(__wt_calloc_one(session, &cst)); cursor = &cst->iface; *cursor = iface; cursor->session = &session->iface; diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c index 50d76609411..1825d641c49 100644 --- a/src/cursor/cur_table.c +++ b/src/cursor/cur_table.c @@ -878,7 +878,7 @@ __wt_curtable_open(WT_SESSION_IMPL *session, return (ret); } - WT_RET(__wt_calloc_def(session, 1, &ctable)); + WT_RET(__wt_calloc_one(session, &ctable)); cursor = &ctable->iface; *cursor = iface; diff --git a/src/docs/error-handling.dox b/src/docs/error-handling.dox index bced608434b..cf268f80500 100644 --- a/src/docs/error-handling.dox +++ b/src/docs/error-handling.dox @@ -47,7 +47,7 @@ This error is returned when an error is not covered by a specific error return. This error indicates an operation did not find a value to return. This includes cursor search and other operations where no record matched the cursor's search key such as WT_CURSOR::update or WT_CURSOR::remove. @par <code>WT_PANIC</code> -This error indicates an underlying problem that requires the application exit and restart. +This error indicates an underlying problem that requires the application exit and restart. The application can exit immediately when \c WT_PANIC is returned from a WiredTiger interface, no further WiredTiger calls are required. @if IGNORE_BUILT_BY_API_ERR_END @endif diff --git a/src/docs/examples.dox b/src/docs/examples.dox index 53bd3589362..9b86df099e3 100644 --- a/src/docs/examples.dox +++ b/src/docs/examples.dox @@ -22,9 +22,6 @@ extractors and cursor types. @example ex_extractor.c Shows how to extend WiredTiger with a more complex custom extractor. -@example ex_file.c -Shows how to use file objects. - @example ex_hello.c This is an example of how to create and open a database. diff --git a/src/docs/tune-page-sizes.dox b/src/docs/tune-page-sizes.dox index b3fd20f6276..130e047a02d 100644 --- a/src/docs/tune-page-sizes.dox +++ b/src/docs/tune-page-sizes.dox @@ -1,42 +1,127 @@ -/*! @page tune_page_sizes Page and overflow item sizes - -There are four page and item size configuration values: \c internal_page_max, -\c internal_item_max, \c leaf_page_max and \c leaf_item_max. All four are -specified to the WT_SESSION::create method, that is, they are configurable -on a per-file basis. - -The \c internal_page_max and \c leaf_page_max configuration values specify -the maximum size for Btree internal and leaf pages. That is, when an -internal or leaf page grows past the specified size, it splits into -multiple pages. Generally, internal pages should be sized to fit into -the system's on-chip caches in order to minimize cache misses when -searching the tree, while leaf pages should be sized to maximize I/O -performance (if reading from disk is necessary, it is usually desirable -to read a large amount of data, assuming some locality of reference in -the application's access pattern). - -The \c internal_item_max and \c leaf_item_max configuration values specify -the maximum size at which an object will be stored on-page. Larger items -will be stored separately in the file from the page where the item logically -appears. Referencing overflow items is more expensive than referencing -on-page items, requiring additional I/O if the object is not already cached. -For this reason, it is important to avoid creating large numbers of overflow -items that are repeatedly referenced, and the maximum item size should -probably be increased if many overflow items are being created. Because -pages must be large enough to store any item that is not an overflow item, -increasing the size of the overflow items may also require increasing the -page sizes. - -With respect to compression, page and item sizes do not necessarily reflect -the actual size of the page or item on disk, if block compression has been -configured. Block compression in WiredTiger happens within the disk I/O -subsystem, and so a page might split even if subsequent compression would -result in a resulting page size that would be small enough to leave as a -single page. In other words, page and overflow sizes are based on in-memory -sizes, not disk sizes. - -There are two other, related configuration values, also settable by the -WT_SESSION::create method. They are \c allocation_size and \c split_pct. +/*! @page tune_page_sizes Page and overflow key/value sizes + +There are seven page and key/value size configuration strings: + +- allocation size (\c allocation_size), +- page sizes (\c internal_page_max and \c leaf_page_max), +- key and value sizes (\c internal_key_max, \c leaf_key_max and \c leaf_value_max), and the +- page-split percentage (\c split_pct). + +All seven are specified to the WT_SESSION::create method, in other +words, they are configurable on a per-file basis. + +Applications commonly configure page sizes, based on their workload's +typical key and value size. Once the correct page size has been chosen, +appropriate defaults for the other configuration values are derived from +the page sizes, and relatively few applications will need to modify the +other page and key/value size configuration options. + +An example of configuring page and key/value sizes: + +@snippet ex_all.c Create a table and configure the page size + +@section tune_page_sizes_sizes Page, key and value sizes + +The \c internal_page_max and \c leaf_page_max configuration values +specify a maximum size for Btree internal and leaf pages. That is, when +an internal or leaf page grows past that size, it splits into multiple +pages. Generally, internal pages should be sized to fit into on-chip +caches in order to minimize cache misses when searching the tree, while +leaf pages should be sized to maximize I/O performance (if reading from +disk is necessary, it is usually desirable to read a large amount of +data, assuming some locality of reference in the application's access +pattern). + +The default page size configurations (2KB for \c internal_page_max, 32KB +for \c leaf_page_max), are appropriate for applications with relatively +small keys and values. + +- Applications doing full-table scans through out-of-memory workloads +might increase both internal and leaf page sizes to transfer more data +per I/O. +- Applications focused on read/write amplification might decrease the page +size to better match the underlying storage block size. + +When block compression has been configured, configured page sizes will +not match the actual size of the page on disk. Block compression in +WiredTiger happens within the I/O subsystem, and so a page might split +even if subsequent compression would result in a resulting page size +small enough to leave as a single page. In other words, page sizes are +based on in-memory sizes, not on-disk sizes. Applications needing to +write specific sized blocks may want to consider implementing a +WT_COMPRESSOR::compress_raw function. + +The page sizes also determine the default size of overflow items, that +is, keys and values too large to easily store on a page. Overflow items +are stored separately in the file from the page where the item logically +appears, and so reading or writing an overflow item is more expensive +than an on-page item, normally requiring additional I/O. Additionally, +overflow values are not cached in memory. This means overflow items +won't affect the caching behavior of the application, but it also means +that each time an overflow value is read, it is re-read from disk. + +For both of these reasons, applications should avoid creating large +numbers of commonly referenced overflow items. This is especially +important for keys, as keys on internal pages are referenced during +random searches, not just during data retrieval. Generally, +applications should make every attempt to avoid creating overflow keys. + +- Applications with large keys and values, and concerned with latency, +might increase the page size to avoid creating overflow items, in order +to avoid the additional cost of retrieving them. + +- Applications with large keys and values, doing random searches, might +decrease the page size to avoid wasting cache space on overflow items +that aren't likely to be needed. + +- Applications with large keys and values, doing table scans, might +increase the page size to avoid creating overflow items, as the overflow +items must be read into memory in all cases, anyway. + +The \c internal_key_max, \c leaf_key_max and \c leaf_value_max +configuration values allow applications to change the size at which a +key or value will be treated as an overflow item. + +The value of \c internal_key_max is relative to the maximum internal +page size. Because the number of keys on an internal page determines +the depth of the tree, the \c internal_key_max value can only be +adjusted within a certain range, and the configured value will be +automatically adjusted by WiredTiger, if necessary to ensure a +reasonable number of keys fit on an internal page. + +The values of \c leaf_key_max and \c leaf_value_max are not relative to +the maximum leaf page size. If either is larger than the maximum page +size, the page size will be ignored when the larger keys and values are +being written, and a larger page will be created as necessary. + +Most applications should not need to tune the maximum key and value +sizes. Applications requiring a small page size, but also having +latency concerns such that the additional work to retrieve an overflow +item is an issue, may find them useful. + +An example of configuring a large leaf overflow value: + +@snippet ex_all.c Create a table and configure a large leaf value max + +@section tune_page_sizes_split_percentage Split percentage + +The \c split_pct configuration string configures the size of a split +page. When a page grows sufficiently large that it must be written as +multiple disk blocks, the newly written block size is \c split_pct +percent of the maximum page size. This value should be selected to +avoid creating a large number of tiny pages or repeatedly splitting +whenever new entries are inserted. For example, if the maximum page +size is 1MB, a \c split_pct value of 10% would potentially result in +creating a large number of 100KB pages, which may not be optimal for +future I/O. Or, if the maximum page size is 1MB, a \c split_pct value +of 90% would potentially result in repeatedly splitting pages as the +split pages grow to 1MB over and over. The default value for \c +split_pct is 75%, intended to keep large pages relatively large, while +still giving split pages room to grow. + +Most applications should not need to tune the split percentage size. + +@section tune_page_sizes_allocation_size Allocation size The \c allocation_size configuration value is the underlying unit of allocation for the file. As the unit of file allocation, it sets the @@ -46,25 +131,12 @@ is set to 4KB, an overflow item of 18,000 bytes requires 5 allocation units and wastes about 2KB of space. If the allocation size is 16KB, the same overflow item would waste more than 10KB. -The default allocation size is 4KB, chosen for compatibility with virtual -memory page sizes and direct I/O requirements on common server platforms. - -The last configuration value is \c split_pct, which configures the size -of a split page. When a page grows sufficiently large that it must be -written as multiple disk blocks, the newly written block size is \c -split_pct percent of the maximum page size. This value should be -selected to avoid creating a large number of tiny pages or repeatedly -splitting whenever new entries are inserted. For example, if the -maximum page size is 1MB, a \c split_pct value of 10% would potentially -result in creating a large number of 100KB pages, which may not be -optimal for future I/O. Or, if the maximum page size is 1MB, a \c -split_pct value of 90% would potentially result in repeatedly splitting -pages as the split pages grow to 1MB over and over. The default value -for \c split_pct is 75%, intended to keep large pages relatively large, -while still giving split pages room to grow. - -An example of configuring page sizes: +The default allocation size is 4KB, chosen for compatibility with +virtual memory page sizes and direct I/O requirements on common server +platforms. -@snippet ex_file.c file create +Most applications should not need to tune the allocation size; it is +primarily intended for applications coping with the specific +requirements some file systems make to support features like direct I/O. - */ +*/ diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index 0e750ae0ca1..0fb858643fd 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -17,6 +17,14 @@ Collators, compressors and extractors can now be disabled with an explicit using the name \c "none" for a collator, compressor or extractor will need to be updated. </dd> + +<dt>maximum keys and value sizes +<dd> +The WT_SESSION::create \c internal_item_max and \c leaf_item_max +configuration strings are now deprecated in favor of the +\c internal_key_max, \c leaf_key_max, and \c leaf_value_max +configuration strings. See @ref tune_page_sizes for more information. +</dd> </dl> @section version_241 Upgrading to Version 2.4.1 diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index fa3bfa50eb0..bc791de6d0f 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -206,7 +206,7 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ - WT_RET(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr)); + WT_RET(__wt_calloc_one(session, &addr)); *addr = mod->mod_replace; mod->mod_replace.addr = NULL; mod->mod_replace.size = 0; diff --git a/src/include/btree.h b/src/include/btree.h index 907b36c9ed4..e7c1826bda9 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -83,9 +83,10 @@ struct __wt_btree { uint32_t allocsize; /* Allocation size */ uint32_t maxintlpage; /* Internal page max size */ - uint32_t maxintlitem; /* Internal page max item size */ + uint32_t maxintlkey; /* Internal page max key size */ uint32_t maxleafpage; /* Leaf page max size */ - uint32_t maxleafitem; /* Leaf page max item size */ + uint32_t maxleafkey; /* Leaf page max key size */ + uint32_t maxleafvalue; /* Leaf page max value size */ uint64_t maxmempage; /* In memory page max size */ void *huffman_key; /* Key huffman encoding */ diff --git a/src/include/extern.h b/src/include/extern.h index 5a5601160c8..00bbdaf746c 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -122,7 +122,6 @@ extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno); extern int __wt_btree_tree_open( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size); extern int __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep); extern void __wt_btree_evictable(WT_SESSION_IMPL *session, int on); -extern uint32_t __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize); extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session); extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session); extern int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size); @@ -496,6 +495,7 @@ extern void __wt_ovfl_txnc_free(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags); +extern uint32_t __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize); extern int __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); extern int __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); diff --git a/src/include/misc.h b/src/include/misc.h index c861dff18bc..c2abaa08057 100644 --- a/src/include/misc.h +++ b/src/include/misc.h @@ -65,11 +65,13 @@ #define WT_SKIP_PROBABILITY (UINT32_MAX >> 2) /* - * __wt_calloc_def -- - * Simple calls don't need separate sizeof arguments. + * __wt_calloc_def, __wt_calloc_one -- + * Most calloc calls don't need separate count or sizeof arguments. */ #define __wt_calloc_def(session, number, addr) \ __wt_calloc(session, (size_t)(number), sizeof(**(addr)), addr) +#define __wt_calloc_one(session, addr) \ + __wt_calloc(session, (size_t)1, sizeof(**(addr)), addr) /* * __wt_realloc_def -- diff --git a/src/include/stat.h b/src/include/stat.h index 37df43adfee..69fa0ba8e4f 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -287,10 +287,11 @@ struct __wt_dsrc_stats { WT_STATS btree_entries; WT_STATS btree_fixed_len; WT_STATS btree_maximum_depth; - WT_STATS btree_maxintlitem; + WT_STATS btree_maxintlkey; WT_STATS btree_maxintlpage; - WT_STATS btree_maxleafitem; + WT_STATS btree_maxleafkey; WT_STATS btree_maxleafpage; + WT_STATS btree_maxleafvalue; WT_STATS btree_overflow; WT_STATS btree_row_internal; WT_STATS btree_row_leaf; diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 100bc771798..9aa219eccfc 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -762,6 +762,18 @@ struct __wt_session { /*! The connection for this session. */ WT_CONNECTION *connection; + /* + * Don't expose app_private to non-C language bindings - they have + * their own way to attach data to an operation. + */ +#if !defined(SWIG) + /*! + * A location for applications to store information that will be + * available in callbacks taking a WT_SESSION handle. + */ + void *app_private; +#endif + /*! * Close the session handle. * @@ -997,12 +1009,12 @@ struct __wt_session { * @config{immutable, configure the index to be immutable - that is an * index is not changed by any update to a record in the table., a * boolean flag; default \c false.} - * @config{internal_item_max, the largest key stored within an internal - * node\, in bytes. If non-zero\, any key larger than the specified - * size will be stored as an overflow item (which may require additional - * I/O to access). If zero\, a default size is chosen that permits at - * least 8 keys per internal page., an integer greater than or equal to - * 0; default \c 0.} + * @config{internal_key_max, the largest key stored in an internal + * node\, in bytes. If set\, keys larger than the specified size are + * stored as overflow items (which may require additional I/O to + * access). The default and the maximum allowed value are both one-tenth + * the size of a newly split internal page., an integer greater than or + * equal to 0; default \c 0.} * @config{internal_key_truncate, configure internal key truncation\, * discarding unnecessary trailing bytes on internal keys (ignored for * custom collators)., a boolean flag; default \c true.} @@ -1020,12 +1032,11 @@ struct __wt_session { * row-store files: keys of type \c 'r' are record numbers and records * referenced by record number are stored in column-store files., a * format string; default \c u.} - * @config{leaf_item_max, the largest key or value stored within a leaf - * node\, in bytes. If non-zero\, any key or value larger than the - * specified size will be stored as an overflow item (which may require - * additional I/O to access). If zero\, a default size is chosen that - * permits at least 4 key and value pairs per leaf page., an integer - * greater than or equal to 0; default \c 0.} + * @config{leaf_key_max, the largest key stored in a leaf node\, in + * bytes. If set\, keys larger than the specified size are stored as + * overflow items (which may require additional I/O to access). The + * default value is one-tenth the size of a newly split leaf page., an + * integer greater than or equal to 0; default \c 0.} * @config{leaf_page_max, the maximum page size for leaf nodes\, in * bytes; the size must be a multiple of the allocation size\, and is * significant for applications wanting to maximize sequential data @@ -1033,6 +1044,13 @@ struct __wt_session { * uncompressed data\, that is\, the limit is applied before any block * compression is done., an integer between 512B and 512MB; default \c * 32KB.} + * @config{leaf_value_max, the largest value stored in a leaf node\, in + * bytes. If set\, values larger than the specified size are stored as + * overflow items (which may require additional I/O to access). If the + * size is larger than the maximum leaf page size\, the page size is + * temporarily ignored when large values are written. The default is + * one-half the size of a newly split leaf page., an integer greater + * than or equal to 0; default \c 0.} * @config{lsm = (, options only relevant for LSM data sources., a set * of related configuration options defined below.} * @config{ auto_throttle, Throttle inserts into @@ -2056,6 +2074,11 @@ struct __wt_event_handler { * Callback to handle error messages; by default, error messages are * written to the stderr stream. * + * Errors that require the application to exit and restart will have + * their \c error value set to \c WT_PANIC. The application can exit + * immediately when \c WT_PANIC is passed to an error handler, there + * is no reason to return into WiredTiger. + * * Error handler returns are not ignored: if the handler returns * non-zero, the error may cause the WiredTiger function posting the * event to fail, and may even cause operation or library failure. @@ -2526,7 +2549,9 @@ const char *wiredtiger_version(int *majorp, int *minorp, int *patchp); /*! * WiredTiger library panic. * This error indicates an underlying problem that requires the application exit - * and restart. + * and restart. The application can exit immediately when \c WT_PANIC is + * returned from a WiredTiger interface, no further WiredTiger calls are + * required. */ #define WT_PANIC -31804 /*! @cond internal */ @@ -2642,7 +2667,7 @@ struct __wt_compressor { * of \c dst_len. If the WT_COMPRESSOR::pre_size method is specified, * the destination buffer will be at least the size returned by that * method; otherwise, the destination buffer will be at least as large - * as \c src_len. + * as the length of the data to compress. * * If compression would not shrink the data or the \c dst buffer is not * large enough to hold the compressed data, the callback should set @@ -2712,10 +2737,8 @@ struct __wt_compressor { * On entry, \c dst points to the destination buffer with a length * of \c dst_len. If the WT_COMPRESSOR::pre_size method is specified, * the destination buffer will be at least the size returned by that - * method; otherwise, the destination buffer will be at least the - * maximum size for the page being written (that is, when writing a - * row-store leaf page, the destination buffer will be at least as - * large as the \c leaf_page_max configuration value). + * method; otherwise, the destination buffer will be at least as large + * as the length of the data to compress. * * After successful completion, the callback should return \c 0, and * set \c result_slotsp to the number of byte strings encoded and @@ -3378,130 +3401,132 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_DSRC_BTREE_FIXED_LEN 2023 /*! btree: maximum tree depth */ #define WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH 2024 -/*! btree: maximum internal page item size */ -#define WT_STAT_DSRC_BTREE_MAXINTLITEM 2025 +/*! btree: maximum internal page key size */ +#define WT_STAT_DSRC_BTREE_MAXINTLKEY 2025 /*! btree: maximum internal page size */ #define WT_STAT_DSRC_BTREE_MAXINTLPAGE 2026 -/*! btree: maximum leaf page item size */ -#define WT_STAT_DSRC_BTREE_MAXLEAFITEM 2027 +/*! btree: maximum leaf page key size */ +#define WT_STAT_DSRC_BTREE_MAXLEAFKEY 2027 /*! btree: maximum leaf page size */ #define WT_STAT_DSRC_BTREE_MAXLEAFPAGE 2028 +/*! btree: maximum leaf page value size */ +#define WT_STAT_DSRC_BTREE_MAXLEAFVALUE 2029 /*! btree: overflow pages */ -#define WT_STAT_DSRC_BTREE_OVERFLOW 2029 +#define WT_STAT_DSRC_BTREE_OVERFLOW 2030 /*! btree: row-store internal pages */ -#define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2030 +#define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2031 /*! btree: row-store leaf pages */ -#define WT_STAT_DSRC_BTREE_ROW_LEAF 2031 +#define WT_STAT_DSRC_BTREE_ROW_LEAF 2032 /*! cache: bytes read into cache */ -#define WT_STAT_DSRC_CACHE_BYTES_READ 2032 +#define WT_STAT_DSRC_CACHE_BYTES_READ 2033 /*! cache: bytes written from cache */ -#define WT_STAT_DSRC_CACHE_BYTES_WRITE 2033 +#define WT_STAT_DSRC_CACHE_BYTES_WRITE 2034 /*! cache: checkpoint blocked page eviction */ -#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 2034 +#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 2035 /*! cache: unmodified pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2035 +#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2036 /*! cache: modified pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 2036 +#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 2037 /*! cache: data source pages selected for eviction unable to be evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 2037 +#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 2038 /*! cache: hazard pointer blocked page eviction */ -#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2038 +#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2039 /*! cache: internal pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2039 +#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2040 /*! cache: in-memory page splits */ -#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2040 +#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2041 /*! cache: overflow values cached in memory */ -#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2041 +#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2042 /*! cache: pages read into cache */ -#define WT_STAT_DSRC_CACHE_READ 2042 +#define WT_STAT_DSRC_CACHE_READ 2043 /*! cache: overflow pages read into cache */ -#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2043 +#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2044 /*! cache: pages written from cache */ -#define WT_STAT_DSRC_CACHE_WRITE 2044 +#define WT_STAT_DSRC_CACHE_WRITE 2045 /*! compression: raw compression call failed, no additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2045 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2046 /*! compression: raw compression call failed, additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2046 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2047 /*! compression: raw compression call succeeded */ -#define WT_STAT_DSRC_COMPRESS_RAW_OK 2047 +#define WT_STAT_DSRC_COMPRESS_RAW_OK 2048 /*! compression: compressed pages read */ -#define WT_STAT_DSRC_COMPRESS_READ 2048 +#define WT_STAT_DSRC_COMPRESS_READ 2049 /*! compression: compressed pages written */ -#define WT_STAT_DSRC_COMPRESS_WRITE 2049 +#define WT_STAT_DSRC_COMPRESS_WRITE 2050 /*! compression: page written failed to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2050 +#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2051 /*! compression: page written was too small to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2051 +#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2052 /*! cursor: create calls */ -#define WT_STAT_DSRC_CURSOR_CREATE 2052 +#define WT_STAT_DSRC_CURSOR_CREATE 2053 /*! cursor: insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT 2053 +#define WT_STAT_DSRC_CURSOR_INSERT 2054 /*! cursor: bulk-loaded cursor-insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2054 +#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2055 /*! cursor: cursor-insert key and value bytes inserted */ -#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2055 +#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2056 /*! cursor: next calls */ -#define WT_STAT_DSRC_CURSOR_NEXT 2056 +#define WT_STAT_DSRC_CURSOR_NEXT 2057 /*! cursor: prev calls */ -#define WT_STAT_DSRC_CURSOR_PREV 2057 +#define WT_STAT_DSRC_CURSOR_PREV 2058 /*! cursor: remove calls */ -#define WT_STAT_DSRC_CURSOR_REMOVE 2058 +#define WT_STAT_DSRC_CURSOR_REMOVE 2059 /*! cursor: cursor-remove key bytes removed */ -#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2059 +#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2060 /*! cursor: reset calls */ -#define WT_STAT_DSRC_CURSOR_RESET 2060 +#define WT_STAT_DSRC_CURSOR_RESET 2061 /*! cursor: search calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH 2061 +#define WT_STAT_DSRC_CURSOR_SEARCH 2062 /*! cursor: search near calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2062 +#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2063 /*! cursor: update calls */ -#define WT_STAT_DSRC_CURSOR_UPDATE 2063 +#define WT_STAT_DSRC_CURSOR_UPDATE 2064 /*! cursor: cursor-update value bytes updated */ -#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2064 +#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2065 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2065 +#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2066 /*! LSM: chunks in the LSM tree */ -#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2066 +#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2067 /*! LSM: highest merge generation in the LSM tree */ -#define WT_STAT_DSRC_LSM_GENERATION_MAX 2067 +#define WT_STAT_DSRC_LSM_GENERATION_MAX 2068 /*! LSM: queries that could have benefited from a Bloom filter that did * not exist */ -#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2068 +#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2069 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2069 +#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2070 /*! reconciliation: dictionary matches */ -#define WT_STAT_DSRC_REC_DICTIONARY 2070 +#define WT_STAT_DSRC_REC_DICTIONARY 2071 /*! reconciliation: internal page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2071 +#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2072 /*! reconciliation: leaf page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2072 +#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2073 /*! reconciliation: maximum blocks required for a page */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2073 +#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2074 /*! reconciliation: internal-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2074 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2075 /*! reconciliation: leaf-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2075 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2076 /*! reconciliation: overflow values written */ -#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2076 +#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2077 /*! reconciliation: pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE 2077 +#define WT_STAT_DSRC_REC_PAGE_DELETE 2078 /*! reconciliation: page checksum matches */ -#define WT_STAT_DSRC_REC_PAGE_MATCH 2078 +#define WT_STAT_DSRC_REC_PAGE_MATCH 2079 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_DSRC_REC_PAGES 2079 +#define WT_STAT_DSRC_REC_PAGES 2080 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_DSRC_REC_PAGES_EVICTION 2080 +#define WT_STAT_DSRC_REC_PAGES_EVICTION 2081 /*! reconciliation: leaf page key bytes discarded using prefix compression */ -#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2081 +#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2082 /*! reconciliation: internal page key bytes discarded using suffix * compression */ -#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2082 +#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2083 /*! session: object compaction */ -#define WT_STAT_DSRC_SESSION_COMPACT 2083 +#define WT_STAT_DSRC_SESSION_COMPACT 2084 /*! session: open cursor count */ -#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2084 +#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2085 /*! transaction: update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2085 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2086 /*! @} */ /* * Statistics section: END diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index 103a506287d..dd60ad926d8 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -322,14 +322,15 @@ __clsm_deleted_encode(WT_SESSION_IMPL *session, * Decode values that start with the tombstone. */ static inline void -__clsm_deleted_decode(WT_ITEM *value) +__clsm_deleted_decode(WT_CURSOR_LSM *clsm, WT_ITEM *value) { /* * Take care with this check: when an LSM cursor is used for a merge, * and/or to create a Bloom filter, it is valid to return the tombstone * value. */ - if (value->size > __tombstone.size && + if (!F_ISSET(clsm, WT_CLSM_MERGE) && + value->size > __tombstone.size && memcmp(value->data, __tombstone.data, __tombstone.size) == 0) --value->size; } @@ -840,7 +841,7 @@ retry: /* err: WT_TRET(__clsm_leave(clsm)); API_END(session, ret); if (ret == 0) - __clsm_deleted_decode(&cursor->value); + __clsm_deleted_decode(clsm, &cursor->value); return (ret); } @@ -928,7 +929,7 @@ retry: /* err: WT_TRET(__clsm_leave(clsm)); API_END(session, ret); if (ret == 0) - __clsm_deleted_decode(&cursor->value); + __clsm_deleted_decode(clsm, &cursor->value); return (ret); } @@ -1087,7 +1088,7 @@ __clsm_search(WT_CURSOR *cursor) err: WT_TRET(__clsm_leave(clsm)); API_END(session, ret); if (ret == 0) - __clsm_deleted_decode(&cursor->value); + __clsm_deleted_decode(clsm, &cursor->value); return (ret); } @@ -1173,8 +1174,7 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp) * smallest cursor larger than the search key, or it is NULL if the * search key is larger than any record in the tree. */ - if (!exact) - cmp = 1; + cmp = exact ? 0 : 1; /* * If we land on a deleted item, try going forwards or backwards to @@ -1189,7 +1189,9 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp) clsm->current = closest; closest = NULL; deleted = __clsm_deleted(clsm, &cursor->value); - if (deleted && (ret = cursor->next(cursor)) == 0) { + if (!deleted) + __clsm_deleted_decode(clsm, &cursor->value); + else if ((ret = cursor->next(cursor)) == 0) { cmp = 1; deleted = 0; } @@ -1197,8 +1199,8 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp) } if (deleted) { clsm->current = NULL; - if ((ret = cursor->prev(cursor)) == 0) - cmp = -1; + WT_ERR(cursor->prev(cursor)); + cmp = -1; } *exactp = cmp; @@ -1210,7 +1212,6 @@ err: WT_TRET(__clsm_leave(clsm)); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); if (ret == 0) { F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); - __clsm_deleted_decode(&cursor->value); } else clsm->current = NULL; @@ -1460,7 +1461,7 @@ __wt_clsm_open(WT_SESSION_IMPL *session, ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)); WT_RET(ret); - WT_ERR(__wt_calloc_def(session, 1, &clsm)); + WT_ERR(__wt_calloc_one(session, &clsm)); cursor = &clsm->iface; *cursor = iface; diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c index 1356d336f6e..248ac70c61e 100644 --- a/src/lsm/lsm_manager.c +++ b/src/lsm/lsm_manager.c @@ -645,7 +645,7 @@ __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session, WT_RET(__wt_epoch(session, &lsm_tree->work_push_ts)); - WT_RET(__wt_calloc_def(session, 1, &entry)); + WT_RET(__wt_calloc_one(session, &entry)); entry->type = type; entry->flags = flags; entry->lsm_tree = lsm_tree; diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c index 9ed605724ce..8989e979a44 100644 --- a/src/lsm/lsm_merge.c +++ b/src/lsm/lsm_merge.c @@ -311,7 +311,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) lsm_tree->name, verb, lsm_tree->chunk[verb]->id)); } - WT_ERR(__wt_calloc_def(session, 1, &chunk)); + WT_ERR(__wt_calloc_one(session, &chunk)); created_chunk = 1; chunk->id = dest_id; diff --git a/src/lsm/lsm_meta.c b/src/lsm/lsm_meta.c index bf03588c066..7fd77b64720 100644 --- a/src/lsm/lsm_meta.c +++ b/src/lsm/lsm_meta.c @@ -91,8 +91,8 @@ __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc, nchunks + 1, &lsm_tree->chunk)); - WT_ERR(__wt_calloc_def( - session, 1, &chunk)); + WT_ERR( + __wt_calloc_one(session, &chunk)); lsm_tree->chunk[nchunks++] = chunk; chunk->id = (uint32_t)lv.val; WT_ERR(__wt_lsm_tree_chunk_name(session, @@ -136,7 +136,7 @@ __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_ERR(__wt_realloc_def(session, &lsm_tree->old_alloc, nchunks + 1, &lsm_tree->old_chunks)); - WT_ERR(__wt_calloc_def(session, 1, &chunk)); + WT_ERR(__wt_calloc_one(session, &chunk)); lsm_tree->old_chunks[nchunks++] = chunk; WT_ERR(__wt_strndup(session, lk.str, lk.len, &chunk->uri)); diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index 888f12bdd94..e7b1d7f9d2c 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -332,7 +332,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, WT_RET_MSG(session, EINVAL, "LSM trees cannot be configured as column stores"); - WT_RET(__wt_calloc_def(session, 1, &lsm_tree)); + WT_RET(__wt_calloc_one(session, &lsm_tree)); WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); @@ -551,7 +551,7 @@ __lsm_tree_open( return (ret); /* Try to open the tree. */ - WT_RET(__wt_calloc_def(session, 1, &lsm_tree)); + WT_RET(__wt_calloc_one(session, &lsm_tree)); WT_ERR(__wt_rwlock_alloc(session, &lsm_tree->rwlock, "lsm tree")); WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); @@ -820,7 +820,7 @@ __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) "merge throttle %ld", lsm_tree->name, new_id, lsm_tree->ckpt_throttle, lsm_tree->merge_throttle)); - WT_ERR(__wt_calloc_def(session, 1, &chunk)); + WT_ERR(__wt_calloc_one(session, &chunk)); chunk->id = new_id; chunk->switch_txn = WT_TXN_NONE; lsm_tree->chunk[lsm_tree->nchunks++] = chunk; @@ -1011,7 +1011,7 @@ __wt_lsm_tree_truncate( locked = 1; /* Create the new chunk. */ - WT_ERR(__wt_calloc_def(session, 1, &chunk)); + WT_ERR(__wt_calloc_one(session, &chunk)); chunk->id = WT_ATOMIC_ADD4(lsm_tree->last, 1); WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); diff --git a/src/os_posix/os_dlopen.c b/src/os_posix/os_dlopen.c index 91410c54c04..cb9fe314beb 100644 --- a/src/os_posix/os_dlopen.c +++ b/src/os_posix/os_dlopen.c @@ -17,7 +17,7 @@ __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp) WT_DECL_RET; WT_DLH *dlh; - WT_RET(__wt_calloc_def(session, 1, &dlh)); + WT_RET(__wt_calloc_one(session, &dlh)); WT_ERR(__wt_strdup(session, path, &dlh->name)); if ((dlh->handle = dlopen(path, RTLD_LAZY)) == NULL) diff --git a/src/os_posix/os_mtx_cond.c b/src/os_posix/os_mtx_cond.c index 3a76cceb3f0..479a61db795 100644 --- a/src/os_posix/os_mtx_cond.c +++ b/src/os_posix/os_mtx_cond.c @@ -22,7 +22,7 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, * !!! * This function MUST handle a NULL session handle. */ - WT_RET(__wt_calloc(session, 1, sizeof(WT_CONDVAR), &cond)); + WT_RET(__wt_calloc_one(session, &cond)); WT_ERR(pthread_mutex_init(&cond->mtx, NULL)); diff --git a/src/os_posix/os_mtx_rw.c b/src/os_posix/os_mtx_rw.c index 1a692f71dce..c6cfa9412a7 100644 --- a/src/os_posix/os_mtx_rw.c +++ b/src/os_posix/os_mtx_rw.c @@ -53,7 +53,7 @@ __wt_rwlock_alloc( WT_RET(__wt_verbose(session, WT_VERB_MUTEX, "rwlock: alloc %s", name)); - WT_RET(__wt_calloc_def(session, 1, &rwlock)); + WT_RET(__wt_calloc_one(session, &rwlock)); rwlock->name = name; diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c index 736ed2be377..a0da1952101 100644 --- a/src/os_posix/os_open.c +++ b/src/os_posix/os_open.c @@ -145,7 +145,7 @@ setupfh: WT_ERR(posix_fadvise(fd, 0, 0, POSIX_FADV_RANDOM)); #endif - WT_ERR(__wt_calloc(session, 1, sizeof(WT_FH), &fh)); + WT_ERR(__wt_calloc_one(session, &fh)); WT_ERR(__wt_strdup(session, name, &fh->name)); fh->fd = fd; fh->ref = 1; diff --git a/src/os_win/os_dir.c b/src/os_win/os_dir.c index 076c64670d4..ab332e01186 100644 --- a/src/os_win/os_dir.c +++ b/src/os_win/os_dir.c @@ -38,7 +38,7 @@ __wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix, path[pathlen - 1] = '\0'; } - WT_ERR(__wt_scr_alloc(session, 0, &pathbuf)); + WT_ERR(__wt_scr_alloc(session, pathlen + 3, &pathbuf)); WT_ERR(__wt_buf_fmt(session, pathbuf, "%s\\*", path)); dirallocsz = 0; @@ -96,7 +96,7 @@ err: if (findhandle != INVALID_HANDLE_VALUE) (void)FindClose(findhandle); __wt_free(session, path); - __wt_buf_free(session, pathbuf); + __wt_scr_free(&pathbuf); if (ret == 0) return (0); diff --git a/src/os_win/os_dlopen.c b/src/os_win/os_dlopen.c index ebc90edd2b2..3fdd0c74b1f 100644 --- a/src/os_win/os_dlopen.c +++ b/src/os_win/os_dlopen.c @@ -17,7 +17,7 @@ __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp) WT_DECL_RET; WT_DLH *dlh; - WT_RET(__wt_calloc_def(session, 1, &dlh)); + WT_RET(__wt_calloc_one(session, &dlh)); WT_ERR(__wt_strdup(session, path, &dlh->name)); /* NULL means load from the current binary */ diff --git a/src/os_win/os_mtx_cond.c b/src/os_win/os_mtx_cond.c index 9c9907bd8be..a33ab4e5c37 100644 --- a/src/os_win/os_mtx_cond.c +++ b/src/os_win/os_mtx_cond.c @@ -21,7 +21,7 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, * !!! * This function MUST handle a NULL session handle. */ - WT_RET(__wt_calloc(session, 1, sizeof(WT_CONDVAR), &cond)); + WT_RET(__wt_calloc_one(session, &cond)); InitializeCriticalSection(&cond->mtx); diff --git a/src/os_win/os_open.c b/src/os_win/os_open.c index 6bdbaa3f065..f9d47c5be5d 100644 --- a/src/os_win/os_open.c +++ b/src/os_win/os_open.c @@ -130,7 +130,7 @@ __wt_open(WT_SESSION_IMPL *session, "open failed for secondary handle: %s", path); setupfh: - WT_ERR(__wt_calloc(session, 1, sizeof(WT_FH), &fh)); + WT_ERR(__wt_calloc_one(session, &fh)); WT_ERR(__wt_strdup(session, name, &fh->name)); fh->filehandle = filehandle; fh->filehandle_secondary = filehandle_secondary; diff --git a/src/packing/pack_stream.c b/src/packing/pack_stream.c index efbbd5d9adb..a35a3555458 100644 --- a/src/packing/pack_stream.c +++ b/src/packing/pack_stream.c @@ -30,7 +30,7 @@ wiredtiger_pack_start(WT_SESSION *wt_session, WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; - WT_RET(__wt_calloc_def(session, 1, &ps)); + WT_RET(__wt_calloc_one(session, &ps)); WT_ERR(__pack_init(session, &ps->pack, format)); ps->p = ps->start = buffer; ps->end = ps->p + len; diff --git a/src/reconcile/rec_track.c b/src/reconcile/rec_track.c index 92282393a23..fdf8ee6d68b 100644 --- a/src/reconcile/rec_track.c +++ b/src/reconcile/rec_track.c @@ -21,7 +21,7 @@ static int __ovfl_track_init(WT_SESSION_IMPL *session, WT_PAGE *page) { - return (__wt_calloc_def(session, 1, &page->modify->ovfl_track)); + return (__wt_calloc_one(session, &page->modify->ovfl_track)); } /* diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index c72447ae841..839ab028afd 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -96,16 +96,15 @@ typedef struct { * image size. * * First, the sizes of the page we're building. If WiredTiger is doing - * page layout, page_size is the same as page_size_max. We accumulate - * the maximum page size of raw data and when we reach that size, we - * split the page into multiple chunks, eventually compressing those - * chunks. When the application is doing page layout (raw compression - * is configured), page_size can continue to grow past page_size_max, - * and we keep accumulating raw data until the raw compression callback - * accepts it. + * page layout, page_size is the same as page_size_orig. We accumulate + * a "page size" of raw data and when we reach that size, we split the + * page into multiple chunks, eventually compressing those chunks. When + * the application is doing page layout (raw compression is configured), + * page_size can continue to grow past page_size_orig, and we keep + * accumulating raw data until the raw compression callback accepts it. */ - uint32_t page_size; /* Current page size */ - uint32_t page_size_max; /* Maximum on-disk page size */ + uint32_t page_size; /* Set page size */ + uint32_t page_size_orig; /* Saved set page size */ /* * Second, the split size: if we're doing the page layout, split to a @@ -202,9 +201,8 @@ typedef struct { * because we've already been forced to split. */ enum { SPLIT_BOUNDARY=0, /* Next: a split page boundary */ - SPLIT_MAX=1, /* Next: the maximum page boundary */ - SPLIT_TRACKING_OFF=2, /* No boundary checks */ - SPLIT_TRACKING_RAW=3 } /* Underlying compression decides */ + SPLIT_TRACKING_OFF=1, /* No boundary checks */ + SPLIT_TRACKING_RAW=2 } /* Underlying compression decides */ bnd_state; /* @@ -591,7 +589,7 @@ __rec_write_init(WT_SESSION_IMPL *session, page = ref->page; if ((r = *(WT_RECONCILE **)reconcilep) == NULL) { - WT_RET(__wt_calloc_def(session, 1, &r)); + WT_RET(__wt_calloc_one(session, &r)); *(WT_RECONCILE **)reconcilep = r; session->reconcile_cleanup = __rec_destroy_session; @@ -1284,7 +1282,7 @@ __rec_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size) */ WT_ASSERT(session, r->space_avail >= size); WT_ASSERT(session, - WT_BLOCK_FITS(r->first_free, size, r->dsk.mem, r->page_size)); + WT_BLOCK_FITS(r->first_free, size, r->dsk.mem, r->dsk.memsize)); r->entries += v; r->space_avail -= size; @@ -1543,6 +1541,37 @@ __rec_split_bnd_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r) } /* + * __wt_split_page_size -- + * Split page size calculation: we don't want to repeatedly split every + * time a new entry is added, so we split to a smaller-than-maximum page size. + */ +uint32_t +__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) +{ + uintmax_t a; + uint32_t split_size; + + /* + * Ideally, the split page size is some percentage of the maximum page + * size rounded to an allocation unit (round to an allocation unit so + * we don't waste space when we write). + */ + a = maxpagesize; /* Don't overflow. */ + split_size = (uint32_t) + WT_ALIGN((a * (u_int)btree->split_pct) / 100, btree->allocsize); + + /* + * If the result of that calculation is the same as the allocation unit + * (that happens if the maximum size is the same size as an allocation + * unit, use a percentage of the maximum page size). + */ + if (split_size == btree->allocsize) + split_size = (uint32_t)((a * (u_int)btree->split_pct) / 100); + + return (split_size); +} + +/* * __rec_split_init -- * Initialization for the reconciliation split functions. */ @@ -1576,7 +1605,7 @@ __rec_split_init(WT_SESSION_IMPL *session, * we don't want to increment our way up to the amount of data needed by * the application to successfully compress to the target page size. */ - r->page_size = r->page_size_max = max; + r->page_size = r->page_size_orig = max; if (r->raw_compression) r->page_size *= 10; @@ -1632,11 +1661,11 @@ __rec_split_init(WT_SESSION_IMPL *session, r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree); } else if (page->type == WT_PAGE_COL_FIX) { - r->split_size = r->page_size_max; + r->split_size = r->page_size; r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); } else { - r->split_size = __wt_split_page_size(btree, r->page_size_max); + r->split_size = __wt_split_page_size(btree, r->page_size); r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); } @@ -1853,17 +1882,45 @@ err: __wt_scr_free(&update); } /* + * __rec_split_grow -- + * Grow the split buffer. + */ +static int +__rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len) +{ + WT_BM *bm; + WT_BTREE *btree; + size_t corrected_page_size, len; + + btree = S2BT(session); + bm = btree->bm; + + len = WT_PTRDIFF(r->first_free, r->dsk.mem); + corrected_page_size = len + add_len; + WT_RET(bm->write_size(bm, session, &corrected_page_size)); + WT_RET(__wt_buf_grow(session, &r->dsk, corrected_page_size)); + r->first_free = (uint8_t *)r->dsk.mem + len; + WT_ASSERT(session, corrected_page_size >= len); + r->space_avail = corrected_page_size - len; + WT_ASSERT(session, r->space_avail >= add_len); + return (0); +} + +/* * __rec_split -- * Handle the page reconciliation bookkeeping. (Did you know "bookkeeper" * has 3 doubled letters in a row? Sweet-tooth does, too.) */ static int -__rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r) +__rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) { - WT_BTREE *btree; WT_BOUNDARY *last, *next; + WT_BTREE *btree; WT_PAGE_HEADER *dsk; - uint32_t len; + size_t len; + + btree = S2BT(session); + dsk = r->dsk.mem; /* * We should never split during salvage, and we're about to drop core @@ -1874,45 +1931,20 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r) "%s page too large, attempted split during salvage", __wt_page_type_string(r->page->type)); - /* - * Handle page-buffer size tracking; we have to do this work in every - * reconciliation loop, and I don't want to repeat the code that many - * times. - */ - btree = S2BT(session); - dsk = r->dsk.mem; - /* Hitting a page boundary resets the dictionary, in all cases. */ __rec_dictionary_reset(r); - /* - * There are 3 cases we have to handle. - * - * #1 - * About to cross a split boundary: save current boundary information - * and return. - * - * #2 - * About to cross the maximum boundary: use saved boundary information - * to write all of the split pages. - * - * #3 - * About to cross a split boundary, but we've either already done the - * split thing when we approached the maximum boundary, in which - * case we write the page and keep going, or we were never tracking - * split boundaries at all. - * - * Cases #1 and #2 are the hard ones: we're called when we're about to - * cross each split boundary, and we save information away so we can - * split if we have to. We're also called when we're about to cross - * the maximum page boundary: in that case, we do the actual split and - * clean up all the previous boundaries, then keep going. - */ switch (r->bnd_state) { - case SPLIT_BOUNDARY: /* Case #1 */ + case SPLIT_BOUNDARY: + /* We can get here if the first key/value pair won't fit. */ + if (r->entries == 0) + break; + /* - * Save the information about where we are when the split would - * have happened. + * About to cross a split boundary but not yet forced to split + * into multiple pages. If we have to split, this is one of the + * split points, save information about where we are when the + * split would have happened. */ WT_RET(__rec_split_bnd_grow(session, r)); last = &r->bnd[r->bnd_next++]; @@ -1939,37 +1971,50 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r) /* * Set the space available to another split-size chunk, if we * have one. If we don't have room for another split chunk, - * add whatever space remains in the maximum page size, and - * hope it's enough. + * add whatever space remains in this page. */ len = WT_PTRDIFF32(r->first_free, dsk); if (len + r->split_size <= r->page_size) r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); else { - r->bnd_state = SPLIT_MAX; + WT_ASSERT(session, r->page_size >= + (WT_PAGE_HEADER_BYTE_SIZE(btree) + len)); r->space_avail = r->page_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len); } - break; - case SPLIT_MAX: /* Case #2 */ + + /* If the next object fits into this page, we're good to go. */ + if (r->space_avail >= next_len) + return (0); + /* - * It didn't all fit into a single page. + * We're going to have to split and create multiple pages. * * Cycle through the saved split-point information, writing the - * split chunks we have tracked. + * split chunks we have tracked. The underlying fixup function + * sets the space available and other information, and copied + * any unwritten chunk of data to the beginning of the buffer. */ WT_RET(__rec_split_fixup(session, r)); - - /* We're done saving split chunks. */ - r->bnd_state = SPLIT_TRACKING_OFF; break; - case SPLIT_TRACKING_OFF: /* Case #3 */ + case SPLIT_TRACKING_OFF: + /* + * We can get here if the first key/value pair won't fit. + * Additionally, grow the buffer to contain the current data if + * we haven't already consumed a reasonable portion of the page. + */ + if (r->entries == 0) + break; + if (WT_PTRDIFF(r->first_free, r->dsk.mem) < r->page_size / 2) + break; + /* - * It didn't all fit, but either we've already noticed it and - * are now processing the rest of the page at the split-size - * boundaries, or the split size was the same as the page size, - * so we never bothered with saving split-point information. + * The key/value pairs didn't fit into a single page, but either + * we've already noticed that and are now processing the rest of + * the pairs at split size boundaries, or the split size was the + * same as the page size, and we never bothered with split point + * information at all. */ WT_RET(__rec_split_bnd_grow(session, r)); last = &r->bnd[r->bnd_next++]; @@ -2007,6 +2052,24 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r) case SPLIT_TRACKING_RAW: WT_ILLEGAL_VALUE(session); } + + /* + * Overflow values can be larger than the maximum page size but still be + * "on-page". If the next key/value pair is larger than space available + * after a split has happened (in other words, larger than the maximum + * page size), create a page sized to hold that one key/value pair. This + * generally splits the page into key/value pairs before a large object, + * the object, and key/value pairs after the object. It's possible other + * key/value pairs will also be aggregated onto the bigger page before + * or after, if the page happens to hold them, but it won't necessarily + * happen that way. + */ + if (r->space_avail < next_len) + WT_RET(__rec_split_grow(session, r, next_len)); + + /* We're done saving split chunks. */ + r->bnd_state = SPLIT_TRACKING_OFF; + return (0); } @@ -2015,8 +2078,8 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r) * Handle the raw compression page reconciliation bookkeeping. */ static int -__rec_split_raw_worker( - WT_SESSION_IMPL *session, WT_RECONCILE *r, int no_more_rows) +__rec_split_raw_worker(WT_SESSION_IMPL *session, + WT_RECONCILE *r, size_t next_len, int no_more_rows) { WT_BM *bm; WT_BOUNDARY *last, *next; @@ -2048,6 +2111,12 @@ __rec_split_raw_worker( next = last + 1; /* + * We can get here if the first key/value pair won't fit. + */ + if (r->entries == 0) + goto split_grow; + + /* * Build arrays of offsets and cumulative counts of cells and rows in * the page: the offset is the byte offset to the possible split-point * (adjusted for an initial chunk that cannot be compressed), entries @@ -2150,27 +2219,29 @@ __rec_split_raw_worker( WT_STORE_SIZE(WT_PTRDIFF(cell, dsk) - WT_BLOCK_COMPRESS_SKIP); /* - * Allocate a destination buffer. If there's a pre-size function, use - * it to determine the destination buffer's minimum size, otherwise the - * destination buffer is documented to be at least the maximum object - * size. + * Allocate a destination buffer. If there's a pre-size function, call + * it to determine the destination buffer's size, else the destination + * buffer is documented to be at least the source size. (We can't use + * the target page size, any single key/value could be larger than the + * page size. Don't bother figuring out a minimum, just use the source + * size.) * - * The destination buffer really only needs to be large enough for the - * target block size, corrected for the requirements of the underlying - * block manager. If the target block size is 8KB, that's a multiple - * of 512B and so the underlying block manager is fine with it. But... - * we don't control what the pre_size method returns us as a required - * size, and we don't want to document the compress_raw method has to - * skip bytes in the buffer because that's confusing, so do something - * more complicated. First, find out how much space the compress_raw - * function might need, either the value returned from pre_size, or the - * maximum object size. Add the compress-skip bytes, and then correct - * that value for the underlying block manager. As a result, we have - * a destination buffer that's the right "object" size when calling the - * compress_raw method, and there are bytes in the header just for us. + * The destination buffer needs to be large enough for the final block + * size, corrected for the requirements of the underlying block manager. + * If the final block size is 8KB, that's a multiple of 512B and so the + * underlying block manager is fine with it. But... we don't control + * what the pre_size method returns us as a required size, and we don't + * want to document the compress_raw method has to skip bytes in the + * buffer because that's confusing, so do something more complicated. + * First, find out how much space the compress_raw function might need, + * either the value returned from pre_size, or the initial source size. + * Add the compress-skip bytes, and then correct that value for the + * underlying block manager. As a result, we have a destination buffer + * that's large enough when calling the compress_raw method, and there + * are bytes in the header just for us. */ if (compressor->pre_size == NULL) - result_len = r->page_size_max; + result_len = (size_t)r->raw_offsets[slots]; else WT_RET(compressor->pre_size(compressor, wt_session, (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP, @@ -2185,7 +2256,7 @@ __rec_split_raw_worker( */ memcpy(dst->mem, dsk, WT_BLOCK_COMPRESS_SKIP); ret = compressor->compress_raw(compressor, wt_session, - r->page_size_max, btree->split_pct, + r->page_size_orig, btree->split_pct, WT_BLOCK_COMPRESS_SKIP, (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP, r->raw_offsets, slots, (uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP, @@ -2296,15 +2367,16 @@ no_slots: * Note use of memmove, the source and destination buffers can * overlap. */ - len = WT_PTRDIFF(r->first_free, (uint8_t *)dsk + - r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP); + len = WT_PTRDIFF( + r->first_free, (uint8_t *)dsk + dsk_dst->mem_size); dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); (void)memmove(dsk_start, (uint8_t *)r->first_free - len, len); r->entries -= r->raw_entries[result_slots - 1]; r->first_free = dsk_start + len; - r->space_avail = - r->page_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len); + r->space_avail += r->raw_offsets[result_slots]; + WT_ASSERT(session, r->first_free + r->space_avail <= + (uint8_t *)r->dsk.mem + r->dsk.memsize); /* * Set the key for the next block (before writing the block, a @@ -2358,15 +2430,14 @@ no_slots: */ WT_STAT_FAST_DATA_INCR(session, compress_raw_fail_temporary); - len = WT_PTRDIFF(r->first_free, r->dsk.mem); - corrected_page_size = r->page_size * 2; - WT_RET(bm->write_size(bm, session, &corrected_page_size)); - WT_RET(__wt_buf_grow(session, &r->dsk, corrected_page_size)); +split_grow: /* + * Double the page size and make sure we accommodate at least + * one more record. The reason for the latter is that we may + * be here because there's a large key/value pair that won't + * fit in our initial page buffer, even at its expanded size. + */ r->page_size *= 2; - r->first_free = (uint8_t *)r->dsk.mem + len; - r->space_avail = - r->page_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len); - return (0); + return (__rec_split_grow(session, r, r->page_size + next_len)); } /* We have a block, update the boundary counter. */ @@ -2438,9 +2509,9 @@ err: __wt_scr_free(&tmp); * Raw compression split routine. */ static inline int -__rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r) +__rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) { - return (__rec_split_raw_worker(session, r, 0)); + return (__rec_split_raw_worker(session, r, next_len, 0)); } /* @@ -2456,7 +2527,6 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) /* Adjust the boundary information based on our split status. */ switch (r->bnd_state) { case SPLIT_BOUNDARY: - case SPLIT_MAX: /* * We never split, the reconciled page fit into a maximum page * size. Change the first boundary slot to represent the full @@ -2516,7 +2586,7 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) /* We're done reconciling - write the final page */ if (r->raw_compression && r->entries != 0) { while (r->entries != 0) - WT_RET(__rec_split_raw_worker(session, r, 1)); + WT_RET(__rec_split_raw_worker(session, r, 0, 1)); } else WT_RET(__rec_split_finish_std(session, r)); @@ -2553,7 +2623,7 @@ __rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r) * WT_PAGE_HEADER header onto the scratch buffer, most of the header * information remains unchanged between the pages. */ - WT_RET(__wt_scr_alloc(session, r->page_size_max, &tmp)); + WT_RET(__wt_scr_alloc(session, r->page_size, &tmp)); dsk = tmp->mem; memcpy(dsk, r->dsk.mem, WT_PAGE_HEADER_SIZE); @@ -2595,8 +2665,10 @@ __rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r) r->entries -= r->total_entries; r->first_free = dsk_start + len; + WT_ASSERT(session, + r->page_size >= (WT_PAGE_HEADER_BYTE_SIZE(btree) + len)); r->space_avail = - (r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree)) - len; + r->split_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len); err: __wt_scr_free(&tmp); return (ret); @@ -2905,17 +2977,17 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) cursor->value.data, cursor->value.size, (uint64_t)0)); /* Boundary: split or write the page. */ - while (key->len + val->len > r->space_avail) + if (key->len + val->len > r->space_avail) { if (r->raw_compression) - WT_RET(__rec_split_raw(session, r)); + WT_RET( + __rec_split_raw(session, r, key->len + val->len)); else { - WT_RET(__rec_split(session, r)); + WT_RET(__rec_split(session, r, key->len + val->len)); /* * Turn off prefix compression until a full key written - * to the new page, and (unless we're already working - * with an overflow key), rebuild the key without prefix - * compression. + * to the new page, and (unless already working with an + * overflow key), rebuild the key without compression. */ if (r->key_pfx_compress_conf) { r->key_pfx_compress = 0; @@ -2924,6 +2996,7 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) session, r, NULL, 0, &ovfl_key)); } } + } /* Copy the key/value pair onto the page. */ __rec_copy_incr(session, r, key); @@ -2968,7 +3041,7 @@ __rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk) __rec_incr(session, r, cbulk->entry, __bitstr_size( (size_t)cbulk->entry * btree->bitcnt)); - WT_RET(__rec_split(session, r)); + WT_RET(__rec_split(session, r, 0)); } cbulk->entry = 0; cbulk->nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail); @@ -3048,11 +3121,10 @@ __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) session, r, cbulk->last.data, cbulk->last.size, cbulk->rle)); /* Boundary: split or write the page. */ - while (val->len > r->space_avail) - if (r->raw_compression) - WT_RET(__rec_split_raw(session, r)); - else - WT_RET(__rec_split(session, r)); + if (val->len > r->space_avail) + WT_RET(r->raw_compression ? + __rec_split_raw(session, r, val->len) : + __rec_split(session, r, val->len)); /* Copy the value onto the page. */ if (btree->dictionary) @@ -3171,11 +3243,10 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) CHILD_RELEASE_ERR(session, hazard, ref); /* Boundary: split or write the page. */ - while (val->len > r->space_avail) - if (r->raw_compression) - WT_ERR(__rec_split_raw(session, r)); - else - WT_ERR(__rec_split(session, r)); + if (val->len > r->space_avail) + WT_ERR(r->raw_compression ? + __rec_split_raw(session, r, val->len) : + __rec_split(session, r, val->len)); /* Copy the value onto the page. */ __rec_copy_incr(session, r, val); @@ -3217,11 +3288,10 @@ __rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) addr->addr, addr->size, __rec_vtype(addr), r->recno); /* Boundary: split or write the page. */ - while (val->len > r->space_avail) - if (r->raw_compression) - WT_RET(__rec_split_raw(session, r)); - else - WT_RET(__rec_split(session, r)); + if (val->len > r->space_avail) + WT_RET(r->raw_compression ? + __rec_split_raw(session, r, val->len) : + __rec_split(session, r, val->len)); /* Copy the value onto the page. */ __rec_copy_incr(session, r, val); @@ -3298,7 +3368,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) */ __rec_incr(session, r, entry, __bitstr_size((size_t)entry * btree->bitcnt)); - WT_RET(__rec_split(session, r)); + WT_RET(__rec_split(session, r, 0)); /* Calculate the number of entries per page. */ entry = 0; @@ -3442,11 +3512,10 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, session, r, value->data, value->size, rle)); /* Boundary: split or write the page. */ - while (val->len > r->space_avail) - if (r->raw_compression) - WT_RET(__rec_split_raw(session, r)); - else - WT_RET(__rec_split(session, r)); + if (val->len > r->space_avail) + WT_RET(r->raw_compression ? + __rec_split_raw(session, r, val->len) : + __rec_split(session, r, val->len)); /* Copy the value onto the page. */ if (!deleted && !overflow_type && btree->dictionary) @@ -4034,24 +4103,25 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) r->cell_zero = 0; /* Boundary: split or write the page. */ - while (key->len + val->len > r->space_avail) { - if (r->raw_compression) { - WT_ERR(__rec_split_raw(session, r)); - continue; - } - - /* - * In one path above, we copied address blocks from the - * page rather than building the actual key. In that - * case, we have to build the actual key now because we - * are about to promote it. - */ - if (key_onpage_ovfl) { - WT_ERR(__wt_buf_set(session, - r->cur, WT_IKEY_DATA(ikey), ikey->size)); - key_onpage_ovfl = 0; + if (key->len + val->len > r->space_avail) { + if (r->raw_compression) + WT_ERR(__rec_split_raw( + session, r, key->len + val->len)); + else { + /* + * In one path above, we copied address blocks + * from the page rather than building the actual + * key. In that case, we have to build the key + * now because we are about to promote it. + */ + if (key_onpage_ovfl) { + WT_ERR(__wt_buf_set(session, r->cur, + WT_IKEY_DATA(ikey), ikey->size)); + key_onpage_ovfl = 0; + } + WT_ERR(__rec_split( + session, r, key->len + val->len)); } - WT_ERR(__rec_split(session, r)); } /* Copy the key and value onto the page. */ @@ -4102,11 +4172,10 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) r, addr->addr, addr->size, __rec_vtype(addr), 0); /* Boundary: split or write the page. */ - while (key->len + val->len > r->space_avail) - if (r->raw_compression) - WT_RET(__rec_split_raw(session, r)); - else - WT_RET(__rec_split(session, r)); + if (key->len + val->len > r->space_avail) + WT_RET(r->raw_compression ? + __rec_split_raw(session, r, key->len + val->len) : + __rec_split(session, r, key->len + val->len)); /* Copy the key and value onto the page. */ __rec_copy_incr(session, r, key); @@ -4140,7 +4209,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session, size_t size; uint64_t slvg_skip; uint32_t i; - int dictionary, onpage_ovfl, ovfl_key; + int dictionary, key_onpage_ovfl, ovfl_key; const void *p; void *copy; @@ -4369,9 +4438,9 @@ __rec_row_leaf(WT_SESSION_IMPL *session, * If the key is an overflow key that hasn't been removed, use * the original backing blocks. */ - onpage_ovfl = kpack != NULL && + key_onpage_ovfl = kpack != NULL && kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM; - if (onpage_ovfl) { + if (key_onpage_ovfl) { key->buf.data = cell; key->buf.size = __wt_cell_total_len(kpack); key->cell_len = 0; @@ -4435,36 +4504,39 @@ build: } /* Boundary: split or write the page. */ - while (key->len + val->len > r->space_avail) { - if (r->raw_compression) { - WT_ERR(__rec_split_raw(session, r)); - continue; - } - - /* - * In one path above, we copied address blocks from the - * page rather than building the actual key. In that - * case, we have to build the actual key now because we - * are about to promote it. - */ - if (onpage_ovfl) { - WT_ERR(__wt_dsk_cell_data_ref( - session, WT_PAGE_ROW_LEAF, kpack, r->cur)); - onpage_ovfl = 0; - } - WT_ERR(__rec_split(session, r)); + if (key->len + val->len > r->space_avail) { + if (r->raw_compression) + WT_ERR(__rec_split_raw( + session, r, key->len + val->len)); + else { + /* + * In one path above, we copied address blocks + * from the page rather than building the actual + * key. In that case, we have to build the key + * now because we are about to promote it. + */ + if (key_onpage_ovfl) { + WT_ERR(__wt_dsk_cell_data_ref(session, + WT_PAGE_ROW_LEAF, kpack, r->cur)); + key_onpage_ovfl = 0; + } + WT_ERR(__rec_split( + session, r, key->len + val->len)); - /* - * Turn off prefix compression until a full key written - * to the new page, and (unless we're already working - * with an overflow key), rebuild the key without prefix - * compression. - */ - if (r->key_pfx_compress_conf) { - r->key_pfx_compress = 0; - if (!ovfl_key) - WT_ERR(__rec_cell_build_leaf_key( - session, r, NULL, 0, &ovfl_key)); + /* + * Turn off prefix compression until a full key + * written to the new page, and (unless already + * working with an overflow key), rebuild the + * key without compression. + */ + if (r->key_pfx_compress_conf) { + r->key_pfx_compress = 0; + if (!ovfl_key) + WT_ERR( + __rec_cell_build_leaf_key( + session, + r, NULL, 0, &ovfl_key)); + } } } @@ -4529,24 +4601,28 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key)); /* Boundary: split or write the page. */ - while (key->len + val->len > r->space_avail) { - if (r->raw_compression) { - WT_RET(__rec_split_raw(session, r)); - continue; - } - WT_RET(__rec_split(session, r)); + if (key->len + val->len > r->space_avail) { + if (r->raw_compression) + WT_RET(__rec_split_raw( + session, r, key->len + val->len)); + else { + WT_RET(__rec_split( + session, r, key->len + val->len)); - /* - * Turn off prefix compression until a full key written - * to the new page, and (unless we're already working - * with an overflow key), rebuild the key without prefix - * compression. - */ - if (r->key_pfx_compress_conf) { - r->key_pfx_compress = 0; - if (!ovfl_key) - WT_RET(__rec_cell_build_leaf_key( - session, r, NULL, 0, &ovfl_key)); + /* + * Turn off prefix compression until a full key + * written to the new page, and (unless already + * working with an overflow key), rebuild the + * key without compression. + */ + if (r->key_pfx_compress_conf) { + r->key_pfx_compress = 0; + if (!ovfl_key) + WT_RET( + __rec_cell_build_leaf_key( + session, + r, NULL, 0, &ovfl_key)); + } } } @@ -5064,7 +5140,7 @@ __rec_cell_build_int_key(WT_SESSION_IMPL *session, WT_RET(__wt_buf_set(session, &key->buf, data, size)); /* Create an overflow object if the data won't fit. */ - if (size > btree->maxintlitem) { + if (size > btree->maxintlkey) { WT_STAT_FAST_DATA_INCR(session, rec_overflow_key_internal); *is_ovflp = 1; @@ -5159,7 +5235,7 @@ __rec_cell_build_leaf_key(WT_SESSION_IMPL *session, key->buf.data, (uint32_t)key->buf.size, &key->buf)); /* Create an overflow object if the data won't fit. */ - if (key->buf.size > btree->maxleafitem) { + if (key->buf.size > btree->maxleafkey) { /* * Overflow objects aren't prefix compressed -- rebuild any * object that was prefix compressed. @@ -5246,7 +5322,7 @@ __rec_cell_build_val(WT_SESSION_IMPL *session, val->buf.data, (uint32_t)val->buf.size, &val->buf)); /* Create an overflow object if the data won't fit. */ - if (val->buf.size > btree->maxleafitem) { + if (val->buf.size > btree->maxleafvalue) { WT_STAT_FAST_DATA_INCR(session, rec_overflow_value); return (__rec_cell_build_ovfl( diff --git a/src/schema/schema_open.c b/src/schema/schema_open.c index f5937381cbb..4699fdeee02 100644 --- a/src/schema/schema_open.c +++ b/src/schema/schema_open.c @@ -83,7 +83,7 @@ __wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table) goto err; } - WT_ERR(__wt_calloc_def(session, 1, &colgroup)); + WT_ERR(__wt_calloc_one(session, &colgroup)); WT_ERR(__wt_strndup( session, buf->data, buf->size, &colgroup->name)); colgroup->config = cgconfig; @@ -319,7 +319,7 @@ __wt_schema_open_index(WT_SESSION_IMPL *session, if (table->indices[i] == NULL) { WT_ERR(cursor->get_value(cursor, &idxconf)); - WT_ERR(__wt_calloc_def(session, 1, &idx)); + WT_ERR(__wt_calloc_one(session, &idx)); WT_ERR(__wt_strdup(session, uri, &idx->name)); WT_ERR(__wt_strdup(session, idxconf, &idx->config)); WT_ERR(__open_index(session, table, idx)); @@ -392,7 +392,7 @@ __wt_schema_open_table(WT_SESSION_IMPL *session, WT_ERR(cursor->search(cursor)); WT_ERR(cursor->get_value(cursor, &tconfig)); - WT_ERR(__wt_calloc_def(session, 1, &table)); + WT_ERR(__wt_calloc_one(session, &table)); table->name = tablename; tablename = NULL; table->name_hash = __wt_hash_city64(name, namelen); diff --git a/src/session/session_api.c b/src/session/session_api.c index dc3c7d7041f..8f460dcc29f 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -953,6 +953,7 @@ __wt_open_session(WT_CONNECTION_IMPL *conn, { static const WT_SESSION stds = { NULL, + NULL, __session_close, __session_reconfigure, __session_open_cursor, diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c index e28e277d5f6..85483c7c8ae 100644 --- a/src/session/session_dhandle.c +++ b/src/session/session_dhandle.c @@ -20,7 +20,7 @@ __session_add_dhandle( WT_DATA_HANDLE_CACHE *dhandle_cache; uint64_t bucket; - WT_RET(__wt_calloc_def(session, 1, &dhandle_cache)); + WT_RET(__wt_calloc_one(session, &dhandle_cache)); dhandle_cache->dhandle = session->dhandle; bucket = dhandle_cache->dhandle->name_hash % WT_HASH_ARRAY_SIZE; diff --git a/src/support/huffman.c b/src/support/huffman.c index 5a06b72d33e..9625e879381 100644 --- a/src/support/huffman.c +++ b/src/support/huffman.c @@ -306,7 +306,7 @@ __wt_huffman_open(WT_SESSION_IMPL *session, combined_nodes = leaves = NULL; node = node2 = tempnode = NULL; - WT_RET(__wt_calloc_def(session, 1, &huffman)); + WT_RET(__wt_calloc_one(session, &huffman)); /* * The frequency table is 4B pairs of symbol and frequency. The symbol @@ -381,8 +381,8 @@ __wt_huffman_open(WT_SESSION_IMPL *session, symcnt, sizeof(INDEXED_SYMBOL), indexed_freq_compare); /* We need two node queues to build the tree. */ - WT_ERR(__wt_calloc_def(session, 1, &leaves)); - WT_ERR(__wt_calloc_def(session, 1, &combined_nodes)); + WT_ERR(__wt_calloc_one(session, &leaves)); + WT_ERR(__wt_calloc_one(session, &combined_nodes)); /* * Adding the leaves to the queue. @@ -393,7 +393,7 @@ __wt_huffman_open(WT_SESSION_IMPL *session, */ for (i = 0; i < symcnt; ++i) if (indexed_freqs[i].frequency > 0) { - WT_ERR(__wt_calloc_def(session, 1, &tempnode)); + WT_ERR(__wt_calloc_one(session, &tempnode)); tempnode->symbol = (uint8_t)indexed_freqs[i].symbol; tempnode->weight = indexed_freqs[i].frequency; WT_ERR(node_queue_enqueue(session, leaves, tempnode)); @@ -431,7 +431,7 @@ __wt_huffman_open(WT_SESSION_IMPL *session, * In every second run, we have both node and node2 initialized. */ if (node != NULL && node2 != NULL) { - WT_ERR(__wt_calloc_def(session, 1, &tempnode)); + WT_ERR(__wt_calloc_one(session, &tempnode)); /* The new weight is the sum of the two weights. */ tempnode->weight = node->weight + node2->weight; @@ -845,7 +845,7 @@ node_queue_enqueue( NODE_QUEUE_ELEM *elem; /* Allocating a new linked list element */ - WT_RET(__wt_calloc_def(session, 1, &elem)); + WT_RET(__wt_calloc_one(session, &elem)); /* It holds the tree node, and has no next element yet */ elem->node = node; diff --git a/src/support/scratch.c b/src/support/scratch.c index ca2cdac8377..e4df04a36ed 100644 --- a/src/support/scratch.c +++ b/src/support/scratch.c @@ -216,7 +216,7 @@ __wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp WT_ASSERT(session, slot != NULL); best = slot; - WT_ERR(__wt_calloc_def(session, 1, best)); + WT_ERR(__wt_calloc_one(session, best)); /* Scratch buffers must be aligned. */ F_SET(*best, WT_ITEM_ALIGNED); diff --git a/src/support/stat.c b/src/support/stat.c index 21d56238f4a..19aa9170c5b 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -30,11 +30,11 @@ __wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats) stats->btree_column_variable.desc = "btree: column-store variable-size leaf pages"; stats->btree_fixed_len.desc = "btree: fixed-record size"; - stats->btree_maxintlitem.desc = - "btree: maximum internal page item size"; + stats->btree_maxintlkey.desc = "btree: maximum internal page key size"; stats->btree_maxintlpage.desc = "btree: maximum internal page size"; - stats->btree_maxleafitem.desc = "btree: maximum leaf page item size"; + stats->btree_maxleafkey.desc = "btree: maximum leaf page key size"; stats->btree_maxleafpage.desc = "btree: maximum leaf page size"; + stats->btree_maxleafvalue.desc = "btree: maximum leaf page value size"; stats->btree_maximum_depth.desc = "btree: maximum tree depth"; stats->btree_entries.desc = "btree: number of key/value pairs"; stats->btree_overflow.desc = "btree: overflow pages"; @@ -154,10 +154,11 @@ __wt_stat_refresh_dsrc_stats(void *stats_arg) stats->btree_column_deleted.v = 0; stats->btree_column_variable.v = 0; stats->btree_fixed_len.v = 0; - stats->btree_maxintlitem.v = 0; + stats->btree_maxintlkey.v = 0; stats->btree_maxintlpage.v = 0; - stats->btree_maxleafitem.v = 0; + stats->btree_maxleafkey.v = 0; stats->btree_maxleafpage.v = 0; + stats->btree_maxleafvalue.v = 0; stats->btree_maximum_depth.v = 0; stats->btree_entries.v = 0; stats->btree_overflow.v = 0; diff --git a/test/format/format.h b/test/format/format.h index 8020d18d716..902cea6cc5d 100644 --- a/test/format/format.h +++ b/test/format/format.h @@ -89,6 +89,8 @@ extern WT_EXTENSION_API *wt_api; #undef M #define M(v) ((v) * 1000000) /* Million */ +#undef KILOBYTE +#define KILOBYTE(v) ((v) * 1024) #undef MEGABYTE #define MEGABYTE(v) ((v) * 1048576) #undef GIGABYTE diff --git a/test/format/ops.c b/test/format/ops.c index 28f1079b30d..bbaeabcc479 100644 --- a/test/format/ops.c +++ b/test/format/ops.c @@ -52,7 +52,7 @@ wts_ops(void) WT_SESSION *session; pthread_t backup_tid, compact_tid; uint64_t thread_ops; - uint32_t i, tenths; + uint32_t i, fourths; int ret, running; conn = g.wts_conn; @@ -72,16 +72,16 @@ wts_ops(void) * There are two mechanisms to specify the length of the run, a number * of operations or a timer. If the former, each thread does an equal * share of the total operations (and make sure that it's not 0). If - * the latter, calculate how many tenth-of-a-second sleeps until this + * the latter, calculate how many fourth-of-a-second sleeps until this * part of the run finishes. */ if (g.c_timer == 0) { - tenths = 0; + fourths = 0; if (g.c_ops < g.c_threads) g.c_ops = g.c_threads; thread_ops = g.c_ops / g.c_threads; } else { - tenths = (g.c_timer * 10 * 60) / FORMAT_OPERATION_REPS; + fourths = (g.c_timer * 4 * 60) / FORMAT_OPERATION_REPS; thread_ops = 0; } @@ -141,7 +141,7 @@ wts_ops(void) /* Tell the thread if it's done. */ if (thread_ops == 0) { - if (tenths == 0) + if (fourths == 0) tinfo[i].quit = 1; } else if (tinfo[i].ops >= thread_ops) @@ -151,8 +151,8 @@ wts_ops(void) if (!running) break; (void)usleep(250000); /* 1/4th of a second */ - if (tenths != 0) - --tenths; + if (fourths != 0) + --fourths; } free(tinfo); diff --git a/test/format/util.c b/test/format/util.c index b043475842e..4880dfbbdd0 100644 --- a/test/format/util.c +++ b/test/format/util.c @@ -27,17 +27,24 @@ #include "format.h" +#ifndef MAX +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) +#endif + static inline uint32_t kv_len(uint64_t keyno, uint32_t min, uint32_t max) { /* - * We want to focus on relatively small key/value items, but admitting - * the possibility of larger items. Pick a size close to the minimum - * most of the time, only roll the dice for a really big item 1 in 20 - * times. (The configuration can force large key/value minimum sizes, - * where every key/value item will be an overflow.) + * Focus on relatively small key/value items, admitting the possibility + * of larger items. Pick a size close to the minimum most of the time, + * only create a larger item 1 in 20 times, and a really big item 1 in + * 1000 times. (Configuration can force large key/value minimum sizes, + * where every key/value item is an overflow.) */ - if (keyno % 20 != 0 && max > min + 20) + if (keyno % 1000 == 0 && max < KILOBYTE(80)) { + min = KILOBYTE(80); + max = KILOBYTE(100); + } else if (keyno % 20 != 0 && max > min + 20) max = min + 20; return (MMRAND(min, max)); } @@ -65,13 +72,14 @@ void key_gen_setup(uint8_t **keyp) { uint8_t *key; - size_t i; + size_t i, len; *keyp = NULL; - if ((key = malloc(g.c_key_max)) == NULL) + len = MAX(KILOBYTE(100), g.c_key_max); + if ((key = malloc(len)) == NULL) die(errno, "malloc"); - for (i = 0; i < g.c_key_max; ++i) + for (i = 0; i < len; ++i) key[i] = (uint8_t)("abcdefghijklmnopqrstuvwxyz"[i % 26]); *keyp = key; } @@ -118,7 +126,7 @@ val_gen_setup(uint8_t **valp) * into the buffer by a few extra bytes, used to generate different * data for column-store run-length encoded files. */ - len = g.c_value_max + 20; + len = MAX(KILOBYTE(100), g.c_value_max) + 20; if ((val = malloc(len)) == NULL) die(errno, "malloc"); for (i = 0; i < len; ++i) diff --git a/test/format/wts.c b/test/format/wts.c index 29b40eda74d..21e7806e982 100644 --- a/test/format/wts.c +++ b/test/format/wts.c @@ -173,12 +173,15 @@ wts_create(void) { WT_CONNECTION *conn; WT_SESSION *session; - uint32_t maxintlpage, maxintlitem, maxleafpage, maxleafitem; + uint32_t maxintlpage, maxintlkey, maxleafpage, maxleafkey, maxleafvalue; int ret; char config[4096], *end, *p; conn = g.wts_conn; + p = config; + end = config + sizeof(config); + /* * Ensure that we can service at least one operation per-thread * concurrently without filling the cache with pinned pages. We @@ -197,23 +200,30 @@ wts_create(void) if (maxleafpage > 512) maxleafpage >>= 1; } - maxintlitem = MMRAND(maxintlpage / 50, maxintlpage / 40); - if (maxintlitem < 40) - maxintlitem = 40; - maxleafitem = MMRAND(maxleafpage / 50, maxleafpage / 40); - if (maxleafitem < 40) - maxleafitem = 40; - - p = config; - end = config + sizeof(config); p += snprintf(p, (size_t)(end - p), "key_format=%s," "allocation_size=512,%s" - "internal_page_max=%d,internal_item_max=%d," - "leaf_page_max=%d,leaf_item_max=%d", + "internal_page_max=%d,leaf_page_max=%d", (g.type == ROW) ? "u" : "r", g.c_firstfit ? "block_allocation=first," : "", - maxintlpage, maxintlitem, maxleafpage, maxleafitem); + maxintlpage, maxleafpage); + + /* + * Configure the maximum key/value sizes, but leave it as the default + * if we come up with something crazy. + */ + maxintlkey = MMRAND(maxintlpage / 50, maxintlpage / 40); + if (maxintlkey > 20) + p += snprintf(p, (size_t)(end - p), + ",internal_key_max=%d", maxintlkey); + maxleafkey = MMRAND(maxleafpage / 50, maxleafpage / 40); + if (maxleafkey > 20) + p += snprintf(p, (size_t)(end - p), + ",leaf_key_max=%d", maxleafkey); + maxleafvalue = MMRAND(maxleafpage * 10, maxleafpage / 40); + if (maxleafvalue > 40 && maxleafvalue < 100 * 1024) + p += snprintf(p, (size_t)(end - p), + ",leaf_value_max=%d", maxleafvalue); switch (g.type) { case FIX: diff --git a/test/suite/run.py b/test/suite/run.py index 32dc8835d4b..a29f7af2212 100644 --- a/test/suite/run.py +++ b/test/suite/run.py @@ -82,6 +82,7 @@ Options:\n\ -g | --gdb all subprocesses (like calls to wt) use gdb\n\ -h | --help show this message\n\ -j N | --parallel N run all tests in parallel using N processes\n\ + -l | --long run the entire test suite\n\ -p | --preserve preserve output files in WT_TEST/<testname>\n\ -t | --timestamp name WT_TEST according to timestamp\n\ -v N | --verbose N set verboseness to N (0<=N<=3, default=1)\n\ @@ -219,7 +220,7 @@ if __name__ == '__main__': tests = unittest.TestSuite() # Turn numbers and ranges into test module names - preserve = timestamp = debug = gdbSub = False + preserve = timestamp = debug = gdbSub = longtest = False parallel = 0 configfile = None configwrite = False @@ -243,6 +244,15 @@ if __name__ == '__main__': if option == '-debug' or option == 'd': debug = True continue + if option == '-gdb' or option == 'g': + gdbSub = True + continue + if option == '-help' or option == 'h': + usage() + sys.exit(True) + if option == '-long' or option == 'l': + longtest = True + continue if option == '-parallel' or option == 'j': if parallel != 0 or len(args) == 0: usage() @@ -255,12 +265,6 @@ if __name__ == '__main__': if option == '-timestamp' or option == 't': timestamp = True continue - if option == '-gdb' or option == 'g': - gdbSub = True - continue - if option == '-help' or option == 'h': - usage() - sys.exit(True) if option == '-verbose' or option == 'v': if len(args) == 0: usage() @@ -292,7 +296,7 @@ if __name__ == '__main__': # All global variables should be set before any test classes are loaded. # That way, verbose printing can be done at the class definition level. wttest.WiredTigerTestCase.globalSetup(preserve, timestamp, gdbSub, - verbose, dirarg) + verbose, dirarg, longtest) # Without any tests listed as arguments, do discovery if len(testargs) == 0: diff --git a/test/suite/test_lsm02.py b/test/suite/test_lsm02.py index 2b3d48f8f30..41d82d8ad0d 100644 --- a/test/suite/test_lsm02.py +++ b/test/suite/test_lsm02.py @@ -54,9 +54,12 @@ class test_lsm02(wttest.WiredTigerTestCase): v = '\x14\x14' self.add_key(self.uri, 'k1', v) self.verify_key_exists(self.uri, 'k1', v) - v += 'a' * 1000 + v = '\x14\x14\0\0\0\0\0\0' self.add_key(self.uri, 'k2', v) self.verify_key_exists(self.uri, 'k2', v) + v += 'a' * 1000 + self.add_key(self.uri, 'k3', v) + self.verify_key_exists(self.uri, 'k3', v) def test_lsm_rename01(self): self.session.create(self.uri, 'key_format=S,value_format=S') diff --git a/test/suite/test_sweep01.py b/test/suite/test_sweep01.py index 8d561763091..d2ebb796d28 100644 --- a/test/suite/test_sweep01.py +++ b/test/suite/test_sweep01.py @@ -33,7 +33,7 @@ import fnmatch, os, shutil, run, time from suite_subprocess import suite_subprocess from wiredtiger import wiredtiger_open, stat -from wtscenario import multiply_scenarios, number_scenarios +from wtscenario import multiply_scenarios, number_scenarios, prune_scenarios import wttest class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess): @@ -55,7 +55,7 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess): create_params = 'key_format=r,value_format=8t')), ] - scenarios = number_scenarios(multiply_scenarios('.', types, ckpt_list)) + scenarios = number_scenarios(prune_scenarios(multiply_scenarios('.', types, ckpt_list), 1, 100)) # Overrides WiredTigerTestCase def setUpConnectionOpen(self, dir): diff --git a/test/suite/test_txn02.py b/test/suite/test_txn02.py index 32165c380a9..d83bf6ce5f8 100644 --- a/test/suite/test_txn02.py +++ b/test/suite/test_txn02.py @@ -32,7 +32,7 @@ import fnmatch, os, shutil, time from suite_subprocess import suite_subprocess from wiredtiger import wiredtiger_open -from wtscenario import multiply_scenarios, number_scenarios +from wtscenario import multiply_scenarios, number_scenarios, prune_scenarios import wttest class test_txn02(wttest.WiredTigerTestCase, suite_subprocess): @@ -81,8 +81,19 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess): txn3s = [('t3c', dict(txn3='commit')), ('t3r', dict(txn3='rollback'))] txn4s = [('t4c', dict(txn4='commit')), ('t4r', dict(txn4='rollback'))] - scenarios = number_scenarios(multiply_scenarios('.', types, - op1s, txn1s, op2s, txn2s, op3s, txn3s, op4s, txn4s)) + all_scenarios = multiply_scenarios('.', types, + op1s, txn1s, op2s, txn2s, op3s, txn3s, op4s, txn4s) + + # This test generates thousands of potential scenarios. + # For default runs, we'll use a small subset of them, for + # long runs (when --long is set) we'll set a much larger limit. + scenarios = number_scenarios(prune_scenarios(all_scenarios, 20, 5000)) + + # Each check_log() call takes a second, so we don't call it for + # every scenario, we'll limit it to the value of checklog_calls. + checklog_calls = 100 if wttest.islongtest() else 2 + checklog_mod = (len(scenarios) / checklog_calls + 1) + # scenarios = number_scenarios(multiply_scenarios('.', types, # op1s, txn1s, op2s, txn2s, op3s, txn3s, op4s, txn4s)) [:3] # Overrides WiredTigerTestCase @@ -253,10 +264,8 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess): # Check the state after each commit/rollback. self.check_all(current, committed) - # Check the log state after the entire op completes and run recovery. - # check_log() takes over a second to run, so we don't want to run it - # for all scenarios, rather, we run it about 100 times overall. - if self.scenario_number % (len(test_txn02.scenarios) / 100 + 1) == 0: + # check_log() is slow, we don't run it on every scenario. + if self.scenario_number % test_txn02.checklog_mod == 0: self.check_log(committed) if __name__ == '__main__': diff --git a/test/suite/wtscenario.py b/test/suite/wtscenario.py index a8fd4031ceb..70497102bb0 100644 --- a/test/suite/wtscenario.py +++ b/test/suite/wtscenario.py @@ -102,17 +102,29 @@ def prune_sorter_key(scene): p = scene[1]['P'] return p * scene[1]['_rand'] -def prune_scenarios(scenes, count = -1): +def prune_resort_key(scene): + """ + Used by prune_scenerios to extract the original ordering key for sorting. + """ + return scene[1]['_order'] + +def set_long_run(islong): + global _is_long_run + _is_long_run = islong + +def prune_scenarios(scenes, default_count = -1, long_count = -1): """ Use listed probabilities for pruning the list of scenarios. That is, the highest probability (value of P in the scendario) - are chosen more often. With a second argument, only the - given number of scenarios are returned. With no second argument, - only scenarios with P > .5 are returned half the time, etc. + are chosen more often. With just one argument, only scenarios + with P > .5 are returned half the time, etc. A second argument + limits the number of scenarios. When a third argument is present, + it is a separate limit for a long run. """ + global _is_long_run r = suite_random.suite_random() result = [] - if count == -1: + if default_count == -1: # Missing second arg - return those with P == .3 at # 30% probability, for example. for scene in scenes: @@ -123,25 +135,41 @@ def prune_scenarios(scenes, count = -1): result.append(scene) return result else: - # With second arg, we want exactly 'count' items - # returned. So we'll sort them all and choose + # With at least a second arg present, we'll want a specific count + # of items returned. So we'll sort them all and choose # the top number. Not the most efficient solution, # but it's easy. + if _is_long_run and long_count != -1: + count = long_count + else: + count = default_count + + l = len(scenes) + if l <= count: + return scenes + if count == 0: + return [] + order = 0 for scene in scenes: scene[1]['_rand'] = r.rand_float() - scenes = sorted(scenes, key=prune_sorter_key) + scene[1]['_order'] = order + order += 1 + scenes = sorted(scenes, key=prune_sorter_key) # random sort driven by P + scenes = scenes[l-count:l] # truncate to get best + scenes = sorted(scenes, key=prune_resort_key) # original order for scene in scenes: del scene[1]['_rand'] - l = len(scenes) - return scenes[l-count:l] + del scene[1]['_order'] + return scenes def number_scenarios(scenes): """ - Add a 'scenario_number' variable to each scenario. + Add a 'scenario_number' and 'scenario_name' variable to each scenario. The hash table for each scenario is altered! """ count = 0 for scene in scenes: + scene[1]['scenario_name'] = scene[0] scene[1]['scenario_number'] = count count += 1 return scenes diff --git a/test/suite/wttest.py b/test/suite/wttest.py index d1705434988..4de09a143b2 100644 --- a/test/suite/wttest.py +++ b/test/suite/wttest.py @@ -37,7 +37,7 @@ except ImportError: from contextlib import contextmanager import os, re, shutil, sys, time, traceback - +import wtscenario import wiredtiger def shortenWithEllipsis(s, maxlen): @@ -141,17 +141,20 @@ class WiredTigerTestCase(unittest.TestCase): @staticmethod def globalSetup(preserveFiles = False, useTimestamp = False, - gdbSub = False, verbose = 1, dirarg = None): + gdbSub = False, verbose = 1, dirarg = None, + longtest = False): WiredTigerTestCase._preserveFiles = preserveFiles d = 'WT_TEST' if dirarg == None else dirarg if useTimestamp: d += '.' + time.strftime('%Y%m%d-%H%M%S', time.localtime()) shutil.rmtree(d, ignore_errors=True) os.makedirs(d) + wtscenario.set_long_run(longtest) WiredTigerTestCase._parentTestdir = d WiredTigerTestCase._origcwd = os.getcwd() WiredTigerTestCase._resultfile = open(os.path.join(d, 'results.txt'), "w", 0) # unbuffered WiredTigerTestCase._gdbSubprocess = gdbSub + WiredTigerTestCase._longtest = longtest WiredTigerTestCase._verbose = verbose WiredTigerTestCase._dupout = os.dup(sys.stdout.fileno()) WiredTigerTestCase._stdout = sys.stdout @@ -182,8 +185,9 @@ class WiredTigerTestCase(unittest.TestCase): # is used, then each scenario is given a number, which can # help distinguish tests. scen = '' - if hasattr(self, 'scenario_number'): - scen = '(scenario ' + str(self.scenario_number) + ')' + if hasattr(self, 'scenario_number') and hasattr(self, 'scenario_name'): + scen = '(scenario ' + str(self.scenario_number) + \ + ': ' + self.scenario_name + ')' return self.simpleName() + scen def simpleName(self): @@ -283,7 +287,7 @@ class WiredTigerTestCase(unittest.TestCase): self.pr('preserving directory ' + self.testdir) if not passed and not skipped: - print "ERROR in " + self.testsubdir + print "ERROR in " + str(self) self.pr('FAIL') self.prexception(excinfo) self.pr('preserving directory ' + self.testdir) @@ -431,6 +435,23 @@ class WiredTigerTestCase(unittest.TestCase): def className(self): return self.__class__.__name__ + +def longtest(description): + """ + Used as a function decorator, for example, @wttest.longtest("description"). + The decorator indicates that this test function should only be included + when running the test suite with the --long option. + """ + def runit_decorator(func): + return func + if not WiredTigerTestCase._longtest: + return unittest.skip(description + ' (enable with --long)') + else: + return runit_decorator + +def islongtest(): + return WiredTigerTestCase._longtest + def runsuite(suite, parallel): suite_to_run = suite if parallel > 1: diff --git a/tools/stat_data.py b/tools/stat_data.py index 50528dbd26a..3d192be7566 100644 --- a/tools/stat_data.py +++ b/tools/stat_data.py @@ -34,10 +34,11 @@ no_scale_per_second_list = [ 'btree: column-store variable-size deleted values', 'btree: column-store variable-size leaf pages', 'btree: fixed-record size', - 'btree: maximum internal page item size', + 'btree: maximum internal page key size', 'btree: maximum internal page size', - 'btree: maximum leaf page item size', + 'btree: maximum leaf page key size', 'btree: maximum leaf page size', + 'btree: maximum leaf page value size', 'btree: maximum tree depth', 'btree: number of key/value pairs', 'btree: overflow pages', |