summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Bostic <keith@wiredtiger.com>2014-12-15 09:35:54 -0500
committerKeith Bostic <keith@wiredtiger.com>2014-12-15 09:35:54 -0500
commit980165614f114dbcf02344ba7209ae77369bcb80 (patch)
tree952a89a49aa758ec177ed9ce491524d0c1c79c1f
parent4c26d2324bae1d7030b0142d50dbd2ccf11ddeb6 (diff)
parent5cf21acf8fd66876e71334cc09deac0a09e8ea91 (diff)
downloadmongo-980165614f114dbcf02344ba7209ae77369bcb80.tar.gz
Merge branch 'develop' into cursor-reconfigure
Conflicts: src/cursor/cur_metadata.c
-rw-r--r--bench/wtperf/runners/small-lsm.wtperf2
-rw-r--r--dist/api_data.py37
-rw-r--r--dist/api_err.py4
-rw-r--r--dist/stat_data.py10
-rw-r--r--examples/c/Makefile.am1
-rw-r--r--examples/c/ex_all.c14
-rw-r--r--examples/c/ex_backup.c2
-rw-r--r--examples/c/ex_file.c72
-rw-r--r--src/async/async_api.c4
-rw-r--r--src/async/async_worker.c2
-rw-r--r--src/block/block_mgr.c2
-rw-r--r--src/block/block_open.c2
-rw-r--r--src/block/block_session.c2
-rw-r--r--src/bloom/bloom.c2
-rw-r--r--src/btree/bt_delete.c4
-rw-r--r--src/btree/bt_handle.c182
-rw-r--r--src/btree/bt_page.c4
-rw-r--r--src/btree/bt_slvg.c10
-rw-r--r--src/btree/bt_split.c16
-rw-r--r--src/btree/bt_stat.c5
-rw-r--r--src/btree/row_modify.c2
-rw-r--r--src/config/config_api.c2
-rw-r--r--src/config/config_check.c2
-rw-r--r--src/config/config_def.c22
-rw-r--r--src/conn/conn_api.c10
-rw-r--r--src/conn/conn_cache.c2
-rw-r--r--src/conn/conn_cache_pool.c2
-rw-r--r--src/conn/conn_dhandle.c4
-rw-r--r--src/conn/conn_log.c2
-rw-r--r--src/cursor/cur_backup.c2
-rw-r--r--src/cursor/cur_config.c2
-rw-r--r--src/cursor/cur_ds.c2
-rw-r--r--src/cursor/cur_dump.c4
-rw-r--r--src/cursor/cur_index.c2
-rw-r--r--src/cursor/cur_log.c6
-rw-r--r--src/cursor/cur_metadata.c6
-rw-r--r--src/cursor/cur_stat.c2
-rw-r--r--src/cursor/cur_table.c2
-rw-r--r--src/docs/error-handling.dox2
-rw-r--r--src/docs/examples.dox3
-rw-r--r--src/docs/tune-page-sizes.dox190
-rw-r--r--src/docs/upgrading.dox8
-rw-r--r--src/evict/evict_page.c2
-rw-r--r--src/include/btree.h5
-rw-r--r--src/include/extern.h2
-rw-r--r--src/include/misc.h6
-rw-r--r--src/include/stat.h5
-rw-r--r--src/include/wiredtiger.in183
-rw-r--r--src/lsm/lsm_cursor.c25
-rw-r--r--src/lsm/lsm_manager.c2
-rw-r--r--src/lsm/lsm_merge.c2
-rw-r--r--src/lsm/lsm_meta.c6
-rw-r--r--src/lsm/lsm_tree.c8
-rw-r--r--src/os_posix/os_dlopen.c2
-rw-r--r--src/os_posix/os_mtx_cond.c2
-rw-r--r--src/os_posix/os_mtx_rw.c2
-rw-r--r--src/os_posix/os_open.c2
-rw-r--r--src/os_win/os_dir.c4
-rw-r--r--src/os_win/os_dlopen.c2
-rw-r--r--src/os_win/os_mtx_cond.c2
-rw-r--r--src/os_win/os_open.c2
-rw-r--r--src/packing/pack_stream.c2
-rw-r--r--src/reconcile/rec_track.c2
-rw-r--r--src/reconcile/rec_write.c496
-rw-r--r--src/schema/schema_open.c6
-rw-r--r--src/session/session_api.c1
-rw-r--r--src/session/session_dhandle.c2
-rw-r--r--src/support/huffman.c12
-rw-r--r--src/support/scratch.c2
-rw-r--r--src/support/stat.c11
-rw-r--r--test/format/format.h2
-rw-r--r--test/format/ops.c14
-rw-r--r--test/format/util.c28
-rw-r--r--test/format/wts.c36
-rw-r--r--test/suite/run.py20
-rw-r--r--test/suite/test_lsm02.py5
-rw-r--r--test/suite/test_sweep01.py4
-rw-r--r--test/suite/test_txn02.py23
-rw-r--r--test/suite/wtscenario.py50
-rw-r--r--test/suite/wttest.py31
-rw-r--r--tools/stat_data.py5
81 files changed, 927 insertions, 745 deletions
diff --git a/bench/wtperf/runners/small-lsm.wtperf b/bench/wtperf/runners/small-lsm.wtperf
index 1b00d18d76b..8c7f65bb8b0 100644
--- a/bench/wtperf/runners/small-lsm.wtperf
+++ b/bench/wtperf/runners/small-lsm.wtperf
@@ -1,6 +1,6 @@
# wtperf options file: small lsm configuration
conn_config="cache_size=500MB"
-table_config="lsm=(chunk_size=5MB),type=lsm,os_cache_dirty_max=16MB"
+table_config="lsm=(chunk_size=10MB),type=lsm"
icount=500000
report_interval=5
run_time=120
diff --git a/dist/api_data.py b/dist/api_data.py
index 2f7757dce6b..bf1346c187c 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -207,17 +207,26 @@ file_config = format_meta + [
block compression is done''',
min='512B', max='512MB'),
Config('internal_item_max', '0', r'''
- the largest key stored within an internal node, in bytes. If
- non-zero, any key larger than the specified size will be
- stored as an overflow item (which may require additional I/O
- to access). If zero, a default size is chosen that permits at
- least 8 keys per internal page''',
- min=0),
+ historic term for internal_key_max''',
+ min=0, undoc=True),
+ Config('internal_key_max', '0', r'''
+ the largest key stored in an internal node, in bytes. If set, keys
+ larger than the specified size are stored as overflow items (which
+ may require additional I/O to access). The default and the maximum
+ allowed value are both one-tenth the size of a newly split internal
+ page''',
+ min='0'),
Config('key_gap', '10', r'''
the maximum gap between instantiated keys in a Btree leaf page,
constraining the number of keys processed to instantiate a
random Btree leaf page key''',
min='0', undoc=True),
+ Config('leaf_key_max', '0', r'''
+ the largest key stored in a leaf node, in bytes. If set, keys
+ larger than the specified size are stored as overflow items (which
+ may require additional I/O to access). The default value is
+ one-tenth the size of a newly split leaf page''',
+ min='0'),
Config('leaf_page_max', '32KB', r'''
the maximum page size for leaf nodes, in bytes; the size must
be a multiple of the allocation size, and is significant for
@@ -226,13 +235,17 @@ file_config = format_meta + [
data, that is, the limit is applied before any block compression
is done''',
min='512B', max='512MB'),
+ Config('leaf_value_max', '0', r'''
+ the largest value stored in a leaf node, in bytes. If set, values
+ larger than the specified size are stored as overflow items (which
+ may require additional I/O to access). If the size is larger than
+ the maximum leaf page size, the page size is temporarily ignored
+ when large values are written. The default is one-half the size of
+ a newly split leaf page''',
+ min='0'),
Config('leaf_item_max', '0', r'''
- the largest key or value stored within a leaf node, in bytes.
- If non-zero, any key or value larger than the specified size
- will be stored as an overflow item (which may require additional
- I/O to access). If zero, a default size is chosen that permits
- at least 4 key and value pairs per leaf page''',
- min=0),
+ historic term for leaf_key_max and leaf_value_max''',
+ min=0, undoc=True),
Config('memory_page_max', '5MB', r'''
the maximum size a page can grow to in memory before being
reconciled to disk. The specified size will be adjusted to a lower
diff --git a/dist/api_err.py b/dist/api_err.py
index 0c61a41ff28..cb2c8cc588e 100644
--- a/dist/api_err.py
+++ b/dist/api_err.py
@@ -42,7 +42,9 @@ errors = [
Error('WT_PANIC', -31804,
'WiredTiger library panic', '''
This error indicates an underlying problem that requires the
- application exit and restart.'''),
+ application exit and restart. The application can exit
+ immediately when \c WT_PANIC is returned from a WiredTiger
+ interface, no further WiredTiger calls are required.'''),
Error('WT_RESTART', -31805,
'restart the operation (internal)', undoc=True),
]
diff --git a/dist/stat_data.py b/dist/stat_data.py
index bd628e7418a..d1d3dd7e5ea 100644
--- a/dist/stat_data.py
+++ b/dist/stat_data.py
@@ -348,14 +348,16 @@ dsrc_stats = [
BtreeStat('btree_fixed_len', 'fixed-record size', 'no_aggregate,no_scale'),
BtreeStat('btree_maximum_depth',
'maximum tree depth', 'max_aggregate,no_scale'),
- BtreeStat('btree_maxintlitem',
- 'maximum internal page item size', 'no_aggregate,no_scale'),
+ BtreeStat('btree_maxintlkey',
+ 'maximum internal page key size', 'no_aggregate,no_scale'),
BtreeStat('btree_maxintlpage',
'maximum internal page size', 'no_aggregate,no_scale'),
- BtreeStat('btree_maxleafitem',
- 'maximum leaf page item size', 'no_aggregate,no_scale'),
+ BtreeStat('btree_maxleafkey',
+ 'maximum leaf page key size', 'no_aggregate,no_scale'),
BtreeStat('btree_maxleafpage',
'maximum leaf page size', 'no_aggregate,no_scale'),
+ BtreeStat('btree_maxleafvalue',
+ 'maximum leaf page value size', 'no_aggregate,no_scale'),
BtreeStat('btree_overflow', 'overflow pages', 'no_scale'),
BtreeStat('btree_row_internal', 'row-store internal pages', 'no_scale'),
BtreeStat('btree_row_leaf', 'row-store leaf pages', 'no_scale'),
diff --git a/examples/c/Makefile.am b/examples/c/Makefile.am
index 17beba4a470..382c5912fef 100644
--- a/examples/c/Makefile.am
+++ b/examples/c/Makefile.am
@@ -13,7 +13,6 @@ noinst_PROGRAMS = \
ex_data_source \
ex_extending \
ex_extractor \
- ex_file \
ex_hello \
ex_log \
ex_pack \
diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c
index db418deed9d..cf5fb363c2f 100644
--- a/examples/c/ex_all.c
+++ b/examples/c/ex_all.c
@@ -524,6 +524,20 @@ session_ops(WT_SESSION *session)
/*! [Create a table with columns] */
ret = session->drop(session, "table:mytable", NULL);
+ /*! [Create a table and configure the page size] */
+ ret = session->create(session,
+ "table:mytable", "key_format=S,value_format=S"
+ "internal_page_max=16KB,leaf_page_max=1MB,leaf_value_max=64KB");
+ /*! [Create a table and configure the page size] */
+ ret = session->drop(session, "table:mytable", NULL);
+
+ /*! [Create a table and configure a large leaf value max] */
+ ret = session->create(session,
+ "table:mytable", "key_format=S,value_format=S"
+ "leaf_page_max=16KB,leaf_value_max=256KB");
+ /*! [Create a table and configure a large leaf value max] */
+ ret = session->drop(session, "table:mytable", NULL);
+
/*
* This example code gets run, and the compression libraries might not
* be loaded, causing the create to fail. The documentation requires
diff --git a/examples/c/ex_backup.c b/examples/c/ex_backup.c
index fb5c5b9d299..ea572c8810b 100644
--- a/examples/c/ex_backup.c
+++ b/examples/c/ex_backup.c
@@ -125,7 +125,7 @@ compare_backups(int i)
* That way we can compare the full and incremental each time through.
*/
static int
-setup_directories()
+setup_directories(void)
{
int i, ret;
char buf[1024];
diff --git a/examples/c/ex_file.c b/examples/c/ex_file.c
deleted file mode 100644
index 4170d1b099d..00000000000
--- a/examples/c/ex_file.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*-
- * Public Domain 2008-2014 WiredTiger, Inc.
- *
- * This is free and unencumbered software released into the public domain.
- *
- * Anyone is free to copy, modify, publish, use, compile, sell, or
- * distribute this software, either in source code form or as a compiled
- * binary, for any purpose, commercial or non-commercial, and by any
- * means.
- *
- * In jurisdictions that recognize copyright laws, the author or authors
- * of this software dedicate any and all copyright interest in the
- * software to the public domain. We make this dedication for the benefit
- * of the public at large and to the detriment of our heirs and
- * successors. We intend this dedication to be an overt act of
- * relinquishment in perpetuity of all present and future rights to this
- * software under copyright law.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- * ex_file.c
- * This is an example demonstrating how to configure an individual file.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <wiredtiger.h>
-
-static const char *home;
-
-int
-main(void)
-{
- WT_CONNECTION *conn;
- WT_SESSION *session;
- int ret;
-
- /*
- * Create a clean test directory for this run of the test program if the
- * environment variable isn't already set (as is done by make check).
- */
- if (getenv("WIREDTIGER_HOME") == NULL) {
- home = "WT_HOME";
- ret = system("rm -rf WT_HOME && mkdir WT_HOME");
- } else
- home = NULL;
-
- if ((ret = wiredtiger_open(home, NULL, "create", &conn)) != 0 ||
- (ret = conn->open_session(conn, NULL, NULL, &session)) != 0) {
- fprintf(stderr, "Error connecting to %s: %s\n",
- home, wiredtiger_strerror(ret));
- return (ret);
- }
- /* Note: further error checking omitted for clarity. */
-
- /*! [file create] */
- ret = session->create(session, "file:example",
- "key_format=u,"
- "internal_page_max=32KB,internal_item_max=1KB,"
- "leaf_page_max=1MB,leaf_item_max=32KB");
- /*! [file create] */
-
- return (conn->close(conn, NULL) == 0 ? ret : EXIT_FAILURE);
-}
diff --git a/src/async/async_api.c b/src/async/async_api.c
index 3cb78e80b09..6aeb404bccd 100644
--- a/src/async/async_api.c
+++ b/src/async/async_api.c
@@ -54,7 +54,7 @@ __async_get_format(WT_CONNECTION_IMPL *conn, const char *uri,
WT_RET(
__wt_open_internal_session(conn, "async-cursor", 1, 1, &session));
__wt_spin_lock(session, &async->ops_lock);
- WT_ERR(__wt_calloc_def(session, 1, &af));
+ WT_ERR(__wt_calloc_one(session, &af));
WT_ERR(__wt_strdup(session, uri, &af->uri));
WT_ERR(__wt_strdup(session, config, &af->config));
af->uri_hash = uri_hash;
@@ -232,7 +232,7 @@ __async_start(WT_SESSION_IMPL *session)
/*
* Async is on, allocate the WT_ASYNC structure and initialize the ops.
*/
- WT_RET(__wt_calloc(session, 1, sizeof(WT_ASYNC), &conn->async));
+ WT_RET(__wt_calloc_one(session, &conn->async));
async = conn->async;
STAILQ_INIT(&async->formatqh);
WT_RET(__wt_spin_init(session, &async->ops_lock, "ops"));
diff --git a/src/async/async_worker.c b/src/async/async_worker.c
index 7a88ac9dd6e..ecf052fc3bf 100644
--- a/src/async/async_worker.c
+++ b/src/async/async_worker.c
@@ -150,7 +150,7 @@ __async_worker_cursor(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op,
* We didn't find one in our cache. Open one and cache it.
* Insert it at the head expecting LRU usage.
*/
- WT_RET(__wt_calloc_def(session, 1, &ac));
+ WT_RET(__wt_calloc_one(session, &ac));
WT_ERR(wt_session->open_cursor(
wt_session, op->format->uri, NULL, op->format->config, &c));
ac->cfg_hash = op->format->cfg_hash;
diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c
index 4f7f2898de5..a9b3b07904d 100644
--- a/src/block/block_mgr.c
+++ b/src/block/block_mgr.c
@@ -419,7 +419,7 @@ __wt_block_manager_open(WT_SESSION_IMPL *session,
*bmp = NULL;
- WT_RET(__wt_calloc_def(session, 1, &bm));
+ WT_RET(__wt_calloc_one(session, &bm));
__bm_method_set(bm, 0);
WT_ERR(__wt_block_open(session, filename, cfg,
diff --git a/src/block/block_open.c b/src/block/block_open.c
index 7b68c59c766..0abe9cffc5f 100644
--- a/src/block/block_open.c
+++ b/src/block/block_open.c
@@ -128,7 +128,7 @@ __wt_block_open(WT_SESSION_IMPL *session,
}
/* Basic structure allocation, initialization. */
- WT_ERR(__wt_calloc_def(session, 1, &block));
+ WT_ERR(__wt_calloc_one(session, &block));
block->ref = 1;
TAILQ_INSERT_HEAD(&conn->blockqh, block, q);
diff --git a/src/block/block_session.c b/src/block/block_session.c
index fa56b72f49b..90fe0af562a 100644
--- a/src/block/block_session.c
+++ b/src/block/block_session.c
@@ -152,7 +152,7 @@ __block_ext_discard(WT_SESSION_IMPL *session, u_int max)
static int
__block_size_alloc(WT_SESSION_IMPL *session, WT_SIZE **szp)
{
- return (__wt_calloc(session, 1, sizeof(WT_SIZE), szp));
+ return (__wt_calloc_one(session, szp));
}
/*
diff --git a/src/bloom/bloom.c b/src/bloom/bloom.c
index b8fecfe0efd..5f7a8f47c21 100644
--- a/src/bloom/bloom.c
+++ b/src/bloom/bloom.c
@@ -28,7 +28,7 @@ __bloom_init(WT_SESSION_IMPL *session,
*bloomp = NULL;
- WT_RET(__wt_calloc_def(session, 1, &bloom));
+ WT_RET(__wt_calloc_one(session, &bloom));
WT_ERR(__wt_strdup(session, uri, &bloom->uri));
len = strlen(WT_BLOOM_TABLE_CONFIG) + 2;
diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c
index 2fc1b0d5460..a58ed5d66e9 100644
--- a/src/btree/bt_delete.c
+++ b/src/btree/bt_delete.c
@@ -117,7 +117,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp)
* Record the change in the transaction structure and set the change's
* transaction ID.
*/
- WT_ERR(__wt_calloc_def(session, 1, &ref->page_del));
+ WT_ERR(__wt_calloc_one(session, &ref->page_del));
ref->page_del->txnid = session->txn.id;
WT_ERR(__wt_txn_modify_ref(session, ref));
@@ -306,7 +306,7 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
* deleted items.
*/
for (i = 0; i < page->pg_row_entries; ++i) {
- WT_ERR(__wt_calloc_def(session, 1, &upd));
+ WT_ERR(__wt_calloc_one(session, &upd));
WT_UPDATE_DELETED_SET(upd);
if (page_del == NULL)
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index fe2623b055b..10ea6cd019c 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -13,9 +13,6 @@ static int __btree_page_sizes(WT_SESSION_IMPL *);
static int __btree_preload(WT_SESSION_IMPL *);
static int __btree_tree_open_empty(WT_SESSION_IMPL *, int, int);
-static int pse1(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t);
-static int pse2(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t, int);
-
/*
* __wt_btree_open --
* Open a Btree.
@@ -623,153 +620,98 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
btree = S2BT(session);
cfg = btree->dhandle->cfg;
+ /*
+ * Get the allocation size. Allocation sizes must be a power-of-two,
+ * nothing else makes sense.
+ */
WT_RET(__wt_direct_io_size_check(
session, cfg, "allocation_size", &btree->allocsize));
+ if (!__wt_ispo2(btree->allocsize))
+ WT_RET_MSG(session,
+ EINVAL, "the allocation size must be a power of two");
+
+ /*
+ * Get the internal/leaf page sizes.
+ * All page sizes must be in units of the allocation size.
+ */
WT_RET(__wt_direct_io_size_check(
session, cfg, "internal_page_max", &btree->maxintlpage));
- WT_RET(__wt_config_gets(session, cfg, "internal_item_max", &cval));
- btree->maxintlitem = (uint32_t)cval.val;
WT_RET(__wt_direct_io_size_check(
session, cfg, "leaf_page_max", &btree->maxleafpage));
- WT_RET(__wt_config_gets(session, cfg, "leaf_item_max", &cval));
- btree->maxleafitem = (uint32_t)cval.val;
-
- WT_RET(__wt_config_gets(session, cfg, "split_pct", &cval));
- btree->split_pct = (int)cval.val;
+ if (btree->maxintlpage < btree->allocsize ||
+ btree->maxintlpage % btree->allocsize != 0 ||
+ btree->maxleafpage < btree->allocsize ||
+ btree->maxleafpage % btree->allocsize != 0)
+ WT_RET_MSG(session, EINVAL,
+ "page sizes must be a multiple of the page allocation "
+ "size (%" PRIu32 "B)", btree->allocsize);
/*
* When a page is forced to split, we want at least 50 entries on its
* parent.
- */
- WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval));
- btree->maxmempage = WT_MAX((uint64_t)cval.val, 50 * btree->maxleafpage);
-
- /*
+ *
* Don't let pages grow to more than half the cache size. Otherwise,
* with very small caches, we can end up in a situation where nothing
* can be evicted. Take care getting the cache size: with a shared
* cache, it may not have been set.
*/
+ WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval));
+ btree->maxmempage = WT_MAX((uint64_t)cval.val, 50 * btree->maxleafpage);
cache_size = S2C(session)->cache_size;
if (cache_size > 0)
btree->maxmempage = WT_MIN(btree->maxmempage, cache_size / 2);
- /* Allocation sizes must be a power-of-two, nothing else makes sense. */
- if (!__wt_ispo2(btree->allocsize))
- WT_RET_MSG(session,
- EINVAL, "the allocation size must be a power of two");
-
- /* All page sizes must be in units of the allocation size. */
- if (btree->maxintlpage < btree->allocsize ||
- btree->maxintlpage % btree->allocsize != 0 ||
- btree->maxleafpage < btree->allocsize ||
- btree->maxleafpage % btree->allocsize != 0)
- WT_RET_MSG(session, EINVAL,
- "page sizes must be a multiple of the page allocation "
- "size (%" PRIu32 "B)", btree->allocsize);
-
/*
- * Set the split percentage: reconciliation splits to a smaller-than-
- * maximum page size so we don't split every time a new entry is added.
+ * Get the split percentage (reconciliation splits pages into smaller
+ * than the maximum page size chunks so we don't split every time a
+ * new entry is added). Determine how large newly split pages will be.
*/
+ WT_RET(__wt_config_gets(session, cfg, "split_pct", &cval));
+ btree->split_pct = (int)cval.val;
intl_split_size = __wt_split_page_size(btree, btree->maxintlpage);
leaf_split_size = __wt_split_page_size(btree, btree->maxleafpage);
/*
- * Default values for internal and leaf page items: make sure at least
- * 8 items fit on split pages.
- */
- if (btree->maxintlitem == 0)
- btree->maxintlitem = intl_split_size / 8;
- if (btree->maxleafitem == 0)
- btree->maxleafitem = leaf_split_size / 8;
-
- /*
- * If raw compression is configured, the application owns page layout,
- * it's not our problem. Hopefully the application chose well.
+ * Get the maximum internal/leaf page key/value sizes.
+ *
+ * In historic versions of WiredTiger, the maximum internal/leaf page
+ * key/value sizes were set by the internal_item_max and leaf_item_max
+ * configuration strings. Look for those strings if we don't find the
+ * newer ones.
*/
- if (btree->compressor != NULL &&
- btree->compressor->compress_raw != NULL)
- return (0);
-
- /* Check we can fit at least 2 items on a page. */
- if (btree->maxintlitem > btree->maxintlpage / 2)
- return (pse1(session, "internal",
- btree->maxintlpage, btree->maxintlitem));
- if (btree->maxleafitem > btree->maxleafpage / 2)
- return (pse1(session, "leaf",
- btree->maxleafpage, btree->maxleafitem));
+ WT_RET(__wt_config_gets(session, cfg, "internal_key_max", &cval));
+ btree->maxintlkey = (uint32_t)cval.val;
+ if (btree->maxintlkey == 0) {
+ WT_RET(
+ __wt_config_gets(session, cfg, "internal_item_max", &cval));
+ btree->maxintlkey = (uint32_t)cval.val;
+ }
+ WT_RET(__wt_config_gets(session, cfg, "leaf_key_max", &cval));
+ btree->maxleafkey = (uint32_t)cval.val;
+ WT_RET(__wt_config_gets(session, cfg, "leaf_value_max", &cval));
+ btree->maxleafvalue = (uint32_t)cval.val;
+ if (btree->maxleafkey == 0 && btree->maxleafvalue == 0) {
+ WT_RET(__wt_config_gets(session, cfg, "leaf_item_max", &cval));
+ btree->maxleafkey = (uint32_t)cval.val;
+ btree->maxleafvalue = (uint32_t)cval.val;
+ }
/*
- * Take into account the size of a split page:
+ * Default/maximum for internal and leaf page keys: split-page / 10.
+ * Default for leaf page values: split-page / 2.
*
- * Make it a separate error message so it's clear what went wrong.
+ * It's difficult for applications to configure this in any exact way as
+ * they have to duplicate our calculation of how many keys must fit on a
+ * page, and given a split-percentage and page header, that isn't easy
+ * to do. If the maximum internal key value is too large for the page,
+ * reset it to the default.
*/
- if (btree->maxintlitem > intl_split_size / 2)
- return (pse2(session, "internal",
- btree->maxintlpage, btree->maxintlitem, btree->split_pct));
- if (btree->maxleafitem > leaf_split_size / 2)
- return (pse2(session, "leaf",
- btree->maxleafpage, btree->maxleafitem, btree->split_pct));
+ if (btree->maxintlkey == 0 || btree->maxintlkey > intl_split_size / 10)
+ btree->maxintlkey = intl_split_size / 10;
+ if (btree->maxleafkey == 0)
+ btree->maxleafkey = leaf_split_size / 10;
+ if (btree->maxleafvalue == 0)
+ btree->maxleafvalue = leaf_split_size / 2;
return (0);
}
-
-/*
- * __wt_split_page_size --
- * Split page size calculation: we don't want to repeatedly split every
- * time a new entry is added, so we split to a smaller-than-maximum page size.
- */
-uint32_t
-__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize)
-{
- uintmax_t a;
- uint32_t split_size;
-
- /*
- * Ideally, the split page size is some percentage of the maximum page
- * size rounded to an allocation unit (round to an allocation unit so
- * we don't waste space when we write).
- */
- a = maxpagesize; /* Don't overflow. */
- split_size = (uint32_t)
- WT_ALIGN((a * (u_int)btree->split_pct) / 100, btree->allocsize);
-
- /*
- * If the result of that calculation is the same as the allocation unit
- * (that happens if the maximum size is the same size as an allocation
- * unit, use a percentage of the maximum page size).
- */
- if (split_size == btree->allocsize)
- split_size = (uint32_t)((a * (u_int)btree->split_pct) / 100);
-
- return (split_size);
-}
-
-/*
- * pse1 --
- * Page size error message 1.
- */
-static int
-pse1(WT_SESSION_IMPL *session, const char *type, uint32_t max, uint32_t ovfl)
-{
- WT_RET_MSG(session, EINVAL,
- "%s page size (%" PRIu32 "B) too small for the maximum item size "
- "(%" PRIu32 "B); the page must be able to hold at least 2 items",
- type, max, ovfl);
-}
-
-/*
- * pse2 --
- * Page size error message 2.
- */
-static int
-pse2(WT_SESSION_IMPL *session,
- const char *type, uint32_t max, uint32_t ovfl, int pct)
-{
- WT_RET_MSG(session, EINVAL,
- "%s page size (%" PRIu32 "B) too small for the maximum item size "
- "(%" PRIu32 "B), because of the split percentage (%d %%); a split "
- "page must be able to hold at least 2 items",
- type, max, ovfl, pct);
-}
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index b2767e74bac..799f0cca3ee 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -227,8 +227,8 @@ __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type,
WT_INTL_INDEX_SET(page, pindex);
if (alloc_refs)
for (i = 0; i < pindex->entries; ++i) {
- WT_ERR(__wt_calloc_def(
- session, 1, &pindex->index[i]));
+ WT_ERR(__wt_calloc_one(
+ session, &pindex->index[i]));
size += sizeof(WT_REF);
}
if (0) {
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c
index 6e70c9ea2b6..96b63f3f8f0 100644
--- a/src/btree/bt_slvg.c
+++ b/src/btree/bt_slvg.c
@@ -491,8 +491,8 @@ __slvg_trk_init(WT_SESSION_IMPL *session,
WT_DECL_RET;
WT_TRACK *trk;
- WT_RET(__wt_calloc_def(session, 1, &trk));
- WT_ERR(__wt_calloc_def(session, 1, &trk->shared));
+ WT_RET(__wt_calloc_one(session, &trk));
+ WT_ERR(__wt_calloc_one(session, &trk->shared));
trk->shared->ref = 1;
trk->ss = ss;
@@ -519,7 +519,7 @@ __slvg_trk_split(WT_SESSION_IMPL *session, WT_TRACK *orig, WT_TRACK **newp)
{
WT_TRACK *trk;
- WT_RET(__wt_calloc_def(session, 1, &trk));
+ WT_RET(__wt_calloc_one(session, &trk));
trk->shared = orig->shared;
trk->ss = orig->ss;
@@ -1181,7 +1181,7 @@ __slvg_col_build_internal(
ref->home = page;
ref->page = NULL;
- WT_ERR(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr));
+ WT_ERR(__wt_calloc_one(session, &addr));
WT_ERR(__wt_strndup(
session, trk->trk_addr, trk->trk_addr_size, &addr->addr));
addr->size = trk->trk_addr_size;
@@ -1826,7 +1826,7 @@ __slvg_row_build_internal(
ref->home = page;
ref->page = NULL;
- WT_ERR(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr));
+ WT_ERR(__wt_calloc_one(session, &addr));
WT_ERR(__wt_strndup(
session, trk->trk_addr, trk->trk_addr_size, &addr->addr));
addr->size = trk->trk_addr_size;
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index e25f0b73e01..c6b97733b69 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -310,7 +310,7 @@ __split_ref_instantiate(WT_SESSION_IMPL *session,
sizeof(WT_ADDR) + addr->size);
else {
__wt_cell_unpack((WT_CELL *)ref->addr, &unpack);
- WT_RET(__wt_calloc_def(session, 1, &addr));
+ WT_RET(__wt_calloc_one(session, &addr));
if ((ret = __wt_strndup(
session, unpack.data, unpack.size, &addr->addr)) != 0) {
__wt_free(session, addr);
@@ -444,7 +444,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
pindex->index[pindex->entries - 1];
for (alloc_refp = alloc_index->index + SPLIT_CORRECT_1,
i = 0; i < children; ++alloc_refp, ++i) {
- WT_ERR(__wt_calloc_def(session, 1, alloc_refp));
+ WT_ERR(__wt_calloc_one(session, alloc_refp));
WT_MEMSIZE_ADD(parent_incr, sizeof(WT_REF));
}
@@ -747,7 +747,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
/* In some cases, the underlying WT_REF has not yet been allocated. */
if (*refp == NULL) {
- WT_RET(__wt_calloc_def(session, 1, refp));
+ WT_RET(__wt_calloc_one(session, refp));
WT_MEMSIZE_ADD(incr, sizeof(WT_REF));
}
ref = *refp;
@@ -768,7 +768,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
* would have to avoid freeing the memory, and it's not worth
* the confusion.
*/
- WT_RET(__wt_calloc_def(session, 1, &addr));
+ WT_RET(__wt_calloc_one(session, &addr));
WT_MEMSIZE_ADD(incr, sizeof(WT_ADDR));
ref->addr = addr;
addr->size = multi->addr.size;
@@ -1081,7 +1081,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
*
* The new reference is visible to readers once the split completes.
*/
- WT_ERR(__wt_calloc_def(session, 1, &split_ref[0]));
+ WT_ERR(__wt_calloc_one(session, &split_ref[0]));
child = split_ref[0];
*child = *ref;
child->state = WT_REF_MEM;
@@ -1112,12 +1112,12 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref, int *splitp)
* The second page in the split is a new WT_REF/page pair.
*/
WT_ERR(__wt_page_alloc(session, WT_PAGE_ROW_LEAF, 0, 0, 0, &right));
- WT_ERR(__wt_calloc_def(session, 1, &right->pg_row_ins));
- WT_ERR(__wt_calloc_def(session, 1, &right->pg_row_ins[0]));
+ WT_ERR(__wt_calloc_one(session, &right->pg_row_ins));
+ WT_ERR(__wt_calloc_one(session, &right->pg_row_ins[0]));
WT_MEMSIZE_ADD(right_incr, sizeof(WT_INSERT_HEAD));
WT_MEMSIZE_ADD(right_incr, sizeof(WT_INSERT_HEAD *));
- WT_ERR(__wt_calloc_def(session, 1, &split_ref[1]));
+ WT_ERR(__wt_calloc_one(session, &split_ref[1]));
child = split_ref[1];
child->page = right;
child->state = WT_REF_MEM;
diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c
index 3da0bcf346c..c08e9d9218b 100644
--- a/src/btree/bt_stat.c
+++ b/src/btree/bt_stat.c
@@ -32,10 +32,11 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
WT_STAT_SET(stats, btree_fixed_len, btree->bitcnt);
WT_STAT_SET(stats, btree_maximum_depth, btree->maximum_depth);
- WT_STAT_SET(stats, btree_maxintlitem, btree->maxintlitem);
WT_STAT_SET(stats, btree_maxintlpage, btree->maxintlpage);
- WT_STAT_SET(stats, btree_maxleafitem, btree->maxleafitem);
+ WT_STAT_SET(stats, btree_maxintlkey, btree->maxintlkey);
WT_STAT_SET(stats, btree_maxleafpage, btree->maxleafpage);
+ WT_STAT_SET(stats, btree_maxleafkey, btree->maxleafkey);
+ WT_STAT_SET(stats, btree_maxleafvalue, btree->maxleafvalue);
/* Everything else is really, really expensive. */
if (!F_ISSET(cst, WT_CONN_STAT_ALL))
diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c
index e0036d14cbb..e7fb75dc8cb 100644
--- a/src/btree/row_modify.c
+++ b/src/btree/row_modify.c
@@ -19,7 +19,7 @@ __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page)
conn = S2C(session);
- WT_RET(__wt_calloc_def(session, 1, &modify));
+ WT_RET(__wt_calloc_one(session, &modify));
/*
* Select a spinlock for the page; let the barrier immediately below
diff --git a/src/config/config_api.c b/src/config/config_api.c
index 42f4c117b81..0c920af0d0e 100644
--- a/src/config/config_api.c
+++ b/src/config/config_api.c
@@ -84,7 +84,7 @@ wiredtiger_config_parser_open(WT_SESSION *wt_session,
*config_parserp = NULL;
session = (WT_SESSION_IMPL *)wt_session;
- WT_RET(__wt_calloc_def(session, 1, &config_parser));
+ WT_RET(__wt_calloc_one(session, &config_parser));
config_parser->iface = stds;
config_parser->session = session;
diff --git a/src/config/config_check.c b/src/config/config_check.c
index c6fd6bbd75b..18300da8282 100644
--- a/src/config/config_check.c
+++ b/src/config/config_check.c
@@ -122,7 +122,7 @@ __wt_configure_method(WT_SESSION_IMPL *session,
* The new base value is the previous base value, a separator and the
* new configuration string.
*/
- WT_ERR(__wt_calloc_def(session, 1, &entry));
+ WT_ERR(__wt_calloc_one(session, &entry));
entry->method = (*epp)->method;
WT_ERR(__wt_calloc_def(session,
strlen((*epp)->base) + strlen(",") + strlen(config) + 1, &p));
diff --git a/src/config/config_def.c b/src/config/config_def.c
index 23f7b27338f..750d9843279 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -138,12 +138,15 @@ static const WT_CONFIG_CHECK confchk_file_meta[] = {
{ "huffman_value", "string", NULL, NULL },
{ "id", "string", NULL, NULL },
{ "internal_item_max", "int", "min=0", NULL },
+ { "internal_key_max", "int", "min=0", NULL },
{ "internal_key_truncate", "boolean", NULL, NULL },
{ "internal_page_max", "int", "min=512B,max=512MB", NULL },
{ "key_format", "format", NULL, NULL },
{ "key_gap", "int", "min=0", NULL },
{ "leaf_item_max", "int", "min=0", NULL },
+ { "leaf_key_max", "int", "min=0", NULL },
{ "leaf_page_max", "int", "min=512B,max=512MB", NULL },
+ { "leaf_value_max", "int", "min=0", NULL },
{ "memory_page_max", "int", "min=512B,max=10TB", NULL },
{ "os_cache_dirty_max", "int", "min=0", NULL },
{ "os_cache_max", "int", "min=0", NULL },
@@ -227,12 +230,15 @@ static const WT_CONFIG_CHECK confchk_session_create[] = {
{ "huffman_value", "string", NULL, NULL },
{ "immutable", "boolean", NULL, NULL },
{ "internal_item_max", "int", "min=0", NULL },
+ { "internal_key_max", "int", "min=0", NULL },
{ "internal_key_truncate", "boolean", NULL, NULL },
{ "internal_page_max", "int", "min=512B,max=512MB", NULL },
{ "key_format", "format", NULL, NULL },
{ "key_gap", "int", "min=0", NULL },
{ "leaf_item_max", "int", "min=0", NULL },
+ { "leaf_key_max", "int", "min=0", NULL },
{ "leaf_page_max", "int", "min=512B,max=512MB", NULL },
+ { "leaf_value_max", "int", "min=0", NULL },
{ "lsm", "category", NULL, confchk_lsm_subconfigs },
{ "memory_page_max", "int", "min=512B,max=10TB", NULL },
{ "os_cache_dirty_max", "int", "min=0", NULL },
@@ -567,11 +573,12 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"block_compressor=,cache_resident=0,checkpoint=,checkpoint_lsn=,"
"checksum=uncompressed,collator=,columns=,dictionary=0,"
"format=btree,huffman_key=,huffman_value=,id=,internal_item_max=0"
- ",internal_key_truncate=,internal_page_max=4KB,key_format=u,"
- "key_gap=10,leaf_item_max=0,leaf_page_max=32KB,"
- "memory_page_max=5MB,os_cache_dirty_max=0,os_cache_max=0,"
- "prefix_compression=0,prefix_compression_min=4,split_pct=75,"
- "value_format=u,version=(major=0,minor=0)",
+ ",internal_key_max=0,internal_key_truncate=,internal_page_max=4KB"
+ ",key_format=u,key_gap=10,leaf_item_max=0,leaf_key_max=0,"
+ "leaf_page_max=32KB,leaf_value_max=0,memory_page_max=5MB,"
+ "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0,"
+ "prefix_compression_min=4,split_pct=75,value_format=u,"
+ "version=(major=0,minor=0)",
confchk_file_meta
},
{ "index.meta",
@@ -604,8 +611,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"block_compressor=,cache_resident=0,checksum=uncompressed,"
"colgroups=,collator=,columns=,dictionary=0,exclusive=0,"
"extractor=,format=btree,huffman_key=,huffman_value=,immutable=0,"
- "internal_item_max=0,internal_key_truncate=,internal_page_max=4KB"
- ",key_format=u,key_gap=10,leaf_item_max=0,leaf_page_max=32KB,"
+ "internal_item_max=0,internal_key_max=0,internal_key_truncate=,"
+ "internal_page_max=4KB,key_format=u,key_gap=10,leaf_item_max=0,"
+ "leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=0,"
"lsm=(auto_throttle=,bloom=,bloom_bit_count=16,bloom_config=,"
"bloom_hash_count=8,bloom_oldest=0,chunk_max=5GB,chunk_size=10MB,"
"merge_max=15,merge_min=0),memory_page_max=5MB,"
diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c
index 8d104729733..551c3037f7b 100644
--- a/src/conn/conn_api.c
+++ b/src/conn/conn_api.c
@@ -292,7 +292,7 @@ __conn_add_collator(WT_CONNECTION *wt_conn,
WT_ERR_MSG(session, EINVAL,
"invalid name for a collator: %s", name);
- WT_ERR(__wt_calloc_def(session, 1, &ncoll));
+ WT_ERR(__wt_calloc_one(session, &ncoll));
WT_ERR(__wt_strdup(session, name, &ncoll->name));
ncoll->collator = collator;
@@ -363,7 +363,7 @@ __conn_add_compressor(WT_CONNECTION *wt_conn,
WT_ERR_MSG(session, EINVAL,
"invalid name for a compressor: %s", name);
- WT_ERR(__wt_calloc_def(session, 1, &ncomp));
+ WT_ERR(__wt_calloc_one(session, &ncomp));
WT_ERR(__wt_strdup(session, name, &ncomp->name));
ncomp->compressor = compressor;
@@ -428,7 +428,7 @@ __conn_add_data_source(WT_CONNECTION *wt_conn,
CONNECTION_API_CALL(conn, session, add_data_source, config, cfg);
WT_UNUSED(cfg);
- WT_ERR(__wt_calloc_def(session, 1, &ndsrc));
+ WT_ERR(__wt_calloc_one(session, &ndsrc));
WT_ERR(__wt_strdup(session, prefix, &ndsrc->prefix));
ndsrc->dsrc = dsrc;
@@ -497,7 +497,7 @@ __conn_add_extractor(WT_CONNECTION *wt_conn,
WT_ERR_MSG(session, EINVAL,
"invalid name for an extractor: %s", name);
- WT_ERR(__wt_calloc_def(session, 1, &nextractor));
+ WT_ERR(__wt_calloc_one(session, &nextractor));
WT_ERR(__wt_strdup(session, name, &nextractor->name));
nextractor->extractor = extractor;
@@ -1490,7 +1490,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
WT_RET(__wt_library_init());
- WT_RET(__wt_calloc_def(NULL, 1, &conn));
+ WT_RET(__wt_calloc_one(NULL, &conn));
conn->iface = stdc;
/*
diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c
index 079bd05ff1e..61bd4447abf 100644
--- a/src/conn/conn_cache.c
+++ b/src/conn/conn_cache.c
@@ -83,7 +83,7 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[])
WT_ASSERT(session, conn->cache == NULL ||
(F_ISSET(conn, WT_CONN_CACHE_POOL) && conn->cache != NULL));
- WT_RET(__wt_calloc_def(session, 1, &conn->cache));
+ WT_RET(__wt_calloc_one(session, &conn->cache));
cache = conn->cache;
diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c
index c7558eea5fb..dcc37da3b3b 100644
--- a/src/conn/conn_cache_pool.c
+++ b/src/conn/conn_cache_pool.c
@@ -81,7 +81,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
if (__wt_process.cache_pool == NULL) {
WT_ASSERT(session, !reconfiguring);
/* Create a cache pool. */
- WT_ERR(__wt_calloc_def(session, 1, &cp));
+ WT_ERR(__wt_calloc_one(session, &cp));
created = 1;
cp->name = pool_name;
pool_name = NULL; /* Belongs to the cache pool now. */
diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c
index 088ff2f3d2c..cfd99ac1f8f 100644
--- a/src/conn/conn_dhandle.c
+++ b/src/conn/conn_dhandle.c
@@ -187,7 +187,7 @@ __conn_dhandle_get(WT_SESSION_IMPL *session,
* then initialize the data handle. Exclusively lock the data handle
* before inserting it in the list.
*/
- WT_RET(__wt_calloc_def(session, 1, &dhandle));
+ WT_RET(__wt_calloc_one(session, &dhandle));
WT_ERR(__wt_rwlock_alloc(session, &dhandle->rwlock, "data handle"));
@@ -196,7 +196,7 @@ __conn_dhandle_get(WT_SESSION_IMPL *session,
if (ckpt != NULL)
WT_ERR(__wt_strdup(session, ckpt, &dhandle->checkpoint));
- WT_ERR(__wt_calloc_def(session, 1, &btree));
+ WT_ERR(__wt_calloc_one(session, &btree));
dhandle->handle = btree;
btree->dhandle = dhandle;
diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c
index 618a0934ce1..6a1a63b5abe 100644
--- a/src/conn/conn_log.c
+++ b/src/conn/conn_log.c
@@ -341,7 +341,7 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[])
/*
* Logging is on, allocate the WT_LOG structure and open the log file.
*/
- WT_RET(__wt_calloc(session, 1, sizeof(WT_LOG), &conn->log));
+ WT_RET(__wt_calloc_one(session, &conn->log));
log = conn->log;
WT_RET(__wt_spin_init(session, &log->log_lock, "log"));
WT_RET(__wt_spin_init(session, &log->log_slot_lock, "log slot"));
diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c
index 41bfaea7ee3..2c03fc55b85 100644
--- a/src/cursor/cur_backup.c
+++ b/src/cursor/cur_backup.c
@@ -126,7 +126,7 @@ __wt_curbackup_open(WT_SESSION_IMPL *session,
cb = NULL;
- WT_RET(__wt_calloc_def(session, 1, &cb));
+ WT_RET(__wt_calloc_one(session, &cb));
cursor = &cb->iface;
*cursor = iface;
cursor->session = &session->iface;
diff --git a/src/cursor/cur_config.c b/src/cursor/cur_config.c
index 5e7ca487ae2..b37736d1b43 100644
--- a/src/cursor/cur_config.c
+++ b/src/cursor/cur_config.c
@@ -49,7 +49,7 @@ __wt_curconfig_open(WT_SESSION_IMPL *session,
WT_UNUSED(uri);
- WT_RET(__wt_calloc_def(session, 1, &cconfig));
+ WT_RET(__wt_calloc_one(session, &cconfig));
cursor = &cconfig->iface;
*cursor = iface;
diff --git a/src/cursor/cur_ds.c b/src/cursor/cur_ds.c
index 096a0e27f8d..f16cc9b33f0 100644
--- a/src/cursor/cur_ds.c
+++ b/src/cursor/cur_ds.c
@@ -474,7 +474,7 @@ __wt_curds_open(
data_source = NULL;
metaconf = NULL;
- WT_RET(__wt_calloc_def(session, 1, &data_source));
+ WT_RET(__wt_calloc_one(session, &data_source));
cursor = &data_source->iface;
*cursor = iface;
cursor->session = &session->iface;
diff --git a/src/cursor/cur_dump.c b/src/cursor/cur_dump.c
index 5760752d406..55b47d13a6d 100644
--- a/src/cursor/cur_dump.c
+++ b/src/cursor/cur_dump.c
@@ -372,7 +372,7 @@ __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp)
session = (WT_SESSION_IMPL *)child->session;
- WT_RET(__wt_calloc_def(session, 1, &cdump));
+ WT_RET(__wt_calloc_one(session, &cdump));
cursor = &cdump->iface;
*cursor = iface;
cursor->session = child->session;
@@ -385,7 +385,7 @@ __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp)
F_SET(cursor, F_ISSET(child,
WT_CURSTD_DUMP_HEX | WT_CURSTD_DUMP_JSON | WT_CURSTD_DUMP_PRINT));
if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) {
- WT_ERR(__wt_calloc_def(session, 1, &json));
+ WT_ERR(__wt_calloc_one(session, &json));
cursor->json_private = child->json_private = json;
}
diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c
index b516b5c58b1..2b31f75cf08 100644
--- a/src/cursor/cur_index.c
+++ b/src/cursor/cur_index.c
@@ -383,7 +383,7 @@ __wt_curindex_open(WT_SESSION_IMPL *session,
namesize = (size_t)(columns - idxname);
WT_RET(__wt_schema_open_index(session, table, idxname, namesize, &idx));
- WT_RET(__wt_calloc_def(session, 1, &cindex));
+ WT_RET(__wt_calloc_one(session, &cindex));
cursor = &cindex->iface;
*cursor = iface;
diff --git a/src/cursor/cur_log.c b/src/cursor/cur_log.c
index bdb19d05c01..0d375ee4a52 100644
--- a/src/cursor/cur_log.c
+++ b/src/cursor/cur_log.c
@@ -336,12 +336,12 @@ __wt_curlog_open(WT_SESSION_IMPL *session,
log = conn->log;
cl = NULL;
- WT_RET(__wt_calloc_def(session, 1, &cl));
+ WT_RET(__wt_calloc_one(session, &cl));
cursor = &cl->iface;
*cursor = iface;
cursor->session = &session->iface;
- WT_ERR(__wt_calloc_def(session, 1, &cl->cur_lsn));
- WT_ERR(__wt_calloc_def(session, 1, &cl->next_lsn));
+ WT_ERR(__wt_calloc_one(session, &cl->cur_lsn));
+ WT_ERR(__wt_calloc_one(session, &cl->next_lsn));
WT_ERR(__wt_scr_alloc(session, 0, &cl->logrec));
WT_ERR(__wt_scr_alloc(session, 0, &cl->opkey));
WT_ERR(__wt_scr_alloc(session, 0, &cl->opvalue));
diff --git a/src/cursor/cur_metadata.c b/src/cursor/cur_metadata.c
index d6c76c48ab9..e1e08c307fc 100644
--- a/src/cursor/cur_metadata.c
+++ b/src/cursor/cur_metadata.c
@@ -423,7 +423,7 @@ __wt_curmetadata_open(WT_SESSION_IMPL *session,
WT_DECL_RET;
WT_CONFIG_ITEM cval;
- WT_RET(__wt_calloc_def(session, 1, &mdc));
+ WT_RET(__wt_calloc_one(session, &mdc));
cursor = &mdc->iface;
*cursor = iface;
@@ -445,7 +445,9 @@ __wt_curmetadata_open(WT_SESSION_IMPL *session,
}
if (0) {
-err: __wt_free(session, mdc);
+err: if (mdc->file_cursor != NULL)
+ WT_TRET(mdc->file_cursor->close(mdc->file_cursor));
+ __wt_free(session, mdc);
}
return (ret);
}
diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c
index cc12077024f..74237c6ffdc 100644
--- a/src/cursor/cur_stat.c
+++ b/src/cursor/cur_stat.c
@@ -503,7 +503,7 @@ __wt_curstat_open(WT_SESSION_IMPL *session,
conn = S2C(session);
- WT_ERR(__wt_calloc_def(session, 1, &cst));
+ WT_ERR(__wt_calloc_one(session, &cst));
cursor = &cst->iface;
*cursor = iface;
cursor->session = &session->iface;
diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c
index 50d76609411..1825d641c49 100644
--- a/src/cursor/cur_table.c
+++ b/src/cursor/cur_table.c
@@ -878,7 +878,7 @@ __wt_curtable_open(WT_SESSION_IMPL *session,
return (ret);
}
- WT_RET(__wt_calloc_def(session, 1, &ctable));
+ WT_RET(__wt_calloc_one(session, &ctable));
cursor = &ctable->iface;
*cursor = iface;
diff --git a/src/docs/error-handling.dox b/src/docs/error-handling.dox
index bced608434b..cf268f80500 100644
--- a/src/docs/error-handling.dox
+++ b/src/docs/error-handling.dox
@@ -47,7 +47,7 @@ This error is returned when an error is not covered by a specific error return.
This error indicates an operation did not find a value to return. This includes cursor search and other operations where no record matched the cursor's search key such as WT_CURSOR::update or WT_CURSOR::remove.
@par <code>WT_PANIC</code>
-This error indicates an underlying problem that requires the application exit and restart.
+This error indicates an underlying problem that requires the application exit and restart. The application can exit immediately when \c WT_PANIC is returned from a WiredTiger interface, no further WiredTiger calls are required.
@if IGNORE_BUILT_BY_API_ERR_END
@endif
diff --git a/src/docs/examples.dox b/src/docs/examples.dox
index 53bd3589362..9b86df099e3 100644
--- a/src/docs/examples.dox
+++ b/src/docs/examples.dox
@@ -22,9 +22,6 @@ extractors and cursor types.
@example ex_extractor.c
Shows how to extend WiredTiger with a more complex custom extractor.
-@example ex_file.c
-Shows how to use file objects.
-
@example ex_hello.c
This is an example of how to create and open a database.
diff --git a/src/docs/tune-page-sizes.dox b/src/docs/tune-page-sizes.dox
index b3fd20f6276..130e047a02d 100644
--- a/src/docs/tune-page-sizes.dox
+++ b/src/docs/tune-page-sizes.dox
@@ -1,42 +1,127 @@
-/*! @page tune_page_sizes Page and overflow item sizes
-
-There are four page and item size configuration values: \c internal_page_max,
-\c internal_item_max, \c leaf_page_max and \c leaf_item_max. All four are
-specified to the WT_SESSION::create method, that is, they are configurable
-on a per-file basis.
-
-The \c internal_page_max and \c leaf_page_max configuration values specify
-the maximum size for Btree internal and leaf pages. That is, when an
-internal or leaf page grows past the specified size, it splits into
-multiple pages. Generally, internal pages should be sized to fit into
-the system's on-chip caches in order to minimize cache misses when
-searching the tree, while leaf pages should be sized to maximize I/O
-performance (if reading from disk is necessary, it is usually desirable
-to read a large amount of data, assuming some locality of reference in
-the application's access pattern).
-
-The \c internal_item_max and \c leaf_item_max configuration values specify
-the maximum size at which an object will be stored on-page. Larger items
-will be stored separately in the file from the page where the item logically
-appears. Referencing overflow items is more expensive than referencing
-on-page items, requiring additional I/O if the object is not already cached.
-For this reason, it is important to avoid creating large numbers of overflow
-items that are repeatedly referenced, and the maximum item size should
-probably be increased if many overflow items are being created. Because
-pages must be large enough to store any item that is not an overflow item,
-increasing the size of the overflow items may also require increasing the
-page sizes.
-
-With respect to compression, page and item sizes do not necessarily reflect
-the actual size of the page or item on disk, if block compression has been
-configured. Block compression in WiredTiger happens within the disk I/O
-subsystem, and so a page might split even if subsequent compression would
-result in a resulting page size that would be small enough to leave as a
-single page. In other words, page and overflow sizes are based on in-memory
-sizes, not disk sizes.
-
-There are two other, related configuration values, also settable by the
-WT_SESSION::create method. They are \c allocation_size and \c split_pct.
+/*! @page tune_page_sizes Page and overflow key/value sizes
+
+There are seven page and key/value size configuration strings:
+
+- allocation size (\c allocation_size),
+- page sizes (\c internal_page_max and \c leaf_page_max),
+- key and value sizes (\c internal_key_max, \c leaf_key_max and \c leaf_value_max), and the
+- page-split percentage (\c split_pct).
+
+All seven are specified to the WT_SESSION::create method, in other
+words, they are configurable on a per-file basis.
+
+Applications commonly configure page sizes, based on their workload's
+typical key and value size. Once the correct page size has been chosen,
+appropriate defaults for the other configuration values are derived from
+the page sizes, and relatively few applications will need to modify the
+other page and key/value size configuration options.
+
+An example of configuring page and key/value sizes:
+
+@snippet ex_all.c Create a table and configure the page size
+
+@section tune_page_sizes_sizes Page, key and value sizes
+
+The \c internal_page_max and \c leaf_page_max configuration values
+specify a maximum size for Btree internal and leaf pages. That is, when
+an internal or leaf page grows past that size, it splits into multiple
+pages. Generally, internal pages should be sized to fit into on-chip
+caches in order to minimize cache misses when searching the tree, while
+leaf pages should be sized to maximize I/O performance (if reading from
+disk is necessary, it is usually desirable to read a large amount of
+data, assuming some locality of reference in the application's access
+pattern).
+
+The default page size configurations (2KB for \c internal_page_max, 32KB
+for \c leaf_page_max), are appropriate for applications with relatively
+small keys and values.
+
+- Applications doing full-table scans through out-of-memory workloads
+might increase both internal and leaf page sizes to transfer more data
+per I/O.
+- Applications focused on read/write amplification might decrease the page
+size to better match the underlying storage block size.
+
+When block compression has been configured, configured page sizes will
+not match the actual size of the page on disk. Block compression in
+WiredTiger happens within the I/O subsystem, and so a page might split
+even if subsequent compression would result in a resulting page size
+small enough to leave as a single page. In other words, page sizes are
+based on in-memory sizes, not on-disk sizes. Applications needing to
+write specific sized blocks may want to consider implementing a
+WT_COMPRESSOR::compress_raw function.
+
+The page sizes also determine the default size of overflow items, that
+is, keys and values too large to easily store on a page. Overflow items
+are stored separately in the file from the page where the item logically
+appears, and so reading or writing an overflow item is more expensive
+than an on-page item, normally requiring additional I/O. Additionally,
+overflow values are not cached in memory. This means overflow items
+won't affect the caching behavior of the application, but it also means
+that each time an overflow value is read, it is re-read from disk.
+
+For both of these reasons, applications should avoid creating large
+numbers of commonly referenced overflow items. This is especially
+important for keys, as keys on internal pages are referenced during
+random searches, not just during data retrieval. Generally,
+applications should make every attempt to avoid creating overflow keys.
+
+- Applications with large keys and values, and concerned with latency,
+might increase the page size to avoid creating overflow items, in order
+to avoid the additional cost of retrieving them.
+
+- Applications with large keys and values, doing random searches, might
+decrease the page size to avoid wasting cache space on overflow items
+that aren't likely to be needed.
+
+- Applications with large keys and values, doing table scans, might
+increase the page size to avoid creating overflow items, as the overflow
+items must be read into memory in all cases, anyway.
+
+The \c internal_key_max, \c leaf_key_max and \c leaf_value_max
+configuration values allow applications to change the size at which a
+key or value will be treated as an overflow item.
+
+The value of \c internal_key_max is relative to the maximum internal
+page size. Because the number of keys on an internal page determines
+the depth of the tree, the \c internal_key_max value can only be
+adjusted within a certain range, and the configured value will be
+automatically adjusted by WiredTiger, if necessary to ensure a
+reasonable number of keys fit on an internal page.
+
+The values of \c leaf_key_max and \c leaf_value_max are not relative to
+the maximum leaf page size. If either is larger than the maximum page
+size, the page size will be ignored when the larger keys and values are
+being written, and a larger page will be created as necessary.
+
+Most applications should not need to tune the maximum key and value
+sizes. Applications requiring a small page size, but also having
+latency concerns such that the additional work to retrieve an overflow
+item is an issue, may find them useful.
+
+An example of configuring a large leaf overflow value:
+
+@snippet ex_all.c Create a table and configure a large leaf value max
+
+@section tune_page_sizes_split_percentage Split percentage
+
+The \c split_pct configuration string configures the size of a split
+page. When a page grows sufficiently large that it must be written as
+multiple disk blocks, the newly written block size is \c split_pct
+percent of the maximum page size. This value should be selected to
+avoid creating a large number of tiny pages or repeatedly splitting
+whenever new entries are inserted. For example, if the maximum page
+size is 1MB, a \c split_pct value of 10% would potentially result in
+creating a large number of 100KB pages, which may not be optimal for
+future I/O. Or, if the maximum page size is 1MB, a \c split_pct value
+of 90% would potentially result in repeatedly splitting pages as the
+split pages grow to 1MB over and over. The default value for \c
+split_pct is 75%, intended to keep large pages relatively large, while
+still giving split pages room to grow.
+
+Most applications should not need to tune the split percentage size.
+
+@section tune_page_sizes_allocation_size Allocation size
The \c allocation_size configuration value is the underlying unit of
allocation for the file. As the unit of file allocation, it sets the
@@ -46,25 +131,12 @@ is set to 4KB, an overflow item of 18,000 bytes requires 5 allocation
units and wastes about 2KB of space. If the allocation size is 16KB,
the same overflow item would waste more than 10KB.
-The default allocation size is 4KB, chosen for compatibility with virtual
-memory page sizes and direct I/O requirements on common server platforms.
-
-The last configuration value is \c split_pct, which configures the size
-of a split page. When a page grows sufficiently large that it must be
-written as multiple disk blocks, the newly written block size is \c
-split_pct percent of the maximum page size. This value should be
-selected to avoid creating a large number of tiny pages or repeatedly
-splitting whenever new entries are inserted. For example, if the
-maximum page size is 1MB, a \c split_pct value of 10% would potentially
-result in creating a large number of 100KB pages, which may not be
-optimal for future I/O. Or, if the maximum page size is 1MB, a \c
-split_pct value of 90% would potentially result in repeatedly splitting
-pages as the split pages grow to 1MB over and over. The default value
-for \c split_pct is 75%, intended to keep large pages relatively large,
-while still giving split pages room to grow.
-
-An example of configuring page sizes:
+The default allocation size is 4KB, chosen for compatibility with
+virtual memory page sizes and direct I/O requirements on common server
+platforms.
-@snippet ex_file.c file create
+Most applications should not need to tune the allocation size; it is
+primarily intended for applications coping with the specific
+requirements some file systems make to support features like direct I/O.
- */
+*/
diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox
index 0e750ae0ca1..0fb858643fd 100644
--- a/src/docs/upgrading.dox
+++ b/src/docs/upgrading.dox
@@ -17,6 +17,14 @@ Collators, compressors and extractors can now be disabled with an explicit
using the name \c "none" for a collator, compressor or extractor will need to
be updated.
</dd>
+
+<dt>maximum keys and value sizes
+<dd>
+The WT_SESSION::create \c internal_item_max and \c leaf_item_max
+configuration strings are now deprecated in favor of the
+\c internal_key_max, \c leaf_key_max, and \c leaf_value_max
+configuration strings. See @ref tune_page_sizes for more information.
+</dd>
</dl>
@section version_241 Upgrading to Version 2.4.1
diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c
index fa3bfa50eb0..bc791de6d0f 100644
--- a/src/evict/evict_page.c
+++ b/src/evict/evict_page.c
@@ -206,7 +206,7 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
* Publish: a barrier to ensure the structure fields are set
* before the state change makes the page available to readers.
*/
- WT_RET(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr));
+ WT_RET(__wt_calloc_one(session, &addr));
*addr = mod->mod_replace;
mod->mod_replace.addr = NULL;
mod->mod_replace.size = 0;
diff --git a/src/include/btree.h b/src/include/btree.h
index 907b36c9ed4..e7c1826bda9 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -83,9 +83,10 @@ struct __wt_btree {
uint32_t allocsize; /* Allocation size */
uint32_t maxintlpage; /* Internal page max size */
- uint32_t maxintlitem; /* Internal page max item size */
+ uint32_t maxintlkey; /* Internal page max key size */
uint32_t maxleafpage; /* Leaf page max size */
- uint32_t maxleafitem; /* Leaf page max item size */
+ uint32_t maxleafkey; /* Leaf page max key size */
+ uint32_t maxleafvalue; /* Leaf page max value size */
uint64_t maxmempage; /* In memory page max size */
void *huffman_key; /* Key huffman encoding */
diff --git a/src/include/extern.h b/src/include/extern.h
index 5a5601160c8..00bbdaf746c 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -122,7 +122,6 @@ extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno);
extern int __wt_btree_tree_open( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
extern int __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep);
extern void __wt_btree_evictable(WT_SESSION_IMPL *session, int on);
-extern uint32_t __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize);
extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session);
extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session);
extern int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size);
@@ -496,6 +495,7 @@ extern void __wt_ovfl_txnc_free(WT_SESSION_IMPL *session, WT_PAGE *page);
extern int __wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page);
extern int __wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page);
extern int __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags);
+extern uint32_t __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize);
extern int __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
extern int __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
diff --git a/src/include/misc.h b/src/include/misc.h
index c861dff18bc..c2abaa08057 100644
--- a/src/include/misc.h
+++ b/src/include/misc.h
@@ -65,11 +65,13 @@
#define WT_SKIP_PROBABILITY (UINT32_MAX >> 2)
/*
- * __wt_calloc_def --
- * Simple calls don't need separate sizeof arguments.
+ * __wt_calloc_def, __wt_calloc_one --
+ * Most calloc calls don't need separate count or sizeof arguments.
*/
#define __wt_calloc_def(session, number, addr) \
__wt_calloc(session, (size_t)(number), sizeof(**(addr)), addr)
+#define __wt_calloc_one(session, addr) \
+ __wt_calloc(session, (size_t)1, sizeof(**(addr)), addr)
/*
* __wt_realloc_def --
diff --git a/src/include/stat.h b/src/include/stat.h
index 37df43adfee..69fa0ba8e4f 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -287,10 +287,11 @@ struct __wt_dsrc_stats {
WT_STATS btree_entries;
WT_STATS btree_fixed_len;
WT_STATS btree_maximum_depth;
- WT_STATS btree_maxintlitem;
+ WT_STATS btree_maxintlkey;
WT_STATS btree_maxintlpage;
- WT_STATS btree_maxleafitem;
+ WT_STATS btree_maxleafkey;
WT_STATS btree_maxleafpage;
+ WT_STATS btree_maxleafvalue;
WT_STATS btree_overflow;
WT_STATS btree_row_internal;
WT_STATS btree_row_leaf;
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 100bc771798..9aa219eccfc 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -762,6 +762,18 @@ struct __wt_session {
/*! The connection for this session. */
WT_CONNECTION *connection;
+ /*
+ * Don't expose app_private to non-C language bindings - they have
+ * their own way to attach data to an operation.
+ */
+#if !defined(SWIG)
+ /*!
+ * A location for applications to store information that will be
+ * available in callbacks taking a WT_SESSION handle.
+ */
+ void *app_private;
+#endif
+
/*!
* Close the session handle.
*
@@ -997,12 +1009,12 @@ struct __wt_session {
* @config{immutable, configure the index to be immutable - that is an
* index is not changed by any update to a record in the table., a
* boolean flag; default \c false.}
- * @config{internal_item_max, the largest key stored within an internal
- * node\, in bytes. If non-zero\, any key larger than the specified
- * size will be stored as an overflow item (which may require additional
- * I/O to access). If zero\, a default size is chosen that permits at
- * least 8 keys per internal page., an integer greater than or equal to
- * 0; default \c 0.}
+ * @config{internal_key_max, the largest key stored in an internal
+ * node\, in bytes. If set\, keys larger than the specified size are
+ * stored as overflow items (which may require additional I/O to
+ * access). The default and the maximum allowed value are both one-tenth
+ * the size of a newly split internal page., an integer greater than or
+ * equal to 0; default \c 0.}
* @config{internal_key_truncate, configure internal key truncation\,
* discarding unnecessary trailing bytes on internal keys (ignored for
* custom collators)., a boolean flag; default \c true.}
@@ -1020,12 +1032,11 @@ struct __wt_session {
* row-store files: keys of type \c 'r' are record numbers and records
* referenced by record number are stored in column-store files., a
* format string; default \c u.}
- * @config{leaf_item_max, the largest key or value stored within a leaf
- * node\, in bytes. If non-zero\, any key or value larger than the
- * specified size will be stored as an overflow item (which may require
- * additional I/O to access). If zero\, a default size is chosen that
- * permits at least 4 key and value pairs per leaf page., an integer
- * greater than or equal to 0; default \c 0.}
+ * @config{leaf_key_max, the largest key stored in a leaf node\, in
+ * bytes. If set\, keys larger than the specified size are stored as
+ * overflow items (which may require additional I/O to access). The
+ * default value is one-tenth the size of a newly split leaf page., an
+ * integer greater than or equal to 0; default \c 0.}
* @config{leaf_page_max, the maximum page size for leaf nodes\, in
* bytes; the size must be a multiple of the allocation size\, and is
* significant for applications wanting to maximize sequential data
@@ -1033,6 +1044,13 @@ struct __wt_session {
* uncompressed data\, that is\, the limit is applied before any block
* compression is done., an integer between 512B and 512MB; default \c
* 32KB.}
+ * @config{leaf_value_max, the largest value stored in a leaf node\, in
+ * bytes. If set\, values larger than the specified size are stored as
+ * overflow items (which may require additional I/O to access). If the
+ * size is larger than the maximum leaf page size\, the page size is
+ * temporarily ignored when large values are written. The default is
+ * one-half the size of a newly split leaf page., an integer greater
+ * than or equal to 0; default \c 0.}
* @config{lsm = (, options only relevant for LSM data sources., a set
* of related configuration options defined below.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;auto_throttle, Throttle inserts into
@@ -2056,6 +2074,11 @@ struct __wt_event_handler {
* Callback to handle error messages; by default, error messages are
* written to the stderr stream.
*
+ * Errors that require the application to exit and restart will have
+ * their \c error value set to \c WT_PANIC. The application can exit
+ * immediately when \c WT_PANIC is passed to an error handler, there
+ * is no reason to return into WiredTiger.
+ *
* Error handler returns are not ignored: if the handler returns
* non-zero, the error may cause the WiredTiger function posting the
* event to fail, and may even cause operation or library failure.
@@ -2526,7 +2549,9 @@ const char *wiredtiger_version(int *majorp, int *minorp, int *patchp);
/*!
* WiredTiger library panic.
* This error indicates an underlying problem that requires the application exit
- * and restart.
+ * and restart. The application can exit immediately when \c WT_PANIC is
+ * returned from a WiredTiger interface, no further WiredTiger calls are
+ * required.
*/
#define WT_PANIC -31804
/*! @cond internal */
@@ -2642,7 +2667,7 @@ struct __wt_compressor {
* of \c dst_len. If the WT_COMPRESSOR::pre_size method is specified,
* the destination buffer will be at least the size returned by that
* method; otherwise, the destination buffer will be at least as large
- * as \c src_len.
+ * as the length of the data to compress.
*
* If compression would not shrink the data or the \c dst buffer is not
* large enough to hold the compressed data, the callback should set
@@ -2712,10 +2737,8 @@ struct __wt_compressor {
* On entry, \c dst points to the destination buffer with a length
* of \c dst_len. If the WT_COMPRESSOR::pre_size method is specified,
* the destination buffer will be at least the size returned by that
- * method; otherwise, the destination buffer will be at least the
- * maximum size for the page being written (that is, when writing a
- * row-store leaf page, the destination buffer will be at least as
- * large as the \c leaf_page_max configuration value).
+ * method; otherwise, the destination buffer will be at least as large
+ * as the length of the data to compress.
*
* After successful completion, the callback should return \c 0, and
* set \c result_slotsp to the number of byte strings encoded and
@@ -3378,130 +3401,132 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_DSRC_BTREE_FIXED_LEN 2023
/*! btree: maximum tree depth */
#define WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH 2024
-/*! btree: maximum internal page item size */
-#define WT_STAT_DSRC_BTREE_MAXINTLITEM 2025
+/*! btree: maximum internal page key size */
+#define WT_STAT_DSRC_BTREE_MAXINTLKEY 2025
/*! btree: maximum internal page size */
#define WT_STAT_DSRC_BTREE_MAXINTLPAGE 2026
-/*! btree: maximum leaf page item size */
-#define WT_STAT_DSRC_BTREE_MAXLEAFITEM 2027
+/*! btree: maximum leaf page key size */
+#define WT_STAT_DSRC_BTREE_MAXLEAFKEY 2027
/*! btree: maximum leaf page size */
#define WT_STAT_DSRC_BTREE_MAXLEAFPAGE 2028
+/*! btree: maximum leaf page value size */
+#define WT_STAT_DSRC_BTREE_MAXLEAFVALUE 2029
/*! btree: overflow pages */
-#define WT_STAT_DSRC_BTREE_OVERFLOW 2029
+#define WT_STAT_DSRC_BTREE_OVERFLOW 2030
/*! btree: row-store internal pages */
-#define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2030
+#define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2031
/*! btree: row-store leaf pages */
-#define WT_STAT_DSRC_BTREE_ROW_LEAF 2031
+#define WT_STAT_DSRC_BTREE_ROW_LEAF 2032
/*! cache: bytes read into cache */
-#define WT_STAT_DSRC_CACHE_BYTES_READ 2032
+#define WT_STAT_DSRC_CACHE_BYTES_READ 2033
/*! cache: bytes written from cache */
-#define WT_STAT_DSRC_CACHE_BYTES_WRITE 2033
+#define WT_STAT_DSRC_CACHE_BYTES_WRITE 2034
/*! cache: checkpoint blocked page eviction */
-#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 2034
+#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 2035
/*! cache: unmodified pages evicted */
-#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2035
+#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2036
/*! cache: modified pages evicted */
-#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 2036
+#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 2037
/*! cache: data source pages selected for eviction unable to be evicted */
-#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 2037
+#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 2038
/*! cache: hazard pointer blocked page eviction */
-#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2038
+#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2039
/*! cache: internal pages evicted */
-#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2039
+#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2040
/*! cache: in-memory page splits */
-#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2040
+#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2041
/*! cache: overflow values cached in memory */
-#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2041
+#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2042
/*! cache: pages read into cache */
-#define WT_STAT_DSRC_CACHE_READ 2042
+#define WT_STAT_DSRC_CACHE_READ 2043
/*! cache: overflow pages read into cache */
-#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2043
+#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2044
/*! cache: pages written from cache */
-#define WT_STAT_DSRC_CACHE_WRITE 2044
+#define WT_STAT_DSRC_CACHE_WRITE 2045
/*! compression: raw compression call failed, no additional data available */
-#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2045
+#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2046
/*! compression: raw compression call failed, additional data available */
-#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2046
+#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2047
/*! compression: raw compression call succeeded */
-#define WT_STAT_DSRC_COMPRESS_RAW_OK 2047
+#define WT_STAT_DSRC_COMPRESS_RAW_OK 2048
/*! compression: compressed pages read */
-#define WT_STAT_DSRC_COMPRESS_READ 2048
+#define WT_STAT_DSRC_COMPRESS_READ 2049
/*! compression: compressed pages written */
-#define WT_STAT_DSRC_COMPRESS_WRITE 2049
+#define WT_STAT_DSRC_COMPRESS_WRITE 2050
/*! compression: page written failed to compress */
-#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2050
+#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2051
/*! compression: page written was too small to compress */
-#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2051
+#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2052
/*! cursor: create calls */
-#define WT_STAT_DSRC_CURSOR_CREATE 2052
+#define WT_STAT_DSRC_CURSOR_CREATE 2053
/*! cursor: insert calls */
-#define WT_STAT_DSRC_CURSOR_INSERT 2053
+#define WT_STAT_DSRC_CURSOR_INSERT 2054
/*! cursor: bulk-loaded cursor-insert calls */
-#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2054
+#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2055
/*! cursor: cursor-insert key and value bytes inserted */
-#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2055
+#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2056
/*! cursor: next calls */
-#define WT_STAT_DSRC_CURSOR_NEXT 2056
+#define WT_STAT_DSRC_CURSOR_NEXT 2057
/*! cursor: prev calls */
-#define WT_STAT_DSRC_CURSOR_PREV 2057
+#define WT_STAT_DSRC_CURSOR_PREV 2058
/*! cursor: remove calls */
-#define WT_STAT_DSRC_CURSOR_REMOVE 2058
+#define WT_STAT_DSRC_CURSOR_REMOVE 2059
/*! cursor: cursor-remove key bytes removed */
-#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2059
+#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2060
/*! cursor: reset calls */
-#define WT_STAT_DSRC_CURSOR_RESET 2060
+#define WT_STAT_DSRC_CURSOR_RESET 2061
/*! cursor: search calls */
-#define WT_STAT_DSRC_CURSOR_SEARCH 2061
+#define WT_STAT_DSRC_CURSOR_SEARCH 2062
/*! cursor: search near calls */
-#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2062
+#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2063
/*! cursor: update calls */
-#define WT_STAT_DSRC_CURSOR_UPDATE 2063
+#define WT_STAT_DSRC_CURSOR_UPDATE 2064
/*! cursor: cursor-update value bytes updated */
-#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2064
+#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2065
/*! LSM: sleep for LSM checkpoint throttle */
-#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2065
+#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2066
/*! LSM: chunks in the LSM tree */
-#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2066
+#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2067
/*! LSM: highest merge generation in the LSM tree */
-#define WT_STAT_DSRC_LSM_GENERATION_MAX 2067
+#define WT_STAT_DSRC_LSM_GENERATION_MAX 2068
/*! LSM: queries that could have benefited from a Bloom filter that did
* not exist */
-#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2068
+#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2069
/*! LSM: sleep for LSM merge throttle */
-#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2069
+#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2070
/*! reconciliation: dictionary matches */
-#define WT_STAT_DSRC_REC_DICTIONARY 2070
+#define WT_STAT_DSRC_REC_DICTIONARY 2071
/*! reconciliation: internal page multi-block writes */
-#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2071
+#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2072
/*! reconciliation: leaf page multi-block writes */
-#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2072
+#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2073
/*! reconciliation: maximum blocks required for a page */
-#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2073
+#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2074
/*! reconciliation: internal-page overflow keys */
-#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2074
+#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2075
/*! reconciliation: leaf-page overflow keys */
-#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2075
+#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2076
/*! reconciliation: overflow values written */
-#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2076
+#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2077
/*! reconciliation: pages deleted */
-#define WT_STAT_DSRC_REC_PAGE_DELETE 2077
+#define WT_STAT_DSRC_REC_PAGE_DELETE 2078
/*! reconciliation: page checksum matches */
-#define WT_STAT_DSRC_REC_PAGE_MATCH 2078
+#define WT_STAT_DSRC_REC_PAGE_MATCH 2079
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_DSRC_REC_PAGES 2079
+#define WT_STAT_DSRC_REC_PAGES 2080
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_DSRC_REC_PAGES_EVICTION 2080
+#define WT_STAT_DSRC_REC_PAGES_EVICTION 2081
/*! reconciliation: leaf page key bytes discarded using prefix compression */
-#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2081
+#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2082
/*! reconciliation: internal page key bytes discarded using suffix
* compression */
-#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2082
+#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2083
/*! session: object compaction */
-#define WT_STAT_DSRC_SESSION_COMPACT 2083
+#define WT_STAT_DSRC_SESSION_COMPACT 2084
/*! session: open cursor count */
-#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2084
+#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2085
/*! transaction: update conflicts */
-#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2085
+#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2086
/*! @} */
/*
* Statistics section: END
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index 103a506287d..dd60ad926d8 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -322,14 +322,15 @@ __clsm_deleted_encode(WT_SESSION_IMPL *session,
* Decode values that start with the tombstone.
*/
static inline void
-__clsm_deleted_decode(WT_ITEM *value)
+__clsm_deleted_decode(WT_CURSOR_LSM *clsm, WT_ITEM *value)
{
/*
* Take care with this check: when an LSM cursor is used for a merge,
* and/or to create a Bloom filter, it is valid to return the tombstone
* value.
*/
- if (value->size > __tombstone.size &&
+ if (!F_ISSET(clsm, WT_CLSM_MERGE) &&
+ value->size > __tombstone.size &&
memcmp(value->data, __tombstone.data, __tombstone.size) == 0)
--value->size;
}
@@ -840,7 +841,7 @@ retry: /*
err: WT_TRET(__clsm_leave(clsm));
API_END(session, ret);
if (ret == 0)
- __clsm_deleted_decode(&cursor->value);
+ __clsm_deleted_decode(clsm, &cursor->value);
return (ret);
}
@@ -928,7 +929,7 @@ retry: /*
err: WT_TRET(__clsm_leave(clsm));
API_END(session, ret);
if (ret == 0)
- __clsm_deleted_decode(&cursor->value);
+ __clsm_deleted_decode(clsm, &cursor->value);
return (ret);
}
@@ -1087,7 +1088,7 @@ __clsm_search(WT_CURSOR *cursor)
err: WT_TRET(__clsm_leave(clsm));
API_END(session, ret);
if (ret == 0)
- __clsm_deleted_decode(&cursor->value);
+ __clsm_deleted_decode(clsm, &cursor->value);
return (ret);
}
@@ -1173,8 +1174,7 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp)
* smallest cursor larger than the search key, or it is NULL if the
* search key is larger than any record in the tree.
*/
- if (!exact)
- cmp = 1;
+ cmp = exact ? 0 : 1;
/*
* If we land on a deleted item, try going forwards or backwards to
@@ -1189,7 +1189,9 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp)
clsm->current = closest;
closest = NULL;
deleted = __clsm_deleted(clsm, &cursor->value);
- if (deleted && (ret = cursor->next(cursor)) == 0) {
+ if (!deleted)
+ __clsm_deleted_decode(clsm, &cursor->value);
+ else if ((ret = cursor->next(cursor)) == 0) {
cmp = 1;
deleted = 0;
}
@@ -1197,8 +1199,8 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp)
}
if (deleted) {
clsm->current = NULL;
- if ((ret = cursor->prev(cursor)) == 0)
- cmp = -1;
+ WT_ERR(cursor->prev(cursor));
+ cmp = -1;
}
*exactp = cmp;
@@ -1210,7 +1212,6 @@ err: WT_TRET(__clsm_leave(clsm));
F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
if (ret == 0) {
F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
- __clsm_deleted_decode(&cursor->value);
} else
clsm->current = NULL;
@@ -1460,7 +1461,7 @@ __wt_clsm_open(WT_SESSION_IMPL *session,
ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree));
WT_RET(ret);
- WT_ERR(__wt_calloc_def(session, 1, &clsm));
+ WT_ERR(__wt_calloc_one(session, &clsm));
cursor = &clsm->iface;
*cursor = iface;
diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c
index 1356d336f6e..248ac70c61e 100644
--- a/src/lsm/lsm_manager.c
+++ b/src/lsm/lsm_manager.c
@@ -645,7 +645,7 @@ __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session,
WT_RET(__wt_epoch(session, &lsm_tree->work_push_ts));
- WT_RET(__wt_calloc_def(session, 1, &entry));
+ WT_RET(__wt_calloc_one(session, &entry));
entry->type = type;
entry->flags = flags;
entry->lsm_tree = lsm_tree;
diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c
index 9ed605724ce..8989e979a44 100644
--- a/src/lsm/lsm_merge.c
+++ b/src/lsm/lsm_merge.c
@@ -311,7 +311,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
lsm_tree->name, verb, lsm_tree->chunk[verb]->id));
}
- WT_ERR(__wt_calloc_def(session, 1, &chunk));
+ WT_ERR(__wt_calloc_one(session, &chunk));
created_chunk = 1;
chunk->id = dest_id;
diff --git a/src/lsm/lsm_meta.c b/src/lsm/lsm_meta.c
index bf03588c066..7fd77b64720 100644
--- a/src/lsm/lsm_meta.c
+++ b/src/lsm/lsm_meta.c
@@ -91,8 +91,8 @@ __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
WT_ERR(__wt_realloc_def(session,
&lsm_tree->chunk_alloc,
nchunks + 1, &lsm_tree->chunk));
- WT_ERR(__wt_calloc_def(
- session, 1, &chunk));
+ WT_ERR(
+ __wt_calloc_one(session, &chunk));
lsm_tree->chunk[nchunks++] = chunk;
chunk->id = (uint32_t)lv.val;
WT_ERR(__wt_lsm_tree_chunk_name(session,
@@ -136,7 +136,7 @@ __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
WT_ERR(__wt_realloc_def(session,
&lsm_tree->old_alloc, nchunks + 1,
&lsm_tree->old_chunks));
- WT_ERR(__wt_calloc_def(session, 1, &chunk));
+ WT_ERR(__wt_calloc_one(session, &chunk));
lsm_tree->old_chunks[nchunks++] = chunk;
WT_ERR(__wt_strndup(session,
lk.str, lk.len, &chunk->uri));
diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c
index 888f12bdd94..e7b1d7f9d2c 100644
--- a/src/lsm/lsm_tree.c
+++ b/src/lsm/lsm_tree.c
@@ -332,7 +332,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
WT_RET_MSG(session, EINVAL,
"LSM trees cannot be configured as column stores");
- WT_RET(__wt_calloc_def(session, 1, &lsm_tree));
+ WT_RET(__wt_calloc_one(session, &lsm_tree));
WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));
@@ -551,7 +551,7 @@ __lsm_tree_open(
return (ret);
/* Try to open the tree. */
- WT_RET(__wt_calloc_def(session, 1, &lsm_tree));
+ WT_RET(__wt_calloc_one(session, &lsm_tree));
WT_ERR(__wt_rwlock_alloc(session, &lsm_tree->rwlock, "lsm tree"));
WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));
@@ -820,7 +820,7 @@ __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
"merge throttle %ld", lsm_tree->name,
new_id, lsm_tree->ckpt_throttle, lsm_tree->merge_throttle));
- WT_ERR(__wt_calloc_def(session, 1, &chunk));
+ WT_ERR(__wt_calloc_one(session, &chunk));
chunk->id = new_id;
chunk->switch_txn = WT_TXN_NONE;
lsm_tree->chunk[lsm_tree->nchunks++] = chunk;
@@ -1011,7 +1011,7 @@ __wt_lsm_tree_truncate(
locked = 1;
/* Create the new chunk. */
- WT_ERR(__wt_calloc_def(session, 1, &chunk));
+ WT_ERR(__wt_calloc_one(session, &chunk));
chunk->id = WT_ATOMIC_ADD4(lsm_tree->last, 1);
WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
diff --git a/src/os_posix/os_dlopen.c b/src/os_posix/os_dlopen.c
index 91410c54c04..cb9fe314beb 100644
--- a/src/os_posix/os_dlopen.c
+++ b/src/os_posix/os_dlopen.c
@@ -17,7 +17,7 @@ __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp)
WT_DECL_RET;
WT_DLH *dlh;
- WT_RET(__wt_calloc_def(session, 1, &dlh));
+ WT_RET(__wt_calloc_one(session, &dlh));
WT_ERR(__wt_strdup(session, path, &dlh->name));
if ((dlh->handle = dlopen(path, RTLD_LAZY)) == NULL)
diff --git a/src/os_posix/os_mtx_cond.c b/src/os_posix/os_mtx_cond.c
index 3a76cceb3f0..479a61db795 100644
--- a/src/os_posix/os_mtx_cond.c
+++ b/src/os_posix/os_mtx_cond.c
@@ -22,7 +22,7 @@ __wt_cond_alloc(WT_SESSION_IMPL *session,
* !!!
* This function MUST handle a NULL session handle.
*/
- WT_RET(__wt_calloc(session, 1, sizeof(WT_CONDVAR), &cond));
+ WT_RET(__wt_calloc_one(session, &cond));
WT_ERR(pthread_mutex_init(&cond->mtx, NULL));
diff --git a/src/os_posix/os_mtx_rw.c b/src/os_posix/os_mtx_rw.c
index 1a692f71dce..c6cfa9412a7 100644
--- a/src/os_posix/os_mtx_rw.c
+++ b/src/os_posix/os_mtx_rw.c
@@ -53,7 +53,7 @@ __wt_rwlock_alloc(
WT_RET(__wt_verbose(session, WT_VERB_MUTEX, "rwlock: alloc %s", name));
- WT_RET(__wt_calloc_def(session, 1, &rwlock));
+ WT_RET(__wt_calloc_one(session, &rwlock));
rwlock->name = name;
diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c
index 736ed2be377..a0da1952101 100644
--- a/src/os_posix/os_open.c
+++ b/src/os_posix/os_open.c
@@ -145,7 +145,7 @@ setupfh:
WT_ERR(posix_fadvise(fd, 0, 0, POSIX_FADV_RANDOM));
#endif
- WT_ERR(__wt_calloc(session, 1, sizeof(WT_FH), &fh));
+ WT_ERR(__wt_calloc_one(session, &fh));
WT_ERR(__wt_strdup(session, name, &fh->name));
fh->fd = fd;
fh->ref = 1;
diff --git a/src/os_win/os_dir.c b/src/os_win/os_dir.c
index 076c64670d4..ab332e01186 100644
--- a/src/os_win/os_dir.c
+++ b/src/os_win/os_dir.c
@@ -38,7 +38,7 @@ __wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix,
path[pathlen - 1] = '\0';
}
- WT_ERR(__wt_scr_alloc(session, 0, &pathbuf));
+ WT_ERR(__wt_scr_alloc(session, pathlen + 3, &pathbuf));
WT_ERR(__wt_buf_fmt(session, pathbuf, "%s\\*", path));
dirallocsz = 0;
@@ -96,7 +96,7 @@ err:
if (findhandle != INVALID_HANDLE_VALUE)
(void)FindClose(findhandle);
__wt_free(session, path);
- __wt_buf_free(session, pathbuf);
+ __wt_scr_free(&pathbuf);
if (ret == 0)
return (0);
diff --git a/src/os_win/os_dlopen.c b/src/os_win/os_dlopen.c
index ebc90edd2b2..3fdd0c74b1f 100644
--- a/src/os_win/os_dlopen.c
+++ b/src/os_win/os_dlopen.c
@@ -17,7 +17,7 @@ __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp)
WT_DECL_RET;
WT_DLH *dlh;
- WT_RET(__wt_calloc_def(session, 1, &dlh));
+ WT_RET(__wt_calloc_one(session, &dlh));
WT_ERR(__wt_strdup(session, path, &dlh->name));
/* NULL means load from the current binary */
diff --git a/src/os_win/os_mtx_cond.c b/src/os_win/os_mtx_cond.c
index 9c9907bd8be..a33ab4e5c37 100644
--- a/src/os_win/os_mtx_cond.c
+++ b/src/os_win/os_mtx_cond.c
@@ -21,7 +21,7 @@ __wt_cond_alloc(WT_SESSION_IMPL *session,
* !!!
* This function MUST handle a NULL session handle.
*/
- WT_RET(__wt_calloc(session, 1, sizeof(WT_CONDVAR), &cond));
+ WT_RET(__wt_calloc_one(session, &cond));
InitializeCriticalSection(&cond->mtx);
diff --git a/src/os_win/os_open.c b/src/os_win/os_open.c
index 6bdbaa3f065..f9d47c5be5d 100644
--- a/src/os_win/os_open.c
+++ b/src/os_win/os_open.c
@@ -130,7 +130,7 @@ __wt_open(WT_SESSION_IMPL *session,
"open failed for secondary handle: %s", path);
setupfh:
- WT_ERR(__wt_calloc(session, 1, sizeof(WT_FH), &fh));
+ WT_ERR(__wt_calloc_one(session, &fh));
WT_ERR(__wt_strdup(session, name, &fh->name));
fh->filehandle = filehandle;
fh->filehandle_secondary = filehandle_secondary;
diff --git a/src/packing/pack_stream.c b/src/packing/pack_stream.c
index efbbd5d9adb..a35a3555458 100644
--- a/src/packing/pack_stream.c
+++ b/src/packing/pack_stream.c
@@ -30,7 +30,7 @@ wiredtiger_pack_start(WT_SESSION *wt_session,
WT_SESSION_IMPL *session;
session = (WT_SESSION_IMPL *)wt_session;
- WT_RET(__wt_calloc_def(session, 1, &ps));
+ WT_RET(__wt_calloc_one(session, &ps));
WT_ERR(__pack_init(session, &ps->pack, format));
ps->p = ps->start = buffer;
ps->end = ps->p + len;
diff --git a/src/reconcile/rec_track.c b/src/reconcile/rec_track.c
index 92282393a23..fdf8ee6d68b 100644
--- a/src/reconcile/rec_track.c
+++ b/src/reconcile/rec_track.c
@@ -21,7 +21,7 @@
static int
__ovfl_track_init(WT_SESSION_IMPL *session, WT_PAGE *page)
{
- return (__wt_calloc_def(session, 1, &page->modify->ovfl_track));
+ return (__wt_calloc_one(session, &page->modify->ovfl_track));
}
/*
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index c72447ae841..839ab028afd 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -96,16 +96,15 @@ typedef struct {
* image size.
*
* First, the sizes of the page we're building. If WiredTiger is doing
- * page layout, page_size is the same as page_size_max. We accumulate
- * the maximum page size of raw data and when we reach that size, we
- * split the page into multiple chunks, eventually compressing those
- * chunks. When the application is doing page layout (raw compression
- * is configured), page_size can continue to grow past page_size_max,
- * and we keep accumulating raw data until the raw compression callback
- * accepts it.
+ * page layout, page_size is the same as page_size_orig. We accumulate
+ * a "page size" of raw data and when we reach that size, we split the
+ * page into multiple chunks, eventually compressing those chunks. When
+ * the application is doing page layout (raw compression is configured),
+ * page_size can continue to grow past page_size_orig, and we keep
+ * accumulating raw data until the raw compression callback accepts it.
*/
- uint32_t page_size; /* Current page size */
- uint32_t page_size_max; /* Maximum on-disk page size */
+ uint32_t page_size; /* Set page size */
+ uint32_t page_size_orig; /* Saved set page size */
/*
* Second, the split size: if we're doing the page layout, split to a
@@ -202,9 +201,8 @@ typedef struct {
* because we've already been forced to split.
*/
enum { SPLIT_BOUNDARY=0, /* Next: a split page boundary */
- SPLIT_MAX=1, /* Next: the maximum page boundary */
- SPLIT_TRACKING_OFF=2, /* No boundary checks */
- SPLIT_TRACKING_RAW=3 } /* Underlying compression decides */
+ SPLIT_TRACKING_OFF=1, /* No boundary checks */
+ SPLIT_TRACKING_RAW=2 } /* Underlying compression decides */
bnd_state;
/*
@@ -591,7 +589,7 @@ __rec_write_init(WT_SESSION_IMPL *session,
page = ref->page;
if ((r = *(WT_RECONCILE **)reconcilep) == NULL) {
- WT_RET(__wt_calloc_def(session, 1, &r));
+ WT_RET(__wt_calloc_one(session, &r));
*(WT_RECONCILE **)reconcilep = r;
session->reconcile_cleanup = __rec_destroy_session;
@@ -1284,7 +1282,7 @@ __rec_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size)
*/
WT_ASSERT(session, r->space_avail >= size);
WT_ASSERT(session,
- WT_BLOCK_FITS(r->first_free, size, r->dsk.mem, r->page_size));
+ WT_BLOCK_FITS(r->first_free, size, r->dsk.mem, r->dsk.memsize));
r->entries += v;
r->space_avail -= size;
@@ -1543,6 +1541,37 @@ __rec_split_bnd_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r)
}
/*
+ * __wt_split_page_size --
+ * Split page size calculation: we don't want to repeatedly split every
+ * time a new entry is added, so we split to a smaller-than-maximum page size.
+ */
+uint32_t
+__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize)
+{
+ uintmax_t a;
+ uint32_t split_size;
+
+ /*
+ * Ideally, the split page size is some percentage of the maximum page
+ * size rounded to an allocation unit (round to an allocation unit so
+ * we don't waste space when we write).
+ */
+ a = maxpagesize; /* Don't overflow. */
+ split_size = (uint32_t)
+ WT_ALIGN((a * (u_int)btree->split_pct) / 100, btree->allocsize);
+
+ /*
+ * If the result of that calculation is the same as the allocation unit
+ * (that happens if the maximum size is the same size as an allocation
+ * unit, use a percentage of the maximum page size).
+ */
+ if (split_size == btree->allocsize)
+ split_size = (uint32_t)((a * (u_int)btree->split_pct) / 100);
+
+ return (split_size);
+}
+
+/*
* __rec_split_init --
* Initialization for the reconciliation split functions.
*/
@@ -1576,7 +1605,7 @@ __rec_split_init(WT_SESSION_IMPL *session,
* we don't want to increment our way up to the amount of data needed by
* the application to successfully compress to the target page size.
*/
- r->page_size = r->page_size_max = max;
+ r->page_size = r->page_size_orig = max;
if (r->raw_compression)
r->page_size *= 10;
@@ -1632,11 +1661,11 @@ __rec_split_init(WT_SESSION_IMPL *session,
r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
}
else if (page->type == WT_PAGE_COL_FIX) {
- r->split_size = r->page_size_max;
+ r->split_size = r->page_size;
r->space_avail =
r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
} else {
- r->split_size = __wt_split_page_size(btree, r->page_size_max);
+ r->split_size = __wt_split_page_size(btree, r->page_size);
r->space_avail =
r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
}
@@ -1853,17 +1882,45 @@ err: __wt_scr_free(&update);
}
/*
+ * __rec_split_grow --
+ * Grow the split buffer.
+ */
+static int
+__rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ size_t corrected_page_size, len;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+
+ len = WT_PTRDIFF(r->first_free, r->dsk.mem);
+ corrected_page_size = len + add_len;
+ WT_RET(bm->write_size(bm, session, &corrected_page_size));
+ WT_RET(__wt_buf_grow(session, &r->dsk, corrected_page_size));
+ r->first_free = (uint8_t *)r->dsk.mem + len;
+ WT_ASSERT(session, corrected_page_size >= len);
+ r->space_avail = corrected_page_size - len;
+ WT_ASSERT(session, r->space_avail >= add_len);
+ return (0);
+}
+
+/*
* __rec_split --
* Handle the page reconciliation bookkeeping. (Did you know "bookkeeper"
* has 3 doubled letters in a row? Sweet-tooth does, too.)
*/
static int
-__rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+__rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
{
- WT_BTREE *btree;
WT_BOUNDARY *last, *next;
+ WT_BTREE *btree;
WT_PAGE_HEADER *dsk;
- uint32_t len;
+ size_t len;
+
+ btree = S2BT(session);
+ dsk = r->dsk.mem;
/*
* We should never split during salvage, and we're about to drop core
@@ -1874,45 +1931,20 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r)
"%s page too large, attempted split during salvage",
__wt_page_type_string(r->page->type));
- /*
- * Handle page-buffer size tracking; we have to do this work in every
- * reconciliation loop, and I don't want to repeat the code that many
- * times.
- */
- btree = S2BT(session);
- dsk = r->dsk.mem;
-
/* Hitting a page boundary resets the dictionary, in all cases. */
__rec_dictionary_reset(r);
- /*
- * There are 3 cases we have to handle.
- *
- * #1
- * About to cross a split boundary: save current boundary information
- * and return.
- *
- * #2
- * About to cross the maximum boundary: use saved boundary information
- * to write all of the split pages.
- *
- * #3
- * About to cross a split boundary, but we've either already done the
- * split thing when we approached the maximum boundary, in which
- * case we write the page and keep going, or we were never tracking
- * split boundaries at all.
- *
- * Cases #1 and #2 are the hard ones: we're called when we're about to
- * cross each split boundary, and we save information away so we can
- * split if we have to. We're also called when we're about to cross
- * the maximum page boundary: in that case, we do the actual split and
- * clean up all the previous boundaries, then keep going.
- */
switch (r->bnd_state) {
- case SPLIT_BOUNDARY: /* Case #1 */
+ case SPLIT_BOUNDARY:
+ /* We can get here if the first key/value pair won't fit. */
+ if (r->entries == 0)
+ break;
+
/*
- * Save the information about where we are when the split would
- * have happened.
+ * About to cross a split boundary but not yet forced to split
+ * into multiple pages. If we have to split, this is one of the
+ * split points, save information about where we are when the
+ * split would have happened.
*/
WT_RET(__rec_split_bnd_grow(session, r));
last = &r->bnd[r->bnd_next++];
@@ -1939,37 +1971,50 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r)
/*
* Set the space available to another split-size chunk, if we
* have one. If we don't have room for another split chunk,
- * add whatever space remains in the maximum page size, and
- * hope it's enough.
+ * add whatever space remains in this page.
*/
len = WT_PTRDIFF32(r->first_free, dsk);
if (len + r->split_size <= r->page_size)
r->space_avail =
r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
else {
- r->bnd_state = SPLIT_MAX;
+ WT_ASSERT(session, r->page_size >=
+ (WT_PAGE_HEADER_BYTE_SIZE(btree) + len));
r->space_avail = r->page_size -
(WT_PAGE_HEADER_BYTE_SIZE(btree) + len);
}
- break;
- case SPLIT_MAX: /* Case #2 */
+
+ /* If the next object fits into this page, we're good to go. */
+ if (r->space_avail >= next_len)
+ return (0);
+
/*
- * It didn't all fit into a single page.
+ * We're going to have to split and create multiple pages.
*
* Cycle through the saved split-point information, writing the
- * split chunks we have tracked.
+ * split chunks we have tracked. The underlying fixup function
+ * sets the space available and other information, and copied
+ * any unwritten chunk of data to the beginning of the buffer.
*/
WT_RET(__rec_split_fixup(session, r));
-
- /* We're done saving split chunks. */
- r->bnd_state = SPLIT_TRACKING_OFF;
break;
- case SPLIT_TRACKING_OFF: /* Case #3 */
+ case SPLIT_TRACKING_OFF:
+ /*
+ * We can get here if the first key/value pair won't fit.
+ * Additionally, grow the buffer to contain the current data if
+ * we haven't already consumed a reasonable portion of the page.
+ */
+ if (r->entries == 0)
+ break;
+ if (WT_PTRDIFF(r->first_free, r->dsk.mem) < r->page_size / 2)
+ break;
+
/*
- * It didn't all fit, but either we've already noticed it and
- * are now processing the rest of the page at the split-size
- * boundaries, or the split size was the same as the page size,
- * so we never bothered with saving split-point information.
+ * The key/value pairs didn't fit into a single page, but either
+ * we've already noticed that and are now processing the rest of
+ * the pairs at split size boundaries, or the split size was the
+ * same as the page size, and we never bothered with split point
+ * information at all.
*/
WT_RET(__rec_split_bnd_grow(session, r));
last = &r->bnd[r->bnd_next++];
@@ -2007,6 +2052,24 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r)
case SPLIT_TRACKING_RAW:
WT_ILLEGAL_VALUE(session);
}
+
+ /*
+ * Overflow values can be larger than the maximum page size but still be
+ * "on-page". If the next key/value pair is larger than space available
+ * after a split has happened (in other words, larger than the maximum
+ * page size), create a page sized to hold that one key/value pair. This
+ * generally splits the page into key/value pairs before a large object,
+ * the object, and key/value pairs after the object. It's possible other
+ * key/value pairs will also be aggregated onto the bigger page before
+ * or after, if the page happens to hold them, but it won't necessarily
+ * happen that way.
+ */
+ if (r->space_avail < next_len)
+ WT_RET(__rec_split_grow(session, r, next_len));
+
+ /* We're done saving split chunks. */
+ r->bnd_state = SPLIT_TRACKING_OFF;
+
return (0);
}
@@ -2015,8 +2078,8 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r)
* Handle the raw compression page reconciliation bookkeeping.
*/
static int
-__rec_split_raw_worker(
- WT_SESSION_IMPL *session, WT_RECONCILE *r, int no_more_rows)
+__rec_split_raw_worker(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, size_t next_len, int no_more_rows)
{
WT_BM *bm;
WT_BOUNDARY *last, *next;
@@ -2048,6 +2111,12 @@ __rec_split_raw_worker(
next = last + 1;
/*
+ * We can get here if the first key/value pair won't fit.
+ */
+ if (r->entries == 0)
+ goto split_grow;
+
+ /*
* Build arrays of offsets and cumulative counts of cells and rows in
* the page: the offset is the byte offset to the possible split-point
* (adjusted for an initial chunk that cannot be compressed), entries
@@ -2150,27 +2219,29 @@ __rec_split_raw_worker(
WT_STORE_SIZE(WT_PTRDIFF(cell, dsk) - WT_BLOCK_COMPRESS_SKIP);
/*
- * Allocate a destination buffer. If there's a pre-size function, use
- * it to determine the destination buffer's minimum size, otherwise the
- * destination buffer is documented to be at least the maximum object
- * size.
+ * Allocate a destination buffer. If there's a pre-size function, call
+ * it to determine the destination buffer's size, else the destination
+ * buffer is documented to be at least the source size. (We can't use
+ * the target page size, any single key/value could be larger than the
+ * page size. Don't bother figuring out a minimum, just use the source
+ * size.)
*
- * The destination buffer really only needs to be large enough for the
- * target block size, corrected for the requirements of the underlying
- * block manager. If the target block size is 8KB, that's a multiple
- * of 512B and so the underlying block manager is fine with it. But...
- * we don't control what the pre_size method returns us as a required
- * size, and we don't want to document the compress_raw method has to
- * skip bytes in the buffer because that's confusing, so do something
- * more complicated. First, find out how much space the compress_raw
- * function might need, either the value returned from pre_size, or the
- * maximum object size. Add the compress-skip bytes, and then correct
- * that value for the underlying block manager. As a result, we have
- * a destination buffer that's the right "object" size when calling the
- * compress_raw method, and there are bytes in the header just for us.
+ * The destination buffer needs to be large enough for the final block
+ * size, corrected for the requirements of the underlying block manager.
+ * If the final block size is 8KB, that's a multiple of 512B and so the
+ * underlying block manager is fine with it. But... we don't control
+ * what the pre_size method returns us as a required size, and we don't
+ * want to document the compress_raw method has to skip bytes in the
+ * buffer because that's confusing, so do something more complicated.
+ * First, find out how much space the compress_raw function might need,
+ * either the value returned from pre_size, or the initial source size.
+ * Add the compress-skip bytes, and then correct that value for the
+ * underlying block manager. As a result, we have a destination buffer
+ * that's large enough when calling the compress_raw method, and there
+ * are bytes in the header just for us.
*/
if (compressor->pre_size == NULL)
- result_len = r->page_size_max;
+ result_len = (size_t)r->raw_offsets[slots];
else
WT_RET(compressor->pre_size(compressor, wt_session,
(uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
@@ -2185,7 +2256,7 @@ __rec_split_raw_worker(
*/
memcpy(dst->mem, dsk, WT_BLOCK_COMPRESS_SKIP);
ret = compressor->compress_raw(compressor, wt_session,
- r->page_size_max, btree->split_pct,
+ r->page_size_orig, btree->split_pct,
WT_BLOCK_COMPRESS_SKIP, (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
r->raw_offsets, slots,
(uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP,
@@ -2296,15 +2367,16 @@ no_slots:
* Note use of memmove, the source and destination buffers can
* overlap.
*/
- len = WT_PTRDIFF(r->first_free, (uint8_t *)dsk +
- r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP);
+ len = WT_PTRDIFF(
+ r->first_free, (uint8_t *)dsk + dsk_dst->mem_size);
dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
(void)memmove(dsk_start, (uint8_t *)r->first_free - len, len);
r->entries -= r->raw_entries[result_slots - 1];
r->first_free = dsk_start + len;
- r->space_avail =
- r->page_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len);
+ r->space_avail += r->raw_offsets[result_slots];
+ WT_ASSERT(session, r->first_free + r->space_avail <=
+ (uint8_t *)r->dsk.mem + r->dsk.memsize);
/*
* Set the key for the next block (before writing the block, a
@@ -2358,15 +2430,14 @@ no_slots:
*/
WT_STAT_FAST_DATA_INCR(session, compress_raw_fail_temporary);
- len = WT_PTRDIFF(r->first_free, r->dsk.mem);
- corrected_page_size = r->page_size * 2;
- WT_RET(bm->write_size(bm, session, &corrected_page_size));
- WT_RET(__wt_buf_grow(session, &r->dsk, corrected_page_size));
+split_grow: /*
+ * Double the page size and make sure we accommodate at least
+ * one more record. The reason for the latter is that we may
+ * be here because there's a large key/value pair that won't
+ * fit in our initial page buffer, even at its expanded size.
+ */
r->page_size *= 2;
- r->first_free = (uint8_t *)r->dsk.mem + len;
- r->space_avail =
- r->page_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len);
- return (0);
+ return (__rec_split_grow(session, r, r->page_size + next_len));
}
/* We have a block, update the boundary counter. */
@@ -2438,9 +2509,9 @@ err: __wt_scr_free(&tmp);
* Raw compression split routine.
*/
static inline int
-__rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+__rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
{
- return (__rec_split_raw_worker(session, r, 0));
+ return (__rec_split_raw_worker(session, r, next_len, 0));
}
/*
@@ -2456,7 +2527,6 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
/* Adjust the boundary information based on our split status. */
switch (r->bnd_state) {
case SPLIT_BOUNDARY:
- case SPLIT_MAX:
/*
* We never split, the reconciled page fit into a maximum page
* size. Change the first boundary slot to represent the full
@@ -2516,7 +2586,7 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r)
/* We're done reconciling - write the final page */
if (r->raw_compression && r->entries != 0) {
while (r->entries != 0)
- WT_RET(__rec_split_raw_worker(session, r, 1));
+ WT_RET(__rec_split_raw_worker(session, r, 0, 1));
} else
WT_RET(__rec_split_finish_std(session, r));
@@ -2553,7 +2623,7 @@ __rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
* WT_PAGE_HEADER header onto the scratch buffer, most of the header
* information remains unchanged between the pages.
*/
- WT_RET(__wt_scr_alloc(session, r->page_size_max, &tmp));
+ WT_RET(__wt_scr_alloc(session, r->page_size, &tmp));
dsk = tmp->mem;
memcpy(dsk, r->dsk.mem, WT_PAGE_HEADER_SIZE);
@@ -2595,8 +2665,10 @@ __rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
r->entries -= r->total_entries;
r->first_free = dsk_start + len;
+ WT_ASSERT(session,
+ r->page_size >= (WT_PAGE_HEADER_BYTE_SIZE(btree) + len));
r->space_avail =
- (r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree)) - len;
+ r->split_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len);
err: __wt_scr_free(&tmp);
return (ret);
@@ -2905,17 +2977,17 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
cursor->value.data, cursor->value.size, (uint64_t)0));
/* Boundary: split or write the page. */
- while (key->len + val->len > r->space_avail)
+ if (key->len + val->len > r->space_avail) {
if (r->raw_compression)
- WT_RET(__rec_split_raw(session, r));
+ WT_RET(
+ __rec_split_raw(session, r, key->len + val->len));
else {
- WT_RET(__rec_split(session, r));
+ WT_RET(__rec_split(session, r, key->len + val->len));
/*
* Turn off prefix compression until a full key written
- * to the new page, and (unless we're already working
- * with an overflow key), rebuild the key without prefix
- * compression.
+ * to the new page, and (unless already working with an
+ * overflow key), rebuild the key without compression.
*/
if (r->key_pfx_compress_conf) {
r->key_pfx_compress = 0;
@@ -2924,6 +2996,7 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
session, r, NULL, 0, &ovfl_key));
}
}
+ }
/* Copy the key/value pair onto the page. */
__rec_copy_incr(session, r, key);
@@ -2968,7 +3041,7 @@ __rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk)
__rec_incr(session, r, cbulk->entry,
__bitstr_size(
(size_t)cbulk->entry * btree->bitcnt));
- WT_RET(__rec_split(session, r));
+ WT_RET(__rec_split(session, r, 0));
}
cbulk->entry = 0;
cbulk->nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail);
@@ -3048,11 +3121,10 @@ __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
session, r, cbulk->last.data, cbulk->last.size, cbulk->rle));
/* Boundary: split or write the page. */
- while (val->len > r->space_avail)
- if (r->raw_compression)
- WT_RET(__rec_split_raw(session, r));
- else
- WT_RET(__rec_split(session, r));
+ if (val->len > r->space_avail)
+ WT_RET(r->raw_compression ?
+ __rec_split_raw(session, r, val->len) :
+ __rec_split(session, r, val->len));
/* Copy the value onto the page. */
if (btree->dictionary)
@@ -3171,11 +3243,10 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
CHILD_RELEASE_ERR(session, hazard, ref);
/* Boundary: split or write the page. */
- while (val->len > r->space_avail)
- if (r->raw_compression)
- WT_ERR(__rec_split_raw(session, r));
- else
- WT_ERR(__rec_split(session, r));
+ if (val->len > r->space_avail)
+ WT_ERR(r->raw_compression ?
+ __rec_split_raw(session, r, val->len) :
+ __rec_split(session, r, val->len));
/* Copy the value onto the page. */
__rec_copy_incr(session, r, val);
@@ -3217,11 +3288,10 @@ __rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
addr->addr, addr->size, __rec_vtype(addr), r->recno);
/* Boundary: split or write the page. */
- while (val->len > r->space_avail)
- if (r->raw_compression)
- WT_RET(__rec_split_raw(session, r));
- else
- WT_RET(__rec_split(session, r));
+ if (val->len > r->space_avail)
+ WT_RET(r->raw_compression ?
+ __rec_split_raw(session, r, val->len) :
+ __rec_split(session, r, val->len));
/* Copy the value onto the page. */
__rec_copy_incr(session, r, val);
@@ -3298,7 +3368,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
*/
__rec_incr(session, r, entry,
__bitstr_size((size_t)entry * btree->bitcnt));
- WT_RET(__rec_split(session, r));
+ WT_RET(__rec_split(session, r, 0));
/* Calculate the number of entries per page. */
entry = 0;
@@ -3442,11 +3512,10 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r,
session, r, value->data, value->size, rle));
/* Boundary: split or write the page. */
- while (val->len > r->space_avail)
- if (r->raw_compression)
- WT_RET(__rec_split_raw(session, r));
- else
- WT_RET(__rec_split(session, r));
+ if (val->len > r->space_avail)
+ WT_RET(r->raw_compression ?
+ __rec_split_raw(session, r, val->len) :
+ __rec_split(session, r, val->len));
/* Copy the value onto the page. */
if (!deleted && !overflow_type && btree->dictionary)
@@ -4034,24 +4103,25 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
r->cell_zero = 0;
/* Boundary: split or write the page. */
- while (key->len + val->len > r->space_avail) {
- if (r->raw_compression) {
- WT_ERR(__rec_split_raw(session, r));
- continue;
- }
-
- /*
- * In one path above, we copied address blocks from the
- * page rather than building the actual key. In that
- * case, we have to build the actual key now because we
- * are about to promote it.
- */
- if (key_onpage_ovfl) {
- WT_ERR(__wt_buf_set(session,
- r->cur, WT_IKEY_DATA(ikey), ikey->size));
- key_onpage_ovfl = 0;
+ if (key->len + val->len > r->space_avail) {
+ if (r->raw_compression)
+ WT_ERR(__rec_split_raw(
+ session, r, key->len + val->len));
+ else {
+ /*
+ * In one path above, we copied address blocks
+ * from the page rather than building the actual
+ * key. In that case, we have to build the key
+ * now because we are about to promote it.
+ */
+ if (key_onpage_ovfl) {
+ WT_ERR(__wt_buf_set(session, r->cur,
+ WT_IKEY_DATA(ikey), ikey->size));
+ key_onpage_ovfl = 0;
+ }
+ WT_ERR(__rec_split(
+ session, r, key->len + val->len));
}
- WT_ERR(__rec_split(session, r));
}
/* Copy the key and value onto the page. */
@@ -4102,11 +4172,10 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
r, addr->addr, addr->size, __rec_vtype(addr), 0);
/* Boundary: split or write the page. */
- while (key->len + val->len > r->space_avail)
- if (r->raw_compression)
- WT_RET(__rec_split_raw(session, r));
- else
- WT_RET(__rec_split(session, r));
+ if (key->len + val->len > r->space_avail)
+ WT_RET(r->raw_compression ?
+ __rec_split_raw(session, r, key->len + val->len) :
+ __rec_split(session, r, key->len + val->len));
/* Copy the key and value onto the page. */
__rec_copy_incr(session, r, key);
@@ -4140,7 +4209,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
size_t size;
uint64_t slvg_skip;
uint32_t i;
- int dictionary, onpage_ovfl, ovfl_key;
+ int dictionary, key_onpage_ovfl, ovfl_key;
const void *p;
void *copy;
@@ -4369,9 +4438,9 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
* If the key is an overflow key that hasn't been removed, use
* the original backing blocks.
*/
- onpage_ovfl = kpack != NULL &&
+ key_onpage_ovfl = kpack != NULL &&
kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM;
- if (onpage_ovfl) {
+ if (key_onpage_ovfl) {
key->buf.data = cell;
key->buf.size = __wt_cell_total_len(kpack);
key->cell_len = 0;
@@ -4435,36 +4504,39 @@ build:
}
/* Boundary: split or write the page. */
- while (key->len + val->len > r->space_avail) {
- if (r->raw_compression) {
- WT_ERR(__rec_split_raw(session, r));
- continue;
- }
-
- /*
- * In one path above, we copied address blocks from the
- * page rather than building the actual key. In that
- * case, we have to build the actual key now because we
- * are about to promote it.
- */
- if (onpage_ovfl) {
- WT_ERR(__wt_dsk_cell_data_ref(
- session, WT_PAGE_ROW_LEAF, kpack, r->cur));
- onpage_ovfl = 0;
- }
- WT_ERR(__rec_split(session, r));
+ if (key->len + val->len > r->space_avail) {
+ if (r->raw_compression)
+ WT_ERR(__rec_split_raw(
+ session, r, key->len + val->len));
+ else {
+ /*
+ * In one path above, we copied address blocks
+ * from the page rather than building the actual
+ * key. In that case, we have to build the key
+ * now because we are about to promote it.
+ */
+ if (key_onpage_ovfl) {
+ WT_ERR(__wt_dsk_cell_data_ref(session,
+ WT_PAGE_ROW_LEAF, kpack, r->cur));
+ key_onpage_ovfl = 0;
+ }
+ WT_ERR(__rec_split(
+ session, r, key->len + val->len));
- /*
- * Turn off prefix compression until a full key written
- * to the new page, and (unless we're already working
- * with an overflow key), rebuild the key without prefix
- * compression.
- */
- if (r->key_pfx_compress_conf) {
- r->key_pfx_compress = 0;
- if (!ovfl_key)
- WT_ERR(__rec_cell_build_leaf_key(
- session, r, NULL, 0, &ovfl_key));
+ /*
+ * Turn off prefix compression until a full key
+ * written to the new page, and (unless already
+ * working with an overflow key), rebuild the
+ * key without compression.
+ */
+ if (r->key_pfx_compress_conf) {
+ r->key_pfx_compress = 0;
+ if (!ovfl_key)
+ WT_ERR(
+ __rec_cell_build_leaf_key(
+ session,
+ r, NULL, 0, &ovfl_key));
+ }
}
}
@@ -4529,24 +4601,28 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key));
/* Boundary: split or write the page. */
- while (key->len + val->len > r->space_avail) {
- if (r->raw_compression) {
- WT_RET(__rec_split_raw(session, r));
- continue;
- }
- WT_RET(__rec_split(session, r));
+ if (key->len + val->len > r->space_avail) {
+ if (r->raw_compression)
+ WT_RET(__rec_split_raw(
+ session, r, key->len + val->len));
+ else {
+ WT_RET(__rec_split(
+ session, r, key->len + val->len));
- /*
- * Turn off prefix compression until a full key written
- * to the new page, and (unless we're already working
- * with an overflow key), rebuild the key without prefix
- * compression.
- */
- if (r->key_pfx_compress_conf) {
- r->key_pfx_compress = 0;
- if (!ovfl_key)
- WT_RET(__rec_cell_build_leaf_key(
- session, r, NULL, 0, &ovfl_key));
+ /*
+ * Turn off prefix compression until a full key
+ * written to the new page, and (unless already
+ * working with an overflow key), rebuild the
+ * key without compression.
+ */
+ if (r->key_pfx_compress_conf) {
+ r->key_pfx_compress = 0;
+ if (!ovfl_key)
+ WT_RET(
+ __rec_cell_build_leaf_key(
+ session,
+ r, NULL, 0, &ovfl_key));
+ }
}
}
@@ -5064,7 +5140,7 @@ __rec_cell_build_int_key(WT_SESSION_IMPL *session,
WT_RET(__wt_buf_set(session, &key->buf, data, size));
/* Create an overflow object if the data won't fit. */
- if (size > btree->maxintlitem) {
+ if (size > btree->maxintlkey) {
WT_STAT_FAST_DATA_INCR(session, rec_overflow_key_internal);
*is_ovflp = 1;
@@ -5159,7 +5235,7 @@ __rec_cell_build_leaf_key(WT_SESSION_IMPL *session,
key->buf.data, (uint32_t)key->buf.size, &key->buf));
/* Create an overflow object if the data won't fit. */
- if (key->buf.size > btree->maxleafitem) {
+ if (key->buf.size > btree->maxleafkey) {
/*
* Overflow objects aren't prefix compressed -- rebuild any
* object that was prefix compressed.
@@ -5246,7 +5322,7 @@ __rec_cell_build_val(WT_SESSION_IMPL *session,
val->buf.data, (uint32_t)val->buf.size, &val->buf));
/* Create an overflow object if the data won't fit. */
- if (val->buf.size > btree->maxleafitem) {
+ if (val->buf.size > btree->maxleafvalue) {
WT_STAT_FAST_DATA_INCR(session, rec_overflow_value);
return (__rec_cell_build_ovfl(
diff --git a/src/schema/schema_open.c b/src/schema/schema_open.c
index f5937381cbb..4699fdeee02 100644
--- a/src/schema/schema_open.c
+++ b/src/schema/schema_open.c
@@ -83,7 +83,7 @@ __wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table)
goto err;
}
- WT_ERR(__wt_calloc_def(session, 1, &colgroup));
+ WT_ERR(__wt_calloc_one(session, &colgroup));
WT_ERR(__wt_strndup(
session, buf->data, buf->size, &colgroup->name));
colgroup->config = cgconfig;
@@ -319,7 +319,7 @@ __wt_schema_open_index(WT_SESSION_IMPL *session,
if (table->indices[i] == NULL) {
WT_ERR(cursor->get_value(cursor, &idxconf));
- WT_ERR(__wt_calloc_def(session, 1, &idx));
+ WT_ERR(__wt_calloc_one(session, &idx));
WT_ERR(__wt_strdup(session, uri, &idx->name));
WT_ERR(__wt_strdup(session, idxconf, &idx->config));
WT_ERR(__open_index(session, table, idx));
@@ -392,7 +392,7 @@ __wt_schema_open_table(WT_SESSION_IMPL *session,
WT_ERR(cursor->search(cursor));
WT_ERR(cursor->get_value(cursor, &tconfig));
- WT_ERR(__wt_calloc_def(session, 1, &table));
+ WT_ERR(__wt_calloc_one(session, &table));
table->name = tablename;
tablename = NULL;
table->name_hash = __wt_hash_city64(name, namelen);
diff --git a/src/session/session_api.c b/src/session/session_api.c
index dc3c7d7041f..8f460dcc29f 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -953,6 +953,7 @@ __wt_open_session(WT_CONNECTION_IMPL *conn,
{
static const WT_SESSION stds = {
NULL,
+ NULL,
__session_close,
__session_reconfigure,
__session_open_cursor,
diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c
index e28e277d5f6..85483c7c8ae 100644
--- a/src/session/session_dhandle.c
+++ b/src/session/session_dhandle.c
@@ -20,7 +20,7 @@ __session_add_dhandle(
WT_DATA_HANDLE_CACHE *dhandle_cache;
uint64_t bucket;
- WT_RET(__wt_calloc_def(session, 1, &dhandle_cache));
+ WT_RET(__wt_calloc_one(session, &dhandle_cache));
dhandle_cache->dhandle = session->dhandle;
bucket = dhandle_cache->dhandle->name_hash % WT_HASH_ARRAY_SIZE;
diff --git a/src/support/huffman.c b/src/support/huffman.c
index 5a06b72d33e..9625e879381 100644
--- a/src/support/huffman.c
+++ b/src/support/huffman.c
@@ -306,7 +306,7 @@ __wt_huffman_open(WT_SESSION_IMPL *session,
combined_nodes = leaves = NULL;
node = node2 = tempnode = NULL;
- WT_RET(__wt_calloc_def(session, 1, &huffman));
+ WT_RET(__wt_calloc_one(session, &huffman));
/*
* The frequency table is 4B pairs of symbol and frequency. The symbol
@@ -381,8 +381,8 @@ __wt_huffman_open(WT_SESSION_IMPL *session,
symcnt, sizeof(INDEXED_SYMBOL), indexed_freq_compare);
/* We need two node queues to build the tree. */
- WT_ERR(__wt_calloc_def(session, 1, &leaves));
- WT_ERR(__wt_calloc_def(session, 1, &combined_nodes));
+ WT_ERR(__wt_calloc_one(session, &leaves));
+ WT_ERR(__wt_calloc_one(session, &combined_nodes));
/*
* Adding the leaves to the queue.
@@ -393,7 +393,7 @@ __wt_huffman_open(WT_SESSION_IMPL *session,
*/
for (i = 0; i < symcnt; ++i)
if (indexed_freqs[i].frequency > 0) {
- WT_ERR(__wt_calloc_def(session, 1, &tempnode));
+ WT_ERR(__wt_calloc_one(session, &tempnode));
tempnode->symbol = (uint8_t)indexed_freqs[i].symbol;
tempnode->weight = indexed_freqs[i].frequency;
WT_ERR(node_queue_enqueue(session, leaves, tempnode));
@@ -431,7 +431,7 @@ __wt_huffman_open(WT_SESSION_IMPL *session,
* In every second run, we have both node and node2 initialized.
*/
if (node != NULL && node2 != NULL) {
- WT_ERR(__wt_calloc_def(session, 1, &tempnode));
+ WT_ERR(__wt_calloc_one(session, &tempnode));
/* The new weight is the sum of the two weights. */
tempnode->weight = node->weight + node2->weight;
@@ -845,7 +845,7 @@ node_queue_enqueue(
NODE_QUEUE_ELEM *elem;
/* Allocating a new linked list element */
- WT_RET(__wt_calloc_def(session, 1, &elem));
+ WT_RET(__wt_calloc_one(session, &elem));
/* It holds the tree node, and has no next element yet */
elem->node = node;
diff --git a/src/support/scratch.c b/src/support/scratch.c
index ca2cdac8377..e4df04a36ed 100644
--- a/src/support/scratch.c
+++ b/src/support/scratch.c
@@ -216,7 +216,7 @@ __wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp
WT_ASSERT(session, slot != NULL);
best = slot;
- WT_ERR(__wt_calloc_def(session, 1, best));
+ WT_ERR(__wt_calloc_one(session, best));
/* Scratch buffers must be aligned. */
F_SET(*best, WT_ITEM_ALIGNED);
diff --git a/src/support/stat.c b/src/support/stat.c
index 21d56238f4a..19aa9170c5b 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -30,11 +30,11 @@ __wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats)
stats->btree_column_variable.desc =
"btree: column-store variable-size leaf pages";
stats->btree_fixed_len.desc = "btree: fixed-record size";
- stats->btree_maxintlitem.desc =
- "btree: maximum internal page item size";
+ stats->btree_maxintlkey.desc = "btree: maximum internal page key size";
stats->btree_maxintlpage.desc = "btree: maximum internal page size";
- stats->btree_maxleafitem.desc = "btree: maximum leaf page item size";
+ stats->btree_maxleafkey.desc = "btree: maximum leaf page key size";
stats->btree_maxleafpage.desc = "btree: maximum leaf page size";
+ stats->btree_maxleafvalue.desc = "btree: maximum leaf page value size";
stats->btree_maximum_depth.desc = "btree: maximum tree depth";
stats->btree_entries.desc = "btree: number of key/value pairs";
stats->btree_overflow.desc = "btree: overflow pages";
@@ -154,10 +154,11 @@ __wt_stat_refresh_dsrc_stats(void *stats_arg)
stats->btree_column_deleted.v = 0;
stats->btree_column_variable.v = 0;
stats->btree_fixed_len.v = 0;
- stats->btree_maxintlitem.v = 0;
+ stats->btree_maxintlkey.v = 0;
stats->btree_maxintlpage.v = 0;
- stats->btree_maxleafitem.v = 0;
+ stats->btree_maxleafkey.v = 0;
stats->btree_maxleafpage.v = 0;
+ stats->btree_maxleafvalue.v = 0;
stats->btree_maximum_depth.v = 0;
stats->btree_entries.v = 0;
stats->btree_overflow.v = 0;
diff --git a/test/format/format.h b/test/format/format.h
index 8020d18d716..902cea6cc5d 100644
--- a/test/format/format.h
+++ b/test/format/format.h
@@ -89,6 +89,8 @@ extern WT_EXTENSION_API *wt_api;
#undef M
#define M(v) ((v) * 1000000) /* Million */
+#undef KILOBYTE
+#define KILOBYTE(v) ((v) * 1024)
#undef MEGABYTE
#define MEGABYTE(v) ((v) * 1048576)
#undef GIGABYTE
diff --git a/test/format/ops.c b/test/format/ops.c
index 28f1079b30d..bbaeabcc479 100644
--- a/test/format/ops.c
+++ b/test/format/ops.c
@@ -52,7 +52,7 @@ wts_ops(void)
WT_SESSION *session;
pthread_t backup_tid, compact_tid;
uint64_t thread_ops;
- uint32_t i, tenths;
+ uint32_t i, fourths;
int ret, running;
conn = g.wts_conn;
@@ -72,16 +72,16 @@ wts_ops(void)
* There are two mechanisms to specify the length of the run, a number
* of operations or a timer. If the former, each thread does an equal
* share of the total operations (and make sure that it's not 0). If
- * the latter, calculate how many tenth-of-a-second sleeps until this
+ * the latter, calculate how many fourth-of-a-second sleeps until this
* part of the run finishes.
*/
if (g.c_timer == 0) {
- tenths = 0;
+ fourths = 0;
if (g.c_ops < g.c_threads)
g.c_ops = g.c_threads;
thread_ops = g.c_ops / g.c_threads;
} else {
- tenths = (g.c_timer * 10 * 60) / FORMAT_OPERATION_REPS;
+ fourths = (g.c_timer * 4 * 60) / FORMAT_OPERATION_REPS;
thread_ops = 0;
}
@@ -141,7 +141,7 @@ wts_ops(void)
/* Tell the thread if it's done. */
if (thread_ops == 0) {
- if (tenths == 0)
+ if (fourths == 0)
tinfo[i].quit = 1;
} else
if (tinfo[i].ops >= thread_ops)
@@ -151,8 +151,8 @@ wts_ops(void)
if (!running)
break;
(void)usleep(250000); /* 1/4th of a second */
- if (tenths != 0)
- --tenths;
+ if (fourths != 0)
+ --fourths;
}
free(tinfo);
diff --git a/test/format/util.c b/test/format/util.c
index b043475842e..4880dfbbdd0 100644
--- a/test/format/util.c
+++ b/test/format/util.c
@@ -27,17 +27,24 @@
#include "format.h"
+#ifndef MAX
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+#endif
+
static inline uint32_t
kv_len(uint64_t keyno, uint32_t min, uint32_t max)
{
/*
- * We want to focus on relatively small key/value items, but admitting
- * the possibility of larger items. Pick a size close to the minimum
- * most of the time, only roll the dice for a really big item 1 in 20
- * times. (The configuration can force large key/value minimum sizes,
- * where every key/value item will be an overflow.)
+ * Focus on relatively small key/value items, admitting the possibility
+ * of larger items. Pick a size close to the minimum most of the time,
+ * only create a larger item 1 in 20 times, and a really big item 1 in
+ * 1000 times. (Configuration can force large key/value minimum sizes,
+ * where every key/value item is an overflow.)
*/
- if (keyno % 20 != 0 && max > min + 20)
+ if (keyno % 1000 == 0 && max < KILOBYTE(80)) {
+ min = KILOBYTE(80);
+ max = KILOBYTE(100);
+ } else if (keyno % 20 != 0 && max > min + 20)
max = min + 20;
return (MMRAND(min, max));
}
@@ -65,13 +72,14 @@ void
key_gen_setup(uint8_t **keyp)
{
uint8_t *key;
- size_t i;
+ size_t i, len;
*keyp = NULL;
- if ((key = malloc(g.c_key_max)) == NULL)
+ len = MAX(KILOBYTE(100), g.c_key_max);
+ if ((key = malloc(len)) == NULL)
die(errno, "malloc");
- for (i = 0; i < g.c_key_max; ++i)
+ for (i = 0; i < len; ++i)
key[i] = (uint8_t)("abcdefghijklmnopqrstuvwxyz"[i % 26]);
*keyp = key;
}
@@ -118,7 +126,7 @@ val_gen_setup(uint8_t **valp)
* into the buffer by a few extra bytes, used to generate different
* data for column-store run-length encoded files.
*/
- len = g.c_value_max + 20;
+ len = MAX(KILOBYTE(100), g.c_value_max) + 20;
if ((val = malloc(len)) == NULL)
die(errno, "malloc");
for (i = 0; i < len; ++i)
diff --git a/test/format/wts.c b/test/format/wts.c
index 29b40eda74d..21e7806e982 100644
--- a/test/format/wts.c
+++ b/test/format/wts.c
@@ -173,12 +173,15 @@ wts_create(void)
{
WT_CONNECTION *conn;
WT_SESSION *session;
- uint32_t maxintlpage, maxintlitem, maxleafpage, maxleafitem;
+ uint32_t maxintlpage, maxintlkey, maxleafpage, maxleafkey, maxleafvalue;
int ret;
char config[4096], *end, *p;
conn = g.wts_conn;
+ p = config;
+ end = config + sizeof(config);
+
/*
* Ensure that we can service at least one operation per-thread
* concurrently without filling the cache with pinned pages. We
@@ -197,23 +200,30 @@ wts_create(void)
if (maxleafpage > 512)
maxleafpage >>= 1;
}
- maxintlitem = MMRAND(maxintlpage / 50, maxintlpage / 40);
- if (maxintlitem < 40)
- maxintlitem = 40;
- maxleafitem = MMRAND(maxleafpage / 50, maxleafpage / 40);
- if (maxleafitem < 40)
- maxleafitem = 40;
-
- p = config;
- end = config + sizeof(config);
p += snprintf(p, (size_t)(end - p),
"key_format=%s,"
"allocation_size=512,%s"
- "internal_page_max=%d,internal_item_max=%d,"
- "leaf_page_max=%d,leaf_item_max=%d",
+ "internal_page_max=%d,leaf_page_max=%d",
(g.type == ROW) ? "u" : "r",
g.c_firstfit ? "block_allocation=first," : "",
- maxintlpage, maxintlitem, maxleafpage, maxleafitem);
+ maxintlpage, maxleafpage);
+
+ /*
+ * Configure the maximum key/value sizes, but leave it as the default
+ * if we come up with something crazy.
+ */
+ maxintlkey = MMRAND(maxintlpage / 50, maxintlpage / 40);
+ if (maxintlkey > 20)
+ p += snprintf(p, (size_t)(end - p),
+ ",internal_key_max=%d", maxintlkey);
+ maxleafkey = MMRAND(maxleafpage / 50, maxleafpage / 40);
+ if (maxleafkey > 20)
+ p += snprintf(p, (size_t)(end - p),
+ ",leaf_key_max=%d", maxleafkey);
+ maxleafvalue = MMRAND(maxleafpage * 10, maxleafpage / 40);
+ if (maxleafvalue > 40 && maxleafvalue < 100 * 1024)
+ p += snprintf(p, (size_t)(end - p),
+ ",leaf_value_max=%d", maxleafvalue);
switch (g.type) {
case FIX:
diff --git a/test/suite/run.py b/test/suite/run.py
index 32dc8835d4b..a29f7af2212 100644
--- a/test/suite/run.py
+++ b/test/suite/run.py
@@ -82,6 +82,7 @@ Options:\n\
-g | --gdb all subprocesses (like calls to wt) use gdb\n\
-h | --help show this message\n\
-j N | --parallel N run all tests in parallel using N processes\n\
+ -l | --long run the entire test suite\n\
-p | --preserve preserve output files in WT_TEST/<testname>\n\
-t | --timestamp name WT_TEST according to timestamp\n\
-v N | --verbose N set verboseness to N (0<=N<=3, default=1)\n\
@@ -219,7 +220,7 @@ if __name__ == '__main__':
tests = unittest.TestSuite()
# Turn numbers and ranges into test module names
- preserve = timestamp = debug = gdbSub = False
+ preserve = timestamp = debug = gdbSub = longtest = False
parallel = 0
configfile = None
configwrite = False
@@ -243,6 +244,15 @@ if __name__ == '__main__':
if option == '-debug' or option == 'd':
debug = True
continue
+ if option == '-gdb' or option == 'g':
+ gdbSub = True
+ continue
+ if option == '-help' or option == 'h':
+ usage()
+ sys.exit(True)
+ if option == '-long' or option == 'l':
+ longtest = True
+ continue
if option == '-parallel' or option == 'j':
if parallel != 0 or len(args) == 0:
usage()
@@ -255,12 +265,6 @@ if __name__ == '__main__':
if option == '-timestamp' or option == 't':
timestamp = True
continue
- if option == '-gdb' or option == 'g':
- gdbSub = True
- continue
- if option == '-help' or option == 'h':
- usage()
- sys.exit(True)
if option == '-verbose' or option == 'v':
if len(args) == 0:
usage()
@@ -292,7 +296,7 @@ if __name__ == '__main__':
# All global variables should be set before any test classes are loaded.
# That way, verbose printing can be done at the class definition level.
wttest.WiredTigerTestCase.globalSetup(preserve, timestamp, gdbSub,
- verbose, dirarg)
+ verbose, dirarg, longtest)
# Without any tests listed as arguments, do discovery
if len(testargs) == 0:
diff --git a/test/suite/test_lsm02.py b/test/suite/test_lsm02.py
index 2b3d48f8f30..41d82d8ad0d 100644
--- a/test/suite/test_lsm02.py
+++ b/test/suite/test_lsm02.py
@@ -54,9 +54,12 @@ class test_lsm02(wttest.WiredTigerTestCase):
v = '\x14\x14'
self.add_key(self.uri, 'k1', v)
self.verify_key_exists(self.uri, 'k1', v)
- v += 'a' * 1000
+ v = '\x14\x14\0\0\0\0\0\0'
self.add_key(self.uri, 'k2', v)
self.verify_key_exists(self.uri, 'k2', v)
+ v += 'a' * 1000
+ self.add_key(self.uri, 'k3', v)
+ self.verify_key_exists(self.uri, 'k3', v)
def test_lsm_rename01(self):
self.session.create(self.uri, 'key_format=S,value_format=S')
diff --git a/test/suite/test_sweep01.py b/test/suite/test_sweep01.py
index 8d561763091..d2ebb796d28 100644
--- a/test/suite/test_sweep01.py
+++ b/test/suite/test_sweep01.py
@@ -33,7 +33,7 @@
import fnmatch, os, shutil, run, time
from suite_subprocess import suite_subprocess
from wiredtiger import wiredtiger_open, stat
-from wtscenario import multiply_scenarios, number_scenarios
+from wtscenario import multiply_scenarios, number_scenarios, prune_scenarios
import wttest
class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
@@ -55,7 +55,7 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
create_params = 'key_format=r,value_format=8t')),
]
- scenarios = number_scenarios(multiply_scenarios('.', types, ckpt_list))
+ scenarios = number_scenarios(prune_scenarios(multiply_scenarios('.', types, ckpt_list), 1, 100))
# Overrides WiredTigerTestCase
def setUpConnectionOpen(self, dir):
diff --git a/test/suite/test_txn02.py b/test/suite/test_txn02.py
index 32165c380a9..d83bf6ce5f8 100644
--- a/test/suite/test_txn02.py
+++ b/test/suite/test_txn02.py
@@ -32,7 +32,7 @@
import fnmatch, os, shutil, time
from suite_subprocess import suite_subprocess
from wiredtiger import wiredtiger_open
-from wtscenario import multiply_scenarios, number_scenarios
+from wtscenario import multiply_scenarios, number_scenarios, prune_scenarios
import wttest
class test_txn02(wttest.WiredTigerTestCase, suite_subprocess):
@@ -81,8 +81,19 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess):
txn3s = [('t3c', dict(txn3='commit')), ('t3r', dict(txn3='rollback'))]
txn4s = [('t4c', dict(txn4='commit')), ('t4r', dict(txn4='rollback'))]
- scenarios = number_scenarios(multiply_scenarios('.', types,
- op1s, txn1s, op2s, txn2s, op3s, txn3s, op4s, txn4s))
+ all_scenarios = multiply_scenarios('.', types,
+ op1s, txn1s, op2s, txn2s, op3s, txn3s, op4s, txn4s)
+
+ # This test generates thousands of potential scenarios.
+ # For default runs, we'll use a small subset of them, for
+ # long runs (when --long is set) we'll set a much larger limit.
+ scenarios = number_scenarios(prune_scenarios(all_scenarios, 20, 5000))
+
+ # Each check_log() call takes a second, so we don't call it for
+ # every scenario, we'll limit it to the value of checklog_calls.
+ checklog_calls = 100 if wttest.islongtest() else 2
+ checklog_mod = (len(scenarios) / checklog_calls + 1)
+
# scenarios = number_scenarios(multiply_scenarios('.', types,
# op1s, txn1s, op2s, txn2s, op3s, txn3s, op4s, txn4s)) [:3]
# Overrides WiredTigerTestCase
@@ -253,10 +264,8 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess):
# Check the state after each commit/rollback.
self.check_all(current, committed)
- # Check the log state after the entire op completes and run recovery.
- # check_log() takes over a second to run, so we don't want to run it
- # for all scenarios, rather, we run it about 100 times overall.
- if self.scenario_number % (len(test_txn02.scenarios) / 100 + 1) == 0:
+ # check_log() is slow, we don't run it on every scenario.
+ if self.scenario_number % test_txn02.checklog_mod == 0:
self.check_log(committed)
if __name__ == '__main__':
diff --git a/test/suite/wtscenario.py b/test/suite/wtscenario.py
index a8fd4031ceb..70497102bb0 100644
--- a/test/suite/wtscenario.py
+++ b/test/suite/wtscenario.py
@@ -102,17 +102,29 @@ def prune_sorter_key(scene):
p = scene[1]['P']
return p * scene[1]['_rand']
-def prune_scenarios(scenes, count = -1):
+def prune_resort_key(scene):
+ """
+ Used by prune_scenerios to extract the original ordering key for sorting.
+ """
+ return scene[1]['_order']
+
+def set_long_run(islong):
+ global _is_long_run
+ _is_long_run = islong
+
+def prune_scenarios(scenes, default_count = -1, long_count = -1):
"""
Use listed probabilities for pruning the list of scenarios.
That is, the highest probability (value of P in the scendario)
- are chosen more often. With a second argument, only the
- given number of scenarios are returned. With no second argument,
- only scenarios with P > .5 are returned half the time, etc.
+ are chosen more often. With just one argument, only scenarios
+ with P > .5 are returned half the time, etc. A second argument
+ limits the number of scenarios. When a third argument is present,
+ it is a separate limit for a long run.
"""
+ global _is_long_run
r = suite_random.suite_random()
result = []
- if count == -1:
+ if default_count == -1:
# Missing second arg - return those with P == .3 at
# 30% probability, for example.
for scene in scenes:
@@ -123,25 +135,41 @@ def prune_scenarios(scenes, count = -1):
result.append(scene)
return result
else:
- # With second arg, we want exactly 'count' items
- # returned. So we'll sort them all and choose
+ # With at least a second arg present, we'll want a specific count
+ # of items returned. So we'll sort them all and choose
# the top number. Not the most efficient solution,
# but it's easy.
+ if _is_long_run and long_count != -1:
+ count = long_count
+ else:
+ count = default_count
+
+ l = len(scenes)
+ if l <= count:
+ return scenes
+ if count == 0:
+ return []
+ order = 0
for scene in scenes:
scene[1]['_rand'] = r.rand_float()
- scenes = sorted(scenes, key=prune_sorter_key)
+ scene[1]['_order'] = order
+ order += 1
+ scenes = sorted(scenes, key=prune_sorter_key) # random sort driven by P
+ scenes = scenes[l-count:l] # truncate to get best
+ scenes = sorted(scenes, key=prune_resort_key) # original order
for scene in scenes:
del scene[1]['_rand']
- l = len(scenes)
- return scenes[l-count:l]
+ del scene[1]['_order']
+ return scenes
def number_scenarios(scenes):
"""
- Add a 'scenario_number' variable to each scenario.
+ Add a 'scenario_number' and 'scenario_name' variable to each scenario.
The hash table for each scenario is altered!
"""
count = 0
for scene in scenes:
+ scene[1]['scenario_name'] = scene[0]
scene[1]['scenario_number'] = count
count += 1
return scenes
diff --git a/test/suite/wttest.py b/test/suite/wttest.py
index d1705434988..4de09a143b2 100644
--- a/test/suite/wttest.py
+++ b/test/suite/wttest.py
@@ -37,7 +37,7 @@ except ImportError:
from contextlib import contextmanager
import os, re, shutil, sys, time, traceback
-
+import wtscenario
import wiredtiger
def shortenWithEllipsis(s, maxlen):
@@ -141,17 +141,20 @@ class WiredTigerTestCase(unittest.TestCase):
@staticmethod
def globalSetup(preserveFiles = False, useTimestamp = False,
- gdbSub = False, verbose = 1, dirarg = None):
+ gdbSub = False, verbose = 1, dirarg = None,
+ longtest = False):
WiredTigerTestCase._preserveFiles = preserveFiles
d = 'WT_TEST' if dirarg == None else dirarg
if useTimestamp:
d += '.' + time.strftime('%Y%m%d-%H%M%S', time.localtime())
shutil.rmtree(d, ignore_errors=True)
os.makedirs(d)
+ wtscenario.set_long_run(longtest)
WiredTigerTestCase._parentTestdir = d
WiredTigerTestCase._origcwd = os.getcwd()
WiredTigerTestCase._resultfile = open(os.path.join(d, 'results.txt'), "w", 0) # unbuffered
WiredTigerTestCase._gdbSubprocess = gdbSub
+ WiredTigerTestCase._longtest = longtest
WiredTigerTestCase._verbose = verbose
WiredTigerTestCase._dupout = os.dup(sys.stdout.fileno())
WiredTigerTestCase._stdout = sys.stdout
@@ -182,8 +185,9 @@ class WiredTigerTestCase(unittest.TestCase):
# is used, then each scenario is given a number, which can
# help distinguish tests.
scen = ''
- if hasattr(self, 'scenario_number'):
- scen = '(scenario ' + str(self.scenario_number) + ')'
+ if hasattr(self, 'scenario_number') and hasattr(self, 'scenario_name'):
+ scen = '(scenario ' + str(self.scenario_number) + \
+ ': ' + self.scenario_name + ')'
return self.simpleName() + scen
def simpleName(self):
@@ -283,7 +287,7 @@ class WiredTigerTestCase(unittest.TestCase):
self.pr('preserving directory ' + self.testdir)
if not passed and not skipped:
- print "ERROR in " + self.testsubdir
+ print "ERROR in " + str(self)
self.pr('FAIL')
self.prexception(excinfo)
self.pr('preserving directory ' + self.testdir)
@@ -431,6 +435,23 @@ class WiredTigerTestCase(unittest.TestCase):
def className(self):
return self.__class__.__name__
+
+def longtest(description):
+ """
+ Used as a function decorator, for example, @wttest.longtest("description").
+ The decorator indicates that this test function should only be included
+ when running the test suite with the --long option.
+ """
+ def runit_decorator(func):
+ return func
+ if not WiredTigerTestCase._longtest:
+ return unittest.skip(description + ' (enable with --long)')
+ else:
+ return runit_decorator
+
+def islongtest():
+ return WiredTigerTestCase._longtest
+
def runsuite(suite, parallel):
suite_to_run = suite
if parallel > 1:
diff --git a/tools/stat_data.py b/tools/stat_data.py
index 50528dbd26a..3d192be7566 100644
--- a/tools/stat_data.py
+++ b/tools/stat_data.py
@@ -34,10 +34,11 @@ no_scale_per_second_list = [
'btree: column-store variable-size deleted values',
'btree: column-store variable-size leaf pages',
'btree: fixed-record size',
- 'btree: maximum internal page item size',
+ 'btree: maximum internal page key size',
'btree: maximum internal page size',
- 'btree: maximum leaf page item size',
+ 'btree: maximum leaf page key size',
'btree: maximum leaf page size',
+ 'btree: maximum leaf page value size',
'btree: maximum tree depth',
'btree: number of key/value pairs',
'btree: overflow pages',