summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/async/async_api.c2
-rw-r--r--src/async/async_op.c2
-rw-r--r--src/async/async_worker.c2
-rw-r--r--src/block/block_addr.c53
-rw-r--r--src/block/block_ckpt.c2
-rw-r--r--src/block/block_compact.c94
-rw-r--r--src/block/block_ext.c2
-rw-r--r--src/block/block_map.c2
-rw-r--r--src/block/block_mgr.c18
-rw-r--r--src/block/block_open.c71
-rw-r--r--src/block/block_read.c2
-rw-r--r--src/block/block_session.c2
-rw-r--r--src/block/block_slvg.c2
-rw-r--r--src/block/block_vrfy.c2
-rw-r--r--src/block/block_write.c2
-rw-r--r--src/bloom/bloom.c6
-rw-r--r--src/btree/bt_compact.c44
-rw-r--r--src/btree/bt_curnext.c169
-rw-r--r--src/btree/bt_curprev.c56
-rw-r--r--src/btree/bt_cursor.c52
-rw-r--r--src/btree/bt_debug.c10
-rw-r--r--src/btree/bt_delete.c2
-rw-r--r--src/btree/bt_discard.c2
-rw-r--r--src/btree/bt_handle.c38
-rw-r--r--src/btree/bt_huffman.c12
-rw-r--r--src/btree/bt_io.c2
-rw-r--r--src/btree/bt_misc.c18
-rw-r--r--src/btree/bt_ovfl.c2
-rw-r--r--src/btree/bt_page.c10
-rw-r--r--src/btree/bt_read.c6
-rw-r--r--src/btree/bt_rebalance.c486
-rw-r--r--src/btree/bt_ret.c2
-rw-r--r--src/btree/bt_slvg.c66
-rw-r--r--src/btree/bt_split.c312
-rw-r--r--src/btree/bt_stat.c6
-rw-r--r--src/btree/bt_sync.c2
-rw-r--r--src/btree/bt_upgrade.c2
-rw-r--r--src/btree/bt_vrfy.c31
-rw-r--r--src/btree/bt_vrfy_dsk.c16
-rw-r--r--src/btree/bt_walk.c170
-rw-r--r--src/btree/col_modify.c2
-rw-r--r--src/btree/col_srch.c122
-rw-r--r--src/btree/row_key.c2
-rw-r--r--src/btree/row_modify.c2
-rw-r--r--src/btree/row_srch.c229
-rw-r--r--src/cache/cache_las.c27
-rw-r--r--src/config/config.c8
-rw-r--r--src/config/config_api.c2
-rw-r--r--src/config/config_check.c2
-rw-r--r--src/config/config_collapse.c2
-rw-r--r--src/config/config_def.c39
-rw-r--r--src/config/config_ext.c2
-rw-r--r--src/config/config_upgrade.c2
-rw-r--r--src/conn/api_version.c2
-rw-r--r--src/conn/conn_api.c10
-rw-r--r--src/conn/conn_cache.c2
-rw-r--r--src/conn/conn_cache_pool.c2
-rw-r--r--src/conn/conn_ckpt.c2
-rw-r--r--src/conn/conn_dhandle.c60
-rw-r--r--src/conn/conn_handle.c2
-rw-r--r--src/conn/conn_log.c46
-rw-r--r--src/conn/conn_open.c2
-rw-r--r--src/conn/conn_stat.c6
-rw-r--r--src/conn/conn_sweep.c12
-rw-r--r--src/cursor/cur_backup.c25
-rw-r--r--src/cursor/cur_bulk.c196
-rw-r--r--src/cursor/cur_config.c2
-rw-r--r--src/cursor/cur_ds.c2
-rw-r--r--src/cursor/cur_dump.c2
-rw-r--r--src/cursor/cur_file.c6
-rw-r--r--src/cursor/cur_index.c2
-rw-r--r--src/cursor/cur_join.c19
-rw-r--r--src/cursor/cur_json.c15
-rw-r--r--src/cursor/cur_log.c2
-rw-r--r--src/cursor/cur_metadata.c10
-rw-r--r--src/cursor/cur_stat.c9
-rw-r--r--src/cursor/cur_std.c2
-rw-r--r--src/cursor/cur_table.c16
-rwxr-xr-xsrc/docs/build-javadoc.sh2
-rw-r--r--src/docs/command-line.dox41
-rw-r--r--src/docs/license.dox21
-rw-r--r--src/docs/programming.dox5
-rw-r--r--src/docs/rebalance.dox14
-rw-r--r--src/docs/schema.dox4
-rw-r--r--src/docs/spell.ok3
-rw-r--r--src/docs/style/footer.html4
-rwxr-xr-xsrc/docs/tools/doxfilter.py2
-rwxr-xr-xsrc/docs/tools/fixlinks.py2
-rw-r--r--src/docs/top/main.dox8
-rw-r--r--src/docs/tune-bulk-load.dox8
-rw-r--r--src/docs/upgrading.dox28
-rw-r--r--src/docs/wtperf.dox23
-rw-r--r--src/evict/evict_file.c4
-rw-r--r--src/evict/evict_lru.c110
-rw-r--r--src/evict/evict_page.c16
-rw-r--r--src/include/api.h2
-rw-r--r--src/include/async.h2
-rw-r--r--src/include/bitstring.i2
-rw-r--r--src/include/block.h9
-rw-r--r--src/include/bloom.h2
-rw-r--r--src/include/btmem.h21
-rw-r--r--src/include/btree.h14
-rw-r--r--src/include/btree.i116
-rw-r--r--src/include/btree_cmp.i2
-rw-r--r--src/include/buf.i14
-rw-r--r--src/include/cache.h2
-rw-r--r--src/include/cache.i2
-rw-r--r--src/include/cell.i2
-rw-r--r--src/include/column.i36
-rw-r--r--src/include/compact.h2
-rw-r--r--src/include/config.h41
-rw-r--r--src/include/connection.h3
-rw-r--r--src/include/cursor.h40
-rw-r--r--src/include/cursor.i8
-rw-r--r--src/include/dhandle.h6
-rw-r--r--src/include/dlh.h2
-rw-r--r--src/include/error.h2
-rw-r--r--src/include/extern.h63
-rw-r--r--src/include/flags.h60
-rw-r--r--src/include/gcc.h54
-rw-r--r--src/include/hardware.h2
-rw-r--r--src/include/intpack.i2
-rw-r--r--src/include/lint.h2
-rw-r--r--src/include/log.h11
-rw-r--r--src/include/log.i2
-rw-r--r--src/include/lsm.h2
-rw-r--r--src/include/meta.h6
-rw-r--r--src/include/misc.h5
-rw-r--r--src/include/misc.i2
-rw-r--r--src/include/msvc.h2
-rw-r--r--src/include/mutex.h2
-rw-r--r--src/include/mutex.i2
-rw-r--r--src/include/os.h2
-rw-r--r--src/include/os_windows.h2
-rw-r--r--src/include/packing.i23
-rw-r--r--src/include/posix.h2
-rw-r--r--src/include/schema.h28
-rw-r--r--src/include/serial.i2
-rw-r--r--src/include/session.h21
-rw-r--r--src/include/stat.h6
-rw-r--r--src/include/txn.h2
-rw-r--r--src/include/txn.i6
-rw-r--r--src/include/verify_build.h2
-rw-r--r--src/include/wiredtiger.in58
-rw-r--r--src/include/wiredtiger_ext.h2
-rw-r--r--src/include/wt_internal.h2
-rw-r--r--src/log/log.c24
-rw-r--r--src/log/log_auto.c96
-rw-r--r--src/log/log_slot.c4
-rw-r--r--src/lsm/lsm_cursor.c8
-rw-r--r--src/lsm/lsm_cursor_bulk.c4
-rw-r--r--src/lsm/lsm_manager.c2
-rw-r--r--src/lsm/lsm_merge.c11
-rw-r--r--src/lsm/lsm_meta.c2
-rw-r--r--src/lsm/lsm_stat.c37
-rw-r--r--src/lsm/lsm_tree.c36
-rw-r--r--src/lsm/lsm_work_unit.c13
-rw-r--r--src/lsm/lsm_worker.c2
-rw-r--r--src/meta/meta_apply.c52
-rw-r--r--src/meta/meta_ckpt.c2
-rw-r--r--src/meta/meta_ext.c2
-rw-r--r--src/meta/meta_table.c141
-rw-r--r--src/meta/meta_track.c21
-rw-r--r--src/meta/meta_turtle.c18
-rw-r--r--src/os_posix/os_abort.c2
-rw-r--r--src/os_posix/os_alloc.c2
-rw-r--r--src/os_posix/os_dir.c2
-rw-r--r--src/os_posix/os_dlopen.c2
-rw-r--r--src/os_posix/os_errno.c2
-rw-r--r--src/os_posix/os_exist.c2
-rw-r--r--src/os_posix/os_fallocate.c2
-rw-r--r--src/os_posix/os_filesize.c2
-rw-r--r--src/os_posix/os_flock.c2
-rw-r--r--src/os_posix/os_fsync.c28
-rw-r--r--src/os_posix/os_ftruncate.c2
-rw-r--r--src/os_posix/os_getenv.c2
-rw-r--r--src/os_posix/os_getline.c2
-rw-r--r--src/os_posix/os_getopt.c2
-rw-r--r--src/os_posix/os_map.c14
-rw-r--r--src/os_posix/os_mtx_cond.c2
-rw-r--r--src/os_posix/os_mtx_rw.c2
-rw-r--r--src/os_posix/os_once.c2
-rw-r--r--src/os_posix/os_open.c2
-rw-r--r--src/os_posix/os_pagesize.c19
-rw-r--r--src/os_posix/os_path.c2
-rw-r--r--src/os_posix/os_priv.c2
-rw-r--r--src/os_posix/os_remove.c2
-rw-r--r--src/os_posix/os_rename.c2
-rw-r--r--src/os_posix/os_rw.c2
-rw-r--r--src/os_posix/os_sleep.c2
-rw-r--r--src/os_posix/os_stdio.c2
-rw-r--r--src/os_posix/os_strtouq.c2
-rw-r--r--src/os_posix/os_thread.c2
-rw-r--r--src/os_posix/os_time.c2
-rw-r--r--src/os_posix/os_yield.c2
-rw-r--r--src/os_win/os_dir.c2
-rw-r--r--src/os_win/os_dlopen.c2
-rw-r--r--src/os_win/os_errno.c2
-rw-r--r--src/os_win/os_exist.c2
-rw-r--r--src/os_win/os_fallocate.c2
-rw-r--r--src/os_win/os_filesize.c2
-rw-r--r--src/os_win/os_flock.c2
-rw-r--r--src/os_win/os_fsync.c4
-rw-r--r--src/os_win/os_ftruncate.c2
-rw-r--r--src/os_win/os_getenv.c2
-rw-r--r--src/os_win/os_map.c2
-rw-r--r--src/os_win/os_mtx_cond.c2
-rw-r--r--src/os_win/os_once.c2
-rw-r--r--src/os_win/os_open.c2
-rw-r--r--src/os_win/os_pagesize.c23
-rw-r--r--src/os_win/os_path.c2
-rw-r--r--src/os_win/os_priv.c2
-rw-r--r--src/os_win/os_remove.c2
-rw-r--r--src/os_win/os_rename.c2
-rw-r--r--src/os_win/os_rw.c2
-rw-r--r--src/os_win/os_sleep.c2
-rw-r--r--src/os_win/os_snprintf.c2
-rw-r--r--src/os_win/os_thread.c2
-rw-r--r--src/os_win/os_time.c2
-rw-r--r--src/os_win/os_vsnprintf.c2
-rw-r--r--src/os_win/os_yield.c2
-rw-r--r--src/packing/pack_api.c2
-rw-r--r--src/packing/pack_impl.c4
-rw-r--r--src/packing/pack_stream.c2
-rw-r--r--src/reconcile/rec_track.c2
-rw-r--r--src/reconcile/rec_write.c229
-rw-r--r--src/schema/schema_create.c94
-rw-r--r--src/schema/schema_drop.c4
-rw-r--r--src/schema/schema_list.c6
-rw-r--r--src/schema/schema_open.c15
-rw-r--r--src/schema/schema_plan.c2
-rw-r--r--src/schema/schema_project.c2
-rw-r--r--src/schema/schema_rename.c4
-rw-r--r--src/schema/schema_stat.c2
-rw-r--r--src/schema/schema_truncate.c48
-rw-r--r--src/schema/schema_util.c2
-rw-r--r--src/schema/schema_worker.c6
-rw-r--r--src/session/session_api.c229
-rw-r--r--src/session/session_compact.c15
-rw-r--r--src/session/session_dhandle.c12
-rw-r--r--src/session/session_salvage.c2
-rw-r--r--src/support/cksum.c2
-rw-r--r--src/support/crypto.c2
-rw-r--r--src/support/err.c2
-rw-r--r--src/support/filename.c60
-rw-r--r--src/support/global.c31
-rw-r--r--src/support/hash_city.c8
-rw-r--r--src/support/hash_fnv.c2
-rw-r--r--src/support/hazard.c2
-rw-r--r--src/support/hex.c27
-rw-r--r--src/support/huffman.c28
-rw-r--r--src/support/pow.c2
-rw-r--r--src/support/rand.c25
-rw-r--r--src/support/scratch.c2
-rw-r--r--src/support/stat.c36
-rw-r--r--src/txn/txn.c84
-rw-r--r--src/txn/txn_ckpt.c35
-rw-r--r--src/txn/txn_ext.c2
-rw-r--r--src/txn/txn_log.c27
-rw-r--r--src/txn/txn_nsnap.c2
-rw-r--r--src/txn/txn_recover.c4
-rw-r--r--src/utilities/util.h3
-rw-r--r--src/utilities/util_backup.c2
-rw-r--r--src/utilities/util_compact.c2
-rw-r--r--src/utilities/util_cpyright.c4
-rw-r--r--src/utilities/util_create.c2
-rw-r--r--src/utilities/util_drop.c2
-rw-r--r--src/utilities/util_dump.c2
-rw-r--r--src/utilities/util_list.c75
-rw-r--r--src/utilities/util_load.c2
-rw-r--r--src/utilities/util_load.h2
-rw-r--r--src/utilities/util_load_json.c2
-rw-r--r--src/utilities/util_loadtext.c2
-rw-r--r--src/utilities/util_main.c8
-rw-r--r--src/utilities/util_misc.c2
-rw-r--r--src/utilities/util_printlog.c17
-rw-r--r--src/utilities/util_read.c2
-rw-r--r--src/utilities/util_rebalance.c63
-rw-r--r--src/utilities/util_rename.c2
-rw-r--r--src/utilities/util_salvage.c2
-rw-r--r--src/utilities/util_stat.c2
-rw-r--r--src/utilities/util_upgrade.c2
-rw-r--r--src/utilities/util_verbose.c2
-rw-r--r--src/utilities/util_verify.c2
-rw-r--r--src/utilities/util_write.c2
285 files changed, 3949 insertions, 1664 deletions
diff --git a/src/async/async_api.c b/src/async/async_api.c
index dc26f2d11c3..fea8714176b 100644
--- a/src/async/async_api.c
+++ b/src/async/async_api.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/async/async_op.c b/src/async/async_op.c
index 7661a4383d6..130c704757b 100644
--- a/src/async/async_op.c
+++ b/src/async/async_op.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/async/async_worker.c b/src/async/async_worker.c
index 6a5ec5feeb0..e692bc619a9 100644
--- a/src/async/async_worker.c
+++ b/src/async/async_worker.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/block/block_addr.c b/src/block/block_addr.c
index 6d50e5f0f4e..b1f2fd9454a 100644
--- a/src/block/block_addr.c
+++ b/src/block/block_addr.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -14,7 +14,7 @@
* caller's buffer reference so it can be called repeatedly to load a buffer.
*/
static int
-__block_buffer_to_addr(WT_BLOCK *block,
+__block_buffer_to_addr(uint32_t allocsize,
const uint8_t **pp, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump)
{
uint64_t o, s, c;
@@ -39,8 +39,8 @@ __block_buffer_to_addr(WT_BLOCK *block,
*offsetp = 0;
*sizep = *cksump = 0;
} else {
- *offsetp = (wt_off_t)(o + 1) * block->allocsize;
- *sizep = (uint32_t)s * block->allocsize;
+ *offsetp = (wt_off_t)(o + 1) * allocsize;
+ *sizep = (uint32_t)s * allocsize;
*cksump = (uint32_t)c;
}
return (0);
@@ -80,7 +80,8 @@ int
__wt_block_buffer_to_addr(WT_BLOCK *block,
const uint8_t *p, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump)
{
- return (__block_buffer_to_addr(block, &p, offsetp, sizep, cksump));
+ return (__block_buffer_to_addr(
+ block->allocsize, &p, offsetp, sizep, cksump));
}
/*
@@ -139,12 +140,12 @@ __wt_block_addr_string(WT_SESSION_IMPL *session,
}
/*
- * __wt_block_buffer_to_ckpt --
+ * __block_buffer_to_ckpt --
* Convert a checkpoint cookie into its components.
*/
-int
-__wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session,
- WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci)
+static int
+__block_buffer_to_ckpt(WT_SESSION_IMPL *session,
+ uint32_t allocsize, const uint8_t *p, WT_BLOCK_CKPT *ci)
{
uint64_t a;
const uint8_t **pp;
@@ -154,13 +155,13 @@ __wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session,
WT_RET_MSG(session, WT_ERROR, "unsupported checkpoint version");
pp = &p;
- WT_RET(__block_buffer_to_addr(block, pp,
+ WT_RET(__block_buffer_to_addr(allocsize, pp,
&ci->root_offset, &ci->root_size, &ci->root_cksum));
- WT_RET(__block_buffer_to_addr(block, pp,
+ WT_RET(__block_buffer_to_addr(allocsize, pp,
&ci->alloc.offset, &ci->alloc.size, &ci->alloc.cksum));
- WT_RET(__block_buffer_to_addr(block, pp,
+ WT_RET(__block_buffer_to_addr(allocsize, pp,
&ci->avail.offset, &ci->avail.size, &ci->avail.cksum));
- WT_RET(__block_buffer_to_addr(block, pp,
+ WT_RET(__block_buffer_to_addr(allocsize, pp,
&ci->discard.offset, &ci->discard.size, &ci->discard.cksum));
WT_RET(__wt_vunpack_uint(pp, 0, &a));
ci->file_size = (wt_off_t)a;
@@ -171,6 +172,32 @@ __wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session,
}
/*
+ * __wt_block_buffer_to_ckpt --
+ * Convert a checkpoint cookie into its components, block manager version.
+ */
+int
+__wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci)
+{
+ return (__block_buffer_to_ckpt(session, block->allocsize, p, ci));
+}
+
+/*
+ * __wt_block_ckpt_decode --
+ * Convert a checkpoint cookie into its components, external utility
+ * version.
+ */
+int
+__wt_block_ckpt_decode(WT_SESSION *wt_session,
+ size_t allocsize, const uint8_t *p, WT_BLOCK_CKPT *ci)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ return (__block_buffer_to_ckpt(session, (uint32_t)allocsize, p, ci));
+}
+
+/*
* __wt_block_ckpt_to_buffer --
* Convert the components into its checkpoint cookie.
*/
diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c
index adbcf0e3fdc..03059c8f23a 100644
--- a/src/block/block_ckpt.c
+++ b/src/block/block_ckpt.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/block/block_compact.c b/src/block/block_compact.c
index d45d0a96da7..8c9be4f029c 100644
--- a/src/block/block_compact.c
+++ b/src/block/block_compact.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -8,7 +8,7 @@
#include "wt_internal.h"
-static int __block_dump_avail(WT_SESSION_IMPL *, WT_BLOCK *);
+static int __block_dump_avail(WT_SESSION_IMPL *, WT_BLOCK *, bool);
/*
* __wt_block_compact_start --
@@ -22,8 +22,6 @@ __wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
/* Switch to first-fit allocation. */
__wt_block_configure_first_fit(block, true);
- block->compact_pct_tenths = 0;
-
return (0);
}
@@ -34,14 +32,21 @@ __wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
int
__wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
{
+ WT_DECL_RET;
+
WT_UNUSED(session);
/* Restore the original allocation plan. */
__wt_block_configure_first_fit(block, false);
- block->compact_pct_tenths = 0;
+ /* Dump the results of the compaction pass. */
+ if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) {
+ __wt_spin_lock(session, &block->live_lock);
+ ret = __block_dump_avail(session, block, false);
+ __wt_spin_unlock(session, &block->live_lock);
+ }
- return (0);
+ return (ret);
}
/*
@@ -70,12 +75,23 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp)
if (fh->size <= WT_MEGABYTE)
return (0);
+ /*
+ * Reset the compaction state information. This is done here, not in the
+ * compaction "start" routine, because this function is called first to
+ * determine if compaction is useful.
+ */
+ block->compact_pct_tenths = 0;
+ block->compact_pages_reviewed = 0;
+ block->compact_pages_skipped = 0;
+ block->compact_pages_written = 0;
+
__wt_spin_lock(session, &block->live_lock);
+ /* Dump the current state of the file. */
if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT))
- WT_ERR(__block_dump_avail(session, block));
+ WT_ERR(__block_dump_avail(session, block, true));
- /* Sum the available bytes in the first 80% and 90% of the file. */
+ /* Sum the available bytes in the initial 80% and 90% of the file. */
avail_eighty = avail_ninety = 0;
ninety = fh->size - fh->size / 10;
eighty = fh->size - ((fh->size / 10) * 2);
@@ -88,23 +104,6 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp)
avail_eighty += ext->size;
}
- WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
- "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first "
- "80%% of the file",
- block->name,
- (uintmax_t)avail_eighty / WT_MEGABYTE, (uintmax_t)avail_eighty));
- WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
- "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first "
- "90%% of the file",
- block->name,
- (uintmax_t)avail_ninety / WT_MEGABYTE, (uintmax_t)avail_ninety));
- WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
- "%s: require 10%% or %" PRIuMAX "MB (%" PRIuMAX ") in the first "
- "90%% of the file to perform compaction, compaction %s",
- block->name,
- (uintmax_t)(fh->size / 10) / WT_MEGABYTE, (uintmax_t)fh->size / 10,
- *skipp ? "skipped" : "proceeding"));
-
/*
* Skip files where we can't recover at least 1MB.
*
@@ -127,6 +126,23 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp)
block->compact_pct_tenths = 1;
}
+ WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
+ "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first "
+ "80%% of the file",
+ block->name,
+ (uintmax_t)avail_eighty / WT_MEGABYTE, (uintmax_t)avail_eighty));
+ WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
+ "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first "
+ "90%% of the file",
+ block->name,
+ (uintmax_t)avail_ninety / WT_MEGABYTE, (uintmax_t)avail_ninety));
+ WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
+ "%s: require 10%% or %" PRIuMAX "MB (%" PRIuMAX ") in the first "
+ "90%% of the file to perform compaction, compaction %s",
+ block->name,
+ (uintmax_t)(fh->size / 10) / WT_MEGABYTE, (uintmax_t)fh->size / 10,
+ *skipp ? "skipped" : "proceeding"));
+
err: __wt_spin_unlock(session, &block->live_lock);
return (ret);
@@ -177,6 +193,14 @@ __wt_block_compact_page_skip(WT_SESSION_IMPL *session,
}
__wt_spin_unlock(session, &block->live_lock);
+ if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) {
+ ++block->compact_pages_reviewed;
+ if (*skipp)
+ ++block->compact_pages_skipped;
+ else
+ ++block->compact_pages_written;
+ }
+
return (ret);
}
@@ -185,7 +209,7 @@ __wt_block_compact_page_skip(WT_SESSION_IMPL *session,
* Dump out the avail list so we can see what compaction will look like.
*/
static int
-__block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block)
+__block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block, bool start)
{
WT_EXTLIST *el;
WT_EXT *ext;
@@ -196,6 +220,20 @@ __block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block)
size = block->fh->size;
WT_RET(__wt_verbose(session, WT_VERB_COMPACT,
+ "============ %s",
+ start ? "testing for compaction" : "ending compaction pass"));
+
+ if (!start) {
+ WT_RET(__wt_verbose(session, WT_VERB_COMPACT,
+ "pages reviewed: %" PRIuMAX,
+ block->compact_pages_reviewed));
+ WT_RET(__wt_verbose(session, WT_VERB_COMPACT,
+ "pages skipped: %" PRIuMAX, block->compact_pages_skipped));
+ WT_RET(__wt_verbose(session, WT_VERB_COMPACT,
+ "pages written: %" PRIuMAX, block->compact_pages_written));
+ }
+
+ WT_RET(__wt_verbose(session, WT_VERB_COMPACT,
"file size %" PRIuMAX "MB (%" PRIuMAX ") with %" PRIuMAX
"%% space available %" PRIuMAX "MB (%" PRIuMAX ")",
(uintmax_t)size / WT_MEGABYTE, (uintmax_t)size,
@@ -219,6 +257,10 @@ __block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block)
}
#ifdef __VERBOSE_OUTPUT_PERCENTILE
+ /*
+ * The verbose output always displays 10% buckets, running this code
+ * as well also displays 1% buckets.
+ */
for (i = 0; i < WT_ELEMENTS(percentile); ++i) {
v = percentile[i] * 512;
WT_RET(__wt_verbose(session, WT_VERB_COMPACT,
diff --git a/src/block/block_ext.c b/src/block/block_ext.c
index a56df220390..ab5d5604087 100644
--- a/src/block/block_ext.c
+++ b/src/block/block_ext.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/block/block_map.c b/src/block/block_map.c
index 6dc270760d6..3d04a492269 100644
--- a/src/block/block_map.c
+++ b/src/block/block_map.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c
index 7260cab75d9..6e2dc775362 100644
--- a/src/block/block_mgr.c
+++ b/src/block/block_mgr.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -221,6 +221,18 @@ __bm_free(WT_BM *bm,
}
/*
+ * __bm_is_mapped --
+ * Return if the file is mapped into memory.
+ */
+static bool
+__bm_is_mapped(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ WT_UNUSED(session);
+
+ return (bm->map == NULL ? false : true);
+}
+
+/*
* __bm_stat --
* Block-manager statistics.
*/
@@ -357,6 +369,7 @@ __bm_method_set(WT_BM *bm, bool readonly)
(int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
bm->free = (int (*)(WT_BM *,
WT_SESSION_IMPL *, const uint8_t *, size_t))__bm_readonly;
+ bm->is_mapped = __bm_is_mapped;
bm->preload = __wt_bm_preload;
bm->read = __wt_bm_read;
bm->salvage_end = (int (*)
@@ -367,6 +380,7 @@ __bm_method_set(WT_BM *bm, bool readonly)
(WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
bm->salvage_valid = (int (*)(WT_BM *,
WT_SESSION_IMPL *, uint8_t *, size_t, bool))__bm_readonly;
+ bm->size = __wt_block_manager_size;
bm->stat = __bm_stat;
bm->sync =
(int (*)(WT_BM *, WT_SESSION_IMPL *, bool))__bm_readonly;
@@ -391,12 +405,14 @@ __bm_method_set(WT_BM *bm, bool readonly)
bm->compact_skip = __bm_compact_skip;
bm->compact_start = __bm_compact_start;
bm->free = __bm_free;
+ bm->is_mapped = __bm_is_mapped;
bm->preload = __wt_bm_preload;
bm->read = __wt_bm_read;
bm->salvage_end = __bm_salvage_end;
bm->salvage_next = __bm_salvage_next;
bm->salvage_start = __bm_salvage_start;
bm->salvage_valid = __bm_salvage_valid;
+ bm->size = __wt_block_manager_size;
bm->stat = __bm_stat;
bm->sync = __bm_sync;
bm->verify_addr = __bm_verify_addr;
diff --git a/src/block/block_open.c b/src/block/block_open.c
index 7cf12d36066..dd0f3f0716a 100644
--- a/src/block/block_open.c
+++ b/src/block/block_open.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -11,36 +11,13 @@
static int __desc_read(WT_SESSION_IMPL *, WT_BLOCK *);
/*
- * __wt_block_manager_truncate --
- * Truncate a file.
+ * __wt_block_manager_drop --
+ * Drop a file.
*/
int
-__wt_block_manager_truncate(
- WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize)
+__wt_block_manager_drop(WT_SESSION_IMPL *session, const char *filename)
{
- WT_DECL_RET;
- WT_FH *fh;
-
- /* Open the underlying file handle. */
- WT_RET(__wt_open(
- session, filename, false, false, WT_FILE_TYPE_DATA, &fh));
-
- /* Truncate the file. */
- WT_ERR(__wt_block_truncate(session, fh, (wt_off_t)0));
-
- /* Write out the file's meta-data. */
- WT_ERR(__wt_desc_init(session, fh, allocsize));
-
- /*
- * Ensure the truncated file has made it to disk, then the upper-level
- * is never surprised.
- */
- WT_ERR(__wt_fsync(session, fh));
-
- /* Close the file handle. */
-err: WT_TRET(__wt_close(session, &fh));
-
- return (ret);
+ return (__wt_remove_if_exists(session, filename));
}
/*
@@ -405,27 +382,37 @@ __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats)
* Reading from the live system's structure normally requires locking,
* but it's an 8B statistics read, there's no need.
*/
- stats->allocation_size = block->allocsize;
- stats->block_checkpoint_size = (int64_t)block->live.ckpt_size;
- stats->block_magic = WT_BLOCK_MAGIC;
- stats->block_major = WT_BLOCK_MAJOR_VERSION;
- stats->block_minor = WT_BLOCK_MINOR_VERSION;
- stats->block_reuse_bytes = (int64_t)block->live.avail.bytes;
- stats->block_size = block->fh->size;
+ WT_STAT_WRITE(stats, allocation_size, block->allocsize);
+ WT_STAT_WRITE(
+ stats, block_checkpoint_size, (int64_t)block->live.ckpt_size);
+ WT_STAT_WRITE(stats, block_magic, WT_BLOCK_MAGIC);
+ WT_STAT_WRITE(stats, block_major, WT_BLOCK_MAJOR_VERSION);
+ WT_STAT_WRITE(stats, block_minor, WT_BLOCK_MINOR_VERSION);
+ WT_STAT_WRITE(
+ stats, block_reuse_bytes, (int64_t)block->live.avail.bytes);
+ WT_STAT_WRITE(stats, block_size, block->fh->size);
}
/*
* __wt_block_manager_size --
- * Set the size statistic for a file.
+ * Return the size of a live block handle.
*/
int
-__wt_block_manager_size(
- WT_SESSION_IMPL *session, const char *filename, WT_DSRC_STATS *stats)
+__wt_block_manager_size(WT_BM *bm, WT_SESSION_IMPL *session, wt_off_t *sizep)
{
- wt_off_t filesize;
-
- WT_RET(__wt_filesize_name(session, filename, false, &filesize));
- stats->block_size = filesize;
+ WT_UNUSED(session);
+ *sizep = bm->block->fh == NULL ? 0 : bm->block->fh->size;
return (0);
}
+
+/*
+ * __wt_block_manager_named_size --
+ * Return the size of a named file.
+ */
+int
+__wt_block_manager_named_size(
+ WT_SESSION_IMPL *session, const char *name, wt_off_t *sizep)
+{
+ return (__wt_filesize_name(session, name, false, sizep));
+}
diff --git a/src/block/block_read.c b/src/block/block_read.c
index ca7797f17af..0e5911ecf2a 100644
--- a/src/block/block_read.c
+++ b/src/block/block_read.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/block/block_session.c b/src/block/block_session.c
index 6683fdd20ce..268adb530cf 100644
--- a/src/block/block_session.c
+++ b/src/block/block_session.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/block/block_slvg.c b/src/block/block_slvg.c
index 9f3093c741d..ef22c727db4 100644
--- a/src/block/block_slvg.c
+++ b/src/block/block_slvg.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/block/block_vrfy.c b/src/block/block_vrfy.c
index 9904dcccd14..35c7a2c218c 100644
--- a/src/block/block_vrfy.c
+++ b/src/block/block_vrfy.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/block/block_write.c b/src/block/block_write.c
index 26efac54080..23f4d7650b9 100644
--- a/src/block/block_write.c
+++ b/src/block/block_write.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/bloom/bloom.c b/src/bloom/bloom.c
index e3a21f25dc1..505630f12cf 100644
--- a/src/bloom/bloom.c
+++ b/src/bloom/bloom.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -130,8 +130,8 @@ __bloom_open_cursor(WT_BLOOM *bloom, WT_CURSOR *owner)
c = NULL;
WT_RET(__wt_open_cursor(session, bloom->uri, owner, cfg, &c));
- /* XXX Layering violation: bump the cache priority for Bloom filters. */
- ((WT_CURSOR_BTREE *)c)->btree->evict_priority = WT_EVICT_INT_SKEW;
+ /* Bump the cache priority for Bloom filters. */
+ __wt_evict_priority_set(session, WT_EVICT_INT_SKEW);
bloom->c = c;
return (0);
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c
index 8044d4f852d..12df19a7e04 100644
--- a/src/btree/bt_compact.c
+++ b/src/btree/bt_compact.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -17,9 +17,11 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
WT_BM *bm;
WT_DECL_RET;
+ WT_MULTI *multi;
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
size_t addr_size;
+ uint32_t i;
const uint8_t *addr;
*skipp = true; /* Default skip. */
@@ -41,29 +43,46 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
/*
* If the page is clean, test the original addresses.
- * If the page is a 1-to-1 replacement, test the replacement addresses.
+ * If the page is a replacement, test the replacement addresses.
* Ignore empty pages, they get merged into the parent.
*/
if (mod == NULL || mod->rec_result == 0) {
__wt_ref_info(ref, &addr, &addr_size, NULL);
if (addr == NULL)
return (0);
- WT_RET(
+ return (
bm->compact_page_skip(bm, session, addr, addr_size, skipp));
- } else if (mod->rec_result == WT_PM_REC_REPLACE) {
- /*
- * The page's modification information can change underfoot if
- * the page is being reconciled, serialize with reconciliation.
- */
+ }
+
+ /*
+ * The page's modification information can change underfoot if the page
+ * is being reconciled, serialize with reconciliation.
+ */
+ if (mod->rec_result == WT_PM_REC_REPLACE ||
+ mod->rec_result == WT_PM_REC_MULTIBLOCK)
WT_RET(__wt_fair_lock(session, &page->page_lock));
+ if (mod->rec_result == WT_PM_REC_REPLACE)
ret = bm->compact_page_skip(bm, session,
mod->mod_replace.addr, mod->mod_replace.size, skipp);
+ if (mod->rec_result == WT_PM_REC_MULTIBLOCK)
+ for (multi = mod->mod_multi,
+ i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+ if (multi->disk_image != NULL)
+ continue;
+ if ((ret = bm->compact_page_skip(bm, session,
+ multi->addr.addr, multi->addr.size, skipp)) != 0)
+ break;
+ if (!*skipp)
+ break;
+ }
+
+ if (mod->rec_result == WT_PM_REC_REPLACE ||
+ mod->rec_result == WT_PM_REC_MULTIBLOCK)
WT_TRET(__wt_fair_unlock(session, &page->page_lock));
- WT_RET(ret);
- }
- return (0);
+
+ return (ret);
}
/*
@@ -139,7 +158,8 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
if (skip)
continue;
- session->compaction = true;
+ session->compact_state = WT_COMPACT_SUCCESS;
+
/* Rewrite the page: mark the page and tree dirty. */
WT_ERR(__wt_page_modify_init(session, ref->page));
__wt_page_modify_set(session, ref->page);
diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c
index 55843d1cae5..63b2e2abebc 100644
--- a/src/btree/bt_curnext.c
+++ b/src/btree/bt_curnext.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -31,13 +31,12 @@ __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage)
return (WT_NOTFOUND);
/*
- * This code looks different from the cursor-previous code. The append
- * list appears on the last page of the tree, but it may be preceded by
- * other rows, which means the cursor's recno will be set to a value and
- * we simply want to increment it. If the cursor's recno is NOT set,
- * we're starting our iteration in a tree that has only appended items.
- * In that case, recno will be 0 and happily enough the increment will
- * set it to 1, which is correct.
+ * This code looks different from the cursor-previous code. The append
+ * list may be preceded by other rows, which means the cursor's recno
+ * will be set to a value and we simply want to increment it. If the
+ * cursor's recno is NOT set, we're starting an iteration in a tree with
+ * only appended items. In that case, recno will be 0 and happily enough
+ * the increment will set it to 1, which is correct.
*/
__cursor_set_recno(cbt, cbt->recno + 1);
@@ -368,6 +367,140 @@ new_insert: if ((ins = cbt->ins) != NULL) {
/* NOTREACHED */
}
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __cursor_key_order_check_col --
+ * Check key ordering for column-store cursor movements.
+ */
+static int
+__cursor_key_order_check_col(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next)
+{
+ int cmp;
+
+ cmp = 0; /* -Werror=maybe-uninitialized */
+
+ if (cbt->lastrecno != WT_RECNO_OOB) {
+ if (cbt->lastrecno < cbt->recno)
+ cmp = -1;
+ if (cbt->lastrecno > cbt->recno)
+ cmp = 1;
+ }
+
+ if (cbt->lastrecno == WT_RECNO_OOB ||
+ (next && cmp < 0) || (!next && cmp > 0)) {
+ cbt->lastrecno = cbt->recno;
+ return (0);
+ }
+
+ WT_PANIC_RET(session, EINVAL,
+ "WT_CURSOR.%s out-of-order returns: returned key %" PRIu64 " then "
+ "key %" PRIu64,
+ next ? "next" : "prev", cbt->lastrecno, cbt->recno);
+}
+
+/*
+ * __cursor_key_order_check_row --
+ * Check key ordering for row-store cursor movements.
+ */
+static int
+__cursor_key_order_check_row(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next)
+{
+ WT_BTREE *btree;
+ WT_ITEM *key;
+ WT_DECL_RET;
+ WT_DECL_ITEM(a);
+ WT_DECL_ITEM(b);
+ int cmp;
+
+ btree = S2BT(session);
+ key = &cbt->iface.key;
+ cmp = 0; /* -Werror=maybe-uninitialized */
+
+ if (cbt->lastkey->size != 0)
+ WT_RET(__wt_compare(
+ session, btree->collator, cbt->lastkey, key, &cmp));
+
+ if (cbt->lastkey->size == 0 || (next && cmp < 0) || (!next && cmp > 0))
+ return (__wt_buf_set(session,
+ cbt->lastkey, cbt->iface.key.data, cbt->iface.key.size));
+
+ WT_ERR(__wt_scr_alloc(session, 512, &a));
+ WT_ERR(__wt_scr_alloc(session, 512, &b));
+
+ WT_PANIC_ERR(session, EINVAL,
+ "WT_CURSOR.%s out-of-order returns: returned key %s then key %s",
+ next ? "next" : "prev",
+ __wt_buf_set_printable(
+ session, cbt->lastkey->data, cbt->lastkey->size, a),
+ __wt_buf_set_printable(session, key->data, key->size, b));
+
+err: __wt_scr_free(session, &a);
+ __wt_scr_free(session, &b);
+
+ return (ret);
+}
+
+/*
+ * __wt_cursor_key_order_check --
+ * Check key ordering for cursor movements.
+ */
+int
+__wt_cursor_key_order_check(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next)
+{
+ switch (cbt->ref->page->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ return (__cursor_key_order_check_col(session, cbt, next));
+ case WT_PAGE_ROW_LEAF:
+ return (__cursor_key_order_check_row(session, cbt, next));
+ WT_ILLEGAL_VALUE(session);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __wt_cursor_key_order_init --
+ * Initialize key ordering checks for cursor movements after a successful
+ * search.
+ */
+int
+__wt_cursor_key_order_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+ /*
+ * Cursor searches set the position for cursor movements, set the
+ * last-key value for diagnostic checking.
+ */
+ switch (cbt->ref->page->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ cbt->lastrecno = cbt->recno;
+ return (0);
+ case WT_PAGE_ROW_LEAF:
+ return (__wt_buf_set(session,
+ cbt->lastkey, cbt->iface.key.data, cbt->iface.key.size));
+ WT_ILLEGAL_VALUE(session);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __wt_cursor_key_order_reset --
+ * Turn off key ordering checks for cursor movements.
+ */
+void
+__wt_cursor_key_order_reset(WT_CURSOR_BTREE *cbt)
+{
+ /*
+ * Clear the last-key returned, it doesn't apply.
+ */
+ cbt->lastkey->size = 0;
+ cbt->lastrecno = WT_RECNO_OOB;
+}
+#endif
+
/*
* __wt_btcur_iterate_setup --
* Initialize a cursor for iteration, usually based on a search.
@@ -393,10 +526,14 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt)
* If we don't have a search page, then we're done, we're starting at
* the beginning or end of the tree, not as a result of a search.
*/
- if (cbt->ref == NULL)
+ if (cbt->ref == NULL) {
+#ifdef HAVE_DIAGNOSTIC
+ __wt_cursor_key_order_reset(cbt);
+#endif
return;
- page = cbt->ref->page;
+ }
+ page = cbt->ref->page;
if (page->type == WT_PAGE_ROW_LEAF) {
/*
* For row-store pages, we need a single item that tells us the
@@ -468,7 +605,6 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating)
*/
for (newpage = false;; newpage = true) {
page = cbt->ref == NULL ? NULL : cbt->ref->page;
- WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page));
if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
switch (page->type) {
@@ -502,9 +638,9 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating)
break;
/*
- * The last page in a column-store has appended entries.
- * We handle it separately from the usual cursor code:
- * it's only that one page and it's in a simple format.
+ * Column-store pages may have appended entries. Handle
+ * it separately from the usual cursor code, it's in a
+ * simple format.
*/
if (page->type != WT_PAGE_ROW_LEAF &&
(cbt->ins_head = WT_COL_APPEND(page)) != NULL) {
@@ -531,6 +667,11 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating)
WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
}
+#ifdef HAVE_DIAGNOSTIC
+ if (ret == 0)
+ WT_ERR(__wt_cursor_key_order_check(session, cbt, true));
+#endif
+
err: if (ret != 0)
WT_TRET(__cursor_reset(cbt));
return (ret);
diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c
index 1d23b976edd..a083ec4016e 100644
--- a/src/btree/bt_curprev.c
+++ b/src/btree/bt_curprev.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -139,10 +139,20 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL)
return (WT_NOTFOUND);
} else {
+ /* Move to the previous record in the append list, if any. */
+ if (cbt->ins != NULL &&
+ cbt->recno <= WT_INSERT_RECNO(cbt->ins))
+ WT_RET(__cursor_skip_prev(cbt));
+
/*
* Handle the special case of leading implicit records, that is,
- * there aren't any records in the tree not on the append list,
- * and the first record on the append list isn't record 1.
+ * there aren't any records in the page not on the append list,
+ * and the append list's first record isn't the first record on
+ * the page. (Although implemented as a test of the page values,
+ * this is really a test for a tree where the first inserted
+ * record wasn't record 1, any other page with only an append
+ * list will have a first page record number matching the first
+ * record in the append list.)
*
* The "right" place to handle this is probably in our caller.
* The high-level cursor-previous routine would:
@@ -156,27 +166,26 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
* into our caller. Anyway, if this code breaks for any reason,
* that's the way I'd go.
*
- * If we're not pointing to a WT_INSERT entry, or we can't find
- * a WT_INSERT record that precedes our record name-space, check
- * if there are any records on the page. If there aren't, then
- * we're in the magic zone, keep going until we get to a record
- * number of 1.
+ * If we're not pointing to a WT_INSERT entry (we didn't find a
+ * WT_INSERT record preceding our record name-space), check if
+ * we've reached the beginning of this page, a possibility if a
+ * page had a large number of items appended, and then split.
+ * If not, check if there are any records on the page. If there
+ * aren't, then we're in the magic zone, keep going until we get
+ * to a record number matching the first record on the page.
*/
- if (cbt->ins != NULL &&
- cbt->recno <= WT_INSERT_RECNO(cbt->ins))
- WT_RET(__cursor_skip_prev(cbt));
if (cbt->ins == NULL &&
- (cbt->recno == 1 || __col_fix_last_recno(page) != 0))
+ (cbt->recno == page->pg_fix_recno ||
+ __col_fix_last_recno(page) != 0))
return (WT_NOTFOUND);
}
/*
- * This code looks different from the cursor-next code. The append
- * list appears on the last page of the tree and contains the last
- * records in the tree. If we're iterating through the tree, starting
- * at the last record in the tree, by definition we're starting a new
- * iteration and we set the record number to the last record found in
- * the tree. Otherwise, decrement the record.
+ * This code looks different from the cursor-next code. The append list
+ * may be preceded by other rows. If we're iterating through the tree,
+ * starting at the last record in the tree, by definition we're starting
+ * a new iteration and we set the record number to the last record found
+ * on the page. Otherwise, decrement the record.
*/
if (newpage)
__cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
@@ -556,12 +565,11 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
*/
for (newpage = false;; newpage = true) {
page = cbt->ref == NULL ? NULL : cbt->ref->page;
- WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page));
/*
- * The last page in a column-store has appended entries.
- * We handle it separately from the usual cursor code:
- * it's only that one page and it's in a simple format.
+ * Column-store pages may have appended entries. Handle it
+ * separately from the usual cursor code, it's in a simple
+ * format.
*/
if (newpage && page != NULL && page->type != WT_PAGE_ROW_LEAF &&
(cbt->ins_head = WT_COL_APPEND(page)) != NULL)
@@ -618,6 +626,10 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
}
+#ifdef HAVE_DIAGNOSTIC
+ if (ret == 0)
+ WT_ERR(__wt_cursor_key_order_check(session, cbt, false));
+#endif
err: if (ret != 0)
WT_TRET(__cursor_reset(cbt));
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index f2bf2978320..c11b7d35de6 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -62,8 +62,18 @@ __cursor_size_chk(WT_SESSION_IMPL *session, WT_ITEM *kv)
static inline int
__cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt)
{
- return (btree->type == BTREE_COL_FIX &&
- !F_ISSET(cbt, WT_CBT_MAX_RECORD));
+ /*
+ * When there's no exact match, column-store search returns the key
+ * nearest the searched-for key (continuing past keys smaller than the
+ * searched-for key to return the next-largest key). Therefore, if the
+ * returned comparison is -1, the searched-for key was larger than any
+ * row on the page's standard information or column-store insert list.
+ *
+ * If the returned comparison is NOT -1, there was a row equal to or
+ * larger than the searched-for key, and we implicitly create missing
+ * rows.
+ */
+ return (btree->type == BTREE_COL_FIX && cbt->compare != -1);
}
/*
@@ -344,6 +354,11 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt)
} else
ret = WT_NOTFOUND;
+#ifdef HAVE_DIAGNOSTIC
+ if (ret == 0)
+ WT_ERR(__wt_cursor_key_order_init(session, cbt));
+#endif
+
err: if (ret != 0)
WT_TRET(__cursor_reset(cbt));
return (ret);
@@ -454,6 +469,11 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
exact = -1;
}
+#ifdef HAVE_DIAGNOSTIC
+ if (ret == 0)
+ WT_ERR(__wt_cursor_key_order_init(session, cbt));
+#endif
+
err: if (ret != 0)
WT_TRET(__cursor_reset(cbt));
if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND))
@@ -502,19 +522,14 @@ retry: WT_RET(__cursor_func_init(cbt, true));
case BTREE_COL_VAR:
/*
* If WT_CURSTD_APPEND is set, insert a new record (ignoring
- * the application's record number). First we search for the
- * maximum possible record number so the search ends on the
- * last page. The real record number is assigned by the
- * serialized append operation.
+ * the application's record number). The real record number
+ * is assigned by the serialized append operation.
*/
if (F_ISSET(cursor, WT_CURSTD_APPEND))
- cbt->iface.recno = UINT64_MAX;
+ cbt->iface.recno = WT_RECNO_OOB;
WT_ERR(__cursor_col_search(session, cbt, NULL));
- if (F_ISSET(cursor, WT_CURSTD_APPEND))
- cbt->iface.recno = WT_RECNO_OOB;
-
/*
* If not overwriting, fail if the key exists. Creating a
* record past the end of the tree in a fixed-length
@@ -830,6 +845,7 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
WT_DECL_RET;
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
+ wt_off_t size;
uint64_t skip;
session = (WT_SESSION_IMPL *)cbt->iface.session;
@@ -866,10 +882,12 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
* !!!
* Ideally, the number would be prime to avoid restart issues.
*/
- if (cbt->next_random_sample_size != 0)
+ if (cbt->next_random_sample_size != 0) {
+ WT_ERR(btree->bm->size(btree->bm, session, &size));
cbt->next_random_leaf_skip = (uint64_t)
- ((btree->bm->block->fh->size / btree->allocsize) /
+ ((size / btree->allocsize) /
cbt->next_random_sample_size) + 1;
+ }
/*
* Choose a leaf page from the tree.
@@ -1225,6 +1243,11 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt)
{
cbt->row_key = &cbt->_row_key;
cbt->tmp = &cbt->_tmp;
+
+#ifdef HAVE_DIAGNOSTIC
+ cbt->lastkey = &cbt->_lastkey;
+ cbt->lastrecno = WT_RECNO_OOB;
+#endif
}
/*
@@ -1250,6 +1273,9 @@ __wt_btcur_close(WT_CURSOR_BTREE *cbt, bool lowlevel)
__wt_buf_free(session, &cbt->_row_key);
__wt_buf_free(session, &cbt->_tmp);
+#ifdef HAVE_DIAGNOSTIC
+ __wt_buf_free(session, &cbt->_lastkey);
+#endif
return (ret);
}
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index d52a94a6da2..7c7f8cab855 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -74,9 +74,7 @@ __wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v)
static inline void
__debug_hex_byte(WT_DBG *ds, uint8_t v)
{
- static const char hex[] = "0123456789abcdef";
-
- __dmsg(ds, "#%c%c", hex[(v & 0xf0) >> 4], hex[v & 0x0f]);
+ __dmsg(ds, "#%c%c", __wt_hex[(v & 0xf0) >> 4], __wt_hex[v & 0x0f]);
}
/*
@@ -678,8 +676,12 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page)
__dmsg(ds, ", evict-lru");
if (F_ISSET_ATOMIC(page, WT_PAGE_OVERFLOW_KEYS))
__dmsg(ds, ", overflow-keys");
+ if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK))
+ __dmsg(ds, ", split-block");
if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT))
__dmsg(ds, ", split-insert");
+ if (F_ISSET_ATOMIC(page, WT_PAGE_UPDATE_IGNORE))
+ __dmsg(ds, ", update-ignore");
if (mod != NULL)
switch (mod->rec_result) {
diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c
index 9dd72108e4b..ba16dd204e8 100644
--- a/src/btree/bt_delete.c
+++ b/src/btree/bt_delete.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index 54b07513089..795111d53f9 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index a6330326954..7f0f37d95d6 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -105,14 +105,23 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
WT_ERR(__wt_btree_tree_open(
session, root_addr, root_addr_size));
- /* Warm the cache, if possible. */
- WT_WITH_PAGE_INDEX(session,
- ret = __btree_preload(session));
- WT_ERR(ret);
-
- /* Get the last record number in a column-store file. */
- if (btree->type != BTREE_ROW)
- WT_ERR(__btree_get_last_recno(session));
+ /*
+ * Rebalance uses the cache, but only wants the root
+ * page, nothing else.
+ */
+ if (!F_ISSET(btree, WT_BTREE_REBALANCE)) {
+ /* Warm the cache, if possible. */
+ WT_WITH_PAGE_INDEX(session,
+ ret = __btree_preload(session));
+ WT_ERR(ret);
+
+ /*
+ * Get the last record number in a column-store
+ * file.
+ */
+ if (btree->type != BTREE_ROW)
+ WT_ERR(__btree_get_last_recno(session));
+ }
}
}
@@ -514,7 +523,7 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, bool creation)
/* Bulk loads require a leaf page for reconciliation: create it now. */
if (F_ISSET(btree, WT_BTREE_BULK)) {
- WT_ERR(__wt_btree_new_leaf_page(session, &leaf));
+ WT_ERR(__wt_btree_new_leaf_page(session, 1, &leaf));
ref->page = leaf;
ref->state = WT_REF_MEM;
WT_ERR(__wt_page_modify_init(session, leaf));
@@ -538,7 +547,8 @@ err: if (leaf != NULL)
* Create an empty leaf page.
*/
int
-__wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep)
+__wt_btree_new_leaf_page(
+ WT_SESSION_IMPL *session, uint64_t recno, WT_PAGE **pagep)
{
WT_BTREE *btree;
@@ -547,15 +557,15 @@ __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep)
switch (btree->type) {
case BTREE_COL_FIX:
WT_RET(__wt_page_alloc(
- session, WT_PAGE_COL_FIX, 1, 0, false, pagep));
+ session, WT_PAGE_COL_FIX, recno, 0, false, pagep));
break;
case BTREE_COL_VAR:
WT_RET(__wt_page_alloc(
- session, WT_PAGE_COL_VAR, 1, 0, false, pagep));
+ session, WT_PAGE_COL_VAR, recno, 0, false, pagep));
break;
case BTREE_ROW:
WT_RET(__wt_page_alloc(
- session, WT_PAGE_ROW_LEAF, 0, 0, false, pagep));
+ session, WT_PAGE_ROW_LEAF, WT_RECNO_OOB, 0, false, pagep));
break;
WT_ILLEGAL_VALUE(session);
}
diff --git a/src/btree/bt_huffman.c b/src/btree/bt_huffman.c
index d9ff9616072..2c0238545fb 100644
--- a/src/btree/bt_huffman.c
+++ b/src/btree/bt_huffman.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -332,11 +332,17 @@ __wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip,
for (tp = table, lineno = 1; (ret =
fscanf(fp, "%" SCNi64 " %" SCNi64, &symbol, &frequency)) != EOF;
++tp, ++lineno) {
- if (lineno > entries)
+ /*
+ * Entries is 0-based, that is, there are (entries +1) possible
+ * values that can be configured. The line number is 1-based, so
+ * adjust the test for too many entries, and report (entries +1)
+ * in the error as the maximum possible number of entries.
+ */
+ if (lineno > entries + 1)
WT_ERR_MSG(session, EINVAL,
"Huffman table file %.*s is corrupted, "
"more than %" PRIu32 " entries",
- (int)ip->len, ip->str, entries);
+ (int)ip->len, ip->str, entries + 1);
if (ret != 2)
WT_ERR_MSG(session, EINVAL,
"line %u of Huffman table file %.*s is corrupted: "
diff --git a/src/btree/bt_io.c b/src/btree/bt_io.c
index 6481f514323..aaf906ca785 100644
--- a/src/btree/bt_io.c
+++ b/src/btree/bt_io.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/bt_misc.c b/src/btree/bt_misc.c
index a60499ef8b7..7f188502a0a 100644
--- a/src/btree/bt_misc.c
+++ b/src/btree/bt_misc.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -129,3 +129,19 @@ __wt_addr_string(WT_SESSION_IMPL *session,
}
return (buf->data);
}
+
+/*
+ * __wt_buf_set_printable --
+ * Set the contents of the buffer to a printable representation of a
+ * byte string.
+ */
+const char *
+__wt_buf_set_printable(
+ WT_SESSION_IMPL *session, const void *p, size_t size, WT_ITEM *buf)
+{
+ if (__wt_raw_to_esc_hex(session, p, size, buf)) {
+ buf->data = "[Error]";
+ buf->size = strlen("[Error]");
+ }
+ return (buf->data);
+}
diff --git a/src/btree/bt_ovfl.c b/src/btree/bt_ovfl.c
index 651cbc8d4ad..fbe361e000a 100644
--- a/src/btree/bt_ovfl.c
+++ b/src/btree/bt_ovfl.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index 8808f0b1a85..9fa0145bbdd 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -272,7 +272,7 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page)
const WT_PAGE_HEADER *dsk;
WT_PAGE_INDEX *pindex;
WT_REF **refp, *ref;
- uint32_t i;
+ uint32_t hint, i;
btree = S2BT(session);
dsk = page->dsk;
@@ -284,9 +284,11 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page)
*/
pindex = WT_INTL_INDEX_GET_SAFE(page);
refp = pindex->index;
+ hint = 0;
WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
ref = *refp++;
ref->home = page;
+ ref->pindex_hint = hint++;
__wt_cell_unpack(cell, unpack);
ref->addr = cell;
@@ -404,7 +406,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
const WT_PAGE_HEADER *dsk;
WT_PAGE_INDEX *pindex;
WT_REF *ref, **refp;
- uint32_t i;
+ uint32_t hint, i;
bool overflow_keys;
btree = S2BT(session);
@@ -421,9 +423,11 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
pindex = WT_INTL_INDEX_GET_SAFE(page);
refp = pindex->index;
overflow_keys = false;
+ hint = 0;
WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
ref = *refp;
ref->home = page;
+ ref->pindex_hint = hint++;
__wt_cell_unpack(cell, unpack);
switch (unpack->type) {
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index c50f97bbe14..ac9faef4ff2 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -379,7 +379,9 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref)
if (addr == NULL) {
WT_ASSERT(session, previous_state == WT_REF_DELETED);
- WT_ERR(__wt_btree_new_leaf_page(session, &page));
+ WT_ERR(__wt_btree_new_leaf_page(session,
+ btree->type == BTREE_ROW ? WT_RECNO_OOB : ref->key.recno,
+ &page));
ref->page = page;
goto done;
}
diff --git a/src/btree/bt_rebalance.c b/src/btree/bt_rebalance.c
new file mode 100644
index 00000000000..86360e83ddf
--- /dev/null
+++ b/src/btree/bt_rebalance.c
@@ -0,0 +1,486 @@
+/*-
+ * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Shared rebalance information.
+ */
+typedef struct {
+ WT_REF **leaf; /* List of leaf pages */
+ size_t leaf_next; /* Next entry */
+ size_t leaf_allocated; /* Allocated bytes */
+
+ WT_ADDR *fl; /* List of objects to free */
+ size_t fl_next; /* Next entry */
+ size_t fl_allocated; /* Allocated bytes */
+
+ WT_PAGE *root; /* Created root page */
+
+ uint8_t type; /* Internal page type */
+
+#define WT_REBALANCE_PROGRESS_INTERVAL 100
+ uint64_t progress; /* Progress counter */
+
+ WT_ITEM *tmp1; /* Temporary buffers */
+ WT_ITEM *tmp2;
+} WT_REBALANCE_STUFF;
+
+/*
+ * __rebalance_discard --
+ * Free the allocated information.
+ */
+static void
+__rebalance_discard(WT_SESSION_IMPL *session, WT_REBALANCE_STUFF *rs)
+{
+ while (rs->leaf_next > 0) {
+ --rs->leaf_next;
+ __wt_free_ref(
+ session, rs->leaf[rs->leaf_next], rs->type, false);
+ }
+ __wt_free(session, rs->leaf);
+
+ while (rs->fl_next > 0) {
+ --rs->fl_next;
+ __wt_free(session, rs->fl[rs->fl_next].addr);
+ }
+ __wt_free(session, rs->fl);
+}
+
+/*
+ * __rebalance_leaf_append --
+ * Add a new entry to the list of leaf pages.
+ */
+static int
+__rebalance_leaf_append(WT_SESSION_IMPL *session,
+ const uint8_t *key, size_t key_len, uint64_t recno,
+ const uint8_t *addr, size_t addr_len, u_int addr_type,
+ WT_REBALANCE_STUFF *rs)
+{
+ WT_ADDR *copy_addr;
+ WT_REF *copy;
+
+ WT_RET(__wt_verbose(session, WT_VERB_REBALANCE,
+ "rebalance leaf-list append %s, %s",
+ __wt_buf_set_printable(session, key, key_len, rs->tmp2),
+ __wt_addr_string(session, addr, addr_len, rs->tmp1)));
+
+ /* Allocate and initialize a new leaf page reference. */
+ WT_RET(__wt_realloc_def(
+ session, &rs->leaf_allocated, rs->leaf_next + 1, &rs->leaf));
+ WT_RET(__wt_calloc_one(session, &copy));
+ rs->leaf[rs->leaf_next++] = copy;
+
+ copy->page = NULL;
+ copy->home = NULL;
+ copy->pindex_hint = 0;
+ copy->state = WT_REF_DISK;
+
+ WT_RET(__wt_calloc_one(session, &copy_addr));
+ copy->addr = copy_addr;
+ WT_RET(__wt_strndup(session, addr, addr_len, &copy_addr->addr));
+ copy_addr->size = (uint8_t)addr_len;
+ copy_addr->type = (uint8_t)addr_type;
+
+ if (recno == WT_RECNO_OOB)
+ WT_RET(__wt_row_ikey(session, 0, key, key_len, copy));
+ else
+ copy->key.recno = recno;
+
+ copy->page_del = NULL;
+ return (0);
+}
+
+/*
+ * __rebalance_fl_append --
+ * Add a new entry to the free list.
+ */
+static int
+__rebalance_fl_append(WT_SESSION_IMPL *session,
+ const uint8_t *addr, size_t addr_len, WT_REBALANCE_STUFF *rs)
+{
+ WT_ADDR *copy;
+
+ WT_RET(__wt_realloc_def(
+ session, &rs->fl_allocated, rs->fl_next + 1, &rs->fl));
+ copy = &rs->fl[rs->fl_next++];
+
+ WT_RET(__wt_strndup(session, addr, addr_len, &copy->addr));
+ copy->size = (uint8_t)addr_len;
+ copy->type = 0;
+
+ return (0);
+}
+
+/*
+ * __rebalance_internal --
+ * Build an in-memory page that references all of the leaf pages we've
+ * found.
+ */
+static int
+__rebalance_internal(WT_SESSION_IMPL *session, WT_REBALANCE_STUFF *rs)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+ WT_REF **refp;
+ uint32_t i, leaf_next;
+
+ btree = S2BT(session);
+
+ /*
+ * There's a limit to the number of pages we can rebalance: the number
+ * of elements on a page is a 4B quantity and it's technically possible
+ * there could be more pages than that in a tree.
+ */
+ if (rs->leaf_next > UINT32_MAX)
+ WT_RET_MSG(session, ENOTSUP,
+ "too many leaf pages to rebalance, %" WT_SIZET_FMT " pages "
+ "exceeds the maximum of %" PRIu32,
+ rs->leaf_next, UINT32_MAX);
+ leaf_next = (uint32_t)rs->leaf_next;
+
+ /* Allocate a row-store root (internal) page and fill it in. */
+ WT_RET(__wt_page_alloc(session, rs->type,
+ rs->type == WT_PAGE_COL_INT ? 1 : 0, leaf_next, false, &page));
+ page->pg_intl_parent_ref = &btree->root;
+ WT_ERR(__wt_page_modify_init(session, page));
+ __wt_page_modify_set(session, page);
+
+ pindex = WT_INTL_INDEX_GET_SAFE(page);
+ for (refp = pindex->index, i = 0; i < leaf_next; ++i) {
+ rs->leaf[i]->home = page;
+ *refp++ = rs->leaf[i];
+ rs->leaf[i] = NULL;
+ }
+
+ rs->root = page;
+ return (0);
+
+err: __wt_page_out(session, &page);
+ return (ret);
+}
+
+/*
+ * __rebalance_free_original --
+ * Free the tracked internal pages and overflow keys.
+ */
+static int
+__rebalance_free_original(WT_SESSION_IMPL *session, WT_REBALANCE_STUFF *rs)
+{
+ WT_ADDR *addr;
+ uint64_t i;
+
+ for (i = 0; i < rs->fl_next; ++i) {
+ addr = &rs->fl[i];
+
+ WT_RET(__wt_verbose(session, WT_VERB_REBALANCE,
+ "rebalance discarding %s",
+ __wt_addr_string(
+ session, addr->addr, addr->size, rs->tmp1)));
+
+ WT_RET(__wt_btree_block_free(session, addr->addr, addr->size));
+ }
+ return (0);
+}
+
+/*
+ * __rebalance_col_walk --
+ * Walk a column-store page and its descendants.
+ */
+static int
+__rebalance_col_walk(
+ WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_REBALANCE_STUFF *rs)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK unpack;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ uint32_t i;
+
+ btree = S2BT(session);
+
+ WT_ERR(__wt_scr_alloc(session, 0, &buf));
+
+ /* Report progress periodically. */
+ if (++rs->progress % WT_REBALANCE_PROGRESS_INTERVAL == 0)
+ WT_ERR(__wt_progress(session, NULL, rs->progress));
+
+ /*
+ * Walk the page, instantiating keys: the page contains sorted key and
+ * location cookie pairs. Keys are on-page/overflow items and location
+ * cookies are WT_CELL_ADDR_XXX items.
+ */
+ WT_CELL_FOREACH(btree, dsk, cell, &unpack, i) {
+ __wt_cell_unpack(cell, &unpack);
+ switch (unpack.type) {
+ case WT_CELL_ADDR_INT:
+ /* An internal page: read it and recursively walk it. */
+ WT_ERR(__wt_bt_read(
+ session, buf, unpack.data, unpack.size));
+ WT_ERR(__rebalance_col_walk(session, buf->data, rs));
+ WT_ERR(__wt_verbose(session, WT_VERB_REBALANCE,
+ "free-list append internal page: %s",
+ __wt_addr_string(
+ session, unpack.data, unpack.size, rs->tmp1)));
+ WT_ERR(__rebalance_fl_append(
+ session, unpack.data, unpack.size, rs));
+ break;
+ case WT_CELL_ADDR_LEAF:
+ case WT_CELL_ADDR_LEAF_NO:
+ WT_ERR(__rebalance_leaf_append(session,
+ NULL, 0, unpack.v, unpack.data, unpack.size,
+ unpack.type == WT_CELL_ADDR_LEAF ?
+ WT_ADDR_LEAF : WT_ADDR_LEAF_NO, rs));
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ }
+
+err: __wt_scr_free(session, &buf);
+ return (ret);
+}
+
+/*
+ * __rebalance_row_leaf_key --
+ * Acquire a copy of the key for a leaf page.
+ */
+static int
+__rebalance_row_leaf_key(WT_SESSION_IMPL *session,
+ const uint8_t *addr, size_t addr_len, WT_ITEM *key, WT_REBALANCE_STUFF *rs)
+{
+ WT_PAGE *page;
+ WT_DECL_RET;
+
+ /*
+ * We need the first key from a leaf page. Leaf pages are relatively
+ * complex (Huffman encoding, prefix compression, and so on), do the
+ * work to instantiate the page and copy the first key to the buffer.
+ */
+ WT_RET(__wt_bt_read(session, rs->tmp1, addr, addr_len));
+ WT_RET(__wt_page_inmem(session, NULL, rs->tmp1->data, 0, 0, &page));
+ ret = __wt_row_leaf_key_copy(session, page, &page->pg_row_d[0], key);
+ __wt_page_out(session, &page);
+ return (ret);
+}
+
+/*
+ * __rebalance_row_walk --
+ * Walk a row-store page and its descendants.
+ */
+static int
+__rebalance_row_walk(
+ WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_REBALANCE_STUFF *rs)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK key, unpack;
+ WT_DECL_ITEM(buf);
+ WT_DECL_ITEM(leafkey);
+ WT_DECL_RET;
+ size_t len;
+ uint32_t i;
+ bool first_cell;
+ const void *p;
+
+ btree = S2BT(session);
+ WT_CLEAR(key); /* [-Werror=maybe-uninitialized] */
+
+ WT_ERR(__wt_scr_alloc(session, 0, &buf));
+ WT_ERR(__wt_scr_alloc(session, 0, &leafkey));
+
+ /* Report progress periodically. */
+ if (++rs->progress % WT_REBALANCE_PROGRESS_INTERVAL == 0)
+ WT_ERR(__wt_progress(session, NULL, rs->progress));
+
+ /*
+ * Walk the page, instantiating keys: the page contains sorted key and
+ * location cookie pairs. Keys are on-page/overflow items and location
+ * cookies are WT_CELL_ADDR_XXX items.
+ */
+ first_cell = true;
+ WT_CELL_FOREACH(btree, dsk, cell, &unpack, i) {
+ __wt_cell_unpack(cell, &unpack);
+ switch (unpack.type) {
+ case WT_CELL_KEY:
+ key = unpack;
+ break;
+ case WT_CELL_KEY_OVFL:
+ /*
+ * Any overflow key that references an internal page is
+ * of no further use, schedule its blocks to be freed.
+ *
+ * We could potentially use the same overflow key being
+ * freed here for the internal page we're creating, but
+ * that's more work to get reconciliation to understand
+ * and overflow keys are (well, should be), uncommon.
+ */
+ WT_ERR(__wt_verbose(session, WT_VERB_REBALANCE,
+ "free-list append overflow key: %s",
+ __wt_addr_string(
+ session, unpack.data, unpack.size, rs->tmp1)));
+
+ WT_ERR(__rebalance_fl_append(
+ session, unpack.data, unpack.size, rs));
+
+ key = unpack;
+ break;
+ case WT_CELL_ADDR_DEL:
+ /*
+ * A deleted leaf page: we're rebalancing this tree,
+ * which means no transaction can be active in it,
+ * which means no deleted leaf page is interesting,
+ * ignore it.
+ */
+ first_cell = false;
+ break;
+ case WT_CELL_ADDR_INT:
+ /* An internal page, schedule its blocks to be freed. */
+ WT_ERR(__wt_verbose(session, WT_VERB_REBALANCE,
+ "free-list append internal page: %s",
+ __wt_addr_string(
+ session, unpack.data, unpack.size, rs->tmp1)));
+ WT_ERR(__rebalance_fl_append(
+ session, unpack.data, unpack.size, rs));
+
+ /* Read and recursively walk the page. */
+ WT_ERR(__wt_bt_read(
+ session, buf, unpack.data, unpack.size));
+ WT_ERR(__rebalance_row_walk(session, buf->data, rs));
+ break;
+ case WT_CELL_ADDR_LEAF:
+ case WT_CELL_ADDR_LEAF_NO:
+ /*
+ * A leaf page.
+ * We can't trust the 0th key on an internal page (we
+ * often don't store them in reconciliation because it
+ * saves space), get it from the underlying leaf page.
+ * Else, if the internal page key is an overflow key,
+ * instantiate it and use it.
+ * Else, we can use the internal page's key as is, it's
+ * sufficient for the page.
+ */
+ if (first_cell) {
+ WT_ERR(__rebalance_row_leaf_key(session,
+ unpack.data, unpack.size, leafkey, rs));
+ p = leafkey->data;
+ len = leafkey->size;
+ } else if (key.type == WT_CELL_KEY_OVFL) {
+ WT_ERR(__wt_dsk_cell_data_ref(
+ session, WT_PAGE_ROW_INT, &key, leafkey));
+ p = leafkey->data;
+ len = leafkey->size;
+ } else {
+ p = key.data;
+ len = key.size;
+ }
+ WT_ERR(__rebalance_leaf_append(session,
+ p, len, WT_RECNO_OOB, unpack.data, unpack.size,
+ unpack.type == WT_CELL_ADDR_LEAF ?
+ WT_ADDR_LEAF : WT_ADDR_LEAF_NO, rs));
+
+ first_cell = false;
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ }
+
+err: __wt_scr_free(session, &buf);
+ __wt_scr_free(session, &leafkey);
+ return (ret);
+}
+
+/*
+ * __wt_bt_rebalance --
+ * Rebalance the last checkpoint in the file.
+ */
+int
+__wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_REBALANCE_STUFF *rs, _rstuff;
+ bool evict_reset;
+
+ WT_UNUSED(cfg);
+
+ btree = S2BT(session);
+
+ /*
+ * If the tree has never been written to disk, we're done, rebalance
+ * walks disk images, not in-memory pages. For the same reason, the
+ * tree has to be clean.
+ */
+ if (btree->root.page->dsk == NULL)
+ return (0);
+ if (btree->modified)
+ WT_RET_MSG(session, EINVAL,
+ "tree is modified, only clean trees may be rebalanced");
+
+ WT_CLEAR(_rstuff);
+ rs = &_rstuff;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &rs->tmp1));
+ WT_ERR(__wt_scr_alloc(session, 0, &rs->tmp2));
+
+ /* Set the internal page tree type. */
+ rs->type = btree->root.page->type;
+
+ /*
+ * Get exclusive access to the file. (Not required, the only page in the
+ * cache is the root page, and that cannot be evicted; however, this way
+ * eviction ignores the tree entirely.)
+ */
+ WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset));
+
+ /* Recursively walk the tree. */
+ switch (rs->type) {
+ case WT_PAGE_ROW_INT:
+ WT_ERR(
+ __rebalance_row_walk(session, btree->root.page->dsk, rs));
+ break;
+ case WT_PAGE_COL_INT:
+ WT_ERR(
+ __rebalance_col_walk(session, btree->root.page->dsk, rs));
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /* Build a new root page. */
+ WT_ERR(__rebalance_internal(session, rs));
+
+ /*
+ * Schedule the free of the original blocks (they shouldn't actually be
+ * freed until the next checkpoint completes).
+ */
+ WT_ERR(__rebalance_free_original(session, rs));
+
+ /*
+ * Swap the old root page for our newly built root page, writing the new
+ * root page as part of a checkpoint will finish the rebalance.
+ */
+ __wt_page_out(session, &btree->root.page);
+ btree->root.page = rs->root;
+ rs->root = NULL;
+
+err: /* Discard any leftover root page we created. */
+ if (rs->root != NULL) {
+ __wt_page_modify_clear(session, rs->root);
+ __wt_page_out(session, &rs->root);
+ }
+
+ /* Discard any leftover leaf and internal page information. */
+ __rebalance_discard(session, rs);
+
+ __wt_scr_free(session, &rs->tmp1);
+ __wt_scr_free(session, &rs->tmp2);
+
+ return (ret);
+}
diff --git a/src/btree/bt_ret.c b/src/btree/bt_ret.c
index c7a4b8e22f4..ebc0499f6a2 100644
--- a/src/btree/bt_ret.c
+++ b/src/btree/bt_ret.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c
index 756ffd98f3a..8d78bda79fb 100644
--- a/src/btree/bt_slvg.c
+++ b/src/btree/bt_slvg.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -595,22 +595,18 @@ __slvg_trk_leaf(WT_SESSION_IMPL *session,
WT_ERR(__wt_row_leaf_key_copy(session, page,
&page->pg_row_d[page->pg_row_entries - 1], &trk->row_stop));
- if (WT_VERBOSE_ISSET(session, WT_VERB_SALVAGE)) {
- WT_ERR(__wt_buf_set_printable(session, ss->tmp1,
- trk->row_start.data, trk->row_start.size));
- WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
- "%s start key %.*s",
- __wt_addr_string(session,
- trk->trk_addr, trk->trk_addr_size, ss->tmp2),
- (int)ss->tmp1->size, (char *)ss->tmp1->data));
- WT_ERR(__wt_buf_set_printable(session, ss->tmp1,
- trk->row_stop.data, trk->row_stop.size));
- WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
- "%s stop key %.*s",
- __wt_addr_string(session,
- trk->trk_addr, trk->trk_addr_size, ss->tmp2),
- (int)ss->tmp1->size, (char *)ss->tmp1->data));
- }
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s start key %s",
+ __wt_addr_string(session,
+ trk->trk_addr, trk->trk_addr_size, ss->tmp1),
+ __wt_buf_set_printable(session,
+ trk->row_start.data, trk->row_start.size, ss->tmp2)));
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s stop key %s",
+ __wt_addr_string(session,
+ trk->trk_addr, trk->trk_addr_size, ss->tmp1),
+ __wt_buf_set_printable(session,
+ trk->row_stop.data, trk->row_stop.size, ss->tmp2)));
/* Row-store pages can contain overflow items. */
WT_ERR(__slvg_trk_leaf_ovfl(session, dsk, trk));
@@ -1807,7 +1803,7 @@ err: if (page != NULL)
*/
static int
__slvg_row_build_internal(
- WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss)
+ WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss)
{
WT_ADDR *addr;
WT_DECL_RET;
@@ -1821,7 +1817,7 @@ __slvg_row_build_internal(
/* Allocate a row-store root (internal) page and fill it in. */
WT_RET(__wt_page_alloc(
- session, WT_PAGE_ROW_INT, 0, leaf_cnt, true, &page));
+ session, WT_PAGE_ROW_INT, WT_RECNO_OOB, leaf_cnt, true, &page));
WT_ERR(__slvg_modify_init(session, page));
pindex = WT_INTL_INDEX_GET_SAFE(page);
@@ -1937,16 +1933,12 @@ __slvg_row_build_leaf(
btree->collator, key, &trk->row_start, &cmp));
if (cmp >= 0)
break;
- if (WT_VERBOSE_ISSET(session, WT_VERB_SALVAGE)) {
- WT_ERR(__wt_buf_set_printable(session,
- ss->tmp1, key->data, key->size));
- WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
- "%s merge discarding leading key %.*s",
- __wt_addr_string(session,
- trk->trk_addr, trk->trk_addr_size,
- ss->tmp2), (int)ss->tmp1->size,
- (char *)ss->tmp1->data));
- }
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s merge discarding leading key %.*s",
+ __wt_addr_string(session,
+ trk->trk_addr, trk->trk_addr_size, ss->tmp1),
+ __wt_buf_set_printable(
+ session, key->data, key->size, ss->tmp2)));
++skip_start;
}
if (F_ISSET(trk, WT_TRACK_CHECK_STOP))
@@ -1961,16 +1953,12 @@ __slvg_row_build_leaf(
btree->collator, key, &trk->row_stop, &cmp));
if (cmp < 0)
break;
- if (WT_VERBOSE_ISSET(session, WT_VERB_SALVAGE)) {
- WT_ERR(__wt_buf_set_printable(session,
- ss->tmp1, key->data, key->size));
- WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
- "%s merge discarding trailing key %.*s",
- __wt_addr_string(session,
- trk->trk_addr, trk->trk_addr_size,
- ss->tmp2), (int)ss->tmp1->size,
- (char *)ss->tmp1->data));
- }
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s merge discarding trailing key %.*s",
+ __wt_addr_string(session,
+ trk->trk_addr, trk->trk_addr_size, ss->tmp1),
+ __wt_buf_set_printable(
+ session, key->data, key->size, ss->tmp2)));
++skip_stop;
}
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 12f4197e9e7..102265c0a8f 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -15,6 +15,22 @@
} while (0)
/*
+ * A note on error handling: main split functions first allocate/initialize new
+ * structures; failures during that period are handled by discarding the memory
+ * and returning an error code, the caller knows the split didn't happen and
+ * proceeds accordingly. Second, split functions update the tree, and a failure
+ * in that period is catastrophic, any partial update to the tree requires a
+ * panic, we can't recover. Third, once the split is complete and the tree has
+ * been fully updated, we have to ignore most errors, the split is complete and
+ * correct, callers have to proceed accordingly.
+ */
+typedef enum {
+ WT_ERR_IGNORE, /* Ignore minor errors */
+ WT_ERR_PANIC, /* Panic on all errors */
+ WT_ERR_RETURN /* Clean up and return error */
+} WT_SPLIT_ERROR_PHASE;
+
+/*
* __split_oldest_gen --
* Calculate the oldest active split generation.
*/
@@ -512,8 +528,9 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex;
WT_REF **alloc_refp;
WT_REF **child_refp, *ref, **root_refp;
+ WT_SPLIT_ERROR_PHASE complete;
size_t child_incr, root_decr, root_incr, size;
- uint64_t split_gen;
+ uint64_t recno, split_gen;
uint32_t children, chunk, i, j, remain;
uint32_t slots;
void *p;
@@ -539,7 +556,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
btree = S2BT(session);
alloc_index = NULL;
root_decr = root_incr = 0;
- complete = ERR_RETURN;
+ complete = WT_ERR_RETURN;
/* The root page will be marked dirty, make sure that will succeed. */
WT_RET(__wt_page_modify_init(session, root));
@@ -589,8 +606,11 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
for (root_refp = pindex->index,
alloc_refp = alloc_index->index, i = 0; i < children; ++i) {
slots = i == children - 1 ? remain : chunk;
+
+ recno = root->type == WT_PAGE_COL_INT ?
+ (*root_refp)->key.recno : WT_RECNO_OOB;
WT_ERR(__wt_page_alloc(
- session, root->type, 0, slots, false, &child));
+ session, root->type, recno, slots, false, &child));
/*
* Initialize the page's child reference; we need a copy of the
@@ -605,12 +625,10 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_ERR(__wt_row_ikey(session, 0, p, size, ref));
root_incr += sizeof(WT_IKEY) + size;
} else
- ref->key.recno = (*root_refp)->key.recno;
+ ref->key.recno = recno;
ref->state = WT_REF_MEM;
/* Initialize the child page. */
- if (root->type == WT_PAGE_COL_INT)
- child->pg_intl_recno = (*root_refp)->key.recno;
child->pg_intl_parent_ref = ref;
/* Mark it dirty. */
@@ -623,7 +641,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
* threads may be underneath us right now changing the structure
* state.) However, if the WT_REF structures reference on-page
* information, we have to fix that, because the disk image for
- * the page that has an page index entry for the WT_REF is about
+ * the page that has a page index entry for the WT_REF is about
* to change.
*/
child_pindex = WT_INTL_INDEX_GET_SAFE(child);
@@ -641,7 +659,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
root_refp - pindex->index == (ptrdiff_t)pindex->entries);
/* Start making real changes to the tree, errors are fatal. */
- complete = ERR_PANIC;
+ complete = WT_ERR_PANIC;
/* Prepare the WT_REFs for the move. */
__split_ref_step1(session, alloc_index, false);
@@ -661,7 +679,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_ERR(__split_ref_step2(session, alloc_index, false));
/* The split is complete and correct, ignore benign errors. */
- complete = ERR_IGNORE;
+ complete = WT_ERR_IGNORE;
/* We've installed the allocated page-index, ensure error handling. */
alloc_index = NULL;
@@ -687,15 +705,15 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
__wt_page_modify_set(session, root);
err: switch (complete) {
- case ERR_RETURN:
+ case WT_ERR_RETURN:
__wt_free_ref_index(session, root, alloc_index, true);
break;
- case ERR_PANIC:
+ case WT_ERR_PANIC:
__wt_err(session, ret,
"fatal error during root page split to deepen the tree");
ret = WT_PANIC;
break;
- case ERR_IGNORE:
+ case WT_ERR_IGNORE:
if (ret != 0 && ret != WT_PANIC) {
__wt_err(session, ret,
"ignoring not-fatal error during root page split "
@@ -721,19 +739,21 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
WT_PAGE *parent;
WT_PAGE_INDEX *alloc_index, *pindex;
WT_REF **alloc_refp, *next_ref;
+ WT_SPLIT_ERROR_PHASE complete;
size_t parent_decr, size;
uint64_t split_gen;
- uint32_t i, j;
+ uint32_t hint, i, j;
uint32_t deleted_entries, parent_entries, result_entries;
uint32_t *deleted_refs;
- bool complete, empty_parent;
+ bool empty_parent;
parent = ref->home;
alloc_index = pindex = NULL;
parent_decr = 0;
parent_entries = 0;
- complete = empty_parent = false;
+ empty_parent = false;
+ complete = WT_ERR_RETURN;
/* The parent page will be marked dirty, make sure that will succeed. */
WT_RET(__wt_page_modify_init(session, parent));
@@ -751,7 +771,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
* array anyway. Switch them to the special split state, so that any
* reading thread will restart.
*/
- WT_RET(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr));
+ WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr));
for (deleted_entries = 0, i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
@@ -791,28 +811,40 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
* Allocate and initialize a new page index array for the parent, then
* copy references from the original index array, plus references from
* the newly created split array, into place.
+ *
+ * Update the WT_REF's page-index hint as we go. This can race with a
+ * thread setting the hint based on an older page-index, and the change
+ * isn't backed out in the case of an error, so there ways for the hint
+ * to be wrong; OK because it's just a hint.
*/
size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *);
WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
parent_incr += size;
alloc_index->index = (WT_REF **)(alloc_index + 1);
alloc_index->entries = result_entries;
- for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) {
+ for (alloc_refp = alloc_index->index,
+ hint = i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
if (next_ref == ref)
for (j = 0; j < new_entries; ++j) {
ref_new[j]->home = parent;
+ ref_new[j]->pindex_hint = hint++;
*alloc_refp++ = ref_new[j];
}
- else if (next_ref->state != WT_REF_SPLIT)
+ else if (next_ref->state != WT_REF_SPLIT) {
/* Skip refs we have marked for deletion. */
+ next_ref->pindex_hint = hint++;
*alloc_refp++ = next_ref;
+ }
}
/* Check that we filled in all the entries. */
WT_ASSERT(session,
alloc_refp - alloc_index->index == (ptrdiff_t)result_entries);
+ /* Start making real changes to the tree, errors are fatal. */
+ complete = WT_ERR_PANIC;
+
/*
* Confirm the parent page's index hasn't moved then update it, which
* makes the split visible to threads descending the tree.
@@ -853,16 +885,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
*/
WT_FULL_BARRIER();
- /*
- * A note on error handling: failures before we swapped the new page
- * index into the parent can be resolved by freeing allocated memory
- * because the original page is unchanged, we can continue to use it
- * and we have not yet modified the parent. Failures after we swap
- * the new page index into the parent are also relatively benign, the
- * split is OK and complete. For those reasons, we ignore errors past
- * this point unless there's a panic.
- */
- complete = true;
+ /* The split is complete and correct, ignore benign errors. */
+ complete = WT_ERR_IGNORE;
WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
"%p: %s %s" "split into parent %p, %" PRIu32 " -> %" PRIu32
@@ -946,7 +970,8 @@ err: __wt_scr_free(session, &scr);
* nothing really bad can have happened, and our caller has to proceed
* with the split.
*/
- if (!complete) {
+ switch (complete) {
+ case WT_ERR_RETURN:
for (i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
if (next_ref->state == WT_REF_SPLIT)
@@ -954,20 +979,28 @@ err: __wt_scr_free(session, &scr);
}
__wt_free_ref_index(session, NULL, alloc_index, false);
-
/*
* The split couldn't proceed because the parent would be empty,
* return EBUSY so our caller knows to unlock the WT_REF that's
* being deleted, but don't be noisy, there's nothing wrong.
*/
if (empty_parent)
- return (EBUSY);
+ ret = EBUSY;
+ break;
+ case WT_ERR_PANIC:
+ __wt_err(session, ret, "fatal error during parent page split");
+ ret = WT_PANIC;
+ break;
+ case WT_ERR_IGNORE:
+ if (ret != 0 && ret != WT_PANIC) {
+ __wt_err(session, ret,
+ "ignoring not-fatal error during parent page "
+ "split");
+ ret = 0;
+ }
+ break;
}
-
- if (ret != 0 && ret != WT_PANIC)
- __wt_err(session, ret,
- "ignoring not-fatal error during parent page split");
- return (ret == WT_PANIC || !complete ? ret : 0);
+ return (ret);
}
/*
@@ -983,8 +1016,9 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex, *replace_index;
WT_REF **alloc_refp;
WT_REF **child_refp, *page_ref, **page_refp, *ref;
+ WT_SPLIT_ERROR_PHASE complete;
size_t child_incr, page_decr, page_incr, parent_incr, size;
- uint64_t split_gen;
+ uint64_t recno, split_gen;
uint32_t children, chunk, i, j, remain;
uint32_t slots;
void *p;
@@ -1012,7 +1046,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
alloc_index = replace_index = NULL;
page_ref = page->pg_intl_parent_ref;
page_decr = page_incr = parent_incr = 0;
- complete = ERR_RETURN;
+ complete = WT_ERR_RETURN;
/*
* Our caller is holding the page locked to single-thread splits, which
@@ -1081,8 +1115,11 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ASSERT(session, page_refp == pindex->index + chunk);
for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) {
slots = i == children - 1 ? remain : chunk;
+
+ recno = page->type == WT_PAGE_COL_INT ?
+ (*page_refp)->key.recno : WT_RECNO_OOB;
WT_ERR(__wt_page_alloc(
- session, page->type, 0, slots, false, &child));
+ session, page->type, recno, slots, false, &child));
/*
* Initialize the page's child reference; we need a copy of the
@@ -1097,12 +1134,10 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ERR(__wt_row_ikey(session, 0, p, size, ref));
parent_incr += sizeof(WT_IKEY) + size;
} else
- ref->key.recno = (*page_refp)->key.recno;
+ ref->key.recno = recno;
ref->state = WT_REF_MEM;
/* Initialize the child page. */
- if (page->type == WT_PAGE_COL_INT)
- child->pg_intl_recno = (*page_refp)->key.recno;
child->pg_intl_parent_ref = ref;
/* Mark it dirty. */
@@ -1133,7 +1168,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
page_refp - pindex->index == (ptrdiff_t)pindex->entries);
/* Start making real changes to the tree, errors are fatal. */
- complete = ERR_PANIC;
+ complete = WT_ERR_PANIC;
/* Prepare the WT_REFs for the move. */
__split_ref_step1(session, alloc_index, true);
@@ -1157,7 +1192,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ERR(__split_ref_step2(session, alloc_index, true));
/* The split is complete and correct, ignore benign errors. */
- complete = ERR_IGNORE;
+ complete = WT_ERR_IGNORE;
/*
* Push out the changes: not required for correctness, but no reason
@@ -1193,16 +1228,16 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
__wt_page_modify_set(session, page);
err: switch (complete) {
- case ERR_RETURN:
+ case WT_ERR_RETURN:
__wt_free_ref_index(session, page, alloc_index, true);
__wt_free_ref_index(session, page, replace_index, false);
break;
- case ERR_PANIC:
+ case WT_ERR_PANIC:
__wt_err(session, ret,
"fatal error during internal page split");
ret = WT_PANIC;
break;
- case ERR_IGNORE:
+ case WT_ERR_IGNORE:
if (ret != 0 && ret != WT_PANIC) {
__wt_err(session, ret,
"ignoring not-fatal error during internal page "
@@ -1654,10 +1689,11 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
WT_DECL_RET;
WT_DECL_ITEM(key);
WT_INSERT *ins, **insp, *moved_ins, *prev_ins;
- WT_INSERT_HEAD *ins_head;
+ WT_INSERT_HEAD *ins_head, *tmp_ins_head;
WT_PAGE *page, *right;
WT_REF *child, *split_ref[2] = { NULL, NULL };
size_t page_decr, parent_incr, right_incr;
+ uint8_t type;
int i;
WT_STAT_FAST_CONN_INCR(session, cache_inmem_split);
@@ -1666,6 +1702,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
page = ref->page;
right = NULL;
page_decr = parent_incr = right_incr = 0;
+ type = page->type;
/*
* Assert splitting makes sense; specifically assert the page is dirty,
@@ -1679,9 +1716,12 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
F_SET_ATOMIC(page, WT_PAGE_SPLIT_INSERT);
/* Find the last item on the page. */
- ins_head = page->pg_row_entries == 0 ?
- WT_ROW_INSERT_SMALLEST(page) :
- WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1);
+ if (type == WT_PAGE_ROW_LEAF)
+ ins_head = page->pg_row_entries == 0 ?
+ WT_ROW_INSERT_SMALLEST(page) :
+ WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1);
+ else
+ ins_head = WT_COL_APPEND(page);
moved_ins = WT_SKIP_LAST(ins_head);
/*
@@ -1692,14 +1732,12 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
* The new WT_REF is not quite identical: we have to instantiate a key,
* and the new reference is visible to readers once the split completes.
*
- * The key-instantiation code checks for races, leave the key fields
- * zeroed we don't trigger them.
- *
* Don't copy any deleted page state: we may be splitting a page that
* was instantiated after a truncate and that history should not be
* carried onto these new child pages.
*/
WT_ERR(__wt_calloc_one(session, &split_ref[0]));
+ parent_incr += sizeof(WT_REF);
child = split_ref[0];
child->page = ref->page;
child->home = ref->home;
@@ -1713,49 +1751,82 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
*/
ref->addr = NULL;
- /*
- * Copy the first key from the original page into first ref in the new
- * parent. Pages created in memory always have a "smallest" insert
- * list, so look there first. If we don't find one, get the first key
- * from the disk image.
- *
- * We can't just use the key from the original ref: it may have been
- * suffix-compressed, and after the split the truncated key may not be
- * valid.
- */
- WT_ERR(__wt_scr_alloc(session, 0, &key));
- if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL) {
- key->data = WT_INSERT_KEY(ins);
- key->size = WT_INSERT_KEY_SIZE(ins);
+ if (type == WT_PAGE_ROW_LEAF) {
+ /*
+ * Copy the first key from the original page into first ref in
+ * the new parent. Pages created in memory always have a
+ * "smallest" insert list, so look there first. If we don't
+ * find one, get the first key from the disk image.
+ *
+ * We can't just use the key from the original ref: it may have
+ * been suffix-compressed, and after the split the truncated key
+ * may not be valid.
+ */
+ WT_ERR(__wt_scr_alloc(session, 0, &key));
+ if ((ins =
+ WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL) {
+ key->data = WT_INSERT_KEY(ins);
+ key->size = WT_INSERT_KEY_SIZE(ins);
+ } else
+ WT_ERR(__wt_row_leaf_key(
+ session, page, &page->pg_row_d[0], key, true));
+ WT_ERR(__wt_row_ikey(session, 0, key->data, key->size, child));
+ parent_incr += sizeof(WT_IKEY) + key->size;
+ __wt_scr_free(session, &key);
} else
- WT_ERR(__wt_row_leaf_key(
- session, page, &page->pg_row_d[0], key, true));
- WT_ERR(__wt_row_ikey(session, 0, key->data, key->size, child));
- parent_incr += sizeof(WT_REF) + sizeof(WT_IKEY) + key->size;
- __wt_scr_free(session, &key);
+ child->key.recno = ref->key.recno;
/*
* The second page in the split is a new WT_REF/page pair.
*/
- WT_ERR(__wt_page_alloc(session, WT_PAGE_ROW_LEAF, 0, 0, false, &right));
- WT_ERR(__wt_calloc_one(session, &right->pg_row_ins));
- WT_ERR(__wt_calloc_one(session, &right->pg_row_ins[0]));
+ if (type == WT_PAGE_ROW_LEAF)
+ WT_ERR(__wt_page_alloc(session,
+ type, WT_RECNO_OOB, 0, false, &right));
+ else
+ WT_ERR(__wt_page_alloc(session,
+ type, WT_INSERT_RECNO(moved_ins), 0, false, &right));
+
+ /*
+ * The new page is dirty by definition, column-store splits update the
+ * page-modify structure, so create it now.
+ */
+ WT_ERR(__wt_page_modify_init(session, right));
+ __wt_page_modify_set(session, right);
+
+ if (type == WT_PAGE_ROW_LEAF) {
+ WT_ERR(__wt_calloc_one(session, &right->pg_row_ins));
+ WT_ERR(__wt_calloc_one(session, &right->pg_row_ins[0]));
+ } else {
+ WT_ERR(__wt_calloc_one(session, &right->modify->mod_append));
+ WT_ERR(__wt_calloc_one(session, &right->modify->mod_append[0]));
+ }
right_incr += sizeof(WT_INSERT_HEAD);
right_incr += sizeof(WT_INSERT_HEAD *);
WT_ERR(__wt_calloc_one(session, &split_ref[1]));
+ parent_incr += sizeof(WT_REF);
child = split_ref[1];
child->page = right;
child->state = WT_REF_MEM;
- WT_ERR(__wt_row_ikey(session, 0,
- WT_INSERT_KEY(moved_ins), WT_INSERT_KEY_SIZE(moved_ins),
- child));
- parent_incr +=
- sizeof(WT_REF) + sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins);
- /* The new page is dirty by definition. */
- WT_ERR(__wt_page_modify_init(session, right));
- __wt_page_modify_set(session, right);
+ if (type == WT_PAGE_ROW_LEAF) {
+ WT_ERR(__wt_row_ikey(session, 0,
+ WT_INSERT_KEY(moved_ins), WT_INSERT_KEY_SIZE(moved_ins),
+ child));
+ parent_incr += sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins);
+ } else
+ child->key.recno = WT_INSERT_RECNO(moved_ins);
+
+ /*
+ * Allocation operations completed, we're going to split.
+ *
+ * Record the split column-store page record, used in reconciliation.
+ */
+ if (type != WT_PAGE_ROW_LEAF) {
+ WT_ASSERT(session,
+ page->modify->mod_split_recno == WT_RECNO_OOB);
+ page->modify->mod_split_recno = child->key.recno;
+ }
/*
* We modified the page above, which will have set the first dirty
@@ -1779,15 +1850,16 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
page_decr, right_incr, __wt_update_list_memsize(moved_ins->upd));
/*
- * Allocation operations completed, move the last insert list item from
- * the original page to the new page.
+ * Move the last insert list item from the original page to the new
+ * page.
*
* First, update the item to the new child page. (Just append the entry
* for simplicity, the previous skip list pointers originally allocated
* can be ignored.)
*/
- right->pg_row_ins[0]->head[0] =
- right->pg_row_ins[0]->tail[0] = moved_ins;
+ tmp_ins_head = type == WT_PAGE_ROW_LEAF ?
+ right->pg_row_ins[0] : right->modify->mod_append[0];
+ tmp_ins_head->head[0] = tmp_ins_head->tail[0] = moved_ins;
/*
* Remove the entry from the orig page (i.e truncate the skip list).
@@ -1872,34 +1944,40 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
__wt_cache_page_inmem_incr(session, right, right_incr);
/*
- * Split into the parent. After this, the original page is no
+ * Split into the parent. On successful return, the original page is no
* longer locked, so we cannot safely look at it.
*/
page = NULL;
if ((ret = __split_parent(
- session, ref, split_ref, 2, parent_incr, false, true)) != 0) {
- /*
- * Move the insert list element back to the original page list.
- * For simplicity, the previous skip list pointers originally
- * allocated can be ignored, just append the entry to the end of
- * the level 0 list. As before, we depend on the list having
- * multiple elements and ignore the edge cases small lists have.
- */
- right->pg_row_ins[0]->head[0] =
- right->pg_row_ins[0]->tail[0] = NULL;
- ins_head->tail[0]->next[0] = moved_ins;
- ins_head->tail[0] = moved_ins;
+ session, ref, split_ref, 2, parent_incr, false, true)) == 0)
+ return (0);
- /*
- * We marked the new page dirty; we're going to discard it, but
- * first mark it clean and fix up the cache statistics.
- */
- __wt_page_modify_clear(session, right);
+ /*
+ * Failure.
+ *
+ * Reset the split column-store page record.
+ */
+ page->modify->mod_split_recno = WT_RECNO_OOB;
- WT_ERR(ret);
- }
+ /*
+ * Clear the allocated page's reference to the moved insert list element
+ * so it's not freed when we discard the page.
+ *
+ * Move the element back to the original page list. For simplicity, the
+ * previous skip list pointers originally allocated can be ignored, just
+ * append the entry to the end of the level 0 list. As before, we depend
+ * on the list having multiple elements and ignore the edge cases small
+ * lists have.
+ */
+ if (type == WT_PAGE_ROW_LEAF)
+ right->pg_row_ins[0]->head[0] =
+ right->pg_row_ins[0]->tail[0] = NULL;
+ else
+ right->modify->mod_append[0]->head[0] =
+ right->modify->mod_append[0]->tail[0] = NULL;
- return (0);
+ ins_head->tail[0]->next[0] = moved_ins;
+ ins_head->tail[0] = moved_ins;
err: if (split_ref[0] != NULL) {
/*
@@ -1907,15 +1985,23 @@ err: if (split_ref[0] != NULL) {
*/
ref->addr = split_ref[0]->addr;
- __wt_free(session, split_ref[0]->key.ikey);
+ if (type == WT_PAGE_ROW_LEAF)
+ __wt_free(session, split_ref[0]->key.ikey);
__wt_free(session, split_ref[0]);
}
if (split_ref[1] != NULL) {
- __wt_free(session, split_ref[1]->key.ikey);
+ if (type == WT_PAGE_ROW_LEAF)
+ __wt_free(session, split_ref[1]->key.ikey);
__wt_free(session, split_ref[1]);
}
- if (right != NULL)
+ if (right != NULL) {
+ /*
+ * We marked the new page dirty; we're going to discard it,
+ * but first mark it clean and fix up the cache statistics.
+ */
+ __wt_page_modify_clear(session, right);
__wt_page_out(session, &right);
+ }
__wt_scr_free(session, &key);
return (ret);
}
diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c
index 5dd75835b0b..3d5abf34147 100644
--- a/src/btree/bt_stat.c
+++ b/src/btree/bt_stat.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -35,10 +35,10 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
WT_STAT_SET(session, stats, btree_fixed_len, btree->bitcnt);
WT_STAT_SET(session, stats, btree_maximum_depth, btree->maximum_depth);
- WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage);
WT_STAT_SET(session, stats, btree_maxintlkey, btree->maxintlkey);
- WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage);
+ WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage);
WT_STAT_SET(session, stats, btree_maxleafkey, btree->maxleafkey);
+ WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage);
WT_STAT_SET(session, stats, btree_maxleafvalue, btree->maxleafvalue);
/* Everything else is really, really expensive. */
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index 86607d8f187..5cbd8d1e996 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/bt_upgrade.c b/src/btree/bt_upgrade.c
index 6b403595ecc..a9ff16ad496 100644
--- a/src/btree/bt_upgrade.c
+++ b/src/btree/bt_upgrade.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c
index d745210bdce..ae2c20be1b6 100644
--- a/src/btree/bt_vrfy.c
+++ b/src/btree/bt_vrfy.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -30,8 +30,7 @@ typedef struct {
u_int depth, depth_internal[100], depth_leaf[100];
- WT_ITEM *tmp1; /* Temporary buffer */
- WT_ITEM *tmp2; /* Temporary buffer */
+ WT_ITEM *tmp1, *tmp2, *tmp3, *tmp4; /* Temporary buffers */
} WT_VSTUFF;
static void __verify_checkpoint_reset(WT_VSTUFF *);
@@ -170,6 +169,8 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
WT_ERR(__wt_scr_alloc(session, 0, &vs->max_addr));
WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1));
WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2));
+ WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp3));
+ WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp4));
/* Check configuration strings. */
WT_ERR(__verify_config(session, cfg, vs));
@@ -251,6 +252,8 @@ err: /* Inform the underlying block manager we're done. */
__wt_scr_free(session, &vs->max_addr);
__wt_scr_free(session, &vs->tmp1);
__wt_scr_free(session, &vs->tmp2);
+ __wt_scr_free(session, &vs->tmp3);
+ __wt_scr_free(session, &vs->tmp4);
return (ret);
}
@@ -570,10 +573,14 @@ __verify_row_int_key_order(WT_SESSION_IMPL *session,
WT_RET_MSG(session, WT_ERROR,
"the internal key in entry %" PRIu32 " on the page at %s "
"sorts before the last key appearing on page %s, earlier "
- "in the tree",
+ "in the tree: %s, %s",
entry,
__wt_page_addr_string(session, ref, vs->tmp1),
- (char *)vs->max_addr->data);
+ (char *)vs->max_addr->data,
+ __wt_buf_set_printable(session,
+ item.data, item.size, vs->tmp2),
+ __wt_buf_set_printable(session,
+ vs->max_key->data, vs->max_key->size, vs->tmp3));
/* Update the largest key we've seen to the key just checked. */
WT_RET(__wt_buf_set(session, vs->max_key, item.data, item.size));
@@ -628,11 +635,15 @@ __verify_row_leaf_key_order(
btree->collator, vs->tmp1, (WT_ITEM *)vs->max_key, &cmp));
if (cmp < 0)
WT_RET_MSG(session, WT_ERROR,
- "the first key on the page at %s sorts equal to or "
- "less than a key appearing on the page at %s, "
- "earlier in the tree",
- __wt_page_addr_string(session, ref, vs->tmp1),
- (char *)vs->max_addr->data);
+ "the first key on the page at %s sorts equal to "
+ "or less than the last key appearing on the page "
+ "at %s, earlier in the tree: %s, %s",
+ __wt_page_addr_string(session, ref, vs->tmp2),
+ (char *)vs->max_addr->data,
+ __wt_buf_set_printable(session,
+ vs->tmp1->data, vs->tmp1->size, vs->tmp3),
+ __wt_buf_set_printable(session,
+ vs->max_key->data, vs->max_key->size, vs->tmp4));
}
/* Update the largest key we've seen to the last key on this page. */
diff --git a/src/btree/bt_vrfy_dsk.c b/src/btree/bt_vrfy_dsk.c
index a703fbd540d..5480a25b5ec 100644
--- a/src/btree/bt_vrfy_dsk.c
+++ b/src/btree/bt_vrfy_dsk.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -197,6 +197,8 @@ __verify_dsk_row(
WT_DECL_ITEM(current);
WT_DECL_ITEM(last_ovfl);
WT_DECL_ITEM(last_pfx);
+ WT_DECL_ITEM(tmp1);
+ WT_DECL_ITEM(tmp2);
WT_DECL_RET;
WT_ITEM *last;
enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type;
@@ -213,6 +215,8 @@ __verify_dsk_row(
WT_ERR(__wt_scr_alloc(session, 0, &current));
WT_ERR(__wt_scr_alloc(session, 0, &last_pfx));
WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl));
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp1));
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp2));
last = last_ovfl;
end = (uint8_t *)dsk + dsk->mem_size;
@@ -402,8 +406,12 @@ key_compare: /*
if (cmp >= 0)
WT_ERR_VRFY(session,
"the %" PRIu32 " and %" PRIu32 " keys on "
- "page at %s are incorrectly sorted",
- cell_num - 2, cell_num, tag);
+ "page at %s are incorrectly sorted: %s, %s",
+ cell_num - 2, cell_num, tag,
+ __wt_buf_set_printable(session,
+ last->data, last->size, tmp1),
+ __wt_buf_set_printable(session,
+ current->data, current->size, tmp2));
}
/*
@@ -464,6 +472,8 @@ err: if (ret == 0)
__wt_scr_free(session, &current);
__wt_scr_free(session, &last_pfx);
__wt_scr_free(session, &last_ovfl);
+ __wt_scr_free(session, &tmp1);
+ __wt_scr_free(session, &tmp2);
return (ret);
}
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index abb18529041..49a59b89552 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -9,11 +9,11 @@
#include "wt_internal.h"
/*
- * __page_refp --
+ * __ref_index_slot --
* Return the page's index and slot for a reference.
*/
static inline void
-__page_refp(WT_SESSION_IMPL *session,
+__ref_index_slot(WT_SESSION_IMPL *session,
WT_REF *ref, WT_PAGE_INDEX **pindexp, uint32_t *slotp)
{
WT_PAGE_INDEX *pindex;
@@ -32,37 +32,36 @@ retry: WT_INTL_INDEX_GET(session, ref->home, pindex);
* loop is from the hint to the end of the list, and the second loop
* is from the start of the list to the end of the list. (The second
* loop overlaps the first, but that only happen in cases where we've
- * deepened the tree and aren't going to find our slot at all, that's
- * not worth optimizing.)
+ * split the tree and aren't going to find our slot at all, that's not
+ * worth optimizing.)
*
* It's not an error for the reference hint to be wrong, it just means
* the first retrieval (which sets the hint for subsequent retrievals),
* is slower.
*/
i = ref->pindex_hint;
- if (i < pindex->entries && pindex->index[i]->page == ref->page) {
+ if (i < pindex->entries && pindex->index[i] == ref) {
*pindexp = pindex;
*slotp = i;
return;
}
while (++i < pindex->entries)
- if (pindex->index[i]->page == ref->page) {
+ if (pindex->index[i] == ref) {
*pindexp = pindex;
*slotp = ref->pindex_hint = i;
return;
}
for (i = 0; i < pindex->entries; ++i)
- if (pindex->index[i]->page == ref->page) {
+ if (pindex->index[i] == ref) {
*pindexp = pindex;
*slotp = ref->pindex_hint = i;
return;
}
/*
- * If we don't find our reference, the page split into a new level and
- * our home pointer references the wrong page. After internal pages
- * deepen, their reference structure home value are updated; yield and
- * wait for that to happen.
+ * If we don't find our reference, the page split and our home pointer
+ * references the wrong page. When internal pages split, their WT_REF
+ * structure home values are updated; yield and wait for that to happen.
*/
__wt_yield();
goto retry;
@@ -116,13 +115,45 @@ __page_ascend(WT_SESSION_IMPL *session,
parent_ref = ref->home->pg_intl_parent_ref;
if (__wt_ref_is_root(parent_ref))
break;
- __page_refp(session, parent_ref, pindexp, slotp);
+ __ref_index_slot(session, parent_ref, pindexp, slotp);
/*
- * When internal pages split, the WT_REF structures being moved
- * are updated first. If the WT_REF we started with references
- * the same page as we found on our search of the parent, there
- * is a consistent view.
+ * There's a split race when a cursor moving forwards through
+ * the tree ascends the tree. If we're splitting an internal
+ * page into its parent, we move the WT_REF structures and
+ * then update the parent's page index before updating the split
+ * page's page index, and it's not an atomic update. A thread
+ * can read the split page's original page index and then read
+ * the parent page's replacement index.
+ *
+ * This can create a race for next-cursor movements.
+ *
+ * For example, imagine an internal page with 3 child pages,
+ * with the namespaces a-f, g-h and i-j; the first child page
+ * splits. The parent starts out with the following page-index:
+ *
+ * | ... | a | g | i | ... |
+ *
+ * which changes to this:
+ *
+ * | ... | a | c | e | g | i | ... |
+ *
+ * The split page starts out with the following page-index:
+ *
+ * | a | b | c | d | e | f |
+ *
+ * Imagine a cursor finishing the 'f' part of the namespace that
+ * starts its ascent to the parent's 'a' slot. Then the page
+ * splits and the parent page's page index is replaced. If the
+ * cursor then searches the parent's replacement page index for
+ * the 'a' slot, it finds it and then increments to the slot
+ * after the 'a' slot, the 'c' slot, and then it incorrectly
+ * repeats its traversal of part of the namespace.
+ *
+ * This function takes a WT_REF argument which is the page from
+ * which we start our ascent. If the parent's slot we find in
+ * our search doesn't point to the same page as that initial
+ * WT_REF, there's a race and we start over again.
*/
if (ref->home == parent_ref->page)
break;
@@ -132,6 +163,91 @@ __page_ascend(WT_SESSION_IMPL *session,
}
/*
+ * __page_descend --
+ * Descend the tree one level.
+ */
+static void
+__page_descend(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_PAGE_INDEX **pindexp, uint32_t *slotp, bool prev)
+{
+ WT_PAGE_INDEX *pindex;
+
+ /*
+ * Ref is a child page into which we're descending, and on which we
+ * have a hazard pointer.
+ */
+ for (;; __wt_yield()) {
+ WT_INTL_INDEX_GET(session, page, pindex);
+ *slotp = prev ? pindex->entries - 1 : 0;
+
+ /*
+ * There's a split race when a cursor moving backwards through
+ * the tree descends the tree. If we're splitting an internal
+ * page into its parent, we move the WT_REF structures and
+ * update the parent's page index before updating the split
+ * page's page index, and it's not an atomic update. A thread
+ * can read the parent page's replacement page index and then
+ * read the split page's original index.
+ *
+ * This can create a race for previous-cursor movements.
+ *
+ * For example, imagine an internal page with 3 child pages,
+ * with the namespaces a-f, g-h and i-j; the first child page
+ * splits. The parent starts out with the following page-index:
+ *
+ * | ... | a | g | i | ... |
+ *
+ * The split page starts out with the following page-index:
+ *
+ * | a | b | c | d | e | f |
+ *
+ * The first step is to move the c-f ranges into a new subtree,
+ * so, for example we might have two new internal pages 'c' and
+ * 'e', where the new 'c' page references the c-d namespace and
+ * the new 'e' page references the e-f namespace. The top of the
+ * subtree references the parent page, but until the parent's
+ * page index is updated, any threads in the subtree won't be
+ * able to ascend out of the subtree. However, once the parent
+ * page's page index is updated to this:
+ *
+ * | ... | a | c | e | g | i | ... |
+ *
+ * threads in the subtree can ascend into the parent. Imagine a
+ * cursor in the c-d part of the namespace that ascends to the
+ * parent's 'c' slot. It would then decrement to the slot before
+ * the 'c' slot, the 'a' slot.
+ *
+ * The previous-cursor movement selects the last slot in the 'a'
+ * page; if the split page's page-index hasn't been updated yet,
+ * it will select the 'f' slot, which is incorrect. Once the
+ * split page's page index is updated to this:
+ *
+ * | a | b |
+ *
+ * the previous-cursor movement will select the 'b' slot, which
+ * is correct.
+ *
+ * This function takes an argument which is the internal page
+ * from which we're descending. If the last slot on the page no
+ * longer points to the current page as its "home", the page is
+ * being split and part of its namespace moved. We have the
+ * correct page and we don't have to move, all we have to do is
+ * wait until the split page's page index is updated.
+ *
+ * No test is necessary for a next-cursor movement because we
+ * do right-hand splits on internal pages and the initial part
+ * of the page's namespace won't change as part of a split.
+ * Instead of testing the direction boolean, do the test the
+ * previous cursor movement requires in all cases, even though
+ * it will always succeed for a next-cursor movement.
+ */
+ if (pindex->index[*slotp]->home == page)
+ break;
+ }
+ *pindexp = pindex;
+}
+
+/*
* __tree_walk_internal --
* Move to the next/previous page in the tree.
*/
@@ -225,7 +341,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
}
/* Figure out the current slot in the WT_REF array. */
- __page_refp(session, ref, &pindex, &slot);
+ __ref_index_slot(session, ref, &pindex, &slot);
for (;;) {
/*
@@ -270,12 +386,8 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
* the parent can't have been evicted.
*/
if (!LF_ISSET(WT_READ_SKIP_INTL)) {
- if ((ret = __wt_page_swap(
- session, couple, ref, flags)) != 0) {
- WT_TRET(__wt_page_release(
- session, couple, flags));
- WT_ERR(ret);
- }
+ WT_ERR(__wt_page_swap(
+ session, couple, ref, flags));
*refp = ref;
goto done;
}
@@ -389,7 +501,8 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
}
}
- ret = __wt_page_swap(session, couple, ref, flags);
+ ret = __wt_page_swap(session, couple, ref,
+ WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags);
/*
* Not-found is an expected return when only walking
@@ -434,7 +547,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
couple == couple_orig ||
WT_PAGE_IS_INTERNAL(couple->page));
ref = couple;
- __page_refp(session, ref, &pindex, &slot);
+ __ref_index_slot(session, ref, &pindex, &slot);
if (couple == couple_orig)
break;
}
@@ -446,9 +559,10 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
*/
if (WT_PAGE_IS_INTERNAL(ref->page)) {
descend: couple = ref;
- WT_INTL_INDEX_GET(session, ref->page, pindex);
- slot = prev ? pindex->entries - 1 : 0;
empty_internal = true;
+
+ __page_descend(
+ session, ref->page, &pindex, &slot, prev);
} else {
/*
* Optionally skip leaf pages, the second half.
diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c
index bb2de3f444b..645d98d9c9b 100644
--- a/src/btree/col_modify.c
+++ b/src/btree/col_modify.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c
index e9fa570f97b..cb5a227495f 100644
--- a/src/btree/col_srch.c
+++ b/src/btree/col_srch.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -9,12 +9,60 @@
#include "wt_internal.h"
/*
+ * __check_leaf_key_range --
+ * Check the search key is in the leaf page's key range.
+ */
+static inline int
+__check_leaf_key_range(WT_SESSION_IMPL *session,
+ uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
+{
+ WT_PAGE_INDEX *pindex;
+ uint32_t indx;
+
+ /*
+ * There are reasons we can't do the fast checks, and we continue with
+ * the leaf page search in those cases, only skipping the complete leaf
+ * page search if we know it's not going to work.
+ */
+ cbt->compare = 0;
+
+ /*
+ * Check if the search key is smaller than the parent's starting key for
+ * this page.
+ */
+ if (recno < leaf->key.recno) {
+ cbt->compare = 1; /* page keys > search key */
+ return (0);
+ }
+
+ /*
+ * Check if the search key is greater than or equal to the starting key
+ * for the parent's next page.
+ *
+ * !!!
+ * Check that "indx + 1" is a valid page-index entry first, because it
+ * also checks that "indx" is a valid page-index entry, and we have to
+ * do that latter check before looking at the indx slot of the array
+ * for a match to leaf (in other words, our page hint might be wrong).
+ */
+ WT_INTL_INDEX_GET(session, leaf->home, pindex);
+ indx = leaf->pindex_hint;
+ if (indx + 1 < pindex->entries && pindex->index[indx] == leaf)
+ if (recno >= pindex->index[indx + 1]->key.recno) {
+ cbt->compare = -1; /* page keys < search key */
+ return (0);
+ }
+
+ return (0);
+}
+
+/*
* __wt_col_search --
* Search a column-store tree for a specific record-based key.
*/
int
__wt_col_search(WT_SESSION_IMPL *session,
- uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
+ uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
{
WT_BTREE *btree;
WT_COL *cip;
@@ -24,6 +72,7 @@ __wt_col_search(WT_SESSION_IMPL *session,
WT_PAGE *page;
WT_PAGE_INDEX *pindex, *parent_pindex;
WT_REF *current, *descent;
+ uint64_t recno;
uint32_t base, indx, limit;
int depth;
@@ -31,8 +80,38 @@ __wt_col_search(WT_SESSION_IMPL *session,
__cursor_pos_clear(cbt);
- /* We may only be searching a single leaf page, not the full tree. */
+ /*
+ * When appending a new record, the search record number will be an
+ * out-of-band value, search for the largest key in the table instead.
+ */
+ if ((recno = search_recno) == WT_RECNO_OOB)
+ recno = UINT64_MAX;
+
+ /*
+ * We may be searching only a single leaf page, not the full tree. In
+ * the normal case where the page links to a parent, check the page's
+ * parent keys before doing the full search, it's faster when the
+ * cursor is being re-positioned. (One case where the page doesn't
+ * have a parent is if it is being re-instantiated in memory as part
+ * of a split).
+ */
if (leaf != NULL) {
+ WT_ASSERT(session, search_recno != WT_RECNO_OOB);
+
+ if (leaf->home != NULL) {
+ WT_RET(__check_leaf_key_range(
+ session, recno, leaf, cbt));
+ if (cbt->compare != 0) {
+ /*
+ * !!!
+ * WT_CURSOR.search_near uses the slot value to
+ * decide if there was an on-page match.
+ */
+ cbt->slot = 0;
+ return (0);
+ }
+ }
+
current = leaf;
goto leaf_only;
}
@@ -103,7 +182,8 @@ descend: /*
* page; otherwise return on error, the swap call ensures we're
* holding nothing on failure.
*/
- if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) {
+ if ((ret = __wt_page_swap(
+ session, current, descent, WT_READ_RESTART_OK)) == 0) {
current = descent;
continue;
}
@@ -120,7 +200,17 @@ leaf_only:
page = current->page;
cbt->ref = current;
cbt->recno = recno;
- cbt->compare = 0;
+
+ /*
+ * Don't bother searching if the caller is appending a new record where
+ * we'll allocate the record number; we're not going to find a match by
+ * definition, and we figure out the record number and position when we
+ * do the work.
+ */
+ if (search_recno == WT_RECNO_OOB) {
+ cbt->compare = -1;
+ return (0);
+ }
/*
* Set the on-page slot to an impossible value larger than any possible
@@ -142,6 +232,7 @@ leaf_only:
* that's impossibly large for the page. We do have additional setup to
* do in that case, the record may be appended to the page.
*/
+ cbt->compare = 0;
if (page->type == WT_PAGE_COL_FIX) {
if (recno < page->pg_fix_recno) {
cbt->compare = 1;
@@ -190,18 +281,10 @@ past_end:
* This is a rarely used path: we normally find exact matches, because
* column-store files are dense, but in this case the caller searched
* past the end of the table.
- *
- * Don't bother searching if the caller is appending a new record where
- * we'll allocate the record number; we're not going to find a match by
- * definition, and we figure out the position when we do the work.
*/
cbt->ins_head = WT_COL_APPEND(page);
- if (recno == UINT64_MAX)
- cbt->ins = NULL;
- else
- cbt->ins = __col_insert_search(
- cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno);
- if (cbt->ins == NULL)
+ if ((cbt->ins = __col_insert_search(
+ cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno)) == NULL)
cbt->compare = -1;
else {
cbt->recno = WT_INSERT_RECNO(cbt->ins);
@@ -212,14 +295,5 @@ past_end:
else
cbt->compare = -1;
}
-
- /*
- * Note if the record is past the maximum record in the tree, the cursor
- * search functions need to know for fixed-length column-stores because
- * appended records implicitly create any skipped records, and cursor
- * search functions have to handle that case.
- */
- if (cbt->compare == -1)
- F_SET(cbt, WT_CBT_MAX_RECORD);
return (0);
}
diff --git a/src/btree/row_key.c b/src/btree/row_key.c
index 6d24708e59c..8b9e858ec18 100644
--- a/src/btree/row_key.c
+++ b/src/btree/row_key.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c
index 0fc02948dd3..176016bb340 100644
--- a/src/btree/row_modify.c
+++ b/src/btree/row_modify.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
index 079f9d3bad1..c06274cdb17 100644
--- a/src/btree/row_srch.c
+++ b/src/btree/row_srch.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -132,6 +132,76 @@ __wt_search_insert(
}
/*
+ * __check_leaf_key_range --
+ * Check the search key is in the leaf page's key range.
+ */
+static inline int
+__check_leaf_key_range(WT_SESSION_IMPL *session,
+ WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_COLLATOR *collator;
+ WT_ITEM *item;
+ WT_PAGE_INDEX *pindex;
+ uint32_t indx;
+ int cmp;
+
+ btree = S2BT(session);
+ collator = btree->collator;
+ item = cbt->tmp;
+
+ /*
+ * There are reasons we can't do the fast checks, and we continue with
+ * the leaf page search in those cases, only skipping the complete leaf
+ * page search if we know it's not going to work.
+ */
+ cbt->compare = 0;
+
+ /*
+ * First, confirm we have the right parent page-index slot, and quit if
+ * we don't. We don't search for the correct slot, that would make this
+ * cheap test expensive.
+ */
+ WT_INTL_INDEX_GET(session, leaf->home, pindex);
+ indx = leaf->pindex_hint;
+ if (indx >= pindex->entries || pindex->index[indx] != leaf)
+ return (0);
+
+ /*
+ * Check if the search key is smaller than the parent's starting key for
+ * this page.
+ *
+ * We can't compare against slot 0 on a row-store internal page because
+ * reconciliation doesn't build it, it may not be a valid key.
+ */
+ if (indx != 0) {
+ __wt_ref_key(leaf->home, leaf, &item->data, &item->size);
+ WT_RET(__wt_compare(session, collator, srch_key, item, &cmp));
+ if (cmp < 0) {
+ cbt->compare = 1; /* page keys > search key */
+ return (0);
+ }
+ }
+
+ /*
+ * Check if the search key is greater than or equal to the starting key
+ * for the parent's next page.
+ */
+ ++indx;
+ if (indx < pindex->entries) {
+ __wt_ref_key(
+ leaf->home, pindex->index[indx], &item->data, &item->size);
+ WT_RET(__wt_compare(session, collator, srch_key, item, &cmp));
+ if (cmp >= 0) {
+ cbt->compare = -1; /* page keys < search key */
+ return (0);
+ }
+ }
+
+ return (0);
+}
+
+/*
* __wt_row_search --
* Search a row-store tree for a specific key.
*/
@@ -179,8 +249,29 @@ __wt_row_search(WT_SESSION_IMPL *session,
append_check = insert && cbt->append_tree;
descend_right = true;
- /* We may only be searching a single leaf page, not the full tree. */
+ /*
+ * We may be searching only a single leaf page, not the full tree. In
+ * the normal case where the page links to a parent, check the page's
+ * parent keys before doing the full search, it's faster when the
+ * cursor is being re-positioned. (One case where the page doesn't
+ * have a parent is if it is being re-instantiated in memory as part
+ * of a split).
+ */
if (leaf != NULL) {
+ if (leaf->home != NULL) {
+ WT_RET(__check_leaf_key_range(
+ session, srch_key, leaf, cbt));
+ if (cbt->compare != 0) {
+ /*
+ * !!!
+ * WT_CURSOR.search_near uses the slot value to
+ * decide if there was an on-page match.
+ */
+ cbt->slot = 0;
+ return (0);
+ }
+ }
+
current = leaf;
goto leaf_only;
}
@@ -196,15 +287,6 @@ restart_page: page = current->page;
WT_INTL_INDEX_GET(session, page, pindex);
- /*
- * Fast-path internal pages with one child, a common case for
- * the root page in new trees.
- */
- if (pindex->entries == 1) {
- descent = pindex->index[0];
- goto descend;
- }
-
/* Fast-path appends. */
if (append_check) {
descent = pindex->index[pindex->entries - 1];
@@ -345,7 +427,8 @@ descend: /*
* page; otherwise return on error, the swap call ensures we're
* holding nothing on failure.
*/
- if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) {
+ if ((ret = __wt_page_swap(
+ session, current, descent, WT_READ_RESTART_OK)) == 0) {
current = descent;
continue;
}
@@ -542,12 +625,18 @@ err: /*
int
__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
{
- WT_INSERT *p, *t;
+ WT_INSERT *ins, **start, **stop;
+ WT_INSERT_HEAD *ins_head;
WT_PAGE *page;
- uint32_t cnt;
+ uint32_t choice, entries, i;
+ int level;
page = cbt->ref->page;
+ start = stop = NULL; /* [-Wconditional-uninitialized] */
+ entries = 0; /* [-Wconditional-uninitialized] */
+
+ /* If the page has disk-based entries, select from them. */
if (page->pg_row_entries != 0) {
cbt->compare = 0;
cbt->slot = __wt_random(&session->rnd) % page->pg_row_entries;
@@ -562,24 +651,115 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
/*
* If the tree is new (and not empty), it might have a large insert
- * list. Count how many records are in the list.
+ * list.
*/
F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
return (WT_NOTFOUND);
- for (cnt = 1, p = WT_SKIP_FIRST(cbt->ins_head);; ++cnt)
- if ((p = WT_SKIP_NEXT(p)) == NULL)
- break;
/*
- * Select a random number from 0 to (N - 1), return that record.
+ * Walk down the list until we find a level with at least 50 entries,
+ * that's where we'll start rolling random numbers. The value 50 is
+ * used to ignore levels with only a few entries, that is, levels which
+ * are potentially badly skewed.
*/
- cnt = __wt_random(&session->rnd) % cnt;
- for (p = t = WT_SKIP_FIRST(cbt->ins_head);; t = p)
- if (cnt-- == 0 || (p = WT_SKIP_NEXT(p)) == NULL)
+ for (ins_head = cbt->ins_head,
+ level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) {
+ start = &ins_head->head[level];
+ for (entries = 0, stop = start;
+ *stop != NULL; stop = &(*stop)->next[level])
+ ++entries;
+
+ if (entries > 50)
break;
+ }
+
+ /*
+ * If it's a tiny list and we went all the way to level 0, correct the
+ * level; entries is correctly set.
+ */
+ if (level < 0)
+ level = 0;
+
+ /*
+ * Step down the skip list levels, selecting a random chunk of the name
+ * space at each level.
+ */
+ while (level > 0) {
+ /*
+ * There are (entries) or (entries + 1) chunks of the name space
+ * considered at each level. They are: between start and the 1st
+ * element, between the 1st and 2nd elements, and so on to the
+ * last chunk which is the name space after the stop element on
+ * the current level. This last chunk of name space may or may
+ * not be there: as we descend the levels of the skip list, this
+ * chunk may appear, depending if the next level down has
+ * entries logically after the stop point in the current level.
+ * We can't ignore those entries: because of the algorithm used
+ * to determine the depth of a skiplist, there may be a large
+ * number of entries "revealed" by descending a level.
+ *
+ * If the next level down has more items after the current stop
+ * point, there are (entries + 1) chunks to consider, else there
+ * are (entries) chunks.
+ */
+ if (*(stop - 1) == NULL)
+ choice = __wt_random(&session->rnd) % entries;
+ else
+ choice = __wt_random(&session->rnd) % (entries + 1);
+
+ if (choice == entries) {
+ /*
+ * We selected the name space after the stop element on
+ * this level. Set the start point to the current stop
+ * point, descend a level and move the stop element to
+ * the end of the list, that is, the end of the newly
+ * discovered name space, counting entries as we go.
+ */
+ start = stop;
+ --start;
+ --level;
+ for (entries = 0, stop = start;
+ *stop != NULL; stop = &(*stop)->next[level])
+ ++entries;
+ } else {
+ /*
+ * We selected another name space on the level. Move the
+ * start pointer the selected number of entries forward
+ * to the start of the selected chunk (if the selected
+ * number is 0, start won't move). Set the stop pointer
+ * to the next element in the list and drop both start
+ * and stop down a level.
+ */
+ for (i = 0; i < choice; ++i)
+ start = &(*start)->next[level];
+ stop = &(*start)->next[level];
+
+ --start;
+ --stop;
+ --level;
+
+ /* Count the entries in the selected name space. */
+ for (entries = 0,
+ ins = *start; ins != *stop; ins = ins->next[level])
+ ++entries;
+ }
+ }
+
+ /*
+ * When we reach the bottom level, entries will already be set. Select
+ * a random entry from the name space and return it.
+ *
+ * It should be impossible for the entries count to be 0 at this point,
+ * but check for it out of paranoia and to quiet static testing tools.
+ */
+ if (entries > 0)
+ entries = __wt_random(&session->rnd) % entries;
+ for (ins = *start; entries > 0; --entries)
+ ins = ins->next[0];
+
+ cbt->ins = ins;
cbt->compare = 0;
- cbt->ins = t;
return (0);
}
@@ -617,7 +797,8 @@ restart_root:
* Swap the parent page for the child page; return on error,
* the swap function ensures we're holding nothing on failure.
*/
- if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) {
+ if ((ret = __wt_page_swap(
+ session, current, descent, WT_READ_RESTART_OK)) == 0) {
current = descent;
continue;
}
diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c
index d3a0265c13a..1ef8dd32bb4 100644
--- a/src/cache/cache_las.c
+++ b/src/cache/cache_las.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -18,6 +18,7 @@ __wt_las_stats_update(WT_SESSION_IMPL *session)
WT_CONNECTION_IMPL *conn;
WT_CONNECTION_STATS **cstats;
WT_DSRC_STATS **dstats;
+ int64_t v;
conn = S2C(session);
@@ -37,10 +38,10 @@ __wt_las_stats_update(WT_SESSION_IMPL *session)
dstats = ((WT_CURSOR_BTREE *)
conn->las_session->las_cursor)->btree->dhandle->stats;
- WT_STAT_SET(session, cstats,
- cache_lookaside_insert, WT_STAT_READ(dstats, cursor_insert));
- WT_STAT_SET(session, cstats,
- cache_lookaside_remove, WT_STAT_READ(dstats, cursor_remove));
+ v = WT_STAT_READ(dstats, cursor_insert);
+ WT_STAT_SET(session, cstats, cache_lookaside_insert, v);
+ v = WT_STAT_READ(dstats, cursor_remove);
+ WT_STAT_SET(session, cstats, cache_lookaside_remove, v);
}
/*
@@ -139,18 +140,27 @@ __wt_las_is_written(WT_SESSION_IMPL *session)
}
/*
- * __wt_las_cursor_create --
+ * __wt_las_cursor_open --
* Open a new lookaside table cursor.
*/
int
-__wt_las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp)
+__wt_las_cursor_open(WT_SESSION_IMPL *session, WT_CURSOR **cursorp)
{
WT_BTREE *btree;
+ WT_DECL_RET;
const char *open_cursor_cfg[] = {
WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL };
- WT_RET(__wt_open_cursor(
+ WT_WITHOUT_DHANDLE(session, ret = __wt_open_cursor(
session, WT_LAS_URI, NULL, open_cursor_cfg, cursorp));
+ WT_RET(ret);
+
+ /*
+ * Retrieve the btree from the cursor, rather than the session because
+ * we don't always switch the LAS handle in to the session before
+ * entering this function.
+ */
+ btree = ((WT_CURSOR_BTREE *)(*cursorp))->btree;
/*
* Set special flags for the lookaside table: the lookaside flag (used,
@@ -161,7 +171,6 @@ __wt_las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp)
* opens (the first update is safe because it's single-threaded from
* wiredtiger_open).
*/
- btree = S2BT(session);
if (!F_ISSET(btree, WT_BTREE_LOOKASIDE))
F_SET(btree, WT_BTREE_LOOKASIDE);
if (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
diff --git a/src/config/config.c b/src/config/config.c
index 505b843aa86..f480ab83dbd 100644
--- a/src/config/config.c
+++ b/src/config/config.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -365,6 +365,9 @@ __config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
conf, "Unexpected character", EINVAL));
case A_DOWN:
+ if (conf->top == -1)
+ return (__config_err(
+ conf, "Unbalanced brackets", EINVAL));
--conf->depth;
CAP(0);
break;
@@ -471,8 +474,7 @@ __config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
if (conf->depth == 0)
return (WT_NOTFOUND);
- return (__config_err(conf,
- "Closing brackets missing from config string", EINVAL));
+ return (__config_err(conf, "Unbalanced brackets", EINVAL));
}
/*
diff --git a/src/config/config_api.c b/src/config/config_api.c
index 2aba80ebcdd..b5228c4329c 100644
--- a/src/config/config_api.c
+++ b/src/config/config_api.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/config/config_check.c b/src/config/config_check.c
index 6b9d6c563ad..c29013483f6 100644
--- a/src/config/config_check.c
+++ b/src/config/config_check.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/config/config_collapse.c b/src/config/config_collapse.c
index c997ac3a324..27bd6255a0a 100644
--- a/src/config/config_collapse.c
+++ b/src/config/config_collapse.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/config/config_def.c b/src/config/config_def.c
index 9d12e953498..879de670695 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -151,9 +151,9 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = {
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
"\"evict\",\"evictserver\",\"fileops\",\"log\",\"lsm\","
"\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\","
- "\"reconcile\",\"recovery\",\"salvage\",\"shared_cache\","
- "\"split\",\"temporary\",\"transaction\",\"verify\",\"version\","
- "\"write\"]",
+ "\"rebalance\",\"reconcile\",\"recovery\",\"salvage\","
+ "\"shared_cache\",\"split\",\"temporary\",\"transaction\","
+ "\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ NULL, NULL, NULL, NULL, NULL, 0 }
};
@@ -291,6 +291,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_create[] = {
static const WT_CONFIG_CHECK confchk_WT_SESSION_drop[] = {
{ "force", "boolean", NULL, NULL, NULL, 0 },
+ { "lock_wait", "boolean", NULL, NULL, NULL, 0 },
{ "remove_files", "boolean", NULL, NULL, NULL, 0 },
{ NULL, NULL, NULL, NULL, NULL, 0 }
};
@@ -563,9 +564,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
"\"evict\",\"evictserver\",\"fileops\",\"log\",\"lsm\","
"\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\","
- "\"reconcile\",\"recovery\",\"salvage\",\"shared_cache\","
- "\"split\",\"temporary\",\"transaction\",\"verify\",\"version\","
- "\"write\"]",
+ "\"rebalance\",\"reconcile\",\"recovery\",\"salvage\","
+ "\"shared_cache\",\"split\",\"temporary\",\"transaction\","
+ "\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ "write_through", "list",
NULL, "choices=[\"data\",\"log\"]",
@@ -643,9 +644,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
"\"evict\",\"evictserver\",\"fileops\",\"log\",\"lsm\","
"\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\","
- "\"reconcile\",\"recovery\",\"salvage\",\"shared_cache\","
- "\"split\",\"temporary\",\"transaction\",\"verify\",\"version\","
- "\"write\"]",
+ "\"rebalance\",\"reconcile\",\"recovery\",\"salvage\","
+ "\"shared_cache\",\"split\",\"temporary\",\"transaction\","
+ "\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ "version", "string", NULL, NULL, NULL, 0 },
{ "write_through", "list",
@@ -718,9 +719,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
"\"evict\",\"evictserver\",\"fileops\",\"log\",\"lsm\","
"\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\","
- "\"reconcile\",\"recovery\",\"salvage\",\"shared_cache\","
- "\"split\",\"temporary\",\"transaction\",\"verify\",\"version\","
- "\"write\"]",
+ "\"rebalance\",\"reconcile\",\"recovery\",\"salvage\","
+ "\"shared_cache\",\"split\",\"temporary\",\"transaction\","
+ "\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ "version", "string", NULL, NULL, NULL, 0 },
{ "write_through", "list",
@@ -793,9 +794,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\","
"\"evict\",\"evictserver\",\"fileops\",\"log\",\"lsm\","
"\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\","
- "\"reconcile\",\"recovery\",\"salvage\",\"shared_cache\","
- "\"split\",\"temporary\",\"transaction\",\"verify\",\"version\","
- "\"write\"]",
+ "\"rebalance\",\"reconcile\",\"recovery\",\"salvage\","
+ "\"shared_cache\",\"split\",\"temporary\",\"transaction\","
+ "\"verify\",\"version\",\"write\"]",
NULL, 0 },
{ "write_through", "list",
NULL, "choices=[\"data\",\"log\"]",
@@ -904,8 +905,8 @@ static const WT_CONFIG_ENTRY config_entries[] = {
confchk_WT_SESSION_create, 40
},
{ "WT_SESSION.drop",
- "force=0,remove_files=",
- confchk_WT_SESSION_drop, 2
+ "force=0,lock_wait=,remove_files=",
+ confchk_WT_SESSION_drop, 3
},
{ "WT_SESSION.join",
"bloom_bit_count=16,bloom_hash_count=8,compare=\"eq\",count=,"
@@ -926,6 +927,10 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"skip_sort_check=0,statistics=,target=",
confchk_WT_SESSION_open_cursor, 12
},
+ { "WT_SESSION.rebalance",
+ "",
+ NULL, 0
+ },
{ "WT_SESSION.reconfigure",
"isolation=read-committed",
confchk_WT_SESSION_reconfigure, 1
diff --git a/src/config/config_ext.c b/src/config/config_ext.c
index 5102f354b02..56c0018f8c3 100644
--- a/src/config/config_ext.c
+++ b/src/config/config_ext.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/config/config_upgrade.c b/src/config/config_upgrade.c
index 0bca1392b51..e9ba38c6693 100644
--- a/src/config/config_upgrade.c
+++ b/src/config/config_upgrade.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/conn/api_version.c b/src/conn/api_version.c
index 6293d221417..a36cdb8d8eb 100644
--- a/src/conn/api_version.c
+++ b/src/conn/api_version.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c
index bd14e1bf4fd..2f62950a36e 100644
--- a/src/conn/conn_api.c
+++ b/src/conn/conn_api.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -1605,6 +1605,7 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[])
{ "mutex", WT_VERB_MUTEX },
{ "overflow", WT_VERB_OVERFLOW },
{ "read", WT_VERB_READ },
+ { "rebalance", WT_VERB_REBALANCE },
{ "reconcile", WT_VERB_RECONCILE },
{ "recovery", WT_VERB_RECOVERY },
{ "salvage", WT_VERB_SALVAGE },
@@ -1749,7 +1750,7 @@ __conn_write_base_config(WT_SESSION_IMPL *session, const char *cfg[])
WT_ERR_NOTFOUND_OK(ret);
/* Flush the handle and rename the file into place. */
- ret = __wt_sync_and_rename_fp(
+ ret = __wt_sync_fp_and_rename(
session, &fp, WT_BASECONFIG_SET, WT_BASECONFIG);
if (0) {
@@ -2003,6 +2004,9 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
WT_ERR(__wt_sweep_config(session, cfg));
WT_ERR(__wt_verbose_config(session, cfg));
+ /* Initialize the OS page size for mmap */
+ conn->page_size = __wt_get_vm_pagesize();
+
/* Now that we know if verbose is configured, output the version. */
WT_ERR(__wt_verbose(
session, WT_VERB_VERSION, "%s", WIREDTIGER_VERSION_STRING));
@@ -2061,7 +2065,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
* DATABASE HOME, IT'S WHAT WE USE TO DECIDE IF WE'RE CREATING OR NOT.
*/
WT_ERR(__wt_turtle_init(session));
- WT_ERR(__wt_metadata_open(session));
+ WT_ERR(__wt_metadata_cursor(session, NULL));
/* Start the worker threads and run recovery. */
WT_ERR(__wt_connection_workers(session, cfg));
diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c
index a1d509e75bd..1831aad5895 100644
--- a/src/conn/conn_cache.c
+++ b/src/conn/conn_cache.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c
index 8d16f94c092..72f23b015b7 100644
--- a/src/conn/conn_cache_pool.c
+++ b/src/conn/conn_cache_pool.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c
index b47e2550b23..a23350a5e46 100644
--- a/src/conn/conn_ckpt.c
+++ b/src/conn/conn_ckpt.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c
index c6d5b535b86..dedafc2b102 100644
--- a/src/conn/conn_dhandle.c
+++ b/src/conn/conn_dhandle.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -119,46 +119,29 @@ __wt_conn_dhandle_find(
}
/*
- * __conn_dhandle_mark_dead --
- * Mark a data handle dead.
- */
-static int
-__conn_dhandle_mark_dead(WT_SESSION_IMPL *session)
-{
- bool evict_reset;
-
- /*
- * Handle forced discard (e.g., when dropping a file).
- *
- * We need exclusive access to the file -- disable ordinary
- * eviction and drain any blocks already queued.
- */
- WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset));
- F_SET(session->dhandle, WT_DHANDLE_DEAD);
- if (evict_reset)
- __wt_evict_file_exclusive_off(session);
- return (0);
-}
-
-/*
* __wt_conn_btree_sync_and_close --
* Sync and close the underlying btree handle.
*/
int
__wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force)
{
+ WT_BM *bm;
WT_BTREE *btree;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
- bool marked_dead, no_schema_lock;
+ bool evict_reset, marked_dead, no_schema_lock;
btree = S2BT(session);
+ bm = btree->bm;
dhandle = session->dhandle;
marked_dead = false;
if (!F_ISSET(dhandle, WT_DHANDLE_OPEN))
return (0);
+ /* Ensure that we aren't racing with the eviction server */
+ WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset));
+
/*
* If we don't already have the schema lock, make it an error to try
* to acquire it. The problem is that we are holding an exclusive
@@ -191,8 +174,16 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force)
*/
if (!F_ISSET(btree,
WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
- if (force && (btree->bm == NULL || btree->bm->map == NULL)) {
- WT_ERR(__conn_dhandle_mark_dead(session));
+ if (force && (bm == NULL || !bm->is_mapped(bm, session))) {
+ F_SET(session->dhandle, WT_DHANDLE_DEAD);
+
+ /*
+ * Reset the tree's eviction priority, and the tree is
+ * evictable by definition.
+ */
+ __wt_evict_priority_clear(session);
+ F_CLR(S2BT(session), WT_BTREE_NO_EVICTION);
+
marked_dead = true;
}
if (!marked_dead || final)
@@ -215,6 +206,9 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force)
err: __wt_spin_unlock(session, &dhandle->close_lock);
+ if (evict_reset)
+ __wt_evict_file_exclusive_off(session);
+
if (no_schema_lock)
F_CLR(session, WT_SESSION_NO_SCHEMA_LOCK);
@@ -650,8 +644,9 @@ __wt_conn_dhandle_discard_single(
F_SET(S2C(session)->cache, WT_CACHE_CLEAR_WALKS);
/* Try to remove the handle, protected by the data handle lock. */
- WT_WITH_HANDLE_LIST_LOCK(session,
- WT_TRET(__conn_dhandle_remove(session, final)));
+ WT_WITH_HANDLE_LIST_LOCK(session, tret,
+ tret = __conn_dhandle_remove(session, final));
+ WT_TRET(tret);
/*
* After successfully removing the handle, clean it up.
@@ -709,6 +704,15 @@ restart:
__wt_session_close_cache(session);
F_SET(session, WT_SESSION_NO_DATA_HANDLES);
+ /*
+ * The connection may have an open metadata cursor handle. We cannot
+ * close it before now because it's potentially used when discarding
+ * other open data handles. Close it before discarding the underlying
+ * metadata handle.
+ */
+ if (session->meta_cursor != NULL)
+ WT_TRET(session->meta_cursor->close(session->meta_cursor));
+
/* Close the metadata file handle. */
while ((dhandle = TAILQ_FIRST(&conn->dhqh)) != NULL)
WT_WITH_DHANDLE(session, dhandle,
diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c
index cc4e3ae2681..12b4e87e921 100644
--- a/src/conn/conn_handle.c
+++ b/src/conn/conn_handle.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c
index 1d44d816467..ed226393fb0 100644
--- a/src/conn/conn_log.c
+++ b/src/conn/conn_log.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -511,7 +511,7 @@ typedef struct {
* write_lsn in LSN order after the buffer is written to the log file.
*/
int
-__wt_log_wrlsn(WT_SESSION_IMPL *session)
+__wt_log_wrlsn(WT_SESSION_IMPL *session, int *yield)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
@@ -550,6 +550,8 @@ restart:
* based on the release LSN, and then look for them in order.
*/
if (written_i > 0) {
+ if (yield != NULL)
+ *yield = 0;
WT_INSERTION_SORT(written, written_i,
WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT);
/*
@@ -660,22 +662,31 @@ __log_wrlsn_server(void *arg)
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_SESSION_IMPL *session;
+ int yield;
session = arg;
conn = S2C(session);
+ yield = 0;
while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
/*
* Write out any log record buffers.
*/
- WT_ERR(__wt_log_wrlsn(session));
- WT_ERR(__wt_cond_wait(session, conn->log_wrlsn_cond, 10000));
+ WT_ERR(__wt_log_wrlsn(session, &yield));
+ /*
+ * If __wt_log_wrlsn did work we want to yield instead of sleep.
+ */
+ if (yield++ < WT_THOUSAND)
+ __wt_yield();
+ else
+ WT_ERR(__wt_cond_wait(
+ session, conn->log_wrlsn_cond, 10000));
}
/*
* On close we need to do this one more time because there could
* be straggling log writes that need to be written.
*/
WT_ERR(__wt_log_force_write(session, 1));
- WT_ERR(__wt_log_wrlsn(session));
+ WT_ERR(__wt_log_wrlsn(session, NULL));
if (0) {
err: __wt_err(session, ret, "log wrlsn server error");
}
@@ -694,12 +705,12 @@ __log_server(void *arg)
WT_LOG *log;
WT_SESSION_IMPL *session;
int freq_per_sec;
- bool signalled;
+ bool locked, signalled;
session = arg;
conn = S2C(session);
log = conn->log;
- signalled = false;
+ locked = signalled = false;
/*
* Set this to the number of times per second we want to force out the
@@ -740,8 +751,22 @@ __log_server(void *arg)
/*
* Perform log pre-allocation.
*/
- if (conn->log_prealloc > 0)
- WT_ERR(__log_prealloc_once(session));
+ if (conn->log_prealloc > 0) {
+ /*
+ * Log file pre-allocation is disabled when a
+ * hot backup cursor is open because we have
+ * agreed not to rename or remove any files in
+ * the database directory.
+ */
+ WT_ERR(__wt_readlock(
+ session, conn->hot_backup_lock));
+ locked = true;
+ if (!conn->hot_backup)
+ WT_ERR(__log_prealloc_once(session));
+ WT_ERR(__wt_readunlock(
+ session, conn->hot_backup_lock));
+ locked = false;
+ }
/*
* Perform the archive.
@@ -768,6 +793,9 @@ __log_server(void *arg)
if (0) {
err: __wt_err(session, ret, "log server error");
+ if (locked)
+ WT_TRET(__wt_readunlock(
+ session, conn->hot_backup_lock));
}
return (WT_THREAD_RET_VALUE);
}
diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c
index 4fe1db1c524..58577b4587d 100644
--- a/src/conn/conn_open.c
+++ b/src/conn/conn_open.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c
index 31438e10606..9edc6091b10 100644
--- a/src/conn/conn_stat.c
+++ b/src/conn/conn_stat.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -340,8 +340,8 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp)
* any that match the list of object sources.
*/
if (conn->stat_sources != NULL) {
- WT_WITH_HANDLE_LIST_LOCK(session, ret =
- __wt_conn_btree_apply(
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
+ ret = __wt_conn_btree_apply(
session, false, NULL, __statlog_apply, NULL));
WT_RET(ret);
}
diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c
index b9b46f3211c..a15aabdd6fe 100644
--- a/src/conn/conn_sweep.c
+++ b/src/conn/conn_sweep.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -64,11 +64,9 @@ __sweep_expire_one(WT_SESSION_IMPL *session)
WT_BTREE *btree;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
- bool evict_reset;
btree = S2BT(session);
dhandle = session->dhandle;
- evict_reset = false;
/*
* Acquire an exclusive lock on the handle and mark it dead.
@@ -92,9 +90,6 @@ __sweep_expire_one(WT_SESSION_IMPL *session)
!__wt_txn_visible_all(session, btree->rec_max_txn))
goto err;
- /* Ensure that we aren't racing with the eviction server */
- WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset));
-
/*
* Mark the handle as dead and close the underlying file
* handle. Closing the handle decrements the open file count,
@@ -102,9 +97,6 @@ __sweep_expire_one(WT_SESSION_IMPL *session)
*/
ret = __wt_conn_btree_sync_and_close(session, false, true);
- if (evict_reset)
- __wt_evict_file_exclusive_off(session);
-
err: WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
return (ret);
@@ -243,7 +235,7 @@ __sweep_remove_handles(WT_SESSION_IMPL *session)
if (!WT_DHANDLE_CAN_DISCARD(dhandle))
continue;
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
ret = __sweep_remove_one(session, dhandle));
if (ret == 0)
WT_STAT_FAST_CONN_INCR(session, dh_sweep_remove);
diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c
index 62ac2203b97..6d5d68000ee 100644
--- a/src/cursor/cur_backup.c
+++ b/src/cursor/cur_backup.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -80,13 +80,14 @@ __curbackup_close(WT_CURSOR *cursor)
int tret;
cb = (WT_CURSOR_BACKUP *)cursor;
+
CURSOR_API_CALL(cursor, session, close, NULL);
WT_TRET(__backup_cleanup_handles(session, cb));
WT_TRET(__wt_cursor_close(cursor));
session->bkp_cursor = NULL;
- WT_WITH_SCHEMA_LOCK(session,
+ WT_WITH_SCHEMA_LOCK(session, tret,
tret = __backup_stop(session)); /* Stop the backup. */
WT_TRET(tret);
@@ -139,7 +140,8 @@ __wt_curbackup_open(WT_SESSION_IMPL *session,
* Start the backup and fill in the cursor's list. Acquire the schema
* lock, we need a consistent view when creating a copy.
*/
- WT_WITH_SCHEMA_LOCK(session, ret = __backup_start(session, cb, cfg));
+ WT_WITH_SCHEMA_LOCK(session, ret,
+ ret = __backup_start(session, cb, cfg));
WT_ERR(ret);
/* __wt_cursor_init is last so we don't have to clean up on error. */
@@ -339,11 +341,8 @@ __backup_all(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb)
cursor = NULL;
- /*
- * Open a cursor on the metadata file and copy all of the entries to
- * the hot backup file.
- */
- WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+ /* Copy all of the metadata entries to the hot backup file. */
+ WT_RET(__wt_metadata_cursor(session, &cursor));
while ((ret = cursor->next(cursor)) == 0) {
WT_ERR(cursor->get_key(cursor, &key));
WT_ERR(cursor->get_value(cursor, &value));
@@ -375,13 +374,13 @@ __backup_all(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb)
}
WT_ERR_NOTFOUND_OK(ret);
+ WT_ERR(__wt_metadata_cursor_release(session, &cursor));
+
/* Build a list of the file objects that need to be copied. */
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_meta_btree_apply(
- session, __backup_list_all_append, NULL));
+ WT_WITH_HANDLE_LIST_LOCK(session, ret, ret =
+ __wt_meta_btree_apply(session, __backup_list_all_append, NULL));
-err: if (cursor != NULL)
- WT_TRET(cursor->close(cursor));
+err: WT_TRET(__wt_metadata_cursor_release(session, &cursor));
return (ret);
}
diff --git a/src/cursor/cur_bulk.c b/src/cursor/cur_bulk.c
index b996b934464..c013383fa61 100644
--- a/src/cursor/cur_bulk.c
+++ b/src/cursor/cur_bulk.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -9,6 +9,25 @@
#include "wt_internal.h"
/*
+ * __bulk_col_keycmp_err --
+ * Error routine when column-store keys inserted out-of-order.
+ */
+static int
+__bulk_col_keycmp_err(WT_CURSOR_BULK *cbulk)
+{
+ WT_CURSOR *cursor;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session;
+ cursor = &cbulk->cbt.iface;
+
+ WT_RET_MSG(session, EINVAL,
+ "bulk-load presented with out-of-order keys: %" PRIu64 " is less "
+ "than previously inserted key %" PRIu64,
+ cursor->recno, cbulk->recno);
+}
+
+/*
* __curbulk_insert_fix --
* Fixed-length column-store bulk cursor insert.
*/
@@ -19,6 +38,7 @@ __curbulk_insert_fix(WT_CURSOR *cursor)
WT_CURSOR_BULK *cbulk;
WT_DECL_RET;
WT_SESSION_IMPL *session;
+ uint64_t recno;
cbulk = (WT_CURSOR_BULK *)cursor;
btree = cbulk->cbt.btree;
@@ -29,13 +49,63 @@ __curbulk_insert_fix(WT_CURSOR *cursor)
* until the bulk cursor is closed.
*/
CURSOR_API_CALL(cursor, session, insert, btree);
+ WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
- WT_CURSOR_NEEDVALUE(cursor);
+ /*
+ * If the "append" flag was configured, the application doesn't have to
+ * supply a key, else require a key.
+ */
+ if (F_ISSET(cursor, WT_CURSTD_APPEND))
+ recno = cbulk->recno + 1;
+ else {
+ WT_CURSOR_CHECKKEY(cursor);
+ if ((recno = cursor->recno) <= cbulk->recno)
+ WT_ERR(__bulk_col_keycmp_err(cbulk));
+ }
+ WT_CURSOR_CHECKVALUE(cursor);
+
+ /*
+ * Insert any skipped records as deleted records, update the current
+ * record count.
+ */
+ for (; recno != cbulk->recno + 1; ++cbulk->recno)
+ WT_ERR(__wt_bulk_insert_fix(session, cbulk, true));
+ cbulk->recno = recno;
+
+ /* Insert the current record. */
+ ret = __wt_bulk_insert_fix(session, cbulk, false);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curbulk_insert_fix_bitmap --
+ * Fixed-length column-store bulk cursor insert for bitmaps.
+ */
+static int
+__curbulk_insert_fix_bitmap(WT_CURSOR *cursor)
+{
+ WT_BTREE *btree;
+ WT_CURSOR_BULK *cbulk;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
- WT_ERR(__wt_bulk_insert_fix(session, cbulk));
+ cbulk = (WT_CURSOR_BULK *)cursor;
+ btree = cbulk->cbt.btree;
+ /*
+ * Bulk cursor inserts are updates, but don't need auto-commit
+ * transactions because they are single-threaded and not visible
+ * until the bulk cursor is closed.
+ */
+ CURSOR_API_CALL(cursor, session, insert, btree);
WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
+ WT_CURSOR_CHECKVALUE(cursor);
+
+ /* Insert the current record. */
+ ret = __wt_bulk_insert_fix_bitmap(session, cbulk);
+
err: API_END_RET(session, ret);
}
@@ -50,7 +120,7 @@ __curbulk_insert_var(WT_CURSOR *cursor)
WT_CURSOR_BULK *cbulk;
WT_DECL_RET;
WT_SESSION_IMPL *session;
- bool duplicate;
+ uint64_t recno;
cbulk = (WT_CURSOR_BULK *)cursor;
btree = cbulk->cbt.btree;
@@ -61,45 +131,63 @@ __curbulk_insert_var(WT_CURSOR *cursor)
* until the bulk cursor is closed.
*/
CURSOR_API_CALL(cursor, session, insert, btree);
-
- WT_CURSOR_NEEDVALUE(cursor);
+ WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
/*
- * If this isn't the first value inserted, compare it against the last
- * value and increment the RLE count.
- *
- * Instead of a "first time" variable, I'm using the RLE count, because
- * it is only zero before the first row is inserted.
+ * If the "append" flag was configured, the application doesn't have to
+ * supply a key, else require a key.
*/
- duplicate = false;
- if (cbulk->rle != 0) {
- if (cbulk->last.size == cursor->value.size &&
- memcmp(cbulk->last.data, cursor->value.data,
- cursor->value.size) == 0) {
- ++cbulk->rle;
- duplicate = true;
- } else
- WT_ERR(__wt_bulk_insert_var(session, cbulk));
+ if (F_ISSET(cursor, WT_CURSTD_APPEND))
+ recno = cbulk->recno + 1;
+ else {
+ WT_CURSOR_CHECKKEY(cursor);
+ if ((recno = cursor->recno) <= cbulk->recno)
+ WT_ERR(__bulk_col_keycmp_err(cbulk));
}
+ WT_CURSOR_CHECKVALUE(cursor);
+
+ if (!cbulk->first_insert) {
+ /*
+ * If not the first insert and the key space is sequential,
+ * compare the current value against the last value; if the
+ * same, just increment the RLE count.
+ */
+ if (recno == cbulk->recno + 1 &&
+ cbulk->last.size == cursor->value.size &&
+ memcmp(cbulk->last.data,
+ cursor->value.data, cursor->value.size) == 0) {
+ ++cbulk->rle;
+ ++cbulk->recno;
+ goto duplicate;
+ }
+
+ /* Insert the previous key/value pair. */
+ WT_ERR(__wt_bulk_insert_var(session, cbulk, false));
+ } else
+ cbulk->first_insert = false;
/*
- * Save a copy of the value for the next comparison and reset the RLE
- * counter.
+ * Insert any skipped records as deleted records, update the current
+ * record count and RLE counter.
*/
- if (!duplicate) {
- WT_ERR(__wt_buf_set(session,
- &cbulk->last, cursor->value.data, cursor->value.size));
- cbulk->rle = 1;
+ if (recno != cbulk->recno + 1) {
+ cbulk->rle = (recno - cbulk->recno) - 1;
+ WT_ERR(__wt_bulk_insert_var(session, cbulk, true));
}
+ cbulk->rle = 1;
+ cbulk->recno = recno;
- WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
+ /* Save a copy of the value for the next comparison. */
+ ret = __wt_buf_set(session,
+ &cbulk->last, cursor->value.data, cursor->value.size);
+duplicate:
err: API_END_RET(session, ret);
}
/*
* __bulk_row_keycmp_err --
- * Error routine when keys inserted out-of-order.
+ * Error routine when row-store keys inserted out-of-order.
*/
static int
__bulk_row_keycmp_err(WT_CURSOR_BULK *cbulk)
@@ -116,16 +204,13 @@ __bulk_row_keycmp_err(WT_CURSOR_BULK *cbulk)
WT_ERR(__wt_scr_alloc(session, 512, &a));
WT_ERR(__wt_scr_alloc(session, 512, &b));
- WT_ERR(__wt_buf_set_printable(
- session, a, cursor->key.data, cursor->key.size));
- WT_ERR(__wt_buf_set_printable(
- session, b, cbulk->last.data, cbulk->last.size));
-
WT_ERR_MSG(session, EINVAL,
- "bulk-load presented with out-of-order keys: %.*s compares smaller "
- "than previously inserted key %.*s",
- (int)a->size, (const char *)a->data,
- (int)b->size, (const char *)b->data);
+ "bulk-load presented with out-of-order keys: %s compares smaller "
+ "than previously inserted key %s",
+ __wt_buf_set_printable(
+ session, cursor->key.data, cursor->key.size, a),
+ __wt_buf_set_printable(
+ session, cbulk->last.data, cbulk->last.size, b));
err: __wt_scr_free(session, &a);
__wt_scr_free(session, &b);
@@ -154,6 +239,7 @@ __curbulk_insert_row(WT_CURSOR *cursor)
* until the bulk cursor is closed.
*/
CURSOR_API_CALL(cursor, session, insert, btree);
+ WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
WT_CURSOR_CHECKKEY(cursor);
WT_CURSOR_CHECKVALUE(cursor);
@@ -161,28 +247,20 @@ __curbulk_insert_row(WT_CURSOR *cursor)
/*
* If this isn't the first key inserted, compare it against the last key
* to ensure the application doesn't accidentally corrupt the table.
- *
- * Instead of a "first time" variable, I'm using the RLE count, because
- * it is only zero before the first row is inserted.
*/
- if (cbulk->rle != 0) {
+ if (!cbulk->first_insert) {
WT_ERR(__wt_compare(session,
btree->collator, &cursor->key, &cbulk->last, &cmp));
if (cmp <= 0)
WT_ERR(__bulk_row_keycmp_err(cbulk));
- }
+ } else
+ cbulk->first_insert = false;
- /*
- * Save a copy of the key for the next comparison and set the RLE
- * counter.
- */
+ /* Save a copy of the key for the next comparison. */
WT_ERR(__wt_buf_set(session,
&cbulk->last, cursor->key.data, cursor->key.size));
- cbulk->rle = 1;
-
- WT_ERR(__wt_bulk_insert_row(session, cbulk));
- WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
+ ret = __wt_bulk_insert_row(session, cbulk);
err: API_END_RET(session, ret);
}
@@ -208,13 +286,12 @@ __curbulk_insert_row_skip_check(WT_CURSOR *cursor)
* until the bulk cursor is closed.
*/
CURSOR_API_CALL(cursor, session, insert, btree);
+ WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
- WT_CURSOR_NEEDKEY(cursor);
- WT_CURSOR_NEEDVALUE(cursor);
-
- WT_ERR(__wt_bulk_insert_row(session, cbulk));
+ WT_CURSOR_CHECKKEY(cursor);
+ WT_CURSOR_CHECKVALUE(cursor);
- WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
+ ret = __wt_bulk_insert_row(session, cbulk);
err: API_END_RET(session, ret);
}
@@ -237,18 +314,25 @@ __wt_curbulk_init(WT_SESSION_IMPL *session,
__wt_cursor_set_notsup(c);
switch (cbt->btree->type) {
case BTREE_COL_FIX:
- c->insert = __curbulk_insert_fix;
+ c->insert = bitmap ?
+ __curbulk_insert_fix_bitmap : __curbulk_insert_fix;
break;
case BTREE_COL_VAR:
c->insert = __curbulk_insert_var;
break;
case BTREE_ROW:
+ /*
+ * Row-store order comparisons are expensive, so we optionally
+ * skip them when we know the input is correct.
+ */
c->insert = skip_sort_check ?
__curbulk_insert_row_skip_check : __curbulk_insert_row;
break;
WT_ILLEGAL_VALUE(session);
}
+ cbulk->first_insert = true;
+ cbulk->recno = 0;
cbulk->bitmap = bitmap;
if (bitmap)
F_SET(c, WT_CURSTD_RAW);
diff --git a/src/cursor/cur_config.c b/src/cursor/cur_config.c
index 348cfbab1dd..1b2fec0eb89 100644
--- a/src/cursor/cur_config.c
+++ b/src/cursor/cur_config.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/cursor/cur_ds.c b/src/cursor/cur_ds.c
index ccc19717612..2a598c99523 100644
--- a/src/cursor/cur_ds.c
+++ b/src/cursor/cur_ds.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/cursor/cur_dump.c b/src/cursor/cur_dump.c
index e5799fbad05..3324efd96cc 100644
--- a/src/cursor/cur_dump.c
+++ b/src/cursor/cur_dump.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c
index b955b292292..8bbe1cc8eda 100644
--- a/src/cursor/cur_file.c
+++ b/src/cursor/cur_file.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -545,8 +545,8 @@ __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri,
* failing with EBUSY due to a database-wide checkpoint.
*/
if (LF_ISSET(WT_DHANDLE_EXCLUSIVE))
- WT_WITH_CHECKPOINT_LOCK(session, ret =
- __wt_session_get_btree_ckpt(
+ WT_WITH_CHECKPOINT_LOCK(session, ret,
+ ret = __wt_session_get_btree_ckpt(
session, uri, cfg, flags));
else
ret = __wt_session_get_btree_ckpt(
diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c
index a909eaece99..6822055131a 100644
--- a/src/cursor/cur_index.c
+++ b/src/cursor/cur_index.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/cursor/cur_join.c b/src/cursor/cur_join.c
index 395da22a80c..2cbefa68c5e 100644
--- a/src/cursor/cur_join.c
+++ b/src/cursor/cur_join.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -383,17 +383,14 @@ __curjoin_endpoint_init_key(WT_SESSION_IMPL *session,
allocbuf = NULL;
if ((cursor = endpoint->cursor) != NULL) {
if (entry->index != NULL) {
+ /* Extract and save the index's logical key. */
cindex = (WT_CURSOR_INDEX *)endpoint->cursor;
- if (cindex->index->extractor == NULL) {
- WT_ERR(__wt_struct_repack(session,
- cindex->child->key_format,
- entry->main->value_format,
- &cindex->child->key, &endpoint->key,
- &allocbuf));
- if (allocbuf != NULL)
- F_SET(endpoint, WT_CURJOIN_END_OWN_KEY);
- } else
- endpoint->key = cindex->child->key;
+ WT_ERR(__wt_struct_repack(session,
+ cindex->child->key_format,
+ cindex->iface.key_format,
+ &cindex->child->key, &endpoint->key, &allocbuf));
+ if (allocbuf != NULL)
+ F_SET(endpoint, WT_CURJOIN_END_OWN_KEY);
} else {
k = &((WT_CURSOR_TABLE *)cursor)->cg_cursors[0]->key;
if (WT_CURSOR_RECNO(cursor)) {
diff --git a/src/cursor/cur_json.c b/src/cursor/cur_json.c
index 8f858a5012f..fcb66d3e8b3 100644
--- a/src/cursor/cur_json.c
+++ b/src/cursor/cur_json.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -313,7 +313,6 @@ size_t
__wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, bool force_unicode)
{
char abbrev;
- u_char h;
if (!force_unicode) {
if (isprint(ch) && ch != '\\' && ch != '"') {
@@ -354,16 +353,8 @@ __wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, bool force_unicode)
*buf++ = 'u';
*buf++ = '0';
*buf++ = '0';
- h = (((u_char)ch) >> 4) & 0xF;
- if (h >= 10)
- *buf++ = 'A' + (h - 10);
- else
- *buf++ = '0' + h;
- h = ((u_char)ch) & 0xF;
- if (h >= 10)
- *buf++ = 'A' + (h - 10);
- else
- *buf++ = '0' + h;
+ *buf++ = __wt_hex[(ch & 0xf0) >> 4];
+ *buf++ = __wt_hex[ch & 0x0f];
}
return (6);
}
diff --git a/src/cursor/cur_log.c b/src/cursor/cur_log.c
index ade9fd18962..35a2d00e6ec 100644
--- a/src/cursor/cur_log.c
+++ b/src/cursor/cur_log.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/cursor/cur_metadata.c b/src/cursor/cur_metadata.c
index 55da93859a6..df66ef34ddd 100644
--- a/src/cursor/cur_metadata.c
+++ b/src/cursor/cur_metadata.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -477,8 +477,12 @@ __wt_curmetadata_open(WT_SESSION_IMPL *session,
cursor->key_format = "S";
cursor->value_format = "S";
- /* Open the file cursor for operations on the regular metadata */
- WT_ERR(__wt_metadata_cursor(session, cfg[1], &mdc->file_cursor));
+ /*
+ * Open the file cursor for operations on the regular metadata; don't
+ * use the existing, cached session metadata cursor, the configuration
+ * may not be the same.
+ */
+ WT_ERR(__wt_metadata_cursor_open(session, cfg[1], &mdc->file_cursor));
WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp));
diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c
index e1d5b8eb91a..00a6ade21c6 100644
--- a/src/cursor/cur_stat.c
+++ b/src/cursor/cur_stat.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -384,6 +384,7 @@ __curstat_file_init(WT_SESSION_IMPL *session,
{
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
+ wt_off_t size;
const char *filename;
/*
@@ -395,8 +396,8 @@ __curstat_file_init(WT_SESSION_IMPL *session,
if (!WT_PREFIX_SKIP(filename, "file:"))
return (EINVAL);
__wt_stat_dsrc_init_single(&cst->u.dsrc_stats);
- WT_RET(__wt_block_manager_size(
- session, filename, &cst->u.dsrc_stats));
+ WT_RET(__wt_block_manager_named_size(session, filename, &size));
+ cst->u.dsrc_stats.block_size = size;
__wt_curstat_dsrc_final(cst);
return (0);
}
@@ -662,7 +663,7 @@ __wt_curstat_open(WT_SESSION_IMPL *session,
/*
* We return the statistics field's offset as the key, and a string
- * description, a string value, and a uint64_t value as the value
+ * description, a string value, and a uint64_t value as the value
* columns.
*/
cursor->key_format = "i";
diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c
index da38988b6c2..051f36c8854 100644
--- a/src/cursor/cur_std.c
+++ b/src/cursor/cur_std.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c
index dca72a16ee5..d986577f640 100644
--- a/src/cursor/cur_table.c
+++ b/src/cursor/cur_table.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -758,6 +758,7 @@ err: API_END_RET(session, ret);
static int
__curtable_open_colgroups(WT_CURSOR_TABLE *ctable, const char *cfg_arg[])
{
+ WT_DECL_RET;
WT_SESSION_IMPL *session;
WT_TABLE *table;
WT_CURSOR **cp;
@@ -776,8 +777,10 @@ __curtable_open_colgroups(WT_CURSOR_TABLE *ctable, const char *cfg_arg[])
/* If the table is incomplete, wait on the table lock and recheck. */
complete = table->cg_complete;
- if (!complete)
- WT_WITH_TABLE_LOCK(session, complete = table->cg_complete);
+ if (!complete) {
+ WT_WITH_TABLE_LOCK(session, ret, complete = table->cg_complete);
+ WT_RET(ret);
+ }
if (!complete)
WT_RET_MSG(session, EINVAL,
"Can't use '%s' until all column groups are created",
@@ -968,8 +971,11 @@ __wt_curtable_open(WT_SESSION_IMPL *session,
WT_ERR(__wt_strdup(session, tmp->data, &ctable->cfg[1]));
if (0) {
-err: WT_TRET(__curtable_close(cursor));
- *cursorp = NULL;
+err: if (*cursorp != NULL) {
+ WT_TRET(__wt_cursor_close(*cursorp));
+ *cursorp = NULL;
+ }
+ WT_TRET(__curtable_close(cursor));
}
__wt_scr_free(session, &tmp);
diff --git a/src/docs/build-javadoc.sh b/src/docs/build-javadoc.sh
index 39c9d989b6c..be886937070 100755
--- a/src/docs/build-javadoc.sh
+++ b/src/docs/build-javadoc.sh
@@ -8,5 +8,5 @@ CLASSPATH=$THRIFT_HOME/libthrift.jar:$SLF4J_JAR javadoc -public -d $DOCS/java \
-stylesheetfile $DOCS/style/javadoc.css \
-use -link http://java.sun.com/j2se/1.5.0/docs/api/ \
-header '<b>WiredTiger API</b><br><font size="-1"> version '$WT_VERSION'</font>' \
- -windowtitle 'WiredTiger Java API' -bottom '<font size=1>Copyright (c) 2008-2015 MongoDB, Inc. All rights reserved.</font>' \
+ -windowtitle 'WiredTiger Java API' -bottom '<font size=1>Copyright (c) 2008-2016 MongoDB, Inc. All rights reserved.</font>' \
com.wiredtiger com.wiredtiger.util
diff --git a/src/docs/command-line.dox b/src/docs/command-line.dox
index 745c5051be3..e2b376d5e3f 100644
--- a/src/docs/command-line.dox
+++ b/src/docs/command-line.dox
@@ -32,7 +32,7 @@ on success and non-zero on error.
The \c wt tool supports several commands. If configured in the underlying
database, some commands will run recovery when opening the database. If
-the user wants to force recovery on any command, use the \c -r option.
+the user wants to force recovery on any command, use the \c -R option.
In general, commands that modify the database or tables will run recovery
by default and commands that only read data will not run recovery.
@@ -46,7 +46,7 @@ opened as a WiredTiger database. See @ref backup for more information,
and @ref file_permissions for specifics on the copied file permissions.
@subsection util_backup_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] backup [-t uri] directory</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] backup [-t uri] directory</code>
@subsection util_backup_options Options
The following are command-specific options for the \c backup command:
@@ -64,7 +64,7 @@ The \c compact command attempts to rewrite the specified table or file
to consume less disk space.
@subsection util_compact_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] compact uri</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] compact uri</code>
@subsection util_compact_options Options
The \c compact command has no command-specific options.
@@ -78,7 +78,7 @@ configuration. It is equivalent to a call to WT_SESSION::create with
the specified string arguments.
@subsection util_create_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] create [-c config] uri</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] create [-c config] uri</code>
@subsection util_create_options Options
The following are command-specific options for the \c create command:
@@ -94,7 +94,7 @@ The \c drop command drops the specified \c uri. It is equivalent to a
call to WT_SESSION::drop with the "force" configuration argument.
@subsection util_drop_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] drop uri</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] drop uri</code>
@subsection util_drop_options Options
The \c drop command has no command-specific options.
@@ -109,7 +109,7 @@ which can be re-loaded into a new table using the \c load command.
See @subpage dump_formats for details of the dump file formats.
@subsection util_dump_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] dump [-jrx] [-c checkpoint] [-f output] uri</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] dump [-jrx] [-c checkpoint] [-f output] uri</code>
@subsection util_dump_options Options
The following are command-specific options for the \c dump command:
@@ -143,7 +143,7 @@ the database. If a URI is specified as an argument, only information about
that data source is printed.
@subsection util_list_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] list [-cv] [uri]</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] list [-cv] [uri]</code>
@subsection util_list_options Options
The following are command-specific options for the \c list command:
@@ -170,7 +170,7 @@ table will be overwritten by the new data (use the \c -n option to
make an attempt to overwrite existing data return an error).
@subsection util_load_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] load [-ajn] [-f input] [-r name] [uri configuration ...]</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] load [-ajn] [-f input] [-r name] [uri configuration ...]</code>
@subsection util_load_options Options
The following are command-specific options for the \c load command:
@@ -244,7 +244,7 @@ row-store table or file already exists, data in the table or file will
be overwritten by the new data.
@subsection util_loadtext_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] loadtext [-f input]</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] loadtext [-f input]</code>
@subsection util_loadtext_options Options
The following are command-specific options for the \c loadtext command:
@@ -260,7 +260,7 @@ Display the database log.
The \c printlog command outputs the database log.
@subsection util_printlog_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] printlog [-p] [-f output]</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] printlog [-x] [-f output]</code>
@subsection util_printlog_options Options
The following are command-specific options for the \c printlog command:
@@ -269,8 +269,9 @@ The following are command-specific options for the \c printlog command:
By default, the \c printlog command output is written to the standard
output; the \c -f option re-directs the output to the specified file.
-@par <code>-p</code>
-Display the log in a printable format.
+@par <code>-x</code>
+Keys and value items in the log are printed in hex format in addition
+to the default string format.
<hr>
@section util_read wt read
@@ -283,7 +284,7 @@ with string or record number keys and string values.
The \c read command exits non-zero if a specified record is not found.
@subsection util_read_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] read uri key ...</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] read uri key ...</code>
@subsection util_read_options Options
The \c read command has no command-specific options.
@@ -295,7 +296,7 @@ Rename a table or file.
The \c rename command renames the specified table or file.
@subsection util_rename_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] rename uri name</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] rename uri name</code>
@subsection util_rename_options Options
The \c rename command has no command-specific options.
@@ -309,7 +310,7 @@ data that cannot be recovered. Underlying files are re-written in
place, overwriting the original file contents.
@subsection util_salvage_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] salvage [-F force] uri</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] salvage [-F force] uri</code>
@subsection util_salvage_options Options
The following are command-specific options for the \c salvage command:
@@ -327,7 +328,7 @@ The \c stat command outputs run-time statistics for the WiredTiger
engine, or, if specified, for the URI on the command-line.
@subsection util_stat_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] stat [-f] [uri]</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] stat [-f] [uri]</code>
@subsection util_stat_options Options
The following are command-specific options for the \c stat command:
@@ -345,7 +346,7 @@ success if the data source is up-to-date, and failure if the data source
cannot be upgraded.
@subsection util_upgrade_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] upgrade uri</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] upgrade uri</code>
@subsection util_upgrade_options Options
The \c upgrade command has no command-specific options.
@@ -359,7 +360,7 @@ success if the data source is correct, and failure if the data source is
corrupted.
@subsection util_verify_synopsis Synopsis
-<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] verify uri</code>
+<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] verify uri</code>
@subsection util_verify_options Options
The \c verify command has no command-specific options.
@@ -381,9 +382,9 @@ Attempting to overwrite an already existing record will fail.
@subsection util_write_synopsis Synopsis
<code>
-wt [-rVv] [-C config] [-E secretkey ] [-h directory] write -a uri value ...
+wt [-RVv] [-C config] [-E secretkey ] [-h directory] write -a uri value ...
<br>
-wt [-rVv] [-C config] [-E secretkey ] [-h directory] write [-o] uri key value ...
+wt [-RVv] [-C config] [-E secretkey ] [-h directory] write [-o] uri key value ...
</code>
@subsection util_write_options Options
diff --git a/src/docs/license.dox b/src/docs/license.dox
index f34ebad19a7..febced2c6af 100644
--- a/src/docs/license.dox
+++ b/src/docs/license.dox
@@ -13,6 +13,19 @@ WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the
<b>GNU General Public License</b></a> for details.
+Additionally, portions of the WiredTiger distribution are distributed
+under the terms of the
+<a href="http://www.opensource.org/licenses/BSD-3-Clause">
+BSD-3-Clause License</a>. These files have
+<a href="http://www.opensource.org/licenses/BSD-3-Clause">
+BSD-3-Clause License</a>
+copyright notices, and may be freely used and redistributed under the
+terms of that notice.
+
+Additionally, portions of the WiredTiger distribution are public domain
+software. Public domain files have notices releasing the software into
+the public domain and may be freely used and redistributed.
+
For a license to use the WiredTiger software under conditions other than
those described above, or for technical support for this software, please
contact MongoDB, Inc. at
@@ -28,7 +41,7 @@ of the WiredTiger library should comply with these copyrights.
@hrow{Distribution Files, Copyright Holder, License}
@row{\c src/include/bitstring.i, University of California\, Berkeley, <a href="http://www.opensource.org/licenses/BSD-3-Clause">BSD-3-Clause License</a>}
@row{\c src/include/queue.h, University of California\, Berkeley, <a href="http://www.opensource.org/licenses/BSD-3-Clause">BSD-3-Clause License</a>}
-@row{\c src/os_posix/getopt.c, University of California\, Berkeley, <a href="http://www.opensource.org/licenses/BSD-3-Clause">BSD-3-Clause License</a>}
+@row{\c src/os_posix/os_getopt.c, University of California\, Berkeley, <a href="http://www.opensource.org/licenses/BSD-3-Clause">BSD-3-Clause License</a>}
@row{\c src/support/hash_city.c, Google\, Inc., <a href="http://www.opensource.org/licenses/MIT">The MIT License</a>}
@row{\c src/support/hash_fnv.c, Authors, Public Domain}
</table>
@@ -63,10 +76,4 @@ selected portions of the WiredTiger sources, please review the copyright
notices and LICENSE files included in the WiredTiger distribution for
the terms and conditions of such redistribution.
-@section license_public_domain Public domain software
-
-Many portions of the WiredTiger distribution are public domain software.
-Public domain files have notices releasing the software into the public
-domain and may be freely used and redistributed.
-
*/
diff --git a/src/docs/programming.dox b/src/docs/programming.dox
index f005f6d3e2d..5d79edd660b 100644
--- a/src/docs/programming.dox
+++ b/src/docs/programming.dox
@@ -40,11 +40,12 @@ each of which is ordered by one or more columns.
- @subpage compact
- @subpage checkpoint
- @subpage durability
+- @subpage cursor_join
+- @subpage cursor_log
- @ref transaction_named_snapshots
+- @subpage rebalance
- @subpage shared_cache
- @subpage statistics
-- @subpage cursor_join
-- @subpage cursor_log
- @subpage_single upgrade
@m_if{c}
diff --git a/src/docs/rebalance.dox b/src/docs/rebalance.dox
new file mode 100644
index 00000000000..a6acfe07ef5
--- /dev/null
+++ b/src/docs/rebalance.dox
@@ -0,0 +1,14 @@
+/*! @m_page{{c,java},rebalance,Rebalance}
+
+The WT_SESSION::rebalance method can be used to rebalance data sources'
+underlying btrees. If a tree has become unbalanced (that is, one part of
+the tree is excessively deep), WT_SESSION::rebalance rewrites the tree
+as a balanced tree.
+
+The data source must be quiescent.
+
+The WT_SESSION::rebalance method should never be needed, as WiredTiger
+btrees are maintained as balanced trees. It is only provided as a tool
+to handle the unexpected.
+
+ */
diff --git a/src/docs/schema.dox b/src/docs/schema.dox
index 66f8046965e..65ad7f6919c 100644
--- a/src/docs/schema.dox
+++ b/src/docs/schema.dox
@@ -89,6 +89,10 @@ struct module to describe the types of columns in a table:
The \c 'r' type is used for record number keys in column stores. It is
otherwise identical to the \c 'Q' type.
+The \c 's' type is used for fixed-length strings. If it is preceded by
+a size, that indicates the number of bytes to store; the default is a
+length of 1 byte.
+
The \c 'S' type is encoded as a C language string terminated by a
NUL character.
@m_if{java}
diff --git a/src/docs/spell.ok b/src/docs/spell.ok
index 86af82d8fd2..80597302cbb 100644
--- a/src/docs/spell.ok
+++ b/src/docs/spell.ok
@@ -66,6 +66,7 @@ NoSQL
OPTYPE
PRELOAD
README
+Rebalance
RedHat
RepMgr
Riak
@@ -120,6 +121,7 @@ boolean
booleans
br
btree
+btrees
bufs
builtin
builtins
@@ -378,6 +380,7 @@ readlock
realclean
realloc
realloc'd
+rebalance
recno
recnoN
recnum
diff --git a/src/docs/style/footer.html b/src/docs/style/footer.html
index 83f1254fa42..e5a7b30eef5 100644
--- a/src/docs/style/footer.html
+++ b/src/docs/style/footer.html
@@ -3,13 +3,13 @@
<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
<ul>
$navpath
- <li class="footer">Copyright (c) 2008-2015 MongoDB, Inc. All rights reserved. Contact <a href="mailto:info@wiredtiger.com">info@wiredtiger.com</a> for more information.</li>
+ <li class="footer">Copyright (c) 2008-2016 MongoDB, Inc. All rights reserved. Contact <a href="mailto:info@wiredtiger.com">info@wiredtiger.com</a> for more information.</li>
</ul>
</div>
<!--END GENERATE_TREEVIEW-->
<!--BEGIN !GENERATE_TREEVIEW-->
<hr class="footer"/><address class="footer"><small>
-Copyright (c) 2008-2015 MongoDB, Inc. All rights reserved. Contact <a href="mailto:info@wiredtiger.com">info@wiredtiger.com</a> for more information.
+Copyright (c) 2008-2016 MongoDB, Inc. All rights reserved. Contact <a href="mailto:info@wiredtiger.com">info@wiredtiger.com</a> for more information.
</small></address>
<!--END !GENERATE_TREEVIEW-->
</body>
diff --git a/src/docs/tools/doxfilter.py b/src/docs/tools/doxfilter.py
index 8ca68c0a1fe..b2d5f857df1 100755
--- a/src/docs/tools/doxfilter.py
+++ b/src/docs/tools/doxfilter.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python
#
-# Public Domain 2014-2015 MongoDB, Inc.
+# Public Domain 2014-2016 MongoDB, Inc.
# Public Domain 2008-2014 WiredTiger, Inc.
#
# This is free and unencumbered software released into the public domain.
diff --git a/src/docs/tools/fixlinks.py b/src/docs/tools/fixlinks.py
index 84f56d219f8..7163246e3bd 100755
--- a/src/docs/tools/fixlinks.py
+++ b/src/docs/tools/fixlinks.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python
#
-# Public Domain 2014-2015 MongoDB, Inc.
+# Public Domain 2014-2016 MongoDB, Inc.
# Public Domain 2008-2014 WiredTiger, Inc.
#
# This is free and unencumbered software released into the public domain.
diff --git a/src/docs/top/main.dox b/src/docs/top/main.dox
index ef2f5bf15a0..7e670541e7d 100644
--- a/src/docs/top/main.dox
+++ b/src/docs/top/main.dox
@@ -6,12 +6,12 @@ WiredTiger is an high performance, scalable, production quality, NoSQL,
@section releases Releases
<table>
-@row{<b>WiredTiger 2.6.1</b> (current),
+@row{<b>WiredTiger 2.7.0</b> (current),
+ <a href="releases/wiredtiger-2.7.0.tar.bz2"><b>[Release package]</b></a>,
+ <a href="2.7.0/index.html"><b>[Documentation]</b></a>}
+@row{<b>WiredTiger 2.6.1</b> (previous),
<a href="releases/wiredtiger-2.6.1.tar.bz2"><b>[Release package]</b></a>,
<a href="2.6.1/index.html"><b>[Documentation]</b></a>}
-@row{<b>WiredTiger 2.5.3</b> (previous),
- <a href="releases/wiredtiger-2.5.3.tar.bz2"><b>[Release package]</b></a>,
- <a href="2.5.3/index.html"><b>[Documentation]</b></a>}
@row{<b>Development branch</b>,
<a href="https://github.com/wiredtiger/wiredtiger"><b>[Source code]</b></a>,
<a href="develop/index.html"><b>[Documentation]</b></a>}
diff --git a/src/docs/tune-bulk-load.dox b/src/docs/tune-bulk-load.dox
index 8ee1061c76c..f5d28436dca 100644
--- a/src/docs/tune-bulk-load.dox
+++ b/src/docs/tune-bulk-load.dox
@@ -15,8 +15,12 @@ WT_CURSOR::close methods. Bulk load inserts are non-transactional: they
cannot be rolled back and ignore the transactional state of the WT_SESSION
in which they are opened.
-When bulk-loading row-store objects, keys must be loaded in sorted
-order.
+When doing a bulk-load insert, keys must be inserted in sorted order.
+When doing a bulk-load insert into a column-store object, any skipped
+records will be created as already-deleted rows. If a column-store
+bulk-load cursor is configured with \c append, the cursor key will be
+ignored and each inserted row will be assigned the next sequential
+record number.
When using the \c sort utility on a Linux or other POSIX-like system to
pre-sort keys, the locale specified by the environment affects the sort
diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox
index 34f391b27f1..e0239919f0b 100644
--- a/src/docs/upgrading.dox
+++ b/src/docs/upgrading.dox
@@ -1,5 +1,33 @@
/*! @page upgrading Upgrading WiredTiger applications
+@section version_271 Upgrading to Version 2.7.1
+<dl>
+<dt>Column-store bulk-load cursors</dt>
+<dd>
+Historically, bulk-load of a column-store object ignored any key set in
+the cursor and automatically assigned each inserted row the next
+sequential record number for its key. In the 2.7.1 release, column-store
+objects match row-store behavior and require the cursor key be set
+before an insert. (This also allows allows sparse tables to be created
+in column-store objects, any skipped records are created as
+already-deleted rows.) To match the previous behavior, specify the
+\c append configuration string when opening the column-store bulk-load
+cursor; this causes the cursor's key to be ignored and each inserted row
+will be assigned the next record number.
+</dd>
+
+<dt>Change to WT_SESSION::truncate with URI</dt>
+<dd>
+If using the WT_SESSION::truncate API with a file: URI for a full table
+truncate, underlying algorithmic changes result in some visible differences.
+This call can now return WT_ROLLBACK. Applications should be prepared to
+handle this error. This method no longer requires exclusive access to the
+table. Also the underlying disk space may not be immediately
+reclaimed when the call returns. The performance of this API may differ
+from earlier releases.
+</dd>
+
+</dl><hr>
@section version_270 Upgrading to Version 2.7.0
<dl>
diff --git a/src/docs/wtperf.dox b/src/docs/wtperf.dox
index fb46a91a62c..64e25978dd8 100644
--- a/src/docs/wtperf.dox
+++ b/src/docs/wtperf.dox
@@ -173,6 +173,10 @@ taken to do the drop.
@par icount (unsigned int, default=5000)
number of records to initially populate. If multiple tables are
configured the count is spread evenly across all tables.
+@par idle_table_cycle (unsigned int, default=0)
+Enable regular create and drop of idle tables, value is the maximum
+number of seconds a create or drop is allowed before flagging an
+error. Default 0 which means disabled.
@par index (boolean, default=false)
Whether to create an index on the value field.
@par insert_rmw (boolean, default=false)
@@ -182,11 +186,17 @@ key size
@par log_partial (boolean, default=false)
perform partial logging on first table only.
@par min_throughput (unsigned int, default=0)
-abort if any throughput measured is less than this amount. Requires
+notify if any throughput measured is less than this amount. Aborts or
+prints warning based on min_throughput_fatal setting. Requires
sample_interval to be configured
+@par min_throughput_fatal (boolean, default=false)
+print warning (false) or abort (true) of min_throughput failure.
@par max_latency (unsigned int, default=0)
-abort if any latency measured exceeds this number of
-milliseconds.Requires sample_interval to be configured
+notify if any latency measured exceeds this number of
+milliseconds.Aborts or prints warning based on min_throughput_fatal
+setting. Requires sample_interval to be configured
+@par max_latency_fatal (boolean, default=false)
+print warning (false) or abort (true) of max_latency failure.
@par pareto (unsigned int, default=0)
use pareto distribution for random numbers. Zero to disable, otherwise
a percentage indicating how aggressive the distribution should be.
@@ -200,6 +210,8 @@ if non zero choose a value from within this range as the key for
insert operations
@par random_value (boolean, default=false)
generate random content for the value
+@par read_range (unsigned int, default=0)
+scan a range of keys after each search
@par reopen_connection (boolean, default=true)
close and reopen the connection between populate and workload phases
@par report_interval (unsigned int, default=2)
@@ -230,8 +242,9 @@ threads, and the 'insert', 'read' and 'update' entries are the ratios
of insert, read and update operations done by each worker thread; If a
throttle value is provided each thread will do a maximum of that
number of operations per second; multiple workload configurations may
-be specified; for example, a more complex threads configuration might
-be 'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))'
+be specified per threads configuration; for example, a more complex
+threads configuration might be
+'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))'
which would create 2 threads doing nothing but reads and 8 threads
each doing 50% inserts and 25% reads and updates. Allowed
configuration values are 'count', 'throttle', 'reads', 'inserts',
diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c
index c5f6ae3d4d1..641864a8baa 100644
--- a/src/evict/evict_file.c
+++ b/src/evict/evict_file.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -85,7 +85,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
WT_ASSERT(session,
F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
__wt_page_can_evict(session, ref, NULL));
- __wt_evict_page_clean_update(session, ref, true);
+ __wt_ref_out(session, ref);
break;
WT_ILLEGAL_VALUE_ERR(session);
}
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index ac481581c23..0536a06bc22 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -27,9 +27,12 @@ static int __evict_server_work(WT_SESSION_IMPL *);
static inline uint64_t
__evict_read_gen(const WT_EVICT_ENTRY *entry)
{
+ WT_BTREE *btree;
WT_PAGE *page;
uint64_t read_gen;
+ btree = entry->btree;
+
/* Never prioritize empty slots. */
if (entry->ref == NULL)
return (UINT64_MAX);
@@ -40,15 +43,23 @@ __evict_read_gen(const WT_EVICT_ENTRY *entry)
if (page->read_gen == WT_READGEN_OLDEST)
return (WT_READGEN_OLDEST);
+ /*
+ * Any leaf page from a dead tree is a great choice (not internal pages,
+ * they may have children and are not yet evictable).
+ */
+ if (!WT_PAGE_IS_INTERNAL(page) &&
+ F_ISSET(btree->dhandle, WT_DHANDLE_DEAD))
+ return (WT_READGEN_OLDEST);
+
/* Any empty page (leaf or internal), is a good choice. */
if (__wt_page_is_empty(page))
return (WT_READGEN_OLDEST);
/*
- * Skew the read generation for internal pages, we prefer to evict leaf
- * pages.
+ * The base read-generation is skewed by the eviction priority.
+ * Internal pages are also adjusted, we prefer to evict leaf pages.
*/
- read_gen = page->read_gen + entry->btree->evict_priority;
+ read_gen = page->read_gen + btree->evict_priority;
if (WT_PAGE_IS_INTERNAL(page))
read_gen += WT_EVICT_INT_SKEW;
@@ -727,6 +738,10 @@ __evict_request_walk_clear(WT_SESSION_IMPL *session)
F_CLR(session, WT_SESSION_CLEAR_EVICT_WALK);
+ /* An error is unexpected - flag the failure. */
+ if (ret != 0)
+ __wt_err(session, ret, "Failed to clear eviction walk point");
+
return (ret);
}
@@ -760,20 +775,18 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp)
{
WT_BTREE *btree;
WT_CACHE *cache;
+ WT_DECL_RET;
WT_EVICT_ENTRY *evict;
u_int i, elem;
+ *evict_resetp = false;
+
btree = S2BT(session);
cache = S2C(session)->cache;
- /*
- * If the file isn't evictable, there's no work to do.
- */
- if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) {
- *evict_resetp = false;
+ /* If the file wasn't evictable, there's no work to do. */
+ if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
return (0);
- }
- *evict_resetp = true;
/*
* Hold the walk lock to set the "no eviction" flag: no new pages from
@@ -784,7 +797,7 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp)
__wt_spin_unlock(session, &cache->evict_walk_lock);
/* Clear any existing LRU eviction walk for the file. */
- WT_RET(__evict_request_walk_clear(session));
+ WT_ERR(__evict_request_walk_clear(session));
/* Hold the evict lock to remove any queued pages from this file. */
__wt_spin_lock(session, &cache->evict_lock);
@@ -806,7 +819,11 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp)
while (btree->evict_busy > 0)
__wt_yield();
+ *evict_resetp = true;
return (0);
+
+err: F_CLR(btree, WT_BTREE_NO_EVICTION);
+ return (ret);
}
/*
@@ -852,9 +869,8 @@ __evict_lru_walk(WT_SESSION_IMPL *session)
{
WT_CACHE *cache;
WT_DECL_RET;
- WT_EVICT_ENTRY *evict;
uint64_t cutoff;
- uint32_t candidates, entries, i;
+ uint32_t candidates, entries;
cache = S2C(session)->cache;
@@ -872,6 +888,14 @@ __evict_lru_walk(WT_SESSION_IMPL *session)
while (entries > 0 && cache->evict_queue[entries - 1].ref == NULL)
--entries;
+ /*
+ * If we have more entries than the maximum tracked between walks,
+ * clear them. Do this before figuring out how many of the entries are
+ * candidates so we never end up with more candidates than entries.
+ */
+ while (entries > WT_EVICT_WALK_BASE)
+ __evict_list_clear(session, &cache->evict_queue[--entries]);
+
cache->evict_entries = entries;
if (entries == 0) {
@@ -916,15 +940,6 @@ __evict_lru_walk(WT_SESSION_IMPL *session)
cache->evict_candidates = candidates;
}
- /* If we have more than the minimum number of entries, clear them. */
- if (cache->evict_entries > WT_EVICT_WALK_BASE) {
- for (i = WT_EVICT_WALK_BASE, evict = cache->evict_queue + i;
- i < cache->evict_entries;
- i++, evict++)
- __evict_list_clear(session, evict);
- cache->evict_entries = WT_EVICT_WALK_BASE;
- }
-
cache->evict_current = cache->evict_queue;
__wt_spin_unlock(session, &cache->evict_lock);
@@ -982,6 +997,7 @@ __evict_walk(WT_SESSION_IMPL *session)
conn = S2C(session);
cache = S2C(session)->cache;
+ btree = NULL;
dhandle = NULL;
dhandle_locked = incr = false;
retries = 0;
@@ -1041,6 +1057,7 @@ retry: while (slot < max_entries && ret == 0) {
(void)__wt_atomic_subi32(
&dhandle->session_inuse, 1);
incr = false;
+ cache->evict_file_next = NULL;
}
dhandle = TAILQ_NEXT(dhandle, q);
}
@@ -1096,6 +1113,9 @@ retry: while (slot < max_entries && ret == 0) {
* exclusive access when a handle is being closed.
*/
if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) {
+ /* Remember the file to visit first, next loop. */
+ cache->evict_file_next = dhandle;
+
WT_WITH_DHANDLE(session, dhandle,
ret = __evict_walk_file(session, &slot));
WT_ASSERT(session, session->split_gen == 0);
@@ -1115,9 +1135,6 @@ retry: while (slot < max_entries && ret == 0) {
}
if (incr) {
- /* Remember the file we should visit first, next loop. */
- cache->evict_file_next = dhandle;
-
WT_ASSERT(session, dhandle->session_inuse > 0);
(void)__wt_atomic_subi32(&dhandle->session_inuse, 1);
incr = false;
@@ -1170,7 +1187,7 @@ __evict_init_candidate(
evict->ref = ref;
evict->btree = S2BT(session);
- /* Mark the page on the list */
+ /* Mark the page on the list; set last to flush the other updates. */
F_SET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU);
}
@@ -1197,15 +1214,17 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
conn = S2C(session);
btree = S2BT(session);
cache = conn->cache;
- start = cache->evict_queue + *slotp;
- end = WT_MIN(start + WT_EVICT_WALK_PER_FILE,
- cache->evict_queue + cache->evict_slots);
internal_pages = restarts = 0;
enough = false;
- walk_flags = WT_READ_CACHE | WT_READ_NO_EVICT |
- WT_READ_NO_GEN | WT_READ_NO_WAIT;
+ start = cache->evict_queue + *slotp;
+ end = start + WT_EVICT_WALK_PER_FILE;
+ if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
+ end > cache->evict_queue + cache->evict_slots)
+ end = cache->evict_queue + cache->evict_slots;
+ walk_flags =
+ WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT;
if (F_ISSET(cache, WT_CACHE_WALK_REVERSE))
walk_flags |= WT_READ_PREV;
@@ -1247,7 +1266,8 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
continue;
/* Pages we no longer need (clean or dirty), are found money. */
- if (__wt_page_is_empty(page))
+ if (__wt_page_is_empty(page) ||
+ F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
goto fast;
/* Skip clean pages if appropriate. */
@@ -1508,8 +1528,8 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full)
if (txn_busy && pct_full < 100)
return (0);
- if (busy == 1)
- txn_busy = 1;
+ if (busy)
+ txn_busy = true;
/* Wake the eviction server if we need to do work. */
WT_RET(__wt_evict_server_wake(session));
@@ -1570,6 +1590,26 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full)
/* NOTREACHED */
}
+/*
+ * __wt_evict_priority_set --
+ * Set a tree's eviction priority.
+ */
+void
+__wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v)
+{
+ S2BT(session)->evict_priority = v;
+}
+
+/*
+ * __wt_evict_priority_clear --
+ * Clear a tree's eviction priority.
+ */
+void
+__wt_evict_priority_clear(WT_SESSION_IMPL *session)
+{
+ S2BT(session)->evict_priority = 0;
+}
+
#ifdef HAVE_DIAGNOSTIC
/*
* __wt_cache_dump --
diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c
index 1cdf07a9a55..72c07eaa05d 100644
--- a/src/evict/evict_page.c
+++ b/src/evict/evict_page.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -8,8 +8,9 @@
#include "wt_internal.h"
-static int __evict_page_dirty_update(WT_SESSION_IMPL *, WT_REF *, bool);
-static int __evict_review(WT_SESSION_IMPL *, WT_REF *, bool *, bool);
+static int __evict_page_clean_update(WT_SESSION_IMPL *, WT_REF *, bool);
+static int __evict_page_dirty_update(WT_SESSION_IMPL *, WT_REF *, bool);
+static int __evict_review(WT_SESSION_IMPL *, WT_REF *, bool *, bool);
/*
* __evict_exclusive_clear --
@@ -117,7 +118,7 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
* Pages that belong to dead trees never write back to disk
* and can't support page splits.
*/
- WT_ERR(__wt_evict_page_clean_update(
+ WT_ERR(__evict_page_clean_update(
session, ref, tree_dead || closing));
else
WT_ERR(__evict_page_dirty_update(session, ref, closing));
@@ -200,12 +201,11 @@ __evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
}
/*
- * __wt_evict_page_clean_update --
+ * __evict_page_clean_update --
* Update a clean page's reference on eviction.
*/
-int
-__wt_evict_page_clean_update(
- WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
+static int
+__evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
{
WT_DECL_RET;
diff --git a/src/include/api.h b/src/include/api.h
index 4821b450f9e..c6a5af40698 100644
--- a/src/include/api.h
+++ b/src/include/api.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/async.h b/src/include/async.h
index fb9a64e774d..7a415a4a17a 100644
--- a/src/include/async.h
+++ b/src/include/async.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/bitstring.i b/src/include/bitstring.i
index 5449ffe6209..0d30e55d1ef 100644
--- a/src/include/bitstring.i
+++ b/src/include/bitstring.i
@@ -1,5 +1,5 @@
/*-
- * Public Domain 2014-2015 MongoDB, Inc.
+ * Public Domain 2014-2016 MongoDB, Inc.
* Public Domain 2008-2014 WiredTiger, Inc.
*
* This is free and unencumbered software released into the public domain.
diff --git a/src/include/block.h b/src/include/block.h
index 4bff6c82783..27a140b73a4 100644
--- a/src/include/block.h
+++ b/src/include/block.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -173,6 +173,7 @@ struct __wt_bm {
int (*compact_skip)(WT_BM *, WT_SESSION_IMPL *, bool *);
int (*compact_start)(WT_BM *, WT_SESSION_IMPL *);
int (*free)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
+ bool (*is_mapped)(WT_BM *, WT_SESSION_IMPL *);
int (*preload)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
int (*read)
(WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t);
@@ -182,6 +183,7 @@ struct __wt_bm {
int (*salvage_start)(WT_BM *, WT_SESSION_IMPL *);
int (*salvage_valid)
(WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t, bool);
+ int (*size)(WT_BM *, WT_SESSION_IMPL *, wt_off_t *);
int (*stat)(WT_BM *, WT_SESSION_IMPL *, WT_DSRC_STATS *stats);
int (*sync)(WT_BM *, WT_SESSION_IMPL *, bool);
int (*verify_addr)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
@@ -244,7 +246,10 @@ struct __wt_block {
bool ckpt_inprogress;/* Live checkpoint in progress */
/* Compaction support */
- int compact_pct_tenths; /* Percent to compact */
+ int compact_pct_tenths; /* Percent to compact */
+ uint64_t compact_pages_reviewed;/* Pages reviewed */
+ uint64_t compact_pages_skipped; /* Pages skipped */
+ uint64_t compact_pages_written; /* Pages rewritten */
/* Salvage support */
wt_off_t slvg_off; /* Salvage file offset */
diff --git a/src/include/bloom.h b/src/include/bloom.h
index a673ee9add2..ddc2d64a118 100644
--- a/src/include/bloom.h
+++ b/src/include/bloom.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 6ee74c61a38..cfbd87f0cae 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -305,7 +305,7 @@ struct __wt_page_modify {
struct {
/*
* Appended items to column-stores: there is only a single one
- * of these per column-store tree.
+ * of these active at a time per column-store tree.
*/
WT_INSERT_HEAD **append;
@@ -319,9 +319,18 @@ struct __wt_page_modify {
* huge.
*/
WT_INSERT_HEAD **update;
+
+ /*
+ * Split-saved last column-store page record. If a column-store
+ * page is split, we save the first record number moved so that
+ * during reconciliation we know the page's last record and can
+ * write any implicitly created deleted records for the page.
+ */
+ uint64_t split_recno;
} leaf;
#define mod_append u2.leaf.append
#define mod_update u2.leaf.update
+#define mod_split_recno u2.leaf.split_recno
} u2;
/*
@@ -478,7 +487,7 @@ struct __wt_page {
#define pg_row_ins u.row.ins
#undef pg_row_upd
#define pg_row_upd u.row.upd
-#define pg_row_entries u.row.entries
+#undef pg_row_entries
#define pg_row_entries u.row.entries
/* Fixed-length column-store leaf page. */
@@ -544,8 +553,8 @@ struct __wt_page {
#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */
#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
#define WT_PAGE_OVERFLOW_KEYS 0x10 /* Page has overflow keys */
-#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */
-#define WT_PAGE_SPLIT_BLOCK 0x40 /* Split blocking eviction and splits */
+#define WT_PAGE_SPLIT_BLOCK 0x20 /* Split blocking eviction and splits */
+#define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */
#define WT_PAGE_UPDATE_IGNORE 0x80 /* Ignore updates on page discard */
uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
@@ -1049,7 +1058,7 @@ struct __wt_insert_head {
uint64_t __prev_split_gen = (session)->split_gen; \
if (__prev_split_gen == 0) \
do { \
- WT_PUBLISH((session)->split_gen, \
+ WT_PUBLISH((session)->split_gen, \
S2C(session)->split_gen); \
} while ((session)->split_gen != S2C(session)->split_gen)
diff --git a/src/include/btree.h b/src/include/btree.h
index a1d8e395cfc..703de0f2fc6 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -153,16 +153,18 @@ struct __wt_btree {
#define WT_BTREE_NO_CHECKPOINT 0x00800 /* Disable checkpoints */
#define WT_BTREE_NO_EVICTION 0x01000 /* Disable eviction */
#define WT_BTREE_NO_LOGGING 0x02000 /* Disable logging */
-#define WT_BTREE_SALVAGE 0x04000 /* Handle is for salvage */
-#define WT_BTREE_SKIP_CKPT 0x08000 /* Handle skipped checkpoint */
-#define WT_BTREE_UPGRADE 0x10000 /* Handle is for upgrade */
-#define WT_BTREE_VERIFY 0x20000 /* Handle is for verify */
+#define WT_BTREE_REBALANCE 0x04000 /* Handle is for rebalance */
+#define WT_BTREE_SALVAGE 0x08000 /* Handle is for salvage */
+#define WT_BTREE_SKIP_CKPT 0x10000 /* Handle skipped checkpoint */
+#define WT_BTREE_UPGRADE 0x20000 /* Handle is for upgrade */
+#define WT_BTREE_VERIFY 0x40000 /* Handle is for verify */
uint32_t flags;
};
/* Flags that make a btree handle special (not for normal use). */
#define WT_BTREE_SPECIAL_FLAGS \
- (WT_BTREE_BULK | WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)
+ (WT_BTREE_BULK | WT_BTREE_REBALANCE | \
+ WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)
/*
* WT_SALVAGE_COOKIE --
diff --git a/src/include/btree.i b/src/include/btree.i
index 23e0dfea2cd..94111397abd 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -1046,15 +1046,16 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
* do it without making the appending threads wait. See if it's worth
* doing a split to let the threads continue before doing eviction.
*
- * Ignore anything other than large, dirty row-store leaf pages. The
- * split code only supports row-store pages, and we depend on the page
- * being dirty for correctness (the page must be reconciled again
+ * Ignore anything other than large, dirty leaf pages. We depend on the
+ * page being dirty for correctness (the page must be reconciled again
* before being evicted after the split, information from a previous
* reconciliation will be wrong, so we can't evict immediately).
*/
- if (page->type != WT_PAGE_ROW_LEAF ||
- page->memory_footprint < btree->splitmempage ||
- !__wt_page_is_modified(page))
+ if (page->memory_footprint < btree->splitmempage)
+ return (false);
+ if (WT_PAGE_IS_INTERNAL(page))
+ return (false);
+ if (!__wt_page_is_modified(page))
return (false);
/*
@@ -1071,9 +1072,11 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
#define WT_MIN_SPLIT_COUNT 30
#define WT_MIN_SPLIT_MULTIPLIER 16 /* At level 2, we see 1/16th entries */
- ins_head = page->pg_row_entries == 0 ?
+ ins_head = page->type == WT_PAGE_ROW_LEAF ?
+ (page->pg_row_entries == 0 ?
WT_ROW_INSERT_SMALLEST(page) :
- WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1);
+ WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1)) :
+ WT_COL_APPEND(page);
if (ins_head == NULL)
return (false);
for (count = 0, size = 0, ins = ins_head->head[WT_MIN_SPLIT_DEPTH];
@@ -1280,8 +1283,8 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
* coupling up/down the tree.
*/
static inline int
-__wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held,
- WT_REF *want, uint32_t flags
+__wt_page_swap_func(
+ WT_SESSION_IMPL *session, WT_REF *held, WT_REF *want, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
, const char *file, int line
#endif
@@ -1310,20 +1313,40 @@ __wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held,
#endif
);
- /* Expected failures: page not found or restart. */
- if (ret == WT_NOTFOUND || ret == WT_RESTART)
- return (ret);
+ /*
+ * Expected failures: page not found or restart. Our callers list the
+ * errors they're expecting to handle.
+ */
+ if (LF_ISSET(WT_READ_NOTFOUND_OK) && ret == WT_NOTFOUND)
+ return (WT_NOTFOUND);
+ if (LF_ISSET(WT_READ_RESTART_OK) && ret == WT_RESTART)
+ return (WT_RESTART);
- /* Discard the original held page. */
+ /* Discard the original held page on either success or error. */
acquired = ret == 0;
WT_TRET(__wt_page_release(session, held, flags));
+ /* Fast-path expected success. */
+ if (ret == 0)
+ return (0);
+
/*
- * If there was an error discarding the original held page, discard
- * the acquired page too, keeping it is never useful.
+ * If there was an error at any point that our caller isn't prepared to
+ * handle, discard any page we acquired.
*/
- if (acquired && ret != 0)
+ if (acquired)
WT_TRET(__wt_page_release(session, want, flags));
+
+ /*
+ * If we're returning an error, don't let it be one our caller expects
+ * to handle as returned by page-in: the expectation includes the held
+ * page not having been released, and that's not the case.
+ */
+ if (LF_ISSET(WT_READ_NOTFOUND_OK) && ret == WT_NOTFOUND)
+ return (EINVAL);
+ if (LF_ISSET(WT_READ_RESTART_OK) && ret == WT_RESTART)
+ return (EINVAL);
+
return (ret);
}
@@ -1437,17 +1460,54 @@ __wt_split_intl_race(
*
* There's a page-split race when we walk the tree: if we're splitting
* an internal page into its parent, we update the parent's page index
- * and then update the page being split, and it's not an atomic update.
- * A thread could read the parent page's original page index, and then
- * read the page's replacement index. Because internal page splits work
- * by replacing the original page with the initial part of the original
- * page, the result of this race is we will have a key that's past the
- * end of the current page, and the parent's page index will have moved.
+ * before updating the split page's page index, and it's not an atomic
+ * update. A thread can read the parent page's original page index and
+ * then read the split page's replacement index.
+ *
+ * Because internal page splits work by truncating the original page to
+ * the initial part of the original page, the result of this race is we
+ * will have a search key that points past the end of the current page.
+ * This is only an issue when we search past the end of the page, if we
+ * find a WT_REF in the page with the namespace we're searching for, we
+ * don't care if the WT_REF moved or not while we were searching, we
+ * have the correct page.
+ *
+ * For example, imagine an internal page with 3 child pages, with the
+ * namespaces a-f, g-h and i-j; the first child page splits. The parent
+ * starts out with the following page-index:
+ *
+ * | ... | a | g | i | ... |
+ *
+ * which changes to this:
+ *
+ * | ... | a | c | e | g | i | ... |
+ *
+ * The child starts out with the following page-index:
+ *
+ * | a | b | c | d | e | f |
+ *
+ * which changes to this:
+ *
+ * | a | b |
+ *
+ * The thread searches the original parent page index for the key "cat",
+ * it couples to the "a" child page; if it uses the replacement child
+ * page index, it will search past the end of the page and couple to the
+ * "b" page, which is wrong.
+ *
+ * To detect the problem, we remember the parent page's page index used
+ * to descend the tree. Whenever we search past the end of a page, we
+ * check to see if the parent's page index has changed since our use of
+ * it during descent. As the problem only appears if we read the split
+ * page's replacement index, the parent page's index must already have
+ * changed, ensuring we detect the problem.
*
- * It's also possible a thread could read the parent page's replacement
- * page index, and then read the page's original index. Because internal
- * splits work by truncating the original page, the original page's old
- * content is compatible, this isn't a problem and we ignore this race.
+ * It's possible for the opposite race to happen (a thread could read
+ * the parent page's replacement page index and then read the split
+ * page's original index). This isn't a problem because internal splits
+ * work by truncating the split page, so the split page search is for
+ * content the split page retains after the split, and we ignore this
+ * race.
*/
WT_INTL_INDEX_GET(session, parent, pindex);
return (pindex != saved_pindex);
diff --git a/src/include/btree_cmp.i b/src/include/btree_cmp.i
index 8a7fe19a22f..1993c1be293 100644
--- a/src/include/btree_cmp.i
+++ b/src/include/btree_cmp.i
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/buf.i b/src/include/buf.i
index b8849396f01..95d945ec6d3 100644
--- a/src/include/buf.i
+++ b/src/include/buf.i
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -92,18 +92,6 @@ __wt_buf_setstr(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *s)
}
/*
- * __wt_buf_set_printable --
- * Set the contents of the buffer to a printable representation of a
- * byte string.
- */
-static inline int
-__wt_buf_set_printable(
- WT_SESSION_IMPL *session, WT_ITEM *buf, const void *from_arg, size_t size)
-{
- return (__wt_raw_to_esc_hex(session, from_arg, size, buf));
-}
-
-/*
* __wt_buf_free --
* Free a buffer.
*/
diff --git a/src/include/cache.h b/src/include/cache.h
index a0440f23a00..a3961d6043e 100644
--- a/src/include/cache.h
+++ b/src/include/cache.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/cache.i b/src/include/cache.i
index 7cbd72853c3..ee13eee84c5 100644
--- a/src/include/cache.i
+++ b/src/include/cache.i
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/cell.i b/src/include/cell.i
index 1410b30bb82..481d2a29764 100644
--- a/src/include/cell.i
+++ b/src/include/cell.i
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/column.i b/src/include/column.i
index fc1f372b2a9..9f3e2101f6f 100644
--- a/src/include/column.i
+++ b/src/include/column.i
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -176,6 +176,16 @@ __col_insert_search(WT_INSERT_HEAD *inshead,
continue;
}
+ /*
+ * When no exact match is found, the search returns the smallest
+ * key larger than the searched-for key, or the largest key
+ * smaller than the searched-for key, if there is no larger key.
+ * Our callers depend on that: specifically, the fixed-length
+ * column store cursor code interprets returning a key smaller
+ * than the searched-for key to mean the searched-for key is
+ * larger than any key on the page. Don't change that behavior,
+ * things will break.
+ */
ins_recno = WT_INSERT_RECNO(ret_ins);
cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1;
@@ -204,9 +214,9 @@ __col_var_last_recno(WT_PAGE *page)
WT_COL_RLE *repeat;
/*
- * If there's an append list (the last page), then there may be more
- * records on the page. This function ignores those records, so our
- * callers have to handle that explicitly, if they care.
+ * If there's an append list, there may be more records on the page.
+ * This function ignores those records, our callers must handle that
+ * explicitly, if they care.
*/
if (page->pg_var_nrepeats == 0)
return (page->pg_var_entries == 0 ? 0 :
@@ -225,9 +235,9 @@ static inline uint64_t
__col_fix_last_recno(WT_PAGE *page)
{
/*
- * If there's an append list (the last page), then there may be more
- * records on the page. This function ignores those records, so our
- * callers have to handle that explicitly, if they care.
+ * If there's an append list, there may be more records on the page.
+ * This function ignores those records, our callers must handle that
+ * explicitly, if they care.
*/
return (page->pg_fix_entries == 0 ? 0 :
page->pg_fix_recno + (page->pg_fix_entries - 1));
@@ -282,7 +292,17 @@ __col_var_search(WT_PAGE *page, uint64_t recno, uint64_t *start_recnop)
start_recno = repeat->recno + repeat->rle;
}
- if (recno >= start_recno + (page->pg_var_entries - start_indx))
+ /*
+ * !!!
+ * The test could be written more simply as:
+ *
+ * (recno >= start_recno + (page->pg_var_entries - start_indx))
+ *
+ * It's split into two parts because the simpler test will overflow if
+ * searching for large record numbers.
+ */
+ if (recno >= start_recno &&
+ recno - start_recno >= page->pg_var_entries - start_indx)
return (NULL);
return (page->pg_var_d + start_indx + (uint32_t)(recno - start_recno));
diff --git a/src/include/compact.h b/src/include/compact.h
index 0698bf7b1a4..2bba52e7173 100644
--- a/src/include/compact.h
+++ b/src/include/compact.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/config.h b/src/include/config.h
index e836abaccba..e63db0e76cf 100644
--- a/src/include/config.h
+++ b/src/include/config.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -72,25 +72,26 @@ struct __wt_config_parser_impl {
#define WT_CONFIG_ENTRY_WT_SESSION_log_flush 20
#define WT_CONFIG_ENTRY_WT_SESSION_log_printf 21
#define WT_CONFIG_ENTRY_WT_SESSION_open_cursor 22
-#define WT_CONFIG_ENTRY_WT_SESSION_reconfigure 23
-#define WT_CONFIG_ENTRY_WT_SESSION_rename 24
-#define WT_CONFIG_ENTRY_WT_SESSION_reset 25
-#define WT_CONFIG_ENTRY_WT_SESSION_rollback_transaction 26
-#define WT_CONFIG_ENTRY_WT_SESSION_salvage 27
-#define WT_CONFIG_ENTRY_WT_SESSION_snapshot 28
-#define WT_CONFIG_ENTRY_WT_SESSION_strerror 29
-#define WT_CONFIG_ENTRY_WT_SESSION_transaction_sync 30
-#define WT_CONFIG_ENTRY_WT_SESSION_truncate 31
-#define WT_CONFIG_ENTRY_WT_SESSION_upgrade 32
-#define WT_CONFIG_ENTRY_WT_SESSION_verify 33
-#define WT_CONFIG_ENTRY_colgroup_meta 34
-#define WT_CONFIG_ENTRY_file_meta 35
-#define WT_CONFIG_ENTRY_index_meta 36
-#define WT_CONFIG_ENTRY_table_meta 37
-#define WT_CONFIG_ENTRY_wiredtiger_open 38
-#define WT_CONFIG_ENTRY_wiredtiger_open_all 39
-#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 40
-#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 41
+#define WT_CONFIG_ENTRY_WT_SESSION_rebalance 23
+#define WT_CONFIG_ENTRY_WT_SESSION_reconfigure 24
+#define WT_CONFIG_ENTRY_WT_SESSION_rename 25
+#define WT_CONFIG_ENTRY_WT_SESSION_reset 26
+#define WT_CONFIG_ENTRY_WT_SESSION_rollback_transaction 27
+#define WT_CONFIG_ENTRY_WT_SESSION_salvage 28
+#define WT_CONFIG_ENTRY_WT_SESSION_snapshot 29
+#define WT_CONFIG_ENTRY_WT_SESSION_strerror 30
+#define WT_CONFIG_ENTRY_WT_SESSION_transaction_sync 31
+#define WT_CONFIG_ENTRY_WT_SESSION_truncate 32
+#define WT_CONFIG_ENTRY_WT_SESSION_upgrade 33
+#define WT_CONFIG_ENTRY_WT_SESSION_verify 34
+#define WT_CONFIG_ENTRY_colgroup_meta 35
+#define WT_CONFIG_ENTRY_file_meta 36
+#define WT_CONFIG_ENTRY_index_meta 37
+#define WT_CONFIG_ENTRY_table_meta 38
+#define WT_CONFIG_ENTRY_wiredtiger_open 39
+#define WT_CONFIG_ENTRY_wiredtiger_open_all 40
+#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 41
+#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 42
/*
* configuration section: END
* DO NOT EDIT: automatically built by dist/flags.py.
diff --git a/src/include/connection.h b/src/include/connection.h
index 2367f5a0035..5d61f9456b3 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -415,6 +415,7 @@ struct __wt_connection_impl {
uint32_t direct_io;
uint32_t write_through; /* FILE_FLAG_WRITE_THROUGH type flags */
bool mmap; /* mmap configuration */
+ int page_size; /* OS page size for mmap alignment */
uint32_t verbose;
uint32_t flags;
diff --git a/src/include/cursor.h b/src/include/cursor.h
index 275e2f2db46..7f7b5dceb79 100644
--- a/src/include/cursor.h
+++ b/src/include/cursor.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -200,18 +200,23 @@ struct __wt_cursor_btree {
uint8_t append_tree; /* Cursor appended to the tree */
+#ifdef HAVE_DIAGNOSTIC
+ /* Check that cursor next/prev never returns keys out-of-order. */
+ WT_ITEM *lastkey, _lastkey;
+ uint64_t lastrecno;
+#endif
+
#define WT_CBT_ACTIVE 0x01 /* Active in the tree */
#define WT_CBT_ITERATE_APPEND 0x02 /* Col-store: iterating append list */
#define WT_CBT_ITERATE_NEXT 0x04 /* Next iteration configuration */
#define WT_CBT_ITERATE_PREV 0x08 /* Prev iteration configuration */
-#define WT_CBT_MAX_RECORD 0x10 /* Col-store: past end-of-table */
-#define WT_CBT_NO_TXN 0x20 /* Non-transactional cursor
+#define WT_CBT_NO_TXN 0x10 /* Non-transactional cursor
(e.g. on a checkpoint) */
-#define WT_CBT_SEARCH_SMALLEST 0x40 /* Row-store: small-key insert list */
+#define WT_CBT_SEARCH_SMALLEST 0x20 /* Row-store: small-key insert list */
#define WT_CBT_POSITION_MASK /* Flags associated with position */ \
(WT_CBT_ITERATE_APPEND | WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV | \
- WT_CBT_MAX_RECORD | WT_CBT_SEARCH_SMALLEST)
+ WT_CBT_SEARCH_SMALLEST)
uint8_t flags;
};
@@ -219,33 +224,32 @@ struct __wt_cursor_btree {
struct __wt_cursor_bulk {
WT_CURSOR_BTREE cbt;
- WT_REF *ref; /* The leaf page */
- WT_PAGE *leaf;
-
/*
* Variable-length column store compares values during bulk load as
* part of RLE compression, row-store compares keys during bulk load
* to avoid corruption.
*/
- WT_ITEM last; /* Last key/value seen */
+ bool first_insert; /* First insert */
+ WT_ITEM last; /* Last key/value inserted */
/*
- * Variable-length column-store RLE counter (also overloaded to mean
- * the first time through the bulk-load insert routine, when set to 0).
+ * Additional column-store bulk load support.
*/
- uint64_t rle;
+ uint64_t recno; /* Record number */
+ uint64_t rle; /* Variable-length RLE counter */
/*
- * Fixed-length column-store current entry in memory chunk count, and
- * the maximum number of records per chunk.
+ * Additional fixed-length column store bitmap bulk load support:
+ * current entry in memory chunk count, and the maximum number of
+ * records per chunk.
*/
+ bool bitmap; /* Bitmap bulk load */
uint32_t entry; /* Entry count */
uint32_t nrecs; /* Max records per chunk */
- /* Special bitmap bulk load for fixed-length column stores. */
- bool bitmap;
-
- void *reconcile; /* Reconciliation information */
+ void *reconcile; /* Reconciliation support */
+ WT_REF *ref; /* The leaf page */
+ WT_PAGE *leaf;
};
struct __wt_cursor_config {
diff --git a/src/include/cursor.i b/src/include/cursor.i
index 2e382591313..8ab96c0a69d 100644
--- a/src/include/cursor.i
+++ b/src/include/cursor.i
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -246,8 +246,12 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter)
session = (WT_SESSION_IMPL *)cbt->iface.session;
- if (reenter)
+ if (reenter) {
+#ifdef HAVE_DIAGNOSTIC
+ __wt_cursor_key_order_reset(cbt);
+#endif
WT_RET(__curfile_leave(cbt));
+ }
/*
* Any old insert position is now invalid. We rely on this being
diff --git a/src/include/dhandle.h b/src/include/dhandle.h
index 9a54b4ddb66..8b313428d06 100644
--- a/src/include/dhandle.h
+++ b/src/include/dhandle.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -33,6 +33,10 @@
(F_ISSET(dhandle, WT_DHANDLE_DEAD) || \
!F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_OPEN))
+/* The metadata cursor's data handle. */
+#define WT_SESSION_META_DHANDLE(s) \
+ (((WT_CURSOR_BTREE *)((s)->meta_cursor))->btree->dhandle)
+
/*
* WT_DATA_HANDLE --
* A handle for a generic named data source.
diff --git a/src/include/dlh.h b/src/include/dlh.h
index c374ec36fb0..9e49c2ff3cb 100644
--- a/src/include/dlh.h
+++ b/src/include/dlh.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/error.h b/src/include/error.h
index e721855ce7c..5f24d205af9 100644
--- a/src/include/error.h
+++ b/src/include/error.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/extern.h b/src/include/extern.h
index d84403cc16d..b71f4b12486 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -14,6 +14,7 @@ extern int __wt_block_buffer_to_addr(WT_BLOCK *block, const uint8_t *p, wt_off_t
extern int __wt_block_addr_invalid(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, bool live);
extern int __wt_block_addr_string(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, const uint8_t *addr, size_t addr_size);
extern int __wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci);
+extern int __wt_block_ckpt_decode(WT_SESSION *wt_session, size_t allocsize, const uint8_t *p, WT_BLOCK_CKPT *ci);
extern int __wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t **pp, WT_BLOCK_CKPT *ci);
extern int __wt_block_ckpt_init( WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name);
extern int __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, uint8_t *root_addr, size_t *root_addr_sizep, bool checkpoint);
@@ -43,14 +44,15 @@ extern void __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el);
extern int __wt_block_map( WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapp, size_t *maplenp, void **mappingcookie);
extern int __wt_block_unmap( WT_SESSION_IMPL *session, WT_BLOCK *block, void *map, size_t maplen, void **mappingcookie);
extern int __wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], bool forced_salvage, bool readonly, uint32_t allocsize, WT_BM **bmp);
-extern int __wt_block_manager_truncate( WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize);
+extern int __wt_block_manager_drop(WT_SESSION_IMPL *session, const char *filename);
extern int __wt_block_manager_create( WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize);
extern void __wt_block_configure_first_fit(WT_BLOCK *block, bool on);
extern int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], bool forced_salvage, bool readonly, uint32_t allocsize, WT_BLOCK **blockp);
extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block);
extern int __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize);
extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats);
-extern int __wt_block_manager_size( WT_SESSION_IMPL *session, const char *filename, WT_DSRC_STATS *stats);
+extern int __wt_block_manager_size(WT_BM *bm, WT_SESSION_IMPL *session, wt_off_t *sizep);
+extern int __wt_block_manager_named_size( WT_SESSION_IMPL *session, const char *name, wt_off_t *sizep);
extern int __wt_bm_preload( WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
extern int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size);
extern int __wt_block_read_off_blind( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset);
@@ -89,6 +91,9 @@ extern int __wt_bloom_close(WT_BLOOM *bloom);
extern int __wt_bloom_drop(WT_BLOOM *bloom, const char *config);
extern int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp);
+extern int __wt_cursor_key_order_check( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next);
+extern int __wt_cursor_key_order_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
+extern void __wt_cursor_key_order_reset(WT_CURSOR_BTREE *cbt);
extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt);
extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating);
extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating);
@@ -129,7 +134,7 @@ extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]);
extern int __wt_btree_close(WT_SESSION_IMPL *session);
extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, bool is_recno);
extern int __wt_btree_tree_open( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
-extern int __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep);
+extern int __wt_btree_new_leaf_page( WT_SESSION_IMPL *session, uint64_t recno, WT_PAGE **pagep);
extern void __wt_btree_evictable(WT_SESSION_IMPL *session, bool on);
extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session);
extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session);
@@ -139,6 +144,7 @@ extern const char *__wt_page_type_string(u_int type);
extern const char *__wt_cell_type_string(uint8_t type);
extern const char *__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf);
extern const char *__wt_addr_string(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ITEM *buf);
+extern const char *__wt_buf_set_printable( WT_SESSION_IMPL *session, const void *p, size_t size, WT_ITEM *buf);
extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store);
extern int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack);
extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell);
@@ -151,6 +157,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
, const char *file, int line
#endif
);
+extern int __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd);
extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]);
extern void __wt_split_stash_discard(WT_SESSION_IMPL *session);
@@ -170,7 +177,7 @@ extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flag
extern int __wt_tree_walk_count(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags);
extern int __wt_tree_walk_skip(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags);
extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove);
-extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt);
+extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt);
extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page);
extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key);
extern int __wt_row_leaf_key_work(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, bool instantiate);
@@ -192,7 +199,7 @@ extern int __wt_las_create(WT_SESSION_IMPL *session);
extern int __wt_las_destroy(WT_SESSION_IMPL *session);
extern void __wt_las_set_written(WT_SESSION_IMPL *session);
extern bool __wt_las_is_written(WT_SESSION_IMPL *session);
-extern int __wt_las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp);
+extern int __wt_las_cursor_open(WT_SESSION_IMPL *session, WT_CURSOR **cursorp);
extern int __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags);
extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags);
extern int __wt_las_sweep(WT_SESSION_IMPL *session);
@@ -255,7 +262,7 @@ extern int __wt_connection_init(WT_CONNECTION_IMPL *conn);
extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn);
extern int __wt_logmgr_reconfig(WT_SESSION_IMPL *session, const char **cfg);
extern int __wt_log_truncate_files( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const char *cfg[]);
-extern int __wt_log_wrlsn(WT_SESSION_IMPL *session);
+extern int __wt_log_wrlsn(WT_SESSION_IMPL *session, int *yield);
extern int __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_logmgr_open(WT_SESSION_IMPL *session);
extern int __wt_logmgr_destroy(WT_SESSION_IMPL *session);
@@ -333,9 +340,10 @@ extern int __wt_evict_destroy(WT_SESSION_IMPL *session);
extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp);
extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session);
extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full);
+extern void __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v);
+extern void __wt_evict_priority_clear(WT_SESSION_IMPL *session);
extern int __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile);
extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing);
-extern int __wt_evict_page_clean_update( WT_SESSION_IMPL *session, WT_REF *ref, bool closing);
extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn);
extern int __wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start);
extern int __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn);
@@ -362,23 +370,23 @@ extern int __wt_logrec_read(WT_SESSION_IMPL *session, const uint8_t **pp, const
extern int __wt_logop_read(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *optypep, uint32_t *opsizep);
extern int __wt_logop_col_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno, WT_ITEM *value);
extern int __wt_logop_col_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop, WT_ITEM *valuep);
-extern int __wt_logop_col_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_col_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
extern int __wt_logop_col_remove_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno);
extern int __wt_logop_col_remove_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop);
-extern int __wt_logop_col_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_col_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
extern int __wt_logop_col_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t start, uint64_t stop);
extern int __wt_logop_col_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *startp, uint64_t *stopp);
-extern int __wt_logop_col_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_col_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
extern int __wt_logop_row_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key, WT_ITEM *value);
extern int __wt_logop_row_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp, WT_ITEM *valuep);
-extern int __wt_logop_row_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_row_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
extern int __wt_logop_row_remove_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key);
extern int __wt_logop_row_remove_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp);
-extern int __wt_logop_row_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_row_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
extern int __wt_logop_row_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *start, WT_ITEM *stop, uint32_t mode);
extern int __wt_logop_row_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep);
-extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
-extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
+extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
extern void __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
extern int __wt_log_slot_switch( WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced);
extern int __wt_log_slot_new(WT_SESSION_IMPL *session);
@@ -447,12 +455,13 @@ extern int __wt_ext_metadata_search(WT_EXTENSION_API *wt_api, WT_SESSION *wt_ses
extern int __wt_ext_metadata_update(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key, const char *value);
extern int __wt_metadata_get_ckptlist( WT_SESSION *session, const char *name, WT_CKPT **ckptbasep);
extern void __wt_metadata_free_ckptlist(WT_SESSION *session, WT_CKPT *ckptbase);
-extern int __wt_metadata_open(WT_SESSION_IMPL *session);
-extern int __wt_metadata_cursor( WT_SESSION_IMPL *session, const char *config, WT_CURSOR **cursorp);
+extern int __wt_metadata_cursor_open( WT_SESSION_IMPL *session, const char *config, WT_CURSOR **cursorp);
+extern int __wt_metadata_cursor(WT_SESSION_IMPL *session, WT_CURSOR **cursorp);
+extern int __wt_metadata_cursor_release(WT_SESSION_IMPL *session, WT_CURSOR **cursorp);
extern int __wt_metadata_insert( WT_SESSION_IMPL *session, const char *key, const char *value);
extern int __wt_metadata_update( WT_SESSION_IMPL *session, const char *key, const char *value);
extern int __wt_metadata_remove(WT_SESSION_IMPL *session, const char *key);
-extern int __wt_metadata_search( WT_SESSION_IMPL *session, const char *key, char **valuep);
+extern int __wt_metadata_search(WT_SESSION_IMPL *session, const char *key, char **valuep);
extern void __wt_meta_track_discard(WT_SESSION_IMPL *session);
extern int __wt_meta_track_on(WT_SESSION_IMPL *session);
extern int __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll);
@@ -468,7 +477,7 @@ extern int __wt_meta_track_init(WT_SESSION_IMPL *session);
extern int __wt_meta_track_destroy(WT_SESSION_IMPL *session);
extern int __wt_turtle_init(WT_SESSION_IMPL *session);
extern int __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **valuep);
-extern int __wt_turtle_update( WT_SESSION_IMPL *session, const char *key, const char *value);
+extern int __wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value);
extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn));
extern int __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp);
extern int __wt_realloc(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp);
@@ -488,7 +497,7 @@ extern int __wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep);
extern int __wt_filesize_name(WT_SESSION_IMPL *session, const char *filename, bool silent, wt_off_t *sizep);
extern int __wt_bytelock(WT_FH *fhp, wt_off_t byte, bool lock);
extern int __wt_directory_sync_fh(WT_SESSION_IMPL *session, WT_FH *fh);
-extern int __wt_directory_sync(WT_SESSION_IMPL *session, char *path);
+extern int __wt_directory_sync(WT_SESSION_IMPL *session, const char *path);
extern int __wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh);
extern int __wt_fsync_async(WT_SESSION_IMPL *session, WT_FH *fh);
extern int __wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len);
@@ -514,6 +523,7 @@ extern int __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp);
extern int __wt_once(void (*init_routine)(void));
extern int __wt_open(WT_SESSION_IMPL *session, const char *name, bool ok_create, bool exclusive, int dio_type, WT_FH **fhp);
extern int __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp);
+extern int __wt_get_vm_pagesize(void);
extern bool __wt_absolute_path(const char *path);
extern const char *__wt_path_separator(void);
extern bool __wt_has_priv(void);
@@ -558,8 +568,9 @@ extern uint32_t __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize);
extern int __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
extern int __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
-extern int __wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
-extern int __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
+extern int __wt_bulk_insert_fix( WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted);
+extern int __wt_bulk_insert_fix_bitmap(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
+extern int __wt_bulk_insert_var( WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted);
extern int __wt_schema_create_strip(WT_SESSION_IMPL *session, const char *v1, const char *v2, char **value_ret);
extern int __wt_direct_io_size_check(WT_SESSION_IMPL *session, const char **cfg, const char *config_name, uint32_t *allocsizep);
extern int __wt_schema_colgroup_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf);
@@ -606,6 +617,7 @@ extern int __wt_session_release_resources(WT_SESSION_IMPL *session);
extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
extern int __wt_session_create( WT_SESSION_IMPL *session, const char *uri, const char *config);
extern int __wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]);
+extern int __wt_session_range_truncate(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *start, WT_CURSOR *stop);
extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, bool open_metadata, WT_SESSION_IMPL **sessionp);
extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, bool open_metadata, uint32_t session_flags, WT_SESSION_IMPL **sessionp);
extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, bool *skipp);
@@ -639,8 +651,9 @@ extern int __wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri);
extern int __wt_filename(WT_SESSION_IMPL *session, const char *name, char **path);
extern int __wt_nfilename( WT_SESSION_IMPL *session, const char *name, size_t namelen, char **path);
extern int __wt_remove_if_exists(WT_SESSION_IMPL *session, const char *name);
-extern int __wt_sync_and_rename_fh( WT_SESSION_IMPL *session, WT_FH **fhp, const char *from, const char *to);
-extern int __wt_sync_and_rename_fp( WT_SESSION_IMPL *session, FILE **fpp, const char *from, const char *to);
+extern int __wt_rename_and_sync_directory( WT_SESSION_IMPL *session, const char *from, const char *to);
+extern int __wt_fh_sync_and_rename( WT_SESSION_IMPL *session, WT_FH **fhp, const char *from, const char *to);
+extern int __wt_sync_fp_and_rename( WT_SESSION_IMPL *session, FILE **fpp, const char *from, const char *to);
extern int __wt_library_init(void);
extern int __wt_breakpoint(void);
extern void __wt_attach(WT_SESSION_IMPL *session);
@@ -654,6 +667,7 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp
);
extern int __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page);
extern void __wt_hazard_close(WT_SESSION_IMPL *session);
+extern void __wt_fill_hex(const uint8_t *src, size_t src_max, uint8_t *dest, size_t dest_max, size_t *lenp);
extern int __wt_raw_to_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to);
extern int __wt_raw_to_esc_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to);
extern int __wt_hex2byte(const u_char *from, u_char *to);
@@ -671,6 +685,7 @@ extern uint32_t __wt_log2_int(uint32_t n);
extern bool __wt_ispo2(uint32_t v);
extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2);
extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state);
+extern int __wt_random_init_seed( WT_SESSION_IMPL *session, WT_RAND_STATE volatile *rnd_state);
extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state);
extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size);
extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4)));
@@ -732,7 +747,7 @@ extern int __wt_txn_checkpoint_logread( WT_SESSION_IMPL *session, const uint8_t
extern int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp);
extern int __wt_txn_truncate_log( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop);
extern int __wt_txn_truncate_end(WT_SESSION_IMPL *session);
-extern int __wt_txn_printlog(WT_SESSION *wt_session, FILE *out);
+extern int __wt_txn_printlog(WT_SESSION *wt_session, FILE *out, uint32_t flags);
extern int __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_txn_named_snapshot_drop(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval);
diff --git a/src/include/flags.h b/src/include/flags.h
index bafff92fbc0..24fae4abccd 100644
--- a/src/include/flags.h
+++ b/src/include/flags.h
@@ -39,15 +39,17 @@
#define WT_LOG_SYNC_ENABLED 0x00000010
#define WT_READ_CACHE 0x00000001
#define WT_READ_COMPACT 0x00000002
-#define WT_READ_NO_EMPTY 0x00000004
-#define WT_READ_NO_EVICT 0x00000008
-#define WT_READ_NO_GEN 0x00000010
-#define WT_READ_NO_WAIT 0x00000020
-#define WT_READ_PREV 0x00000040
-#define WT_READ_SKIP_INTL 0x00000080
-#define WT_READ_SKIP_LEAF 0x00000100
-#define WT_READ_TRUNCATE 0x00000200
-#define WT_READ_WONT_NEED 0x00000400
+#define WT_READ_NOTFOUND_OK 0x00000004
+#define WT_READ_NO_EMPTY 0x00000008
+#define WT_READ_NO_EVICT 0x00000010
+#define WT_READ_NO_GEN 0x00000020
+#define WT_READ_NO_WAIT 0x00000040
+#define WT_READ_PREV 0x00000080
+#define WT_READ_RESTART_OK 0x00000100
+#define WT_READ_SKIP_INTL 0x00000200
+#define WT_READ_SKIP_LEAF 0x00000400
+#define WT_READ_TRUNCATE 0x00000800
+#define WT_READ_WONT_NEED 0x00001000
#define WT_SESSION_CAN_WAIT 0x00000001
#define WT_SESSION_CLEAR_EVICT_WALK 0x00000002
#define WT_SESSION_INTERNAL 0x00000004
@@ -57,15 +59,16 @@
#define WT_SESSION_LOCKED_SLOT 0x00000040
#define WT_SESSION_LOCKED_TABLE 0x00000080
#define WT_SESSION_LOCKED_TURTLE 0x00000100
-#define WT_SESSION_LOGGING_INMEM 0x00000200
-#define WT_SESSION_LOOKASIDE_CURSOR 0x00000400
-#define WT_SESSION_NO_CACHE 0x00000800
-#define WT_SESSION_NO_DATA_HANDLES 0x00001000
-#define WT_SESSION_NO_EVICTION 0x00002000
-#define WT_SESSION_NO_LOGGING 0x00004000
-#define WT_SESSION_NO_SCHEMA_LOCK 0x00008000
-#define WT_SESSION_QUIET_CORRUPT_FILE 0x00010000
-#define WT_SESSION_SERVER_ASYNC 0x00020000
+#define WT_SESSION_LOCK_NO_WAIT 0x00000200
+#define WT_SESSION_LOGGING_INMEM 0x00000400
+#define WT_SESSION_LOOKASIDE_CURSOR 0x00000800
+#define WT_SESSION_NO_CACHE 0x00001000
+#define WT_SESSION_NO_DATA_HANDLES 0x00002000
+#define WT_SESSION_NO_EVICTION 0x00004000
+#define WT_SESSION_NO_LOGGING 0x00008000
+#define WT_SESSION_NO_SCHEMA_LOCK 0x00010000
+#define WT_SESSION_QUIET_CORRUPT_FILE 0x00020000
+#define WT_SESSION_SERVER_ASYNC 0x00040000
#define WT_TXN_LOG_CKPT_CLEANUP 0x00000001
#define WT_TXN_LOG_CKPT_PREPARE 0x00000002
#define WT_TXN_LOG_CKPT_START 0x00000004
@@ -85,16 +88,17 @@
#define WT_VERB_MUTEX 0x00000800
#define WT_VERB_OVERFLOW 0x00001000
#define WT_VERB_READ 0x00002000
-#define WT_VERB_RECONCILE 0x00004000
-#define WT_VERB_RECOVERY 0x00008000
-#define WT_VERB_SALVAGE 0x00010000
-#define WT_VERB_SHARED_CACHE 0x00020000
-#define WT_VERB_SPLIT 0x00040000
-#define WT_VERB_TEMPORARY 0x00080000
-#define WT_VERB_TRANSACTION 0x00100000
-#define WT_VERB_VERIFY 0x00200000
-#define WT_VERB_VERSION 0x00400000
-#define WT_VERB_WRITE 0x00800000
+#define WT_VERB_REBALANCE 0x00004000
+#define WT_VERB_RECONCILE 0x00008000
+#define WT_VERB_RECOVERY 0x00010000
+#define WT_VERB_SALVAGE 0x00020000
+#define WT_VERB_SHARED_CACHE 0x00040000
+#define WT_VERB_SPLIT 0x00080000
+#define WT_VERB_TEMPORARY 0x00100000
+#define WT_VERB_TRANSACTION 0x00200000
+#define WT_VERB_VERIFY 0x00400000
+#define WT_VERB_VERSION 0x00800000
+#define WT_VERB_WRITE 0x01000000
#define WT_VISIBILITY_ERR 0x00000010
/*
* flags section: END
diff --git a/src/include/gcc.h b/src/include/gcc.h
index 01e33792d73..6ccc0de3c03 100644
--- a/src/include/gcc.h
+++ b/src/include/gcc.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -156,8 +156,7 @@ __wt_atomic_cas_ptr(void *vp, void *old, void *new)
#if defined(x86_64) || defined(__x86_64__)
/* Pause instruction to prevent excess processor bus usage */
-#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory")
-
+#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory")
#define WT_FULL_BARRIER() do { \
__asm__ volatile ("mfence" ::: "memory"); \
} while (0)
@@ -169,7 +168,7 @@ __wt_atomic_cas_ptr(void *vp, void *old, void *new)
} while (0)
#elif defined(i386) || defined(__i386__)
-#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory")
+#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory")
#define WT_FULL_BARRIER() do { \
__asm__ volatile ("lock; addl $0, 0(%%esp)" ::: "memory"); \
} while (0)
@@ -177,23 +176,58 @@ __wt_atomic_cas_ptr(void *vp, void *old, void *new)
#define WT_WRITE_BARRIER() WT_FULL_BARRIER()
#elif defined(__PPC64__) || defined(PPC64)
+/* ori 0,0,0 is the PPC64 noop instruction */
#define WT_PAUSE() __asm__ volatile("ori 0,0,0" ::: "memory")
-#define WT_FULL_BARRIER() do {
+#define WT_FULL_BARRIER() do { \
__asm__ volatile ("sync" ::: "memory"); \
} while (0)
-#define WT_READ_BARRIER() WT_FULL_BARRIER()
-#define WT_WRITE_BARRIER() WT_FULL_BARRIER()
+
+/* TODO: ISA 2.07 Elemental Memory Barriers would be better,
+ specifically mbll, and mbss, but they are not supported by POWER 8 */
+#define WT_READ_BARRIER() do { \
+ __asm__ volatile ("lwsync" ::: "memory"); \
+} while (0)
+#define WT_WRITE_BARRIER() do { \
+ __asm__ volatile ("lwsync" ::: "memory"); \
+} while (0)
#elif defined(__aarch64__)
#define WT_PAUSE() __asm__ volatile("yield" ::: "memory")
#define WT_FULL_BARRIER() do { \
- __asm__ volatile ("dsb sy" ::: "memory"); \
+ __asm__ volatile ("dsb sy" ::: "memory"); \
+} while (0)
+#define WT_READ_BARRIER() do { \
+ __asm__ volatile ("dsb ld" ::: "memory"); \
+} while (0)
+#define WT_WRITE_BARRIER() do { \
+ __asm__ volatile ("dsb st" ::: "memory"); \
+} while (0)
+
+#elif defined(__s390x__)
+#define WT_PAUSE() __asm__ volatile("lr 0,0" ::: "memory")
+#define WT_FULL_BARRIER() do { \
+ __asm__ volatile ("bcr 15,0\n" ::: "memory"); \
} while (0)
+#define WT_READ_BARRIER() WT_FULL_BARRIER()
+#define WT_WRITE_BARRIER() WT_FULL_BARRIER()
+
+#elif defined(__sparc__)
+#define WT_PAUSE() __asm__ volatile("rd %%ccr, %%g0" ::: "memory")
+
+#define WT_FULL_BARRIER() do { \
+ __asm__ volatile ("membar #StoreLoad" ::: "memory"); \
+} while (0)
+
+/*
+ * On UltraSparc machines, TSO is used, and so there is no need for membar.
+ * READ_BARRIER = #LoadLoad, and WRITE_BARRIER = #StoreStore are noop.
+ */
#define WT_READ_BARRIER() do { \
- __asm__ volatile ("dsb ld" ::: "memory"); \
+ __asm__ volatile ("" ::: "memory"); \
} while (0)
+
#define WT_WRITE_BARRIER() do { \
- __asm__ volatile ("dsb st" ::: "memory"); \
+ __asm__ volatile ("" ::: "memory"); \
} while (0)
#else
diff --git a/src/include/hardware.h b/src/include/hardware.h
index 1ab2c3d39c4..93ed8a868b6 100644
--- a/src/include/hardware.h
+++ b/src/include/hardware.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/intpack.i b/src/include/intpack.i
index a13ad05451d..b27afd24e6c 100644
--- a/src/include/intpack.i
+++ b/src/include/intpack.i
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/lint.h b/src/include/lint.h
index f288fb98683..f8b17022968 100644
--- a/src/include/lint.h
+++ b/src/include/lint.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/log.h b/src/include/log.h
index 521de567fc0..577f6a888a3 100644
--- a/src/include/log.h
+++ b/src/include/log.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -160,9 +160,9 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_logslot {
#define WT_SLOT_INIT_FLAGS 0
-#define WT_WITH_SLOT_LOCK(session, log, op) do { \
+#define WT_WITH_SLOT_LOCK(session, log, ret, op) do { \
WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT)); \
- WT_WITH_LOCK(session, \
+ WT_WITH_LOCK(session, ret, \
&log->log_slot_lock, WT_SESSION_LOCKED_SLOT, op); \
} while (0)
@@ -267,6 +267,11 @@ struct __wt_log_desc {
};
/*
+ * Flags for __wt_txn_op_printlog.
+ */
+#define WT_TXN_PRINTLOG_HEX 0x0001 /* Add hex output */
+
+/*
* WT_LOG_REC_DESC --
* A descriptor for a log record type.
*/
diff --git a/src/include/log.i b/src/include/log.i
index ff309c31265..fcdbc72c388 100644
--- a/src/include/log.i
+++ b/src/include/log.i
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/lsm.h b/src/include/lsm.h
index d15dab3aa45..7cb3ccc895d 100644
--- a/src/include/lsm.h
+++ b/src/include/lsm.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/meta.h b/src/include/meta.h
index 938101e9caa..e29ec4202dc 100644
--- a/src/include/meta.h
+++ b/src/include/meta.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -43,9 +43,9 @@
* WT_WITH_TURTLE_LOCK --
* Acquire the turtle file lock, perform an operation, drop the lock.
*/
-#define WT_WITH_TURTLE_LOCK(session, op) do { \
+#define WT_WITH_TURTLE_LOCK(session, ret, op) do { \
WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_TURTLE));\
- WT_WITH_LOCK(session, \
+ WT_WITH_LOCK(session, ret, \
&S2C(session)->turtle_lock, WT_SESSION_LOCKED_TURTLE, op); \
} while (0)
diff --git a/src/include/misc.h b/src/include/misc.h
index e542baec642..78997661851 100644
--- a/src/include/misc.h
+++ b/src/include/misc.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -268,3 +268,6 @@ union __wt_rand_state {
uint32_t w, z;
} x;
};
+
+/* Shared array for converting to hex */
+extern const u_char __wt_hex[];
diff --git a/src/include/misc.i b/src/include/misc.i
index 75068706b70..04376441340 100644
--- a/src/include/misc.i
+++ b/src/include/misc.i
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/msvc.h b/src/include/msvc.h
index 8f5aa9abde8..99260a44875 100644
--- a/src/include/msvc.h
+++ b/src/include/msvc.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/mutex.h b/src/include/mutex.h
index b67e5e610e8..f798bfb3ece 100644
--- a/src/include/mutex.h
+++ b/src/include/mutex.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/mutex.i b/src/include/mutex.i
index 7eb042dd79f..52250f84ab3 100644
--- a/src/include/mutex.i
+++ b/src/include/mutex.i
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/os.h b/src/include/os.h
index d135fd9eb1f..fbba7f05f88 100644
--- a/src/include/os.h
+++ b/src/include/os.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/os_windows.h b/src/include/os_windows.h
index de97143335f..65938ac9f17 100644
--- a/src/include/os_windows.h
+++ b/src/include/os_windows.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/packing.i b/src/include/packing.i
index 9be38251703..784a55ef2ae 100644
--- a/src/include/packing.i
+++ b/src/include/packing.i
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -25,7 +25,8 @@ typedef struct {
char type;
} WT_PACK_VALUE;
-#define WT_PACK_VALUE_INIT { { 0 }, 0, 0, 0 }
+/* Default to size = 1 if there is no size prefix. */
+#define WT_PACK_VALUE_INIT { { 0 }, 1, 0, 0 }
#define WT_DECL_PACK_VALUE(pv) WT_PACK_VALUE pv = WT_PACK_VALUE_INIT
typedef struct {
@@ -151,7 +152,14 @@ next: if (pack->cur == pack->end)
switch (pv->type) {
case 'S':
+ return (0);
case 's':
+ if (pv->size < 1)
+ WT_RET_MSG(pack->session, EINVAL,
+ "Fixed length strings must be at least 1 byte "
+ "in format '%.*s'",
+ (int)(pack->end - pack->orig), pack->orig);
+ return (0);
case 'x':
return (0);
case 't':
@@ -266,9 +274,10 @@ __pack_size(WT_SESSION_IMPL *session, WT_PACK_VALUE *pv)
return (s);
case 's':
case 'S':
- if (pv->type == 's' || pv->havesize)
+ if (pv->type == 's' || pv->havesize) {
s = pv->size;
- else
+ WT_ASSERT(session, s != 0);
+ } else
s = strlen(pv->u.s) + 1;
return (s);
case 'U':
@@ -460,9 +469,10 @@ __unpack_read(WT_SESSION_IMPL *session,
break;
case 's':
case 'S':
- if (pv->type == 's' || pv->havesize)
+ if (pv->type == 's' || pv->havesize) {
s = pv->size;
- else
+ WT_ASSERT(session, s != 0);
+ } else
s = strlen((const char *)*pp) + 1;
if (s > 0)
pv->u.s = (const char *)*pp;
@@ -667,7 +677,6 @@ __wt_struct_unpackv(WT_SESSION_IMPL *session,
if (fmt[0] != '\0' && fmt[1] == '\0') {
pv.type = fmt[0];
- pv.size = 1;
if ((ret = __unpack_read(session, &pv, &p, size)) == 0)
WT_UNPACK_PUT(session, pv, ap);
return (0);
diff --git a/src/include/posix.h b/src/include/posix.h
index 1aa629c98e7..2593c7b6797 100644
--- a/src/include/posix.h
+++ b/src/include/posix.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/schema.h b/src/include/schema.h
index 023fd398f1c..88a3a39f8b3 100644
--- a/src/include/schema.h
+++ b/src/include/schema.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -82,9 +82,17 @@ struct __wt_table {
* WT_WITH_LOCK --
* Acquire a lock, perform an operation, drop the lock.
*/
-#define WT_WITH_LOCK(session, lock, flag, op) do { \
+#define WT_WITH_LOCK(session, ret, lock, flag, op) do { \
+ ret = 0; \
if (F_ISSET(session, (flag))) { \
op; \
+ } else if (F_ISSET(session, WT_SESSION_LOCK_NO_WAIT)) { \
+ if ((ret = __wt_spin_trylock(session, (lock))) == 0) { \
+ F_SET(session, (flag)); \
+ op; \
+ F_CLR(session, (flag)); \
+ __wt_spin_unlock(session, (lock)); \
+ } \
} else { \
__wt_spin_lock(session, (lock)); \
F_SET(session, (flag)); \
@@ -98,16 +106,16 @@ struct __wt_table {
* WT_WITH_CHECKPOINT_LOCK --
* Acquire the checkpoint lock, perform an operation, drop the lock.
*/
-#define WT_WITH_CHECKPOINT_LOCK(session, op) \
- WT_WITH_LOCK(session, \
+#define WT_WITH_CHECKPOINT_LOCK(session, ret, op) \
+ WT_WITH_LOCK(session, ret, \
&S2C(session)->checkpoint_lock, WT_SESSION_LOCKED_CHECKPOINT, op)
/*
* WT_WITH_HANDLE_LIST_LOCK --
* Acquire the data handle list lock, perform an operation, drop the lock.
*/
-#define WT_WITH_HANDLE_LIST_LOCK(session, op) \
- WT_WITH_LOCK(session, \
+#define WT_WITH_HANDLE_LIST_LOCK(session, ret, op) \
+ WT_WITH_LOCK(session, ret, \
&S2C(session)->dhandle_lock, WT_SESSION_LOCKED_HANDLE_LIST, op)
/*
* WT_WITH_SCHEMA_LOCK --
@@ -115,12 +123,12 @@ struct __wt_table {
* Check that we are not already holding some other lock: the schema lock
* must be taken first.
*/
-#define WT_WITH_SCHEMA_LOCK(session, op) do { \
+#define WT_WITH_SCHEMA_LOCK(session, ret, op) do { \
WT_ASSERT(session, \
F_ISSET(session, WT_SESSION_LOCKED_SCHEMA) || \
!F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST | \
WT_SESSION_NO_SCHEMA_LOCK | WT_SESSION_LOCKED_TABLE)); \
- WT_WITH_LOCK(session, \
+ WT_WITH_LOCK(session, ret, \
&S2C(session)->schema_lock, WT_SESSION_LOCKED_SCHEMA, op); \
} while (0)
@@ -128,11 +136,11 @@ struct __wt_table {
* WT_WITH_TABLE_LOCK --
* Acquire the table lock, perform an operation, drop the lock.
*/
-#define WT_WITH_TABLE_LOCK(session, op) do { \
+#define WT_WITH_TABLE_LOCK(session, ret, op) do { \
WT_ASSERT(session, \
F_ISSET(session, WT_SESSION_LOCKED_TABLE) || \
!F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \
- WT_WITH_LOCK(session, \
+ WT_WITH_LOCK(session, ret, \
&S2C(session)->table_lock, WT_SESSION_LOCKED_TABLE, op); \
} while (0)
diff --git a/src/include/serial.i b/src/include/serial.i
index ca22ce12d81..fa920de7e37 100644
--- a/src/include/serial.i
+++ b/src/include/serial.i
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/session.h b/src/include/session.h
index 5c3bcfb8ed0..5c3291230b4 100644
--- a/src/include/session.h
+++ b/src/include/session.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -74,19 +74,22 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl {
TAILQ_HEAD(__cursors, __wt_cursor) cursors;
WT_CURSOR_BACKUP *bkp_cursor; /* Hot backup cursor */
- WT_COMPACT *compact; /* Compact state */
+
+ WT_COMPACT *compact; /* Compaction information */
+ enum { WT_COMPACT_NONE=0,
+ WT_COMPACT_RUNNING, WT_COMPACT_SUCCESS } compact_state;
/*
* Lookaside table cursor, sweep and eviction worker threads only.
*/
WT_CURSOR *las_cursor; /* Lookaside table cursor */
- WT_DATA_HANDLE *meta_dhandle; /* Metadata file */
- void *meta_track; /* Metadata operation tracking */
- void *meta_track_next; /* Current position */
- void *meta_track_sub; /* Child transaction / save point */
- size_t meta_track_alloc; /* Currently allocated */
- int meta_track_nest; /* Nesting level of meta transaction */
+ WT_CURSOR *meta_cursor; /* Metadata file */
+ void *meta_track; /* Metadata operation tracking */
+ void *meta_track_next; /* Current position */
+ void *meta_track_sub; /* Child transaction / save point */
+ size_t meta_track_alloc; /* Currently allocated */
+ int meta_track_nest; /* Nesting level of meta transaction */
#define WT_META_TRACKING(session) (session->meta_track_next != NULL)
/*
@@ -134,8 +137,6 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl {
void *reconcile; /* Reconciliation support */
int (*reconcile_cleanup)(WT_SESSION_IMPL *);
- bool compaction; /* Compaction did some work */
-
uint32_t flags;
/*
diff --git a/src/include/stat.h b/src/include/stat.h
index dfe7ee5c6cd..51d2fa332e7 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -139,8 +139,8 @@ __wt_stats_clear(void *stats_arg, int slot)
*/
#define WT_STAT_READ(stats, fld) \
__wt_stats_aggregate(stats, WT_STATS_FIELD_TO_SLOT(stats, fld))
-#define WT_STAT_WRITE(session, stats, fld) \
- ((stats)[WT_STATS_SLOT_ID(session)]->fld);
+#define WT_STAT_WRITE(stats, fld, v) \
+ (stats)->fld = (int64_t)(v)
#define WT_STAT_DECRV(session, stats, fld, value) \
(stats)[WT_STATS_SLOT_ID(session)]->fld -= (int64_t)(value)
diff --git a/src/include/txn.h b/src/include/txn.h
index 936164fa9a7..1e82e2d982a 100644
--- a/src/include/txn.h
+++ b/src/include/txn.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/txn.i b/src/include/txn.i
index 1005d4a395d..46f2ff3e5f1 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -185,9 +185,7 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id)
if (id == WT_TXN_ABORTED)
return (false);
- /*
- * Read-uncommitted transactions see all other changes.
- */
+ /* Read-uncommitted transactions see all other changes. */
if (txn->isolation == WT_ISO_READ_UNCOMMITTED)
return (true);
diff --git a/src/include/verify_build.h b/src/include/verify_build.h
index 6a97def12be..477b9b7c134 100644
--- a/src/include/verify_build.h
+++ b/src/include/verify_build.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index bdd8bb65910..676f95d9b05 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -566,20 +566,21 @@ struct __wt_cursor {
*/
const char *internal_uri;
-#define WT_CURSTD_APPEND 0x0001
-#define WT_CURSTD_BULK 0x0002
-#define WT_CURSTD_DUMP_HEX 0x0004
-#define WT_CURSTD_DUMP_JSON 0x0008
-#define WT_CURSTD_DUMP_PRINT 0x0010
-#define WT_CURSTD_KEY_EXT 0x0020 /* Key points out of the tree. */
-#define WT_CURSTD_KEY_INT 0x0040 /* Key points into the tree. */
+#define WT_CURSTD_APPEND 0x00001
+#define WT_CURSTD_BULK 0x00002
+#define WT_CURSTD_DUMP_HEX 0x00004
+#define WT_CURSTD_DUMP_JSON 0x00008
+#define WT_CURSTD_DUMP_PRINT 0x00010
+#define WT_CURSTD_JOINED 0x00020
+#define WT_CURSTD_KEY_EXT 0x00040 /* Key points out of the tree. */
+#define WT_CURSTD_KEY_INT 0x00080 /* Key points into the tree. */
#define WT_CURSTD_KEY_SET (WT_CURSTD_KEY_EXT | WT_CURSTD_KEY_INT)
-#define WT_CURSTD_JOINED 0x0080
-#define WT_CURSTD_OPEN 0x0100
-#define WT_CURSTD_OVERWRITE 0x0200
-#define WT_CURSTD_RAW 0x0400
-#define WT_CURSTD_VALUE_EXT 0x0800 /* Value points out of the tree. */
-#define WT_CURSTD_VALUE_INT 0x1000 /* Value points into the tree. */
+#define WT_CURSTD_META_INUSE 0x00100
+#define WT_CURSTD_OPEN 0x00200
+#define WT_CURSTD_OVERWRITE 0x00400
+#define WT_CURSTD_RAW 0x00800
+#define WT_CURSTD_VALUE_EXT 0x01000 /* Value points out of the tree. */
+#define WT_CURSTD_VALUE_INT 0x02000 /* Value points into the tree. */
#define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT)
uint32_t flags;
#endif
@@ -1236,6 +1237,9 @@ struct __wt_session {
* @configstart{WT_SESSION.drop, see dist/api_data.py}
* @config{force, return success if the object does not exist., a
* boolean flag; default \c false.}
+ * @config{lock_wait, wait for locks\, if \c lock_wait=false\, fail if
+ * any required locks are not available immediately., a boolean flag;
+ * default \c true.}
* @config{remove_files, should the underlying files be removed?., a
* boolean flag; default \c true.}
* @configend
@@ -1329,6 +1333,19 @@ struct __wt_session {
int __F(log_printf)(WT_SESSION *session, const char *fmt, ...);
/*!
+ * Rebalance a table, see @ref rebalance.
+ *
+ * @snippet ex_all.c Rebalance a table
+ *
+ * @param session the session handle
+ * @param uri the current URI of the object, such as \c "table:mytable"
+ * @configempty{WT_SESSION.rebalance, see dist/api_data.py}
+ * @ebusy_errors
+ */
+ int __F(rebalance)(
+ WT_SESSION *session, const char *uri, const char *config);
+
+ /*!
* Rename an object.
*
* @snippet ex_all.c Rename a table
@@ -1920,9 +1937,10 @@ struct __wt_connection {
* "block"\, \c "checkpoint"\, \c "compact"\, \c "evict"\, \c
* "evictserver"\, \c "fileops"\, \c "log"\, \c "lsm"\, \c
* "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c
- * "read"\, \c "reconcile"\, \c "recovery"\, \c "salvage"\, \c
- * "shared_cache"\, \c "split"\, \c "temporary"\, \c "transaction"\, \c
- * "verify"\, \c "version"\, \c "write"; default empty.}
+ * "read"\, \c "rebalance"\, \c "reconcile"\, \c "recovery"\, \c
+ * "salvage"\, \c "shared_cache"\, \c "split"\, \c "temporary"\, \c
+ * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default
+ * empty.}
* @configend
* @errors
*/
@@ -2405,9 +2423,9 @@ struct __wt_connection {
* values chosen from the following options: \c "api"\, \c "block"\, \c
* "checkpoint"\, \c "compact"\, \c "evict"\, \c "evictserver"\, \c "fileops"\,
* \c "log"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c
- * "overflow"\, \c "read"\, \c "reconcile"\, \c "recovery"\, \c "salvage"\, \c
- * "shared_cache"\, \c "split"\, \c "temporary"\, \c "transaction"\, \c
- * "verify"\, \c "version"\, \c "write"; default empty.}
+ * "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c "recovery"\, \c
+ * "salvage"\, \c "shared_cache"\, \c "split"\, \c "temporary"\, \c
+ * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default empty.}
* @config{write_through, Use \c FILE_FLAG_WRITE_THROUGH on Windows to write to
* files. Ignored on non-Windows systems. Options are given as a list\, such
* as <code>"write_through=[data]"</code>. Configuring \c write_through requires
diff --git a/src/include/wiredtiger_ext.h b/src/include/wiredtiger_ext.h
index 28fd8e18329..0db876b56f3 100644
--- a/src/include/wiredtiger_ext.h
+++ b/src/include/wiredtiger_ext.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h
index 0a1e143ce70..54b5dfd19f4 100644
--- a/src/include/wt_internal.h
+++ b/src/include/wt_internal.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/log/log.c b/src/log/log.c
index 118e081c3ec..3bf04d025d8 100644
--- a/src/log/log.c
+++ b/src/log/log.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -47,7 +47,7 @@ __wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start)
conn = S2C(session);
log = conn->log;
WT_RET(__wt_log_force_write(session, 1));
- WT_RET(__wt_log_wrlsn(session));
+ WT_RET(__wt_log_wrlsn(session, NULL));
if (start)
*lsn = log->write_start_lsn;
else
@@ -669,8 +669,7 @@ __log_openfile(WT_SESSION_IMPL *session,
* check that the magic number and versions are correct.
*/
if (!ok_create) {
- __wt_scr_free(session, &buf);
- WT_ERR(__wt_scr_alloc(session, allocsize, &buf));
+ WT_ERR(__wt_buf_grow(session, buf, allocsize));
memset(buf->mem, 0, allocsize);
WT_ERR(__wt_read(session, *fh, 0, allocsize, buf->mem));
logrec = (WT_LOG_RECORD *)buf->mem;
@@ -771,7 +770,7 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created)
WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
while (log->log_close_fh != NULL) {
WT_STAT_FAST_CONN_INCR(session, log_close_yields);
- WT_RET(__wt_log_wrlsn(session));
+ WT_RET(__wt_log_wrlsn(session, NULL));
if (++yield_cnt > 10000)
return (EBUSY);
__wt_yield();
@@ -791,9 +790,10 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created)
WT_FULL_BARRIER();
/*
* If we're pre-allocating log files, look for one. If there aren't any
- * or we're not pre-allocating, then create one.
+ * or we're not pre-allocating, or a backup cursor is open, then
+ * create one.
*/
- if (conn->log_prealloc > 0) {
+ if (conn->log_prealloc > 0 && !conn->hot_backup) {
ret = __log_alloc_prealloc(session, log->fileid);
/*
* If ret is 0 it means we found a pre-allocated file.
@@ -1120,7 +1120,7 @@ __wt_log_open(WT_SESSION_IMPL *session)
* Start logging at the beginning of the next log file, no matter
* where the previous log file ends.
*/
- WT_WITH_SLOT_LOCK(session, log,
+ WT_WITH_SLOT_LOCK(session, log, ret,
ret = __log_newfile(session, true, NULL));
WT_ERR(ret);
@@ -1970,6 +1970,14 @@ err:
myslot.slot != NULL)
ret = myslot.slot->slot_error;
+ /*
+ * If one of the sync flags is set, assert the proper LSN has moved to
+ * match.
+ */
+ WT_ASSERT(session, !LF_ISSET(WT_LOG_FLUSH) ||
+ __wt_log_cmp(&log->write_lsn, &lsn) >= 0);
+ WT_ASSERT(session,
+ !LF_ISSET(WT_LOG_FSYNC) || __wt_log_cmp(&log->sync_lsn, &lsn) >= 0);
return (ret);
}
diff --git a/src/log/log_auto.c b/src/log/log_auto.c
index 5a1d03b1976..54df01d01ab 100644
--- a/src/log/log_auto.c
+++ b/src/log/log_auto.c
@@ -69,7 +69,7 @@ __logrec_json_unpack_str(char *dest, size_t destlen, const char *src,
}
static int
-__logrec_jsonify_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item)
+__logrec_make_json_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item)
{
size_t needed;
@@ -79,6 +79,17 @@ __logrec_jsonify_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item)
return (0);
}
+static int
+__logrec_make_hex_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item)
+{
+ size_t needed;
+
+ needed = item->size * 2 + 1;
+ WT_RET(__wt_realloc(session, NULL, needed, destp));
+ __wt_fill_hex(item->data, item->size, (uint8_t *)*destp, needed, NULL);
+ return (0);
+}
+
int
__wt_logop_col_put_pack(
WT_SESSION_IMPL *session, WT_ITEM *logrec,
@@ -121,7 +132,8 @@ __wt_logop_col_put_unpack(
int
__wt_logop_col_put_print(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
WT_DECL_RET;
uint32_t fileid;
@@ -138,9 +150,14 @@ __wt_logop_col_put_print(
" \"fileid\": \"%" PRIu32 "\",\n", fileid));
WT_ERR(__wt_fprintf(out,
" \"recno\": \"%" PRIu64 "\",\n", recno));
- WT_ERR(__logrec_jsonify_str(session, &escaped, &value));
+ WT_ERR(__logrec_make_json_str(session, &escaped, &value));
WT_ERR(__wt_fprintf(out,
" \"value\": \"%s\"", escaped));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &value));
+ WT_ERR(__wt_fprintf(out,
+ ",\n \"value-hex\": \"%s\"", escaped));
+ }
err: __wt_free(session, escaped);
return (ret);
@@ -188,11 +205,13 @@ __wt_logop_col_remove_unpack(
int
__wt_logop_col_remove_print(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
uint32_t fileid;
uint64_t recno;
+ WT_UNUSED(flags);
WT_RET(__wt_logop_col_remove_unpack(
session, pp, end, &fileid, &recno));
@@ -246,12 +265,14 @@ __wt_logop_col_truncate_unpack(
int
__wt_logop_col_truncate_print(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
uint32_t fileid;
uint64_t start;
uint64_t stop;
+ WT_UNUSED(flags);
WT_RET(__wt_logop_col_truncate_unpack(
session, pp, end, &fileid, &start, &stop));
@@ -307,7 +328,8 @@ __wt_logop_row_put_unpack(
int
__wt_logop_row_put_print(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
WT_DECL_RET;
uint32_t fileid;
@@ -322,12 +344,22 @@ __wt_logop_row_put_print(
WT_RET(__wt_fprintf(out, " \"optype\": \"row_put\",\n"));
WT_ERR(__wt_fprintf(out,
" \"fileid\": \"%" PRIu32 "\",\n", fileid));
- WT_ERR(__logrec_jsonify_str(session, &escaped, &key));
+ WT_ERR(__logrec_make_json_str(session, &escaped, &key));
WT_ERR(__wt_fprintf(out,
" \"key\": \"%s\",\n", escaped));
- WT_ERR(__logrec_jsonify_str(session, &escaped, &value));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &key));
+ WT_ERR(__wt_fprintf(out,
+ " \"key-hex\": \"%s\",\n", escaped));
+ }
+ WT_ERR(__logrec_make_json_str(session, &escaped, &value));
WT_ERR(__wt_fprintf(out,
" \"value\": \"%s\"", escaped));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &value));
+ WT_ERR(__wt_fprintf(out,
+ ",\n \"value-hex\": \"%s\"", escaped));
+ }
err: __wt_free(session, escaped);
return (ret);
@@ -375,7 +407,8 @@ __wt_logop_row_remove_unpack(
int
__wt_logop_row_remove_print(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
WT_DECL_RET;
uint32_t fileid;
@@ -389,9 +422,14 @@ __wt_logop_row_remove_print(
WT_RET(__wt_fprintf(out, " \"optype\": \"row_remove\",\n"));
WT_ERR(__wt_fprintf(out,
" \"fileid\": \"%" PRIu32 "\",\n", fileid));
- WT_ERR(__logrec_jsonify_str(session, &escaped, &key));
+ WT_ERR(__logrec_make_json_str(session, &escaped, &key));
WT_ERR(__wt_fprintf(out,
" \"key\": \"%s\"", escaped));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &key));
+ WT_ERR(__wt_fprintf(out,
+ ",\n \"key-hex\": \"%s\"", escaped));
+ }
err: __wt_free(session, escaped);
return (ret);
@@ -439,7 +477,8 @@ __wt_logop_row_truncate_unpack(
int
__wt_logop_row_truncate_print(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
WT_DECL_RET;
uint32_t fileid;
@@ -455,12 +494,22 @@ __wt_logop_row_truncate_print(
WT_RET(__wt_fprintf(out, " \"optype\": \"row_truncate\",\n"));
WT_ERR(__wt_fprintf(out,
" \"fileid\": \"%" PRIu32 "\",\n", fileid));
- WT_ERR(__logrec_jsonify_str(session, &escaped, &start));
+ WT_ERR(__logrec_make_json_str(session, &escaped, &start));
WT_ERR(__wt_fprintf(out,
" \"start\": \"%s\",\n", escaped));
- WT_ERR(__logrec_jsonify_str(session, &escaped, &stop));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &start));
+ WT_ERR(__wt_fprintf(out,
+ " \"start-hex\": \"%s\",\n", escaped));
+ }
+ WT_ERR(__logrec_make_json_str(session, &escaped, &stop));
WT_ERR(__wt_fprintf(out,
" \"stop\": \"%s\",\n", escaped));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &stop));
+ WT_ERR(__wt_fprintf(out,
+ " \"stop-hex\": \"%s\",\n", escaped));
+ }
WT_ERR(__wt_fprintf(out,
" \"mode\": \"%" PRIu32 "\"", mode));
@@ -470,7 +519,8 @@ err: __wt_free(session, escaped);
int
__wt_txn_op_printlog(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
uint32_t optype, opsize;
@@ -480,27 +530,33 @@ __wt_txn_op_printlog(
switch (optype) {
case WT_LOGOP_COL_PUT:
- WT_RET(__wt_logop_col_put_print(session, pp, end, out));
+ WT_RET(__wt_logop_col_put_print(session, pp, end, out,
+ flags));
break;
case WT_LOGOP_COL_REMOVE:
- WT_RET(__wt_logop_col_remove_print(session, pp, end, out));
+ WT_RET(__wt_logop_col_remove_print(session, pp, end, out,
+ flags));
break;
case WT_LOGOP_COL_TRUNCATE:
- WT_RET(__wt_logop_col_truncate_print(session, pp, end, out));
+ WT_RET(__wt_logop_col_truncate_print(session, pp, end, out,
+ flags));
break;
case WT_LOGOP_ROW_PUT:
- WT_RET(__wt_logop_row_put_print(session, pp, end, out));
+ WT_RET(__wt_logop_row_put_print(session, pp, end, out,
+ flags));
break;
case WT_LOGOP_ROW_REMOVE:
- WT_RET(__wt_logop_row_remove_print(session, pp, end, out));
+ WT_RET(__wt_logop_row_remove_print(session, pp, end, out,
+ flags));
break;
case WT_LOGOP_ROW_TRUNCATE:
- WT_RET(__wt_logop_row_truncate_print(session, pp, end, out));
+ WT_RET(__wt_logop_row_truncate_print(session, pp, end, out,
+ flags));
break;
WT_ILLEGAL_VALUE(session);
diff --git a/src/log/log_slot.c b/src/log/log_slot.c
index 8155397d823..760e8888de6 100644
--- a/src/log/log_slot.c
+++ b/src/log/log_slot.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -187,7 +187,7 @@ __wt_log_slot_switch(
* because we are responsible for setting up the new slot.
*/
do {
- WT_WITH_SLOT_LOCK(session, log,
+ WT_WITH_SLOT_LOCK(session, log, ret,
ret = __log_slot_switch_internal(session, myslot, forced));
if (ret == EBUSY) {
WT_STAT_FAST_CONN_INCR(session, log_slot_switch_busy);
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index 953698476ef..1bb9a7238fe 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -254,7 +254,7 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update)
(!update && F_ISSET(clsm, WT_CLSM_OPEN_READ))))
break;
-open: WT_WITH_SCHEMA_LOCK(session,
+open: WT_WITH_SCHEMA_LOCK(session, ret,
ret = __clsm_open_cursors(clsm, update, 0, 0));
WT_RET(ret);
}
@@ -710,7 +710,7 @@ __wt_clsm_init_merge(
F_SET(clsm, WT_CLSM_MINOR_MERGE);
clsm->nchunks = nchunks;
- WT_WITH_SCHEMA_LOCK(session,
+ WT_WITH_SCHEMA_LOCK(session, ret,
ret = __clsm_open_cursors(clsm, false, start_chunk, start_id));
return (ret);
}
@@ -1543,7 +1543,7 @@ __wt_clsm_open(WT_SESSION_IMPL *session,
bulk = cval.val != 0;
/* Get the LSM tree. */
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
ret = __wt_lsm_tree_get(session, uri, bulk, &lsm_tree));
/*
* Check whether the exclusive open for a bulk load succeeded, and
diff --git a/src/lsm/lsm_cursor_bulk.c b/src/lsm/lsm_cursor_bulk.c
index 65e8fe1e9a7..607ca0c9705 100644
--- a/src/lsm/lsm_cursor_bulk.c
+++ b/src/lsm/lsm_cursor_bulk.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -114,7 +114,7 @@ __wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[])
* switch inline, since switch needs a schema lock and online index
* creation opens a bulk cursor while holding the schema lock.
*/
- WT_WITH_SCHEMA_LOCK(session,
+ WT_WITH_SCHEMA_LOCK(session, ret,
ret = __wt_lsm_tree_switch(session, lsm_tree));
WT_RET(ret);
diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c
index d8cf36f2cc1..dac8d987328 100644
--- a/src/lsm/lsm_manager.c
+++ b/src/lsm/lsm_manager.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c
index 1a2608803e4..29325066da7 100644
--- a/src/lsm/lsm_merge.c
+++ b/src/lsm/lsm_merge.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -437,7 +437,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
F_SET(src, WT_CURSTD_RAW);
WT_ERR(__wt_clsm_init_merge(src, start_chunk, start_id, nchunks));
- WT_WITH_SCHEMA_LOCK(session,
+ WT_WITH_SCHEMA_LOCK(session, ret,
ret = __wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
WT_ERR(ret);
if (create_bloom) {
@@ -607,12 +607,13 @@ err: if (locked)
if (ret != 0 && created_chunk) {
/* Drop the newly-created files on error. */
if (chunk->uri != NULL) {
- WT_WITH_SCHEMA_LOCK(session, tret =
- __wt_schema_drop(session, chunk->uri, drop_cfg));
+ WT_WITH_SCHEMA_LOCK(session, tret,
+ tret = __wt_schema_drop(
+ session, chunk->uri, drop_cfg));
WT_TRET(tret);
}
if (create_bloom && chunk->bloom_uri != NULL) {
- WT_WITH_SCHEMA_LOCK(session,
+ WT_WITH_SCHEMA_LOCK(session, tret,
tret = __wt_schema_drop(
session, chunk->bloom_uri, drop_cfg));
WT_TRET(tret);
diff --git a/src/lsm/lsm_meta.c b/src/lsm/lsm_meta.c
index 64ca283e2c8..d76b2a48aa7 100644
--- a/src/lsm/lsm_meta.c
+++ b/src/lsm/lsm_meta.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/lsm/lsm_stat.c b/src/lsm/lsm_stat.c
index c1eb7a2a389..c147cf5774a 100644
--- a/src/lsm/lsm_stat.c
+++ b/src/lsm/lsm_stat.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -33,7 +33,7 @@ __curstat_lsm_init(
"checkpoint=" WT_CHECKPOINT, NULL, NULL };
locked = false;
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree));
WT_RET(ret);
WT_ERR(__wt_scr_alloc(session, 0, &uribuf));
@@ -91,7 +91,7 @@ __curstat_lsm_init(
* top-level.
*/
new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
- new->lsm_generation_max = chunk->generation;
+ WT_STAT_WRITE(new, lsm_generation_max, chunk->generation);
/* Aggregate statistics from each new chunk. */
__wt_stat_dsrc_aggregate_single(new, stats);
@@ -115,37 +115,40 @@ __curstat_lsm_init(
* into the top-level.
*/
new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
- new->bloom_size =
- (int64_t)((chunk->count * lsm_tree->bloom_bit_count) / 8);
- new->bloom_page_evict =
- new->cache_eviction_clean + new->cache_eviction_dirty;
- new->bloom_page_read = new->cache_read;
+ WT_STAT_WRITE(new, bloom_size,
+ (int64_t)((chunk->count * lsm_tree->bloom_bit_count) / 8));
+ WT_STAT_WRITE(new, bloom_page_evict,
+ new->cache_eviction_clean + new->cache_eviction_dirty);
+ WT_STAT_WRITE(new, bloom_page_read, new->cache_read);
__wt_stat_dsrc_aggregate_single(new, stats);
WT_ERR(stat_cursor->close(stat_cursor));
}
/* Set statistics that aren't aggregated directly into the cursor */
- stats->bloom_count = bloom_count;
- stats->lsm_chunk_count = lsm_tree->nchunks;
+ WT_STAT_WRITE(stats, bloom_count, bloom_count);
+ WT_STAT_WRITE(stats, lsm_chunk_count, lsm_tree->nchunks);
/* Include, and optionally clear, LSM-level specific information. */
- stats->bloom_miss = lsm_tree->bloom_miss;
+ WT_STAT_WRITE(stats, bloom_miss, lsm_tree->bloom_miss);
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
lsm_tree->bloom_miss = 0;
- stats->bloom_hit = lsm_tree->bloom_hit;
+ WT_STAT_WRITE(stats, bloom_hit, lsm_tree->bloom_hit);
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
lsm_tree->bloom_hit = 0;
- stats->bloom_false_positive = lsm_tree->bloom_false_positive;
+ WT_STAT_WRITE(
+ stats, bloom_false_positive, lsm_tree->bloom_false_positive);
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
lsm_tree->bloom_false_positive = 0;
- stats->lsm_lookup_no_bloom = lsm_tree->lsm_lookup_no_bloom;
+ WT_STAT_WRITE(
+ stats, lsm_lookup_no_bloom, lsm_tree->lsm_lookup_no_bloom);
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
lsm_tree->lsm_lookup_no_bloom = 0;
- stats->lsm_checkpoint_throttle = lsm_tree->lsm_checkpoint_throttle;
+ WT_STAT_WRITE(
+ stats, lsm_checkpoint_throttle, lsm_tree->lsm_checkpoint_throttle);
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
lsm_tree->lsm_checkpoint_throttle = 0;
- stats->lsm_merge_throttle = lsm_tree->lsm_merge_throttle;
+ WT_STAT_WRITE(stats, lsm_merge_throttle, lsm_tree->lsm_merge_throttle);
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
lsm_tree->lsm_merge_throttle = 0;
@@ -173,7 +176,7 @@ __wt_curstat_lsm_init(
* Grab the schema lock because we will be locking the LSM tree and we
* may need to open some files.
*/
- WT_WITH_SCHEMA_LOCK(session,
+ WT_WITH_SCHEMA_LOCK(session, ret,
ret = __curstat_lsm_init(session, uri, cst));
return (ret);
diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c
index 5d819607413..ff6e66fd1a1 100644
--- a/src/lsm/lsm_tree.c
+++ b/src/lsm/lsm_tree.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -243,7 +243,7 @@ __lsm_tree_cleanup_old(WT_SESSION_IMPL *session, const char *uri)
WT_RET(__wt_exist(session, uri + strlen("file:"), &exists));
if (exists)
- WT_WITH_SCHEMA_LOCK(session,
+ WT_WITH_SCHEMA_LOCK(session, ret,
ret = __wt_schema_drop(session, uri, cfg));
return (ret);
}
@@ -315,7 +315,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
char *tmpconfig;
/* If the tree is open, it already exists. */
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree));
if (ret == 0) {
__wt_lsm_tree_release(session, lsm_tree);
@@ -447,7 +447,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
* tracking macros handle cleaning up on failure.
*/
if (ret == 0)
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
ret = __lsm_tree_open(session, uri, true, &lsm_tree));
if (ret == 0)
__wt_lsm_tree_release(session, lsm_tree);
@@ -954,13 +954,14 @@ __wt_lsm_tree_drop(
WT_DECL_RET;
WT_LSM_CHUNK *chunk;
WT_LSM_TREE *lsm_tree;
+ int tret;
u_int i;
bool locked;
locked = false;
/* Get the LSM tree. */
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
ret = __wt_lsm_tree_get(session, name, true, &lsm_tree));
WT_RET(ret);
@@ -996,8 +997,9 @@ __wt_lsm_tree_drop(
err: if (locked)
WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
- WT_WITH_HANDLE_LIST_LOCK(session,
- WT_TRET(__lsm_tree_discard(session, lsm_tree, false)));
+ WT_WITH_HANDLE_LIST_LOCK(session, tret,
+ tret = __lsm_tree_discard(session, lsm_tree, false));
+ WT_TRET(tret);
return (ret);
}
@@ -1013,6 +1015,7 @@ __wt_lsm_tree_rename(WT_SESSION_IMPL *session,
WT_LSM_CHUNK *chunk;
WT_LSM_TREE *lsm_tree;
const char *old;
+ int tret;
u_int i;
bool locked;
@@ -1020,7 +1023,7 @@ __wt_lsm_tree_rename(WT_SESSION_IMPL *session,
locked = false;
/* Get the LSM tree. */
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
ret = __wt_lsm_tree_get(session, olduri, true, &lsm_tree));
WT_RET(ret);
@@ -1070,8 +1073,9 @@ err: if (locked)
* Discard this LSM tree structure. The first operation on the renamed
* tree will create a new one.
*/
- WT_WITH_HANDLE_LIST_LOCK(session,
- WT_TRET(__lsm_tree_discard(session, lsm_tree, false)));
+ WT_WITH_HANDLE_LIST_LOCK(session, tret,
+ tret = __lsm_tree_discard(session, lsm_tree, false));
+ WT_TRET(tret);
return (ret);
}
@@ -1086,6 +1090,7 @@ __wt_lsm_tree_truncate(
WT_DECL_RET;
WT_LSM_CHUNK *chunk;
WT_LSM_TREE *lsm_tree;
+ int tret;
bool locked;
WT_UNUSED(cfg);
@@ -1093,7 +1098,7 @@ __wt_lsm_tree_truncate(
locked = false;
/* Get the LSM tree. */
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
ret = __wt_lsm_tree_get(session, name, true, &lsm_tree));
WT_RET(ret);
@@ -1132,8 +1137,9 @@ err: if (locked)
* the last good version of the metadata will be used, resulting
* in a valid (not truncated) tree.
*/
- WT_WITH_HANDLE_LIST_LOCK(session,
- WT_TRET(__lsm_tree_discard(session, lsm_tree, false)));
+ WT_WITH_HANDLE_LIST_LOCK(session, tret,
+ tret = __lsm_tree_discard(session, lsm_tree, false));
+ WT_TRET(tret);
}
return (ret);
}
@@ -1231,7 +1237,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp)
/* Tell __wt_schema_worker not to look inside the LSM tree. */
*skipp = true;
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
ret = __wt_lsm_tree_get(session, name, false, &lsm_tree));
WT_RET(ret);
@@ -1429,7 +1435,7 @@ __wt_lsm_tree_worker(WT_SESSION_IMPL *session,
locked = false;
exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE);
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
ret = __wt_lsm_tree_get(session, uri, exclusive, &lsm_tree));
WT_RET(ret);
diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c
index 4741cf52608..4faa25967ad 100644
--- a/src/lsm/lsm_work_unit.c
+++ b/src/lsm/lsm_work_unit.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -168,7 +168,7 @@ __wt_lsm_work_switch(
*entryp = NULL;
if (F_ISSET(entry->lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
- WT_WITH_SCHEMA_LOCK(session,
+ WT_WITH_SCHEMA_LOCK(session, ret,
ret = __wt_lsm_tree_switch(session, entry->lsm_tree));
/* Failing to complete the switch is fine */
if (ret == EBUSY) {
@@ -272,7 +272,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
!F_ISSET(chunk, WT_LSM_CHUNK_STABLE) &&
!chunk->evicted) {
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
ret = __lsm_discard_handle(session, chunk->uri, NULL));
if (ret == 0)
chunk->evicted = 1;
@@ -336,7 +336,8 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
* necessary handle locks.
*/
WT_ERR(__wt_meta_track_on(session));
- WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(
+ WT_WITH_SCHEMA_LOCK(session, ret,
+ ret = __wt_schema_worker(
session, chunk->uri, __wt_checkpoint, NULL, NULL, 0));
WT_TRET(__wt_meta_track_off(session, false, ret != 0));
if (ret != 0)
@@ -505,7 +506,7 @@ __lsm_drop_file(WT_SESSION_IMPL *session, const char *uri)
*
* This will fail with EBUSY if the file is still in use.
*/
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
ret = __lsm_discard_handle(session, uri, WT_CHECKPOINT));
WT_RET(ret);
@@ -514,7 +515,7 @@ __lsm_drop_file(WT_SESSION_IMPL *session, const char *uri)
* results in the hot backup lock being taken when it updates the
* metadata (which would be too late to prevent our drop).
*/
- WT_WITH_SCHEMA_LOCK(session,
+ WT_WITH_SCHEMA_LOCK(session, ret,
ret = __wt_schema_drop(session, uri, drop_cfg));
if (ret == 0)
diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c
index 625783ac16c..7562cb1cae3 100644
--- a/src/lsm/lsm_worker.c
+++ b/src/lsm/lsm_worker.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/meta/meta_apply.c b/src/meta/meta_apply.c
index 95c5b9807ca..92766213b33 100644
--- a/src/meta/meta_apply.c
+++ b/src/meta/meta_apply.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -9,27 +9,23 @@
#include "wt_internal.h"
/*
- * __wt_meta_btree_apply --
+ * __meta_btree_apply --
* Apply a function to all files listed in the metadata, apart from the
* metadata file.
*/
-int
-__wt_meta_btree_apply(WT_SESSION_IMPL *session,
+static inline int
+__meta_btree_apply(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[])
{
- WT_CURSOR *cursor;
- WT_DATA_HANDLE *saved_dhandle;
WT_DECL_RET;
const char *uri;
- int cmp, tret;
+ int cmp;
- saved_dhandle = session->dhandle;
- WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
cursor->set_key(cursor, "file:");
- if ((tret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0)
- tret = cursor->next(cursor);
- for (; tret == 0; tret = cursor->next(cursor)) {
- WT_ERR(cursor->get_key(cursor, &uri));
+ if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0)
+ ret = cursor->next(cursor);
+ for (; ret == 0; ret = cursor->next(cursor)) {
+ WT_RET(cursor->get_key(cursor, &uri));
if (!WT_PREFIX_MATCH(uri, "file:"))
break;
if (strcmp(uri, WT_METAFILE_URI) == 0)
@@ -43,8 +39,7 @@ __wt_meta_btree_apply(WT_SESSION_IMPL *session,
*/
ret = __wt_session_get_btree(session, uri, NULL, NULL, 0);
if (ret == 0) {
- WT_SAVE_DHANDLE(session,
- ret = func(session, cfg));
+ WT_SAVE_DHANDLE(session, ret = func(session, cfg));
if (WT_META_TRACKING(session))
WT_TRET(__wt_meta_track_handle_lock(
session, false));
@@ -53,12 +48,29 @@ __wt_meta_btree_apply(WT_SESSION_IMPL *session,
} else if (ret == EBUSY)
ret = __wt_conn_btree_apply_single(
session, uri, NULL, func, cfg);
- WT_ERR(ret);
+ WT_RET(ret);
}
+ WT_RET_NOTFOUND_OK(ret);
+
+ return (0);
+}
+
+/*
+ * __wt_meta_btree_apply --
+ * Apply a function to all files listed in the metadata, apart from the
+ * metadata file.
+ */
+int
+__wt_meta_btree_apply(WT_SESSION_IMPL *session,
+ int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[])
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+
+ WT_RET(__wt_metadata_cursor(session, &cursor));
+ WT_SAVE_DHANDLE(session,
+ ret = __meta_btree_apply(session, cursor, func, cfg));
+ WT_TRET(__wt_metadata_cursor_release(session, &cursor));
- if (tret != WT_NOTFOUND)
- WT_TRET(tret);
-err: WT_TRET(cursor->close(cursor));
- session->dhandle = saved_dhandle;
return (ret);
}
diff --git a/src/meta/meta_ckpt.c b/src/meta/meta_ckpt.c
index 70c9bf8dfcd..f7da8525639 100644
--- a/src/meta/meta_ckpt.c
+++ b/src/meta/meta_ckpt.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/meta/meta_ext.c b/src/meta/meta_ext.c
index 423b7d2e76b..b48f7205807 100644
--- a/src/meta/meta_ext.c
+++ b/src/meta/meta_ext.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/meta/meta_table.c b/src/meta/meta_table.c
index e7074a9c1b5..9938cb07a5c 100644
--- a/src/meta/meta_table.c
+++ b/src/meta/meta_table.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -31,21 +31,28 @@ __metadata_turtle(const char *key)
}
/*
- * __wt_metadata_open --
- * Opens the metadata file, sets session->meta_dhandle.
+ * __wt_metadata_cursor_open --
+ * Opens a cursor on the metadata.
*/
int
-__wt_metadata_open(WT_SESSION_IMPL *session)
+__wt_metadata_cursor_open(
+ WT_SESSION_IMPL *session, const char *config, WT_CURSOR **cursorp)
{
WT_BTREE *btree;
+ WT_DECL_RET;
+ const char *open_cursor_cfg[] = {
+ WT_CONFIG_BASE(session, WT_SESSION_open_cursor), config, NULL };
- if (session->meta_dhandle != NULL)
- return (0);
-
- WT_RET(__wt_session_get_btree(session, WT_METAFILE_URI, NULL, NULL, 0));
+ WT_WITHOUT_DHANDLE(session, ret = __wt_open_cursor(
+ session, WT_METAFILE_URI, NULL, open_cursor_cfg, cursorp));
+ WT_RET(ret);
- session->meta_dhandle = session->dhandle;
- WT_ASSERT(session, session->meta_dhandle != NULL);
+ /*
+ * Retrieve the btree from the cursor, rather than the session because
+ * we don't always switch the metadata handle in to the session before
+ * entering this function.
+ */
+ btree = ((WT_CURSOR_BTREE *)(*cursorp))->btree;
/*
* Set special flags for the metadata file: eviction (the metadata file
@@ -56,7 +63,6 @@ __wt_metadata_open(WT_SESSION_IMPL *session)
* opens (the first update is safe because it's single-threaded from
* wiredtiger_open).
*/
- btree = S2BT(session);
if (!F_ISSET(btree, WT_BTREE_IN_MEMORY))
F_SET(btree, WT_BTREE_IN_MEMORY);
if (!F_ISSET(btree, WT_BTREE_NO_EVICTION))
@@ -64,44 +70,81 @@ __wt_metadata_open(WT_SESSION_IMPL *session)
if (F_ISSET(btree, WT_BTREE_NO_LOGGING))
F_CLR(btree, WT_BTREE_NO_LOGGING);
- /* The metadata handle doesn't need to stay locked -- release it. */
- return (__wt_session_release_btree(session));
+ return (0);
}
/*
* __wt_metadata_cursor --
- * Opens a cursor on the metadata.
+ * Returns the session's cached metadata cursor, unless it's in use, in
+ * which case it opens and returns another metadata cursor.
*/
int
-__wt_metadata_cursor(
- WT_SESSION_IMPL *session, const char *config, WT_CURSOR **cursorp)
+__wt_metadata_cursor(WT_SESSION_IMPL *session, WT_CURSOR **cursorp)
{
- WT_DATA_HANDLE *saved_dhandle;
- WT_DECL_RET;
- bool is_dead;
- const char *cfg[] =
- { WT_CONFIG_BASE(session, WT_SESSION_open_cursor), config, NULL };
+ WT_CURSOR *cursor;
- saved_dhandle = session->dhandle;
- WT_ERR(__wt_metadata_open(session));
+ /*
+ * If we don't have a cached metadata cursor, or it's already in use,
+ * we'll need to open a new one.
+ */
+ cursor = NULL;
+ if (session->meta_cursor == NULL ||
+ F_ISSET(session->meta_cursor, WT_CURSTD_META_INUSE)) {
+ WT_RET(__wt_metadata_cursor_open(session, NULL, &cursor));
+ if (session->meta_cursor == NULL) {
+ session->meta_cursor = cursor;
+ cursor = NULL;
+ }
+ }
- session->dhandle = session->meta_dhandle;
+ /*
+ * If there's no cursor return, we're done, our caller should have just
+ * been triggering the creation of the session's cached cursor. There
+ * should not be an open local cursor in that case, but caution doesn't
+ * cost anything.
+ */
+ if (cursorp == NULL)
+ return (cursor == NULL ? 0 : cursor->close(cursor));
- /*
- * We use the metadata a lot, so we have a handle cached; lock it and
- * increment the in-use counter once the cursor is open.
+ /*
+ * If the cached cursor is in use, return the newly opened cursor, else
+ * mark the cached cursor in use and return it.
*/
- WT_ERR(__wt_session_lock_dhandle(session, 0, &is_dead));
+ if (F_ISSET(session->meta_cursor, WT_CURSTD_META_INUSE))
+ *cursorp = cursor;
+ else {
+ *cursorp = session->meta_cursor;
+ F_SET(session->meta_cursor, WT_CURSTD_META_INUSE);
+ }
+ return (0);
+}
- /* The metadata should never be closed. */
- WT_ASSERT(session, !is_dead);
+/*
+ * __wt_metadata_cursor_release --
+ * Release a metadata cursor.
+ */
+int
+__wt_metadata_cursor_release(WT_SESSION_IMPL *session, WT_CURSOR **cursorp)
+{
+ WT_CURSOR *cursor;
- WT_ERR(__wt_curfile_create(session, NULL, cfg, false, false, cursorp));
- __wt_cursor_dhandle_incr_use(session);
+ WT_UNUSED(session);
- /* Restore the caller's btree. */
-err: session->dhandle = saved_dhandle;
- return (ret);
+ if ((cursor = *cursorp) == NULL)
+ return (0);
+ *cursorp = NULL;
+
+ /*
+ * If using the session's cached metadata cursor, clear the in-use flag
+ * and reset it, otherwise, discard the cursor.
+ */
+ if (F_ISSET(cursor, WT_CURSTD_META_INUSE)) {
+ WT_ASSERT(session, cursor == session->meta_cursor);
+
+ F_CLR(cursor, WT_CURSTD_META_INUSE);
+ return (cursor->reset(cursor));
+ }
+ return (cursor->close(cursor));
}
/*
@@ -124,14 +167,13 @@ __wt_metadata_insert(
WT_RET_MSG(session, EINVAL,
"%s: insert not supported on the turtle file", key);
- WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
+ WT_RET(__wt_metadata_cursor(session, &cursor));
cursor->set_key(cursor, key);
cursor->set_value(cursor, value);
WT_ERR(cursor->insert(cursor));
if (WT_META_TRACKING(session))
WT_ERR(__wt_meta_track_insert(session, key));
-
-err: WT_TRET(cursor->close(cursor));
+err: WT_TRET(__wt_metadata_cursor_release(session, &cursor));
return (ret);
}
@@ -152,7 +194,7 @@ __wt_metadata_update(
__metadata_turtle(key) ? "" : "not "));
if (__metadata_turtle(key)) {
- WT_WITH_TURTLE_LOCK(session,
+ WT_WITH_TURTLE_LOCK(session, ret,
ret = __wt_turtle_update(session, key, value));
return (ret);
}
@@ -160,12 +202,14 @@ __wt_metadata_update(
if (WT_META_TRACKING(session))
WT_RET(__wt_meta_track_update(session, key));
- WT_RET(__wt_metadata_cursor(session, "overwrite", &cursor));
+ WT_RET(__wt_metadata_cursor(session, &cursor));
+ /* This cursor needs to have overwrite semantics. */
+ WT_ASSERT(session, F_ISSET(cursor, WT_CURSTD_OVERWRITE));
+
cursor->set_key(cursor, key);
cursor->set_value(cursor, value);
WT_ERR(cursor->insert(cursor));
-
-err: WT_TRET(cursor->close(cursor));
+err: WT_TRET(__wt_metadata_cursor_release(session, &cursor));
return (ret);
}
@@ -188,14 +232,13 @@ __wt_metadata_remove(WT_SESSION_IMPL *session, const char *key)
WT_RET_MSG(session, EINVAL,
"%s: remove not supported on the turtle file", key);
- WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
+ WT_RET(__wt_metadata_cursor(session, &cursor));
cursor->set_key(cursor, key);
WT_ERR(cursor->search(cursor));
if (WT_META_TRACKING(session))
WT_ERR(__wt_meta_track_update(session, key));
WT_ERR(cursor->remove(cursor));
-
-err: WT_TRET(cursor->close(cursor));
+err: WT_TRET(__wt_metadata_cursor_release(session, &cursor));
return (ret);
}
@@ -205,8 +248,7 @@ err: WT_TRET(cursor->close(cursor));
* The caller is responsible for freeing the allocated memory.
*/
int
-__wt_metadata_search(
- WT_SESSION_IMPL *session, const char *key, char **valuep)
+__wt_metadata_search(WT_SESSION_IMPL *session, const char *key, char **valuep)
{
WT_CURSOR *cursor;
WT_DECL_RET;
@@ -230,7 +272,7 @@ __wt_metadata_search(
* Metadata updates use non-transactional techniques (such as the
* schema and metadata locks) to protect access to in-flight updates.
*/
- WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
+ WT_RET(__wt_metadata_cursor(session, &cursor));
cursor->set_key(cursor, key);
WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED,
ret = cursor->search(cursor));
@@ -238,7 +280,6 @@ __wt_metadata_search(
WT_ERR(cursor->get_value(cursor, &value));
WT_ERR(__wt_strdup(session, value, valuep));
-
-err: WT_TRET(cursor->close(cursor));
+err: WT_TRET(__wt_metadata_cursor_release(session, &cursor));
return (ret);
}
diff --git a/src/meta/meta_track.c b/src/meta/meta_track.c
index ea1757129c5..1baab2deae1 100644
--- a/src/meta/meta_track.c
+++ b/src/meta/meta_track.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -141,7 +141,7 @@ __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk)
ret = bm->checkpoint_resolve(bm, session));
break;
case WT_ST_DROP_COMMIT:
- if ((ret = __wt_remove_if_exists(session, trk->a)) != 0)
+ if ((ret = __wt_block_manager_drop(session, trk->a)) != 0)
__wt_err(session, ret,
"metadata remove dropped file %s", trk->a);
break;
@@ -189,7 +189,7 @@ __meta_track_unroll(WT_SESSION_IMPL *session, WT_META_TRACK *trk)
* For removes, b is NULL.
*/
if (trk->a != NULL && trk->b != NULL &&
- (ret = __wt_rename(session,
+ (ret = __wt_rename_and_sync_directory(session,
trk->b + strlen("file:"), trk->a + strlen("file:"))) != 0)
__wt_err(session, ret,
"metadata unroll rename %s to %s", trk->b, trk->a);
@@ -262,16 +262,17 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll)
}
/*
- * If we don't have the metadata handle (e.g, we're in the process of
+ * If we don't have the metadata cursor (e.g, we're in the process of
* creating the metadata), we can't sync it.
*/
- if (!need_sync || session->meta_dhandle == NULL ||
+ if (!need_sync || session->meta_cursor == NULL ||
F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
goto done;
/* If we're logging, make sure the metadata update was flushed. */
if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED)) {
- WT_WITH_DHANDLE(session, session->meta_dhandle,
+ WT_WITH_DHANDLE(session,
+ WT_SESSION_META_DHANDLE(session),
ret = __wt_txn_checkpoint_log(
session, false, WT_TXN_LOG_CKPT_SYNC, NULL));
WT_RET(ret);
@@ -284,12 +285,14 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll)
*/
ckpt_session->txn.id = session->txn.id;
F_SET(ckpt_session, WT_SESSION_LOCKED_SCHEMA);
- WT_WITH_DHANDLE(ckpt_session, session->meta_dhandle, ret =
- __wt_checkpoint(ckpt_session, NULL));
+ WT_WITH_DHANDLE(ckpt_session,
+ WT_SESSION_META_DHANDLE(session),
+ ret = __wt_checkpoint(ckpt_session, NULL));
F_CLR(ckpt_session, WT_SESSION_LOCKED_SCHEMA);
ckpt_session->txn.id = WT_TXN_NONE;
WT_RET(ret);
- WT_WITH_DHANDLE(session, session->meta_dhandle,
+ WT_WITH_DHANDLE(session,
+ WT_SESSION_META_DHANDLE(session),
ret = __wt_checkpoint_sync(session, NULL));
WT_RET(ret);
}
diff --git a/src/meta/meta_turtle.c b/src/meta/meta_turtle.c
index 13e8b31916f..7182bb0fe5f 100644
--- a/src/meta/meta_turtle.c
+++ b/src/meta/meta_turtle.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -55,7 +55,7 @@ __metadata_init(WT_SESSION_IMPL *session)
* We're single-threaded, but acquire the schema lock regardless: the
* lower level code checks that it is appropriately synchronized.
*/
- WT_WITH_SCHEMA_LOCK(session,
+ WT_WITH_SCHEMA_LOCK(session, ret,
ret = __wt_schema_create(session, WT_METAFILE_URI, NULL));
return (ret);
@@ -120,7 +120,7 @@ __metadata_load_bulk(WT_SESSION_IMPL *session)
* If a file was being bulk-loaded during the hot backup, it will appear
* in the metadata file, but the file won't exist. Create on demand.
*/
- WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+ WT_RET(__wt_metadata_cursor(session, &cursor));
while ((ret = cursor->next(cursor)) == 0) {
WT_ERR(cursor->get_key(cursor, &key));
if (!WT_PREFIX_SKIP(key, "file:"))
@@ -141,9 +141,7 @@ __metadata_load_bulk(WT_SESSION_IMPL *session)
}
WT_ERR_NOTFOUND_OK(ret);
-err: if (cursor != NULL)
- WT_TRET(cursor->close(cursor));
-
+err: WT_TRET(__wt_metadata_cursor_release(session, &cursor));
return (ret);
}
@@ -202,7 +200,8 @@ __wt_turtle_init(WT_SESSION_IMPL *session)
/* Create the turtle file. */
WT_RET(__metadata_config(session, &metaconf));
- WT_WITH_TURTLE_LOCK(session, ret = __wt_turtle_update(
+ WT_WITH_TURTLE_LOCK(session, ret,
+ ret = __wt_turtle_update(
session, WT_METAFILE_URI, metaconf));
WT_ERR(ret);
}
@@ -271,8 +270,7 @@ err: WT_TRET(__wt_fclose(&fp, WT_FHANDLE_READ));
* Update the turtle file.
*/
int
-__wt_turtle_update(
- WT_SESSION_IMPL *session, const char *key, const char *value)
+__wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value)
{
WT_FH *fh;
WT_DECL_ITEM(buf);
@@ -299,7 +297,7 @@ __wt_turtle_update(
WT_ERR(__wt_write(session, fh, 0, buf->size, buf->data));
/* Flush the handle and rename the file into place. */
- ret = __wt_sync_and_rename_fh(
+ ret = __wt_fh_sync_and_rename(
session, &fh, WT_METADATA_TURTLE_SET, WT_METADATA_TURTLE);
/* Close any file handle left open, remove any temporary file. */
diff --git a/src/os_posix/os_abort.c b/src/os_posix/os_abort.c
index 5bb6aeb6e16..034eedcfbf8 100644
--- a/src/os_posix/os_abort.c
+++ b/src/os_posix/os_abort.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_alloc.c b/src/os_posix/os_alloc.c
index eb2482723ec..3876f9a1afe 100644
--- a/src/os_posix/os_alloc.c
+++ b/src/os_posix/os_alloc.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_dir.c b/src/os_posix/os_dir.c
index 9eba641ca51..83e77aa5312 100644
--- a/src/os_posix/os_dir.c
+++ b/src/os_posix/os_dir.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_dlopen.c b/src/os_posix/os_dlopen.c
index 2b5fa249163..9a74eb4813d 100644
--- a/src/os_posix/os_dlopen.c
+++ b/src/os_posix/os_dlopen.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_errno.c b/src/os_posix/os_errno.c
index 229b68e0008..a58ae88447e 100644
--- a/src/os_posix/os_errno.c
+++ b/src/os_posix/os_errno.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_exist.c b/src/os_posix/os_exist.c
index 644a27dca9a..87f0e219d2e 100644
--- a/src/os_posix/os_exist.c
+++ b/src/os_posix/os_exist.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_fallocate.c b/src/os_posix/os_fallocate.c
index 6280e334afb..9d160afd179 100644
--- a/src/os_posix/os_fallocate.c
+++ b/src/os_posix/os_fallocate.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_filesize.c b/src/os_posix/os_filesize.c
index c58f73b0665..72242e351bf 100644
--- a/src/os_posix/os_filesize.c
+++ b/src/os_posix/os_filesize.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_flock.c b/src/os_posix/os_flock.c
index 07393481e7d..e2056f7636c 100644
--- a/src/os_posix/os_flock.c
+++ b/src/os_posix/os_flock.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_fsync.c b/src/os_posix/os_fsync.c
index b0c04e98258..f5afddc557b 100644
--- a/src/os_posix/os_fsync.c
+++ b/src/os_posix/os_fsync.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -75,12 +75,13 @@ __wt_directory_sync_fh(WT_SESSION_IMPL *session, WT_FH *fh)
* Flush a directory to ensure a file creation is durable.
*/
int
-__wt_directory_sync(WT_SESSION_IMPL *session, char *path)
+__wt_directory_sync(WT_SESSION_IMPL *session, const char *path)
{
#ifdef __linux__
WT_DECL_RET;
int fd, tret;
- char *dir;
+ const char *dir;
+ char *copy;
/*
* POSIX 1003.1 does not require that fsync of a file handle ensures the
@@ -88,15 +89,22 @@ __wt_directory_sync(WT_SESSION_IMPL *session, char *path)
* there are historic Linux filesystems requiring this), do an explicit
* fsync on a file descriptor for the directory to be sure.
*/
- if (path == NULL || (dir = strrchr(path, '/')) == NULL) {
- dir = NULL;
- path = (char *)S2C(session)->home;
- } else
- *dir = '\0';
+ copy = NULL;
+ if (path == NULL || (dir = strrchr(path, '/')) == NULL)
+ path = S2C(session)->home;
+ else {
+ /*
+ * Copy the directory name, leaving the trailing slash in place,
+ * so a path of "/foo" doesn't result in an empty string.
+ */
+ WT_RET(__wt_strndup(
+ session, path, (size_t)(dir - path) + 1, &copy));
+ path = copy;
+ }
+
WT_SYSCALL_RETRY(((fd =
open(path, O_RDONLY, 0444)) == -1 ? 1 : 0), ret);
- if (dir != NULL)
- *dir = '/';
+ __wt_free(session, copy);
if (ret != 0)
WT_RET_MSG(session, ret, "%s: open", path);
diff --git a/src/os_posix/os_ftruncate.c b/src/os_posix/os_ftruncate.c
index 696d8da54f4..2af90512f26 100644
--- a/src/os_posix/os_ftruncate.c
+++ b/src/os_posix/os_ftruncate.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_getenv.c b/src/os_posix/os_getenv.c
index e1e0051a120..7a086145cee 100644
--- a/src/os_posix/os_getenv.c
+++ b/src/os_posix/os_getenv.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_getline.c b/src/os_posix/os_getline.c
index 7c4ee8d1746..c0ca96852de 100644
--- a/src/os_posix/os_getline.c
+++ b/src/os_posix/os_getline.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_getopt.c b/src/os_posix/os_getopt.c
index 486d85286bc..0306ad1d79d 100644
--- a/src/os_posix/os_getopt.c
+++ b/src/os_posix/os_getopt.c
@@ -1,5 +1,5 @@
/*-
- * Public Domain 2014-2015 MongoDB, Inc.
+ * Public Domain 2014-2016 MongoDB, Inc.
* Public Domain 2008-2014 WiredTiger, Inc.
*
* This is free and unencumbered software released into the public domain.
diff --git a/src/os_posix/os_map.c b/src/os_posix/os_map.c
index e95ccb0ade2..42aeeac4a5e 100644
--- a/src/os_posix/os_map.c
+++ b/src/os_posix/os_map.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -48,8 +48,6 @@ __wt_mmap(WT_SESSION_IMPL *session,
return (0);
}
-#define WT_VM_PAGESIZE 4096
-
/*
* __wt_mmap_preload --
* Cause a section of a memory map to be faulted in.
@@ -59,9 +57,10 @@ __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size)
{
#ifdef HAVE_POSIX_MADVISE
/* Linux requires the address be aligned to a 4KB boundary. */
+ WT_CONNECTION_IMPL *conn = S2C(session);
WT_BM *bm = S2BT(session)->bm;
WT_DECL_RET;
- void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1));
+ void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(conn->page_size - 1));
size += WT_PTRDIFF(p, blk);
/* XXX proxy for "am I doing a scan?" -- manual read-ahead */
@@ -78,9 +77,9 @@ __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size)
* Manual pages aren't clear on whether alignment is required for the
* size, so we will be conservative.
*/
- size &= ~(size_t)(WT_VM_PAGESIZE - 1);
+ size &= ~(size_t)(conn->page_size - 1);
- if (size > WT_VM_PAGESIZE &&
+ if (size > (size_t)conn->page_size &&
(ret = posix_madvise(blk, size, POSIX_MADV_WILLNEED)) != 0)
WT_RET_MSG(session, ret, "posix_madvise will need");
#else
@@ -101,8 +100,9 @@ __wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size)
{
#ifdef HAVE_POSIX_MADVISE
/* Linux requires the address be aligned to a 4KB boundary. */
+ WT_CONNECTION_IMPL *conn = S2C(session);
WT_DECL_RET;
- void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1));
+ void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(conn->page_size - 1));
size += WT_PTRDIFF(p, blk);
if ((ret = posix_madvise(blk, size, POSIX_MADV_DONTNEED)) != 0)
diff --git a/src/os_posix/os_mtx_cond.c b/src/os_posix/os_mtx_cond.c
index d5fc86b648b..5f4e9a7cf2b 100644
--- a/src/os_posix/os_mtx_cond.c
+++ b/src/os_posix/os_mtx_cond.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_mtx_rw.c b/src/os_posix/os_mtx_rw.c
index 46f134feabb..b6876cdfbdc 100644
--- a/src/os_posix/os_mtx_rw.c
+++ b/src/os_posix/os_mtx_rw.c
@@ -1,5 +1,5 @@
/*-
- * Public Domain 2014-2015 MongoDB, Inc.
+ * Public Domain 2014-2016 MongoDB, Inc.
* Public Domain 2008-2014 WiredTiger, Inc.
*
* This is free and unencumbered software released into the public domain.
diff --git a/src/os_posix/os_once.c b/src/os_posix/os_once.c
index bfe0b9819ac..8d900042330 100644
--- a/src/os_posix/os_once.c
+++ b/src/os_posix/os_once.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c
index a87272db391..b085676c53b 100644
--- a/src/os_posix/os_open.c
+++ b/src/os_posix/os_open.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_pagesize.c b/src/os_posix/os_pagesize.c
new file mode 100644
index 00000000000..4a7e7084cc6
--- /dev/null
+++ b/src/os_posix/os_pagesize.c
@@ -0,0 +1,19 @@
+/*-
+ * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_get_vm_pagesize --
+ * Return the default page size of a virtual memory page.
+ */
+int
+__wt_get_vm_pagesize(void)
+{
+ return (getpagesize());
+}
diff --git a/src/os_posix/os_path.c b/src/os_posix/os_path.c
index af28e1b3b56..6dc54675eb8 100644
--- a/src/os_posix/os_path.c
+++ b/src/os_posix/os_path.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_priv.c b/src/os_posix/os_priv.c
index a8479668d67..5ffbbf7a1f2 100644
--- a/src/os_posix/os_priv.c
+++ b/src/os_posix/os_priv.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_remove.c b/src/os_posix/os_remove.c
index 96bbba9bab2..bc244c12e46 100644
--- a/src/os_posix/os_remove.c
+++ b/src/os_posix/os_remove.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_rename.c b/src/os_posix/os_rename.c
index 811604e7f0f..301190305c4 100644
--- a/src/os_posix/os_rename.c
+++ b/src/os_posix/os_rename.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_rw.c b/src/os_posix/os_rw.c
index 24d6d1aa879..8733bfe0f53 100644
--- a/src/os_posix/os_rw.c
+++ b/src/os_posix/os_rw.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_sleep.c b/src/os_posix/os_sleep.c
index 4e90edabc53..8633b8d1ec0 100644
--- a/src/os_posix/os_sleep.c
+++ b/src/os_posix/os_sleep.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_stdio.c b/src/os_posix/os_stdio.c
index da880f5521e..7ab107eda1e 100644
--- a/src/os_posix/os_stdio.c
+++ b/src/os_posix/os_stdio.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_strtouq.c b/src/os_posix/os_strtouq.c
index 0b7a540959c..0ae604fc761 100644
--- a/src/os_posix/os_strtouq.c
+++ b/src/os_posix/os_strtouq.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_thread.c b/src/os_posix/os_thread.c
index c7222aac6c4..35a23622ddc 100644
--- a/src/os_posix/os_thread.c
+++ b/src/os_posix/os_thread.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_time.c b/src/os_posix/os_time.c
index c3052df62e7..0e5a1cdadfb 100644
--- a/src/os_posix/os_time.c
+++ b/src/os_posix/os_time.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_posix/os_yield.c b/src/os_posix/os_yield.c
index 297ec7deaee..052a46940b7 100644
--- a/src/os_posix/os_yield.c
+++ b/src/os_posix/os_yield.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_dir.c b/src/os_win/os_dir.c
index aff916c25f5..00ec4f252e4 100644
--- a/src/os_win/os_dir.c
+++ b/src/os_win/os_dir.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_dlopen.c b/src/os_win/os_dlopen.c
index 1c57d5f8073..0bad39d681d 100644
--- a/src/os_win/os_dlopen.c
+++ b/src/os_win/os_dlopen.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_errno.c b/src/os_win/os_errno.c
index a9d3d521052..6a9daf8443f 100644
--- a/src/os_win/os_errno.c
+++ b/src/os_win/os_errno.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_exist.c b/src/os_win/os_exist.c
index 4a727801569..ec1369cc727 100644
--- a/src/os_win/os_exist.c
+++ b/src/os_win/os_exist.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_fallocate.c b/src/os_win/os_fallocate.c
index 030c2e4c6c7..cdc7a1c46ee 100644
--- a/src/os_win/os_fallocate.c
+++ b/src/os_win/os_fallocate.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_filesize.c b/src/os_win/os_filesize.c
index 7f231b5ba9a..c9925fb18a8 100644
--- a/src/os_win/os_filesize.c
+++ b/src/os_win/os_filesize.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_flock.c b/src/os_win/os_flock.c
index 947d7bdcde7..60a981499a5 100644
--- a/src/os_win/os_flock.c
+++ b/src/os_win/os_flock.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_fsync.c b/src/os_win/os_fsync.c
index 7a01b5cd61d..913b7ca5a4e 100644
--- a/src/os_win/os_fsync.c
+++ b/src/os_win/os_fsync.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -25,7 +25,7 @@ __wt_directory_sync_fh(WT_SESSION_IMPL *session, WT_FH *fh)
* Flush a directory to ensure a file creation is durable.
*/
int
-__wt_directory_sync(WT_SESSION_IMPL *session, char *path)
+__wt_directory_sync(WT_SESSION_IMPL *session, const char *path)
{
WT_UNUSED(session);
WT_UNUSED(path);
diff --git a/src/os_win/os_ftruncate.c b/src/os_win/os_ftruncate.c
index cc635306a71..0c11b5509b7 100644
--- a/src/os_win/os_ftruncate.c
+++ b/src/os_win/os_ftruncate.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_getenv.c b/src/os_win/os_getenv.c
index 9b3a20abad7..c9084769cd5 100644
--- a/src/os_win/os_getenv.c
+++ b/src/os_win/os_getenv.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_map.c b/src/os_win/os_map.c
index 3c4edb59ea8..dc040b4fa54 100644
--- a/src/os_win/os_map.c
+++ b/src/os_win/os_map.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_mtx_cond.c b/src/os_win/os_mtx_cond.c
index b909afa9ba6..14bac2a99d9 100644
--- a/src/os_win/os_mtx_cond.c
+++ b/src/os_win/os_mtx_cond.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_once.c b/src/os_win/os_once.c
index bb5e059452e..9ea3fe044eb 100644
--- a/src/os_win/os_once.c
+++ b/src/os_win/os_once.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_open.c b/src/os_win/os_open.c
index c3106763452..3ec53daf001 100644
--- a/src/os_win/os_open.c
+++ b/src/os_win/os_open.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_pagesize.c b/src/os_win/os_pagesize.c
new file mode 100644
index 00000000000..648105c0e7c
--- /dev/null
+++ b/src/os_win/os_pagesize.c
@@ -0,0 +1,23 @@
+/*-
+ * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_get_vm_pagesize --
+ * Return the default page size of a virtual memory page.
+ */
+int
+__wt_get_vm_pagesize(void)
+{
+ SYSTEM_INFO system_info;
+
+ GetSystemInfo(&system_info);
+
+ return (system_info.dwPageSize);
+}
diff --git a/src/os_win/os_path.c b/src/os_win/os_path.c
index 9d001e50571..e9532de2b38 100644
--- a/src/os_win/os_path.c
+++ b/src/os_win/os_path.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_priv.c b/src/os_win/os_priv.c
index 5c32d6b5999..8c1f3893920 100644
--- a/src/os_win/os_priv.c
+++ b/src/os_win/os_priv.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_remove.c b/src/os_win/os_remove.c
index 55b50030064..5682a25d7f2 100644
--- a/src/os_win/os_remove.c
+++ b/src/os_win/os_remove.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_rename.c b/src/os_win/os_rename.c
index a0f33843218..829ab1d16e9 100644
--- a/src/os_win/os_rename.c
+++ b/src/os_win/os_rename.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_rw.c b/src/os_win/os_rw.c
index bafefcfba24..49f011001a4 100644
--- a/src/os_win/os_rw.c
+++ b/src/os_win/os_rw.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_sleep.c b/src/os_win/os_sleep.c
index 33e04c1d8a9..1d4b316488a 100644
--- a/src/os_win/os_sleep.c
+++ b/src/os_win/os_sleep.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_snprintf.c b/src/os_win/os_snprintf.c
index ebb14fd32e8..a6056ff9342 100644
--- a/src/os_win/os_snprintf.c
+++ b/src/os_win/os_snprintf.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_thread.c b/src/os_win/os_thread.c
index b5f13aea4e9..3be0ccb9393 100644
--- a/src/os_win/os_thread.c
+++ b/src/os_win/os_thread.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_time.c b/src/os_win/os_time.c
index 2292c317a64..e784b5d8a36 100644
--- a/src/os_win/os_time.c
+++ b/src/os_win/os_time.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_vsnprintf.c b/src/os_win/os_vsnprintf.c
index 205b63751c7..63f96e79d5b 100644
--- a/src/os_win/os_vsnprintf.c
+++ b/src/os_win/os_vsnprintf.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/os_win/os_yield.c b/src/os_win/os_yield.c
index dd3eb67de8b..aab1559e072 100644
--- a/src/os_win/os_yield.c
+++ b/src/os_win/os_yield.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/packing/pack_api.c b/src/packing/pack_api.c
index efe999505bf..4c65406cd64 100644
--- a/src/packing/pack_api.c
+++ b/src/packing/pack_api.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/packing/pack_impl.c b/src/packing/pack_impl.c
index 30d28dfb63c..0e3ed44ba6a 100644
--- a/src/packing/pack_impl.c
+++ b/src/packing/pack_impl.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -176,6 +176,8 @@ __wt_struct_repack(WT_SESSION_IMPL *session, const char *infmt,
/* Outfmt should complete before infmt */
while ((ret = __pack_next(&packout, &pvout)) == 0) {
+ if (p >= end)
+ WT_ERR(EINVAL);
WT_ERR(__pack_next(&packin, &pvin));
before = p;
WT_ERR(__unpack_read(session, &pvin, &p, (size_t)(end - p)));
diff --git a/src/packing/pack_stream.c b/src/packing/pack_stream.c
index 1f3449d79d3..98da5b405c3 100644
--- a/src/packing/pack_stream.c
+++ b/src/packing/pack_stream.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/reconcile/rec_track.c b/src/reconcile/rec_track.c
index 18ed5c6b551..4a3a8a7e988 100644
--- a/src/reconcile/rec_track.c
+++ b/src/reconcile/rec_track.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index 21cc68ed119..332449027a9 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -630,12 +630,12 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
*/
switch (page->type) {
case WT_PAGE_COL_INT:
- WT_RET(__wt_page_alloc(session,
- WT_PAGE_COL_INT, 1, mod->mod_multi_entries, false, &next));
+ WT_RET(__wt_page_alloc(session, WT_PAGE_COL_INT,
+ 1, mod->mod_multi_entries, false, &next));
break;
case WT_PAGE_ROW_INT:
- WT_RET(__wt_page_alloc(session,
- WT_PAGE_ROW_INT, 0, mod->mod_multi_entries, false, &next));
+ WT_RET(__wt_page_alloc(session, WT_PAGE_ROW_INT,
+ WT_RECNO_OOB, mod->mod_multi_entries, false, &next));
break;
WT_ILLEGAL_VALUE(session);
}
@@ -1276,6 +1276,8 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
for (upd = upd_list; upd->next != NULL; upd = upd->next)
;
upd->next = append;
+ __wt_cache_page_inmem_incr(
+ session, page, WT_UPDATE_MEMSIZE(append));
}
/*
@@ -1756,7 +1758,7 @@ __rec_key_state_update(WT_RECONCILE *r, bool ovfl_key)
* Figure out the maximum leaf page size for the reconciliation.
*/
static inline uint32_t
-__rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+__rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r)
{
WT_BTREE *btree;
WT_PAGE *page;
@@ -3263,7 +3265,14 @@ supd_check_complete:
memset(WT_BLOCK_HEADER_REF(dsk), 0, btree->block_header);
bnd->cksum = __wt_cksum(buf->data, buf->size);
- if (mod->rec_result == WT_PM_REC_MULTIBLOCK &&
+ /*
+ * One last check: don't reuse blocks if compacting, the reason
+ * for compaction is to move blocks to different locations. We
+ * do this check after calculating the checksums, hopefully the
+ * next write can be skipped.
+ */
+ if (session->compact_state == WT_COMPACT_NONE &&
+ mod->rec_result == WT_PM_REC_MULTIBLOCK &&
mod->mod_multi_entries > bnd_slot) {
multi = &mod->mod_multi[bnd_slot];
if (multi->size == bnd->size &&
@@ -3502,7 +3511,7 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
break;
case BTREE_COL_VAR:
if (cbulk->rle != 0)
- WT_RET(__wt_bulk_insert_var(session, cbulk));
+ WT_RET(__wt_bulk_insert_var(session, cbulk, false));
break;
case BTREE_ROW:
break;
@@ -3625,43 +3634,20 @@ __rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk)
* Fixed-length column-store bulk insert.
*/
int
-__wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+__wt_bulk_insert_fix(
+ WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted)
{
WT_BTREE *btree;
WT_CURSOR *cursor;
WT_RECONCILE *r;
- uint32_t entries, offset, page_entries, page_size;
- const uint8_t *data;
r = cbulk->reconcile;
btree = S2BT(session);
cursor = &cbulk->cbt.iface;
- if (cbulk->bitmap) {
- if (((r->recno - 1) * btree->bitcnt) & 0x7)
- WT_RET_MSG(session, EINVAL,
- "Bulk bitmap load not aligned on a byte boundary");
- for (data = cursor->value.data,
- entries = (uint32_t)cursor->value.size;
- entries > 0;
- entries -= page_entries, data += page_size) {
- WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk));
-
- page_entries =
- WT_MIN(entries, cbulk->nrecs - cbulk->entry);
- page_size = __bitstr_size(page_entries * btree->bitcnt);
- offset = __bitstr_size(cbulk->entry * btree->bitcnt);
- memcpy(r->first_free + offset, data, page_size);
- cbulk->entry += page_entries;
- r->recno += page_entries;
- }
- return (0);
- }
-
WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk));
-
- __bit_setv(r->first_free,
- cbulk->entry, btree->bitcnt, ((uint8_t *)cursor->value.data)[0]);
+ __bit_setv(r->first_free, cbulk->entry,
+ btree->bitcnt, deleted ? 0 : ((uint8_t *)cursor->value.data)[0]);
++cbulk->entry;
++r->recno;
@@ -3669,11 +3655,48 @@ __wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
}
/*
+ * __wt_bulk_insert_fix_bitmap --
+ * Fixed-length column-store bulk insert.
+ */
+int
+__wt_bulk_insert_fix_bitmap(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_RECONCILE *r;
+ uint32_t entries, offset, page_entries, page_size;
+ const uint8_t *data;
+
+ r = cbulk->reconcile;
+ btree = S2BT(session);
+ cursor = &cbulk->cbt.iface;
+
+ if (((r->recno - 1) * btree->bitcnt) & 0x7)
+ WT_RET_MSG(session, EINVAL,
+ "Bulk bitmap load not aligned on a byte boundary");
+ for (data = cursor->value.data,
+ entries = (uint32_t)cursor->value.size;
+ entries > 0;
+ entries -= page_entries, data += page_size) {
+ WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk));
+
+ page_entries = WT_MIN(entries, cbulk->nrecs - cbulk->entry);
+ page_size = __bitstr_size(page_entries * btree->bitcnt);
+ offset = __bitstr_size(cbulk->entry * btree->bitcnt);
+ memcpy(r->first_free + offset, data, page_size);
+ cbulk->entry += page_entries;
+ r->recno += page_entries;
+ }
+ return (0);
+}
+
+/*
* __wt_bulk_insert_var --
* Variable-length column-store bulk insert.
*/
int
-__wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+__wt_bulk_insert_var(
+ WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted)
{
WT_BTREE *btree;
WT_KV *val;
@@ -3682,14 +3705,20 @@ __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
r = cbulk->reconcile;
btree = S2BT(session);
- /*
- * Store the bulk cursor's last buffer, not the current value, we're
- * creating a duplicate count, which means we want the previous value
- * seen, not the current value.
- */
val = &r->v;
- WT_RET(__rec_cell_build_val(
- session, r, cbulk->last.data, cbulk->last.size, cbulk->rle));
+ if (deleted) {
+ val->cell_len = __wt_cell_pack_del(&val->cell, cbulk->rle);
+ val->buf.data = NULL;
+ val->buf.size = 0;
+ val->len = val->cell_len;
+ } else
+ /*
+ * Store the bulk cursor's last buffer, not the current value,
+ * we're tracking duplicates, which means we want the previous
+ * value seen, not the current value.
+ */
+ WT_RET(__rec_cell_build_val(session,
+ r, cbulk->last.data, cbulk->last.size, cbulk->rle));
/* Boundary: split or write the page. */
if (val->len > r->space_avail)
@@ -3923,16 +3952,49 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
r->recno += entry;
/* Walk any append list. */
- WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) {
- WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
- if (upd == NULL)
- continue;
+ for (ins =
+ WT_SKIP_FIRST(WT_COL_APPEND(page));; ins = WT_SKIP_NEXT(ins)) {
+ if (ins == NULL) {
+ /*
+ * If the page split, instantiate any missing records in
+ * the page's name space. (Imagine record 98 is
+ * transactionally visible, 99 wasn't created or is not
+ * yet visible, 100 is visible. Then the page splits and
+ * record 100 moves to another page. When we reconcile
+ * the original page, we write record 98, then we don't
+ * see record 99 for whatever reason. If we've moved
+ * record 1000, we don't know to write a deleted record
+ * 99 on the page.)
+ *
+ * The record number recorded during the split is the
+ * first key on the split page, that is, one larger than
+ * the last key on this page, we have to decrement it.
+ */
+ if ((recno =
+ page->modify->mod_split_recno) == WT_RECNO_OOB)
+ break;
+ recno -= 1;
+
+ /*
+ * The following loop assumes records to write, and the
+ * previous key might have been visible.
+ */
+ if (r->recno > recno)
+ break;
+ upd = NULL;
+ } else {
+ WT_RET(
+ __rec_txn_read(session, r, ins, NULL, NULL, &upd));
+ if (upd == NULL)
+ continue;
+ recno = WT_INSERT_RECNO(ins);
+ }
for (;;) {
/*
* The application may have inserted records which left
* gaps in the name space.
*/
- for (recno = WT_INSERT_RECNO(ins);
+ for (;
nrecs > 0 && r->recno < recno;
--nrecs, ++entry, ++r->recno)
__bit_setv(
@@ -3940,6 +4002,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
if (nrecs > 0) {
__bit_setv(r->first_free, entry, btree->bitcnt,
+ upd == NULL ? 0 :
((uint8_t *)WT_UPDATE_DATA(upd))[0]);
--nrecs;
++entry;
@@ -3961,6 +4024,13 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
entry = 0;
nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail);
}
+
+ /*
+ * Execute this loop once without an insert item to catch any
+ * missing records due to a split, then quit.
+ */
+ if (ins == NULL)
+ break;
}
/* Update the counters. */
@@ -4441,11 +4511,36 @@ compare: /*
}
/* Walk any append list. */
- WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) {
- WT_ERR(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
- if (upd == NULL)
- continue;
- for (n = WT_INSERT_RECNO(ins); src_recno <= n; ++src_recno) {
+ for (ins =
+ WT_SKIP_FIRST(WT_COL_APPEND(page));; ins = WT_SKIP_NEXT(ins)) {
+ if (ins == NULL) {
+ /*
+ * If the page split, instantiate any missing records in
+ * the page's name space. (Imagine record 98 is
+ * transactionally visible, 99 wasn't created or is not
+ * yet visible, 100 is visible. Then the page splits and
+ * record 100 moves to another page. When we reconcile
+ * the original page, we write record 98, then we don't
+ * see record 99 for whatever reason. If we've moved
+ * record 1000, we don't know to write a deleted record
+ * 99 on the page.)
+ *
+ * The record number recorded during the split is the
+ * first key on the split page, that is, one larger than
+ * the last key on this page, we have to decrement it.
+ */
+ if ((n = page->modify->mod_split_recno) == WT_RECNO_OOB)
+ break;
+ n -= 1;
+ upd = NULL;
+ } else {
+ WT_ERR(
+ __rec_txn_read(session, r, ins, NULL, NULL, &upd));
+ if (upd == NULL)
+ continue;
+ n = WT_INSERT_RECNO(ins);
+ }
+ while (src_recno <= n) {
/*
* The application may have inserted records which left
* gaps in the name space, and these gaps can be huge.
@@ -4468,7 +4563,8 @@ compare: /*
src_recno += skip;
}
} else {
- deleted = WT_UPDATE_DELETED_ISSET(upd);
+ deleted = upd == NULL ||
+ WT_UPDATE_DELETED_ISSET(upd);
if (!deleted) {
data = WT_UPDATE_DATA(upd);
size = upd->size;
@@ -4485,7 +4581,7 @@ compare: /*
last->size == size &&
memcmp(last->data, data, size) == 0)) {
++rle;
- continue;
+ goto next;
}
WT_ERR(__rec_col_var_helper(session, r,
salvage, last, last_deleted, 0, rle));
@@ -4504,7 +4600,23 @@ compare: /*
}
last_deleted = deleted;
rle = 1;
+
+ /*
+ * Move to the next record. It's not a simple increment
+ * because if it's the maximum record, incrementing it
+ * wraps to 0 and this turns into an infinite loop.
+ */
+next: if (src_recno == UINT64_MAX)
+ break;
+ ++src_recno;
}
+
+ /*
+ * Execute this loop once without an insert item to catch any
+ * missing records due to a split, then quit.
+ */
+ if (ins == NULL)
+ break;
}
/* If we were tracking a record, write it. */
@@ -5343,11 +5455,10 @@ __rec_split_dump_keys(WT_SESSION_IMPL *session, WT_PAGE *page, WT_RECONCILE *r)
switch (page->type) {
case WT_PAGE_ROW_INT:
case WT_PAGE_ROW_LEAF:
- WT_ERR(__wt_buf_set_printable(
- session, tkey, bnd->key.data, bnd->key.size));
WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
- "starting key %.*s",
- (int)tkey->size, (const char *)tkey->data));
+ "starting key %s",
+ __wt_buf_set_printable(
+ session, bnd->key.data, bnd->key.size, tkey)));
break;
case WT_PAGE_COL_FIX:
case WT_PAGE_COL_INT:
diff --git a/src/schema/schema_create.c b/src/schema/schema_create.c
index db4658cbd0e..8cdcbbcad54 100644
--- a/src/schema/schema_create.c
+++ b/src/schema/schema_create.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -74,11 +74,11 @@ __create_file(WT_SESSION_IMPL *session,
{
WT_DECL_ITEM(val);
WT_DECL_RET;
- uint32_t allocsize;
- bool is_metadata;
const char *filename, **p, *filecfg[] =
{ WT_CONFIG_BASE(session, file_meta), config, NULL, NULL };
char *fileconf;
+ uint32_t allocsize;
+ bool is_metadata;
fileconf = NULL;
@@ -97,7 +97,7 @@ __create_file(WT_SESSION_IMPL *session,
}
/* Sanity check the allocation size. */
- WT_RET(__wt_direct_io_size_check(
+ WT_ERR(__wt_direct_io_size_check(
session, filecfg, "allocation_size", &allocsize));
/* Create the file. */
@@ -197,13 +197,15 @@ __create_colgroup(WT_SESSION_IMPL *session,
{ WT_CONFIG_BASE(session, colgroup_meta), config, NULL, NULL };
const char *sourcecfg[] = { config, NULL, NULL };
const char *cgname, *source, *sourceconf, *tablename;
- char *cgconf, *oldconf;
+ char *cgconf, *origconf;
+ bool exists;
sourceconf = NULL;
- cgconf = oldconf = NULL;
+ cgconf = origconf = NULL;
WT_CLEAR(fmt);
WT_CLEAR(confbuf);
WT_CLEAR(namebuf);
+ exists = false;
tablename = name;
if (!WT_PREFIX_SKIP(tablename, "colgroup:"))
@@ -228,6 +230,14 @@ __create_colgroup(WT_SESSION_IMPL *session,
"Column group '%s' not found in table '%.*s'",
cgname, (int)tlen, tablename);
+ /* Check if the column group already exists. */
+ if ((ret = __wt_metadata_search(session, name, &origconf)) == 0) {
+ if (exclusive)
+ WT_ERR(EEXIST);
+ exists = true;
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
/* Find the first NULL entry in the cfg stack. */
for (cfgp = &cfg[1]; *cfgp; cfgp++)
;
@@ -262,25 +272,22 @@ __create_colgroup(WT_SESSION_IMPL *session,
}
sourcecfg[1] = fmt.data;
WT_ERR(__wt_config_merge(session, sourcecfg, NULL, &sourceconf));
-
WT_ERR(__wt_schema_create(session, source, sourceconf));
WT_ERR(__wt_config_collapse(session, cfg, &cgconf));
- if ((ret = __wt_metadata_insert(session, name, cgconf)) != 0) {
- /*
- * If the entry already exists in the metadata, we're done.
- * This is an error for exclusive creates but okay otherwise.
- */
- if (ret == WT_DUPLICATE_KEY)
- ret = exclusive ? EEXIST : 0;
+ if (exists) {
+ if (strcmp(cgconf, origconf) != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "%s: does not match existing configuration", name);
goto err;
}
+ WT_ERR(__wt_metadata_insert(session, name, cgconf));
WT_ERR(__wt_schema_open_colgroups(session, table));
err: __wt_free(session, cgconf);
__wt_free(session, sourceconf);
- __wt_free(session, oldconf);
+ __wt_free(session, origconf);
__wt_buf_free(session, &confbuf);
__wt_buf_free(session, &fmt);
__wt_buf_free(session, &namebuf);
@@ -382,18 +389,18 @@ __create_index(WT_SESSION_IMPL *session,
{ WT_CONFIG_BASE(session, index_meta), NULL, NULL, NULL };
const char *sourcecfg[] = { config, NULL, NULL };
const char *source, *sourceconf, *idxname, *tablename;
- char *idxconf;
+ char *idxconf, *origconf;
size_t tlen;
- bool have_extractor;
+ bool exists, have_extractor;
u_int i, npublic_cols;
sourceconf = NULL;
- idxconf = NULL;
+ idxconf = origconf = NULL;
WT_CLEAR(confbuf);
WT_CLEAR(fmt);
WT_CLEAR(extra_cols);
WT_CLEAR(namebuf);
- have_extractor = false;
+ exists = have_extractor = false;
tablename = name;
if (!WT_PREFIX_SKIP(tablename, "index:"))
@@ -411,9 +418,17 @@ __create_index(WT_SESSION_IMPL *session,
(int)tlen, tablename);
if (table->is_simple)
- WT_RET_MSG(session, EINVAL,
+ WT_ERR_MSG(session, EINVAL,
"%s requires a table with named columns", name);
+ /* Check if the index already exists. */
+ if ((ret = __wt_metadata_search(session, name, &origconf)) == 0) {
+ if (exclusive)
+ WT_ERR(EEXIST);
+ exists = true;
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
if (__wt_config_getones(session, config, "source", &cval) == 0) {
WT_ERR(__wt_buf_fmt(session, &namebuf,
"%.*s", (int)cval.len, cval.str));
@@ -488,8 +503,7 @@ __create_index(WT_SESSION_IMPL *session,
WT_ERR(__wt_buf_catfmt(
session, &extra_cols, "%.*s,", (int)ckey.len, ckey.str));
}
- if (ret != 0 && ret != WT_NOTFOUND)
- goto err;
+ WT_ERR_NOTFOUND_OK(ret);
/* Index values are empty: all columns are packed into the index key. */
WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=,key_format="));
@@ -525,23 +539,22 @@ __create_index(WT_SESSION_IMPL *session,
cfg[1] = sourceconf;
cfg[2] = confbuf.data;
WT_ERR(__wt_config_collapse(session, cfg, &idxconf));
- if ((ret = __wt_metadata_insert(session, name, idxconf)) != 0) {
- /*
- * If the entry already exists in the metadata, we're done.
- * This is an error for exclusive creates but okay otherwise.
- */
- if (ret == WT_DUPLICATE_KEY)
- ret = exclusive ? EEXIST : 0;
+ if (exists) {
+ if (strcmp(idxconf, origconf) != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "%s: does not match existing configuration", name);
goto err;
}
+ WT_ERR(__wt_metadata_insert(session, name, idxconf));
/* Make sure that the configuration is valid. */
WT_ERR(__wt_schema_open_index(
session, table, idxname, strlen(idxname), &idx));
-
- WT_ERR(__fill_index(session, table, idx));
+ if (!exists)
+ WT_ERR(__fill_index(session, table, idx));
err: __wt_free(session, idxconf);
+ __wt_free(session, origconf);
__wt_free(session, sourceconf);
__wt_buf_free(session, &confbuf);
__wt_buf_free(session, &extra_cols);
@@ -570,10 +583,12 @@ __create_table(WT_SESSION_IMPL *session,
char *tableconf, *cgname;
size_t cgsize;
int ncolgroups;
+ bool exists;
cgname = NULL;
table = NULL;
tableconf = NULL;
+ exists = false;
tablename = name;
if (!WT_PREFIX_SKIP(tablename, "table:"))
@@ -581,8 +596,9 @@ __create_table(WT_SESSION_IMPL *session,
if ((ret = __wt_schema_get_table(session,
tablename, strlen(tablename), false, &table)) == 0) {
- __wt_schema_release_table(session, table);
- return (exclusive ? EEXIST : 0);
+ if (exclusive)
+ WT_ERR(EEXIST);
+ exists = true;
}
WT_RET_NOTFOUND_OK(ret);
@@ -595,15 +611,13 @@ __create_table(WT_SESSION_IMPL *session,
WT_ERR_NOTFOUND_OK(ret);
WT_ERR(__wt_config_collapse(session, cfg, &tableconf));
- if ((ret = __wt_metadata_insert(session, name, tableconf)) != 0) {
- /*
- * If the entry already exists in the metadata, we're done.
- * This is an error for exclusive creates but okay otherwise.
- */
- if (ret == WT_DUPLICATE_KEY)
- ret = exclusive ? EEXIST : 0;
+ if (exists) {
+ if (strcmp(tableconf, table->config) != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "%s: does not match existing configuration", name);
goto err;
}
+ WT_ERR(__wt_metadata_insert(session, name, tableconf));
/* Attempt to open the table now to catch any errors. */
WT_ERR(__wt_schema_get_table(
diff --git a/src/schema/schema_drop.c b/src/schema/schema_drop.c
index 9b9f3a23961..6ac76930c9a 100644
--- a/src/schema/schema_drop.c
+++ b/src/schema/schema_drop.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -29,7 +29,7 @@ __drop_file(
return (EINVAL);
/* Close all btree handles associated with this file. */
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
ret = __wt_conn_dhandle_close_all(session, uri, force));
WT_RET(ret);
diff --git a/src/schema/schema_list.c b/src/schema/schema_list.c
index da5f033ad40..5e9caf94b7a 100644
--- a/src/schema/schema_list.c
+++ b/src/schema/schema_list.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -21,9 +21,9 @@ __schema_add_table(WT_SESSION_IMPL *session,
uint64_t bucket;
/* Make sure the metadata is open before getting other locks. */
- WT_RET(__wt_metadata_open(session));
+ WT_RET(__wt_metadata_cursor(session, NULL));
- WT_WITH_TABLE_LOCK(session,
+ WT_WITH_TABLE_LOCK(session, ret,
ret = __wt_schema_open_table(
session, name, namelen, ok_incomplete, &table));
WT_RET(ret);
diff --git a/src/schema/schema_open.c b/src/schema/schema_open.c
index ba8664f2e39..49318f80959 100644
--- a/src/schema/schema_open.c
+++ b/src/schema/schema_open.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -291,7 +291,7 @@ __schema_open_index(WT_SESSION_IMPL *session,
WT_ERR(__wt_buf_fmt(session, tmp, "index:%s:", tablename));
/* Find matching indices. */
- WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+ WT_ERR(__wt_metadata_cursor(session, &cursor));
cursor->set_key(cursor, tmp->data);
if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0)
ret = cursor->next(cursor);
@@ -379,10 +379,10 @@ __schema_open_index(WT_SESSION_IMPL *session,
table->idx_complete = true;
}
-err: __wt_scr_free(session, &tmp);
+err: WT_TRET(__wt_metadata_cursor_release(session, &cursor));
WT_TRET(__wt_schema_destroy_index(session, &idx));
- if (cursor != NULL)
- WT_TRET(cursor->close(cursor));
+
+ __wt_scr_free(session, &tmp);
return (ret);
}
@@ -438,7 +438,7 @@ __schema_open_table(WT_SESSION_IMPL *session,
WT_ERR(__wt_buf_fmt(session, buf, "table:%.*s", (int)namelen, name));
WT_ERR(__wt_strndup(session, buf->data, buf->size, &tablename));
- WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+ WT_ERR(__wt_metadata_cursor(session, &cursor));
cursor->set_key(cursor, tablename);
WT_ERR(cursor->search(cursor));
WT_ERR(cursor->get_value(cursor, &tconfig));
@@ -508,8 +508,7 @@ __schema_open_table(WT_SESSION_IMPL *session,
if (0) {
err: WT_TRET(__wt_schema_destroy_table(session, &table));
}
- if (cursor != NULL)
- WT_TRET(cursor->close(cursor));
+ WT_TRET(__wt_metadata_cursor_release(session, &cursor));
__wt_free(session, tablename);
__wt_scr_free(session, &buf);
diff --git a/src/schema/schema_plan.c b/src/schema/schema_plan.c
index 066e666190b..612a2d2d192 100644
--- a/src/schema/schema_plan.c
+++ b/src/schema/schema_plan.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/schema/schema_project.c b/src/schema/schema_project.c
index be5f73b48ed..4d29b2baa13 100644
--- a/src/schema/schema_project.c
+++ b/src/schema/schema_project.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/schema/schema_rename.c b/src/schema/schema_rename.c
index 3f368417d40..4ec126394dd 100644
--- a/src/schema/schema_rename.c
+++ b/src/schema/schema_rename.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -30,7 +30,7 @@ __rename_file(
return (EINVAL);
/* Close any btree handles in the file. */
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
ret = __wt_conn_dhandle_close_all(session, uri, false));
WT_ERR(ret);
diff --git a/src/schema/schema_stat.c b/src/schema/schema_stat.c
index 82c2e2a15dc..d3d0605c60a 100644
--- a/src/schema/schema_stat.c
+++ b/src/schema/schema_stat.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/schema/schema_truncate.c b/src/schema/schema_truncate.c
index c39bba4753c..e7752b60ca4 100644
--- a/src/schema/schema_truncate.c
+++ b/src/schema/schema_truncate.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -9,43 +9,6 @@
#include "wt_internal.h"
/*
- * __truncate_file --
- * WT_SESSION::truncate for a file.
- */
-static int
-__truncate_file(WT_SESSION_IMPL *session, const char *uri)
-{
- WT_DECL_RET;
- const char *filename;
- uint32_t allocsize;
-
- filename = uri;
- if (!WT_PREFIX_SKIP(filename, "file:"))
- return (EINVAL);
-
- /* Open and lock the file. */
- WT_RET(__wt_session_get_btree(
- session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE));
- WT_STAT_FAST_DATA_INCR(session, cursor_truncate);
-
- /* Get the allocation size. */
- allocsize = S2BT(session)->allocsize;
-
- WT_RET(__wt_session_release_btree(session));
-
- /* Close any btree handles in the file. */
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __wt_conn_dhandle_close_all(session, uri, false));
- WT_RET(ret);
-
- /* Delete the root address and truncate the file. */
- WT_RET(__wt_meta_checkpoint_clear(session, uri));
- WT_RET(__wt_block_manager_truncate(session, filename, allocsize));
-
- return (0);
-}
-
-/*
* __truncate_table --
* WT_SESSION::truncate for a table.
*/
@@ -112,9 +75,12 @@ __wt_schema_truncate(
tablename = uri;
- if (WT_PREFIX_MATCH(uri, "file:")) {
- ret = __truncate_file(session, uri);
- } else if (WT_PREFIX_MATCH(uri, "lsm:"))
+ if (WT_PREFIX_MATCH(uri, "file:"))
+ /*
+ * File truncate translates into a range truncate.
+ */
+ ret = __wt_session_range_truncate(session, uri, NULL, NULL);
+ else if (WT_PREFIX_MATCH(uri, "lsm:"))
ret = __wt_lsm_tree_truncate(session, uri, cfg);
else if (WT_PREFIX_SKIP(tablename, "table:"))
ret = __truncate_table(session, tablename, cfg);
diff --git a/src/schema/schema_util.c b/src/schema/schema_util.c
index 1e810e8adc9..d1c84dc8d85 100644
--- a/src/schema/schema_util.c
+++ b/src/schema/schema_util.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c
index 64218923173..a2fe5244c4d 100644
--- a/src/schema/schema_worker.c
+++ b/src/schema/schema_worker.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -49,7 +49,7 @@ __wt_schema_worker(WT_SESSION_IMPL *session,
* any open file handles, including checkpoints.
*/
if (FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE)) {
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
ret = __wt_conn_dhandle_close_all(
session, uri, false));
WT_ERR(ret);
@@ -63,7 +63,7 @@ __wt_schema_worker(WT_SESSION_IMPL *session,
} else if (ret == EBUSY) {
WT_ASSERT(session, !FLD_ISSET(
open_flags, WT_DHANDLE_EXCLUSIVE));
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
ret = __wt_conn_btree_apply_single_ckpt(
session, uri, file_func, cfg));
}
diff --git a/src/session/session_api.c b/src/session/session_api.c
index 053f69ee7f8..c03b5fdc044 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -148,7 +148,7 @@ __session_close(WT_SESSION *wt_session, const char *config)
* via the registered close callback.
*/
if (session->event_handler->handle_close != NULL &&
- !WT_STREQ(cursor->uri, WT_LAS_URI))
+ !WT_STREQ(cursor->internal_uri, WT_LAS_URI))
WT_TRET(session->event_handler->handle_close(
session->event_handler, wt_session, cursor));
WT_TRET(cursor->close(cursor));
@@ -442,8 +442,8 @@ __wt_session_create(
{
WT_DECL_RET;
- WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session,
+ WT_WITH_SCHEMA_LOCK(session, ret,
+ WT_WITH_TABLE_LOCK(session, ret,
ret = __wt_schema_create(session, uri, config)));
return (ret);
}
@@ -554,6 +554,32 @@ err: API_END_RET(session, ret);
}
/*
+ * __session_rebalance --
+ * WT_SESSION->rebalance method.
+ */
+static int
+__session_rebalance(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ SESSION_API_CALL(session, rebalance, config, cfg);
+
+ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
+ WT_ERR(ENOTSUP);
+
+ /* Block out checkpoints to avoid spurious EBUSY errors. */
+ WT_WITH_CHECKPOINT_LOCK(session, ret,
+ WT_WITH_SCHEMA_LOCK(session, ret,
+ ret = __wt_schema_worker(session, uri, __wt_bt_rebalance,
+ NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_REBALANCE)));
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
* __session_rename --
* WT_SESSION->rename method.
*/
@@ -571,8 +597,8 @@ __session_rename(WT_SESSION *wt_session,
WT_ERR(__wt_str_name_check(session, uri));
WT_ERR(__wt_str_name_check(session, newuri));
- WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session,
+ WT_WITH_SCHEMA_LOCK(session, ret,
+ WT_WITH_TABLE_LOCK(session, ret,
ret = __wt_schema_rename(session, uri, newuri, cfg)));
err: API_END_RET_NOTFOUND_MAP(session, ret);
@@ -611,10 +637,22 @@ int
__wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
{
WT_DECL_RET;
+ WT_CONFIG_ITEM cval;
+ bool lock_wait;
+
+ WT_RET(__wt_config_gets_def(session, cfg, "lock_wait", 1, &cval));
+ lock_wait = cval.val != 0 || F_ISSET(session, WT_SESSION_LOCK_NO_WAIT);
- WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session,
+ if (!lock_wait)
+ F_SET(session, WT_SESSION_LOCK_NO_WAIT);
+
+ WT_WITH_SCHEMA_LOCK(session, ret,
+ WT_WITH_TABLE_LOCK(session, ret,
ret = __wt_schema_drop(session, uri, cfg)));
+
+ if (!lock_wait)
+ F_CLR(session, WT_SESSION_LOCK_NO_WAIT);
+
return (ret);
}
@@ -648,6 +686,7 @@ static int
__session_join(WT_SESSION *wt_session, WT_CURSOR *join_cursor,
WT_CURSOR *ref_cursor, const char *config)
{
+ WT_CURSOR *firstcg;
WT_CONFIG_ITEM cval;
WT_CURSOR_INDEX *cindex;
WT_CURSOR_JOIN *cjoin;
@@ -661,6 +700,7 @@ __session_join(WT_SESSION *wt_session, WT_CURSOR *join_cursor,
uint8_t flags, range;
count = 0;
+ firstcg = NULL;
session = (WT_SESSION_IMPL *)wt_session;
SESSION_API_CALL(session, join, config, cfg);
table = NULL;
@@ -672,15 +712,18 @@ __session_join(WT_SESSION *wt_session, WT_CURSOR *join_cursor,
cindex = (WT_CURSOR_INDEX *)ref_cursor;
idx = cindex->index;
table = cindex->table;
- WT_CURSOR_CHECKKEY(ref_cursor);
+ firstcg = cindex->cg_cursors[0];
} else if (WT_PREFIX_MATCH(ref_cursor->uri, "table:")) {
idx = NULL;
ctable = (WT_CURSOR_TABLE *)ref_cursor;
table = ctable->table;
- WT_CURSOR_CHECKKEY(ctable->cg_cursors[0]);
+ firstcg = ctable->cg_cursors[0];
} else
WT_ERR_MSG(session, EINVAL, "not an index or table cursor");
+ if (!F_ISSET(firstcg, WT_CURSTD_KEY_SET))
+ WT_ERR_MSG(session, EINVAL,
+ "requires reference cursor be positioned");
cjoin = (WT_CURSOR_JOIN *)join_cursor;
if (cjoin->table != table)
WT_ERR_MSG(session, EINVAL,
@@ -771,71 +814,48 @@ __session_salvage(WT_SESSION *wt_session, const char *uri, const char *config)
WT_ERR(ENOTSUP);
/* Block out checkpoints to avoid spurious EBUSY errors. */
- WT_WITH_CHECKPOINT_LOCK(session,
- WT_WITH_SCHEMA_LOCK(session, ret =
- __wt_schema_worker(session, uri, __wt_salvage,
+ WT_WITH_CHECKPOINT_LOCK(session, ret,
+ WT_WITH_SCHEMA_LOCK(session, ret,
+ ret = __wt_schema_worker(session, uri, __wt_salvage,
NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_SALVAGE)));
err: API_END_RET_NOTFOUND_MAP(session, ret);
}
/*
- * __session_truncate --
- * WT_SESSION->truncate method.
+ * __wt_session_range_truncate --
+ * Session handling of a range truncate.
*/
-static int
-__session_truncate(WT_SESSION *wt_session,
- const char *uri, WT_CURSOR *start, WT_CURSOR *stop, const char *config)
+int
+__wt_session_range_truncate(WT_SESSION_IMPL *session,
+ const char *uri, WT_CURSOR *start, WT_CURSOR *stop)
{
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
WT_CURSOR *cursor;
+ WT_DECL_RET;
int cmp;
bool local_start;
local_start = false;
-
- session = (WT_SESSION_IMPL *)wt_session;
- SESSION_TXN_API_CALL(session, truncate, config, cfg);
- WT_STAT_FAST_CONN_INCR(session, cursor_truncate);
-
- /*
- * If the URI is specified, we don't need a start/stop, if start/stop
- * is specified, we don't need a URI. One exception is the log URI
- * which may truncate (archive) log files for a backup cursor.
- *
- * If no URI is specified, and both cursors are specified, start/stop
- * must reference the same object.
- *
- * Any specified cursor must have been initialized.
- */
- if ((uri == NULL && start == NULL && stop == NULL) ||
- (uri != NULL && !WT_PREFIX_MATCH(uri, "log:") &&
- (start != NULL || stop != NULL)))
- WT_ERR_MSG(session, EINVAL,
- "the truncate method should be passed either a URI or "
- "start/stop cursors, but not both");
-
if (uri != NULL) {
- /* Disallow objects in the WiredTiger name space. */
- WT_ERR(__wt_str_name_check(session, uri));
-
- if (WT_PREFIX_MATCH(uri, "log:")) {
+ WT_ASSERT(session, WT_PREFIX_MATCH(uri, "file:"));
+ /*
+ * A URI file truncate becomes a range truncate where we
+ * set a start cursor at the beginning. We already
+ * know the NULL stop goes to the end of the range.
+ */
+ WT_ERR(__session_open_cursor(
+ (WT_SESSION *)session, uri, NULL, NULL, &start));
+ local_start = true;
+ ret = start->next(start);
+ if (ret == WT_NOTFOUND) {
/*
- * Verify the user only gave the URI prefix and not
- * a specific target name after that.
+ * If there are no elements, there is nothing
+ * to do.
*/
- if (!WT_STREQ(uri, "log:"))
- WT_ERR_MSG(session, EINVAL,
- "the truncate method should not specify any"
- "target after the log: URI prefix.");
- ret = __wt_log_truncate_files(session, start, cfg);
- } else
- /* Wait for checkpoints to avoid EBUSY errors. */
- WT_WITH_CHECKPOINT_LOCK(session,
- WT_WITH_SCHEMA_LOCK(session,
- ret = __wt_schema_truncate(session, uri, cfg)));
- goto done;
+ ret = 0;
+ goto done;
+ }
+ WT_ERR(ret);
}
/*
@@ -893,7 +913,7 @@ __session_truncate(WT_SESSION *wt_session,
*/
if (start == NULL) {
WT_ERR(__session_open_cursor(
- wt_session, stop->uri, NULL, NULL, &start));
+ (WT_SESSION *)session, stop->uri, NULL, NULL, &start));
local_start = true;
WT_ERR(start->next(start));
}
@@ -910,13 +930,72 @@ __session_truncate(WT_SESSION *wt_session,
WT_ERR(__wt_schema_range_truncate(session, start, stop));
done:
-err: TXN_API_END_RETRY(session, ret, 0);
-
- /*
+err: /*
* Close any locally-opened start cursor.
*/
if (local_start)
WT_TRET(start->close(start));
+ return (ret);
+}
+
+/*
+ * __session_truncate --
+ * WT_SESSION->truncate method.
+ */
+static int
+__session_truncate(WT_SESSION *wt_session,
+ const char *uri, WT_CURSOR *start, WT_CURSOR *stop, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_TXN_API_CALL(session, truncate, config, cfg);
+ WT_STAT_FAST_CONN_INCR(session, cursor_truncate);
+
+ /*
+ * If the URI is specified, we don't need a start/stop, if start/stop
+ * is specified, we don't need a URI. One exception is the log URI
+ * which may truncate (archive) log files for a backup cursor.
+ *
+ * If no URI is specified, and both cursors are specified, start/stop
+ * must reference the same object.
+ *
+ * Any specified cursor must have been initialized.
+ */
+ if ((uri == NULL && start == NULL && stop == NULL) ||
+ (uri != NULL && !WT_PREFIX_MATCH(uri, "log:") &&
+ (start != NULL || stop != NULL)))
+ WT_ERR_MSG(session, EINVAL,
+ "the truncate method should be passed either a URI or "
+ "start/stop cursors, but not both");
+
+ if (uri != NULL) {
+ /* Disallow objects in the WiredTiger name space. */
+ WT_ERR(__wt_str_name_check(session, uri));
+
+ if (WT_PREFIX_MATCH(uri, "log:")) {
+ /*
+ * Verify the user only gave the URI prefix and not
+ * a specific target name after that.
+ */
+ if (!WT_STREQ(uri, "log:"))
+ WT_ERR_MSG(session, EINVAL,
+ "the truncate method should not specify any"
+ "target after the log: URI prefix.");
+ WT_ERR(__wt_log_truncate_files(session, start, cfg));
+ } else if (WT_PREFIX_MATCH(uri, "file:"))
+ WT_ERR(__wt_session_range_truncate(
+ session, uri, start, stop));
+ else
+ /* Wait for checkpoints to avoid EBUSY errors. */
+ WT_WITH_CHECKPOINT_LOCK(session, ret,
+ WT_WITH_SCHEMA_LOCK(session, ret,
+ ret = __wt_schema_truncate(session, uri, cfg)));
+ } else
+ WT_ERR(__wt_session_range_truncate(session, uri, start, stop));
+
+err: TXN_API_END_RETRY(session, ret, 0);
/*
* Only map WT_NOTFOUND to ENOENT if a URI was specified.
@@ -938,8 +1017,8 @@ __session_upgrade(WT_SESSION *wt_session, const char *uri, const char *config)
SESSION_API_CALL(session, upgrade, config, cfg);
/* Block out checkpoints to avoid spurious EBUSY errors. */
- WT_WITH_CHECKPOINT_LOCK(session,
- WT_WITH_SCHEMA_LOCK(session,
+ WT_WITH_CHECKPOINT_LOCK(session, ret,
+ WT_WITH_SCHEMA_LOCK(session, ret,
ret = __wt_schema_worker(session, uri, __wt_upgrade,
NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_UPGRADE)));
@@ -964,8 +1043,8 @@ __session_verify(WT_SESSION *wt_session, const char *uri, const char *config)
WT_ERR(ENOTSUP);
/* Block out checkpoints to avoid spurious EBUSY errors. */
- WT_WITH_CHECKPOINT_LOCK(session,
- WT_WITH_SCHEMA_LOCK(session,
+ WT_WITH_CHECKPOINT_LOCK(session, ret,
+ WT_WITH_SCHEMA_LOCK(session, ret,
ret = __wt_schema_worker(session, uri, __wt_verify,
NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_VERIFY)));
@@ -1287,6 +1366,7 @@ __open_session(WT_CONNECTION_IMPL *conn,
__session_join,
__session_log_flush,
__session_log_printf,
+ __session_rebalance,
__session_rename,
__session_reset,
__session_salvage,
@@ -1443,7 +1523,7 @@ __wt_open_session(WT_CONNECTION_IMPL *conn,
*/
if (open_metadata) {
WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SCHEMA));
- if ((ret = __wt_metadata_open(session)) != 0) {
+ if ((ret = __wt_metadata_cursor(session, NULL)) != 0) {
wt_session = &session->iface;
WT_TRET(wt_session->close(wt_session, NULL));
return (ret);
@@ -1486,14 +1566,11 @@ __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name,
* deadlocked getting the cursor late in the process. Be defensive,
* get it now.
*/
- if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) {
- WT_WITHOUT_DHANDLE(session, ret =
- __wt_las_cursor_create(session, &session->las_cursor));
- if (ret != 0) {
- wt_session = &session->iface;
- WT_TRET(wt_session->close(wt_session, NULL));
- return (ret);
- }
+ if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR) &&
+ (ret = __wt_las_cursor_open(session, &session->las_cursor)) != 0) {
+ wt_session = &session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ return (ret);
}
*sessionp = session;
diff --git a/src/session/session_compact.c b/src/session/session_compact.c
index 456fcd3ce03..5abccbd1366 100644
--- a/src/session/session_compact.c
+++ b/src/session/session_compact.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -172,12 +172,12 @@ __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
for (i = 0; i < 100; ++i) {
WT_ERR(__wt_txn_checkpoint(session, checkpoint_cfg));
- session->compaction = false;
- WT_WITH_SCHEMA_LOCK(session,
+ session->compact_state = WT_COMPACT_RUNNING;
+ WT_WITH_SCHEMA_LOCK(session, ret,
ret = __wt_schema_worker(
session, uri, __wt_compact, NULL, cfg, 0));
WT_ERR(ret);
- if (!session->compaction)
+ if (session->compact_state != WT_COMPACT_SUCCESS)
break;
WT_ERR(__wt_txn_checkpoint(session, checkpoint_cfg));
@@ -185,7 +185,9 @@ __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
WT_ERR(__session_compact_check_timeout(session, start_time));
}
-err: __wt_scr_free(session, &t);
+err: session->compact_state = WT_COMPACT_NONE;
+
+ __wt_scr_free(session, &t);
return (ret);
}
@@ -226,7 +228,8 @@ __wt_session_compact(
session->compact->max_time = (uint64_t)cval.val;
/* Find the types of data sources are being compacted. */
- WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(
+ WT_WITH_SCHEMA_LOCK(session, ret,
+ ret = __wt_schema_worker(
session, uri, NULL, __wt_compact_uri_analyze, cfg, 0));
WT_ERR(ret);
diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c
index dd5094fb480..1ac758c0cee 100644
--- a/src/session/session_dhandle.c
+++ b/src/session/session_dhandle.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -453,8 +453,8 @@ __session_get_dhandle(
* We didn't find a match in the session cache, search the shared
* handle list and cache the handle we find.
*/
- WT_WITH_HANDLE_LIST_LOCK(session, ret =
- __session_find_shared_dhandle(session, uri, checkpoint));
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
+ ret = __session_find_shared_dhandle(session, uri, checkpoint));
if (ret == 0)
ret = __session_add_dhandle(session, NULL);
@@ -509,9 +509,9 @@ __wt_session_get_btree(WT_SESSION_IMPL *session,
F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
WT_RET(__wt_writeunlock(session, dhandle->rwlock));
- WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_HANDLE_LIST_LOCK(session, ret =
- __wt_session_get_btree(
+ WT_WITH_SCHEMA_LOCK(session, ret,
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
+ ret = __wt_session_get_btree(
session, uri, checkpoint, cfg, flags)));
return (ret);
diff --git a/src/session/session_salvage.c b/src/session/session_salvage.c
index 07f68e4c194..983b28dd8ea 100644
--- a/src/session/session_salvage.c
+++ b/src/session/session_salvage.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/support/cksum.c b/src/support/cksum.c
index a8b5823100d..c2982c40015 100644
--- a/src/support/cksum.c
+++ b/src/support/cksum.c
@@ -1,5 +1,5 @@
/*-
- * Public Domain 2014-2015 MongoDB, Inc.
+ * Public Domain 2014-2016 MongoDB, Inc.
* Public Domain 2008-2014 WiredTiger, Inc.
*
* This is free and unencumbered software released into the public domain.
diff --git a/src/support/crypto.c b/src/support/crypto.c
index b1102163e7b..1049621fb44 100644
--- a/src/support/crypto.c
+++ b/src/support/crypto.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/support/err.c b/src/support/err.c
index de518cbf08b..875bd3efcf3 100644
--- a/src/support/err.c
+++ b/src/support/err.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/support/filename.c b/src/support/filename.c
index 02a83803e25..215f5b47997 100644
--- a/src/support/filename.c
+++ b/src/support/filename.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -65,11 +65,49 @@ __wt_remove_if_exists(WT_SESSION_IMPL *session, const char *name)
}
/*
- * __wt_sync_and_rename_fh --
+ * __wt_rename_and_sync_directory --
+ * Rename a file and sync the enclosing directory.
+ */
+int
+__wt_rename_and_sync_directory(
+ WT_SESSION_IMPL *session, const char *from, const char *to)
+{
+ const char *fp, *tp;
+ bool same_directory;
+
+ /* Rename the source file to the target. */
+ WT_RET(__wt_rename(session, from, to));
+
+ /*
+ * Flush the backing directory to guarantee the rename. My reading of
+ * POSIX 1003.1 is there's no guarantee flushing only one of the from
+ * or to directories, or flushing a common parent, is sufficient, and
+ * even if POSIX were to make that guarantee, existing filesystems are
+ * known to not provide the guarantee or only provide the guarantee
+ * with specific mount options. Flush both of the from/to directories
+ * until it's a performance problem.
+ */
+ WT_RET(__wt_directory_sync(session, from));
+
+ /*
+ * In almost all cases, we're going to be renaming files in the same
+ * directory, we can at least fast-path that.
+ */
+ fp = strrchr(from, '/');
+ tp = strrchr(to, '/');
+ same_directory = (fp == NULL && tp == NULL) ||
+ (fp != NULL && tp != NULL &&
+ fp - from == tp - to && memcmp(from, to, (size_t)(fp - from)) == 0);
+
+ return (same_directory ? 0 : __wt_directory_sync(session, to));
+}
+
+/*
+ * __wt_fh_sync_and_rename --
* Sync and close a file, and swap it into place.
*/
int
-__wt_sync_and_rename_fh(
+__wt_fh_sync_and_rename(
WT_SESSION_IMPL *session, WT_FH **fhp, const char *from, const char *to)
{
WT_DECL_RET;
@@ -83,19 +121,15 @@ __wt_sync_and_rename_fh(
WT_TRET(__wt_close(session, &fh));
WT_RET(ret);
- /* Rename the source file to the target. */
- WT_RET(__wt_rename(session, from, to));
-
- /* Flush the backing directory to guarantee the rename. */
- return (__wt_directory_sync(session, NULL));
+ return (__wt_rename_and_sync_directory(session, from, to));
}
/*
- * __wt_sync_and_rename_fp --
+ * __wt_sync_fp_and_rename --
* Sync and close a file, and swap it into place.
*/
int
-__wt_sync_and_rename_fp(
+__wt_sync_fp_and_rename(
WT_SESSION_IMPL *session, FILE **fpp, const char *from, const char *to)
{
FILE *fp;
@@ -106,9 +140,5 @@ __wt_sync_and_rename_fp(
/* Flush to disk and close the handle. */
WT_RET(__wt_fclose(&fp, WT_FHANDLE_WRITE));
- /* Rename the source file to the target. */
- WT_RET(__wt_rename(session, from, to));
-
- /* Flush the backing directory to guarantee the rename. */
- return (__wt_directory_sync(session, NULL));
+ return (__wt_rename_and_sync_directory(session, from, to));
}
diff --git a/src/support/global.c b/src/support/global.c
index 1e32f5b4453..0234455b6ce 100644
--- a/src/support/global.c
+++ b/src/support/global.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -12,28 +12,6 @@ WT_PROCESS __wt_process; /* Per-process structure */
static int __wt_pthread_once_failed; /* If initialization failed */
/*
- * __system_is_little_endian --
- * Check if the system is little endian.
- */
-static int
-__system_is_little_endian(void)
-{
- uint64_t v;
- bool little;
-
- v = 1;
- little = *((uint8_t *)&v) != 0;
-
- if (little)
- return (0);
-
- fprintf(stderr,
- "This release of the WiredTiger data engine does not support "
- "big-endian systems; contact WiredTiger for more information.\n");
- return (EINVAL);
-}
-
-/*
* __wt_global_once --
* Global initialization, run once.
*/
@@ -42,11 +20,6 @@ __wt_global_once(void)
{
WT_DECL_RET;
- if ((ret = __system_is_little_endian()) != 0) {
- __wt_pthread_once_failed = ret;
- return;
- }
-
if ((ret =
__wt_spin_init(NULL, &__wt_process.spinlock, "global")) != 0) {
__wt_pthread_once_failed = ret;
@@ -115,7 +88,7 @@ __wt_attach(WT_SESSION_IMPL *session)
/* Sleep forever, the debugger will interrupt us when it attaches. */
for (;;)
- __wt_sleep(100, 0);
+ __wt_sleep(10, 0);
#else
WT_UNUSED(session);
#endif
diff --git a/src/support/hash_city.c b/src/support/hash_city.c
index 9a4a6464f40..5780cd7b459 100644
--- a/src/support/hash_city.c
+++ b/src/support/hash_city.c
@@ -1,5 +1,5 @@
/*-
- * Public Domain 2014-2015 MongoDB, Inc.
+ * Public Domain 2014-2016 MongoDB, Inc.
* Public Domain 2008-2014 WiredTiger, Inc.
*
* This is free and unencumbered software released into the public domain.
@@ -99,6 +99,12 @@ static uint32_t UNALIGNED_LOAD32(const char *p) {
#define bswap_32(x) OSSwapInt32(x)
#define bswap_64(x) OSSwapInt64(x)
+#elif defined(__sun)
+
+#include <sys/byteorder.h>
+#define bswap_32 BSWAP_32
+#define bswap_64 BSWAP_64
+
#else
#include <byteswap.h>
#endif
diff --git a/src/support/hash_fnv.c b/src/support/hash_fnv.c
index e780931454d..35e7e5f3a73 100644
--- a/src/support/hash_fnv.c
+++ b/src/support/hash_fnv.c
@@ -1,5 +1,5 @@
/*-
- * Public Domain 2014-2015 MongoDB, Inc.
+ * Public Domain 2014-2016 MongoDB, Inc.
* Public Domain 2008-2014 WiredTiger, Inc.
*
* This is free and unencumbered software released into the public domain.
diff --git a/src/support/hazard.c b/src/support/hazard.c
index 0fc7051fb90..13e0eb3b9ac 100644
--- a/src/support/hazard.c
+++ b/src/support/hazard.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/support/hex.c b/src/support/hex.c
index eb9f420911a..d42a84154ca 100644
--- a/src/support/hex.c
+++ b/src/support/hex.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -8,7 +8,7 @@
#include "wt_internal.h"
-static const u_char hex[] = "0123456789abcdef";
+const u_char __wt_hex[] = "0123456789abcdef";
/*
* __fill_hex --
@@ -25,8 +25,8 @@ __fill_hex(const uint8_t *src, size_t src_max,
--dest_max;
for (; src_max > 0 && dest_max > 1;
src_max -= 1, dest_max -= 2, ++src) {
- *dest++ = hex[(*src & 0xf0) >> 4];
- *dest++ = hex[*src & 0x0f];
+ *dest++ = __wt_hex[(*src & 0xf0) >> 4];
+ *dest++ = __wt_hex[*src & 0x0f];
}
*dest++ = '\0';
if (lenp != NULL)
@@ -34,6 +34,17 @@ __fill_hex(const uint8_t *src, size_t src_max,
}
/*
+ * __wt_fill_hex --
+ * In-memory conversion of raw bytes to a hexadecimal representation.
+ */
+void
+__wt_fill_hex(const uint8_t *src, size_t src_max,
+ uint8_t *dest, size_t dest_max, size_t *lenp)
+{
+ __fill_hex(src, src_max, dest, dest_max, lenp);
+}
+
+/*
* __wt_raw_to_hex --
* Convert a chunk of data to a nul-terminated printable hex string.
*/
@@ -72,10 +83,6 @@ __wt_raw_to_esc_hex(
*/
WT_RET(__wt_buf_init(session, to, size * 3 + 1));
- /*
- * In the worst case, every character takes up 3 spaces, plus a
- * trailing nul byte.
- */
for (p = from, t = to->mem, i = size; i > 0; --i, ++p)
if (isprint((int)*p)) {
if (*p == '\\')
@@ -83,8 +90,8 @@ __wt_raw_to_esc_hex(
*t++ = *p;
} else {
*t++ = '\\';
- *t++ = hex[(*p & 0xf0) >> 4];
- *t++ = hex[*p & 0x0f];
+ *t++ = __wt_hex[(*p & 0xf0) >> 4];
+ *t++ = __wt_hex[*p & 0x0f];
}
*t++ = '\0';
to->size = WT_PTRDIFF(t, to->mem);
diff --git a/src/support/huffman.c b/src/support/huffman.c
index 4bda365cb10..edd0bc9f648 100644
--- a/src/support/huffman.c
+++ b/src/support/huffman.c
@@ -1,9 +1,31 @@
-/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+/*
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
- * See the file LICENSE for redistribution information.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name MongoDB or the name WiredTiger
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY MONGODB INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
*/
#include "wt_internal.h"
diff --git a/src/support/pow.c b/src/support/pow.c
index 0f50bfe56a1..028263581d3 100644
--- a/src/support/pow.c
+++ b/src/support/pow.c
@@ -1,5 +1,5 @@
/*-
- * Public Domain 2014-2015 MongoDB, Inc.
+ * Public Domain 2014-2016 MongoDB, Inc.
* Public Domain 2008-2014 WiredTiger, Inc.
*
* This is free and unencumbered software released into the public domain.
diff --git a/src/support/rand.c b/src/support/rand.c
index f5ecb12633e..d2e4cd27aab 100644
--- a/src/support/rand.c
+++ b/src/support/rand.c
@@ -1,5 +1,5 @@
/*-
- * Public Domain 2014-2015 MongoDB, Inc.
+ * Public Domain 2014-2016 MongoDB, Inc.
* Public Domain 2008-2014 WiredTiger, Inc.
*
* This is free and unencumbered software released into the public domain.
@@ -60,6 +60,29 @@ __wt_random_init(WT_RAND_STATE volatile * rnd_state)
}
/*
+ * __wt_random_init_seed --
+ * Initialize the state of a 32-bit pseudo-random number.
+ * Use this, instead of __wt_random_init if we are running with multiple
+ * threads and we want each thread to initialize its own random state based
+ * on a different random seed.
+ */
+int
+__wt_random_init_seed(
+ WT_SESSION_IMPL *session, WT_RAND_STATE volatile * rnd_state)
+{
+ struct timespec ts;
+ WT_RAND_STATE rnd;
+
+ WT_RET(__wt_epoch(session, &ts));
+ M_W(rnd) = (uint32_t)(ts.tv_nsec + 521288629);
+ M_Z(rnd) = (uint32_t)(ts.tv_nsec + 362436069);
+
+ *rnd_state = rnd;
+
+ return (0);
+}
+
+/*
* __wt_random --
* Return a 32-bit pseudo-random number.
*/
diff --git a/src/support/scratch.c b/src/support/scratch.c
index f0c403c9ec8..94020ba2621 100644
--- a/src/support/scratch.c
+++ b/src/support/scratch.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/support/stat.c b/src/support/stat.c
index 4d7cd65fd18..7a615131628 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -250,19 +250,24 @@ __wt_stat_dsrc_aggregate_single(
to->block_alloc += from->block_alloc;
to->block_free += from->block_free;
to->block_checkpoint_size += from->block_checkpoint_size;
- to->allocation_size = from->allocation_size;
+ if (from->allocation_size > to->allocation_size)
+ to->allocation_size = from->allocation_size;
to->block_reuse_bytes += from->block_reuse_bytes;
- to->block_magic = from->block_magic;
- to->block_major = from->block_major;
+ if (from->block_magic > to->block_magic)
+ to->block_magic = from->block_magic;
+ if (from->block_major > to->block_major)
+ to->block_major = from->block_major;
to->block_size += from->block_size;
- to->block_minor = from->block_minor;
+ if (from->block_minor > to->block_minor)
+ to->block_minor = from->block_minor;
to->btree_checkpoint_generation += from->btree_checkpoint_generation;
to->btree_column_fix += from->btree_column_fix;
to->btree_column_internal += from->btree_column_internal;
to->btree_column_deleted += from->btree_column_deleted;
to->btree_column_variable += from->btree_column_variable;
to->btree_column_rle += from->btree_column_rle;
- to->btree_fixed_len = from->btree_fixed_len;
+ if (from->btree_fixed_len > to->btree_fixed_len)
+ to->btree_fixed_len = from->btree_fixed_len;
if (from->btree_maxintlkey > to->btree_maxintlkey)
to->btree_maxintlkey = from->btree_maxintlkey;
if (from->btree_maxintlpage > to->btree_maxintlpage)
@@ -367,12 +372,16 @@ __wt_stat_dsrc_aggregate(
to->block_free += WT_STAT_READ(from, block_free);
to->block_checkpoint_size +=
WT_STAT_READ(from, block_checkpoint_size);
- to->allocation_size = from[0]->allocation_size;
+ if ((v = WT_STAT_READ(from, allocation_size)) > to->allocation_size)
+ to->allocation_size = v;
to->block_reuse_bytes += WT_STAT_READ(from, block_reuse_bytes);
- to->block_magic = from[0]->block_magic;
- to->block_major = from[0]->block_major;
+ if ((v = WT_STAT_READ(from, block_magic)) > to->block_magic)
+ to->block_magic = v;
+ if ((v = WT_STAT_READ(from, block_major)) > to->block_major)
+ to->block_major = v;
to->block_size += WT_STAT_READ(from, block_size);
- to->block_minor = from[0]->block_minor;
+ if ((v = WT_STAT_READ(from, block_minor)) > to->block_minor)
+ to->block_minor = v;
to->btree_checkpoint_generation +=
WT_STAT_READ(from, btree_checkpoint_generation);
to->btree_column_fix += WT_STAT_READ(from, btree_column_fix);
@@ -382,15 +391,14 @@ __wt_stat_dsrc_aggregate(
to->btree_column_variable +=
WT_STAT_READ(from, btree_column_variable);
to->btree_column_rle += WT_STAT_READ(from, btree_column_rle);
- to->btree_fixed_len = from[0]->btree_fixed_len;
- if ((v = WT_STAT_READ(from, btree_maxintlkey)) >
- to->btree_maxintlkey)
+ if ((v = WT_STAT_READ(from, btree_fixed_len)) > to->btree_fixed_len)
+ to->btree_fixed_len = v;
+ if ((v = WT_STAT_READ(from, btree_maxintlkey)) > to->btree_maxintlkey)
to->btree_maxintlkey = v;
if ((v = WT_STAT_READ(from, btree_maxintlpage)) >
to->btree_maxintlpage)
to->btree_maxintlpage = v;
- if ((v = WT_STAT_READ(from, btree_maxleafkey)) >
- to->btree_maxleafkey)
+ if ((v = WT_STAT_READ(from, btree_maxleafkey)) > to->btree_maxleafkey)
to->btree_maxleafkey = v;
if ((v = WT_STAT_READ(from, btree_maxleafpage)) >
to->btree_maxleafpage)
diff --git a/src/txn/txn.c b/src/txn/txn.c
index f835fea8f67..e8fd8c0c119 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -216,6 +216,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force)
conn = S2C(session);
txn_global = &conn->txn_global;
+retry:
current_id = last_running = txn_global->current;
oldest_session = NULL;
prev_oldest_id = txn_global->oldest_id;
@@ -287,43 +288,60 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force)
WT_TXNID_LT(txn_global->last_running, last_running);
/* Update the oldest ID. */
- if ((WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) &&
- __wt_atomic_casiv32(&txn_global->scan_count, 1, -1)) {
- WT_ORDERED_READ(session_cnt, conn->session_cnt);
- for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
- if ((id = s->id) != WT_TXN_NONE &&
- WT_TXNID_LT(id, last_running))
- last_running = id;
- if ((id = s->snap_min) != WT_TXN_NONE &&
- WT_TXNID_LT(id, oldest_id))
- oldest_id = id;
- }
-
- if (WT_TXNID_LT(last_running, oldest_id))
- oldest_id = last_running;
-
-#ifdef HAVE_DIAGNOSTIC
+ if (WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) {
/*
- * Make sure the ID doesn't move past any named snapshots.
- *
- * Don't include the read/assignment in the assert statement.
- * Coverity complains if there are assignments only done in
- * diagnostic builds, and when the read is from a volatile.
+ * We know we want to update. Check if we're racing.
*/
- id = txn_global->nsnap_oldest_id;
- WT_ASSERT(session,
- id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
+ if (__wt_atomic_casiv32(&txn_global->scan_count, 1, -1)) {
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ for (i = 0, s = txn_global->states;
+ i < session_cnt; i++, s++) {
+ if ((id = s->id) != WT_TXN_NONE &&
+ WT_TXNID_LT(id, last_running))
+ last_running = id;
+ if ((id = s->snap_min) != WT_TXN_NONE &&
+ WT_TXNID_LT(id, oldest_id))
+ oldest_id = id;
+ }
+
+ if (WT_TXNID_LT(last_running, oldest_id))
+ oldest_id = last_running;
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * Make sure the ID doesn't move past any named
+ * snapshots.
+ *
+ * Don't include the read/assignment in the assert
+ * statement. Coverity complains if there are
+ * assignments only done in diagnostic builds, and
+ * when the read is from a volatile.
+ */
+ id = txn_global->nsnap_oldest_id;
+ WT_ASSERT(session,
+ id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
#endif
- if (WT_TXNID_LT(txn_global->last_running, last_running))
- txn_global->last_running = last_running;
- if (WT_TXNID_LT(txn_global->oldest_id, oldest_id))
- txn_global->oldest_id = oldest_id;
- WT_ASSERT(session, txn_global->scan_count == -1);
- txn_global->scan_count = 0;
+ if (WT_TXNID_LT(txn_global->last_running, last_running))
+ txn_global->last_running = last_running;
+ if (WT_TXNID_LT(txn_global->oldest_id, oldest_id))
+ txn_global->oldest_id = oldest_id;
+ WT_ASSERT(session, txn_global->scan_count == -1);
+ txn_global->scan_count = 0;
+ } else {
+ /*
+ * We wanted to update the oldest ID but we're racing
+ * another thread. Retry if this is a forced update.
+ */
+ WT_ASSERT(session, txn_global->scan_count > 0);
+ (void)__wt_atomic_subiv32(&txn_global->scan_count, 1);
+ if (force) {
+ __wt_yield();
+ goto retry;
+ }
+ }
} else {
if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) &&
- current_id - oldest_id > 10000 && last_running_moved &&
- oldest_session != NULL) {
+ current_id - oldest_id > 10000 && oldest_session != NULL) {
(void)__wt_verbose(session, WT_VERB_TRANSACTION,
"old snapshot %" PRIu64
" pinned in session %d [%s]"
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index bc1537ca878..7d4d4d5c27c 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -56,7 +56,7 @@ __checkpoint_name_check(WT_SESSION_IMPL *session, const char *uri)
* confirm the metadata file contains no non-file objects.
*/
if (uri == NULL) {
- WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+ WT_RET(__wt_metadata_cursor(session, &cursor));
while ((ret = cursor->next(cursor)) == 0) {
WT_ERR(cursor->get_key(cursor, &uri));
if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
@@ -79,8 +79,7 @@ __checkpoint_name_check(WT_SESSION_IMPL *session, const char *uri)
WT_ERR_MSG(session, EINVAL,
"%s object does not support named checkpoints", fail);
-err: if (cursor != NULL)
- WT_TRET(cursor->close(cursor));
+err: WT_TRET(__wt_metadata_cursor_release(session, &cursor));
return (ret);
}
@@ -185,7 +184,7 @@ __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[],
session->ckpt_handle[i].dhandle,
ret = (*op)(session, cfg));
else
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
ret = __wt_conn_btree_apply_single(session,
session->ckpt_handle[i].name, NULL, op, cfg));
WT_RET(ret);
@@ -371,7 +370,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
full = idle = logging = tracking = false;
/* Ensure the metadata table is open before taking any locks. */
- WT_RET(__wt_metadata_open(session));
+ WT_RET(__wt_metadata_cursor(session, NULL));
/*
* Do a pass over the configuration arguments and figure out what kind
@@ -386,9 +385,9 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
* Get a list of handles we want to flush; this may pull closed objects
* into the session cache, but we're going to do that eventually anyway.
*/
- WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session,
- WT_WITH_HANDLE_LIST_LOCK(session,
+ WT_WITH_SCHEMA_LOCK(session, ret,
+ WT_WITH_TABLE_LOCK(session, ret,
+ WT_WITH_HANDLE_LIST_LOCK(session, ret,
ret = __checkpoint_apply_all(
session, cfg, __wt_checkpoint_list, NULL))));
WT_ERR(ret);
@@ -551,14 +550,16 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
saved_meta_next = session->meta_track_next;
session->meta_track_next = NULL;
WT_WITH_DHANDLE(session,
- session->meta_dhandle, ret = __wt_checkpoint(session, cfg));
+ WT_SESSION_META_DHANDLE(session),
+ ret = __wt_checkpoint(session, cfg));
session->meta_track_next = saved_meta_next;
WT_ERR(ret);
WT_ERR(__checkpoint_verbose_track(session,
"metadata sync completed", &verb_timer));
} else
- WT_WITH_DHANDLE(session, session->meta_dhandle,
+ WT_WITH_DHANDLE(session,
+ WT_SESSION_META_DHANDLE(session),
ret = __wt_txn_checkpoint_log(
session, false, WT_TXN_LOG_CKPT_SYNC, NULL));
@@ -601,8 +602,8 @@ err: /*
*/
if (full && logging) {
if (ret == 0 &&
- F_ISSET((WT_BTREE *)session->meta_dhandle->handle,
- WT_BTREE_SKIP_CKPT))
+ F_ISSET(((WT_CURSOR_BTREE *)
+ session->meta_cursor)->btree, WT_BTREE_SKIP_CKPT))
idle = true;
WT_TRET(__wt_txn_checkpoint_log(session, full,
(ret == 0 && !idle) ?
@@ -665,7 +666,8 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
*/
WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 1);
- WT_WITH_CHECKPOINT_LOCK(session, ret = __txn_checkpoint(session, cfg));
+ WT_WITH_CHECKPOINT_LOCK(session, ret,
+ ret = __txn_checkpoint(session, cfg));
WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 0);
@@ -1037,12 +1039,13 @@ nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT);
"for a bulk-loaded file");
fake_ckpt = true;
goto fake;
+ case WT_BTREE_REBALANCE:
case WT_BTREE_SALVAGE:
case WT_BTREE_UPGRADE:
case WT_BTREE_VERIFY:
WT_ERR_MSG(session, EINVAL,
- "checkpoints are blocked during salvage, upgrade "
- "or verify operations");
+ "checkpoints are blocked during rebalance, "
+ "salvage, upgrade or verify operations");
}
/*
diff --git a/src/txn/txn_ext.c b/src/txn/txn_ext.c
index 36d42a8996f..9ea1af6c4f8 100644
--- a/src/txn/txn_ext.c
+++ b/src/txn/txn_ext.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c
index c5fa52dea6a..4c4a7fb3132 100644
--- a/src/txn/txn_log.c
+++ b/src/txn/txn_log.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -8,6 +8,12 @@
#include "wt_internal.h"
+/* Cookie passed to __txn_printlog. */
+typedef struct {
+ FILE *out;
+ uint32_t flags;
+} WT_TXN_PRINTLOG_ARGS;
+
/*
* __txn_op_log --
* Log an operation for the current transaction.
@@ -64,7 +70,8 @@ err: __wt_buf_free(session, &key);
*/
static int
__txn_commit_printlog(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out,
+ uint32_t flags)
{
bool firstrecord;
@@ -79,7 +86,7 @@ __txn_commit_printlog(
firstrecord = false;
- WT_RET(__wt_txn_op_printlog(session, pp, end, out));
+ WT_RET(__wt_txn_op_printlog(session, pp, end, out, flags));
WT_RET(__wt_fprintf(out, "\n }"));
}
@@ -459,6 +466,7 @@ __txn_printlog(WT_SESSION_IMPL *session,
FILE *out;
WT_LOG_RECORD *logrec;
WT_LSN ckpt_lsn;
+ WT_TXN_PRINTLOG_ARGS *args;
const uint8_t *end, *p;
const char *msg;
uint64_t txnid;
@@ -467,7 +475,8 @@ __txn_printlog(WT_SESSION_IMPL *session,
bool compressed;
WT_UNUSED(next_lsnp);
- out = cookie;
+ args = cookie;
+ out = args->out;
p = WT_LOG_SKIP_HEADER(rawrec->data);
end = (const uint8_t *)rawrec->data + rawrec->size;
@@ -506,7 +515,8 @@ __txn_printlog(WT_SESSION_IMPL *session,
WT_RET(__wt_fprintf(out, " \"type\" : \"commit\",\n"));
WT_RET(__wt_fprintf(out,
" \"txnid\" : %" PRIu64 ",\n", txnid));
- WT_RET(__txn_commit_printlog(session, &p, end, out));
+ WT_RET(__txn_commit_printlog(session, &p, end, out,
+ args->flags));
break;
case WT_LOGREC_FILE_SYNC:
@@ -537,15 +547,18 @@ __txn_printlog(WT_SESSION_IMPL *session,
* Print the log in a human-readable format.
*/
int
-__wt_txn_printlog(WT_SESSION *wt_session, FILE *out)
+__wt_txn_printlog(WT_SESSION *wt_session, FILE *out, uint32_t flags)
{
WT_SESSION_IMPL *session;
+ WT_TXN_PRINTLOG_ARGS args;
session = (WT_SESSION_IMPL *)wt_session;
+ args.out = out;
+ args.flags = flags;
WT_RET(__wt_fprintf(out, "[\n"));
WT_RET(__wt_log_scan(
- session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, out));
+ session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, &args));
WT_RET(__wt_fprintf(out, "\n]\n"));
return (0);
diff --git a/src/txn/txn_nsnap.c b/src/txn/txn_nsnap.c
index 169929a46de..eddcca9248f 100644
--- a/src/txn/txn_nsnap.c
+++ b/src/txn/txn_nsnap.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c
index d0b3b909f09..8051d059d7e 100644
--- a/src/txn/txn_recover.c
+++ b/src/txn/txn_recover.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -427,7 +427,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config));
WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config));
- WT_ERR(__wt_metadata_cursor(session, NULL, &metac));
+ WT_ERR(__wt_metadata_cursor_open(session, NULL, &metac));
metafile = &r.files[WT_METAFILE_ID];
metafile->c = metac;
diff --git a/src/utilities/util.h b/src/utilities/util.h
index 08d0537956f..3882d814e3a 100644
--- a/src/utilities/util.h
+++ b/src/utilities/util.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -42,6 +42,7 @@ char *util_name(WT_SESSION *, const char *, const char *);
int util_printlog(WT_SESSION *, int, char *[]);
int util_read(WT_SESSION *, int, char *[]);
int util_read_line(WT_SESSION *, ULINE *, bool, bool *);
+int util_rebalance(WT_SESSION *, int, char *[]);
int util_rename(WT_SESSION *, int, char *[]);
int util_salvage(WT_SESSION *, int, char *[]);
int util_stat(WT_SESSION *, int, char *[]);
diff --git a/src/utilities/util_backup.c b/src/utilities/util_backup.c
index d07c99afc19..b3afc78e9e8 100644
--- a/src/utilities/util_backup.c
+++ b/src/utilities/util_backup.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/utilities/util_compact.c b/src/utilities/util_compact.c
index 153d2d11a6d..c114eb207fa 100644
--- a/src/utilities/util_compact.c
+++ b/src/utilities/util_compact.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/utilities/util_cpyright.c b/src/utilities/util_cpyright.c
index df135b68d2c..7de0eab6dc6 100644
--- a/src/utilities/util_cpyright.c
+++ b/src/utilities/util_cpyright.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -11,7 +11,7 @@
void
util_copyright(void)
{
- printf("%s\n", "Copyright (c) 2008-2015 MongoDB, Inc.");
+ printf("%s\n", "Copyright (c) 2008-2016 MongoDB, Inc.");
printf("%s\n\n", "All rights reserved.");
printf("%s\n\n",
diff --git a/src/utilities/util_create.c b/src/utilities/util_create.c
index 06ea5edd8cc..4e609736f2d 100644
--- a/src/utilities/util_create.c
+++ b/src/utilities/util_create.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/utilities/util_drop.c b/src/utilities/util_drop.c
index 9717b102857..ba41445dfb6 100644
--- a/src/utilities/util_drop.c
+++ b/src/utilities/util_drop.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c
index 7dfac50b724..ca761a52d8a 100644
--- a/src/utilities/util_dump.c
+++ b/src/utilities/util_dump.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/utilities/util_list.c b/src/utilities/util_list.c
index 99a1455a74e..c7afea04b1c 100644
--- a/src/utilities/util_list.c
+++ b/src/utilities/util_list.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -8,6 +8,7 @@
#include "util.h"
+static int list_get_allocsize(WT_SESSION *, const char *, size_t *);
static int list_print(WT_SESSION *, const char *, bool, bool);
static int list_print_checkpoint(WT_SESSION *, const char *);
static int usage(void);
@@ -56,6 +57,48 @@ util_list(WT_SESSION *session, int argc, char *argv[])
}
/*
+ * list_get_allocsize --
+ * Get the allocation size for this file from the metadata.
+ */
+static int
+list_get_allocsize(WT_SESSION *session, const char *key, size_t *allocsize)
+{
+ WT_CONFIG_ITEM szvalue;
+ WT_CONFIG_PARSER *parser;
+ WT_DECL_RET;
+ WT_EXTENSION_API *wt_api;
+ char *config;
+
+ wt_api = session->connection->get_extension_api(session->connection);
+ if ((ret =
+ wt_api->metadata_search(wt_api, session, key, &config)) != 0) {
+ fprintf(stderr, "%s: %s: extension_api.metadata_search: %s\n",
+ progname, key, session->strerror(session, ret));
+ return (ret);
+ }
+ if ((ret = wt_api->config_parser_open(wt_api, session, config,
+ strlen(config), &parser)) != 0) {
+ fprintf(stderr, "%s: extension_api.config_parser_open: %s\n",
+ progname, session->strerror(session, ret));
+ return (ret);
+ }
+ if ((ret = parser->get(parser, "allocation_size", &szvalue)) != 0) {
+ if (ret != WT_NOTFOUND)
+ fprintf(stderr, "%s: config_parser.get: %s\n",
+ progname, session->strerror(session, ret));
+ (void)parser->close(parser);
+ return (ret);
+ }
+ if ((ret = parser->close(parser)) != 0) {
+ fprintf(stderr, "%s: config_parser.close: %s\n",
+ progname, session->strerror(session, ret));
+ return (ret);
+ }
+ *allocsize = (size_t)szvalue.val;
+ return (0);
+}
+
+/*
* list_print --
* List the high-level objects in the database.
*/
@@ -137,9 +180,10 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag)
static int
list_print_checkpoint(WT_SESSION *session, const char *key)
{
+ WT_BLOCK_CKPT ci;
WT_DECL_RET;
WT_CKPT *ckpt, *ckptbase;
- size_t len;
+ size_t allocsize, len;
time_t t;
uint64_t v;
@@ -151,6 +195,14 @@ list_print_checkpoint(WT_SESSION *session, const char *key)
if ((ret = __wt_metadata_get_ckptlist(session, key, &ckptbase)) != 0)
return (ret == WT_NOTFOUND ? 0 : ret);
+ /* We need the allocation size for decoding the checkpoint addr */
+ if ((ret = list_get_allocsize(session, key, &allocsize)) != 0) {
+ if (ret == WT_NOTFOUND)
+ allocsize = 0;
+ else
+ return (ret);
+ }
+
/* Find the longest name, so we can pretty-print. */
len = 0;
WT_CKPT_FOREACH(ckptbase, ckpt)
@@ -158,7 +210,15 @@ list_print_checkpoint(WT_SESSION *session, const char *key)
len = strlen(ckpt->name);
++len;
+ memset(&ci, 0, sizeof(ci));
WT_CKPT_FOREACH(ckptbase, ckpt) {
+ if (allocsize != 0 && (ret = __wt_block_ckpt_decode(
+ session, allocsize, ckpt->raw.data, &ci)) != 0) {
+ fprintf(stderr, "%s: __wt_block_buffer_to_ckpt: %s\n",
+ progname, session->strerror(session, ret));
+ /* continue if damaged */
+ ci.root_size = 0;
+ }
/*
* Call ctime, not ctime_r; ctime_r has portability problems,
* the Solaris version is different from the POSIX standard.
@@ -179,6 +239,17 @@ list_print_checkpoint(WT_SESSION *session, const char *key)
printf(" (%" PRIu64 " KB)\n", v / WT_KILOBYTE);
else
printf(" (%" PRIu64 " B)\n", v);
+ if (ci.root_size != 0) {
+ printf("\t\t" "root offset: %" PRIuMAX
+ " (0x%" PRIxMAX ")\n",
+ (intmax_t)ci.root_offset, (intmax_t)ci.root_offset);
+ printf("\t\t" "root size: %" PRIu32
+ " (0x%" PRIx32 ")\n",
+ ci.root_size, ci.root_size);
+ printf("\t\t" "root checksum: %" PRIu32
+ " (0x%" PRIx32 ")\n",
+ ci.root_cksum, ci.root_cksum);
+ }
}
__wt_metadata_free_ckptlist(session, ckptbase);
diff --git a/src/utilities/util_load.c b/src/utilities/util_load.c
index a40fa60361f..696dc68630a 100644
--- a/src/utilities/util_load.c
+++ b/src/utilities/util_load.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/utilities/util_load.h b/src/utilities/util_load.h
index ca359ce662b..710b18bfe83 100644
--- a/src/utilities/util_load.h
+++ b/src/utilities/util_load.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/utilities/util_load_json.c b/src/utilities/util_load_json.c
index c7d4893ae20..9349d39bb1e 100644
--- a/src/utilities/util_load_json.c
+++ b/src/utilities/util_load_json.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/utilities/util_loadtext.c b/src/utilities/util_loadtext.c
index c6cd264c423..f9c5b6e9a1f 100644
--- a/src/utilities/util_loadtext.c
+++ b/src/utilities/util_loadtext.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/utilities/util_main.c b/src/utilities/util_main.c
index 9cbda08690e..e18d8d7d1f5 100644
--- a/src/utilities/util_main.c
+++ b/src/utilities/util_main.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -159,6 +159,8 @@ main(int argc, char *argv[])
case 'r':
if (strcmp(command, "read") == 0)
func = util_read;
+ else if (strcmp(command, "rebalance") == 0)
+ func = util_rebalance;
else if (strcmp(command, "rename") == 0)
func = util_rename;
break;
@@ -226,7 +228,6 @@ main(int argc, char *argv[])
ret = func(session, argc, argv);
/* Close the database. */
-
err: if (conn != NULL && (tret = conn->close(conn, NULL)) != 0 && ret == 0)
ret = tret;
@@ -260,9 +261,10 @@ usage(void)
"\t" "dump\t dump an object\n"
"\t" "list\t list database objects\n"
"\t" "load\t load an object\n"
- "\t" "loadtext\t load an object from a text file\n"
+ "\t" "loadtext load an object from a text file\n"
"\t" "printlog display the database log\n"
"\t" "read\t read values from an object\n"
+ "\t" "rebalance rebalance an object\n"
"\t" "rename\t rename an object\n"
"\t" "salvage\t salvage a file\n"
"\t" "stat\t display statistics for an object\n"
diff --git a/src/utilities/util_misc.c b/src/utilities/util_misc.c
index 76cb37b30dc..f45f6b339f2 100644
--- a/src/utilities/util_misc.c
+++ b/src/utilities/util_misc.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/utilities/util_printlog.c b/src/utilities/util_printlog.c
index d202b09b228..9a2bdc8a9ba 100644
--- a/src/utilities/util_printlog.c
+++ b/src/utilities/util_printlog.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -15,10 +15,10 @@ util_printlog(WT_SESSION *session, int argc, char *argv[])
{
WT_DECL_RET;
int ch;
- bool printable;
+ uint32_t flags;
- printable = false;
- while ((ch = __wt_getopt(progname, argc, argv, "f:p")) != EOF)
+ flags = 0;
+ while ((ch = __wt_getopt(progname, argc, argv, "f:x")) != EOF)
switch (ch) {
case 'f': /* output file */
if (freopen(__wt_optarg, "w", stdout) == NULL) {
@@ -27,8 +27,8 @@ util_printlog(WT_SESSION *session, int argc, char *argv[])
return (1);
}
break;
- case 'p':
- printable = true;
+ case 'x': /* hex output */
+ LF_SET(WT_TXN_PRINTLOG_HEX);
break;
case '?':
default:
@@ -41,8 +41,7 @@ util_printlog(WT_SESSION *session, int argc, char *argv[])
if (argc != 0)
return (usage());
- WT_UNUSED(printable);
- ret = __wt_txn_printlog(session, stdout);
+ ret = __wt_txn_printlog(session, stdout, flags);
if (ret != 0) {
fprintf(stderr, "%s: printlog failed: %s\n",
@@ -61,7 +60,7 @@ usage(void)
{
(void)fprintf(stderr,
"usage: %s %s "
- "printlog [-p] [-f output-file]\n",
+ "printlog [-x] [-f output-file]\n",
progname, usage_prefix);
return (1);
}
diff --git a/src/utilities/util_read.c b/src/utilities/util_read.c
index a2fcc330c7d..2e766377aa9 100644
--- a/src/utilities/util_read.c
+++ b/src/utilities/util_read.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/utilities/util_rebalance.c b/src/utilities/util_rebalance.c
new file mode 100644
index 00000000000..45f161487e5
--- /dev/null
+++ b/src/utilities/util_rebalance.c
@@ -0,0 +1,63 @@
+/*-
+ * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_rebalance(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ int ch;
+ char *name;
+
+ name = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
+ switch (ch) {
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The remaining argument is the table name. */
+ if (argc != 1)
+ return (usage());
+ if ((name = util_name(session, *argv, "table")) == NULL)
+ return (1);
+
+ if ((ret = session->rebalance(session, name, NULL)) != 0) {
+ fprintf(stderr, "%s: rebalance(%s): %s\n",
+ progname, name, session->strerror(session, ret));
+ goto err;
+ }
+
+ /* Verbose configures a progress counter, move to the next line. */
+ if (verbose)
+ printf("\n");
+
+ if (0) {
+err: ret = 1;
+ }
+
+ free(name);
+
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "rebalance uri\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/utilities/util_rename.c b/src/utilities/util_rename.c
index 29347690ccc..aee299c6e63 100644
--- a/src/utilities/util_rename.c
+++ b/src/utilities/util_rename.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/utilities/util_salvage.c b/src/utilities/util_salvage.c
index e791d2f1dda..679d1074457 100644
--- a/src/utilities/util_salvage.c
+++ b/src/utilities/util_salvage.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/utilities/util_stat.c b/src/utilities/util_stat.c
index b7558ee3be0..e511ca4f7e8 100644
--- a/src/utilities/util_stat.c
+++ b/src/utilities/util_stat.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/utilities/util_upgrade.c b/src/utilities/util_upgrade.c
index 0f2e655d1dd..63b23f28c16 100644
--- a/src/utilities/util_upgrade.c
+++ b/src/utilities/util_upgrade.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/utilities/util_verbose.c b/src/utilities/util_verbose.c
index 084cce3f610..e568ec0a414 100644
--- a/src/utilities/util_verbose.c
+++ b/src/utilities/util_verbose.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/utilities/util_verify.c b/src/utilities/util_verify.c
index 12f76e9d4ed..2df4fa65f43 100644
--- a/src/utilities/util_verify.c
+++ b/src/utilities/util_verify.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/utilities/util_write.c b/src/utilities/util_write.c
index 7871040411b..7d9bce02b36 100644
--- a/src/utilities/util_write.c
+++ b/src/utilities/util_write.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*