summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@wiredtiger.com>2012-10-18 15:14:54 +1100
committerMichael Cahill <michael.cahill@wiredtiger.com>2012-10-18 15:14:54 +1100
commita54f05cdc958498d0f11e8b4d2fce31a87de9376 (patch)
tree6dac7d153567915283f36141e5bc5086a2e7ada1
parent141ea122ec37a1ad8b0573dacf055ecab2360ce3 (diff)
parent8a006f4828974ae0c391b7b26c345d1c86a476ff (diff)
downloadmongo-a54f05cdc958498d0f11e8b4d2fce31a87de9376.tar.gz
Merge branch 'develop' into lsm-schema
Conflicts: dist/api_data.py
-rw-r--r--build_posix/Make.base2
-rw-r--r--build_posix/configure.ac.in2
-rw-r--r--dist/api_data.py22
-rw-r--r--dist/filelist2
-rw-r--r--dist/s_string.ok1
-rw-r--r--dist/stat_data.py1
-rw-r--r--examples/c/ex_all.c10
-rw-r--r--examples/c/ex_test_perf.c2
-rw-r--r--src/block/block_ckpt.c96
-rw-r--r--src/block/block_compact.c98
-rw-r--r--src/block/block_mgr.c32
-rw-r--r--src/bloom/bloom.c2
-rw-r--r--src/btree/bt_cache.c10
-rw-r--r--src/btree/bt_compact.c171
-rw-r--r--src/btree/bt_evict.c84
-rw-r--r--src/btree/bt_sync.c2
-rw-r--r--src/btree/bt_upgrade.c2
-rw-r--r--src/btree/bt_vrfy.c105
-rw-r--r--src/btree/bt_walk.c120
-rw-r--r--src/btree/rec_evict.c2
-rw-r--r--src/btree/rec_write.c37
-rw-r--r--src/config/config_def.c26
-rw-r--r--src/cursor/cur_index.c4
-rw-r--r--src/cursor/cur_table.c2
-rw-r--r--src/docs/command-line.dox32
-rw-r--r--src/include/api.h10
-rw-r--r--src/include/btmem.h5
-rw-r--r--src/include/cache.h6
-rw-r--r--src/include/cache.i2
-rw-r--r--src/include/extern.h32
-rw-r--r--src/include/misc.h7
-rw-r--r--src/include/stat.h1
-rw-r--r--src/include/txn.i14
-rw-r--r--src/include/wiredtiger.in72
-rw-r--r--src/lsm/lsm_merge.c36
-rw-r--r--src/lsm/lsm_stat.c13
-rw-r--r--src/lsm/lsm_tree.c19
-rw-r--r--src/lsm/lsm_worker.c34
-rw-r--r--src/os_posix/os_mtx.c14
-rw-r--r--src/os_posix/os_open.c20
-rw-r--r--src/schema/schema_list.c1
-rw-r--r--src/schema/schema_worker.c13
-rw-r--r--src/session/session_api.c103
-rw-r--r--src/support/err.c1
-rw-r--r--src/support/hazard.c24
-rw-r--r--src/support/stat.c2
-rw-r--r--src/txn/txn_ckpt.c35
-rw-r--r--src/utilities/util.h2
-rw-r--r--src/utilities/util_compact.c61
-rw-r--r--src/utilities/util_dumpfile.c68
-rw-r--r--src/utilities/util_main.c90
-rw-r--r--src/utilities/util_rename.c2
-rw-r--r--src/utilities/util_salvage.c4
-rw-r--r--src/utilities/util_upgrade.c4
-rw-r--r--src/utilities/util_verify.c32
-rw-r--r--test/bloom/test_bloom.c2
-rw-r--r--test/fops/t.c2
-rw-r--r--test/suite/test_bug004.py2
-rw-r--r--test/suite/test_compact.py94
-rw-r--r--test/suite/test_util10.py93
-rw-r--r--test/thread/t.c2
61 files changed, 1199 insertions, 590 deletions
diff --git a/build_posix/Make.base b/build_posix/Make.base
index dc52d1ac247..2a7985f57b0 100644
--- a/build_posix/Make.base
+++ b/build_posix/Make.base
@@ -10,10 +10,10 @@ bin_PROGRAMS = wt
wt_SOURCES =\
src/utilities/util_backup.c \
src/utilities/util_cpyright.c \
+ src/utilities/util_compact.c \
src/utilities/util_create.c \
src/utilities/util_drop.c \
src/utilities/util_dump.c \
- src/utilities/util_dumpfile.c \
src/utilities/util_getopt.c \
src/utilities/util_list.c \
src/utilities/util_load.c \
diff --git a/build_posix/configure.ac.in b/build_posix/configure.ac.in
index 77d90e00241..caf2287988a 100644
--- a/build_posix/configure.ac.in
+++ b/build_posix/configure.ac.in
@@ -70,7 +70,7 @@ AC_PROG_INSTALL
AC_CHECK_LIB(pthread, pthread_create)
AC_CHECK_LIB(dl, dlopen)
AC_CHECK_LIB(rt, sched_yield)
-AC_CHECK_FUNCS([clock_gettime gettimeofday fcntl posix_memalign])
+AC_CHECK_FUNCS([clock_gettime gettimeofday fcntl posix_fadvise posix_memalign])
AC_SYS_LARGEFILE
AC_C_BIGENDIAN
diff --git a/dist/api_data.py b/dist/api_data.py
index 3f89dac26a1..c0e6b21ec5c 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -277,6 +277,8 @@ methods = {
'session.close' : Method([]),
+'session.compact' : Method([]),
+
'session.create' : Method(table_meta + file_config + source_meta + [
Config('exclusive', 'false', r'''
fail if the object exists. When false (the default), if the
@@ -291,7 +293,6 @@ methods = {
type='boolean'),
]),
-'session.dumpfile' : Method([]),
'session.log_printf' : Method([]),
'session.open_cursor' : Method([
@@ -368,7 +369,20 @@ methods = {
]),
'session.truncate' : Method([]),
'session.upgrade' : Method([]),
-'session.verify' : Method([]),
+'session.verify' : Method([
+ Config('dump_address', 'false', r'''
+ Display addresses and page types as pages are verified, using
+ the application's message handler, intended for debugging''',
+ type='boolean'),
+ Config('dump_blocks', 'false', r'''
+ Display the contents of on-disk blocks as they are verified, using
+ the application's message handler, intended for debugging''',
+ type='boolean'),
+ Config('dump_pages', 'false', r'''
+ Display the contents of in-memory pages as they are verified, using
+ the application's message handler, intended for debugging''',
+ type='boolean')
+]),
'session.begin_transaction' : Method([
Config('isolation', '', r'''
@@ -400,6 +414,10 @@ methods = {
including the named checkpoint. Checkpoints cannot be
dropped while a hot backup is in progress or if open in
a cursor''', type='list'),
+ Config('force', 'false', r'''
+ checkpoints may be skipped if the underlying object has not
+ been modified, this option forces the checkpoint''',
+ type='boolean'),
Config('name', '', r'''
if non-empty, specify a name for the checkpoint'''),
Config('target', '', r'''
diff --git a/dist/filelist b/dist/filelist
index 9674aaedcf5..bbee6912f01 100644
--- a/dist/filelist
+++ b/dist/filelist
@@ -6,6 +6,7 @@ src/api/api_version.c
src/block/block_addr.c
src/block/block_ckpt.c
src/block/block_cksum.c
+src/block/block_compact.c
src/block/block_ext.c
src/block/block_mgr.c
src/block/block_open.c
@@ -16,6 +17,7 @@ src/block/block_write.c
src/bloom/bloom.c
src/btree/bt_bulk.c
src/btree/bt_cache.c
+src/btree/bt_compact.c
src/btree/bt_curnext.c
src/btree/bt_curprev.c
src/btree/bt_cursor.c
diff --git a/dist/s_string.ok b/dist/s_string.ok
index b3fc3df2b65..9abf8bf4d2b 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -201,7 +201,6 @@ Vv
VxWorks
WIREDTIGER
WeakHashLen
-WildTiger
WinNT
WiredTiger
WiredTiger's
diff --git a/dist/stat_data.py b/dist/stat_data.py
index 1fcdde0e930..f7663bd3c42 100644
--- a/dist/stat_data.py
+++ b/dist/stat_data.py
@@ -64,6 +64,7 @@ btree_stats = [
Stat('file_col_fix_pages', 'column-store fixed-size leaf pages'),
Stat('file_col_int_pages', 'column-store internal pages'),
Stat('file_col_var_pages', 'column-store variable-size leaf pages'),
+ Stat('file_compact_rewrite', 'pages rewritten by compaction'),
Stat('file_entries', 'total entries'),
Stat('file_fixed_len', 'fixed-record size'),
Stat('file_magic', 'magic number'),
diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c
index 09a95d3aaee..99b2392a86e 100644
--- a/examples/c/ex_all.c
+++ b/examples/c/ex_all.c
@@ -400,14 +400,14 @@ session_ops(WT_SESSION *session)
"table:mytable", "key_format=r,value_format=S,cache_resident=true");
/*! [Create a cache-resident object] */
+ /*! [Compact a table] */
+ ret = session->compact(session, "table:mytable", NULL);
+ /*! [Compact a table] */
+
/*! [Drop a table] */
ret = session->drop(session, "table:mytable", NULL);
/*! [Drop a table] */
- /*! [Dump a file] */
- ret = session->dumpfile(session, "file:myfile", NULL);
- /*! [Dump a file] */
-
/*! [Print to the message stream] */
ret = session->msg_printf(
session, "process ID %" PRIuMAX, (uintmax_t)getpid());
@@ -895,6 +895,8 @@ main(void)
const char *home = "WT_TEST";
ret = wiredtiger_open(home, NULL, "create,transactional", &conn);
/*! [Open a connection] */
+
+ (void)conn->close(conn, NULL);
}
/*! [Get the WiredTiger library version #1] */
diff --git a/examples/c/ex_test_perf.c b/examples/c/ex_test_perf.c
index e5d9c0b1541..74de4ef39b3 100644
--- a/examples/c/ex_test_perf.c
+++ b/examples/c/ex_test_perf.c
@@ -192,7 +192,9 @@ read_thread(void *arg)
cursor->set_key(cursor, key_buf);
cursor->search(cursor);
}
+
session->close(session, NULL);
+ free(key_buf);
return (arg);
}
diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c
index fc9aa37a31e..a581945aab0 100644
--- a/src/block/block_ckpt.c
+++ b/src/block/block_ckpt.c
@@ -231,6 +231,38 @@ __wt_block_checkpoint(WT_SESSION_IMPL *session,
}
/*
+ * __ckpt_extlist_read --
+ * Read a checkpoints extent lists and copy
+ */
+static int
+__ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
+{
+ WT_BLOCK_CKPT *ci;
+
+ /*
+ * Allocate a checkpoint structure, crack the cookie and read the
+ * checkpoint's extent lists.
+ *
+ * Ignore the avail list: checkpoint avail lists are only useful if we
+ * are rolling forward from the particular checkpoint and they represent
+ * our best understanding of what blocks can be allocated. If we are
+ * not operating on the live checkpoint, subsequent checkpoints might
+ * have allocated those blocks, and the avail list is useless. We don't
+ * discard it, because it is useful as part of verification, but we
+ * don't re-write it either.
+ */
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv));
+
+ ci = ckpt->bpriv;
+ WT_RET(__wt_block_ckpt_init(session, block, ci, ckpt->name, 0));
+ WT_RET(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
+ WT_RET(__wt_block_extlist_read(session, block, &ci->alloc));
+ WT_RET(__wt_block_extlist_read(session, block, &ci->discard));
+
+ return (0);
+}
+
+/*
* __ckpt_extlist_fblocks --
* If a checkpoint's extent list is going away, free its blocks.
*/
@@ -261,7 +293,7 @@ __ckpt_process(
WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
{
WT_BLOCK_CKPT *a, *b, *ci;
- WT_CKPT *ckpt, *last_ckpt, *next_ckpt;
+ WT_CKPT *ckpt, *next_ckpt;
WT_DECL_ITEM(tmp);
WT_DECL_RET;
uint64_t ckpt_size;
@@ -297,55 +329,36 @@ __ckpt_process(
session, &ci->ckpt_avail, "live", "ckpt_avail"));
/*
- * To delete a checkpoint, we'll need extent list for it, and we have to
- * read that from the disk.
+ * To delete a checkpoint, we'll need checkpoint information for it and
+ * the subsequent checkpoint into which it gets rolled; read them from
+ * disk before we lock things down.
*/
- last_ckpt = NULL;
deleting = 0;
WT_CKPT_FOREACH(ckptbase, ckpt) {
- if (F_ISSET(ckpt, WT_CKPT_FAKE))
+ if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
+ !F_ISSET(ckpt, WT_CKPT_DELETE))
continue;
+ deleting = 1;
/*
- * To delete a checkpoint, we'll need checkpoint information for
- * it and the subsequent checkpoint. The test is tricky, load
- * the current checkpoint's information if it's marked for
- * deletion or if it follows a checkpoint marked for deletion,
- * where the boundary cases are the first checkpoint in the list
- * and the last checkpoint in the list: if we're deleting the
- * last checkpoint in the list, there's no next checkpoint, the
- * checkpoint will be merged into the live tree.
+ * Read the checkpoint and next checkpoint extent lists if we
+ * haven't already read them (we may have already read these
+ * extent blocks if there is more than one deleted checkpoint).
*/
- if (!F_ISSET(ckpt, WT_CKPT_DELETE) &&
- (F_ISSET(ckpt, WT_CKPT_ADD) ||
- last_ckpt == NULL || !F_ISSET(last_ckpt, WT_CKPT_DELETE))) {
- last_ckpt = ckpt;
- continue;
- }
- last_ckpt = ckpt;
- deleting = 1;
+ if (ckpt->bpriv == NULL)
+ WT_ERR(__ckpt_extlist_read(session, block, ckpt));
+
+ for (next_ckpt = ckpt + 1;; ++next_ckpt)
+ if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
+ break;
/*
- * Allocate a checkpoint structure, crack the cookie and read
- * the checkpoint's extent lists.
- *
- * Ignore the avail list: checkpoint avail lists are only useful
- * if we are rolling forward from the particular checkpoint and
- * they represent our best understanding of what blocks can be
- * allocated. If we are not operating on the live checkpoint,
- * subsequent checkpoints might have allocated those blocks, and
- * the avail list is useless. We don't discard it, because it
- * is useful as part of verification, but we don't re-write it
- * either.
+ * The "next" checkpoint may be the live tree which has no
+ * extent blocks to read.
*/
- WT_ERR(__wt_calloc(
- session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv));
- ci = ckpt->bpriv;
- WT_ERR(__wt_block_ckpt_init(session, block, ci, ckpt->name, 0));
- WT_ERR(__wt_block_buffer_to_ckpt(
- session, block, ckpt->raw.data, ci));
- WT_ERR(__wt_block_extlist_read(session, block, &ci->alloc));
- WT_ERR(__wt_block_extlist_read(session, block, &ci->discard));
+ if (next_ckpt->bpriv == NULL &&
+ !F_ISSET(next_ckpt, WT_CKPT_ADD))
+ WT_ERR(__ckpt_extlist_read(session, block, next_ckpt));
}
/*
@@ -369,7 +382,8 @@ __ckpt_process(
* when writing the live extent lists.
*/
WT_CKPT_FOREACH(ckptbase, ckpt) {
- if (!F_ISSET(ckpt, WT_CKPT_DELETE))
+ if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
+ !F_ISSET(ckpt, WT_CKPT_DELETE))
continue;
if (WT_VERBOSE_ISSET(session, ckpt)) {
diff --git a/src/block/block_compact.c b/src/block/block_compact.c
new file mode 100644
index 00000000000..7ddc03b4b6a
--- /dev/null
+++ b/src/block/block_compact.c
@@ -0,0 +1,98 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_block_compact_skip --
+ * Return if compaction will shrink the file.
+ */
+int
+__wt_block_compact_skip(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, int *skipp)
+{
+ WT_EXT *ext;
+ WT_EXTLIST *el;
+ WT_FH *fh;
+ off_t avail, half;
+ int pct;
+
+ fh = block->fh;
+ *skipp = 1;
+
+ /*
+ * We do compaction by copying blocks from the end of the file to the
+ * beginning of the file, and we need some metrics to decide if it's
+ * worth doing. Ignore small files, and files where less than 30% of
+ * the file appears in the available list, and in the first half of
+ * the file. In other words, don't bother with compaction unless we
+ * have a reasonable expectation of moving 30% of the file from the
+ * last half of the file to the first half of the file.
+ */
+#define WT_COMPACT_TRIGGER 30
+ if (fh->file_size <= 10 * 1024)
+ return (0);
+
+ __wt_spin_lock(session, &block->live_lock);
+
+ avail = 0;
+ half = fh->file_size / 2;
+
+ el = &block->live.avail;
+ WT_EXT_FOREACH(ext, el->off)
+ if (ext->off < half)
+ avail += ext->size;
+ pct = (int)((avail * 100) / fh->file_size);
+
+ __wt_spin_unlock(session, &block->live_lock);
+
+ if (pct >= WT_COMPACT_TRIGGER)
+ *skipp = 0;
+
+ WT_VERBOSE_RET(session, block,
+ "compaction %s" "useful, %d%% of the free space in the available "
+ "list appears in the first half of the file",
+ pct < WT_COMPACT_TRIGGER ? "not " : "", pct);
+
+ return (0);
+}
+
+/*
+ * __wt_block_compact_page_skip --
+ * Return if writing a particular page will shrink the file.
+ */
+int
+__wt_block_compact_page_skip(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, const uint8_t *addr, uint32_t addr_size, int *skipp)
+{
+ WT_FH *fh;
+ off_t offset;
+ uint32_t size, cksum;
+
+ WT_UNUSED(addr_size);
+ *skipp = 0; /* Paranoia: skip on error. */
+
+ fh = block->fh;
+
+ /* Crack the cookie. */
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+ /*
+ * If this block appears in the last half of the file, rewrite it.
+ *
+ * It's unclear we need to lock: the chances of a smashed read are close
+ * to non-existent and the worst thing that can happen is we rewrite a
+ * block we didn't want to rewrite. On the other hand, compaction is
+ * not expected to be a common operation in WiredTiger, we shouldn't be
+ * here a lot.
+ */
+ __wt_spin_lock(session, &block->live_lock);
+ *skipp = offset > fh->file_size / 2 ? 0 : 1;
+ __wt_spin_unlock(session, &block->live_lock);
+
+ return (0);
+}
diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c
index 01941619942..d7f9ad8cb92 100644
--- a/src/block/block_mgr.c
+++ b/src/block/block_mgr.c
@@ -194,6 +194,38 @@ __wt_bm_checkpoint_unload(WT_SESSION_IMPL *session)
}
/*
+ * __wt_bm_compact_skip --
+ * Return if a file can be compacted.
+ */
+int
+__wt_bm_compact_skip(WT_SESSION_IMPL *session, int *skipp)
+{
+ WT_BLOCK *block;
+
+ if ((block = session->btree->block) == NULL)
+ return (__bm_invalid(session));
+
+ return (__wt_block_compact_skip(session, block, skipp));
+}
+
+/*
+ * __wt_bm_compact_skip --
+ * Return if a page is useful for compaction.
+ */
+int
+__wt_bm_compact_page_skip(WT_SESSION_IMPL *session,
+ const uint8_t *addr, uint32_t addr_size, int *skipp)
+{
+ WT_BLOCK *block;
+
+ if ((block = session->btree->block) == NULL)
+ return (__bm_invalid(session));
+
+ return (__wt_block_compact_page_skip(
+ session, block, addr, addr_size, skipp));
+}
+
+/*
* __wt_bm_truncate --
* Truncate a file.
*/
diff --git a/src/bloom/bloom.c b/src/bloom/bloom.c
index a6fc72cdf49..7d9f687c384 100644
--- a/src/bloom/bloom.c
+++ b/src/bloom/bloom.c
@@ -46,7 +46,7 @@ __bloom_init(WT_SESSION_IMPL *session,
err: if (bloom->uri != NULL)
__wt_free(session, bloom->uri);
if (bloom->config != NULL)
- __wt_free(session, bloom->uri);
+ __wt_free(session, bloom->config);
if (bloom->bitstring != NULL)
__wt_free(session, bloom->bitstring);
if (bloom != NULL)
diff --git a/src/btree/bt_cache.c b/src/btree/bt_cache.c
index 5aecf065976..ae952d768d7 100644
--- a/src/btree/bt_cache.c
+++ b/src/btree/bt_cache.c
@@ -71,14 +71,6 @@ __wt_cache_create(WT_CONNECTION_IMPL *conn, const char *cfg[])
__wt_spin_init(session, &cache->evict_lock);
/*
- * Allocate the forced page eviction request array. We size it to
- * allow one eviction page request per session.
- */
- cache->max_evict_request = conn->session_size;
- WT_ERR(__wt_calloc_def(
- session, cache->max_evict_request, &cache->evict_request));
-
- /*
* We pull some values from the cache statistics (rather than have two
* copies). Set them.
*/
@@ -128,7 +120,5 @@ __wt_cache_destroy(WT_CONNECTION_IMPL *conn)
(void)__wt_cond_destroy(session, cache->evict_cond);
__wt_spin_destroy(session, &cache->evict_lock);
- __wt_free(session, cache->evict_request);
-
__wt_free(session, conn->cache);
}
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c
new file mode 100644
index 00000000000..2b57a5a321e
--- /dev/null
+++ b/src/btree/bt_compact.c
@@ -0,0 +1,171 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_compact --
+ * Compact a file.
+ */
+int
+__wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_PAGE *page;
+ int skip;
+
+ WT_UNUSED(cfg);
+
+ /* Check if compaction might be useful. */
+ WT_RET(__wt_bm_compact_skip(session, &skip));
+ if (skip)
+ return (0);
+
+ /*
+ * Invoke the eviction server to review in-memory pages to see if they
+ * need to be re-written (we must use the eviction server because it's
+ * the only thread that can safely look at page reconciliation values).
+ */
+ WT_RET(__wt_sync_file_serial(session, WT_SYNC_COMPACT));
+ __wt_evict_server_wake(session);
+ __wt_cond_wait(session, session->cond, 0);
+ WT_RET(session->syncop_ret);
+
+ /*
+ * Walk the tree reviewing all of the on-disk pages to see if they
+ * need to be re-written.
+ */
+ for (page = NULL;;) {
+ WT_RET(__wt_tree_walk(session, &page, WT_TREE_COMPACT));
+ if (page == NULL)
+ break;
+
+ /* Mark the page and tree dirty, we want to write this page. */
+ if ((ret = __wt_page_modify_init(session, page)) != 0) {
+ __wt_stack_release(session, page);
+ WT_RET(ret);
+ }
+ __wt_page_and_tree_modify_set(session, page);
+
+ WT_BSTAT_INCR(session, file_compact_rewrite);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_compact_page_skip --
+ * Return if the block-manager wants us to re-write this page.
+ */
+int
+__wt_compact_page_skip(
+ WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref, int *skipp)
+{
+ uint32_t addr_size;
+ const uint8_t *addr;
+
+ /*
+ * There's one compaction test we do before we read the page, to see
+ * if the block-manager thinks it useful to rewrite the page. If a
+ * rewrite won't help, we don't want to do I/O for nothing. For that
+ * reason, this check is done in a call from inside the tree-walking
+ * routine.
+ *
+ * Ignore everything but on-disk pages, the eviction server has already
+ * done a pass over the in-memory pages.
+ */
+ if (ref->state != WT_REF_DISK) {
+ *skipp = 1;
+ return (0);
+ }
+
+ __wt_get_addr(parent, ref, &addr, &addr_size);
+ if (addr == NULL) {
+ *skipp = 1;
+ return (0);
+ }
+
+ return (__wt_bm_compact_page_skip(session, addr, addr_size, skipp));
+}
+
+/*
+ * __wt_compact_evict --
+ * Helper routine for the eviction thread to decide if a file's size would
+ * benefit from re-writing this page.
+ */
+int
+__wt_compact_evict(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_PAGE_MODIFY *mod;
+ int skip;
+ uint32_t addr_size;
+ const uint8_t *addr;
+
+ mod = page->modify;
+
+ /*
+ * We're using the eviction thread in compaction because it can safely
+ * look at page reconciliation information, no pages are being evicted
+ * if the eviction is busy here. That's not good for performance and
+ * implies compaction will impact performance, but right now it's the
+ * only way to safely look at reconciliation information.
+ *
+ * The reason we need to look at reconciliation information is that an
+ * in-memory page's original disk addresses might have been fine for
+ * compaction, but its replacement addresses might be a problem.
+ *
+ * Ignore the root: it may not have a replacement address, and besides,
+ * if anything else gets written, so will it.
+ */
+ if (WT_PAGE_IS_ROOT(page))
+ return (0);
+
+ /*
+ * If the page is already dirty, skip some work, it will be written in
+ * any case.
+ */
+ if (__wt_page_is_modified(page))
+ return (0);
+
+ /*
+ * If the page is clean, test the original addresses.
+ * If the page is a 1-to-1 replacement, test the replacement addresses.
+ * If the page is a split, ignore it, it will be merged into the parent.
+ */
+ if (mod == NULL)
+ goto disk;
+
+ switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+ case 0:
+disk: __wt_get_addr(page->parent, page->ref, &addr, &addr_size);
+ if (addr == NULL)
+ return (0);
+ WT_RET(
+ __wt_bm_compact_page_skip(session, addr, addr_size, &skip));
+ if (skip)
+ return (0);
+ break;
+ case WT_PM_REC_EMPTY:
+ return (0);
+ case WT_PM_REC_REPLACE:
+ WT_RET(__wt_bm_compact_page_skip(
+ session, mod->u.replace.addr, mod->u.replace.size, &skip));
+ if (skip)
+ return (0);
+ break;
+ case WT_PM_REC_SPLIT:
+ case WT_PM_REC_SPLIT_MERGE:
+ return (0);
+ }
+
+ /* Mark the page and tree dirty, we want to write this page. */
+ WT_RET(__wt_page_modify_init(session, page));
+ __wt_page_and_tree_modify_set(session, page);
+
+ WT_BSTAT_INCR(session, file_compact_rewrite);
+ return (0);
+}
diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c
index e5bdf9eacd7..dec1651dd36 100644
--- a/src/btree/bt_evict.c
+++ b/src/btree/bt_evict.c
@@ -21,8 +21,9 @@ static int __evict_worker(WT_SESSION_IMPL *);
* number of pages from each file's in-memory tree for each page we evict.
*/
#define WT_EVICT_GROUP 30 /* Consider N pages as LRU candidates */
-#define WT_EVICT_WALK_PER_TABLE 35 /* Pages to visit per file */
+#define WT_EVICT_WALK_PER_FILE 5 /* Pages to visit per file */
#define WT_EVICT_WALK_BASE 50 /* Pages tracked across file visits */
+#define WT_EVICT_WALK_INCR 100 /* Pages added each walk */
/*
* __evict_list_clr --
@@ -156,30 +157,22 @@ __wt_cache_evict_server(void *arg)
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_SESSION_IMPL *session;
- int read_lockout;
session = arg;
conn = S2C(session);
cache = conn->cache;
while (F_ISSET(conn, WT_SERVER_RUN)) {
- /*
- * Use the same logic as application threads to decide whether
- * there is work to do.
- */
- __wt_eviction_check(session, &read_lockout, 0);
-
- if (!read_lockout) {
- WT_VERBOSE_ERR(session, evictserver, "sleeping");
- __wt_cond_wait(session, cache->evict_cond);
- }
+ /* Evict pages from the cache as needed. */
+ WT_ERR(__evict_worker(session));
if (!F_ISSET(conn, WT_SERVER_RUN))
break;
- WT_VERBOSE_ERR(session, evictserver, "waking");
- /* Evict pages from the cache as needed. */
- WT_ERR(__evict_worker(session));
+ WT_VERBOSE_ERR(session, evictserver, "sleeping");
+ /* Don't rely on signals: check periodically. */
+ __wt_cond_wait(session, cache->evict_cond, 100000);
+ WT_VERBOSE_ERR(session, evictserver, "waking");
}
WT_VERBOSE_ERR(session, evictserver, "exiting");
@@ -213,7 +206,7 @@ __evict_worker(WT_SESSION_IMPL *session)
{
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
- uint64_t bytes_start, bytes_inuse, bytes_max;
+ uint64_t bytes_inuse, bytes_max;
int loop;
conn = S2C(session);
@@ -237,7 +230,7 @@ __evict_worker(WT_SESSION_IMPL *session)
*/
bytes_inuse = __wt_cache_bytes_inuse(cache);
bytes_max = conn->cache_size;
- if (bytes_inuse < cache->eviction_target * (bytes_max / 100))
+ if (bytes_inuse < (cache->eviction_target * bytes_max) / 100)
break;
WT_RET(__evict_lru(session));
@@ -247,9 +240,7 @@ __evict_worker(WT_SESSION_IMPL *session)
* any progress at all, go back to sleep, it's not something
* we can fix.
*/
- bytes_start = bytes_inuse;
- bytes_inuse = __wt_cache_bytes_inuse(cache);
- if (bytes_start == bytes_inuse) {
+ if (__wt_cache_bytes_inuse(cache) >= bytes_inuse) {
if (loop == 10) {
WT_STAT_INCR(conn->stats, cache_evict_slow);
WT_VERBOSE_RET(session, evictserver,
@@ -392,9 +383,9 @@ __evict_file_request_walk(WT_SESSION_IMPL *session)
__evict_list_clr_all(session, 0);
/*
- * Wait for LRU eviction activity to drain. It is much easier
- * to reason about sync or forced eviction if we know there are
- * no other threads evicting in the tree.
+ * Wait for LRU eviction activity to drain. It is much easier to
+ * reason about checkpoints if we know there are no other threads
+ * evicting in the tree.
*/
while (request_session->btree->lru_count > 0) {
__wt_spin_unlock(session, &cache->evict_lock);
@@ -437,27 +428,21 @@ __evict_file_request(WT_SESSION_IMPL *session, int syncop)
break;
WT_ERR(__wt_tree_walk(session, &next_page, WT_TREE_EVICT));
- /* Write dirty pages for sync, and sync with discard. */
switch (syncop) {
+ case WT_SYNC_COMPACT:
+ WT_ERR(__wt_compact_evict(session, page));
+ break;
case WT_SYNC:
case WT_SYNC_DISCARD:
+ /* Write dirty pages for sync and sync with discard. */
if (__wt_page_is_modified(page))
WT_ERR(__wt_rec_write(
session, page, NULL, WT_REC_SINGLE));
- break;
- case WT_SYNC_DISCARD_NOWRITE:
- break;
- }
+ if (syncop == WT_SYNC)
+ break;
- /*
- * Evict the page for sync with discard, simply discard the page
- * for discard alone.
- */
- switch (syncop) {
- case WT_SYNC:
- break;
- case WT_SYNC_DISCARD:
/*
+ * Evict the page for sync with discard.
* Do not attempt to evict pages expected to be merged
* into their parents, with the single exception that
* the root page can't be merged into anything, it must
@@ -471,10 +456,11 @@ __evict_file_request(WT_SESSION_IMPL *session, int syncop)
break;
case WT_SYNC_DISCARD_NOWRITE:
/*
- * When we discard the root page, clear the reference
- * from the btree handle. It is important to do this
- * here, so that future eviction doesn't see root_page
- * pointing to freed memory.
+ * Simply discard the page for discard alone. When we
+ * discard the root page, clear the reference from the
+ * btree handle. It is important to do this here, so
+ * that future eviction doesn't see root_page pointing
+ * to freed memory.
*/
if (WT_PAGE_IS_ROOT(page))
session->btree->root_page = NULL;
@@ -532,17 +518,18 @@ __evict_walk(WT_SESSION_IMPL *session)
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
- u_int elem, file_count, i;
+ u_int elem, file_count, i, retries;
conn = S2C(session);
cache = S2C(session)->cache;
+ retries = 0;
/*
* Resize the array in which we're tracking pages, as necessary, then
* get some pages from each underlying file. In practice, a realloc
* is rarely needed, so it is worth avoiding the LRU lock.
*/
- elem = WT_EVICT_WALK_BASE + 2 * WT_EVICT_GROUP;
+ elem = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR;
if (elem > cache->evict_entries) {
__wt_spin_lock(session, &cache->evict_lock);
/* Save the offset of the eviction point. */
@@ -561,7 +548,7 @@ __evict_walk(WT_SESSION_IMPL *session)
* servicing eviction requests.
*/
i = WT_EVICT_WALK_BASE;
- file_count = 0;
+retry: file_count = 0;
TAILQ_FOREACH(btree, &conn->btqh, q) {
if (file_count++ < cache->evict_file_next)
continue;
@@ -591,6 +578,11 @@ __evict_walk(WT_SESSION_IMPL *session)
}
cache->evict_file_next = (btree == NULL) ? 0 : file_count;
+ /* In the extreme case, all of the pages have to come from one file. */
+ if (ret == 0 && i < cache->evict_entries &&
+ retries++ < WT_EVICT_WALK_INCR / WT_EVICT_WALK_PER_FILE)
+ goto retry;
+
if (0) {
err: __wt_spin_unlock(session, &cache->evict_lock);
}
@@ -614,7 +606,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
btree = session->btree;
cache = S2C(session)->cache;
start = cache->evict + *slotp;
- end = start + WT_EVICT_WALK_PER_TABLE;
+ end = start + WT_EVICT_WALK_PER_FILE;
if (end > cache->evict + cache->evict_entries)
end = cache->evict + cache->evict_entries;
@@ -729,9 +721,7 @@ __evict_get_page(
/*
* Lock the page while holding the eviction mutex to prevent
* multiple attempts to evict it. For pages that are already
- * being evicted, including pages on the request queue for
- * forced eviction, this operation will fail and we will move
- * on.
+ * being evicted, this operation will fail and we will move on.
*/
ref = evict->page->ref;
WT_ASSERT(session, evict->page == ref->page);
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index 1a28f5f9e7b..bb05f966ee4 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -77,7 +77,7 @@ __wt_bt_cache_flush(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op)
*/
WT_ERR(__wt_sync_file_serial(session, op));
__wt_evict_server_wake(session);
- __wt_cond_wait(session, session->cond);
+ __wt_cond_wait(session, session->cond, 0);
ret = session->syncop_ret;
switch (op) {
diff --git a/src/btree/bt_upgrade.c b/src/btree/bt_upgrade.c
index 574a2335796..fb675123d13 100644
--- a/src/btree/bt_upgrade.c
+++ b/src/btree/bt_upgrade.c
@@ -14,9 +14,9 @@
int
__wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_UNUSED(session);
WT_UNUSED(cfg);
/* There's nothing to upgrade, yet. */
+ WT_RET(__wt_progress(session, NULL, 1));
return (0);
}
diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c
index ed4309c1bad..84dc9e263e7 100644
--- a/src/btree/bt_vrfy.c
+++ b/src/btree/bt_vrfy.c
@@ -19,14 +19,16 @@ typedef struct {
uint64_t fcnt; /* Progress counter */
- int dumpfile; /* Dump file stream */
+ int dump_address;
+ int dump_pages;
+ int dump_blocks;
WT_ITEM *tmp1; /* Temporary buffer */
WT_ITEM *tmp2; /* Temporary buffer */
} WT_VSTUFF;
static void __verify_checkpoint_reset(WT_VSTUFF *);
-static int __verify_int(WT_SESSION_IMPL *, int);
+static int __verify_config(WT_SESSION_IMPL *, const char *[], WT_VSTUFF *);
static int __verify_overflow(
WT_SESSION_IMPL *, const uint8_t *, uint32_t, WT_VSTUFF *);
static int __verify_overflow_cell(
@@ -44,41 +46,6 @@ static int __verify_tree(WT_SESSION_IMPL *, WT_PAGE *, WT_VSTUFF *);
int
__wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_UNUSED(cfg);
-
- return (__verify_int(session, 0));
-}
-
-/*
- * __wt_dumpfile --
- * Dump a file in debugging mode.
- */
-int
-__wt_dumpfile(WT_SESSION_IMPL *session, const char *cfg[])
-{
- WT_UNUSED(cfg);
-
-#ifdef HAVE_DIAGNOSTIC
- /*
- * We use the verification code to do debugging dumps because if we're
- * dumping in debugging mode, we want to confirm the page is OK before
- * walking it.
- */
- return (__verify_int(session, 1));
-#else
- WT_RET_MSG(session, ENOTSUP,
- "the WiredTiger library was not built in diagnostic mode");
-#endif
-}
-
-/*
- * __verify_int --
- * Internal version of verify: verify a Btree, optionally dumping each
- * page in debugging mode.
- */
-static int
-__verify_int(WT_SESSION_IMPL *session, int dumpfile)
-{
WT_BTREE *btree;
WT_CKPT *ckptbase, *ckpt;
WT_DECL_RET;
@@ -90,12 +57,14 @@ __verify_int(WT_SESSION_IMPL *session, int dumpfile)
WT_CLEAR(_vstuff);
vs = &_vstuff;
- vs->dumpfile = dumpfile;
WT_ERR(__wt_scr_alloc(session, 0, &vs->max_key));
WT_ERR(__wt_scr_alloc(session, 0, &vs->max_addr));
WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1));
WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2));
+ /* Check configuration strings. */
+ WT_ERR(__verify_config(session, cfg, vs));
+
/* Get a list of the checkpoints for this file. */
WT_ERR(__wt_meta_ckptlist_get(session, btree->name, &ckptbase));
@@ -106,6 +75,11 @@ __verify_int(WT_SESSION_IMPL *session, int dumpfile)
WT_CKPT_FOREACH(ckptbase, ckpt) {
WT_VERBOSE_ERR(session, verify,
"%s: checkpoint %s", btree->name, ckpt->name);
+#ifdef HAVE_DIAGNOSTIC
+ if (vs->dump_address || vs->dump_blocks || vs->dump_pages)
+ WT_ERR(__wt_msg(session,
+ "%s: checkpoint %s", btree->name, ckpt->name));
+#endif
/* Fake checkpoints require no work. */
if (F_ISSET(ckpt, WT_CKPT_FAKE))
@@ -157,6 +131,48 @@ err: __wt_meta_ckptlist_free(session, ckptbase);
}
/*
+ * __verify_config --
+ * Verification supports dumping pages in various formats.
+ */
+static int
+__verify_config(WT_SESSION_IMPL *session, const char *cfg[], WT_VSTUFF *vs)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+
+ ret = __wt_config_gets(session, cfg, "dump_address", &cval);
+ if (ret != 0 && ret != WT_NOTFOUND)
+ WT_RET(ret);
+ if (ret == 0 && cval.val != 0)
+ vs->dump_address = 1;
+
+ ret = __wt_config_gets(session, cfg, "dump_blocks", &cval);
+ if (ret != 0 && ret != WT_NOTFOUND)
+ WT_RET(ret);
+ if (ret == 0 && cval.val != 0)
+ vs->dump_blocks = 1;
+
+ ret = __wt_config_gets(session, cfg, "dump_pages", &cval);
+ if (ret != 0 && ret != WT_NOTFOUND)
+ WT_RET(ret);
+ if (ret == 0 && cval.val != 0)
+ vs->dump_pages = 1;
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * We use the verification code to do debugging dumps because if we're
+ * dumping in debugging mode, we want to confirm the page is OK before
+ * walking it.
+ */
+#else
+ if (vs->dump_address || vs->dump_blocks || vs->dump_pages)
+ WT_RET_MSG(session, ENOTSUP,
+ "the WiredTiger library was not built in diagnostic mode");
+#endif
+ return (0);
+}
+
+/*
* __verify_checkpoint_reset --
* Reset anything needing to be reset for each new checkpoint verification.
*/
@@ -197,6 +213,12 @@ __verify_tree(WT_SESSION_IMPL *session, WT_PAGE *page, WT_VSTUFF *vs)
WT_VERBOSE_RET(session, verify, "%s %s",
__wt_page_addr_string(session, vs->tmp1, page),
__wt_page_type_string(page->type));
+#ifdef HAVE_DIAGNOSTIC
+ if (vs->dump_address)
+ WT_RET(__wt_msg(session, "%s %s",
+ __wt_page_addr_string(session, vs->tmp1, page),
+ __wt_page_type_string(page->type)));
+#endif
/*
* The page's physical structure was verified when it was read into
@@ -226,11 +248,10 @@ __verify_tree(WT_SESSION_IMPL *session, WT_PAGE *page, WT_VSTUFF *vs)
#ifdef HAVE_DIAGNOSTIC
/* Optionally dump the page in debugging mode. */
- if (vs->dumpfile) {
+ if (vs->dump_blocks && page->dsk != NULL)
+ WT_RET(__wt_debug_disk(session, page->dsk, NULL));
+ if (vs->dump_pages)
WT_RET(__wt_debug_page(session, page, NULL));
- if (page->dsk != NULL)
- WT_RET(__wt_debug_disk(session, page->dsk, NULL));
- }
#endif
/*
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index ac3adbe0e75..7fcedbdd1e2 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -58,24 +58,10 @@ __tree_walk_delete(
/*
* If the page is already instantiated in-memory, other threads may be
- * using it: no fast delete.
- */
- if (ref->state != WT_REF_DISK)
- return (0);
-
- /*
- * If the page references overflow items, we have to clean it up during
- * reconciliation, no fast delete.
- */
- if (!__wt_off_page(page, ref->addr)) {
- __wt_cell_unpack(ref->addr, &unpack);
- if (unpack.raw != WT_CELL_ADDR_LNO)
- return (0);
- }
-
- /*
- * Atomically switch the page's state to delete it. If the page state
- * changed underneath us, no fast delete.
+ * using it, no fast delete.
+ *
+ * Atomically switch the page's state to lock it. If the page state
+ * changes underneath us, no fast delete.
*
* Possible optimization: if the page is already deleted and the delete
* is visible to us (the delete has been committed), we could skip the
@@ -83,29 +69,47 @@ __tree_walk_delete(
* in the page. While that's a huge amount of work to no purpose, it's
* unclear optimizing for overlapping range deletes is worth the effort.
*/
- if (!WT_ATOMIC_CAS(ref->state, WT_REF_DISK, WT_REF_READING))
+ if (ref->state != WT_REF_DISK ||
+ !WT_ATOMIC_CAS(ref->state, WT_REF_DISK, WT_REF_READING))
return (0);
/*
- * We have the reference "locked":
+ * If the page references overflow items, we have to clean it up during
+ * reconciliation, no fast delete. Check this after we have the page
+ * locked down, instantiating the page in memory and modifying it could
+ * theoretically point the address somewhere away from the on-page cell.
+ */
+ __wt_cell_unpack(ref->addr, &unpack);
+ if (unpack.raw != WT_CELL_ADDR_LNO)
+ goto err;
+
+ /*
* Record the change in the transaction structure and set the change's
* transaction ID.
*/
WT_ERR(__wt_txn_modify_ref(session, ref));
/*
- * This action dirties the page: mark it dirty now, because there's no
+ * This action dirties the parent page: mark it dirty now, there's no
* future reconciliation of the child leaf page that will dirty it as
- * we flush the tree.
+ * we write the tree.
*/
WT_ERR(__wt_page_modify_init(session, page));
__wt_page_modify_set(page);
*skipp = 1;
- /* Release the page. */
-err: WT_PUBLISH(ref->state, WT_REF_DELETED);
+ /* Delete the page. */
+ WT_PUBLISH(ref->state, WT_REF_DELETED);
+ return (0);
+err: /*
+ * Restore the page to on-disk status, we'll have to instantiate it.
+ * We're don't have to back out adding this node to the transaction
+ * modify list, that's OK because the rollback function ignores nodes
+ * that aren't set to WT_REF_DELETED.
+ */
+ WT_PUBLISH(ref->state, WT_REF_DISK);
return (ret);
}
@@ -151,12 +155,14 @@ __wt_tree_walk(WT_SESSION_IMPL *session, WT_PAGE **pagep, uint32_t flags)
WT_PAGE *page, *t;
WT_REF *ref;
uint32_t slot;
- int discard, eviction, prev, skip;
+ int compact, discard, eviction, prev, set_read_gen, skip;
btree = session->btree;
- /* We can currently only do fast-discard on row-store trees. */
+ /* Fast-discard currently only works on row-store trees. */
discard = LF_ISSET(WT_TREE_DISCARD) && btree->type == BTREE_ROW ? 1 : 0;
+
+ compact = LF_ISSET(WT_TREE_COMPACT) ? 1 : 0;
eviction = LF_ISSET(WT_TREE_EVICT) ? 1 : 0;
prev = LF_ISSET(WT_TREE_PREV) ? 1 : 0;
@@ -229,43 +235,55 @@ descend: for (;;) {
* the state to WT_REF_EVICT_WALK temporarily to avoid
* the page being evicted by another thread while it is
* being evaluated.
- *
- * We also return pages in the "evict-force" state,
- * which indicates they are waiting on the eviction
- * server getting to a request. A sync call in the
- * meantime must write such a page to ensure all
- * modifications are written. Since this is happening
- * inside the eviction server, and an LRU walk will
- * check the state before adding the page to the LRU
- * queue, there is no way for an evict-force page to
- * disappear from under us.
*/
+ set_read_gen = 0;
if (eviction) {
if (!WT_ATOMIC_CAS(ref->state,
WT_REF_MEM, WT_REF_EVICT_WALK))
break;
+ } else if (discard) {
+ /*
+ * If deleting a range, try to delete the page
+ * without instantiating it.
+ */
+ WT_RET(__tree_walk_delete(
+ session, page, ref, &skip));
+ if (skip)
+ break;
+ WT_RET(__wt_page_in(session, page, ref));
} else {
- if (discard) {
- /*
- * If deleting a range, try to delete
- * the page without instantiating it.
- */
- WT_RET(__tree_walk_delete(
+ /*
+ * If iterating a cursor (or doing compaction),
+ * skip deleted pages that are visible to us.
+ */
+ WT_RET(__tree_walk_read(session, ref, &skip));
+ if (skip)
+ break;
+
+ /*
+ * Test if the page is useful for compaction:
+ * we don't want to read it if it won't help.
+ *
+ * Pages read for compaction aren't "useful";
+ * reset the page generation to 0 so the page
+ * is quickly chosen for eviction. (This can
+ * race of course, but it's unlikely and will
+ * only result in an incorrectly low page read
+ * generation.)
+ */
+ set_read_gen = 0;
+ if (compact) {
+ WT_RET(__wt_compact_page_skip(
session, page, ref, &skip));
if (skip)
break;
- } else {
- /*
- * If iterating a cursor, skip deleted
- * pages that are visible to us.
- */
- WT_RET(__tree_walk_read(
- session, ref, &skip));
- if (skip)
- break;
+ set_read_gen =
+ ref->state == WT_REF_DISK ? 1 : 0;
}
WT_RET(__wt_page_in(session, page, ref));
+ if (set_read_gen)
+ page->read_gen = 0;
}
page = ref->page;
diff --git a/src/btree/rec_evict.c b/src/btree/rec_evict.c
index 2ec317407d2..aba4edbd787 100644
--- a/src/btree/rec_evict.c
+++ b/src/btree/rec_evict.c
@@ -309,7 +309,7 @@ __rec_review(WT_SESSION_IMPL *session,
* to evict split-merge pages, which means the only interesting case
* is an empty page. If the eviction thread picked an "empty" page
* for eviction, it must have had reason, probably the empty page got
- * really, really full and is being forced out of the cache.
+ * really, really full.
*/
mod = page->modify;
if (!top && (mod == NULL || !F_ISSET(mod,
diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c
index 9356125aa8b..15b9b2da76f 100644
--- a/src/btree/rec_write.c
+++ b/src/btree/rec_write.c
@@ -239,28 +239,27 @@ static void __rec_dictionary_reset(WT_RECONCILE *);
*
* The reconciliation code is used in the following situations:
*
- * (1) by the eviction server during sync;
- * (2) by the eviction server during forced eviction of a page; and
- * (3) by any thread during LRU eviction.
+ * (1) by the eviction server during sync; and
+ * (2) by any thread during LRU eviction.
*
* The complexity is checking the page state of child pages when looking for
* pages to merge.
*
* We clearly want to consider all normal, in-memory pages (WT_REF_MEM).
*
- * During LRU eviction in case (3), the eviction code has already locked the
+ * During LRU eviction in case (2), the eviction code has already locked the
* subtree, so locked pages should be included in the merge (WT_REF_LOCKED).
*
* To make this tractable, the eviction server guarantees that no thread is
- * doing LRU eviction in the tree when cases (1) and (2) occur. That is, the
- * only state change that can occur during a sync or forced eviction is for a
- * reference to a page on disk to cause a page to be read (WT_REF_READING).
- * In the case of a read, we could safely ignore those pages because they are
- * unmodified by definition -- they are being read from disk, however, in the
- * current system, that state also includes fast-delete pages that are being
- * instantiated. Those pages cannot be ignored, as they have been modified.
- * For this reason, we have to wait for the WT_REF_READING state to be resolved
- * to another state before we proceed.
+ * doing LRU eviction in the tree when case (1) occurs. That is, the only
+ * state change that can occur during a sync is for a reference to a page on
+ * disk to cause a page to be read (WT_REF_READING). In the case of a read, we
+ * could safely ignore those pages because they are unmodified by definition --
+ * they are being read from disk, however, in the current system, that state
+ * also includes fast-delete pages that are being instantiated. Those pages
+ * cannot be ignored, as they have been modified. For this reason, we have to
+ * wait for the WT_REF_READING state to be resolved to another state before we
+ * proceed.
*/
static int
__rec_page_modified(WT_SESSION_IMPL *session,
@@ -1480,6 +1479,8 @@ __wt_rec_row_bulk_insert(WT_CURSOR_BULK *cbulk)
return (0);
}
+#define WT_FIX_ENTRIES(btree, bytes) (((bytes) * 8) / (btree)->bitcnt)
+
/*
* __wt_rec_col_fix_bulk_insert --
* Fixed-length column-store bulk insert.
@@ -1504,7 +1505,7 @@ __wt_rec_col_fix_bulk_insert(WT_CURSOR_BULK *cbulk)
entries > 0;
entries -= page_entries, data += page_size) {
page_entries = WT_MIN(entries,
- r->space_avail * 8 / btree->bitcnt);
+ WT_FIX_ENTRIES(btree, r->space_avail));
page_size = __bitstr_size(page_entries * btree->bitcnt);
memcpy(r->first_free, data, page_size);
@@ -1533,7 +1534,7 @@ __wt_rec_col_fix_bulk_insert(WT_CURSOR_BULK *cbulk)
WT_RET(__rec_split(session, r));
}
cbulk->entry = 0;
- cbulk->nrecs = r->space_avail * 8 / btree->bitcnt;
+ cbulk->nrecs = WT_FIX_ENTRIES(btree, r->space_avail);
}
__bit_setv(r->first_free,
@@ -1733,7 +1734,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
/* Calculate the number of entries per page remainder. */
entry = page->entries;
- nrecs = (r->space_avail * 8 / btree->bitcnt) - page->entries;
+ nrecs = WT_FIX_ENTRIES(btree, r->space_avail) - page->entries;
r->recno += entry;
/* Walk any append list. */
@@ -1774,7 +1775,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
/* Calculate the number of entries per page. */
entry = 0;
- nrecs = r->space_avail * 8 / btree->bitcnt;
+ nrecs = WT_FIX_ENTRIES(btree, r->space_avail);
}
}
@@ -1820,7 +1821,7 @@ __rec_col_fix_slvg(WT_SESSION_IMPL *session,
for (;;) {
/* Calculate the number of entries per page. */
entry = 0;
- nrecs = r->space_avail * 8 / btree->bitcnt;
+ nrecs = WT_FIX_ENTRIES(btree, r->space_avail);
for (; nrecs > 0 && salvage->missing > 0;
--nrecs, --salvage->missing, ++entry)
diff --git a/src/config/config_def.c b/src/config/config_def.c
index 19bfe49a7fe..c35e7da4a82 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -184,11 +184,12 @@ __wt_confchk_session_begin_transaction[] = {
const char *
__wt_confdfl_session_checkpoint =
- "drop=,name=,target=";
+ "drop=,force=0,name=,target=";
WT_CONFIG_CHECK
__wt_confchk_session_checkpoint[] = {
{ "drop", "list", NULL },
+ { "force", "boolean", NULL },
{ "name", "string", NULL },
{ "target", "list", NULL },
{ NULL, NULL, NULL }
@@ -213,6 +214,15 @@ __wt_confchk_session_commit_transaction[] = {
};
const char *
+__wt_confdfl_session_compact =
+ "";
+
+WT_CONFIG_CHECK
+__wt_confchk_session_compact[] = {
+ { NULL, NULL, NULL }
+};
+
+const char *
__wt_confdfl_session_create =
"allocation_size=512B,block_compressor=,cache_resident=0,checksum=,"
"colgroups=,collator=,columns=,columns=,dictionary=0,exclusive=0,"
@@ -274,15 +284,6 @@ __wt_confchk_session_drop[] = {
};
const char *
-__wt_confdfl_session_dumpfile =
- "";
-
-WT_CONFIG_CHECK
-__wt_confchk_session_dumpfile[] = {
- { NULL, NULL, NULL }
-};
-
-const char *
__wt_confdfl_session_log_printf =
"";
@@ -373,10 +374,13 @@ __wt_confchk_session_upgrade[] = {
const char *
__wt_confdfl_session_verify =
- "";
+ "dump_address=0,dump_blocks=0,dump_pages=0";
WT_CONFIG_CHECK
__wt_confchk_session_verify[] = {
+ { "dump_address", "boolean", NULL },
+ { "dump_blocks", "boolean", NULL },
+ { "dump_pages", "boolean", NULL },
{ NULL, NULL, NULL }
};
diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c
index 39966177fda..1c4fd99e521 100644
--- a/src/cursor/cur_index.c
+++ b/src/cursor/cur_index.c
@@ -289,9 +289,9 @@ __curindex_close(WT_CURSOR *cursor)
__wt_free(session, cindex->cg_cursors);
if (cindex->key_plan != idx->key_plan)
__wt_free(session, cindex->key_plan);
- if (cindex->value_plan != idx->value_plan)
- __wt_free(session, cindex->value_plan);
if (cursor->value_format != cindex->table->value_format)
+ __wt_free(session, cursor->value_format);
+ if (cindex->value_plan != idx->value_plan)
__wt_free(session, cindex->value_plan);
WT_TRET(__wt_btcur_close(&cindex->cbt));
diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c
index 22510ff44da..a1b72243e4e 100644
--- a/src/cursor/cur_table.c
+++ b/src/cursor/cur_table.c
@@ -615,6 +615,8 @@ __curtable_close(WT_CURSOR *cursor)
if (ctable->plan != ctable->table->plan)
__wt_free(session, ctable->plan);
+ if (cursor->value_format != ctable->table->value_format)
+ __wt_free(session, cursor->value_format);
__wt_free(session, ctable->cg_cursors);
__wt_free(session, ctable->idx_cursors);
/* The URI is owned by the table. */
diff --git a/src/docs/command-line.dox b/src/docs/command-line.dox
index 34b703246a2..d20fe51b72c 100644
--- a/src/docs/command-line.dox
+++ b/src/docs/command-line.dox
@@ -48,6 +48,19 @@ database; the \c -t option changes the \c backup command to do a hot
backup of only the named objects.
<hr>
+@section util_compact wt compact
+Compact a table or file.
+
+The \c compact command attempts to rewrite the specified table or file
+to consume less disk space.
+
+@subsection util_compact_synopsis Synopsis
+<code>wt [-Vv] [-C config] [-h directory] compact uri</code>
+
+@subsection util_compact_options Options
+The \c compact command has no command-specific options.
+
+<hr>
@section util_create wt create
Create a table or file.
@@ -108,25 +121,6 @@ Dump all characters in a hexadecimal encoding (the default is to leave
printable characters unencoded).
<hr>
-@section util_dumpfile wt dumpfile
-Dump a file in a debugging format.
-
-The \c dumpfile command dumps the specified physical file in a non-portable,
-debugging format, exiting success if the file is correct, and failure if the
-file is corrupted.
-
-@subsection util_dumpfile_synopsis Synopsis
-<code>wt [-Vv] [-C config] [-h directory] dumpfile [-f output] file</code>
-
-@subsection util_dumpfile_options Options
-The following are command-specific options for the \c dumpfile command:
-
-@par <code>-f</code>
-By default, the \c dumpfile command output is written to the standard
-output; the \c -f option re-directs the output to the specified
-file.
-
-<hr>
@section util_list wt list
List the tables and files in the database.
diff --git a/src/include/api.h b/src/include/api.h
index f5aa7bd9050..677742c4c6e 100644
--- a/src/include/api.h
+++ b/src/include/api.h
@@ -111,8 +111,9 @@ struct __wt_session_impl {
size_t excl_allocated; /* Bytes allocated */
#define WT_SYNC 1 /* Sync the file */
-#define WT_SYNC_DISCARD 2 /* Sync the file, discard pages */
-#define WT_SYNC_DISCARD_NOWRITE 3 /* Discard the file */
+#define WT_SYNC_COMPACT 2 /* Compact the file */
+#define WT_SYNC_DISCARD 3 /* Sync the file, discard pages */
+#define WT_SYNC_DISCARD_NOWRITE 4 /* Discard the file */
int syncop; /* File operation */
int syncop_ret; /* Return value */
@@ -127,8 +128,9 @@ struct __wt_session_impl {
* easily call a function to clear memory up to, but not including, the
* hazard reference.
*/
- uint32_t hazard_size; /* Count of used hazard references */
- u_int nhazard;
+ uint32_t hazard_size; /* Allocated slots in hazard array. */
+ uint32_t nhazard; /* Count of active hazard references */
+
#define WT_SESSION_CLEAR(s) memset(s, 0, WT_PTRDIFF(&(s)->hazard, s))
WT_HAZARD *hazard; /* Hazard reference array */
};
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 796101f3737..fd79028dac7 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -279,10 +279,7 @@ struct __wt_page {
*/
uint32_t entries;
- /*
- * Memory attached to the page (although not exact or complete), used
- * to force eviction of a page tying too much memory down.
- */
+ /* Memory attached to the page. */
uint32_t memory_footprint;
#define WT_PAGE_INVALID 0 /* Invalid page */
diff --git a/src/include/cache.h b/src/include/cache.h
index 73dd5ab1c31..33fb370b413 100644
--- a/src/include/cache.h
+++ b/src/include/cache.h
@@ -55,12 +55,6 @@ struct __wt_cache {
u_int evict_file_next; /* LRU: next file to search */
/*
- * Forced-page eviction request information.
- */
- WT_EVICT_ENTRY *evict_request; /* Forced page eviction request list */
- uint32_t max_evict_request; /* Size of the eviction request array */
-
- /*
* Sync/flush request information.
*/
volatile uint64_t sync_request; /* File sync requests */
diff --git a/src/include/cache.i b/src/include/cache.i
index 702234c426c..6c3c796673d 100644
--- a/src/include/cache.i
+++ b/src/include/cache.i
@@ -31,7 +31,7 @@ __wt_eviction_check(WT_SESSION_IMPL *session, int *read_lockoutp, int wake)
*read_lockoutp = (bytes_inuse > bytes_max);
/* Wake eviction when we're over the trigger cache size. */
- if (wake && bytes_inuse > cache->eviction_trigger * (bytes_max / 100))
+ if (wake && bytes_inuse >= (cache->eviction_trigger * bytes_max) / 100)
__wt_evict_server_wake(session);
}
diff --git a/src/include/extern.h b/src/include/extern.h
index 3827035f791..6fc339b3d39 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -49,6 +49,14 @@ extern int __wt_block_checkpoint(WT_SESSION_IMPL *session,
extern int __wt_block_checkpoint_resolve(WT_SESSION_IMPL *session,
WT_BLOCK *block);
extern uint32_t __wt_cksum(const void *chunk, size_t len);
+extern int __wt_block_compact_skip( WT_SESSION_IMPL *session,
+ WT_BLOCK *block,
+ int *skipp);
+extern int __wt_block_compact_page_skip(WT_SESSION_IMPL *session,
+ WT_BLOCK *block,
+ const uint8_t *addr,
+ uint32_t addr_size,
+ int *skipp);
extern int __wt_block_off_match(WT_EXTLIST *el, off_t off, off_t size);
extern int __wt_block_off_remove_overlap( WT_SESSION_IMPL *session,
WT_EXTLIST *el,
@@ -132,6 +140,11 @@ extern int __wt_bm_checkpoint_load(WT_SESSION_IMPL *session,
uint32_t addr_size,
int readonly);
extern int __wt_bm_checkpoint_unload(WT_SESSION_IMPL *session);
+extern int __wt_bm_compact_skip(WT_SESSION_IMPL *session, int *skipp);
+extern int __wt_bm_compact_page_skip(WT_SESSION_IMPL *session,
+ const uint8_t *addr,
+ uint32_t addr_size,
+ int *skipp);
extern int __wt_bm_truncate(WT_SESSION_IMPL *session, const char *filename);
extern int __wt_bm_free(WT_SESSION_IMPL *session,
const uint8_t *addr,
@@ -252,6 +265,12 @@ extern int __wt_cache_config(WT_CONNECTION_IMPL *conn, const char *cfg[]);
extern int __wt_cache_create(WT_CONNECTION_IMPL *conn, const char *cfg[]);
extern void __wt_cache_stats_update(WT_CONNECTION_IMPL *conn);
extern void __wt_cache_destroy(WT_CONNECTION_IMPL *conn);
+extern int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_compact_page_skip( WT_SESSION_IMPL *session,
+ WT_PAGE *parent,
+ WT_REF *ref,
+ int *skipp);
+extern int __wt_compact_evict(WT_SESSION_IMPL *session, WT_PAGE *page);
extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt, int next);
extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, int discard);
extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt);
@@ -362,7 +381,6 @@ extern int __wt_bt_cache_flush(WT_SESSION_IMPL *session,
int op);
extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]);
-extern int __wt_dumpfile(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_verify_dsk(WT_SESSION_IMPL *session,
const char *addr,
WT_ITEM *buf);
@@ -553,12 +571,12 @@ extern const char *__wt_confdfl_session_close;
extern WT_CONFIG_CHECK __wt_confchk_session_close[];
extern const char *__wt_confdfl_session_commit_transaction;
extern WT_CONFIG_CHECK __wt_confchk_session_commit_transaction[];
+extern const char *__wt_confdfl_session_compact;
+extern WT_CONFIG_CHECK __wt_confchk_session_compact[];
extern const char *__wt_confdfl_session_create;
extern WT_CONFIG_CHECK __wt_confchk_session_create[];
extern const char *__wt_confdfl_session_drop;
extern WT_CONFIG_CHECK __wt_confchk_session_drop[];
-extern const char *__wt_confdfl_session_dumpfile;
-extern WT_CONFIG_CHECK __wt_confchk_session_dumpfile[];
extern const char *__wt_confdfl_session_log_printf;
extern WT_CONFIG_CHECK __wt_confchk_session_log_printf[];
extern const char *__wt_confdfl_session_open_cursor;
@@ -696,7 +714,9 @@ extern int __wt_lsm_merge_update_tree(WT_SESSION_IMPL *session,
int start_chunk,
int nchunks,
WT_LSM_CHUNK *chunk);
-extern int __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_merge(WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree,
+ int stalls);
extern int __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
extern int __wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
extern int __wt_lsm_stat_init( WT_SESSION_IMPL *session,
@@ -848,7 +868,9 @@ extern int __wt_cond_alloc(WT_SESSION_IMPL *session,
const char *name,
int is_locked,
WT_CONDVAR **condp);
-extern void __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond);
+extern void __wt_cond_wait(WT_SESSION_IMPL *session,
+ WT_CONDVAR *cond,
+ uint64_t usecs);
extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond);
extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR *cond);
extern int __wt_rwlock_alloc( WT_SESSION_IMPL *session,
diff --git a/src/include/misc.h b/src/include/misc.h
index 6b82155bb08..476ef39084f 100644
--- a/src/include/misc.h
+++ b/src/include/misc.h
@@ -174,9 +174,10 @@
#define WT_DECL_RET int ret = 0
/* Flags for the tree-walk function. */
-#define WT_TREE_DISCARD 0x01 /* Discarding */
-#define WT_TREE_EVICT 0x02 /* Eviction */
-#define WT_TREE_PREV 0x04 /* Backward walk */
+#define WT_TREE_COMPACT 0x01 /* Compaction */
+#define WT_TREE_DISCARD 0x02 /* Discarding */
+#define WT_TREE_EVICT 0x04 /* Eviction */
+#define WT_TREE_PREV 0x08 /* Backward walk */
/*
* In diagnostic mode we track the locations from which hazard references and
diff --git a/src/include/stat.h b/src/include/stat.h
index 098886904bb..ea29a8c7b1e 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -86,6 +86,7 @@ struct __wt_btree_stats {
WT_STATS file_minor;
WT_STATS file_overflow;
WT_STATS file_allocsize;
+ WT_STATS file_compact_rewrite;
WT_STATS rec_page_merge;
WT_STATS rec_dictionary;
WT_STATS rec_split_intl;
diff --git a/src/include/txn.i b/src/include/txn.i
index ccf04831f14..0f0425a5961 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -100,8 +100,18 @@ __wt_txn_visible(WT_SESSION_IMPL *session, wt_txnid_t id)
if (id == txn->id)
return (1);
- /* Read-uncommitted transactions see all other changes. */
- if (txn->isolation == TXN_ISO_READ_UNCOMMITTED)
+ /*
+ * Read-uncommitted transactions see all other changes.
+ *
+ * All metadata reads are at read-uncommitted isolation. That's
+ * because once a schema-level operation completes, subsequent
+ * operations must see the current version of checkpoint metadata, or
+ * they may try to read blocks that may have been freed from a file.
+ * Metadata updates use non-transactional techniques (such as the
+ * schema and metadata locks) to protect access to in-flight updates.
+ */
+ if (txn->isolation == TXN_ISO_READ_UNCOMMITTED ||
+ session->btree == session->metafile)
return (1);
/*
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index a7ae4c669e8..21b972ab9f5 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -706,6 +706,18 @@ struct __wt_session {
int __F(create)(WT_SESSION *session,
const char *name, const char *config);
+ /*! Compact an object.
+ *
+ * @snippet ex_all.c Compact a table
+ *
+ * @param session the session handle
+ * @param name the URI of the object to drop, such as \c "table:stock"
+ * @configempty{session.compact, see dist/api_data.py}
+ * @errors
+ */
+ int __F(compact)(WT_SESSION *session,
+ const char *name, const char *config);
+
/*! Drop (delete) an object.
*
* @snippet ex_all.c Drop a table
@@ -812,7 +824,17 @@ struct __wt_session {
*
* @param session the session handle
* @param name the URI of the file or table to verify
- * @configempty{session.verify, see dist/api_data.py}
+ * @configstart{session.verify, see dist/api_data.py}
+ * @config{dump_address, Display addresses and page types as pages are
+ * verified\, using the application's message handler\, intended for
+ * debugging.,a boolean flag; default \c false.}
+ * @config{dump_blocks, Display the contents of on-disk blocks as they
+ * are verified\, using the application's message handler\, intended for
+ * debugging.,a boolean flag; default \c false.}
+ * @config{dump_pages, Display the contents of in-memory pages as they
+ * are verified\, using the application's message handler\, intended for
+ * debugging.,a boolean flag; default \c false.}
+ * @configend
* @errors
*/
int __F(verify)(WT_SESSION *session,
@@ -904,6 +926,9 @@ struct __wt_session {
* drop all checkpoints before and including the named checkpoint.
* Checkpoints cannot be dropped while a hot backup is in progress or if
* open in a cursor.,a list of strings; default empty.}
+ * @config{force, checkpoints may be skipped if the underlying object
+ * has not been modified\, this option forces the checkpoint.,a boolean
+ * flag; default \c false.}
* @config{name, if non-empty\, specify a name for the checkpoint.,a
* string; default empty.}
* @config{target, if non-empty\, checkpoint the list of objects.,a list
@@ -918,21 +943,6 @@ struct __wt_session {
* @{
*/
- /*! Dump a physical file in debugging mode.
- *
- * The specified file is displayed in a non-portable debugging mode to
- * the application's standard output.
- *
- * @snippet ex_all.c Dump a file
- *
- * @param session the session handle
- * @param name the URI of the file to dump
- * @configempty{session.dumpfile, see dist/api_data.py}
- * @errors
- */
- int __F(dumpfile)(WT_SESSION *session,
- const char *name, const char *config);
-
/*! Send a string to the message handler for debugging.
*
* @snippet ex_all.c Print to the message stream
@@ -1774,34 +1784,36 @@ extern int wiredtiger_extension_init(WT_SESSION *session,
#define WT_STAT_file_overflow 31
/*! page size allocation unit */
#define WT_STAT_file_allocsize 32
+/*! pages rewritten by compaction */
+#define WT_STAT_file_compact_rewrite 33
/*! reconcile: deleted or temporary pages merged */
-#define WT_STAT_rec_page_merge 33
+#define WT_STAT_rec_page_merge 34
/*! reconcile: dictionary match */
-#define WT_STAT_rec_dictionary 34
+#define WT_STAT_rec_dictionary 35
/*! reconcile: internal pages split */
-#define WT_STAT_rec_split_intl 35
+#define WT_STAT_rec_split_intl 36
/*! reconcile: leaf pages split */
-#define WT_STAT_rec_split_leaf 36
+#define WT_STAT_rec_split_leaf 37
/*! reconcile: overflow key */
-#define WT_STAT_rec_ovfl_key 37
+#define WT_STAT_rec_ovfl_key 38
/*! reconcile: overflow value */
-#define WT_STAT_rec_ovfl_value 38
+#define WT_STAT_rec_ovfl_value 39
/*! reconcile: pages deleted */
-#define WT_STAT_rec_page_delete 39
+#define WT_STAT_rec_page_delete 40
/*! reconcile: pages written */
-#define WT_STAT_rec_written 40
+#define WT_STAT_rec_written 41
/*! reconcile: unable to acquire hazard reference */
-#define WT_STAT_rec_hazard 41
+#define WT_STAT_rec_hazard 42
/*! row-store internal pages */
-#define WT_STAT_file_row_int_pages 42
+#define WT_STAT_file_row_int_pages 43
/*! row-store leaf pages */
-#define WT_STAT_file_row_leaf_pages 43
+#define WT_STAT_file_row_leaf_pages 44
/*! total entries */
-#define WT_STAT_file_entries 44
+#define WT_STAT_file_entries 45
/*! update conflicts */
-#define WT_STAT_update_conflict 45
+#define WT_STAT_update_conflict 46
/*! write generation conflicts */
-#define WT_STAT_file_write_conflicts 46
+#define WT_STAT_file_write_conflicts 47
/*! @} */
/*!
diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c
index c09267ba79c..44957c6b34f 100644
--- a/src/lsm/lsm_merge.c
+++ b/src/lsm/lsm_merge.c
@@ -61,7 +61,7 @@ __wt_lsm_merge_update_tree(WT_SESSION_IMPL *session,
* Merge a set of chunks of an LSM tree.
*/
int
-__wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+__wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int stalls)
{
WT_BLOOM *bloom;
WT_CURSOR *src, *dest;
@@ -85,7 +85,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
* read. We need a copy, since other threads may alter the chunk count
* while we are doing a merge.
*/
- nchunks = lsm_tree->nchunks - 1;
+ nchunks = lsm_tree->nchunks;
/*
* If there aren't any chunks to merge, or some of the chunks aren't
@@ -106,8 +106,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
/* Only include chunks that are stable on disk. */
end_chunk = nchunks - 1;
while (end_chunk > 0 &&
- (!F_ISSET(lsm_tree->chunk[end_chunk], WT_LSM_CHUNK_ONDISK) ||
- lsm_tree->chunk[end_chunk]->ncursor > 0))
+ !F_ISSET(lsm_tree->chunk[end_chunk], WT_LSM_CHUNK_ONDISK))
--end_chunk;
/*
@@ -136,6 +135,11 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
if (nchunks > 2 && chunk->count > 2 * record_count / nchunks)
break;
+ /* Don't do any big merges until we have waited for 10s. */
+ if (nchunks > 0 && stalls < 10 &&
+ chunk->count > lsm_tree->chunk[end_chunk]->count * 2)
+ break;
+
record_count += chunk->count;
--start_chunk;
@@ -146,7 +150,8 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
WT_ASSERT(session, nchunks <= max_chunks);
- if (nchunks <= 1)
+ /* Don't do small merges unless we have waited for 2s. */
+ if (nchunks <= 1 || (stalls < 2 && nchunks < max_chunks / 2))
return (WT_NOTFOUND);
/* Allocate an ID for the merge. */
@@ -192,6 +197,11 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
wt_session, chunk->uri, NULL, "raw,bulk", &dest));
for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
+ if (insert_count % 1000 &&
+ !F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) {
+ ret = EINTR;
+ goto err;
+ }
WT_ERR(src->get_key(src, &key));
dest->set_key(dest, &key);
WT_ERR(src->get_value(src, &value));
@@ -237,11 +247,23 @@ err: if (src != NULL)
WT_TRET(__wt_bloom_close(bloom));
__wt_scr_free(&bbuf);
if (ret != 0) {
+ /*
+ * Ideally we would drop the new chunk on error, but that
+ * introduces potential deadlock problems. It is relatively
+ * harmless to leave the file - it does not interfere
+ * with later re-use.
+ WT_WITH_SCHEMA_LOCK(session,
+ (void)wt_session->drop(wt_session, chunk->uri, NULL));
+ */
__wt_free(session, chunk->bloom_uri);
__wt_free(session, chunk->uri);
__wt_free(session, chunk);
- WT_VERBOSE_VOID(session, lsm,
- "Merge failed with %s\n", wiredtiger_strerror(ret));
+ if (ret == EINTR)
+ WT_VERBOSE_VOID(session, lsm,
+ "Merge aborted due to close");
+ else
+ WT_VERBOSE_VOID(session, lsm,
+ "Merge failed with %s", wiredtiger_strerror(ret));
}
return (ret);
}
diff --git a/src/lsm/lsm_stat.c b/src/lsm/lsm_stat.c
index 87a151c5ada..6f47692a26e 100644
--- a/src/lsm/lsm_stat.c
+++ b/src/lsm/lsm_stat.c
@@ -67,9 +67,18 @@ __wt_lsm_stat_init(
*/
WT_ERR(__wt_buf_fmt(
session, uribuf, "statistics:%s", chunk->uri));
- WT_ERR(__wt_curstat_open(session, uribuf->data,
+ ret = __wt_curstat_open(session, uribuf->data,
F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ? disk_cfg : cfg,
- &stat_cursor));
+ &stat_cursor);
+ /*
+ * XXX kludge: we may have an empty chunk where no checkpoint
+ * was written. If so, try to open the ordinary handle on that
+ * chunk instead.
+ */
+ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))
+ ret = __wt_curstat_open(
+ session, uribuf->data, cfg, &stat_cursor);
+ WT_ERR(ret);
stat_cursor->set_key(stat_cursor, WT_STAT_page_evict_fail);
WT_ERR(stat_cursor->search(stat_cursor));
diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c
index 2be721349ea..84dac6b71d1 100644
--- a/src/lsm/lsm_tree.c
+++ b/src/lsm/lsm_tree.c
@@ -24,9 +24,18 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN))
TAILQ_REMOVE(&S2C(session)->lsmqh, lsm_tree, q);
+ __wt_free(session, lsm_tree->name);
+ __wt_free(session, lsm_tree->config);
+ __wt_free(session, lsm_tree->key_format);
+ __wt_free(session, lsm_tree->value_format);
+ __wt_free(session, lsm_tree->file_config);
+
+ if (lsm_tree->rwlock != NULL)
+ __wt_rwlock_destroy(session, &lsm_tree->rwlock);
+
+ __wt_free(session, lsm_tree->stats);
__wt_spin_destroy(session, &lsm_tree->lock);
- __wt_free(session, lsm_tree->name);
for (i = 0; i < lsm_tree->nchunks; i++) {
if ((chunk = lsm_tree->chunk[i]) == NULL)
continue;
@@ -46,8 +55,6 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
__wt_free(session, chunk);
}
__wt_free(session, lsm_tree->old_chunks);
- __wt_free(session, lsm_tree->stats);
-
__wt_free(session, lsm_tree);
}
@@ -160,9 +167,15 @@ __wt_lsm_tree_setup_chunk(WT_SESSION_IMPL *session,
WT_DECL_ITEM(buf);
WT_DECL_ITEM(bbuf);
WT_DECL_RET;
+ const char *cfg[] = API_CONF_DEFAULTS(session, drop, "force");
WT_RET(__wt_scr_alloc(session, 0, &buf));
WT_ERR(__wt_lsm_tree_chunk_name(session, lsm_tree, i, buf));
+ /*
+ * Drop the chunk first - there may be some content hanging over
+ * from an aborted merge.
+ */
+ WT_ERR(__wt_schema_drop(session, buf->data, cfg));
WT_ERR(__wt_schema_create(session, buf->data, lsm_tree->file_config));
chunk->uri = __wt_buf_steal(session, buf, NULL);
if (create_bloom) {
diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c
index d7b395bcf8b..3e9951dbafb 100644
--- a/src/lsm/lsm_worker.c
+++ b/src/lsm/lsm_worker.c
@@ -21,10 +21,11 @@ __wt_lsm_worker(void *arg)
{
WT_LSM_TREE *lsm_tree;
WT_SESSION_IMPL *session;
- int progress;
+ int progress, stalls;
lsm_tree = arg;
session = lsm_tree->worker_session;
+ stalls = 0;
while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) {
progress = 0;
@@ -32,7 +33,8 @@ __wt_lsm_worker(void *arg)
/* Clear any state from previous worker thread iterations. */
session->btree = NULL;
- if (__wt_lsm_merge(session, lsm_tree) == 0)
+ /* Report stalls to merge in seconds. */
+ if (__wt_lsm_merge(session, lsm_tree, stalls / 1000) == 0)
progress = 1;
/* Clear any state from previous worker thread iterations. */
@@ -42,8 +44,12 @@ __wt_lsm_worker(void *arg)
__lsm_free_chunks(session, lsm_tree) == 0)
progress = 1;
- if (!progress)
- __wt_sleep(0, 10);
+ if (progress)
+ stalls = 0;
+ else {
+ __wt_sleep(0, 1000);
+ ++stalls;
+ }
}
return (NULL);
@@ -213,14 +219,14 @@ __lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
WT_DECL_RET;
WT_LSM_CHUNK *chunk;
const char *drop_cfg[] = { NULL };
- int found, i;
+ int locked, progress, i;
- found = 0;
+ locked = progress = 0;
for (i = 0; i < lsm_tree->nold_chunks; i++) {
if ((chunk = lsm_tree->old_chunks[i]) == NULL)
continue;
- if (!found) {
- found = 1;
+ if (!locked) {
+ locked = 1;
/* TODO: Do we need the lsm_tree lock for all drops? */
__wt_spin_lock(session, &lsm_tree->lock);
}
@@ -232,6 +238,7 @@ __lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
* be positioned on this old chunk.
*/
if (ret == 0) {
+ progress = 1;
F_CLR(chunk, WT_LSM_CHUNK_BLOOM);
__wt_free(session, chunk->bloom_uri);
chunk->bloom_uri = NULL;
@@ -250,6 +257,7 @@ __lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
* be positioned on this old chunk.
*/
if (ret == 0) {
+ progress = 1;
__wt_free(session, chunk->uri);
chunk->uri = NULL;
} else if (ret != EBUSY)
@@ -262,10 +270,14 @@ __lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
++lsm_tree->old_avail;
}
}
- if (found) {
-err: ret = __wt_lsm_meta_write(session, lsm_tree);
+ if (locked) {
+err: WT_TRET(__wt_lsm_meta_write(session, lsm_tree));
__wt_spin_unlock(session, &lsm_tree->lock);
}
+
/* Returning non-zero means there is no work to do. */
- return (found ? 0 : WT_NOTFOUND);
+ if (!progress)
+ WT_TRET(WT_NOTFOUND);
+
+ return (ret);
}
diff --git a/src/os_posix/os_mtx.c b/src/os_posix/os_mtx.c
index 4e7a1e2cc6e..805a8ce4e0f 100644
--- a/src/os_posix/os_mtx.c
+++ b/src/os_posix/os_mtx.c
@@ -46,7 +46,7 @@ err: __wt_free(session, cond);
* Lock a mutex.
*/
void
-__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
+__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
{
WT_DECL_RET;
@@ -65,7 +65,17 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
* it's known to return these errors on some systems.
*/
while (cond->locked) {
- ret = pthread_cond_wait(&cond->cond, &cond->mtx);
+ if (usecs > 0) {
+ struct timeval tv;
+ struct timespec ts;
+
+ gettimeofday(&tv, NULL);
+ ts.tv_sec = tv.tv_sec + (tv.tv_usec + usecs) / 1000000;
+ ts.tv_nsec = 1000L * ((tv.tv_usec + usecs) % 1000000);
+ ret = pthread_cond_timedwait(
+ &cond->cond, &cond->mtx, &ts);
+ } else
+ ret = pthread_cond_wait(&cond->cond, &cond->mtx);
if (ret != 0 &&
ret != EINTR &&
#ifdef ETIME
diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c
index 055eb84efba..cedd3cf2bbd 100644
--- a/src/os_posix/os_open.c
+++ b/src/os_posix/os_open.c
@@ -90,6 +90,15 @@ __wt_open(WT_SESSION_IMPL *session,
/* Windows clones: we always want to treat the file as a binary. */
f |= O_BINARY;
#endif
+#ifdef O_CLOEXEC
+ /*
+ * Security:
+ * The application may spawn a new process, and we don't want another
+ * process to have access to our file handles.
+ */
+ f |= O_CLOEXEC;
+#endif
+
if (ok_create) {
f |= O_CREAT;
if (exclusive)
@@ -109,17 +118,24 @@ __wt_open(WT_SESSION_IMPL *session,
if (ret != 0)
WT_ERR_MSG(session, ret, "%s", name);
-#if defined(HAVE_FCNTL) && defined(FD_CLOEXEC)
+#if defined(HAVE_FCNTL) && defined(FD_CLOEXEC) && !defined(O_CLOEXEC)
/*
* Security:
* The application may spawn a new process, and we don't want another
* process to have access to our file handles. There's an obvious
- * race here...
+ * race here, so we prefer the flag to open if available.
*/
if ((f = fcntl(fd, F_GETFD)) == -1 ||
fcntl(fd, F_SETFD, f | FD_CLOEXEC) == -1)
WT_ERR_MSG(session, __wt_errno(), "%s: fcntl", name);
#endif
+
+#if defined(HAVE_POSIX_FADVISE)
+ /* Disable read-ahead on trees: it slows down random read workloads. */
+ if (is_tree)
+ WT_ERR(posix_fadvise(fd, 0, 0, POSIX_FADV_RANDOM));
+#endif
+
WT_ERR(__open_directory_sync(session));
WT_ERR(__wt_calloc(session, 1, sizeof(WT_FH), &fh));
diff --git a/src/schema/schema_list.c b/src/schema/schema_list.c
index 704ec92f3a2..4fc82e72480 100644
--- a/src/schema/schema_list.c
+++ b/src/schema/schema_list.c
@@ -96,6 +96,7 @@ __wt_schema_destroy_index(WT_SESSION_IMPL *session, WT_INDEX *idx)
__wt_free(session, idx->name);
__wt_free(session, idx->source);
__wt_free(session, idx->config);
+ __wt_free(session, idx->key_format);
__wt_free(session, idx->key_plan);
__wt_free(session, idx->value_plan);
__wt_free(session, idx->idxkey_format);
diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c
index 8bb62999d7b..4e35dcd0497 100644
--- a/src/schema/schema_worker.c
+++ b/src/schema/schema_worker.c
@@ -54,6 +54,19 @@ __wt_schema_worker(WT_SESSION_IMPL *session,
WT_TRET(__wt_session_release_btree(session));
WT_RET(ret);
}
+
+ /* Compaction and checkpoint apply to index files as well. */
+ if (func == __wt_compact || func == __wt_checkpoint) {
+ WT_RET(__wt_schema_open_indices(session, table));
+ for (i = 0; i < table->nindices; i++) {
+ WT_RET(__wt_session_get_btree_ckpt(
+ session, table->indices[i]->source,
+ cfg, open_flags));
+ ret = func(session, cfg);
+ WT_TRET(__wt_session_release_btree(session));
+ WT_RET(ret);
+ }
+ }
} else
return (__wt_bad_object_type(session, uri));
diff --git a/src/session/session_api.c b/src/session/session_api.c
index c515d2e885d..5c7f0acf2dd 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -7,6 +7,7 @@
#include "wt_internal.h"
+static int __session_checkpoint(WT_SESSION *, const char *);
static int __session_rollback_transaction(WT_SESSION *, const char *);
/*
@@ -280,42 +281,114 @@ err: API_END_NOTFOUND_MAP(session, ret);
}
/*
- * __session_drop --
- * WT_SESSION->drop method.
+ * __session_compact_worker --
+ * Worker function to do the actual compaction call.
*/
static int
-__session_drop(WT_SESSION *wt_session, const char *uri, const char *config)
+__session_compact_worker(
+ WT_SESSION *wt_session, const char *uri, const char *config)
{
WT_DECL_RET;
WT_SESSION_IMPL *session;
session = (WT_SESSION_IMPL *)wt_session;
- SESSION_API_CALL(session, drop, config, cfg);
+ SESSION_API_CALL(session, compact, config, cfg);
WT_WITH_SCHEMA_LOCK(session,
- ret = __wt_schema_drop(session, uri, cfg));
+ ret = __wt_schema_worker(session, uri, __wt_compact, cfg, 0));
-err: /* Note: drop operations cannot be unrolled (yet?). */
- API_END_NOTFOUND_MAP(session, ret);
+err: API_END_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_compact --
+ * WT_SESSION.compact method.
+ */
+static int
+__session_compact(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+ WT_DECL_RET;
+ WT_ITEM *t;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ /* Compaction makes no sense for LSM objects, ignore requests. */
+ if (WT_PREFIX_MATCH(uri, "lsm:"))
+ return (0);
+ if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
+ !WT_PREFIX_MATCH(uri, "file:") &&
+ !WT_PREFIX_MATCH(uri, "index:") &&
+ !WT_PREFIX_MATCH(uri, "table:"))
+ return (__wt_bad_object_type(session, uri));
+
+ /*
+ * Compaction requires 2, and possibly 3 checkpoints, how many is block
+ * manager specific: all block managers will need the first checkpoint,
+ * but may or may not need the last two.
+ *
+ * The first checkpoint frees emptied pages to the underlying block
+ * manager (when rows are deleted, underlying blocks aren't freed until
+ * the page is reconciled, and checkpoint makes that happen). Because
+ * compaction is based on having available blocks in the block manager,
+ * compaction could do no work without the first checkpoint.
+ *
+ * After the first checkpoint, we compact the tree.
+ *
+ * The second and third checkpoints are done because the default block
+ * manager does checkpoints in two steps: blocks made available for
+ * re-use during a checkpoint are put on a special checkpoint-available
+ * list and only moved onto the real available list once the metadata
+ * has been updated with the newly written checkpoint information. This
+ * means blocks allocated by the checkpoint itself cannot be taken from
+ * the blocks made available by the checkpoint.
+ *
+ * In other words, the second checkpoint puts the blocks from the end of
+ * the file that were freed by compaction onto the checkpoint-available
+ * list, but then potentially writes checkpoint blocks at the end of the
+ * file, which would prevent any file truncation. When the second
+ * checkpoint resolves, those blocks become available for the third
+ * checkpoint, so it's able to write its blocks toward the beginning of
+ * the file, and then the file can be truncated.
+ *
+ * We do the work here so applications don't get confused why compaction
+ * isn't helping until after multiple, subsequent checkpoint calls.
+ *
+ * Force the checkpoint: we don't want to skip it because the work we
+ * need to have done is done in the underlying block manager.
+ */
+ WT_RET(__wt_scr_alloc(session, 0, &t));
+ WT_ERR(__wt_buf_fmt(session, t, "target=(\"%s\")", uri));
+ WT_ERR(__session_checkpoint(wt_session, t->data));
+
+ WT_ERR(__session_compact_worker(wt_session, uri, config));
+
+ WT_ERR(__wt_buf_fmt(session, t, "target=(\"%s\"),force=1", uri));
+ WT_ERR(__session_checkpoint(wt_session, t->data));
+ WT_ERR(__session_checkpoint(wt_session, t->data));
+
+err: __wt_scr_free(&t);
+ return (ret);
}
/*
- * __session_dumpfile --
- * WT_SESSION->dumpfile method.
+ * __session_drop --
+ * WT_SESSION->drop method.
*/
static int
-__session_dumpfile(WT_SESSION *wt_session, const char *uri, const char *config)
+__session_drop(WT_SESSION *wt_session, const char *uri, const char *config)
{
WT_DECL_RET;
WT_SESSION_IMPL *session;
session = (WT_SESSION_IMPL *)wt_session;
- SESSION_API_CALL(session, dumpfile, config, cfg);
+ SESSION_API_CALL(session, drop, config, cfg);
+
WT_WITH_SCHEMA_LOCK(session,
- ret = __wt_schema_worker(session, uri,
- __wt_dumpfile, cfg, WT_BTREE_EXCLUSIVE | WT_BTREE_VERIFY));
+ ret = __wt_schema_drop(session, uri, cfg));
-err: API_END_NOTFOUND_MAP(session, ret);
+err: /* Note: drop operations cannot be unrolled (yet?). */
+ API_END_NOTFOUND_MAP(session, ret);
}
/*
@@ -641,6 +714,7 @@ __wt_open_session(WT_CONNECTION_IMPL *conn, int internal,
__session_reconfigure,
__session_open_cursor,
__session_create,
+ __session_compact,
__session_drop,
__session_rename,
__session_salvage,
@@ -651,7 +725,6 @@ __wt_open_session(WT_CONNECTION_IMPL *conn, int internal,
__session_commit_transaction,
__session_rollback_transaction,
__session_checkpoint,
- __session_dumpfile,
__session_msg_printf
};
WT_DECL_RET;
diff --git a/src/support/err.c b/src/support/err.c
index 55f21a5bcd5..c332d18478e 100644
--- a/src/support/err.c
+++ b/src/support/err.c
@@ -382,6 +382,7 @@ __wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri)
WT_PREFIX_MATCH(uri, "config:") ||
WT_PREFIX_MATCH(uri, "file:") ||
WT_PREFIX_MATCH(uri, "index:") ||
+ WT_PREFIX_MATCH(uri, "lsm:") ||
WT_PREFIX_MATCH(uri, "statistics:") ||
WT_PREFIX_MATCH(uri, "table:"))
WT_RET_MSG(session, ENOTSUP,
diff --git a/src/support/hazard.c b/src/support/hazard.c
index df362278d25..78dc1b5a1d8 100644
--- a/src/support/hazard.c
+++ b/src/support/hazard.c
@@ -24,6 +24,7 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp
{
WT_BTREE *btree;
WT_HAZARD *hp;
+ int restarts = 0;
btree = session->btree;
*busyp = 0;
@@ -45,12 +46,23 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp
* reference before it discards the page (the eviction server sets the
* state to WT_REF_LOCKED, then flushes memory and checks the hazard
* references).
+ *
+ * For sessions with many active hazard references, skip most of the
+ * active slots: there may be a free slot in there, but checking is
+ * expensive. Most hazard references are released quickly: optimize
+ * for that case.
*/
- for (hp = session->hazard; ; ++hp) {
+ for (hp = session->hazard + session->nhazard;; ++hp) {
/* Expand the number of hazard references if available.*/
if (hp >= session->hazard + session->hazard_size) {
if (session->hazard_size >= S2C(session)->hazard_max)
break;
+ /* Restart the search. */
+ if (session->nhazard < session->hazard_size &&
+ restarts++ == 0) {
+ hp = session->hazard;
+ continue;
+ }
WT_PUBLISH(session->hazard_size,
WT_MIN(session->hazard_size + WT_HAZARD_INCR,
S2C(session)->hazard_max));
@@ -134,9 +146,13 @@ __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page)
*/
WT_ASSERT(session, page != NULL);
- /* Clear the caller's hazard pointer. */
- for (hp = session->hazard;
- hp < session->hazard + session->hazard_size; ++hp)
+ /*
+ * Clear the caller's hazard pointer.
+ * The common pattern is LIFO, so do a reverse search.
+ */
+ for (hp = session->hazard + session->hazard_size - 1;
+ hp >= session->hazard;
+ --hp)
if (hp->page == page) {
/*
* We don't publish the hazard reference clear in the
diff --git a/src/support/stat.c b/src/support/stat.c
index c8f67d0ac12..23065e706a5 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -26,6 +26,7 @@ __wt_stat_alloc_btree_stats(WT_SESSION_IMPL *session, WT_BTREE_STATS **statsp)
stats->file_col_int_pages.desc = "column-store internal pages";
stats->file_col_var_pages.desc =
"column-store variable-size leaf pages";
+ stats->file_compact_rewrite.desc = "pages rewritten by compaction";
stats->file_entries.desc = "total entries";
stats->file_fixed_len.desc = "fixed-record size";
stats->file_magic.desc = "magic number";
@@ -88,6 +89,7 @@ __wt_stat_clear_btree_stats(WT_STATS *stats_arg)
stats->file_col_fix_pages.v = 0;
stats->file_col_int_pages.v = 0;
stats->file_col_var_pages.v = 0;
+ stats->file_compact_rewrite.v = 0;
stats->file_entries.v = 0;
stats->file_fixed_len.v = 0;
stats->file_magic.v = 0;
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index 44998fbbf9a..34eaf68ffd4 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -158,7 +158,7 @@ __ckpt_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len)
*/
if (len < strlen(WT_CHECKPOINT))
return (0);
- if (strncmp(name, WT_CHECKPOINT, strlen(WT_CHECKPOINT)) != 0)
+ if (!WT_PREFIX_MATCH(name, WT_CHECKPOINT))
return (0);
WT_RET_MSG(session, EINVAL,
@@ -183,8 +183,7 @@ __drop(WT_CKPT *ckptbase, const char *name, size_t len)
*/
if (strncmp(WT_CHECKPOINT, name, len) == 0) {
WT_CKPT_FOREACH(ckptbase, ckpt)
- if (strncmp(ckpt->name,
- WT_CHECKPOINT, strlen(WT_CHECKPOINT)) == 0)
+ if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT))
F_SET(ckpt, WT_CKPT_DELETE);
} else
WT_CKPT_FOREACH(ckptbase, ckpt)
@@ -273,7 +272,7 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
WT_TXN *txn;
WT_TXN_ISOLATION saved_isolation;
const char *name;
- int deleted, is_checkpoint, track_ckpt;
+ int deleted, force, is_checkpoint, track_ckpt;
char *name_alloc;
conn = S2C(session);
@@ -392,7 +391,15 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
* to open the checkpoint in a cursor after taking any checkpoint, which
* means it must exist.
*/
+ force = 0;
if (!btree->modified) {
+ ret = __wt_config_gets(session, cfg, "force", &cval);
+ if (ret != 0 && ret != WT_NOTFOUND)
+ WT_ERR(ret);
+ if (ret == 0 && cval.val != 0)
+ force = 1;
+ }
+ if (!btree->modified && !force) {
if (!is_checkpoint)
goto skip;
@@ -400,9 +407,18 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
WT_CKPT_FOREACH(ckptbase, ckpt)
if (F_ISSET(ckpt, WT_CKPT_DELETE))
++deleted;
+ /*
+ * Complicated test: if we only deleted a single checkpoint, and
+ * it was the last checkpoint in the object, and it has the same
+ * name as the checkpoint we're taking (correcting for internal
+ * checkpoint names with their generational suffix numbers), we
+ * can skip the checkpoint, there's nothing to do.
+ */
if (deleted == 1 &&
F_ISSET(ckpt - 1, WT_CKPT_DELETE) &&
- strcmp(name, (ckpt - 1)->name) == 0)
+ (strcmp(name, (ckpt - 1)->name) == 0 ||
+ (WT_PREFIX_MATCH(name, WT_CHECKPOINT) &&
+ WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT))))
goto skip;
}
@@ -433,9 +449,8 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
* delete flag, and otherwise fail.
*/
if (conn->ckpt_backup) {
- if (strncmp(ckpt->name,
- WT_CHECKPOINT,
- strlen(WT_CHECKPOINT)) == 0) {
+ if (WT_PREFIX_MATCH(
+ ckpt->name, WT_CHECKPOINT)) {
F_CLR(ckpt, WT_CKPT_DELETE);
continue;
}
@@ -454,8 +469,8 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
__wt_session_lock_checkpoint(session, ckpt->name);
if (ret == 0)
continue;
- if (ret == EBUSY && strncmp(ckpt->name,
- WT_CHECKPOINT, strlen(WT_CHECKPOINT)) == 0) {
+ if (ret == EBUSY &&
+ WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) {
F_CLR(ckpt, WT_CKPT_DELETE);
continue;
}
diff --git a/src/utilities/util.h b/src/utilities/util.h
index a736fdfcf85..b2dd594c757 100644
--- a/src/utilities/util.h
+++ b/src/utilities/util.h
@@ -42,11 +42,11 @@ extern char *util_optarg; /* argument associated with option */
int util_backup(WT_SESSION *, int, char *[]);
int util_cerr(const char *, const char *, int);
+int util_compact(WT_SESSION *, int, char *[]);
void util_copyright(void);
int util_create(WT_SESSION *, int, char *[]);
int util_drop(WT_SESSION *, int, char *[]);
int util_dump(WT_SESSION *, int, char *[]);
-int util_dumpfile(WT_SESSION *, int, char *[]);
int util_err(int, const char *, ...);
int util_flush(WT_SESSION *, const char *);
int util_getopt(int, char * const *, const char *);
diff --git a/src/utilities/util_compact.c b/src/utilities/util_compact.c
new file mode 100644
index 00000000000..11ee6ddaf14
--- /dev/null
+++ b/src/utilities/util_compact.c
@@ -0,0 +1,61 @@
+/*-
+ * Copyright (c) 2008-2012 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_compact(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ int ch;
+ char *uri;
+
+ uri = NULL;
+ while ((ch = util_getopt(argc, argv, "")) != EOF)
+ switch (ch) {
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= util_optind;
+ argv += util_optind;
+
+ /* The remaining argument is the table name. */
+ if (argc != 1)
+ return (usage());
+ if ((uri = util_name(*argv, "table",
+ UTIL_COLGROUP_OK | UTIL_FILE_OK | UTIL_INDEX_OK |
+ UTIL_LSM_OK | UTIL_TABLE_OK)) == NULL)
+ return (1);
+
+ if ((ret = session->compact(session, uri, NULL)) != 0) {
+ fprintf(stderr, "%s: compact(%s): %s\n",
+ progname, uri, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ if (0) {
+err: ret = 1;
+ }
+
+ if (uri != NULL)
+ free(uri);
+
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "compact uri\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/utilities/util_dumpfile.c b/src/utilities/util_dumpfile.c
deleted file mode 100644
index 2edbbf88322..00000000000
--- a/src/utilities/util_dumpfile.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*-
- * Copyright (c) 2008-2012 WiredTiger, Inc.
- * All rights reserved.
- *
- * See the file LICENSE for redistribution information.
- */
-
-#include "util.h"
-
-static int usage(void);
-
-int
-util_dumpfile(WT_SESSION *session, int argc, char *argv[])
-{
- WT_DECL_RET;
- int ch;
- char *name;
-
- name = NULL;
- while ((ch = util_getopt(argc, argv, "f:")) != EOF)
- switch (ch) {
- case 'f': /* output file */
- if (freopen(util_optarg, "w", stdout) == NULL) {
- fprintf(stderr, "%s: %s: %s\n",
- progname, util_optarg, strerror(errno));
- return (1);
- }
- break;
- case '?':
- default:
- return (usage());
- }
- argc -= util_optind;
- argv += util_optind;
-
- /* The remaining argument is the file name. */
- if (argc != 1)
- return (usage());
- if ((name = util_name(*argv, "file", UTIL_FILE_OK)) == NULL)
- return (1);
-
- if ((ret = session->dumpfile(session, name, NULL)) != 0) {
- fprintf(stderr, "%s: dumpfile(%s): %s\n",
- progname, name, wiredtiger_strerror(ret));
- goto err;
- }
- if (verbose)
- printf("\n");
-
- if (0) {
-err: ret = 1;
- }
-
- if (name != NULL)
- free(name);
-
- return (ret);
-}
-
-static int
-usage(void)
-{
- (void)fprintf(stderr,
- "usage: %s %s "
- "dumpfile [-f output-file] file\n",
- progname, usage_prefix);
- return (1);
-}
diff --git a/src/utilities/util_main.c b/src/utilities/util_main.c
index d4be5a45500..6b8b91fa2db 100644
--- a/src/utilities/util_main.c
+++ b/src/utilities/util_main.c
@@ -111,6 +111,8 @@ main(int argc, char *argv[])
case 'c':
if (strcmp(command, "create") == 0)
ret = util_create(session, argc, argv);
+ else if (strcmp(command, "compact") == 0)
+ ret = util_compact(session, argc, argv);
else
ret = usage();
break;
@@ -119,8 +121,6 @@ main(int argc, char *argv[])
ret = util_drop(session, argc, argv);
else if (strcmp(command, "dump") == 0)
ret = util_dump(session, argc, argv);
- else if (strcmp(command, "dumpfile") == 0)
- ret = util_dumpfile(session, argc, argv);
else
ret = usage();
break;
@@ -193,28 +193,28 @@ usage(void)
WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR);
fprintf(stderr,
"global options:\n"
- "\t-C\twiredtiger_open configuration\n"
- "\t-h\tdatabase directory\n"
- "\t-V\tdisplay library version and exit\n"
- "\t-v\tverbose\n");
+ "\t" "-C\twiredtiger_open configuration\n"
+ "\t" "-h\tdatabase directory\n"
+ "\t" "-V\tdisplay library version and exit\n"
+ "\t" "-v\tverbose\n");
fprintf(stderr,
"commands:\n"
- "\tbackup database backup\n"
- "\tcopyright copyright information\n"
- "\tcreate\t create an object\n"
- "\tdrop\t drop an object\n"
- "\tdump\t dump an object\n"
- "\tdumpfile dump a physical file in debugging format\n"
- "\tlist\t list database objects\n"
- "\tload\t load an object\n"
- "\tprintlog display the database log\n"
- "\tread\t read values from an object\n"
- "\trename\t rename an object\n"
- "\tsalvage\t salvage a file\n"
- "\tstat\t display statistics for an object\n"
- "\tupgrade\t upgrade an object\n"
- "\tverify\t verify an object\n"
- "\twrite\t write values to an object\n");
+ "\t" "backup\t database backup\n"
+ "\t" "compact\t compact an object\n"
+ "\t" "copyright copyright information\n"
+ "\t" "create\t create an object\n"
+ "\t" "drop\t drop an object\n"
+ "\t" "dump\t dump an object\n"
+ "\t" "list\t list database objects\n"
+ "\t" "load\t load an object\n"
+ "\t" "printlog display the database log\n"
+ "\t" "read\t read values from an object\n"
+ "\t" "rename\t rename an object\n"
+ "\t" "salvage\t salvage a file\n"
+ "\t" "stat\t display statistics for an object\n"
+ "\t" "upgrade\t upgrade an object\n"
+ "\t" "verify\t verify an object\n"
+ "\t" "write\t write values to an object\n");
return (EXIT_FAILURE);
}
@@ -231,43 +231,33 @@ util_name(const char *s, const char *type, u_int flags)
char *name;
copy = 0;
- if (WT_PREFIX_MATCH(s, "colgroup:")) {
- if (!(flags & UTIL_COLGROUP_OK)) {
- fprintf(stderr,
- "%s: %s: \"colgroup\" type not supported\n",
- progname, command);
- return (NULL);
- }
+ if (WT_PREFIX_MATCH(s, "backup:")) {
+ goto type_err;
+ } else if (WT_PREFIX_MATCH(s, "colgroup:")) {
+ if (!(flags & UTIL_COLGROUP_OK))
+ goto type_err;
copy = 1;
+ } else if (WT_PREFIX_MATCH(s, "config:")) {
+ goto type_err;
} else if (WT_PREFIX_MATCH(s, "file:")) {
- if (!(flags & UTIL_FILE_OK)) {
- fprintf(stderr,
- "%s: %s: \"file\" type not supported\n",
- progname, command);
- return (NULL);
- }
+ if (!(flags & UTIL_FILE_OK))
+ goto type_err;
copy = 1;
} else if (WT_PREFIX_MATCH(s, "index:")) {
- if (!(flags & UTIL_INDEX_OK)) {
- fprintf(stderr,
- "%s: %s: \"index\" type not supported\n",
- progname, command);
- return (NULL);
- }
+ if (!(flags & UTIL_INDEX_OK))
+ goto type_err;
copy = 1;
} else if (WT_PREFIX_MATCH(s, "lsm:")) {
- if (!(flags & UTIL_LSM_OK)) {
- fprintf(stderr,
- "%s: %s: \"lsm\" type not supported\n",
- progname, command);
- return (NULL);
- }
+ if (!(flags & UTIL_LSM_OK))
+ goto type_err;
copy = 1;
+ } else if (WT_PREFIX_MATCH(s, "statistics:")) {
+ goto type_err;
} else if (WT_PREFIX_MATCH(s, "table:")) {
if (!(flags & UTIL_TABLE_OK)) {
- fprintf(stderr,
- "%s: %s: \"table\" type not supported\n",
- progname, command);
+type_err: fprintf(stderr,
+ "%s: %s: unsupported object type: %s\n",
+ progname, command, s);
return (NULL);
}
copy = 1;
diff --git a/src/utilities/util_rename.c b/src/utilities/util_rename.c
index 55a90f96b55..c2ddb2998ad 100644
--- a/src/utilities/util_rename.c
+++ b/src/utilities/util_rename.c
@@ -39,8 +39,6 @@ util_rename(WT_SESSION *session, int argc, char *argv[])
progname, uri, newname, wiredtiger_strerror(ret));
goto err;
}
- if (verbose)
- printf("\n");
if (0) {
err: ret = 1;
diff --git a/src/utilities/util_salvage.c b/src/utilities/util_salvage.c
index 8821e673df1..82c247fe0ac 100644
--- a/src/utilities/util_salvage.c
+++ b/src/utilities/util_salvage.c
@@ -42,6 +42,8 @@ util_salvage(WT_SESSION *session, int argc, char *argv[])
progname, name, wiredtiger_strerror(ret));
goto err;
}
+
+ /* Verbose configures a progress counter, move to the next line. */
if (verbose)
printf("\n");
@@ -60,7 +62,7 @@ usage(void)
{
(void)fprintf(stderr,
"usage: %s %s "
- "salvage [-F] file\n",
+ "salvage [-F] uri\n",
progname, usage_prefix);
return (1);
}
diff --git a/src/utilities/util_upgrade.c b/src/utilities/util_upgrade.c
index 7c4f2035fa2..89fca16e548 100644
--- a/src/utilities/util_upgrade.c
+++ b/src/utilities/util_upgrade.c
@@ -38,6 +38,8 @@ util_upgrade(WT_SESSION *session, int argc, char *argv[])
progname, name, wiredtiger_strerror(ret));
goto err;
}
+
+ /* Verbose configures a progress counter, move to the next line. */
if (verbose)
printf("\n");
@@ -56,7 +58,7 @@ usage(void)
{
(void)fprintf(stderr,
"usage: %s %s "
- "upgrade file\n",
+ "upgrade uri\n",
progname, usage_prefix);
return (1);
}
diff --git a/src/utilities/util_verify.c b/src/utilities/util_verify.c
index fda2d413346..3ae94594737 100644
--- a/src/utilities/util_verify.c
+++ b/src/utilities/util_verify.c
@@ -13,12 +13,23 @@ int
util_verify(WT_SESSION *session, int argc, char *argv[])
{
WT_DECL_RET;
- int ch;
- char *name;
+ int ch, dump_address, dump_blocks, dump_pages;
+ char *name, config[128];
name = NULL;
- while ((ch = util_getopt(argc, argv, "")) != EOF)
+ dump_address = dump_blocks = dump_pages = 0;
+ while ((ch = util_getopt(argc, argv, "d:")) != EOF)
switch (ch) {
+ case 'd':
+ if (strcmp(util_optarg, "dump_address") == 0)
+ dump_address = 1;
+ else if (strcmp(util_optarg, "dump_blocks") == 0)
+ dump_blocks = 1;
+ else if (strcmp(util_optarg, "dump_pages") == 0)
+ dump_pages = 1;
+ else
+ return (usage());
+ break;
case '?':
default:
return (usage());
@@ -33,11 +44,22 @@ util_verify(WT_SESSION *session, int argc, char *argv[])
"table", UTIL_FILE_OK | UTIL_LSM_OK | UTIL_TABLE_OK)) == NULL)
return (1);
- if ((ret = session->verify(session, name, NULL)) != 0) {
+ /* Build the configuration string as necessary. */
+ config[0] = '\0';
+ if (dump_address)
+ (void)strcat(config, "dump_address,");
+ if (dump_blocks)
+ (void)strcat(config, "dump_blocks,");
+ if (dump_pages)
+ (void)strcat(config, "dump_pages,");
+
+ if ((ret = session->verify(session, name, config)) != 0) {
fprintf(stderr, "%s: verify(%s): %s\n",
progname, name, wiredtiger_strerror(ret));
goto err;
}
+
+ /* Verbose configures a progress counter, move to the next line. */
if (verbose)
printf("\n");
@@ -56,7 +78,7 @@ usage(void)
{
(void)fprintf(stderr,
"usage: %s %s "
- "verify file\n",
+ "verify [-d dump_address | dump_blocks | dump_pages] uri\n",
progname, usage_prefix);
return (1);
}
diff --git a/test/bloom/test_bloom.c b/test/bloom/test_bloom.c
index a19e1613fa2..7c339916fcb 100644
--- a/test/bloom/test_bloom.c
+++ b/test/bloom/test_bloom.c
@@ -106,7 +106,7 @@ int setup(void)
int ret;
char config[512];
- (void)system("rm -f WildTiger WiredTiger.* *.bf");
+ (void)system("rm -f WiredTiger* *.bf");
/*
* This test doesn't test public Wired Tiger functionality, it still
diff --git a/test/fops/t.c b/test/fops/t.c
index 694554741ec..f697f7fe30e 100644
--- a/test/fops/t.c
+++ b/test/fops/t.c
@@ -150,7 +150,7 @@ wt_shutdown(void)
static void
shutdown(void)
{
- (void)system("rm -f WildTiger WiredTiger.* __wt*");
+ (void)system("rm -f WiredTiger* __wt*");
}
static int
diff --git a/test/suite/test_bug004.py b/test/suite/test_bug004.py
index 69bf0247e32..2013859c6b0 100644
--- a/test/suite/test_bug004.py
+++ b/test/suite/test_bug004.py
@@ -25,7 +25,7 @@
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
-# test_bug003.py
+# test_bug004.py
# Regression tests.
import wiredtiger, wttest
diff --git a/test/suite/test_compact.py b/test/suite/test_compact.py
new file mode 100644
index 00000000000..c05ad088be0
--- /dev/null
+++ b/test/suite/test_compact.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+#
+# Public Domain 2008-2012 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import os
+import wiredtiger, wttest
+from helper import complex_populate, simple_populate, key_populate
+from suite_subprocess import suite_subprocess
+from wtscenario import multiply_scenarios, number_scenarios
+
+# test_compact.py
+# session level compact operation
+class test_compact(wttest.WiredTigerTestCase, suite_subprocess):
+ name = 'test_compact'
+
+ # Use a small page size because we want to create lots of pages.
+ config = 'leaf_page_max=512,value_format=S,key_format=S'
+ nentries = 40000
+
+ types = [
+ ('file', dict(uri='file:')),
+ ('table', dict(uri='table:'))
+ ]
+ compact = [
+ ('method', dict(utility=0,reopen=0)),
+ ('method_reopen', dict(utility=0,reopen=1)),
+ ('utility', dict(utility=1,reopen=0)),
+ ]
+ scenarios = number_scenarios(multiply_scenarios('.', types, compact))
+
+ # Test compaction.
+ def test_compact(self):
+ # Populate an object
+ uri = self.uri + self.name
+ if self.uri == "file:":
+ simple_populate(self, uri, self.config, self.nentries)
+ else:
+ complex_populate(self, uri, self.config, self.nentries)
+
+ # Reopen the connection to force the object to disk.
+ self.reopen_conn()
+
+ # Remove most of the object.
+ c1 = self.session.open_cursor(uri, None)
+ c1.set_key(key_populate(c1, 5))
+ c2 = self.session.open_cursor(uri, None)
+ c2.set_key(key_populate(c2, self.nentries - 5))
+ self.session.truncate(None, c1, c2, None)
+ c1.close()
+ c2.close()
+
+ # Compact it, using either the session method or the utility.
+ if self.utility == 1:
+ self.session.checkpoint(None)
+ self.close_conn();
+ self.runWt(["compact", uri])
+ else:
+ # Optionally reopen the connection so we do more on-disk tests.
+ if self.reopen == 1:
+ self.session.checkpoint(None)
+ self.reopen_conn()
+
+ self.session.compact(uri, None)
+
+ # If it's a simple object, confirm it worked.
+ if self.uri == "file:":
+ self.assertLess(os.stat(self.name).st_size, 10 * 1024)
+
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/test/suite/test_util10.py b/test/suite/test_util10.py
deleted file mode 100644
index 4e7641efe77..00000000000
--- a/test/suite/test_util10.py
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/usr/bin/env python
-#
-# Public Domain 2008-2012 WiredTiger, Inc.
-#
-# This is free and unencumbered software released into the public domain.
-#
-# Anyone is free to copy, modify, publish, use, compile, sell, or
-# distribute this software, either in source code form or as a compiled
-# binary, for any purpose, commercial or non-commercial, and by any
-# means.
-#
-# In jurisdictions that recognize copyright laws, the author or authors
-# of this software dedicate any and all copyright interest in the
-# software to the public domain. We make this dedication for the benefit
-# of the public at large and to the detriment of our heirs and
-# successors. We intend this dedication to be an overt act of
-# relinquishment in perpetuity of all present and future rights to this
-# software under copyright law.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-# OTHER DEALINGS IN THE SOFTWARE.
-
-import os, struct
-from suite_subprocess import suite_subprocess
-import wiredtiger, wttest
-
-# test_util10.py
-# Utilities: wt dumpfile
-class test_util10(wttest.WiredTigerTestCase, suite_subprocess):
- tablename = 'test_util10.a'
- nentries = 1000
- session_params = 'key_format=S,value_format=S'
-
- def populate(self, tablename):
- """
- Insert some simple entries into the table
- """
- cursor = self.session.open_cursor('table:' + tablename, None, None)
- for i in range(0, self.nentries):
- key = 'KEY' + str(i)
- val = 'VAL' + str(i)
- cursor.set_key(key)
- cursor.set_value(val)
- cursor.insert()
- cursor.set_key('SOMEKEY')
- cursor.set_value('SOMEVALUE')
- cursor.insert()
- cursor.set_key('ANOTHERKEY')
- cursor.set_value('ANOTHERVALUE')
- cursor.insert()
- cursor.close()
-
- def test_dumpfile_empty(self):
- """
- Test read in a 'wt' process, using an empty table
- """
- self.session.create('table:' + self.tablename, self.session_params)
- outfile = "dumpfileout.txt"
- self.runWt(["dumpfile", self.tablename + ".wt"], outfilename=outfile)
- self.check_empty_file(outfile)
-
- def test_dumpfile_populated(self):
- """
- Test read in a 'wt' process, using an empty table
- """
- self.session.create('table:' + self.tablename, self.session_params)
- self.populate(self.tablename)
- outfile = "dumpfileout.txt"
- self.runWt(["dumpfile", self.tablename + ".wt"], outfilename=outfile)
-
- # Expected output is roughly K/V pairs in this format:
- # K {xxxxxx#00}
- # V {xxxxxx#00}
- # except that by default keys use prefix compression.
- # 'KEY340' would not be found in the output, but rather K {0#00}
- # because it appears immediately after 'KEY34' so uses the five
- # bytes of that key. We've chosen keys to find that will not be
- # compressed.
- self.check_file_contains(outfile, 'V {VAL22#00}')
- self.check_file_contains(outfile, 'K {KEY0#00}')
- self.check_file_contains(outfile, 'K {SOMEKEY#00}')
- self.check_file_contains(outfile, 'V {SOMEVALUE#00}')
- self.check_file_contains(outfile, 'K {SOMEKEY#00}')
- self.check_file_contains(outfile, 'V {ANOTHERVALUE#00}')
-
-
-if __name__ == '__main__':
- wttest.run()
diff --git a/test/thread/t.c b/test/thread/t.c
index 0f40e0424e4..9858ed08719 100644
--- a/test/thread/t.c
+++ b/test/thread/t.c
@@ -177,7 +177,7 @@ wt_shutdown(void)
static void
shutdown(void)
{
- (void)system("rm -f WildTiger WiredTiger.* __wt*");
+ (void)system("rm -f WiredTiger.* __wt*");
}
static int