diff options
61 files changed, 1199 insertions, 590 deletions
diff --git a/build_posix/Make.base b/build_posix/Make.base index dc52d1ac247..2a7985f57b0 100644 --- a/build_posix/Make.base +++ b/build_posix/Make.base @@ -10,10 +10,10 @@ bin_PROGRAMS = wt wt_SOURCES =\ src/utilities/util_backup.c \ src/utilities/util_cpyright.c \ + src/utilities/util_compact.c \ src/utilities/util_create.c \ src/utilities/util_drop.c \ src/utilities/util_dump.c \ - src/utilities/util_dumpfile.c \ src/utilities/util_getopt.c \ src/utilities/util_list.c \ src/utilities/util_load.c \ diff --git a/build_posix/configure.ac.in b/build_posix/configure.ac.in index 77d90e00241..caf2287988a 100644 --- a/build_posix/configure.ac.in +++ b/build_posix/configure.ac.in @@ -70,7 +70,7 @@ AC_PROG_INSTALL AC_CHECK_LIB(pthread, pthread_create) AC_CHECK_LIB(dl, dlopen) AC_CHECK_LIB(rt, sched_yield) -AC_CHECK_FUNCS([clock_gettime gettimeofday fcntl posix_memalign]) +AC_CHECK_FUNCS([clock_gettime gettimeofday fcntl posix_fadvise posix_memalign]) AC_SYS_LARGEFILE AC_C_BIGENDIAN diff --git a/dist/api_data.py b/dist/api_data.py index 3f89dac26a1..c0e6b21ec5c 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -277,6 +277,8 @@ methods = { 'session.close' : Method([]), +'session.compact' : Method([]), + 'session.create' : Method(table_meta + file_config + source_meta + [ Config('exclusive', 'false', r''' fail if the object exists. When false (the default), if the @@ -291,7 +293,6 @@ methods = { type='boolean'), ]), -'session.dumpfile' : Method([]), 'session.log_printf' : Method([]), 'session.open_cursor' : Method([ @@ -368,7 +369,20 @@ methods = { ]), 'session.truncate' : Method([]), 'session.upgrade' : Method([]), -'session.verify' : Method([]), +'session.verify' : Method([ + Config('dump_address', 'false', r''' + Display addresses and page types as pages are verified, using + the application's message handler, intended for debugging''', + type='boolean'), + Config('dump_blocks', 'false', r''' + Display the contents of on-disk blocks as they are verified, using + the application's message handler, intended for debugging''', + type='boolean'), + Config('dump_pages', 'false', r''' + Display the contents of in-memory pages as they are verified, using + the application's message handler, intended for debugging''', + type='boolean') +]), 'session.begin_transaction' : Method([ Config('isolation', '', r''' @@ -400,6 +414,10 @@ methods = { including the named checkpoint. Checkpoints cannot be dropped while a hot backup is in progress or if open in a cursor''', type='list'), + Config('force', 'false', r''' + checkpoints may be skipped if the underlying object has not + been modified, this option forces the checkpoint''', + type='boolean'), Config('name', '', r''' if non-empty, specify a name for the checkpoint'''), Config('target', '', r''' diff --git a/dist/filelist b/dist/filelist index 9674aaedcf5..bbee6912f01 100644 --- a/dist/filelist +++ b/dist/filelist @@ -6,6 +6,7 @@ src/api/api_version.c src/block/block_addr.c src/block/block_ckpt.c src/block/block_cksum.c +src/block/block_compact.c src/block/block_ext.c src/block/block_mgr.c src/block/block_open.c @@ -16,6 +17,7 @@ src/block/block_write.c src/bloom/bloom.c src/btree/bt_bulk.c src/btree/bt_cache.c +src/btree/bt_compact.c src/btree/bt_curnext.c src/btree/bt_curprev.c src/btree/bt_cursor.c diff --git a/dist/s_string.ok b/dist/s_string.ok index b3fc3df2b65..9abf8bf4d2b 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -201,7 +201,6 @@ Vv VxWorks WIREDTIGER WeakHashLen -WildTiger WinNT WiredTiger WiredTiger's diff --git a/dist/stat_data.py b/dist/stat_data.py index 1fcdde0e930..f7663bd3c42 100644 --- a/dist/stat_data.py +++ b/dist/stat_data.py @@ -64,6 +64,7 @@ btree_stats = [ Stat('file_col_fix_pages', 'column-store fixed-size leaf pages'), Stat('file_col_int_pages', 'column-store internal pages'), Stat('file_col_var_pages', 'column-store variable-size leaf pages'), + Stat('file_compact_rewrite', 'pages rewritten by compaction'), Stat('file_entries', 'total entries'), Stat('file_fixed_len', 'fixed-record size'), Stat('file_magic', 'magic number'), diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c index 09a95d3aaee..99b2392a86e 100644 --- a/examples/c/ex_all.c +++ b/examples/c/ex_all.c @@ -400,14 +400,14 @@ session_ops(WT_SESSION *session) "table:mytable", "key_format=r,value_format=S,cache_resident=true"); /*! [Create a cache-resident object] */ + /*! [Compact a table] */ + ret = session->compact(session, "table:mytable", NULL); + /*! [Compact a table] */ + /*! [Drop a table] */ ret = session->drop(session, "table:mytable", NULL); /*! [Drop a table] */ - /*! [Dump a file] */ - ret = session->dumpfile(session, "file:myfile", NULL); - /*! [Dump a file] */ - /*! [Print to the message stream] */ ret = session->msg_printf( session, "process ID %" PRIuMAX, (uintmax_t)getpid()); @@ -895,6 +895,8 @@ main(void) const char *home = "WT_TEST"; ret = wiredtiger_open(home, NULL, "create,transactional", &conn); /*! [Open a connection] */ + + (void)conn->close(conn, NULL); } /*! [Get the WiredTiger library version #1] */ diff --git a/examples/c/ex_test_perf.c b/examples/c/ex_test_perf.c index e5d9c0b1541..74de4ef39b3 100644 --- a/examples/c/ex_test_perf.c +++ b/examples/c/ex_test_perf.c @@ -192,7 +192,9 @@ read_thread(void *arg) cursor->set_key(cursor, key_buf); cursor->search(cursor); } + session->close(session, NULL); + free(key_buf); return (arg); } diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c index fc9aa37a31e..a581945aab0 100644 --- a/src/block/block_ckpt.c +++ b/src/block/block_ckpt.c @@ -231,6 +231,38 @@ __wt_block_checkpoint(WT_SESSION_IMPL *session, } /* + * __ckpt_extlist_read -- + * Read a checkpoints extent lists and copy + */ +static int +__ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt) +{ + WT_BLOCK_CKPT *ci; + + /* + * Allocate a checkpoint structure, crack the cookie and read the + * checkpoint's extent lists. + * + * Ignore the avail list: checkpoint avail lists are only useful if we + * are rolling forward from the particular checkpoint and they represent + * our best understanding of what blocks can be allocated. If we are + * not operating on the live checkpoint, subsequent checkpoints might + * have allocated those blocks, and the avail list is useless. We don't + * discard it, because it is useful as part of verification, but we + * don't re-write it either. + */ + WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv)); + + ci = ckpt->bpriv; + WT_RET(__wt_block_ckpt_init(session, block, ci, ckpt->name, 0)); + WT_RET(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci)); + WT_RET(__wt_block_extlist_read(session, block, &ci->alloc)); + WT_RET(__wt_block_extlist_read(session, block, &ci->discard)); + + return (0); +} + +/* * __ckpt_extlist_fblocks -- * If a checkpoint's extent list is going away, free its blocks. */ @@ -261,7 +293,7 @@ __ckpt_process( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) { WT_BLOCK_CKPT *a, *b, *ci; - WT_CKPT *ckpt, *last_ckpt, *next_ckpt; + WT_CKPT *ckpt, *next_ckpt; WT_DECL_ITEM(tmp); WT_DECL_RET; uint64_t ckpt_size; @@ -297,55 +329,36 @@ __ckpt_process( session, &ci->ckpt_avail, "live", "ckpt_avail")); /* - * To delete a checkpoint, we'll need extent list for it, and we have to - * read that from the disk. + * To delete a checkpoint, we'll need checkpoint information for it and + * the subsequent checkpoint into which it gets rolled; read them from + * disk before we lock things down. */ - last_ckpt = NULL; deleting = 0; WT_CKPT_FOREACH(ckptbase, ckpt) { - if (F_ISSET(ckpt, WT_CKPT_FAKE)) + if (F_ISSET(ckpt, WT_CKPT_FAKE) || + !F_ISSET(ckpt, WT_CKPT_DELETE)) continue; + deleting = 1; /* - * To delete a checkpoint, we'll need checkpoint information for - * it and the subsequent checkpoint. The test is tricky, load - * the current checkpoint's information if it's marked for - * deletion or if it follows a checkpoint marked for deletion, - * where the boundary cases are the first checkpoint in the list - * and the last checkpoint in the list: if we're deleting the - * last checkpoint in the list, there's no next checkpoint, the - * checkpoint will be merged into the live tree. + * Read the checkpoint and next checkpoint extent lists if we + * haven't already read them (we may have already read these + * extent blocks if there is more than one deleted checkpoint). */ - if (!F_ISSET(ckpt, WT_CKPT_DELETE) && - (F_ISSET(ckpt, WT_CKPT_ADD) || - last_ckpt == NULL || !F_ISSET(last_ckpt, WT_CKPT_DELETE))) { - last_ckpt = ckpt; - continue; - } - last_ckpt = ckpt; - deleting = 1; + if (ckpt->bpriv == NULL) + WT_ERR(__ckpt_extlist_read(session, block, ckpt)); + + for (next_ckpt = ckpt + 1;; ++next_ckpt) + if (!F_ISSET(next_ckpt, WT_CKPT_FAKE)) + break; /* - * Allocate a checkpoint structure, crack the cookie and read - * the checkpoint's extent lists. - * - * Ignore the avail list: checkpoint avail lists are only useful - * if we are rolling forward from the particular checkpoint and - * they represent our best understanding of what blocks can be - * allocated. If we are not operating on the live checkpoint, - * subsequent checkpoints might have allocated those blocks, and - * the avail list is useless. We don't discard it, because it - * is useful as part of verification, but we don't re-write it - * either. + * The "next" checkpoint may be the live tree which has no + * extent blocks to read. */ - WT_ERR(__wt_calloc( - session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv)); - ci = ckpt->bpriv; - WT_ERR(__wt_block_ckpt_init(session, block, ci, ckpt->name, 0)); - WT_ERR(__wt_block_buffer_to_ckpt( - session, block, ckpt->raw.data, ci)); - WT_ERR(__wt_block_extlist_read(session, block, &ci->alloc)); - WT_ERR(__wt_block_extlist_read(session, block, &ci->discard)); + if (next_ckpt->bpriv == NULL && + !F_ISSET(next_ckpt, WT_CKPT_ADD)) + WT_ERR(__ckpt_extlist_read(session, block, next_ckpt)); } /* @@ -369,7 +382,8 @@ __ckpt_process( * when writing the live extent lists. */ WT_CKPT_FOREACH(ckptbase, ckpt) { - if (!F_ISSET(ckpt, WT_CKPT_DELETE)) + if (F_ISSET(ckpt, WT_CKPT_FAKE) || + !F_ISSET(ckpt, WT_CKPT_DELETE)) continue; if (WT_VERBOSE_ISSET(session, ckpt)) { diff --git a/src/block/block_compact.c b/src/block/block_compact.c new file mode 100644 index 00000000000..7ddc03b4b6a --- /dev/null +++ b/src/block/block_compact.c @@ -0,0 +1,98 @@ +/*- + * Copyright (c) 2008-2012 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_block_compact_skip -- + * Return if compaction will shrink the file. + */ +int +__wt_block_compact_skip( + WT_SESSION_IMPL *session, WT_BLOCK *block, int *skipp) +{ + WT_EXT *ext; + WT_EXTLIST *el; + WT_FH *fh; + off_t avail, half; + int pct; + + fh = block->fh; + *skipp = 1; + + /* + * We do compaction by copying blocks from the end of the file to the + * beginning of the file, and we need some metrics to decide if it's + * worth doing. Ignore small files, and files where less than 30% of + * the file appears in the available list, and in the first half of + * the file. In other words, don't bother with compaction unless we + * have a reasonable expectation of moving 30% of the file from the + * last half of the file to the first half of the file. + */ +#define WT_COMPACT_TRIGGER 30 + if (fh->file_size <= 10 * 1024) + return (0); + + __wt_spin_lock(session, &block->live_lock); + + avail = 0; + half = fh->file_size / 2; + + el = &block->live.avail; + WT_EXT_FOREACH(ext, el->off) + if (ext->off < half) + avail += ext->size; + pct = (int)((avail * 100) / fh->file_size); + + __wt_spin_unlock(session, &block->live_lock); + + if (pct >= WT_COMPACT_TRIGGER) + *skipp = 0; + + WT_VERBOSE_RET(session, block, + "compaction %s" "useful, %d%% of the free space in the available " + "list appears in the first half of the file", + pct < WT_COMPACT_TRIGGER ? "not " : "", pct); + + return (0); +} + +/* + * __wt_block_compact_page_skip -- + * Return if writing a particular page will shrink the file. + */ +int +__wt_block_compact_page_skip(WT_SESSION_IMPL *session, + WT_BLOCK *block, const uint8_t *addr, uint32_t addr_size, int *skipp) +{ + WT_FH *fh; + off_t offset; + uint32_t size, cksum; + + WT_UNUSED(addr_size); + *skipp = 0; /* Paranoia: skip on error. */ + + fh = block->fh; + + /* Crack the cookie. */ + WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); + + /* + * If this block appears in the last half of the file, rewrite it. + * + * It's unclear we need to lock: the chances of a smashed read are close + * to non-existent and the worst thing that can happen is we rewrite a + * block we didn't want to rewrite. On the other hand, compaction is + * not expected to be a common operation in WiredTiger, we shouldn't be + * here a lot. + */ + __wt_spin_lock(session, &block->live_lock); + *skipp = offset > fh->file_size / 2 ? 0 : 1; + __wt_spin_unlock(session, &block->live_lock); + + return (0); +} diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c index 01941619942..d7f9ad8cb92 100644 --- a/src/block/block_mgr.c +++ b/src/block/block_mgr.c @@ -194,6 +194,38 @@ __wt_bm_checkpoint_unload(WT_SESSION_IMPL *session) } /* + * __wt_bm_compact_skip -- + * Return if a file can be compacted. + */ +int +__wt_bm_compact_skip(WT_SESSION_IMPL *session, int *skipp) +{ + WT_BLOCK *block; + + if ((block = session->btree->block) == NULL) + return (__bm_invalid(session)); + + return (__wt_block_compact_skip(session, block, skipp)); +} + +/* + * __wt_bm_compact_skip -- + * Return if a page is useful for compaction. + */ +int +__wt_bm_compact_page_skip(WT_SESSION_IMPL *session, + const uint8_t *addr, uint32_t addr_size, int *skipp) +{ + WT_BLOCK *block; + + if ((block = session->btree->block) == NULL) + return (__bm_invalid(session)); + + return (__wt_block_compact_page_skip( + session, block, addr, addr_size, skipp)); +} + +/* * __wt_bm_truncate -- * Truncate a file. */ diff --git a/src/bloom/bloom.c b/src/bloom/bloom.c index a6fc72cdf49..7d9f687c384 100644 --- a/src/bloom/bloom.c +++ b/src/bloom/bloom.c @@ -46,7 +46,7 @@ __bloom_init(WT_SESSION_IMPL *session, err: if (bloom->uri != NULL) __wt_free(session, bloom->uri); if (bloom->config != NULL) - __wt_free(session, bloom->uri); + __wt_free(session, bloom->config); if (bloom->bitstring != NULL) __wt_free(session, bloom->bitstring); if (bloom != NULL) diff --git a/src/btree/bt_cache.c b/src/btree/bt_cache.c index 5aecf065976..ae952d768d7 100644 --- a/src/btree/bt_cache.c +++ b/src/btree/bt_cache.c @@ -71,14 +71,6 @@ __wt_cache_create(WT_CONNECTION_IMPL *conn, const char *cfg[]) __wt_spin_init(session, &cache->evict_lock); /* - * Allocate the forced page eviction request array. We size it to - * allow one eviction page request per session. - */ - cache->max_evict_request = conn->session_size; - WT_ERR(__wt_calloc_def( - session, cache->max_evict_request, &cache->evict_request)); - - /* * We pull some values from the cache statistics (rather than have two * copies). Set them. */ @@ -128,7 +120,5 @@ __wt_cache_destroy(WT_CONNECTION_IMPL *conn) (void)__wt_cond_destroy(session, cache->evict_cond); __wt_spin_destroy(session, &cache->evict_lock); - __wt_free(session, cache->evict_request); - __wt_free(session, conn->cache); } diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c new file mode 100644 index 00000000000..2b57a5a321e --- /dev/null +++ b/src/btree/bt_compact.c @@ -0,0 +1,171 @@ +/*- + * Copyright (c) 2008-2012 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_compact -- + * Compact a file. + */ +int +__wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_DECL_RET; + WT_PAGE *page; + int skip; + + WT_UNUSED(cfg); + + /* Check if compaction might be useful. */ + WT_RET(__wt_bm_compact_skip(session, &skip)); + if (skip) + return (0); + + /* + * Invoke the eviction server to review in-memory pages to see if they + * need to be re-written (we must use the eviction server because it's + * the only thread that can safely look at page reconciliation values). + */ + WT_RET(__wt_sync_file_serial(session, WT_SYNC_COMPACT)); + __wt_evict_server_wake(session); + __wt_cond_wait(session, session->cond, 0); + WT_RET(session->syncop_ret); + + /* + * Walk the tree reviewing all of the on-disk pages to see if they + * need to be re-written. + */ + for (page = NULL;;) { + WT_RET(__wt_tree_walk(session, &page, WT_TREE_COMPACT)); + if (page == NULL) + break; + + /* Mark the page and tree dirty, we want to write this page. */ + if ((ret = __wt_page_modify_init(session, page)) != 0) { + __wt_stack_release(session, page); + WT_RET(ret); + } + __wt_page_and_tree_modify_set(session, page); + + WT_BSTAT_INCR(session, file_compact_rewrite); + } + + return (0); +} + +/* + * __wt_compact_page_skip -- + * Return if the block-manager wants us to re-write this page. + */ +int +__wt_compact_page_skip( + WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref, int *skipp) +{ + uint32_t addr_size; + const uint8_t *addr; + + /* + * There's one compaction test we do before we read the page, to see + * if the block-manager thinks it useful to rewrite the page. If a + * rewrite won't help, we don't want to do I/O for nothing. For that + * reason, this check is done in a call from inside the tree-walking + * routine. + * + * Ignore everything but on-disk pages, the eviction server has already + * done a pass over the in-memory pages. + */ + if (ref->state != WT_REF_DISK) { + *skipp = 1; + return (0); + } + + __wt_get_addr(parent, ref, &addr, &addr_size); + if (addr == NULL) { + *skipp = 1; + return (0); + } + + return (__wt_bm_compact_page_skip(session, addr, addr_size, skipp)); +} + +/* + * __wt_compact_evict -- + * Helper routine for the eviction thread to decide if a file's size would + * benefit from re-writing this page. + */ +int +__wt_compact_evict(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_PAGE_MODIFY *mod; + int skip; + uint32_t addr_size; + const uint8_t *addr; + + mod = page->modify; + + /* + * We're using the eviction thread in compaction because it can safely + * look at page reconciliation information, no pages are being evicted + * if the eviction is busy here. That's not good for performance and + * implies compaction will impact performance, but right now it's the + * only way to safely look at reconciliation information. + * + * The reason we need to look at reconciliation information is that an + * in-memory page's original disk addresses might have been fine for + * compaction, but its replacement addresses might be a problem. + * + * Ignore the root: it may not have a replacement address, and besides, + * if anything else gets written, so will it. + */ + if (WT_PAGE_IS_ROOT(page)) + return (0); + + /* + * If the page is already dirty, skip some work, it will be written in + * any case. + */ + if (__wt_page_is_modified(page)) + return (0); + + /* + * If the page is clean, test the original addresses. + * If the page is a 1-to-1 replacement, test the replacement addresses. + * If the page is a split, ignore it, it will be merged into the parent. + */ + if (mod == NULL) + goto disk; + + switch (F_ISSET(mod, WT_PM_REC_MASK)) { + case 0: +disk: __wt_get_addr(page->parent, page->ref, &addr, &addr_size); + if (addr == NULL) + return (0); + WT_RET( + __wt_bm_compact_page_skip(session, addr, addr_size, &skip)); + if (skip) + return (0); + break; + case WT_PM_REC_EMPTY: + return (0); + case WT_PM_REC_REPLACE: + WT_RET(__wt_bm_compact_page_skip( + session, mod->u.replace.addr, mod->u.replace.size, &skip)); + if (skip) + return (0); + break; + case WT_PM_REC_SPLIT: + case WT_PM_REC_SPLIT_MERGE: + return (0); + } + + /* Mark the page and tree dirty, we want to write this page. */ + WT_RET(__wt_page_modify_init(session, page)); + __wt_page_and_tree_modify_set(session, page); + + WT_BSTAT_INCR(session, file_compact_rewrite); + return (0); +} diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c index e5bdf9eacd7..dec1651dd36 100644 --- a/src/btree/bt_evict.c +++ b/src/btree/bt_evict.c @@ -21,8 +21,9 @@ static int __evict_worker(WT_SESSION_IMPL *); * number of pages from each file's in-memory tree for each page we evict. */ #define WT_EVICT_GROUP 30 /* Consider N pages as LRU candidates */ -#define WT_EVICT_WALK_PER_TABLE 35 /* Pages to visit per file */ +#define WT_EVICT_WALK_PER_FILE 5 /* Pages to visit per file */ #define WT_EVICT_WALK_BASE 50 /* Pages tracked across file visits */ +#define WT_EVICT_WALK_INCR 100 /* Pages added each walk */ /* * __evict_list_clr -- @@ -156,30 +157,22 @@ __wt_cache_evict_server(void *arg) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *session; - int read_lockout; session = arg; conn = S2C(session); cache = conn->cache; while (F_ISSET(conn, WT_SERVER_RUN)) { - /* - * Use the same logic as application threads to decide whether - * there is work to do. - */ - __wt_eviction_check(session, &read_lockout, 0); - - if (!read_lockout) { - WT_VERBOSE_ERR(session, evictserver, "sleeping"); - __wt_cond_wait(session, cache->evict_cond); - } + /* Evict pages from the cache as needed. */ + WT_ERR(__evict_worker(session)); if (!F_ISSET(conn, WT_SERVER_RUN)) break; - WT_VERBOSE_ERR(session, evictserver, "waking"); - /* Evict pages from the cache as needed. */ - WT_ERR(__evict_worker(session)); + WT_VERBOSE_ERR(session, evictserver, "sleeping"); + /* Don't rely on signals: check periodically. */ + __wt_cond_wait(session, cache->evict_cond, 100000); + WT_VERBOSE_ERR(session, evictserver, "waking"); } WT_VERBOSE_ERR(session, evictserver, "exiting"); @@ -213,7 +206,7 @@ __evict_worker(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; - uint64_t bytes_start, bytes_inuse, bytes_max; + uint64_t bytes_inuse, bytes_max; int loop; conn = S2C(session); @@ -237,7 +230,7 @@ __evict_worker(WT_SESSION_IMPL *session) */ bytes_inuse = __wt_cache_bytes_inuse(cache); bytes_max = conn->cache_size; - if (bytes_inuse < cache->eviction_target * (bytes_max / 100)) + if (bytes_inuse < (cache->eviction_target * bytes_max) / 100) break; WT_RET(__evict_lru(session)); @@ -247,9 +240,7 @@ __evict_worker(WT_SESSION_IMPL *session) * any progress at all, go back to sleep, it's not something * we can fix. */ - bytes_start = bytes_inuse; - bytes_inuse = __wt_cache_bytes_inuse(cache); - if (bytes_start == bytes_inuse) { + if (__wt_cache_bytes_inuse(cache) >= bytes_inuse) { if (loop == 10) { WT_STAT_INCR(conn->stats, cache_evict_slow); WT_VERBOSE_RET(session, evictserver, @@ -392,9 +383,9 @@ __evict_file_request_walk(WT_SESSION_IMPL *session) __evict_list_clr_all(session, 0); /* - * Wait for LRU eviction activity to drain. It is much easier - * to reason about sync or forced eviction if we know there are - * no other threads evicting in the tree. + * Wait for LRU eviction activity to drain. It is much easier to + * reason about checkpoints if we know there are no other threads + * evicting in the tree. */ while (request_session->btree->lru_count > 0) { __wt_spin_unlock(session, &cache->evict_lock); @@ -437,27 +428,21 @@ __evict_file_request(WT_SESSION_IMPL *session, int syncop) break; WT_ERR(__wt_tree_walk(session, &next_page, WT_TREE_EVICT)); - /* Write dirty pages for sync, and sync with discard. */ switch (syncop) { + case WT_SYNC_COMPACT: + WT_ERR(__wt_compact_evict(session, page)); + break; case WT_SYNC: case WT_SYNC_DISCARD: + /* Write dirty pages for sync and sync with discard. */ if (__wt_page_is_modified(page)) WT_ERR(__wt_rec_write( session, page, NULL, WT_REC_SINGLE)); - break; - case WT_SYNC_DISCARD_NOWRITE: - break; - } + if (syncop == WT_SYNC) + break; - /* - * Evict the page for sync with discard, simply discard the page - * for discard alone. - */ - switch (syncop) { - case WT_SYNC: - break; - case WT_SYNC_DISCARD: /* + * Evict the page for sync with discard. * Do not attempt to evict pages expected to be merged * into their parents, with the single exception that * the root page can't be merged into anything, it must @@ -471,10 +456,11 @@ __evict_file_request(WT_SESSION_IMPL *session, int syncop) break; case WT_SYNC_DISCARD_NOWRITE: /* - * When we discard the root page, clear the reference - * from the btree handle. It is important to do this - * here, so that future eviction doesn't see root_page - * pointing to freed memory. + * Simply discard the page for discard alone. When we + * discard the root page, clear the reference from the + * btree handle. It is important to do this here, so + * that future eviction doesn't see root_page pointing + * to freed memory. */ if (WT_PAGE_IS_ROOT(page)) session->btree->root_page = NULL; @@ -532,17 +518,18 @@ __evict_walk(WT_SESSION_IMPL *session) WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; - u_int elem, file_count, i; + u_int elem, file_count, i, retries; conn = S2C(session); cache = S2C(session)->cache; + retries = 0; /* * Resize the array in which we're tracking pages, as necessary, then * get some pages from each underlying file. In practice, a realloc * is rarely needed, so it is worth avoiding the LRU lock. */ - elem = WT_EVICT_WALK_BASE + 2 * WT_EVICT_GROUP; + elem = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR; if (elem > cache->evict_entries) { __wt_spin_lock(session, &cache->evict_lock); /* Save the offset of the eviction point. */ @@ -561,7 +548,7 @@ __evict_walk(WT_SESSION_IMPL *session) * servicing eviction requests. */ i = WT_EVICT_WALK_BASE; - file_count = 0; +retry: file_count = 0; TAILQ_FOREACH(btree, &conn->btqh, q) { if (file_count++ < cache->evict_file_next) continue; @@ -591,6 +578,11 @@ __evict_walk(WT_SESSION_IMPL *session) } cache->evict_file_next = (btree == NULL) ? 0 : file_count; + /* In the extreme case, all of the pages have to come from one file. */ + if (ret == 0 && i < cache->evict_entries && + retries++ < WT_EVICT_WALK_INCR / WT_EVICT_WALK_PER_FILE) + goto retry; + if (0) { err: __wt_spin_unlock(session, &cache->evict_lock); } @@ -614,7 +606,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) btree = session->btree; cache = S2C(session)->cache; start = cache->evict + *slotp; - end = start + WT_EVICT_WALK_PER_TABLE; + end = start + WT_EVICT_WALK_PER_FILE; if (end > cache->evict + cache->evict_entries) end = cache->evict + cache->evict_entries; @@ -729,9 +721,7 @@ __evict_get_page( /* * Lock the page while holding the eviction mutex to prevent * multiple attempts to evict it. For pages that are already - * being evicted, including pages on the request queue for - * forced eviction, this operation will fail and we will move - * on. + * being evicted, this operation will fail and we will move on. */ ref = evict->page->ref; WT_ASSERT(session, evict->page == ref->page); diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index 1a28f5f9e7b..bb05f966ee4 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -77,7 +77,7 @@ __wt_bt_cache_flush(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op) */ WT_ERR(__wt_sync_file_serial(session, op)); __wt_evict_server_wake(session); - __wt_cond_wait(session, session->cond); + __wt_cond_wait(session, session->cond, 0); ret = session->syncop_ret; switch (op) { diff --git a/src/btree/bt_upgrade.c b/src/btree/bt_upgrade.c index 574a2335796..fb675123d13 100644 --- a/src/btree/bt_upgrade.c +++ b/src/btree/bt_upgrade.c @@ -14,9 +14,9 @@ int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_UNUSED(session); WT_UNUSED(cfg); /* There's nothing to upgrade, yet. */ + WT_RET(__wt_progress(session, NULL, 1)); return (0); } diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c index ed4309c1bad..84dc9e263e7 100644 --- a/src/btree/bt_vrfy.c +++ b/src/btree/bt_vrfy.c @@ -19,14 +19,16 @@ typedef struct { uint64_t fcnt; /* Progress counter */ - int dumpfile; /* Dump file stream */ + int dump_address; + int dump_pages; + int dump_blocks; WT_ITEM *tmp1; /* Temporary buffer */ WT_ITEM *tmp2; /* Temporary buffer */ } WT_VSTUFF; static void __verify_checkpoint_reset(WT_VSTUFF *); -static int __verify_int(WT_SESSION_IMPL *, int); +static int __verify_config(WT_SESSION_IMPL *, const char *[], WT_VSTUFF *); static int __verify_overflow( WT_SESSION_IMPL *, const uint8_t *, uint32_t, WT_VSTUFF *); static int __verify_overflow_cell( @@ -44,41 +46,6 @@ static int __verify_tree(WT_SESSION_IMPL *, WT_PAGE *, WT_VSTUFF *); int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_UNUSED(cfg); - - return (__verify_int(session, 0)); -} - -/* - * __wt_dumpfile -- - * Dump a file in debugging mode. - */ -int -__wt_dumpfile(WT_SESSION_IMPL *session, const char *cfg[]) -{ - WT_UNUSED(cfg); - -#ifdef HAVE_DIAGNOSTIC - /* - * We use the verification code to do debugging dumps because if we're - * dumping in debugging mode, we want to confirm the page is OK before - * walking it. - */ - return (__verify_int(session, 1)); -#else - WT_RET_MSG(session, ENOTSUP, - "the WiredTiger library was not built in diagnostic mode"); -#endif -} - -/* - * __verify_int -- - * Internal version of verify: verify a Btree, optionally dumping each - * page in debugging mode. - */ -static int -__verify_int(WT_SESSION_IMPL *session, int dumpfile) -{ WT_BTREE *btree; WT_CKPT *ckptbase, *ckpt; WT_DECL_RET; @@ -90,12 +57,14 @@ __verify_int(WT_SESSION_IMPL *session, int dumpfile) WT_CLEAR(_vstuff); vs = &_vstuff; - vs->dumpfile = dumpfile; WT_ERR(__wt_scr_alloc(session, 0, &vs->max_key)); WT_ERR(__wt_scr_alloc(session, 0, &vs->max_addr)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2)); + /* Check configuration strings. */ + WT_ERR(__verify_config(session, cfg, vs)); + /* Get a list of the checkpoints for this file. */ WT_ERR(__wt_meta_ckptlist_get(session, btree->name, &ckptbase)); @@ -106,6 +75,11 @@ __verify_int(WT_SESSION_IMPL *session, int dumpfile) WT_CKPT_FOREACH(ckptbase, ckpt) { WT_VERBOSE_ERR(session, verify, "%s: checkpoint %s", btree->name, ckpt->name); +#ifdef HAVE_DIAGNOSTIC + if (vs->dump_address || vs->dump_blocks || vs->dump_pages) + WT_ERR(__wt_msg(session, + "%s: checkpoint %s", btree->name, ckpt->name)); +#endif /* Fake checkpoints require no work. */ if (F_ISSET(ckpt, WT_CKPT_FAKE)) @@ -157,6 +131,48 @@ err: __wt_meta_ckptlist_free(session, ckptbase); } /* + * __verify_config -- + * Verification supports dumping pages in various formats. + */ +static int +__verify_config(WT_SESSION_IMPL *session, const char *cfg[], WT_VSTUFF *vs) +{ + WT_CONFIG_ITEM cval; + WT_DECL_RET; + + ret = __wt_config_gets(session, cfg, "dump_address", &cval); + if (ret != 0 && ret != WT_NOTFOUND) + WT_RET(ret); + if (ret == 0 && cval.val != 0) + vs->dump_address = 1; + + ret = __wt_config_gets(session, cfg, "dump_blocks", &cval); + if (ret != 0 && ret != WT_NOTFOUND) + WT_RET(ret); + if (ret == 0 && cval.val != 0) + vs->dump_blocks = 1; + + ret = __wt_config_gets(session, cfg, "dump_pages", &cval); + if (ret != 0 && ret != WT_NOTFOUND) + WT_RET(ret); + if (ret == 0 && cval.val != 0) + vs->dump_pages = 1; + +#ifdef HAVE_DIAGNOSTIC + /* + * We use the verification code to do debugging dumps because if we're + * dumping in debugging mode, we want to confirm the page is OK before + * walking it. + */ +#else + if (vs->dump_address || vs->dump_blocks || vs->dump_pages) + WT_RET_MSG(session, ENOTSUP, + "the WiredTiger library was not built in diagnostic mode"); +#endif + return (0); +} + +/* * __verify_checkpoint_reset -- * Reset anything needing to be reset for each new checkpoint verification. */ @@ -197,6 +213,12 @@ __verify_tree(WT_SESSION_IMPL *session, WT_PAGE *page, WT_VSTUFF *vs) WT_VERBOSE_RET(session, verify, "%s %s", __wt_page_addr_string(session, vs->tmp1, page), __wt_page_type_string(page->type)); +#ifdef HAVE_DIAGNOSTIC + if (vs->dump_address) + WT_RET(__wt_msg(session, "%s %s", + __wt_page_addr_string(session, vs->tmp1, page), + __wt_page_type_string(page->type))); +#endif /* * The page's physical structure was verified when it was read into @@ -226,11 +248,10 @@ __verify_tree(WT_SESSION_IMPL *session, WT_PAGE *page, WT_VSTUFF *vs) #ifdef HAVE_DIAGNOSTIC /* Optionally dump the page in debugging mode. */ - if (vs->dumpfile) { + if (vs->dump_blocks && page->dsk != NULL) + WT_RET(__wt_debug_disk(session, page->dsk, NULL)); + if (vs->dump_pages) WT_RET(__wt_debug_page(session, page, NULL)); - if (page->dsk != NULL) - WT_RET(__wt_debug_disk(session, page->dsk, NULL)); - } #endif /* diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c index ac3adbe0e75..7fcedbdd1e2 100644 --- a/src/btree/bt_walk.c +++ b/src/btree/bt_walk.c @@ -58,24 +58,10 @@ __tree_walk_delete( /* * If the page is already instantiated in-memory, other threads may be - * using it: no fast delete. - */ - if (ref->state != WT_REF_DISK) - return (0); - - /* - * If the page references overflow items, we have to clean it up during - * reconciliation, no fast delete. - */ - if (!__wt_off_page(page, ref->addr)) { - __wt_cell_unpack(ref->addr, &unpack); - if (unpack.raw != WT_CELL_ADDR_LNO) - return (0); - } - - /* - * Atomically switch the page's state to delete it. If the page state - * changed underneath us, no fast delete. + * using it, no fast delete. + * + * Atomically switch the page's state to lock it. If the page state + * changes underneath us, no fast delete. * * Possible optimization: if the page is already deleted and the delete * is visible to us (the delete has been committed), we could skip the @@ -83,29 +69,47 @@ __tree_walk_delete( * in the page. While that's a huge amount of work to no purpose, it's * unclear optimizing for overlapping range deletes is worth the effort. */ - if (!WT_ATOMIC_CAS(ref->state, WT_REF_DISK, WT_REF_READING)) + if (ref->state != WT_REF_DISK || + !WT_ATOMIC_CAS(ref->state, WT_REF_DISK, WT_REF_READING)) return (0); /* - * We have the reference "locked": + * If the page references overflow items, we have to clean it up during + * reconciliation, no fast delete. Check this after we have the page + * locked down, instantiating the page in memory and modifying it could + * theoretically point the address somewhere away from the on-page cell. + */ + __wt_cell_unpack(ref->addr, &unpack); + if (unpack.raw != WT_CELL_ADDR_LNO) + goto err; + + /* * Record the change in the transaction structure and set the change's * transaction ID. */ WT_ERR(__wt_txn_modify_ref(session, ref)); /* - * This action dirties the page: mark it dirty now, because there's no + * This action dirties the parent page: mark it dirty now, there's no * future reconciliation of the child leaf page that will dirty it as - * we flush the tree. + * we write the tree. */ WT_ERR(__wt_page_modify_init(session, page)); __wt_page_modify_set(page); *skipp = 1; - /* Release the page. */ -err: WT_PUBLISH(ref->state, WT_REF_DELETED); + /* Delete the page. */ + WT_PUBLISH(ref->state, WT_REF_DELETED); + return (0); +err: /* + * Restore the page to on-disk status, we'll have to instantiate it. + * We're don't have to back out adding this node to the transaction + * modify list, that's OK because the rollback function ignores nodes + * that aren't set to WT_REF_DELETED. + */ + WT_PUBLISH(ref->state, WT_REF_DISK); return (ret); } @@ -151,12 +155,14 @@ __wt_tree_walk(WT_SESSION_IMPL *session, WT_PAGE **pagep, uint32_t flags) WT_PAGE *page, *t; WT_REF *ref; uint32_t slot; - int discard, eviction, prev, skip; + int compact, discard, eviction, prev, set_read_gen, skip; btree = session->btree; - /* We can currently only do fast-discard on row-store trees. */ + /* Fast-discard currently only works on row-store trees. */ discard = LF_ISSET(WT_TREE_DISCARD) && btree->type == BTREE_ROW ? 1 : 0; + + compact = LF_ISSET(WT_TREE_COMPACT) ? 1 : 0; eviction = LF_ISSET(WT_TREE_EVICT) ? 1 : 0; prev = LF_ISSET(WT_TREE_PREV) ? 1 : 0; @@ -229,43 +235,55 @@ descend: for (;;) { * the state to WT_REF_EVICT_WALK temporarily to avoid * the page being evicted by another thread while it is * being evaluated. - * - * We also return pages in the "evict-force" state, - * which indicates they are waiting on the eviction - * server getting to a request. A sync call in the - * meantime must write such a page to ensure all - * modifications are written. Since this is happening - * inside the eviction server, and an LRU walk will - * check the state before adding the page to the LRU - * queue, there is no way for an evict-force page to - * disappear from under us. */ + set_read_gen = 0; if (eviction) { if (!WT_ATOMIC_CAS(ref->state, WT_REF_MEM, WT_REF_EVICT_WALK)) break; + } else if (discard) { + /* + * If deleting a range, try to delete the page + * without instantiating it. + */ + WT_RET(__tree_walk_delete( + session, page, ref, &skip)); + if (skip) + break; + WT_RET(__wt_page_in(session, page, ref)); } else { - if (discard) { - /* - * If deleting a range, try to delete - * the page without instantiating it. - */ - WT_RET(__tree_walk_delete( + /* + * If iterating a cursor (or doing compaction), + * skip deleted pages that are visible to us. + */ + WT_RET(__tree_walk_read(session, ref, &skip)); + if (skip) + break; + + /* + * Test if the page is useful for compaction: + * we don't want to read it if it won't help. + * + * Pages read for compaction aren't "useful"; + * reset the page generation to 0 so the page + * is quickly chosen for eviction. (This can + * race of course, but it's unlikely and will + * only result in an incorrectly low page read + * generation.) + */ + set_read_gen = 0; + if (compact) { + WT_RET(__wt_compact_page_skip( session, page, ref, &skip)); if (skip) break; - } else { - /* - * If iterating a cursor, skip deleted - * pages that are visible to us. - */ - WT_RET(__tree_walk_read( - session, ref, &skip)); - if (skip) - break; + set_read_gen = + ref->state == WT_REF_DISK ? 1 : 0; } WT_RET(__wt_page_in(session, page, ref)); + if (set_read_gen) + page->read_gen = 0; } page = ref->page; diff --git a/src/btree/rec_evict.c b/src/btree/rec_evict.c index 2ec317407d2..aba4edbd787 100644 --- a/src/btree/rec_evict.c +++ b/src/btree/rec_evict.c @@ -309,7 +309,7 @@ __rec_review(WT_SESSION_IMPL *session, * to evict split-merge pages, which means the only interesting case * is an empty page. If the eviction thread picked an "empty" page * for eviction, it must have had reason, probably the empty page got - * really, really full and is being forced out of the cache. + * really, really full. */ mod = page->modify; if (!top && (mod == NULL || !F_ISSET(mod, diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c index 9356125aa8b..15b9b2da76f 100644 --- a/src/btree/rec_write.c +++ b/src/btree/rec_write.c @@ -239,28 +239,27 @@ static void __rec_dictionary_reset(WT_RECONCILE *); * * The reconciliation code is used in the following situations: * - * (1) by the eviction server during sync; - * (2) by the eviction server during forced eviction of a page; and - * (3) by any thread during LRU eviction. + * (1) by the eviction server during sync; and + * (2) by any thread during LRU eviction. * * The complexity is checking the page state of child pages when looking for * pages to merge. * * We clearly want to consider all normal, in-memory pages (WT_REF_MEM). * - * During LRU eviction in case (3), the eviction code has already locked the + * During LRU eviction in case (2), the eviction code has already locked the * subtree, so locked pages should be included in the merge (WT_REF_LOCKED). * * To make this tractable, the eviction server guarantees that no thread is - * doing LRU eviction in the tree when cases (1) and (2) occur. That is, the - * only state change that can occur during a sync or forced eviction is for a - * reference to a page on disk to cause a page to be read (WT_REF_READING). - * In the case of a read, we could safely ignore those pages because they are - * unmodified by definition -- they are being read from disk, however, in the - * current system, that state also includes fast-delete pages that are being - * instantiated. Those pages cannot be ignored, as they have been modified. - * For this reason, we have to wait for the WT_REF_READING state to be resolved - * to another state before we proceed. + * doing LRU eviction in the tree when case (1) occurs. That is, the only + * state change that can occur during a sync is for a reference to a page on + * disk to cause a page to be read (WT_REF_READING). In the case of a read, we + * could safely ignore those pages because they are unmodified by definition -- + * they are being read from disk, however, in the current system, that state + * also includes fast-delete pages that are being instantiated. Those pages + * cannot be ignored, as they have been modified. For this reason, we have to + * wait for the WT_REF_READING state to be resolved to another state before we + * proceed. */ static int __rec_page_modified(WT_SESSION_IMPL *session, @@ -1480,6 +1479,8 @@ __wt_rec_row_bulk_insert(WT_CURSOR_BULK *cbulk) return (0); } +#define WT_FIX_ENTRIES(btree, bytes) (((bytes) * 8) / (btree)->bitcnt) + /* * __wt_rec_col_fix_bulk_insert -- * Fixed-length column-store bulk insert. @@ -1504,7 +1505,7 @@ __wt_rec_col_fix_bulk_insert(WT_CURSOR_BULK *cbulk) entries > 0; entries -= page_entries, data += page_size) { page_entries = WT_MIN(entries, - r->space_avail * 8 / btree->bitcnt); + WT_FIX_ENTRIES(btree, r->space_avail)); page_size = __bitstr_size(page_entries * btree->bitcnt); memcpy(r->first_free, data, page_size); @@ -1533,7 +1534,7 @@ __wt_rec_col_fix_bulk_insert(WT_CURSOR_BULK *cbulk) WT_RET(__rec_split(session, r)); } cbulk->entry = 0; - cbulk->nrecs = r->space_avail * 8 / btree->bitcnt; + cbulk->nrecs = WT_FIX_ENTRIES(btree, r->space_avail); } __bit_setv(r->first_free, @@ -1733,7 +1734,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* Calculate the number of entries per page remainder. */ entry = page->entries; - nrecs = (r->space_avail * 8 / btree->bitcnt) - page->entries; + nrecs = WT_FIX_ENTRIES(btree, r->space_avail) - page->entries; r->recno += entry; /* Walk any append list. */ @@ -1774,7 +1775,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* Calculate the number of entries per page. */ entry = 0; - nrecs = r->space_avail * 8 / btree->bitcnt; + nrecs = WT_FIX_ENTRIES(btree, r->space_avail); } } @@ -1820,7 +1821,7 @@ __rec_col_fix_slvg(WT_SESSION_IMPL *session, for (;;) { /* Calculate the number of entries per page. */ entry = 0; - nrecs = r->space_avail * 8 / btree->bitcnt; + nrecs = WT_FIX_ENTRIES(btree, r->space_avail); for (; nrecs > 0 && salvage->missing > 0; --nrecs, --salvage->missing, ++entry) diff --git a/src/config/config_def.c b/src/config/config_def.c index 19bfe49a7fe..c35e7da4a82 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -184,11 +184,12 @@ __wt_confchk_session_begin_transaction[] = { const char * __wt_confdfl_session_checkpoint = - "drop=,name=,target="; + "drop=,force=0,name=,target="; WT_CONFIG_CHECK __wt_confchk_session_checkpoint[] = { { "drop", "list", NULL }, + { "force", "boolean", NULL }, { "name", "string", NULL }, { "target", "list", NULL }, { NULL, NULL, NULL } @@ -213,6 +214,15 @@ __wt_confchk_session_commit_transaction[] = { }; const char * +__wt_confdfl_session_compact = + ""; + +WT_CONFIG_CHECK +__wt_confchk_session_compact[] = { + { NULL, NULL, NULL } +}; + +const char * __wt_confdfl_session_create = "allocation_size=512B,block_compressor=,cache_resident=0,checksum=," "colgroups=,collator=,columns=,columns=,dictionary=0,exclusive=0," @@ -274,15 +284,6 @@ __wt_confchk_session_drop[] = { }; const char * -__wt_confdfl_session_dumpfile = - ""; - -WT_CONFIG_CHECK -__wt_confchk_session_dumpfile[] = { - { NULL, NULL, NULL } -}; - -const char * __wt_confdfl_session_log_printf = ""; @@ -373,10 +374,13 @@ __wt_confchk_session_upgrade[] = { const char * __wt_confdfl_session_verify = - ""; + "dump_address=0,dump_blocks=0,dump_pages=0"; WT_CONFIG_CHECK __wt_confchk_session_verify[] = { + { "dump_address", "boolean", NULL }, + { "dump_blocks", "boolean", NULL }, + { "dump_pages", "boolean", NULL }, { NULL, NULL, NULL } }; diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c index 39966177fda..1c4fd99e521 100644 --- a/src/cursor/cur_index.c +++ b/src/cursor/cur_index.c @@ -289,9 +289,9 @@ __curindex_close(WT_CURSOR *cursor) __wt_free(session, cindex->cg_cursors); if (cindex->key_plan != idx->key_plan) __wt_free(session, cindex->key_plan); - if (cindex->value_plan != idx->value_plan) - __wt_free(session, cindex->value_plan); if (cursor->value_format != cindex->table->value_format) + __wt_free(session, cursor->value_format); + if (cindex->value_plan != idx->value_plan) __wt_free(session, cindex->value_plan); WT_TRET(__wt_btcur_close(&cindex->cbt)); diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c index 22510ff44da..a1b72243e4e 100644 --- a/src/cursor/cur_table.c +++ b/src/cursor/cur_table.c @@ -615,6 +615,8 @@ __curtable_close(WT_CURSOR *cursor) if (ctable->plan != ctable->table->plan) __wt_free(session, ctable->plan); + if (cursor->value_format != ctable->table->value_format) + __wt_free(session, cursor->value_format); __wt_free(session, ctable->cg_cursors); __wt_free(session, ctable->idx_cursors); /* The URI is owned by the table. */ diff --git a/src/docs/command-line.dox b/src/docs/command-line.dox index 34b703246a2..d20fe51b72c 100644 --- a/src/docs/command-line.dox +++ b/src/docs/command-line.dox @@ -48,6 +48,19 @@ database; the \c -t option changes the \c backup command to do a hot backup of only the named objects. <hr> +@section util_compact wt compact +Compact a table or file. + +The \c compact command attempts to rewrite the specified table or file +to consume less disk space. + +@subsection util_compact_synopsis Synopsis +<code>wt [-Vv] [-C config] [-h directory] compact uri</code> + +@subsection util_compact_options Options +The \c compact command has no command-specific options. + +<hr> @section util_create wt create Create a table or file. @@ -108,25 +121,6 @@ Dump all characters in a hexadecimal encoding (the default is to leave printable characters unencoded). <hr> -@section util_dumpfile wt dumpfile -Dump a file in a debugging format. - -The \c dumpfile command dumps the specified physical file in a non-portable, -debugging format, exiting success if the file is correct, and failure if the -file is corrupted. - -@subsection util_dumpfile_synopsis Synopsis -<code>wt [-Vv] [-C config] [-h directory] dumpfile [-f output] file</code> - -@subsection util_dumpfile_options Options -The following are command-specific options for the \c dumpfile command: - -@par <code>-f</code> -By default, the \c dumpfile command output is written to the standard -output; the \c -f option re-directs the output to the specified -file. - -<hr> @section util_list wt list List the tables and files in the database. diff --git a/src/include/api.h b/src/include/api.h index f5aa7bd9050..677742c4c6e 100644 --- a/src/include/api.h +++ b/src/include/api.h @@ -111,8 +111,9 @@ struct __wt_session_impl { size_t excl_allocated; /* Bytes allocated */ #define WT_SYNC 1 /* Sync the file */ -#define WT_SYNC_DISCARD 2 /* Sync the file, discard pages */ -#define WT_SYNC_DISCARD_NOWRITE 3 /* Discard the file */ +#define WT_SYNC_COMPACT 2 /* Compact the file */ +#define WT_SYNC_DISCARD 3 /* Sync the file, discard pages */ +#define WT_SYNC_DISCARD_NOWRITE 4 /* Discard the file */ int syncop; /* File operation */ int syncop_ret; /* Return value */ @@ -127,8 +128,9 @@ struct __wt_session_impl { * easily call a function to clear memory up to, but not including, the * hazard reference. */ - uint32_t hazard_size; /* Count of used hazard references */ - u_int nhazard; + uint32_t hazard_size; /* Allocated slots in hazard array. */ + uint32_t nhazard; /* Count of active hazard references */ + #define WT_SESSION_CLEAR(s) memset(s, 0, WT_PTRDIFF(&(s)->hazard, s)) WT_HAZARD *hazard; /* Hazard reference array */ }; diff --git a/src/include/btmem.h b/src/include/btmem.h index 796101f3737..fd79028dac7 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -279,10 +279,7 @@ struct __wt_page { */ uint32_t entries; - /* - * Memory attached to the page (although not exact or complete), used - * to force eviction of a page tying too much memory down. - */ + /* Memory attached to the page. */ uint32_t memory_footprint; #define WT_PAGE_INVALID 0 /* Invalid page */ diff --git a/src/include/cache.h b/src/include/cache.h index 73dd5ab1c31..33fb370b413 100644 --- a/src/include/cache.h +++ b/src/include/cache.h @@ -55,12 +55,6 @@ struct __wt_cache { u_int evict_file_next; /* LRU: next file to search */ /* - * Forced-page eviction request information. - */ - WT_EVICT_ENTRY *evict_request; /* Forced page eviction request list */ - uint32_t max_evict_request; /* Size of the eviction request array */ - - /* * Sync/flush request information. */ volatile uint64_t sync_request; /* File sync requests */ diff --git a/src/include/cache.i b/src/include/cache.i index 702234c426c..6c3c796673d 100644 --- a/src/include/cache.i +++ b/src/include/cache.i @@ -31,7 +31,7 @@ __wt_eviction_check(WT_SESSION_IMPL *session, int *read_lockoutp, int wake) *read_lockoutp = (bytes_inuse > bytes_max); /* Wake eviction when we're over the trigger cache size. */ - if (wake && bytes_inuse > cache->eviction_trigger * (bytes_max / 100)) + if (wake && bytes_inuse >= (cache->eviction_trigger * bytes_max) / 100) __wt_evict_server_wake(session); } diff --git a/src/include/extern.h b/src/include/extern.h index 3827035f791..6fc339b3d39 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -49,6 +49,14 @@ extern int __wt_block_checkpoint(WT_SESSION_IMPL *session, extern int __wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block); extern uint32_t __wt_cksum(const void *chunk, size_t len); +extern int __wt_block_compact_skip( WT_SESSION_IMPL *session, + WT_BLOCK *block, + int *skipp); +extern int __wt_block_compact_page_skip(WT_SESSION_IMPL *session, + WT_BLOCK *block, + const uint8_t *addr, + uint32_t addr_size, + int *skipp); extern int __wt_block_off_match(WT_EXTLIST *el, off_t off, off_t size); extern int __wt_block_off_remove_overlap( WT_SESSION_IMPL *session, WT_EXTLIST *el, @@ -132,6 +140,11 @@ extern int __wt_bm_checkpoint_load(WT_SESSION_IMPL *session, uint32_t addr_size, int readonly); extern int __wt_bm_checkpoint_unload(WT_SESSION_IMPL *session); +extern int __wt_bm_compact_skip(WT_SESSION_IMPL *session, int *skipp); +extern int __wt_bm_compact_page_skip(WT_SESSION_IMPL *session, + const uint8_t *addr, + uint32_t addr_size, + int *skipp); extern int __wt_bm_truncate(WT_SESSION_IMPL *session, const char *filename); extern int __wt_bm_free(WT_SESSION_IMPL *session, const uint8_t *addr, @@ -252,6 +265,12 @@ extern int __wt_cache_config(WT_CONNECTION_IMPL *conn, const char *cfg[]); extern int __wt_cache_create(WT_CONNECTION_IMPL *conn, const char *cfg[]); extern void __wt_cache_stats_update(WT_CONNECTION_IMPL *conn); extern void __wt_cache_destroy(WT_CONNECTION_IMPL *conn); +extern int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_compact_page_skip( WT_SESSION_IMPL *session, + WT_PAGE *parent, + WT_REF *ref, + int *skipp); +extern int __wt_compact_evict(WT_SESSION_IMPL *session, WT_PAGE *page); extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt, int next); extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, int discard); extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt); @@ -362,7 +381,6 @@ extern int __wt_bt_cache_flush(WT_SESSION_IMPL *session, int op); extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]); -extern int __wt_dumpfile(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf); @@ -553,12 +571,12 @@ extern const char *__wt_confdfl_session_close; extern WT_CONFIG_CHECK __wt_confchk_session_close[]; extern const char *__wt_confdfl_session_commit_transaction; extern WT_CONFIG_CHECK __wt_confchk_session_commit_transaction[]; +extern const char *__wt_confdfl_session_compact; +extern WT_CONFIG_CHECK __wt_confchk_session_compact[]; extern const char *__wt_confdfl_session_create; extern WT_CONFIG_CHECK __wt_confchk_session_create[]; extern const char *__wt_confdfl_session_drop; extern WT_CONFIG_CHECK __wt_confchk_session_drop[]; -extern const char *__wt_confdfl_session_dumpfile; -extern WT_CONFIG_CHECK __wt_confchk_session_dumpfile[]; extern const char *__wt_confdfl_session_log_printf; extern WT_CONFIG_CHECK __wt_confchk_session_log_printf[]; extern const char *__wt_confdfl_session_open_cursor; @@ -696,7 +714,9 @@ extern int __wt_lsm_merge_update_tree(WT_SESSION_IMPL *session, int start_chunk, int nchunks, WT_LSM_CHUNK *chunk); -extern int __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); +extern int __wt_lsm_merge(WT_SESSION_IMPL *session, + WT_LSM_TREE *lsm_tree, + int stalls); extern int __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); extern int __wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); extern int __wt_lsm_stat_init( WT_SESSION_IMPL *session, @@ -848,7 +868,9 @@ extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, int is_locked, WT_CONDVAR **condp); -extern void __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond); +extern void __wt_cond_wait(WT_SESSION_IMPL *session, + WT_CONDVAR *cond, + uint64_t usecs); extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond); extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR *cond); extern int __wt_rwlock_alloc( WT_SESSION_IMPL *session, diff --git a/src/include/misc.h b/src/include/misc.h index 6b82155bb08..476ef39084f 100644 --- a/src/include/misc.h +++ b/src/include/misc.h @@ -174,9 +174,10 @@ #define WT_DECL_RET int ret = 0 /* Flags for the tree-walk function. */ -#define WT_TREE_DISCARD 0x01 /* Discarding */ -#define WT_TREE_EVICT 0x02 /* Eviction */ -#define WT_TREE_PREV 0x04 /* Backward walk */ +#define WT_TREE_COMPACT 0x01 /* Compaction */ +#define WT_TREE_DISCARD 0x02 /* Discarding */ +#define WT_TREE_EVICT 0x04 /* Eviction */ +#define WT_TREE_PREV 0x08 /* Backward walk */ /* * In diagnostic mode we track the locations from which hazard references and diff --git a/src/include/stat.h b/src/include/stat.h index 098886904bb..ea29a8c7b1e 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -86,6 +86,7 @@ struct __wt_btree_stats { WT_STATS file_minor; WT_STATS file_overflow; WT_STATS file_allocsize; + WT_STATS file_compact_rewrite; WT_STATS rec_page_merge; WT_STATS rec_dictionary; WT_STATS rec_split_intl; diff --git a/src/include/txn.i b/src/include/txn.i index ccf04831f14..0f0425a5961 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -100,8 +100,18 @@ __wt_txn_visible(WT_SESSION_IMPL *session, wt_txnid_t id) if (id == txn->id) return (1); - /* Read-uncommitted transactions see all other changes. */ - if (txn->isolation == TXN_ISO_READ_UNCOMMITTED) + /* + * Read-uncommitted transactions see all other changes. + * + * All metadata reads are at read-uncommitted isolation. That's + * because once a schema-level operation completes, subsequent + * operations must see the current version of checkpoint metadata, or + * they may try to read blocks that may have been freed from a file. + * Metadata updates use non-transactional techniques (such as the + * schema and metadata locks) to protect access to in-flight updates. + */ + if (txn->isolation == TXN_ISO_READ_UNCOMMITTED || + session->btree == session->metafile) return (1); /* diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index a7ae4c669e8..21b972ab9f5 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -706,6 +706,18 @@ struct __wt_session { int __F(create)(WT_SESSION *session, const char *name, const char *config); + /*! Compact an object. + * + * @snippet ex_all.c Compact a table + * + * @param session the session handle + * @param name the URI of the object to drop, such as \c "table:stock" + * @configempty{session.compact, see dist/api_data.py} + * @errors + */ + int __F(compact)(WT_SESSION *session, + const char *name, const char *config); + /*! Drop (delete) an object. * * @snippet ex_all.c Drop a table @@ -812,7 +824,17 @@ struct __wt_session { * * @param session the session handle * @param name the URI of the file or table to verify - * @configempty{session.verify, see dist/api_data.py} + * @configstart{session.verify, see dist/api_data.py} + * @config{dump_address, Display addresses and page types as pages are + * verified\, using the application's message handler\, intended for + * debugging.,a boolean flag; default \c false.} + * @config{dump_blocks, Display the contents of on-disk blocks as they + * are verified\, using the application's message handler\, intended for + * debugging.,a boolean flag; default \c false.} + * @config{dump_pages, Display the contents of in-memory pages as they + * are verified\, using the application's message handler\, intended for + * debugging.,a boolean flag; default \c false.} + * @configend * @errors */ int __F(verify)(WT_SESSION *session, @@ -904,6 +926,9 @@ struct __wt_session { * drop all checkpoints before and including the named checkpoint. * Checkpoints cannot be dropped while a hot backup is in progress or if * open in a cursor.,a list of strings; default empty.} + * @config{force, checkpoints may be skipped if the underlying object + * has not been modified\, this option forces the checkpoint.,a boolean + * flag; default \c false.} * @config{name, if non-empty\, specify a name for the checkpoint.,a * string; default empty.} * @config{target, if non-empty\, checkpoint the list of objects.,a list @@ -918,21 +943,6 @@ struct __wt_session { * @{ */ - /*! Dump a physical file in debugging mode. - * - * The specified file is displayed in a non-portable debugging mode to - * the application's standard output. - * - * @snippet ex_all.c Dump a file - * - * @param session the session handle - * @param name the URI of the file to dump - * @configempty{session.dumpfile, see dist/api_data.py} - * @errors - */ - int __F(dumpfile)(WT_SESSION *session, - const char *name, const char *config); - /*! Send a string to the message handler for debugging. * * @snippet ex_all.c Print to the message stream @@ -1774,34 +1784,36 @@ extern int wiredtiger_extension_init(WT_SESSION *session, #define WT_STAT_file_overflow 31 /*! page size allocation unit */ #define WT_STAT_file_allocsize 32 +/*! pages rewritten by compaction */ +#define WT_STAT_file_compact_rewrite 33 /*! reconcile: deleted or temporary pages merged */ -#define WT_STAT_rec_page_merge 33 +#define WT_STAT_rec_page_merge 34 /*! reconcile: dictionary match */ -#define WT_STAT_rec_dictionary 34 +#define WT_STAT_rec_dictionary 35 /*! reconcile: internal pages split */ -#define WT_STAT_rec_split_intl 35 +#define WT_STAT_rec_split_intl 36 /*! reconcile: leaf pages split */ -#define WT_STAT_rec_split_leaf 36 +#define WT_STAT_rec_split_leaf 37 /*! reconcile: overflow key */ -#define WT_STAT_rec_ovfl_key 37 +#define WT_STAT_rec_ovfl_key 38 /*! reconcile: overflow value */ -#define WT_STAT_rec_ovfl_value 38 +#define WT_STAT_rec_ovfl_value 39 /*! reconcile: pages deleted */ -#define WT_STAT_rec_page_delete 39 +#define WT_STAT_rec_page_delete 40 /*! reconcile: pages written */ -#define WT_STAT_rec_written 40 +#define WT_STAT_rec_written 41 /*! reconcile: unable to acquire hazard reference */ -#define WT_STAT_rec_hazard 41 +#define WT_STAT_rec_hazard 42 /*! row-store internal pages */ -#define WT_STAT_file_row_int_pages 42 +#define WT_STAT_file_row_int_pages 43 /*! row-store leaf pages */ -#define WT_STAT_file_row_leaf_pages 43 +#define WT_STAT_file_row_leaf_pages 44 /*! total entries */ -#define WT_STAT_file_entries 44 +#define WT_STAT_file_entries 45 /*! update conflicts */ -#define WT_STAT_update_conflict 45 +#define WT_STAT_update_conflict 46 /*! write generation conflicts */ -#define WT_STAT_file_write_conflicts 46 +#define WT_STAT_file_write_conflicts 47 /*! @} */ /*! diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c index c09267ba79c..44957c6b34f 100644 --- a/src/lsm/lsm_merge.c +++ b/src/lsm/lsm_merge.c @@ -61,7 +61,7 @@ __wt_lsm_merge_update_tree(WT_SESSION_IMPL *session, * Merge a set of chunks of an LSM tree. */ int -__wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +__wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int stalls) { WT_BLOOM *bloom; WT_CURSOR *src, *dest; @@ -85,7 +85,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * read. We need a copy, since other threads may alter the chunk count * while we are doing a merge. */ - nchunks = lsm_tree->nchunks - 1; + nchunks = lsm_tree->nchunks; /* * If there aren't any chunks to merge, or some of the chunks aren't @@ -106,8 +106,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) /* Only include chunks that are stable on disk. */ end_chunk = nchunks - 1; while (end_chunk > 0 && - (!F_ISSET(lsm_tree->chunk[end_chunk], WT_LSM_CHUNK_ONDISK) || - lsm_tree->chunk[end_chunk]->ncursor > 0)) + !F_ISSET(lsm_tree->chunk[end_chunk], WT_LSM_CHUNK_ONDISK)) --end_chunk; /* @@ -136,6 +135,11 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) if (nchunks > 2 && chunk->count > 2 * record_count / nchunks) break; + /* Don't do any big merges until we have waited for 10s. */ + if (nchunks > 0 && stalls < 10 && + chunk->count > lsm_tree->chunk[end_chunk]->count * 2) + break; + record_count += chunk->count; --start_chunk; @@ -146,7 +150,8 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_ASSERT(session, nchunks <= max_chunks); - if (nchunks <= 1) + /* Don't do small merges unless we have waited for 2s. */ + if (nchunks <= 1 || (stalls < 2 && nchunks < max_chunks / 2)) return (WT_NOTFOUND); /* Allocate an ID for the merge. */ @@ -192,6 +197,11 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) wt_session, chunk->uri, NULL, "raw,bulk", &dest)); for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { + if (insert_count % 1000 && + !F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) { + ret = EINTR; + goto err; + } WT_ERR(src->get_key(src, &key)); dest->set_key(dest, &key); WT_ERR(src->get_value(src, &value)); @@ -237,11 +247,23 @@ err: if (src != NULL) WT_TRET(__wt_bloom_close(bloom)); __wt_scr_free(&bbuf); if (ret != 0) { + /* + * Ideally we would drop the new chunk on error, but that + * introduces potential deadlock problems. It is relatively + * harmless to leave the file - it does not interfere + * with later re-use. + WT_WITH_SCHEMA_LOCK(session, + (void)wt_session->drop(wt_session, chunk->uri, NULL)); + */ __wt_free(session, chunk->bloom_uri); __wt_free(session, chunk->uri); __wt_free(session, chunk); - WT_VERBOSE_VOID(session, lsm, - "Merge failed with %s\n", wiredtiger_strerror(ret)); + if (ret == EINTR) + WT_VERBOSE_VOID(session, lsm, + "Merge aborted due to close"); + else + WT_VERBOSE_VOID(session, lsm, + "Merge failed with %s", wiredtiger_strerror(ret)); } return (ret); } diff --git a/src/lsm/lsm_stat.c b/src/lsm/lsm_stat.c index 87a151c5ada..6f47692a26e 100644 --- a/src/lsm/lsm_stat.c +++ b/src/lsm/lsm_stat.c @@ -67,9 +67,18 @@ __wt_lsm_stat_init( */ WT_ERR(__wt_buf_fmt( session, uribuf, "statistics:%s", chunk->uri)); - WT_ERR(__wt_curstat_open(session, uribuf->data, + ret = __wt_curstat_open(session, uribuf->data, F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ? disk_cfg : cfg, - &stat_cursor)); + &stat_cursor); + /* + * XXX kludge: we may have an empty chunk where no checkpoint + * was written. If so, try to open the ordinary handle on that + * chunk instead. + */ + if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) + ret = __wt_curstat_open( + session, uribuf->data, cfg, &stat_cursor); + WT_ERR(ret); stat_cursor->set_key(stat_cursor, WT_STAT_page_evict_fail); WT_ERR(stat_cursor->search(stat_cursor)); diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index 2be721349ea..84dac6b71d1 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -24,9 +24,18 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) TAILQ_REMOVE(&S2C(session)->lsmqh, lsm_tree, q); + __wt_free(session, lsm_tree->name); + __wt_free(session, lsm_tree->config); + __wt_free(session, lsm_tree->key_format); + __wt_free(session, lsm_tree->value_format); + __wt_free(session, lsm_tree->file_config); + + if (lsm_tree->rwlock != NULL) + __wt_rwlock_destroy(session, &lsm_tree->rwlock); + + __wt_free(session, lsm_tree->stats); __wt_spin_destroy(session, &lsm_tree->lock); - __wt_free(session, lsm_tree->name); for (i = 0; i < lsm_tree->nchunks; i++) { if ((chunk = lsm_tree->chunk[i]) == NULL) continue; @@ -46,8 +55,6 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) __wt_free(session, chunk); } __wt_free(session, lsm_tree->old_chunks); - __wt_free(session, lsm_tree->stats); - __wt_free(session, lsm_tree); } @@ -160,9 +167,15 @@ __wt_lsm_tree_setup_chunk(WT_SESSION_IMPL *session, WT_DECL_ITEM(buf); WT_DECL_ITEM(bbuf); WT_DECL_RET; + const char *cfg[] = API_CONF_DEFAULTS(session, drop, "force"); WT_RET(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_lsm_tree_chunk_name(session, lsm_tree, i, buf)); + /* + * Drop the chunk first - there may be some content hanging over + * from an aborted merge. + */ + WT_ERR(__wt_schema_drop(session, buf->data, cfg)); WT_ERR(__wt_schema_create(session, buf->data, lsm_tree->file_config)); chunk->uri = __wt_buf_steal(session, buf, NULL); if (create_bloom) { diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c index d7b395bcf8b..3e9951dbafb 100644 --- a/src/lsm/lsm_worker.c +++ b/src/lsm/lsm_worker.c @@ -21,10 +21,11 @@ __wt_lsm_worker(void *arg) { WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; - int progress; + int progress, stalls; lsm_tree = arg; session = lsm_tree->worker_session; + stalls = 0; while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) { progress = 0; @@ -32,7 +33,8 @@ __wt_lsm_worker(void *arg) /* Clear any state from previous worker thread iterations. */ session->btree = NULL; - if (__wt_lsm_merge(session, lsm_tree) == 0) + /* Report stalls to merge in seconds. */ + if (__wt_lsm_merge(session, lsm_tree, stalls / 1000) == 0) progress = 1; /* Clear any state from previous worker thread iterations. */ @@ -42,8 +44,12 @@ __wt_lsm_worker(void *arg) __lsm_free_chunks(session, lsm_tree) == 0) progress = 1; - if (!progress) - __wt_sleep(0, 10); + if (progress) + stalls = 0; + else { + __wt_sleep(0, 1000); + ++stalls; + } } return (NULL); @@ -213,14 +219,14 @@ __lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_DECL_RET; WT_LSM_CHUNK *chunk; const char *drop_cfg[] = { NULL }; - int found, i; + int locked, progress, i; - found = 0; + locked = progress = 0; for (i = 0; i < lsm_tree->nold_chunks; i++) { if ((chunk = lsm_tree->old_chunks[i]) == NULL) continue; - if (!found) { - found = 1; + if (!locked) { + locked = 1; /* TODO: Do we need the lsm_tree lock for all drops? */ __wt_spin_lock(session, &lsm_tree->lock); } @@ -232,6 +238,7 @@ __lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * be positioned on this old chunk. */ if (ret == 0) { + progress = 1; F_CLR(chunk, WT_LSM_CHUNK_BLOOM); __wt_free(session, chunk->bloom_uri); chunk->bloom_uri = NULL; @@ -250,6 +257,7 @@ __lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * be positioned on this old chunk. */ if (ret == 0) { + progress = 1; __wt_free(session, chunk->uri); chunk->uri = NULL; } else if (ret != EBUSY) @@ -262,10 +270,14 @@ __lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) ++lsm_tree->old_avail; } } - if (found) { -err: ret = __wt_lsm_meta_write(session, lsm_tree); + if (locked) { +err: WT_TRET(__wt_lsm_meta_write(session, lsm_tree)); __wt_spin_unlock(session, &lsm_tree->lock); } + /* Returning non-zero means there is no work to do. */ - return (found ? 0 : WT_NOTFOUND); + if (!progress) + WT_TRET(WT_NOTFOUND); + + return (ret); } diff --git a/src/os_posix/os_mtx.c b/src/os_posix/os_mtx.c index 4e7a1e2cc6e..805a8ce4e0f 100644 --- a/src/os_posix/os_mtx.c +++ b/src/os_posix/os_mtx.c @@ -46,7 +46,7 @@ err: __wt_free(session, cond); * Lock a mutex. */ void -__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond) +__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs) { WT_DECL_RET; @@ -65,7 +65,17 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond) * it's known to return these errors on some systems. */ while (cond->locked) { - ret = pthread_cond_wait(&cond->cond, &cond->mtx); + if (usecs > 0) { + struct timeval tv; + struct timespec ts; + + gettimeofday(&tv, NULL); + ts.tv_sec = tv.tv_sec + (tv.tv_usec + usecs) / 1000000; + ts.tv_nsec = 1000L * ((tv.tv_usec + usecs) % 1000000); + ret = pthread_cond_timedwait( + &cond->cond, &cond->mtx, &ts); + } else + ret = pthread_cond_wait(&cond->cond, &cond->mtx); if (ret != 0 && ret != EINTR && #ifdef ETIME diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c index 055eb84efba..cedd3cf2bbd 100644 --- a/src/os_posix/os_open.c +++ b/src/os_posix/os_open.c @@ -90,6 +90,15 @@ __wt_open(WT_SESSION_IMPL *session, /* Windows clones: we always want to treat the file as a binary. */ f |= O_BINARY; #endif +#ifdef O_CLOEXEC + /* + * Security: + * The application may spawn a new process, and we don't want another + * process to have access to our file handles. + */ + f |= O_CLOEXEC; +#endif + if (ok_create) { f |= O_CREAT; if (exclusive) @@ -109,17 +118,24 @@ __wt_open(WT_SESSION_IMPL *session, if (ret != 0) WT_ERR_MSG(session, ret, "%s", name); -#if defined(HAVE_FCNTL) && defined(FD_CLOEXEC) +#if defined(HAVE_FCNTL) && defined(FD_CLOEXEC) && !defined(O_CLOEXEC) /* * Security: * The application may spawn a new process, and we don't want another * process to have access to our file handles. There's an obvious - * race here... + * race here, so we prefer the flag to open if available. */ if ((f = fcntl(fd, F_GETFD)) == -1 || fcntl(fd, F_SETFD, f | FD_CLOEXEC) == -1) WT_ERR_MSG(session, __wt_errno(), "%s: fcntl", name); #endif + +#if defined(HAVE_POSIX_FADVISE) + /* Disable read-ahead on trees: it slows down random read workloads. */ + if (is_tree) + WT_ERR(posix_fadvise(fd, 0, 0, POSIX_FADV_RANDOM)); +#endif + WT_ERR(__open_directory_sync(session)); WT_ERR(__wt_calloc(session, 1, sizeof(WT_FH), &fh)); diff --git a/src/schema/schema_list.c b/src/schema/schema_list.c index 704ec92f3a2..4fc82e72480 100644 --- a/src/schema/schema_list.c +++ b/src/schema/schema_list.c @@ -96,6 +96,7 @@ __wt_schema_destroy_index(WT_SESSION_IMPL *session, WT_INDEX *idx) __wt_free(session, idx->name); __wt_free(session, idx->source); __wt_free(session, idx->config); + __wt_free(session, idx->key_format); __wt_free(session, idx->key_plan); __wt_free(session, idx->value_plan); __wt_free(session, idx->idxkey_format); diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c index 8bb62999d7b..4e35dcd0497 100644 --- a/src/schema/schema_worker.c +++ b/src/schema/schema_worker.c @@ -54,6 +54,19 @@ __wt_schema_worker(WT_SESSION_IMPL *session, WT_TRET(__wt_session_release_btree(session)); WT_RET(ret); } + + /* Compaction and checkpoint apply to index files as well. */ + if (func == __wt_compact || func == __wt_checkpoint) { + WT_RET(__wt_schema_open_indices(session, table)); + for (i = 0; i < table->nindices; i++) { + WT_RET(__wt_session_get_btree_ckpt( + session, table->indices[i]->source, + cfg, open_flags)); + ret = func(session, cfg); + WT_TRET(__wt_session_release_btree(session)); + WT_RET(ret); + } + } } else return (__wt_bad_object_type(session, uri)); diff --git a/src/session/session_api.c b/src/session/session_api.c index c515d2e885d..5c7f0acf2dd 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -7,6 +7,7 @@ #include "wt_internal.h" +static int __session_checkpoint(WT_SESSION *, const char *); static int __session_rollback_transaction(WT_SESSION *, const char *); /* @@ -280,42 +281,114 @@ err: API_END_NOTFOUND_MAP(session, ret); } /* - * __session_drop -- - * WT_SESSION->drop method. + * __session_compact_worker -- + * Worker function to do the actual compaction call. */ static int -__session_drop(WT_SESSION *wt_session, const char *uri, const char *config) +__session_compact_worker( + WT_SESSION *wt_session, const char *uri, const char *config) { WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; - SESSION_API_CALL(session, drop, config, cfg); + SESSION_API_CALL(session, compact, config, cfg); WT_WITH_SCHEMA_LOCK(session, - ret = __wt_schema_drop(session, uri, cfg)); + ret = __wt_schema_worker(session, uri, __wt_compact, cfg, 0)); -err: /* Note: drop operations cannot be unrolled (yet?). */ - API_END_NOTFOUND_MAP(session, ret); +err: API_END_NOTFOUND_MAP(session, ret); +} + +/* + * __session_compact -- + * WT_SESSION.compact method. + */ +static int +__session_compact(WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_DECL_RET; + WT_ITEM *t; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + + /* Compaction makes no sense for LSM objects, ignore requests. */ + if (WT_PREFIX_MATCH(uri, "lsm:")) + return (0); + if (!WT_PREFIX_MATCH(uri, "colgroup:") && + !WT_PREFIX_MATCH(uri, "file:") && + !WT_PREFIX_MATCH(uri, "index:") && + !WT_PREFIX_MATCH(uri, "table:")) + return (__wt_bad_object_type(session, uri)); + + /* + * Compaction requires 2, and possibly 3 checkpoints, how many is block + * manager specific: all block managers will need the first checkpoint, + * but may or may not need the last two. + * + * The first checkpoint frees emptied pages to the underlying block + * manager (when rows are deleted, underlying blocks aren't freed until + * the page is reconciled, and checkpoint makes that happen). Because + * compaction is based on having available blocks in the block manager, + * compaction could do no work without the first checkpoint. + * + * After the first checkpoint, we compact the tree. + * + * The second and third checkpoints are done because the default block + * manager does checkpoints in two steps: blocks made available for + * re-use during a checkpoint are put on a special checkpoint-available + * list and only moved onto the real available list once the metadata + * has been updated with the newly written checkpoint information. This + * means blocks allocated by the checkpoint itself cannot be taken from + * the blocks made available by the checkpoint. + * + * In other words, the second checkpoint puts the blocks from the end of + * the file that were freed by compaction onto the checkpoint-available + * list, but then potentially writes checkpoint blocks at the end of the + * file, which would prevent any file truncation. When the second + * checkpoint resolves, those blocks become available for the third + * checkpoint, so it's able to write its blocks toward the beginning of + * the file, and then the file can be truncated. + * + * We do the work here so applications don't get confused why compaction + * isn't helping until after multiple, subsequent checkpoint calls. + * + * Force the checkpoint: we don't want to skip it because the work we + * need to have done is done in the underlying block manager. + */ + WT_RET(__wt_scr_alloc(session, 0, &t)); + WT_ERR(__wt_buf_fmt(session, t, "target=(\"%s\")", uri)); + WT_ERR(__session_checkpoint(wt_session, t->data)); + + WT_ERR(__session_compact_worker(wt_session, uri, config)); + + WT_ERR(__wt_buf_fmt(session, t, "target=(\"%s\"),force=1", uri)); + WT_ERR(__session_checkpoint(wt_session, t->data)); + WT_ERR(__session_checkpoint(wt_session, t->data)); + +err: __wt_scr_free(&t); + return (ret); } /* - * __session_dumpfile -- - * WT_SESSION->dumpfile method. + * __session_drop -- + * WT_SESSION->drop method. */ static int -__session_dumpfile(WT_SESSION *wt_session, const char *uri, const char *config) +__session_drop(WT_SESSION *wt_session, const char *uri, const char *config) { WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; - SESSION_API_CALL(session, dumpfile, config, cfg); + SESSION_API_CALL(session, drop, config, cfg); + WT_WITH_SCHEMA_LOCK(session, - ret = __wt_schema_worker(session, uri, - __wt_dumpfile, cfg, WT_BTREE_EXCLUSIVE | WT_BTREE_VERIFY)); + ret = __wt_schema_drop(session, uri, cfg)); -err: API_END_NOTFOUND_MAP(session, ret); +err: /* Note: drop operations cannot be unrolled (yet?). */ + API_END_NOTFOUND_MAP(session, ret); } /* @@ -641,6 +714,7 @@ __wt_open_session(WT_CONNECTION_IMPL *conn, int internal, __session_reconfigure, __session_open_cursor, __session_create, + __session_compact, __session_drop, __session_rename, __session_salvage, @@ -651,7 +725,6 @@ __wt_open_session(WT_CONNECTION_IMPL *conn, int internal, __session_commit_transaction, __session_rollback_transaction, __session_checkpoint, - __session_dumpfile, __session_msg_printf }; WT_DECL_RET; diff --git a/src/support/err.c b/src/support/err.c index 55f21a5bcd5..c332d18478e 100644 --- a/src/support/err.c +++ b/src/support/err.c @@ -382,6 +382,7 @@ __wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri) WT_PREFIX_MATCH(uri, "config:") || WT_PREFIX_MATCH(uri, "file:") || WT_PREFIX_MATCH(uri, "index:") || + WT_PREFIX_MATCH(uri, "lsm:") || WT_PREFIX_MATCH(uri, "statistics:") || WT_PREFIX_MATCH(uri, "table:")) WT_RET_MSG(session, ENOTSUP, diff --git a/src/support/hazard.c b/src/support/hazard.c index df362278d25..78dc1b5a1d8 100644 --- a/src/support/hazard.c +++ b/src/support/hazard.c @@ -24,6 +24,7 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp { WT_BTREE *btree; WT_HAZARD *hp; + int restarts = 0; btree = session->btree; *busyp = 0; @@ -45,12 +46,23 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp * reference before it discards the page (the eviction server sets the * state to WT_REF_LOCKED, then flushes memory and checks the hazard * references). + * + * For sessions with many active hazard references, skip most of the + * active slots: there may be a free slot in there, but checking is + * expensive. Most hazard references are released quickly: optimize + * for that case. */ - for (hp = session->hazard; ; ++hp) { + for (hp = session->hazard + session->nhazard;; ++hp) { /* Expand the number of hazard references if available.*/ if (hp >= session->hazard + session->hazard_size) { if (session->hazard_size >= S2C(session)->hazard_max) break; + /* Restart the search. */ + if (session->nhazard < session->hazard_size && + restarts++ == 0) { + hp = session->hazard; + continue; + } WT_PUBLISH(session->hazard_size, WT_MIN(session->hazard_size + WT_HAZARD_INCR, S2C(session)->hazard_max)); @@ -134,9 +146,13 @@ __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page) */ WT_ASSERT(session, page != NULL); - /* Clear the caller's hazard pointer. */ - for (hp = session->hazard; - hp < session->hazard + session->hazard_size; ++hp) + /* + * Clear the caller's hazard pointer. + * The common pattern is LIFO, so do a reverse search. + */ + for (hp = session->hazard + session->hazard_size - 1; + hp >= session->hazard; + --hp) if (hp->page == page) { /* * We don't publish the hazard reference clear in the diff --git a/src/support/stat.c b/src/support/stat.c index c8f67d0ac12..23065e706a5 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -26,6 +26,7 @@ __wt_stat_alloc_btree_stats(WT_SESSION_IMPL *session, WT_BTREE_STATS **statsp) stats->file_col_int_pages.desc = "column-store internal pages"; stats->file_col_var_pages.desc = "column-store variable-size leaf pages"; + stats->file_compact_rewrite.desc = "pages rewritten by compaction"; stats->file_entries.desc = "total entries"; stats->file_fixed_len.desc = "fixed-record size"; stats->file_magic.desc = "magic number"; @@ -88,6 +89,7 @@ __wt_stat_clear_btree_stats(WT_STATS *stats_arg) stats->file_col_fix_pages.v = 0; stats->file_col_int_pages.v = 0; stats->file_col_var_pages.v = 0; + stats->file_compact_rewrite.v = 0; stats->file_entries.v = 0; stats->file_fixed_len.v = 0; stats->file_magic.v = 0; diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 44998fbbf9a..34eaf68ffd4 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -158,7 +158,7 @@ __ckpt_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len) */ if (len < strlen(WT_CHECKPOINT)) return (0); - if (strncmp(name, WT_CHECKPOINT, strlen(WT_CHECKPOINT)) != 0) + if (!WT_PREFIX_MATCH(name, WT_CHECKPOINT)) return (0); WT_RET_MSG(session, EINVAL, @@ -183,8 +183,7 @@ __drop(WT_CKPT *ckptbase, const char *name, size_t len) */ if (strncmp(WT_CHECKPOINT, name, len) == 0) { WT_CKPT_FOREACH(ckptbase, ckpt) - if (strncmp(ckpt->name, - WT_CHECKPOINT, strlen(WT_CHECKPOINT)) == 0) + if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) F_SET(ckpt, WT_CKPT_DELETE); } else WT_CKPT_FOREACH(ckptbase, ckpt) @@ -273,7 +272,7 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_TXN *txn; WT_TXN_ISOLATION saved_isolation; const char *name; - int deleted, is_checkpoint, track_ckpt; + int deleted, force, is_checkpoint, track_ckpt; char *name_alloc; conn = S2C(session); @@ -392,7 +391,15 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * to open the checkpoint in a cursor after taking any checkpoint, which * means it must exist. */ + force = 0; if (!btree->modified) { + ret = __wt_config_gets(session, cfg, "force", &cval); + if (ret != 0 && ret != WT_NOTFOUND) + WT_ERR(ret); + if (ret == 0 && cval.val != 0) + force = 1; + } + if (!btree->modified && !force) { if (!is_checkpoint) goto skip; @@ -400,9 +407,18 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_DELETE)) ++deleted; + /* + * Complicated test: if we only deleted a single checkpoint, and + * it was the last checkpoint in the object, and it has the same + * name as the checkpoint we're taking (correcting for internal + * checkpoint names with their generational suffix numbers), we + * can skip the checkpoint, there's nothing to do. + */ if (deleted == 1 && F_ISSET(ckpt - 1, WT_CKPT_DELETE) && - strcmp(name, (ckpt - 1)->name) == 0) + (strcmp(name, (ckpt - 1)->name) == 0 || + (WT_PREFIX_MATCH(name, WT_CHECKPOINT) && + WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT)))) goto skip; } @@ -433,9 +449,8 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * delete flag, and otherwise fail. */ if (conn->ckpt_backup) { - if (strncmp(ckpt->name, - WT_CHECKPOINT, - strlen(WT_CHECKPOINT)) == 0) { + if (WT_PREFIX_MATCH( + ckpt->name, WT_CHECKPOINT)) { F_CLR(ckpt, WT_CKPT_DELETE); continue; } @@ -454,8 +469,8 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) __wt_session_lock_checkpoint(session, ckpt->name); if (ret == 0) continue; - if (ret == EBUSY && strncmp(ckpt->name, - WT_CHECKPOINT, strlen(WT_CHECKPOINT)) == 0) { + if (ret == EBUSY && + WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) { F_CLR(ckpt, WT_CKPT_DELETE); continue; } diff --git a/src/utilities/util.h b/src/utilities/util.h index a736fdfcf85..b2dd594c757 100644 --- a/src/utilities/util.h +++ b/src/utilities/util.h @@ -42,11 +42,11 @@ extern char *util_optarg; /* argument associated with option */ int util_backup(WT_SESSION *, int, char *[]); int util_cerr(const char *, const char *, int); +int util_compact(WT_SESSION *, int, char *[]); void util_copyright(void); int util_create(WT_SESSION *, int, char *[]); int util_drop(WT_SESSION *, int, char *[]); int util_dump(WT_SESSION *, int, char *[]); -int util_dumpfile(WT_SESSION *, int, char *[]); int util_err(int, const char *, ...); int util_flush(WT_SESSION *, const char *); int util_getopt(int, char * const *, const char *); diff --git a/src/utilities/util_compact.c b/src/utilities/util_compact.c new file mode 100644 index 00000000000..11ee6ddaf14 --- /dev/null +++ b/src/utilities/util_compact.c @@ -0,0 +1,61 @@ +/*- + * Copyright (c) 2008-2012 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +static int usage(void); + +int +util_compact(WT_SESSION *session, int argc, char *argv[]) +{ + WT_DECL_RET; + int ch; + char *uri; + + uri = NULL; + while ((ch = util_getopt(argc, argv, "")) != EOF) + switch (ch) { + case '?': + default: + return (usage()); + } + argc -= util_optind; + argv += util_optind; + + /* The remaining argument is the table name. */ + if (argc != 1) + return (usage()); + if ((uri = util_name(*argv, "table", + UTIL_COLGROUP_OK | UTIL_FILE_OK | UTIL_INDEX_OK | + UTIL_LSM_OK | UTIL_TABLE_OK)) == NULL) + return (1); + + if ((ret = session->compact(session, uri, NULL)) != 0) { + fprintf(stderr, "%s: compact(%s): %s\n", + progname, uri, wiredtiger_strerror(ret)); + goto err; + } + + if (0) { +err: ret = 1; + } + + if (uri != NULL) + free(uri); + + return (ret); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "compact uri\n", + progname, usage_prefix); + return (1); +} diff --git a/src/utilities/util_dumpfile.c b/src/utilities/util_dumpfile.c deleted file mode 100644 index 2edbbf88322..00000000000 --- a/src/utilities/util_dumpfile.c +++ /dev/null @@ -1,68 +0,0 @@ -/*- - * Copyright (c) 2008-2012 WiredTiger, Inc. - * All rights reserved. - * - * See the file LICENSE for redistribution information. - */ - -#include "util.h" - -static int usage(void); - -int -util_dumpfile(WT_SESSION *session, int argc, char *argv[]) -{ - WT_DECL_RET; - int ch; - char *name; - - name = NULL; - while ((ch = util_getopt(argc, argv, "f:")) != EOF) - switch (ch) { - case 'f': /* output file */ - if (freopen(util_optarg, "w", stdout) == NULL) { - fprintf(stderr, "%s: %s: %s\n", - progname, util_optarg, strerror(errno)); - return (1); - } - break; - case '?': - default: - return (usage()); - } - argc -= util_optind; - argv += util_optind; - - /* The remaining argument is the file name. */ - if (argc != 1) - return (usage()); - if ((name = util_name(*argv, "file", UTIL_FILE_OK)) == NULL) - return (1); - - if ((ret = session->dumpfile(session, name, NULL)) != 0) { - fprintf(stderr, "%s: dumpfile(%s): %s\n", - progname, name, wiredtiger_strerror(ret)); - goto err; - } - if (verbose) - printf("\n"); - - if (0) { -err: ret = 1; - } - - if (name != NULL) - free(name); - - return (ret); -} - -static int -usage(void) -{ - (void)fprintf(stderr, - "usage: %s %s " - "dumpfile [-f output-file] file\n", - progname, usage_prefix); - return (1); -} diff --git a/src/utilities/util_main.c b/src/utilities/util_main.c index d4be5a45500..6b8b91fa2db 100644 --- a/src/utilities/util_main.c +++ b/src/utilities/util_main.c @@ -111,6 +111,8 @@ main(int argc, char *argv[]) case 'c': if (strcmp(command, "create") == 0) ret = util_create(session, argc, argv); + else if (strcmp(command, "compact") == 0) + ret = util_compact(session, argc, argv); else ret = usage(); break; @@ -119,8 +121,6 @@ main(int argc, char *argv[]) ret = util_drop(session, argc, argv); else if (strcmp(command, "dump") == 0) ret = util_dump(session, argc, argv); - else if (strcmp(command, "dumpfile") == 0) - ret = util_dumpfile(session, argc, argv); else ret = usage(); break; @@ -193,28 +193,28 @@ usage(void) WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR); fprintf(stderr, "global options:\n" - "\t-C\twiredtiger_open configuration\n" - "\t-h\tdatabase directory\n" - "\t-V\tdisplay library version and exit\n" - "\t-v\tverbose\n"); + "\t" "-C\twiredtiger_open configuration\n" + "\t" "-h\tdatabase directory\n" + "\t" "-V\tdisplay library version and exit\n" + "\t" "-v\tverbose\n"); fprintf(stderr, "commands:\n" - "\tbackup database backup\n" - "\tcopyright copyright information\n" - "\tcreate\t create an object\n" - "\tdrop\t drop an object\n" - "\tdump\t dump an object\n" - "\tdumpfile dump a physical file in debugging format\n" - "\tlist\t list database objects\n" - "\tload\t load an object\n" - "\tprintlog display the database log\n" - "\tread\t read values from an object\n" - "\trename\t rename an object\n" - "\tsalvage\t salvage a file\n" - "\tstat\t display statistics for an object\n" - "\tupgrade\t upgrade an object\n" - "\tverify\t verify an object\n" - "\twrite\t write values to an object\n"); + "\t" "backup\t database backup\n" + "\t" "compact\t compact an object\n" + "\t" "copyright copyright information\n" + "\t" "create\t create an object\n" + "\t" "drop\t drop an object\n" + "\t" "dump\t dump an object\n" + "\t" "list\t list database objects\n" + "\t" "load\t load an object\n" + "\t" "printlog display the database log\n" + "\t" "read\t read values from an object\n" + "\t" "rename\t rename an object\n" + "\t" "salvage\t salvage a file\n" + "\t" "stat\t display statistics for an object\n" + "\t" "upgrade\t upgrade an object\n" + "\t" "verify\t verify an object\n" + "\t" "write\t write values to an object\n"); return (EXIT_FAILURE); } @@ -231,43 +231,33 @@ util_name(const char *s, const char *type, u_int flags) char *name; copy = 0; - if (WT_PREFIX_MATCH(s, "colgroup:")) { - if (!(flags & UTIL_COLGROUP_OK)) { - fprintf(stderr, - "%s: %s: \"colgroup\" type not supported\n", - progname, command); - return (NULL); - } + if (WT_PREFIX_MATCH(s, "backup:")) { + goto type_err; + } else if (WT_PREFIX_MATCH(s, "colgroup:")) { + if (!(flags & UTIL_COLGROUP_OK)) + goto type_err; copy = 1; + } else if (WT_PREFIX_MATCH(s, "config:")) { + goto type_err; } else if (WT_PREFIX_MATCH(s, "file:")) { - if (!(flags & UTIL_FILE_OK)) { - fprintf(stderr, - "%s: %s: \"file\" type not supported\n", - progname, command); - return (NULL); - } + if (!(flags & UTIL_FILE_OK)) + goto type_err; copy = 1; } else if (WT_PREFIX_MATCH(s, "index:")) { - if (!(flags & UTIL_INDEX_OK)) { - fprintf(stderr, - "%s: %s: \"index\" type not supported\n", - progname, command); - return (NULL); - } + if (!(flags & UTIL_INDEX_OK)) + goto type_err; copy = 1; } else if (WT_PREFIX_MATCH(s, "lsm:")) { - if (!(flags & UTIL_LSM_OK)) { - fprintf(stderr, - "%s: %s: \"lsm\" type not supported\n", - progname, command); - return (NULL); - } + if (!(flags & UTIL_LSM_OK)) + goto type_err; copy = 1; + } else if (WT_PREFIX_MATCH(s, "statistics:")) { + goto type_err; } else if (WT_PREFIX_MATCH(s, "table:")) { if (!(flags & UTIL_TABLE_OK)) { - fprintf(stderr, - "%s: %s: \"table\" type not supported\n", - progname, command); +type_err: fprintf(stderr, + "%s: %s: unsupported object type: %s\n", + progname, command, s); return (NULL); } copy = 1; diff --git a/src/utilities/util_rename.c b/src/utilities/util_rename.c index 55a90f96b55..c2ddb2998ad 100644 --- a/src/utilities/util_rename.c +++ b/src/utilities/util_rename.c @@ -39,8 +39,6 @@ util_rename(WT_SESSION *session, int argc, char *argv[]) progname, uri, newname, wiredtiger_strerror(ret)); goto err; } - if (verbose) - printf("\n"); if (0) { err: ret = 1; diff --git a/src/utilities/util_salvage.c b/src/utilities/util_salvage.c index 8821e673df1..82c247fe0ac 100644 --- a/src/utilities/util_salvage.c +++ b/src/utilities/util_salvage.c @@ -42,6 +42,8 @@ util_salvage(WT_SESSION *session, int argc, char *argv[]) progname, name, wiredtiger_strerror(ret)); goto err; } + + /* Verbose configures a progress counter, move to the next line. */ if (verbose) printf("\n"); @@ -60,7 +62,7 @@ usage(void) { (void)fprintf(stderr, "usage: %s %s " - "salvage [-F] file\n", + "salvage [-F] uri\n", progname, usage_prefix); return (1); } diff --git a/src/utilities/util_upgrade.c b/src/utilities/util_upgrade.c index 7c4f2035fa2..89fca16e548 100644 --- a/src/utilities/util_upgrade.c +++ b/src/utilities/util_upgrade.c @@ -38,6 +38,8 @@ util_upgrade(WT_SESSION *session, int argc, char *argv[]) progname, name, wiredtiger_strerror(ret)); goto err; } + + /* Verbose configures a progress counter, move to the next line. */ if (verbose) printf("\n"); @@ -56,7 +58,7 @@ usage(void) { (void)fprintf(stderr, "usage: %s %s " - "upgrade file\n", + "upgrade uri\n", progname, usage_prefix); return (1); } diff --git a/src/utilities/util_verify.c b/src/utilities/util_verify.c index fda2d413346..3ae94594737 100644 --- a/src/utilities/util_verify.c +++ b/src/utilities/util_verify.c @@ -13,12 +13,23 @@ int util_verify(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; - int ch; - char *name; + int ch, dump_address, dump_blocks, dump_pages; + char *name, config[128]; name = NULL; - while ((ch = util_getopt(argc, argv, "")) != EOF) + dump_address = dump_blocks = dump_pages = 0; + while ((ch = util_getopt(argc, argv, "d:")) != EOF) switch (ch) { + case 'd': + if (strcmp(util_optarg, "dump_address") == 0) + dump_address = 1; + else if (strcmp(util_optarg, "dump_blocks") == 0) + dump_blocks = 1; + else if (strcmp(util_optarg, "dump_pages") == 0) + dump_pages = 1; + else + return (usage()); + break; case '?': default: return (usage()); @@ -33,11 +44,22 @@ util_verify(WT_SESSION *session, int argc, char *argv[]) "table", UTIL_FILE_OK | UTIL_LSM_OK | UTIL_TABLE_OK)) == NULL) return (1); - if ((ret = session->verify(session, name, NULL)) != 0) { + /* Build the configuration string as necessary. */ + config[0] = '\0'; + if (dump_address) + (void)strcat(config, "dump_address,"); + if (dump_blocks) + (void)strcat(config, "dump_blocks,"); + if (dump_pages) + (void)strcat(config, "dump_pages,"); + + if ((ret = session->verify(session, name, config)) != 0) { fprintf(stderr, "%s: verify(%s): %s\n", progname, name, wiredtiger_strerror(ret)); goto err; } + + /* Verbose configures a progress counter, move to the next line. */ if (verbose) printf("\n"); @@ -56,7 +78,7 @@ usage(void) { (void)fprintf(stderr, "usage: %s %s " - "verify file\n", + "verify [-d dump_address | dump_blocks | dump_pages] uri\n", progname, usage_prefix); return (1); } diff --git a/test/bloom/test_bloom.c b/test/bloom/test_bloom.c index a19e1613fa2..7c339916fcb 100644 --- a/test/bloom/test_bloom.c +++ b/test/bloom/test_bloom.c @@ -106,7 +106,7 @@ int setup(void) int ret; char config[512]; - (void)system("rm -f WildTiger WiredTiger.* *.bf"); + (void)system("rm -f WiredTiger* *.bf"); /* * This test doesn't test public Wired Tiger functionality, it still diff --git a/test/fops/t.c b/test/fops/t.c index 694554741ec..f697f7fe30e 100644 --- a/test/fops/t.c +++ b/test/fops/t.c @@ -150,7 +150,7 @@ wt_shutdown(void) static void shutdown(void) { - (void)system("rm -f WildTiger WiredTiger.* __wt*"); + (void)system("rm -f WiredTiger* __wt*"); } static int diff --git a/test/suite/test_bug004.py b/test/suite/test_bug004.py index 69bf0247e32..2013859c6b0 100644 --- a/test/suite/test_bug004.py +++ b/test/suite/test_bug004.py @@ -25,7 +25,7 @@ # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR # OTHER DEALINGS IN THE SOFTWARE. # -# test_bug003.py +# test_bug004.py # Regression tests. import wiredtiger, wttest diff --git a/test/suite/test_compact.py b/test/suite/test_compact.py new file mode 100644 index 00000000000..c05ad088be0 --- /dev/null +++ b/test/suite/test_compact.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python +# +# Public Domain 2008-2012 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import os +import wiredtiger, wttest +from helper import complex_populate, simple_populate, key_populate +from suite_subprocess import suite_subprocess +from wtscenario import multiply_scenarios, number_scenarios + +# test_compact.py +# session level compact operation +class test_compact(wttest.WiredTigerTestCase, suite_subprocess): + name = 'test_compact' + + # Use a small page size because we want to create lots of pages. + config = 'leaf_page_max=512,value_format=S,key_format=S' + nentries = 40000 + + types = [ + ('file', dict(uri='file:')), + ('table', dict(uri='table:')) + ] + compact = [ + ('method', dict(utility=0,reopen=0)), + ('method_reopen', dict(utility=0,reopen=1)), + ('utility', dict(utility=1,reopen=0)), + ] + scenarios = number_scenarios(multiply_scenarios('.', types, compact)) + + # Test compaction. + def test_compact(self): + # Populate an object + uri = self.uri + self.name + if self.uri == "file:": + simple_populate(self, uri, self.config, self.nentries) + else: + complex_populate(self, uri, self.config, self.nentries) + + # Reopen the connection to force the object to disk. + self.reopen_conn() + + # Remove most of the object. + c1 = self.session.open_cursor(uri, None) + c1.set_key(key_populate(c1, 5)) + c2 = self.session.open_cursor(uri, None) + c2.set_key(key_populate(c2, self.nentries - 5)) + self.session.truncate(None, c1, c2, None) + c1.close() + c2.close() + + # Compact it, using either the session method or the utility. + if self.utility == 1: + self.session.checkpoint(None) + self.close_conn(); + self.runWt(["compact", uri]) + else: + # Optionally reopen the connection so we do more on-disk tests. + if self.reopen == 1: + self.session.checkpoint(None) + self.reopen_conn() + + self.session.compact(uri, None) + + # If it's a simple object, confirm it worked. + if self.uri == "file:": + self.assertLess(os.stat(self.name).st_size, 10 * 1024) + + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_util10.py b/test/suite/test_util10.py deleted file mode 100644 index 4e7641efe77..00000000000 --- a/test/suite/test_util10.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python -# -# Public Domain 2008-2012 WiredTiger, Inc. -# -# This is free and unencumbered software released into the public domain. -# -# Anyone is free to copy, modify, publish, use, compile, sell, or -# distribute this software, either in source code form or as a compiled -# binary, for any purpose, commercial or non-commercial, and by any -# means. -# -# In jurisdictions that recognize copyright laws, the author or authors -# of this software dedicate any and all copyright interest in the -# software to the public domain. We make this dedication for the benefit -# of the public at large and to the detriment of our heirs and -# successors. We intend this dedication to be an overt act of -# relinquishment in perpetuity of all present and future rights to this -# software under copyright law. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR -# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -# OTHER DEALINGS IN THE SOFTWARE. - -import os, struct -from suite_subprocess import suite_subprocess -import wiredtiger, wttest - -# test_util10.py -# Utilities: wt dumpfile -class test_util10(wttest.WiredTigerTestCase, suite_subprocess): - tablename = 'test_util10.a' - nentries = 1000 - session_params = 'key_format=S,value_format=S' - - def populate(self, tablename): - """ - Insert some simple entries into the table - """ - cursor = self.session.open_cursor('table:' + tablename, None, None) - for i in range(0, self.nentries): - key = 'KEY' + str(i) - val = 'VAL' + str(i) - cursor.set_key(key) - cursor.set_value(val) - cursor.insert() - cursor.set_key('SOMEKEY') - cursor.set_value('SOMEVALUE') - cursor.insert() - cursor.set_key('ANOTHERKEY') - cursor.set_value('ANOTHERVALUE') - cursor.insert() - cursor.close() - - def test_dumpfile_empty(self): - """ - Test read in a 'wt' process, using an empty table - """ - self.session.create('table:' + self.tablename, self.session_params) - outfile = "dumpfileout.txt" - self.runWt(["dumpfile", self.tablename + ".wt"], outfilename=outfile) - self.check_empty_file(outfile) - - def test_dumpfile_populated(self): - """ - Test read in a 'wt' process, using an empty table - """ - self.session.create('table:' + self.tablename, self.session_params) - self.populate(self.tablename) - outfile = "dumpfileout.txt" - self.runWt(["dumpfile", self.tablename + ".wt"], outfilename=outfile) - - # Expected output is roughly K/V pairs in this format: - # K {xxxxxx#00} - # V {xxxxxx#00} - # except that by default keys use prefix compression. - # 'KEY340' would not be found in the output, but rather K {0#00} - # because it appears immediately after 'KEY34' so uses the five - # bytes of that key. We've chosen keys to find that will not be - # compressed. - self.check_file_contains(outfile, 'V {VAL22#00}') - self.check_file_contains(outfile, 'K {KEY0#00}') - self.check_file_contains(outfile, 'K {SOMEKEY#00}') - self.check_file_contains(outfile, 'V {SOMEVALUE#00}') - self.check_file_contains(outfile, 'K {SOMEKEY#00}') - self.check_file_contains(outfile, 'V {ANOTHERVALUE#00}') - - -if __name__ == '__main__': - wttest.run() diff --git a/test/thread/t.c b/test/thread/t.c index 0f40e0424e4..9858ed08719 100644 --- a/test/thread/t.c +++ b/test/thread/t.c @@ -177,7 +177,7 @@ wt_shutdown(void) static void shutdown(void) { - (void)system("rm -f WildTiger WiredTiger.* __wt*"); + (void)system("rm -f WiredTiger.* __wt*"); } static int |