diff options
Diffstat (limited to 'src')
285 files changed, 3949 insertions, 1664 deletions
diff --git a/src/async/async_api.c b/src/async/async_api.c index dc26f2d11c3..fea8714176b 100644 --- a/src/async/async_api.c +++ b/src/async/async_api.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/async/async_op.c b/src/async/async_op.c index 7661a4383d6..130c704757b 100644 --- a/src/async/async_op.c +++ b/src/async/async_op.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/async/async_worker.c b/src/async/async_worker.c index 6a5ec5feeb0..e692bc619a9 100644 --- a/src/async/async_worker.c +++ b/src/async/async_worker.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/block/block_addr.c b/src/block/block_addr.c index 6d50e5f0f4e..b1f2fd9454a 100644 --- a/src/block/block_addr.c +++ b/src/block/block_addr.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -14,7 +14,7 @@ * caller's buffer reference so it can be called repeatedly to load a buffer. */ static int -__block_buffer_to_addr(WT_BLOCK *block, +__block_buffer_to_addr(uint32_t allocsize, const uint8_t **pp, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump) { uint64_t o, s, c; @@ -39,8 +39,8 @@ __block_buffer_to_addr(WT_BLOCK *block, *offsetp = 0; *sizep = *cksump = 0; } else { - *offsetp = (wt_off_t)(o + 1) * block->allocsize; - *sizep = (uint32_t)s * block->allocsize; + *offsetp = (wt_off_t)(o + 1) * allocsize; + *sizep = (uint32_t)s * allocsize; *cksump = (uint32_t)c; } return (0); @@ -80,7 +80,8 @@ int __wt_block_buffer_to_addr(WT_BLOCK *block, const uint8_t *p, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump) { - return (__block_buffer_to_addr(block, &p, offsetp, sizep, cksump)); + return (__block_buffer_to_addr( + block->allocsize, &p, offsetp, sizep, cksump)); } /* @@ -139,12 +140,12 @@ __wt_block_addr_string(WT_SESSION_IMPL *session, } /* - * __wt_block_buffer_to_ckpt -- + * __block_buffer_to_ckpt -- * Convert a checkpoint cookie into its components. */ -int -__wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session, - WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci) +static int +__block_buffer_to_ckpt(WT_SESSION_IMPL *session, + uint32_t allocsize, const uint8_t *p, WT_BLOCK_CKPT *ci) { uint64_t a; const uint8_t **pp; @@ -154,13 +155,13 @@ __wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session, WT_RET_MSG(session, WT_ERROR, "unsupported checkpoint version"); pp = &p; - WT_RET(__block_buffer_to_addr(block, pp, + WT_RET(__block_buffer_to_addr(allocsize, pp, &ci->root_offset, &ci->root_size, &ci->root_cksum)); - WT_RET(__block_buffer_to_addr(block, pp, + WT_RET(__block_buffer_to_addr(allocsize, pp, &ci->alloc.offset, &ci->alloc.size, &ci->alloc.cksum)); - WT_RET(__block_buffer_to_addr(block, pp, + WT_RET(__block_buffer_to_addr(allocsize, pp, &ci->avail.offset, &ci->avail.size, &ci->avail.cksum)); - WT_RET(__block_buffer_to_addr(block, pp, + WT_RET(__block_buffer_to_addr(allocsize, pp, &ci->discard.offset, &ci->discard.size, &ci->discard.cksum)); WT_RET(__wt_vunpack_uint(pp, 0, &a)); ci->file_size = (wt_off_t)a; @@ -171,6 +172,32 @@ __wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session, } /* + * __wt_block_buffer_to_ckpt -- + * Convert a checkpoint cookie into its components, block manager version. + */ +int +__wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session, + WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci) +{ + return (__block_buffer_to_ckpt(session, block->allocsize, p, ci)); +} + +/* + * __wt_block_ckpt_decode -- + * Convert a checkpoint cookie into its components, external utility + * version. + */ +int +__wt_block_ckpt_decode(WT_SESSION *wt_session, + size_t allocsize, const uint8_t *p, WT_BLOCK_CKPT *ci) +{ + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + return (__block_buffer_to_ckpt(session, (uint32_t)allocsize, p, ci)); +} + +/* * __wt_block_ckpt_to_buffer -- * Convert the components into its checkpoint cookie. */ diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c index adbcf0e3fdc..03059c8f23a 100644 --- a/src/block/block_ckpt.c +++ b/src/block/block_ckpt.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/block/block_compact.c b/src/block/block_compact.c index d45d0a96da7..8c9be4f029c 100644 --- a/src/block/block_compact.c +++ b/src/block/block_compact.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -8,7 +8,7 @@ #include "wt_internal.h" -static int __block_dump_avail(WT_SESSION_IMPL *, WT_BLOCK *); +static int __block_dump_avail(WT_SESSION_IMPL *, WT_BLOCK *, bool); /* * __wt_block_compact_start -- @@ -22,8 +22,6 @@ __wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block) /* Switch to first-fit allocation. */ __wt_block_configure_first_fit(block, true); - block->compact_pct_tenths = 0; - return (0); } @@ -34,14 +32,21 @@ __wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block) int __wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block) { + WT_DECL_RET; + WT_UNUSED(session); /* Restore the original allocation plan. */ __wt_block_configure_first_fit(block, false); - block->compact_pct_tenths = 0; + /* Dump the results of the compaction pass. */ + if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) { + __wt_spin_lock(session, &block->live_lock); + ret = __block_dump_avail(session, block, false); + __wt_spin_unlock(session, &block->live_lock); + } - return (0); + return (ret); } /* @@ -70,12 +75,23 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp) if (fh->size <= WT_MEGABYTE) return (0); + /* + * Reset the compaction state information. This is done here, not in the + * compaction "start" routine, because this function is called first to + * determine if compaction is useful. + */ + block->compact_pct_tenths = 0; + block->compact_pages_reviewed = 0; + block->compact_pages_skipped = 0; + block->compact_pages_written = 0; + __wt_spin_lock(session, &block->live_lock); + /* Dump the current state of the file. */ if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) - WT_ERR(__block_dump_avail(session, block)); + WT_ERR(__block_dump_avail(session, block, true)); - /* Sum the available bytes in the first 80% and 90% of the file. */ + /* Sum the available bytes in the initial 80% and 90% of the file. */ avail_eighty = avail_ninety = 0; ninety = fh->size - fh->size / 10; eighty = fh->size - ((fh->size / 10) * 2); @@ -88,23 +104,6 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp) avail_eighty += ext->size; } - WT_ERR(__wt_verbose(session, WT_VERB_COMPACT, - "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first " - "80%% of the file", - block->name, - (uintmax_t)avail_eighty / WT_MEGABYTE, (uintmax_t)avail_eighty)); - WT_ERR(__wt_verbose(session, WT_VERB_COMPACT, - "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first " - "90%% of the file", - block->name, - (uintmax_t)avail_ninety / WT_MEGABYTE, (uintmax_t)avail_ninety)); - WT_ERR(__wt_verbose(session, WT_VERB_COMPACT, - "%s: require 10%% or %" PRIuMAX "MB (%" PRIuMAX ") in the first " - "90%% of the file to perform compaction, compaction %s", - block->name, - (uintmax_t)(fh->size / 10) / WT_MEGABYTE, (uintmax_t)fh->size / 10, - *skipp ? "skipped" : "proceeding")); - /* * Skip files where we can't recover at least 1MB. * @@ -127,6 +126,23 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp) block->compact_pct_tenths = 1; } + WT_ERR(__wt_verbose(session, WT_VERB_COMPACT, + "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first " + "80%% of the file", + block->name, + (uintmax_t)avail_eighty / WT_MEGABYTE, (uintmax_t)avail_eighty)); + WT_ERR(__wt_verbose(session, WT_VERB_COMPACT, + "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first " + "90%% of the file", + block->name, + (uintmax_t)avail_ninety / WT_MEGABYTE, (uintmax_t)avail_ninety)); + WT_ERR(__wt_verbose(session, WT_VERB_COMPACT, + "%s: require 10%% or %" PRIuMAX "MB (%" PRIuMAX ") in the first " + "90%% of the file to perform compaction, compaction %s", + block->name, + (uintmax_t)(fh->size / 10) / WT_MEGABYTE, (uintmax_t)fh->size / 10, + *skipp ? "skipped" : "proceeding")); + err: __wt_spin_unlock(session, &block->live_lock); return (ret); @@ -177,6 +193,14 @@ __wt_block_compact_page_skip(WT_SESSION_IMPL *session, } __wt_spin_unlock(session, &block->live_lock); + if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) { + ++block->compact_pages_reviewed; + if (*skipp) + ++block->compact_pages_skipped; + else + ++block->compact_pages_written; + } + return (ret); } @@ -185,7 +209,7 @@ __wt_block_compact_page_skip(WT_SESSION_IMPL *session, * Dump out the avail list so we can see what compaction will look like. */ static int -__block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block) +__block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block, bool start) { WT_EXTLIST *el; WT_EXT *ext; @@ -196,6 +220,20 @@ __block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block) size = block->fh->size; WT_RET(__wt_verbose(session, WT_VERB_COMPACT, + "============ %s", + start ? "testing for compaction" : "ending compaction pass")); + + if (!start) { + WT_RET(__wt_verbose(session, WT_VERB_COMPACT, + "pages reviewed: %" PRIuMAX, + block->compact_pages_reviewed)); + WT_RET(__wt_verbose(session, WT_VERB_COMPACT, + "pages skipped: %" PRIuMAX, block->compact_pages_skipped)); + WT_RET(__wt_verbose(session, WT_VERB_COMPACT, + "pages written: %" PRIuMAX, block->compact_pages_written)); + } + + WT_RET(__wt_verbose(session, WT_VERB_COMPACT, "file size %" PRIuMAX "MB (%" PRIuMAX ") with %" PRIuMAX "%% space available %" PRIuMAX "MB (%" PRIuMAX ")", (uintmax_t)size / WT_MEGABYTE, (uintmax_t)size, @@ -219,6 +257,10 @@ __block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block) } #ifdef __VERBOSE_OUTPUT_PERCENTILE + /* + * The verbose output always displays 10% buckets, running this code + * as well also displays 1% buckets. + */ for (i = 0; i < WT_ELEMENTS(percentile); ++i) { v = percentile[i] * 512; WT_RET(__wt_verbose(session, WT_VERB_COMPACT, diff --git a/src/block/block_ext.c b/src/block/block_ext.c index a56df220390..ab5d5604087 100644 --- a/src/block/block_ext.c +++ b/src/block/block_ext.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/block/block_map.c b/src/block/block_map.c index 6dc270760d6..3d04a492269 100644 --- a/src/block/block_map.c +++ b/src/block/block_map.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c index 7260cab75d9..6e2dc775362 100644 --- a/src/block/block_mgr.c +++ b/src/block/block_mgr.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -221,6 +221,18 @@ __bm_free(WT_BM *bm, } /* + * __bm_is_mapped -- + * Return if the file is mapped into memory. + */ +static bool +__bm_is_mapped(WT_BM *bm, WT_SESSION_IMPL *session) +{ + WT_UNUSED(session); + + return (bm->map == NULL ? false : true); +} + +/* * __bm_stat -- * Block-manager statistics. */ @@ -357,6 +369,7 @@ __bm_method_set(WT_BM *bm, bool readonly) (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly; bm->free = (int (*)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t))__bm_readonly; + bm->is_mapped = __bm_is_mapped; bm->preload = __wt_bm_preload; bm->read = __wt_bm_read; bm->salvage_end = (int (*) @@ -367,6 +380,7 @@ __bm_method_set(WT_BM *bm, bool readonly) (WT_BM *, WT_SESSION_IMPL *))__bm_readonly; bm->salvage_valid = (int (*)(WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t, bool))__bm_readonly; + bm->size = __wt_block_manager_size; bm->stat = __bm_stat; bm->sync = (int (*)(WT_BM *, WT_SESSION_IMPL *, bool))__bm_readonly; @@ -391,12 +405,14 @@ __bm_method_set(WT_BM *bm, bool readonly) bm->compact_skip = __bm_compact_skip; bm->compact_start = __bm_compact_start; bm->free = __bm_free; + bm->is_mapped = __bm_is_mapped; bm->preload = __wt_bm_preload; bm->read = __wt_bm_read; bm->salvage_end = __bm_salvage_end; bm->salvage_next = __bm_salvage_next; bm->salvage_start = __bm_salvage_start; bm->salvage_valid = __bm_salvage_valid; + bm->size = __wt_block_manager_size; bm->stat = __bm_stat; bm->sync = __bm_sync; bm->verify_addr = __bm_verify_addr; diff --git a/src/block/block_open.c b/src/block/block_open.c index 7cf12d36066..dd0f3f0716a 100644 --- a/src/block/block_open.c +++ b/src/block/block_open.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -11,36 +11,13 @@ static int __desc_read(WT_SESSION_IMPL *, WT_BLOCK *); /* - * __wt_block_manager_truncate -- - * Truncate a file. + * __wt_block_manager_drop -- + * Drop a file. */ int -__wt_block_manager_truncate( - WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize) +__wt_block_manager_drop(WT_SESSION_IMPL *session, const char *filename) { - WT_DECL_RET; - WT_FH *fh; - - /* Open the underlying file handle. */ - WT_RET(__wt_open( - session, filename, false, false, WT_FILE_TYPE_DATA, &fh)); - - /* Truncate the file. */ - WT_ERR(__wt_block_truncate(session, fh, (wt_off_t)0)); - - /* Write out the file's meta-data. */ - WT_ERR(__wt_desc_init(session, fh, allocsize)); - - /* - * Ensure the truncated file has made it to disk, then the upper-level - * is never surprised. - */ - WT_ERR(__wt_fsync(session, fh)); - - /* Close the file handle. */ -err: WT_TRET(__wt_close(session, &fh)); - - return (ret); + return (__wt_remove_if_exists(session, filename)); } /* @@ -405,27 +382,37 @@ __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats) * Reading from the live system's structure normally requires locking, * but it's an 8B statistics read, there's no need. */ - stats->allocation_size = block->allocsize; - stats->block_checkpoint_size = (int64_t)block->live.ckpt_size; - stats->block_magic = WT_BLOCK_MAGIC; - stats->block_major = WT_BLOCK_MAJOR_VERSION; - stats->block_minor = WT_BLOCK_MINOR_VERSION; - stats->block_reuse_bytes = (int64_t)block->live.avail.bytes; - stats->block_size = block->fh->size; + WT_STAT_WRITE(stats, allocation_size, block->allocsize); + WT_STAT_WRITE( + stats, block_checkpoint_size, (int64_t)block->live.ckpt_size); + WT_STAT_WRITE(stats, block_magic, WT_BLOCK_MAGIC); + WT_STAT_WRITE(stats, block_major, WT_BLOCK_MAJOR_VERSION); + WT_STAT_WRITE(stats, block_minor, WT_BLOCK_MINOR_VERSION); + WT_STAT_WRITE( + stats, block_reuse_bytes, (int64_t)block->live.avail.bytes); + WT_STAT_WRITE(stats, block_size, block->fh->size); } /* * __wt_block_manager_size -- - * Set the size statistic for a file. + * Return the size of a live block handle. */ int -__wt_block_manager_size( - WT_SESSION_IMPL *session, const char *filename, WT_DSRC_STATS *stats) +__wt_block_manager_size(WT_BM *bm, WT_SESSION_IMPL *session, wt_off_t *sizep) { - wt_off_t filesize; - - WT_RET(__wt_filesize_name(session, filename, false, &filesize)); - stats->block_size = filesize; + WT_UNUSED(session); + *sizep = bm->block->fh == NULL ? 0 : bm->block->fh->size; return (0); } + +/* + * __wt_block_manager_named_size -- + * Return the size of a named file. + */ +int +__wt_block_manager_named_size( + WT_SESSION_IMPL *session, const char *name, wt_off_t *sizep) +{ + return (__wt_filesize_name(session, name, false, sizep)); +} diff --git a/src/block/block_read.c b/src/block/block_read.c index ca7797f17af..0e5911ecf2a 100644 --- a/src/block/block_read.c +++ b/src/block/block_read.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/block/block_session.c b/src/block/block_session.c index 6683fdd20ce..268adb530cf 100644 --- a/src/block/block_session.c +++ b/src/block/block_session.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/block/block_slvg.c b/src/block/block_slvg.c index 9f3093c741d..ef22c727db4 100644 --- a/src/block/block_slvg.c +++ b/src/block/block_slvg.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/block/block_vrfy.c b/src/block/block_vrfy.c index 9904dcccd14..35c7a2c218c 100644 --- a/src/block/block_vrfy.c +++ b/src/block/block_vrfy.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/block/block_write.c b/src/block/block_write.c index 26efac54080..23f4d7650b9 100644 --- a/src/block/block_write.c +++ b/src/block/block_write.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/bloom/bloom.c b/src/bloom/bloom.c index e3a21f25dc1..505630f12cf 100644 --- a/src/bloom/bloom.c +++ b/src/bloom/bloom.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -130,8 +130,8 @@ __bloom_open_cursor(WT_BLOOM *bloom, WT_CURSOR *owner) c = NULL; WT_RET(__wt_open_cursor(session, bloom->uri, owner, cfg, &c)); - /* XXX Layering violation: bump the cache priority for Bloom filters. */ - ((WT_CURSOR_BTREE *)c)->btree->evict_priority = WT_EVICT_INT_SKEW; + /* Bump the cache priority for Bloom filters. */ + __wt_evict_priority_set(session, WT_EVICT_INT_SKEW); bloom->c = c; return (0); diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c index 8044d4f852d..12df19a7e04 100644 --- a/src/btree/bt_compact.c +++ b/src/btree/bt_compact.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -17,9 +17,11 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_BM *bm; WT_DECL_RET; + WT_MULTI *multi; WT_PAGE *page; WT_PAGE_MODIFY *mod; size_t addr_size; + uint32_t i; const uint8_t *addr; *skipp = true; /* Default skip. */ @@ -41,29 +43,46 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) /* * If the page is clean, test the original addresses. - * If the page is a 1-to-1 replacement, test the replacement addresses. + * If the page is a replacement, test the replacement addresses. * Ignore empty pages, they get merged into the parent. */ if (mod == NULL || mod->rec_result == 0) { __wt_ref_info(ref, &addr, &addr_size, NULL); if (addr == NULL) return (0); - WT_RET( + return ( bm->compact_page_skip(bm, session, addr, addr_size, skipp)); - } else if (mod->rec_result == WT_PM_REC_REPLACE) { - /* - * The page's modification information can change underfoot if - * the page is being reconciled, serialize with reconciliation. - */ + } + + /* + * The page's modification information can change underfoot if the page + * is being reconciled, serialize with reconciliation. + */ + if (mod->rec_result == WT_PM_REC_REPLACE || + mod->rec_result == WT_PM_REC_MULTIBLOCK) WT_RET(__wt_fair_lock(session, &page->page_lock)); + if (mod->rec_result == WT_PM_REC_REPLACE) ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp); + if (mod->rec_result == WT_PM_REC_MULTIBLOCK) + for (multi = mod->mod_multi, + i = 0; i < mod->mod_multi_entries; ++multi, ++i) { + if (multi->disk_image != NULL) + continue; + if ((ret = bm->compact_page_skip(bm, session, + multi->addr.addr, multi->addr.size, skipp)) != 0) + break; + if (!*skipp) + break; + } + + if (mod->rec_result == WT_PM_REC_REPLACE || + mod->rec_result == WT_PM_REC_MULTIBLOCK) WT_TRET(__wt_fair_unlock(session, &page->page_lock)); - WT_RET(ret); - } - return (0); + + return (ret); } /* @@ -139,7 +158,8 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) if (skip) continue; - session->compaction = true; + session->compact_state = WT_COMPACT_SUCCESS; + /* Rewrite the page: mark the page and tree dirty. */ WT_ERR(__wt_page_modify_init(session, ref->page)); __wt_page_modify_set(session, ref->page); diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c index 55843d1cae5..63b2e2abebc 100644 --- a/src/btree/bt_curnext.c +++ b/src/btree/bt_curnext.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -31,13 +31,12 @@ __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage) return (WT_NOTFOUND); /* - * This code looks different from the cursor-previous code. The append - * list appears on the last page of the tree, but it may be preceded by - * other rows, which means the cursor's recno will be set to a value and - * we simply want to increment it. If the cursor's recno is NOT set, - * we're starting our iteration in a tree that has only appended items. - * In that case, recno will be 0 and happily enough the increment will - * set it to 1, which is correct. + * This code looks different from the cursor-previous code. The append + * list may be preceded by other rows, which means the cursor's recno + * will be set to a value and we simply want to increment it. If the + * cursor's recno is NOT set, we're starting an iteration in a tree with + * only appended items. In that case, recno will be 0 and happily enough + * the increment will set it to 1, which is correct. */ __cursor_set_recno(cbt, cbt->recno + 1); @@ -368,6 +367,140 @@ new_insert: if ((ins = cbt->ins) != NULL) { /* NOTREACHED */ } +#ifdef HAVE_DIAGNOSTIC +/* + * __cursor_key_order_check_col -- + * Check key ordering for column-store cursor movements. + */ +static int +__cursor_key_order_check_col( + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next) +{ + int cmp; + + cmp = 0; /* -Werror=maybe-uninitialized */ + + if (cbt->lastrecno != WT_RECNO_OOB) { + if (cbt->lastrecno < cbt->recno) + cmp = -1; + if (cbt->lastrecno > cbt->recno) + cmp = 1; + } + + if (cbt->lastrecno == WT_RECNO_OOB || + (next && cmp < 0) || (!next && cmp > 0)) { + cbt->lastrecno = cbt->recno; + return (0); + } + + WT_PANIC_RET(session, EINVAL, + "WT_CURSOR.%s out-of-order returns: returned key %" PRIu64 " then " + "key %" PRIu64, + next ? "next" : "prev", cbt->lastrecno, cbt->recno); +} + +/* + * __cursor_key_order_check_row -- + * Check key ordering for row-store cursor movements. + */ +static int +__cursor_key_order_check_row( + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next) +{ + WT_BTREE *btree; + WT_ITEM *key; + WT_DECL_RET; + WT_DECL_ITEM(a); + WT_DECL_ITEM(b); + int cmp; + + btree = S2BT(session); + key = &cbt->iface.key; + cmp = 0; /* -Werror=maybe-uninitialized */ + + if (cbt->lastkey->size != 0) + WT_RET(__wt_compare( + session, btree->collator, cbt->lastkey, key, &cmp)); + + if (cbt->lastkey->size == 0 || (next && cmp < 0) || (!next && cmp > 0)) + return (__wt_buf_set(session, + cbt->lastkey, cbt->iface.key.data, cbt->iface.key.size)); + + WT_ERR(__wt_scr_alloc(session, 512, &a)); + WT_ERR(__wt_scr_alloc(session, 512, &b)); + + WT_PANIC_ERR(session, EINVAL, + "WT_CURSOR.%s out-of-order returns: returned key %s then key %s", + next ? "next" : "prev", + __wt_buf_set_printable( + session, cbt->lastkey->data, cbt->lastkey->size, a), + __wt_buf_set_printable(session, key->data, key->size, b)); + +err: __wt_scr_free(session, &a); + __wt_scr_free(session, &b); + + return (ret); +} + +/* + * __wt_cursor_key_order_check -- + * Check key ordering for cursor movements. + */ +int +__wt_cursor_key_order_check( + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next) +{ + switch (cbt->ref->page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + return (__cursor_key_order_check_col(session, cbt, next)); + case WT_PAGE_ROW_LEAF: + return (__cursor_key_order_check_row(session, cbt, next)); + WT_ILLEGAL_VALUE(session); + } + /* NOTREACHED */ +} + +/* + * __wt_cursor_key_order_init -- + * Initialize key ordering checks for cursor movements after a successful + * search. + */ +int +__wt_cursor_key_order_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +{ + /* + * Cursor searches set the position for cursor movements, set the + * last-key value for diagnostic checking. + */ + switch (cbt->ref->page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + cbt->lastrecno = cbt->recno; + return (0); + case WT_PAGE_ROW_LEAF: + return (__wt_buf_set(session, + cbt->lastkey, cbt->iface.key.data, cbt->iface.key.size)); + WT_ILLEGAL_VALUE(session); + } + /* NOTREACHED */ +} + +/* + * __wt_cursor_key_order_reset -- + * Turn off key ordering checks for cursor movements. + */ +void +__wt_cursor_key_order_reset(WT_CURSOR_BTREE *cbt) +{ + /* + * Clear the last-key returned, it doesn't apply. + */ + cbt->lastkey->size = 0; + cbt->lastrecno = WT_RECNO_OOB; +} +#endif + /* * __wt_btcur_iterate_setup -- * Initialize a cursor for iteration, usually based on a search. @@ -393,10 +526,14 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt) * If we don't have a search page, then we're done, we're starting at * the beginning or end of the tree, not as a result of a search. */ - if (cbt->ref == NULL) + if (cbt->ref == NULL) { +#ifdef HAVE_DIAGNOSTIC + __wt_cursor_key_order_reset(cbt); +#endif return; - page = cbt->ref->page; + } + page = cbt->ref->page; if (page->type == WT_PAGE_ROW_LEAF) { /* * For row-store pages, we need a single item that tells us the @@ -468,7 +605,6 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) */ for (newpage = false;; newpage = true) { page = cbt->ref == NULL ? NULL : cbt->ref->page; - WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page)); if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { @@ -502,9 +638,9 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) break; /* - * The last page in a column-store has appended entries. - * We handle it separately from the usual cursor code: - * it's only that one page and it's in a simple format. + * Column-store pages may have appended entries. Handle + * it separately from the usual cursor code, it's in a + * simple format. */ if (page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) { @@ -531,6 +667,11 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } +#ifdef HAVE_DIAGNOSTIC + if (ret == 0) + WT_ERR(__wt_cursor_key_order_check(session, cbt, true)); +#endif + err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c index 1d23b976edd..a083ec4016e 100644 --- a/src/btree/bt_curprev.c +++ b/src/btree/bt_curprev.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -139,10 +139,20 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage) if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL) return (WT_NOTFOUND); } else { + /* Move to the previous record in the append list, if any. */ + if (cbt->ins != NULL && + cbt->recno <= WT_INSERT_RECNO(cbt->ins)) + WT_RET(__cursor_skip_prev(cbt)); + /* * Handle the special case of leading implicit records, that is, - * there aren't any records in the tree not on the append list, - * and the first record on the append list isn't record 1. + * there aren't any records in the page not on the append list, + * and the append list's first record isn't the first record on + * the page. (Although implemented as a test of the page values, + * this is really a test for a tree where the first inserted + * record wasn't record 1, any other page with only an append + * list will have a first page record number matching the first + * record in the append list.) * * The "right" place to handle this is probably in our caller. * The high-level cursor-previous routine would: @@ -156,27 +166,26 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage) * into our caller. Anyway, if this code breaks for any reason, * that's the way I'd go. * - * If we're not pointing to a WT_INSERT entry, or we can't find - * a WT_INSERT record that precedes our record name-space, check - * if there are any records on the page. If there aren't, then - * we're in the magic zone, keep going until we get to a record - * number of 1. + * If we're not pointing to a WT_INSERT entry (we didn't find a + * WT_INSERT record preceding our record name-space), check if + * we've reached the beginning of this page, a possibility if a + * page had a large number of items appended, and then split. + * If not, check if there are any records on the page. If there + * aren't, then we're in the magic zone, keep going until we get + * to a record number matching the first record on the page. */ - if (cbt->ins != NULL && - cbt->recno <= WT_INSERT_RECNO(cbt->ins)) - WT_RET(__cursor_skip_prev(cbt)); if (cbt->ins == NULL && - (cbt->recno == 1 || __col_fix_last_recno(page) != 0)) + (cbt->recno == page->pg_fix_recno || + __col_fix_last_recno(page) != 0)) return (WT_NOTFOUND); } /* - * This code looks different from the cursor-next code. The append - * list appears on the last page of the tree and contains the last - * records in the tree. If we're iterating through the tree, starting - * at the last record in the tree, by definition we're starting a new - * iteration and we set the record number to the last record found in - * the tree. Otherwise, decrement the record. + * This code looks different from the cursor-next code. The append list + * may be preceded by other rows. If we're iterating through the tree, + * starting at the last record in the tree, by definition we're starting + * a new iteration and we set the record number to the last record found + * on the page. Otherwise, decrement the record. */ if (newpage) __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); @@ -556,12 +565,11 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) */ for (newpage = false;; newpage = true) { page = cbt->ref == NULL ? NULL : cbt->ref->page; - WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page)); /* - * The last page in a column-store has appended entries. - * We handle it separately from the usual cursor code: - * it's only that one page and it's in a simple format. + * Column-store pages may have appended entries. Handle it + * separately from the usual cursor code, it's in a simple + * format. */ if (newpage && page != NULL && page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) @@ -618,6 +626,10 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } +#ifdef HAVE_DIAGNOSTIC + if (ret == 0) + WT_ERR(__wt_cursor_key_order_check(session, cbt, false)); +#endif err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index f2bf2978320..c11b7d35de6 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -62,8 +62,18 @@ __cursor_size_chk(WT_SESSION_IMPL *session, WT_ITEM *kv) static inline int __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt) { - return (btree->type == BTREE_COL_FIX && - !F_ISSET(cbt, WT_CBT_MAX_RECORD)); + /* + * When there's no exact match, column-store search returns the key + * nearest the searched-for key (continuing past keys smaller than the + * searched-for key to return the next-largest key). Therefore, if the + * returned comparison is -1, the searched-for key was larger than any + * row on the page's standard information or column-store insert list. + * + * If the returned comparison is NOT -1, there was a row equal to or + * larger than the searched-for key, and we implicitly create missing + * rows. + */ + return (btree->type == BTREE_COL_FIX && cbt->compare != -1); } /* @@ -344,6 +354,11 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) } else ret = WT_NOTFOUND; +#ifdef HAVE_DIAGNOSTIC + if (ret == 0) + WT_ERR(__wt_cursor_key_order_init(session, cbt)); +#endif + err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); @@ -454,6 +469,11 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) exact = -1; } +#ifdef HAVE_DIAGNOSTIC + if (ret == 0) + WT_ERR(__wt_cursor_key_order_init(session, cbt)); +#endif + err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND)) @@ -502,19 +522,14 @@ retry: WT_RET(__cursor_func_init(cbt, true)); case BTREE_COL_VAR: /* * If WT_CURSTD_APPEND is set, insert a new record (ignoring - * the application's record number). First we search for the - * maximum possible record number so the search ends on the - * last page. The real record number is assigned by the - * serialized append operation. + * the application's record number). The real record number + * is assigned by the serialized append operation. */ if (F_ISSET(cursor, WT_CURSTD_APPEND)) - cbt->iface.recno = UINT64_MAX; + cbt->iface.recno = WT_RECNO_OOB; WT_ERR(__cursor_col_search(session, cbt, NULL)); - if (F_ISSET(cursor, WT_CURSTD_APPEND)) - cbt->iface.recno = WT_RECNO_OOB; - /* * If not overwriting, fail if the key exists. Creating a * record past the end of the tree in a fixed-length @@ -830,6 +845,7 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; + wt_off_t size; uint64_t skip; session = (WT_SESSION_IMPL *)cbt->iface.session; @@ -866,10 +882,12 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) * !!! * Ideally, the number would be prime to avoid restart issues. */ - if (cbt->next_random_sample_size != 0) + if (cbt->next_random_sample_size != 0) { + WT_ERR(btree->bm->size(btree->bm, session, &size)); cbt->next_random_leaf_skip = (uint64_t) - ((btree->bm->block->fh->size / btree->allocsize) / + ((size / btree->allocsize) / cbt->next_random_sample_size) + 1; + } /* * Choose a leaf page from the tree. @@ -1225,6 +1243,11 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt) { cbt->row_key = &cbt->_row_key; cbt->tmp = &cbt->_tmp; + +#ifdef HAVE_DIAGNOSTIC + cbt->lastkey = &cbt->_lastkey; + cbt->lastrecno = WT_RECNO_OOB; +#endif } /* @@ -1250,6 +1273,9 @@ __wt_btcur_close(WT_CURSOR_BTREE *cbt, bool lowlevel) __wt_buf_free(session, &cbt->_row_key); __wt_buf_free(session, &cbt->_tmp); +#ifdef HAVE_DIAGNOSTIC + __wt_buf_free(session, &cbt->_lastkey); +#endif return (ret); } diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index d52a94a6da2..7c7f8cab855 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -74,9 +74,7 @@ __wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v) static inline void __debug_hex_byte(WT_DBG *ds, uint8_t v) { - static const char hex[] = "0123456789abcdef"; - - __dmsg(ds, "#%c%c", hex[(v & 0xf0) >> 4], hex[v & 0x0f]); + __dmsg(ds, "#%c%c", __wt_hex[(v & 0xf0) >> 4], __wt_hex[v & 0x0f]); } /* @@ -678,8 +676,12 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page) __dmsg(ds, ", evict-lru"); if (F_ISSET_ATOMIC(page, WT_PAGE_OVERFLOW_KEYS)) __dmsg(ds, ", overflow-keys"); + if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK)) + __dmsg(ds, ", split-block"); if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT)) __dmsg(ds, ", split-insert"); + if (F_ISSET_ATOMIC(page, WT_PAGE_UPDATE_IGNORE)) + __dmsg(ds, ", update-ignore"); if (mod != NULL) switch (mod->rec_result) { diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c index 9dd72108e4b..ba16dd204e8 100644 --- a/src/btree/bt_delete.c +++ b/src/btree/bt_delete.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index 54b07513089..795111d53f9 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index a6330326954..7f0f37d95d6 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -105,14 +105,23 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) WT_ERR(__wt_btree_tree_open( session, root_addr, root_addr_size)); - /* Warm the cache, if possible. */ - WT_WITH_PAGE_INDEX(session, - ret = __btree_preload(session)); - WT_ERR(ret); - - /* Get the last record number in a column-store file. */ - if (btree->type != BTREE_ROW) - WT_ERR(__btree_get_last_recno(session)); + /* + * Rebalance uses the cache, but only wants the root + * page, nothing else. + */ + if (!F_ISSET(btree, WT_BTREE_REBALANCE)) { + /* Warm the cache, if possible. */ + WT_WITH_PAGE_INDEX(session, + ret = __btree_preload(session)); + WT_ERR(ret); + + /* + * Get the last record number in a column-store + * file. + */ + if (btree->type != BTREE_ROW) + WT_ERR(__btree_get_last_recno(session)); + } } } @@ -514,7 +523,7 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, bool creation) /* Bulk loads require a leaf page for reconciliation: create it now. */ if (F_ISSET(btree, WT_BTREE_BULK)) { - WT_ERR(__wt_btree_new_leaf_page(session, &leaf)); + WT_ERR(__wt_btree_new_leaf_page(session, 1, &leaf)); ref->page = leaf; ref->state = WT_REF_MEM; WT_ERR(__wt_page_modify_init(session, leaf)); @@ -538,7 +547,8 @@ err: if (leaf != NULL) * Create an empty leaf page. */ int -__wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep) +__wt_btree_new_leaf_page( + WT_SESSION_IMPL *session, uint64_t recno, WT_PAGE **pagep) { WT_BTREE *btree; @@ -547,15 +557,15 @@ __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep) switch (btree->type) { case BTREE_COL_FIX: WT_RET(__wt_page_alloc( - session, WT_PAGE_COL_FIX, 1, 0, false, pagep)); + session, WT_PAGE_COL_FIX, recno, 0, false, pagep)); break; case BTREE_COL_VAR: WT_RET(__wt_page_alloc( - session, WT_PAGE_COL_VAR, 1, 0, false, pagep)); + session, WT_PAGE_COL_VAR, recno, 0, false, pagep)); break; case BTREE_ROW: WT_RET(__wt_page_alloc( - session, WT_PAGE_ROW_LEAF, 0, 0, false, pagep)); + session, WT_PAGE_ROW_LEAF, WT_RECNO_OOB, 0, false, pagep)); break; WT_ILLEGAL_VALUE(session); } diff --git a/src/btree/bt_huffman.c b/src/btree/bt_huffman.c index d9ff9616072..2c0238545fb 100644 --- a/src/btree/bt_huffman.c +++ b/src/btree/bt_huffman.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -332,11 +332,17 @@ __wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip, for (tp = table, lineno = 1; (ret = fscanf(fp, "%" SCNi64 " %" SCNi64, &symbol, &frequency)) != EOF; ++tp, ++lineno) { - if (lineno > entries) + /* + * Entries is 0-based, that is, there are (entries +1) possible + * values that can be configured. The line number is 1-based, so + * adjust the test for too many entries, and report (entries +1) + * in the error as the maximum possible number of entries. + */ + if (lineno > entries + 1) WT_ERR_MSG(session, EINVAL, "Huffman table file %.*s is corrupted, " "more than %" PRIu32 " entries", - (int)ip->len, ip->str, entries); + (int)ip->len, ip->str, entries + 1); if (ret != 2) WT_ERR_MSG(session, EINVAL, "line %u of Huffman table file %.*s is corrupted: " diff --git a/src/btree/bt_io.c b/src/btree/bt_io.c index 6481f514323..aaf906ca785 100644 --- a/src/btree/bt_io.c +++ b/src/btree/bt_io.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/bt_misc.c b/src/btree/bt_misc.c index a60499ef8b7..7f188502a0a 100644 --- a/src/btree/bt_misc.c +++ b/src/btree/bt_misc.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -129,3 +129,19 @@ __wt_addr_string(WT_SESSION_IMPL *session, } return (buf->data); } + +/* + * __wt_buf_set_printable -- + * Set the contents of the buffer to a printable representation of a + * byte string. + */ +const char * +__wt_buf_set_printable( + WT_SESSION_IMPL *session, const void *p, size_t size, WT_ITEM *buf) +{ + if (__wt_raw_to_esc_hex(session, p, size, buf)) { + buf->data = "[Error]"; + buf->size = strlen("[Error]"); + } + return (buf->data); +} diff --git a/src/btree/bt_ovfl.c b/src/btree/bt_ovfl.c index 651cbc8d4ad..fbe361e000a 100644 --- a/src/btree/bt_ovfl.c +++ b/src/btree/bt_ovfl.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index 8808f0b1a85..9fa0145bbdd 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -272,7 +272,7 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page) const WT_PAGE_HEADER *dsk; WT_PAGE_INDEX *pindex; WT_REF **refp, *ref; - uint32_t i; + uint32_t hint, i; btree = S2BT(session); dsk = page->dsk; @@ -284,9 +284,11 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page) */ pindex = WT_INTL_INDEX_GET_SAFE(page); refp = pindex->index; + hint = 0; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { ref = *refp++; ref->home = page; + ref->pindex_hint = hint++; __wt_cell_unpack(cell, unpack); ref->addr = cell; @@ -404,7 +406,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) const WT_PAGE_HEADER *dsk; WT_PAGE_INDEX *pindex; WT_REF *ref, **refp; - uint32_t i; + uint32_t hint, i; bool overflow_keys; btree = S2BT(session); @@ -421,9 +423,11 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) pindex = WT_INTL_INDEX_GET_SAFE(page); refp = pindex->index; overflow_keys = false; + hint = 0; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { ref = *refp; ref->home = page; + ref->pindex_hint = hint++; __wt_cell_unpack(cell, unpack); switch (unpack->type) { diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index c50f97bbe14..ac9faef4ff2 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -379,7 +379,9 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref) if (addr == NULL) { WT_ASSERT(session, previous_state == WT_REF_DELETED); - WT_ERR(__wt_btree_new_leaf_page(session, &page)); + WT_ERR(__wt_btree_new_leaf_page(session, + btree->type == BTREE_ROW ? WT_RECNO_OOB : ref->key.recno, + &page)); ref->page = page; goto done; } diff --git a/src/btree/bt_rebalance.c b/src/btree/bt_rebalance.c new file mode 100644 index 00000000000..86360e83ddf --- /dev/null +++ b/src/btree/bt_rebalance.c @@ -0,0 +1,486 @@ +/*- + * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * Shared rebalance information. + */ +typedef struct { + WT_REF **leaf; /* List of leaf pages */ + size_t leaf_next; /* Next entry */ + size_t leaf_allocated; /* Allocated bytes */ + + WT_ADDR *fl; /* List of objects to free */ + size_t fl_next; /* Next entry */ + size_t fl_allocated; /* Allocated bytes */ + + WT_PAGE *root; /* Created root page */ + + uint8_t type; /* Internal page type */ + +#define WT_REBALANCE_PROGRESS_INTERVAL 100 + uint64_t progress; /* Progress counter */ + + WT_ITEM *tmp1; /* Temporary buffers */ + WT_ITEM *tmp2; +} WT_REBALANCE_STUFF; + +/* + * __rebalance_discard -- + * Free the allocated information. + */ +static void +__rebalance_discard(WT_SESSION_IMPL *session, WT_REBALANCE_STUFF *rs) +{ + while (rs->leaf_next > 0) { + --rs->leaf_next; + __wt_free_ref( + session, rs->leaf[rs->leaf_next], rs->type, false); + } + __wt_free(session, rs->leaf); + + while (rs->fl_next > 0) { + --rs->fl_next; + __wt_free(session, rs->fl[rs->fl_next].addr); + } + __wt_free(session, rs->fl); +} + +/* + * __rebalance_leaf_append -- + * Add a new entry to the list of leaf pages. + */ +static int +__rebalance_leaf_append(WT_SESSION_IMPL *session, + const uint8_t *key, size_t key_len, uint64_t recno, + const uint8_t *addr, size_t addr_len, u_int addr_type, + WT_REBALANCE_STUFF *rs) +{ + WT_ADDR *copy_addr; + WT_REF *copy; + + WT_RET(__wt_verbose(session, WT_VERB_REBALANCE, + "rebalance leaf-list append %s, %s", + __wt_buf_set_printable(session, key, key_len, rs->tmp2), + __wt_addr_string(session, addr, addr_len, rs->tmp1))); + + /* Allocate and initialize a new leaf page reference. */ + WT_RET(__wt_realloc_def( + session, &rs->leaf_allocated, rs->leaf_next + 1, &rs->leaf)); + WT_RET(__wt_calloc_one(session, ©)); + rs->leaf[rs->leaf_next++] = copy; + + copy->page = NULL; + copy->home = NULL; + copy->pindex_hint = 0; + copy->state = WT_REF_DISK; + + WT_RET(__wt_calloc_one(session, ©_addr)); + copy->addr = copy_addr; + WT_RET(__wt_strndup(session, addr, addr_len, ©_addr->addr)); + copy_addr->size = (uint8_t)addr_len; + copy_addr->type = (uint8_t)addr_type; + + if (recno == WT_RECNO_OOB) + WT_RET(__wt_row_ikey(session, 0, key, key_len, copy)); + else + copy->key.recno = recno; + + copy->page_del = NULL; + return (0); +} + +/* + * __rebalance_fl_append -- + * Add a new entry to the free list. + */ +static int +__rebalance_fl_append(WT_SESSION_IMPL *session, + const uint8_t *addr, size_t addr_len, WT_REBALANCE_STUFF *rs) +{ + WT_ADDR *copy; + + WT_RET(__wt_realloc_def( + session, &rs->fl_allocated, rs->fl_next + 1, &rs->fl)); + copy = &rs->fl[rs->fl_next++]; + + WT_RET(__wt_strndup(session, addr, addr_len, ©->addr)); + copy->size = (uint8_t)addr_len; + copy->type = 0; + + return (0); +} + +/* + * __rebalance_internal -- + * Build an in-memory page that references all of the leaf pages we've + * found. + */ +static int +__rebalance_internal(WT_SESSION_IMPL *session, WT_REBALANCE_STUFF *rs) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_INDEX *pindex; + WT_REF **refp; + uint32_t i, leaf_next; + + btree = S2BT(session); + + /* + * There's a limit to the number of pages we can rebalance: the number + * of elements on a page is a 4B quantity and it's technically possible + * there could be more pages than that in a tree. + */ + if (rs->leaf_next > UINT32_MAX) + WT_RET_MSG(session, ENOTSUP, + "too many leaf pages to rebalance, %" WT_SIZET_FMT " pages " + "exceeds the maximum of %" PRIu32, + rs->leaf_next, UINT32_MAX); + leaf_next = (uint32_t)rs->leaf_next; + + /* Allocate a row-store root (internal) page and fill it in. */ + WT_RET(__wt_page_alloc(session, rs->type, + rs->type == WT_PAGE_COL_INT ? 1 : 0, leaf_next, false, &page)); + page->pg_intl_parent_ref = &btree->root; + WT_ERR(__wt_page_modify_init(session, page)); + __wt_page_modify_set(session, page); + + pindex = WT_INTL_INDEX_GET_SAFE(page); + for (refp = pindex->index, i = 0; i < leaf_next; ++i) { + rs->leaf[i]->home = page; + *refp++ = rs->leaf[i]; + rs->leaf[i] = NULL; + } + + rs->root = page; + return (0); + +err: __wt_page_out(session, &page); + return (ret); +} + +/* + * __rebalance_free_original -- + * Free the tracked internal pages and overflow keys. + */ +static int +__rebalance_free_original(WT_SESSION_IMPL *session, WT_REBALANCE_STUFF *rs) +{ + WT_ADDR *addr; + uint64_t i; + + for (i = 0; i < rs->fl_next; ++i) { + addr = &rs->fl[i]; + + WT_RET(__wt_verbose(session, WT_VERB_REBALANCE, + "rebalance discarding %s", + __wt_addr_string( + session, addr->addr, addr->size, rs->tmp1))); + + WT_RET(__wt_btree_block_free(session, addr->addr, addr->size)); + } + return (0); +} + +/* + * __rebalance_col_walk -- + * Walk a column-store page and its descendants. + */ +static int +__rebalance_col_walk( + WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_REBALANCE_STUFF *rs) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK unpack; + WT_DECL_ITEM(buf); + WT_DECL_RET; + uint32_t i; + + btree = S2BT(session); + + WT_ERR(__wt_scr_alloc(session, 0, &buf)); + + /* Report progress periodically. */ + if (++rs->progress % WT_REBALANCE_PROGRESS_INTERVAL == 0) + WT_ERR(__wt_progress(session, NULL, rs->progress)); + + /* + * Walk the page, instantiating keys: the page contains sorted key and + * location cookie pairs. Keys are on-page/overflow items and location + * cookies are WT_CELL_ADDR_XXX items. + */ + WT_CELL_FOREACH(btree, dsk, cell, &unpack, i) { + __wt_cell_unpack(cell, &unpack); + switch (unpack.type) { + case WT_CELL_ADDR_INT: + /* An internal page: read it and recursively walk it. */ + WT_ERR(__wt_bt_read( + session, buf, unpack.data, unpack.size)); + WT_ERR(__rebalance_col_walk(session, buf->data, rs)); + WT_ERR(__wt_verbose(session, WT_VERB_REBALANCE, + "free-list append internal page: %s", + __wt_addr_string( + session, unpack.data, unpack.size, rs->tmp1))); + WT_ERR(__rebalance_fl_append( + session, unpack.data, unpack.size, rs)); + break; + case WT_CELL_ADDR_LEAF: + case WT_CELL_ADDR_LEAF_NO: + WT_ERR(__rebalance_leaf_append(session, + NULL, 0, unpack.v, unpack.data, unpack.size, + unpack.type == WT_CELL_ADDR_LEAF ? + WT_ADDR_LEAF : WT_ADDR_LEAF_NO, rs)); + break; + WT_ILLEGAL_VALUE_ERR(session); + } + } + +err: __wt_scr_free(session, &buf); + return (ret); +} + +/* + * __rebalance_row_leaf_key -- + * Acquire a copy of the key for a leaf page. + */ +static int +__rebalance_row_leaf_key(WT_SESSION_IMPL *session, + const uint8_t *addr, size_t addr_len, WT_ITEM *key, WT_REBALANCE_STUFF *rs) +{ + WT_PAGE *page; + WT_DECL_RET; + + /* + * We need the first key from a leaf page. Leaf pages are relatively + * complex (Huffman encoding, prefix compression, and so on), do the + * work to instantiate the page and copy the first key to the buffer. + */ + WT_RET(__wt_bt_read(session, rs->tmp1, addr, addr_len)); + WT_RET(__wt_page_inmem(session, NULL, rs->tmp1->data, 0, 0, &page)); + ret = __wt_row_leaf_key_copy(session, page, &page->pg_row_d[0], key); + __wt_page_out(session, &page); + return (ret); +} + +/* + * __rebalance_row_walk -- + * Walk a row-store page and its descendants. + */ +static int +__rebalance_row_walk( + WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_REBALANCE_STUFF *rs) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK key, unpack; + WT_DECL_ITEM(buf); + WT_DECL_ITEM(leafkey); + WT_DECL_RET; + size_t len; + uint32_t i; + bool first_cell; + const void *p; + + btree = S2BT(session); + WT_CLEAR(key); /* [-Werror=maybe-uninitialized] */ + + WT_ERR(__wt_scr_alloc(session, 0, &buf)); + WT_ERR(__wt_scr_alloc(session, 0, &leafkey)); + + /* Report progress periodically. */ + if (++rs->progress % WT_REBALANCE_PROGRESS_INTERVAL == 0) + WT_ERR(__wt_progress(session, NULL, rs->progress)); + + /* + * Walk the page, instantiating keys: the page contains sorted key and + * location cookie pairs. Keys are on-page/overflow items and location + * cookies are WT_CELL_ADDR_XXX items. + */ + first_cell = true; + WT_CELL_FOREACH(btree, dsk, cell, &unpack, i) { + __wt_cell_unpack(cell, &unpack); + switch (unpack.type) { + case WT_CELL_KEY: + key = unpack; + break; + case WT_CELL_KEY_OVFL: + /* + * Any overflow key that references an internal page is + * of no further use, schedule its blocks to be freed. + * + * We could potentially use the same overflow key being + * freed here for the internal page we're creating, but + * that's more work to get reconciliation to understand + * and overflow keys are (well, should be), uncommon. + */ + WT_ERR(__wt_verbose(session, WT_VERB_REBALANCE, + "free-list append overflow key: %s", + __wt_addr_string( + session, unpack.data, unpack.size, rs->tmp1))); + + WT_ERR(__rebalance_fl_append( + session, unpack.data, unpack.size, rs)); + + key = unpack; + break; + case WT_CELL_ADDR_DEL: + /* + * A deleted leaf page: we're rebalancing this tree, + * which means no transaction can be active in it, + * which means no deleted leaf page is interesting, + * ignore it. + */ + first_cell = false; + break; + case WT_CELL_ADDR_INT: + /* An internal page, schedule its blocks to be freed. */ + WT_ERR(__wt_verbose(session, WT_VERB_REBALANCE, + "free-list append internal page: %s", + __wt_addr_string( + session, unpack.data, unpack.size, rs->tmp1))); + WT_ERR(__rebalance_fl_append( + session, unpack.data, unpack.size, rs)); + + /* Read and recursively walk the page. */ + WT_ERR(__wt_bt_read( + session, buf, unpack.data, unpack.size)); + WT_ERR(__rebalance_row_walk(session, buf->data, rs)); + break; + case WT_CELL_ADDR_LEAF: + case WT_CELL_ADDR_LEAF_NO: + /* + * A leaf page. + * We can't trust the 0th key on an internal page (we + * often don't store them in reconciliation because it + * saves space), get it from the underlying leaf page. + * Else, if the internal page key is an overflow key, + * instantiate it and use it. + * Else, we can use the internal page's key as is, it's + * sufficient for the page. + */ + if (first_cell) { + WT_ERR(__rebalance_row_leaf_key(session, + unpack.data, unpack.size, leafkey, rs)); + p = leafkey->data; + len = leafkey->size; + } else if (key.type == WT_CELL_KEY_OVFL) { + WT_ERR(__wt_dsk_cell_data_ref( + session, WT_PAGE_ROW_INT, &key, leafkey)); + p = leafkey->data; + len = leafkey->size; + } else { + p = key.data; + len = key.size; + } + WT_ERR(__rebalance_leaf_append(session, + p, len, WT_RECNO_OOB, unpack.data, unpack.size, + unpack.type == WT_CELL_ADDR_LEAF ? + WT_ADDR_LEAF : WT_ADDR_LEAF_NO, rs)); + + first_cell = false; + break; + WT_ILLEGAL_VALUE_ERR(session); + } + } + +err: __wt_scr_free(session, &buf); + __wt_scr_free(session, &leafkey); + return (ret); +} + +/* + * __wt_bt_rebalance -- + * Rebalance the last checkpoint in the file. + */ +int +__wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_REBALANCE_STUFF *rs, _rstuff; + bool evict_reset; + + WT_UNUSED(cfg); + + btree = S2BT(session); + + /* + * If the tree has never been written to disk, we're done, rebalance + * walks disk images, not in-memory pages. For the same reason, the + * tree has to be clean. + */ + if (btree->root.page->dsk == NULL) + return (0); + if (btree->modified) + WT_RET_MSG(session, EINVAL, + "tree is modified, only clean trees may be rebalanced"); + + WT_CLEAR(_rstuff); + rs = &_rstuff; + + WT_ERR(__wt_scr_alloc(session, 0, &rs->tmp1)); + WT_ERR(__wt_scr_alloc(session, 0, &rs->tmp2)); + + /* Set the internal page tree type. */ + rs->type = btree->root.page->type; + + /* + * Get exclusive access to the file. (Not required, the only page in the + * cache is the root page, and that cannot be evicted; however, this way + * eviction ignores the tree entirely.) + */ + WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); + + /* Recursively walk the tree. */ + switch (rs->type) { + case WT_PAGE_ROW_INT: + WT_ERR( + __rebalance_row_walk(session, btree->root.page->dsk, rs)); + break; + case WT_PAGE_COL_INT: + WT_ERR( + __rebalance_col_walk(session, btree->root.page->dsk, rs)); + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + /* Build a new root page. */ + WT_ERR(__rebalance_internal(session, rs)); + + /* + * Schedule the free of the original blocks (they shouldn't actually be + * freed until the next checkpoint completes). + */ + WT_ERR(__rebalance_free_original(session, rs)); + + /* + * Swap the old root page for our newly built root page, writing the new + * root page as part of a checkpoint will finish the rebalance. + */ + __wt_page_out(session, &btree->root.page); + btree->root.page = rs->root; + rs->root = NULL; + +err: /* Discard any leftover root page we created. */ + if (rs->root != NULL) { + __wt_page_modify_clear(session, rs->root); + __wt_page_out(session, &rs->root); + } + + /* Discard any leftover leaf and internal page information. */ + __rebalance_discard(session, rs); + + __wt_scr_free(session, &rs->tmp1); + __wt_scr_free(session, &rs->tmp2); + + return (ret); +} diff --git a/src/btree/bt_ret.c b/src/btree/bt_ret.c index c7a4b8e22f4..ebc0499f6a2 100644 --- a/src/btree/bt_ret.c +++ b/src/btree/bt_ret.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index 756ffd98f3a..8d78bda79fb 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -595,22 +595,18 @@ __slvg_trk_leaf(WT_SESSION_IMPL *session, WT_ERR(__wt_row_leaf_key_copy(session, page, &page->pg_row_d[page->pg_row_entries - 1], &trk->row_stop)); - if (WT_VERBOSE_ISSET(session, WT_VERB_SALVAGE)) { - WT_ERR(__wt_buf_set_printable(session, ss->tmp1, - trk->row_start.data, trk->row_start.size)); - WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, - "%s start key %.*s", - __wt_addr_string(session, - trk->trk_addr, trk->trk_addr_size, ss->tmp2), - (int)ss->tmp1->size, (char *)ss->tmp1->data)); - WT_ERR(__wt_buf_set_printable(session, ss->tmp1, - trk->row_stop.data, trk->row_stop.size)); - WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, - "%s stop key %.*s", - __wt_addr_string(session, - trk->trk_addr, trk->trk_addr_size, ss->tmp2), - (int)ss->tmp1->size, (char *)ss->tmp1->data)); - } + WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, + "%s start key %s", + __wt_addr_string(session, + trk->trk_addr, trk->trk_addr_size, ss->tmp1), + __wt_buf_set_printable(session, + trk->row_start.data, trk->row_start.size, ss->tmp2))); + WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, + "%s stop key %s", + __wt_addr_string(session, + trk->trk_addr, trk->trk_addr_size, ss->tmp1), + __wt_buf_set_printable(session, + trk->row_stop.data, trk->row_stop.size, ss->tmp2))); /* Row-store pages can contain overflow items. */ WT_ERR(__slvg_trk_leaf_ovfl(session, dsk, trk)); @@ -1807,7 +1803,7 @@ err: if (page != NULL) */ static int __slvg_row_build_internal( - WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss) + WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss) { WT_ADDR *addr; WT_DECL_RET; @@ -1821,7 +1817,7 @@ __slvg_row_build_internal( /* Allocate a row-store root (internal) page and fill it in. */ WT_RET(__wt_page_alloc( - session, WT_PAGE_ROW_INT, 0, leaf_cnt, true, &page)); + session, WT_PAGE_ROW_INT, WT_RECNO_OOB, leaf_cnt, true, &page)); WT_ERR(__slvg_modify_init(session, page)); pindex = WT_INTL_INDEX_GET_SAFE(page); @@ -1937,16 +1933,12 @@ __slvg_row_build_leaf( btree->collator, key, &trk->row_start, &cmp)); if (cmp >= 0) break; - if (WT_VERBOSE_ISSET(session, WT_VERB_SALVAGE)) { - WT_ERR(__wt_buf_set_printable(session, - ss->tmp1, key->data, key->size)); - WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, - "%s merge discarding leading key %.*s", - __wt_addr_string(session, - trk->trk_addr, trk->trk_addr_size, - ss->tmp2), (int)ss->tmp1->size, - (char *)ss->tmp1->data)); - } + WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, + "%s merge discarding leading key %.*s", + __wt_addr_string(session, + trk->trk_addr, trk->trk_addr_size, ss->tmp1), + __wt_buf_set_printable( + session, key->data, key->size, ss->tmp2))); ++skip_start; } if (F_ISSET(trk, WT_TRACK_CHECK_STOP)) @@ -1961,16 +1953,12 @@ __slvg_row_build_leaf( btree->collator, key, &trk->row_stop, &cmp)); if (cmp < 0) break; - if (WT_VERBOSE_ISSET(session, WT_VERB_SALVAGE)) { - WT_ERR(__wt_buf_set_printable(session, - ss->tmp1, key->data, key->size)); - WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, - "%s merge discarding trailing key %.*s", - __wt_addr_string(session, - trk->trk_addr, trk->trk_addr_size, - ss->tmp2), (int)ss->tmp1->size, - (char *)ss->tmp1->data)); - } + WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, + "%s merge discarding trailing key %.*s", + __wt_addr_string(session, + trk->trk_addr, trk->trk_addr_size, ss->tmp1), + __wt_buf_set_printable( + session, key->data, key->size, ss->tmp2))); ++skip_stop; } diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 12f4197e9e7..102265c0a8f 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -15,6 +15,22 @@ } while (0) /* + * A note on error handling: main split functions first allocate/initialize new + * structures; failures during that period are handled by discarding the memory + * and returning an error code, the caller knows the split didn't happen and + * proceeds accordingly. Second, split functions update the tree, and a failure + * in that period is catastrophic, any partial update to the tree requires a + * panic, we can't recover. Third, once the split is complete and the tree has + * been fully updated, we have to ignore most errors, the split is complete and + * correct, callers have to proceed accordingly. + */ +typedef enum { + WT_ERR_IGNORE, /* Ignore minor errors */ + WT_ERR_PANIC, /* Panic on all errors */ + WT_ERR_RETURN /* Clean up and return error */ +} WT_SPLIT_ERROR_PHASE; + +/* * __split_oldest_gen -- * Calculate the oldest active split generation. */ @@ -512,8 +528,9 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex; WT_REF **alloc_refp; WT_REF **child_refp, *ref, **root_refp; + WT_SPLIT_ERROR_PHASE complete; size_t child_incr, root_decr, root_incr, size; - uint64_t split_gen; + uint64_t recno, split_gen; uint32_t children, chunk, i, j, remain; uint32_t slots; void *p; @@ -539,7 +556,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) btree = S2BT(session); alloc_index = NULL; root_decr = root_incr = 0; - complete = ERR_RETURN; + complete = WT_ERR_RETURN; /* The root page will be marked dirty, make sure that will succeed. */ WT_RET(__wt_page_modify_init(session, root)); @@ -589,8 +606,11 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) for (root_refp = pindex->index, alloc_refp = alloc_index->index, i = 0; i < children; ++i) { slots = i == children - 1 ? remain : chunk; + + recno = root->type == WT_PAGE_COL_INT ? + (*root_refp)->key.recno : WT_RECNO_OOB; WT_ERR(__wt_page_alloc( - session, root->type, 0, slots, false, &child)); + session, root->type, recno, slots, false, &child)); /* * Initialize the page's child reference; we need a copy of the @@ -605,12 +625,10 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_ERR(__wt_row_ikey(session, 0, p, size, ref)); root_incr += sizeof(WT_IKEY) + size; } else - ref->key.recno = (*root_refp)->key.recno; + ref->key.recno = recno; ref->state = WT_REF_MEM; /* Initialize the child page. */ - if (root->type == WT_PAGE_COL_INT) - child->pg_intl_recno = (*root_refp)->key.recno; child->pg_intl_parent_ref = ref; /* Mark it dirty. */ @@ -623,7 +641,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) * threads may be underneath us right now changing the structure * state.) However, if the WT_REF structures reference on-page * information, we have to fix that, because the disk image for - * the page that has an page index entry for the WT_REF is about + * the page that has a page index entry for the WT_REF is about * to change. */ child_pindex = WT_INTL_INDEX_GET_SAFE(child); @@ -641,7 +659,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) root_refp - pindex->index == (ptrdiff_t)pindex->entries); /* Start making real changes to the tree, errors are fatal. */ - complete = ERR_PANIC; + complete = WT_ERR_PANIC; /* Prepare the WT_REFs for the move. */ __split_ref_step1(session, alloc_index, false); @@ -661,7 +679,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_ERR(__split_ref_step2(session, alloc_index, false)); /* The split is complete and correct, ignore benign errors. */ - complete = ERR_IGNORE; + complete = WT_ERR_IGNORE; /* We've installed the allocated page-index, ensure error handling. */ alloc_index = NULL; @@ -687,15 +705,15 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) __wt_page_modify_set(session, root); err: switch (complete) { - case ERR_RETURN: + case WT_ERR_RETURN: __wt_free_ref_index(session, root, alloc_index, true); break; - case ERR_PANIC: + case WT_ERR_PANIC: __wt_err(session, ret, "fatal error during root page split to deepen the tree"); ret = WT_PANIC; break; - case ERR_IGNORE: + case WT_ERR_IGNORE: if (ret != 0 && ret != WT_PANIC) { __wt_err(session, ret, "ignoring not-fatal error during root page split " @@ -721,19 +739,21 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_PAGE *parent; WT_PAGE_INDEX *alloc_index, *pindex; WT_REF **alloc_refp, *next_ref; + WT_SPLIT_ERROR_PHASE complete; size_t parent_decr, size; uint64_t split_gen; - uint32_t i, j; + uint32_t hint, i, j; uint32_t deleted_entries, parent_entries, result_entries; uint32_t *deleted_refs; - bool complete, empty_parent; + bool empty_parent; parent = ref->home; alloc_index = pindex = NULL; parent_decr = 0; parent_entries = 0; - complete = empty_parent = false; + empty_parent = false; + complete = WT_ERR_RETURN; /* The parent page will be marked dirty, make sure that will succeed. */ WT_RET(__wt_page_modify_init(session, parent)); @@ -751,7 +771,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * array anyway. Switch them to the special split state, so that any * reading thread will restart. */ - WT_RET(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr)); + WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr)); for (deleted_entries = 0, i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); @@ -791,28 +811,40 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * Allocate and initialize a new page index array for the parent, then * copy references from the original index array, plus references from * the newly created split array, into place. + * + * Update the WT_REF's page-index hint as we go. This can race with a + * thread setting the hint based on an older page-index, and the change + * isn't backed out in the case of an error, so there ways for the hint + * to be wrong; OK because it's just a hint. */ size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *); WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); parent_incr += size; alloc_index->index = (WT_REF **)(alloc_index + 1); alloc_index->entries = result_entries; - for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) { + for (alloc_refp = alloc_index->index, + hint = i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; if (next_ref == ref) for (j = 0; j < new_entries; ++j) { ref_new[j]->home = parent; + ref_new[j]->pindex_hint = hint++; *alloc_refp++ = ref_new[j]; } - else if (next_ref->state != WT_REF_SPLIT) + else if (next_ref->state != WT_REF_SPLIT) { /* Skip refs we have marked for deletion. */ + next_ref->pindex_hint = hint++; *alloc_refp++ = next_ref; + } } /* Check that we filled in all the entries. */ WT_ASSERT(session, alloc_refp - alloc_index->index == (ptrdiff_t)result_entries); + /* Start making real changes to the tree, errors are fatal. */ + complete = WT_ERR_PANIC; + /* * Confirm the parent page's index hasn't moved then update it, which * makes the split visible to threads descending the tree. @@ -853,16 +885,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, */ WT_FULL_BARRIER(); - /* - * A note on error handling: failures before we swapped the new page - * index into the parent can be resolved by freeing allocated memory - * because the original page is unchanged, we can continue to use it - * and we have not yet modified the parent. Failures after we swap - * the new page index into the parent are also relatively benign, the - * split is OK and complete. For those reasons, we ignore errors past - * this point unless there's a panic. - */ - complete = true; + /* The split is complete and correct, ignore benign errors. */ + complete = WT_ERR_IGNORE; WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, "%p: %s %s" "split into parent %p, %" PRIu32 " -> %" PRIu32 @@ -946,7 +970,8 @@ err: __wt_scr_free(session, &scr); * nothing really bad can have happened, and our caller has to proceed * with the split. */ - if (!complete) { + switch (complete) { + case WT_ERR_RETURN: for (i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; if (next_ref->state == WT_REF_SPLIT) @@ -954,20 +979,28 @@ err: __wt_scr_free(session, &scr); } __wt_free_ref_index(session, NULL, alloc_index, false); - /* * The split couldn't proceed because the parent would be empty, * return EBUSY so our caller knows to unlock the WT_REF that's * being deleted, but don't be noisy, there's nothing wrong. */ if (empty_parent) - return (EBUSY); + ret = EBUSY; + break; + case WT_ERR_PANIC: + __wt_err(session, ret, "fatal error during parent page split"); + ret = WT_PANIC; + break; + case WT_ERR_IGNORE: + if (ret != 0 && ret != WT_PANIC) { + __wt_err(session, ret, + "ignoring not-fatal error during parent page " + "split"); + ret = 0; + } + break; } - - if (ret != 0 && ret != WT_PANIC) - __wt_err(session, ret, - "ignoring not-fatal error during parent page split"); - return (ret == WT_PANIC || !complete ? ret : 0); + return (ret); } /* @@ -983,8 +1016,9 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex, *replace_index; WT_REF **alloc_refp; WT_REF **child_refp, *page_ref, **page_refp, *ref; + WT_SPLIT_ERROR_PHASE complete; size_t child_incr, page_decr, page_incr, parent_incr, size; - uint64_t split_gen; + uint64_t recno, split_gen; uint32_t children, chunk, i, j, remain; uint32_t slots; void *p; @@ -1012,7 +1046,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) alloc_index = replace_index = NULL; page_ref = page->pg_intl_parent_ref; page_decr = page_incr = parent_incr = 0; - complete = ERR_RETURN; + complete = WT_ERR_RETURN; /* * Our caller is holding the page locked to single-thread splits, which @@ -1081,8 +1115,11 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ASSERT(session, page_refp == pindex->index + chunk); for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) { slots = i == children - 1 ? remain : chunk; + + recno = page->type == WT_PAGE_COL_INT ? + (*page_refp)->key.recno : WT_RECNO_OOB; WT_ERR(__wt_page_alloc( - session, page->type, 0, slots, false, &child)); + session, page->type, recno, slots, false, &child)); /* * Initialize the page's child reference; we need a copy of the @@ -1097,12 +1134,10 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ERR(__wt_row_ikey(session, 0, p, size, ref)); parent_incr += sizeof(WT_IKEY) + size; } else - ref->key.recno = (*page_refp)->key.recno; + ref->key.recno = recno; ref->state = WT_REF_MEM; /* Initialize the child page. */ - if (page->type == WT_PAGE_COL_INT) - child->pg_intl_recno = (*page_refp)->key.recno; child->pg_intl_parent_ref = ref; /* Mark it dirty. */ @@ -1133,7 +1168,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) page_refp - pindex->index == (ptrdiff_t)pindex->entries); /* Start making real changes to the tree, errors are fatal. */ - complete = ERR_PANIC; + complete = WT_ERR_PANIC; /* Prepare the WT_REFs for the move. */ __split_ref_step1(session, alloc_index, true); @@ -1157,7 +1192,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ERR(__split_ref_step2(session, alloc_index, true)); /* The split is complete and correct, ignore benign errors. */ - complete = ERR_IGNORE; + complete = WT_ERR_IGNORE; /* * Push out the changes: not required for correctness, but no reason @@ -1193,16 +1228,16 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) __wt_page_modify_set(session, page); err: switch (complete) { - case ERR_RETURN: + case WT_ERR_RETURN: __wt_free_ref_index(session, page, alloc_index, true); __wt_free_ref_index(session, page, replace_index, false); break; - case ERR_PANIC: + case WT_ERR_PANIC: __wt_err(session, ret, "fatal error during internal page split"); ret = WT_PANIC; break; - case ERR_IGNORE: + case WT_ERR_IGNORE: if (ret != 0 && ret != WT_PANIC) { __wt_err(session, ret, "ignoring not-fatal error during internal page " @@ -1654,10 +1689,11 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) WT_DECL_RET; WT_DECL_ITEM(key); WT_INSERT *ins, **insp, *moved_ins, *prev_ins; - WT_INSERT_HEAD *ins_head; + WT_INSERT_HEAD *ins_head, *tmp_ins_head; WT_PAGE *page, *right; WT_REF *child, *split_ref[2] = { NULL, NULL }; size_t page_decr, parent_incr, right_incr; + uint8_t type; int i; WT_STAT_FAST_CONN_INCR(session, cache_inmem_split); @@ -1666,6 +1702,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) page = ref->page; right = NULL; page_decr = parent_incr = right_incr = 0; + type = page->type; /* * Assert splitting makes sense; specifically assert the page is dirty, @@ -1679,9 +1716,12 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) F_SET_ATOMIC(page, WT_PAGE_SPLIT_INSERT); /* Find the last item on the page. */ - ins_head = page->pg_row_entries == 0 ? - WT_ROW_INSERT_SMALLEST(page) : - WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); + if (type == WT_PAGE_ROW_LEAF) + ins_head = page->pg_row_entries == 0 ? + WT_ROW_INSERT_SMALLEST(page) : + WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); + else + ins_head = WT_COL_APPEND(page); moved_ins = WT_SKIP_LAST(ins_head); /* @@ -1692,14 +1732,12 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * The new WT_REF is not quite identical: we have to instantiate a key, * and the new reference is visible to readers once the split completes. * - * The key-instantiation code checks for races, leave the key fields - * zeroed we don't trigger them. - * * Don't copy any deleted page state: we may be splitting a page that * was instantiated after a truncate and that history should not be * carried onto these new child pages. */ WT_ERR(__wt_calloc_one(session, &split_ref[0])); + parent_incr += sizeof(WT_REF); child = split_ref[0]; child->page = ref->page; child->home = ref->home; @@ -1713,49 +1751,82 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) */ ref->addr = NULL; - /* - * Copy the first key from the original page into first ref in the new - * parent. Pages created in memory always have a "smallest" insert - * list, so look there first. If we don't find one, get the first key - * from the disk image. - * - * We can't just use the key from the original ref: it may have been - * suffix-compressed, and after the split the truncated key may not be - * valid. - */ - WT_ERR(__wt_scr_alloc(session, 0, &key)); - if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL) { - key->data = WT_INSERT_KEY(ins); - key->size = WT_INSERT_KEY_SIZE(ins); + if (type == WT_PAGE_ROW_LEAF) { + /* + * Copy the first key from the original page into first ref in + * the new parent. Pages created in memory always have a + * "smallest" insert list, so look there first. If we don't + * find one, get the first key from the disk image. + * + * We can't just use the key from the original ref: it may have + * been suffix-compressed, and after the split the truncated key + * may not be valid. + */ + WT_ERR(__wt_scr_alloc(session, 0, &key)); + if ((ins = + WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL) { + key->data = WT_INSERT_KEY(ins); + key->size = WT_INSERT_KEY_SIZE(ins); + } else + WT_ERR(__wt_row_leaf_key( + session, page, &page->pg_row_d[0], key, true)); + WT_ERR(__wt_row_ikey(session, 0, key->data, key->size, child)); + parent_incr += sizeof(WT_IKEY) + key->size; + __wt_scr_free(session, &key); } else - WT_ERR(__wt_row_leaf_key( - session, page, &page->pg_row_d[0], key, true)); - WT_ERR(__wt_row_ikey(session, 0, key->data, key->size, child)); - parent_incr += sizeof(WT_REF) + sizeof(WT_IKEY) + key->size; - __wt_scr_free(session, &key); + child->key.recno = ref->key.recno; /* * The second page in the split is a new WT_REF/page pair. */ - WT_ERR(__wt_page_alloc(session, WT_PAGE_ROW_LEAF, 0, 0, false, &right)); - WT_ERR(__wt_calloc_one(session, &right->pg_row_ins)); - WT_ERR(__wt_calloc_one(session, &right->pg_row_ins[0])); + if (type == WT_PAGE_ROW_LEAF) + WT_ERR(__wt_page_alloc(session, + type, WT_RECNO_OOB, 0, false, &right)); + else + WT_ERR(__wt_page_alloc(session, + type, WT_INSERT_RECNO(moved_ins), 0, false, &right)); + + /* + * The new page is dirty by definition, column-store splits update the + * page-modify structure, so create it now. + */ + WT_ERR(__wt_page_modify_init(session, right)); + __wt_page_modify_set(session, right); + + if (type == WT_PAGE_ROW_LEAF) { + WT_ERR(__wt_calloc_one(session, &right->pg_row_ins)); + WT_ERR(__wt_calloc_one(session, &right->pg_row_ins[0])); + } else { + WT_ERR(__wt_calloc_one(session, &right->modify->mod_append)); + WT_ERR(__wt_calloc_one(session, &right->modify->mod_append[0])); + } right_incr += sizeof(WT_INSERT_HEAD); right_incr += sizeof(WT_INSERT_HEAD *); WT_ERR(__wt_calloc_one(session, &split_ref[1])); + parent_incr += sizeof(WT_REF); child = split_ref[1]; child->page = right; child->state = WT_REF_MEM; - WT_ERR(__wt_row_ikey(session, 0, - WT_INSERT_KEY(moved_ins), WT_INSERT_KEY_SIZE(moved_ins), - child)); - parent_incr += - sizeof(WT_REF) + sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins); - /* The new page is dirty by definition. */ - WT_ERR(__wt_page_modify_init(session, right)); - __wt_page_modify_set(session, right); + if (type == WT_PAGE_ROW_LEAF) { + WT_ERR(__wt_row_ikey(session, 0, + WT_INSERT_KEY(moved_ins), WT_INSERT_KEY_SIZE(moved_ins), + child)); + parent_incr += sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins); + } else + child->key.recno = WT_INSERT_RECNO(moved_ins); + + /* + * Allocation operations completed, we're going to split. + * + * Record the split column-store page record, used in reconciliation. + */ + if (type != WT_PAGE_ROW_LEAF) { + WT_ASSERT(session, + page->modify->mod_split_recno == WT_RECNO_OOB); + page->modify->mod_split_recno = child->key.recno; + } /* * We modified the page above, which will have set the first dirty @@ -1779,15 +1850,16 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) page_decr, right_incr, __wt_update_list_memsize(moved_ins->upd)); /* - * Allocation operations completed, move the last insert list item from - * the original page to the new page. + * Move the last insert list item from the original page to the new + * page. * * First, update the item to the new child page. (Just append the entry * for simplicity, the previous skip list pointers originally allocated * can be ignored.) */ - right->pg_row_ins[0]->head[0] = - right->pg_row_ins[0]->tail[0] = moved_ins; + tmp_ins_head = type == WT_PAGE_ROW_LEAF ? + right->pg_row_ins[0] : right->modify->mod_append[0]; + tmp_ins_head->head[0] = tmp_ins_head->tail[0] = moved_ins; /* * Remove the entry from the orig page (i.e truncate the skip list). @@ -1872,34 +1944,40 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) __wt_cache_page_inmem_incr(session, right, right_incr); /* - * Split into the parent. After this, the original page is no + * Split into the parent. On successful return, the original page is no * longer locked, so we cannot safely look at it. */ page = NULL; if ((ret = __split_parent( - session, ref, split_ref, 2, parent_incr, false, true)) != 0) { - /* - * Move the insert list element back to the original page list. - * For simplicity, the previous skip list pointers originally - * allocated can be ignored, just append the entry to the end of - * the level 0 list. As before, we depend on the list having - * multiple elements and ignore the edge cases small lists have. - */ - right->pg_row_ins[0]->head[0] = - right->pg_row_ins[0]->tail[0] = NULL; - ins_head->tail[0]->next[0] = moved_ins; - ins_head->tail[0] = moved_ins; + session, ref, split_ref, 2, parent_incr, false, true)) == 0) + return (0); - /* - * We marked the new page dirty; we're going to discard it, but - * first mark it clean and fix up the cache statistics. - */ - __wt_page_modify_clear(session, right); + /* + * Failure. + * + * Reset the split column-store page record. + */ + page->modify->mod_split_recno = WT_RECNO_OOB; - WT_ERR(ret); - } + /* + * Clear the allocated page's reference to the moved insert list element + * so it's not freed when we discard the page. + * + * Move the element back to the original page list. For simplicity, the + * previous skip list pointers originally allocated can be ignored, just + * append the entry to the end of the level 0 list. As before, we depend + * on the list having multiple elements and ignore the edge cases small + * lists have. + */ + if (type == WT_PAGE_ROW_LEAF) + right->pg_row_ins[0]->head[0] = + right->pg_row_ins[0]->tail[0] = NULL; + else + right->modify->mod_append[0]->head[0] = + right->modify->mod_append[0]->tail[0] = NULL; - return (0); + ins_head->tail[0]->next[0] = moved_ins; + ins_head->tail[0] = moved_ins; err: if (split_ref[0] != NULL) { /* @@ -1907,15 +1985,23 @@ err: if (split_ref[0] != NULL) { */ ref->addr = split_ref[0]->addr; - __wt_free(session, split_ref[0]->key.ikey); + if (type == WT_PAGE_ROW_LEAF) + __wt_free(session, split_ref[0]->key.ikey); __wt_free(session, split_ref[0]); } if (split_ref[1] != NULL) { - __wt_free(session, split_ref[1]->key.ikey); + if (type == WT_PAGE_ROW_LEAF) + __wt_free(session, split_ref[1]->key.ikey); __wt_free(session, split_ref[1]); } - if (right != NULL) + if (right != NULL) { + /* + * We marked the new page dirty; we're going to discard it, + * but first mark it clean and fix up the cache statistics. + */ + __wt_page_modify_clear(session, right); __wt_page_out(session, &right); + } __wt_scr_free(session, &key); return (ret); } diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c index 5dd75835b0b..3d5abf34147 100644 --- a/src/btree/bt_stat.c +++ b/src/btree/bt_stat.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -35,10 +35,10 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) WT_STAT_SET(session, stats, btree_fixed_len, btree->bitcnt); WT_STAT_SET(session, stats, btree_maximum_depth, btree->maximum_depth); - WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage); WT_STAT_SET(session, stats, btree_maxintlkey, btree->maxintlkey); - WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage); + WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage); WT_STAT_SET(session, stats, btree_maxleafkey, btree->maxleafkey); + WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage); WT_STAT_SET(session, stats, btree_maxleafvalue, btree->maxleafvalue); /* Everything else is really, really expensive. */ diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index 86607d8f187..5cbd8d1e996 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/bt_upgrade.c b/src/btree/bt_upgrade.c index 6b403595ecc..a9ff16ad496 100644 --- a/src/btree/bt_upgrade.c +++ b/src/btree/bt_upgrade.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c index d745210bdce..ae2c20be1b6 100644 --- a/src/btree/bt_vrfy.c +++ b/src/btree/bt_vrfy.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -30,8 +30,7 @@ typedef struct { u_int depth, depth_internal[100], depth_leaf[100]; - WT_ITEM *tmp1; /* Temporary buffer */ - WT_ITEM *tmp2; /* Temporary buffer */ + WT_ITEM *tmp1, *tmp2, *tmp3, *tmp4; /* Temporary buffers */ } WT_VSTUFF; static void __verify_checkpoint_reset(WT_VSTUFF *); @@ -170,6 +169,8 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_scr_alloc(session, 0, &vs->max_addr)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2)); + WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp3)); + WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp4)); /* Check configuration strings. */ WT_ERR(__verify_config(session, cfg, vs)); @@ -251,6 +252,8 @@ err: /* Inform the underlying block manager we're done. */ __wt_scr_free(session, &vs->max_addr); __wt_scr_free(session, &vs->tmp1); __wt_scr_free(session, &vs->tmp2); + __wt_scr_free(session, &vs->tmp3); + __wt_scr_free(session, &vs->tmp4); return (ret); } @@ -570,10 +573,14 @@ __verify_row_int_key_order(WT_SESSION_IMPL *session, WT_RET_MSG(session, WT_ERROR, "the internal key in entry %" PRIu32 " on the page at %s " "sorts before the last key appearing on page %s, earlier " - "in the tree", + "in the tree: %s, %s", entry, __wt_page_addr_string(session, ref, vs->tmp1), - (char *)vs->max_addr->data); + (char *)vs->max_addr->data, + __wt_buf_set_printable(session, + item.data, item.size, vs->tmp2), + __wt_buf_set_printable(session, + vs->max_key->data, vs->max_key->size, vs->tmp3)); /* Update the largest key we've seen to the key just checked. */ WT_RET(__wt_buf_set(session, vs->max_key, item.data, item.size)); @@ -628,11 +635,15 @@ __verify_row_leaf_key_order( btree->collator, vs->tmp1, (WT_ITEM *)vs->max_key, &cmp)); if (cmp < 0) WT_RET_MSG(session, WT_ERROR, - "the first key on the page at %s sorts equal to or " - "less than a key appearing on the page at %s, " - "earlier in the tree", - __wt_page_addr_string(session, ref, vs->tmp1), - (char *)vs->max_addr->data); + "the first key on the page at %s sorts equal to " + "or less than the last key appearing on the page " + "at %s, earlier in the tree: %s, %s", + __wt_page_addr_string(session, ref, vs->tmp2), + (char *)vs->max_addr->data, + __wt_buf_set_printable(session, + vs->tmp1->data, vs->tmp1->size, vs->tmp3), + __wt_buf_set_printable(session, + vs->max_key->data, vs->max_key->size, vs->tmp4)); } /* Update the largest key we've seen to the last key on this page. */ diff --git a/src/btree/bt_vrfy_dsk.c b/src/btree/bt_vrfy_dsk.c index a703fbd540d..5480a25b5ec 100644 --- a/src/btree/bt_vrfy_dsk.c +++ b/src/btree/bt_vrfy_dsk.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -197,6 +197,8 @@ __verify_dsk_row( WT_DECL_ITEM(current); WT_DECL_ITEM(last_ovfl); WT_DECL_ITEM(last_pfx); + WT_DECL_ITEM(tmp1); + WT_DECL_ITEM(tmp2); WT_DECL_RET; WT_ITEM *last; enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type; @@ -213,6 +215,8 @@ __verify_dsk_row( WT_ERR(__wt_scr_alloc(session, 0, ¤t)); WT_ERR(__wt_scr_alloc(session, 0, &last_pfx)); WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl)); + WT_ERR(__wt_scr_alloc(session, 0, &tmp1)); + WT_ERR(__wt_scr_alloc(session, 0, &tmp2)); last = last_ovfl; end = (uint8_t *)dsk + dsk->mem_size; @@ -402,8 +406,12 @@ key_compare: /* if (cmp >= 0) WT_ERR_VRFY(session, "the %" PRIu32 " and %" PRIu32 " keys on " - "page at %s are incorrectly sorted", - cell_num - 2, cell_num, tag); + "page at %s are incorrectly sorted: %s, %s", + cell_num - 2, cell_num, tag, + __wt_buf_set_printable(session, + last->data, last->size, tmp1), + __wt_buf_set_printable(session, + current->data, current->size, tmp2)); } /* @@ -464,6 +472,8 @@ err: if (ret == 0) __wt_scr_free(session, ¤t); __wt_scr_free(session, &last_pfx); __wt_scr_free(session, &last_ovfl); + __wt_scr_free(session, &tmp1); + __wt_scr_free(session, &tmp2); return (ret); } diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c index abb18529041..49a59b89552 100644 --- a/src/btree/bt_walk.c +++ b/src/btree/bt_walk.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -9,11 +9,11 @@ #include "wt_internal.h" /* - * __page_refp -- + * __ref_index_slot -- * Return the page's index and slot for a reference. */ static inline void -__page_refp(WT_SESSION_IMPL *session, +__ref_index_slot(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp, uint32_t *slotp) { WT_PAGE_INDEX *pindex; @@ -32,37 +32,36 @@ retry: WT_INTL_INDEX_GET(session, ref->home, pindex); * loop is from the hint to the end of the list, and the second loop * is from the start of the list to the end of the list. (The second * loop overlaps the first, but that only happen in cases where we've - * deepened the tree and aren't going to find our slot at all, that's - * not worth optimizing.) + * split the tree and aren't going to find our slot at all, that's not + * worth optimizing.) * * It's not an error for the reference hint to be wrong, it just means * the first retrieval (which sets the hint for subsequent retrievals), * is slower. */ i = ref->pindex_hint; - if (i < pindex->entries && pindex->index[i]->page == ref->page) { + if (i < pindex->entries && pindex->index[i] == ref) { *pindexp = pindex; *slotp = i; return; } while (++i < pindex->entries) - if (pindex->index[i]->page == ref->page) { + if (pindex->index[i] == ref) { *pindexp = pindex; *slotp = ref->pindex_hint = i; return; } for (i = 0; i < pindex->entries; ++i) - if (pindex->index[i]->page == ref->page) { + if (pindex->index[i] == ref) { *pindexp = pindex; *slotp = ref->pindex_hint = i; return; } /* - * If we don't find our reference, the page split into a new level and - * our home pointer references the wrong page. After internal pages - * deepen, their reference structure home value are updated; yield and - * wait for that to happen. + * If we don't find our reference, the page split and our home pointer + * references the wrong page. When internal pages split, their WT_REF + * structure home values are updated; yield and wait for that to happen. */ __wt_yield(); goto retry; @@ -116,13 +115,45 @@ __page_ascend(WT_SESSION_IMPL *session, parent_ref = ref->home->pg_intl_parent_ref; if (__wt_ref_is_root(parent_ref)) break; - __page_refp(session, parent_ref, pindexp, slotp); + __ref_index_slot(session, parent_ref, pindexp, slotp); /* - * When internal pages split, the WT_REF structures being moved - * are updated first. If the WT_REF we started with references - * the same page as we found on our search of the parent, there - * is a consistent view. + * There's a split race when a cursor moving forwards through + * the tree ascends the tree. If we're splitting an internal + * page into its parent, we move the WT_REF structures and + * then update the parent's page index before updating the split + * page's page index, and it's not an atomic update. A thread + * can read the split page's original page index and then read + * the parent page's replacement index. + * + * This can create a race for next-cursor movements. + * + * For example, imagine an internal page with 3 child pages, + * with the namespaces a-f, g-h and i-j; the first child page + * splits. The parent starts out with the following page-index: + * + * | ... | a | g | i | ... | + * + * which changes to this: + * + * | ... | a | c | e | g | i | ... | + * + * The split page starts out with the following page-index: + * + * | a | b | c | d | e | f | + * + * Imagine a cursor finishing the 'f' part of the namespace that + * starts its ascent to the parent's 'a' slot. Then the page + * splits and the parent page's page index is replaced. If the + * cursor then searches the parent's replacement page index for + * the 'a' slot, it finds it and then increments to the slot + * after the 'a' slot, the 'c' slot, and then it incorrectly + * repeats its traversal of part of the namespace. + * + * This function takes a WT_REF argument which is the page from + * which we start our ascent. If the parent's slot we find in + * our search doesn't point to the same page as that initial + * WT_REF, there's a race and we start over again. */ if (ref->home == parent_ref->page) break; @@ -132,6 +163,91 @@ __page_ascend(WT_SESSION_IMPL *session, } /* + * __page_descend -- + * Descend the tree one level. + */ +static void +__page_descend(WT_SESSION_IMPL *session, + WT_PAGE *page, WT_PAGE_INDEX **pindexp, uint32_t *slotp, bool prev) +{ + WT_PAGE_INDEX *pindex; + + /* + * Ref is a child page into which we're descending, and on which we + * have a hazard pointer. + */ + for (;; __wt_yield()) { + WT_INTL_INDEX_GET(session, page, pindex); + *slotp = prev ? pindex->entries - 1 : 0; + + /* + * There's a split race when a cursor moving backwards through + * the tree descends the tree. If we're splitting an internal + * page into its parent, we move the WT_REF structures and + * update the parent's page index before updating the split + * page's page index, and it's not an atomic update. A thread + * can read the parent page's replacement page index and then + * read the split page's original index. + * + * This can create a race for previous-cursor movements. + * + * For example, imagine an internal page with 3 child pages, + * with the namespaces a-f, g-h and i-j; the first child page + * splits. The parent starts out with the following page-index: + * + * | ... | a | g | i | ... | + * + * The split page starts out with the following page-index: + * + * | a | b | c | d | e | f | + * + * The first step is to move the c-f ranges into a new subtree, + * so, for example we might have two new internal pages 'c' and + * 'e', where the new 'c' page references the c-d namespace and + * the new 'e' page references the e-f namespace. The top of the + * subtree references the parent page, but until the parent's + * page index is updated, any threads in the subtree won't be + * able to ascend out of the subtree. However, once the parent + * page's page index is updated to this: + * + * | ... | a | c | e | g | i | ... | + * + * threads in the subtree can ascend into the parent. Imagine a + * cursor in the c-d part of the namespace that ascends to the + * parent's 'c' slot. It would then decrement to the slot before + * the 'c' slot, the 'a' slot. + * + * The previous-cursor movement selects the last slot in the 'a' + * page; if the split page's page-index hasn't been updated yet, + * it will select the 'f' slot, which is incorrect. Once the + * split page's page index is updated to this: + * + * | a | b | + * + * the previous-cursor movement will select the 'b' slot, which + * is correct. + * + * This function takes an argument which is the internal page + * from which we're descending. If the last slot on the page no + * longer points to the current page as its "home", the page is + * being split and part of its namespace moved. We have the + * correct page and we don't have to move, all we have to do is + * wait until the split page's page index is updated. + * + * No test is necessary for a next-cursor movement because we + * do right-hand splits on internal pages and the initial part + * of the page's namespace won't change as part of a split. + * Instead of testing the direction boolean, do the test the + * previous cursor movement requires in all cases, even though + * it will always succeed for a next-cursor movement. + */ + if (pindex->index[*slotp]->home == page) + break; + } + *pindexp = pindex; +} + +/* * __tree_walk_internal -- * Move to the next/previous page in the tree. */ @@ -225,7 +341,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session, } /* Figure out the current slot in the WT_REF array. */ - __page_refp(session, ref, &pindex, &slot); + __ref_index_slot(session, ref, &pindex, &slot); for (;;) { /* @@ -270,12 +386,8 @@ __tree_walk_internal(WT_SESSION_IMPL *session, * the parent can't have been evicted. */ if (!LF_ISSET(WT_READ_SKIP_INTL)) { - if ((ret = __wt_page_swap( - session, couple, ref, flags)) != 0) { - WT_TRET(__wt_page_release( - session, couple, flags)); - WT_ERR(ret); - } + WT_ERR(__wt_page_swap( + session, couple, ref, flags)); *refp = ref; goto done; } @@ -389,7 +501,8 @@ __tree_walk_internal(WT_SESSION_IMPL *session, } } - ret = __wt_page_swap(session, couple, ref, flags); + ret = __wt_page_swap(session, couple, ref, + WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags); /* * Not-found is an expected return when only walking @@ -434,7 +547,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session, couple == couple_orig || WT_PAGE_IS_INTERNAL(couple->page)); ref = couple; - __page_refp(session, ref, &pindex, &slot); + __ref_index_slot(session, ref, &pindex, &slot); if (couple == couple_orig) break; } @@ -446,9 +559,10 @@ __tree_walk_internal(WT_SESSION_IMPL *session, */ if (WT_PAGE_IS_INTERNAL(ref->page)) { descend: couple = ref; - WT_INTL_INDEX_GET(session, ref->page, pindex); - slot = prev ? pindex->entries - 1 : 0; empty_internal = true; + + __page_descend( + session, ref->page, &pindex, &slot, prev); } else { /* * Optionally skip leaf pages, the second half. diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c index bb2de3f444b..645d98d9c9b 100644 --- a/src/btree/col_modify.c +++ b/src/btree/col_modify.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c index e9fa570f97b..cb5a227495f 100644 --- a/src/btree/col_srch.c +++ b/src/btree/col_srch.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -9,12 +9,60 @@ #include "wt_internal.h" /* + * __check_leaf_key_range -- + * Check the search key is in the leaf page's key range. + */ +static inline int +__check_leaf_key_range(WT_SESSION_IMPL *session, + uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) +{ + WT_PAGE_INDEX *pindex; + uint32_t indx; + + /* + * There are reasons we can't do the fast checks, and we continue with + * the leaf page search in those cases, only skipping the complete leaf + * page search if we know it's not going to work. + */ + cbt->compare = 0; + + /* + * Check if the search key is smaller than the parent's starting key for + * this page. + */ + if (recno < leaf->key.recno) { + cbt->compare = 1; /* page keys > search key */ + return (0); + } + + /* + * Check if the search key is greater than or equal to the starting key + * for the parent's next page. + * + * !!! + * Check that "indx + 1" is a valid page-index entry first, because it + * also checks that "indx" is a valid page-index entry, and we have to + * do that latter check before looking at the indx slot of the array + * for a match to leaf (in other words, our page hint might be wrong). + */ + WT_INTL_INDEX_GET(session, leaf->home, pindex); + indx = leaf->pindex_hint; + if (indx + 1 < pindex->entries && pindex->index[indx] == leaf) + if (recno >= pindex->index[indx + 1]->key.recno) { + cbt->compare = -1; /* page keys < search key */ + return (0); + } + + return (0); +} + +/* * __wt_col_search -- * Search a column-store tree for a specific record-based key. */ int __wt_col_search(WT_SESSION_IMPL *session, - uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) + uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_COL *cip; @@ -24,6 +72,7 @@ __wt_col_search(WT_SESSION_IMPL *session, WT_PAGE *page; WT_PAGE_INDEX *pindex, *parent_pindex; WT_REF *current, *descent; + uint64_t recno; uint32_t base, indx, limit; int depth; @@ -31,8 +80,38 @@ __wt_col_search(WT_SESSION_IMPL *session, __cursor_pos_clear(cbt); - /* We may only be searching a single leaf page, not the full tree. */ + /* + * When appending a new record, the search record number will be an + * out-of-band value, search for the largest key in the table instead. + */ + if ((recno = search_recno) == WT_RECNO_OOB) + recno = UINT64_MAX; + + /* + * We may be searching only a single leaf page, not the full tree. In + * the normal case where the page links to a parent, check the page's + * parent keys before doing the full search, it's faster when the + * cursor is being re-positioned. (One case where the page doesn't + * have a parent is if it is being re-instantiated in memory as part + * of a split). + */ if (leaf != NULL) { + WT_ASSERT(session, search_recno != WT_RECNO_OOB); + + if (leaf->home != NULL) { + WT_RET(__check_leaf_key_range( + session, recno, leaf, cbt)); + if (cbt->compare != 0) { + /* + * !!! + * WT_CURSOR.search_near uses the slot value to + * decide if there was an on-page match. + */ + cbt->slot = 0; + return (0); + } + } + current = leaf; goto leaf_only; } @@ -103,7 +182,8 @@ descend: /* * page; otherwise return on error, the swap call ensures we're * holding nothing on failure. */ - if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) { + if ((ret = __wt_page_swap( + session, current, descent, WT_READ_RESTART_OK)) == 0) { current = descent; continue; } @@ -120,7 +200,17 @@ leaf_only: page = current->page; cbt->ref = current; cbt->recno = recno; - cbt->compare = 0; + + /* + * Don't bother searching if the caller is appending a new record where + * we'll allocate the record number; we're not going to find a match by + * definition, and we figure out the record number and position when we + * do the work. + */ + if (search_recno == WT_RECNO_OOB) { + cbt->compare = -1; + return (0); + } /* * Set the on-page slot to an impossible value larger than any possible @@ -142,6 +232,7 @@ leaf_only: * that's impossibly large for the page. We do have additional setup to * do in that case, the record may be appended to the page. */ + cbt->compare = 0; if (page->type == WT_PAGE_COL_FIX) { if (recno < page->pg_fix_recno) { cbt->compare = 1; @@ -190,18 +281,10 @@ past_end: * This is a rarely used path: we normally find exact matches, because * column-store files are dense, but in this case the caller searched * past the end of the table. - * - * Don't bother searching if the caller is appending a new record where - * we'll allocate the record number; we're not going to find a match by - * definition, and we figure out the position when we do the work. */ cbt->ins_head = WT_COL_APPEND(page); - if (recno == UINT64_MAX) - cbt->ins = NULL; - else - cbt->ins = __col_insert_search( - cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno); - if (cbt->ins == NULL) + if ((cbt->ins = __col_insert_search( + cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno)) == NULL) cbt->compare = -1; else { cbt->recno = WT_INSERT_RECNO(cbt->ins); @@ -212,14 +295,5 @@ past_end: else cbt->compare = -1; } - - /* - * Note if the record is past the maximum record in the tree, the cursor - * search functions need to know for fixed-length column-stores because - * appended records implicitly create any skipped records, and cursor - * search functions have to handle that case. - */ - if (cbt->compare == -1) - F_SET(cbt, WT_CBT_MAX_RECORD); return (0); } diff --git a/src/btree/row_key.c b/src/btree/row_key.c index 6d24708e59c..8b9e858ec18 100644 --- a/src/btree/row_key.c +++ b/src/btree/row_key.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c index 0fc02948dd3..176016bb340 100644 --- a/src/btree/row_modify.c +++ b/src/btree/row_modify.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index 079f9d3bad1..c06274cdb17 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -132,6 +132,76 @@ __wt_search_insert( } /* + * __check_leaf_key_range -- + * Check the search key is in the leaf page's key range. + */ +static inline int +__check_leaf_key_range(WT_SESSION_IMPL *session, + WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt) +{ + WT_BTREE *btree; + WT_COLLATOR *collator; + WT_ITEM *item; + WT_PAGE_INDEX *pindex; + uint32_t indx; + int cmp; + + btree = S2BT(session); + collator = btree->collator; + item = cbt->tmp; + + /* + * There are reasons we can't do the fast checks, and we continue with + * the leaf page search in those cases, only skipping the complete leaf + * page search if we know it's not going to work. + */ + cbt->compare = 0; + + /* + * First, confirm we have the right parent page-index slot, and quit if + * we don't. We don't search for the correct slot, that would make this + * cheap test expensive. + */ + WT_INTL_INDEX_GET(session, leaf->home, pindex); + indx = leaf->pindex_hint; + if (indx >= pindex->entries || pindex->index[indx] != leaf) + return (0); + + /* + * Check if the search key is smaller than the parent's starting key for + * this page. + * + * We can't compare against slot 0 on a row-store internal page because + * reconciliation doesn't build it, it may not be a valid key. + */ + if (indx != 0) { + __wt_ref_key(leaf->home, leaf, &item->data, &item->size); + WT_RET(__wt_compare(session, collator, srch_key, item, &cmp)); + if (cmp < 0) { + cbt->compare = 1; /* page keys > search key */ + return (0); + } + } + + /* + * Check if the search key is greater than or equal to the starting key + * for the parent's next page. + */ + ++indx; + if (indx < pindex->entries) { + __wt_ref_key( + leaf->home, pindex->index[indx], &item->data, &item->size); + WT_RET(__wt_compare(session, collator, srch_key, item, &cmp)); + if (cmp >= 0) { + cbt->compare = -1; /* page keys < search key */ + return (0); + } + } + + return (0); +} + +/* * __wt_row_search -- * Search a row-store tree for a specific key. */ @@ -179,8 +249,29 @@ __wt_row_search(WT_SESSION_IMPL *session, append_check = insert && cbt->append_tree; descend_right = true; - /* We may only be searching a single leaf page, not the full tree. */ + /* + * We may be searching only a single leaf page, not the full tree. In + * the normal case where the page links to a parent, check the page's + * parent keys before doing the full search, it's faster when the + * cursor is being re-positioned. (One case where the page doesn't + * have a parent is if it is being re-instantiated in memory as part + * of a split). + */ if (leaf != NULL) { + if (leaf->home != NULL) { + WT_RET(__check_leaf_key_range( + session, srch_key, leaf, cbt)); + if (cbt->compare != 0) { + /* + * !!! + * WT_CURSOR.search_near uses the slot value to + * decide if there was an on-page match. + */ + cbt->slot = 0; + return (0); + } + } + current = leaf; goto leaf_only; } @@ -196,15 +287,6 @@ restart_page: page = current->page; WT_INTL_INDEX_GET(session, page, pindex); - /* - * Fast-path internal pages with one child, a common case for - * the root page in new trees. - */ - if (pindex->entries == 1) { - descent = pindex->index[0]; - goto descend; - } - /* Fast-path appends. */ if (append_check) { descent = pindex->index[pindex->entries - 1]; @@ -345,7 +427,8 @@ descend: /* * page; otherwise return on error, the swap call ensures we're * holding nothing on failure. */ - if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) { + if ((ret = __wt_page_swap( + session, current, descent, WT_READ_RESTART_OK)) == 0) { current = descent; continue; } @@ -542,12 +625,18 @@ err: /* int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) { - WT_INSERT *p, *t; + WT_INSERT *ins, **start, **stop; + WT_INSERT_HEAD *ins_head; WT_PAGE *page; - uint32_t cnt; + uint32_t choice, entries, i; + int level; page = cbt->ref->page; + start = stop = NULL; /* [-Wconditional-uninitialized] */ + entries = 0; /* [-Wconditional-uninitialized] */ + + /* If the page has disk-based entries, select from them. */ if (page->pg_row_entries != 0) { cbt->compare = 0; cbt->slot = __wt_random(&session->rnd) % page->pg_row_entries; @@ -562,24 +651,115 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) /* * If the tree is new (and not empty), it might have a large insert - * list. Count how many records are in the list. + * list. */ F_SET(cbt, WT_CBT_SEARCH_SMALLEST); if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) return (WT_NOTFOUND); - for (cnt = 1, p = WT_SKIP_FIRST(cbt->ins_head);; ++cnt) - if ((p = WT_SKIP_NEXT(p)) == NULL) - break; /* - * Select a random number from 0 to (N - 1), return that record. + * Walk down the list until we find a level with at least 50 entries, + * that's where we'll start rolling random numbers. The value 50 is + * used to ignore levels with only a few entries, that is, levels which + * are potentially badly skewed. */ - cnt = __wt_random(&session->rnd) % cnt; - for (p = t = WT_SKIP_FIRST(cbt->ins_head);; t = p) - if (cnt-- == 0 || (p = WT_SKIP_NEXT(p)) == NULL) + for (ins_head = cbt->ins_head, + level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) { + start = &ins_head->head[level]; + for (entries = 0, stop = start; + *stop != NULL; stop = &(*stop)->next[level]) + ++entries; + + if (entries > 50) break; + } + + /* + * If it's a tiny list and we went all the way to level 0, correct the + * level; entries is correctly set. + */ + if (level < 0) + level = 0; + + /* + * Step down the skip list levels, selecting a random chunk of the name + * space at each level. + */ + while (level > 0) { + /* + * There are (entries) or (entries + 1) chunks of the name space + * considered at each level. They are: between start and the 1st + * element, between the 1st and 2nd elements, and so on to the + * last chunk which is the name space after the stop element on + * the current level. This last chunk of name space may or may + * not be there: as we descend the levels of the skip list, this + * chunk may appear, depending if the next level down has + * entries logically after the stop point in the current level. + * We can't ignore those entries: because of the algorithm used + * to determine the depth of a skiplist, there may be a large + * number of entries "revealed" by descending a level. + * + * If the next level down has more items after the current stop + * point, there are (entries + 1) chunks to consider, else there + * are (entries) chunks. + */ + if (*(stop - 1) == NULL) + choice = __wt_random(&session->rnd) % entries; + else + choice = __wt_random(&session->rnd) % (entries + 1); + + if (choice == entries) { + /* + * We selected the name space after the stop element on + * this level. Set the start point to the current stop + * point, descend a level and move the stop element to + * the end of the list, that is, the end of the newly + * discovered name space, counting entries as we go. + */ + start = stop; + --start; + --level; + for (entries = 0, stop = start; + *stop != NULL; stop = &(*stop)->next[level]) + ++entries; + } else { + /* + * We selected another name space on the level. Move the + * start pointer the selected number of entries forward + * to the start of the selected chunk (if the selected + * number is 0, start won't move). Set the stop pointer + * to the next element in the list and drop both start + * and stop down a level. + */ + for (i = 0; i < choice; ++i) + start = &(*start)->next[level]; + stop = &(*start)->next[level]; + + --start; + --stop; + --level; + + /* Count the entries in the selected name space. */ + for (entries = 0, + ins = *start; ins != *stop; ins = ins->next[level]) + ++entries; + } + } + + /* + * When we reach the bottom level, entries will already be set. Select + * a random entry from the name space and return it. + * + * It should be impossible for the entries count to be 0 at this point, + * but check for it out of paranoia and to quiet static testing tools. + */ + if (entries > 0) + entries = __wt_random(&session->rnd) % entries; + for (ins = *start; entries > 0; --entries) + ins = ins->next[0]; + + cbt->ins = ins; cbt->compare = 0; - cbt->ins = t; return (0); } @@ -617,7 +797,8 @@ restart_root: * Swap the parent page for the child page; return on error, * the swap function ensures we're holding nothing on failure. */ - if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) { + if ((ret = __wt_page_swap( + session, current, descent, WT_READ_RESTART_OK)) == 0) { current = descent; continue; } diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c index d3a0265c13a..1ef8dd32bb4 100644 --- a/src/cache/cache_las.c +++ b/src/cache/cache_las.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -18,6 +18,7 @@ __wt_las_stats_update(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; WT_CONNECTION_STATS **cstats; WT_DSRC_STATS **dstats; + int64_t v; conn = S2C(session); @@ -37,10 +38,10 @@ __wt_las_stats_update(WT_SESSION_IMPL *session) dstats = ((WT_CURSOR_BTREE *) conn->las_session->las_cursor)->btree->dhandle->stats; - WT_STAT_SET(session, cstats, - cache_lookaside_insert, WT_STAT_READ(dstats, cursor_insert)); - WT_STAT_SET(session, cstats, - cache_lookaside_remove, WT_STAT_READ(dstats, cursor_remove)); + v = WT_STAT_READ(dstats, cursor_insert); + WT_STAT_SET(session, cstats, cache_lookaside_insert, v); + v = WT_STAT_READ(dstats, cursor_remove); + WT_STAT_SET(session, cstats, cache_lookaside_remove, v); } /* @@ -139,18 +140,27 @@ __wt_las_is_written(WT_SESSION_IMPL *session) } /* - * __wt_las_cursor_create -- + * __wt_las_cursor_open -- * Open a new lookaside table cursor. */ int -__wt_las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) +__wt_las_cursor_open(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) { WT_BTREE *btree; + WT_DECL_RET; const char *open_cursor_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL }; - WT_RET(__wt_open_cursor( + WT_WITHOUT_DHANDLE(session, ret = __wt_open_cursor( session, WT_LAS_URI, NULL, open_cursor_cfg, cursorp)); + WT_RET(ret); + + /* + * Retrieve the btree from the cursor, rather than the session because + * we don't always switch the LAS handle in to the session before + * entering this function. + */ + btree = ((WT_CURSOR_BTREE *)(*cursorp))->btree; /* * Set special flags for the lookaside table: the lookaside flag (used, @@ -161,7 +171,6 @@ __wt_las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) * opens (the first update is safe because it's single-threaded from * wiredtiger_open). */ - btree = S2BT(session); if (!F_ISSET(btree, WT_BTREE_LOOKASIDE)) F_SET(btree, WT_BTREE_LOOKASIDE); if (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) diff --git a/src/config/config.c b/src/config/config.c index 505b843aa86..f480ab83dbd 100644 --- a/src/config/config.c +++ b/src/config/config.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -365,6 +365,9 @@ __config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value) conf, "Unexpected character", EINVAL)); case A_DOWN: + if (conf->top == -1) + return (__config_err( + conf, "Unbalanced brackets", EINVAL)); --conf->depth; CAP(0); break; @@ -471,8 +474,7 @@ __config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value) if (conf->depth == 0) return (WT_NOTFOUND); - return (__config_err(conf, - "Closing brackets missing from config string", EINVAL)); + return (__config_err(conf, "Unbalanced brackets", EINVAL)); } /* diff --git a/src/config/config_api.c b/src/config/config_api.c index 2aba80ebcdd..b5228c4329c 100644 --- a/src/config/config_api.c +++ b/src/config/config_api.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/config/config_check.c b/src/config/config_check.c index 6b9d6c563ad..c29013483f6 100644 --- a/src/config/config_check.c +++ b/src/config/config_check.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/config/config_collapse.c b/src/config/config_collapse.c index c997ac3a324..27bd6255a0a 100644 --- a/src/config/config_collapse.c +++ b/src/config/config_collapse.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/config/config_def.c b/src/config/config_def.c index 9d12e953498..879de670695 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -151,9 +151,9 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," "\"evict\",\"evictserver\",\"fileops\",\"log\",\"lsm\"," "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\"," - "\"reconcile\",\"recovery\",\"salvage\",\"shared_cache\"," - "\"split\",\"temporary\",\"transaction\",\"verify\",\"version\"," - "\"write\"]", + "\"rebalance\",\"reconcile\",\"recovery\",\"salvage\"," + "\"shared_cache\",\"split\",\"temporary\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } }; @@ -291,6 +291,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_create[] = { static const WT_CONFIG_CHECK confchk_WT_SESSION_drop[] = { { "force", "boolean", NULL, NULL, NULL, 0 }, + { "lock_wait", "boolean", NULL, NULL, NULL, 0 }, { "remove_files", "boolean", NULL, NULL, NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } }; @@ -563,9 +564,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," "\"evict\",\"evictserver\",\"fileops\",\"log\",\"lsm\"," "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\"," - "\"reconcile\",\"recovery\",\"salvage\",\"shared_cache\"," - "\"split\",\"temporary\",\"transaction\",\"verify\",\"version\"," - "\"write\"]", + "\"rebalance\",\"reconcile\",\"recovery\",\"salvage\"," + "\"shared_cache\",\"split\",\"temporary\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "write_through", "list", NULL, "choices=[\"data\",\"log\"]", @@ -643,9 +644,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," "\"evict\",\"evictserver\",\"fileops\",\"log\",\"lsm\"," "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\"," - "\"reconcile\",\"recovery\",\"salvage\",\"shared_cache\"," - "\"split\",\"temporary\",\"transaction\",\"verify\",\"version\"," - "\"write\"]", + "\"rebalance\",\"reconcile\",\"recovery\",\"salvage\"," + "\"shared_cache\",\"split\",\"temporary\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "version", "string", NULL, NULL, NULL, 0 }, { "write_through", "list", @@ -718,9 +719,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," "\"evict\",\"evictserver\",\"fileops\",\"log\",\"lsm\"," "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\"," - "\"reconcile\",\"recovery\",\"salvage\",\"shared_cache\"," - "\"split\",\"temporary\",\"transaction\",\"verify\",\"version\"," - "\"write\"]", + "\"rebalance\",\"reconcile\",\"recovery\",\"salvage\"," + "\"shared_cache\",\"split\",\"temporary\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "version", "string", NULL, NULL, NULL, 0 }, { "write_through", "list", @@ -793,9 +794,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," "\"evict\",\"evictserver\",\"fileops\",\"log\",\"lsm\"," "\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\",\"read\"," - "\"reconcile\",\"recovery\",\"salvage\",\"shared_cache\"," - "\"split\",\"temporary\",\"transaction\",\"verify\",\"version\"," - "\"write\"]", + "\"rebalance\",\"reconcile\",\"recovery\",\"salvage\"," + "\"shared_cache\",\"split\",\"temporary\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "write_through", "list", NULL, "choices=[\"data\",\"log\"]", @@ -904,8 +905,8 @@ static const WT_CONFIG_ENTRY config_entries[] = { confchk_WT_SESSION_create, 40 }, { "WT_SESSION.drop", - "force=0,remove_files=", - confchk_WT_SESSION_drop, 2 + "force=0,lock_wait=,remove_files=", + confchk_WT_SESSION_drop, 3 }, { "WT_SESSION.join", "bloom_bit_count=16,bloom_hash_count=8,compare=\"eq\",count=," @@ -926,6 +927,10 @@ static const WT_CONFIG_ENTRY config_entries[] = { "skip_sort_check=0,statistics=,target=", confchk_WT_SESSION_open_cursor, 12 }, + { "WT_SESSION.rebalance", + "", + NULL, 0 + }, { "WT_SESSION.reconfigure", "isolation=read-committed", confchk_WT_SESSION_reconfigure, 1 diff --git a/src/config/config_ext.c b/src/config/config_ext.c index 5102f354b02..56c0018f8c3 100644 --- a/src/config/config_ext.c +++ b/src/config/config_ext.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/config/config_upgrade.c b/src/config/config_upgrade.c index 0bca1392b51..e9ba38c6693 100644 --- a/src/config/config_upgrade.c +++ b/src/config/config_upgrade.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/conn/api_version.c b/src/conn/api_version.c index 6293d221417..a36cdb8d8eb 100644 --- a/src/conn/api_version.c +++ b/src/conn/api_version.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index bd14e1bf4fd..2f62950a36e 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -1605,6 +1605,7 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) { "mutex", WT_VERB_MUTEX }, { "overflow", WT_VERB_OVERFLOW }, { "read", WT_VERB_READ }, + { "rebalance", WT_VERB_REBALANCE }, { "reconcile", WT_VERB_RECONCILE }, { "recovery", WT_VERB_RECOVERY }, { "salvage", WT_VERB_SALVAGE }, @@ -1749,7 +1750,7 @@ __conn_write_base_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR_NOTFOUND_OK(ret); /* Flush the handle and rename the file into place. */ - ret = __wt_sync_and_rename_fp( + ret = __wt_sync_fp_and_rename( session, &fp, WT_BASECONFIG_SET, WT_BASECONFIG); if (0) { @@ -2003,6 +2004,9 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_ERR(__wt_sweep_config(session, cfg)); WT_ERR(__wt_verbose_config(session, cfg)); + /* Initialize the OS page size for mmap */ + conn->page_size = __wt_get_vm_pagesize(); + /* Now that we know if verbose is configured, output the version. */ WT_ERR(__wt_verbose( session, WT_VERB_VERSION, "%s", WIREDTIGER_VERSION_STRING)); @@ -2061,7 +2065,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, * DATABASE HOME, IT'S WHAT WE USE TO DECIDE IF WE'RE CREATING OR NOT. */ WT_ERR(__wt_turtle_init(session)); - WT_ERR(__wt_metadata_open(session)); + WT_ERR(__wt_metadata_cursor(session, NULL)); /* Start the worker threads and run recovery. */ WT_ERR(__wt_connection_workers(session, cfg)); diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c index a1d509e75bd..1831aad5895 100644 --- a/src/conn/conn_cache.c +++ b/src/conn/conn_cache.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c index 8d16f94c092..72f23b015b7 100644 --- a/src/conn/conn_cache_pool.c +++ b/src/conn/conn_cache_pool.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c index b47e2550b23..a23350a5e46 100644 --- a/src/conn/conn_ckpt.c +++ b/src/conn/conn_ckpt.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index c6d5b535b86..dedafc2b102 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -119,46 +119,29 @@ __wt_conn_dhandle_find( } /* - * __conn_dhandle_mark_dead -- - * Mark a data handle dead. - */ -static int -__conn_dhandle_mark_dead(WT_SESSION_IMPL *session) -{ - bool evict_reset; - - /* - * Handle forced discard (e.g., when dropping a file). - * - * We need exclusive access to the file -- disable ordinary - * eviction and drain any blocks already queued. - */ - WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset)); - F_SET(session->dhandle, WT_DHANDLE_DEAD); - if (evict_reset) - __wt_evict_file_exclusive_off(session); - return (0); -} - -/* * __wt_conn_btree_sync_and_close -- * Sync and close the underlying btree handle. */ int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) { + WT_BM *bm; WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_DECL_RET; - bool marked_dead, no_schema_lock; + bool evict_reset, marked_dead, no_schema_lock; btree = S2BT(session); + bm = btree->bm; dhandle = session->dhandle; marked_dead = false; if (!F_ISSET(dhandle, WT_DHANDLE_OPEN)) return (0); + /* Ensure that we aren't racing with the eviction server */ + WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset)); + /* * If we don't already have the schema lock, make it an error to try * to acquire it. The problem is that we are holding an exclusive @@ -191,8 +174,16 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) */ if (!F_ISSET(btree, WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) { - if (force && (btree->bm == NULL || btree->bm->map == NULL)) { - WT_ERR(__conn_dhandle_mark_dead(session)); + if (force && (bm == NULL || !bm->is_mapped(bm, session))) { + F_SET(session->dhandle, WT_DHANDLE_DEAD); + + /* + * Reset the tree's eviction priority, and the tree is + * evictable by definition. + */ + __wt_evict_priority_clear(session); + F_CLR(S2BT(session), WT_BTREE_NO_EVICTION); + marked_dead = true; } if (!marked_dead || final) @@ -215,6 +206,9 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) err: __wt_spin_unlock(session, &dhandle->close_lock); + if (evict_reset) + __wt_evict_file_exclusive_off(session); + if (no_schema_lock) F_CLR(session, WT_SESSION_NO_SCHEMA_LOCK); @@ -650,8 +644,9 @@ __wt_conn_dhandle_discard_single( F_SET(S2C(session)->cache, WT_CACHE_CLEAR_WALKS); /* Try to remove the handle, protected by the data handle lock. */ - WT_WITH_HANDLE_LIST_LOCK(session, - WT_TRET(__conn_dhandle_remove(session, final))); + WT_WITH_HANDLE_LIST_LOCK(session, tret, + tret = __conn_dhandle_remove(session, final)); + WT_TRET(tret); /* * After successfully removing the handle, clean it up. @@ -709,6 +704,15 @@ restart: __wt_session_close_cache(session); F_SET(session, WT_SESSION_NO_DATA_HANDLES); + /* + * The connection may have an open metadata cursor handle. We cannot + * close it before now because it's potentially used when discarding + * other open data handles. Close it before discarding the underlying + * metadata handle. + */ + if (session->meta_cursor != NULL) + WT_TRET(session->meta_cursor->close(session->meta_cursor)); + /* Close the metadata file handle. */ while ((dhandle = TAILQ_FIRST(&conn->dhqh)) != NULL) WT_WITH_DHANDLE(session, dhandle, diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index cc4e3ae2681..12b4e87e921 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index 1d44d816467..ed226393fb0 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -511,7 +511,7 @@ typedef struct { * write_lsn in LSN order after the buffer is written to the log file. */ int -__wt_log_wrlsn(WT_SESSION_IMPL *session) +__wt_log_wrlsn(WT_SESSION_IMPL *session, int *yield) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; @@ -550,6 +550,8 @@ restart: * based on the release LSN, and then look for them in order. */ if (written_i > 0) { + if (yield != NULL) + *yield = 0; WT_INSERTION_SORT(written, written_i, WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT); /* @@ -660,22 +662,31 @@ __log_wrlsn_server(void *arg) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *session; + int yield; session = arg; conn = S2C(session); + yield = 0; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * Write out any log record buffers. */ - WT_ERR(__wt_log_wrlsn(session)); - WT_ERR(__wt_cond_wait(session, conn->log_wrlsn_cond, 10000)); + WT_ERR(__wt_log_wrlsn(session, &yield)); + /* + * If __wt_log_wrlsn did work we want to yield instead of sleep. + */ + if (yield++ < WT_THOUSAND) + __wt_yield(); + else + WT_ERR(__wt_cond_wait( + session, conn->log_wrlsn_cond, 10000)); } /* * On close we need to do this one more time because there could * be straggling log writes that need to be written. */ WT_ERR(__wt_log_force_write(session, 1)); - WT_ERR(__wt_log_wrlsn(session)); + WT_ERR(__wt_log_wrlsn(session, NULL)); if (0) { err: __wt_err(session, ret, "log wrlsn server error"); } @@ -694,12 +705,12 @@ __log_server(void *arg) WT_LOG *log; WT_SESSION_IMPL *session; int freq_per_sec; - bool signalled; + bool locked, signalled; session = arg; conn = S2C(session); log = conn->log; - signalled = false; + locked = signalled = false; /* * Set this to the number of times per second we want to force out the @@ -740,8 +751,22 @@ __log_server(void *arg) /* * Perform log pre-allocation. */ - if (conn->log_prealloc > 0) - WT_ERR(__log_prealloc_once(session)); + if (conn->log_prealloc > 0) { + /* + * Log file pre-allocation is disabled when a + * hot backup cursor is open because we have + * agreed not to rename or remove any files in + * the database directory. + */ + WT_ERR(__wt_readlock( + session, conn->hot_backup_lock)); + locked = true; + if (!conn->hot_backup) + WT_ERR(__log_prealloc_once(session)); + WT_ERR(__wt_readunlock( + session, conn->hot_backup_lock)); + locked = false; + } /* * Perform the archive. @@ -768,6 +793,9 @@ __log_server(void *arg) if (0) { err: __wt_err(session, ret, "log server error"); + if (locked) + WT_TRET(__wt_readunlock( + session, conn->hot_backup_lock)); } return (WT_THREAD_RET_VALUE); } diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c index 4fe1db1c524..58577b4587d 100644 --- a/src/conn/conn_open.c +++ b/src/conn/conn_open.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c index 31438e10606..9edc6091b10 100644 --- a/src/conn/conn_stat.c +++ b/src/conn/conn_stat.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -340,8 +340,8 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp) * any that match the list of object sources. */ if (conn->stat_sources != NULL) { - WT_WITH_HANDLE_LIST_LOCK(session, ret = - __wt_conn_btree_apply( + WT_WITH_HANDLE_LIST_LOCK(session, ret, + ret = __wt_conn_btree_apply( session, false, NULL, __statlog_apply, NULL)); WT_RET(ret); } diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c index b9b46f3211c..a15aabdd6fe 100644 --- a/src/conn/conn_sweep.c +++ b/src/conn/conn_sweep.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -64,11 +64,9 @@ __sweep_expire_one(WT_SESSION_IMPL *session) WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_DECL_RET; - bool evict_reset; btree = S2BT(session); dhandle = session->dhandle; - evict_reset = false; /* * Acquire an exclusive lock on the handle and mark it dead. @@ -92,9 +90,6 @@ __sweep_expire_one(WT_SESSION_IMPL *session) !__wt_txn_visible_all(session, btree->rec_max_txn)) goto err; - /* Ensure that we aren't racing with the eviction server */ - WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); - /* * Mark the handle as dead and close the underlying file * handle. Closing the handle decrements the open file count, @@ -102,9 +97,6 @@ __sweep_expire_one(WT_SESSION_IMPL *session) */ ret = __wt_conn_btree_sync_and_close(session, false, true); - if (evict_reset) - __wt_evict_file_exclusive_off(session); - err: WT_TRET(__wt_writeunlock(session, dhandle->rwlock)); return (ret); @@ -243,7 +235,7 @@ __sweep_remove_handles(WT_SESSION_IMPL *session) if (!WT_DHANDLE_CAN_DISCARD(dhandle)) continue; - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret, ret = __sweep_remove_one(session, dhandle)); if (ret == 0) WT_STAT_FAST_CONN_INCR(session, dh_sweep_remove); diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c index 62ac2203b97..6d5d68000ee 100644 --- a/src/cursor/cur_backup.c +++ b/src/cursor/cur_backup.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -80,13 +80,14 @@ __curbackup_close(WT_CURSOR *cursor) int tret; cb = (WT_CURSOR_BACKUP *)cursor; + CURSOR_API_CALL(cursor, session, close, NULL); WT_TRET(__backup_cleanup_handles(session, cb)); WT_TRET(__wt_cursor_close(cursor)); session->bkp_cursor = NULL; - WT_WITH_SCHEMA_LOCK(session, + WT_WITH_SCHEMA_LOCK(session, tret, tret = __backup_stop(session)); /* Stop the backup. */ WT_TRET(tret); @@ -139,7 +140,8 @@ __wt_curbackup_open(WT_SESSION_IMPL *session, * Start the backup and fill in the cursor's list. Acquire the schema * lock, we need a consistent view when creating a copy. */ - WT_WITH_SCHEMA_LOCK(session, ret = __backup_start(session, cb, cfg)); + WT_WITH_SCHEMA_LOCK(session, ret, + ret = __backup_start(session, cb, cfg)); WT_ERR(ret); /* __wt_cursor_init is last so we don't have to clean up on error. */ @@ -339,11 +341,8 @@ __backup_all(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) cursor = NULL; - /* - * Open a cursor on the metadata file and copy all of the entries to - * the hot backup file. - */ - WT_ERR(__wt_metadata_cursor(session, NULL, &cursor)); + /* Copy all of the metadata entries to the hot backup file. */ + WT_RET(__wt_metadata_cursor(session, &cursor)); while ((ret = cursor->next(cursor)) == 0) { WT_ERR(cursor->get_key(cursor, &key)); WT_ERR(cursor->get_value(cursor, &value)); @@ -375,13 +374,13 @@ __backup_all(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) } WT_ERR_NOTFOUND_OK(ret); + WT_ERR(__wt_metadata_cursor_release(session, &cursor)); + /* Build a list of the file objects that need to be copied. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_meta_btree_apply( - session, __backup_list_all_append, NULL)); + WT_WITH_HANDLE_LIST_LOCK(session, ret, ret = + __wt_meta_btree_apply(session, __backup_list_all_append, NULL)); -err: if (cursor != NULL) - WT_TRET(cursor->close(cursor)); +err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); return (ret); } diff --git a/src/cursor/cur_bulk.c b/src/cursor/cur_bulk.c index b996b934464..c013383fa61 100644 --- a/src/cursor/cur_bulk.c +++ b/src/cursor/cur_bulk.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -9,6 +9,25 @@ #include "wt_internal.h" /* + * __bulk_col_keycmp_err -- + * Error routine when column-store keys inserted out-of-order. + */ +static int +__bulk_col_keycmp_err(WT_CURSOR_BULK *cbulk) +{ + WT_CURSOR *cursor; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session; + cursor = &cbulk->cbt.iface; + + WT_RET_MSG(session, EINVAL, + "bulk-load presented with out-of-order keys: %" PRIu64 " is less " + "than previously inserted key %" PRIu64, + cursor->recno, cbulk->recno); +} + +/* * __curbulk_insert_fix -- * Fixed-length column-store bulk cursor insert. */ @@ -19,6 +38,7 @@ __curbulk_insert_fix(WT_CURSOR *cursor) WT_CURSOR_BULK *cbulk; WT_DECL_RET; WT_SESSION_IMPL *session; + uint64_t recno; cbulk = (WT_CURSOR_BULK *)cursor; btree = cbulk->cbt.btree; @@ -29,13 +49,63 @@ __curbulk_insert_fix(WT_CURSOR *cursor) * until the bulk cursor is closed. */ CURSOR_API_CALL(cursor, session, insert, btree); + WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); - WT_CURSOR_NEEDVALUE(cursor); + /* + * If the "append" flag was configured, the application doesn't have to + * supply a key, else require a key. + */ + if (F_ISSET(cursor, WT_CURSTD_APPEND)) + recno = cbulk->recno + 1; + else { + WT_CURSOR_CHECKKEY(cursor); + if ((recno = cursor->recno) <= cbulk->recno) + WT_ERR(__bulk_col_keycmp_err(cbulk)); + } + WT_CURSOR_CHECKVALUE(cursor); + + /* + * Insert any skipped records as deleted records, update the current + * record count. + */ + for (; recno != cbulk->recno + 1; ++cbulk->recno) + WT_ERR(__wt_bulk_insert_fix(session, cbulk, true)); + cbulk->recno = recno; + + /* Insert the current record. */ + ret = __wt_bulk_insert_fix(session, cbulk, false); + +err: API_END_RET(session, ret); +} + +/* + * __curbulk_insert_fix_bitmap -- + * Fixed-length column-store bulk cursor insert for bitmaps. + */ +static int +__curbulk_insert_fix_bitmap(WT_CURSOR *cursor) +{ + WT_BTREE *btree; + WT_CURSOR_BULK *cbulk; + WT_DECL_RET; + WT_SESSION_IMPL *session; - WT_ERR(__wt_bulk_insert_fix(session, cbulk)); + cbulk = (WT_CURSOR_BULK *)cursor; + btree = cbulk->cbt.btree; + /* + * Bulk cursor inserts are updates, but don't need auto-commit + * transactions because they are single-threaded and not visible + * until the bulk cursor is closed. + */ + CURSOR_API_CALL(cursor, session, insert, btree); WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); + WT_CURSOR_CHECKVALUE(cursor); + + /* Insert the current record. */ + ret = __wt_bulk_insert_fix_bitmap(session, cbulk); + err: API_END_RET(session, ret); } @@ -50,7 +120,7 @@ __curbulk_insert_var(WT_CURSOR *cursor) WT_CURSOR_BULK *cbulk; WT_DECL_RET; WT_SESSION_IMPL *session; - bool duplicate; + uint64_t recno; cbulk = (WT_CURSOR_BULK *)cursor; btree = cbulk->cbt.btree; @@ -61,45 +131,63 @@ __curbulk_insert_var(WT_CURSOR *cursor) * until the bulk cursor is closed. */ CURSOR_API_CALL(cursor, session, insert, btree); - - WT_CURSOR_NEEDVALUE(cursor); + WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); /* - * If this isn't the first value inserted, compare it against the last - * value and increment the RLE count. - * - * Instead of a "first time" variable, I'm using the RLE count, because - * it is only zero before the first row is inserted. + * If the "append" flag was configured, the application doesn't have to + * supply a key, else require a key. */ - duplicate = false; - if (cbulk->rle != 0) { - if (cbulk->last.size == cursor->value.size && - memcmp(cbulk->last.data, cursor->value.data, - cursor->value.size) == 0) { - ++cbulk->rle; - duplicate = true; - } else - WT_ERR(__wt_bulk_insert_var(session, cbulk)); + if (F_ISSET(cursor, WT_CURSTD_APPEND)) + recno = cbulk->recno + 1; + else { + WT_CURSOR_CHECKKEY(cursor); + if ((recno = cursor->recno) <= cbulk->recno) + WT_ERR(__bulk_col_keycmp_err(cbulk)); } + WT_CURSOR_CHECKVALUE(cursor); + + if (!cbulk->first_insert) { + /* + * If not the first insert and the key space is sequential, + * compare the current value against the last value; if the + * same, just increment the RLE count. + */ + if (recno == cbulk->recno + 1 && + cbulk->last.size == cursor->value.size && + memcmp(cbulk->last.data, + cursor->value.data, cursor->value.size) == 0) { + ++cbulk->rle; + ++cbulk->recno; + goto duplicate; + } + + /* Insert the previous key/value pair. */ + WT_ERR(__wt_bulk_insert_var(session, cbulk, false)); + } else + cbulk->first_insert = false; /* - * Save a copy of the value for the next comparison and reset the RLE - * counter. + * Insert any skipped records as deleted records, update the current + * record count and RLE counter. */ - if (!duplicate) { - WT_ERR(__wt_buf_set(session, - &cbulk->last, cursor->value.data, cursor->value.size)); - cbulk->rle = 1; + if (recno != cbulk->recno + 1) { + cbulk->rle = (recno - cbulk->recno) - 1; + WT_ERR(__wt_bulk_insert_var(session, cbulk, true)); } + cbulk->rle = 1; + cbulk->recno = recno; - WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); + /* Save a copy of the value for the next comparison. */ + ret = __wt_buf_set(session, + &cbulk->last, cursor->value.data, cursor->value.size); +duplicate: err: API_END_RET(session, ret); } /* * __bulk_row_keycmp_err -- - * Error routine when keys inserted out-of-order. + * Error routine when row-store keys inserted out-of-order. */ static int __bulk_row_keycmp_err(WT_CURSOR_BULK *cbulk) @@ -116,16 +204,13 @@ __bulk_row_keycmp_err(WT_CURSOR_BULK *cbulk) WT_ERR(__wt_scr_alloc(session, 512, &a)); WT_ERR(__wt_scr_alloc(session, 512, &b)); - WT_ERR(__wt_buf_set_printable( - session, a, cursor->key.data, cursor->key.size)); - WT_ERR(__wt_buf_set_printable( - session, b, cbulk->last.data, cbulk->last.size)); - WT_ERR_MSG(session, EINVAL, - "bulk-load presented with out-of-order keys: %.*s compares smaller " - "than previously inserted key %.*s", - (int)a->size, (const char *)a->data, - (int)b->size, (const char *)b->data); + "bulk-load presented with out-of-order keys: %s compares smaller " + "than previously inserted key %s", + __wt_buf_set_printable( + session, cursor->key.data, cursor->key.size, a), + __wt_buf_set_printable( + session, cbulk->last.data, cbulk->last.size, b)); err: __wt_scr_free(session, &a); __wt_scr_free(session, &b); @@ -154,6 +239,7 @@ __curbulk_insert_row(WT_CURSOR *cursor) * until the bulk cursor is closed. */ CURSOR_API_CALL(cursor, session, insert, btree); + WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); WT_CURSOR_CHECKKEY(cursor); WT_CURSOR_CHECKVALUE(cursor); @@ -161,28 +247,20 @@ __curbulk_insert_row(WT_CURSOR *cursor) /* * If this isn't the first key inserted, compare it against the last key * to ensure the application doesn't accidentally corrupt the table. - * - * Instead of a "first time" variable, I'm using the RLE count, because - * it is only zero before the first row is inserted. */ - if (cbulk->rle != 0) { + if (!cbulk->first_insert) { WT_ERR(__wt_compare(session, btree->collator, &cursor->key, &cbulk->last, &cmp)); if (cmp <= 0) WT_ERR(__bulk_row_keycmp_err(cbulk)); - } + } else + cbulk->first_insert = false; - /* - * Save a copy of the key for the next comparison and set the RLE - * counter. - */ + /* Save a copy of the key for the next comparison. */ WT_ERR(__wt_buf_set(session, &cbulk->last, cursor->key.data, cursor->key.size)); - cbulk->rle = 1; - - WT_ERR(__wt_bulk_insert_row(session, cbulk)); - WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); + ret = __wt_bulk_insert_row(session, cbulk); err: API_END_RET(session, ret); } @@ -208,13 +286,12 @@ __curbulk_insert_row_skip_check(WT_CURSOR *cursor) * until the bulk cursor is closed. */ CURSOR_API_CALL(cursor, session, insert, btree); + WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); - WT_CURSOR_NEEDKEY(cursor); - WT_CURSOR_NEEDVALUE(cursor); - - WT_ERR(__wt_bulk_insert_row(session, cbulk)); + WT_CURSOR_CHECKKEY(cursor); + WT_CURSOR_CHECKVALUE(cursor); - WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); + ret = __wt_bulk_insert_row(session, cbulk); err: API_END_RET(session, ret); } @@ -237,18 +314,25 @@ __wt_curbulk_init(WT_SESSION_IMPL *session, __wt_cursor_set_notsup(c); switch (cbt->btree->type) { case BTREE_COL_FIX: - c->insert = __curbulk_insert_fix; + c->insert = bitmap ? + __curbulk_insert_fix_bitmap : __curbulk_insert_fix; break; case BTREE_COL_VAR: c->insert = __curbulk_insert_var; break; case BTREE_ROW: + /* + * Row-store order comparisons are expensive, so we optionally + * skip them when we know the input is correct. + */ c->insert = skip_sort_check ? __curbulk_insert_row_skip_check : __curbulk_insert_row; break; WT_ILLEGAL_VALUE(session); } + cbulk->first_insert = true; + cbulk->recno = 0; cbulk->bitmap = bitmap; if (bitmap) F_SET(c, WT_CURSTD_RAW); diff --git a/src/cursor/cur_config.c b/src/cursor/cur_config.c index 348cfbab1dd..1b2fec0eb89 100644 --- a/src/cursor/cur_config.c +++ b/src/cursor/cur_config.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/cursor/cur_ds.c b/src/cursor/cur_ds.c index ccc19717612..2a598c99523 100644 --- a/src/cursor/cur_ds.c +++ b/src/cursor/cur_ds.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/cursor/cur_dump.c b/src/cursor/cur_dump.c index e5799fbad05..3324efd96cc 100644 --- a/src/cursor/cur_dump.c +++ b/src/cursor/cur_dump.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c index b955b292292..8bbe1cc8eda 100644 --- a/src/cursor/cur_file.c +++ b/src/cursor/cur_file.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -545,8 +545,8 @@ __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, * failing with EBUSY due to a database-wide checkpoint. */ if (LF_ISSET(WT_DHANDLE_EXCLUSIVE)) - WT_WITH_CHECKPOINT_LOCK(session, ret = - __wt_session_get_btree_ckpt( + WT_WITH_CHECKPOINT_LOCK(session, ret, + ret = __wt_session_get_btree_ckpt( session, uri, cfg, flags)); else ret = __wt_session_get_btree_ckpt( diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c index a909eaece99..6822055131a 100644 --- a/src/cursor/cur_index.c +++ b/src/cursor/cur_index.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/cursor/cur_join.c b/src/cursor/cur_join.c index 395da22a80c..2cbefa68c5e 100644 --- a/src/cursor/cur_join.c +++ b/src/cursor/cur_join.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -383,17 +383,14 @@ __curjoin_endpoint_init_key(WT_SESSION_IMPL *session, allocbuf = NULL; if ((cursor = endpoint->cursor) != NULL) { if (entry->index != NULL) { + /* Extract and save the index's logical key. */ cindex = (WT_CURSOR_INDEX *)endpoint->cursor; - if (cindex->index->extractor == NULL) { - WT_ERR(__wt_struct_repack(session, - cindex->child->key_format, - entry->main->value_format, - &cindex->child->key, &endpoint->key, - &allocbuf)); - if (allocbuf != NULL) - F_SET(endpoint, WT_CURJOIN_END_OWN_KEY); - } else - endpoint->key = cindex->child->key; + WT_ERR(__wt_struct_repack(session, + cindex->child->key_format, + cindex->iface.key_format, + &cindex->child->key, &endpoint->key, &allocbuf)); + if (allocbuf != NULL) + F_SET(endpoint, WT_CURJOIN_END_OWN_KEY); } else { k = &((WT_CURSOR_TABLE *)cursor)->cg_cursors[0]->key; if (WT_CURSOR_RECNO(cursor)) { diff --git a/src/cursor/cur_json.c b/src/cursor/cur_json.c index 8f858a5012f..fcb66d3e8b3 100644 --- a/src/cursor/cur_json.c +++ b/src/cursor/cur_json.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -313,7 +313,6 @@ size_t __wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, bool force_unicode) { char abbrev; - u_char h; if (!force_unicode) { if (isprint(ch) && ch != '\\' && ch != '"') { @@ -354,16 +353,8 @@ __wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, bool force_unicode) *buf++ = 'u'; *buf++ = '0'; *buf++ = '0'; - h = (((u_char)ch) >> 4) & 0xF; - if (h >= 10) - *buf++ = 'A' + (h - 10); - else - *buf++ = '0' + h; - h = ((u_char)ch) & 0xF; - if (h >= 10) - *buf++ = 'A' + (h - 10); - else - *buf++ = '0' + h; + *buf++ = __wt_hex[(ch & 0xf0) >> 4]; + *buf++ = __wt_hex[ch & 0x0f]; } return (6); } diff --git a/src/cursor/cur_log.c b/src/cursor/cur_log.c index ade9fd18962..35a2d00e6ec 100644 --- a/src/cursor/cur_log.c +++ b/src/cursor/cur_log.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/cursor/cur_metadata.c b/src/cursor/cur_metadata.c index 55da93859a6..df66ef34ddd 100644 --- a/src/cursor/cur_metadata.c +++ b/src/cursor/cur_metadata.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -477,8 +477,12 @@ __wt_curmetadata_open(WT_SESSION_IMPL *session, cursor->key_format = "S"; cursor->value_format = "S"; - /* Open the file cursor for operations on the regular metadata */ - WT_ERR(__wt_metadata_cursor(session, cfg[1], &mdc->file_cursor)); + /* + * Open the file cursor for operations on the regular metadata; don't + * use the existing, cached session metadata cursor, the configuration + * may not be the same. + */ + WT_ERR(__wt_metadata_cursor_open(session, cfg[1], &mdc->file_cursor)); WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp)); diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c index e1d5b8eb91a..00a6ade21c6 100644 --- a/src/cursor/cur_stat.c +++ b/src/cursor/cur_stat.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -384,6 +384,7 @@ __curstat_file_init(WT_SESSION_IMPL *session, { WT_DATA_HANDLE *dhandle; WT_DECL_RET; + wt_off_t size; const char *filename; /* @@ -395,8 +396,8 @@ __curstat_file_init(WT_SESSION_IMPL *session, if (!WT_PREFIX_SKIP(filename, "file:")) return (EINVAL); __wt_stat_dsrc_init_single(&cst->u.dsrc_stats); - WT_RET(__wt_block_manager_size( - session, filename, &cst->u.dsrc_stats)); + WT_RET(__wt_block_manager_named_size(session, filename, &size)); + cst->u.dsrc_stats.block_size = size; __wt_curstat_dsrc_final(cst); return (0); } @@ -662,7 +663,7 @@ __wt_curstat_open(WT_SESSION_IMPL *session, /* * We return the statistics field's offset as the key, and a string - * description, a string value, and a uint64_t value as the value + * description, a string value, and a uint64_t value as the value * columns. */ cursor->key_format = "i"; diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c index da38988b6c2..051f36c8854 100644 --- a/src/cursor/cur_std.c +++ b/src/cursor/cur_std.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c index dca72a16ee5..d986577f640 100644 --- a/src/cursor/cur_table.c +++ b/src/cursor/cur_table.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -758,6 +758,7 @@ err: API_END_RET(session, ret); static int __curtable_open_colgroups(WT_CURSOR_TABLE *ctable, const char *cfg_arg[]) { + WT_DECL_RET; WT_SESSION_IMPL *session; WT_TABLE *table; WT_CURSOR **cp; @@ -776,8 +777,10 @@ __curtable_open_colgroups(WT_CURSOR_TABLE *ctable, const char *cfg_arg[]) /* If the table is incomplete, wait on the table lock and recheck. */ complete = table->cg_complete; - if (!complete) - WT_WITH_TABLE_LOCK(session, complete = table->cg_complete); + if (!complete) { + WT_WITH_TABLE_LOCK(session, ret, complete = table->cg_complete); + WT_RET(ret); + } if (!complete) WT_RET_MSG(session, EINVAL, "Can't use '%s' until all column groups are created", @@ -968,8 +971,11 @@ __wt_curtable_open(WT_SESSION_IMPL *session, WT_ERR(__wt_strdup(session, tmp->data, &ctable->cfg[1])); if (0) { -err: WT_TRET(__curtable_close(cursor)); - *cursorp = NULL; +err: if (*cursorp != NULL) { + WT_TRET(__wt_cursor_close(*cursorp)); + *cursorp = NULL; + } + WT_TRET(__curtable_close(cursor)); } __wt_scr_free(session, &tmp); diff --git a/src/docs/build-javadoc.sh b/src/docs/build-javadoc.sh index 39c9d989b6c..be886937070 100755 --- a/src/docs/build-javadoc.sh +++ b/src/docs/build-javadoc.sh @@ -8,5 +8,5 @@ CLASSPATH=$THRIFT_HOME/libthrift.jar:$SLF4J_JAR javadoc -public -d $DOCS/java \ -stylesheetfile $DOCS/style/javadoc.css \ -use -link http://java.sun.com/j2se/1.5.0/docs/api/ \ -header '<b>WiredTiger API</b><br><font size="-1"> version '$WT_VERSION'</font>' \ - -windowtitle 'WiredTiger Java API' -bottom '<font size=1>Copyright (c) 2008-2015 MongoDB, Inc. All rights reserved.</font>' \ + -windowtitle 'WiredTiger Java API' -bottom '<font size=1>Copyright (c) 2008-2016 MongoDB, Inc. All rights reserved.</font>' \ com.wiredtiger com.wiredtiger.util diff --git a/src/docs/command-line.dox b/src/docs/command-line.dox index 745c5051be3..e2b376d5e3f 100644 --- a/src/docs/command-line.dox +++ b/src/docs/command-line.dox @@ -32,7 +32,7 @@ on success and non-zero on error. The \c wt tool supports several commands. If configured in the underlying database, some commands will run recovery when opening the database. If -the user wants to force recovery on any command, use the \c -r option. +the user wants to force recovery on any command, use the \c -R option. In general, commands that modify the database or tables will run recovery by default and commands that only read data will not run recovery. @@ -46,7 +46,7 @@ opened as a WiredTiger database. See @ref backup for more information, and @ref file_permissions for specifics on the copied file permissions. @subsection util_backup_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] backup [-t uri] directory</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] backup [-t uri] directory</code> @subsection util_backup_options Options The following are command-specific options for the \c backup command: @@ -64,7 +64,7 @@ The \c compact command attempts to rewrite the specified table or file to consume less disk space. @subsection util_compact_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] compact uri</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] compact uri</code> @subsection util_compact_options Options The \c compact command has no command-specific options. @@ -78,7 +78,7 @@ configuration. It is equivalent to a call to WT_SESSION::create with the specified string arguments. @subsection util_create_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] create [-c config] uri</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] create [-c config] uri</code> @subsection util_create_options Options The following are command-specific options for the \c create command: @@ -94,7 +94,7 @@ The \c drop command drops the specified \c uri. It is equivalent to a call to WT_SESSION::drop with the "force" configuration argument. @subsection util_drop_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] drop uri</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] drop uri</code> @subsection util_drop_options Options The \c drop command has no command-specific options. @@ -109,7 +109,7 @@ which can be re-loaded into a new table using the \c load command. See @subpage dump_formats for details of the dump file formats. @subsection util_dump_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] dump [-jrx] [-c checkpoint] [-f output] uri</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] dump [-jrx] [-c checkpoint] [-f output] uri</code> @subsection util_dump_options Options The following are command-specific options for the \c dump command: @@ -143,7 +143,7 @@ the database. If a URI is specified as an argument, only information about that data source is printed. @subsection util_list_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] list [-cv] [uri]</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] list [-cv] [uri]</code> @subsection util_list_options Options The following are command-specific options for the \c list command: @@ -170,7 +170,7 @@ table will be overwritten by the new data (use the \c -n option to make an attempt to overwrite existing data return an error). @subsection util_load_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] load [-ajn] [-f input] [-r name] [uri configuration ...]</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] load [-ajn] [-f input] [-r name] [uri configuration ...]</code> @subsection util_load_options Options The following are command-specific options for the \c load command: @@ -244,7 +244,7 @@ row-store table or file already exists, data in the table or file will be overwritten by the new data. @subsection util_loadtext_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] loadtext [-f input]</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] loadtext [-f input]</code> @subsection util_loadtext_options Options The following are command-specific options for the \c loadtext command: @@ -260,7 +260,7 @@ Display the database log. The \c printlog command outputs the database log. @subsection util_printlog_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] printlog [-p] [-f output]</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] printlog [-x] [-f output]</code> @subsection util_printlog_options Options The following are command-specific options for the \c printlog command: @@ -269,8 +269,9 @@ The following are command-specific options for the \c printlog command: By default, the \c printlog command output is written to the standard output; the \c -f option re-directs the output to the specified file. -@par <code>-p</code> -Display the log in a printable format. +@par <code>-x</code> +Keys and value items in the log are printed in hex format in addition +to the default string format. <hr> @section util_read wt read @@ -283,7 +284,7 @@ with string or record number keys and string values. The \c read command exits non-zero if a specified record is not found. @subsection util_read_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] read uri key ...</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] read uri key ...</code> @subsection util_read_options Options The \c read command has no command-specific options. @@ -295,7 +296,7 @@ Rename a table or file. The \c rename command renames the specified table or file. @subsection util_rename_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] rename uri name</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] rename uri name</code> @subsection util_rename_options Options The \c rename command has no command-specific options. @@ -309,7 +310,7 @@ data that cannot be recovered. Underlying files are re-written in place, overwriting the original file contents. @subsection util_salvage_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] salvage [-F force] uri</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] salvage [-F force] uri</code> @subsection util_salvage_options Options The following are command-specific options for the \c salvage command: @@ -327,7 +328,7 @@ The \c stat command outputs run-time statistics for the WiredTiger engine, or, if specified, for the URI on the command-line. @subsection util_stat_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] stat [-f] [uri]</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] stat [-f] [uri]</code> @subsection util_stat_options Options The following are command-specific options for the \c stat command: @@ -345,7 +346,7 @@ success if the data source is up-to-date, and failure if the data source cannot be upgraded. @subsection util_upgrade_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] upgrade uri</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] upgrade uri</code> @subsection util_upgrade_options Options The \c upgrade command has no command-specific options. @@ -359,7 +360,7 @@ success if the data source is correct, and failure if the data source is corrupted. @subsection util_verify_synopsis Synopsis -<code>wt [-rVv] [-C config] [-E secretkey ] [-h directory] verify uri</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] verify uri</code> @subsection util_verify_options Options The \c verify command has no command-specific options. @@ -381,9 +382,9 @@ Attempting to overwrite an already existing record will fail. @subsection util_write_synopsis Synopsis <code> -wt [-rVv] [-C config] [-E secretkey ] [-h directory] write -a uri value ... +wt [-RVv] [-C config] [-E secretkey ] [-h directory] write -a uri value ... <br> -wt [-rVv] [-C config] [-E secretkey ] [-h directory] write [-o] uri key value ... +wt [-RVv] [-C config] [-E secretkey ] [-h directory] write [-o] uri key value ... </code> @subsection util_write_options Options diff --git a/src/docs/license.dox b/src/docs/license.dox index f34ebad19a7..febced2c6af 100644 --- a/src/docs/license.dox +++ b/src/docs/license.dox @@ -13,6 +13,19 @@ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the <b>GNU General Public License</b></a> for details. +Additionally, portions of the WiredTiger distribution are distributed +under the terms of the +<a href="http://www.opensource.org/licenses/BSD-3-Clause"> +BSD-3-Clause License</a>. These files have +<a href="http://www.opensource.org/licenses/BSD-3-Clause"> +BSD-3-Clause License</a> +copyright notices, and may be freely used and redistributed under the +terms of that notice. + +Additionally, portions of the WiredTiger distribution are public domain +software. Public domain files have notices releasing the software into +the public domain and may be freely used and redistributed. + For a license to use the WiredTiger software under conditions other than those described above, or for technical support for this software, please contact MongoDB, Inc. at @@ -28,7 +41,7 @@ of the WiredTiger library should comply with these copyrights. @hrow{Distribution Files, Copyright Holder, License} @row{\c src/include/bitstring.i, University of California\, Berkeley, <a href="http://www.opensource.org/licenses/BSD-3-Clause">BSD-3-Clause License</a>} @row{\c src/include/queue.h, University of California\, Berkeley, <a href="http://www.opensource.org/licenses/BSD-3-Clause">BSD-3-Clause License</a>} -@row{\c src/os_posix/getopt.c, University of California\, Berkeley, <a href="http://www.opensource.org/licenses/BSD-3-Clause">BSD-3-Clause License</a>} +@row{\c src/os_posix/os_getopt.c, University of California\, Berkeley, <a href="http://www.opensource.org/licenses/BSD-3-Clause">BSD-3-Clause License</a>} @row{\c src/support/hash_city.c, Google\, Inc., <a href="http://www.opensource.org/licenses/MIT">The MIT License</a>} @row{\c src/support/hash_fnv.c, Authors, Public Domain} </table> @@ -63,10 +76,4 @@ selected portions of the WiredTiger sources, please review the copyright notices and LICENSE files included in the WiredTiger distribution for the terms and conditions of such redistribution. -@section license_public_domain Public domain software - -Many portions of the WiredTiger distribution are public domain software. -Public domain files have notices releasing the software into the public -domain and may be freely used and redistributed. - */ diff --git a/src/docs/programming.dox b/src/docs/programming.dox index f005f6d3e2d..5d79edd660b 100644 --- a/src/docs/programming.dox +++ b/src/docs/programming.dox @@ -40,11 +40,12 @@ each of which is ordered by one or more columns. - @subpage compact - @subpage checkpoint - @subpage durability +- @subpage cursor_join +- @subpage cursor_log - @ref transaction_named_snapshots +- @subpage rebalance - @subpage shared_cache - @subpage statistics -- @subpage cursor_join -- @subpage cursor_log - @subpage_single upgrade @m_if{c} diff --git a/src/docs/rebalance.dox b/src/docs/rebalance.dox new file mode 100644 index 00000000000..a6acfe07ef5 --- /dev/null +++ b/src/docs/rebalance.dox @@ -0,0 +1,14 @@ +/*! @m_page{{c,java},rebalance,Rebalance} + +The WT_SESSION::rebalance method can be used to rebalance data sources' +underlying btrees. If a tree has become unbalanced (that is, one part of +the tree is excessively deep), WT_SESSION::rebalance rewrites the tree +as a balanced tree. + +The data source must be quiescent. + +The WT_SESSION::rebalance method should never be needed, as WiredTiger +btrees are maintained as balanced trees. It is only provided as a tool +to handle the unexpected. + + */ diff --git a/src/docs/schema.dox b/src/docs/schema.dox index 66f8046965e..65ad7f6919c 100644 --- a/src/docs/schema.dox +++ b/src/docs/schema.dox @@ -89,6 +89,10 @@ struct module to describe the types of columns in a table: The \c 'r' type is used for record number keys in column stores. It is otherwise identical to the \c 'Q' type. +The \c 's' type is used for fixed-length strings. If it is preceded by +a size, that indicates the number of bytes to store; the default is a +length of 1 byte. + The \c 'S' type is encoded as a C language string terminated by a NUL character. @m_if{java} diff --git a/src/docs/spell.ok b/src/docs/spell.ok index 86af82d8fd2..80597302cbb 100644 --- a/src/docs/spell.ok +++ b/src/docs/spell.ok @@ -66,6 +66,7 @@ NoSQL OPTYPE PRELOAD README +Rebalance RedHat RepMgr Riak @@ -120,6 +121,7 @@ boolean booleans br btree +btrees bufs builtin builtins @@ -378,6 +380,7 @@ readlock realclean realloc realloc'd +rebalance recno recnoN recnum diff --git a/src/docs/style/footer.html b/src/docs/style/footer.html index 83f1254fa42..e5a7b30eef5 100644 --- a/src/docs/style/footer.html +++ b/src/docs/style/footer.html @@ -3,13 +3,13 @@ <div id="nav-path" class="navpath"><!-- id is needed for treeview function! --> <ul> $navpath - <li class="footer">Copyright (c) 2008-2015 MongoDB, Inc. All rights reserved. Contact <a href="mailto:info@wiredtiger.com">info@wiredtiger.com</a> for more information.</li> + <li class="footer">Copyright (c) 2008-2016 MongoDB, Inc. All rights reserved. Contact <a href="mailto:info@wiredtiger.com">info@wiredtiger.com</a> for more information.</li> </ul> </div> <!--END GENERATE_TREEVIEW--> <!--BEGIN !GENERATE_TREEVIEW--> <hr class="footer"/><address class="footer"><small> -Copyright (c) 2008-2015 MongoDB, Inc. All rights reserved. Contact <a href="mailto:info@wiredtiger.com">info@wiredtiger.com</a> for more information. +Copyright (c) 2008-2016 MongoDB, Inc. All rights reserved. Contact <a href="mailto:info@wiredtiger.com">info@wiredtiger.com</a> for more information. </small></address> <!--END !GENERATE_TREEVIEW--> </body> diff --git a/src/docs/tools/doxfilter.py b/src/docs/tools/doxfilter.py index 8ca68c0a1fe..b2d5f857df1 100755 --- a/src/docs/tools/doxfilter.py +++ b/src/docs/tools/doxfilter.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -# Public Domain 2014-2015 MongoDB, Inc. +# Public Domain 2014-2016 MongoDB, Inc. # Public Domain 2008-2014 WiredTiger, Inc. # # This is free and unencumbered software released into the public domain. diff --git a/src/docs/tools/fixlinks.py b/src/docs/tools/fixlinks.py index 84f56d219f8..7163246e3bd 100755 --- a/src/docs/tools/fixlinks.py +++ b/src/docs/tools/fixlinks.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -# Public Domain 2014-2015 MongoDB, Inc. +# Public Domain 2014-2016 MongoDB, Inc. # Public Domain 2008-2014 WiredTiger, Inc. # # This is free and unencumbered software released into the public domain. diff --git a/src/docs/top/main.dox b/src/docs/top/main.dox index ef2f5bf15a0..7e670541e7d 100644 --- a/src/docs/top/main.dox +++ b/src/docs/top/main.dox @@ -6,12 +6,12 @@ WiredTiger is an high performance, scalable, production quality, NoSQL, @section releases Releases <table> -@row{<b>WiredTiger 2.6.1</b> (current), +@row{<b>WiredTiger 2.7.0</b> (current), + <a href="releases/wiredtiger-2.7.0.tar.bz2"><b>[Release package]</b></a>, + <a href="2.7.0/index.html"><b>[Documentation]</b></a>} +@row{<b>WiredTiger 2.6.1</b> (previous), <a href="releases/wiredtiger-2.6.1.tar.bz2"><b>[Release package]</b></a>, <a href="2.6.1/index.html"><b>[Documentation]</b></a>} -@row{<b>WiredTiger 2.5.3</b> (previous), - <a href="releases/wiredtiger-2.5.3.tar.bz2"><b>[Release package]</b></a>, - <a href="2.5.3/index.html"><b>[Documentation]</b></a>} @row{<b>Development branch</b>, <a href="https://github.com/wiredtiger/wiredtiger"><b>[Source code]</b></a>, <a href="develop/index.html"><b>[Documentation]</b></a>} diff --git a/src/docs/tune-bulk-load.dox b/src/docs/tune-bulk-load.dox index 8ee1061c76c..f5d28436dca 100644 --- a/src/docs/tune-bulk-load.dox +++ b/src/docs/tune-bulk-load.dox @@ -15,8 +15,12 @@ WT_CURSOR::close methods. Bulk load inserts are non-transactional: they cannot be rolled back and ignore the transactional state of the WT_SESSION in which they are opened. -When bulk-loading row-store objects, keys must be loaded in sorted -order. +When doing a bulk-load insert, keys must be inserted in sorted order. +When doing a bulk-load insert into a column-store object, any skipped +records will be created as already-deleted rows. If a column-store +bulk-load cursor is configured with \c append, the cursor key will be +ignored and each inserted row will be assigned the next sequential +record number. When using the \c sort utility on a Linux or other POSIX-like system to pre-sort keys, the locale specified by the environment affects the sort diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index 34f391b27f1..e0239919f0b 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -1,5 +1,33 @@ /*! @page upgrading Upgrading WiredTiger applications +@section version_271 Upgrading to Version 2.7.1 +<dl> +<dt>Column-store bulk-load cursors</dt> +<dd> +Historically, bulk-load of a column-store object ignored any key set in +the cursor and automatically assigned each inserted row the next +sequential record number for its key. In the 2.7.1 release, column-store +objects match row-store behavior and require the cursor key be set +before an insert. (This also allows allows sparse tables to be created +in column-store objects, any skipped records are created as +already-deleted rows.) To match the previous behavior, specify the +\c append configuration string when opening the column-store bulk-load +cursor; this causes the cursor's key to be ignored and each inserted row +will be assigned the next record number. +</dd> + +<dt>Change to WT_SESSION::truncate with URI</dt> +<dd> +If using the WT_SESSION::truncate API with a file: URI for a full table +truncate, underlying algorithmic changes result in some visible differences. +This call can now return WT_ROLLBACK. Applications should be prepared to +handle this error. This method no longer requires exclusive access to the +table. Also the underlying disk space may not be immediately +reclaimed when the call returns. The performance of this API may differ +from earlier releases. +</dd> + +</dl><hr> @section version_270 Upgrading to Version 2.7.0 <dl> diff --git a/src/docs/wtperf.dox b/src/docs/wtperf.dox index fb46a91a62c..64e25978dd8 100644 --- a/src/docs/wtperf.dox +++ b/src/docs/wtperf.dox @@ -173,6 +173,10 @@ taken to do the drop. @par icount (unsigned int, default=5000) number of records to initially populate. If multiple tables are configured the count is spread evenly across all tables. +@par idle_table_cycle (unsigned int, default=0) +Enable regular create and drop of idle tables, value is the maximum +number of seconds a create or drop is allowed before flagging an +error. Default 0 which means disabled. @par index (boolean, default=false) Whether to create an index on the value field. @par insert_rmw (boolean, default=false) @@ -182,11 +186,17 @@ key size @par log_partial (boolean, default=false) perform partial logging on first table only. @par min_throughput (unsigned int, default=0) -abort if any throughput measured is less than this amount. Requires +notify if any throughput measured is less than this amount. Aborts or +prints warning based on min_throughput_fatal setting. Requires sample_interval to be configured +@par min_throughput_fatal (boolean, default=false) +print warning (false) or abort (true) of min_throughput failure. @par max_latency (unsigned int, default=0) -abort if any latency measured exceeds this number of -milliseconds.Requires sample_interval to be configured +notify if any latency measured exceeds this number of +milliseconds.Aborts or prints warning based on min_throughput_fatal +setting. Requires sample_interval to be configured +@par max_latency_fatal (boolean, default=false) +print warning (false) or abort (true) of max_latency failure. @par pareto (unsigned int, default=0) use pareto distribution for random numbers. Zero to disable, otherwise a percentage indicating how aggressive the distribution should be. @@ -200,6 +210,8 @@ if non zero choose a value from within this range as the key for insert operations @par random_value (boolean, default=false) generate random content for the value +@par read_range (unsigned int, default=0) +scan a range of keys after each search @par reopen_connection (boolean, default=true) close and reopen the connection between populate and workload phases @par report_interval (unsigned int, default=2) @@ -230,8 +242,9 @@ threads, and the 'insert', 'read' and 'update' entries are the ratios of insert, read and update operations done by each worker thread; If a throttle value is provided each thread will do a maximum of that number of operations per second; multiple workload configurations may -be specified; for example, a more complex threads configuration might -be 'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' +be specified per threads configuration; for example, a more complex +threads configuration might be +'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' which would create 2 threads doing nothing but reads and 8 threads each doing 50% inserts and 25% reads and updates. Allowed configuration values are 'count', 'throttle', 'reads', 'inserts', diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c index c5f6ae3d4d1..641864a8baa 100644 --- a/src/evict/evict_file.c +++ b/src/evict/evict_file.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -85,7 +85,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) WT_ASSERT(session, F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || __wt_page_can_evict(session, ref, NULL)); - __wt_evict_page_clean_update(session, ref, true); + __wt_ref_out(session, ref); break; WT_ILLEGAL_VALUE_ERR(session); } diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index ac481581c23..0536a06bc22 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -27,9 +27,12 @@ static int __evict_server_work(WT_SESSION_IMPL *); static inline uint64_t __evict_read_gen(const WT_EVICT_ENTRY *entry) { + WT_BTREE *btree; WT_PAGE *page; uint64_t read_gen; + btree = entry->btree; + /* Never prioritize empty slots. */ if (entry->ref == NULL) return (UINT64_MAX); @@ -40,15 +43,23 @@ __evict_read_gen(const WT_EVICT_ENTRY *entry) if (page->read_gen == WT_READGEN_OLDEST) return (WT_READGEN_OLDEST); + /* + * Any leaf page from a dead tree is a great choice (not internal pages, + * they may have children and are not yet evictable). + */ + if (!WT_PAGE_IS_INTERNAL(page) && + F_ISSET(btree->dhandle, WT_DHANDLE_DEAD)) + return (WT_READGEN_OLDEST); + /* Any empty page (leaf or internal), is a good choice. */ if (__wt_page_is_empty(page)) return (WT_READGEN_OLDEST); /* - * Skew the read generation for internal pages, we prefer to evict leaf - * pages. + * The base read-generation is skewed by the eviction priority. + * Internal pages are also adjusted, we prefer to evict leaf pages. */ - read_gen = page->read_gen + entry->btree->evict_priority; + read_gen = page->read_gen + btree->evict_priority; if (WT_PAGE_IS_INTERNAL(page)) read_gen += WT_EVICT_INT_SKEW; @@ -727,6 +738,10 @@ __evict_request_walk_clear(WT_SESSION_IMPL *session) F_CLR(session, WT_SESSION_CLEAR_EVICT_WALK); + /* An error is unexpected - flag the failure. */ + if (ret != 0) + __wt_err(session, ret, "Failed to clear eviction walk point"); + return (ret); } @@ -760,20 +775,18 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp) { WT_BTREE *btree; WT_CACHE *cache; + WT_DECL_RET; WT_EVICT_ENTRY *evict; u_int i, elem; + *evict_resetp = false; + btree = S2BT(session); cache = S2C(session)->cache; - /* - * If the file isn't evictable, there's no work to do. - */ - if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) { - *evict_resetp = false; + /* If the file wasn't evictable, there's no work to do. */ + if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) return (0); - } - *evict_resetp = true; /* * Hold the walk lock to set the "no eviction" flag: no new pages from @@ -784,7 +797,7 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp) __wt_spin_unlock(session, &cache->evict_walk_lock); /* Clear any existing LRU eviction walk for the file. */ - WT_RET(__evict_request_walk_clear(session)); + WT_ERR(__evict_request_walk_clear(session)); /* Hold the evict lock to remove any queued pages from this file. */ __wt_spin_lock(session, &cache->evict_lock); @@ -806,7 +819,11 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp) while (btree->evict_busy > 0) __wt_yield(); + *evict_resetp = true; return (0); + +err: F_CLR(btree, WT_BTREE_NO_EVICTION); + return (ret); } /* @@ -852,9 +869,8 @@ __evict_lru_walk(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_DECL_RET; - WT_EVICT_ENTRY *evict; uint64_t cutoff; - uint32_t candidates, entries, i; + uint32_t candidates, entries; cache = S2C(session)->cache; @@ -872,6 +888,14 @@ __evict_lru_walk(WT_SESSION_IMPL *session) while (entries > 0 && cache->evict_queue[entries - 1].ref == NULL) --entries; + /* + * If we have more entries than the maximum tracked between walks, + * clear them. Do this before figuring out how many of the entries are + * candidates so we never end up with more candidates than entries. + */ + while (entries > WT_EVICT_WALK_BASE) + __evict_list_clear(session, &cache->evict_queue[--entries]); + cache->evict_entries = entries; if (entries == 0) { @@ -916,15 +940,6 @@ __evict_lru_walk(WT_SESSION_IMPL *session) cache->evict_candidates = candidates; } - /* If we have more than the minimum number of entries, clear them. */ - if (cache->evict_entries > WT_EVICT_WALK_BASE) { - for (i = WT_EVICT_WALK_BASE, evict = cache->evict_queue + i; - i < cache->evict_entries; - i++, evict++) - __evict_list_clear(session, evict); - cache->evict_entries = WT_EVICT_WALK_BASE; - } - cache->evict_current = cache->evict_queue; __wt_spin_unlock(session, &cache->evict_lock); @@ -982,6 +997,7 @@ __evict_walk(WT_SESSION_IMPL *session) conn = S2C(session); cache = S2C(session)->cache; + btree = NULL; dhandle = NULL; dhandle_locked = incr = false; retries = 0; @@ -1041,6 +1057,7 @@ retry: while (slot < max_entries && ret == 0) { (void)__wt_atomic_subi32( &dhandle->session_inuse, 1); incr = false; + cache->evict_file_next = NULL; } dhandle = TAILQ_NEXT(dhandle, q); } @@ -1096,6 +1113,9 @@ retry: while (slot < max_entries && ret == 0) { * exclusive access when a handle is being closed. */ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) { + /* Remember the file to visit first, next loop. */ + cache->evict_file_next = dhandle; + WT_WITH_DHANDLE(session, dhandle, ret = __evict_walk_file(session, &slot)); WT_ASSERT(session, session->split_gen == 0); @@ -1115,9 +1135,6 @@ retry: while (slot < max_entries && ret == 0) { } if (incr) { - /* Remember the file we should visit first, next loop. */ - cache->evict_file_next = dhandle; - WT_ASSERT(session, dhandle->session_inuse > 0); (void)__wt_atomic_subi32(&dhandle->session_inuse, 1); incr = false; @@ -1170,7 +1187,7 @@ __evict_init_candidate( evict->ref = ref; evict->btree = S2BT(session); - /* Mark the page on the list */ + /* Mark the page on the list; set last to flush the other updates. */ F_SET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU); } @@ -1197,15 +1214,17 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) conn = S2C(session); btree = S2BT(session); cache = conn->cache; - start = cache->evict_queue + *slotp; - end = WT_MIN(start + WT_EVICT_WALK_PER_FILE, - cache->evict_queue + cache->evict_slots); internal_pages = restarts = 0; enough = false; - walk_flags = WT_READ_CACHE | WT_READ_NO_EVICT | - WT_READ_NO_GEN | WT_READ_NO_WAIT; + start = cache->evict_queue + *slotp; + end = start + WT_EVICT_WALK_PER_FILE; + if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || + end > cache->evict_queue + cache->evict_slots) + end = cache->evict_queue + cache->evict_slots; + walk_flags = + WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT; if (F_ISSET(cache, WT_CACHE_WALK_REVERSE)) walk_flags |= WT_READ_PREV; @@ -1247,7 +1266,8 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) continue; /* Pages we no longer need (clean or dirty), are found money. */ - if (__wt_page_is_empty(page)) + if (__wt_page_is_empty(page) || + F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) goto fast; /* Skip clean pages if appropriate. */ @@ -1508,8 +1528,8 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) if (txn_busy && pct_full < 100) return (0); - if (busy == 1) - txn_busy = 1; + if (busy) + txn_busy = true; /* Wake the eviction server if we need to do work. */ WT_RET(__wt_evict_server_wake(session)); @@ -1570,6 +1590,26 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) /* NOTREACHED */ } +/* + * __wt_evict_priority_set -- + * Set a tree's eviction priority. + */ +void +__wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v) +{ + S2BT(session)->evict_priority = v; +} + +/* + * __wt_evict_priority_clear -- + * Clear a tree's eviction priority. + */ +void +__wt_evict_priority_clear(WT_SESSION_IMPL *session) +{ + S2BT(session)->evict_priority = 0; +} + #ifdef HAVE_DIAGNOSTIC /* * __wt_cache_dump -- diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index 1cdf07a9a55..72c07eaa05d 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -8,8 +8,9 @@ #include "wt_internal.h" -static int __evict_page_dirty_update(WT_SESSION_IMPL *, WT_REF *, bool); -static int __evict_review(WT_SESSION_IMPL *, WT_REF *, bool *, bool); +static int __evict_page_clean_update(WT_SESSION_IMPL *, WT_REF *, bool); +static int __evict_page_dirty_update(WT_SESSION_IMPL *, WT_REF *, bool); +static int __evict_review(WT_SESSION_IMPL *, WT_REF *, bool *, bool); /* * __evict_exclusive_clear -- @@ -117,7 +118,7 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) * Pages that belong to dead trees never write back to disk * and can't support page splits. */ - WT_ERR(__wt_evict_page_clean_update( + WT_ERR(__evict_page_clean_update( session, ref, tree_dead || closing)); else WT_ERR(__evict_page_dirty_update(session, ref, closing)); @@ -200,12 +201,11 @@ __evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) } /* - * __wt_evict_page_clean_update -- + * __evict_page_clean_update -- * Update a clean page's reference on eviction. */ -int -__wt_evict_page_clean_update( - WT_SESSION_IMPL *session, WT_REF *ref, bool closing) +static int +__evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) { WT_DECL_RET; diff --git a/src/include/api.h b/src/include/api.h index 4821b450f9e..c6a5af40698 100644 --- a/src/include/api.h +++ b/src/include/api.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/async.h b/src/include/async.h index fb9a64e774d..7a415a4a17a 100644 --- a/src/include/async.h +++ b/src/include/async.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/bitstring.i b/src/include/bitstring.i index 5449ffe6209..0d30e55d1ef 100644 --- a/src/include/bitstring.i +++ b/src/include/bitstring.i @@ -1,5 +1,5 @@ /*- - * Public Domain 2014-2015 MongoDB, Inc. + * Public Domain 2014-2016 MongoDB, Inc. * Public Domain 2008-2014 WiredTiger, Inc. * * This is free and unencumbered software released into the public domain. diff --git a/src/include/block.h b/src/include/block.h index 4bff6c82783..27a140b73a4 100644 --- a/src/include/block.h +++ b/src/include/block.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -173,6 +173,7 @@ struct __wt_bm { int (*compact_skip)(WT_BM *, WT_SESSION_IMPL *, bool *); int (*compact_start)(WT_BM *, WT_SESSION_IMPL *); int (*free)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); + bool (*is_mapped)(WT_BM *, WT_SESSION_IMPL *); int (*preload)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); int (*read) (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t); @@ -182,6 +183,7 @@ struct __wt_bm { int (*salvage_start)(WT_BM *, WT_SESSION_IMPL *); int (*salvage_valid) (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t, bool); + int (*size)(WT_BM *, WT_SESSION_IMPL *, wt_off_t *); int (*stat)(WT_BM *, WT_SESSION_IMPL *, WT_DSRC_STATS *stats); int (*sync)(WT_BM *, WT_SESSION_IMPL *, bool); int (*verify_addr)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); @@ -244,7 +246,10 @@ struct __wt_block { bool ckpt_inprogress;/* Live checkpoint in progress */ /* Compaction support */ - int compact_pct_tenths; /* Percent to compact */ + int compact_pct_tenths; /* Percent to compact */ + uint64_t compact_pages_reviewed;/* Pages reviewed */ + uint64_t compact_pages_skipped; /* Pages skipped */ + uint64_t compact_pages_written; /* Pages rewritten */ /* Salvage support */ wt_off_t slvg_off; /* Salvage file offset */ diff --git a/src/include/bloom.h b/src/include/bloom.h index a673ee9add2..ddc2d64a118 100644 --- a/src/include/bloom.h +++ b/src/include/bloom.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/btmem.h b/src/include/btmem.h index 6ee74c61a38..cfbd87f0cae 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -305,7 +305,7 @@ struct __wt_page_modify { struct { /* * Appended items to column-stores: there is only a single one - * of these per column-store tree. + * of these active at a time per column-store tree. */ WT_INSERT_HEAD **append; @@ -319,9 +319,18 @@ struct __wt_page_modify { * huge. */ WT_INSERT_HEAD **update; + + /* + * Split-saved last column-store page record. If a column-store + * page is split, we save the first record number moved so that + * during reconciliation we know the page's last record and can + * write any implicitly created deleted records for the page. + */ + uint64_t split_recno; } leaf; #define mod_append u2.leaf.append #define mod_update u2.leaf.update +#define mod_split_recno u2.leaf.split_recno } u2; /* @@ -478,7 +487,7 @@ struct __wt_page { #define pg_row_ins u.row.ins #undef pg_row_upd #define pg_row_upd u.row.upd -#define pg_row_entries u.row.entries +#undef pg_row_entries #define pg_row_entries u.row.entries /* Fixed-length column-store leaf page. */ @@ -544,8 +553,8 @@ struct __wt_page { #define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */ #define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ #define WT_PAGE_OVERFLOW_KEYS 0x10 /* Page has overflow keys */ -#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */ -#define WT_PAGE_SPLIT_BLOCK 0x40 /* Split blocking eviction and splits */ +#define WT_PAGE_SPLIT_BLOCK 0x20 /* Split blocking eviction and splits */ +#define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */ #define WT_PAGE_UPDATE_IGNORE 0x80 /* Ignore updates on page discard */ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ @@ -1049,7 +1058,7 @@ struct __wt_insert_head { uint64_t __prev_split_gen = (session)->split_gen; \ if (__prev_split_gen == 0) \ do { \ - WT_PUBLISH((session)->split_gen, \ + WT_PUBLISH((session)->split_gen, \ S2C(session)->split_gen); \ } while ((session)->split_gen != S2C(session)->split_gen) diff --git a/src/include/btree.h b/src/include/btree.h index a1d8e395cfc..703de0f2fc6 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -153,16 +153,18 @@ struct __wt_btree { #define WT_BTREE_NO_CHECKPOINT 0x00800 /* Disable checkpoints */ #define WT_BTREE_NO_EVICTION 0x01000 /* Disable eviction */ #define WT_BTREE_NO_LOGGING 0x02000 /* Disable logging */ -#define WT_BTREE_SALVAGE 0x04000 /* Handle is for salvage */ -#define WT_BTREE_SKIP_CKPT 0x08000 /* Handle skipped checkpoint */ -#define WT_BTREE_UPGRADE 0x10000 /* Handle is for upgrade */ -#define WT_BTREE_VERIFY 0x20000 /* Handle is for verify */ +#define WT_BTREE_REBALANCE 0x04000 /* Handle is for rebalance */ +#define WT_BTREE_SALVAGE 0x08000 /* Handle is for salvage */ +#define WT_BTREE_SKIP_CKPT 0x10000 /* Handle skipped checkpoint */ +#define WT_BTREE_UPGRADE 0x20000 /* Handle is for upgrade */ +#define WT_BTREE_VERIFY 0x40000 /* Handle is for verify */ uint32_t flags; }; /* Flags that make a btree handle special (not for normal use). */ #define WT_BTREE_SPECIAL_FLAGS \ - (WT_BTREE_BULK | WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY) + (WT_BTREE_BULK | WT_BTREE_REBALANCE | \ + WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY) /* * WT_SALVAGE_COOKIE -- diff --git a/src/include/btree.i b/src/include/btree.i index 23e0dfea2cd..94111397abd 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -1046,15 +1046,16 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) * do it without making the appending threads wait. See if it's worth * doing a split to let the threads continue before doing eviction. * - * Ignore anything other than large, dirty row-store leaf pages. The - * split code only supports row-store pages, and we depend on the page - * being dirty for correctness (the page must be reconciled again + * Ignore anything other than large, dirty leaf pages. We depend on the + * page being dirty for correctness (the page must be reconciled again * before being evicted after the split, information from a previous * reconciliation will be wrong, so we can't evict immediately). */ - if (page->type != WT_PAGE_ROW_LEAF || - page->memory_footprint < btree->splitmempage || - !__wt_page_is_modified(page)) + if (page->memory_footprint < btree->splitmempage) + return (false); + if (WT_PAGE_IS_INTERNAL(page)) + return (false); + if (!__wt_page_is_modified(page)) return (false); /* @@ -1071,9 +1072,11 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) #define WT_MIN_SPLIT_COUNT 30 #define WT_MIN_SPLIT_MULTIPLIER 16 /* At level 2, we see 1/16th entries */ - ins_head = page->pg_row_entries == 0 ? + ins_head = page->type == WT_PAGE_ROW_LEAF ? + (page->pg_row_entries == 0 ? WT_ROW_INSERT_SMALLEST(page) : - WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); + WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1)) : + WT_COL_APPEND(page); if (ins_head == NULL) return (false); for (count = 0, size = 0, ins = ins_head->head[WT_MIN_SPLIT_DEPTH]; @@ -1280,8 +1283,8 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) * coupling up/down the tree. */ static inline int -__wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held, - WT_REF *want, uint32_t flags +__wt_page_swap_func( + WT_SESSION_IMPL *session, WT_REF *held, WT_REF *want, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif @@ -1310,20 +1313,40 @@ __wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held, #endif ); - /* Expected failures: page not found or restart. */ - if (ret == WT_NOTFOUND || ret == WT_RESTART) - return (ret); + /* + * Expected failures: page not found or restart. Our callers list the + * errors they're expecting to handle. + */ + if (LF_ISSET(WT_READ_NOTFOUND_OK) && ret == WT_NOTFOUND) + return (WT_NOTFOUND); + if (LF_ISSET(WT_READ_RESTART_OK) && ret == WT_RESTART) + return (WT_RESTART); - /* Discard the original held page. */ + /* Discard the original held page on either success or error. */ acquired = ret == 0; WT_TRET(__wt_page_release(session, held, flags)); + /* Fast-path expected success. */ + if (ret == 0) + return (0); + /* - * If there was an error discarding the original held page, discard - * the acquired page too, keeping it is never useful. + * If there was an error at any point that our caller isn't prepared to + * handle, discard any page we acquired. */ - if (acquired && ret != 0) + if (acquired) WT_TRET(__wt_page_release(session, want, flags)); + + /* + * If we're returning an error, don't let it be one our caller expects + * to handle as returned by page-in: the expectation includes the held + * page not having been released, and that's not the case. + */ + if (LF_ISSET(WT_READ_NOTFOUND_OK) && ret == WT_NOTFOUND) + return (EINVAL); + if (LF_ISSET(WT_READ_RESTART_OK) && ret == WT_RESTART) + return (EINVAL); + return (ret); } @@ -1437,17 +1460,54 @@ __wt_split_intl_race( * * There's a page-split race when we walk the tree: if we're splitting * an internal page into its parent, we update the parent's page index - * and then update the page being split, and it's not an atomic update. - * A thread could read the parent page's original page index, and then - * read the page's replacement index. Because internal page splits work - * by replacing the original page with the initial part of the original - * page, the result of this race is we will have a key that's past the - * end of the current page, and the parent's page index will have moved. + * before updating the split page's page index, and it's not an atomic + * update. A thread can read the parent page's original page index and + * then read the split page's replacement index. + * + * Because internal page splits work by truncating the original page to + * the initial part of the original page, the result of this race is we + * will have a search key that points past the end of the current page. + * This is only an issue when we search past the end of the page, if we + * find a WT_REF in the page with the namespace we're searching for, we + * don't care if the WT_REF moved or not while we were searching, we + * have the correct page. + * + * For example, imagine an internal page with 3 child pages, with the + * namespaces a-f, g-h and i-j; the first child page splits. The parent + * starts out with the following page-index: + * + * | ... | a | g | i | ... | + * + * which changes to this: + * + * | ... | a | c | e | g | i | ... | + * + * The child starts out with the following page-index: + * + * | a | b | c | d | e | f | + * + * which changes to this: + * + * | a | b | + * + * The thread searches the original parent page index for the key "cat", + * it couples to the "a" child page; if it uses the replacement child + * page index, it will search past the end of the page and couple to the + * "b" page, which is wrong. + * + * To detect the problem, we remember the parent page's page index used + * to descend the tree. Whenever we search past the end of a page, we + * check to see if the parent's page index has changed since our use of + * it during descent. As the problem only appears if we read the split + * page's replacement index, the parent page's index must already have + * changed, ensuring we detect the problem. * - * It's also possible a thread could read the parent page's replacement - * page index, and then read the page's original index. Because internal - * splits work by truncating the original page, the original page's old - * content is compatible, this isn't a problem and we ignore this race. + * It's possible for the opposite race to happen (a thread could read + * the parent page's replacement page index and then read the split + * page's original index). This isn't a problem because internal splits + * work by truncating the split page, so the split page search is for + * content the split page retains after the split, and we ignore this + * race. */ WT_INTL_INDEX_GET(session, parent, pindex); return (pindex != saved_pindex); diff --git a/src/include/btree_cmp.i b/src/include/btree_cmp.i index 8a7fe19a22f..1993c1be293 100644 --- a/src/include/btree_cmp.i +++ b/src/include/btree_cmp.i @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/buf.i b/src/include/buf.i index b8849396f01..95d945ec6d3 100644 --- a/src/include/buf.i +++ b/src/include/buf.i @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -92,18 +92,6 @@ __wt_buf_setstr(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *s) } /* - * __wt_buf_set_printable -- - * Set the contents of the buffer to a printable representation of a - * byte string. - */ -static inline int -__wt_buf_set_printable( - WT_SESSION_IMPL *session, WT_ITEM *buf, const void *from_arg, size_t size) -{ - return (__wt_raw_to_esc_hex(session, from_arg, size, buf)); -} - -/* * __wt_buf_free -- * Free a buffer. */ diff --git a/src/include/cache.h b/src/include/cache.h index a0440f23a00..a3961d6043e 100644 --- a/src/include/cache.h +++ b/src/include/cache.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/cache.i b/src/include/cache.i index 7cbd72853c3..ee13eee84c5 100644 --- a/src/include/cache.i +++ b/src/include/cache.i @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/cell.i b/src/include/cell.i index 1410b30bb82..481d2a29764 100644 --- a/src/include/cell.i +++ b/src/include/cell.i @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/column.i b/src/include/column.i index fc1f372b2a9..9f3e2101f6f 100644 --- a/src/include/column.i +++ b/src/include/column.i @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -176,6 +176,16 @@ __col_insert_search(WT_INSERT_HEAD *inshead, continue; } + /* + * When no exact match is found, the search returns the smallest + * key larger than the searched-for key, or the largest key + * smaller than the searched-for key, if there is no larger key. + * Our callers depend on that: specifically, the fixed-length + * column store cursor code interprets returning a key smaller + * than the searched-for key to mean the searched-for key is + * larger than any key on the page. Don't change that behavior, + * things will break. + */ ins_recno = WT_INSERT_RECNO(ret_ins); cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1; @@ -204,9 +214,9 @@ __col_var_last_recno(WT_PAGE *page) WT_COL_RLE *repeat; /* - * If there's an append list (the last page), then there may be more - * records on the page. This function ignores those records, so our - * callers have to handle that explicitly, if they care. + * If there's an append list, there may be more records on the page. + * This function ignores those records, our callers must handle that + * explicitly, if they care. */ if (page->pg_var_nrepeats == 0) return (page->pg_var_entries == 0 ? 0 : @@ -225,9 +235,9 @@ static inline uint64_t __col_fix_last_recno(WT_PAGE *page) { /* - * If there's an append list (the last page), then there may be more - * records on the page. This function ignores those records, so our - * callers have to handle that explicitly, if they care. + * If there's an append list, there may be more records on the page. + * This function ignores those records, our callers must handle that + * explicitly, if they care. */ return (page->pg_fix_entries == 0 ? 0 : page->pg_fix_recno + (page->pg_fix_entries - 1)); @@ -282,7 +292,17 @@ __col_var_search(WT_PAGE *page, uint64_t recno, uint64_t *start_recnop) start_recno = repeat->recno + repeat->rle; } - if (recno >= start_recno + (page->pg_var_entries - start_indx)) + /* + * !!! + * The test could be written more simply as: + * + * (recno >= start_recno + (page->pg_var_entries - start_indx)) + * + * It's split into two parts because the simpler test will overflow if + * searching for large record numbers. + */ + if (recno >= start_recno && + recno - start_recno >= page->pg_var_entries - start_indx) return (NULL); return (page->pg_var_d + start_indx + (uint32_t)(recno - start_recno)); diff --git a/src/include/compact.h b/src/include/compact.h index 0698bf7b1a4..2bba52e7173 100644 --- a/src/include/compact.h +++ b/src/include/compact.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/config.h b/src/include/config.h index e836abaccba..e63db0e76cf 100644 --- a/src/include/config.h +++ b/src/include/config.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -72,25 +72,26 @@ struct __wt_config_parser_impl { #define WT_CONFIG_ENTRY_WT_SESSION_log_flush 20 #define WT_CONFIG_ENTRY_WT_SESSION_log_printf 21 #define WT_CONFIG_ENTRY_WT_SESSION_open_cursor 22 -#define WT_CONFIG_ENTRY_WT_SESSION_reconfigure 23 -#define WT_CONFIG_ENTRY_WT_SESSION_rename 24 -#define WT_CONFIG_ENTRY_WT_SESSION_reset 25 -#define WT_CONFIG_ENTRY_WT_SESSION_rollback_transaction 26 -#define WT_CONFIG_ENTRY_WT_SESSION_salvage 27 -#define WT_CONFIG_ENTRY_WT_SESSION_snapshot 28 -#define WT_CONFIG_ENTRY_WT_SESSION_strerror 29 -#define WT_CONFIG_ENTRY_WT_SESSION_transaction_sync 30 -#define WT_CONFIG_ENTRY_WT_SESSION_truncate 31 -#define WT_CONFIG_ENTRY_WT_SESSION_upgrade 32 -#define WT_CONFIG_ENTRY_WT_SESSION_verify 33 -#define WT_CONFIG_ENTRY_colgroup_meta 34 -#define WT_CONFIG_ENTRY_file_meta 35 -#define WT_CONFIG_ENTRY_index_meta 36 -#define WT_CONFIG_ENTRY_table_meta 37 -#define WT_CONFIG_ENTRY_wiredtiger_open 38 -#define WT_CONFIG_ENTRY_wiredtiger_open_all 39 -#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 40 -#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 41 +#define WT_CONFIG_ENTRY_WT_SESSION_rebalance 23 +#define WT_CONFIG_ENTRY_WT_SESSION_reconfigure 24 +#define WT_CONFIG_ENTRY_WT_SESSION_rename 25 +#define WT_CONFIG_ENTRY_WT_SESSION_reset 26 +#define WT_CONFIG_ENTRY_WT_SESSION_rollback_transaction 27 +#define WT_CONFIG_ENTRY_WT_SESSION_salvage 28 +#define WT_CONFIG_ENTRY_WT_SESSION_snapshot 29 +#define WT_CONFIG_ENTRY_WT_SESSION_strerror 30 +#define WT_CONFIG_ENTRY_WT_SESSION_transaction_sync 31 +#define WT_CONFIG_ENTRY_WT_SESSION_truncate 32 +#define WT_CONFIG_ENTRY_WT_SESSION_upgrade 33 +#define WT_CONFIG_ENTRY_WT_SESSION_verify 34 +#define WT_CONFIG_ENTRY_colgroup_meta 35 +#define WT_CONFIG_ENTRY_file_meta 36 +#define WT_CONFIG_ENTRY_index_meta 37 +#define WT_CONFIG_ENTRY_table_meta 38 +#define WT_CONFIG_ENTRY_wiredtiger_open 39 +#define WT_CONFIG_ENTRY_wiredtiger_open_all 40 +#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 41 +#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 42 /* * configuration section: END * DO NOT EDIT: automatically built by dist/flags.py. diff --git a/src/include/connection.h b/src/include/connection.h index 2367f5a0035..5d61f9456b3 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -415,6 +415,7 @@ struct __wt_connection_impl { uint32_t direct_io; uint32_t write_through; /* FILE_FLAG_WRITE_THROUGH type flags */ bool mmap; /* mmap configuration */ + int page_size; /* OS page size for mmap alignment */ uint32_t verbose; uint32_t flags; diff --git a/src/include/cursor.h b/src/include/cursor.h index 275e2f2db46..7f7b5dceb79 100644 --- a/src/include/cursor.h +++ b/src/include/cursor.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -200,18 +200,23 @@ struct __wt_cursor_btree { uint8_t append_tree; /* Cursor appended to the tree */ +#ifdef HAVE_DIAGNOSTIC + /* Check that cursor next/prev never returns keys out-of-order. */ + WT_ITEM *lastkey, _lastkey; + uint64_t lastrecno; +#endif + #define WT_CBT_ACTIVE 0x01 /* Active in the tree */ #define WT_CBT_ITERATE_APPEND 0x02 /* Col-store: iterating append list */ #define WT_CBT_ITERATE_NEXT 0x04 /* Next iteration configuration */ #define WT_CBT_ITERATE_PREV 0x08 /* Prev iteration configuration */ -#define WT_CBT_MAX_RECORD 0x10 /* Col-store: past end-of-table */ -#define WT_CBT_NO_TXN 0x20 /* Non-transactional cursor +#define WT_CBT_NO_TXN 0x10 /* Non-transactional cursor (e.g. on a checkpoint) */ -#define WT_CBT_SEARCH_SMALLEST 0x40 /* Row-store: small-key insert list */ +#define WT_CBT_SEARCH_SMALLEST 0x20 /* Row-store: small-key insert list */ #define WT_CBT_POSITION_MASK /* Flags associated with position */ \ (WT_CBT_ITERATE_APPEND | WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV | \ - WT_CBT_MAX_RECORD | WT_CBT_SEARCH_SMALLEST) + WT_CBT_SEARCH_SMALLEST) uint8_t flags; }; @@ -219,33 +224,32 @@ struct __wt_cursor_btree { struct __wt_cursor_bulk { WT_CURSOR_BTREE cbt; - WT_REF *ref; /* The leaf page */ - WT_PAGE *leaf; - /* * Variable-length column store compares values during bulk load as * part of RLE compression, row-store compares keys during bulk load * to avoid corruption. */ - WT_ITEM last; /* Last key/value seen */ + bool first_insert; /* First insert */ + WT_ITEM last; /* Last key/value inserted */ /* - * Variable-length column-store RLE counter (also overloaded to mean - * the first time through the bulk-load insert routine, when set to 0). + * Additional column-store bulk load support. */ - uint64_t rle; + uint64_t recno; /* Record number */ + uint64_t rle; /* Variable-length RLE counter */ /* - * Fixed-length column-store current entry in memory chunk count, and - * the maximum number of records per chunk. + * Additional fixed-length column store bitmap bulk load support: + * current entry in memory chunk count, and the maximum number of + * records per chunk. */ + bool bitmap; /* Bitmap bulk load */ uint32_t entry; /* Entry count */ uint32_t nrecs; /* Max records per chunk */ - /* Special bitmap bulk load for fixed-length column stores. */ - bool bitmap; - - void *reconcile; /* Reconciliation information */ + void *reconcile; /* Reconciliation support */ + WT_REF *ref; /* The leaf page */ + WT_PAGE *leaf; }; struct __wt_cursor_config { diff --git a/src/include/cursor.i b/src/include/cursor.i index 2e382591313..8ab96c0a69d 100644 --- a/src/include/cursor.i +++ b/src/include/cursor.i @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -246,8 +246,12 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter) session = (WT_SESSION_IMPL *)cbt->iface.session; - if (reenter) + if (reenter) { +#ifdef HAVE_DIAGNOSTIC + __wt_cursor_key_order_reset(cbt); +#endif WT_RET(__curfile_leave(cbt)); + } /* * Any old insert position is now invalid. We rely on this being diff --git a/src/include/dhandle.h b/src/include/dhandle.h index 9a54b4ddb66..8b313428d06 100644 --- a/src/include/dhandle.h +++ b/src/include/dhandle.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -33,6 +33,10 @@ (F_ISSET(dhandle, WT_DHANDLE_DEAD) || \ !F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_OPEN)) +/* The metadata cursor's data handle. */ +#define WT_SESSION_META_DHANDLE(s) \ + (((WT_CURSOR_BTREE *)((s)->meta_cursor))->btree->dhandle) + /* * WT_DATA_HANDLE -- * A handle for a generic named data source. diff --git a/src/include/dlh.h b/src/include/dlh.h index c374ec36fb0..9e49c2ff3cb 100644 --- a/src/include/dlh.h +++ b/src/include/dlh.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/error.h b/src/include/error.h index e721855ce7c..5f24d205af9 100644 --- a/src/include/error.h +++ b/src/include/error.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/extern.h b/src/include/extern.h index d84403cc16d..b71f4b12486 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -14,6 +14,7 @@ extern int __wt_block_buffer_to_addr(WT_BLOCK *block, const uint8_t *p, wt_off_t extern int __wt_block_addr_invalid(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, bool live); extern int __wt_block_addr_string(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, const uint8_t *addr, size_t addr_size); extern int __wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci); +extern int __wt_block_ckpt_decode(WT_SESSION *wt_session, size_t allocsize, const uint8_t *p, WT_BLOCK_CKPT *ci); extern int __wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t **pp, WT_BLOCK_CKPT *ci); extern int __wt_block_ckpt_init( WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name); extern int __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, uint8_t *root_addr, size_t *root_addr_sizep, bool checkpoint); @@ -43,14 +44,15 @@ extern void __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el); extern int __wt_block_map( WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapp, size_t *maplenp, void **mappingcookie); extern int __wt_block_unmap( WT_SESSION_IMPL *session, WT_BLOCK *block, void *map, size_t maplen, void **mappingcookie); extern int __wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], bool forced_salvage, bool readonly, uint32_t allocsize, WT_BM **bmp); -extern int __wt_block_manager_truncate( WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize); +extern int __wt_block_manager_drop(WT_SESSION_IMPL *session, const char *filename); extern int __wt_block_manager_create( WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize); extern void __wt_block_configure_first_fit(WT_BLOCK *block, bool on); extern int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], bool forced_salvage, bool readonly, uint32_t allocsize, WT_BLOCK **blockp); extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block); extern int __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize); extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats); -extern int __wt_block_manager_size( WT_SESSION_IMPL *session, const char *filename, WT_DSRC_STATS *stats); +extern int __wt_block_manager_size(WT_BM *bm, WT_SESSION_IMPL *session, wt_off_t *sizep); +extern int __wt_block_manager_named_size( WT_SESSION_IMPL *session, const char *name, wt_off_t *sizep); extern int __wt_bm_preload( WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size); extern int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size); extern int __wt_block_read_off_blind( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset); @@ -89,6 +91,9 @@ extern int __wt_bloom_close(WT_BLOOM *bloom); extern int __wt_bloom_drop(WT_BLOOM *bloom, const char *config); extern int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp); +extern int __wt_cursor_key_order_check( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next); +extern int __wt_cursor_key_order_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); +extern void __wt_cursor_key_order_reset(WT_CURSOR_BTREE *cbt); extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt); extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating); extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating); @@ -129,7 +134,7 @@ extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]); extern int __wt_btree_close(WT_SESSION_IMPL *session); extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, bool is_recno); extern int __wt_btree_tree_open( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size); -extern int __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep); +extern int __wt_btree_new_leaf_page( WT_SESSION_IMPL *session, uint64_t recno, WT_PAGE **pagep); extern void __wt_btree_evictable(WT_SESSION_IMPL *session, bool on); extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session); extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session); @@ -139,6 +144,7 @@ extern const char *__wt_page_type_string(u_int type); extern const char *__wt_cell_type_string(uint8_t type); extern const char *__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf); extern const char *__wt_addr_string(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ITEM *buf); +extern const char *__wt_buf_set_printable( WT_SESSION_IMPL *session, const void *p, size_t size, WT_ITEM *buf); extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store); extern int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack); extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell); @@ -151,6 +157,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags , const char *file, int line #endif ); +extern int __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd); extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]); extern void __wt_split_stash_discard(WT_SESSION_IMPL *session); @@ -170,7 +177,7 @@ extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flag extern int __wt_tree_walk_count(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags); extern int __wt_tree_walk_skip(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags); extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove); -extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt); +extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt); extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key); extern int __wt_row_leaf_key_work(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, bool instantiate); @@ -192,7 +199,7 @@ extern int __wt_las_create(WT_SESSION_IMPL *session); extern int __wt_las_destroy(WT_SESSION_IMPL *session); extern void __wt_las_set_written(WT_SESSION_IMPL *session); extern bool __wt_las_is_written(WT_SESSION_IMPL *session); -extern int __wt_las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp); +extern int __wt_las_cursor_open(WT_SESSION_IMPL *session, WT_CURSOR **cursorp); extern int __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags); extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags); extern int __wt_las_sweep(WT_SESSION_IMPL *session); @@ -255,7 +262,7 @@ extern int __wt_connection_init(WT_CONNECTION_IMPL *conn); extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn); extern int __wt_logmgr_reconfig(WT_SESSION_IMPL *session, const char **cfg); extern int __wt_log_truncate_files( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const char *cfg[]); -extern int __wt_log_wrlsn(WT_SESSION_IMPL *session); +extern int __wt_log_wrlsn(WT_SESSION_IMPL *session, int *yield); extern int __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_logmgr_open(WT_SESSION_IMPL *session); extern int __wt_logmgr_destroy(WT_SESSION_IMPL *session); @@ -333,9 +340,10 @@ extern int __wt_evict_destroy(WT_SESSION_IMPL *session); extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp); extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session); extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full); +extern void __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v); +extern void __wt_evict_priority_clear(WT_SESSION_IMPL *session); extern int __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile); extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing); -extern int __wt_evict_page_clean_update( WT_SESSION_IMPL *session, WT_REF *ref, bool closing); extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn); extern int __wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start); extern int __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn); @@ -362,23 +370,23 @@ extern int __wt_logrec_read(WT_SESSION_IMPL *session, const uint8_t **pp, const extern int __wt_logop_read(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *optypep, uint32_t *opsizep); extern int __wt_logop_col_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno, WT_ITEM *value); extern int __wt_logop_col_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop, WT_ITEM *valuep); -extern int __wt_logop_col_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_col_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); extern int __wt_logop_col_remove_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno); extern int __wt_logop_col_remove_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop); -extern int __wt_logop_col_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_col_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); extern int __wt_logop_col_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t start, uint64_t stop); extern int __wt_logop_col_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *startp, uint64_t *stopp); -extern int __wt_logop_col_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_col_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); extern int __wt_logop_row_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key, WT_ITEM *value); extern int __wt_logop_row_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp, WT_ITEM *valuep); -extern int __wt_logop_row_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_row_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); extern int __wt_logop_row_remove_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key); extern int __wt_logop_row_remove_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp); -extern int __wt_logop_row_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_row_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); extern int __wt_logop_row_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *start, WT_ITEM *stop, uint32_t mode); extern int __wt_logop_row_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep); -extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); -extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); +extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); extern void __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); extern int __wt_log_slot_switch( WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced); extern int __wt_log_slot_new(WT_SESSION_IMPL *session); @@ -447,12 +455,13 @@ extern int __wt_ext_metadata_search(WT_EXTENSION_API *wt_api, WT_SESSION *wt_ses extern int __wt_ext_metadata_update(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key, const char *value); extern int __wt_metadata_get_ckptlist( WT_SESSION *session, const char *name, WT_CKPT **ckptbasep); extern void __wt_metadata_free_ckptlist(WT_SESSION *session, WT_CKPT *ckptbase); -extern int __wt_metadata_open(WT_SESSION_IMPL *session); -extern int __wt_metadata_cursor( WT_SESSION_IMPL *session, const char *config, WT_CURSOR **cursorp); +extern int __wt_metadata_cursor_open( WT_SESSION_IMPL *session, const char *config, WT_CURSOR **cursorp); +extern int __wt_metadata_cursor(WT_SESSION_IMPL *session, WT_CURSOR **cursorp); +extern int __wt_metadata_cursor_release(WT_SESSION_IMPL *session, WT_CURSOR **cursorp); extern int __wt_metadata_insert( WT_SESSION_IMPL *session, const char *key, const char *value); extern int __wt_metadata_update( WT_SESSION_IMPL *session, const char *key, const char *value); extern int __wt_metadata_remove(WT_SESSION_IMPL *session, const char *key); -extern int __wt_metadata_search( WT_SESSION_IMPL *session, const char *key, char **valuep); +extern int __wt_metadata_search(WT_SESSION_IMPL *session, const char *key, char **valuep); extern void __wt_meta_track_discard(WT_SESSION_IMPL *session); extern int __wt_meta_track_on(WT_SESSION_IMPL *session); extern int __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll); @@ -468,7 +477,7 @@ extern int __wt_meta_track_init(WT_SESSION_IMPL *session); extern int __wt_meta_track_destroy(WT_SESSION_IMPL *session); extern int __wt_turtle_init(WT_SESSION_IMPL *session); extern int __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **valuep); -extern int __wt_turtle_update( WT_SESSION_IMPL *session, const char *key, const char *value); +extern int __wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value); extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn)); extern int __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp); extern int __wt_realloc(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp); @@ -488,7 +497,7 @@ extern int __wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep); extern int __wt_filesize_name(WT_SESSION_IMPL *session, const char *filename, bool silent, wt_off_t *sizep); extern int __wt_bytelock(WT_FH *fhp, wt_off_t byte, bool lock); extern int __wt_directory_sync_fh(WT_SESSION_IMPL *session, WT_FH *fh); -extern int __wt_directory_sync(WT_SESSION_IMPL *session, char *path); +extern int __wt_directory_sync(WT_SESSION_IMPL *session, const char *path); extern int __wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh); extern int __wt_fsync_async(WT_SESSION_IMPL *session, WT_FH *fh); extern int __wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len); @@ -514,6 +523,7 @@ extern int __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp); extern int __wt_once(void (*init_routine)(void)); extern int __wt_open(WT_SESSION_IMPL *session, const char *name, bool ok_create, bool exclusive, int dio_type, WT_FH **fhp); extern int __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp); +extern int __wt_get_vm_pagesize(void); extern bool __wt_absolute_path(const char *path); extern const char *__wt_path_separator(void); extern bool __wt_has_priv(void); @@ -558,8 +568,9 @@ extern uint32_t __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize); extern int __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); extern int __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); -extern int __wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); -extern int __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); +extern int __wt_bulk_insert_fix( WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted); +extern int __wt_bulk_insert_fix_bitmap(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); +extern int __wt_bulk_insert_var( WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted); extern int __wt_schema_create_strip(WT_SESSION_IMPL *session, const char *v1, const char *v2, char **value_ret); extern int __wt_direct_io_size_check(WT_SESSION_IMPL *session, const char **cfg, const char *config_name, uint32_t *allocsizep); extern int __wt_schema_colgroup_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf); @@ -606,6 +617,7 @@ extern int __wt_session_release_resources(WT_SESSION_IMPL *session); extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); extern int __wt_session_create( WT_SESSION_IMPL *session, const char *uri, const char *config); extern int __wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]); +extern int __wt_session_range_truncate(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *start, WT_CURSOR *stop); extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, bool open_metadata, WT_SESSION_IMPL **sessionp); extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, bool open_metadata, uint32_t session_flags, WT_SESSION_IMPL **sessionp); extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, bool *skipp); @@ -639,8 +651,9 @@ extern int __wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri); extern int __wt_filename(WT_SESSION_IMPL *session, const char *name, char **path); extern int __wt_nfilename( WT_SESSION_IMPL *session, const char *name, size_t namelen, char **path); extern int __wt_remove_if_exists(WT_SESSION_IMPL *session, const char *name); -extern int __wt_sync_and_rename_fh( WT_SESSION_IMPL *session, WT_FH **fhp, const char *from, const char *to); -extern int __wt_sync_and_rename_fp( WT_SESSION_IMPL *session, FILE **fpp, const char *from, const char *to); +extern int __wt_rename_and_sync_directory( WT_SESSION_IMPL *session, const char *from, const char *to); +extern int __wt_fh_sync_and_rename( WT_SESSION_IMPL *session, WT_FH **fhp, const char *from, const char *to); +extern int __wt_sync_fp_and_rename( WT_SESSION_IMPL *session, FILE **fpp, const char *from, const char *to); extern int __wt_library_init(void); extern int __wt_breakpoint(void); extern void __wt_attach(WT_SESSION_IMPL *session); @@ -654,6 +667,7 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp ); extern int __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page); extern void __wt_hazard_close(WT_SESSION_IMPL *session); +extern void __wt_fill_hex(const uint8_t *src, size_t src_max, uint8_t *dest, size_t dest_max, size_t *lenp); extern int __wt_raw_to_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to); extern int __wt_raw_to_esc_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to); extern int __wt_hex2byte(const u_char *from, u_char *to); @@ -671,6 +685,7 @@ extern uint32_t __wt_log2_int(uint32_t n); extern bool __wt_ispo2(uint32_t v); extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2); extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state); +extern int __wt_random_init_seed( WT_SESSION_IMPL *session, WT_RAND_STATE volatile *rnd_state); extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state); extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size); extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))); @@ -732,7 +747,7 @@ extern int __wt_txn_checkpoint_logread( WT_SESSION_IMPL *session, const uint8_t extern int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp); extern int __wt_txn_truncate_log( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop); extern int __wt_txn_truncate_end(WT_SESSION_IMPL *session); -extern int __wt_txn_printlog(WT_SESSION *wt_session, FILE *out); +extern int __wt_txn_printlog(WT_SESSION *wt_session, FILE *out, uint32_t flags); extern int __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_txn_named_snapshot_drop(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval); diff --git a/src/include/flags.h b/src/include/flags.h index bafff92fbc0..24fae4abccd 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -39,15 +39,17 @@ #define WT_LOG_SYNC_ENABLED 0x00000010 #define WT_READ_CACHE 0x00000001 #define WT_READ_COMPACT 0x00000002 -#define WT_READ_NO_EMPTY 0x00000004 -#define WT_READ_NO_EVICT 0x00000008 -#define WT_READ_NO_GEN 0x00000010 -#define WT_READ_NO_WAIT 0x00000020 -#define WT_READ_PREV 0x00000040 -#define WT_READ_SKIP_INTL 0x00000080 -#define WT_READ_SKIP_LEAF 0x00000100 -#define WT_READ_TRUNCATE 0x00000200 -#define WT_READ_WONT_NEED 0x00000400 +#define WT_READ_NOTFOUND_OK 0x00000004 +#define WT_READ_NO_EMPTY 0x00000008 +#define WT_READ_NO_EVICT 0x00000010 +#define WT_READ_NO_GEN 0x00000020 +#define WT_READ_NO_WAIT 0x00000040 +#define WT_READ_PREV 0x00000080 +#define WT_READ_RESTART_OK 0x00000100 +#define WT_READ_SKIP_INTL 0x00000200 +#define WT_READ_SKIP_LEAF 0x00000400 +#define WT_READ_TRUNCATE 0x00000800 +#define WT_READ_WONT_NEED 0x00001000 #define WT_SESSION_CAN_WAIT 0x00000001 #define WT_SESSION_CLEAR_EVICT_WALK 0x00000002 #define WT_SESSION_INTERNAL 0x00000004 @@ -57,15 +59,16 @@ #define WT_SESSION_LOCKED_SLOT 0x00000040 #define WT_SESSION_LOCKED_TABLE 0x00000080 #define WT_SESSION_LOCKED_TURTLE 0x00000100 -#define WT_SESSION_LOGGING_INMEM 0x00000200 -#define WT_SESSION_LOOKASIDE_CURSOR 0x00000400 -#define WT_SESSION_NO_CACHE 0x00000800 -#define WT_SESSION_NO_DATA_HANDLES 0x00001000 -#define WT_SESSION_NO_EVICTION 0x00002000 -#define WT_SESSION_NO_LOGGING 0x00004000 -#define WT_SESSION_NO_SCHEMA_LOCK 0x00008000 -#define WT_SESSION_QUIET_CORRUPT_FILE 0x00010000 -#define WT_SESSION_SERVER_ASYNC 0x00020000 +#define WT_SESSION_LOCK_NO_WAIT 0x00000200 +#define WT_SESSION_LOGGING_INMEM 0x00000400 +#define WT_SESSION_LOOKASIDE_CURSOR 0x00000800 +#define WT_SESSION_NO_CACHE 0x00001000 +#define WT_SESSION_NO_DATA_HANDLES 0x00002000 +#define WT_SESSION_NO_EVICTION 0x00004000 +#define WT_SESSION_NO_LOGGING 0x00008000 +#define WT_SESSION_NO_SCHEMA_LOCK 0x00010000 +#define WT_SESSION_QUIET_CORRUPT_FILE 0x00020000 +#define WT_SESSION_SERVER_ASYNC 0x00040000 #define WT_TXN_LOG_CKPT_CLEANUP 0x00000001 #define WT_TXN_LOG_CKPT_PREPARE 0x00000002 #define WT_TXN_LOG_CKPT_START 0x00000004 @@ -85,16 +88,17 @@ #define WT_VERB_MUTEX 0x00000800 #define WT_VERB_OVERFLOW 0x00001000 #define WT_VERB_READ 0x00002000 -#define WT_VERB_RECONCILE 0x00004000 -#define WT_VERB_RECOVERY 0x00008000 -#define WT_VERB_SALVAGE 0x00010000 -#define WT_VERB_SHARED_CACHE 0x00020000 -#define WT_VERB_SPLIT 0x00040000 -#define WT_VERB_TEMPORARY 0x00080000 -#define WT_VERB_TRANSACTION 0x00100000 -#define WT_VERB_VERIFY 0x00200000 -#define WT_VERB_VERSION 0x00400000 -#define WT_VERB_WRITE 0x00800000 +#define WT_VERB_REBALANCE 0x00004000 +#define WT_VERB_RECONCILE 0x00008000 +#define WT_VERB_RECOVERY 0x00010000 +#define WT_VERB_SALVAGE 0x00020000 +#define WT_VERB_SHARED_CACHE 0x00040000 +#define WT_VERB_SPLIT 0x00080000 +#define WT_VERB_TEMPORARY 0x00100000 +#define WT_VERB_TRANSACTION 0x00200000 +#define WT_VERB_VERIFY 0x00400000 +#define WT_VERB_VERSION 0x00800000 +#define WT_VERB_WRITE 0x01000000 #define WT_VISIBILITY_ERR 0x00000010 /* * flags section: END diff --git a/src/include/gcc.h b/src/include/gcc.h index 01e33792d73..6ccc0de3c03 100644 --- a/src/include/gcc.h +++ b/src/include/gcc.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -156,8 +156,7 @@ __wt_atomic_cas_ptr(void *vp, void *old, void *new) #if defined(x86_64) || defined(__x86_64__) /* Pause instruction to prevent excess processor bus usage */ -#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory") - +#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory") #define WT_FULL_BARRIER() do { \ __asm__ volatile ("mfence" ::: "memory"); \ } while (0) @@ -169,7 +168,7 @@ __wt_atomic_cas_ptr(void *vp, void *old, void *new) } while (0) #elif defined(i386) || defined(__i386__) -#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory") +#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory") #define WT_FULL_BARRIER() do { \ __asm__ volatile ("lock; addl $0, 0(%%esp)" ::: "memory"); \ } while (0) @@ -177,23 +176,58 @@ __wt_atomic_cas_ptr(void *vp, void *old, void *new) #define WT_WRITE_BARRIER() WT_FULL_BARRIER() #elif defined(__PPC64__) || defined(PPC64) +/* ori 0,0,0 is the PPC64 noop instruction */ #define WT_PAUSE() __asm__ volatile("ori 0,0,0" ::: "memory") -#define WT_FULL_BARRIER() do { +#define WT_FULL_BARRIER() do { \ __asm__ volatile ("sync" ::: "memory"); \ } while (0) -#define WT_READ_BARRIER() WT_FULL_BARRIER() -#define WT_WRITE_BARRIER() WT_FULL_BARRIER() + +/* TODO: ISA 2.07 Elemental Memory Barriers would be better, + specifically mbll, and mbss, but they are not supported by POWER 8 */ +#define WT_READ_BARRIER() do { \ + __asm__ volatile ("lwsync" ::: "memory"); \ +} while (0) +#define WT_WRITE_BARRIER() do { \ + __asm__ volatile ("lwsync" ::: "memory"); \ +} while (0) #elif defined(__aarch64__) #define WT_PAUSE() __asm__ volatile("yield" ::: "memory") #define WT_FULL_BARRIER() do { \ - __asm__ volatile ("dsb sy" ::: "memory"); \ + __asm__ volatile ("dsb sy" ::: "memory"); \ +} while (0) +#define WT_READ_BARRIER() do { \ + __asm__ volatile ("dsb ld" ::: "memory"); \ +} while (0) +#define WT_WRITE_BARRIER() do { \ + __asm__ volatile ("dsb st" ::: "memory"); \ +} while (0) + +#elif defined(__s390x__) +#define WT_PAUSE() __asm__ volatile("lr 0,0" ::: "memory") +#define WT_FULL_BARRIER() do { \ + __asm__ volatile ("bcr 15,0\n" ::: "memory"); \ } while (0) +#define WT_READ_BARRIER() WT_FULL_BARRIER() +#define WT_WRITE_BARRIER() WT_FULL_BARRIER() + +#elif defined(__sparc__) +#define WT_PAUSE() __asm__ volatile("rd %%ccr, %%g0" ::: "memory") + +#define WT_FULL_BARRIER() do { \ + __asm__ volatile ("membar #StoreLoad" ::: "memory"); \ +} while (0) + +/* + * On UltraSparc machines, TSO is used, and so there is no need for membar. + * READ_BARRIER = #LoadLoad, and WRITE_BARRIER = #StoreStore are noop. + */ #define WT_READ_BARRIER() do { \ - __asm__ volatile ("dsb ld" ::: "memory"); \ + __asm__ volatile ("" ::: "memory"); \ } while (0) + #define WT_WRITE_BARRIER() do { \ - __asm__ volatile ("dsb st" ::: "memory"); \ + __asm__ volatile ("" ::: "memory"); \ } while (0) #else diff --git a/src/include/hardware.h b/src/include/hardware.h index 1ab2c3d39c4..93ed8a868b6 100644 --- a/src/include/hardware.h +++ b/src/include/hardware.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/intpack.i b/src/include/intpack.i index a13ad05451d..b27afd24e6c 100644 --- a/src/include/intpack.i +++ b/src/include/intpack.i @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/lint.h b/src/include/lint.h index f288fb98683..f8b17022968 100644 --- a/src/include/lint.h +++ b/src/include/lint.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/log.h b/src/include/log.h index 521de567fc0..577f6a888a3 100644 --- a/src/include/log.h +++ b/src/include/log.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -160,9 +160,9 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_logslot { #define WT_SLOT_INIT_FLAGS 0 -#define WT_WITH_SLOT_LOCK(session, log, op) do { \ +#define WT_WITH_SLOT_LOCK(session, log, ret, op) do { \ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT)); \ - WT_WITH_LOCK(session, \ + WT_WITH_LOCK(session, ret, \ &log->log_slot_lock, WT_SESSION_LOCKED_SLOT, op); \ } while (0) @@ -267,6 +267,11 @@ struct __wt_log_desc { }; /* + * Flags for __wt_txn_op_printlog. + */ +#define WT_TXN_PRINTLOG_HEX 0x0001 /* Add hex output */ + +/* * WT_LOG_REC_DESC -- * A descriptor for a log record type. */ diff --git a/src/include/log.i b/src/include/log.i index ff309c31265..fcdbc72c388 100644 --- a/src/include/log.i +++ b/src/include/log.i @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/lsm.h b/src/include/lsm.h index d15dab3aa45..7cb3ccc895d 100644 --- a/src/include/lsm.h +++ b/src/include/lsm.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/meta.h b/src/include/meta.h index 938101e9caa..e29ec4202dc 100644 --- a/src/include/meta.h +++ b/src/include/meta.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -43,9 +43,9 @@ * WT_WITH_TURTLE_LOCK -- * Acquire the turtle file lock, perform an operation, drop the lock. */ -#define WT_WITH_TURTLE_LOCK(session, op) do { \ +#define WT_WITH_TURTLE_LOCK(session, ret, op) do { \ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_TURTLE));\ - WT_WITH_LOCK(session, \ + WT_WITH_LOCK(session, ret, \ &S2C(session)->turtle_lock, WT_SESSION_LOCKED_TURTLE, op); \ } while (0) diff --git a/src/include/misc.h b/src/include/misc.h index e542baec642..78997661851 100644 --- a/src/include/misc.h +++ b/src/include/misc.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -268,3 +268,6 @@ union __wt_rand_state { uint32_t w, z; } x; }; + +/* Shared array for converting to hex */ +extern const u_char __wt_hex[]; diff --git a/src/include/misc.i b/src/include/misc.i index 75068706b70..04376441340 100644 --- a/src/include/misc.i +++ b/src/include/misc.i @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/msvc.h b/src/include/msvc.h index 8f5aa9abde8..99260a44875 100644 --- a/src/include/msvc.h +++ b/src/include/msvc.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/mutex.h b/src/include/mutex.h index b67e5e610e8..f798bfb3ece 100644 --- a/src/include/mutex.h +++ b/src/include/mutex.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/mutex.i b/src/include/mutex.i index 7eb042dd79f..52250f84ab3 100644 --- a/src/include/mutex.i +++ b/src/include/mutex.i @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/os.h b/src/include/os.h index d135fd9eb1f..fbba7f05f88 100644 --- a/src/include/os.h +++ b/src/include/os.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/os_windows.h b/src/include/os_windows.h index de97143335f..65938ac9f17 100644 --- a/src/include/os_windows.h +++ b/src/include/os_windows.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/packing.i b/src/include/packing.i index 9be38251703..784a55ef2ae 100644 --- a/src/include/packing.i +++ b/src/include/packing.i @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -25,7 +25,8 @@ typedef struct { char type; } WT_PACK_VALUE; -#define WT_PACK_VALUE_INIT { { 0 }, 0, 0, 0 } +/* Default to size = 1 if there is no size prefix. */ +#define WT_PACK_VALUE_INIT { { 0 }, 1, 0, 0 } #define WT_DECL_PACK_VALUE(pv) WT_PACK_VALUE pv = WT_PACK_VALUE_INIT typedef struct { @@ -151,7 +152,14 @@ next: if (pack->cur == pack->end) switch (pv->type) { case 'S': + return (0); case 's': + if (pv->size < 1) + WT_RET_MSG(pack->session, EINVAL, + "Fixed length strings must be at least 1 byte " + "in format '%.*s'", + (int)(pack->end - pack->orig), pack->orig); + return (0); case 'x': return (0); case 't': @@ -266,9 +274,10 @@ __pack_size(WT_SESSION_IMPL *session, WT_PACK_VALUE *pv) return (s); case 's': case 'S': - if (pv->type == 's' || pv->havesize) + if (pv->type == 's' || pv->havesize) { s = pv->size; - else + WT_ASSERT(session, s != 0); + } else s = strlen(pv->u.s) + 1; return (s); case 'U': @@ -460,9 +469,10 @@ __unpack_read(WT_SESSION_IMPL *session, break; case 's': case 'S': - if (pv->type == 's' || pv->havesize) + if (pv->type == 's' || pv->havesize) { s = pv->size; - else + WT_ASSERT(session, s != 0); + } else s = strlen((const char *)*pp) + 1; if (s > 0) pv->u.s = (const char *)*pp; @@ -667,7 +677,6 @@ __wt_struct_unpackv(WT_SESSION_IMPL *session, if (fmt[0] != '\0' && fmt[1] == '\0') { pv.type = fmt[0]; - pv.size = 1; if ((ret = __unpack_read(session, &pv, &p, size)) == 0) WT_UNPACK_PUT(session, pv, ap); return (0); diff --git a/src/include/posix.h b/src/include/posix.h index 1aa629c98e7..2593c7b6797 100644 --- a/src/include/posix.h +++ b/src/include/posix.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/schema.h b/src/include/schema.h index 023fd398f1c..88a3a39f8b3 100644 --- a/src/include/schema.h +++ b/src/include/schema.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -82,9 +82,17 @@ struct __wt_table { * WT_WITH_LOCK -- * Acquire a lock, perform an operation, drop the lock. */ -#define WT_WITH_LOCK(session, lock, flag, op) do { \ +#define WT_WITH_LOCK(session, ret, lock, flag, op) do { \ + ret = 0; \ if (F_ISSET(session, (flag))) { \ op; \ + } else if (F_ISSET(session, WT_SESSION_LOCK_NO_WAIT)) { \ + if ((ret = __wt_spin_trylock(session, (lock))) == 0) { \ + F_SET(session, (flag)); \ + op; \ + F_CLR(session, (flag)); \ + __wt_spin_unlock(session, (lock)); \ + } \ } else { \ __wt_spin_lock(session, (lock)); \ F_SET(session, (flag)); \ @@ -98,16 +106,16 @@ struct __wt_table { * WT_WITH_CHECKPOINT_LOCK -- * Acquire the checkpoint lock, perform an operation, drop the lock. */ -#define WT_WITH_CHECKPOINT_LOCK(session, op) \ - WT_WITH_LOCK(session, \ +#define WT_WITH_CHECKPOINT_LOCK(session, ret, op) \ + WT_WITH_LOCK(session, ret, \ &S2C(session)->checkpoint_lock, WT_SESSION_LOCKED_CHECKPOINT, op) /* * WT_WITH_HANDLE_LIST_LOCK -- * Acquire the data handle list lock, perform an operation, drop the lock. */ -#define WT_WITH_HANDLE_LIST_LOCK(session, op) \ - WT_WITH_LOCK(session, \ +#define WT_WITH_HANDLE_LIST_LOCK(session, ret, op) \ + WT_WITH_LOCK(session, ret, \ &S2C(session)->dhandle_lock, WT_SESSION_LOCKED_HANDLE_LIST, op) /* * WT_WITH_SCHEMA_LOCK -- @@ -115,12 +123,12 @@ struct __wt_table { * Check that we are not already holding some other lock: the schema lock * must be taken first. */ -#define WT_WITH_SCHEMA_LOCK(session, op) do { \ +#define WT_WITH_SCHEMA_LOCK(session, ret, op) do { \ WT_ASSERT(session, \ F_ISSET(session, WT_SESSION_LOCKED_SCHEMA) || \ !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST | \ WT_SESSION_NO_SCHEMA_LOCK | WT_SESSION_LOCKED_TABLE)); \ - WT_WITH_LOCK(session, \ + WT_WITH_LOCK(session, ret, \ &S2C(session)->schema_lock, WT_SESSION_LOCKED_SCHEMA, op); \ } while (0) @@ -128,11 +136,11 @@ struct __wt_table { * WT_WITH_TABLE_LOCK -- * Acquire the table lock, perform an operation, drop the lock. */ -#define WT_WITH_TABLE_LOCK(session, op) do { \ +#define WT_WITH_TABLE_LOCK(session, ret, op) do { \ WT_ASSERT(session, \ F_ISSET(session, WT_SESSION_LOCKED_TABLE) || \ !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \ - WT_WITH_LOCK(session, \ + WT_WITH_LOCK(session, ret, \ &S2C(session)->table_lock, WT_SESSION_LOCKED_TABLE, op); \ } while (0) diff --git a/src/include/serial.i b/src/include/serial.i index ca22ce12d81..fa920de7e37 100644 --- a/src/include/serial.i +++ b/src/include/serial.i @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/session.h b/src/include/session.h index 5c3bcfb8ed0..5c3291230b4 100644 --- a/src/include/session.h +++ b/src/include/session.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -74,19 +74,22 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { TAILQ_HEAD(__cursors, __wt_cursor) cursors; WT_CURSOR_BACKUP *bkp_cursor; /* Hot backup cursor */ - WT_COMPACT *compact; /* Compact state */ + + WT_COMPACT *compact; /* Compaction information */ + enum { WT_COMPACT_NONE=0, + WT_COMPACT_RUNNING, WT_COMPACT_SUCCESS } compact_state; /* * Lookaside table cursor, sweep and eviction worker threads only. */ WT_CURSOR *las_cursor; /* Lookaside table cursor */ - WT_DATA_HANDLE *meta_dhandle; /* Metadata file */ - void *meta_track; /* Metadata operation tracking */ - void *meta_track_next; /* Current position */ - void *meta_track_sub; /* Child transaction / save point */ - size_t meta_track_alloc; /* Currently allocated */ - int meta_track_nest; /* Nesting level of meta transaction */ + WT_CURSOR *meta_cursor; /* Metadata file */ + void *meta_track; /* Metadata operation tracking */ + void *meta_track_next; /* Current position */ + void *meta_track_sub; /* Child transaction / save point */ + size_t meta_track_alloc; /* Currently allocated */ + int meta_track_nest; /* Nesting level of meta transaction */ #define WT_META_TRACKING(session) (session->meta_track_next != NULL) /* @@ -134,8 +137,6 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { void *reconcile; /* Reconciliation support */ int (*reconcile_cleanup)(WT_SESSION_IMPL *); - bool compaction; /* Compaction did some work */ - uint32_t flags; /* diff --git a/src/include/stat.h b/src/include/stat.h index dfe7ee5c6cd..51d2fa332e7 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -139,8 +139,8 @@ __wt_stats_clear(void *stats_arg, int slot) */ #define WT_STAT_READ(stats, fld) \ __wt_stats_aggregate(stats, WT_STATS_FIELD_TO_SLOT(stats, fld)) -#define WT_STAT_WRITE(session, stats, fld) \ - ((stats)[WT_STATS_SLOT_ID(session)]->fld); +#define WT_STAT_WRITE(stats, fld, v) \ + (stats)->fld = (int64_t)(v) #define WT_STAT_DECRV(session, stats, fld, value) \ (stats)[WT_STATS_SLOT_ID(session)]->fld -= (int64_t)(value) diff --git a/src/include/txn.h b/src/include/txn.h index 936164fa9a7..1e82e2d982a 100644 --- a/src/include/txn.h +++ b/src/include/txn.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/txn.i b/src/include/txn.i index 1005d4a395d..46f2ff3e5f1 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -185,9 +185,7 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id) if (id == WT_TXN_ABORTED) return (false); - /* - * Read-uncommitted transactions see all other changes. - */ + /* Read-uncommitted transactions see all other changes. */ if (txn->isolation == WT_ISO_READ_UNCOMMITTED) return (true); diff --git a/src/include/verify_build.h b/src/include/verify_build.h index 6a97def12be..477b9b7c134 100644 --- a/src/include/verify_build.h +++ b/src/include/verify_build.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index bdd8bb65910..676f95d9b05 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -566,20 +566,21 @@ struct __wt_cursor { */ const char *internal_uri; -#define WT_CURSTD_APPEND 0x0001 -#define WT_CURSTD_BULK 0x0002 -#define WT_CURSTD_DUMP_HEX 0x0004 -#define WT_CURSTD_DUMP_JSON 0x0008 -#define WT_CURSTD_DUMP_PRINT 0x0010 -#define WT_CURSTD_KEY_EXT 0x0020 /* Key points out of the tree. */ -#define WT_CURSTD_KEY_INT 0x0040 /* Key points into the tree. */ +#define WT_CURSTD_APPEND 0x00001 +#define WT_CURSTD_BULK 0x00002 +#define WT_CURSTD_DUMP_HEX 0x00004 +#define WT_CURSTD_DUMP_JSON 0x00008 +#define WT_CURSTD_DUMP_PRINT 0x00010 +#define WT_CURSTD_JOINED 0x00020 +#define WT_CURSTD_KEY_EXT 0x00040 /* Key points out of the tree. */ +#define WT_CURSTD_KEY_INT 0x00080 /* Key points into the tree. */ #define WT_CURSTD_KEY_SET (WT_CURSTD_KEY_EXT | WT_CURSTD_KEY_INT) -#define WT_CURSTD_JOINED 0x0080 -#define WT_CURSTD_OPEN 0x0100 -#define WT_CURSTD_OVERWRITE 0x0200 -#define WT_CURSTD_RAW 0x0400 -#define WT_CURSTD_VALUE_EXT 0x0800 /* Value points out of the tree. */ -#define WT_CURSTD_VALUE_INT 0x1000 /* Value points into the tree. */ +#define WT_CURSTD_META_INUSE 0x00100 +#define WT_CURSTD_OPEN 0x00200 +#define WT_CURSTD_OVERWRITE 0x00400 +#define WT_CURSTD_RAW 0x00800 +#define WT_CURSTD_VALUE_EXT 0x01000 /* Value points out of the tree. */ +#define WT_CURSTD_VALUE_INT 0x02000 /* Value points into the tree. */ #define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT) uint32_t flags; #endif @@ -1236,6 +1237,9 @@ struct __wt_session { * @configstart{WT_SESSION.drop, see dist/api_data.py} * @config{force, return success if the object does not exist., a * boolean flag; default \c false.} + * @config{lock_wait, wait for locks\, if \c lock_wait=false\, fail if + * any required locks are not available immediately., a boolean flag; + * default \c true.} * @config{remove_files, should the underlying files be removed?., a * boolean flag; default \c true.} * @configend @@ -1329,6 +1333,19 @@ struct __wt_session { int __F(log_printf)(WT_SESSION *session, const char *fmt, ...); /*! + * Rebalance a table, see @ref rebalance. + * + * @snippet ex_all.c Rebalance a table + * + * @param session the session handle + * @param uri the current URI of the object, such as \c "table:mytable" + * @configempty{WT_SESSION.rebalance, see dist/api_data.py} + * @ebusy_errors + */ + int __F(rebalance)( + WT_SESSION *session, const char *uri, const char *config); + + /*! * Rename an object. * * @snippet ex_all.c Rename a table @@ -1920,9 +1937,10 @@ struct __wt_connection { * "block"\, \c "checkpoint"\, \c "compact"\, \c "evict"\, \c * "evictserver"\, \c "fileops"\, \c "log"\, \c "lsm"\, \c * "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c - * "read"\, \c "reconcile"\, \c "recovery"\, \c "salvage"\, \c - * "shared_cache"\, \c "split"\, \c "temporary"\, \c "transaction"\, \c - * "verify"\, \c "version"\, \c "write"; default empty.} + * "read"\, \c "rebalance"\, \c "reconcile"\, \c "recovery"\, \c + * "salvage"\, \c "shared_cache"\, \c "split"\, \c "temporary"\, \c + * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default + * empty.} * @configend * @errors */ @@ -2405,9 +2423,9 @@ struct __wt_connection { * values chosen from the following options: \c "api"\, \c "block"\, \c * "checkpoint"\, \c "compact"\, \c "evict"\, \c "evictserver"\, \c "fileops"\, * \c "log"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c - * "overflow"\, \c "read"\, \c "reconcile"\, \c "recovery"\, \c "salvage"\, \c - * "shared_cache"\, \c "split"\, \c "temporary"\, \c "transaction"\, \c - * "verify"\, \c "version"\, \c "write"; default empty.} + * "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c "recovery"\, \c + * "salvage"\, \c "shared_cache"\, \c "split"\, \c "temporary"\, \c + * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default empty.} * @config{write_through, Use \c FILE_FLAG_WRITE_THROUGH on Windows to write to * files. Ignored on non-Windows systems. Options are given as a list\, such * as <code>"write_through=[data]"</code>. Configuring \c write_through requires diff --git a/src/include/wiredtiger_ext.h b/src/include/wiredtiger_ext.h index 28fd8e18329..0db876b56f3 100644 --- a/src/include/wiredtiger_ext.h +++ b/src/include/wiredtiger_ext.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h index 0a1e143ce70..54b5dfd19f4 100644 --- a/src/include/wt_internal.h +++ b/src/include/wt_internal.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/log/log.c b/src/log/log.c index 118e081c3ec..3bf04d025d8 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -47,7 +47,7 @@ __wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start) conn = S2C(session); log = conn->log; WT_RET(__wt_log_force_write(session, 1)); - WT_RET(__wt_log_wrlsn(session)); + WT_RET(__wt_log_wrlsn(session, NULL)); if (start) *lsn = log->write_start_lsn; else @@ -669,8 +669,7 @@ __log_openfile(WT_SESSION_IMPL *session, * check that the magic number and versions are correct. */ if (!ok_create) { - __wt_scr_free(session, &buf); - WT_ERR(__wt_scr_alloc(session, allocsize, &buf)); + WT_ERR(__wt_buf_grow(session, buf, allocsize)); memset(buf->mem, 0, allocsize); WT_ERR(__wt_read(session, *fh, 0, allocsize, buf->mem)); logrec = (WT_LOG_RECORD *)buf->mem; @@ -771,7 +770,7 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created) WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); while (log->log_close_fh != NULL) { WT_STAT_FAST_CONN_INCR(session, log_close_yields); - WT_RET(__wt_log_wrlsn(session)); + WT_RET(__wt_log_wrlsn(session, NULL)); if (++yield_cnt > 10000) return (EBUSY); __wt_yield(); @@ -791,9 +790,10 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created) WT_FULL_BARRIER(); /* * If we're pre-allocating log files, look for one. If there aren't any - * or we're not pre-allocating, then create one. + * or we're not pre-allocating, or a backup cursor is open, then + * create one. */ - if (conn->log_prealloc > 0) { + if (conn->log_prealloc > 0 && !conn->hot_backup) { ret = __log_alloc_prealloc(session, log->fileid); /* * If ret is 0 it means we found a pre-allocated file. @@ -1120,7 +1120,7 @@ __wt_log_open(WT_SESSION_IMPL *session) * Start logging at the beginning of the next log file, no matter * where the previous log file ends. */ - WT_WITH_SLOT_LOCK(session, log, + WT_WITH_SLOT_LOCK(session, log, ret, ret = __log_newfile(session, true, NULL)); WT_ERR(ret); @@ -1970,6 +1970,14 @@ err: myslot.slot != NULL) ret = myslot.slot->slot_error; + /* + * If one of the sync flags is set, assert the proper LSN has moved to + * match. + */ + WT_ASSERT(session, !LF_ISSET(WT_LOG_FLUSH) || + __wt_log_cmp(&log->write_lsn, &lsn) >= 0); + WT_ASSERT(session, + !LF_ISSET(WT_LOG_FSYNC) || __wt_log_cmp(&log->sync_lsn, &lsn) >= 0); return (ret); } diff --git a/src/log/log_auto.c b/src/log/log_auto.c index 5a1d03b1976..54df01d01ab 100644 --- a/src/log/log_auto.c +++ b/src/log/log_auto.c @@ -69,7 +69,7 @@ __logrec_json_unpack_str(char *dest, size_t destlen, const char *src, } static int -__logrec_jsonify_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item) +__logrec_make_json_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item) { size_t needed; @@ -79,6 +79,17 @@ __logrec_jsonify_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item) return (0); } +static int +__logrec_make_hex_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item) +{ + size_t needed; + + needed = item->size * 2 + 1; + WT_RET(__wt_realloc(session, NULL, needed, destp)); + __wt_fill_hex(item->data, item->size, (uint8_t *)*destp, needed, NULL); + return (0); +} + int __wt_logop_col_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, @@ -121,7 +132,8 @@ __wt_logop_col_put_unpack( int __wt_logop_col_put_print( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { WT_DECL_RET; uint32_t fileid; @@ -138,9 +150,14 @@ __wt_logop_col_put_print( " \"fileid\": \"%" PRIu32 "\",\n", fileid)); WT_ERR(__wt_fprintf(out, " \"recno\": \"%" PRIu64 "\",\n", recno)); - WT_ERR(__logrec_jsonify_str(session, &escaped, &value)); + WT_ERR(__logrec_make_json_str(session, &escaped, &value)); WT_ERR(__wt_fprintf(out, " \"value\": \"%s\"", escaped)); + if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) { + WT_ERR(__logrec_make_hex_str(session, &escaped, &value)); + WT_ERR(__wt_fprintf(out, + ",\n \"value-hex\": \"%s\"", escaped)); + } err: __wt_free(session, escaped); return (ret); @@ -188,11 +205,13 @@ __wt_logop_col_remove_unpack( int __wt_logop_col_remove_print( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { uint32_t fileid; uint64_t recno; + WT_UNUSED(flags); WT_RET(__wt_logop_col_remove_unpack( session, pp, end, &fileid, &recno)); @@ -246,12 +265,14 @@ __wt_logop_col_truncate_unpack( int __wt_logop_col_truncate_print( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { uint32_t fileid; uint64_t start; uint64_t stop; + WT_UNUSED(flags); WT_RET(__wt_logop_col_truncate_unpack( session, pp, end, &fileid, &start, &stop)); @@ -307,7 +328,8 @@ __wt_logop_row_put_unpack( int __wt_logop_row_put_print( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { WT_DECL_RET; uint32_t fileid; @@ -322,12 +344,22 @@ __wt_logop_row_put_print( WT_RET(__wt_fprintf(out, " \"optype\": \"row_put\",\n")); WT_ERR(__wt_fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid)); - WT_ERR(__logrec_jsonify_str(session, &escaped, &key)); + WT_ERR(__logrec_make_json_str(session, &escaped, &key)); WT_ERR(__wt_fprintf(out, " \"key\": \"%s\",\n", escaped)); - WT_ERR(__logrec_jsonify_str(session, &escaped, &value)); + if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) { + WT_ERR(__logrec_make_hex_str(session, &escaped, &key)); + WT_ERR(__wt_fprintf(out, + " \"key-hex\": \"%s\",\n", escaped)); + } + WT_ERR(__logrec_make_json_str(session, &escaped, &value)); WT_ERR(__wt_fprintf(out, " \"value\": \"%s\"", escaped)); + if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) { + WT_ERR(__logrec_make_hex_str(session, &escaped, &value)); + WT_ERR(__wt_fprintf(out, + ",\n \"value-hex\": \"%s\"", escaped)); + } err: __wt_free(session, escaped); return (ret); @@ -375,7 +407,8 @@ __wt_logop_row_remove_unpack( int __wt_logop_row_remove_print( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { WT_DECL_RET; uint32_t fileid; @@ -389,9 +422,14 @@ __wt_logop_row_remove_print( WT_RET(__wt_fprintf(out, " \"optype\": \"row_remove\",\n")); WT_ERR(__wt_fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid)); - WT_ERR(__logrec_jsonify_str(session, &escaped, &key)); + WT_ERR(__logrec_make_json_str(session, &escaped, &key)); WT_ERR(__wt_fprintf(out, " \"key\": \"%s\"", escaped)); + if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) { + WT_ERR(__logrec_make_hex_str(session, &escaped, &key)); + WT_ERR(__wt_fprintf(out, + ",\n \"key-hex\": \"%s\"", escaped)); + } err: __wt_free(session, escaped); return (ret); @@ -439,7 +477,8 @@ __wt_logop_row_truncate_unpack( int __wt_logop_row_truncate_print( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { WT_DECL_RET; uint32_t fileid; @@ -455,12 +494,22 @@ __wt_logop_row_truncate_print( WT_RET(__wt_fprintf(out, " \"optype\": \"row_truncate\",\n")); WT_ERR(__wt_fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid)); - WT_ERR(__logrec_jsonify_str(session, &escaped, &start)); + WT_ERR(__logrec_make_json_str(session, &escaped, &start)); WT_ERR(__wt_fprintf(out, " \"start\": \"%s\",\n", escaped)); - WT_ERR(__logrec_jsonify_str(session, &escaped, &stop)); + if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) { + WT_ERR(__logrec_make_hex_str(session, &escaped, &start)); + WT_ERR(__wt_fprintf(out, + " \"start-hex\": \"%s\",\n", escaped)); + } + WT_ERR(__logrec_make_json_str(session, &escaped, &stop)); WT_ERR(__wt_fprintf(out, " \"stop\": \"%s\",\n", escaped)); + if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) { + WT_ERR(__logrec_make_hex_str(session, &escaped, &stop)); + WT_ERR(__wt_fprintf(out, + " \"stop-hex\": \"%s\",\n", escaped)); + } WT_ERR(__wt_fprintf(out, " \"mode\": \"%" PRIu32 "\"", mode)); @@ -470,7 +519,8 @@ err: __wt_free(session, escaped); int __wt_txn_op_printlog( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { uint32_t optype, opsize; @@ -480,27 +530,33 @@ __wt_txn_op_printlog( switch (optype) { case WT_LOGOP_COL_PUT: - WT_RET(__wt_logop_col_put_print(session, pp, end, out)); + WT_RET(__wt_logop_col_put_print(session, pp, end, out, + flags)); break; case WT_LOGOP_COL_REMOVE: - WT_RET(__wt_logop_col_remove_print(session, pp, end, out)); + WT_RET(__wt_logop_col_remove_print(session, pp, end, out, + flags)); break; case WT_LOGOP_COL_TRUNCATE: - WT_RET(__wt_logop_col_truncate_print(session, pp, end, out)); + WT_RET(__wt_logop_col_truncate_print(session, pp, end, out, + flags)); break; case WT_LOGOP_ROW_PUT: - WT_RET(__wt_logop_row_put_print(session, pp, end, out)); + WT_RET(__wt_logop_row_put_print(session, pp, end, out, + flags)); break; case WT_LOGOP_ROW_REMOVE: - WT_RET(__wt_logop_row_remove_print(session, pp, end, out)); + WT_RET(__wt_logop_row_remove_print(session, pp, end, out, + flags)); break; case WT_LOGOP_ROW_TRUNCATE: - WT_RET(__wt_logop_row_truncate_print(session, pp, end, out)); + WT_RET(__wt_logop_row_truncate_print(session, pp, end, out, + flags)); break; WT_ILLEGAL_VALUE(session); diff --git a/src/log/log_slot.c b/src/log/log_slot.c index 8155397d823..760e8888de6 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -187,7 +187,7 @@ __wt_log_slot_switch( * because we are responsible for setting up the new slot. */ do { - WT_WITH_SLOT_LOCK(session, log, + WT_WITH_SLOT_LOCK(session, log, ret, ret = __log_slot_switch_internal(session, myslot, forced)); if (ret == EBUSY) { WT_STAT_FAST_CONN_INCR(session, log_slot_switch_busy); diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index 953698476ef..1bb9a7238fe 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -254,7 +254,7 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) (!update && F_ISSET(clsm, WT_CLSM_OPEN_READ)))) break; -open: WT_WITH_SCHEMA_LOCK(session, +open: WT_WITH_SCHEMA_LOCK(session, ret, ret = __clsm_open_cursors(clsm, update, 0, 0)); WT_RET(ret); } @@ -710,7 +710,7 @@ __wt_clsm_init_merge( F_SET(clsm, WT_CLSM_MINOR_MERGE); clsm->nchunks = nchunks; - WT_WITH_SCHEMA_LOCK(session, + WT_WITH_SCHEMA_LOCK(session, ret, ret = __clsm_open_cursors(clsm, false, start_chunk, start_id)); return (ret); } @@ -1543,7 +1543,7 @@ __wt_clsm_open(WT_SESSION_IMPL *session, bulk = cval.val != 0; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret, ret = __wt_lsm_tree_get(session, uri, bulk, &lsm_tree)); /* * Check whether the exclusive open for a bulk load succeeded, and diff --git a/src/lsm/lsm_cursor_bulk.c b/src/lsm/lsm_cursor_bulk.c index 65e8fe1e9a7..607ca0c9705 100644 --- a/src/lsm/lsm_cursor_bulk.c +++ b/src/lsm/lsm_cursor_bulk.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -114,7 +114,7 @@ __wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[]) * switch inline, since switch needs a schema lock and online index * creation opens a bulk cursor while holding the schema lock. */ - WT_WITH_SCHEMA_LOCK(session, + WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_lsm_tree_switch(session, lsm_tree)); WT_RET(ret); diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c index d8cf36f2cc1..dac8d987328 100644 --- a/src/lsm/lsm_manager.c +++ b/src/lsm/lsm_manager.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c index 1a2608803e4..29325066da7 100644 --- a/src/lsm/lsm_merge.c +++ b/src/lsm/lsm_merge.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -437,7 +437,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) F_SET(src, WT_CURSTD_RAW); WT_ERR(__wt_clsm_init_merge(src, start_chunk, start_id, nchunks)); - WT_WITH_SCHEMA_LOCK(session, + WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); WT_ERR(ret); if (create_bloom) { @@ -607,12 +607,13 @@ err: if (locked) if (ret != 0 && created_chunk) { /* Drop the newly-created files on error. */ if (chunk->uri != NULL) { - WT_WITH_SCHEMA_LOCK(session, tret = - __wt_schema_drop(session, chunk->uri, drop_cfg)); + WT_WITH_SCHEMA_LOCK(session, tret, + tret = __wt_schema_drop( + session, chunk->uri, drop_cfg)); WT_TRET(tret); } if (create_bloom && chunk->bloom_uri != NULL) { - WT_WITH_SCHEMA_LOCK(session, + WT_WITH_SCHEMA_LOCK(session, tret, tret = __wt_schema_drop( session, chunk->bloom_uri, drop_cfg)); WT_TRET(tret); diff --git a/src/lsm/lsm_meta.c b/src/lsm/lsm_meta.c index 64ca283e2c8..d76b2a48aa7 100644 --- a/src/lsm/lsm_meta.c +++ b/src/lsm/lsm_meta.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/lsm/lsm_stat.c b/src/lsm/lsm_stat.c index c1eb7a2a389..c147cf5774a 100644 --- a/src/lsm/lsm_stat.c +++ b/src/lsm/lsm_stat.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -33,7 +33,7 @@ __curstat_lsm_init( "checkpoint=" WT_CHECKPOINT, NULL, NULL }; locked = false; - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret, ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)); WT_RET(ret); WT_ERR(__wt_scr_alloc(session, 0, &uribuf)); @@ -91,7 +91,7 @@ __curstat_lsm_init( * top-level. */ new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor); - new->lsm_generation_max = chunk->generation; + WT_STAT_WRITE(new, lsm_generation_max, chunk->generation); /* Aggregate statistics from each new chunk. */ __wt_stat_dsrc_aggregate_single(new, stats); @@ -115,37 +115,40 @@ __curstat_lsm_init( * into the top-level. */ new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor); - new->bloom_size = - (int64_t)((chunk->count * lsm_tree->bloom_bit_count) / 8); - new->bloom_page_evict = - new->cache_eviction_clean + new->cache_eviction_dirty; - new->bloom_page_read = new->cache_read; + WT_STAT_WRITE(new, bloom_size, + (int64_t)((chunk->count * lsm_tree->bloom_bit_count) / 8)); + WT_STAT_WRITE(new, bloom_page_evict, + new->cache_eviction_clean + new->cache_eviction_dirty); + WT_STAT_WRITE(new, bloom_page_read, new->cache_read); __wt_stat_dsrc_aggregate_single(new, stats); WT_ERR(stat_cursor->close(stat_cursor)); } /* Set statistics that aren't aggregated directly into the cursor */ - stats->bloom_count = bloom_count; - stats->lsm_chunk_count = lsm_tree->nchunks; + WT_STAT_WRITE(stats, bloom_count, bloom_count); + WT_STAT_WRITE(stats, lsm_chunk_count, lsm_tree->nchunks); /* Include, and optionally clear, LSM-level specific information. */ - stats->bloom_miss = lsm_tree->bloom_miss; + WT_STAT_WRITE(stats, bloom_miss, lsm_tree->bloom_miss); if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) lsm_tree->bloom_miss = 0; - stats->bloom_hit = lsm_tree->bloom_hit; + WT_STAT_WRITE(stats, bloom_hit, lsm_tree->bloom_hit); if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) lsm_tree->bloom_hit = 0; - stats->bloom_false_positive = lsm_tree->bloom_false_positive; + WT_STAT_WRITE( + stats, bloom_false_positive, lsm_tree->bloom_false_positive); if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) lsm_tree->bloom_false_positive = 0; - stats->lsm_lookup_no_bloom = lsm_tree->lsm_lookup_no_bloom; + WT_STAT_WRITE( + stats, lsm_lookup_no_bloom, lsm_tree->lsm_lookup_no_bloom); if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) lsm_tree->lsm_lookup_no_bloom = 0; - stats->lsm_checkpoint_throttle = lsm_tree->lsm_checkpoint_throttle; + WT_STAT_WRITE( + stats, lsm_checkpoint_throttle, lsm_tree->lsm_checkpoint_throttle); if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) lsm_tree->lsm_checkpoint_throttle = 0; - stats->lsm_merge_throttle = lsm_tree->lsm_merge_throttle; + WT_STAT_WRITE(stats, lsm_merge_throttle, lsm_tree->lsm_merge_throttle); if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) lsm_tree->lsm_merge_throttle = 0; @@ -173,7 +176,7 @@ __wt_curstat_lsm_init( * Grab the schema lock because we will be locking the LSM tree and we * may need to open some files. */ - WT_WITH_SCHEMA_LOCK(session, + WT_WITH_SCHEMA_LOCK(session, ret, ret = __curstat_lsm_init(session, uri, cst)); return (ret); diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index 5d819607413..ff6e66fd1a1 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -243,7 +243,7 @@ __lsm_tree_cleanup_old(WT_SESSION_IMPL *session, const char *uri) WT_RET(__wt_exist(session, uri + strlen("file:"), &exists)); if (exists) - WT_WITH_SCHEMA_LOCK(session, + WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_schema_drop(session, uri, cfg)); return (ret); } @@ -315,7 +315,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, char *tmpconfig; /* If the tree is open, it already exists. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret, ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)); if (ret == 0) { __wt_lsm_tree_release(session, lsm_tree); @@ -447,7 +447,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, * tracking macros handle cleaning up on failure. */ if (ret == 0) - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret, ret = __lsm_tree_open(session, uri, true, &lsm_tree)); if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); @@ -954,13 +954,14 @@ __wt_lsm_tree_drop( WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; + int tret; u_int i; bool locked; locked = false; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret, ret = __wt_lsm_tree_get(session, name, true, &lsm_tree)); WT_RET(ret); @@ -996,8 +997,9 @@ __wt_lsm_tree_drop( err: if (locked) WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); - WT_WITH_HANDLE_LIST_LOCK(session, - WT_TRET(__lsm_tree_discard(session, lsm_tree, false))); + WT_WITH_HANDLE_LIST_LOCK(session, tret, + tret = __lsm_tree_discard(session, lsm_tree, false)); + WT_TRET(tret); return (ret); } @@ -1013,6 +1015,7 @@ __wt_lsm_tree_rename(WT_SESSION_IMPL *session, WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; const char *old; + int tret; u_int i; bool locked; @@ -1020,7 +1023,7 @@ __wt_lsm_tree_rename(WT_SESSION_IMPL *session, locked = false; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret, ret = __wt_lsm_tree_get(session, olduri, true, &lsm_tree)); WT_RET(ret); @@ -1070,8 +1073,9 @@ err: if (locked) * Discard this LSM tree structure. The first operation on the renamed * tree will create a new one. */ - WT_WITH_HANDLE_LIST_LOCK(session, - WT_TRET(__lsm_tree_discard(session, lsm_tree, false))); + WT_WITH_HANDLE_LIST_LOCK(session, tret, + tret = __lsm_tree_discard(session, lsm_tree, false)); + WT_TRET(tret); return (ret); } @@ -1086,6 +1090,7 @@ __wt_lsm_tree_truncate( WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; + int tret; bool locked; WT_UNUSED(cfg); @@ -1093,7 +1098,7 @@ __wt_lsm_tree_truncate( locked = false; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret, ret = __wt_lsm_tree_get(session, name, true, &lsm_tree)); WT_RET(ret); @@ -1132,8 +1137,9 @@ err: if (locked) * the last good version of the metadata will be used, resulting * in a valid (not truncated) tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - WT_TRET(__lsm_tree_discard(session, lsm_tree, false))); + WT_WITH_HANDLE_LIST_LOCK(session, tret, + tret = __lsm_tree_discard(session, lsm_tree, false)); + WT_TRET(tret); } return (ret); } @@ -1231,7 +1237,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp) /* Tell __wt_schema_worker not to look inside the LSM tree. */ *skipp = true; - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret, ret = __wt_lsm_tree_get(session, name, false, &lsm_tree)); WT_RET(ret); @@ -1429,7 +1435,7 @@ __wt_lsm_tree_worker(WT_SESSION_IMPL *session, locked = false; exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE); - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret, ret = __wt_lsm_tree_get(session, uri, exclusive, &lsm_tree)); WT_RET(ret); diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c index 4741cf52608..4faa25967ad 100644 --- a/src/lsm/lsm_work_unit.c +++ b/src/lsm/lsm_work_unit.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -168,7 +168,7 @@ __wt_lsm_work_switch( *entryp = NULL; if (F_ISSET(entry->lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { - WT_WITH_SCHEMA_LOCK(session, + WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_lsm_tree_switch(session, entry->lsm_tree)); /* Failing to complete the switch is fine */ if (ret == EBUSY) { @@ -272,7 +272,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && !chunk->evicted) { - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret, ret = __lsm_discard_handle(session, chunk->uri, NULL)); if (ret == 0) chunk->evicted = 1; @@ -336,7 +336,8 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, * necessary handle locks. */ WT_ERR(__wt_meta_track_on(session)); - WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker( + WT_WITH_SCHEMA_LOCK(session, ret, + ret = __wt_schema_worker( session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)); WT_TRET(__wt_meta_track_off(session, false, ret != 0)); if (ret != 0) @@ -505,7 +506,7 @@ __lsm_drop_file(WT_SESSION_IMPL *session, const char *uri) * * This will fail with EBUSY if the file is still in use. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret, ret = __lsm_discard_handle(session, uri, WT_CHECKPOINT)); WT_RET(ret); @@ -514,7 +515,7 @@ __lsm_drop_file(WT_SESSION_IMPL *session, const char *uri) * results in the hot backup lock being taken when it updates the * metadata (which would be too late to prevent our drop). */ - WT_WITH_SCHEMA_LOCK(session, + WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_schema_drop(session, uri, drop_cfg)); if (ret == 0) diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c index 625783ac16c..7562cb1cae3 100644 --- a/src/lsm/lsm_worker.c +++ b/src/lsm/lsm_worker.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/meta/meta_apply.c b/src/meta/meta_apply.c index 95c5b9807ca..92766213b33 100644 --- a/src/meta/meta_apply.c +++ b/src/meta/meta_apply.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -9,27 +9,23 @@ #include "wt_internal.h" /* - * __wt_meta_btree_apply -- + * __meta_btree_apply -- * Apply a function to all files listed in the metadata, apart from the * metadata file. */ -int -__wt_meta_btree_apply(WT_SESSION_IMPL *session, +static inline int +__meta_btree_apply(WT_SESSION_IMPL *session, WT_CURSOR *cursor, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) { - WT_CURSOR *cursor; - WT_DATA_HANDLE *saved_dhandle; WT_DECL_RET; const char *uri; - int cmp, tret; + int cmp; - saved_dhandle = session->dhandle; - WT_RET(__wt_metadata_cursor(session, NULL, &cursor)); cursor->set_key(cursor, "file:"); - if ((tret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0) - tret = cursor->next(cursor); - for (; tret == 0; tret = cursor->next(cursor)) { - WT_ERR(cursor->get_key(cursor, &uri)); + if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0) + ret = cursor->next(cursor); + for (; ret == 0; ret = cursor->next(cursor)) { + WT_RET(cursor->get_key(cursor, &uri)); if (!WT_PREFIX_MATCH(uri, "file:")) break; if (strcmp(uri, WT_METAFILE_URI) == 0) @@ -43,8 +39,7 @@ __wt_meta_btree_apply(WT_SESSION_IMPL *session, */ ret = __wt_session_get_btree(session, uri, NULL, NULL, 0); if (ret == 0) { - WT_SAVE_DHANDLE(session, - ret = func(session, cfg)); + WT_SAVE_DHANDLE(session, ret = func(session, cfg)); if (WT_META_TRACKING(session)) WT_TRET(__wt_meta_track_handle_lock( session, false)); @@ -53,12 +48,29 @@ __wt_meta_btree_apply(WT_SESSION_IMPL *session, } else if (ret == EBUSY) ret = __wt_conn_btree_apply_single( session, uri, NULL, func, cfg); - WT_ERR(ret); + WT_RET(ret); } + WT_RET_NOTFOUND_OK(ret); + + return (0); +} + +/* + * __wt_meta_btree_apply -- + * Apply a function to all files listed in the metadata, apart from the + * metadata file. + */ +int +__wt_meta_btree_apply(WT_SESSION_IMPL *session, + int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + + WT_RET(__wt_metadata_cursor(session, &cursor)); + WT_SAVE_DHANDLE(session, + ret = __meta_btree_apply(session, cursor, func, cfg)); + WT_TRET(__wt_metadata_cursor_release(session, &cursor)); - if (tret != WT_NOTFOUND) - WT_TRET(tret); -err: WT_TRET(cursor->close(cursor)); - session->dhandle = saved_dhandle; return (ret); } diff --git a/src/meta/meta_ckpt.c b/src/meta/meta_ckpt.c index 70c9bf8dfcd..f7da8525639 100644 --- a/src/meta/meta_ckpt.c +++ b/src/meta/meta_ckpt.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/meta/meta_ext.c b/src/meta/meta_ext.c index 423b7d2e76b..b48f7205807 100644 --- a/src/meta/meta_ext.c +++ b/src/meta/meta_ext.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/meta/meta_table.c b/src/meta/meta_table.c index e7074a9c1b5..9938cb07a5c 100644 --- a/src/meta/meta_table.c +++ b/src/meta/meta_table.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -31,21 +31,28 @@ __metadata_turtle(const char *key) } /* - * __wt_metadata_open -- - * Opens the metadata file, sets session->meta_dhandle. + * __wt_metadata_cursor_open -- + * Opens a cursor on the metadata. */ int -__wt_metadata_open(WT_SESSION_IMPL *session) +__wt_metadata_cursor_open( + WT_SESSION_IMPL *session, const char *config, WT_CURSOR **cursorp) { WT_BTREE *btree; + WT_DECL_RET; + const char *open_cursor_cfg[] = { + WT_CONFIG_BASE(session, WT_SESSION_open_cursor), config, NULL }; - if (session->meta_dhandle != NULL) - return (0); - - WT_RET(__wt_session_get_btree(session, WT_METAFILE_URI, NULL, NULL, 0)); + WT_WITHOUT_DHANDLE(session, ret = __wt_open_cursor( + session, WT_METAFILE_URI, NULL, open_cursor_cfg, cursorp)); + WT_RET(ret); - session->meta_dhandle = session->dhandle; - WT_ASSERT(session, session->meta_dhandle != NULL); + /* + * Retrieve the btree from the cursor, rather than the session because + * we don't always switch the metadata handle in to the session before + * entering this function. + */ + btree = ((WT_CURSOR_BTREE *)(*cursorp))->btree; /* * Set special flags for the metadata file: eviction (the metadata file @@ -56,7 +63,6 @@ __wt_metadata_open(WT_SESSION_IMPL *session) * opens (the first update is safe because it's single-threaded from * wiredtiger_open). */ - btree = S2BT(session); if (!F_ISSET(btree, WT_BTREE_IN_MEMORY)) F_SET(btree, WT_BTREE_IN_MEMORY); if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) @@ -64,44 +70,81 @@ __wt_metadata_open(WT_SESSION_IMPL *session) if (F_ISSET(btree, WT_BTREE_NO_LOGGING)) F_CLR(btree, WT_BTREE_NO_LOGGING); - /* The metadata handle doesn't need to stay locked -- release it. */ - return (__wt_session_release_btree(session)); + return (0); } /* * __wt_metadata_cursor -- - * Opens a cursor on the metadata. + * Returns the session's cached metadata cursor, unless it's in use, in + * which case it opens and returns another metadata cursor. */ int -__wt_metadata_cursor( - WT_SESSION_IMPL *session, const char *config, WT_CURSOR **cursorp) +__wt_metadata_cursor(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) { - WT_DATA_HANDLE *saved_dhandle; - WT_DECL_RET; - bool is_dead; - const char *cfg[] = - { WT_CONFIG_BASE(session, WT_SESSION_open_cursor), config, NULL }; + WT_CURSOR *cursor; - saved_dhandle = session->dhandle; - WT_ERR(__wt_metadata_open(session)); + /* + * If we don't have a cached metadata cursor, or it's already in use, + * we'll need to open a new one. + */ + cursor = NULL; + if (session->meta_cursor == NULL || + F_ISSET(session->meta_cursor, WT_CURSTD_META_INUSE)) { + WT_RET(__wt_metadata_cursor_open(session, NULL, &cursor)); + if (session->meta_cursor == NULL) { + session->meta_cursor = cursor; + cursor = NULL; + } + } - session->dhandle = session->meta_dhandle; + /* + * If there's no cursor return, we're done, our caller should have just + * been triggering the creation of the session's cached cursor. There + * should not be an open local cursor in that case, but caution doesn't + * cost anything. + */ + if (cursorp == NULL) + return (cursor == NULL ? 0 : cursor->close(cursor)); - /* - * We use the metadata a lot, so we have a handle cached; lock it and - * increment the in-use counter once the cursor is open. + /* + * If the cached cursor is in use, return the newly opened cursor, else + * mark the cached cursor in use and return it. */ - WT_ERR(__wt_session_lock_dhandle(session, 0, &is_dead)); + if (F_ISSET(session->meta_cursor, WT_CURSTD_META_INUSE)) + *cursorp = cursor; + else { + *cursorp = session->meta_cursor; + F_SET(session->meta_cursor, WT_CURSTD_META_INUSE); + } + return (0); +} - /* The metadata should never be closed. */ - WT_ASSERT(session, !is_dead); +/* + * __wt_metadata_cursor_release -- + * Release a metadata cursor. + */ +int +__wt_metadata_cursor_release(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) +{ + WT_CURSOR *cursor; - WT_ERR(__wt_curfile_create(session, NULL, cfg, false, false, cursorp)); - __wt_cursor_dhandle_incr_use(session); + WT_UNUSED(session); - /* Restore the caller's btree. */ -err: session->dhandle = saved_dhandle; - return (ret); + if ((cursor = *cursorp) == NULL) + return (0); + *cursorp = NULL; + + /* + * If using the session's cached metadata cursor, clear the in-use flag + * and reset it, otherwise, discard the cursor. + */ + if (F_ISSET(cursor, WT_CURSTD_META_INUSE)) { + WT_ASSERT(session, cursor == session->meta_cursor); + + F_CLR(cursor, WT_CURSTD_META_INUSE); + return (cursor->reset(cursor)); + } + return (cursor->close(cursor)); } /* @@ -124,14 +167,13 @@ __wt_metadata_insert( WT_RET_MSG(session, EINVAL, "%s: insert not supported on the turtle file", key); - WT_RET(__wt_metadata_cursor(session, NULL, &cursor)); + WT_RET(__wt_metadata_cursor(session, &cursor)); cursor->set_key(cursor, key); cursor->set_value(cursor, value); WT_ERR(cursor->insert(cursor)); if (WT_META_TRACKING(session)) WT_ERR(__wt_meta_track_insert(session, key)); - -err: WT_TRET(cursor->close(cursor)); +err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); return (ret); } @@ -152,7 +194,7 @@ __wt_metadata_update( __metadata_turtle(key) ? "" : "not ")); if (__metadata_turtle(key)) { - WT_WITH_TURTLE_LOCK(session, + WT_WITH_TURTLE_LOCK(session, ret, ret = __wt_turtle_update(session, key, value)); return (ret); } @@ -160,12 +202,14 @@ __wt_metadata_update( if (WT_META_TRACKING(session)) WT_RET(__wt_meta_track_update(session, key)); - WT_RET(__wt_metadata_cursor(session, "overwrite", &cursor)); + WT_RET(__wt_metadata_cursor(session, &cursor)); + /* This cursor needs to have overwrite semantics. */ + WT_ASSERT(session, F_ISSET(cursor, WT_CURSTD_OVERWRITE)); + cursor->set_key(cursor, key); cursor->set_value(cursor, value); WT_ERR(cursor->insert(cursor)); - -err: WT_TRET(cursor->close(cursor)); +err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); return (ret); } @@ -188,14 +232,13 @@ __wt_metadata_remove(WT_SESSION_IMPL *session, const char *key) WT_RET_MSG(session, EINVAL, "%s: remove not supported on the turtle file", key); - WT_RET(__wt_metadata_cursor(session, NULL, &cursor)); + WT_RET(__wt_metadata_cursor(session, &cursor)); cursor->set_key(cursor, key); WT_ERR(cursor->search(cursor)); if (WT_META_TRACKING(session)) WT_ERR(__wt_meta_track_update(session, key)); WT_ERR(cursor->remove(cursor)); - -err: WT_TRET(cursor->close(cursor)); +err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); return (ret); } @@ -205,8 +248,7 @@ err: WT_TRET(cursor->close(cursor)); * The caller is responsible for freeing the allocated memory. */ int -__wt_metadata_search( - WT_SESSION_IMPL *session, const char *key, char **valuep) +__wt_metadata_search(WT_SESSION_IMPL *session, const char *key, char **valuep) { WT_CURSOR *cursor; WT_DECL_RET; @@ -230,7 +272,7 @@ __wt_metadata_search( * Metadata updates use non-transactional techniques (such as the * schema and metadata locks) to protect access to in-flight updates. */ - WT_RET(__wt_metadata_cursor(session, NULL, &cursor)); + WT_RET(__wt_metadata_cursor(session, &cursor)); cursor->set_key(cursor, key); WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED, ret = cursor->search(cursor)); @@ -238,7 +280,6 @@ __wt_metadata_search( WT_ERR(cursor->get_value(cursor, &value)); WT_ERR(__wt_strdup(session, value, valuep)); - -err: WT_TRET(cursor->close(cursor)); +err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); return (ret); } diff --git a/src/meta/meta_track.c b/src/meta/meta_track.c index ea1757129c5..1baab2deae1 100644 --- a/src/meta/meta_track.c +++ b/src/meta/meta_track.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -141,7 +141,7 @@ __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk) ret = bm->checkpoint_resolve(bm, session)); break; case WT_ST_DROP_COMMIT: - if ((ret = __wt_remove_if_exists(session, trk->a)) != 0) + if ((ret = __wt_block_manager_drop(session, trk->a)) != 0) __wt_err(session, ret, "metadata remove dropped file %s", trk->a); break; @@ -189,7 +189,7 @@ __meta_track_unroll(WT_SESSION_IMPL *session, WT_META_TRACK *trk) * For removes, b is NULL. */ if (trk->a != NULL && trk->b != NULL && - (ret = __wt_rename(session, + (ret = __wt_rename_and_sync_directory(session, trk->b + strlen("file:"), trk->a + strlen("file:"))) != 0) __wt_err(session, ret, "metadata unroll rename %s to %s", trk->b, trk->a); @@ -262,16 +262,17 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll) } /* - * If we don't have the metadata handle (e.g, we're in the process of + * If we don't have the metadata cursor (e.g, we're in the process of * creating the metadata), we can't sync it. */ - if (!need_sync || session->meta_dhandle == NULL || + if (!need_sync || session->meta_cursor == NULL || F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) goto done; /* If we're logging, make sure the metadata update was flushed. */ if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED)) { - WT_WITH_DHANDLE(session, session->meta_dhandle, + WT_WITH_DHANDLE(session, + WT_SESSION_META_DHANDLE(session), ret = __wt_txn_checkpoint_log( session, false, WT_TXN_LOG_CKPT_SYNC, NULL)); WT_RET(ret); @@ -284,12 +285,14 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll) */ ckpt_session->txn.id = session->txn.id; F_SET(ckpt_session, WT_SESSION_LOCKED_SCHEMA); - WT_WITH_DHANDLE(ckpt_session, session->meta_dhandle, ret = - __wt_checkpoint(ckpt_session, NULL)); + WT_WITH_DHANDLE(ckpt_session, + WT_SESSION_META_DHANDLE(session), + ret = __wt_checkpoint(ckpt_session, NULL)); F_CLR(ckpt_session, WT_SESSION_LOCKED_SCHEMA); ckpt_session->txn.id = WT_TXN_NONE; WT_RET(ret); - WT_WITH_DHANDLE(session, session->meta_dhandle, + WT_WITH_DHANDLE(session, + WT_SESSION_META_DHANDLE(session), ret = __wt_checkpoint_sync(session, NULL)); WT_RET(ret); } diff --git a/src/meta/meta_turtle.c b/src/meta/meta_turtle.c index 13e8b31916f..7182bb0fe5f 100644 --- a/src/meta/meta_turtle.c +++ b/src/meta/meta_turtle.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -55,7 +55,7 @@ __metadata_init(WT_SESSION_IMPL *session) * We're single-threaded, but acquire the schema lock regardless: the * lower level code checks that it is appropriately synchronized. */ - WT_WITH_SCHEMA_LOCK(session, + WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_schema_create(session, WT_METAFILE_URI, NULL)); return (ret); @@ -120,7 +120,7 @@ __metadata_load_bulk(WT_SESSION_IMPL *session) * If a file was being bulk-loaded during the hot backup, it will appear * in the metadata file, but the file won't exist. Create on demand. */ - WT_ERR(__wt_metadata_cursor(session, NULL, &cursor)); + WT_RET(__wt_metadata_cursor(session, &cursor)); while ((ret = cursor->next(cursor)) == 0) { WT_ERR(cursor->get_key(cursor, &key)); if (!WT_PREFIX_SKIP(key, "file:")) @@ -141,9 +141,7 @@ __metadata_load_bulk(WT_SESSION_IMPL *session) } WT_ERR_NOTFOUND_OK(ret); -err: if (cursor != NULL) - WT_TRET(cursor->close(cursor)); - +err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); return (ret); } @@ -202,7 +200,8 @@ __wt_turtle_init(WT_SESSION_IMPL *session) /* Create the turtle file. */ WT_RET(__metadata_config(session, &metaconf)); - WT_WITH_TURTLE_LOCK(session, ret = __wt_turtle_update( + WT_WITH_TURTLE_LOCK(session, ret, + ret = __wt_turtle_update( session, WT_METAFILE_URI, metaconf)); WT_ERR(ret); } @@ -271,8 +270,7 @@ err: WT_TRET(__wt_fclose(&fp, WT_FHANDLE_READ)); * Update the turtle file. */ int -__wt_turtle_update( - WT_SESSION_IMPL *session, const char *key, const char *value) +__wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value) { WT_FH *fh; WT_DECL_ITEM(buf); @@ -299,7 +297,7 @@ __wt_turtle_update( WT_ERR(__wt_write(session, fh, 0, buf->size, buf->data)); /* Flush the handle and rename the file into place. */ - ret = __wt_sync_and_rename_fh( + ret = __wt_fh_sync_and_rename( session, &fh, WT_METADATA_TURTLE_SET, WT_METADATA_TURTLE); /* Close any file handle left open, remove any temporary file. */ diff --git a/src/os_posix/os_abort.c b/src/os_posix/os_abort.c index 5bb6aeb6e16..034eedcfbf8 100644 --- a/src/os_posix/os_abort.c +++ b/src/os_posix/os_abort.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_alloc.c b/src/os_posix/os_alloc.c index eb2482723ec..3876f9a1afe 100644 --- a/src/os_posix/os_alloc.c +++ b/src/os_posix/os_alloc.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_dir.c b/src/os_posix/os_dir.c index 9eba641ca51..83e77aa5312 100644 --- a/src/os_posix/os_dir.c +++ b/src/os_posix/os_dir.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_dlopen.c b/src/os_posix/os_dlopen.c index 2b5fa249163..9a74eb4813d 100644 --- a/src/os_posix/os_dlopen.c +++ b/src/os_posix/os_dlopen.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_errno.c b/src/os_posix/os_errno.c index 229b68e0008..a58ae88447e 100644 --- a/src/os_posix/os_errno.c +++ b/src/os_posix/os_errno.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_exist.c b/src/os_posix/os_exist.c index 644a27dca9a..87f0e219d2e 100644 --- a/src/os_posix/os_exist.c +++ b/src/os_posix/os_exist.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_fallocate.c b/src/os_posix/os_fallocate.c index 6280e334afb..9d160afd179 100644 --- a/src/os_posix/os_fallocate.c +++ b/src/os_posix/os_fallocate.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_filesize.c b/src/os_posix/os_filesize.c index c58f73b0665..72242e351bf 100644 --- a/src/os_posix/os_filesize.c +++ b/src/os_posix/os_filesize.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_flock.c b/src/os_posix/os_flock.c index 07393481e7d..e2056f7636c 100644 --- a/src/os_posix/os_flock.c +++ b/src/os_posix/os_flock.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_fsync.c b/src/os_posix/os_fsync.c index b0c04e98258..f5afddc557b 100644 --- a/src/os_posix/os_fsync.c +++ b/src/os_posix/os_fsync.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -75,12 +75,13 @@ __wt_directory_sync_fh(WT_SESSION_IMPL *session, WT_FH *fh) * Flush a directory to ensure a file creation is durable. */ int -__wt_directory_sync(WT_SESSION_IMPL *session, char *path) +__wt_directory_sync(WT_SESSION_IMPL *session, const char *path) { #ifdef __linux__ WT_DECL_RET; int fd, tret; - char *dir; + const char *dir; + char *copy; /* * POSIX 1003.1 does not require that fsync of a file handle ensures the @@ -88,15 +89,22 @@ __wt_directory_sync(WT_SESSION_IMPL *session, char *path) * there are historic Linux filesystems requiring this), do an explicit * fsync on a file descriptor for the directory to be sure. */ - if (path == NULL || (dir = strrchr(path, '/')) == NULL) { - dir = NULL; - path = (char *)S2C(session)->home; - } else - *dir = '\0'; + copy = NULL; + if (path == NULL || (dir = strrchr(path, '/')) == NULL) + path = S2C(session)->home; + else { + /* + * Copy the directory name, leaving the trailing slash in place, + * so a path of "/foo" doesn't result in an empty string. + */ + WT_RET(__wt_strndup( + session, path, (size_t)(dir - path) + 1, ©)); + path = copy; + } + WT_SYSCALL_RETRY(((fd = open(path, O_RDONLY, 0444)) == -1 ? 1 : 0), ret); - if (dir != NULL) - *dir = '/'; + __wt_free(session, copy); if (ret != 0) WT_RET_MSG(session, ret, "%s: open", path); diff --git a/src/os_posix/os_ftruncate.c b/src/os_posix/os_ftruncate.c index 696d8da54f4..2af90512f26 100644 --- a/src/os_posix/os_ftruncate.c +++ b/src/os_posix/os_ftruncate.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_getenv.c b/src/os_posix/os_getenv.c index e1e0051a120..7a086145cee 100644 --- a/src/os_posix/os_getenv.c +++ b/src/os_posix/os_getenv.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_getline.c b/src/os_posix/os_getline.c index 7c4ee8d1746..c0ca96852de 100644 --- a/src/os_posix/os_getline.c +++ b/src/os_posix/os_getline.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_getopt.c b/src/os_posix/os_getopt.c index 486d85286bc..0306ad1d79d 100644 --- a/src/os_posix/os_getopt.c +++ b/src/os_posix/os_getopt.c @@ -1,5 +1,5 @@ /*- - * Public Domain 2014-2015 MongoDB, Inc. + * Public Domain 2014-2016 MongoDB, Inc. * Public Domain 2008-2014 WiredTiger, Inc. * * This is free and unencumbered software released into the public domain. diff --git a/src/os_posix/os_map.c b/src/os_posix/os_map.c index e95ccb0ade2..42aeeac4a5e 100644 --- a/src/os_posix/os_map.c +++ b/src/os_posix/os_map.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -48,8 +48,6 @@ __wt_mmap(WT_SESSION_IMPL *session, return (0); } -#define WT_VM_PAGESIZE 4096 - /* * __wt_mmap_preload -- * Cause a section of a memory map to be faulted in. @@ -59,9 +57,10 @@ __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size) { #ifdef HAVE_POSIX_MADVISE /* Linux requires the address be aligned to a 4KB boundary. */ + WT_CONNECTION_IMPL *conn = S2C(session); WT_BM *bm = S2BT(session)->bm; WT_DECL_RET; - void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1)); + void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(conn->page_size - 1)); size += WT_PTRDIFF(p, blk); /* XXX proxy for "am I doing a scan?" -- manual read-ahead */ @@ -78,9 +77,9 @@ __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size) * Manual pages aren't clear on whether alignment is required for the * size, so we will be conservative. */ - size &= ~(size_t)(WT_VM_PAGESIZE - 1); + size &= ~(size_t)(conn->page_size - 1); - if (size > WT_VM_PAGESIZE && + if (size > (size_t)conn->page_size && (ret = posix_madvise(blk, size, POSIX_MADV_WILLNEED)) != 0) WT_RET_MSG(session, ret, "posix_madvise will need"); #else @@ -101,8 +100,9 @@ __wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size) { #ifdef HAVE_POSIX_MADVISE /* Linux requires the address be aligned to a 4KB boundary. */ + WT_CONNECTION_IMPL *conn = S2C(session); WT_DECL_RET; - void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1)); + void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(conn->page_size - 1)); size += WT_PTRDIFF(p, blk); if ((ret = posix_madvise(blk, size, POSIX_MADV_DONTNEED)) != 0) diff --git a/src/os_posix/os_mtx_cond.c b/src/os_posix/os_mtx_cond.c index d5fc86b648b..5f4e9a7cf2b 100644 --- a/src/os_posix/os_mtx_cond.c +++ b/src/os_posix/os_mtx_cond.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_mtx_rw.c b/src/os_posix/os_mtx_rw.c index 46f134feabb..b6876cdfbdc 100644 --- a/src/os_posix/os_mtx_rw.c +++ b/src/os_posix/os_mtx_rw.c @@ -1,5 +1,5 @@ /*- - * Public Domain 2014-2015 MongoDB, Inc. + * Public Domain 2014-2016 MongoDB, Inc. * Public Domain 2008-2014 WiredTiger, Inc. * * This is free and unencumbered software released into the public domain. diff --git a/src/os_posix/os_once.c b/src/os_posix/os_once.c index bfe0b9819ac..8d900042330 100644 --- a/src/os_posix/os_once.c +++ b/src/os_posix/os_once.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c index a87272db391..b085676c53b 100644 --- a/src/os_posix/os_open.c +++ b/src/os_posix/os_open.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_pagesize.c b/src/os_posix/os_pagesize.c new file mode 100644 index 00000000000..4a7e7084cc6 --- /dev/null +++ b/src/os_posix/os_pagesize.c @@ -0,0 +1,19 @@ +/*- + * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_get_vm_pagesize -- + * Return the default page size of a virtual memory page. + */ +int +__wt_get_vm_pagesize(void) +{ + return (getpagesize()); +} diff --git a/src/os_posix/os_path.c b/src/os_posix/os_path.c index af28e1b3b56..6dc54675eb8 100644 --- a/src/os_posix/os_path.c +++ b/src/os_posix/os_path.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_priv.c b/src/os_posix/os_priv.c index a8479668d67..5ffbbf7a1f2 100644 --- a/src/os_posix/os_priv.c +++ b/src/os_posix/os_priv.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_remove.c b/src/os_posix/os_remove.c index 96bbba9bab2..bc244c12e46 100644 --- a/src/os_posix/os_remove.c +++ b/src/os_posix/os_remove.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_rename.c b/src/os_posix/os_rename.c index 811604e7f0f..301190305c4 100644 --- a/src/os_posix/os_rename.c +++ b/src/os_posix/os_rename.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_rw.c b/src/os_posix/os_rw.c index 24d6d1aa879..8733bfe0f53 100644 --- a/src/os_posix/os_rw.c +++ b/src/os_posix/os_rw.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_sleep.c b/src/os_posix/os_sleep.c index 4e90edabc53..8633b8d1ec0 100644 --- a/src/os_posix/os_sleep.c +++ b/src/os_posix/os_sleep.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_stdio.c b/src/os_posix/os_stdio.c index da880f5521e..7ab107eda1e 100644 --- a/src/os_posix/os_stdio.c +++ b/src/os_posix/os_stdio.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_strtouq.c b/src/os_posix/os_strtouq.c index 0b7a540959c..0ae604fc761 100644 --- a/src/os_posix/os_strtouq.c +++ b/src/os_posix/os_strtouq.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_thread.c b/src/os_posix/os_thread.c index c7222aac6c4..35a23622ddc 100644 --- a/src/os_posix/os_thread.c +++ b/src/os_posix/os_thread.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_time.c b/src/os_posix/os_time.c index c3052df62e7..0e5a1cdadfb 100644 --- a/src/os_posix/os_time.c +++ b/src/os_posix/os_time.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_posix/os_yield.c b/src/os_posix/os_yield.c index 297ec7deaee..052a46940b7 100644 --- a/src/os_posix/os_yield.c +++ b/src/os_posix/os_yield.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_dir.c b/src/os_win/os_dir.c index aff916c25f5..00ec4f252e4 100644 --- a/src/os_win/os_dir.c +++ b/src/os_win/os_dir.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_dlopen.c b/src/os_win/os_dlopen.c index 1c57d5f8073..0bad39d681d 100644 --- a/src/os_win/os_dlopen.c +++ b/src/os_win/os_dlopen.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_errno.c b/src/os_win/os_errno.c index a9d3d521052..6a9daf8443f 100644 --- a/src/os_win/os_errno.c +++ b/src/os_win/os_errno.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_exist.c b/src/os_win/os_exist.c index 4a727801569..ec1369cc727 100644 --- a/src/os_win/os_exist.c +++ b/src/os_win/os_exist.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_fallocate.c b/src/os_win/os_fallocate.c index 030c2e4c6c7..cdc7a1c46ee 100644 --- a/src/os_win/os_fallocate.c +++ b/src/os_win/os_fallocate.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_filesize.c b/src/os_win/os_filesize.c index 7f231b5ba9a..c9925fb18a8 100644 --- a/src/os_win/os_filesize.c +++ b/src/os_win/os_filesize.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_flock.c b/src/os_win/os_flock.c index 947d7bdcde7..60a981499a5 100644 --- a/src/os_win/os_flock.c +++ b/src/os_win/os_flock.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_fsync.c b/src/os_win/os_fsync.c index 7a01b5cd61d..913b7ca5a4e 100644 --- a/src/os_win/os_fsync.c +++ b/src/os_win/os_fsync.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -25,7 +25,7 @@ __wt_directory_sync_fh(WT_SESSION_IMPL *session, WT_FH *fh) * Flush a directory to ensure a file creation is durable. */ int -__wt_directory_sync(WT_SESSION_IMPL *session, char *path) +__wt_directory_sync(WT_SESSION_IMPL *session, const char *path) { WT_UNUSED(session); WT_UNUSED(path); diff --git a/src/os_win/os_ftruncate.c b/src/os_win/os_ftruncate.c index cc635306a71..0c11b5509b7 100644 --- a/src/os_win/os_ftruncate.c +++ b/src/os_win/os_ftruncate.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_getenv.c b/src/os_win/os_getenv.c index 9b3a20abad7..c9084769cd5 100644 --- a/src/os_win/os_getenv.c +++ b/src/os_win/os_getenv.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_map.c b/src/os_win/os_map.c index 3c4edb59ea8..dc040b4fa54 100644 --- a/src/os_win/os_map.c +++ b/src/os_win/os_map.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_mtx_cond.c b/src/os_win/os_mtx_cond.c index b909afa9ba6..14bac2a99d9 100644 --- a/src/os_win/os_mtx_cond.c +++ b/src/os_win/os_mtx_cond.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_once.c b/src/os_win/os_once.c index bb5e059452e..9ea3fe044eb 100644 --- a/src/os_win/os_once.c +++ b/src/os_win/os_once.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_open.c b/src/os_win/os_open.c index c3106763452..3ec53daf001 100644 --- a/src/os_win/os_open.c +++ b/src/os_win/os_open.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_pagesize.c b/src/os_win/os_pagesize.c new file mode 100644 index 00000000000..648105c0e7c --- /dev/null +++ b/src/os_win/os_pagesize.c @@ -0,0 +1,23 @@ +/*- + * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_get_vm_pagesize -- + * Return the default page size of a virtual memory page. + */ +int +__wt_get_vm_pagesize(void) +{ + SYSTEM_INFO system_info; + + GetSystemInfo(&system_info); + + return (system_info.dwPageSize); +} diff --git a/src/os_win/os_path.c b/src/os_win/os_path.c index 9d001e50571..e9532de2b38 100644 --- a/src/os_win/os_path.c +++ b/src/os_win/os_path.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_priv.c b/src/os_win/os_priv.c index 5c32d6b5999..8c1f3893920 100644 --- a/src/os_win/os_priv.c +++ b/src/os_win/os_priv.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_remove.c b/src/os_win/os_remove.c index 55b50030064..5682a25d7f2 100644 --- a/src/os_win/os_remove.c +++ b/src/os_win/os_remove.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_rename.c b/src/os_win/os_rename.c index a0f33843218..829ab1d16e9 100644 --- a/src/os_win/os_rename.c +++ b/src/os_win/os_rename.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_rw.c b/src/os_win/os_rw.c index bafefcfba24..49f011001a4 100644 --- a/src/os_win/os_rw.c +++ b/src/os_win/os_rw.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_sleep.c b/src/os_win/os_sleep.c index 33e04c1d8a9..1d4b316488a 100644 --- a/src/os_win/os_sleep.c +++ b/src/os_win/os_sleep.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_snprintf.c b/src/os_win/os_snprintf.c index ebb14fd32e8..a6056ff9342 100644 --- a/src/os_win/os_snprintf.c +++ b/src/os_win/os_snprintf.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_thread.c b/src/os_win/os_thread.c index b5f13aea4e9..3be0ccb9393 100644 --- a/src/os_win/os_thread.c +++ b/src/os_win/os_thread.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_time.c b/src/os_win/os_time.c index 2292c317a64..e784b5d8a36 100644 --- a/src/os_win/os_time.c +++ b/src/os_win/os_time.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_vsnprintf.c b/src/os_win/os_vsnprintf.c index 205b63751c7..63f96e79d5b 100644 --- a/src/os_win/os_vsnprintf.c +++ b/src/os_win/os_vsnprintf.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/os_win/os_yield.c b/src/os_win/os_yield.c index dd3eb67de8b..aab1559e072 100644 --- a/src/os_win/os_yield.c +++ b/src/os_win/os_yield.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/packing/pack_api.c b/src/packing/pack_api.c index efe999505bf..4c65406cd64 100644 --- a/src/packing/pack_api.c +++ b/src/packing/pack_api.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/packing/pack_impl.c b/src/packing/pack_impl.c index 30d28dfb63c..0e3ed44ba6a 100644 --- a/src/packing/pack_impl.c +++ b/src/packing/pack_impl.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -176,6 +176,8 @@ __wt_struct_repack(WT_SESSION_IMPL *session, const char *infmt, /* Outfmt should complete before infmt */ while ((ret = __pack_next(&packout, &pvout)) == 0) { + if (p >= end) + WT_ERR(EINVAL); WT_ERR(__pack_next(&packin, &pvin)); before = p; WT_ERR(__unpack_read(session, &pvin, &p, (size_t)(end - p))); diff --git a/src/packing/pack_stream.c b/src/packing/pack_stream.c index 1f3449d79d3..98da5b405c3 100644 --- a/src/packing/pack_stream.c +++ b/src/packing/pack_stream.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/reconcile/rec_track.c b/src/reconcile/rec_track.c index 18ed5c6b551..4a3a8a7e988 100644 --- a/src/reconcile/rec_track.c +++ b/src/reconcile/rec_track.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 21cc68ed119..332449027a9 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -630,12 +630,12 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) */ switch (page->type) { case WT_PAGE_COL_INT: - WT_RET(__wt_page_alloc(session, - WT_PAGE_COL_INT, 1, mod->mod_multi_entries, false, &next)); + WT_RET(__wt_page_alloc(session, WT_PAGE_COL_INT, + 1, mod->mod_multi_entries, false, &next)); break; case WT_PAGE_ROW_INT: - WT_RET(__wt_page_alloc(session, - WT_PAGE_ROW_INT, 0, mod->mod_multi_entries, false, &next)); + WT_RET(__wt_page_alloc(session, WT_PAGE_ROW_INT, + WT_RECNO_OOB, mod->mod_multi_entries, false, &next)); break; WT_ILLEGAL_VALUE(session); } @@ -1276,6 +1276,8 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, for (upd = upd_list; upd->next != NULL; upd = upd->next) ; upd->next = append; + __wt_cache_page_inmem_incr( + session, page, WT_UPDATE_MEMSIZE(append)); } /* @@ -1756,7 +1758,7 @@ __rec_key_state_update(WT_RECONCILE *r, bool ovfl_key) * Figure out the maximum leaf page size for the reconciliation. */ static inline uint32_t -__rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r) +__rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r) { WT_BTREE *btree; WT_PAGE *page; @@ -3263,7 +3265,14 @@ supd_check_complete: memset(WT_BLOCK_HEADER_REF(dsk), 0, btree->block_header); bnd->cksum = __wt_cksum(buf->data, buf->size); - if (mod->rec_result == WT_PM_REC_MULTIBLOCK && + /* + * One last check: don't reuse blocks if compacting, the reason + * for compaction is to move blocks to different locations. We + * do this check after calculating the checksums, hopefully the + * next write can be skipped. + */ + if (session->compact_state == WT_COMPACT_NONE && + mod->rec_result == WT_PM_REC_MULTIBLOCK && mod->mod_multi_entries > bnd_slot) { multi = &mod->mod_multi[bnd_slot]; if (multi->size == bnd->size && @@ -3502,7 +3511,7 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) break; case BTREE_COL_VAR: if (cbulk->rle != 0) - WT_RET(__wt_bulk_insert_var(session, cbulk)); + WT_RET(__wt_bulk_insert_var(session, cbulk, false)); break; case BTREE_ROW: break; @@ -3625,43 +3634,20 @@ __rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk) * Fixed-length column-store bulk insert. */ int -__wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) +__wt_bulk_insert_fix( + WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted) { WT_BTREE *btree; WT_CURSOR *cursor; WT_RECONCILE *r; - uint32_t entries, offset, page_entries, page_size; - const uint8_t *data; r = cbulk->reconcile; btree = S2BT(session); cursor = &cbulk->cbt.iface; - if (cbulk->bitmap) { - if (((r->recno - 1) * btree->bitcnt) & 0x7) - WT_RET_MSG(session, EINVAL, - "Bulk bitmap load not aligned on a byte boundary"); - for (data = cursor->value.data, - entries = (uint32_t)cursor->value.size; - entries > 0; - entries -= page_entries, data += page_size) { - WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk)); - - page_entries = - WT_MIN(entries, cbulk->nrecs - cbulk->entry); - page_size = __bitstr_size(page_entries * btree->bitcnt); - offset = __bitstr_size(cbulk->entry * btree->bitcnt); - memcpy(r->first_free + offset, data, page_size); - cbulk->entry += page_entries; - r->recno += page_entries; - } - return (0); - } - WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk)); - - __bit_setv(r->first_free, - cbulk->entry, btree->bitcnt, ((uint8_t *)cursor->value.data)[0]); + __bit_setv(r->first_free, cbulk->entry, + btree->bitcnt, deleted ? 0 : ((uint8_t *)cursor->value.data)[0]); ++cbulk->entry; ++r->recno; @@ -3669,11 +3655,48 @@ __wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) } /* + * __wt_bulk_insert_fix_bitmap -- + * Fixed-length column-store bulk insert. + */ +int +__wt_bulk_insert_fix_bitmap(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) +{ + WT_BTREE *btree; + WT_CURSOR *cursor; + WT_RECONCILE *r; + uint32_t entries, offset, page_entries, page_size; + const uint8_t *data; + + r = cbulk->reconcile; + btree = S2BT(session); + cursor = &cbulk->cbt.iface; + + if (((r->recno - 1) * btree->bitcnt) & 0x7) + WT_RET_MSG(session, EINVAL, + "Bulk bitmap load not aligned on a byte boundary"); + for (data = cursor->value.data, + entries = (uint32_t)cursor->value.size; + entries > 0; + entries -= page_entries, data += page_size) { + WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk)); + + page_entries = WT_MIN(entries, cbulk->nrecs - cbulk->entry); + page_size = __bitstr_size(page_entries * btree->bitcnt); + offset = __bitstr_size(cbulk->entry * btree->bitcnt); + memcpy(r->first_free + offset, data, page_size); + cbulk->entry += page_entries; + r->recno += page_entries; + } + return (0); +} + +/* * __wt_bulk_insert_var -- * Variable-length column-store bulk insert. */ int -__wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) +__wt_bulk_insert_var( + WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted) { WT_BTREE *btree; WT_KV *val; @@ -3682,14 +3705,20 @@ __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) r = cbulk->reconcile; btree = S2BT(session); - /* - * Store the bulk cursor's last buffer, not the current value, we're - * creating a duplicate count, which means we want the previous value - * seen, not the current value. - */ val = &r->v; - WT_RET(__rec_cell_build_val( - session, r, cbulk->last.data, cbulk->last.size, cbulk->rle)); + if (deleted) { + val->cell_len = __wt_cell_pack_del(&val->cell, cbulk->rle); + val->buf.data = NULL; + val->buf.size = 0; + val->len = val->cell_len; + } else + /* + * Store the bulk cursor's last buffer, not the current value, + * we're tracking duplicates, which means we want the previous + * value seen, not the current value. + */ + WT_RET(__rec_cell_build_val(session, + r, cbulk->last.data, cbulk->last.size, cbulk->rle)); /* Boundary: split or write the page. */ if (val->len > r->space_avail) @@ -3923,16 +3952,49 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) r->recno += entry; /* Walk any append list. */ - WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) { - WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd)); - if (upd == NULL) - continue; + for (ins = + WT_SKIP_FIRST(WT_COL_APPEND(page));; ins = WT_SKIP_NEXT(ins)) { + if (ins == NULL) { + /* + * If the page split, instantiate any missing records in + * the page's name space. (Imagine record 98 is + * transactionally visible, 99 wasn't created or is not + * yet visible, 100 is visible. Then the page splits and + * record 100 moves to another page. When we reconcile + * the original page, we write record 98, then we don't + * see record 99 for whatever reason. If we've moved + * record 1000, we don't know to write a deleted record + * 99 on the page.) + * + * The record number recorded during the split is the + * first key on the split page, that is, one larger than + * the last key on this page, we have to decrement it. + */ + if ((recno = + page->modify->mod_split_recno) == WT_RECNO_OOB) + break; + recno -= 1; + + /* + * The following loop assumes records to write, and the + * previous key might have been visible. + */ + if (r->recno > recno) + break; + upd = NULL; + } else { + WT_RET( + __rec_txn_read(session, r, ins, NULL, NULL, &upd)); + if (upd == NULL) + continue; + recno = WT_INSERT_RECNO(ins); + } for (;;) { /* * The application may have inserted records which left * gaps in the name space. */ - for (recno = WT_INSERT_RECNO(ins); + for (; nrecs > 0 && r->recno < recno; --nrecs, ++entry, ++r->recno) __bit_setv( @@ -3940,6 +4002,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) if (nrecs > 0) { __bit_setv(r->first_free, entry, btree->bitcnt, + upd == NULL ? 0 : ((uint8_t *)WT_UPDATE_DATA(upd))[0]); --nrecs; ++entry; @@ -3961,6 +4024,13 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) entry = 0; nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail); } + + /* + * Execute this loop once without an insert item to catch any + * missing records due to a split, then quit. + */ + if (ins == NULL) + break; } /* Update the counters. */ @@ -4441,11 +4511,36 @@ compare: /* } /* Walk any append list. */ - WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) { - WT_ERR(__rec_txn_read(session, r, ins, NULL, NULL, &upd)); - if (upd == NULL) - continue; - for (n = WT_INSERT_RECNO(ins); src_recno <= n; ++src_recno) { + for (ins = + WT_SKIP_FIRST(WT_COL_APPEND(page));; ins = WT_SKIP_NEXT(ins)) { + if (ins == NULL) { + /* + * If the page split, instantiate any missing records in + * the page's name space. (Imagine record 98 is + * transactionally visible, 99 wasn't created or is not + * yet visible, 100 is visible. Then the page splits and + * record 100 moves to another page. When we reconcile + * the original page, we write record 98, then we don't + * see record 99 for whatever reason. If we've moved + * record 1000, we don't know to write a deleted record + * 99 on the page.) + * + * The record number recorded during the split is the + * first key on the split page, that is, one larger than + * the last key on this page, we have to decrement it. + */ + if ((n = page->modify->mod_split_recno) == WT_RECNO_OOB) + break; + n -= 1; + upd = NULL; + } else { + WT_ERR( + __rec_txn_read(session, r, ins, NULL, NULL, &upd)); + if (upd == NULL) + continue; + n = WT_INSERT_RECNO(ins); + } + while (src_recno <= n) { /* * The application may have inserted records which left * gaps in the name space, and these gaps can be huge. @@ -4468,7 +4563,8 @@ compare: /* src_recno += skip; } } else { - deleted = WT_UPDATE_DELETED_ISSET(upd); + deleted = upd == NULL || + WT_UPDATE_DELETED_ISSET(upd); if (!deleted) { data = WT_UPDATE_DATA(upd); size = upd->size; @@ -4485,7 +4581,7 @@ compare: /* last->size == size && memcmp(last->data, data, size) == 0)) { ++rle; - continue; + goto next; } WT_ERR(__rec_col_var_helper(session, r, salvage, last, last_deleted, 0, rle)); @@ -4504,7 +4600,23 @@ compare: /* } last_deleted = deleted; rle = 1; + + /* + * Move to the next record. It's not a simple increment + * because if it's the maximum record, incrementing it + * wraps to 0 and this turns into an infinite loop. + */ +next: if (src_recno == UINT64_MAX) + break; + ++src_recno; } + + /* + * Execute this loop once without an insert item to catch any + * missing records due to a split, then quit. + */ + if (ins == NULL) + break; } /* If we were tracking a record, write it. */ @@ -5343,11 +5455,10 @@ __rec_split_dump_keys(WT_SESSION_IMPL *session, WT_PAGE *page, WT_RECONCILE *r) switch (page->type) { case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: - WT_ERR(__wt_buf_set_printable( - session, tkey, bnd->key.data, bnd->key.size)); WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, - "starting key %.*s", - (int)tkey->size, (const char *)tkey->data)); + "starting key %s", + __wt_buf_set_printable( + session, bnd->key.data, bnd->key.size, tkey))); break; case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: diff --git a/src/schema/schema_create.c b/src/schema/schema_create.c index db4658cbd0e..8cdcbbcad54 100644 --- a/src/schema/schema_create.c +++ b/src/schema/schema_create.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -74,11 +74,11 @@ __create_file(WT_SESSION_IMPL *session, { WT_DECL_ITEM(val); WT_DECL_RET; - uint32_t allocsize; - bool is_metadata; const char *filename, **p, *filecfg[] = { WT_CONFIG_BASE(session, file_meta), config, NULL, NULL }; char *fileconf; + uint32_t allocsize; + bool is_metadata; fileconf = NULL; @@ -97,7 +97,7 @@ __create_file(WT_SESSION_IMPL *session, } /* Sanity check the allocation size. */ - WT_RET(__wt_direct_io_size_check( + WT_ERR(__wt_direct_io_size_check( session, filecfg, "allocation_size", &allocsize)); /* Create the file. */ @@ -197,13 +197,15 @@ __create_colgroup(WT_SESSION_IMPL *session, { WT_CONFIG_BASE(session, colgroup_meta), config, NULL, NULL }; const char *sourcecfg[] = { config, NULL, NULL }; const char *cgname, *source, *sourceconf, *tablename; - char *cgconf, *oldconf; + char *cgconf, *origconf; + bool exists; sourceconf = NULL; - cgconf = oldconf = NULL; + cgconf = origconf = NULL; WT_CLEAR(fmt); WT_CLEAR(confbuf); WT_CLEAR(namebuf); + exists = false; tablename = name; if (!WT_PREFIX_SKIP(tablename, "colgroup:")) @@ -228,6 +230,14 @@ __create_colgroup(WT_SESSION_IMPL *session, "Column group '%s' not found in table '%.*s'", cgname, (int)tlen, tablename); + /* Check if the column group already exists. */ + if ((ret = __wt_metadata_search(session, name, &origconf)) == 0) { + if (exclusive) + WT_ERR(EEXIST); + exists = true; + } + WT_ERR_NOTFOUND_OK(ret); + /* Find the first NULL entry in the cfg stack. */ for (cfgp = &cfg[1]; *cfgp; cfgp++) ; @@ -262,25 +272,22 @@ __create_colgroup(WT_SESSION_IMPL *session, } sourcecfg[1] = fmt.data; WT_ERR(__wt_config_merge(session, sourcecfg, NULL, &sourceconf)); - WT_ERR(__wt_schema_create(session, source, sourceconf)); WT_ERR(__wt_config_collapse(session, cfg, &cgconf)); - if ((ret = __wt_metadata_insert(session, name, cgconf)) != 0) { - /* - * If the entry already exists in the metadata, we're done. - * This is an error for exclusive creates but okay otherwise. - */ - if (ret == WT_DUPLICATE_KEY) - ret = exclusive ? EEXIST : 0; + if (exists) { + if (strcmp(cgconf, origconf) != 0) + WT_ERR_MSG(session, EINVAL, + "%s: does not match existing configuration", name); goto err; } + WT_ERR(__wt_metadata_insert(session, name, cgconf)); WT_ERR(__wt_schema_open_colgroups(session, table)); err: __wt_free(session, cgconf); __wt_free(session, sourceconf); - __wt_free(session, oldconf); + __wt_free(session, origconf); __wt_buf_free(session, &confbuf); __wt_buf_free(session, &fmt); __wt_buf_free(session, &namebuf); @@ -382,18 +389,18 @@ __create_index(WT_SESSION_IMPL *session, { WT_CONFIG_BASE(session, index_meta), NULL, NULL, NULL }; const char *sourcecfg[] = { config, NULL, NULL }; const char *source, *sourceconf, *idxname, *tablename; - char *idxconf; + char *idxconf, *origconf; size_t tlen; - bool have_extractor; + bool exists, have_extractor; u_int i, npublic_cols; sourceconf = NULL; - idxconf = NULL; + idxconf = origconf = NULL; WT_CLEAR(confbuf); WT_CLEAR(fmt); WT_CLEAR(extra_cols); WT_CLEAR(namebuf); - have_extractor = false; + exists = have_extractor = false; tablename = name; if (!WT_PREFIX_SKIP(tablename, "index:")) @@ -411,9 +418,17 @@ __create_index(WT_SESSION_IMPL *session, (int)tlen, tablename); if (table->is_simple) - WT_RET_MSG(session, EINVAL, + WT_ERR_MSG(session, EINVAL, "%s requires a table with named columns", name); + /* Check if the index already exists. */ + if ((ret = __wt_metadata_search(session, name, &origconf)) == 0) { + if (exclusive) + WT_ERR(EEXIST); + exists = true; + } + WT_ERR_NOTFOUND_OK(ret); + if (__wt_config_getones(session, config, "source", &cval) == 0) { WT_ERR(__wt_buf_fmt(session, &namebuf, "%.*s", (int)cval.len, cval.str)); @@ -488,8 +503,7 @@ __create_index(WT_SESSION_IMPL *session, WT_ERR(__wt_buf_catfmt( session, &extra_cols, "%.*s,", (int)ckey.len, ckey.str)); } - if (ret != 0 && ret != WT_NOTFOUND) - goto err; + WT_ERR_NOTFOUND_OK(ret); /* Index values are empty: all columns are packed into the index key. */ WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=,key_format=")); @@ -525,23 +539,22 @@ __create_index(WT_SESSION_IMPL *session, cfg[1] = sourceconf; cfg[2] = confbuf.data; WT_ERR(__wt_config_collapse(session, cfg, &idxconf)); - if ((ret = __wt_metadata_insert(session, name, idxconf)) != 0) { - /* - * If the entry already exists in the metadata, we're done. - * This is an error for exclusive creates but okay otherwise. - */ - if (ret == WT_DUPLICATE_KEY) - ret = exclusive ? EEXIST : 0; + if (exists) { + if (strcmp(idxconf, origconf) != 0) + WT_ERR_MSG(session, EINVAL, + "%s: does not match existing configuration", name); goto err; } + WT_ERR(__wt_metadata_insert(session, name, idxconf)); /* Make sure that the configuration is valid. */ WT_ERR(__wt_schema_open_index( session, table, idxname, strlen(idxname), &idx)); - - WT_ERR(__fill_index(session, table, idx)); + if (!exists) + WT_ERR(__fill_index(session, table, idx)); err: __wt_free(session, idxconf); + __wt_free(session, origconf); __wt_free(session, sourceconf); __wt_buf_free(session, &confbuf); __wt_buf_free(session, &extra_cols); @@ -570,10 +583,12 @@ __create_table(WT_SESSION_IMPL *session, char *tableconf, *cgname; size_t cgsize; int ncolgroups; + bool exists; cgname = NULL; table = NULL; tableconf = NULL; + exists = false; tablename = name; if (!WT_PREFIX_SKIP(tablename, "table:")) @@ -581,8 +596,9 @@ __create_table(WT_SESSION_IMPL *session, if ((ret = __wt_schema_get_table(session, tablename, strlen(tablename), false, &table)) == 0) { - __wt_schema_release_table(session, table); - return (exclusive ? EEXIST : 0); + if (exclusive) + WT_ERR(EEXIST); + exists = true; } WT_RET_NOTFOUND_OK(ret); @@ -595,15 +611,13 @@ __create_table(WT_SESSION_IMPL *session, WT_ERR_NOTFOUND_OK(ret); WT_ERR(__wt_config_collapse(session, cfg, &tableconf)); - if ((ret = __wt_metadata_insert(session, name, tableconf)) != 0) { - /* - * If the entry already exists in the metadata, we're done. - * This is an error for exclusive creates but okay otherwise. - */ - if (ret == WT_DUPLICATE_KEY) - ret = exclusive ? EEXIST : 0; + if (exists) { + if (strcmp(tableconf, table->config) != 0) + WT_ERR_MSG(session, EINVAL, + "%s: does not match existing configuration", name); goto err; } + WT_ERR(__wt_metadata_insert(session, name, tableconf)); /* Attempt to open the table now to catch any errors. */ WT_ERR(__wt_schema_get_table( diff --git a/src/schema/schema_drop.c b/src/schema/schema_drop.c index 9b9f3a23961..6ac76930c9a 100644 --- a/src/schema/schema_drop.c +++ b/src/schema/schema_drop.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -29,7 +29,7 @@ __drop_file( return (EINVAL); /* Close all btree handles associated with this file. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret, ret = __wt_conn_dhandle_close_all(session, uri, force)); WT_RET(ret); diff --git a/src/schema/schema_list.c b/src/schema/schema_list.c index da5f033ad40..5e9caf94b7a 100644 --- a/src/schema/schema_list.c +++ b/src/schema/schema_list.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -21,9 +21,9 @@ __schema_add_table(WT_SESSION_IMPL *session, uint64_t bucket; /* Make sure the metadata is open before getting other locks. */ - WT_RET(__wt_metadata_open(session)); + WT_RET(__wt_metadata_cursor(session, NULL)); - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_LOCK(session, ret, ret = __wt_schema_open_table( session, name, namelen, ok_incomplete, &table)); WT_RET(ret); diff --git a/src/schema/schema_open.c b/src/schema/schema_open.c index ba8664f2e39..49318f80959 100644 --- a/src/schema/schema_open.c +++ b/src/schema/schema_open.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -291,7 +291,7 @@ __schema_open_index(WT_SESSION_IMPL *session, WT_ERR(__wt_buf_fmt(session, tmp, "index:%s:", tablename)); /* Find matching indices. */ - WT_ERR(__wt_metadata_cursor(session, NULL, &cursor)); + WT_ERR(__wt_metadata_cursor(session, &cursor)); cursor->set_key(cursor, tmp->data); if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0) ret = cursor->next(cursor); @@ -379,10 +379,10 @@ __schema_open_index(WT_SESSION_IMPL *session, table->idx_complete = true; } -err: __wt_scr_free(session, &tmp); +err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); WT_TRET(__wt_schema_destroy_index(session, &idx)); - if (cursor != NULL) - WT_TRET(cursor->close(cursor)); + + __wt_scr_free(session, &tmp); return (ret); } @@ -438,7 +438,7 @@ __schema_open_table(WT_SESSION_IMPL *session, WT_ERR(__wt_buf_fmt(session, buf, "table:%.*s", (int)namelen, name)); WT_ERR(__wt_strndup(session, buf->data, buf->size, &tablename)); - WT_ERR(__wt_metadata_cursor(session, NULL, &cursor)); + WT_ERR(__wt_metadata_cursor(session, &cursor)); cursor->set_key(cursor, tablename); WT_ERR(cursor->search(cursor)); WT_ERR(cursor->get_value(cursor, &tconfig)); @@ -508,8 +508,7 @@ __schema_open_table(WT_SESSION_IMPL *session, if (0) { err: WT_TRET(__wt_schema_destroy_table(session, &table)); } - if (cursor != NULL) - WT_TRET(cursor->close(cursor)); + WT_TRET(__wt_metadata_cursor_release(session, &cursor)); __wt_free(session, tablename); __wt_scr_free(session, &buf); diff --git a/src/schema/schema_plan.c b/src/schema/schema_plan.c index 066e666190b..612a2d2d192 100644 --- a/src/schema/schema_plan.c +++ b/src/schema/schema_plan.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/schema/schema_project.c b/src/schema/schema_project.c index be5f73b48ed..4d29b2baa13 100644 --- a/src/schema/schema_project.c +++ b/src/schema/schema_project.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/schema/schema_rename.c b/src/schema/schema_rename.c index 3f368417d40..4ec126394dd 100644 --- a/src/schema/schema_rename.c +++ b/src/schema/schema_rename.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -30,7 +30,7 @@ __rename_file( return (EINVAL); /* Close any btree handles in the file. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret, ret = __wt_conn_dhandle_close_all(session, uri, false)); WT_ERR(ret); diff --git a/src/schema/schema_stat.c b/src/schema/schema_stat.c index 82c2e2a15dc..d3d0605c60a 100644 --- a/src/schema/schema_stat.c +++ b/src/schema/schema_stat.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/schema/schema_truncate.c b/src/schema/schema_truncate.c index c39bba4753c..e7752b60ca4 100644 --- a/src/schema/schema_truncate.c +++ b/src/schema/schema_truncate.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -9,43 +9,6 @@ #include "wt_internal.h" /* - * __truncate_file -- - * WT_SESSION::truncate for a file. - */ -static int -__truncate_file(WT_SESSION_IMPL *session, const char *uri) -{ - WT_DECL_RET; - const char *filename; - uint32_t allocsize; - - filename = uri; - if (!WT_PREFIX_SKIP(filename, "file:")) - return (EINVAL); - - /* Open and lock the file. */ - WT_RET(__wt_session_get_btree( - session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE)); - WT_STAT_FAST_DATA_INCR(session, cursor_truncate); - - /* Get the allocation size. */ - allocsize = S2BT(session)->allocsize; - - WT_RET(__wt_session_release_btree(session)); - - /* Close any btree handles in the file. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_conn_dhandle_close_all(session, uri, false)); - WT_RET(ret); - - /* Delete the root address and truncate the file. */ - WT_RET(__wt_meta_checkpoint_clear(session, uri)); - WT_RET(__wt_block_manager_truncate(session, filename, allocsize)); - - return (0); -} - -/* * __truncate_table -- * WT_SESSION::truncate for a table. */ @@ -112,9 +75,12 @@ __wt_schema_truncate( tablename = uri; - if (WT_PREFIX_MATCH(uri, "file:")) { - ret = __truncate_file(session, uri); - } else if (WT_PREFIX_MATCH(uri, "lsm:")) + if (WT_PREFIX_MATCH(uri, "file:")) + /* + * File truncate translates into a range truncate. + */ + ret = __wt_session_range_truncate(session, uri, NULL, NULL); + else if (WT_PREFIX_MATCH(uri, "lsm:")) ret = __wt_lsm_tree_truncate(session, uri, cfg); else if (WT_PREFIX_SKIP(tablename, "table:")) ret = __truncate_table(session, tablename, cfg); diff --git a/src/schema/schema_util.c b/src/schema/schema_util.c index 1e810e8adc9..d1c84dc8d85 100644 --- a/src/schema/schema_util.c +++ b/src/schema/schema_util.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c index 64218923173..a2fe5244c4d 100644 --- a/src/schema/schema_worker.c +++ b/src/schema/schema_worker.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -49,7 +49,7 @@ __wt_schema_worker(WT_SESSION_IMPL *session, * any open file handles, including checkpoints. */ if (FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE)) { - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret, ret = __wt_conn_dhandle_close_all( session, uri, false)); WT_ERR(ret); @@ -63,7 +63,7 @@ __wt_schema_worker(WT_SESSION_IMPL *session, } else if (ret == EBUSY) { WT_ASSERT(session, !FLD_ISSET( open_flags, WT_DHANDLE_EXCLUSIVE)); - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret, ret = __wt_conn_btree_apply_single_ckpt( session, uri, file_func, cfg)); } diff --git a/src/session/session_api.c b/src/session/session_api.c index 053f69ee7f8..c03b5fdc044 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -148,7 +148,7 @@ __session_close(WT_SESSION *wt_session, const char *config) * via the registered close callback. */ if (session->event_handler->handle_close != NULL && - !WT_STREQ(cursor->uri, WT_LAS_URI)) + !WT_STREQ(cursor->internal_uri, WT_LAS_URI)) WT_TRET(session->event_handler->handle_close( session->event_handler, wt_session, cursor)); WT_TRET(cursor->close(cursor)); @@ -442,8 +442,8 @@ __wt_session_create( { WT_DECL_RET; - WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + WT_WITH_SCHEMA_LOCK(session, ret, + WT_WITH_TABLE_LOCK(session, ret, ret = __wt_schema_create(session, uri, config))); return (ret); } @@ -554,6 +554,32 @@ err: API_END_RET(session, ret); } /* + * __session_rebalance -- + * WT_SESSION->rebalance method. + */ +static int +__session_rebalance(WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + + SESSION_API_CALL(session, rebalance, config, cfg); + + if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) + WT_ERR(ENOTSUP); + + /* Block out checkpoints to avoid spurious EBUSY errors. */ + WT_WITH_CHECKPOINT_LOCK(session, ret, + WT_WITH_SCHEMA_LOCK(session, ret, + ret = __wt_schema_worker(session, uri, __wt_bt_rebalance, + NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_REBALANCE))); + +err: API_END_RET_NOTFOUND_MAP(session, ret); +} + +/* * __session_rename -- * WT_SESSION->rename method. */ @@ -571,8 +597,8 @@ __session_rename(WT_SESSION *wt_session, WT_ERR(__wt_str_name_check(session, uri)); WT_ERR(__wt_str_name_check(session, newuri)); - WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + WT_WITH_SCHEMA_LOCK(session, ret, + WT_WITH_TABLE_LOCK(session, ret, ret = __wt_schema_rename(session, uri, newuri, cfg))); err: API_END_RET_NOTFOUND_MAP(session, ret); @@ -611,10 +637,22 @@ int __wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) { WT_DECL_RET; + WT_CONFIG_ITEM cval; + bool lock_wait; + + WT_RET(__wt_config_gets_def(session, cfg, "lock_wait", 1, &cval)); + lock_wait = cval.val != 0 || F_ISSET(session, WT_SESSION_LOCK_NO_WAIT); - WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + if (!lock_wait) + F_SET(session, WT_SESSION_LOCK_NO_WAIT); + + WT_WITH_SCHEMA_LOCK(session, ret, + WT_WITH_TABLE_LOCK(session, ret, ret = __wt_schema_drop(session, uri, cfg))); + + if (!lock_wait) + F_CLR(session, WT_SESSION_LOCK_NO_WAIT); + return (ret); } @@ -648,6 +686,7 @@ static int __session_join(WT_SESSION *wt_session, WT_CURSOR *join_cursor, WT_CURSOR *ref_cursor, const char *config) { + WT_CURSOR *firstcg; WT_CONFIG_ITEM cval; WT_CURSOR_INDEX *cindex; WT_CURSOR_JOIN *cjoin; @@ -661,6 +700,7 @@ __session_join(WT_SESSION *wt_session, WT_CURSOR *join_cursor, uint8_t flags, range; count = 0; + firstcg = NULL; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, join, config, cfg); table = NULL; @@ -672,15 +712,18 @@ __session_join(WT_SESSION *wt_session, WT_CURSOR *join_cursor, cindex = (WT_CURSOR_INDEX *)ref_cursor; idx = cindex->index; table = cindex->table; - WT_CURSOR_CHECKKEY(ref_cursor); + firstcg = cindex->cg_cursors[0]; } else if (WT_PREFIX_MATCH(ref_cursor->uri, "table:")) { idx = NULL; ctable = (WT_CURSOR_TABLE *)ref_cursor; table = ctable->table; - WT_CURSOR_CHECKKEY(ctable->cg_cursors[0]); + firstcg = ctable->cg_cursors[0]; } else WT_ERR_MSG(session, EINVAL, "not an index or table cursor"); + if (!F_ISSET(firstcg, WT_CURSTD_KEY_SET)) + WT_ERR_MSG(session, EINVAL, + "requires reference cursor be positioned"); cjoin = (WT_CURSOR_JOIN *)join_cursor; if (cjoin->table != table) WT_ERR_MSG(session, EINVAL, @@ -771,71 +814,48 @@ __session_salvage(WT_SESSION *wt_session, const char *uri, const char *config) WT_ERR(ENOTSUP); /* Block out checkpoints to avoid spurious EBUSY errors. */ - WT_WITH_CHECKPOINT_LOCK(session, - WT_WITH_SCHEMA_LOCK(session, ret = - __wt_schema_worker(session, uri, __wt_salvage, + WT_WITH_CHECKPOINT_LOCK(session, ret, + WT_WITH_SCHEMA_LOCK(session, ret, + ret = __wt_schema_worker(session, uri, __wt_salvage, NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_SALVAGE))); err: API_END_RET_NOTFOUND_MAP(session, ret); } /* - * __session_truncate -- - * WT_SESSION->truncate method. + * __wt_session_range_truncate -- + * Session handling of a range truncate. */ -static int -__session_truncate(WT_SESSION *wt_session, - const char *uri, WT_CURSOR *start, WT_CURSOR *stop, const char *config) +int +__wt_session_range_truncate(WT_SESSION_IMPL *session, + const char *uri, WT_CURSOR *start, WT_CURSOR *stop) { - WT_DECL_RET; - WT_SESSION_IMPL *session; WT_CURSOR *cursor; + WT_DECL_RET; int cmp; bool local_start; local_start = false; - - session = (WT_SESSION_IMPL *)wt_session; - SESSION_TXN_API_CALL(session, truncate, config, cfg); - WT_STAT_FAST_CONN_INCR(session, cursor_truncate); - - /* - * If the URI is specified, we don't need a start/stop, if start/stop - * is specified, we don't need a URI. One exception is the log URI - * which may truncate (archive) log files for a backup cursor. - * - * If no URI is specified, and both cursors are specified, start/stop - * must reference the same object. - * - * Any specified cursor must have been initialized. - */ - if ((uri == NULL && start == NULL && stop == NULL) || - (uri != NULL && !WT_PREFIX_MATCH(uri, "log:") && - (start != NULL || stop != NULL))) - WT_ERR_MSG(session, EINVAL, - "the truncate method should be passed either a URI or " - "start/stop cursors, but not both"); - if (uri != NULL) { - /* Disallow objects in the WiredTiger name space. */ - WT_ERR(__wt_str_name_check(session, uri)); - - if (WT_PREFIX_MATCH(uri, "log:")) { + WT_ASSERT(session, WT_PREFIX_MATCH(uri, "file:")); + /* + * A URI file truncate becomes a range truncate where we + * set a start cursor at the beginning. We already + * know the NULL stop goes to the end of the range. + */ + WT_ERR(__session_open_cursor( + (WT_SESSION *)session, uri, NULL, NULL, &start)); + local_start = true; + ret = start->next(start); + if (ret == WT_NOTFOUND) { /* - * Verify the user only gave the URI prefix and not - * a specific target name after that. + * If there are no elements, there is nothing + * to do. */ - if (!WT_STREQ(uri, "log:")) - WT_ERR_MSG(session, EINVAL, - "the truncate method should not specify any" - "target after the log: URI prefix."); - ret = __wt_log_truncate_files(session, start, cfg); - } else - /* Wait for checkpoints to avoid EBUSY errors. */ - WT_WITH_CHECKPOINT_LOCK(session, - WT_WITH_SCHEMA_LOCK(session, - ret = __wt_schema_truncate(session, uri, cfg))); - goto done; + ret = 0; + goto done; + } + WT_ERR(ret); } /* @@ -893,7 +913,7 @@ __session_truncate(WT_SESSION *wt_session, */ if (start == NULL) { WT_ERR(__session_open_cursor( - wt_session, stop->uri, NULL, NULL, &start)); + (WT_SESSION *)session, stop->uri, NULL, NULL, &start)); local_start = true; WT_ERR(start->next(start)); } @@ -910,13 +930,72 @@ __session_truncate(WT_SESSION *wt_session, WT_ERR(__wt_schema_range_truncate(session, start, stop)); done: -err: TXN_API_END_RETRY(session, ret, 0); - - /* +err: /* * Close any locally-opened start cursor. */ if (local_start) WT_TRET(start->close(start)); + return (ret); +} + +/* + * __session_truncate -- + * WT_SESSION->truncate method. + */ +static int +__session_truncate(WT_SESSION *wt_session, + const char *uri, WT_CURSOR *start, WT_CURSOR *stop, const char *config) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + SESSION_TXN_API_CALL(session, truncate, config, cfg); + WT_STAT_FAST_CONN_INCR(session, cursor_truncate); + + /* + * If the URI is specified, we don't need a start/stop, if start/stop + * is specified, we don't need a URI. One exception is the log URI + * which may truncate (archive) log files for a backup cursor. + * + * If no URI is specified, and both cursors are specified, start/stop + * must reference the same object. + * + * Any specified cursor must have been initialized. + */ + if ((uri == NULL && start == NULL && stop == NULL) || + (uri != NULL && !WT_PREFIX_MATCH(uri, "log:") && + (start != NULL || stop != NULL))) + WT_ERR_MSG(session, EINVAL, + "the truncate method should be passed either a URI or " + "start/stop cursors, but not both"); + + if (uri != NULL) { + /* Disallow objects in the WiredTiger name space. */ + WT_ERR(__wt_str_name_check(session, uri)); + + if (WT_PREFIX_MATCH(uri, "log:")) { + /* + * Verify the user only gave the URI prefix and not + * a specific target name after that. + */ + if (!WT_STREQ(uri, "log:")) + WT_ERR_MSG(session, EINVAL, + "the truncate method should not specify any" + "target after the log: URI prefix."); + WT_ERR(__wt_log_truncate_files(session, start, cfg)); + } else if (WT_PREFIX_MATCH(uri, "file:")) + WT_ERR(__wt_session_range_truncate( + session, uri, start, stop)); + else + /* Wait for checkpoints to avoid EBUSY errors. */ + WT_WITH_CHECKPOINT_LOCK(session, ret, + WT_WITH_SCHEMA_LOCK(session, ret, + ret = __wt_schema_truncate(session, uri, cfg))); + } else + WT_ERR(__wt_session_range_truncate(session, uri, start, stop)); + +err: TXN_API_END_RETRY(session, ret, 0); /* * Only map WT_NOTFOUND to ENOENT if a URI was specified. @@ -938,8 +1017,8 @@ __session_upgrade(WT_SESSION *wt_session, const char *uri, const char *config) SESSION_API_CALL(session, upgrade, config, cfg); /* Block out checkpoints to avoid spurious EBUSY errors. */ - WT_WITH_CHECKPOINT_LOCK(session, - WT_WITH_SCHEMA_LOCK(session, + WT_WITH_CHECKPOINT_LOCK(session, ret, + WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_schema_worker(session, uri, __wt_upgrade, NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_UPGRADE))); @@ -964,8 +1043,8 @@ __session_verify(WT_SESSION *wt_session, const char *uri, const char *config) WT_ERR(ENOTSUP); /* Block out checkpoints to avoid spurious EBUSY errors. */ - WT_WITH_CHECKPOINT_LOCK(session, - WT_WITH_SCHEMA_LOCK(session, + WT_WITH_CHECKPOINT_LOCK(session, ret, + WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_schema_worker(session, uri, __wt_verify, NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_VERIFY))); @@ -1287,6 +1366,7 @@ __open_session(WT_CONNECTION_IMPL *conn, __session_join, __session_log_flush, __session_log_printf, + __session_rebalance, __session_rename, __session_reset, __session_salvage, @@ -1443,7 +1523,7 @@ __wt_open_session(WT_CONNECTION_IMPL *conn, */ if (open_metadata) { WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); - if ((ret = __wt_metadata_open(session)) != 0) { + if ((ret = __wt_metadata_cursor(session, NULL)) != 0) { wt_session = &session->iface; WT_TRET(wt_session->close(wt_session, NULL)); return (ret); @@ -1486,14 +1566,11 @@ __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, * deadlocked getting the cursor late in the process. Be defensive, * get it now. */ - if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) { - WT_WITHOUT_DHANDLE(session, ret = - __wt_las_cursor_create(session, &session->las_cursor)); - if (ret != 0) { - wt_session = &session->iface; - WT_TRET(wt_session->close(wt_session, NULL)); - return (ret); - } + if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR) && + (ret = __wt_las_cursor_open(session, &session->las_cursor)) != 0) { + wt_session = &session->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + return (ret); } *sessionp = session; diff --git a/src/session/session_compact.c b/src/session/session_compact.c index 456fcd3ce03..5abccbd1366 100644 --- a/src/session/session_compact.c +++ b/src/session/session_compact.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -172,12 +172,12 @@ __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) for (i = 0; i < 100; ++i) { WT_ERR(__wt_txn_checkpoint(session, checkpoint_cfg)); - session->compaction = false; - WT_WITH_SCHEMA_LOCK(session, + session->compact_state = WT_COMPACT_RUNNING; + WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_schema_worker( session, uri, __wt_compact, NULL, cfg, 0)); WT_ERR(ret); - if (!session->compaction) + if (session->compact_state != WT_COMPACT_SUCCESS) break; WT_ERR(__wt_txn_checkpoint(session, checkpoint_cfg)); @@ -185,7 +185,9 @@ __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) WT_ERR(__session_compact_check_timeout(session, start_time)); } -err: __wt_scr_free(session, &t); +err: session->compact_state = WT_COMPACT_NONE; + + __wt_scr_free(session, &t); return (ret); } @@ -226,7 +228,8 @@ __wt_session_compact( session->compact->max_time = (uint64_t)cval.val; /* Find the types of data sources are being compacted. */ - WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker( + WT_WITH_SCHEMA_LOCK(session, ret, + ret = __wt_schema_worker( session, uri, NULL, __wt_compact_uri_analyze, cfg, 0)); WT_ERR(ret); diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c index dd5094fb480..1ac758c0cee 100644 --- a/src/session/session_dhandle.c +++ b/src/session/session_dhandle.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -453,8 +453,8 @@ __session_get_dhandle( * We didn't find a match in the session cache, search the shared * handle list and cache the handle we find. */ - WT_WITH_HANDLE_LIST_LOCK(session, ret = - __session_find_shared_dhandle(session, uri, checkpoint)); + WT_WITH_HANDLE_LIST_LOCK(session, ret, + ret = __session_find_shared_dhandle(session, uri, checkpoint)); if (ret == 0) ret = __session_add_dhandle(session, NULL); @@ -509,9 +509,9 @@ __wt_session_get_btree(WT_SESSION_IMPL *session, F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); WT_RET(__wt_writeunlock(session, dhandle->rwlock)); - WT_WITH_SCHEMA_LOCK(session, - WT_WITH_HANDLE_LIST_LOCK(session, ret = - __wt_session_get_btree( + WT_WITH_SCHEMA_LOCK(session, ret, + WT_WITH_HANDLE_LIST_LOCK(session, ret, + ret = __wt_session_get_btree( session, uri, checkpoint, cfg, flags))); return (ret); diff --git a/src/session/session_salvage.c b/src/session/session_salvage.c index 07f68e4c194..983b28dd8ea 100644 --- a/src/session/session_salvage.c +++ b/src/session/session_salvage.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/support/cksum.c b/src/support/cksum.c index a8b5823100d..c2982c40015 100644 --- a/src/support/cksum.c +++ b/src/support/cksum.c @@ -1,5 +1,5 @@ /*- - * Public Domain 2014-2015 MongoDB, Inc. + * Public Domain 2014-2016 MongoDB, Inc. * Public Domain 2008-2014 WiredTiger, Inc. * * This is free and unencumbered software released into the public domain. diff --git a/src/support/crypto.c b/src/support/crypto.c index b1102163e7b..1049621fb44 100644 --- a/src/support/crypto.c +++ b/src/support/crypto.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/support/err.c b/src/support/err.c index de518cbf08b..875bd3efcf3 100644 --- a/src/support/err.c +++ b/src/support/err.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/support/filename.c b/src/support/filename.c index 02a83803e25..215f5b47997 100644 --- a/src/support/filename.c +++ b/src/support/filename.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -65,11 +65,49 @@ __wt_remove_if_exists(WT_SESSION_IMPL *session, const char *name) } /* - * __wt_sync_and_rename_fh -- + * __wt_rename_and_sync_directory -- + * Rename a file and sync the enclosing directory. + */ +int +__wt_rename_and_sync_directory( + WT_SESSION_IMPL *session, const char *from, const char *to) +{ + const char *fp, *tp; + bool same_directory; + + /* Rename the source file to the target. */ + WT_RET(__wt_rename(session, from, to)); + + /* + * Flush the backing directory to guarantee the rename. My reading of + * POSIX 1003.1 is there's no guarantee flushing only one of the from + * or to directories, or flushing a common parent, is sufficient, and + * even if POSIX were to make that guarantee, existing filesystems are + * known to not provide the guarantee or only provide the guarantee + * with specific mount options. Flush both of the from/to directories + * until it's a performance problem. + */ + WT_RET(__wt_directory_sync(session, from)); + + /* + * In almost all cases, we're going to be renaming files in the same + * directory, we can at least fast-path that. + */ + fp = strrchr(from, '/'); + tp = strrchr(to, '/'); + same_directory = (fp == NULL && tp == NULL) || + (fp != NULL && tp != NULL && + fp - from == tp - to && memcmp(from, to, (size_t)(fp - from)) == 0); + + return (same_directory ? 0 : __wt_directory_sync(session, to)); +} + +/* + * __wt_fh_sync_and_rename -- * Sync and close a file, and swap it into place. */ int -__wt_sync_and_rename_fh( +__wt_fh_sync_and_rename( WT_SESSION_IMPL *session, WT_FH **fhp, const char *from, const char *to) { WT_DECL_RET; @@ -83,19 +121,15 @@ __wt_sync_and_rename_fh( WT_TRET(__wt_close(session, &fh)); WT_RET(ret); - /* Rename the source file to the target. */ - WT_RET(__wt_rename(session, from, to)); - - /* Flush the backing directory to guarantee the rename. */ - return (__wt_directory_sync(session, NULL)); + return (__wt_rename_and_sync_directory(session, from, to)); } /* - * __wt_sync_and_rename_fp -- + * __wt_sync_fp_and_rename -- * Sync and close a file, and swap it into place. */ int -__wt_sync_and_rename_fp( +__wt_sync_fp_and_rename( WT_SESSION_IMPL *session, FILE **fpp, const char *from, const char *to) { FILE *fp; @@ -106,9 +140,5 @@ __wt_sync_and_rename_fp( /* Flush to disk and close the handle. */ WT_RET(__wt_fclose(&fp, WT_FHANDLE_WRITE)); - /* Rename the source file to the target. */ - WT_RET(__wt_rename(session, from, to)); - - /* Flush the backing directory to guarantee the rename. */ - return (__wt_directory_sync(session, NULL)); + return (__wt_rename_and_sync_directory(session, from, to)); } diff --git a/src/support/global.c b/src/support/global.c index 1e32f5b4453..0234455b6ce 100644 --- a/src/support/global.c +++ b/src/support/global.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -12,28 +12,6 @@ WT_PROCESS __wt_process; /* Per-process structure */ static int __wt_pthread_once_failed; /* If initialization failed */ /* - * __system_is_little_endian -- - * Check if the system is little endian. - */ -static int -__system_is_little_endian(void) -{ - uint64_t v; - bool little; - - v = 1; - little = *((uint8_t *)&v) != 0; - - if (little) - return (0); - - fprintf(stderr, - "This release of the WiredTiger data engine does not support " - "big-endian systems; contact WiredTiger for more information.\n"); - return (EINVAL); -} - -/* * __wt_global_once -- * Global initialization, run once. */ @@ -42,11 +20,6 @@ __wt_global_once(void) { WT_DECL_RET; - if ((ret = __system_is_little_endian()) != 0) { - __wt_pthread_once_failed = ret; - return; - } - if ((ret = __wt_spin_init(NULL, &__wt_process.spinlock, "global")) != 0) { __wt_pthread_once_failed = ret; @@ -115,7 +88,7 @@ __wt_attach(WT_SESSION_IMPL *session) /* Sleep forever, the debugger will interrupt us when it attaches. */ for (;;) - __wt_sleep(100, 0); + __wt_sleep(10, 0); #else WT_UNUSED(session); #endif diff --git a/src/support/hash_city.c b/src/support/hash_city.c index 9a4a6464f40..5780cd7b459 100644 --- a/src/support/hash_city.c +++ b/src/support/hash_city.c @@ -1,5 +1,5 @@ /*- - * Public Domain 2014-2015 MongoDB, Inc. + * Public Domain 2014-2016 MongoDB, Inc. * Public Domain 2008-2014 WiredTiger, Inc. * * This is free and unencumbered software released into the public domain. @@ -99,6 +99,12 @@ static uint32_t UNALIGNED_LOAD32(const char *p) { #define bswap_32(x) OSSwapInt32(x) #define bswap_64(x) OSSwapInt64(x) +#elif defined(__sun) + +#include <sys/byteorder.h> +#define bswap_32 BSWAP_32 +#define bswap_64 BSWAP_64 + #else #include <byteswap.h> #endif diff --git a/src/support/hash_fnv.c b/src/support/hash_fnv.c index e780931454d..35e7e5f3a73 100644 --- a/src/support/hash_fnv.c +++ b/src/support/hash_fnv.c @@ -1,5 +1,5 @@ /*- - * Public Domain 2014-2015 MongoDB, Inc. + * Public Domain 2014-2016 MongoDB, Inc. * Public Domain 2008-2014 WiredTiger, Inc. * * This is free and unencumbered software released into the public domain. diff --git a/src/support/hazard.c b/src/support/hazard.c index 0fc7051fb90..13e0eb3b9ac 100644 --- a/src/support/hazard.c +++ b/src/support/hazard.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/support/hex.c b/src/support/hex.c index eb9f420911a..d42a84154ca 100644 --- a/src/support/hex.c +++ b/src/support/hex.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -8,7 +8,7 @@ #include "wt_internal.h" -static const u_char hex[] = "0123456789abcdef"; +const u_char __wt_hex[] = "0123456789abcdef"; /* * __fill_hex -- @@ -25,8 +25,8 @@ __fill_hex(const uint8_t *src, size_t src_max, --dest_max; for (; src_max > 0 && dest_max > 1; src_max -= 1, dest_max -= 2, ++src) { - *dest++ = hex[(*src & 0xf0) >> 4]; - *dest++ = hex[*src & 0x0f]; + *dest++ = __wt_hex[(*src & 0xf0) >> 4]; + *dest++ = __wt_hex[*src & 0x0f]; } *dest++ = '\0'; if (lenp != NULL) @@ -34,6 +34,17 @@ __fill_hex(const uint8_t *src, size_t src_max, } /* + * __wt_fill_hex -- + * In-memory conversion of raw bytes to a hexadecimal representation. + */ +void +__wt_fill_hex(const uint8_t *src, size_t src_max, + uint8_t *dest, size_t dest_max, size_t *lenp) +{ + __fill_hex(src, src_max, dest, dest_max, lenp); +} + +/* * __wt_raw_to_hex -- * Convert a chunk of data to a nul-terminated printable hex string. */ @@ -72,10 +83,6 @@ __wt_raw_to_esc_hex( */ WT_RET(__wt_buf_init(session, to, size * 3 + 1)); - /* - * In the worst case, every character takes up 3 spaces, plus a - * trailing nul byte. - */ for (p = from, t = to->mem, i = size; i > 0; --i, ++p) if (isprint((int)*p)) { if (*p == '\\') @@ -83,8 +90,8 @@ __wt_raw_to_esc_hex( *t++ = *p; } else { *t++ = '\\'; - *t++ = hex[(*p & 0xf0) >> 4]; - *t++ = hex[*p & 0x0f]; + *t++ = __wt_hex[(*p & 0xf0) >> 4]; + *t++ = __wt_hex[*p & 0x0f]; } *t++ = '\0'; to->size = WT_PTRDIFF(t, to->mem); diff --git a/src/support/huffman.c b/src/support/huffman.c index 4bda365cb10..edd0bc9f648 100644 --- a/src/support/huffman.c +++ b/src/support/huffman.c @@ -1,9 +1,31 @@ -/*- - * Copyright (c) 2014-2015 MongoDB, Inc. +/* + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * - * See the file LICENSE for redistribution information. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name MongoDB or the name WiredTiger + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY MONGODB INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ #include "wt_internal.h" diff --git a/src/support/pow.c b/src/support/pow.c index 0f50bfe56a1..028263581d3 100644 --- a/src/support/pow.c +++ b/src/support/pow.c @@ -1,5 +1,5 @@ /*- - * Public Domain 2014-2015 MongoDB, Inc. + * Public Domain 2014-2016 MongoDB, Inc. * Public Domain 2008-2014 WiredTiger, Inc. * * This is free and unencumbered software released into the public domain. diff --git a/src/support/rand.c b/src/support/rand.c index f5ecb12633e..d2e4cd27aab 100644 --- a/src/support/rand.c +++ b/src/support/rand.c @@ -1,5 +1,5 @@ /*- - * Public Domain 2014-2015 MongoDB, Inc. + * Public Domain 2014-2016 MongoDB, Inc. * Public Domain 2008-2014 WiredTiger, Inc. * * This is free and unencumbered software released into the public domain. @@ -60,6 +60,29 @@ __wt_random_init(WT_RAND_STATE volatile * rnd_state) } /* + * __wt_random_init_seed -- + * Initialize the state of a 32-bit pseudo-random number. + * Use this, instead of __wt_random_init if we are running with multiple + * threads and we want each thread to initialize its own random state based + * on a different random seed. + */ +int +__wt_random_init_seed( + WT_SESSION_IMPL *session, WT_RAND_STATE volatile * rnd_state) +{ + struct timespec ts; + WT_RAND_STATE rnd; + + WT_RET(__wt_epoch(session, &ts)); + M_W(rnd) = (uint32_t)(ts.tv_nsec + 521288629); + M_Z(rnd) = (uint32_t)(ts.tv_nsec + 362436069); + + *rnd_state = rnd; + + return (0); +} + +/* * __wt_random -- * Return a 32-bit pseudo-random number. */ diff --git a/src/support/scratch.c b/src/support/scratch.c index f0c403c9ec8..94020ba2621 100644 --- a/src/support/scratch.c +++ b/src/support/scratch.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/support/stat.c b/src/support/stat.c index 4d7cd65fd18..7a615131628 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -250,19 +250,24 @@ __wt_stat_dsrc_aggregate_single( to->block_alloc += from->block_alloc; to->block_free += from->block_free; to->block_checkpoint_size += from->block_checkpoint_size; - to->allocation_size = from->allocation_size; + if (from->allocation_size > to->allocation_size) + to->allocation_size = from->allocation_size; to->block_reuse_bytes += from->block_reuse_bytes; - to->block_magic = from->block_magic; - to->block_major = from->block_major; + if (from->block_magic > to->block_magic) + to->block_magic = from->block_magic; + if (from->block_major > to->block_major) + to->block_major = from->block_major; to->block_size += from->block_size; - to->block_minor = from->block_minor; + if (from->block_minor > to->block_minor) + to->block_minor = from->block_minor; to->btree_checkpoint_generation += from->btree_checkpoint_generation; to->btree_column_fix += from->btree_column_fix; to->btree_column_internal += from->btree_column_internal; to->btree_column_deleted += from->btree_column_deleted; to->btree_column_variable += from->btree_column_variable; to->btree_column_rle += from->btree_column_rle; - to->btree_fixed_len = from->btree_fixed_len; + if (from->btree_fixed_len > to->btree_fixed_len) + to->btree_fixed_len = from->btree_fixed_len; if (from->btree_maxintlkey > to->btree_maxintlkey) to->btree_maxintlkey = from->btree_maxintlkey; if (from->btree_maxintlpage > to->btree_maxintlpage) @@ -367,12 +372,16 @@ __wt_stat_dsrc_aggregate( to->block_free += WT_STAT_READ(from, block_free); to->block_checkpoint_size += WT_STAT_READ(from, block_checkpoint_size); - to->allocation_size = from[0]->allocation_size; + if ((v = WT_STAT_READ(from, allocation_size)) > to->allocation_size) + to->allocation_size = v; to->block_reuse_bytes += WT_STAT_READ(from, block_reuse_bytes); - to->block_magic = from[0]->block_magic; - to->block_major = from[0]->block_major; + if ((v = WT_STAT_READ(from, block_magic)) > to->block_magic) + to->block_magic = v; + if ((v = WT_STAT_READ(from, block_major)) > to->block_major) + to->block_major = v; to->block_size += WT_STAT_READ(from, block_size); - to->block_minor = from[0]->block_minor; + if ((v = WT_STAT_READ(from, block_minor)) > to->block_minor) + to->block_minor = v; to->btree_checkpoint_generation += WT_STAT_READ(from, btree_checkpoint_generation); to->btree_column_fix += WT_STAT_READ(from, btree_column_fix); @@ -382,15 +391,14 @@ __wt_stat_dsrc_aggregate( to->btree_column_variable += WT_STAT_READ(from, btree_column_variable); to->btree_column_rle += WT_STAT_READ(from, btree_column_rle); - to->btree_fixed_len = from[0]->btree_fixed_len; - if ((v = WT_STAT_READ(from, btree_maxintlkey)) > - to->btree_maxintlkey) + if ((v = WT_STAT_READ(from, btree_fixed_len)) > to->btree_fixed_len) + to->btree_fixed_len = v; + if ((v = WT_STAT_READ(from, btree_maxintlkey)) > to->btree_maxintlkey) to->btree_maxintlkey = v; if ((v = WT_STAT_READ(from, btree_maxintlpage)) > to->btree_maxintlpage) to->btree_maxintlpage = v; - if ((v = WT_STAT_READ(from, btree_maxleafkey)) > - to->btree_maxleafkey) + if ((v = WT_STAT_READ(from, btree_maxleafkey)) > to->btree_maxleafkey) to->btree_maxleafkey = v; if ((v = WT_STAT_READ(from, btree_maxleafpage)) > to->btree_maxleafpage) diff --git a/src/txn/txn.c b/src/txn/txn.c index f835fea8f67..e8fd8c0c119 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -216,6 +216,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force) conn = S2C(session); txn_global = &conn->txn_global; +retry: current_id = last_running = txn_global->current; oldest_session = NULL; prev_oldest_id = txn_global->oldest_id; @@ -287,43 +288,60 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force) WT_TXNID_LT(txn_global->last_running, last_running); /* Update the oldest ID. */ - if ((WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) && - __wt_atomic_casiv32(&txn_global->scan_count, 1, -1)) { - WT_ORDERED_READ(session_cnt, conn->session_cnt); - for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { - if ((id = s->id) != WT_TXN_NONE && - WT_TXNID_LT(id, last_running)) - last_running = id; - if ((id = s->snap_min) != WT_TXN_NONE && - WT_TXNID_LT(id, oldest_id)) - oldest_id = id; - } - - if (WT_TXNID_LT(last_running, oldest_id)) - oldest_id = last_running; - -#ifdef HAVE_DIAGNOSTIC + if (WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) { /* - * Make sure the ID doesn't move past any named snapshots. - * - * Don't include the read/assignment in the assert statement. - * Coverity complains if there are assignments only done in - * diagnostic builds, and when the read is from a volatile. + * We know we want to update. Check if we're racing. */ - id = txn_global->nsnap_oldest_id; - WT_ASSERT(session, - id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); + if (__wt_atomic_casiv32(&txn_global->scan_count, 1, -1)) { + WT_ORDERED_READ(session_cnt, conn->session_cnt); + for (i = 0, s = txn_global->states; + i < session_cnt; i++, s++) { + if ((id = s->id) != WT_TXN_NONE && + WT_TXNID_LT(id, last_running)) + last_running = id; + if ((id = s->snap_min) != WT_TXN_NONE && + WT_TXNID_LT(id, oldest_id)) + oldest_id = id; + } + + if (WT_TXNID_LT(last_running, oldest_id)) + oldest_id = last_running; + +#ifdef HAVE_DIAGNOSTIC + /* + * Make sure the ID doesn't move past any named + * snapshots. + * + * Don't include the read/assignment in the assert + * statement. Coverity complains if there are + * assignments only done in diagnostic builds, and + * when the read is from a volatile. + */ + id = txn_global->nsnap_oldest_id; + WT_ASSERT(session, + id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); #endif - if (WT_TXNID_LT(txn_global->last_running, last_running)) - txn_global->last_running = last_running; - if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) - txn_global->oldest_id = oldest_id; - WT_ASSERT(session, txn_global->scan_count == -1); - txn_global->scan_count = 0; + if (WT_TXNID_LT(txn_global->last_running, last_running)) + txn_global->last_running = last_running; + if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) + txn_global->oldest_id = oldest_id; + WT_ASSERT(session, txn_global->scan_count == -1); + txn_global->scan_count = 0; + } else { + /* + * We wanted to update the oldest ID but we're racing + * another thread. Retry if this is a forced update. + */ + WT_ASSERT(session, txn_global->scan_count > 0); + (void)__wt_atomic_subiv32(&txn_global->scan_count, 1); + if (force) { + __wt_yield(); + goto retry; + } + } } else { if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && - current_id - oldest_id > 10000 && last_running_moved && - oldest_session != NULL) { + current_id - oldest_id > 10000 && oldest_session != NULL) { (void)__wt_verbose(session, WT_VERB_TRANSACTION, "old snapshot %" PRIu64 " pinned in session %d [%s]" diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index bc1537ca878..7d4d4d5c27c 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -56,7 +56,7 @@ __checkpoint_name_check(WT_SESSION_IMPL *session, const char *uri) * confirm the metadata file contains no non-file objects. */ if (uri == NULL) { - WT_ERR(__wt_metadata_cursor(session, NULL, &cursor)); + WT_RET(__wt_metadata_cursor(session, &cursor)); while ((ret = cursor->next(cursor)) == 0) { WT_ERR(cursor->get_key(cursor, &uri)); if (!WT_PREFIX_MATCH(uri, "colgroup:") && @@ -79,8 +79,7 @@ __checkpoint_name_check(WT_SESSION_IMPL *session, const char *uri) WT_ERR_MSG(session, EINVAL, "%s object does not support named checkpoints", fail); -err: if (cursor != NULL) - WT_TRET(cursor->close(cursor)); +err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); return (ret); } @@ -185,7 +184,7 @@ __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], session->ckpt_handle[i].dhandle, ret = (*op)(session, cfg)); else - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_LOCK(session, ret, ret = __wt_conn_btree_apply_single(session, session->ckpt_handle[i].name, NULL, op, cfg)); WT_RET(ret); @@ -371,7 +370,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) full = idle = logging = tracking = false; /* Ensure the metadata table is open before taking any locks. */ - WT_RET(__wt_metadata_open(session)); + WT_RET(__wt_metadata_cursor(session, NULL)); /* * Do a pass over the configuration arguments and figure out what kind @@ -386,9 +385,9 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * Get a list of handles we want to flush; this may pull closed objects * into the session cache, but we're going to do that eventually anyway. */ - WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_SCHEMA_LOCK(session, ret, + WT_WITH_TABLE_LOCK(session, ret, + WT_WITH_HANDLE_LIST_LOCK(session, ret, ret = __checkpoint_apply_all( session, cfg, __wt_checkpoint_list, NULL)))); WT_ERR(ret); @@ -551,14 +550,16 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) saved_meta_next = session->meta_track_next; session->meta_track_next = NULL; WT_WITH_DHANDLE(session, - session->meta_dhandle, ret = __wt_checkpoint(session, cfg)); + WT_SESSION_META_DHANDLE(session), + ret = __wt_checkpoint(session, cfg)); session->meta_track_next = saved_meta_next; WT_ERR(ret); WT_ERR(__checkpoint_verbose_track(session, "metadata sync completed", &verb_timer)); } else - WT_WITH_DHANDLE(session, session->meta_dhandle, + WT_WITH_DHANDLE(session, + WT_SESSION_META_DHANDLE(session), ret = __wt_txn_checkpoint_log( session, false, WT_TXN_LOG_CKPT_SYNC, NULL)); @@ -601,8 +602,8 @@ err: /* */ if (full && logging) { if (ret == 0 && - F_ISSET((WT_BTREE *)session->meta_dhandle->handle, - WT_BTREE_SKIP_CKPT)) + F_ISSET(((WT_CURSOR_BTREE *) + session->meta_cursor)->btree, WT_BTREE_SKIP_CKPT)) idle = true; WT_TRET(__wt_txn_checkpoint_log(session, full, (ret == 0 && !idle) ? @@ -665,7 +666,8 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) */ WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 1); - WT_WITH_CHECKPOINT_LOCK(session, ret = __txn_checkpoint(session, cfg)); + WT_WITH_CHECKPOINT_LOCK(session, ret, + ret = __txn_checkpoint(session, cfg)); WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 0); @@ -1037,12 +1039,13 @@ nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT); "for a bulk-loaded file"); fake_ckpt = true; goto fake; + case WT_BTREE_REBALANCE: case WT_BTREE_SALVAGE: case WT_BTREE_UPGRADE: case WT_BTREE_VERIFY: WT_ERR_MSG(session, EINVAL, - "checkpoints are blocked during salvage, upgrade " - "or verify operations"); + "checkpoints are blocked during rebalance, " + "salvage, upgrade or verify operations"); } /* diff --git a/src/txn/txn_ext.c b/src/txn/txn_ext.c index 36d42a8996f..9ea1af6c4f8 100644 --- a/src/txn/txn_ext.c +++ b/src/txn/txn_ext.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c index c5fa52dea6a..4c4a7fb3132 100644 --- a/src/txn/txn_log.c +++ b/src/txn/txn_log.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -8,6 +8,12 @@ #include "wt_internal.h" +/* Cookie passed to __txn_printlog. */ +typedef struct { + FILE *out; + uint32_t flags; +} WT_TXN_PRINTLOG_ARGS; + /* * __txn_op_log -- * Log an operation for the current transaction. @@ -64,7 +70,8 @@ err: __wt_buf_free(session, &key); */ static int __txn_commit_printlog( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, + uint32_t flags) { bool firstrecord; @@ -79,7 +86,7 @@ __txn_commit_printlog( firstrecord = false; - WT_RET(__wt_txn_op_printlog(session, pp, end, out)); + WT_RET(__wt_txn_op_printlog(session, pp, end, out, flags)); WT_RET(__wt_fprintf(out, "\n }")); } @@ -459,6 +466,7 @@ __txn_printlog(WT_SESSION_IMPL *session, FILE *out; WT_LOG_RECORD *logrec; WT_LSN ckpt_lsn; + WT_TXN_PRINTLOG_ARGS *args; const uint8_t *end, *p; const char *msg; uint64_t txnid; @@ -467,7 +475,8 @@ __txn_printlog(WT_SESSION_IMPL *session, bool compressed; WT_UNUSED(next_lsnp); - out = cookie; + args = cookie; + out = args->out; p = WT_LOG_SKIP_HEADER(rawrec->data); end = (const uint8_t *)rawrec->data + rawrec->size; @@ -506,7 +515,8 @@ __txn_printlog(WT_SESSION_IMPL *session, WT_RET(__wt_fprintf(out, " \"type\" : \"commit\",\n")); WT_RET(__wt_fprintf(out, " \"txnid\" : %" PRIu64 ",\n", txnid)); - WT_RET(__txn_commit_printlog(session, &p, end, out)); + WT_RET(__txn_commit_printlog(session, &p, end, out, + args->flags)); break; case WT_LOGREC_FILE_SYNC: @@ -537,15 +547,18 @@ __txn_printlog(WT_SESSION_IMPL *session, * Print the log in a human-readable format. */ int -__wt_txn_printlog(WT_SESSION *wt_session, FILE *out) +__wt_txn_printlog(WT_SESSION *wt_session, FILE *out, uint32_t flags) { WT_SESSION_IMPL *session; + WT_TXN_PRINTLOG_ARGS args; session = (WT_SESSION_IMPL *)wt_session; + args.out = out; + args.flags = flags; WT_RET(__wt_fprintf(out, "[\n")); WT_RET(__wt_log_scan( - session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, out)); + session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, &args)); WT_RET(__wt_fprintf(out, "\n]\n")); return (0); diff --git a/src/txn/txn_nsnap.c b/src/txn/txn_nsnap.c index 169929a46de..eddcca9248f 100644 --- a/src/txn/txn_nsnap.c +++ b/src/txn/txn_nsnap.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c index d0b3b909f09..8051d059d7e 100644 --- a/src/txn/txn_recover.c +++ b/src/txn/txn_recover.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -427,7 +427,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session) WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config)); - WT_ERR(__wt_metadata_cursor(session, NULL, &metac)); + WT_ERR(__wt_metadata_cursor_open(session, NULL, &metac)); metafile = &r.files[WT_METAFILE_ID]; metafile->c = metac; diff --git a/src/utilities/util.h b/src/utilities/util.h index 08d0537956f..3882d814e3a 100644 --- a/src/utilities/util.h +++ b/src/utilities/util.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -42,6 +42,7 @@ char *util_name(WT_SESSION *, const char *, const char *); int util_printlog(WT_SESSION *, int, char *[]); int util_read(WT_SESSION *, int, char *[]); int util_read_line(WT_SESSION *, ULINE *, bool, bool *); +int util_rebalance(WT_SESSION *, int, char *[]); int util_rename(WT_SESSION *, int, char *[]); int util_salvage(WT_SESSION *, int, char *[]); int util_stat(WT_SESSION *, int, char *[]); diff --git a/src/utilities/util_backup.c b/src/utilities/util_backup.c index d07c99afc19..b3afc78e9e8 100644 --- a/src/utilities/util_backup.c +++ b/src/utilities/util_backup.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/utilities/util_compact.c b/src/utilities/util_compact.c index 153d2d11a6d..c114eb207fa 100644 --- a/src/utilities/util_compact.c +++ b/src/utilities/util_compact.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/utilities/util_cpyright.c b/src/utilities/util_cpyright.c index df135b68d2c..7de0eab6dc6 100644 --- a/src/utilities/util_cpyright.c +++ b/src/utilities/util_cpyright.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -11,7 +11,7 @@ void util_copyright(void) { - printf("%s\n", "Copyright (c) 2008-2015 MongoDB, Inc."); + printf("%s\n", "Copyright (c) 2008-2016 MongoDB, Inc."); printf("%s\n\n", "All rights reserved."); printf("%s\n\n", diff --git a/src/utilities/util_create.c b/src/utilities/util_create.c index 06ea5edd8cc..4e609736f2d 100644 --- a/src/utilities/util_create.c +++ b/src/utilities/util_create.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/utilities/util_drop.c b/src/utilities/util_drop.c index 9717b102857..ba41445dfb6 100644 --- a/src/utilities/util_drop.c +++ b/src/utilities/util_drop.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c index 7dfac50b724..ca761a52d8a 100644 --- a/src/utilities/util_dump.c +++ b/src/utilities/util_dump.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/utilities/util_list.c b/src/utilities/util_list.c index 99a1455a74e..c7afea04b1c 100644 --- a/src/utilities/util_list.c +++ b/src/utilities/util_list.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -8,6 +8,7 @@ #include "util.h" +static int list_get_allocsize(WT_SESSION *, const char *, size_t *); static int list_print(WT_SESSION *, const char *, bool, bool); static int list_print_checkpoint(WT_SESSION *, const char *); static int usage(void); @@ -56,6 +57,48 @@ util_list(WT_SESSION *session, int argc, char *argv[]) } /* + * list_get_allocsize -- + * Get the allocation size for this file from the metadata. + */ +static int +list_get_allocsize(WT_SESSION *session, const char *key, size_t *allocsize) +{ + WT_CONFIG_ITEM szvalue; + WT_CONFIG_PARSER *parser; + WT_DECL_RET; + WT_EXTENSION_API *wt_api; + char *config; + + wt_api = session->connection->get_extension_api(session->connection); + if ((ret = + wt_api->metadata_search(wt_api, session, key, &config)) != 0) { + fprintf(stderr, "%s: %s: extension_api.metadata_search: %s\n", + progname, key, session->strerror(session, ret)); + return (ret); + } + if ((ret = wt_api->config_parser_open(wt_api, session, config, + strlen(config), &parser)) != 0) { + fprintf(stderr, "%s: extension_api.config_parser_open: %s\n", + progname, session->strerror(session, ret)); + return (ret); + } + if ((ret = parser->get(parser, "allocation_size", &szvalue)) != 0) { + if (ret != WT_NOTFOUND) + fprintf(stderr, "%s: config_parser.get: %s\n", + progname, session->strerror(session, ret)); + (void)parser->close(parser); + return (ret); + } + if ((ret = parser->close(parser)) != 0) { + fprintf(stderr, "%s: config_parser.close: %s\n", + progname, session->strerror(session, ret)); + return (ret); + } + *allocsize = (size_t)szvalue.val; + return (0); +} + +/* * list_print -- * List the high-level objects in the database. */ @@ -137,9 +180,10 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag) static int list_print_checkpoint(WT_SESSION *session, const char *key) { + WT_BLOCK_CKPT ci; WT_DECL_RET; WT_CKPT *ckpt, *ckptbase; - size_t len; + size_t allocsize, len; time_t t; uint64_t v; @@ -151,6 +195,14 @@ list_print_checkpoint(WT_SESSION *session, const char *key) if ((ret = __wt_metadata_get_ckptlist(session, key, &ckptbase)) != 0) return (ret == WT_NOTFOUND ? 0 : ret); + /* We need the allocation size for decoding the checkpoint addr */ + if ((ret = list_get_allocsize(session, key, &allocsize)) != 0) { + if (ret == WT_NOTFOUND) + allocsize = 0; + else + return (ret); + } + /* Find the longest name, so we can pretty-print. */ len = 0; WT_CKPT_FOREACH(ckptbase, ckpt) @@ -158,7 +210,15 @@ list_print_checkpoint(WT_SESSION *session, const char *key) len = strlen(ckpt->name); ++len; + memset(&ci, 0, sizeof(ci)); WT_CKPT_FOREACH(ckptbase, ckpt) { + if (allocsize != 0 && (ret = __wt_block_ckpt_decode( + session, allocsize, ckpt->raw.data, &ci)) != 0) { + fprintf(stderr, "%s: __wt_block_buffer_to_ckpt: %s\n", + progname, session->strerror(session, ret)); + /* continue if damaged */ + ci.root_size = 0; + } /* * Call ctime, not ctime_r; ctime_r has portability problems, * the Solaris version is different from the POSIX standard. @@ -179,6 +239,17 @@ list_print_checkpoint(WT_SESSION *session, const char *key) printf(" (%" PRIu64 " KB)\n", v / WT_KILOBYTE); else printf(" (%" PRIu64 " B)\n", v); + if (ci.root_size != 0) { + printf("\t\t" "root offset: %" PRIuMAX + " (0x%" PRIxMAX ")\n", + (intmax_t)ci.root_offset, (intmax_t)ci.root_offset); + printf("\t\t" "root size: %" PRIu32 + " (0x%" PRIx32 ")\n", + ci.root_size, ci.root_size); + printf("\t\t" "root checksum: %" PRIu32 + " (0x%" PRIx32 ")\n", + ci.root_cksum, ci.root_cksum); + } } __wt_metadata_free_ckptlist(session, ckptbase); diff --git a/src/utilities/util_load.c b/src/utilities/util_load.c index a40fa60361f..696dc68630a 100644 --- a/src/utilities/util_load.c +++ b/src/utilities/util_load.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/utilities/util_load.h b/src/utilities/util_load.h index ca359ce662b..710b18bfe83 100644 --- a/src/utilities/util_load.h +++ b/src/utilities/util_load.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/utilities/util_load_json.c b/src/utilities/util_load_json.c index c7d4893ae20..9349d39bb1e 100644 --- a/src/utilities/util_load_json.c +++ b/src/utilities/util_load_json.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/utilities/util_loadtext.c b/src/utilities/util_loadtext.c index c6cd264c423..f9c5b6e9a1f 100644 --- a/src/utilities/util_loadtext.c +++ b/src/utilities/util_loadtext.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/utilities/util_main.c b/src/utilities/util_main.c index 9cbda08690e..e18d8d7d1f5 100644 --- a/src/utilities/util_main.c +++ b/src/utilities/util_main.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -159,6 +159,8 @@ main(int argc, char *argv[]) case 'r': if (strcmp(command, "read") == 0) func = util_read; + else if (strcmp(command, "rebalance") == 0) + func = util_rebalance; else if (strcmp(command, "rename") == 0) func = util_rename; break; @@ -226,7 +228,6 @@ main(int argc, char *argv[]) ret = func(session, argc, argv); /* Close the database. */ - err: if (conn != NULL && (tret = conn->close(conn, NULL)) != 0 && ret == 0) ret = tret; @@ -260,9 +261,10 @@ usage(void) "\t" "dump\t dump an object\n" "\t" "list\t list database objects\n" "\t" "load\t load an object\n" - "\t" "loadtext\t load an object from a text file\n" + "\t" "loadtext load an object from a text file\n" "\t" "printlog display the database log\n" "\t" "read\t read values from an object\n" + "\t" "rebalance rebalance an object\n" "\t" "rename\t rename an object\n" "\t" "salvage\t salvage a file\n" "\t" "stat\t display statistics for an object\n" diff --git a/src/utilities/util_misc.c b/src/utilities/util_misc.c index 76cb37b30dc..f45f6b339f2 100644 --- a/src/utilities/util_misc.c +++ b/src/utilities/util_misc.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/utilities/util_printlog.c b/src/utilities/util_printlog.c index d202b09b228..9a2bdc8a9ba 100644 --- a/src/utilities/util_printlog.c +++ b/src/utilities/util_printlog.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -15,10 +15,10 @@ util_printlog(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - bool printable; + uint32_t flags; - printable = false; - while ((ch = __wt_getopt(progname, argc, argv, "f:p")) != EOF) + flags = 0; + while ((ch = __wt_getopt(progname, argc, argv, "f:x")) != EOF) switch (ch) { case 'f': /* output file */ if (freopen(__wt_optarg, "w", stdout) == NULL) { @@ -27,8 +27,8 @@ util_printlog(WT_SESSION *session, int argc, char *argv[]) return (1); } break; - case 'p': - printable = true; + case 'x': /* hex output */ + LF_SET(WT_TXN_PRINTLOG_HEX); break; case '?': default: @@ -41,8 +41,7 @@ util_printlog(WT_SESSION *session, int argc, char *argv[]) if (argc != 0) return (usage()); - WT_UNUSED(printable); - ret = __wt_txn_printlog(session, stdout); + ret = __wt_txn_printlog(session, stdout, flags); if (ret != 0) { fprintf(stderr, "%s: printlog failed: %s\n", @@ -61,7 +60,7 @@ usage(void) { (void)fprintf(stderr, "usage: %s %s " - "printlog [-p] [-f output-file]\n", + "printlog [-x] [-f output-file]\n", progname, usage_prefix); return (1); } diff --git a/src/utilities/util_read.c b/src/utilities/util_read.c index a2fcc330c7d..2e766377aa9 100644 --- a/src/utilities/util_read.c +++ b/src/utilities/util_read.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/utilities/util_rebalance.c b/src/utilities/util_rebalance.c new file mode 100644 index 00000000000..45f161487e5 --- /dev/null +++ b/src/utilities/util_rebalance.c @@ -0,0 +1,63 @@ +/*- + * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +static int usage(void); + +int +util_rebalance(WT_SESSION *session, int argc, char *argv[]) +{ + WT_DECL_RET; + int ch; + char *name; + + name = NULL; + while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) + switch (ch) { + case '?': + default: + return (usage()); + } + argc -= __wt_optind; + argv += __wt_optind; + + /* The remaining argument is the table name. */ + if (argc != 1) + return (usage()); + if ((name = util_name(session, *argv, "table")) == NULL) + return (1); + + if ((ret = session->rebalance(session, name, NULL)) != 0) { + fprintf(stderr, "%s: rebalance(%s): %s\n", + progname, name, session->strerror(session, ret)); + goto err; + } + + /* Verbose configures a progress counter, move to the next line. */ + if (verbose) + printf("\n"); + + if (0) { +err: ret = 1; + } + + free(name); + + return (ret); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "rebalance uri\n", + progname, usage_prefix); + return (1); +} diff --git a/src/utilities/util_rename.c b/src/utilities/util_rename.c index 29347690ccc..aee299c6e63 100644 --- a/src/utilities/util_rename.c +++ b/src/utilities/util_rename.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/utilities/util_salvage.c b/src/utilities/util_salvage.c index e791d2f1dda..679d1074457 100644 --- a/src/utilities/util_salvage.c +++ b/src/utilities/util_salvage.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/utilities/util_stat.c b/src/utilities/util_stat.c index b7558ee3be0..e511ca4f7e8 100644 --- a/src/utilities/util_stat.c +++ b/src/utilities/util_stat.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/utilities/util_upgrade.c b/src/utilities/util_upgrade.c index 0f2e655d1dd..63b23f28c16 100644 --- a/src/utilities/util_upgrade.c +++ b/src/utilities/util_upgrade.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/utilities/util_verbose.c b/src/utilities/util_verbose.c index 084cce3f610..e568ec0a414 100644 --- a/src/utilities/util_verbose.c +++ b/src/utilities/util_verbose.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/utilities/util_verify.c b/src/utilities/util_verify.c index 12f76e9d4ed..2df4fa65f43 100644 --- a/src/utilities/util_verify.c +++ b/src/utilities/util_verify.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * diff --git a/src/utilities/util_write.c b/src/utilities/util_write.c index 7871040411b..7d9bce02b36 100644 --- a/src/utilities/util_write.c +++ b/src/utilities/util_write.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * |