From d845b75e5f0837f801bdf371babd985308a1ad80 Mon Sep 17 00:00:00 2001 From: Ramon Fernandez Date: Thu, 7 Jan 2016 16:31:22 -0500 Subject: Import wiredtiger-wiredtiger-2.7.0-269-g44463c5.tar.gz from wiredtiger branch mongodb-3.4 ref: 3c2ad56..44463c5 SERVER-21833 Compact does not release space to the system with WiredTiger WT-2060 Simplify aggregation of statistics WT-2099 Seeing memory underflow messages WT-2113 truncate01 sometimes fails WT-2177 Add a per-thread seed to random number generator WT-2198 bulk load and column store appends WT-2231 pinned page cursor searches could check parent keys WT-2235 wt printlog option without unicode WT-2245 WTPERF Truncate has no ability to catch up when it falls behind WT-2246 column-store append searches the leaf page; the maximum record number fails CRUD operations WT-2256 WTPERFs throttle option fires in bursts WT-2257 wtperf doesn't handle overriding workload config WT-2259 __wt_evict_file_exclusive_on() should clear WT_BTREE_NO_EVICTION on error WT-2260 Workloads evict internal pages unexpectedly WT-2262 Random sampling is skewed by tree shape WT-2265 Wiredtiger related change in ppc64le specific code block in gcc.h WT-2266 Add wtperf config to set if perf thresholds are fatal WT-2269 wtperf should dump its config everytime it runs WT-2272 Stress test assertion in the sweep server WT-2275 broken DB after application crash WT-2276 tool to decode checkpoint addr WT-2277 Remove WT check against big-endian systems WT-2279 Define WT_PAUSE(), WT_FULL_BARRIER(), etc when s390x is defined WT-2281 wtperf smoke.sh fails on ppc64le WT-2282 error in wt_txn_update_oldest verbose message test WT-2283 retry in txn_update_oldest results in a hang WT-2285 configure should set BUFFER_ALIGNMENT_DEFAULT to 4kb on linux WT-2289 failure in fast key check WT-2290 WT_SESSION.compact could be more effective. WT-2291 Random cursor walk inefficient in skip list only trees WT-2297 Fix off-by-one error in Huffman config file parsing WT-2299 upper-level WiredTiger code is reaching into the block manager WT-2301 Add reading a range to wtperf WT-2303 Build warning in wtperf WT-2304 wtperf crash dumping config WT-2307 Internal page splits can corrupt cursor iteration WT-2311 Support Sparc --- src/third_party/wiredtiger/src/block/block_addr.c | 51 +++-- .../wiredtiger/src/block/block_compact.c | 92 ++++++--- src/third_party/wiredtiger/src/block/block_mgr.c | 16 ++ src/third_party/wiredtiger/src/block/block_open.c | 38 ++-- src/third_party/wiredtiger/src/btree/bt_compact.c | 42 +++- src/third_party/wiredtiger/src/btree/bt_curnext.c | 111 +++++++++++ src/third_party/wiredtiger/src/btree/bt_curprev.c | 4 + src/third_party/wiredtiger/src/btree/bt_cursor.c | 40 ++-- src/third_party/wiredtiger/src/btree/bt_debug.c | 4 +- src/third_party/wiredtiger/src/btree/bt_huffman.c | 10 +- src/third_party/wiredtiger/src/btree/bt_page.c | 8 +- src/third_party/wiredtiger/src/btree/bt_slvg.c | 2 +- src/third_party/wiredtiger/src/btree/bt_split.c | 133 +++++++------ src/third_party/wiredtiger/src/btree/bt_stat.c | 4 +- src/third_party/wiredtiger/src/btree/col_srch.c | 117 +++++++++-- src/third_party/wiredtiger/src/btree/row_srch.c | 221 +++++++++++++++++++-- src/third_party/wiredtiger/src/cache/cache_las.c | 9 +- src/third_party/wiredtiger/src/conn/conn_api.c | 3 + src/third_party/wiredtiger/src/conn/conn_dhandle.c | 4 +- src/third_party/wiredtiger/src/cursor/cur_bulk.c | 179 ++++++++++++----- src/third_party/wiredtiger/src/cursor/cur_json.c | 13 +- src/third_party/wiredtiger/src/cursor/cur_stat.c | 7 +- src/third_party/wiredtiger/src/cursor/cur_table.c | 7 +- src/third_party/wiredtiger/src/evict/evict_lru.c | 22 +- src/third_party/wiredtiger/src/include/block.h | 7 +- src/third_party/wiredtiger/src/include/btmem.h | 4 +- src/third_party/wiredtiger/src/include/column.i | 22 +- .../wiredtiger/src/include/connection.h | 1 + src/third_party/wiredtiger/src/include/cursor.h | 38 ++-- src/third_party/wiredtiger/src/include/extern.h | 33 +-- src/third_party/wiredtiger/src/include/gcc.h | 52 ++++- src/third_party/wiredtiger/src/include/log.h | 5 + src/third_party/wiredtiger/src/include/misc.h | 3 + src/third_party/wiredtiger/src/include/session.h | 7 +- src/third_party/wiredtiger/src/include/stat.h | 4 +- src/third_party/wiredtiger/src/log/log_auto.c | 96 +++++++-- src/third_party/wiredtiger/src/lsm/lsm_stat.c | 31 +-- src/third_party/wiredtiger/src/meta/meta_turtle.c | 3 +- src/third_party/wiredtiger/src/os_posix/os_map.c | 12 +- .../wiredtiger/src/os_posix/os_pagesize.c | 19 ++ .../wiredtiger/src/os_win/os_pagesize.c | 23 +++ .../wiredtiger/src/reconcile/rec_write.c | 118 +++++++---- .../wiredtiger/src/session/session_api.c | 2 +- .../wiredtiger/src/session/session_compact.c | 8 +- src/third_party/wiredtiger/src/support/global.c | 29 +-- src/third_party/wiredtiger/src/support/hash_city.c | 6 + src/third_party/wiredtiger/src/support/hex.c | 21 +- src/third_party/wiredtiger/src/support/huffman.c | 26 ++- src/third_party/wiredtiger/src/support/rand.c | 23 +++ src/third_party/wiredtiger/src/support/stat.c | 36 ++-- src/third_party/wiredtiger/src/txn/txn.c | 82 +++++--- src/third_party/wiredtiger/src/txn/txn_log.c | 25 ++- .../wiredtiger/src/utilities/util_list.c | 73 ++++++- .../wiredtiger/src/utilities/util_main.c | 1 - .../wiredtiger/src/utilities/util_printlog.c | 15 +- 55 files changed, 1466 insertions(+), 496 deletions(-) create mode 100644 src/third_party/wiredtiger/src/os_posix/os_pagesize.c create mode 100644 src/third_party/wiredtiger/src/os_win/os_pagesize.c (limited to 'src/third_party/wiredtiger/src') diff --git a/src/third_party/wiredtiger/src/block/block_addr.c b/src/third_party/wiredtiger/src/block/block_addr.c index 6d50e5f0f4e..9ba4ec4a8b2 100644 --- a/src/third_party/wiredtiger/src/block/block_addr.c +++ b/src/third_party/wiredtiger/src/block/block_addr.c @@ -14,7 +14,7 @@ * caller's buffer reference so it can be called repeatedly to load a buffer. */ static int -__block_buffer_to_addr(WT_BLOCK *block, +__block_buffer_to_addr(uint32_t allocsize, const uint8_t **pp, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump) { uint64_t o, s, c; @@ -39,8 +39,8 @@ __block_buffer_to_addr(WT_BLOCK *block, *offsetp = 0; *sizep = *cksump = 0; } else { - *offsetp = (wt_off_t)(o + 1) * block->allocsize; - *sizep = (uint32_t)s * block->allocsize; + *offsetp = (wt_off_t)(o + 1) * allocsize; + *sizep = (uint32_t)s * allocsize; *cksump = (uint32_t)c; } return (0); @@ -80,7 +80,8 @@ int __wt_block_buffer_to_addr(WT_BLOCK *block, const uint8_t *p, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump) { - return (__block_buffer_to_addr(block, &p, offsetp, sizep, cksump)); + return (__block_buffer_to_addr( + block->allocsize, &p, offsetp, sizep, cksump)); } /* @@ -139,12 +140,12 @@ __wt_block_addr_string(WT_SESSION_IMPL *session, } /* - * __wt_block_buffer_to_ckpt -- + * __block_buffer_to_ckpt -- * Convert a checkpoint cookie into its components. */ -int -__wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session, - WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci) +static int +__block_buffer_to_ckpt(WT_SESSION_IMPL *session, + uint32_t allocsize, const uint8_t *p, WT_BLOCK_CKPT *ci) { uint64_t a; const uint8_t **pp; @@ -154,13 +155,13 @@ __wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session, WT_RET_MSG(session, WT_ERROR, "unsupported checkpoint version"); pp = &p; - WT_RET(__block_buffer_to_addr(block, pp, + WT_RET(__block_buffer_to_addr(allocsize, pp, &ci->root_offset, &ci->root_size, &ci->root_cksum)); - WT_RET(__block_buffer_to_addr(block, pp, + WT_RET(__block_buffer_to_addr(allocsize, pp, &ci->alloc.offset, &ci->alloc.size, &ci->alloc.cksum)); - WT_RET(__block_buffer_to_addr(block, pp, + WT_RET(__block_buffer_to_addr(allocsize, pp, &ci->avail.offset, &ci->avail.size, &ci->avail.cksum)); - WT_RET(__block_buffer_to_addr(block, pp, + WT_RET(__block_buffer_to_addr(allocsize, pp, &ci->discard.offset, &ci->discard.size, &ci->discard.cksum)); WT_RET(__wt_vunpack_uint(pp, 0, &a)); ci->file_size = (wt_off_t)a; @@ -170,6 +171,32 @@ __wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session, return (0); } +/* + * __wt_block_buffer_to_ckpt -- + * Convert a checkpoint cookie into its components, block manager version. + */ +int +__wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session, + WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci) +{ + return (__block_buffer_to_ckpt(session, block->allocsize, p, ci)); +} + +/* + * __wt_block_ckpt_decode -- + * Convert a checkpoint cookie into its components, external utility + * version. + */ +int +__wt_block_ckpt_decode(WT_SESSION *wt_session, + size_t allocsize, const uint8_t *p, WT_BLOCK_CKPT *ci) +{ + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + return (__block_buffer_to_ckpt(session, (uint32_t)allocsize, p, ci)); +} + /* * __wt_block_ckpt_to_buffer -- * Convert the components into its checkpoint cookie. diff --git a/src/third_party/wiredtiger/src/block/block_compact.c b/src/third_party/wiredtiger/src/block/block_compact.c index d45d0a96da7..cd304b848d4 100644 --- a/src/third_party/wiredtiger/src/block/block_compact.c +++ b/src/third_party/wiredtiger/src/block/block_compact.c @@ -8,7 +8,7 @@ #include "wt_internal.h" -static int __block_dump_avail(WT_SESSION_IMPL *, WT_BLOCK *); +static int __block_dump_avail(WT_SESSION_IMPL *, WT_BLOCK *, bool); /* * __wt_block_compact_start -- @@ -22,8 +22,6 @@ __wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block) /* Switch to first-fit allocation. */ __wt_block_configure_first_fit(block, true); - block->compact_pct_tenths = 0; - return (0); } @@ -34,14 +32,21 @@ __wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block) int __wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block) { + WT_DECL_RET; + WT_UNUSED(session); /* Restore the original allocation plan. */ __wt_block_configure_first_fit(block, false); - block->compact_pct_tenths = 0; + /* Dump the results of the compaction pass. */ + if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) { + __wt_spin_lock(session, &block->live_lock); + ret = __block_dump_avail(session, block, false); + __wt_spin_unlock(session, &block->live_lock); + } - return (0); + return (ret); } /* @@ -70,12 +75,23 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp) if (fh->size <= WT_MEGABYTE) return (0); + /* + * Reset the compaction state information. This is done here, not in the + * compaction "start" routine, because this function is called first to + * determine if compaction is useful. + */ + block->compact_pct_tenths = 0; + block->compact_pages_reviewed = 0; + block->compact_pages_skipped = 0; + block->compact_pages_written = 0; + __wt_spin_lock(session, &block->live_lock); + /* Dump the current state of the file. */ if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) - WT_ERR(__block_dump_avail(session, block)); + WT_ERR(__block_dump_avail(session, block, true)); - /* Sum the available bytes in the first 80% and 90% of the file. */ + /* Sum the available bytes in the initial 80% and 90% of the file. */ avail_eighty = avail_ninety = 0; ninety = fh->size - fh->size / 10; eighty = fh->size - ((fh->size / 10) * 2); @@ -88,23 +104,6 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp) avail_eighty += ext->size; } - WT_ERR(__wt_verbose(session, WT_VERB_COMPACT, - "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first " - "80%% of the file", - block->name, - (uintmax_t)avail_eighty / WT_MEGABYTE, (uintmax_t)avail_eighty)); - WT_ERR(__wt_verbose(session, WT_VERB_COMPACT, - "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first " - "90%% of the file", - block->name, - (uintmax_t)avail_ninety / WT_MEGABYTE, (uintmax_t)avail_ninety)); - WT_ERR(__wt_verbose(session, WT_VERB_COMPACT, - "%s: require 10%% or %" PRIuMAX "MB (%" PRIuMAX ") in the first " - "90%% of the file to perform compaction, compaction %s", - block->name, - (uintmax_t)(fh->size / 10) / WT_MEGABYTE, (uintmax_t)fh->size / 10, - *skipp ? "skipped" : "proceeding")); - /* * Skip files where we can't recover at least 1MB. * @@ -127,6 +126,23 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp) block->compact_pct_tenths = 1; } + WT_ERR(__wt_verbose(session, WT_VERB_COMPACT, + "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first " + "80%% of the file", + block->name, + (uintmax_t)avail_eighty / WT_MEGABYTE, (uintmax_t)avail_eighty)); + WT_ERR(__wt_verbose(session, WT_VERB_COMPACT, + "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first " + "90%% of the file", + block->name, + (uintmax_t)avail_ninety / WT_MEGABYTE, (uintmax_t)avail_ninety)); + WT_ERR(__wt_verbose(session, WT_VERB_COMPACT, + "%s: require 10%% or %" PRIuMAX "MB (%" PRIuMAX ") in the first " + "90%% of the file to perform compaction, compaction %s", + block->name, + (uintmax_t)(fh->size / 10) / WT_MEGABYTE, (uintmax_t)fh->size / 10, + *skipp ? "skipped" : "proceeding")); + err: __wt_spin_unlock(session, &block->live_lock); return (ret); @@ -177,6 +193,14 @@ __wt_block_compact_page_skip(WT_SESSION_IMPL *session, } __wt_spin_unlock(session, &block->live_lock); + if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) { + ++block->compact_pages_reviewed; + if (*skipp) + ++block->compact_pages_skipped; + else + ++block->compact_pages_written; + } + return (ret); } @@ -185,7 +209,7 @@ __wt_block_compact_page_skip(WT_SESSION_IMPL *session, * Dump out the avail list so we can see what compaction will look like. */ static int -__block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block) +__block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block, bool start) { WT_EXTLIST *el; WT_EXT *ext; @@ -195,6 +219,20 @@ __block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block) el = &block->live.avail; size = block->fh->size; + WT_RET(__wt_verbose(session, WT_VERB_COMPACT, + "============ %s", + start ? "testing for compaction" : "ending compaction pass")); + + if (!start) { + WT_RET(__wt_verbose(session, WT_VERB_COMPACT, + "pages reviewed: %" PRIuMAX, + block->compact_pages_reviewed)); + WT_RET(__wt_verbose(session, WT_VERB_COMPACT, + "pages skipped: %" PRIuMAX, block->compact_pages_skipped)); + WT_RET(__wt_verbose(session, WT_VERB_COMPACT, + "pages written: %" PRIuMAX, block->compact_pages_written)); + } + WT_RET(__wt_verbose(session, WT_VERB_COMPACT, "file size %" PRIuMAX "MB (%" PRIuMAX ") with %" PRIuMAX "%% space available %" PRIuMAX "MB (%" PRIuMAX ")", @@ -219,6 +257,10 @@ __block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block) } #ifdef __VERBOSE_OUTPUT_PERCENTILE + /* + * The verbose output always displays 10% buckets, running this code + * as well also displays 1% buckets. + */ for (i = 0; i < WT_ELEMENTS(percentile); ++i) { v = percentile[i] * 512; WT_RET(__wt_verbose(session, WT_VERB_COMPACT, diff --git a/src/third_party/wiredtiger/src/block/block_mgr.c b/src/third_party/wiredtiger/src/block/block_mgr.c index 7260cab75d9..f9f66e05d7f 100644 --- a/src/third_party/wiredtiger/src/block/block_mgr.c +++ b/src/third_party/wiredtiger/src/block/block_mgr.c @@ -220,6 +220,18 @@ __bm_free(WT_BM *bm, return (__wt_block_free(session, bm->block, addr, addr_size)); } +/* + * __bm_is_mapped -- + * Return if the file is mapped into memory. + */ +static bool +__bm_is_mapped(WT_BM *bm, WT_SESSION_IMPL *session) +{ + WT_UNUSED(session); + + return (bm->map == NULL ? false : true); +} + /* * __bm_stat -- * Block-manager statistics. @@ -357,6 +369,7 @@ __bm_method_set(WT_BM *bm, bool readonly) (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly; bm->free = (int (*)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t))__bm_readonly; + bm->is_mapped = __bm_is_mapped; bm->preload = __wt_bm_preload; bm->read = __wt_bm_read; bm->salvage_end = (int (*) @@ -367,6 +380,7 @@ __bm_method_set(WT_BM *bm, bool readonly) (WT_BM *, WT_SESSION_IMPL *))__bm_readonly; bm->salvage_valid = (int (*)(WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t, bool))__bm_readonly; + bm->size = __wt_block_manager_size; bm->stat = __bm_stat; bm->sync = (int (*)(WT_BM *, WT_SESSION_IMPL *, bool))__bm_readonly; @@ -391,12 +405,14 @@ __bm_method_set(WT_BM *bm, bool readonly) bm->compact_skip = __bm_compact_skip; bm->compact_start = __bm_compact_start; bm->free = __bm_free; + bm->is_mapped = __bm_is_mapped; bm->preload = __wt_bm_preload; bm->read = __wt_bm_read; bm->salvage_end = __bm_salvage_end; bm->salvage_next = __bm_salvage_next; bm->salvage_start = __bm_salvage_start; bm->salvage_valid = __bm_salvage_valid; + bm->size = __wt_block_manager_size; bm->stat = __bm_stat; bm->sync = __bm_sync; bm->verify_addr = __bm_verify_addr; diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c index 7cf12d36066..ff70b765d1f 100644 --- a/src/third_party/wiredtiger/src/block/block_open.c +++ b/src/third_party/wiredtiger/src/block/block_open.c @@ -405,27 +405,37 @@ __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats) * Reading from the live system's structure normally requires locking, * but it's an 8B statistics read, there's no need. */ - stats->allocation_size = block->allocsize; - stats->block_checkpoint_size = (int64_t)block->live.ckpt_size; - stats->block_magic = WT_BLOCK_MAGIC; - stats->block_major = WT_BLOCK_MAJOR_VERSION; - stats->block_minor = WT_BLOCK_MINOR_VERSION; - stats->block_reuse_bytes = (int64_t)block->live.avail.bytes; - stats->block_size = block->fh->size; + WT_STAT_WRITE(stats, allocation_size, block->allocsize); + WT_STAT_WRITE( + stats, block_checkpoint_size, (int64_t)block->live.ckpt_size); + WT_STAT_WRITE(stats, block_magic, WT_BLOCK_MAGIC); + WT_STAT_WRITE(stats, block_major, WT_BLOCK_MAJOR_VERSION); + WT_STAT_WRITE(stats, block_minor, WT_BLOCK_MINOR_VERSION); + WT_STAT_WRITE( + stats, block_reuse_bytes, (int64_t)block->live.avail.bytes); + WT_STAT_WRITE(stats, block_size, block->fh->size); } /* * __wt_block_manager_size -- - * Set the size statistic for a file. + * Return the size of a live block handle. */ int -__wt_block_manager_size( - WT_SESSION_IMPL *session, const char *filename, WT_DSRC_STATS *stats) +__wt_block_manager_size(WT_BM *bm, WT_SESSION_IMPL *session, wt_off_t *sizep) { - wt_off_t filesize; - - WT_RET(__wt_filesize_name(session, filename, false, &filesize)); - stats->block_size = filesize; + WT_UNUSED(session); + *sizep = bm->block->fh == NULL ? 0 : bm->block->fh->size; return (0); } + +/* + * __wt_block_manager_named_size -- + * Return the size of a named file. + */ +int +__wt_block_manager_named_size( + WT_SESSION_IMPL *session, const char *name, wt_off_t *sizep) +{ + return (__wt_filesize_name(session, name, false, sizep)); +} diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c index 8044d4f852d..8935d39b696 100644 --- a/src/third_party/wiredtiger/src/btree/bt_compact.c +++ b/src/third_party/wiredtiger/src/btree/bt_compact.c @@ -17,9 +17,11 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_BM *bm; WT_DECL_RET; + WT_MULTI *multi; WT_PAGE *page; WT_PAGE_MODIFY *mod; size_t addr_size; + uint32_t i; const uint8_t *addr; *skipp = true; /* Default skip. */ @@ -41,29 +43,46 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) /* * If the page is clean, test the original addresses. - * If the page is a 1-to-1 replacement, test the replacement addresses. + * If the page is a replacement, test the replacement addresses. * Ignore empty pages, they get merged into the parent. */ if (mod == NULL || mod->rec_result == 0) { __wt_ref_info(ref, &addr, &addr_size, NULL); if (addr == NULL) return (0); - WT_RET( + return ( bm->compact_page_skip(bm, session, addr, addr_size, skipp)); - } else if (mod->rec_result == WT_PM_REC_REPLACE) { - /* - * The page's modification information can change underfoot if - * the page is being reconciled, serialize with reconciliation. - */ + } + + /* + * The page's modification information can change underfoot if the page + * is being reconciled, serialize with reconciliation. + */ + if (mod->rec_result == WT_PM_REC_REPLACE || + mod->rec_result == WT_PM_REC_MULTIBLOCK) WT_RET(__wt_fair_lock(session, &page->page_lock)); + if (mod->rec_result == WT_PM_REC_REPLACE) ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp); + if (mod->rec_result == WT_PM_REC_MULTIBLOCK) + for (multi = mod->mod_multi, + i = 0; i < mod->mod_multi_entries; ++multi, ++i) { + if (multi->disk_image != NULL) + continue; + if ((ret = bm->compact_page_skip(bm, session, + multi->addr.addr, multi->addr.size, skipp)) != 0) + break; + if (!*skipp) + break; + } + + if (mod->rec_result == WT_PM_REC_REPLACE || + mod->rec_result == WT_PM_REC_MULTIBLOCK) WT_TRET(__wt_fair_unlock(session, &page->page_lock)); - WT_RET(ret); - } - return (0); + + return (ret); } /* @@ -139,7 +158,8 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) if (skip) continue; - session->compaction = true; + session->compact_state = WT_COMPACT_SUCCESS; + /* Rewrite the page: mark the page and tree dirty. */ WT_ERR(__wt_page_modify_init(session, ref->page)); __wt_page_modify_set(session, ref->page); diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c index 55843d1cae5..6573bc60165 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curnext.c +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -389,6 +389,14 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt) */ cbt->page_deleted_count = 0; +#ifdef HAVE_DIAGNOSTIC + /* + * If starting a new iteration, clear the last-key returned, it doesn't + * apply. + */ + cbt->lastkey->size = 0; + cbt->lastrecno = WT_RECNO_OOB; +#endif /* * If we don't have a search page, then we're done, we're starting at * the beginning or end of the tree, not as a result of a search. @@ -430,6 +438,104 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt) } } +#ifdef HAVE_DIAGNOSTIC +/* + * __cursor_key_order_check_col -- + * Check key ordering for column-store cursor movements. + */ +static int +__cursor_key_order_check_col( + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next) +{ + int cmp; + + cmp = 0; /* -Werror=maybe-uninitialized */ + + if (cbt->lastrecno != WT_RECNO_OOB) { + if (cbt->lastrecno < cbt->recno) + cmp = -1; + if (cbt->lastrecno > cbt->recno) + cmp = 1; + } + + if (cbt->lastrecno == WT_RECNO_OOB || + (next && cmp < 0) || (!next && cmp > 0)) { + cbt->lastrecno = cbt->recno; + return (0); + } + + WT_PANIC_RET(session, EINVAL, + "WT_CURSOR.%s out-of-order returns: returned key %" PRIu64 " then " + "key %" PRIu64, + next ? "next" : "prev", cbt->lastrecno, cbt->recno); +} + +/* + * __cursor_key_order_check_row -- + * Check key ordering for row-store cursor movements. + */ +static int +__cursor_key_order_check_row( + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next) +{ + WT_BTREE *btree; + WT_ITEM *key; + WT_DECL_RET; + WT_DECL_ITEM(a); + WT_DECL_ITEM(b); + int cmp; + + btree = S2BT(session); + key = &cbt->iface.key; + cmp = 0; /* -Werror=maybe-uninitialized */ + + if (cbt->lastkey->size != 0) + WT_RET(__wt_compare( + session, btree->collator, cbt->lastkey, key, &cmp)); + + if (cbt->lastkey->size == 0 || (next && cmp < 0) || (!next && cmp > 0)) + return (__wt_buf_set(session, cbt->lastkey, + cbt->iface.key.data, cbt->iface.key.size)); + + WT_ERR(__wt_scr_alloc(session, 512, &a)); + WT_ERR(__wt_buf_set_printable( + session, a, cbt->lastkey->data, cbt->lastkey->size)); + + WT_ERR(__wt_scr_alloc(session, 512, &b)); + WT_ERR(__wt_buf_set_printable(session, b, key->data, key->size)); + + WT_PANIC_ERR(session, EINVAL, + "WT_CURSOR.%s out-of-order returns: returned key %.*s then " + "key %.*s", + next ? "next" : "prev", + (int)a->size, (const char *)a->data, + (int)b->size, (const char *)b->data); + +err: __wt_scr_free(session, &a); + __wt_scr_free(session, &b); + + return (ret); +} + +/* + * __wt_cursor_key_order_check -- + * Check key ordering for cursor movements. + */ +int +__wt_cursor_key_order_check( + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next) +{ + switch (cbt->ref->page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + return (__cursor_key_order_check_col(session, cbt, next)); + case WT_PAGE_ROW_LEAF: + return (__cursor_key_order_check_row(session, cbt, next)); + WT_ILLEGAL_VALUE(session); + } +} +#endif + /* * __wt_btcur_next -- * Move to the next record in the tree. @@ -531,6 +637,11 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } +#ifdef HAVE_DIAGNOSTIC + if (ret == 0) + WT_ERR(__wt_cursor_key_order_check(session, cbt, true)); +#endif + err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index 1d23b976edd..1e4b1daa090 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -618,6 +618,10 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } +#ifdef HAVE_DIAGNOSTIC + if (ret == 0) + WT_ERR(__wt_cursor_key_order_check(session, cbt, false)); +#endif err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index f2bf2978320..28b51fd2865 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -62,8 +62,18 @@ __cursor_size_chk(WT_SESSION_IMPL *session, WT_ITEM *kv) static inline int __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt) { - return (btree->type == BTREE_COL_FIX && - !F_ISSET(cbt, WT_CBT_MAX_RECORD)); + /* + * When there's no exact match, column-store search returns the key + * nearest the searched-for key (continuing past keys smaller than the + * searched-for key to return the next-largest key). Therefore, if the + * returned comparison is -1, the searched-for key was larger than any + * row on the page's standard information or column-store insert list. + * + * If the returned comparison is NOT -1, there was a row equal to or + * larger than the searched-for key, and we implicitly create missing + * rows. + */ + return (btree->type == BTREE_COL_FIX && cbt->compare != -1); } /* @@ -502,19 +512,14 @@ retry: WT_RET(__cursor_func_init(cbt, true)); case BTREE_COL_VAR: /* * If WT_CURSTD_APPEND is set, insert a new record (ignoring - * the application's record number). First we search for the - * maximum possible record number so the search ends on the - * last page. The real record number is assigned by the - * serialized append operation. + * the application's record number). The real record number + * is assigned by the serialized append operation. */ if (F_ISSET(cursor, WT_CURSTD_APPEND)) - cbt->iface.recno = UINT64_MAX; + cbt->iface.recno = WT_RECNO_OOB; WT_ERR(__cursor_col_search(session, cbt, NULL)); - if (F_ISSET(cursor, WT_CURSTD_APPEND)) - cbt->iface.recno = WT_RECNO_OOB; - /* * If not overwriting, fail if the key exists. Creating a * record past the end of the tree in a fixed-length @@ -830,6 +835,7 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; + wt_off_t size; uint64_t skip; session = (WT_SESSION_IMPL *)cbt->iface.session; @@ -866,10 +872,12 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) * !!! * Ideally, the number would be prime to avoid restart issues. */ - if (cbt->next_random_sample_size != 0) + if (cbt->next_random_sample_size != 0) { + WT_ERR(btree->bm->size(btree->bm, session, &size)); cbt->next_random_leaf_skip = (uint64_t) - ((btree->bm->block->fh->size / btree->allocsize) / + ((size / btree->allocsize) / cbt->next_random_sample_size) + 1; + } /* * Choose a leaf page from the tree. @@ -1225,6 +1233,11 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt) { cbt->row_key = &cbt->_row_key; cbt->tmp = &cbt->_tmp; + +#ifdef HAVE_DIAGNOSTIC + cbt->lastkey = &cbt->_lastkey; + cbt->lastrecno = WT_RECNO_OOB; +#endif } /* @@ -1250,6 +1263,9 @@ __wt_btcur_close(WT_CURSOR_BTREE *cbt, bool lowlevel) __wt_buf_free(session, &cbt->_row_key); __wt_buf_free(session, &cbt->_tmp); +#ifdef HAVE_DIAGNOSTIC + __wt_buf_free(session, &cbt->_lastkey); +#endif return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index d52a94a6da2..393f869ece9 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -74,9 +74,7 @@ __wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v) static inline void __debug_hex_byte(WT_DBG *ds, uint8_t v) { - static const char hex[] = "0123456789abcdef"; - - __dmsg(ds, "#%c%c", hex[(v & 0xf0) >> 4], hex[v & 0x0f]); + __dmsg(ds, "#%c%c", __wt_hex[(v & 0xf0) >> 4], __wt_hex[v & 0x0f]); } /* diff --git a/src/third_party/wiredtiger/src/btree/bt_huffman.c b/src/third_party/wiredtiger/src/btree/bt_huffman.c index d9ff9616072..a34e57796a8 100644 --- a/src/third_party/wiredtiger/src/btree/bt_huffman.c +++ b/src/third_party/wiredtiger/src/btree/bt_huffman.c @@ -332,11 +332,17 @@ __wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip, for (tp = table, lineno = 1; (ret = fscanf(fp, "%" SCNi64 " %" SCNi64, &symbol, &frequency)) != EOF; ++tp, ++lineno) { - if (lineno > entries) + /* + * Entries is 0-based, that is, there are (entries +1) possible + * values that can be configured. The line number is 1-based, so + * adjust the test for too many entries, and report (entries +1) + * in the error as the maximum possible number of entries. + */ + if (lineno > entries + 1) WT_ERR_MSG(session, EINVAL, "Huffman table file %.*s is corrupted, " "more than %" PRIu32 " entries", - (int)ip->len, ip->str, entries); + (int)ip->len, ip->str, entries + 1); if (ret != 2) WT_ERR_MSG(session, EINVAL, "line %u of Huffman table file %.*s is corrupted: " diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index 8808f0b1a85..fdccf033828 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -272,7 +272,7 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page) const WT_PAGE_HEADER *dsk; WT_PAGE_INDEX *pindex; WT_REF **refp, *ref; - uint32_t i; + uint32_t hint, i; btree = S2BT(session); dsk = page->dsk; @@ -284,9 +284,11 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page) */ pindex = WT_INTL_INDEX_GET_SAFE(page); refp = pindex->index; + hint = 0; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { ref = *refp++; ref->home = page; + ref->pindex_hint = hint++; __wt_cell_unpack(cell, unpack); ref->addr = cell; @@ -404,7 +406,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) const WT_PAGE_HEADER *dsk; WT_PAGE_INDEX *pindex; WT_REF *ref, **refp; - uint32_t i; + uint32_t hint, i; bool overflow_keys; btree = S2BT(session); @@ -421,9 +423,11 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) pindex = WT_INTL_INDEX_GET_SAFE(page); refp = pindex->index; overflow_keys = false; + hint = 0; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { ref = *refp; ref->home = page; + ref->pindex_hint = hint++; __wt_cell_unpack(cell, unpack); switch (unpack->type) { diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c index 756ffd98f3a..b5c299b9ea9 100644 --- a/src/third_party/wiredtiger/src/btree/bt_slvg.c +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -1807,7 +1807,7 @@ err: if (page != NULL) */ static int __slvg_row_build_internal( - WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss) + WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss) { WT_ADDR *addr; WT_DECL_RET; diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 12f4197e9e7..69c787c9385 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -14,6 +14,22 @@ to_incr += __len; \ } while (0) +/* + * A note on error handling: main split functions first allocate/initialize new + * structures; failures during that period are handled by discarding the memory + * and returning an error code, the caller knows the split didn't happen and + * proceeds accordingly. Second, split functions update the tree, and a failure + * in that period is catastrophic, any partial update to the tree requires a + * panic, we can't recover. Third, once the split is complete and the tree has + * been fully updated, we have to ignore most errors, the split is complete and + * correct, callers have to proceed accordingly. + */ +typedef enum { + WT_ERR_IGNORE, /* Ignore minor errors */ + WT_ERR_PANIC, /* Panic on all errors */ + WT_ERR_RETURN /* Clean up and return error */ +} WT_SPLIT_ERROR_PHASE; + /* * __split_oldest_gen -- * Calculate the oldest active split generation. @@ -512,25 +528,13 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex; WT_REF **alloc_refp; WT_REF **child_refp, *ref, **root_refp; + WT_SPLIT_ERROR_PHASE complete; size_t child_incr, root_decr, root_incr, size; uint64_t split_gen; uint32_t children, chunk, i, j, remain; uint32_t slots; void *p; - /* - * A note on error handling: this function first allocates/initializes - * new structures; failures during that period are handled by discarding - * the memory and returning an error code, our caller knows the split - * didn't happen and proceeds accordingly. Second, this function updates - * the tree, and a failure in that period is catastrophic, any partial - * update to the tree requires a panic, we can't recover. Third, once - * the split is complete and the tree has been fully updated, we have to - * ignore most errors because the split is complete and correct, callers - * have to proceed accordingly. - */ - enum { ERR_RETURN, ERR_PANIC, ERR_IGNORE } complete; - WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen); WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen); WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal); @@ -539,7 +543,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) btree = S2BT(session); alloc_index = NULL; root_decr = root_incr = 0; - complete = ERR_RETURN; + complete = WT_ERR_RETURN; /* The root page will be marked dirty, make sure that will succeed. */ WT_RET(__wt_page_modify_init(session, root)); @@ -623,7 +627,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) * threads may be underneath us right now changing the structure * state.) However, if the WT_REF structures reference on-page * information, we have to fix that, because the disk image for - * the page that has an page index entry for the WT_REF is about + * the page that has a page index entry for the WT_REF is about * to change. */ child_pindex = WT_INTL_INDEX_GET_SAFE(child); @@ -641,7 +645,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) root_refp - pindex->index == (ptrdiff_t)pindex->entries); /* Start making real changes to the tree, errors are fatal. */ - complete = ERR_PANIC; + complete = WT_ERR_PANIC; /* Prepare the WT_REFs for the move. */ __split_ref_step1(session, alloc_index, false); @@ -661,7 +665,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_ERR(__split_ref_step2(session, alloc_index, false)); /* The split is complete and correct, ignore benign errors. */ - complete = ERR_IGNORE; + complete = WT_ERR_IGNORE; /* We've installed the allocated page-index, ensure error handling. */ alloc_index = NULL; @@ -687,15 +691,15 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) __wt_page_modify_set(session, root); err: switch (complete) { - case ERR_RETURN: + case WT_ERR_RETURN: __wt_free_ref_index(session, root, alloc_index, true); break; - case ERR_PANIC: + case WT_ERR_PANIC: __wt_err(session, ret, "fatal error during root page split to deepen the tree"); ret = WT_PANIC; break; - case ERR_IGNORE: + case WT_ERR_IGNORE: if (ret != 0 && ret != WT_PANIC) { __wt_err(session, ret, "ignoring not-fatal error during root page split " @@ -721,19 +725,21 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_PAGE *parent; WT_PAGE_INDEX *alloc_index, *pindex; WT_REF **alloc_refp, *next_ref; + WT_SPLIT_ERROR_PHASE complete; size_t parent_decr, size; uint64_t split_gen; - uint32_t i, j; + uint32_t hint, i, j; uint32_t deleted_entries, parent_entries, result_entries; uint32_t *deleted_refs; - bool complete, empty_parent; + bool empty_parent; parent = ref->home; alloc_index = pindex = NULL; parent_decr = 0; parent_entries = 0; - complete = empty_parent = false; + empty_parent = false; + complete = WT_ERR_RETURN; /* The parent page will be marked dirty, make sure that will succeed. */ WT_RET(__wt_page_modify_init(session, parent)); @@ -751,7 +757,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * array anyway. Switch them to the special split state, so that any * reading thread will restart. */ - WT_RET(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr)); + WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr)); for (deleted_entries = 0, i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); @@ -791,28 +797,40 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * Allocate and initialize a new page index array for the parent, then * copy references from the original index array, plus references from * the newly created split array, into place. + * + * Update the WT_REF's page-index hint as we go. This can race with a + * thread setting the hint based on an older page-index, and the change + * isn't backed out in the case of an error, so there ways for the hint + * to be wrong; OK because it's just a hint. */ size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *); WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); parent_incr += size; alloc_index->index = (WT_REF **)(alloc_index + 1); alloc_index->entries = result_entries; - for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) { + for (alloc_refp = alloc_index->index, + hint = i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; if (next_ref == ref) for (j = 0; j < new_entries; ++j) { ref_new[j]->home = parent; + ref_new[j]->pindex_hint = hint++; *alloc_refp++ = ref_new[j]; } - else if (next_ref->state != WT_REF_SPLIT) + else if (next_ref->state != WT_REF_SPLIT) { /* Skip refs we have marked for deletion. */ + next_ref->pindex_hint = hint++; *alloc_refp++ = next_ref; + } } /* Check that we filled in all the entries. */ WT_ASSERT(session, alloc_refp - alloc_index->index == (ptrdiff_t)result_entries); + /* Start making real changes to the tree, errors are fatal. */ + complete = WT_ERR_PANIC; + /* * Confirm the parent page's index hasn't moved then update it, which * makes the split visible to threads descending the tree. @@ -853,16 +871,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, */ WT_FULL_BARRIER(); - /* - * A note on error handling: failures before we swapped the new page - * index into the parent can be resolved by freeing allocated memory - * because the original page is unchanged, we can continue to use it - * and we have not yet modified the parent. Failures after we swap - * the new page index into the parent are also relatively benign, the - * split is OK and complete. For those reasons, we ignore errors past - * this point unless there's a panic. - */ - complete = true; + /* The split is complete and correct, ignore benign errors. */ + complete = WT_ERR_IGNORE; WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, "%p: %s %s" "split into parent %p, %" PRIu32 " -> %" PRIu32 @@ -946,7 +956,8 @@ err: __wt_scr_free(session, &scr); * nothing really bad can have happened, and our caller has to proceed * with the split. */ - if (!complete) { + switch (complete) { + case WT_ERR_RETURN: for (i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; if (next_ref->state == WT_REF_SPLIT) @@ -954,20 +965,28 @@ err: __wt_scr_free(session, &scr); } __wt_free_ref_index(session, NULL, alloc_index, false); - /* * The split couldn't proceed because the parent would be empty, * return EBUSY so our caller knows to unlock the WT_REF that's * being deleted, but don't be noisy, there's nothing wrong. */ if (empty_parent) - return (EBUSY); + ret = EBUSY; + break; + case WT_ERR_PANIC: + __wt_err(session, ret, "fatal error during parent page split"); + ret = WT_PANIC; + break; + case WT_ERR_IGNORE: + if (ret != 0 && ret != WT_PANIC) { + __wt_err(session, ret, + "ignoring not-fatal error during parent page " + "split"); + ret = 0; + } + break; } - - if (ret != 0 && ret != WT_PANIC) - __wt_err(session, ret, - "ignoring not-fatal error during parent page split"); - return (ret == WT_PANIC || !complete ? ret : 0); + return (ret); } /* @@ -983,25 +1002,13 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex, *replace_index; WT_REF **alloc_refp; WT_REF **child_refp, *page_ref, **page_refp, *ref; + WT_SPLIT_ERROR_PHASE complete; size_t child_incr, page_decr, page_incr, parent_incr, size; uint64_t split_gen; uint32_t children, chunk, i, j, remain; uint32_t slots; void *p; - /* - * A note on error handling: this function first allocates/initializes - * new structures; failures during that period are handled by discarding - * the memory and returning an error code, our caller knows the split - * didn't happen and proceeds accordingly. Second, this function updates - * the tree, and a failure in that period is catastrophic, any partial - * update to the tree requires a panic, we can't recover. Third, once - * the split is complete and the tree has been fully updated, we have to - * ignore most errors because the split is complete and correct, callers - * have to proceed accordingly. - */ - enum { ERR_RETURN, ERR_PANIC, ERR_IGNORE } complete; - WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal); WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal); @@ -1012,7 +1019,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) alloc_index = replace_index = NULL; page_ref = page->pg_intl_parent_ref; page_decr = page_incr = parent_incr = 0; - complete = ERR_RETURN; + complete = WT_ERR_RETURN; /* * Our caller is holding the page locked to single-thread splits, which @@ -1133,7 +1140,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) page_refp - pindex->index == (ptrdiff_t)pindex->entries); /* Start making real changes to the tree, errors are fatal. */ - complete = ERR_PANIC; + complete = WT_ERR_PANIC; /* Prepare the WT_REFs for the move. */ __split_ref_step1(session, alloc_index, true); @@ -1157,7 +1164,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ERR(__split_ref_step2(session, alloc_index, true)); /* The split is complete and correct, ignore benign errors. */ - complete = ERR_IGNORE; + complete = WT_ERR_IGNORE; /* * Push out the changes: not required for correctness, but no reason @@ -1193,16 +1200,16 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) __wt_page_modify_set(session, page); err: switch (complete) { - case ERR_RETURN: + case WT_ERR_RETURN: __wt_free_ref_index(session, page, alloc_index, true); __wt_free_ref_index(session, page, replace_index, false); break; - case ERR_PANIC: + case WT_ERR_PANIC: __wt_err(session, ret, "fatal error during internal page split"); ret = WT_PANIC; break; - case ERR_IGNORE: + case WT_ERR_IGNORE: if (ret != 0 && ret != WT_PANIC) { __wt_err(session, ret, "ignoring not-fatal error during internal page " diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c index 5dd75835b0b..ef70160aa72 100644 --- a/src/third_party/wiredtiger/src/btree/bt_stat.c +++ b/src/third_party/wiredtiger/src/btree/bt_stat.c @@ -35,10 +35,10 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) WT_STAT_SET(session, stats, btree_fixed_len, btree->bitcnt); WT_STAT_SET(session, stats, btree_maximum_depth, btree->maximum_depth); - WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage); WT_STAT_SET(session, stats, btree_maxintlkey, btree->maxintlkey); - WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage); + WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage); WT_STAT_SET(session, stats, btree_maxleafkey, btree->maxleafkey); + WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage); WT_STAT_SET(session, stats, btree_maxleafvalue, btree->maxleafvalue); /* Everything else is really, really expensive. */ diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c index e9fa570f97b..c5e2abbe440 100644 --- a/src/third_party/wiredtiger/src/btree/col_srch.c +++ b/src/third_party/wiredtiger/src/btree/col_srch.c @@ -8,13 +8,61 @@ #include "wt_internal.h" +/* + * __check_leaf_key_range -- + * Check the search key is in the leaf page's key range. + */ +static inline int +__check_leaf_key_range(WT_SESSION_IMPL *session, + uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) +{ + WT_PAGE_INDEX *pindex; + uint32_t indx; + + /* + * There are reasons we can't do the fast checks, and we continue with + * the leaf page search in those cases, only skipping the complete leaf + * page search if we know it's not going to work. + */ + cbt->compare = 0; + + /* + * Check if the search key is smaller than the parent's starting key for + * this page. + */ + if (recno < leaf->key.recno) { + cbt->compare = 1; /* page keys > search key */ + return (0); + } + + /* + * Check if the search key is greater than or equal to the starting key + * for the parent's next page. + * + * !!! + * Check that "indx + 1" is a valid page-index entry first, because it + * also checks that "indx" is a valid page-index entry, and we have to + * do that latter check before looking at the indx slot of the array + * for a match to leaf (in other words, our page hint might be wrong). + */ + WT_INTL_INDEX_GET(session, leaf->home, pindex); + indx = leaf->pindex_hint; + if (indx + 1 < pindex->entries && pindex->index[indx] == leaf) + if (recno >= pindex->index[indx + 1]->key.recno) { + cbt->compare = -1; /* page keys < search key */ + return (0); + } + + return (0); +} + /* * __wt_col_search -- * Search a column-store tree for a specific record-based key. */ int __wt_col_search(WT_SESSION_IMPL *session, - uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) + uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_COL *cip; @@ -24,6 +72,7 @@ __wt_col_search(WT_SESSION_IMPL *session, WT_PAGE *page; WT_PAGE_INDEX *pindex, *parent_pindex; WT_REF *current, *descent; + uint64_t recno; uint32_t base, indx, limit; int depth; @@ -31,8 +80,38 @@ __wt_col_search(WT_SESSION_IMPL *session, __cursor_pos_clear(cbt); - /* We may only be searching a single leaf page, not the full tree. */ + /* + * When appending a new record, the search record number will be an + * out-of-band value, search for the largest key in the table instead. + */ + if ((recno = search_recno) == WT_RECNO_OOB) + recno = UINT64_MAX; + + /* + * We may be searching only a single leaf page, not the full tree. In + * the normal case where the page links to a parent, check the page's + * parent keys before doing the full search, it's faster when the + * cursor is being re-positioned. (One case where the page doesn't + * have a parent is if it is being re-instantiated in memory as part + * of a split). + */ if (leaf != NULL) { + WT_ASSERT(session, search_recno != WT_RECNO_OOB); + + if (leaf->home != NULL) { + WT_RET(__check_leaf_key_range( + session, recno, leaf, cbt)); + if (cbt->compare != 0) { + /* + * !!! + * WT_CURSOR.search_near uses the slot value to + * decide if there was an on-page match. + */ + cbt->slot = 0; + return (0); + } + } + current = leaf; goto leaf_only; } @@ -120,7 +199,17 @@ leaf_only: page = current->page; cbt->ref = current; cbt->recno = recno; - cbt->compare = 0; + + /* + * Don't bother searching if the caller is appending a new record where + * we'll allocate the record number; we're not going to find a match by + * definition, and we figure out the record number and position when we + * do the work. + */ + if (search_recno == WT_RECNO_OOB) { + cbt->compare = -1; + return (0); + } /* * Set the on-page slot to an impossible value larger than any possible @@ -142,6 +231,7 @@ leaf_only: * that's impossibly large for the page. We do have additional setup to * do in that case, the record may be appended to the page. */ + cbt->compare = 0; if (page->type == WT_PAGE_COL_FIX) { if (recno < page->pg_fix_recno) { cbt->compare = 1; @@ -190,18 +280,10 @@ past_end: * This is a rarely used path: we normally find exact matches, because * column-store files are dense, but in this case the caller searched * past the end of the table. - * - * Don't bother searching if the caller is appending a new record where - * we'll allocate the record number; we're not going to find a match by - * definition, and we figure out the position when we do the work. */ cbt->ins_head = WT_COL_APPEND(page); - if (recno == UINT64_MAX) - cbt->ins = NULL; - else - cbt->ins = __col_insert_search( - cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno); - if (cbt->ins == NULL) + if ((cbt->ins = __col_insert_search( + cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno)) == NULL) cbt->compare = -1; else { cbt->recno = WT_INSERT_RECNO(cbt->ins); @@ -212,14 +294,5 @@ past_end: else cbt->compare = -1; } - - /* - * Note if the record is past the maximum record in the tree, the cursor - * search functions need to know for fixed-length column-stores because - * appended records implicitly create any skipped records, and cursor - * search functions have to handle that case. - */ - if (cbt->compare == -1) - F_SET(cbt, WT_CBT_MAX_RECORD); return (0); } diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index 079f9d3bad1..e98d30152ab 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -131,6 +131,76 @@ __wt_search_insert( return (0); } +/* + * __check_leaf_key_range -- + * Check the search key is in the leaf page's key range. + */ +static inline int +__check_leaf_key_range(WT_SESSION_IMPL *session, + WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt) +{ + WT_BTREE *btree; + WT_COLLATOR *collator; + WT_ITEM *item; + WT_PAGE_INDEX *pindex; + uint32_t indx; + int cmp; + + btree = S2BT(session); + collator = btree->collator; + item = cbt->tmp; + + /* + * There are reasons we can't do the fast checks, and we continue with + * the leaf page search in those cases, only skipping the complete leaf + * page search if we know it's not going to work. + */ + cbt->compare = 0; + + /* + * First, confirm we have the right parent page-index slot, and quit if + * we don't. We don't search for the correct slot, that would make this + * cheap test expensive. + */ + WT_INTL_INDEX_GET(session, leaf->home, pindex); + indx = leaf->pindex_hint; + if (indx >= pindex->entries || pindex->index[indx] != leaf) + return (0); + + /* + * Check if the search key is smaller than the parent's starting key for + * this page. + * + * We can't compare against slot 0 on a row-store internal page because + * reconciliation doesn't build it, it may not be a valid key. + */ + if (indx != 0) { + __wt_ref_key(leaf->home, leaf, &item->data, &item->size); + WT_RET(__wt_compare(session, collator, srch_key, item, &cmp)); + if (cmp < 0) { + cbt->compare = 1; /* page keys > search key */ + return (0); + } + } + + /* + * Check if the search key is greater than or equal to the starting key + * for the parent's next page. + */ + ++indx; + if (indx < pindex->entries) { + __wt_ref_key( + leaf->home, pindex->index[indx], &item->data, &item->size); + WT_RET(__wt_compare(session, collator, srch_key, item, &cmp)); + if (cmp >= 0) { + cbt->compare = -1; /* page keys < search key */ + return (0); + } + } + + return (0); +} + /* * __wt_row_search -- * Search a row-store tree for a specific key. @@ -179,8 +249,29 @@ __wt_row_search(WT_SESSION_IMPL *session, append_check = insert && cbt->append_tree; descend_right = true; - /* We may only be searching a single leaf page, not the full tree. */ + /* + * We may be searching only a single leaf page, not the full tree. In + * the normal case where the page links to a parent, check the page's + * parent keys before doing the full search, it's faster when the + * cursor is being re-positioned. (One case where the page doesn't + * have a parent is if it is being re-instantiated in memory as part + * of a split). + */ if (leaf != NULL) { + if (leaf->home != NULL) { + WT_RET(__check_leaf_key_range( + session, srch_key, leaf, cbt)); + if (cbt->compare != 0) { + /* + * !!! + * WT_CURSOR.search_near uses the slot value to + * decide if there was an on-page match. + */ + cbt->slot = 0; + return (0); + } + } + current = leaf; goto leaf_only; } @@ -196,15 +287,6 @@ restart_page: page = current->page; WT_INTL_INDEX_GET(session, page, pindex); - /* - * Fast-path internal pages with one child, a common case for - * the root page in new trees. - */ - if (pindex->entries == 1) { - descent = pindex->index[0]; - goto descend; - } - /* Fast-path appends. */ if (append_check) { descent = pindex->index[pindex->entries - 1]; @@ -542,12 +624,18 @@ err: /* int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) { - WT_INSERT *p, *t; + WT_INSERT *ins, **start, **stop; + WT_INSERT_HEAD *ins_head; WT_PAGE *page; - uint32_t cnt; + uint32_t choice, entries, i; + int level; page = cbt->ref->page; + start = stop = NULL; /* [-Wconditional-uninitialized] */ + entries = 0; /* [-Wconditional-uninitialized] */ + + /* If the page has disk-based entries, select from them. */ if (page->pg_row_entries != 0) { cbt->compare = 0; cbt->slot = __wt_random(&session->rnd) % page->pg_row_entries; @@ -562,24 +650,115 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) /* * If the tree is new (and not empty), it might have a large insert - * list. Count how many records are in the list. + * list. */ F_SET(cbt, WT_CBT_SEARCH_SMALLEST); if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) return (WT_NOTFOUND); - for (cnt = 1, p = WT_SKIP_FIRST(cbt->ins_head);; ++cnt) - if ((p = WT_SKIP_NEXT(p)) == NULL) - break; /* - * Select a random number from 0 to (N - 1), return that record. + * Walk down the list until we find a level with at least 50 entries, + * that's where we'll start rolling random numbers. The value 50 is + * used to ignore levels with only a few entries, that is, levels which + * are potentially badly skewed. */ - cnt = __wt_random(&session->rnd) % cnt; - for (p = t = WT_SKIP_FIRST(cbt->ins_head);; t = p) - if (cnt-- == 0 || (p = WT_SKIP_NEXT(p)) == NULL) + for (ins_head = cbt->ins_head, + level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) { + start = &ins_head->head[level]; + for (entries = 0, stop = start; + *stop != NULL; stop = &(*stop)->next[level]) + ++entries; + + if (entries > 50) break; + } + + /* + * If it's a tiny list and we went all the way to level 0, correct the + * level; entries is correctly set. + */ + if (level < 0) + level = 0; + + /* + * Step down the skip list levels, selecting a random chunk of the name + * space at each level. + */ + while (level > 0) { + /* + * There are (entries) or (entries + 1) chunks of the name space + * considered at each level. They are: between start and the 1st + * element, between the 1st and 2nd elements, and so on to the + * last chunk which is the name space after the stop element on + * the current level. This last chunk of name space may or may + * not be there: as we descend the levels of the skip list, this + * chunk may appear, depending if the next level down has + * entries logically after the stop point in the current level. + * We can't ignore those entries: because of the algorithm used + * to determine the depth of a skiplist, there may be a large + * number of entries "revealed" by descending a level. + * + * If the next level down has more items after the current stop + * point, there are (entries + 1) chunks to consider, else there + * are (entries) chunks. + */ + if (*(stop - 1) == NULL) + choice = __wt_random(&session->rnd) % entries; + else + choice = __wt_random(&session->rnd) % (entries + 1); + + if (choice == entries) { + /* + * We selected the name space after the stop element on + * this level. Set the start point to the current stop + * point, descend a level and move the stop element to + * the end of the list, that is, the end of the newly + * discovered name space, counting entries as we go. + */ + start = stop; + --start; + --level; + for (entries = 0, stop = start; + *stop != NULL; stop = &(*stop)->next[level]) + ++entries; + } else { + /* + * We selected another name space on the level. Move the + * start pointer the selected number of entries forward + * to the start of the selected chunk (if the selected + * number is 0, start won't move). Set the stop pointer + * to the next element in the list and drop both start + * and stop down a level. + */ + for (i = 0; i < choice; ++i) + start = &(*start)->next[level]; + stop = &(*start)->next[level]; + + --start; + --stop; + --level; + + /* Count the entries in the selected name space. */ + for (entries = 0, + ins = *start; ins != *stop; ins = ins->next[level]) + ++entries; + } + } + + /* + * When we reach the bottom level, entries will already be set. Select + * a random entry from the name space and return it. + * + * It should be impossible for the entries count to be 0 at this point, + * but check for it out of paranoia and to quiet static testing tools. + */ + if (entries > 0) + entries = __wt_random(&session->rnd) % entries; + for (ins = *start; entries > 0; --entries) + ins = ins->next[0]; + + cbt->ins = ins; cbt->compare = 0; - cbt->ins = t; return (0); } diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c index d3a0265c13a..e943f01236e 100644 --- a/src/third_party/wiredtiger/src/cache/cache_las.c +++ b/src/third_party/wiredtiger/src/cache/cache_las.c @@ -18,6 +18,7 @@ __wt_las_stats_update(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; WT_CONNECTION_STATS **cstats; WT_DSRC_STATS **dstats; + int64_t v; conn = S2C(session); @@ -37,10 +38,10 @@ __wt_las_stats_update(WT_SESSION_IMPL *session) dstats = ((WT_CURSOR_BTREE *) conn->las_session->las_cursor)->btree->dhandle->stats; - WT_STAT_SET(session, cstats, - cache_lookaside_insert, WT_STAT_READ(dstats, cursor_insert)); - WT_STAT_SET(session, cstats, - cache_lookaside_remove, WT_STAT_READ(dstats, cursor_remove)); + v = WT_STAT_READ(dstats, cursor_insert); + WT_STAT_SET(session, cstats, cache_lookaside_insert, v); + v = WT_STAT_READ(dstats, cursor_remove); + WT_STAT_SET(session, cstats, cache_lookaside_remove, v); } /* diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index bd14e1bf4fd..ee9935828e2 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -2003,6 +2003,9 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_ERR(__wt_sweep_config(session, cfg)); WT_ERR(__wt_verbose_config(session, cfg)); + /* Initialize the OS page size for mmap */ + conn->page_size = __wt_get_vm_pagesize(); + /* Now that we know if verbose is configured, output the version. */ WT_ERR(__wt_verbose( session, WT_VERB_VERSION, "%s", WIREDTIGER_VERSION_STRING)); diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c index c6d5b535b86..0821238fbd7 100644 --- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c +++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c @@ -147,12 +147,14 @@ __conn_dhandle_mark_dead(WT_SESSION_IMPL *session) int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) { + WT_BM *bm; WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_DECL_RET; bool marked_dead, no_schema_lock; btree = S2BT(session); + bm = btree->bm; dhandle = session->dhandle; marked_dead = false; @@ -191,7 +193,7 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) */ if (!F_ISSET(btree, WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) { - if (force && (btree->bm == NULL || btree->bm->map == NULL)) { + if (force && (bm == NULL || !bm->is_mapped(bm, session))) { WT_ERR(__conn_dhandle_mark_dead(session)); marked_dead = true; } diff --git a/src/third_party/wiredtiger/src/cursor/cur_bulk.c b/src/third_party/wiredtiger/src/cursor/cur_bulk.c index b996b934464..db64d2ad498 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_bulk.c +++ b/src/third_party/wiredtiger/src/cursor/cur_bulk.c @@ -8,6 +8,25 @@ #include "wt_internal.h" +/* + * __bulk_col_keycmp_err -- + * Error routine when column-store keys inserted out-of-order. + */ +static int +__bulk_col_keycmp_err(WT_CURSOR_BULK *cbulk) +{ + WT_CURSOR *cursor; + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session; + cursor = &cbulk->cbt.iface; + + WT_RET_MSG(session, EINVAL, + "bulk-load presented with out-of-order keys: %" PRIu64 " is less " + "than previously inserted key %" PRIu64, + cursor->recno, cbulk->recno); +} + /* * __curbulk_insert_fix -- * Fixed-length column-store bulk cursor insert. @@ -19,6 +38,7 @@ __curbulk_insert_fix(WT_CURSOR *cursor) WT_CURSOR_BULK *cbulk; WT_DECL_RET; WT_SESSION_IMPL *session; + uint64_t recno; cbulk = (WT_CURSOR_BULK *)cursor; btree = cbulk->cbt.btree; @@ -29,13 +49,63 @@ __curbulk_insert_fix(WT_CURSOR *cursor) * until the bulk cursor is closed. */ CURSOR_API_CALL(cursor, session, insert, btree); + WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); - WT_CURSOR_NEEDVALUE(cursor); + /* + * If the "append" flag was configured, the application doesn't have to + * supply a key, else require a key. + */ + if (F_ISSET(cursor, WT_CURSTD_APPEND)) + recno = cbulk->recno + 1; + else { + WT_CURSOR_CHECKKEY(cursor); + if ((recno = cursor->recno) <= cbulk->recno) + WT_ERR(__bulk_col_keycmp_err(cbulk)); + } + WT_CURSOR_CHECKVALUE(cursor); - WT_ERR(__wt_bulk_insert_fix(session, cbulk)); + /* + * Insert any skipped records as deleted records, update the current + * record count. + */ + for (; recno != cbulk->recno + 1; ++cbulk->recno) + WT_ERR(__wt_bulk_insert_fix(session, cbulk, true)); + cbulk->recno = recno; + + /* Insert the current record. */ + ret = __wt_bulk_insert_fix(session, cbulk, false); + +err: API_END_RET(session, ret); +} + +/* + * __curbulk_insert_fix_bitmap -- + * Fixed-length column-store bulk cursor insert for bitmaps. + */ +static int +__curbulk_insert_fix_bitmap(WT_CURSOR *cursor) +{ + WT_BTREE *btree; + WT_CURSOR_BULK *cbulk; + WT_DECL_RET; + WT_SESSION_IMPL *session; + cbulk = (WT_CURSOR_BULK *)cursor; + btree = cbulk->cbt.btree; + + /* + * Bulk cursor inserts are updates, but don't need auto-commit + * transactions because they are single-threaded and not visible + * until the bulk cursor is closed. + */ + CURSOR_API_CALL(cursor, session, insert, btree); WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); + WT_CURSOR_CHECKVALUE(cursor); + + /* Insert the current record. */ + ret = __wt_bulk_insert_fix_bitmap(session, cbulk); + err: API_END_RET(session, ret); } @@ -50,7 +120,7 @@ __curbulk_insert_var(WT_CURSOR *cursor) WT_CURSOR_BULK *cbulk; WT_DECL_RET; WT_SESSION_IMPL *session; - bool duplicate; + uint64_t recno; cbulk = (WT_CURSOR_BULK *)cursor; btree = cbulk->cbt.btree; @@ -61,45 +131,63 @@ __curbulk_insert_var(WT_CURSOR *cursor) * until the bulk cursor is closed. */ CURSOR_API_CALL(cursor, session, insert, btree); - - WT_CURSOR_NEEDVALUE(cursor); + WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); /* - * If this isn't the first value inserted, compare it against the last - * value and increment the RLE count. - * - * Instead of a "first time" variable, I'm using the RLE count, because - * it is only zero before the first row is inserted. + * If the "append" flag was configured, the application doesn't have to + * supply a key, else require a key. */ - duplicate = false; - if (cbulk->rle != 0) { - if (cbulk->last.size == cursor->value.size && - memcmp(cbulk->last.data, cursor->value.data, - cursor->value.size) == 0) { - ++cbulk->rle; - duplicate = true; - } else - WT_ERR(__wt_bulk_insert_var(session, cbulk)); + if (F_ISSET(cursor, WT_CURSTD_APPEND)) + recno = cbulk->recno + 1; + else { + WT_CURSOR_CHECKKEY(cursor); + if ((recno = cursor->recno) <= cbulk->recno) + WT_ERR(__bulk_col_keycmp_err(cbulk)); } + WT_CURSOR_CHECKVALUE(cursor); + + if (!cbulk->first_insert) { + /* + * If not the first insert and the key space is sequential, + * compare the current value against the last value; if the + * same, just increment the RLE count. + */ + if (recno == cbulk->recno + 1 && + cbulk->last.size == cursor->value.size && + memcmp(cbulk->last.data, + cursor->value.data, cursor->value.size) == 0) { + ++cbulk->rle; + ++cbulk->recno; + goto duplicate; + } + + /* Insert the previous key/value pair. */ + WT_ERR(__wt_bulk_insert_var(session, cbulk, false)); + } else + cbulk->first_insert = false; /* - * Save a copy of the value for the next comparison and reset the RLE - * counter. + * Insert any skipped records as deleted records, update the current + * record count and RLE counter. */ - if (!duplicate) { - WT_ERR(__wt_buf_set(session, - &cbulk->last, cursor->value.data, cursor->value.size)); - cbulk->rle = 1; + if (recno != cbulk->recno + 1) { + cbulk->rle = (recno - cbulk->recno) - 1; + WT_ERR(__wt_bulk_insert_var(session, cbulk, true)); } + cbulk->rle = 1; + cbulk->recno = recno; - WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); + /* Save a copy of the value for the next comparison. */ + ret = __wt_buf_set(session, + &cbulk->last, cursor->value.data, cursor->value.size); +duplicate: err: API_END_RET(session, ret); } /* * __bulk_row_keycmp_err -- - * Error routine when keys inserted out-of-order. + * Error routine when row-store keys inserted out-of-order. */ static int __bulk_row_keycmp_err(WT_CURSOR_BULK *cbulk) @@ -154,6 +242,7 @@ __curbulk_insert_row(WT_CURSOR *cursor) * until the bulk cursor is closed. */ CURSOR_API_CALL(cursor, session, insert, btree); + WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); WT_CURSOR_CHECKKEY(cursor); WT_CURSOR_CHECKVALUE(cursor); @@ -161,28 +250,20 @@ __curbulk_insert_row(WT_CURSOR *cursor) /* * If this isn't the first key inserted, compare it against the last key * to ensure the application doesn't accidentally corrupt the table. - * - * Instead of a "first time" variable, I'm using the RLE count, because - * it is only zero before the first row is inserted. */ - if (cbulk->rle != 0) { + if (!cbulk->first_insert) { WT_ERR(__wt_compare(session, btree->collator, &cursor->key, &cbulk->last, &cmp)); if (cmp <= 0) WT_ERR(__bulk_row_keycmp_err(cbulk)); - } + } else + cbulk->first_insert = false; - /* - * Save a copy of the key for the next comparison and set the RLE - * counter. - */ + /* Save a copy of the key for the next comparison. */ WT_ERR(__wt_buf_set(session, &cbulk->last, cursor->key.data, cursor->key.size)); - cbulk->rle = 1; - - WT_ERR(__wt_bulk_insert_row(session, cbulk)); - WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); + ret = __wt_bulk_insert_row(session, cbulk); err: API_END_RET(session, ret); } @@ -208,13 +289,12 @@ __curbulk_insert_row_skip_check(WT_CURSOR *cursor) * until the bulk cursor is closed. */ CURSOR_API_CALL(cursor, session, insert, btree); + WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); - WT_CURSOR_NEEDKEY(cursor); - WT_CURSOR_NEEDVALUE(cursor); - - WT_ERR(__wt_bulk_insert_row(session, cbulk)); + WT_CURSOR_CHECKKEY(cursor); + WT_CURSOR_CHECKVALUE(cursor); - WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); + ret = __wt_bulk_insert_row(session, cbulk); err: API_END_RET(session, ret); } @@ -237,18 +317,25 @@ __wt_curbulk_init(WT_SESSION_IMPL *session, __wt_cursor_set_notsup(c); switch (cbt->btree->type) { case BTREE_COL_FIX: - c->insert = __curbulk_insert_fix; + c->insert = bitmap ? + __curbulk_insert_fix_bitmap : __curbulk_insert_fix; break; case BTREE_COL_VAR: c->insert = __curbulk_insert_var; break; case BTREE_ROW: + /* + * Row-store order comparisons are expensive, so we optionally + * skip them when we know the input is correct. + */ c->insert = skip_sort_check ? __curbulk_insert_row_skip_check : __curbulk_insert_row; break; WT_ILLEGAL_VALUE(session); } + cbulk->first_insert = true; + cbulk->recno = 0; cbulk->bitmap = bitmap; if (bitmap) F_SET(c, WT_CURSTD_RAW); diff --git a/src/third_party/wiredtiger/src/cursor/cur_json.c b/src/third_party/wiredtiger/src/cursor/cur_json.c index 8f858a5012f..3270be07de4 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_json.c +++ b/src/third_party/wiredtiger/src/cursor/cur_json.c @@ -313,7 +313,6 @@ size_t __wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, bool force_unicode) { char abbrev; - u_char h; if (!force_unicode) { if (isprint(ch) && ch != '\\' && ch != '"') { @@ -354,16 +353,8 @@ __wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, bool force_unicode) *buf++ = 'u'; *buf++ = '0'; *buf++ = '0'; - h = (((u_char)ch) >> 4) & 0xF; - if (h >= 10) - *buf++ = 'A' + (h - 10); - else - *buf++ = '0' + h; - h = ((u_char)ch) & 0xF; - if (h >= 10) - *buf++ = 'A' + (h - 10); - else - *buf++ = '0' + h; + *buf++ = __wt_hex[(ch & 0xf0) >> 4]; + *buf++ = __wt_hex[ch & 0x0f]; } return (6); } diff --git a/src/third_party/wiredtiger/src/cursor/cur_stat.c b/src/third_party/wiredtiger/src/cursor/cur_stat.c index e1d5b8eb91a..652dec364fb 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_stat.c +++ b/src/third_party/wiredtiger/src/cursor/cur_stat.c @@ -384,6 +384,7 @@ __curstat_file_init(WT_SESSION_IMPL *session, { WT_DATA_HANDLE *dhandle; WT_DECL_RET; + wt_off_t size; const char *filename; /* @@ -395,8 +396,8 @@ __curstat_file_init(WT_SESSION_IMPL *session, if (!WT_PREFIX_SKIP(filename, "file:")) return (EINVAL); __wt_stat_dsrc_init_single(&cst->u.dsrc_stats); - WT_RET(__wt_block_manager_size( - session, filename, &cst->u.dsrc_stats)); + WT_RET(__wt_block_manager_named_size(session, filename, &size)); + cst->u.dsrc_stats.block_size = size; __wt_curstat_dsrc_final(cst); return (0); } @@ -662,7 +663,7 @@ __wt_curstat_open(WT_SESSION_IMPL *session, /* * We return the statistics field's offset as the key, and a string - * description, a string value, and a uint64_t value as the value + * description, a string value, and a uint64_t value as the value * columns. */ cursor->key_format = "i"; diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c index dca72a16ee5..e746ccd5871 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_table.c +++ b/src/third_party/wiredtiger/src/cursor/cur_table.c @@ -968,8 +968,11 @@ __wt_curtable_open(WT_SESSION_IMPL *session, WT_ERR(__wt_strdup(session, tmp->data, &ctable->cfg[1])); if (0) { -err: WT_TRET(__curtable_close(cursor)); - *cursorp = NULL; +err: if (*cursorp != NULL) { + WT_TRET(__wt_cursor_close(*cursorp)); + *cursorp = NULL; + } + WT_TRET(__curtable_close(cursor)); } __wt_scr_free(session, &tmp); diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index ac481581c23..0e2b33c35ec 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -727,6 +727,10 @@ __evict_request_walk_clear(WT_SESSION_IMPL *session) F_CLR(session, WT_SESSION_CLEAR_EVICT_WALK); + /* An error is unexpected - flag the failure. */ + if (ret != 0) + __wt_err(session, ret, "Failed to clear eviction walk point"); + return (ret); } @@ -760,20 +764,18 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp) { WT_BTREE *btree; WT_CACHE *cache; + WT_DECL_RET; WT_EVICT_ENTRY *evict; u_int i, elem; + *evict_resetp = false; + btree = S2BT(session); cache = S2C(session)->cache; - /* - * If the file isn't evictable, there's no work to do. - */ - if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) { - *evict_resetp = false; + /* If the file wasn't evictable, there's no work to do. */ + if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) return (0); - } - *evict_resetp = true; /* * Hold the walk lock to set the "no eviction" flag: no new pages from @@ -784,7 +786,7 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp) __wt_spin_unlock(session, &cache->evict_walk_lock); /* Clear any existing LRU eviction walk for the file. */ - WT_RET(__evict_request_walk_clear(session)); + WT_ERR(__evict_request_walk_clear(session)); /* Hold the evict lock to remove any queued pages from this file. */ __wt_spin_lock(session, &cache->evict_lock); @@ -806,7 +808,11 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp) while (btree->evict_busy > 0) __wt_yield(); + *evict_resetp = true; return (0); + +err: F_CLR(btree, WT_BTREE_NO_EVICTION); + return (ret); } /* diff --git a/src/third_party/wiredtiger/src/include/block.h b/src/third_party/wiredtiger/src/include/block.h index 4bff6c82783..804eec24874 100644 --- a/src/third_party/wiredtiger/src/include/block.h +++ b/src/third_party/wiredtiger/src/include/block.h @@ -173,6 +173,7 @@ struct __wt_bm { int (*compact_skip)(WT_BM *, WT_SESSION_IMPL *, bool *); int (*compact_start)(WT_BM *, WT_SESSION_IMPL *); int (*free)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); + bool (*is_mapped)(WT_BM *, WT_SESSION_IMPL *); int (*preload)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); int (*read) (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t); @@ -182,6 +183,7 @@ struct __wt_bm { int (*salvage_start)(WT_BM *, WT_SESSION_IMPL *); int (*salvage_valid) (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t, bool); + int (*size)(WT_BM *, WT_SESSION_IMPL *, wt_off_t *); int (*stat)(WT_BM *, WT_SESSION_IMPL *, WT_DSRC_STATS *stats); int (*sync)(WT_BM *, WT_SESSION_IMPL *, bool); int (*verify_addr)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); @@ -244,7 +246,10 @@ struct __wt_block { bool ckpt_inprogress;/* Live checkpoint in progress */ /* Compaction support */ - int compact_pct_tenths; /* Percent to compact */ + int compact_pct_tenths; /* Percent to compact */ + uint64_t compact_pages_reviewed;/* Pages reviewed */ + uint64_t compact_pages_skipped; /* Pages skipped */ + uint64_t compact_pages_written; /* Pages rewritten */ /* Salvage support */ wt_off_t slvg_off; /* Salvage file offset */ diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 6ee74c61a38..12a736c56a2 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -478,7 +478,7 @@ struct __wt_page { #define pg_row_ins u.row.ins #undef pg_row_upd #define pg_row_upd u.row.upd -#define pg_row_entries u.row.entries +#undef pg_row_entries #define pg_row_entries u.row.entries /* Fixed-length column-store leaf page. */ @@ -1049,7 +1049,7 @@ struct __wt_insert_head { uint64_t __prev_split_gen = (session)->split_gen; \ if (__prev_split_gen == 0) \ do { \ - WT_PUBLISH((session)->split_gen, \ + WT_PUBLISH((session)->split_gen, \ S2C(session)->split_gen); \ } while ((session)->split_gen != S2C(session)->split_gen) diff --git a/src/third_party/wiredtiger/src/include/column.i b/src/third_party/wiredtiger/src/include/column.i index fc1f372b2a9..9388e07d0d8 100644 --- a/src/third_party/wiredtiger/src/include/column.i +++ b/src/third_party/wiredtiger/src/include/column.i @@ -176,6 +176,16 @@ __col_insert_search(WT_INSERT_HEAD *inshead, continue; } + /* + * When no exact match is found, the search returns the smallest + * key larger than the searched-for key, or the largest key + * smaller than the searched-for key, if there is no larger key. + * Our callers depend on that: specifically, the fixed-length + * column store cursor code interprets returning a key smaller + * than the searched-for key to mean the searched-for key is + * larger than any key on the page. Don't change that behavior, + * things will break. + */ ins_recno = WT_INSERT_RECNO(ret_ins); cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1; @@ -282,7 +292,17 @@ __col_var_search(WT_PAGE *page, uint64_t recno, uint64_t *start_recnop) start_recno = repeat->recno + repeat->rle; } - if (recno >= start_recno + (page->pg_var_entries - start_indx)) + /* + * !!! + * The test could be written more simply as: + * + * (recno >= start_recno + (page->pg_var_entries - start_indx)) + * + * It's split into two parts because the simpler test will overflow if + * searching for large record numbers. + */ + if (recno >= start_recno && + recno - start_recno >= page->pg_var_entries - start_indx) return (NULL); return (page->pg_var_d + start_indx + (uint32_t)(recno - start_recno)); diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index 2367f5a0035..1c1cb9b8987 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -415,6 +415,7 @@ struct __wt_connection_impl { uint32_t direct_io; uint32_t write_through; /* FILE_FLAG_WRITE_THROUGH type flags */ bool mmap; /* mmap configuration */ + int page_size; /* OS page size for mmap alignment */ uint32_t verbose; uint32_t flags; diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h index 275e2f2db46..4f232ce4fd0 100644 --- a/src/third_party/wiredtiger/src/include/cursor.h +++ b/src/third_party/wiredtiger/src/include/cursor.h @@ -200,18 +200,23 @@ struct __wt_cursor_btree { uint8_t append_tree; /* Cursor appended to the tree */ +#ifdef HAVE_DIAGNOSTIC + /* Check that cursor next/prev never returns keys out-of-order. */ + WT_ITEM *lastkey, _lastkey; + uint64_t lastrecno; +#endif + #define WT_CBT_ACTIVE 0x01 /* Active in the tree */ #define WT_CBT_ITERATE_APPEND 0x02 /* Col-store: iterating append list */ #define WT_CBT_ITERATE_NEXT 0x04 /* Next iteration configuration */ #define WT_CBT_ITERATE_PREV 0x08 /* Prev iteration configuration */ -#define WT_CBT_MAX_RECORD 0x10 /* Col-store: past end-of-table */ -#define WT_CBT_NO_TXN 0x20 /* Non-transactional cursor +#define WT_CBT_NO_TXN 0x10 /* Non-transactional cursor (e.g. on a checkpoint) */ -#define WT_CBT_SEARCH_SMALLEST 0x40 /* Row-store: small-key insert list */ +#define WT_CBT_SEARCH_SMALLEST 0x20 /* Row-store: small-key insert list */ #define WT_CBT_POSITION_MASK /* Flags associated with position */ \ (WT_CBT_ITERATE_APPEND | WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV | \ - WT_CBT_MAX_RECORD | WT_CBT_SEARCH_SMALLEST) + WT_CBT_SEARCH_SMALLEST) uint8_t flags; }; @@ -219,33 +224,32 @@ struct __wt_cursor_btree { struct __wt_cursor_bulk { WT_CURSOR_BTREE cbt; - WT_REF *ref; /* The leaf page */ - WT_PAGE *leaf; - /* * Variable-length column store compares values during bulk load as * part of RLE compression, row-store compares keys during bulk load * to avoid corruption. */ - WT_ITEM last; /* Last key/value seen */ + bool first_insert; /* First insert */ + WT_ITEM last; /* Last key/value inserted */ /* - * Variable-length column-store RLE counter (also overloaded to mean - * the first time through the bulk-load insert routine, when set to 0). + * Additional column-store bulk load support. */ - uint64_t rle; + uint64_t recno; /* Record number */ + uint64_t rle; /* Variable-length RLE counter */ /* - * Fixed-length column-store current entry in memory chunk count, and - * the maximum number of records per chunk. + * Additional fixed-length column store bitmap bulk load support: + * current entry in memory chunk count, and the maximum number of + * records per chunk. */ + bool bitmap; /* Bitmap bulk load */ uint32_t entry; /* Entry count */ uint32_t nrecs; /* Max records per chunk */ - /* Special bitmap bulk load for fixed-length column stores. */ - bool bitmap; - - void *reconcile; /* Reconciliation information */ + void *reconcile; /* Reconciliation support */ + WT_REF *ref; /* The leaf page */ + WT_PAGE *leaf; }; struct __wt_cursor_config { diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index d84403cc16d..7338f8dae3b 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -14,6 +14,7 @@ extern int __wt_block_buffer_to_addr(WT_BLOCK *block, const uint8_t *p, wt_off_t extern int __wt_block_addr_invalid(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, bool live); extern int __wt_block_addr_string(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, const uint8_t *addr, size_t addr_size); extern int __wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci); +extern int __wt_block_ckpt_decode(WT_SESSION *wt_session, size_t allocsize, const uint8_t *p, WT_BLOCK_CKPT *ci); extern int __wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t **pp, WT_BLOCK_CKPT *ci); extern int __wt_block_ckpt_init( WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name); extern int __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, uint8_t *root_addr, size_t *root_addr_sizep, bool checkpoint); @@ -50,7 +51,8 @@ extern int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block); extern int __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize); extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats); -extern int __wt_block_manager_size( WT_SESSION_IMPL *session, const char *filename, WT_DSRC_STATS *stats); +extern int __wt_block_manager_size(WT_BM *bm, WT_SESSION_IMPL *session, wt_off_t *sizep); +extern int __wt_block_manager_named_size( WT_SESSION_IMPL *session, const char *name, wt_off_t *sizep); extern int __wt_bm_preload( WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size); extern int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size); extern int __wt_block_read_off_blind( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset); @@ -90,6 +92,7 @@ extern int __wt_bloom_drop(WT_BLOOM *bloom, const char *config); extern int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp); extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt); +extern int __wt_cursor_key_order_check( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next); extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating); extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating); extern int __wt_btcur_reset(WT_CURSOR_BTREE *cbt); @@ -170,7 +173,7 @@ extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flag extern int __wt_tree_walk_count(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags); extern int __wt_tree_walk_skip(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags); extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove); -extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt); +extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt); extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key); extern int __wt_row_leaf_key_work(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, bool instantiate); @@ -362,23 +365,23 @@ extern int __wt_logrec_read(WT_SESSION_IMPL *session, const uint8_t **pp, const extern int __wt_logop_read(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *optypep, uint32_t *opsizep); extern int __wt_logop_col_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno, WT_ITEM *value); extern int __wt_logop_col_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop, WT_ITEM *valuep); -extern int __wt_logop_col_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_col_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); extern int __wt_logop_col_remove_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno); extern int __wt_logop_col_remove_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop); -extern int __wt_logop_col_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_col_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); extern int __wt_logop_col_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t start, uint64_t stop); extern int __wt_logop_col_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *startp, uint64_t *stopp); -extern int __wt_logop_col_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_col_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); extern int __wt_logop_row_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key, WT_ITEM *value); extern int __wt_logop_row_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp, WT_ITEM *valuep); -extern int __wt_logop_row_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_row_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); extern int __wt_logop_row_remove_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key); extern int __wt_logop_row_remove_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp); -extern int __wt_logop_row_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_row_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); extern int __wt_logop_row_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *start, WT_ITEM *stop, uint32_t mode); extern int __wt_logop_row_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep); -extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); -extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); +extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags); extern void __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); extern int __wt_log_slot_switch( WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced); extern int __wt_log_slot_new(WT_SESSION_IMPL *session); @@ -468,7 +471,7 @@ extern int __wt_meta_track_init(WT_SESSION_IMPL *session); extern int __wt_meta_track_destroy(WT_SESSION_IMPL *session); extern int __wt_turtle_init(WT_SESSION_IMPL *session); extern int __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **valuep); -extern int __wt_turtle_update( WT_SESSION_IMPL *session, const char *key, const char *value); +extern int __wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value); extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn)); extern int __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp); extern int __wt_realloc(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp); @@ -514,6 +517,7 @@ extern int __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp); extern int __wt_once(void (*init_routine)(void)); extern int __wt_open(WT_SESSION_IMPL *session, const char *name, bool ok_create, bool exclusive, int dio_type, WT_FH **fhp); extern int __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp); +extern int __wt_get_vm_pagesize(void); extern bool __wt_absolute_path(const char *path); extern const char *__wt_path_separator(void); extern bool __wt_has_priv(void); @@ -558,8 +562,9 @@ extern uint32_t __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize); extern int __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); extern int __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); -extern int __wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); -extern int __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); +extern int __wt_bulk_insert_fix( WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted); +extern int __wt_bulk_insert_fix_bitmap(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); +extern int __wt_bulk_insert_var( WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted); extern int __wt_schema_create_strip(WT_SESSION_IMPL *session, const char *v1, const char *v2, char **value_ret); extern int __wt_direct_io_size_check(WT_SESSION_IMPL *session, const char **cfg, const char *config_name, uint32_t *allocsizep); extern int __wt_schema_colgroup_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf); @@ -654,6 +659,7 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp ); extern int __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page); extern void __wt_hazard_close(WT_SESSION_IMPL *session); +extern void __wt_fill_hex(const uint8_t *src, size_t src_max, uint8_t *dest, size_t dest_max, size_t *lenp); extern int __wt_raw_to_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to); extern int __wt_raw_to_esc_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to); extern int __wt_hex2byte(const u_char *from, u_char *to); @@ -671,6 +677,7 @@ extern uint32_t __wt_log2_int(uint32_t n); extern bool __wt_ispo2(uint32_t v); extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2); extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state); +extern int __wt_random_init_seed( WT_SESSION_IMPL *session, WT_RAND_STATE volatile *rnd_state); extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state); extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size); extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))); @@ -732,7 +739,7 @@ extern int __wt_txn_checkpoint_logread( WT_SESSION_IMPL *session, const uint8_t extern int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp); extern int __wt_txn_truncate_log( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop); extern int __wt_txn_truncate_end(WT_SESSION_IMPL *session); -extern int __wt_txn_printlog(WT_SESSION *wt_session, FILE *out); +extern int __wt_txn_printlog(WT_SESSION *wt_session, FILE *out, uint32_t flags); extern int __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_txn_named_snapshot_drop(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval); diff --git a/src/third_party/wiredtiger/src/include/gcc.h b/src/third_party/wiredtiger/src/include/gcc.h index 01e33792d73..bb80f8b738b 100644 --- a/src/third_party/wiredtiger/src/include/gcc.h +++ b/src/third_party/wiredtiger/src/include/gcc.h @@ -156,8 +156,7 @@ __wt_atomic_cas_ptr(void *vp, void *old, void *new) #if defined(x86_64) || defined(__x86_64__) /* Pause instruction to prevent excess processor bus usage */ -#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory") - +#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory") #define WT_FULL_BARRIER() do { \ __asm__ volatile ("mfence" ::: "memory"); \ } while (0) @@ -169,7 +168,7 @@ __wt_atomic_cas_ptr(void *vp, void *old, void *new) } while (0) #elif defined(i386) || defined(__i386__) -#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory") +#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory") #define WT_FULL_BARRIER() do { \ __asm__ volatile ("lock; addl $0, 0(%%esp)" ::: "memory"); \ } while (0) @@ -177,23 +176,58 @@ __wt_atomic_cas_ptr(void *vp, void *old, void *new) #define WT_WRITE_BARRIER() WT_FULL_BARRIER() #elif defined(__PPC64__) || defined(PPC64) +/* ori 0,0,0 is the PPC64 noop instruction */ #define WT_PAUSE() __asm__ volatile("ori 0,0,0" ::: "memory") -#define WT_FULL_BARRIER() do { +#define WT_FULL_BARRIER() do { \ __asm__ volatile ("sync" ::: "memory"); \ } while (0) -#define WT_READ_BARRIER() WT_FULL_BARRIER() -#define WT_WRITE_BARRIER() WT_FULL_BARRIER() + +/* TODO: ISA 2.07 Elemental Memory Barriers would be better, + specifically mbll, and mbss, but they are not supported by POWER 8 */ +#define WT_READ_BARRIER() do { \ + __asm__ volatile ("lwsync" ::: "memory"); \ +} while (0) +#define WT_WRITE_BARRIER() do { \ + __asm__ volatile ("lwsync" ::: "memory"); \ +} while (0) #elif defined(__aarch64__) #define WT_PAUSE() __asm__ volatile("yield" ::: "memory") #define WT_FULL_BARRIER() do { \ - __asm__ volatile ("dsb sy" ::: "memory"); \ + __asm__ volatile ("dsb sy" ::: "memory"); \ +} while (0) +#define WT_READ_BARRIER() do { \ + __asm__ volatile ("dsb ld" ::: "memory"); \ +} while (0) +#define WT_WRITE_BARRIER() do { \ + __asm__ volatile ("dsb st" ::: "memory"); \ +} while (0) + +#elif defined(__s390x__) +#define WT_PAUSE() __asm__ volatile("lr 0,0" ::: "memory") +#define WT_FULL_BARRIER() do { \ + __asm__ volatile ("bcr 15,0\n" ::: "memory"); \ } while (0) +#define WT_READ_BARRIER() WT_FULL_BARRIER() +#define WT_WRITE_BARRIER() WT_FULL_BARRIER() + +#elif defined(__sparc__) +#define WT_PAUSE() __asm__ volatile("rd %%ccr, %%g0" ::: "memory") + +#define WT_FULL_BARRIER() do { \ + __asm__ volatile ("membar #StoreLoad" ::: "memory"); \ +} while (0) + +/* + * On UltraSparc machines, TSO is used, and so there is no need for membar. + * READ_BARRIER = #LoadLoad, and WRITE_BARRIER = #StoreStore are noop. + */ #define WT_READ_BARRIER() do { \ - __asm__ volatile ("dsb ld" ::: "memory"); \ + __asm__ volatile ("" ::: "memory"); \ } while (0) + #define WT_WRITE_BARRIER() do { \ - __asm__ volatile ("dsb st" ::: "memory"); \ + __asm__ volatile ("" ::: "memory"); \ } while (0) #else diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h index 521de567fc0..e7737e12663 100644 --- a/src/third_party/wiredtiger/src/include/log.h +++ b/src/third_party/wiredtiger/src/include/log.h @@ -266,6 +266,11 @@ struct __wt_log_desc { uint64_t log_size; /* 08-15: Log file size */ }; +/* + * Flags for __wt_txn_op_printlog. + */ +#define WT_TXN_PRINTLOG_HEX 0x0001 /* Add hex output */ + /* * WT_LOG_REC_DESC -- * A descriptor for a log record type. diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h index e542baec642..898e44eb8e0 100644 --- a/src/third_party/wiredtiger/src/include/misc.h +++ b/src/third_party/wiredtiger/src/include/misc.h @@ -268,3 +268,6 @@ union __wt_rand_state { uint32_t w, z; } x; }; + +/* Shared array for converting to hex */ +extern const u_char __wt_hex[]; diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index 5c3bcfb8ed0..1eca49f2c40 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -74,7 +74,10 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { TAILQ_HEAD(__cursors, __wt_cursor) cursors; WT_CURSOR_BACKUP *bkp_cursor; /* Hot backup cursor */ - WT_COMPACT *compact; /* Compact state */ + + WT_COMPACT *compact; /* Compaction information */ + enum { WT_COMPACT_NONE=0, + WT_COMPACT_RUNNING, WT_COMPACT_SUCCESS } compact_state; /* * Lookaside table cursor, sweep and eviction worker threads only. @@ -134,8 +137,6 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { void *reconcile; /* Reconciliation support */ int (*reconcile_cleanup)(WT_SESSION_IMPL *); - bool compaction; /* Compaction did some work */ - uint32_t flags; /* diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index dfe7ee5c6cd..a554607b7d5 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -139,8 +139,8 @@ __wt_stats_clear(void *stats_arg, int slot) */ #define WT_STAT_READ(stats, fld) \ __wt_stats_aggregate(stats, WT_STATS_FIELD_TO_SLOT(stats, fld)) -#define WT_STAT_WRITE(session, stats, fld) \ - ((stats)[WT_STATS_SLOT_ID(session)]->fld); +#define WT_STAT_WRITE(stats, fld, v) \ + (stats)->fld = (int64_t)(v) #define WT_STAT_DECRV(session, stats, fld, value) \ (stats)[WT_STATS_SLOT_ID(session)]->fld -= (int64_t)(value) diff --git a/src/third_party/wiredtiger/src/log/log_auto.c b/src/third_party/wiredtiger/src/log/log_auto.c index 5a1d03b1976..54df01d01ab 100644 --- a/src/third_party/wiredtiger/src/log/log_auto.c +++ b/src/third_party/wiredtiger/src/log/log_auto.c @@ -69,7 +69,7 @@ __logrec_json_unpack_str(char *dest, size_t destlen, const char *src, } static int -__logrec_jsonify_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item) +__logrec_make_json_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item) { size_t needed; @@ -79,6 +79,17 @@ __logrec_jsonify_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item) return (0); } +static int +__logrec_make_hex_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item) +{ + size_t needed; + + needed = item->size * 2 + 1; + WT_RET(__wt_realloc(session, NULL, needed, destp)); + __wt_fill_hex(item->data, item->size, (uint8_t *)*destp, needed, NULL); + return (0); +} + int __wt_logop_col_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, @@ -121,7 +132,8 @@ __wt_logop_col_put_unpack( int __wt_logop_col_put_print( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { WT_DECL_RET; uint32_t fileid; @@ -138,9 +150,14 @@ __wt_logop_col_put_print( " \"fileid\": \"%" PRIu32 "\",\n", fileid)); WT_ERR(__wt_fprintf(out, " \"recno\": \"%" PRIu64 "\",\n", recno)); - WT_ERR(__logrec_jsonify_str(session, &escaped, &value)); + WT_ERR(__logrec_make_json_str(session, &escaped, &value)); WT_ERR(__wt_fprintf(out, " \"value\": \"%s\"", escaped)); + if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) { + WT_ERR(__logrec_make_hex_str(session, &escaped, &value)); + WT_ERR(__wt_fprintf(out, + ",\n \"value-hex\": \"%s\"", escaped)); + } err: __wt_free(session, escaped); return (ret); @@ -188,11 +205,13 @@ __wt_logop_col_remove_unpack( int __wt_logop_col_remove_print( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { uint32_t fileid; uint64_t recno; + WT_UNUSED(flags); WT_RET(__wt_logop_col_remove_unpack( session, pp, end, &fileid, &recno)); @@ -246,12 +265,14 @@ __wt_logop_col_truncate_unpack( int __wt_logop_col_truncate_print( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { uint32_t fileid; uint64_t start; uint64_t stop; + WT_UNUSED(flags); WT_RET(__wt_logop_col_truncate_unpack( session, pp, end, &fileid, &start, &stop)); @@ -307,7 +328,8 @@ __wt_logop_row_put_unpack( int __wt_logop_row_put_print( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { WT_DECL_RET; uint32_t fileid; @@ -322,12 +344,22 @@ __wt_logop_row_put_print( WT_RET(__wt_fprintf(out, " \"optype\": \"row_put\",\n")); WT_ERR(__wt_fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid)); - WT_ERR(__logrec_jsonify_str(session, &escaped, &key)); + WT_ERR(__logrec_make_json_str(session, &escaped, &key)); WT_ERR(__wt_fprintf(out, " \"key\": \"%s\",\n", escaped)); - WT_ERR(__logrec_jsonify_str(session, &escaped, &value)); + if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) { + WT_ERR(__logrec_make_hex_str(session, &escaped, &key)); + WT_ERR(__wt_fprintf(out, + " \"key-hex\": \"%s\",\n", escaped)); + } + WT_ERR(__logrec_make_json_str(session, &escaped, &value)); WT_ERR(__wt_fprintf(out, " \"value\": \"%s\"", escaped)); + if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) { + WT_ERR(__logrec_make_hex_str(session, &escaped, &value)); + WT_ERR(__wt_fprintf(out, + ",\n \"value-hex\": \"%s\"", escaped)); + } err: __wt_free(session, escaped); return (ret); @@ -375,7 +407,8 @@ __wt_logop_row_remove_unpack( int __wt_logop_row_remove_print( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { WT_DECL_RET; uint32_t fileid; @@ -389,9 +422,14 @@ __wt_logop_row_remove_print( WT_RET(__wt_fprintf(out, " \"optype\": \"row_remove\",\n")); WT_ERR(__wt_fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid)); - WT_ERR(__logrec_jsonify_str(session, &escaped, &key)); + WT_ERR(__logrec_make_json_str(session, &escaped, &key)); WT_ERR(__wt_fprintf(out, " \"key\": \"%s\"", escaped)); + if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) { + WT_ERR(__logrec_make_hex_str(session, &escaped, &key)); + WT_ERR(__wt_fprintf(out, + ",\n \"key-hex\": \"%s\"", escaped)); + } err: __wt_free(session, escaped); return (ret); @@ -439,7 +477,8 @@ __wt_logop_row_truncate_unpack( int __wt_logop_row_truncate_print( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { WT_DECL_RET; uint32_t fileid; @@ -455,12 +494,22 @@ __wt_logop_row_truncate_print( WT_RET(__wt_fprintf(out, " \"optype\": \"row_truncate\",\n")); WT_ERR(__wt_fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid)); - WT_ERR(__logrec_jsonify_str(session, &escaped, &start)); + WT_ERR(__logrec_make_json_str(session, &escaped, &start)); WT_ERR(__wt_fprintf(out, " \"start\": \"%s\",\n", escaped)); - WT_ERR(__logrec_jsonify_str(session, &escaped, &stop)); + if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) { + WT_ERR(__logrec_make_hex_str(session, &escaped, &start)); + WT_ERR(__wt_fprintf(out, + " \"start-hex\": \"%s\",\n", escaped)); + } + WT_ERR(__logrec_make_json_str(session, &escaped, &stop)); WT_ERR(__wt_fprintf(out, " \"stop\": \"%s\",\n", escaped)); + if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) { + WT_ERR(__logrec_make_hex_str(session, &escaped, &stop)); + WT_ERR(__wt_fprintf(out, + " \"stop-hex\": \"%s\",\n", escaped)); + } WT_ERR(__wt_fprintf(out, " \"mode\": \"%" PRIu32 "\"", mode)); @@ -470,7 +519,8 @@ err: __wt_free(session, escaped); int __wt_txn_op_printlog( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, + FILE *out, uint32_t flags) { uint32_t optype, opsize; @@ -480,27 +530,33 @@ __wt_txn_op_printlog( switch (optype) { case WT_LOGOP_COL_PUT: - WT_RET(__wt_logop_col_put_print(session, pp, end, out)); + WT_RET(__wt_logop_col_put_print(session, pp, end, out, + flags)); break; case WT_LOGOP_COL_REMOVE: - WT_RET(__wt_logop_col_remove_print(session, pp, end, out)); + WT_RET(__wt_logop_col_remove_print(session, pp, end, out, + flags)); break; case WT_LOGOP_COL_TRUNCATE: - WT_RET(__wt_logop_col_truncate_print(session, pp, end, out)); + WT_RET(__wt_logop_col_truncate_print(session, pp, end, out, + flags)); break; case WT_LOGOP_ROW_PUT: - WT_RET(__wt_logop_row_put_print(session, pp, end, out)); + WT_RET(__wt_logop_row_put_print(session, pp, end, out, + flags)); break; case WT_LOGOP_ROW_REMOVE: - WT_RET(__wt_logop_row_remove_print(session, pp, end, out)); + WT_RET(__wt_logop_row_remove_print(session, pp, end, out, + flags)); break; case WT_LOGOP_ROW_TRUNCATE: - WT_RET(__wt_logop_row_truncate_print(session, pp, end, out)); + WT_RET(__wt_logop_row_truncate_print(session, pp, end, out, + flags)); break; WT_ILLEGAL_VALUE(session); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_stat.c b/src/third_party/wiredtiger/src/lsm/lsm_stat.c index c1eb7a2a389..7c53990a2a2 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_stat.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_stat.c @@ -91,7 +91,7 @@ __curstat_lsm_init( * top-level. */ new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor); - new->lsm_generation_max = chunk->generation; + WT_STAT_WRITE(new, lsm_generation_max, chunk->generation); /* Aggregate statistics from each new chunk. */ __wt_stat_dsrc_aggregate_single(new, stats); @@ -115,37 +115,40 @@ __curstat_lsm_init( * into the top-level. */ new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor); - new->bloom_size = - (int64_t)((chunk->count * lsm_tree->bloom_bit_count) / 8); - new->bloom_page_evict = - new->cache_eviction_clean + new->cache_eviction_dirty; - new->bloom_page_read = new->cache_read; + WT_STAT_WRITE(new, bloom_size, + (int64_t)((chunk->count * lsm_tree->bloom_bit_count) / 8)); + WT_STAT_WRITE(new, bloom_page_evict, + new->cache_eviction_clean + new->cache_eviction_dirty); + WT_STAT_WRITE(new, bloom_page_read, new->cache_read); __wt_stat_dsrc_aggregate_single(new, stats); WT_ERR(stat_cursor->close(stat_cursor)); } /* Set statistics that aren't aggregated directly into the cursor */ - stats->bloom_count = bloom_count; - stats->lsm_chunk_count = lsm_tree->nchunks; + WT_STAT_WRITE(stats, bloom_count, bloom_count); + WT_STAT_WRITE(stats, lsm_chunk_count, lsm_tree->nchunks); /* Include, and optionally clear, LSM-level specific information. */ - stats->bloom_miss = lsm_tree->bloom_miss; + WT_STAT_WRITE(stats, bloom_miss, lsm_tree->bloom_miss); if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) lsm_tree->bloom_miss = 0; - stats->bloom_hit = lsm_tree->bloom_hit; + WT_STAT_WRITE(stats, bloom_hit, lsm_tree->bloom_hit); if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) lsm_tree->bloom_hit = 0; - stats->bloom_false_positive = lsm_tree->bloom_false_positive; + WT_STAT_WRITE( + stats, bloom_false_positive, lsm_tree->bloom_false_positive); if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) lsm_tree->bloom_false_positive = 0; - stats->lsm_lookup_no_bloom = lsm_tree->lsm_lookup_no_bloom; + WT_STAT_WRITE( + stats, lsm_lookup_no_bloom, lsm_tree->lsm_lookup_no_bloom); if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) lsm_tree->lsm_lookup_no_bloom = 0; - stats->lsm_checkpoint_throttle = lsm_tree->lsm_checkpoint_throttle; + WT_STAT_WRITE( + stats, lsm_checkpoint_throttle, lsm_tree->lsm_checkpoint_throttle); if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) lsm_tree->lsm_checkpoint_throttle = 0; - stats->lsm_merge_throttle = lsm_tree->lsm_merge_throttle; + WT_STAT_WRITE(stats, lsm_merge_throttle, lsm_tree->lsm_merge_throttle); if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) lsm_tree->lsm_merge_throttle = 0; diff --git a/src/third_party/wiredtiger/src/meta/meta_turtle.c b/src/third_party/wiredtiger/src/meta/meta_turtle.c index 13e8b31916f..3bd57846862 100644 --- a/src/third_party/wiredtiger/src/meta/meta_turtle.c +++ b/src/third_party/wiredtiger/src/meta/meta_turtle.c @@ -271,8 +271,7 @@ err: WT_TRET(__wt_fclose(&fp, WT_FHANDLE_READ)); * Update the turtle file. */ int -__wt_turtle_update( - WT_SESSION_IMPL *session, const char *key, const char *value) +__wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value) { WT_FH *fh; WT_DECL_ITEM(buf); diff --git a/src/third_party/wiredtiger/src/os_posix/os_map.c b/src/third_party/wiredtiger/src/os_posix/os_map.c index e95ccb0ade2..4276c89dbcf 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_map.c +++ b/src/third_party/wiredtiger/src/os_posix/os_map.c @@ -48,8 +48,6 @@ __wt_mmap(WT_SESSION_IMPL *session, return (0); } -#define WT_VM_PAGESIZE 4096 - /* * __wt_mmap_preload -- * Cause a section of a memory map to be faulted in. @@ -59,9 +57,10 @@ __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size) { #ifdef HAVE_POSIX_MADVISE /* Linux requires the address be aligned to a 4KB boundary. */ + WT_CONNECTION_IMPL *conn = S2C(session); WT_BM *bm = S2BT(session)->bm; WT_DECL_RET; - void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1)); + void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(conn->page_size - 1)); size += WT_PTRDIFF(p, blk); /* XXX proxy for "am I doing a scan?" -- manual read-ahead */ @@ -78,9 +77,9 @@ __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size) * Manual pages aren't clear on whether alignment is required for the * size, so we will be conservative. */ - size &= ~(size_t)(WT_VM_PAGESIZE - 1); + size &= ~(size_t)(conn->page_size - 1); - if (size > WT_VM_PAGESIZE && + if (size > (size_t)conn->page_size && (ret = posix_madvise(blk, size, POSIX_MADV_WILLNEED)) != 0) WT_RET_MSG(session, ret, "posix_madvise will need"); #else @@ -101,8 +100,9 @@ __wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size) { #ifdef HAVE_POSIX_MADVISE /* Linux requires the address be aligned to a 4KB boundary. */ + WT_CONNECTION_IMPL *conn = S2C(session); WT_DECL_RET; - void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1)); + void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(conn->page_size - 1)); size += WT_PTRDIFF(p, blk); if ((ret = posix_madvise(blk, size, POSIX_MADV_DONTNEED)) != 0) diff --git a/src/third_party/wiredtiger/src/os_posix/os_pagesize.c b/src/third_party/wiredtiger/src/os_posix/os_pagesize.c new file mode 100644 index 00000000000..e7c7b4fdf15 --- /dev/null +++ b/src/third_party/wiredtiger/src/os_posix/os_pagesize.c @@ -0,0 +1,19 @@ +/*- + * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_get_vm_pagesize -- + * Return the default page size of a virtual memory page. + */ +int +__wt_get_vm_pagesize(void) +{ + return (getpagesize()); +} diff --git a/src/third_party/wiredtiger/src/os_win/os_pagesize.c b/src/third_party/wiredtiger/src/os_win/os_pagesize.c new file mode 100644 index 00000000000..55cd6a694ec --- /dev/null +++ b/src/third_party/wiredtiger/src/os_win/os_pagesize.c @@ -0,0 +1,23 @@ +/*- + * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_get_vm_pagesize -- + * Return the default page size of a virtual memory page. + */ +int +__wt_get_vm_pagesize(void) +{ + SYSTEM_INFO system_info; + + GetSystemInfo(&system_info); + + return (system_info.dwPageSize); +} diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 21cc68ed119..2b07117f9d5 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -1276,6 +1276,8 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, for (upd = upd_list; upd->next != NULL; upd = upd->next) ; upd->next = append; + __wt_cache_page_inmem_incr( + session, page, WT_UPDATE_MEMSIZE(append)); } /* @@ -1756,7 +1758,7 @@ __rec_key_state_update(WT_RECONCILE *r, bool ovfl_key) * Figure out the maximum leaf page size for the reconciliation. */ static inline uint32_t -__rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r) +__rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r) { WT_BTREE *btree; WT_PAGE *page; @@ -3263,7 +3265,14 @@ supd_check_complete: memset(WT_BLOCK_HEADER_REF(dsk), 0, btree->block_header); bnd->cksum = __wt_cksum(buf->data, buf->size); - if (mod->rec_result == WT_PM_REC_MULTIBLOCK && + /* + * One last check: don't reuse blocks if compacting, the reason + * for compaction is to move blocks to different locations. We + * do this check after calculating the checksums, hopefully the + * next write can be skipped. + */ + if (session->compact_state == WT_COMPACT_NONE && + mod->rec_result == WT_PM_REC_MULTIBLOCK && mod->mod_multi_entries > bnd_slot) { multi = &mod->mod_multi[bnd_slot]; if (multi->size == bnd->size && @@ -3502,7 +3511,7 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) break; case BTREE_COL_VAR: if (cbulk->rle != 0) - WT_RET(__wt_bulk_insert_var(session, cbulk)); + WT_RET(__wt_bulk_insert_var(session, cbulk, false)); break; case BTREE_ROW: break; @@ -3625,55 +3634,69 @@ __rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk) * Fixed-length column-store bulk insert. */ int -__wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) +__wt_bulk_insert_fix( + WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted) { WT_BTREE *btree; WT_CURSOR *cursor; WT_RECONCILE *r; - uint32_t entries, offset, page_entries, page_size; - const uint8_t *data; r = cbulk->reconcile; btree = S2BT(session); cursor = &cbulk->cbt.iface; - if (cbulk->bitmap) { - if (((r->recno - 1) * btree->bitcnt) & 0x7) - WT_RET_MSG(session, EINVAL, - "Bulk bitmap load not aligned on a byte boundary"); - for (data = cursor->value.data, - entries = (uint32_t)cursor->value.size; - entries > 0; - entries -= page_entries, data += page_size) { - WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk)); - - page_entries = - WT_MIN(entries, cbulk->nrecs - cbulk->entry); - page_size = __bitstr_size(page_entries * btree->bitcnt); - offset = __bitstr_size(cbulk->entry * btree->bitcnt); - memcpy(r->first_free + offset, data, page_size); - cbulk->entry += page_entries; - r->recno += page_entries; - } - return (0); - } - WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk)); - - __bit_setv(r->first_free, - cbulk->entry, btree->bitcnt, ((uint8_t *)cursor->value.data)[0]); + __bit_setv(r->first_free, cbulk->entry, + btree->bitcnt, deleted ? 0 : ((uint8_t *)cursor->value.data)[0]); ++cbulk->entry; ++r->recno; return (0); } +/* + * __wt_bulk_insert_fix_bitmap -- + * Fixed-length column-store bulk insert. + */ +int +__wt_bulk_insert_fix_bitmap(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) +{ + WT_BTREE *btree; + WT_CURSOR *cursor; + WT_RECONCILE *r; + uint32_t entries, offset, page_entries, page_size; + const uint8_t *data; + + r = cbulk->reconcile; + btree = S2BT(session); + cursor = &cbulk->cbt.iface; + + if (((r->recno - 1) * btree->bitcnt) & 0x7) + WT_RET_MSG(session, EINVAL, + "Bulk bitmap load not aligned on a byte boundary"); + for (data = cursor->value.data, + entries = (uint32_t)cursor->value.size; + entries > 0; + entries -= page_entries, data += page_size) { + WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk)); + + page_entries = WT_MIN(entries, cbulk->nrecs - cbulk->entry); + page_size = __bitstr_size(page_entries * btree->bitcnt); + offset = __bitstr_size(cbulk->entry * btree->bitcnt); + memcpy(r->first_free + offset, data, page_size); + cbulk->entry += page_entries; + r->recno += page_entries; + } + return (0); +} + /* * __wt_bulk_insert_var -- * Variable-length column-store bulk insert. */ int -__wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) +__wt_bulk_insert_var( + WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted) { WT_BTREE *btree; WT_KV *val; @@ -3682,14 +3705,20 @@ __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) r = cbulk->reconcile; btree = S2BT(session); - /* - * Store the bulk cursor's last buffer, not the current value, we're - * creating a duplicate count, which means we want the previous value - * seen, not the current value. - */ val = &r->v; - WT_RET(__rec_cell_build_val( - session, r, cbulk->last.data, cbulk->last.size, cbulk->rle)); + if (deleted) { + val->cell_len = __wt_cell_pack_del(&val->cell, cbulk->rle); + val->buf.data = NULL; + val->buf.size = 0; + val->len = val->cell_len; + } else + /* + * Store the bulk cursor's last buffer, not the current value, + * we're tracking duplicates, which means we want the previous + * value seen, not the current value. + */ + WT_RET(__rec_cell_build_val(session, + r, cbulk->last.data, cbulk->last.size, cbulk->rle)); /* Boundary: split or write the page. */ if (val->len > r->space_avail) @@ -4445,7 +4474,7 @@ compare: /* WT_ERR(__rec_txn_read(session, r, ins, NULL, NULL, &upd)); if (upd == NULL) continue; - for (n = WT_INSERT_RECNO(ins); src_recno <= n; ++src_recno) { + for (n = WT_INSERT_RECNO(ins); src_recno <= n;) { /* * The application may have inserted records which left * gaps in the name space, and these gaps can be huge. @@ -4485,7 +4514,7 @@ compare: /* last->size == size && memcmp(last->data, data, size) == 0)) { ++rle; - continue; + goto next; } WT_ERR(__rec_col_var_helper(session, r, salvage, last, last_deleted, 0, rle)); @@ -4504,6 +4533,15 @@ compare: /* } last_deleted = deleted; rle = 1; + + /* + * Move to the next record. It's not a simple increment + * because if it's the maximum record, incrementing it + * wraps to 0 and this turns into an infinite loop. + */ +next: if (src_recno == UINT64_MAX) + break; + ++src_recno; } } diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index 053f69ee7f8..f0d0f26db54 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -148,7 +148,7 @@ __session_close(WT_SESSION *wt_session, const char *config) * via the registered close callback. */ if (session->event_handler->handle_close != NULL && - !WT_STREQ(cursor->uri, WT_LAS_URI)) + !WT_STREQ(cursor->internal_uri, WT_LAS_URI)) WT_TRET(session->event_handler->handle_close( session->event_handler, wt_session, cursor)); WT_TRET(cursor->close(cursor)); diff --git a/src/third_party/wiredtiger/src/session/session_compact.c b/src/third_party/wiredtiger/src/session/session_compact.c index 456fcd3ce03..8a5b741c0c5 100644 --- a/src/third_party/wiredtiger/src/session/session_compact.c +++ b/src/third_party/wiredtiger/src/session/session_compact.c @@ -172,12 +172,12 @@ __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) for (i = 0; i < 100; ++i) { WT_ERR(__wt_txn_checkpoint(session, checkpoint_cfg)); - session->compaction = false; + session->compact_state = WT_COMPACT_RUNNING; WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker( session, uri, __wt_compact, NULL, cfg, 0)); WT_ERR(ret); - if (!session->compaction) + if (session->compact_state != WT_COMPACT_SUCCESS) break; WT_ERR(__wt_txn_checkpoint(session, checkpoint_cfg)); @@ -185,7 +185,9 @@ __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) WT_ERR(__session_compact_check_timeout(session, start_time)); } -err: __wt_scr_free(session, &t); +err: session->compact_state = WT_COMPACT_NONE; + + __wt_scr_free(session, &t); return (ret); } diff --git a/src/third_party/wiredtiger/src/support/global.c b/src/third_party/wiredtiger/src/support/global.c index 1e32f5b4453..2330a65a707 100644 --- a/src/third_party/wiredtiger/src/support/global.c +++ b/src/third_party/wiredtiger/src/support/global.c @@ -11,28 +11,6 @@ WT_PROCESS __wt_process; /* Per-process structure */ static int __wt_pthread_once_failed; /* If initialization failed */ -/* - * __system_is_little_endian -- - * Check if the system is little endian. - */ -static int -__system_is_little_endian(void) -{ - uint64_t v; - bool little; - - v = 1; - little = *((uint8_t *)&v) != 0; - - if (little) - return (0); - - fprintf(stderr, - "This release of the WiredTiger data engine does not support " - "big-endian systems; contact WiredTiger for more information.\n"); - return (EINVAL); -} - /* * __wt_global_once -- * Global initialization, run once. @@ -42,11 +20,6 @@ __wt_global_once(void) { WT_DECL_RET; - if ((ret = __system_is_little_endian()) != 0) { - __wt_pthread_once_failed = ret; - return; - } - if ((ret = __wt_spin_init(NULL, &__wt_process.spinlock, "global")) != 0) { __wt_pthread_once_failed = ret; @@ -115,7 +88,7 @@ __wt_attach(WT_SESSION_IMPL *session) /* Sleep forever, the debugger will interrupt us when it attaches. */ for (;;) - __wt_sleep(100, 0); + __wt_sleep(10, 0); #else WT_UNUSED(session); #endif diff --git a/src/third_party/wiredtiger/src/support/hash_city.c b/src/third_party/wiredtiger/src/support/hash_city.c index 9a4a6464f40..33f4113c004 100644 --- a/src/third_party/wiredtiger/src/support/hash_city.c +++ b/src/third_party/wiredtiger/src/support/hash_city.c @@ -99,6 +99,12 @@ static uint32_t UNALIGNED_LOAD32(const char *p) { #define bswap_32(x) OSSwapInt32(x) #define bswap_64(x) OSSwapInt64(x) +#elif defined(__sun) + +#include +#define bswap_32 BSWAP_32 +#define bswap_64 BSWAP_64 + #else #include #endif diff --git a/src/third_party/wiredtiger/src/support/hex.c b/src/third_party/wiredtiger/src/support/hex.c index eb9f420911a..5fb8d4bc190 100644 --- a/src/third_party/wiredtiger/src/support/hex.c +++ b/src/third_party/wiredtiger/src/support/hex.c @@ -8,7 +8,7 @@ #include "wt_internal.h" -static const u_char hex[] = "0123456789abcdef"; +const u_char __wt_hex[] = "0123456789abcdef"; /* * __fill_hex -- @@ -25,14 +25,25 @@ __fill_hex(const uint8_t *src, size_t src_max, --dest_max; for (; src_max > 0 && dest_max > 1; src_max -= 1, dest_max -= 2, ++src) { - *dest++ = hex[(*src & 0xf0) >> 4]; - *dest++ = hex[*src & 0x0f]; + *dest++ = __wt_hex[(*src & 0xf0) >> 4]; + *dest++ = __wt_hex[*src & 0x0f]; } *dest++ = '\0'; if (lenp != NULL) *lenp = WT_PTRDIFF(dest, dest_orig); } +/* + * __wt_fill_hex -- + * In-memory conversion of raw bytes to a hexadecimal representation. + */ +void +__wt_fill_hex(const uint8_t *src, size_t src_max, + uint8_t *dest, size_t dest_max, size_t *lenp) +{ + __fill_hex(src, src_max, dest, dest_max, lenp); +} + /* * __wt_raw_to_hex -- * Convert a chunk of data to a nul-terminated printable hex string. @@ -83,8 +94,8 @@ __wt_raw_to_esc_hex( *t++ = *p; } else { *t++ = '\\'; - *t++ = hex[(*p & 0xf0) >> 4]; - *t++ = hex[*p & 0x0f]; + *t++ = __wt_hex[(*p & 0xf0) >> 4]; + *t++ = __wt_hex[*p & 0x0f]; } *t++ = '\0'; to->size = WT_PTRDIFF(t, to->mem); diff --git a/src/third_party/wiredtiger/src/support/huffman.c b/src/third_party/wiredtiger/src/support/huffman.c index 4bda365cb10..9488dbf14fe 100644 --- a/src/third_party/wiredtiger/src/support/huffman.c +++ b/src/third_party/wiredtiger/src/support/huffman.c @@ -1,9 +1,31 @@ -/*- +/* * Copyright (c) 2014-2015 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * - * See the file LICENSE for redistribution information. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name MongoDB or the name WiredTiger + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY MONGODB INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ #include "wt_internal.h" diff --git a/src/third_party/wiredtiger/src/support/rand.c b/src/third_party/wiredtiger/src/support/rand.c index f5ecb12633e..3adcb801f03 100644 --- a/src/third_party/wiredtiger/src/support/rand.c +++ b/src/third_party/wiredtiger/src/support/rand.c @@ -59,6 +59,29 @@ __wt_random_init(WT_RAND_STATE volatile * rnd_state) *rnd_state = rnd; } +/* + * __wt_random_init_seed -- + * Initialize the state of a 32-bit pseudo-random number. + * Use this, instead of __wt_random_init if we are running with multiple + * threads and we want each thread to initialize its own random state based + * on a different random seed. + */ +int +__wt_random_init_seed( + WT_SESSION_IMPL *session, WT_RAND_STATE volatile * rnd_state) +{ + struct timespec ts; + WT_RAND_STATE rnd; + + WT_RET(__wt_epoch(session, &ts)); + M_W(rnd) = (uint32_t)(ts.tv_nsec + 521288629); + M_Z(rnd) = (uint32_t)(ts.tv_nsec + 362436069); + + *rnd_state = rnd; + + return (0); +} + /* * __wt_random -- * Return a 32-bit pseudo-random number. diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 4d7cd65fd18..7a615131628 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -250,19 +250,24 @@ __wt_stat_dsrc_aggregate_single( to->block_alloc += from->block_alloc; to->block_free += from->block_free; to->block_checkpoint_size += from->block_checkpoint_size; - to->allocation_size = from->allocation_size; + if (from->allocation_size > to->allocation_size) + to->allocation_size = from->allocation_size; to->block_reuse_bytes += from->block_reuse_bytes; - to->block_magic = from->block_magic; - to->block_major = from->block_major; + if (from->block_magic > to->block_magic) + to->block_magic = from->block_magic; + if (from->block_major > to->block_major) + to->block_major = from->block_major; to->block_size += from->block_size; - to->block_minor = from->block_minor; + if (from->block_minor > to->block_minor) + to->block_minor = from->block_minor; to->btree_checkpoint_generation += from->btree_checkpoint_generation; to->btree_column_fix += from->btree_column_fix; to->btree_column_internal += from->btree_column_internal; to->btree_column_deleted += from->btree_column_deleted; to->btree_column_variable += from->btree_column_variable; to->btree_column_rle += from->btree_column_rle; - to->btree_fixed_len = from->btree_fixed_len; + if (from->btree_fixed_len > to->btree_fixed_len) + to->btree_fixed_len = from->btree_fixed_len; if (from->btree_maxintlkey > to->btree_maxintlkey) to->btree_maxintlkey = from->btree_maxintlkey; if (from->btree_maxintlpage > to->btree_maxintlpage) @@ -367,12 +372,16 @@ __wt_stat_dsrc_aggregate( to->block_free += WT_STAT_READ(from, block_free); to->block_checkpoint_size += WT_STAT_READ(from, block_checkpoint_size); - to->allocation_size = from[0]->allocation_size; + if ((v = WT_STAT_READ(from, allocation_size)) > to->allocation_size) + to->allocation_size = v; to->block_reuse_bytes += WT_STAT_READ(from, block_reuse_bytes); - to->block_magic = from[0]->block_magic; - to->block_major = from[0]->block_major; + if ((v = WT_STAT_READ(from, block_magic)) > to->block_magic) + to->block_magic = v; + if ((v = WT_STAT_READ(from, block_major)) > to->block_major) + to->block_major = v; to->block_size += WT_STAT_READ(from, block_size); - to->block_minor = from[0]->block_minor; + if ((v = WT_STAT_READ(from, block_minor)) > to->block_minor) + to->block_minor = v; to->btree_checkpoint_generation += WT_STAT_READ(from, btree_checkpoint_generation); to->btree_column_fix += WT_STAT_READ(from, btree_column_fix); @@ -382,15 +391,14 @@ __wt_stat_dsrc_aggregate( to->btree_column_variable += WT_STAT_READ(from, btree_column_variable); to->btree_column_rle += WT_STAT_READ(from, btree_column_rle); - to->btree_fixed_len = from[0]->btree_fixed_len; - if ((v = WT_STAT_READ(from, btree_maxintlkey)) > - to->btree_maxintlkey) + if ((v = WT_STAT_READ(from, btree_fixed_len)) > to->btree_fixed_len) + to->btree_fixed_len = v; + if ((v = WT_STAT_READ(from, btree_maxintlkey)) > to->btree_maxintlkey) to->btree_maxintlkey = v; if ((v = WT_STAT_READ(from, btree_maxintlpage)) > to->btree_maxintlpage) to->btree_maxintlpage = v; - if ((v = WT_STAT_READ(from, btree_maxleafkey)) > - to->btree_maxleafkey) + if ((v = WT_STAT_READ(from, btree_maxleafkey)) > to->btree_maxleafkey) to->btree_maxleafkey = v; if ((v = WT_STAT_READ(from, btree_maxleafpage)) > to->btree_maxleafpage) diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index f835fea8f67..0a3e4a7a7db 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -216,6 +216,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force) conn = S2C(session); txn_global = &conn->txn_global; +retry: current_id = last_running = txn_global->current; oldest_session = NULL; prev_oldest_id = txn_global->oldest_id; @@ -287,43 +288,60 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force) WT_TXNID_LT(txn_global->last_running, last_running); /* Update the oldest ID. */ - if ((WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) && - __wt_atomic_casiv32(&txn_global->scan_count, 1, -1)) { - WT_ORDERED_READ(session_cnt, conn->session_cnt); - for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { - if ((id = s->id) != WT_TXN_NONE && - WT_TXNID_LT(id, last_running)) - last_running = id; - if ((id = s->snap_min) != WT_TXN_NONE && - WT_TXNID_LT(id, oldest_id)) - oldest_id = id; - } - - if (WT_TXNID_LT(last_running, oldest_id)) - oldest_id = last_running; - -#ifdef HAVE_DIAGNOSTIC + if (WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) { /* - * Make sure the ID doesn't move past any named snapshots. - * - * Don't include the read/assignment in the assert statement. - * Coverity complains if there are assignments only done in - * diagnostic builds, and when the read is from a volatile. + * We know we want to update. Check if we're racing. */ - id = txn_global->nsnap_oldest_id; - WT_ASSERT(session, - id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); + if (__wt_atomic_casiv32(&txn_global->scan_count, 1, -1)) { + WT_ORDERED_READ(session_cnt, conn->session_cnt); + for (i = 0, s = txn_global->states; + i < session_cnt; i++, s++) { + if ((id = s->id) != WT_TXN_NONE && + WT_TXNID_LT(id, last_running)) + last_running = id; + if ((id = s->snap_min) != WT_TXN_NONE && + WT_TXNID_LT(id, oldest_id)) + oldest_id = id; + } + + if (WT_TXNID_LT(last_running, oldest_id)) + oldest_id = last_running; + +#ifdef HAVE_DIAGNOSTIC + /* + * Make sure the ID doesn't move past any named + * snapshots. + * + * Don't include the read/assignment in the assert + * statement. Coverity complains if there are + * assignments only done in diagnostic builds, and + * when the read is from a volatile. + */ + id = txn_global->nsnap_oldest_id; + WT_ASSERT(session, + id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); #endif - if (WT_TXNID_LT(txn_global->last_running, last_running)) - txn_global->last_running = last_running; - if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) - txn_global->oldest_id = oldest_id; - WT_ASSERT(session, txn_global->scan_count == -1); - txn_global->scan_count = 0; + if (WT_TXNID_LT(txn_global->last_running, last_running)) + txn_global->last_running = last_running; + if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) + txn_global->oldest_id = oldest_id; + WT_ASSERT(session, txn_global->scan_count == -1); + txn_global->scan_count = 0; + } else { + /* + * We wanted to update the oldest ID but we're racing + * another thread. Retry if this is a forced update. + */ + WT_ASSERT(session, txn_global->scan_count > 0); + (void)__wt_atomic_subiv32(&txn_global->scan_count, 1); + if (force) { + __wt_yield(); + goto retry; + } + } } else { if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && - current_id - oldest_id > 10000 && last_running_moved && - oldest_session != NULL) { + current_id - oldest_id > 10000 && oldest_session != NULL) { (void)__wt_verbose(session, WT_VERB_TRANSACTION, "old snapshot %" PRIu64 " pinned in session %d [%s]" diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c index c5fa52dea6a..148ed868792 100644 --- a/src/third_party/wiredtiger/src/txn/txn_log.c +++ b/src/third_party/wiredtiger/src/txn/txn_log.c @@ -8,6 +8,12 @@ #include "wt_internal.h" +/* Cookie passed to __txn_printlog. */ +typedef struct { + FILE *out; + uint32_t flags; +} WT_TXN_PRINTLOG_ARGS; + /* * __txn_op_log -- * Log an operation for the current transaction. @@ -64,7 +70,8 @@ err: __wt_buf_free(session, &key); */ static int __txn_commit_printlog( - WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, + uint32_t flags) { bool firstrecord; @@ -79,7 +86,7 @@ __txn_commit_printlog( firstrecord = false; - WT_RET(__wt_txn_op_printlog(session, pp, end, out)); + WT_RET(__wt_txn_op_printlog(session, pp, end, out, flags)); WT_RET(__wt_fprintf(out, "\n }")); } @@ -459,6 +466,7 @@ __txn_printlog(WT_SESSION_IMPL *session, FILE *out; WT_LOG_RECORD *logrec; WT_LSN ckpt_lsn; + WT_TXN_PRINTLOG_ARGS *args; const uint8_t *end, *p; const char *msg; uint64_t txnid; @@ -467,7 +475,8 @@ __txn_printlog(WT_SESSION_IMPL *session, bool compressed; WT_UNUSED(next_lsnp); - out = cookie; + args = cookie; + out = args->out; p = WT_LOG_SKIP_HEADER(rawrec->data); end = (const uint8_t *)rawrec->data + rawrec->size; @@ -506,7 +515,8 @@ __txn_printlog(WT_SESSION_IMPL *session, WT_RET(__wt_fprintf(out, " \"type\" : \"commit\",\n")); WT_RET(__wt_fprintf(out, " \"txnid\" : %" PRIu64 ",\n", txnid)); - WT_RET(__txn_commit_printlog(session, &p, end, out)); + WT_RET(__txn_commit_printlog(session, &p, end, out, + args->flags)); break; case WT_LOGREC_FILE_SYNC: @@ -537,15 +547,18 @@ __txn_printlog(WT_SESSION_IMPL *session, * Print the log in a human-readable format. */ int -__wt_txn_printlog(WT_SESSION *wt_session, FILE *out) +__wt_txn_printlog(WT_SESSION *wt_session, FILE *out, uint32_t flags) { WT_SESSION_IMPL *session; + WT_TXN_PRINTLOG_ARGS args; session = (WT_SESSION_IMPL *)wt_session; + args.out = out; + args.flags = flags; WT_RET(__wt_fprintf(out, "[\n")); WT_RET(__wt_log_scan( - session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, out)); + session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, &args)); WT_RET(__wt_fprintf(out, "\n]\n")); return (0); diff --git a/src/third_party/wiredtiger/src/utilities/util_list.c b/src/third_party/wiredtiger/src/utilities/util_list.c index 99a1455a74e..135a8bab225 100644 --- a/src/third_party/wiredtiger/src/utilities/util_list.c +++ b/src/third_party/wiredtiger/src/utilities/util_list.c @@ -8,6 +8,7 @@ #include "util.h" +static int list_get_allocsize(WT_SESSION *, const char *, size_t *); static int list_print(WT_SESSION *, const char *, bool, bool); static int list_print_checkpoint(WT_SESSION *, const char *); static int usage(void); @@ -55,6 +56,48 @@ util_list(WT_SESSION *session, int argc, char *argv[]) return (ret); } +/* + * list_get_allocsize -- + * Get the allocation size for this file from the metadata. + */ +static int +list_get_allocsize(WT_SESSION *session, const char *key, size_t *allocsize) +{ + WT_CONFIG_ITEM szvalue; + WT_CONFIG_PARSER *parser; + WT_DECL_RET; + WT_EXTENSION_API *wt_api; + char *config; + + wt_api = session->connection->get_extension_api(session->connection); + if ((ret = + wt_api->metadata_search(wt_api, session, key, &config)) != 0) { + fprintf(stderr, "%s: %s: extension_api.metadata_search: %s\n", + progname, key, session->strerror(session, ret)); + return (ret); + } + if ((ret = wt_api->config_parser_open(wt_api, session, config, + strlen(config), &parser)) != 0) { + fprintf(stderr, "%s: extension_api.config_parser_open: %s\n", + progname, session->strerror(session, ret)); + return (ret); + } + if ((ret = parser->get(parser, "allocation_size", &szvalue)) != 0) { + if (ret != WT_NOTFOUND) + fprintf(stderr, "%s: config_parser.get: %s\n", + progname, session->strerror(session, ret)); + (void)parser->close(parser); + return (ret); + } + if ((ret = parser->close(parser)) != 0) { + fprintf(stderr, "%s: config_parser.close: %s\n", + progname, session->strerror(session, ret)); + return (ret); + } + *allocsize = (size_t)szvalue.val; + return (0); +} + /* * list_print -- * List the high-level objects in the database. @@ -137,9 +180,10 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag) static int list_print_checkpoint(WT_SESSION *session, const char *key) { + WT_BLOCK_CKPT ci; WT_DECL_RET; WT_CKPT *ckpt, *ckptbase; - size_t len; + size_t allocsize, len; time_t t; uint64_t v; @@ -151,6 +195,14 @@ list_print_checkpoint(WT_SESSION *session, const char *key) if ((ret = __wt_metadata_get_ckptlist(session, key, &ckptbase)) != 0) return (ret == WT_NOTFOUND ? 0 : ret); + /* We need the allocation size for decoding the checkpoint addr */ + if ((ret = list_get_allocsize(session, key, &allocsize)) != 0) { + if (ret == WT_NOTFOUND) + allocsize = 0; + else + return (ret); + } + /* Find the longest name, so we can pretty-print. */ len = 0; WT_CKPT_FOREACH(ckptbase, ckpt) @@ -158,7 +210,15 @@ list_print_checkpoint(WT_SESSION *session, const char *key) len = strlen(ckpt->name); ++len; + memset(&ci, 0, sizeof(ci)); WT_CKPT_FOREACH(ckptbase, ckpt) { + if (allocsize != 0 && (ret = __wt_block_ckpt_decode( + session, allocsize, ckpt->raw.data, &ci)) != 0) { + fprintf(stderr, "%s: __wt_block_buffer_to_ckpt: %s\n", + progname, session->strerror(session, ret)); + /* continue if damaged */ + ci.root_size = 0; + } /* * Call ctime, not ctime_r; ctime_r has portability problems, * the Solaris version is different from the POSIX standard. @@ -179,6 +239,17 @@ list_print_checkpoint(WT_SESSION *session, const char *key) printf(" (%" PRIu64 " KB)\n", v / WT_KILOBYTE); else printf(" (%" PRIu64 " B)\n", v); + if (ci.root_size != 0) { + printf("\t\t" "root offset: %" PRIuMAX + " (0x%" PRIxMAX ")\n", + (intmax_t)ci.root_offset, (intmax_t)ci.root_offset); + printf("\t\t" "root size: %" PRIu32 + " (0x%" PRIx32 ")\n", + ci.root_size, ci.root_size); + printf("\t\t" "root checksum: %" PRIu32 + " (0x%" PRIx32 ")\n", + ci.root_cksum, ci.root_cksum); + } } __wt_metadata_free_ckptlist(session, ckptbase); diff --git a/src/third_party/wiredtiger/src/utilities/util_main.c b/src/third_party/wiredtiger/src/utilities/util_main.c index 9cbda08690e..3b7187bd0de 100644 --- a/src/third_party/wiredtiger/src/utilities/util_main.c +++ b/src/third_party/wiredtiger/src/utilities/util_main.c @@ -226,7 +226,6 @@ main(int argc, char *argv[]) ret = func(session, argc, argv); /* Close the database. */ - err: if (conn != NULL && (tret = conn->close(conn, NULL)) != 0 && ret == 0) ret = tret; diff --git a/src/third_party/wiredtiger/src/utilities/util_printlog.c b/src/third_party/wiredtiger/src/utilities/util_printlog.c index d202b09b228..3a665c1c657 100644 --- a/src/third_party/wiredtiger/src/utilities/util_printlog.c +++ b/src/third_party/wiredtiger/src/utilities/util_printlog.c @@ -15,10 +15,10 @@ util_printlog(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - bool printable; + uint32_t flags; - printable = false; - while ((ch = __wt_getopt(progname, argc, argv, "f:p")) != EOF) + flags = 0; + while ((ch = __wt_getopt(progname, argc, argv, "f:x")) != EOF) switch (ch) { case 'f': /* output file */ if (freopen(__wt_optarg, "w", stdout) == NULL) { @@ -27,8 +27,8 @@ util_printlog(WT_SESSION *session, int argc, char *argv[]) return (1); } break; - case 'p': - printable = true; + case 'x': /* hex output */ + LF_SET(WT_TXN_PRINTLOG_HEX); break; case '?': default: @@ -41,8 +41,7 @@ util_printlog(WT_SESSION *session, int argc, char *argv[]) if (argc != 0) return (usage()); - WT_UNUSED(printable); - ret = __wt_txn_printlog(session, stdout); + ret = __wt_txn_printlog(session, stdout, flags); if (ret != 0) { fprintf(stderr, "%s: printlog failed: %s\n", @@ -61,7 +60,7 @@ usage(void) { (void)fprintf(stderr, "usage: %s %s " - "printlog [-p] [-f output-file]\n", + "printlog [-x] [-f output-file]\n", progname, usage_prefix); return (1); } -- cgit v1.2.1