summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src
diff options
context:
space:
mode:
authorRamon Fernandez <ramon@mongodb.com>2016-01-07 16:31:22 -0500
committerRamon Fernandez <ramon@mongodb.com>2016-01-07 16:31:22 -0500
commitd845b75e5f0837f801bdf371babd985308a1ad80 (patch)
tree080cd08b3d8fd1a04ff1787667a2f7b7546e371e /src/third_party/wiredtiger/src
parent6b95d2ee131a2b8e40f69106ac3f46921269ab46 (diff)
downloadmongo-d845b75e5f0837f801bdf371babd985308a1ad80.tar.gz
Import wiredtiger-wiredtiger-2.7.0-269-g44463c5.tar.gz from wiredtiger branch mongodb-3.4
ref: 3c2ad56..44463c5 SERVER-21833 Compact does not release space to the system with WiredTiger WT-2060 Simplify aggregation of statistics WT-2099 Seeing memory underflow messages WT-2113 truncate01 sometimes fails WT-2177 Add a per-thread seed to random number generator WT-2198 bulk load and column store appends WT-2231 pinned page cursor searches could check parent keys WT-2235 wt printlog option without unicode WT-2245 WTPERF Truncate has no ability to catch up when it falls behind WT-2246 column-store append searches the leaf page; the maximum record number fails CRUD operations WT-2256 WTPERFs throttle option fires in bursts WT-2257 wtperf doesn't handle overriding workload config WT-2259 __wt_evict_file_exclusive_on() should clear WT_BTREE_NO_EVICTION on error WT-2260 Workloads evict internal pages unexpectedly WT-2262 Random sampling is skewed by tree shape WT-2265 Wiredtiger related change in ppc64le specific code block in gcc.h WT-2266 Add wtperf config to set if perf thresholds are fatal WT-2269 wtperf should dump its config everytime it runs WT-2272 Stress test assertion in the sweep server WT-2275 broken DB after application crash WT-2276 tool to decode checkpoint addr WT-2277 Remove WT check against big-endian systems WT-2279 Define WT_PAUSE(), WT_FULL_BARRIER(), etc when s390x is defined WT-2281 wtperf smoke.sh fails on ppc64le WT-2282 error in wt_txn_update_oldest verbose message test WT-2283 retry in txn_update_oldest results in a hang WT-2285 configure should set BUFFER_ALIGNMENT_DEFAULT to 4kb on linux WT-2289 failure in fast key check WT-2290 WT_SESSION.compact could be more effective. WT-2291 Random cursor walk inefficient in skip list only trees WT-2297 Fix off-by-one error in Huffman config file parsing WT-2299 upper-level WiredTiger code is reaching into the block manager WT-2301 Add reading a range to wtperf WT-2303 Build warning in wtperf WT-2304 wtperf crash dumping config WT-2307 Internal page splits can corrupt cursor iteration WT-2311 Support Sparc
Diffstat (limited to 'src/third_party/wiredtiger/src')
-rw-r--r--src/third_party/wiredtiger/src/block/block_addr.c51
-rw-r--r--src/third_party/wiredtiger/src/block/block_compact.c92
-rw-r--r--src/third_party/wiredtiger/src/block/block_mgr.c16
-rw-r--r--src/third_party/wiredtiger/src/block/block_open.c38
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_compact.c42
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curnext.c111
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curprev.c4
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c40
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c4
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_huffman.c10
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_page.c8
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_slvg.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c133
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_stat.c4
-rw-r--r--src/third_party/wiredtiger/src/btree/col_srch.c117
-rw-r--r--src/third_party/wiredtiger/src/btree/row_srch.c221
-rw-r--r--src/third_party/wiredtiger/src/cache/cache_las.c9
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c3
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_dhandle.c4
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_bulk.c179
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_json.c13
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_stat.c7
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_table.c7
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c22
-rw-r--r--src/third_party/wiredtiger/src/include/block.h7
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h4
-rw-r--r--src/third_party/wiredtiger/src/include/column.i22
-rw-r--r--src/third_party/wiredtiger/src/include/connection.h1
-rw-r--r--src/third_party/wiredtiger/src/include/cursor.h38
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h33
-rw-r--r--src/third_party/wiredtiger/src/include/gcc.h52
-rw-r--r--src/third_party/wiredtiger/src/include/log.h5
-rw-r--r--src/third_party/wiredtiger/src/include/misc.h3
-rw-r--r--src/third_party/wiredtiger/src/include/session.h7
-rw-r--r--src/third_party/wiredtiger/src/include/stat.h4
-rw-r--r--src/third_party/wiredtiger/src/log/log_auto.c96
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_stat.c31
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_turtle.c3
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_map.c12
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_pagesize.c19
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_pagesize.c23
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c118
-rw-r--r--src/third_party/wiredtiger/src/session/session_api.c2
-rw-r--r--src/third_party/wiredtiger/src/session/session_compact.c8
-rw-r--r--src/third_party/wiredtiger/src/support/global.c29
-rw-r--r--src/third_party/wiredtiger/src/support/hash_city.c6
-rw-r--r--src/third_party/wiredtiger/src/support/hex.c21
-rw-r--r--src/third_party/wiredtiger/src/support/huffman.c26
-rw-r--r--src/third_party/wiredtiger/src/support/rand.c23
-rw-r--r--src/third_party/wiredtiger/src/support/stat.c36
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c82
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_log.c25
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_list.c73
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_main.c1
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_printlog.c15
55 files changed, 1466 insertions, 496 deletions
diff --git a/src/third_party/wiredtiger/src/block/block_addr.c b/src/third_party/wiredtiger/src/block/block_addr.c
index 6d50e5f0f4e..9ba4ec4a8b2 100644
--- a/src/third_party/wiredtiger/src/block/block_addr.c
+++ b/src/third_party/wiredtiger/src/block/block_addr.c
@@ -14,7 +14,7 @@
* caller's buffer reference so it can be called repeatedly to load a buffer.
*/
static int
-__block_buffer_to_addr(WT_BLOCK *block,
+__block_buffer_to_addr(uint32_t allocsize,
const uint8_t **pp, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump)
{
uint64_t o, s, c;
@@ -39,8 +39,8 @@ __block_buffer_to_addr(WT_BLOCK *block,
*offsetp = 0;
*sizep = *cksump = 0;
} else {
- *offsetp = (wt_off_t)(o + 1) * block->allocsize;
- *sizep = (uint32_t)s * block->allocsize;
+ *offsetp = (wt_off_t)(o + 1) * allocsize;
+ *sizep = (uint32_t)s * allocsize;
*cksump = (uint32_t)c;
}
return (0);
@@ -80,7 +80,8 @@ int
__wt_block_buffer_to_addr(WT_BLOCK *block,
const uint8_t *p, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump)
{
- return (__block_buffer_to_addr(block, &p, offsetp, sizep, cksump));
+ return (__block_buffer_to_addr(
+ block->allocsize, &p, offsetp, sizep, cksump));
}
/*
@@ -139,12 +140,12 @@ __wt_block_addr_string(WT_SESSION_IMPL *session,
}
/*
- * __wt_block_buffer_to_ckpt --
+ * __block_buffer_to_ckpt --
* Convert a checkpoint cookie into its components.
*/
-int
-__wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session,
- WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci)
+static int
+__block_buffer_to_ckpt(WT_SESSION_IMPL *session,
+ uint32_t allocsize, const uint8_t *p, WT_BLOCK_CKPT *ci)
{
uint64_t a;
const uint8_t **pp;
@@ -154,13 +155,13 @@ __wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session,
WT_RET_MSG(session, WT_ERROR, "unsupported checkpoint version");
pp = &p;
- WT_RET(__block_buffer_to_addr(block, pp,
+ WT_RET(__block_buffer_to_addr(allocsize, pp,
&ci->root_offset, &ci->root_size, &ci->root_cksum));
- WT_RET(__block_buffer_to_addr(block, pp,
+ WT_RET(__block_buffer_to_addr(allocsize, pp,
&ci->alloc.offset, &ci->alloc.size, &ci->alloc.cksum));
- WT_RET(__block_buffer_to_addr(block, pp,
+ WT_RET(__block_buffer_to_addr(allocsize, pp,
&ci->avail.offset, &ci->avail.size, &ci->avail.cksum));
- WT_RET(__block_buffer_to_addr(block, pp,
+ WT_RET(__block_buffer_to_addr(allocsize, pp,
&ci->discard.offset, &ci->discard.size, &ci->discard.cksum));
WT_RET(__wt_vunpack_uint(pp, 0, &a));
ci->file_size = (wt_off_t)a;
@@ -171,6 +172,32 @@ __wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session,
}
/*
+ * __wt_block_buffer_to_ckpt --
+ * Convert a checkpoint cookie into its components, block manager version.
+ */
+int
+__wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci)
+{
+ return (__block_buffer_to_ckpt(session, block->allocsize, p, ci));
+}
+
+/*
+ * __wt_block_ckpt_decode --
+ * Convert a checkpoint cookie into its components, external utility
+ * version.
+ */
+int
+__wt_block_ckpt_decode(WT_SESSION *wt_session,
+ size_t allocsize, const uint8_t *p, WT_BLOCK_CKPT *ci)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ return (__block_buffer_to_ckpt(session, (uint32_t)allocsize, p, ci));
+}
+
+/*
* __wt_block_ckpt_to_buffer --
* Convert the components into its checkpoint cookie.
*/
diff --git a/src/third_party/wiredtiger/src/block/block_compact.c b/src/third_party/wiredtiger/src/block/block_compact.c
index d45d0a96da7..cd304b848d4 100644
--- a/src/third_party/wiredtiger/src/block/block_compact.c
+++ b/src/third_party/wiredtiger/src/block/block_compact.c
@@ -8,7 +8,7 @@
#include "wt_internal.h"
-static int __block_dump_avail(WT_SESSION_IMPL *, WT_BLOCK *);
+static int __block_dump_avail(WT_SESSION_IMPL *, WT_BLOCK *, bool);
/*
* __wt_block_compact_start --
@@ -22,8 +22,6 @@ __wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
/* Switch to first-fit allocation. */
__wt_block_configure_first_fit(block, true);
- block->compact_pct_tenths = 0;
-
return (0);
}
@@ -34,14 +32,21 @@ __wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
int
__wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
{
+ WT_DECL_RET;
+
WT_UNUSED(session);
/* Restore the original allocation plan. */
__wt_block_configure_first_fit(block, false);
- block->compact_pct_tenths = 0;
+ /* Dump the results of the compaction pass. */
+ if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) {
+ __wt_spin_lock(session, &block->live_lock);
+ ret = __block_dump_avail(session, block, false);
+ __wt_spin_unlock(session, &block->live_lock);
+ }
- return (0);
+ return (ret);
}
/*
@@ -70,12 +75,23 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp)
if (fh->size <= WT_MEGABYTE)
return (0);
+ /*
+ * Reset the compaction state information. This is done here, not in the
+ * compaction "start" routine, because this function is called first to
+ * determine if compaction is useful.
+ */
+ block->compact_pct_tenths = 0;
+ block->compact_pages_reviewed = 0;
+ block->compact_pages_skipped = 0;
+ block->compact_pages_written = 0;
+
__wt_spin_lock(session, &block->live_lock);
+ /* Dump the current state of the file. */
if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT))
- WT_ERR(__block_dump_avail(session, block));
+ WT_ERR(__block_dump_avail(session, block, true));
- /* Sum the available bytes in the first 80% and 90% of the file. */
+ /* Sum the available bytes in the initial 80% and 90% of the file. */
avail_eighty = avail_ninety = 0;
ninety = fh->size - fh->size / 10;
eighty = fh->size - ((fh->size / 10) * 2);
@@ -88,23 +104,6 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp)
avail_eighty += ext->size;
}
- WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
- "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first "
- "80%% of the file",
- block->name,
- (uintmax_t)avail_eighty / WT_MEGABYTE, (uintmax_t)avail_eighty));
- WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
- "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first "
- "90%% of the file",
- block->name,
- (uintmax_t)avail_ninety / WT_MEGABYTE, (uintmax_t)avail_ninety));
- WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
- "%s: require 10%% or %" PRIuMAX "MB (%" PRIuMAX ") in the first "
- "90%% of the file to perform compaction, compaction %s",
- block->name,
- (uintmax_t)(fh->size / 10) / WT_MEGABYTE, (uintmax_t)fh->size / 10,
- *skipp ? "skipped" : "proceeding"));
-
/*
* Skip files where we can't recover at least 1MB.
*
@@ -127,6 +126,23 @@ __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, bool *skipp)
block->compact_pct_tenths = 1;
}
+ WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
+ "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first "
+ "80%% of the file",
+ block->name,
+ (uintmax_t)avail_eighty / WT_MEGABYTE, (uintmax_t)avail_eighty));
+ WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
+ "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first "
+ "90%% of the file",
+ block->name,
+ (uintmax_t)avail_ninety / WT_MEGABYTE, (uintmax_t)avail_ninety));
+ WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
+ "%s: require 10%% or %" PRIuMAX "MB (%" PRIuMAX ") in the first "
+ "90%% of the file to perform compaction, compaction %s",
+ block->name,
+ (uintmax_t)(fh->size / 10) / WT_MEGABYTE, (uintmax_t)fh->size / 10,
+ *skipp ? "skipped" : "proceeding"));
+
err: __wt_spin_unlock(session, &block->live_lock);
return (ret);
@@ -177,6 +193,14 @@ __wt_block_compact_page_skip(WT_SESSION_IMPL *session,
}
__wt_spin_unlock(session, &block->live_lock);
+ if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT)) {
+ ++block->compact_pages_reviewed;
+ if (*skipp)
+ ++block->compact_pages_skipped;
+ else
+ ++block->compact_pages_written;
+ }
+
return (ret);
}
@@ -185,7 +209,7 @@ __wt_block_compact_page_skip(WT_SESSION_IMPL *session,
* Dump out the avail list so we can see what compaction will look like.
*/
static int
-__block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block)
+__block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block, bool start)
{
WT_EXTLIST *el;
WT_EXT *ext;
@@ -196,6 +220,20 @@ __block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block)
size = block->fh->size;
WT_RET(__wt_verbose(session, WT_VERB_COMPACT,
+ "============ %s",
+ start ? "testing for compaction" : "ending compaction pass"));
+
+ if (!start) {
+ WT_RET(__wt_verbose(session, WT_VERB_COMPACT,
+ "pages reviewed: %" PRIuMAX,
+ block->compact_pages_reviewed));
+ WT_RET(__wt_verbose(session, WT_VERB_COMPACT,
+ "pages skipped: %" PRIuMAX, block->compact_pages_skipped));
+ WT_RET(__wt_verbose(session, WT_VERB_COMPACT,
+ "pages written: %" PRIuMAX, block->compact_pages_written));
+ }
+
+ WT_RET(__wt_verbose(session, WT_VERB_COMPACT,
"file size %" PRIuMAX "MB (%" PRIuMAX ") with %" PRIuMAX
"%% space available %" PRIuMAX "MB (%" PRIuMAX ")",
(uintmax_t)size / WT_MEGABYTE, (uintmax_t)size,
@@ -219,6 +257,10 @@ __block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block)
}
#ifdef __VERBOSE_OUTPUT_PERCENTILE
+ /*
+ * The verbose output always displays 10% buckets, running this code
+ * as well also displays 1% buckets.
+ */
for (i = 0; i < WT_ELEMENTS(percentile); ++i) {
v = percentile[i] * 512;
WT_RET(__wt_verbose(session, WT_VERB_COMPACT,
diff --git a/src/third_party/wiredtiger/src/block/block_mgr.c b/src/third_party/wiredtiger/src/block/block_mgr.c
index 7260cab75d9..f9f66e05d7f 100644
--- a/src/third_party/wiredtiger/src/block/block_mgr.c
+++ b/src/third_party/wiredtiger/src/block/block_mgr.c
@@ -221,6 +221,18 @@ __bm_free(WT_BM *bm,
}
/*
+ * __bm_is_mapped --
+ * Return if the file is mapped into memory.
+ */
+static bool
+__bm_is_mapped(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ WT_UNUSED(session);
+
+ return (bm->map == NULL ? false : true);
+}
+
+/*
* __bm_stat --
* Block-manager statistics.
*/
@@ -357,6 +369,7 @@ __bm_method_set(WT_BM *bm, bool readonly)
(int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
bm->free = (int (*)(WT_BM *,
WT_SESSION_IMPL *, const uint8_t *, size_t))__bm_readonly;
+ bm->is_mapped = __bm_is_mapped;
bm->preload = __wt_bm_preload;
bm->read = __wt_bm_read;
bm->salvage_end = (int (*)
@@ -367,6 +380,7 @@ __bm_method_set(WT_BM *bm, bool readonly)
(WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
bm->salvage_valid = (int (*)(WT_BM *,
WT_SESSION_IMPL *, uint8_t *, size_t, bool))__bm_readonly;
+ bm->size = __wt_block_manager_size;
bm->stat = __bm_stat;
bm->sync =
(int (*)(WT_BM *, WT_SESSION_IMPL *, bool))__bm_readonly;
@@ -391,12 +405,14 @@ __bm_method_set(WT_BM *bm, bool readonly)
bm->compact_skip = __bm_compact_skip;
bm->compact_start = __bm_compact_start;
bm->free = __bm_free;
+ bm->is_mapped = __bm_is_mapped;
bm->preload = __wt_bm_preload;
bm->read = __wt_bm_read;
bm->salvage_end = __bm_salvage_end;
bm->salvage_next = __bm_salvage_next;
bm->salvage_start = __bm_salvage_start;
bm->salvage_valid = __bm_salvage_valid;
+ bm->size = __wt_block_manager_size;
bm->stat = __bm_stat;
bm->sync = __bm_sync;
bm->verify_addr = __bm_verify_addr;
diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c
index 7cf12d36066..ff70b765d1f 100644
--- a/src/third_party/wiredtiger/src/block/block_open.c
+++ b/src/third_party/wiredtiger/src/block/block_open.c
@@ -405,27 +405,37 @@ __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats)
* Reading from the live system's structure normally requires locking,
* but it's an 8B statistics read, there's no need.
*/
- stats->allocation_size = block->allocsize;
- stats->block_checkpoint_size = (int64_t)block->live.ckpt_size;
- stats->block_magic = WT_BLOCK_MAGIC;
- stats->block_major = WT_BLOCK_MAJOR_VERSION;
- stats->block_minor = WT_BLOCK_MINOR_VERSION;
- stats->block_reuse_bytes = (int64_t)block->live.avail.bytes;
- stats->block_size = block->fh->size;
+ WT_STAT_WRITE(stats, allocation_size, block->allocsize);
+ WT_STAT_WRITE(
+ stats, block_checkpoint_size, (int64_t)block->live.ckpt_size);
+ WT_STAT_WRITE(stats, block_magic, WT_BLOCK_MAGIC);
+ WT_STAT_WRITE(stats, block_major, WT_BLOCK_MAJOR_VERSION);
+ WT_STAT_WRITE(stats, block_minor, WT_BLOCK_MINOR_VERSION);
+ WT_STAT_WRITE(
+ stats, block_reuse_bytes, (int64_t)block->live.avail.bytes);
+ WT_STAT_WRITE(stats, block_size, block->fh->size);
}
/*
* __wt_block_manager_size --
- * Set the size statistic for a file.
+ * Return the size of a live block handle.
*/
int
-__wt_block_manager_size(
- WT_SESSION_IMPL *session, const char *filename, WT_DSRC_STATS *stats)
+__wt_block_manager_size(WT_BM *bm, WT_SESSION_IMPL *session, wt_off_t *sizep)
{
- wt_off_t filesize;
-
- WT_RET(__wt_filesize_name(session, filename, false, &filesize));
- stats->block_size = filesize;
+ WT_UNUSED(session);
+ *sizep = bm->block->fh == NULL ? 0 : bm->block->fh->size;
return (0);
}
+
+/*
+ * __wt_block_manager_named_size --
+ * Return the size of a named file.
+ */
+int
+__wt_block_manager_named_size(
+ WT_SESSION_IMPL *session, const char *name, wt_off_t *sizep)
+{
+ return (__wt_filesize_name(session, name, false, sizep));
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c
index 8044d4f852d..8935d39b696 100644
--- a/src/third_party/wiredtiger/src/btree/bt_compact.c
+++ b/src/third_party/wiredtiger/src/btree/bt_compact.c
@@ -17,9 +17,11 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
WT_BM *bm;
WT_DECL_RET;
+ WT_MULTI *multi;
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
size_t addr_size;
+ uint32_t i;
const uint8_t *addr;
*skipp = true; /* Default skip. */
@@ -41,29 +43,46 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
/*
* If the page is clean, test the original addresses.
- * If the page is a 1-to-1 replacement, test the replacement addresses.
+ * If the page is a replacement, test the replacement addresses.
* Ignore empty pages, they get merged into the parent.
*/
if (mod == NULL || mod->rec_result == 0) {
__wt_ref_info(ref, &addr, &addr_size, NULL);
if (addr == NULL)
return (0);
- WT_RET(
+ return (
bm->compact_page_skip(bm, session, addr, addr_size, skipp));
- } else if (mod->rec_result == WT_PM_REC_REPLACE) {
- /*
- * The page's modification information can change underfoot if
- * the page is being reconciled, serialize with reconciliation.
- */
+ }
+
+ /*
+ * The page's modification information can change underfoot if the page
+ * is being reconciled, serialize with reconciliation.
+ */
+ if (mod->rec_result == WT_PM_REC_REPLACE ||
+ mod->rec_result == WT_PM_REC_MULTIBLOCK)
WT_RET(__wt_fair_lock(session, &page->page_lock));
+ if (mod->rec_result == WT_PM_REC_REPLACE)
ret = bm->compact_page_skip(bm, session,
mod->mod_replace.addr, mod->mod_replace.size, skipp);
+ if (mod->rec_result == WT_PM_REC_MULTIBLOCK)
+ for (multi = mod->mod_multi,
+ i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+ if (multi->disk_image != NULL)
+ continue;
+ if ((ret = bm->compact_page_skip(bm, session,
+ multi->addr.addr, multi->addr.size, skipp)) != 0)
+ break;
+ if (!*skipp)
+ break;
+ }
+
+ if (mod->rec_result == WT_PM_REC_REPLACE ||
+ mod->rec_result == WT_PM_REC_MULTIBLOCK)
WT_TRET(__wt_fair_unlock(session, &page->page_lock));
- WT_RET(ret);
- }
- return (0);
+
+ return (ret);
}
/*
@@ -139,7 +158,8 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
if (skip)
continue;
- session->compaction = true;
+ session->compact_state = WT_COMPACT_SUCCESS;
+
/* Rewrite the page: mark the page and tree dirty. */
WT_ERR(__wt_page_modify_init(session, ref->page));
__wt_page_modify_set(session, ref->page);
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c
index 55843d1cae5..6573bc60165 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curnext.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c
@@ -389,6 +389,14 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt)
*/
cbt->page_deleted_count = 0;
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * If starting a new iteration, clear the last-key returned, it doesn't
+ * apply.
+ */
+ cbt->lastkey->size = 0;
+ cbt->lastrecno = WT_RECNO_OOB;
+#endif
/*
* If we don't have a search page, then we're done, we're starting at
* the beginning or end of the tree, not as a result of a search.
@@ -430,6 +438,104 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt)
}
}
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __cursor_key_order_check_col --
+ * Check key ordering for column-store cursor movements.
+ */
+static int
+__cursor_key_order_check_col(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next)
+{
+ int cmp;
+
+ cmp = 0; /* -Werror=maybe-uninitialized */
+
+ if (cbt->lastrecno != WT_RECNO_OOB) {
+ if (cbt->lastrecno < cbt->recno)
+ cmp = -1;
+ if (cbt->lastrecno > cbt->recno)
+ cmp = 1;
+ }
+
+ if (cbt->lastrecno == WT_RECNO_OOB ||
+ (next && cmp < 0) || (!next && cmp > 0)) {
+ cbt->lastrecno = cbt->recno;
+ return (0);
+ }
+
+ WT_PANIC_RET(session, EINVAL,
+ "WT_CURSOR.%s out-of-order returns: returned key %" PRIu64 " then "
+ "key %" PRIu64,
+ next ? "next" : "prev", cbt->lastrecno, cbt->recno);
+}
+
+/*
+ * __cursor_key_order_check_row --
+ * Check key ordering for row-store cursor movements.
+ */
+static int
+__cursor_key_order_check_row(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next)
+{
+ WT_BTREE *btree;
+ WT_ITEM *key;
+ WT_DECL_RET;
+ WT_DECL_ITEM(a);
+ WT_DECL_ITEM(b);
+ int cmp;
+
+ btree = S2BT(session);
+ key = &cbt->iface.key;
+ cmp = 0; /* -Werror=maybe-uninitialized */
+
+ if (cbt->lastkey->size != 0)
+ WT_RET(__wt_compare(
+ session, btree->collator, cbt->lastkey, key, &cmp));
+
+ if (cbt->lastkey->size == 0 || (next && cmp < 0) || (!next && cmp > 0))
+ return (__wt_buf_set(session, cbt->lastkey,
+ cbt->iface.key.data, cbt->iface.key.size));
+
+ WT_ERR(__wt_scr_alloc(session, 512, &a));
+ WT_ERR(__wt_buf_set_printable(
+ session, a, cbt->lastkey->data, cbt->lastkey->size));
+
+ WT_ERR(__wt_scr_alloc(session, 512, &b));
+ WT_ERR(__wt_buf_set_printable(session, b, key->data, key->size));
+
+ WT_PANIC_ERR(session, EINVAL,
+ "WT_CURSOR.%s out-of-order returns: returned key %.*s then "
+ "key %.*s",
+ next ? "next" : "prev",
+ (int)a->size, (const char *)a->data,
+ (int)b->size, (const char *)b->data);
+
+err: __wt_scr_free(session, &a);
+ __wt_scr_free(session, &b);
+
+ return (ret);
+}
+
+/*
+ * __wt_cursor_key_order_check --
+ * Check key ordering for cursor movements.
+ */
+int
+__wt_cursor_key_order_check(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next)
+{
+ switch (cbt->ref->page->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ return (__cursor_key_order_check_col(session, cbt, next));
+ case WT_PAGE_ROW_LEAF:
+ return (__cursor_key_order_check_row(session, cbt, next));
+ WT_ILLEGAL_VALUE(session);
+ }
+}
+#endif
+
/*
* __wt_btcur_next --
* Move to the next record in the tree.
@@ -531,6 +637,11 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating)
WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
}
+#ifdef HAVE_DIAGNOSTIC
+ if (ret == 0)
+ WT_ERR(__wt_cursor_key_order_check(session, cbt, true));
+#endif
+
err: if (ret != 0)
WT_TRET(__cursor_reset(cbt));
return (ret);
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
index 1d23b976edd..1e4b1daa090 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curprev.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -618,6 +618,10 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
}
+#ifdef HAVE_DIAGNOSTIC
+ if (ret == 0)
+ WT_ERR(__wt_cursor_key_order_check(session, cbt, false));
+#endif
err: if (ret != 0)
WT_TRET(__cursor_reset(cbt));
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index f2bf2978320..28b51fd2865 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -62,8 +62,18 @@ __cursor_size_chk(WT_SESSION_IMPL *session, WT_ITEM *kv)
static inline int
__cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt)
{
- return (btree->type == BTREE_COL_FIX &&
- !F_ISSET(cbt, WT_CBT_MAX_RECORD));
+ /*
+ * When there's no exact match, column-store search returns the key
+ * nearest the searched-for key (continuing past keys smaller than the
+ * searched-for key to return the next-largest key). Therefore, if the
+ * returned comparison is -1, the searched-for key was larger than any
+ * row on the page's standard information or column-store insert list.
+ *
+ * If the returned comparison is NOT -1, there was a row equal to or
+ * larger than the searched-for key, and we implicitly create missing
+ * rows.
+ */
+ return (btree->type == BTREE_COL_FIX && cbt->compare != -1);
}
/*
@@ -502,19 +512,14 @@ retry: WT_RET(__cursor_func_init(cbt, true));
case BTREE_COL_VAR:
/*
* If WT_CURSTD_APPEND is set, insert a new record (ignoring
- * the application's record number). First we search for the
- * maximum possible record number so the search ends on the
- * last page. The real record number is assigned by the
- * serialized append operation.
+ * the application's record number). The real record number
+ * is assigned by the serialized append operation.
*/
if (F_ISSET(cursor, WT_CURSTD_APPEND))
- cbt->iface.recno = UINT64_MAX;
+ cbt->iface.recno = WT_RECNO_OOB;
WT_ERR(__cursor_col_search(session, cbt, NULL));
- if (F_ISSET(cursor, WT_CURSTD_APPEND))
- cbt->iface.recno = WT_RECNO_OOB;
-
/*
* If not overwriting, fail if the key exists. Creating a
* record past the end of the tree in a fixed-length
@@ -830,6 +835,7 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
WT_DECL_RET;
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
+ wt_off_t size;
uint64_t skip;
session = (WT_SESSION_IMPL *)cbt->iface.session;
@@ -866,10 +872,12 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
* !!!
* Ideally, the number would be prime to avoid restart issues.
*/
- if (cbt->next_random_sample_size != 0)
+ if (cbt->next_random_sample_size != 0) {
+ WT_ERR(btree->bm->size(btree->bm, session, &size));
cbt->next_random_leaf_skip = (uint64_t)
- ((btree->bm->block->fh->size / btree->allocsize) /
+ ((size / btree->allocsize) /
cbt->next_random_sample_size) + 1;
+ }
/*
* Choose a leaf page from the tree.
@@ -1225,6 +1233,11 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt)
{
cbt->row_key = &cbt->_row_key;
cbt->tmp = &cbt->_tmp;
+
+#ifdef HAVE_DIAGNOSTIC
+ cbt->lastkey = &cbt->_lastkey;
+ cbt->lastrecno = WT_RECNO_OOB;
+#endif
}
/*
@@ -1250,6 +1263,9 @@ __wt_btcur_close(WT_CURSOR_BTREE *cbt, bool lowlevel)
__wt_buf_free(session, &cbt->_row_key);
__wt_buf_free(session, &cbt->_tmp);
+#ifdef HAVE_DIAGNOSTIC
+ __wt_buf_free(session, &cbt->_lastkey);
+#endif
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index d52a94a6da2..393f869ece9 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -74,9 +74,7 @@ __wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v)
static inline void
__debug_hex_byte(WT_DBG *ds, uint8_t v)
{
- static const char hex[] = "0123456789abcdef";
-
- __dmsg(ds, "#%c%c", hex[(v & 0xf0) >> 4], hex[v & 0x0f]);
+ __dmsg(ds, "#%c%c", __wt_hex[(v & 0xf0) >> 4], __wt_hex[v & 0x0f]);
}
/*
diff --git a/src/third_party/wiredtiger/src/btree/bt_huffman.c b/src/third_party/wiredtiger/src/btree/bt_huffman.c
index d9ff9616072..a34e57796a8 100644
--- a/src/third_party/wiredtiger/src/btree/bt_huffman.c
+++ b/src/third_party/wiredtiger/src/btree/bt_huffman.c
@@ -332,11 +332,17 @@ __wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip,
for (tp = table, lineno = 1; (ret =
fscanf(fp, "%" SCNi64 " %" SCNi64, &symbol, &frequency)) != EOF;
++tp, ++lineno) {
- if (lineno > entries)
+ /*
+ * Entries is 0-based, that is, there are (entries +1) possible
+ * values that can be configured. The line number is 1-based, so
+ * adjust the test for too many entries, and report (entries +1)
+ * in the error as the maximum possible number of entries.
+ */
+ if (lineno > entries + 1)
WT_ERR_MSG(session, EINVAL,
"Huffman table file %.*s is corrupted, "
"more than %" PRIu32 " entries",
- (int)ip->len, ip->str, entries);
+ (int)ip->len, ip->str, entries + 1);
if (ret != 2)
WT_ERR_MSG(session, EINVAL,
"line %u of Huffman table file %.*s is corrupted: "
diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c
index 8808f0b1a85..fdccf033828 100644
--- a/src/third_party/wiredtiger/src/btree/bt_page.c
+++ b/src/third_party/wiredtiger/src/btree/bt_page.c
@@ -272,7 +272,7 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page)
const WT_PAGE_HEADER *dsk;
WT_PAGE_INDEX *pindex;
WT_REF **refp, *ref;
- uint32_t i;
+ uint32_t hint, i;
btree = S2BT(session);
dsk = page->dsk;
@@ -284,9 +284,11 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page)
*/
pindex = WT_INTL_INDEX_GET_SAFE(page);
refp = pindex->index;
+ hint = 0;
WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
ref = *refp++;
ref->home = page;
+ ref->pindex_hint = hint++;
__wt_cell_unpack(cell, unpack);
ref->addr = cell;
@@ -404,7 +406,7 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
const WT_PAGE_HEADER *dsk;
WT_PAGE_INDEX *pindex;
WT_REF *ref, **refp;
- uint32_t i;
+ uint32_t hint, i;
bool overflow_keys;
btree = S2BT(session);
@@ -421,9 +423,11 @@ __inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
pindex = WT_INTL_INDEX_GET_SAFE(page);
refp = pindex->index;
overflow_keys = false;
+ hint = 0;
WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
ref = *refp;
ref->home = page;
+ ref->pindex_hint = hint++;
__wt_cell_unpack(cell, unpack);
switch (unpack->type) {
diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c
index 756ffd98f3a..b5c299b9ea9 100644
--- a/src/third_party/wiredtiger/src/btree/bt_slvg.c
+++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c
@@ -1807,7 +1807,7 @@ err: if (page != NULL)
*/
static int
__slvg_row_build_internal(
- WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss)
+ WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss)
{
WT_ADDR *addr;
WT_DECL_RET;
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 12f4197e9e7..69c787c9385 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -15,6 +15,22 @@
} while (0)
/*
+ * A note on error handling: main split functions first allocate/initialize new
+ * structures; failures during that period are handled by discarding the memory
+ * and returning an error code, the caller knows the split didn't happen and
+ * proceeds accordingly. Second, split functions update the tree, and a failure
+ * in that period is catastrophic, any partial update to the tree requires a
+ * panic, we can't recover. Third, once the split is complete and the tree has
+ * been fully updated, we have to ignore most errors, the split is complete and
+ * correct, callers have to proceed accordingly.
+ */
+typedef enum {
+ WT_ERR_IGNORE, /* Ignore minor errors */
+ WT_ERR_PANIC, /* Panic on all errors */
+ WT_ERR_RETURN /* Clean up and return error */
+} WT_SPLIT_ERROR_PHASE;
+
+/*
* __split_oldest_gen --
* Calculate the oldest active split generation.
*/
@@ -512,25 +528,13 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex;
WT_REF **alloc_refp;
WT_REF **child_refp, *ref, **root_refp;
+ WT_SPLIT_ERROR_PHASE complete;
size_t child_incr, root_decr, root_incr, size;
uint64_t split_gen;
uint32_t children, chunk, i, j, remain;
uint32_t slots;
void *p;
- /*
- * A note on error handling: this function first allocates/initializes
- * new structures; failures during that period are handled by discarding
- * the memory and returning an error code, our caller knows the split
- * didn't happen and proceeds accordingly. Second, this function updates
- * the tree, and a failure in that period is catastrophic, any partial
- * update to the tree requires a panic, we can't recover. Third, once
- * the split is complete and the tree has been fully updated, we have to
- * ignore most errors because the split is complete and correct, callers
- * have to proceed accordingly.
- */
- enum { ERR_RETURN, ERR_PANIC, ERR_IGNORE } complete;
-
WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen);
WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal);
@@ -539,7 +543,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
btree = S2BT(session);
alloc_index = NULL;
root_decr = root_incr = 0;
- complete = ERR_RETURN;
+ complete = WT_ERR_RETURN;
/* The root page will be marked dirty, make sure that will succeed. */
WT_RET(__wt_page_modify_init(session, root));
@@ -623,7 +627,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
* threads may be underneath us right now changing the structure
* state.) However, if the WT_REF structures reference on-page
* information, we have to fix that, because the disk image for
- * the page that has an page index entry for the WT_REF is about
+ * the page that has a page index entry for the WT_REF is about
* to change.
*/
child_pindex = WT_INTL_INDEX_GET_SAFE(child);
@@ -641,7 +645,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
root_refp - pindex->index == (ptrdiff_t)pindex->entries);
/* Start making real changes to the tree, errors are fatal. */
- complete = ERR_PANIC;
+ complete = WT_ERR_PANIC;
/* Prepare the WT_REFs for the move. */
__split_ref_step1(session, alloc_index, false);
@@ -661,7 +665,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_ERR(__split_ref_step2(session, alloc_index, false));
/* The split is complete and correct, ignore benign errors. */
- complete = ERR_IGNORE;
+ complete = WT_ERR_IGNORE;
/* We've installed the allocated page-index, ensure error handling. */
alloc_index = NULL;
@@ -687,15 +691,15 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
__wt_page_modify_set(session, root);
err: switch (complete) {
- case ERR_RETURN:
+ case WT_ERR_RETURN:
__wt_free_ref_index(session, root, alloc_index, true);
break;
- case ERR_PANIC:
+ case WT_ERR_PANIC:
__wt_err(session, ret,
"fatal error during root page split to deepen the tree");
ret = WT_PANIC;
break;
- case ERR_IGNORE:
+ case WT_ERR_IGNORE:
if (ret != 0 && ret != WT_PANIC) {
__wt_err(session, ret,
"ignoring not-fatal error during root page split "
@@ -721,19 +725,21 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
WT_PAGE *parent;
WT_PAGE_INDEX *alloc_index, *pindex;
WT_REF **alloc_refp, *next_ref;
+ WT_SPLIT_ERROR_PHASE complete;
size_t parent_decr, size;
uint64_t split_gen;
- uint32_t i, j;
+ uint32_t hint, i, j;
uint32_t deleted_entries, parent_entries, result_entries;
uint32_t *deleted_refs;
- bool complete, empty_parent;
+ bool empty_parent;
parent = ref->home;
alloc_index = pindex = NULL;
parent_decr = 0;
parent_entries = 0;
- complete = empty_parent = false;
+ empty_parent = false;
+ complete = WT_ERR_RETURN;
/* The parent page will be marked dirty, make sure that will succeed. */
WT_RET(__wt_page_modify_init(session, parent));
@@ -751,7 +757,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
* array anyway. Switch them to the special split state, so that any
* reading thread will restart.
*/
- WT_RET(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr));
+ WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr));
for (deleted_entries = 0, i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
@@ -791,28 +797,40 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
* Allocate and initialize a new page index array for the parent, then
* copy references from the original index array, plus references from
* the newly created split array, into place.
+ *
+ * Update the WT_REF's page-index hint as we go. This can race with a
+ * thread setting the hint based on an older page-index, and the change
+ * isn't backed out in the case of an error, so there ways for the hint
+ * to be wrong; OK because it's just a hint.
*/
size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *);
WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
parent_incr += size;
alloc_index->index = (WT_REF **)(alloc_index + 1);
alloc_index->entries = result_entries;
- for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) {
+ for (alloc_refp = alloc_index->index,
+ hint = i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
if (next_ref == ref)
for (j = 0; j < new_entries; ++j) {
ref_new[j]->home = parent;
+ ref_new[j]->pindex_hint = hint++;
*alloc_refp++ = ref_new[j];
}
- else if (next_ref->state != WT_REF_SPLIT)
+ else if (next_ref->state != WT_REF_SPLIT) {
/* Skip refs we have marked for deletion. */
+ next_ref->pindex_hint = hint++;
*alloc_refp++ = next_ref;
+ }
}
/* Check that we filled in all the entries. */
WT_ASSERT(session,
alloc_refp - alloc_index->index == (ptrdiff_t)result_entries);
+ /* Start making real changes to the tree, errors are fatal. */
+ complete = WT_ERR_PANIC;
+
/*
* Confirm the parent page's index hasn't moved then update it, which
* makes the split visible to threads descending the tree.
@@ -853,16 +871,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
*/
WT_FULL_BARRIER();
- /*
- * A note on error handling: failures before we swapped the new page
- * index into the parent can be resolved by freeing allocated memory
- * because the original page is unchanged, we can continue to use it
- * and we have not yet modified the parent. Failures after we swap
- * the new page index into the parent are also relatively benign, the
- * split is OK and complete. For those reasons, we ignore errors past
- * this point unless there's a panic.
- */
- complete = true;
+ /* The split is complete and correct, ignore benign errors. */
+ complete = WT_ERR_IGNORE;
WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
"%p: %s %s" "split into parent %p, %" PRIu32 " -> %" PRIu32
@@ -946,7 +956,8 @@ err: __wt_scr_free(session, &scr);
* nothing really bad can have happened, and our caller has to proceed
* with the split.
*/
- if (!complete) {
+ switch (complete) {
+ case WT_ERR_RETURN:
for (i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
if (next_ref->state == WT_REF_SPLIT)
@@ -954,20 +965,28 @@ err: __wt_scr_free(session, &scr);
}
__wt_free_ref_index(session, NULL, alloc_index, false);
-
/*
* The split couldn't proceed because the parent would be empty,
* return EBUSY so our caller knows to unlock the WT_REF that's
* being deleted, but don't be noisy, there's nothing wrong.
*/
if (empty_parent)
- return (EBUSY);
+ ret = EBUSY;
+ break;
+ case WT_ERR_PANIC:
+ __wt_err(session, ret, "fatal error during parent page split");
+ ret = WT_PANIC;
+ break;
+ case WT_ERR_IGNORE:
+ if (ret != 0 && ret != WT_PANIC) {
+ __wt_err(session, ret,
+ "ignoring not-fatal error during parent page "
+ "split");
+ ret = 0;
+ }
+ break;
}
-
- if (ret != 0 && ret != WT_PANIC)
- __wt_err(session, ret,
- "ignoring not-fatal error during parent page split");
- return (ret == WT_PANIC || !complete ? ret : 0);
+ return (ret);
}
/*
@@ -983,25 +1002,13 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex, *replace_index;
WT_REF **alloc_refp;
WT_REF **child_refp, *page_ref, **page_refp, *ref;
+ WT_SPLIT_ERROR_PHASE complete;
size_t child_incr, page_decr, page_incr, parent_incr, size;
uint64_t split_gen;
uint32_t children, chunk, i, j, remain;
uint32_t slots;
void *p;
- /*
- * A note on error handling: this function first allocates/initializes
- * new structures; failures during that period are handled by discarding
- * the memory and returning an error code, our caller knows the split
- * didn't happen and proceeds accordingly. Second, this function updates
- * the tree, and a failure in that period is catastrophic, any partial
- * update to the tree requires a panic, we can't recover. Third, once
- * the split is complete and the tree has been fully updated, we have to
- * ignore most errors because the split is complete and correct, callers
- * have to proceed accordingly.
- */
- enum { ERR_RETURN, ERR_PANIC, ERR_IGNORE } complete;
-
WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal);
@@ -1012,7 +1019,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
alloc_index = replace_index = NULL;
page_ref = page->pg_intl_parent_ref;
page_decr = page_incr = parent_incr = 0;
- complete = ERR_RETURN;
+ complete = WT_ERR_RETURN;
/*
* Our caller is holding the page locked to single-thread splits, which
@@ -1133,7 +1140,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
page_refp - pindex->index == (ptrdiff_t)pindex->entries);
/* Start making real changes to the tree, errors are fatal. */
- complete = ERR_PANIC;
+ complete = WT_ERR_PANIC;
/* Prepare the WT_REFs for the move. */
__split_ref_step1(session, alloc_index, true);
@@ -1157,7 +1164,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ERR(__split_ref_step2(session, alloc_index, true));
/* The split is complete and correct, ignore benign errors. */
- complete = ERR_IGNORE;
+ complete = WT_ERR_IGNORE;
/*
* Push out the changes: not required for correctness, but no reason
@@ -1193,16 +1200,16 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
__wt_page_modify_set(session, page);
err: switch (complete) {
- case ERR_RETURN:
+ case WT_ERR_RETURN:
__wt_free_ref_index(session, page, alloc_index, true);
__wt_free_ref_index(session, page, replace_index, false);
break;
- case ERR_PANIC:
+ case WT_ERR_PANIC:
__wt_err(session, ret,
"fatal error during internal page split");
ret = WT_PANIC;
break;
- case ERR_IGNORE:
+ case WT_ERR_IGNORE:
if (ret != 0 && ret != WT_PANIC) {
__wt_err(session, ret,
"ignoring not-fatal error during internal page "
diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c
index 5dd75835b0b..ef70160aa72 100644
--- a/src/third_party/wiredtiger/src/btree/bt_stat.c
+++ b/src/third_party/wiredtiger/src/btree/bt_stat.c
@@ -35,10 +35,10 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
WT_STAT_SET(session, stats, btree_fixed_len, btree->bitcnt);
WT_STAT_SET(session, stats, btree_maximum_depth, btree->maximum_depth);
- WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage);
WT_STAT_SET(session, stats, btree_maxintlkey, btree->maxintlkey);
- WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage);
+ WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage);
WT_STAT_SET(session, stats, btree_maxleafkey, btree->maxleafkey);
+ WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage);
WT_STAT_SET(session, stats, btree_maxleafvalue, btree->maxleafvalue);
/* Everything else is really, really expensive. */
diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c
index e9fa570f97b..c5e2abbe440 100644
--- a/src/third_party/wiredtiger/src/btree/col_srch.c
+++ b/src/third_party/wiredtiger/src/btree/col_srch.c
@@ -9,12 +9,60 @@
#include "wt_internal.h"
/*
+ * __check_leaf_key_range --
+ * Check the search key is in the leaf page's key range.
+ */
+static inline int
+__check_leaf_key_range(WT_SESSION_IMPL *session,
+ uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
+{
+ WT_PAGE_INDEX *pindex;
+ uint32_t indx;
+
+ /*
+ * There are reasons we can't do the fast checks, and we continue with
+ * the leaf page search in those cases, only skipping the complete leaf
+ * page search if we know it's not going to work.
+ */
+ cbt->compare = 0;
+
+ /*
+ * Check if the search key is smaller than the parent's starting key for
+ * this page.
+ */
+ if (recno < leaf->key.recno) {
+ cbt->compare = 1; /* page keys > search key */
+ return (0);
+ }
+
+ /*
+ * Check if the search key is greater than or equal to the starting key
+ * for the parent's next page.
+ *
+ * !!!
+ * Check that "indx + 1" is a valid page-index entry first, because it
+ * also checks that "indx" is a valid page-index entry, and we have to
+ * do that latter check before looking at the indx slot of the array
+ * for a match to leaf (in other words, our page hint might be wrong).
+ */
+ WT_INTL_INDEX_GET(session, leaf->home, pindex);
+ indx = leaf->pindex_hint;
+ if (indx + 1 < pindex->entries && pindex->index[indx] == leaf)
+ if (recno >= pindex->index[indx + 1]->key.recno) {
+ cbt->compare = -1; /* page keys < search key */
+ return (0);
+ }
+
+ return (0);
+}
+
+/*
* __wt_col_search --
* Search a column-store tree for a specific record-based key.
*/
int
__wt_col_search(WT_SESSION_IMPL *session,
- uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
+ uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
{
WT_BTREE *btree;
WT_COL *cip;
@@ -24,6 +72,7 @@ __wt_col_search(WT_SESSION_IMPL *session,
WT_PAGE *page;
WT_PAGE_INDEX *pindex, *parent_pindex;
WT_REF *current, *descent;
+ uint64_t recno;
uint32_t base, indx, limit;
int depth;
@@ -31,8 +80,38 @@ __wt_col_search(WT_SESSION_IMPL *session,
__cursor_pos_clear(cbt);
- /* We may only be searching a single leaf page, not the full tree. */
+ /*
+ * When appending a new record, the search record number will be an
+ * out-of-band value, search for the largest key in the table instead.
+ */
+ if ((recno = search_recno) == WT_RECNO_OOB)
+ recno = UINT64_MAX;
+
+ /*
+ * We may be searching only a single leaf page, not the full tree. In
+ * the normal case where the page links to a parent, check the page's
+ * parent keys before doing the full search, it's faster when the
+ * cursor is being re-positioned. (One case where the page doesn't
+ * have a parent is if it is being re-instantiated in memory as part
+ * of a split).
+ */
if (leaf != NULL) {
+ WT_ASSERT(session, search_recno != WT_RECNO_OOB);
+
+ if (leaf->home != NULL) {
+ WT_RET(__check_leaf_key_range(
+ session, recno, leaf, cbt));
+ if (cbt->compare != 0) {
+ /*
+ * !!!
+ * WT_CURSOR.search_near uses the slot value to
+ * decide if there was an on-page match.
+ */
+ cbt->slot = 0;
+ return (0);
+ }
+ }
+
current = leaf;
goto leaf_only;
}
@@ -120,7 +199,17 @@ leaf_only:
page = current->page;
cbt->ref = current;
cbt->recno = recno;
- cbt->compare = 0;
+
+ /*
+ * Don't bother searching if the caller is appending a new record where
+ * we'll allocate the record number; we're not going to find a match by
+ * definition, and we figure out the record number and position when we
+ * do the work.
+ */
+ if (search_recno == WT_RECNO_OOB) {
+ cbt->compare = -1;
+ return (0);
+ }
/*
* Set the on-page slot to an impossible value larger than any possible
@@ -142,6 +231,7 @@ leaf_only:
* that's impossibly large for the page. We do have additional setup to
* do in that case, the record may be appended to the page.
*/
+ cbt->compare = 0;
if (page->type == WT_PAGE_COL_FIX) {
if (recno < page->pg_fix_recno) {
cbt->compare = 1;
@@ -190,18 +280,10 @@ past_end:
* This is a rarely used path: we normally find exact matches, because
* column-store files are dense, but in this case the caller searched
* past the end of the table.
- *
- * Don't bother searching if the caller is appending a new record where
- * we'll allocate the record number; we're not going to find a match by
- * definition, and we figure out the position when we do the work.
*/
cbt->ins_head = WT_COL_APPEND(page);
- if (recno == UINT64_MAX)
- cbt->ins = NULL;
- else
- cbt->ins = __col_insert_search(
- cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno);
- if (cbt->ins == NULL)
+ if ((cbt->ins = __col_insert_search(
+ cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno)) == NULL)
cbt->compare = -1;
else {
cbt->recno = WT_INSERT_RECNO(cbt->ins);
@@ -212,14 +294,5 @@ past_end:
else
cbt->compare = -1;
}
-
- /*
- * Note if the record is past the maximum record in the tree, the cursor
- * search functions need to know for fixed-length column-stores because
- * appended records implicitly create any skipped records, and cursor
- * search functions have to handle that case.
- */
- if (cbt->compare == -1)
- F_SET(cbt, WT_CBT_MAX_RECORD);
return (0);
}
diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c
index 079f9d3bad1..e98d30152ab 100644
--- a/src/third_party/wiredtiger/src/btree/row_srch.c
+++ b/src/third_party/wiredtiger/src/btree/row_srch.c
@@ -132,6 +132,76 @@ __wt_search_insert(
}
/*
+ * __check_leaf_key_range --
+ * Check the search key is in the leaf page's key range.
+ */
+static inline int
+__check_leaf_key_range(WT_SESSION_IMPL *session,
+ WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_COLLATOR *collator;
+ WT_ITEM *item;
+ WT_PAGE_INDEX *pindex;
+ uint32_t indx;
+ int cmp;
+
+ btree = S2BT(session);
+ collator = btree->collator;
+ item = cbt->tmp;
+
+ /*
+ * There are reasons we can't do the fast checks, and we continue with
+ * the leaf page search in those cases, only skipping the complete leaf
+ * page search if we know it's not going to work.
+ */
+ cbt->compare = 0;
+
+ /*
+ * First, confirm we have the right parent page-index slot, and quit if
+ * we don't. We don't search for the correct slot, that would make this
+ * cheap test expensive.
+ */
+ WT_INTL_INDEX_GET(session, leaf->home, pindex);
+ indx = leaf->pindex_hint;
+ if (indx >= pindex->entries || pindex->index[indx] != leaf)
+ return (0);
+
+ /*
+ * Check if the search key is smaller than the parent's starting key for
+ * this page.
+ *
+ * We can't compare against slot 0 on a row-store internal page because
+ * reconciliation doesn't build it, it may not be a valid key.
+ */
+ if (indx != 0) {
+ __wt_ref_key(leaf->home, leaf, &item->data, &item->size);
+ WT_RET(__wt_compare(session, collator, srch_key, item, &cmp));
+ if (cmp < 0) {
+ cbt->compare = 1; /* page keys > search key */
+ return (0);
+ }
+ }
+
+ /*
+ * Check if the search key is greater than or equal to the starting key
+ * for the parent's next page.
+ */
+ ++indx;
+ if (indx < pindex->entries) {
+ __wt_ref_key(
+ leaf->home, pindex->index[indx], &item->data, &item->size);
+ WT_RET(__wt_compare(session, collator, srch_key, item, &cmp));
+ if (cmp >= 0) {
+ cbt->compare = -1; /* page keys < search key */
+ return (0);
+ }
+ }
+
+ return (0);
+}
+
+/*
* __wt_row_search --
* Search a row-store tree for a specific key.
*/
@@ -179,8 +249,29 @@ __wt_row_search(WT_SESSION_IMPL *session,
append_check = insert && cbt->append_tree;
descend_right = true;
- /* We may only be searching a single leaf page, not the full tree. */
+ /*
+ * We may be searching only a single leaf page, not the full tree. In
+ * the normal case where the page links to a parent, check the page's
+ * parent keys before doing the full search, it's faster when the
+ * cursor is being re-positioned. (One case where the page doesn't
+ * have a parent is if it is being re-instantiated in memory as part
+ * of a split).
+ */
if (leaf != NULL) {
+ if (leaf->home != NULL) {
+ WT_RET(__check_leaf_key_range(
+ session, srch_key, leaf, cbt));
+ if (cbt->compare != 0) {
+ /*
+ * !!!
+ * WT_CURSOR.search_near uses the slot value to
+ * decide if there was an on-page match.
+ */
+ cbt->slot = 0;
+ return (0);
+ }
+ }
+
current = leaf;
goto leaf_only;
}
@@ -196,15 +287,6 @@ restart_page: page = current->page;
WT_INTL_INDEX_GET(session, page, pindex);
- /*
- * Fast-path internal pages with one child, a common case for
- * the root page in new trees.
- */
- if (pindex->entries == 1) {
- descent = pindex->index[0];
- goto descend;
- }
-
/* Fast-path appends. */
if (append_check) {
descent = pindex->index[pindex->entries - 1];
@@ -542,12 +624,18 @@ err: /*
int
__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
{
- WT_INSERT *p, *t;
+ WT_INSERT *ins, **start, **stop;
+ WT_INSERT_HEAD *ins_head;
WT_PAGE *page;
- uint32_t cnt;
+ uint32_t choice, entries, i;
+ int level;
page = cbt->ref->page;
+ start = stop = NULL; /* [-Wconditional-uninitialized] */
+ entries = 0; /* [-Wconditional-uninitialized] */
+
+ /* If the page has disk-based entries, select from them. */
if (page->pg_row_entries != 0) {
cbt->compare = 0;
cbt->slot = __wt_random(&session->rnd) % page->pg_row_entries;
@@ -562,24 +650,115 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
/*
* If the tree is new (and not empty), it might have a large insert
- * list. Count how many records are in the list.
+ * list.
*/
F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
return (WT_NOTFOUND);
- for (cnt = 1, p = WT_SKIP_FIRST(cbt->ins_head);; ++cnt)
- if ((p = WT_SKIP_NEXT(p)) == NULL)
- break;
/*
- * Select a random number from 0 to (N - 1), return that record.
+ * Walk down the list until we find a level with at least 50 entries,
+ * that's where we'll start rolling random numbers. The value 50 is
+ * used to ignore levels with only a few entries, that is, levels which
+ * are potentially badly skewed.
*/
- cnt = __wt_random(&session->rnd) % cnt;
- for (p = t = WT_SKIP_FIRST(cbt->ins_head);; t = p)
- if (cnt-- == 0 || (p = WT_SKIP_NEXT(p)) == NULL)
+ for (ins_head = cbt->ins_head,
+ level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) {
+ start = &ins_head->head[level];
+ for (entries = 0, stop = start;
+ *stop != NULL; stop = &(*stop)->next[level])
+ ++entries;
+
+ if (entries > 50)
break;
+ }
+
+ /*
+ * If it's a tiny list and we went all the way to level 0, correct the
+ * level; entries is correctly set.
+ */
+ if (level < 0)
+ level = 0;
+
+ /*
+ * Step down the skip list levels, selecting a random chunk of the name
+ * space at each level.
+ */
+ while (level > 0) {
+ /*
+ * There are (entries) or (entries + 1) chunks of the name space
+ * considered at each level. They are: between start and the 1st
+ * element, between the 1st and 2nd elements, and so on to the
+ * last chunk which is the name space after the stop element on
+ * the current level. This last chunk of name space may or may
+ * not be there: as we descend the levels of the skip list, this
+ * chunk may appear, depending if the next level down has
+ * entries logically after the stop point in the current level.
+ * We can't ignore those entries: because of the algorithm used
+ * to determine the depth of a skiplist, there may be a large
+ * number of entries "revealed" by descending a level.
+ *
+ * If the next level down has more items after the current stop
+ * point, there are (entries + 1) chunks to consider, else there
+ * are (entries) chunks.
+ */
+ if (*(stop - 1) == NULL)
+ choice = __wt_random(&session->rnd) % entries;
+ else
+ choice = __wt_random(&session->rnd) % (entries + 1);
+
+ if (choice == entries) {
+ /*
+ * We selected the name space after the stop element on
+ * this level. Set the start point to the current stop
+ * point, descend a level and move the stop element to
+ * the end of the list, that is, the end of the newly
+ * discovered name space, counting entries as we go.
+ */
+ start = stop;
+ --start;
+ --level;
+ for (entries = 0, stop = start;
+ *stop != NULL; stop = &(*stop)->next[level])
+ ++entries;
+ } else {
+ /*
+ * We selected another name space on the level. Move the
+ * start pointer the selected number of entries forward
+ * to the start of the selected chunk (if the selected
+ * number is 0, start won't move). Set the stop pointer
+ * to the next element in the list and drop both start
+ * and stop down a level.
+ */
+ for (i = 0; i < choice; ++i)
+ start = &(*start)->next[level];
+ stop = &(*start)->next[level];
+
+ --start;
+ --stop;
+ --level;
+
+ /* Count the entries in the selected name space. */
+ for (entries = 0,
+ ins = *start; ins != *stop; ins = ins->next[level])
+ ++entries;
+ }
+ }
+
+ /*
+ * When we reach the bottom level, entries will already be set. Select
+ * a random entry from the name space and return it.
+ *
+ * It should be impossible for the entries count to be 0 at this point,
+ * but check for it out of paranoia and to quiet static testing tools.
+ */
+ if (entries > 0)
+ entries = __wt_random(&session->rnd) % entries;
+ for (ins = *start; entries > 0; --entries)
+ ins = ins->next[0];
+
+ cbt->ins = ins;
cbt->compare = 0;
- cbt->ins = t;
return (0);
}
diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c
index d3a0265c13a..e943f01236e 100644
--- a/src/third_party/wiredtiger/src/cache/cache_las.c
+++ b/src/third_party/wiredtiger/src/cache/cache_las.c
@@ -18,6 +18,7 @@ __wt_las_stats_update(WT_SESSION_IMPL *session)
WT_CONNECTION_IMPL *conn;
WT_CONNECTION_STATS **cstats;
WT_DSRC_STATS **dstats;
+ int64_t v;
conn = S2C(session);
@@ -37,10 +38,10 @@ __wt_las_stats_update(WT_SESSION_IMPL *session)
dstats = ((WT_CURSOR_BTREE *)
conn->las_session->las_cursor)->btree->dhandle->stats;
- WT_STAT_SET(session, cstats,
- cache_lookaside_insert, WT_STAT_READ(dstats, cursor_insert));
- WT_STAT_SET(session, cstats,
- cache_lookaside_remove, WT_STAT_READ(dstats, cursor_remove));
+ v = WT_STAT_READ(dstats, cursor_insert);
+ WT_STAT_SET(session, cstats, cache_lookaside_insert, v);
+ v = WT_STAT_READ(dstats, cursor_remove);
+ WT_STAT_SET(session, cstats, cache_lookaside_remove, v);
}
/*
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
index bd14e1bf4fd..ee9935828e2 100644
--- a/src/third_party/wiredtiger/src/conn/conn_api.c
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -2003,6 +2003,9 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
WT_ERR(__wt_sweep_config(session, cfg));
WT_ERR(__wt_verbose_config(session, cfg));
+ /* Initialize the OS page size for mmap */
+ conn->page_size = __wt_get_vm_pagesize();
+
/* Now that we know if verbose is configured, output the version. */
WT_ERR(__wt_verbose(
session, WT_VERB_VERSION, "%s", WIREDTIGER_VERSION_STRING));
diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
index c6d5b535b86..0821238fbd7 100644
--- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c
+++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
@@ -147,12 +147,14 @@ __conn_dhandle_mark_dead(WT_SESSION_IMPL *session)
int
__wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force)
{
+ WT_BM *bm;
WT_BTREE *btree;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
bool marked_dead, no_schema_lock;
btree = S2BT(session);
+ bm = btree->bm;
dhandle = session->dhandle;
marked_dead = false;
@@ -191,7 +193,7 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force)
*/
if (!F_ISSET(btree,
WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
- if (force && (btree->bm == NULL || btree->bm->map == NULL)) {
+ if (force && (bm == NULL || !bm->is_mapped(bm, session))) {
WT_ERR(__conn_dhandle_mark_dead(session));
marked_dead = true;
}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_bulk.c b/src/third_party/wiredtiger/src/cursor/cur_bulk.c
index b996b934464..db64d2ad498 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_bulk.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_bulk.c
@@ -9,6 +9,25 @@
#include "wt_internal.h"
/*
+ * __bulk_col_keycmp_err --
+ * Error routine when column-store keys inserted out-of-order.
+ */
+static int
+__bulk_col_keycmp_err(WT_CURSOR_BULK *cbulk)
+{
+ WT_CURSOR *cursor;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session;
+ cursor = &cbulk->cbt.iface;
+
+ WT_RET_MSG(session, EINVAL,
+ "bulk-load presented with out-of-order keys: %" PRIu64 " is less "
+ "than previously inserted key %" PRIu64,
+ cursor->recno, cbulk->recno);
+}
+
+/*
* __curbulk_insert_fix --
* Fixed-length column-store bulk cursor insert.
*/
@@ -19,6 +38,7 @@ __curbulk_insert_fix(WT_CURSOR *cursor)
WT_CURSOR_BULK *cbulk;
WT_DECL_RET;
WT_SESSION_IMPL *session;
+ uint64_t recno;
cbulk = (WT_CURSOR_BULK *)cursor;
btree = cbulk->cbt.btree;
@@ -29,13 +49,63 @@ __curbulk_insert_fix(WT_CURSOR *cursor)
* until the bulk cursor is closed.
*/
CURSOR_API_CALL(cursor, session, insert, btree);
+ WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
- WT_CURSOR_NEEDVALUE(cursor);
+ /*
+ * If the "append" flag was configured, the application doesn't have to
+ * supply a key, else require a key.
+ */
+ if (F_ISSET(cursor, WT_CURSTD_APPEND))
+ recno = cbulk->recno + 1;
+ else {
+ WT_CURSOR_CHECKKEY(cursor);
+ if ((recno = cursor->recno) <= cbulk->recno)
+ WT_ERR(__bulk_col_keycmp_err(cbulk));
+ }
+ WT_CURSOR_CHECKVALUE(cursor);
- WT_ERR(__wt_bulk_insert_fix(session, cbulk));
+ /*
+ * Insert any skipped records as deleted records, update the current
+ * record count.
+ */
+ for (; recno != cbulk->recno + 1; ++cbulk->recno)
+ WT_ERR(__wt_bulk_insert_fix(session, cbulk, true));
+ cbulk->recno = recno;
+
+ /* Insert the current record. */
+ ret = __wt_bulk_insert_fix(session, cbulk, false);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curbulk_insert_fix_bitmap --
+ * Fixed-length column-store bulk cursor insert for bitmaps.
+ */
+static int
+__curbulk_insert_fix_bitmap(WT_CURSOR *cursor)
+{
+ WT_BTREE *btree;
+ WT_CURSOR_BULK *cbulk;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ cbulk = (WT_CURSOR_BULK *)cursor;
+ btree = cbulk->cbt.btree;
+
+ /*
+ * Bulk cursor inserts are updates, but don't need auto-commit
+ * transactions because they are single-threaded and not visible
+ * until the bulk cursor is closed.
+ */
+ CURSOR_API_CALL(cursor, session, insert, btree);
WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
+ WT_CURSOR_CHECKVALUE(cursor);
+
+ /* Insert the current record. */
+ ret = __wt_bulk_insert_fix_bitmap(session, cbulk);
+
err: API_END_RET(session, ret);
}
@@ -50,7 +120,7 @@ __curbulk_insert_var(WT_CURSOR *cursor)
WT_CURSOR_BULK *cbulk;
WT_DECL_RET;
WT_SESSION_IMPL *session;
- bool duplicate;
+ uint64_t recno;
cbulk = (WT_CURSOR_BULK *)cursor;
btree = cbulk->cbt.btree;
@@ -61,45 +131,63 @@ __curbulk_insert_var(WT_CURSOR *cursor)
* until the bulk cursor is closed.
*/
CURSOR_API_CALL(cursor, session, insert, btree);
-
- WT_CURSOR_NEEDVALUE(cursor);
+ WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
/*
- * If this isn't the first value inserted, compare it against the last
- * value and increment the RLE count.
- *
- * Instead of a "first time" variable, I'm using the RLE count, because
- * it is only zero before the first row is inserted.
+ * If the "append" flag was configured, the application doesn't have to
+ * supply a key, else require a key.
*/
- duplicate = false;
- if (cbulk->rle != 0) {
- if (cbulk->last.size == cursor->value.size &&
- memcmp(cbulk->last.data, cursor->value.data,
- cursor->value.size) == 0) {
- ++cbulk->rle;
- duplicate = true;
- } else
- WT_ERR(__wt_bulk_insert_var(session, cbulk));
+ if (F_ISSET(cursor, WT_CURSTD_APPEND))
+ recno = cbulk->recno + 1;
+ else {
+ WT_CURSOR_CHECKKEY(cursor);
+ if ((recno = cursor->recno) <= cbulk->recno)
+ WT_ERR(__bulk_col_keycmp_err(cbulk));
}
+ WT_CURSOR_CHECKVALUE(cursor);
+
+ if (!cbulk->first_insert) {
+ /*
+ * If not the first insert and the key space is sequential,
+ * compare the current value against the last value; if the
+ * same, just increment the RLE count.
+ */
+ if (recno == cbulk->recno + 1 &&
+ cbulk->last.size == cursor->value.size &&
+ memcmp(cbulk->last.data,
+ cursor->value.data, cursor->value.size) == 0) {
+ ++cbulk->rle;
+ ++cbulk->recno;
+ goto duplicate;
+ }
+
+ /* Insert the previous key/value pair. */
+ WT_ERR(__wt_bulk_insert_var(session, cbulk, false));
+ } else
+ cbulk->first_insert = false;
/*
- * Save a copy of the value for the next comparison and reset the RLE
- * counter.
+ * Insert any skipped records as deleted records, update the current
+ * record count and RLE counter.
*/
- if (!duplicate) {
- WT_ERR(__wt_buf_set(session,
- &cbulk->last, cursor->value.data, cursor->value.size));
- cbulk->rle = 1;
+ if (recno != cbulk->recno + 1) {
+ cbulk->rle = (recno - cbulk->recno) - 1;
+ WT_ERR(__wt_bulk_insert_var(session, cbulk, true));
}
+ cbulk->rle = 1;
+ cbulk->recno = recno;
- WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
+ /* Save a copy of the value for the next comparison. */
+ ret = __wt_buf_set(session,
+ &cbulk->last, cursor->value.data, cursor->value.size);
+duplicate:
err: API_END_RET(session, ret);
}
/*
* __bulk_row_keycmp_err --
- * Error routine when keys inserted out-of-order.
+ * Error routine when row-store keys inserted out-of-order.
*/
static int
__bulk_row_keycmp_err(WT_CURSOR_BULK *cbulk)
@@ -154,6 +242,7 @@ __curbulk_insert_row(WT_CURSOR *cursor)
* until the bulk cursor is closed.
*/
CURSOR_API_CALL(cursor, session, insert, btree);
+ WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
WT_CURSOR_CHECKKEY(cursor);
WT_CURSOR_CHECKVALUE(cursor);
@@ -161,28 +250,20 @@ __curbulk_insert_row(WT_CURSOR *cursor)
/*
* If this isn't the first key inserted, compare it against the last key
* to ensure the application doesn't accidentally corrupt the table.
- *
- * Instead of a "first time" variable, I'm using the RLE count, because
- * it is only zero before the first row is inserted.
*/
- if (cbulk->rle != 0) {
+ if (!cbulk->first_insert) {
WT_ERR(__wt_compare(session,
btree->collator, &cursor->key, &cbulk->last, &cmp));
if (cmp <= 0)
WT_ERR(__bulk_row_keycmp_err(cbulk));
- }
+ } else
+ cbulk->first_insert = false;
- /*
- * Save a copy of the key for the next comparison and set the RLE
- * counter.
- */
+ /* Save a copy of the key for the next comparison. */
WT_ERR(__wt_buf_set(session,
&cbulk->last, cursor->key.data, cursor->key.size));
- cbulk->rle = 1;
-
- WT_ERR(__wt_bulk_insert_row(session, cbulk));
- WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
+ ret = __wt_bulk_insert_row(session, cbulk);
err: API_END_RET(session, ret);
}
@@ -208,13 +289,12 @@ __curbulk_insert_row_skip_check(WT_CURSOR *cursor)
* until the bulk cursor is closed.
*/
CURSOR_API_CALL(cursor, session, insert, btree);
+ WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
- WT_CURSOR_NEEDKEY(cursor);
- WT_CURSOR_NEEDVALUE(cursor);
-
- WT_ERR(__wt_bulk_insert_row(session, cbulk));
+ WT_CURSOR_CHECKKEY(cursor);
+ WT_CURSOR_CHECKVALUE(cursor);
- WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
+ ret = __wt_bulk_insert_row(session, cbulk);
err: API_END_RET(session, ret);
}
@@ -237,18 +317,25 @@ __wt_curbulk_init(WT_SESSION_IMPL *session,
__wt_cursor_set_notsup(c);
switch (cbt->btree->type) {
case BTREE_COL_FIX:
- c->insert = __curbulk_insert_fix;
+ c->insert = bitmap ?
+ __curbulk_insert_fix_bitmap : __curbulk_insert_fix;
break;
case BTREE_COL_VAR:
c->insert = __curbulk_insert_var;
break;
case BTREE_ROW:
+ /*
+ * Row-store order comparisons are expensive, so we optionally
+ * skip them when we know the input is correct.
+ */
c->insert = skip_sort_check ?
__curbulk_insert_row_skip_check : __curbulk_insert_row;
break;
WT_ILLEGAL_VALUE(session);
}
+ cbulk->first_insert = true;
+ cbulk->recno = 0;
cbulk->bitmap = bitmap;
if (bitmap)
F_SET(c, WT_CURSTD_RAW);
diff --git a/src/third_party/wiredtiger/src/cursor/cur_json.c b/src/third_party/wiredtiger/src/cursor/cur_json.c
index 8f858a5012f..3270be07de4 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_json.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_json.c
@@ -313,7 +313,6 @@ size_t
__wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, bool force_unicode)
{
char abbrev;
- u_char h;
if (!force_unicode) {
if (isprint(ch) && ch != '\\' && ch != '"') {
@@ -354,16 +353,8 @@ __wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, bool force_unicode)
*buf++ = 'u';
*buf++ = '0';
*buf++ = '0';
- h = (((u_char)ch) >> 4) & 0xF;
- if (h >= 10)
- *buf++ = 'A' + (h - 10);
- else
- *buf++ = '0' + h;
- h = ((u_char)ch) & 0xF;
- if (h >= 10)
- *buf++ = 'A' + (h - 10);
- else
- *buf++ = '0' + h;
+ *buf++ = __wt_hex[(ch & 0xf0) >> 4];
+ *buf++ = __wt_hex[ch & 0x0f];
}
return (6);
}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_stat.c b/src/third_party/wiredtiger/src/cursor/cur_stat.c
index e1d5b8eb91a..652dec364fb 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_stat.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_stat.c
@@ -384,6 +384,7 @@ __curstat_file_init(WT_SESSION_IMPL *session,
{
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
+ wt_off_t size;
const char *filename;
/*
@@ -395,8 +396,8 @@ __curstat_file_init(WT_SESSION_IMPL *session,
if (!WT_PREFIX_SKIP(filename, "file:"))
return (EINVAL);
__wt_stat_dsrc_init_single(&cst->u.dsrc_stats);
- WT_RET(__wt_block_manager_size(
- session, filename, &cst->u.dsrc_stats));
+ WT_RET(__wt_block_manager_named_size(session, filename, &size));
+ cst->u.dsrc_stats.block_size = size;
__wt_curstat_dsrc_final(cst);
return (0);
}
@@ -662,7 +663,7 @@ __wt_curstat_open(WT_SESSION_IMPL *session,
/*
* We return the statistics field's offset as the key, and a string
- * description, a string value, and a uint64_t value as the value
+ * description, a string value, and a uint64_t value as the value
* columns.
*/
cursor->key_format = "i";
diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c
index dca72a16ee5..e746ccd5871 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_table.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_table.c
@@ -968,8 +968,11 @@ __wt_curtable_open(WT_SESSION_IMPL *session,
WT_ERR(__wt_strdup(session, tmp->data, &ctable->cfg[1]));
if (0) {
-err: WT_TRET(__curtable_close(cursor));
- *cursorp = NULL;
+err: if (*cursorp != NULL) {
+ WT_TRET(__wt_cursor_close(*cursorp));
+ *cursorp = NULL;
+ }
+ WT_TRET(__curtable_close(cursor));
}
__wt_scr_free(session, &tmp);
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index ac481581c23..0e2b33c35ec 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -727,6 +727,10 @@ __evict_request_walk_clear(WT_SESSION_IMPL *session)
F_CLR(session, WT_SESSION_CLEAR_EVICT_WALK);
+ /* An error is unexpected - flag the failure. */
+ if (ret != 0)
+ __wt_err(session, ret, "Failed to clear eviction walk point");
+
return (ret);
}
@@ -760,20 +764,18 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp)
{
WT_BTREE *btree;
WT_CACHE *cache;
+ WT_DECL_RET;
WT_EVICT_ENTRY *evict;
u_int i, elem;
+ *evict_resetp = false;
+
btree = S2BT(session);
cache = S2C(session)->cache;
- /*
- * If the file isn't evictable, there's no work to do.
- */
- if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) {
- *evict_resetp = false;
+ /* If the file wasn't evictable, there's no work to do. */
+ if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
return (0);
- }
- *evict_resetp = true;
/*
* Hold the walk lock to set the "no eviction" flag: no new pages from
@@ -784,7 +786,7 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp)
__wt_spin_unlock(session, &cache->evict_walk_lock);
/* Clear any existing LRU eviction walk for the file. */
- WT_RET(__evict_request_walk_clear(session));
+ WT_ERR(__evict_request_walk_clear(session));
/* Hold the evict lock to remove any queued pages from this file. */
__wt_spin_lock(session, &cache->evict_lock);
@@ -806,7 +808,11 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp)
while (btree->evict_busy > 0)
__wt_yield();
+ *evict_resetp = true;
return (0);
+
+err: F_CLR(btree, WT_BTREE_NO_EVICTION);
+ return (ret);
}
/*
diff --git a/src/third_party/wiredtiger/src/include/block.h b/src/third_party/wiredtiger/src/include/block.h
index 4bff6c82783..804eec24874 100644
--- a/src/third_party/wiredtiger/src/include/block.h
+++ b/src/third_party/wiredtiger/src/include/block.h
@@ -173,6 +173,7 @@ struct __wt_bm {
int (*compact_skip)(WT_BM *, WT_SESSION_IMPL *, bool *);
int (*compact_start)(WT_BM *, WT_SESSION_IMPL *);
int (*free)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
+ bool (*is_mapped)(WT_BM *, WT_SESSION_IMPL *);
int (*preload)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
int (*read)
(WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t);
@@ -182,6 +183,7 @@ struct __wt_bm {
int (*salvage_start)(WT_BM *, WT_SESSION_IMPL *);
int (*salvage_valid)
(WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t, bool);
+ int (*size)(WT_BM *, WT_SESSION_IMPL *, wt_off_t *);
int (*stat)(WT_BM *, WT_SESSION_IMPL *, WT_DSRC_STATS *stats);
int (*sync)(WT_BM *, WT_SESSION_IMPL *, bool);
int (*verify_addr)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
@@ -244,7 +246,10 @@ struct __wt_block {
bool ckpt_inprogress;/* Live checkpoint in progress */
/* Compaction support */
- int compact_pct_tenths; /* Percent to compact */
+ int compact_pct_tenths; /* Percent to compact */
+ uint64_t compact_pages_reviewed;/* Pages reviewed */
+ uint64_t compact_pages_skipped; /* Pages skipped */
+ uint64_t compact_pages_written; /* Pages rewritten */
/* Salvage support */
wt_off_t slvg_off; /* Salvage file offset */
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index 6ee74c61a38..12a736c56a2 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -478,7 +478,7 @@ struct __wt_page {
#define pg_row_ins u.row.ins
#undef pg_row_upd
#define pg_row_upd u.row.upd
-#define pg_row_entries u.row.entries
+#undef pg_row_entries
#define pg_row_entries u.row.entries
/* Fixed-length column-store leaf page. */
@@ -1049,7 +1049,7 @@ struct __wt_insert_head {
uint64_t __prev_split_gen = (session)->split_gen; \
if (__prev_split_gen == 0) \
do { \
- WT_PUBLISH((session)->split_gen, \
+ WT_PUBLISH((session)->split_gen, \
S2C(session)->split_gen); \
} while ((session)->split_gen != S2C(session)->split_gen)
diff --git a/src/third_party/wiredtiger/src/include/column.i b/src/third_party/wiredtiger/src/include/column.i
index fc1f372b2a9..9388e07d0d8 100644
--- a/src/third_party/wiredtiger/src/include/column.i
+++ b/src/third_party/wiredtiger/src/include/column.i
@@ -176,6 +176,16 @@ __col_insert_search(WT_INSERT_HEAD *inshead,
continue;
}
+ /*
+ * When no exact match is found, the search returns the smallest
+ * key larger than the searched-for key, or the largest key
+ * smaller than the searched-for key, if there is no larger key.
+ * Our callers depend on that: specifically, the fixed-length
+ * column store cursor code interprets returning a key smaller
+ * than the searched-for key to mean the searched-for key is
+ * larger than any key on the page. Don't change that behavior,
+ * things will break.
+ */
ins_recno = WT_INSERT_RECNO(ret_ins);
cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1;
@@ -282,7 +292,17 @@ __col_var_search(WT_PAGE *page, uint64_t recno, uint64_t *start_recnop)
start_recno = repeat->recno + repeat->rle;
}
- if (recno >= start_recno + (page->pg_var_entries - start_indx))
+ /*
+ * !!!
+ * The test could be written more simply as:
+ *
+ * (recno >= start_recno + (page->pg_var_entries - start_indx))
+ *
+ * It's split into two parts because the simpler test will overflow if
+ * searching for large record numbers.
+ */
+ if (recno >= start_recno &&
+ recno - start_recno >= page->pg_var_entries - start_indx)
return (NULL);
return (page->pg_var_d + start_indx + (uint32_t)(recno - start_recno));
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
index 2367f5a0035..1c1cb9b8987 100644
--- a/src/third_party/wiredtiger/src/include/connection.h
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -415,6 +415,7 @@ struct __wt_connection_impl {
uint32_t direct_io;
uint32_t write_through; /* FILE_FLAG_WRITE_THROUGH type flags */
bool mmap; /* mmap configuration */
+ int page_size; /* OS page size for mmap alignment */
uint32_t verbose;
uint32_t flags;
diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h
index 275e2f2db46..4f232ce4fd0 100644
--- a/src/third_party/wiredtiger/src/include/cursor.h
+++ b/src/third_party/wiredtiger/src/include/cursor.h
@@ -200,18 +200,23 @@ struct __wt_cursor_btree {
uint8_t append_tree; /* Cursor appended to the tree */
+#ifdef HAVE_DIAGNOSTIC
+ /* Check that cursor next/prev never returns keys out-of-order. */
+ WT_ITEM *lastkey, _lastkey;
+ uint64_t lastrecno;
+#endif
+
#define WT_CBT_ACTIVE 0x01 /* Active in the tree */
#define WT_CBT_ITERATE_APPEND 0x02 /* Col-store: iterating append list */
#define WT_CBT_ITERATE_NEXT 0x04 /* Next iteration configuration */
#define WT_CBT_ITERATE_PREV 0x08 /* Prev iteration configuration */
-#define WT_CBT_MAX_RECORD 0x10 /* Col-store: past end-of-table */
-#define WT_CBT_NO_TXN 0x20 /* Non-transactional cursor
+#define WT_CBT_NO_TXN 0x10 /* Non-transactional cursor
(e.g. on a checkpoint) */
-#define WT_CBT_SEARCH_SMALLEST 0x40 /* Row-store: small-key insert list */
+#define WT_CBT_SEARCH_SMALLEST 0x20 /* Row-store: small-key insert list */
#define WT_CBT_POSITION_MASK /* Flags associated with position */ \
(WT_CBT_ITERATE_APPEND | WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV | \
- WT_CBT_MAX_RECORD | WT_CBT_SEARCH_SMALLEST)
+ WT_CBT_SEARCH_SMALLEST)
uint8_t flags;
};
@@ -219,33 +224,32 @@ struct __wt_cursor_btree {
struct __wt_cursor_bulk {
WT_CURSOR_BTREE cbt;
- WT_REF *ref; /* The leaf page */
- WT_PAGE *leaf;
-
/*
* Variable-length column store compares values during bulk load as
* part of RLE compression, row-store compares keys during bulk load
* to avoid corruption.
*/
- WT_ITEM last; /* Last key/value seen */
+ bool first_insert; /* First insert */
+ WT_ITEM last; /* Last key/value inserted */
/*
- * Variable-length column-store RLE counter (also overloaded to mean
- * the first time through the bulk-load insert routine, when set to 0).
+ * Additional column-store bulk load support.
*/
- uint64_t rle;
+ uint64_t recno; /* Record number */
+ uint64_t rle; /* Variable-length RLE counter */
/*
- * Fixed-length column-store current entry in memory chunk count, and
- * the maximum number of records per chunk.
+ * Additional fixed-length column store bitmap bulk load support:
+ * current entry in memory chunk count, and the maximum number of
+ * records per chunk.
*/
+ bool bitmap; /* Bitmap bulk load */
uint32_t entry; /* Entry count */
uint32_t nrecs; /* Max records per chunk */
- /* Special bitmap bulk load for fixed-length column stores. */
- bool bitmap;
-
- void *reconcile; /* Reconciliation information */
+ void *reconcile; /* Reconciliation support */
+ WT_REF *ref; /* The leaf page */
+ WT_PAGE *leaf;
};
struct __wt_cursor_config {
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index d84403cc16d..7338f8dae3b 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -14,6 +14,7 @@ extern int __wt_block_buffer_to_addr(WT_BLOCK *block, const uint8_t *p, wt_off_t
extern int __wt_block_addr_invalid(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, bool live);
extern int __wt_block_addr_string(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, const uint8_t *addr, size_t addr_size);
extern int __wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci);
+extern int __wt_block_ckpt_decode(WT_SESSION *wt_session, size_t allocsize, const uint8_t *p, WT_BLOCK_CKPT *ci);
extern int __wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t **pp, WT_BLOCK_CKPT *ci);
extern int __wt_block_ckpt_init( WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name);
extern int __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, uint8_t *root_addr, size_t *root_addr_sizep, bool checkpoint);
@@ -50,7 +51,8 @@ extern int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const
extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block);
extern int __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize);
extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats);
-extern int __wt_block_manager_size( WT_SESSION_IMPL *session, const char *filename, WT_DSRC_STATS *stats);
+extern int __wt_block_manager_size(WT_BM *bm, WT_SESSION_IMPL *session, wt_off_t *sizep);
+extern int __wt_block_manager_named_size( WT_SESSION_IMPL *session, const char *name, wt_off_t *sizep);
extern int __wt_bm_preload( WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
extern int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size);
extern int __wt_block_read_off_blind( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset);
@@ -90,6 +92,7 @@ extern int __wt_bloom_drop(WT_BLOOM *bloom, const char *config);
extern int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp);
extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt);
+extern int __wt_cursor_key_order_check( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next);
extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating);
extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating);
extern int __wt_btcur_reset(WT_CURSOR_BTREE *cbt);
@@ -170,7 +173,7 @@ extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flag
extern int __wt_tree_walk_count(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags);
extern int __wt_tree_walk_skip(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags);
extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove);
-extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt);
+extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt);
extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page);
extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key);
extern int __wt_row_leaf_key_work(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, bool instantiate);
@@ -362,23 +365,23 @@ extern int __wt_logrec_read(WT_SESSION_IMPL *session, const uint8_t **pp, const
extern int __wt_logop_read(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *optypep, uint32_t *opsizep);
extern int __wt_logop_col_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno, WT_ITEM *value);
extern int __wt_logop_col_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop, WT_ITEM *valuep);
-extern int __wt_logop_col_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_col_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
extern int __wt_logop_col_remove_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno);
extern int __wt_logop_col_remove_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop);
-extern int __wt_logop_col_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_col_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
extern int __wt_logop_col_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t start, uint64_t stop);
extern int __wt_logop_col_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *startp, uint64_t *stopp);
-extern int __wt_logop_col_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_col_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
extern int __wt_logop_row_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key, WT_ITEM *value);
extern int __wt_logop_row_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp, WT_ITEM *valuep);
-extern int __wt_logop_row_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_row_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
extern int __wt_logop_row_remove_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key);
extern int __wt_logop_row_remove_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp);
-extern int __wt_logop_row_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_row_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
extern int __wt_logop_row_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *start, WT_ITEM *stop, uint32_t mode);
extern int __wt_logop_row_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep);
-extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
-extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
+extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out, uint32_t flags);
extern void __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
extern int __wt_log_slot_switch( WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced);
extern int __wt_log_slot_new(WT_SESSION_IMPL *session);
@@ -468,7 +471,7 @@ extern int __wt_meta_track_init(WT_SESSION_IMPL *session);
extern int __wt_meta_track_destroy(WT_SESSION_IMPL *session);
extern int __wt_turtle_init(WT_SESSION_IMPL *session);
extern int __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **valuep);
-extern int __wt_turtle_update( WT_SESSION_IMPL *session, const char *key, const char *value);
+extern int __wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value);
extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn));
extern int __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp);
extern int __wt_realloc(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp);
@@ -514,6 +517,7 @@ extern int __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp);
extern int __wt_once(void (*init_routine)(void));
extern int __wt_open(WT_SESSION_IMPL *session, const char *name, bool ok_create, bool exclusive, int dio_type, WT_FH **fhp);
extern int __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp);
+extern int __wt_get_vm_pagesize(void);
extern bool __wt_absolute_path(const char *path);
extern const char *__wt_path_separator(void);
extern bool __wt_has_priv(void);
@@ -558,8 +562,9 @@ extern uint32_t __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize);
extern int __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
extern int __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
-extern int __wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
-extern int __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
+extern int __wt_bulk_insert_fix( WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted);
+extern int __wt_bulk_insert_fix_bitmap(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
+extern int __wt_bulk_insert_var( WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted);
extern int __wt_schema_create_strip(WT_SESSION_IMPL *session, const char *v1, const char *v2, char **value_ret);
extern int __wt_direct_io_size_check(WT_SESSION_IMPL *session, const char **cfg, const char *config_name, uint32_t *allocsizep);
extern int __wt_schema_colgroup_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf);
@@ -654,6 +659,7 @@ __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp
);
extern int __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page);
extern void __wt_hazard_close(WT_SESSION_IMPL *session);
+extern void __wt_fill_hex(const uint8_t *src, size_t src_max, uint8_t *dest, size_t dest_max, size_t *lenp);
extern int __wt_raw_to_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to);
extern int __wt_raw_to_esc_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to);
extern int __wt_hex2byte(const u_char *from, u_char *to);
@@ -671,6 +677,7 @@ extern uint32_t __wt_log2_int(uint32_t n);
extern bool __wt_ispo2(uint32_t v);
extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2);
extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state);
+extern int __wt_random_init_seed( WT_SESSION_IMPL *session, WT_RAND_STATE volatile *rnd_state);
extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state);
extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size);
extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4)));
@@ -732,7 +739,7 @@ extern int __wt_txn_checkpoint_logread( WT_SESSION_IMPL *session, const uint8_t
extern int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp);
extern int __wt_txn_truncate_log( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop);
extern int __wt_txn_truncate_end(WT_SESSION_IMPL *session);
-extern int __wt_txn_printlog(WT_SESSION *wt_session, FILE *out);
+extern int __wt_txn_printlog(WT_SESSION *wt_session, FILE *out, uint32_t flags);
extern int __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_txn_named_snapshot_drop(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval);
diff --git a/src/third_party/wiredtiger/src/include/gcc.h b/src/third_party/wiredtiger/src/include/gcc.h
index 01e33792d73..bb80f8b738b 100644
--- a/src/third_party/wiredtiger/src/include/gcc.h
+++ b/src/third_party/wiredtiger/src/include/gcc.h
@@ -156,8 +156,7 @@ __wt_atomic_cas_ptr(void *vp, void *old, void *new)
#if defined(x86_64) || defined(__x86_64__)
/* Pause instruction to prevent excess processor bus usage */
-#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory")
-
+#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory")
#define WT_FULL_BARRIER() do { \
__asm__ volatile ("mfence" ::: "memory"); \
} while (0)
@@ -169,7 +168,7 @@ __wt_atomic_cas_ptr(void *vp, void *old, void *new)
} while (0)
#elif defined(i386) || defined(__i386__)
-#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory")
+#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory")
#define WT_FULL_BARRIER() do { \
__asm__ volatile ("lock; addl $0, 0(%%esp)" ::: "memory"); \
} while (0)
@@ -177,23 +176,58 @@ __wt_atomic_cas_ptr(void *vp, void *old, void *new)
#define WT_WRITE_BARRIER() WT_FULL_BARRIER()
#elif defined(__PPC64__) || defined(PPC64)
+/* ori 0,0,0 is the PPC64 noop instruction */
#define WT_PAUSE() __asm__ volatile("ori 0,0,0" ::: "memory")
-#define WT_FULL_BARRIER() do {
+#define WT_FULL_BARRIER() do { \
__asm__ volatile ("sync" ::: "memory"); \
} while (0)
-#define WT_READ_BARRIER() WT_FULL_BARRIER()
-#define WT_WRITE_BARRIER() WT_FULL_BARRIER()
+
+/* TODO: ISA 2.07 Elemental Memory Barriers would be better,
+ specifically mbll, and mbss, but they are not supported by POWER 8 */
+#define WT_READ_BARRIER() do { \
+ __asm__ volatile ("lwsync" ::: "memory"); \
+} while (0)
+#define WT_WRITE_BARRIER() do { \
+ __asm__ volatile ("lwsync" ::: "memory"); \
+} while (0)
#elif defined(__aarch64__)
#define WT_PAUSE() __asm__ volatile("yield" ::: "memory")
#define WT_FULL_BARRIER() do { \
- __asm__ volatile ("dsb sy" ::: "memory"); \
+ __asm__ volatile ("dsb sy" ::: "memory"); \
+} while (0)
+#define WT_READ_BARRIER() do { \
+ __asm__ volatile ("dsb ld" ::: "memory"); \
+} while (0)
+#define WT_WRITE_BARRIER() do { \
+ __asm__ volatile ("dsb st" ::: "memory"); \
+} while (0)
+
+#elif defined(__s390x__)
+#define WT_PAUSE() __asm__ volatile("lr 0,0" ::: "memory")
+#define WT_FULL_BARRIER() do { \
+ __asm__ volatile ("bcr 15,0\n" ::: "memory"); \
} while (0)
+#define WT_READ_BARRIER() WT_FULL_BARRIER()
+#define WT_WRITE_BARRIER() WT_FULL_BARRIER()
+
+#elif defined(__sparc__)
+#define WT_PAUSE() __asm__ volatile("rd %%ccr, %%g0" ::: "memory")
+
+#define WT_FULL_BARRIER() do { \
+ __asm__ volatile ("membar #StoreLoad" ::: "memory"); \
+} while (0)
+
+/*
+ * On UltraSparc machines, TSO is used, and so there is no need for membar.
+ * READ_BARRIER = #LoadLoad, and WRITE_BARRIER = #StoreStore are noop.
+ */
#define WT_READ_BARRIER() do { \
- __asm__ volatile ("dsb ld" ::: "memory"); \
+ __asm__ volatile ("" ::: "memory"); \
} while (0)
+
#define WT_WRITE_BARRIER() do { \
- __asm__ volatile ("dsb st" ::: "memory"); \
+ __asm__ volatile ("" ::: "memory"); \
} while (0)
#else
diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h
index 521de567fc0..e7737e12663 100644
--- a/src/third_party/wiredtiger/src/include/log.h
+++ b/src/third_party/wiredtiger/src/include/log.h
@@ -267,6 +267,11 @@ struct __wt_log_desc {
};
/*
+ * Flags for __wt_txn_op_printlog.
+ */
+#define WT_TXN_PRINTLOG_HEX 0x0001 /* Add hex output */
+
+/*
* WT_LOG_REC_DESC --
* A descriptor for a log record type.
*/
diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h
index e542baec642..898e44eb8e0 100644
--- a/src/third_party/wiredtiger/src/include/misc.h
+++ b/src/third_party/wiredtiger/src/include/misc.h
@@ -268,3 +268,6 @@ union __wt_rand_state {
uint32_t w, z;
} x;
};
+
+/* Shared array for converting to hex */
+extern const u_char __wt_hex[];
diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h
index 5c3bcfb8ed0..1eca49f2c40 100644
--- a/src/third_party/wiredtiger/src/include/session.h
+++ b/src/third_party/wiredtiger/src/include/session.h
@@ -74,7 +74,10 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl {
TAILQ_HEAD(__cursors, __wt_cursor) cursors;
WT_CURSOR_BACKUP *bkp_cursor; /* Hot backup cursor */
- WT_COMPACT *compact; /* Compact state */
+
+ WT_COMPACT *compact; /* Compaction information */
+ enum { WT_COMPACT_NONE=0,
+ WT_COMPACT_RUNNING, WT_COMPACT_SUCCESS } compact_state;
/*
* Lookaside table cursor, sweep and eviction worker threads only.
@@ -134,8 +137,6 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl {
void *reconcile; /* Reconciliation support */
int (*reconcile_cleanup)(WT_SESSION_IMPL *);
- bool compaction; /* Compaction did some work */
-
uint32_t flags;
/*
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
index dfe7ee5c6cd..a554607b7d5 100644
--- a/src/third_party/wiredtiger/src/include/stat.h
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -139,8 +139,8 @@ __wt_stats_clear(void *stats_arg, int slot)
*/
#define WT_STAT_READ(stats, fld) \
__wt_stats_aggregate(stats, WT_STATS_FIELD_TO_SLOT(stats, fld))
-#define WT_STAT_WRITE(session, stats, fld) \
- ((stats)[WT_STATS_SLOT_ID(session)]->fld);
+#define WT_STAT_WRITE(stats, fld, v) \
+ (stats)->fld = (int64_t)(v)
#define WT_STAT_DECRV(session, stats, fld, value) \
(stats)[WT_STATS_SLOT_ID(session)]->fld -= (int64_t)(value)
diff --git a/src/third_party/wiredtiger/src/log/log_auto.c b/src/third_party/wiredtiger/src/log/log_auto.c
index 5a1d03b1976..54df01d01ab 100644
--- a/src/third_party/wiredtiger/src/log/log_auto.c
+++ b/src/third_party/wiredtiger/src/log/log_auto.c
@@ -69,7 +69,7 @@ __logrec_json_unpack_str(char *dest, size_t destlen, const char *src,
}
static int
-__logrec_jsonify_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item)
+__logrec_make_json_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item)
{
size_t needed;
@@ -79,6 +79,17 @@ __logrec_jsonify_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item)
return (0);
}
+static int
+__logrec_make_hex_str(WT_SESSION_IMPL *session, char **destp, WT_ITEM *item)
+{
+ size_t needed;
+
+ needed = item->size * 2 + 1;
+ WT_RET(__wt_realloc(session, NULL, needed, destp));
+ __wt_fill_hex(item->data, item->size, (uint8_t *)*destp, needed, NULL);
+ return (0);
+}
+
int
__wt_logop_col_put_pack(
WT_SESSION_IMPL *session, WT_ITEM *logrec,
@@ -121,7 +132,8 @@ __wt_logop_col_put_unpack(
int
__wt_logop_col_put_print(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
WT_DECL_RET;
uint32_t fileid;
@@ -138,9 +150,14 @@ __wt_logop_col_put_print(
" \"fileid\": \"%" PRIu32 "\",\n", fileid));
WT_ERR(__wt_fprintf(out,
" \"recno\": \"%" PRIu64 "\",\n", recno));
- WT_ERR(__logrec_jsonify_str(session, &escaped, &value));
+ WT_ERR(__logrec_make_json_str(session, &escaped, &value));
WT_ERR(__wt_fprintf(out,
" \"value\": \"%s\"", escaped));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &value));
+ WT_ERR(__wt_fprintf(out,
+ ",\n \"value-hex\": \"%s\"", escaped));
+ }
err: __wt_free(session, escaped);
return (ret);
@@ -188,11 +205,13 @@ __wt_logop_col_remove_unpack(
int
__wt_logop_col_remove_print(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
uint32_t fileid;
uint64_t recno;
+ WT_UNUSED(flags);
WT_RET(__wt_logop_col_remove_unpack(
session, pp, end, &fileid, &recno));
@@ -246,12 +265,14 @@ __wt_logop_col_truncate_unpack(
int
__wt_logop_col_truncate_print(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
uint32_t fileid;
uint64_t start;
uint64_t stop;
+ WT_UNUSED(flags);
WT_RET(__wt_logop_col_truncate_unpack(
session, pp, end, &fileid, &start, &stop));
@@ -307,7 +328,8 @@ __wt_logop_row_put_unpack(
int
__wt_logop_row_put_print(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
WT_DECL_RET;
uint32_t fileid;
@@ -322,12 +344,22 @@ __wt_logop_row_put_print(
WT_RET(__wt_fprintf(out, " \"optype\": \"row_put\",\n"));
WT_ERR(__wt_fprintf(out,
" \"fileid\": \"%" PRIu32 "\",\n", fileid));
- WT_ERR(__logrec_jsonify_str(session, &escaped, &key));
+ WT_ERR(__logrec_make_json_str(session, &escaped, &key));
WT_ERR(__wt_fprintf(out,
" \"key\": \"%s\",\n", escaped));
- WT_ERR(__logrec_jsonify_str(session, &escaped, &value));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &key));
+ WT_ERR(__wt_fprintf(out,
+ " \"key-hex\": \"%s\",\n", escaped));
+ }
+ WT_ERR(__logrec_make_json_str(session, &escaped, &value));
WT_ERR(__wt_fprintf(out,
" \"value\": \"%s\"", escaped));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &value));
+ WT_ERR(__wt_fprintf(out,
+ ",\n \"value-hex\": \"%s\"", escaped));
+ }
err: __wt_free(session, escaped);
return (ret);
@@ -375,7 +407,8 @@ __wt_logop_row_remove_unpack(
int
__wt_logop_row_remove_print(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
WT_DECL_RET;
uint32_t fileid;
@@ -389,9 +422,14 @@ __wt_logop_row_remove_print(
WT_RET(__wt_fprintf(out, " \"optype\": \"row_remove\",\n"));
WT_ERR(__wt_fprintf(out,
" \"fileid\": \"%" PRIu32 "\",\n", fileid));
- WT_ERR(__logrec_jsonify_str(session, &escaped, &key));
+ WT_ERR(__logrec_make_json_str(session, &escaped, &key));
WT_ERR(__wt_fprintf(out,
" \"key\": \"%s\"", escaped));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &key));
+ WT_ERR(__wt_fprintf(out,
+ ",\n \"key-hex\": \"%s\"", escaped));
+ }
err: __wt_free(session, escaped);
return (ret);
@@ -439,7 +477,8 @@ __wt_logop_row_truncate_unpack(
int
__wt_logop_row_truncate_print(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
WT_DECL_RET;
uint32_t fileid;
@@ -455,12 +494,22 @@ __wt_logop_row_truncate_print(
WT_RET(__wt_fprintf(out, " \"optype\": \"row_truncate\",\n"));
WT_ERR(__wt_fprintf(out,
" \"fileid\": \"%" PRIu32 "\",\n", fileid));
- WT_ERR(__logrec_jsonify_str(session, &escaped, &start));
+ WT_ERR(__logrec_make_json_str(session, &escaped, &start));
WT_ERR(__wt_fprintf(out,
" \"start\": \"%s\",\n", escaped));
- WT_ERR(__logrec_jsonify_str(session, &escaped, &stop));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &start));
+ WT_ERR(__wt_fprintf(out,
+ " \"start-hex\": \"%s\",\n", escaped));
+ }
+ WT_ERR(__logrec_make_json_str(session, &escaped, &stop));
WT_ERR(__wt_fprintf(out,
" \"stop\": \"%s\",\n", escaped));
+ if (LF_ISSET(WT_TXN_PRINTLOG_HEX)) {
+ WT_ERR(__logrec_make_hex_str(session, &escaped, &stop));
+ WT_ERR(__wt_fprintf(out,
+ " \"stop-hex\": \"%s\",\n", escaped));
+ }
WT_ERR(__wt_fprintf(out,
" \"mode\": \"%" PRIu32 "\"", mode));
@@ -470,7 +519,8 @@ err: __wt_free(session, escaped);
int
__wt_txn_op_printlog(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ FILE *out, uint32_t flags)
{
uint32_t optype, opsize;
@@ -480,27 +530,33 @@ __wt_txn_op_printlog(
switch (optype) {
case WT_LOGOP_COL_PUT:
- WT_RET(__wt_logop_col_put_print(session, pp, end, out));
+ WT_RET(__wt_logop_col_put_print(session, pp, end, out,
+ flags));
break;
case WT_LOGOP_COL_REMOVE:
- WT_RET(__wt_logop_col_remove_print(session, pp, end, out));
+ WT_RET(__wt_logop_col_remove_print(session, pp, end, out,
+ flags));
break;
case WT_LOGOP_COL_TRUNCATE:
- WT_RET(__wt_logop_col_truncate_print(session, pp, end, out));
+ WT_RET(__wt_logop_col_truncate_print(session, pp, end, out,
+ flags));
break;
case WT_LOGOP_ROW_PUT:
- WT_RET(__wt_logop_row_put_print(session, pp, end, out));
+ WT_RET(__wt_logop_row_put_print(session, pp, end, out,
+ flags));
break;
case WT_LOGOP_ROW_REMOVE:
- WT_RET(__wt_logop_row_remove_print(session, pp, end, out));
+ WT_RET(__wt_logop_row_remove_print(session, pp, end, out,
+ flags));
break;
case WT_LOGOP_ROW_TRUNCATE:
- WT_RET(__wt_logop_row_truncate_print(session, pp, end, out));
+ WT_RET(__wt_logop_row_truncate_print(session, pp, end, out,
+ flags));
break;
WT_ILLEGAL_VALUE(session);
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_stat.c b/src/third_party/wiredtiger/src/lsm/lsm_stat.c
index c1eb7a2a389..7c53990a2a2 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_stat.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_stat.c
@@ -91,7 +91,7 @@ __curstat_lsm_init(
* top-level.
*/
new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
- new->lsm_generation_max = chunk->generation;
+ WT_STAT_WRITE(new, lsm_generation_max, chunk->generation);
/* Aggregate statistics from each new chunk. */
__wt_stat_dsrc_aggregate_single(new, stats);
@@ -115,37 +115,40 @@ __curstat_lsm_init(
* into the top-level.
*/
new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
- new->bloom_size =
- (int64_t)((chunk->count * lsm_tree->bloom_bit_count) / 8);
- new->bloom_page_evict =
- new->cache_eviction_clean + new->cache_eviction_dirty;
- new->bloom_page_read = new->cache_read;
+ WT_STAT_WRITE(new, bloom_size,
+ (int64_t)((chunk->count * lsm_tree->bloom_bit_count) / 8));
+ WT_STAT_WRITE(new, bloom_page_evict,
+ new->cache_eviction_clean + new->cache_eviction_dirty);
+ WT_STAT_WRITE(new, bloom_page_read, new->cache_read);
__wt_stat_dsrc_aggregate_single(new, stats);
WT_ERR(stat_cursor->close(stat_cursor));
}
/* Set statistics that aren't aggregated directly into the cursor */
- stats->bloom_count = bloom_count;
- stats->lsm_chunk_count = lsm_tree->nchunks;
+ WT_STAT_WRITE(stats, bloom_count, bloom_count);
+ WT_STAT_WRITE(stats, lsm_chunk_count, lsm_tree->nchunks);
/* Include, and optionally clear, LSM-level specific information. */
- stats->bloom_miss = lsm_tree->bloom_miss;
+ WT_STAT_WRITE(stats, bloom_miss, lsm_tree->bloom_miss);
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
lsm_tree->bloom_miss = 0;
- stats->bloom_hit = lsm_tree->bloom_hit;
+ WT_STAT_WRITE(stats, bloom_hit, lsm_tree->bloom_hit);
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
lsm_tree->bloom_hit = 0;
- stats->bloom_false_positive = lsm_tree->bloom_false_positive;
+ WT_STAT_WRITE(
+ stats, bloom_false_positive, lsm_tree->bloom_false_positive);
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
lsm_tree->bloom_false_positive = 0;
- stats->lsm_lookup_no_bloom = lsm_tree->lsm_lookup_no_bloom;
+ WT_STAT_WRITE(
+ stats, lsm_lookup_no_bloom, lsm_tree->lsm_lookup_no_bloom);
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
lsm_tree->lsm_lookup_no_bloom = 0;
- stats->lsm_checkpoint_throttle = lsm_tree->lsm_checkpoint_throttle;
+ WT_STAT_WRITE(
+ stats, lsm_checkpoint_throttle, lsm_tree->lsm_checkpoint_throttle);
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
lsm_tree->lsm_checkpoint_throttle = 0;
- stats->lsm_merge_throttle = lsm_tree->lsm_merge_throttle;
+ WT_STAT_WRITE(stats, lsm_merge_throttle, lsm_tree->lsm_merge_throttle);
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
lsm_tree->lsm_merge_throttle = 0;
diff --git a/src/third_party/wiredtiger/src/meta/meta_turtle.c b/src/third_party/wiredtiger/src/meta/meta_turtle.c
index 13e8b31916f..3bd57846862 100644
--- a/src/third_party/wiredtiger/src/meta/meta_turtle.c
+++ b/src/third_party/wiredtiger/src/meta/meta_turtle.c
@@ -271,8 +271,7 @@ err: WT_TRET(__wt_fclose(&fp, WT_FHANDLE_READ));
* Update the turtle file.
*/
int
-__wt_turtle_update(
- WT_SESSION_IMPL *session, const char *key, const char *value)
+__wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value)
{
WT_FH *fh;
WT_DECL_ITEM(buf);
diff --git a/src/third_party/wiredtiger/src/os_posix/os_map.c b/src/third_party/wiredtiger/src/os_posix/os_map.c
index e95ccb0ade2..4276c89dbcf 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_map.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_map.c
@@ -48,8 +48,6 @@ __wt_mmap(WT_SESSION_IMPL *session,
return (0);
}
-#define WT_VM_PAGESIZE 4096
-
/*
* __wt_mmap_preload --
* Cause a section of a memory map to be faulted in.
@@ -59,9 +57,10 @@ __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size)
{
#ifdef HAVE_POSIX_MADVISE
/* Linux requires the address be aligned to a 4KB boundary. */
+ WT_CONNECTION_IMPL *conn = S2C(session);
WT_BM *bm = S2BT(session)->bm;
WT_DECL_RET;
- void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1));
+ void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(conn->page_size - 1));
size += WT_PTRDIFF(p, blk);
/* XXX proxy for "am I doing a scan?" -- manual read-ahead */
@@ -78,9 +77,9 @@ __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size)
* Manual pages aren't clear on whether alignment is required for the
* size, so we will be conservative.
*/
- size &= ~(size_t)(WT_VM_PAGESIZE - 1);
+ size &= ~(size_t)(conn->page_size - 1);
- if (size > WT_VM_PAGESIZE &&
+ if (size > (size_t)conn->page_size &&
(ret = posix_madvise(blk, size, POSIX_MADV_WILLNEED)) != 0)
WT_RET_MSG(session, ret, "posix_madvise will need");
#else
@@ -101,8 +100,9 @@ __wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size)
{
#ifdef HAVE_POSIX_MADVISE
/* Linux requires the address be aligned to a 4KB boundary. */
+ WT_CONNECTION_IMPL *conn = S2C(session);
WT_DECL_RET;
- void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1));
+ void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(conn->page_size - 1));
size += WT_PTRDIFF(p, blk);
if ((ret = posix_madvise(blk, size, POSIX_MADV_DONTNEED)) != 0)
diff --git a/src/third_party/wiredtiger/src/os_posix/os_pagesize.c b/src/third_party/wiredtiger/src/os_posix/os_pagesize.c
new file mode 100644
index 00000000000..e7c7b4fdf15
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_pagesize.c
@@ -0,0 +1,19 @@
+/*-
+ * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_get_vm_pagesize --
+ * Return the default page size of a virtual memory page.
+ */
+int
+__wt_get_vm_pagesize(void)
+{
+ return (getpagesize());
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_pagesize.c b/src/third_party/wiredtiger/src/os_win/os_pagesize.c
new file mode 100644
index 00000000000..55cd6a694ec
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_pagesize.c
@@ -0,0 +1,23 @@
+/*-
+ * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_get_vm_pagesize --
+ * Return the default page size of a virtual memory page.
+ */
+int
+__wt_get_vm_pagesize(void)
+{
+ SYSTEM_INFO system_info;
+
+ GetSystemInfo(&system_info);
+
+ return (system_info.dwPageSize);
+}
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 21cc68ed119..2b07117f9d5 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -1276,6 +1276,8 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
for (upd = upd_list; upd->next != NULL; upd = upd->next)
;
upd->next = append;
+ __wt_cache_page_inmem_incr(
+ session, page, WT_UPDATE_MEMSIZE(append));
}
/*
@@ -1756,7 +1758,7 @@ __rec_key_state_update(WT_RECONCILE *r, bool ovfl_key)
* Figure out the maximum leaf page size for the reconciliation.
*/
static inline uint32_t
-__rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+__rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r)
{
WT_BTREE *btree;
WT_PAGE *page;
@@ -3263,7 +3265,14 @@ supd_check_complete:
memset(WT_BLOCK_HEADER_REF(dsk), 0, btree->block_header);
bnd->cksum = __wt_cksum(buf->data, buf->size);
- if (mod->rec_result == WT_PM_REC_MULTIBLOCK &&
+ /*
+ * One last check: don't reuse blocks if compacting, the reason
+ * for compaction is to move blocks to different locations. We
+ * do this check after calculating the checksums, hopefully the
+ * next write can be skipped.
+ */
+ if (session->compact_state == WT_COMPACT_NONE &&
+ mod->rec_result == WT_PM_REC_MULTIBLOCK &&
mod->mod_multi_entries > bnd_slot) {
multi = &mod->mod_multi[bnd_slot];
if (multi->size == bnd->size &&
@@ -3502,7 +3511,7 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
break;
case BTREE_COL_VAR:
if (cbulk->rle != 0)
- WT_RET(__wt_bulk_insert_var(session, cbulk));
+ WT_RET(__wt_bulk_insert_var(session, cbulk, false));
break;
case BTREE_ROW:
break;
@@ -3625,43 +3634,20 @@ __rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk)
* Fixed-length column-store bulk insert.
*/
int
-__wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+__wt_bulk_insert_fix(
+ WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted)
{
WT_BTREE *btree;
WT_CURSOR *cursor;
WT_RECONCILE *r;
- uint32_t entries, offset, page_entries, page_size;
- const uint8_t *data;
r = cbulk->reconcile;
btree = S2BT(session);
cursor = &cbulk->cbt.iface;
- if (cbulk->bitmap) {
- if (((r->recno - 1) * btree->bitcnt) & 0x7)
- WT_RET_MSG(session, EINVAL,
- "Bulk bitmap load not aligned on a byte boundary");
- for (data = cursor->value.data,
- entries = (uint32_t)cursor->value.size;
- entries > 0;
- entries -= page_entries, data += page_size) {
- WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk));
-
- page_entries =
- WT_MIN(entries, cbulk->nrecs - cbulk->entry);
- page_size = __bitstr_size(page_entries * btree->bitcnt);
- offset = __bitstr_size(cbulk->entry * btree->bitcnt);
- memcpy(r->first_free + offset, data, page_size);
- cbulk->entry += page_entries;
- r->recno += page_entries;
- }
- return (0);
- }
-
WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk));
-
- __bit_setv(r->first_free,
- cbulk->entry, btree->bitcnt, ((uint8_t *)cursor->value.data)[0]);
+ __bit_setv(r->first_free, cbulk->entry,
+ btree->bitcnt, deleted ? 0 : ((uint8_t *)cursor->value.data)[0]);
++cbulk->entry;
++r->recno;
@@ -3669,11 +3655,48 @@ __wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
}
/*
+ * __wt_bulk_insert_fix_bitmap --
+ * Fixed-length column-store bulk insert.
+ */
+int
+__wt_bulk_insert_fix_bitmap(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_RECONCILE *r;
+ uint32_t entries, offset, page_entries, page_size;
+ const uint8_t *data;
+
+ r = cbulk->reconcile;
+ btree = S2BT(session);
+ cursor = &cbulk->cbt.iface;
+
+ if (((r->recno - 1) * btree->bitcnt) & 0x7)
+ WT_RET_MSG(session, EINVAL,
+ "Bulk bitmap load not aligned on a byte boundary");
+ for (data = cursor->value.data,
+ entries = (uint32_t)cursor->value.size;
+ entries > 0;
+ entries -= page_entries, data += page_size) {
+ WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk));
+
+ page_entries = WT_MIN(entries, cbulk->nrecs - cbulk->entry);
+ page_size = __bitstr_size(page_entries * btree->bitcnt);
+ offset = __bitstr_size(cbulk->entry * btree->bitcnt);
+ memcpy(r->first_free + offset, data, page_size);
+ cbulk->entry += page_entries;
+ r->recno += page_entries;
+ }
+ return (0);
+}
+
+/*
* __wt_bulk_insert_var --
* Variable-length column-store bulk insert.
*/
int
-__wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+__wt_bulk_insert_var(
+ WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted)
{
WT_BTREE *btree;
WT_KV *val;
@@ -3682,14 +3705,20 @@ __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
r = cbulk->reconcile;
btree = S2BT(session);
- /*
- * Store the bulk cursor's last buffer, not the current value, we're
- * creating a duplicate count, which means we want the previous value
- * seen, not the current value.
- */
val = &r->v;
- WT_RET(__rec_cell_build_val(
- session, r, cbulk->last.data, cbulk->last.size, cbulk->rle));
+ if (deleted) {
+ val->cell_len = __wt_cell_pack_del(&val->cell, cbulk->rle);
+ val->buf.data = NULL;
+ val->buf.size = 0;
+ val->len = val->cell_len;
+ } else
+ /*
+ * Store the bulk cursor's last buffer, not the current value,
+ * we're tracking duplicates, which means we want the previous
+ * value seen, not the current value.
+ */
+ WT_RET(__rec_cell_build_val(session,
+ r, cbulk->last.data, cbulk->last.size, cbulk->rle));
/* Boundary: split or write the page. */
if (val->len > r->space_avail)
@@ -4445,7 +4474,7 @@ compare: /*
WT_ERR(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
if (upd == NULL)
continue;
- for (n = WT_INSERT_RECNO(ins); src_recno <= n; ++src_recno) {
+ for (n = WT_INSERT_RECNO(ins); src_recno <= n;) {
/*
* The application may have inserted records which left
* gaps in the name space, and these gaps can be huge.
@@ -4485,7 +4514,7 @@ compare: /*
last->size == size &&
memcmp(last->data, data, size) == 0)) {
++rle;
- continue;
+ goto next;
}
WT_ERR(__rec_col_var_helper(session, r,
salvage, last, last_deleted, 0, rle));
@@ -4504,6 +4533,15 @@ compare: /*
}
last_deleted = deleted;
rle = 1;
+
+ /*
+ * Move to the next record. It's not a simple increment
+ * because if it's the maximum record, incrementing it
+ * wraps to 0 and this turns into an infinite loop.
+ */
+next: if (src_recno == UINT64_MAX)
+ break;
+ ++src_recno;
}
}
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
index 053f69ee7f8..f0d0f26db54 100644
--- a/src/third_party/wiredtiger/src/session/session_api.c
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -148,7 +148,7 @@ __session_close(WT_SESSION *wt_session, const char *config)
* via the registered close callback.
*/
if (session->event_handler->handle_close != NULL &&
- !WT_STREQ(cursor->uri, WT_LAS_URI))
+ !WT_STREQ(cursor->internal_uri, WT_LAS_URI))
WT_TRET(session->event_handler->handle_close(
session->event_handler, wt_session, cursor));
WT_TRET(cursor->close(cursor));
diff --git a/src/third_party/wiredtiger/src/session/session_compact.c b/src/third_party/wiredtiger/src/session/session_compact.c
index 456fcd3ce03..8a5b741c0c5 100644
--- a/src/third_party/wiredtiger/src/session/session_compact.c
+++ b/src/third_party/wiredtiger/src/session/session_compact.c
@@ -172,12 +172,12 @@ __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
for (i = 0; i < 100; ++i) {
WT_ERR(__wt_txn_checkpoint(session, checkpoint_cfg));
- session->compaction = false;
+ session->compact_state = WT_COMPACT_RUNNING;
WT_WITH_SCHEMA_LOCK(session,
ret = __wt_schema_worker(
session, uri, __wt_compact, NULL, cfg, 0));
WT_ERR(ret);
- if (!session->compaction)
+ if (session->compact_state != WT_COMPACT_SUCCESS)
break;
WT_ERR(__wt_txn_checkpoint(session, checkpoint_cfg));
@@ -185,7 +185,9 @@ __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
WT_ERR(__session_compact_check_timeout(session, start_time));
}
-err: __wt_scr_free(session, &t);
+err: session->compact_state = WT_COMPACT_NONE;
+
+ __wt_scr_free(session, &t);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/support/global.c b/src/third_party/wiredtiger/src/support/global.c
index 1e32f5b4453..2330a65a707 100644
--- a/src/third_party/wiredtiger/src/support/global.c
+++ b/src/third_party/wiredtiger/src/support/global.c
@@ -12,28 +12,6 @@ WT_PROCESS __wt_process; /* Per-process structure */
static int __wt_pthread_once_failed; /* If initialization failed */
/*
- * __system_is_little_endian --
- * Check if the system is little endian.
- */
-static int
-__system_is_little_endian(void)
-{
- uint64_t v;
- bool little;
-
- v = 1;
- little = *((uint8_t *)&v) != 0;
-
- if (little)
- return (0);
-
- fprintf(stderr,
- "This release of the WiredTiger data engine does not support "
- "big-endian systems; contact WiredTiger for more information.\n");
- return (EINVAL);
-}
-
-/*
* __wt_global_once --
* Global initialization, run once.
*/
@@ -42,11 +20,6 @@ __wt_global_once(void)
{
WT_DECL_RET;
- if ((ret = __system_is_little_endian()) != 0) {
- __wt_pthread_once_failed = ret;
- return;
- }
-
if ((ret =
__wt_spin_init(NULL, &__wt_process.spinlock, "global")) != 0) {
__wt_pthread_once_failed = ret;
@@ -115,7 +88,7 @@ __wt_attach(WT_SESSION_IMPL *session)
/* Sleep forever, the debugger will interrupt us when it attaches. */
for (;;)
- __wt_sleep(100, 0);
+ __wt_sleep(10, 0);
#else
WT_UNUSED(session);
#endif
diff --git a/src/third_party/wiredtiger/src/support/hash_city.c b/src/third_party/wiredtiger/src/support/hash_city.c
index 9a4a6464f40..33f4113c004 100644
--- a/src/third_party/wiredtiger/src/support/hash_city.c
+++ b/src/third_party/wiredtiger/src/support/hash_city.c
@@ -99,6 +99,12 @@ static uint32_t UNALIGNED_LOAD32(const char *p) {
#define bswap_32(x) OSSwapInt32(x)
#define bswap_64(x) OSSwapInt64(x)
+#elif defined(__sun)
+
+#include <sys/byteorder.h>
+#define bswap_32 BSWAP_32
+#define bswap_64 BSWAP_64
+
#else
#include <byteswap.h>
#endif
diff --git a/src/third_party/wiredtiger/src/support/hex.c b/src/third_party/wiredtiger/src/support/hex.c
index eb9f420911a..5fb8d4bc190 100644
--- a/src/third_party/wiredtiger/src/support/hex.c
+++ b/src/third_party/wiredtiger/src/support/hex.c
@@ -8,7 +8,7 @@
#include "wt_internal.h"
-static const u_char hex[] = "0123456789abcdef";
+const u_char __wt_hex[] = "0123456789abcdef";
/*
* __fill_hex --
@@ -25,8 +25,8 @@ __fill_hex(const uint8_t *src, size_t src_max,
--dest_max;
for (; src_max > 0 && dest_max > 1;
src_max -= 1, dest_max -= 2, ++src) {
- *dest++ = hex[(*src & 0xf0) >> 4];
- *dest++ = hex[*src & 0x0f];
+ *dest++ = __wt_hex[(*src & 0xf0) >> 4];
+ *dest++ = __wt_hex[*src & 0x0f];
}
*dest++ = '\0';
if (lenp != NULL)
@@ -34,6 +34,17 @@ __fill_hex(const uint8_t *src, size_t src_max,
}
/*
+ * __wt_fill_hex --
+ * In-memory conversion of raw bytes to a hexadecimal representation.
+ */
+void
+__wt_fill_hex(const uint8_t *src, size_t src_max,
+ uint8_t *dest, size_t dest_max, size_t *lenp)
+{
+ __fill_hex(src, src_max, dest, dest_max, lenp);
+}
+
+/*
* __wt_raw_to_hex --
* Convert a chunk of data to a nul-terminated printable hex string.
*/
@@ -83,8 +94,8 @@ __wt_raw_to_esc_hex(
*t++ = *p;
} else {
*t++ = '\\';
- *t++ = hex[(*p & 0xf0) >> 4];
- *t++ = hex[*p & 0x0f];
+ *t++ = __wt_hex[(*p & 0xf0) >> 4];
+ *t++ = __wt_hex[*p & 0x0f];
}
*t++ = '\0';
to->size = WT_PTRDIFF(t, to->mem);
diff --git a/src/third_party/wiredtiger/src/support/huffman.c b/src/third_party/wiredtiger/src/support/huffman.c
index 4bda365cb10..9488dbf14fe 100644
--- a/src/third_party/wiredtiger/src/support/huffman.c
+++ b/src/third_party/wiredtiger/src/support/huffman.c
@@ -1,9 +1,31 @@
-/*-
+/*
* Copyright (c) 2014-2015 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
- * See the file LICENSE for redistribution information.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name MongoDB or the name WiredTiger
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY MONGODB INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
*/
#include "wt_internal.h"
diff --git a/src/third_party/wiredtiger/src/support/rand.c b/src/third_party/wiredtiger/src/support/rand.c
index f5ecb12633e..3adcb801f03 100644
--- a/src/third_party/wiredtiger/src/support/rand.c
+++ b/src/third_party/wiredtiger/src/support/rand.c
@@ -60,6 +60,29 @@ __wt_random_init(WT_RAND_STATE volatile * rnd_state)
}
/*
+ * __wt_random_init_seed --
+ * Initialize the state of a 32-bit pseudo-random number.
+ * Use this, instead of __wt_random_init if we are running with multiple
+ * threads and we want each thread to initialize its own random state based
+ * on a different random seed.
+ */
+int
+__wt_random_init_seed(
+ WT_SESSION_IMPL *session, WT_RAND_STATE volatile * rnd_state)
+{
+ struct timespec ts;
+ WT_RAND_STATE rnd;
+
+ WT_RET(__wt_epoch(session, &ts));
+ M_W(rnd) = (uint32_t)(ts.tv_nsec + 521288629);
+ M_Z(rnd) = (uint32_t)(ts.tv_nsec + 362436069);
+
+ *rnd_state = rnd;
+
+ return (0);
+}
+
+/*
* __wt_random --
* Return a 32-bit pseudo-random number.
*/
diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c
index 4d7cd65fd18..7a615131628 100644
--- a/src/third_party/wiredtiger/src/support/stat.c
+++ b/src/third_party/wiredtiger/src/support/stat.c
@@ -250,19 +250,24 @@ __wt_stat_dsrc_aggregate_single(
to->block_alloc += from->block_alloc;
to->block_free += from->block_free;
to->block_checkpoint_size += from->block_checkpoint_size;
- to->allocation_size = from->allocation_size;
+ if (from->allocation_size > to->allocation_size)
+ to->allocation_size = from->allocation_size;
to->block_reuse_bytes += from->block_reuse_bytes;
- to->block_magic = from->block_magic;
- to->block_major = from->block_major;
+ if (from->block_magic > to->block_magic)
+ to->block_magic = from->block_magic;
+ if (from->block_major > to->block_major)
+ to->block_major = from->block_major;
to->block_size += from->block_size;
- to->block_minor = from->block_minor;
+ if (from->block_minor > to->block_minor)
+ to->block_minor = from->block_minor;
to->btree_checkpoint_generation += from->btree_checkpoint_generation;
to->btree_column_fix += from->btree_column_fix;
to->btree_column_internal += from->btree_column_internal;
to->btree_column_deleted += from->btree_column_deleted;
to->btree_column_variable += from->btree_column_variable;
to->btree_column_rle += from->btree_column_rle;
- to->btree_fixed_len = from->btree_fixed_len;
+ if (from->btree_fixed_len > to->btree_fixed_len)
+ to->btree_fixed_len = from->btree_fixed_len;
if (from->btree_maxintlkey > to->btree_maxintlkey)
to->btree_maxintlkey = from->btree_maxintlkey;
if (from->btree_maxintlpage > to->btree_maxintlpage)
@@ -367,12 +372,16 @@ __wt_stat_dsrc_aggregate(
to->block_free += WT_STAT_READ(from, block_free);
to->block_checkpoint_size +=
WT_STAT_READ(from, block_checkpoint_size);
- to->allocation_size = from[0]->allocation_size;
+ if ((v = WT_STAT_READ(from, allocation_size)) > to->allocation_size)
+ to->allocation_size = v;
to->block_reuse_bytes += WT_STAT_READ(from, block_reuse_bytes);
- to->block_magic = from[0]->block_magic;
- to->block_major = from[0]->block_major;
+ if ((v = WT_STAT_READ(from, block_magic)) > to->block_magic)
+ to->block_magic = v;
+ if ((v = WT_STAT_READ(from, block_major)) > to->block_major)
+ to->block_major = v;
to->block_size += WT_STAT_READ(from, block_size);
- to->block_minor = from[0]->block_minor;
+ if ((v = WT_STAT_READ(from, block_minor)) > to->block_minor)
+ to->block_minor = v;
to->btree_checkpoint_generation +=
WT_STAT_READ(from, btree_checkpoint_generation);
to->btree_column_fix += WT_STAT_READ(from, btree_column_fix);
@@ -382,15 +391,14 @@ __wt_stat_dsrc_aggregate(
to->btree_column_variable +=
WT_STAT_READ(from, btree_column_variable);
to->btree_column_rle += WT_STAT_READ(from, btree_column_rle);
- to->btree_fixed_len = from[0]->btree_fixed_len;
- if ((v = WT_STAT_READ(from, btree_maxintlkey)) >
- to->btree_maxintlkey)
+ if ((v = WT_STAT_READ(from, btree_fixed_len)) > to->btree_fixed_len)
+ to->btree_fixed_len = v;
+ if ((v = WT_STAT_READ(from, btree_maxintlkey)) > to->btree_maxintlkey)
to->btree_maxintlkey = v;
if ((v = WT_STAT_READ(from, btree_maxintlpage)) >
to->btree_maxintlpage)
to->btree_maxintlpage = v;
- if ((v = WT_STAT_READ(from, btree_maxleafkey)) >
- to->btree_maxleafkey)
+ if ((v = WT_STAT_READ(from, btree_maxleafkey)) > to->btree_maxleafkey)
to->btree_maxleafkey = v;
if ((v = WT_STAT_READ(from, btree_maxleafpage)) >
to->btree_maxleafpage)
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index f835fea8f67..0a3e4a7a7db 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -216,6 +216,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force)
conn = S2C(session);
txn_global = &conn->txn_global;
+retry:
current_id = last_running = txn_global->current;
oldest_session = NULL;
prev_oldest_id = txn_global->oldest_id;
@@ -287,43 +288,60 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force)
WT_TXNID_LT(txn_global->last_running, last_running);
/* Update the oldest ID. */
- if ((WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) &&
- __wt_atomic_casiv32(&txn_global->scan_count, 1, -1)) {
- WT_ORDERED_READ(session_cnt, conn->session_cnt);
- for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
- if ((id = s->id) != WT_TXN_NONE &&
- WT_TXNID_LT(id, last_running))
- last_running = id;
- if ((id = s->snap_min) != WT_TXN_NONE &&
- WT_TXNID_LT(id, oldest_id))
- oldest_id = id;
- }
-
- if (WT_TXNID_LT(last_running, oldest_id))
- oldest_id = last_running;
-
-#ifdef HAVE_DIAGNOSTIC
+ if (WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) {
/*
- * Make sure the ID doesn't move past any named snapshots.
- *
- * Don't include the read/assignment in the assert statement.
- * Coverity complains if there are assignments only done in
- * diagnostic builds, and when the read is from a volatile.
+ * We know we want to update. Check if we're racing.
*/
- id = txn_global->nsnap_oldest_id;
- WT_ASSERT(session,
- id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
+ if (__wt_atomic_casiv32(&txn_global->scan_count, 1, -1)) {
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ for (i = 0, s = txn_global->states;
+ i < session_cnt; i++, s++) {
+ if ((id = s->id) != WT_TXN_NONE &&
+ WT_TXNID_LT(id, last_running))
+ last_running = id;
+ if ((id = s->snap_min) != WT_TXN_NONE &&
+ WT_TXNID_LT(id, oldest_id))
+ oldest_id = id;
+ }
+
+ if (WT_TXNID_LT(last_running, oldest_id))
+ oldest_id = last_running;
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * Make sure the ID doesn't move past any named
+ * snapshots.
+ *
+ * Don't include the read/assignment in the assert
+ * statement. Coverity complains if there are
+ * assignments only done in diagnostic builds, and
+ * when the read is from a volatile.
+ */
+ id = txn_global->nsnap_oldest_id;
+ WT_ASSERT(session,
+ id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
#endif
- if (WT_TXNID_LT(txn_global->last_running, last_running))
- txn_global->last_running = last_running;
- if (WT_TXNID_LT(txn_global->oldest_id, oldest_id))
- txn_global->oldest_id = oldest_id;
- WT_ASSERT(session, txn_global->scan_count == -1);
- txn_global->scan_count = 0;
+ if (WT_TXNID_LT(txn_global->last_running, last_running))
+ txn_global->last_running = last_running;
+ if (WT_TXNID_LT(txn_global->oldest_id, oldest_id))
+ txn_global->oldest_id = oldest_id;
+ WT_ASSERT(session, txn_global->scan_count == -1);
+ txn_global->scan_count = 0;
+ } else {
+ /*
+ * We wanted to update the oldest ID but we're racing
+ * another thread. Retry if this is a forced update.
+ */
+ WT_ASSERT(session, txn_global->scan_count > 0);
+ (void)__wt_atomic_subiv32(&txn_global->scan_count, 1);
+ if (force) {
+ __wt_yield();
+ goto retry;
+ }
+ }
} else {
if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) &&
- current_id - oldest_id > 10000 && last_running_moved &&
- oldest_session != NULL) {
+ current_id - oldest_id > 10000 && oldest_session != NULL) {
(void)__wt_verbose(session, WT_VERB_TRANSACTION,
"old snapshot %" PRIu64
" pinned in session %d [%s]"
diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c
index c5fa52dea6a..148ed868792 100644
--- a/src/third_party/wiredtiger/src/txn/txn_log.c
+++ b/src/third_party/wiredtiger/src/txn/txn_log.c
@@ -8,6 +8,12 @@
#include "wt_internal.h"
+/* Cookie passed to __txn_printlog. */
+typedef struct {
+ FILE *out;
+ uint32_t flags;
+} WT_TXN_PRINTLOG_ARGS;
+
/*
* __txn_op_log --
* Log an operation for the current transaction.
@@ -64,7 +70,8 @@ err: __wt_buf_free(session, &key);
*/
static int
__txn_commit_printlog(
- WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out,
+ uint32_t flags)
{
bool firstrecord;
@@ -79,7 +86,7 @@ __txn_commit_printlog(
firstrecord = false;
- WT_RET(__wt_txn_op_printlog(session, pp, end, out));
+ WT_RET(__wt_txn_op_printlog(session, pp, end, out, flags));
WT_RET(__wt_fprintf(out, "\n }"));
}
@@ -459,6 +466,7 @@ __txn_printlog(WT_SESSION_IMPL *session,
FILE *out;
WT_LOG_RECORD *logrec;
WT_LSN ckpt_lsn;
+ WT_TXN_PRINTLOG_ARGS *args;
const uint8_t *end, *p;
const char *msg;
uint64_t txnid;
@@ -467,7 +475,8 @@ __txn_printlog(WT_SESSION_IMPL *session,
bool compressed;
WT_UNUSED(next_lsnp);
- out = cookie;
+ args = cookie;
+ out = args->out;
p = WT_LOG_SKIP_HEADER(rawrec->data);
end = (const uint8_t *)rawrec->data + rawrec->size;
@@ -506,7 +515,8 @@ __txn_printlog(WT_SESSION_IMPL *session,
WT_RET(__wt_fprintf(out, " \"type\" : \"commit\",\n"));
WT_RET(__wt_fprintf(out,
" \"txnid\" : %" PRIu64 ",\n", txnid));
- WT_RET(__txn_commit_printlog(session, &p, end, out));
+ WT_RET(__txn_commit_printlog(session, &p, end, out,
+ args->flags));
break;
case WT_LOGREC_FILE_SYNC:
@@ -537,15 +547,18 @@ __txn_printlog(WT_SESSION_IMPL *session,
* Print the log in a human-readable format.
*/
int
-__wt_txn_printlog(WT_SESSION *wt_session, FILE *out)
+__wt_txn_printlog(WT_SESSION *wt_session, FILE *out, uint32_t flags)
{
WT_SESSION_IMPL *session;
+ WT_TXN_PRINTLOG_ARGS args;
session = (WT_SESSION_IMPL *)wt_session;
+ args.out = out;
+ args.flags = flags;
WT_RET(__wt_fprintf(out, "[\n"));
WT_RET(__wt_log_scan(
- session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, out));
+ session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, &args));
WT_RET(__wt_fprintf(out, "\n]\n"));
return (0);
diff --git a/src/third_party/wiredtiger/src/utilities/util_list.c b/src/third_party/wiredtiger/src/utilities/util_list.c
index 99a1455a74e..135a8bab225 100644
--- a/src/third_party/wiredtiger/src/utilities/util_list.c
+++ b/src/third_party/wiredtiger/src/utilities/util_list.c
@@ -8,6 +8,7 @@
#include "util.h"
+static int list_get_allocsize(WT_SESSION *, const char *, size_t *);
static int list_print(WT_SESSION *, const char *, bool, bool);
static int list_print_checkpoint(WT_SESSION *, const char *);
static int usage(void);
@@ -56,6 +57,48 @@ util_list(WT_SESSION *session, int argc, char *argv[])
}
/*
+ * list_get_allocsize --
+ * Get the allocation size for this file from the metadata.
+ */
+static int
+list_get_allocsize(WT_SESSION *session, const char *key, size_t *allocsize)
+{
+ WT_CONFIG_ITEM szvalue;
+ WT_CONFIG_PARSER *parser;
+ WT_DECL_RET;
+ WT_EXTENSION_API *wt_api;
+ char *config;
+
+ wt_api = session->connection->get_extension_api(session->connection);
+ if ((ret =
+ wt_api->metadata_search(wt_api, session, key, &config)) != 0) {
+ fprintf(stderr, "%s: %s: extension_api.metadata_search: %s\n",
+ progname, key, session->strerror(session, ret));
+ return (ret);
+ }
+ if ((ret = wt_api->config_parser_open(wt_api, session, config,
+ strlen(config), &parser)) != 0) {
+ fprintf(stderr, "%s: extension_api.config_parser_open: %s\n",
+ progname, session->strerror(session, ret));
+ return (ret);
+ }
+ if ((ret = parser->get(parser, "allocation_size", &szvalue)) != 0) {
+ if (ret != WT_NOTFOUND)
+ fprintf(stderr, "%s: config_parser.get: %s\n",
+ progname, session->strerror(session, ret));
+ (void)parser->close(parser);
+ return (ret);
+ }
+ if ((ret = parser->close(parser)) != 0) {
+ fprintf(stderr, "%s: config_parser.close: %s\n",
+ progname, session->strerror(session, ret));
+ return (ret);
+ }
+ *allocsize = (size_t)szvalue.val;
+ return (0);
+}
+
+/*
* list_print --
* List the high-level objects in the database.
*/
@@ -137,9 +180,10 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag)
static int
list_print_checkpoint(WT_SESSION *session, const char *key)
{
+ WT_BLOCK_CKPT ci;
WT_DECL_RET;
WT_CKPT *ckpt, *ckptbase;
- size_t len;
+ size_t allocsize, len;
time_t t;
uint64_t v;
@@ -151,6 +195,14 @@ list_print_checkpoint(WT_SESSION *session, const char *key)
if ((ret = __wt_metadata_get_ckptlist(session, key, &ckptbase)) != 0)
return (ret == WT_NOTFOUND ? 0 : ret);
+ /* We need the allocation size for decoding the checkpoint addr */
+ if ((ret = list_get_allocsize(session, key, &allocsize)) != 0) {
+ if (ret == WT_NOTFOUND)
+ allocsize = 0;
+ else
+ return (ret);
+ }
+
/* Find the longest name, so we can pretty-print. */
len = 0;
WT_CKPT_FOREACH(ckptbase, ckpt)
@@ -158,7 +210,15 @@ list_print_checkpoint(WT_SESSION *session, const char *key)
len = strlen(ckpt->name);
++len;
+ memset(&ci, 0, sizeof(ci));
WT_CKPT_FOREACH(ckptbase, ckpt) {
+ if (allocsize != 0 && (ret = __wt_block_ckpt_decode(
+ session, allocsize, ckpt->raw.data, &ci)) != 0) {
+ fprintf(stderr, "%s: __wt_block_buffer_to_ckpt: %s\n",
+ progname, session->strerror(session, ret));
+ /* continue if damaged */
+ ci.root_size = 0;
+ }
/*
* Call ctime, not ctime_r; ctime_r has portability problems,
* the Solaris version is different from the POSIX standard.
@@ -179,6 +239,17 @@ list_print_checkpoint(WT_SESSION *session, const char *key)
printf(" (%" PRIu64 " KB)\n", v / WT_KILOBYTE);
else
printf(" (%" PRIu64 " B)\n", v);
+ if (ci.root_size != 0) {
+ printf("\t\t" "root offset: %" PRIuMAX
+ " (0x%" PRIxMAX ")\n",
+ (intmax_t)ci.root_offset, (intmax_t)ci.root_offset);
+ printf("\t\t" "root size: %" PRIu32
+ " (0x%" PRIx32 ")\n",
+ ci.root_size, ci.root_size);
+ printf("\t\t" "root checksum: %" PRIu32
+ " (0x%" PRIx32 ")\n",
+ ci.root_cksum, ci.root_cksum);
+ }
}
__wt_metadata_free_ckptlist(session, ckptbase);
diff --git a/src/third_party/wiredtiger/src/utilities/util_main.c b/src/third_party/wiredtiger/src/utilities/util_main.c
index 9cbda08690e..3b7187bd0de 100644
--- a/src/third_party/wiredtiger/src/utilities/util_main.c
+++ b/src/third_party/wiredtiger/src/utilities/util_main.c
@@ -226,7 +226,6 @@ main(int argc, char *argv[])
ret = func(session, argc, argv);
/* Close the database. */
-
err: if (conn != NULL && (tret = conn->close(conn, NULL)) != 0 && ret == 0)
ret = tret;
diff --git a/src/third_party/wiredtiger/src/utilities/util_printlog.c b/src/third_party/wiredtiger/src/utilities/util_printlog.c
index d202b09b228..3a665c1c657 100644
--- a/src/third_party/wiredtiger/src/utilities/util_printlog.c
+++ b/src/third_party/wiredtiger/src/utilities/util_printlog.c
@@ -15,10 +15,10 @@ util_printlog(WT_SESSION *session, int argc, char *argv[])
{
WT_DECL_RET;
int ch;
- bool printable;
+ uint32_t flags;
- printable = false;
- while ((ch = __wt_getopt(progname, argc, argv, "f:p")) != EOF)
+ flags = 0;
+ while ((ch = __wt_getopt(progname, argc, argv, "f:x")) != EOF)
switch (ch) {
case 'f': /* output file */
if (freopen(__wt_optarg, "w", stdout) == NULL) {
@@ -27,8 +27,8 @@ util_printlog(WT_SESSION *session, int argc, char *argv[])
return (1);
}
break;
- case 'p':
- printable = true;
+ case 'x': /* hex output */
+ LF_SET(WT_TXN_PRINTLOG_HEX);
break;
case '?':
default:
@@ -41,8 +41,7 @@ util_printlog(WT_SESSION *session, int argc, char *argv[])
if (argc != 0)
return (usage());
- WT_UNUSED(printable);
- ret = __wt_txn_printlog(session, stdout);
+ ret = __wt_txn_printlog(session, stdout, flags);
if (ret != 0) {
fprintf(stderr, "%s: printlog failed: %s\n",
@@ -61,7 +60,7 @@ usage(void)
{
(void)fprintf(stderr,
"usage: %s %s "
- "printlog [-p] [-f output-file]\n",
+ "printlog [-x] [-f output-file]\n",
progname, usage_prefix);
return (1);
}