summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/third_party/wiredtiger/dist/api_data.py37
-rw-r--r--src/third_party/wiredtiger/dist/filelist5
-rw-r--r--src/third_party/wiredtiger/dist/log.py3
-rw-r--r--src/third_party/wiredtiger/dist/log_data.py32
-rw-r--r--src/third_party/wiredtiger/dist/s_define.list1
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok1
-rwxr-xr-xsrc/third_party/wiredtiger/dist/s_void1
-rw-r--r--src/third_party/wiredtiger/dist/stat_data.py2
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c9
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_delete.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_read.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_ret.c7
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_slvg.c9
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c27
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_sync.c4
-rw-r--r--src/third_party/wiredtiger/src/cache/cache_las.c66
-rw-r--r--src/third_party/wiredtiger/src/config/config_def.c137
-rw-r--r--src/third_party/wiredtiger/src/conn/api_calc_modify.c25
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c57
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_handle.c3
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_log.c16
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_open.c2
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_reconfig.c4
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_file.c3
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c42
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_page.c72
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h4
-rw-r--r--src/third_party/wiredtiger/src/include/btree.h6
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i6
-rw-r--r--src/third_party/wiredtiger/src/include/cache.h20
-rw-r--r--src/third_party/wiredtiger/src/include/cell.h162
-rw-r--r--src/third_party/wiredtiger/src/include/cell.i155
-rw-r--r--src/third_party/wiredtiger/src/include/connection.h22
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h41
-rw-r--r--src/third_party/wiredtiger/src/include/log.h3
-rw-r--r--src/third_party/wiredtiger/src/include/meta.h1
-rw-r--r--src/third_party/wiredtiger/src/include/reconcile.h294
-rw-r--r--src/third_party/wiredtiger/src/include/reconcile.i257
-rw-r--r--src/third_party/wiredtiger/src/include/stat.h2
-rw-r--r--src/third_party/wiredtiger/src/include/txn.h2
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i46
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in740
-rw-r--r--src/third_party/wiredtiger/src/include/wt_internal.h9
-rw-r--r--src/third_party/wiredtiger/src/log/log.c14
-rw-r--r--src/third_party/wiredtiger/src/log/log_auto.c96
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_child.c329
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_col.c1077
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_dictionary.c200
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_row.c1025
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_visibility.c405
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c3647
-rw-r--r--src/third_party/wiredtiger/src/support/stat.c8
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c10
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_log.c78
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c28
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_timestamp.c13
-rw-r--r--src/third_party/wiredtiger/test/csuite/Makefile.am4
-rw-r--r--src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c63
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt4803_cache_overflow_abort/main.c239
-rw-r--r--src/third_party/wiredtiger/test/evergreen.yml33
-rw-r--r--src/third_party/wiredtiger/test/suite/test_debug_mode01.py94
-rw-r--r--src/third_party/wiredtiger/test/suite/test_debug_mode02.py108
-rw-r--r--src/third_party/wiredtiger/test/suite/test_debug_mode03.py79
-rw-r--r--src/third_party/wiredtiger/test/suite/test_debug_mode04.py58
-rw-r--r--src/third_party/wiredtiger/test/suite/test_las04.py110
-rw-r--r--src/third_party/wiredtiger/test/suite/test_timestamp17.py173
-rw-r--r--src/third_party/wiredtiger/test/suite/wttest.py21
68 files changed, 5987 insertions, 4266 deletions
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py
index 6908a52f5e0..3e5f2806de0 100644
--- a/src/third_party/wiredtiger/dist/api_data.py
+++ b/src/third_party/wiredtiger/dist/api_data.py
@@ -430,6 +430,18 @@ connection_runtime_config = [
for space to be available in cache before giving up. Default will
wait forever''',
min=0),
+ Config('cache_overflow', '', r'''
+ cache overflow configuration options''',
+ type='category', subconfig=[
+ Config('file_max', '0', r'''
+ The maximum number of bytes that WiredTiger is allowed to use for
+ its cache overflow mechanism. If the cache overflow file exceeds
+ this size, a panic will be triggered. The default value means that
+ the cache overflow file is unbounded and may use as much space as
+ the filesystem will accommodate. The minimum non-zero setting is
+ 100MB.''', # !!! Must match WT_LAS_FILE_MIN
+ min='0')
+ ]),
Config('cache_overhead', '8', r'''
assume the heap allocator overhead is the specified percentage, and
adjust the cache usage by that amount (for example, if there is 10GB
@@ -456,6 +468,31 @@ connection_runtime_config = [
above 0 configures periodic checkpoints''',
min='0', max='100000'),
]),
+ Config('debug_mode', '', r'''
+ control the settings of various extended debugging features''',
+ type='category', subconfig=[
+ Config('checkpoint_retention', '0', r'''
+ adjust log archiving to retain the log records of this number
+ of checkpoints. Zero or one means perform normal archiving.''',
+ min='0', max='1024'),
+ Config('eviction', 'false', r'''
+ if true, modify internal algorithms to change skew to force
+ lookaside eviction to happen more aggressively. This includes but
+ is not limited to not skewing newest, not favoring leaf pages,
+ and modifying the eviction score mechanism.''',
+ type='boolean'),
+ Config('rollback_error', '0', r'''
+ return a WT_ROLLBACK error from a transaction operation about
+ every Nth operation to simulate a collision''',
+ min='0', max='10M'),
+ Config('table_logging', 'false', r'''
+ if true, write transaction related information to the log for all
+ operations, even operations for tables with logging turned off.
+ This setting introduces a log format change that may break older
+ versions of WiredTiger. These operations are informational and
+ skipped in recovery.''',
+ type='boolean'),
+ ]),
Config('error_prefix', '', r'''
prefix string for error messages'''),
Config('eviction', '', r'''
diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist
index 73fa6819e94..036b1a8b1a9 100644
--- a/src/third_party/wiredtiger/dist/filelist
+++ b/src/third_party/wiredtiger/dist/filelist
@@ -163,7 +163,12 @@ src/os_win/os_yield.c WINDOWS_HOST
src/packing/pack_api.c
src/packing/pack_impl.c
src/packing/pack_stream.c
+src/reconcile/rec_child.c
+src/reconcile/rec_col.c
+src/reconcile/rec_dictionary.c
+src/reconcile/rec_row.c
src/reconcile/rec_track.c
+src/reconcile/rec_visibility.c
src/reconcile/rec_write.c
src/schema/schema_alter.c
src/schema/schema_create.c
diff --git a/src/third_party/wiredtiger/dist/log.py b/src/third_party/wiredtiger/dist/log.py
index 4669b6bcc73..b3e6a71b63a 100644
--- a/src/third_party/wiredtiger/dist/log.py
+++ b/src/third_party/wiredtiger/dist/log.py
@@ -18,6 +18,9 @@ field_types = {
'WT_ERR(__logrec_make_hex_str(session, &escaped, &arg));']),
'recno' : ('uint64_t', 'r', '%" PRIu64 "', 'arg', [ '' ]),
'uint32' : ('uint32_t', 'I', '%" PRIu32 "', 'arg', [ '' ]),
+ # The fileid may have the high bit set. Print in both decimal and hex.
+ 'uint32_id' : ('uint32_t', 'I',
+ '%" PRIu32 " 0x%" PRIx32 "', 'arg, arg', [ '' ]),
'uint64' : ('uint64_t', 'Q', '%" PRIu64 "', 'arg', [ '' ]),
}
diff --git a/src/third_party/wiredtiger/dist/log_data.py b/src/third_party/wiredtiger/dist/log_data.py
index 9e1538ccf04..18f368eaad0 100644
--- a/src/third_party/wiredtiger/dist/log_data.py
+++ b/src/third_party/wiredtiger/dist/log_data.py
@@ -36,7 +36,7 @@ rectypes = [
# the allocated LSN to reduce the amount of work recovery has to do, and
# they are useful for debugging recovery.
LogRecordType('file_sync', 'file sync', [
- ('uint32', 'fileid'), ('int', 'start')]),
+ ('uint32_id', 'fileid'), ('int', 'start')]),
# Debugging message in the log
LogRecordType('message', 'message', [('string', 'message')]),
@@ -62,25 +62,39 @@ class LogOperationType:
optypes = [
# commit operations
LogOperationType('col_modify', 'column modify',
- [('uint32', 'fileid'), ('recno', 'recno'), ('item', 'value')]),
+ [('uint32_id', 'fileid'), ('recno', 'recno'), ('item', 'value')]),
LogOperationType('col_put', 'column put',
- [('uint32', 'fileid'), ('recno', 'recno'), ('item', 'value')]),
+ [('uint32_id', 'fileid'), ('recno', 'recno'), ('item', 'value')]),
LogOperationType('col_remove', 'column remove',
- [('uint32', 'fileid'), ('recno', 'recno')]),
+ [('uint32_id', 'fileid'), ('recno', 'recno')]),
LogOperationType('col_truncate', 'column truncate',
- [('uint32', 'fileid'), ('recno', 'start'), ('recno', 'stop')]),
+ [('uint32_id', 'fileid'), ('recno', 'start'), ('recno', 'stop')]),
LogOperationType('row_modify', 'row modify',
- [('uint32', 'fileid'), ('item', 'key'), ('item', 'value')]),
+ [('uint32_id', 'fileid'), ('item', 'key'), ('item', 'value')]),
LogOperationType('row_put', 'row put',
- [('uint32', 'fileid'), ('item', 'key'), ('item', 'value')]),
+ [('uint32_id', 'fileid'), ('item', 'key'), ('item', 'value')]),
LogOperationType('row_remove', 'row remove',
- [('uint32', 'fileid'), ('item', 'key')]),
+ [('uint32_id', 'fileid'), ('item', 'key')]),
LogOperationType('row_truncate', 'row truncate',
- [('uint32', 'fileid'), ('item', 'start'), ('item', 'stop'),
+ [('uint32_id', 'fileid'), ('item', 'start'), ('item', 'stop'),
('uint32', 'mode')]),
# system operations
LogOperationType('checkpoint_start', 'checkpoint start', []),
LogOperationType('prev_lsn', 'previous LSN',
[('WT_LSN', 'prev_lsn')]),
+
+# diagnostic operations
+# Operations used only for diagnostic purposes should be have their type
+# values in the diagnostic range in src/include/wiredtiger.in so that they
+# are always ignored by recovery.
+ #
+ # We need to know the base size/type of a 'struct timespec'. Cast its
+ # parts to uint64_t and split it into seconds and nanoseconds.
+ #
+ LogOperationType('txn_timestamp', 'txn_timestamp',
+ [('uint64', 'time_sec'), ('uint64', 'time_nsec'),
+ ('uint64', 'commit_ts'), ('uint64', 'durable_ts'),
+ ('uint64', 'first_ts'), ('uint64', 'prepare_ts'),
+ ('uint64', 'read_ts')]),
]
diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list
index f199900e860..4ed32778cbb 100644
--- a/src/third_party/wiredtiger/dist/s_define.list
+++ b/src/third_party/wiredtiger/dist/s_define.list
@@ -20,6 +20,7 @@ WT_BLOCK_HEADER_SIZE
WT_CACHE_LINE_ALIGNMENT
WT_CACHE_LINE_PAD_BEGIN
WT_CACHE_LINE_PAD_END
+WT_CELL_UNUSED_BIT4
WT_CLOCKDIFF_NS
WT_CONN_CHECK_PANIC
WT_DEADLOCK
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index c251c99f2fe..3f336d0443b 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -103,6 +103,7 @@ DbEnv
Decrement
Decrypt
DeleteFileW
+Dh
EACCES
EAGAIN
EB
diff --git a/src/third_party/wiredtiger/dist/s_void b/src/third_party/wiredtiger/dist/s_void
index 6c2b8b34040..2bb8b7abf0a 100755
--- a/src/third_party/wiredtiger/dist/s_void
+++ b/src/third_party/wiredtiger/dist/s_void
@@ -119,6 +119,7 @@ func_ok()
-e '/int snappy_pre_size$/d' \
-e '/int snappy_terminate$/d' \
-e '/int subtest_error_handler$/d' \
+ -e '/int test_las_workload$/d' \
-e '/int uri2name$/d' \
-e '/int usage$/d' \
-e '/int util_err$/d' \
diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py
index 8b26fa2e9af..34d957a75ec 100644
--- a/src/third_party/wiredtiger/dist/stat_data.py
+++ b/src/third_party/wiredtiger/dist/stat_data.py
@@ -274,6 +274,8 @@ connection_stats = [
CacheStat('cache_lookaside_cursor_wait_internal', 'cache overflow cursor internal thread wait time (usecs)'),
CacheStat('cache_lookaside_entries', 'cache overflow table entries', 'no_clear,no_scale'),
CacheStat('cache_lookaside_insert', 'cache overflow table insert calls'),
+ CacheStat('cache_lookaside_ondisk', 'cache overflow table on-disk size', 'no_clear,no_scale,size'),
+ CacheStat('cache_lookaside_ondisk_max', 'cache overflow table max on-disk size', 'no_clear,no_scale,size'),
CacheStat('cache_lookaside_remove', 'cache overflow table remove calls'),
CacheStat('cache_lookaside_score', 'cache overflow score', 'no_clear,no_scale'),
CacheStat('cache_overhead', 'percentage overhead', 'no_clear,no_scale'),
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 43dc53c86e3..a6cf0bc879e 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -1,5 +1,5 @@
{
- "commit": "4051e4941c894655cdb7d3dec97a7e32e7defbe6",
+ "commit": "4a3194b043b8cffb5339c12e1554d0bd42ed1b1f",
"github": "wiredtiger/wiredtiger.git",
"vendor": "wiredtiger",
"branch": "mongodb-4.0"
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index 6047be0be14..55b41ad4b21 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -1423,12 +1423,13 @@ __cursor_chain_exceeded(WT_CURSOR_BTREE *cbt)
upd != NULL && upd->type == WT_UPDATE_MODIFY;
++i, upd = upd->next) {
upd_size += WT_UPDATE_MEMSIZE(upd);
- if (upd_size >= WT_MODIFY_MEM_FACTOR * cursor->value.size)
+ if (i >= WT_MAX_MODIFY_UPDATE &&
+ upd_size * WT_MODIFY_MEM_FRACTION >= cursor->value.size)
return (true);
}
- if (upd != NULL && upd->type == WT_UPDATE_STANDARD &&
- __wt_txn_upd_visible_all(session, upd) &&
- i >= WT_MAX_MODIFY_UPDATE)
+ if (i >= WT_MAX_MODIFY_UPDATE && upd != NULL &&
+ upd->type == WT_UPDATE_STANDARD &&
+ __wt_txn_upd_visible_all(session, upd))
return (true);
return (false);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
index b0fd6a58edf..63ee4a3bc7c 100644
--- a/src/third_party/wiredtiger/src/btree/bt_delete.c
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -81,7 +81,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
}
(void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1);
- ret = __wt_evict(session, ref, false, previous_state);
+ ret = __wt_evict(session, ref, previous_state, 0);
(void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1);
WT_RET_BUSY_OK(ret);
ret = 0;
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
index 8dd918e8011..87f47f20aeb 100644
--- a/src/third_party/wiredtiger/src/btree/bt_read.c
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -776,7 +776,7 @@ read: /*
if (force_attempts < 10 &&
__evict_force_check(session, ref)) {
++force_attempts;
- ret = __wt_page_release_evict(session, ref);
+ ret = __wt_page_release_evict(session, ref, 0);
/* If forced eviction fails, stall. */
if (ret == EBUSY) {
WT_NOT_READ(ret, 0);
diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c
index bc85dcee4f5..4b42221865e 100644
--- a/src/third_party/wiredtiger/src/btree/bt_ret.c
+++ b/src/third_party/wiredtiger/src/btree/bt_ret.c
@@ -201,6 +201,13 @@ __wt_value_return_upd(WT_SESSION_IMPL *session,
memcpy(listp, list, sizeof(list));
}
listp[i++] = upd;
+
+ /*
+ * Once a modify is found, all previously committed
+ * modifications should be applied regardless of
+ * visibility.
+ */
+ ignore_visibility = true;
}
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c
index f8f2552dc0a..5b0f2a5569a 100644
--- a/src/third_party/wiredtiger/src/btree/bt_slvg.c
+++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c
@@ -328,7 +328,8 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[])
*/
if (ss->root_ref.page != NULL) {
btree->ckpt = ckptbase;
- ret = __wt_evict(session, &ss->root_ref, true, WT_REF_MEM);
+ ret = __wt_evict(session, &ss->root_ref, WT_REF_MEM,
+ WT_EVICT_CALL_CLOSING);
ss->root_ref.page = NULL;
btree->ckpt = NULL;
}
@@ -1300,7 +1301,8 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
ret = __wt_page_release(session, ref, 0);
if (ret == 0)
- ret = __wt_evict(session, ref, true, WT_REF_MEM);
+ ret = __wt_evict(session, ref, WT_REF_MEM,
+ WT_EVICT_CALL_CLOSING);
if (0) {
err: WT_TRET(__wt_page_release(session, ref, 0));
@@ -2019,7 +2021,8 @@ __slvg_row_build_leaf(
*/
ret = __wt_page_release(session, ref, 0);
if (ret == 0)
- ret = __wt_evict(session, ref, true, WT_REF_MEM);
+ ret = __wt_evict(session, ref, WT_REF_MEM,
+ WT_EVICT_CALL_CLOSING);
if (0) {
err: WT_TRET(__wt_page_release(session, ref, 0));
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 113b95e6ff9..9321cc88282 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -1406,6 +1406,25 @@ err: if (parent != NULL)
return (0);
}
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __check_upd_list --
+ * Sanity check an update list.
+ * In particular, make sure there no birthmarks.
+ */
+static void
+__check_upd_list(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+{
+ int birthmark_count;
+
+ for (birthmark_count = 0; upd != NULL; upd = upd->next)
+ if (upd->type == WT_UPDATE_BIRTHMARK)
+ ++birthmark_count;
+
+ WT_ASSERT(session, birthmark_count <= 1);
+}
+#endif
+
/*
* __split_multi_inmem --
* Instantiate a page from a disk image.
@@ -1501,6 +1520,10 @@ __split_multi_inmem(
key->size = WT_INSERT_KEY_SIZE(supd->ins);
}
+#ifdef HAVE_DIAGNOSTIC
+ __check_upd_list(session, upd);
+#endif
+
/* Search the page. */
WT_ERR(__wt_row_search(
session, key, ref, &cbt, true, true));
@@ -1802,9 +1825,11 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL) {
key->data = WT_INSERT_KEY(ins);
key->size = WT_INSERT_KEY_SIZE(ins);
- } else
+ } else {
+ WT_ASSERT(session, page->entries > 0);
WT_ERR(__wt_row_leaf_key(
session, page, &page->pg_row[0], key, true));
+ }
WT_ERR(__wt_row_ikey(session, 0, key->data, key->size, child));
parent_incr += sizeof(WT_IKEY) + key->size;
__wt_scr_free(session, &key);
diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c
index c7d17abd202..7113f4d9724 100644
--- a/src/third_party/wiredtiger/src/btree/bt_sync.c
+++ b/src/third_party/wiredtiger/src/btree/bt_sync.c
@@ -143,7 +143,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
* checkpoint, the on-disk version is correct. If the truncate is
* visible, we skip over the child page when writing its parent. We
* check whether a truncate is visible in the checkpoint as part of
- * reconciling internal pages (specifically in __rec_child_modify).
+ * reconciling internal pages (specifically in __wt_rec_child_modify).
*/
LF_SET(WT_READ_DELETED_SKIP);
@@ -326,7 +326,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
page->read_gen == WT_READGEN_WONT_NEED &&
!tried_eviction) {
WT_ERR_BUSY_OK(
- __wt_page_release_evict(session, walk));
+ __wt_page_release_evict(session, walk, 0));
walk = prev;
prev = NULL;
tried_eviction = true;
diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c
index 77614e9c9e4..0e9f4f04f46 100644
--- a/src/third_party/wiredtiger/src/cache/cache_las.c
+++ b/src/third_party/wiredtiger/src/cache/cache_las.c
@@ -57,6 +57,46 @@ __las_entry_count(WT_CACHE *cache)
}
/*
+ * __wt_las_config --
+ * Configure the lookaside table.
+ */
+int
+__wt_las_config(WT_SESSION_IMPL *session, const char **cfg)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CURSOR_BTREE *las_cursor;
+ WT_SESSION_IMPL *las_session;
+
+ WT_RET(__wt_config_gets(
+ session, cfg, "cache_overflow.file_max", &cval));
+
+ if (cval.val != 0 && cval.val < WT_LAS_FILE_MIN)
+ WT_RET_MSG(session, EINVAL,
+ "max cache overflow size %" PRId64 " below minimum %d",
+ cval.val, WT_LAS_FILE_MIN);
+
+ /* This is expected for in-memory configurations. */
+ las_session = S2C(session)->cache->las_session[0];
+ WT_ASSERT(session,
+ las_session != NULL || F_ISSET(S2C(session), WT_CONN_IN_MEMORY));
+
+ if (las_session == NULL)
+ return (0);
+
+ /*
+ * We need to set file_max on the btree associated with one of the
+ * lookaside sessions.
+ */
+ las_cursor = (WT_CURSOR_BTREE *)las_session->las_cursor;
+ las_cursor->btree->file_max = (uint64_t)cval.val;
+
+ WT_STAT_CONN_SET(
+ session, cache_lookaside_ondisk_max, las_cursor->btree->file_max);
+
+ return (0);
+}
+
+/*
* __wt_las_empty --
* Return when there are entries in the lookaside table.
*/
@@ -126,7 +166,7 @@ __wt_las_stats_update(WT_SESSION_IMPL *session)
* Initialize the database's lookaside store.
*/
int
-__wt_las_create(WT_SESSION_IMPL *session)
+__wt_las_create(WT_SESSION_IMPL *session, const char **cfg)
{
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
@@ -166,6 +206,8 @@ __wt_las_create(WT_SESSION_IMPL *session)
WT_RET(__wt_las_cursor_open(cache->las_session[i]));
}
+ WT_RET(__wt_las_config(session, cfg));
+
/* The statistics server is already running, make sure we don't race. */
WT_WRITE_BARRIER();
F_SET(conn, WT_CONN_LOOKASIDE_OPEN);
@@ -609,8 +651,10 @@ __wt_las_insert_block(WT_CURSOR *cursor,
WT_SAVE_UPD *list;
WT_SESSION_IMPL *session;
WT_TXN_ISOLATION saved_isolation;
- WT_UPDATE *upd;
- uint64_t insert_cnt, las_counter, las_pageid, prepared_insert_cnt;
+ WT_UPDATE *first_upd, *upd;
+ wt_off_t las_size;
+ uint64_t insert_cnt, las_counter, las_pageid, max_las_size;
+ uint64_t prepared_insert_cnt;
uint32_t btree_id, i, slot;
uint8_t *p;
bool local_txn;
@@ -688,7 +732,7 @@ __wt_las_insert_block(WT_CURSOR *cursor,
slot = page->type == WT_PAGE_ROW_LEAF ?
WT_ROW_SLOT(page, list->ripcip) :
WT_COL_SLOT(page, list->ripcip);
- upd = list->ins == NULL ?
+ first_upd = upd = list->ins == NULL ?
page->modify->mod_row_update[slot] : list->ins->upd;
/*
@@ -707,6 +751,9 @@ __wt_las_insert_block(WT_CURSOR *cursor,
las_value.size = upd->size;
break;
case WT_UPDATE_BIRTHMARK:
+ WT_ASSERT(session, upd != first_upd ||
+ multi->page_las.skew_newest);
+ /* FALLTHROUGH */
case WT_UPDATE_TOMBSTONE:
las_value.size = 0;
break;
@@ -727,6 +774,8 @@ __wt_las_insert_block(WT_CURSOR *cursor,
(upd->type == WT_UPDATE_STANDARD ||
upd->type == WT_UPDATE_MODIFY)) {
las_value.size = 0;
+ WT_ASSERT(session, upd != first_upd ||
+ multi->page_las.skew_newest);
cursor->set_value(cursor, upd->txnid,
upd->timestamp, upd->prepare_state,
WT_UPDATE_BIRTHMARK, &las_value);
@@ -748,6 +797,14 @@ __wt_las_insert_block(WT_CURSOR *cursor,
} while ((upd = upd->next) != NULL);
}
+ WT_ERR(__wt_block_manager_named_size(session, WT_LAS_FILE, &las_size));
+ WT_STAT_CONN_SET(session, cache_lookaside_ondisk, las_size);
+ max_las_size = ((WT_CURSOR_BTREE *)cursor)->btree->file_max;
+ if (max_las_size != 0 && (uint64_t)las_size > max_las_size)
+ WT_PANIC_MSG(session, WT_PANIC,
+ "WiredTigerLAS: file size of %" PRIu64 " exceeds maximum "
+ "size %" PRIu64, (uint64_t)las_size, max_las_size);
+
err: /* Resolve the transaction. */
if (local_txn) {
if (ret == 0)
@@ -773,6 +830,7 @@ err: /* Resolve the transaction. */
__las_insert_block_verbose(session, btree, multi);
}
+ WT_UNUSED(first_upd);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c
index 521f3d4bdc8..9e78e669cbb 100644
--- a/src/third_party/wiredtiger/src/config/config_def.c
+++ b/src/third_party/wiredtiger/src/config/config_def.c
@@ -61,6 +61,12 @@ static const WT_CONFIG_CHECK
};
static const WT_CONFIG_CHECK
+ confchk_wiredtiger_open_cache_overflow_subconfigs[] = {
+ { "file_max", "int", NULL, "min=0", NULL, 0 },
+ { NULL, NULL, NULL, NULL, NULL, 0 }
+};
+
+static const WT_CONFIG_CHECK
confchk_wiredtiger_open_checkpoint_subconfigs[] = {
{ "log_size", "int", NULL, "min=0,max=2GB", NULL, 0 },
{ "wait", "int", NULL, "min=0,max=100000", NULL, 0 },
@@ -74,6 +80,17 @@ static const WT_CONFIG_CHECK
};
static const WT_CONFIG_CHECK
+ confchk_wiredtiger_open_debug_mode_subconfigs[] = {
+ { "checkpoint_retention", "int",
+ NULL, "min=0,max=1024",
+ NULL, 0 },
+ { "eviction", "boolean", NULL, NULL, NULL, 0 },
+ { "rollback_error", "int", NULL, "min=0,max=10M", NULL, 0 },
+ { "table_logging", "boolean", NULL, NULL, NULL, 0 },
+ { NULL, NULL, NULL, NULL, NULL, 0 }
+};
+
+static const WT_CONFIG_CHECK
confchk_wiredtiger_open_eviction_subconfigs[] = {
{ "threads_max", "int", NULL, "min=1,max=20", NULL, 0 },
{ "threads_min", "int", NULL, "min=1,max=20", NULL, 0 },
@@ -148,6 +165,9 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = {
NULL, NULL,
confchk_wiredtiger_open_async_subconfigs, 3 },
{ "cache_max_wait_ms", "int", NULL, "min=0", NULL, 0 },
+ { "cache_overflow", "category",
+ NULL, NULL,
+ confchk_wiredtiger_open_cache_overflow_subconfigs, 1 },
{ "cache_overhead", "int", NULL, "min=0,max=30", NULL, 0 },
{ "cache_size", "int", NULL, "min=1MB,max=10TB", NULL, 0 },
{ "checkpoint", "category",
@@ -156,6 +176,9 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = {
{ "compatibility", "category",
NULL, NULL,
confchk_WT_CONNECTION_reconfigure_compatibility_subconfigs, 1 },
+ { "debug_mode", "category",
+ NULL, NULL,
+ confchk_wiredtiger_open_debug_mode_subconfigs, 4 },
{ "error_prefix", "string", NULL, NULL, NULL, 0 },
{ "eviction", "category",
NULL, NULL,
@@ -839,6 +862,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
{ "builtin_extension_config", "string", NULL, NULL, NULL, 0 },
{ "cache_cursors", "boolean", NULL, NULL, NULL, 0 },
{ "cache_max_wait_ms", "int", NULL, "min=0", NULL, 0 },
+ { "cache_overflow", "category",
+ NULL, NULL,
+ confchk_wiredtiger_open_cache_overflow_subconfigs, 1 },
{ "cache_overhead", "int", NULL, "min=0,max=30", NULL, 0 },
{ "cache_size", "int", NULL, "min=1MB,max=10TB", NULL, 0 },
{ "checkpoint", "category",
@@ -850,6 +876,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
confchk_wiredtiger_open_compatibility_subconfigs, 3 },
{ "config_base", "boolean", NULL, NULL, NULL, 0 },
{ "create", "boolean", NULL, NULL, NULL, 0 },
+ { "debug_mode", "category",
+ NULL, NULL,
+ confchk_wiredtiger_open_debug_mode_subconfigs, 4 },
{ "direct_io", "list",
NULL, "choices=[\"checkpoint\",\"data\",\"log\"]",
NULL, 0 },
@@ -948,6 +977,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
{ "builtin_extension_config", "string", NULL, NULL, NULL, 0 },
{ "cache_cursors", "boolean", NULL, NULL, NULL, 0 },
{ "cache_max_wait_ms", "int", NULL, "min=0", NULL, 0 },
+ { "cache_overflow", "category",
+ NULL, NULL,
+ confchk_wiredtiger_open_cache_overflow_subconfigs, 1 },
{ "cache_overhead", "int", NULL, "min=0,max=30", NULL, 0 },
{ "cache_size", "int", NULL, "min=1MB,max=10TB", NULL, 0 },
{ "checkpoint", "category",
@@ -959,6 +991,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
confchk_wiredtiger_open_compatibility_subconfigs, 3 },
{ "config_base", "boolean", NULL, NULL, NULL, 0 },
{ "create", "boolean", NULL, NULL, NULL, 0 },
+ { "debug_mode", "category",
+ NULL, NULL,
+ confchk_wiredtiger_open_debug_mode_subconfigs, 4 },
{ "direct_io", "list",
NULL, "choices=[\"checkpoint\",\"data\",\"log\"]",
NULL, 0 },
@@ -1058,6 +1093,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
{ "builtin_extension_config", "string", NULL, NULL, NULL, 0 },
{ "cache_cursors", "boolean", NULL, NULL, NULL, 0 },
{ "cache_max_wait_ms", "int", NULL, "min=0", NULL, 0 },
+ { "cache_overflow", "category",
+ NULL, NULL,
+ confchk_wiredtiger_open_cache_overflow_subconfigs, 1 },
{ "cache_overhead", "int", NULL, "min=0,max=30", NULL, 0 },
{ "cache_size", "int", NULL, "min=1MB,max=10TB", NULL, 0 },
{ "checkpoint", "category",
@@ -1067,6 +1105,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
{ "compatibility", "category",
NULL, NULL,
confchk_wiredtiger_open_compatibility_subconfigs, 3 },
+ { "debug_mode", "category",
+ NULL, NULL,
+ confchk_wiredtiger_open_debug_mode_subconfigs, 4 },
{ "direct_io", "list",
NULL, "choices=[\"checkpoint\",\"data\",\"log\"]",
NULL, 0 },
@@ -1162,6 +1203,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
{ "builtin_extension_config", "string", NULL, NULL, NULL, 0 },
{ "cache_cursors", "boolean", NULL, NULL, NULL, 0 },
{ "cache_max_wait_ms", "int", NULL, "min=0", NULL, 0 },
+ { "cache_overflow", "category",
+ NULL, NULL,
+ confchk_wiredtiger_open_cache_overflow_subconfigs, 1 },
{ "cache_overhead", "int", NULL, "min=0,max=30", NULL, 0 },
{ "cache_size", "int", NULL, "min=1MB,max=10TB", NULL, 0 },
{ "checkpoint", "category",
@@ -1171,6 +1215,9 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
{ "compatibility", "category",
NULL, NULL,
confchk_wiredtiger_open_compatibility_subconfigs, 3 },
+ { "debug_mode", "category",
+ NULL, NULL,
+ confchk_wiredtiger_open_debug_mode_subconfigs, 4 },
{ "direct_io", "list",
NULL, "choices=[\"checkpoint\",\"data\",\"log\"]",
NULL, 0 },
@@ -1307,8 +1354,10 @@ static const WT_CONFIG_ENTRY config_entries[] = {
},
{ "WT_CONNECTION.reconfigure",
"async=(enabled=false,ops_max=1024,threads=2),cache_max_wait_ms=0"
- ",cache_overhead=8,cache_size=100MB,checkpoint=(log_size=0,"
- "wait=0),compatibility=(release=),error_prefix=,"
+ ",cache_overflow=(file_max=0),cache_overhead=8,cache_size=100MB,"
+ "checkpoint=(log_size=0,wait=0),compatibility=(release=),"
+ "debug_mode=(checkpoint_retention=0,eviction=false,"
+ "rollback_error=0,table_logging=false),error_prefix=,"
"eviction=(threads_max=8,threads_min=1),"
"eviction_checkpoint_target=1,eviction_dirty_target=5,"
"eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95"
@@ -1321,7 +1370,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"statistics=none,statistics_log=(json=false,on_close=false,"
"sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
"timing_stress_for_test=,verbose=",
- confchk_WT_CONNECTION_reconfigure, 24
+ confchk_WT_CONNECTION_reconfigure, 26
},
{ "WT_CONNECTION.rollback_to_stable",
"",
@@ -1556,19 +1605,22 @@ static const WT_CONFIG_ENTRY config_entries[] = {
{ "wiredtiger_open",
"async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1"
",builtin_extension_config=,cache_cursors=true,"
- "cache_max_wait_ms=0,cache_overhead=8,cache_size=100MB,"
- "checkpoint=(log_size=0,wait=0),checkpoint_sync=true,"
- "compatibility=(release=,require_max=,require_min=),"
- "config_base=true,create=false,direct_io=,encryption=(keyid=,"
- "name=,secretkey=),error_prefix=,eviction=(threads_max=8,"
- "threads_min=1),eviction_checkpoint_target=1,"
- "eviction_dirty_target=5,eviction_dirty_trigger=20,"
- "eviction_target=80,eviction_trigger=95,exclusive=false,"
- "extensions=,file_extend=,file_manager=(close_handle_minimum=250,"
- "close_idle_time=30,close_scan_interval=10),hazard_max=1000,"
- "in_memory=false,io_capacity=(total=0),log=(archive=true,"
- "compressor=,enabled=false,file_max=100MB,os_cache_dirty_pct=0,"
- "path=\".\",prealloc=true,recover=on,zero_fill=false),"
+ "cache_max_wait_ms=0,cache_overflow=(file_max=0),cache_overhead=8"
+ ",cache_size=100MB,checkpoint=(log_size=0,wait=0),"
+ "checkpoint_sync=true,compatibility=(release=,require_max=,"
+ "require_min=),config_base=true,create=false,"
+ "debug_mode=(checkpoint_retention=0,eviction=false,"
+ "rollback_error=0,table_logging=false),direct_io=,"
+ "encryption=(keyid=,name=,secretkey=),error_prefix=,"
+ "eviction=(threads_max=8,threads_min=1),"
+ "eviction_checkpoint_target=1,eviction_dirty_target=5,"
+ "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95"
+ ",exclusive=false,extensions=,file_extend=,"
+ "file_manager=(close_handle_minimum=250,close_idle_time=30,"
+ "close_scan_interval=10),hazard_max=1000,in_memory=false,"
+ "io_capacity=(total=0),log=(archive=true,compressor=,"
+ "enabled=false,file_max=100MB,os_cache_dirty_pct=0,path=\".\","
+ "prealloc=true,recover=on,zero_fill=false),"
"lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true,"
"mmap=true,multiprocess=false,operation_tracking=(enabled=false,"
"path=\".\"),readonly=false,salvage=false,session_max=100,"
@@ -1579,24 +1631,27 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"timing_stress_for_test=,transaction_sync=(enabled=false,"
"method=fsync),use_environment=true,use_environment_priv=false,"
"verbose=,write_through=",
- confchk_wiredtiger_open, 48
+ confchk_wiredtiger_open, 50
},
{ "wiredtiger_open_all",
"async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1"
",builtin_extension_config=,cache_cursors=true,"
- "cache_max_wait_ms=0,cache_overhead=8,cache_size=100MB,"
- "checkpoint=(log_size=0,wait=0),checkpoint_sync=true,"
- "compatibility=(release=,require_max=,require_min=),"
- "config_base=true,create=false,direct_io=,encryption=(keyid=,"
- "name=,secretkey=),error_prefix=,eviction=(threads_max=8,"
- "threads_min=1),eviction_checkpoint_target=1,"
- "eviction_dirty_target=5,eviction_dirty_trigger=20,"
- "eviction_target=80,eviction_trigger=95,exclusive=false,"
- "extensions=,file_extend=,file_manager=(close_handle_minimum=250,"
- "close_idle_time=30,close_scan_interval=10),hazard_max=1000,"
- "in_memory=false,io_capacity=(total=0),log=(archive=true,"
- "compressor=,enabled=false,file_max=100MB,os_cache_dirty_pct=0,"
- "path=\".\",prealloc=true,recover=on,zero_fill=false),"
+ "cache_max_wait_ms=0,cache_overflow=(file_max=0),cache_overhead=8"
+ ",cache_size=100MB,checkpoint=(log_size=0,wait=0),"
+ "checkpoint_sync=true,compatibility=(release=,require_max=,"
+ "require_min=),config_base=true,create=false,"
+ "debug_mode=(checkpoint_retention=0,eviction=false,"
+ "rollback_error=0,table_logging=false),direct_io=,"
+ "encryption=(keyid=,name=,secretkey=),error_prefix=,"
+ "eviction=(threads_max=8,threads_min=1),"
+ "eviction_checkpoint_target=1,eviction_dirty_target=5,"
+ "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95"
+ ",exclusive=false,extensions=,file_extend=,"
+ "file_manager=(close_handle_minimum=250,close_idle_time=30,"
+ "close_scan_interval=10),hazard_max=1000,in_memory=false,"
+ "io_capacity=(total=0),log=(archive=true,compressor=,"
+ "enabled=false,file_max=100MB,os_cache_dirty_pct=0,path=\".\","
+ "prealloc=true,recover=on,zero_fill=false),"
"lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true,"
"mmap=true,multiprocess=false,operation_tracking=(enabled=false,"
"path=\".\"),readonly=false,salvage=false,session_max=100,"
@@ -1607,14 +1662,16 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"timing_stress_for_test=,transaction_sync=(enabled=false,"
"method=fsync),use_environment=true,use_environment_priv=false,"
"verbose=,version=(major=0,minor=0),write_through=",
- confchk_wiredtiger_open_all, 49
+ confchk_wiredtiger_open_all, 51
},
{ "wiredtiger_open_basecfg",
"async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1"
",builtin_extension_config=,cache_cursors=true,"
- "cache_max_wait_ms=0,cache_overhead=8,cache_size=100MB,"
- "checkpoint=(log_size=0,wait=0),checkpoint_sync=true,"
- "compatibility=(release=,require_max=,require_min=),direct_io=,"
+ "cache_max_wait_ms=0,cache_overflow=(file_max=0),cache_overhead=8"
+ ",cache_size=100MB,checkpoint=(log_size=0,wait=0),"
+ "checkpoint_sync=true,compatibility=(release=,require_max=,"
+ "require_min=),debug_mode=(checkpoint_retention=0,eviction=false,"
+ "rollback_error=0,table_logging=false),direct_io=,"
"encryption=(keyid=,name=,secretkey=),error_prefix=,"
"eviction=(threads_max=8,threads_min=1),"
"eviction_checkpoint_target=1,eviction_dirty_target=5,"
@@ -1633,14 +1690,16 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
"timing_stress_for_test=,transaction_sync=(enabled=false,"
"method=fsync),verbose=,version=(major=0,minor=0),write_through=",
- confchk_wiredtiger_open_basecfg, 43
+ confchk_wiredtiger_open_basecfg, 45
},
{ "wiredtiger_open_usercfg",
"async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1"
",builtin_extension_config=,cache_cursors=true,"
- "cache_max_wait_ms=0,cache_overhead=8,cache_size=100MB,"
- "checkpoint=(log_size=0,wait=0),checkpoint_sync=true,"
- "compatibility=(release=,require_max=,require_min=),direct_io=,"
+ "cache_max_wait_ms=0,cache_overflow=(file_max=0),cache_overhead=8"
+ ",cache_size=100MB,checkpoint=(log_size=0,wait=0),"
+ "checkpoint_sync=true,compatibility=(release=,require_max=,"
+ "require_min=),debug_mode=(checkpoint_retention=0,eviction=false,"
+ "rollback_error=0,table_logging=false),direct_io=,"
"encryption=(keyid=,name=,secretkey=),error_prefix=,"
"eviction=(threads_max=8,threads_min=1),"
"eviction_checkpoint_target=1,eviction_dirty_target=5,"
@@ -1659,7 +1718,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0),"
"timing_stress_for_test=,transaction_sync=(enabled=false,"
"method=fsync),verbose=,write_through=",
- confchk_wiredtiger_open_usercfg, 42
+ confchk_wiredtiger_open_usercfg, 44
},
{ NULL, NULL, NULL, 0 }
};
diff --git a/src/third_party/wiredtiger/src/conn/api_calc_modify.c b/src/third_party/wiredtiger/src/conn/api_calc_modify.c
index 4a435a85ef1..a8091498ee6 100644
--- a/src/third_party/wiredtiger/src/conn/api_calc_modify.c
+++ b/src/third_party/wiredtiger/src/conn/api_calc_modify.c
@@ -69,16 +69,31 @@ static void
__cm_extend(WT_CM_STATE *cms,
const uint8_t *m1, const uint8_t *m2, WT_CM_MATCH *match)
{
+ ptrdiff_t n;
const uint8_t *p1, *p2;
- /* Step past the end and before the beginning of the matching block. */
+ p1 = m1;
+ p2 = m2;
+
+ /*
+ * Keep skipping half of the remaining bytes while they compare equal.
+ * This is significantly faster than our byte-at-a-time loop below.
+ */
for (p1 = m1, p2 = m2;
- p1 < cms->e1 && p2 < cms->e2 && *p1 == *p2;
- p1++, p2++)
+ (n = WT_MIN(cms->e1 - p1, cms->e2 - p2) / 2) > 8 &&
+ memcmp(p1, p2, (size_t)n) == 0;
+ p1 += n, p2 += n)
+ ;
+
+ /* Step past the end and before the beginning of the matching block. */
+ for (n = WT_MIN(cms->e1 - p1, cms->e2 - p2);
+ n > 0 && *p1 == *p2;
+ n--, p1++, p2++)
;
- for (; m1 >= cms->used1 && m2 >= cms->used2 && *m1 == *m2;
- m1--, m2--)
+ for (n = WT_MIN(m1 - cms->used1, m2 - cms->used2);
+ n > 0 && *m1 == *m2;
+ n--, m1--, m2--)
;
match->m1 = m1 + 1;
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
index ef0072c45ac..54199fd38ad 100644
--- a/src/third_party/wiredtiger/src/conn/conn_api.c
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -1816,6 +1816,57 @@ err: /*
return (ret);
}
+/*
+ * __wt_debug_mode_config --
+ * Set debugging configuration.
+ */
+int
+__wt_debug_mode_config(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CACHE *cache;
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN_GLOBAL *txn_global;
+
+ conn = S2C(session);
+ cache = conn->cache;
+ txn_global = &conn->txn_global;
+
+ WT_RET(__wt_config_gets(session,
+ cfg, "debug_mode.checkpoint_retention", &cval));
+ conn->debug_ckpt_cnt = (uint32_t)cval.val;
+ if (cval.val == 0) {
+ if (conn->debug_ckpt != NULL)
+ __wt_free(session, conn->debug_ckpt);
+ conn->debug_ckpt = NULL;
+ } else if (conn->debug_ckpt != NULL)
+ WT_RET(__wt_realloc(session, NULL,
+ conn->debug_ckpt_cnt, &conn->debug_ckpt));
+ else
+ WT_RET(__wt_calloc_def(session,
+ conn->debug_ckpt_cnt, &conn->debug_ckpt));
+
+ WT_RET(__wt_config_gets(session,
+ cfg, "debug_mode.eviction", &cval));
+ if (cval.val)
+ F_SET(cache, WT_CACHE_EVICT_DEBUG_MODE);
+ else
+ F_CLR(cache, WT_CACHE_EVICT_DEBUG_MODE);
+
+ WT_RET(__wt_config_gets(session,
+ cfg, "debug_mode.rollback_error", &cval));
+ txn_global->debug_rollback = (uint64_t)cval.val;
+
+ WT_RET(__wt_config_gets(session,
+ cfg, "debug_mode.table_logging", &cval));
+ if (cval.val)
+ FLD_SET(conn->log_flags, WT_CONN_LOG_DEBUG_MODE);
+ else
+ FLD_CLR(conn->log_flags, WT_CONN_LOG_DEBUG_MODE);
+
+ return (0);
+}
+
/* Simple structure for name and flag configuration searches. */
typedef struct {
const char *name;
@@ -2707,6 +2758,12 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
session = conn->default_session;
/*
+ * This function expects the cache to be created so parse this after
+ * the rest of the connection is set up.
+ */
+ WT_ERR(__wt_debug_mode_config(session, cfg));
+
+ /*
* Load the extensions after initialization completes; extensions expect
* everything else to be in place, and the extensions call back into the
* library.
diff --git a/src/third_party/wiredtiger/src/conn/conn_handle.c b/src/third_party/wiredtiger/src/conn/conn_handle.c
index a3818b3c914..faee6216ed7 100644
--- a/src/third_party/wiredtiger/src/conn/conn_handle.c
+++ b/src/third_party/wiredtiger/src/conn/conn_handle.c
@@ -132,8 +132,9 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn)
/* Free allocated memory. */
__wt_free(session, conn->cfg);
- __wt_free(session, conn->home);
+ __wt_free(session, conn->debug_ckpt);
__wt_free(session, conn->error_prefix);
+ __wt_free(session, conn->home);
__wt_free(session, conn->sessions);
__wt_stat_connection_discard(session, conn);
diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c
index 8bc111346c5..cd93e459e0a 100644
--- a/src/third_party/wiredtiger/src/conn/conn_log.c
+++ b/src/third_party/wiredtiger/src/conn/conn_log.c
@@ -372,9 +372,19 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file)
*/
if (backup_file != 0)
min_lognum = WT_MIN(log->ckpt_lsn.l.file, backup_file);
- else
- min_lognum = WT_MIN(
- log->ckpt_lsn.l.file, log->sync_lsn.l.file);
+ else {
+ /*
+ * Figure out the minimum log file to archive. Use the
+ * LSN in the debugging array if necessary.
+ */
+ if (conn->debug_ckpt_cnt == 0)
+ min_lognum = WT_MIN(
+ log->ckpt_lsn.l.file, log->sync_lsn.l.file);
+ else
+ min_lognum = WT_MIN(
+ conn->debug_ckpt[conn->debug_ckpt_cnt - 1].l.file,
+ log->sync_lsn.l.file);
+ }
__wt_verbose(session, WT_VERB_LOG,
"log_archive: archive to log number %" PRIu32, min_lognum);
diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c
index 7a2b52f40f9..fba1132ecb7 100644
--- a/src/third_party/wiredtiger/src/conn/conn_open.c
+++ b/src/third_party/wiredtiger/src/conn/conn_open.c
@@ -238,7 +238,7 @@ __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET(__wt_meta_track_init(session));
/* Create the lookaside table. */
- WT_RET(__wt_las_create(session));
+ WT_RET(__wt_las_create(session, cfg));
/*
* Start eviction threads.
diff --git a/src/third_party/wiredtiger/src/conn/conn_reconfig.c b/src/third_party/wiredtiger/src/conn/conn_reconfig.c
index 1cd589c32c9..fa0726a1306 100644
--- a/src/third_party/wiredtiger/src/conn/conn_reconfig.c
+++ b/src/third_party/wiredtiger/src/conn/conn_reconfig.c
@@ -488,12 +488,14 @@ __wt_conn_reconfig(WT_SESSION_IMPL *session, const char **cfg)
WT_ERR(__wt_cache_config(session, true, cfg));
WT_ERR(__wt_capacity_server_create(session, cfg));
WT_ERR(__wt_checkpoint_server_create(session, cfg));
+ WT_ERR(__wt_debug_mode_config(session, cfg));
+ WT_ERR(__wt_las_config(session, cfg));
WT_ERR(__wt_logmgr_reconfig(session, cfg));
WT_ERR(__wt_lsm_manager_reconfig(session, cfg));
WT_ERR(__wt_statlog_create(session, cfg));
WT_ERR(__wt_sweep_config(session, cfg));
- WT_ERR(__wt_verbose_config(session, cfg));
WT_ERR(__wt_timing_stress_config(session, cfg));
+ WT_ERR(__wt_verbose_config(session, cfg));
/* Third, merge everything together, creating a new connection state. */
WT_ERR(__wt_config_merge(session, cfg, NULL, &p));
diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c
index b9747d1b681..0e806f20608 100644
--- a/src/third_party/wiredtiger/src/evict/evict_file.c
+++ b/src/third_party/wiredtiger/src/evict/evict_file.c
@@ -95,7 +95,8 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
* Ensure the ref state is restored to the previous
* value if eviction fails.
*/
- WT_ERR(__wt_evict(session, ref, true, ref->state));
+ WT_ERR(__wt_evict(session, ref, ref->state,
+ WT_EVICT_CALL_CLOSING));
break;
case WT_SYNC_DISCARD:
/*
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index 3001f3d23da..f40ed758a19 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -107,6 +107,25 @@ __evict_entry_priority(WT_SESSION_IMPL *session, WT_REF *ref)
}
/*
+ * __evict_lru_cmp_debug --
+ * Qsort function: sort the eviction array.
+ * Version for eviction debug mode.
+ */
+static int WT_CDECL
+__evict_lru_cmp_debug(const void *a_arg, const void *b_arg)
+{
+ const WT_EVICT_ENTRY *a, *b;
+ uint64_t a_score, b_score;
+
+ a = a_arg;
+ b = b_arg;
+ a_score = (a->ref == NULL ? UINT64_MAX : 0);
+ b_score = (b->ref == NULL ? UINT64_MAX : 0);
+
+ return ((a_score < b_score) ? -1 : (a_score == b_score) ? 0 : 1);
+}
+
+/*
* __evict_lru_cmp --
* Qsort function: sort the eviction array.
*/
@@ -1257,8 +1276,17 @@ __evict_lru_walk(WT_SESSION_IMPL *session)
queue->evict_current = NULL;
entries = queue->evict_entries;
- __wt_qsort(queue->evict_queue,
- entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp);
+ /*
+ * Style note: __wt_qsort is a macro that can leave a dangling
+ * else. Full curly braces are needed here for the compiler.
+ */
+ if (F_ISSET(cache, WT_CACHE_EVICT_DEBUG_MODE)) {
+ __wt_qsort(queue->evict_queue,
+ entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp_debug);
+ } else {
+ __wt_qsort(queue->evict_queue,
+ entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp);
+ }
/* Trim empty entries from the end. */
while (entries > 0 && queue->evict_queue[entries - 1].ref == NULL)
@@ -1975,12 +2003,14 @@ __evict_walk_tree(WT_SESSION_IMPL *session,
* cache (indicated by seeing an internal page that is the
* parent of the last page we saw).
*
- * Also skip internal page unless we get aggressive or the tree
- * is idle (indicated by the tree being skipped for walks).
+ * Also skip internal page unless we get aggressive, the tree
+ * is idle (indicated by the tree being skipped for walks),
+ * or we are in eviction debug mode.
* The goal here is that if trees become completely idle, we
* eventually push them out of cache completely.
*/
- if (WT_PAGE_IS_INTERNAL(page)) {
+ if (!F_ISSET(cache, WT_CACHE_EVICT_DEBUG_MODE) &&
+ WT_PAGE_IS_INTERNAL(page)) {
if (page == last_parent)
continue;
if (btree->evict_walk_period == 0 &&
@@ -2320,7 +2350,7 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server)
__wt_cache_read_gen_bump(session, ref->page);
WT_WITH_BTREE(session, btree,
- ret = __wt_evict(session, ref, false, previous_state));
+ ret = __wt_evict(session, ref, previous_state, 0));
(void)__wt_atomic_subv32(&btree->evict_busy, 1);
diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c
index e75f0ef1bed..2510815401f 100644
--- a/src/third_party/wiredtiger/src/evict/evict_page.c
+++ b/src/third_party/wiredtiger/src/evict/evict_page.c
@@ -8,9 +8,9 @@
#include "wt_internal.h"
-static int __evict_page_clean_update(WT_SESSION_IMPL *, WT_REF *, bool);
-static int __evict_page_dirty_update(WT_SESSION_IMPL *, WT_REF *, bool);
-static int __evict_review(WT_SESSION_IMPL *, WT_REF *, bool, bool *);
+static int __evict_page_clean_update(WT_SESSION_IMPL *, WT_REF *, uint32_t);
+static int __evict_page_dirty_update(WT_SESSION_IMPL *, WT_REF *, uint32_t);
+static int __evict_review(WT_SESSION_IMPL *, WT_REF *, uint32_t, bool *);
/*
* __evict_exclusive_clear --
@@ -51,19 +51,20 @@ __evict_exclusive(WT_SESSION_IMPL *session, WT_REF *ref)
* Release a reference to a page, and attempt to immediately evict it.
*/
int
-__wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref)
+__wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
{
WT_BTREE *btree;
WT_DECL_RET;
WT_PAGE *page;
uint64_t time_start, time_stop;
- uint32_t previous_state;
+ uint32_t evict_flags, previous_state;
bool locked, too_big;
btree = S2BT(session);
locked = false;
page = ref->page;
time_start = __wt_clock(session);
+ evict_flags = LF_ISSET(WT_READ_NO_SPLIT) ? WT_EVICT_CALL_NO_SPLIT : 0;
/*
* This function always releases the hazard pointer - ensure that's
@@ -89,7 +90,7 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref)
* Track how long the call to evict took. If eviction is successful then
* we have one of two pairs of stats to increment.
*/
- ret = __wt_evict(session, ref, false, previous_state);
+ ret = __wt_evict(session, ref, previous_state, evict_flags);
time_stop = __wt_clock(session);
if (ret == 0) {
if (too_big) {
@@ -124,20 +125,25 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref)
*/
int
__wt_evict(WT_SESSION_IMPL *session,
- WT_REF *ref, bool closing, uint32_t previous_state)
+ WT_REF *ref, uint32_t previous_state, uint32_t flags)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_PAGE *page;
- bool clean_page, inmem_split, local_gen, tree_dead;
+ bool clean_page, closing, inmem_split, local_gen, tree_dead;
conn = S2C(session);
page = ref->page;
+ closing = LF_ISSET(WT_EVICT_CALL_CLOSING);
local_gen = false;
__wt_verbose(session, WT_VERB_EVICT,
"page %p (%s)", (void *)page, __wt_page_type_string(page->type));
+ tree_dead = F_ISSET(session->dhandle, WT_DHANDLE_DEAD);
+ if (tree_dead)
+ LF_SET(WT_EVICT_CALL_NO_SPLIT);
+
/*
* Enter the eviction generation. If we re-enter eviction, leave the
* previous eviction generation (which must be as low as the current
@@ -171,7 +177,7 @@ __wt_evict(WT_SESSION_IMPL *session,
* Make this check for clean pages, too: while unlikely eviction would
* choose an internal page with children, it's not disallowed.
*/
- WT_ERR(__evict_review(session, ref, closing, &inmem_split));
+ WT_ERR(__evict_review(session, ref, flags, &inmem_split));
/*
* If there was an in-memory split, the tree has been left in the state
@@ -208,7 +214,6 @@ __wt_evict(WT_SESSION_IMPL *session,
}
/* Update the reference and discard the page. */
- tree_dead = F_ISSET(session->dhandle, WT_DHANDLE_DEAD);
if (__wt_ref_is_root(ref))
__wt_ref_out(session, ref);
else if ((clean_page && !F_ISSET(conn, WT_CONN_IN_MEMORY)) || tree_dead)
@@ -216,10 +221,9 @@ __wt_evict(WT_SESSION_IMPL *session,
* Pages that belong to dead trees never write back to disk
* and can't support page splits.
*/
- WT_ERR(__evict_page_clean_update(
- session, ref, tree_dead || closing));
+ WT_ERR(__evict_page_clean_update(session, ref, flags));
else
- WT_ERR(__evict_page_dirty_update(session, ref, closing));
+ WT_ERR(__evict_page_dirty_update(session, ref, flags));
if (clean_page) {
WT_STAT_CONN_INCR(session, cache_eviction_clean);
@@ -250,7 +254,7 @@ done: /* Leave any local eviction generation. */
* split.
*/
static int
-__evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
+__evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
{
WT_DECL_RET;
WT_PAGE *parent;
@@ -264,7 +268,7 @@ __evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
* Avoid doing reverse splits when closing the file, it is wasted work
* and some structures may have already been freed.
*/
- if (!closing) {
+ if (!LF_ISSET(WT_EVICT_CALL_NO_SPLIT | WT_EVICT_CALL_CLOSING)) {
parent = ref->home;
WT_INTL_INDEX_GET(session, parent, pindex);
ndeleted = __wt_atomic_addv32(&pindex->deleted_entries, 1);
@@ -302,9 +306,12 @@ __evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
* Update a clean page's reference on eviction.
*/
static int
-__evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
+__evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
{
WT_DECL_RET;
+ bool closing;
+
+ closing = LF_ISSET(WT_EVICT_CALL_CLOSING);
/*
* Before discarding a page, assert that all updates are globally
@@ -334,7 +341,7 @@ __evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
WT_REF_SET_STATE(ref, WT_REF_LOOKASIDE);
} else if (ref->addr == NULL) {
WT_WITH_PAGE_INDEX(session,
- ret = __evict_delete_ref(session, ref, closing));
+ ret = __evict_delete_ref(session, ref, flags));
WT_RET_BUSY_OK(ret);
} else
WT_REF_SET_STATE(ref, WT_REF_DISK);
@@ -347,14 +354,17 @@ __evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
* Update a dirty page's reference on eviction.
*/
static int
-__evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
+__evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref,
+ uint32_t evict_flags)
{
WT_ADDR *addr;
WT_DECL_RET;
WT_MULTI multi;
WT_PAGE_MODIFY *mod;
+ bool closing;
mod = ref->page->modify;
+ closing = FLD_ISSET(evict_flags, WT_EVICT_CALL_CLOSING);
WT_ASSERT(session, ref->addr == NULL);
@@ -370,7 +380,7 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
*/
__wt_ref_out(session, ref);
WT_WITH_PAGE_INDEX(session,
- ret = __evict_delete_ref(session, ref, closing));
+ ret = __evict_delete_ref(session, ref, evict_flags));
WT_RET_BUSY_OK(ret);
break;
case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
@@ -511,20 +521,22 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent)
*/
static int
__evict_review(
- WT_SESSION_IMPL *session, WT_REF *ref, bool closing, bool *inmem_splitp)
+ WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags,
+ bool *inmem_splitp)
{
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_PAGE *page;
uint32_t flags;
- bool lookaside_retry, *lookaside_retryp, modified;
+ bool closing, lookaside_retry, *lookaside_retryp, modified;
*inmem_splitp = false;
conn = S2C(session);
page = ref->page;
flags = WT_REC_EVICT;
+ closing = FLD_ISSET(evict_flags, WT_EVICT_CALL_CLOSING);
if (!WT_SESSION_BTREE_SYNC(session))
LF_SET(WT_REC_VISIBLE_ALL);
@@ -644,7 +656,13 @@ __evict_review(
else if (!WT_IS_METADATA(session->dhandle)) {
LF_SET(WT_REC_UPDATE_RESTORE);
- if (F_ISSET(cache, WT_CACHE_EVICT_SCRUB))
+ /*
+ * Scrub if we're supposed to or toss it in sometimes
+ * if we are in debugging mode.
+ */
+ if (F_ISSET(cache, WT_CACHE_EVICT_SCRUB) ||
+ (F_ISSET(cache, WT_CACHE_EVICT_DEBUG_MODE) &&
+ __wt_random(&session->rnd) % 3 == 0))
LF_SET(WT_REC_SCRUB);
/*
@@ -653,8 +671,16 @@ __evict_review(
* suggests trying the lookaside table.
*/
if (F_ISSET(cache, WT_CACHE_EVICT_LOOKASIDE) &&
- !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE))
+ !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE)) {
+ if (F_ISSET(cache,
+ WT_CACHE_EVICT_DEBUG_MODE) &&
+ __wt_random(&session->rnd) % 10 == 0) {
+ LF_CLR(WT_REC_SCRUB |
+ WT_REC_UPDATE_RESTORE);
+ LF_SET(WT_REC_LOOKASIDE);
+ }
lookaside_retryp = &lookaside_retry;
+ }
}
}
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index 46f507ebedf..9859b3b607a 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -1114,9 +1114,9 @@ struct __wt_update {
/*
* WT_MODIFY_MEM_FACTOR --
- * Limit update chains to a factor of the base document size.
+ * Limit update chains to a fraction of the base document size.
*/
-#define WT_MODIFY_MEM_FACTOR 1
+#define WT_MODIFY_MEM_FRACTION 10
/*
* WT_INSERT --
diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h
index f7ff274cfb8..17722a806e5 100644
--- a/src/third_party/wiredtiger/src/include/btree.h
+++ b/src/third_party/wiredtiger/src/include/btree.h
@@ -194,6 +194,12 @@ struct __wt_btree {
uint64_t bytes_dirty_total; /* Bytes ever dirtied in cache. */
/*
+ * The maximum bytes allowed to be used for the table on disk. This is
+ * currently only used for the lookaside table.
+ */
+ uint64_t file_max;
+
+ /*
* We flush pages from the tree (in order to make checkpoint faster),
* without a high-level lock. To avoid multiple threads flushing at
* the same time, lock the tree.
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index 5e0f0521ded..e728790b02c 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -918,6 +918,7 @@ __wt_row_leaf_key_set_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL *cell)
*/
v = WT_CELL_ENCODE_OFFSET(WT_PAGE_DISK_OFFSET(page, cell)) |
WT_CELL_FLAG;
+ WT_ASSERT(NULL, WT_ROW_SLOT(page, rip) < page->entries);
WT_ROW_KEY_SET(rip, v);
}
@@ -937,6 +938,7 @@ __wt_row_leaf_key_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack)
v = WT_K_ENCODE_KEY_LEN(unpack->size) |
WT_K_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) |
WT_K_FLAG;
+ WT_ASSERT(NULL, WT_ROW_SLOT(page, rip) < page->entries);
WT_ROW_KEY_SET(rip, v);
}
@@ -975,6 +977,7 @@ __wt_row_leaf_value_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack)
WT_KV_ENCODE_VALUE_LEN(unpack->size) |
WT_KV_ENCODE_KEY_OFFSET(key_offset) |
WT_KV_ENCODE_VALUE_OFFSET(value_offset) | WT_KV_FLAG;
+ WT_ASSERT(NULL, WT_ROW_SLOT(page, rip) < page->entries);
WT_ROW_KEY_SET(rip, v);
}
@@ -1516,7 +1519,8 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
WT_IGNORE_RET(
__wt_page_evict_urgent(session, ref));
} else {
- WT_RET_BUSY_OK(__wt_page_release_evict(session, ref));
+ WT_RET_BUSY_OK(__wt_page_release_evict(session, ref,
+ flags));
return (0);
}
}
diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h
index 7966d9802b3..c4c0ee5d5d4 100644
--- a/src/third_party/wiredtiger/src/include/cache.h
+++ b/src/third_party/wiredtiger/src/include/cache.h
@@ -54,6 +54,7 @@ typedef enum __wt_cache_op {
WT_SYNC_WRITE_LEAVES
} WT_CACHE_OP;
+#define WT_LAS_FILE_MIN (100 * WT_MEGABYTE)
#define WT_LAS_NUM_SESSIONS 5
#define WT_LAS_SWEEP_ENTRIES (20 * WT_THOUSAND)
#define WT_LAS_SWEEP_SEC 2
@@ -171,7 +172,7 @@ struct __wt_cache {
* Score of how aggressive eviction should be about selecting eviction
* candidates. If eviction is struggling to make progress, this score
* rises (up to a maximum of 100), at which point the cache is "stuck"
- * and transaction will be rolled back.
+ * and transactions will be rolled back.
*/
uint32_t evict_aggressive_score;
@@ -251,11 +252,12 @@ struct __wt_cache {
/* AUTOMATIC FLAG VALUE GENERATION START */
#define WT_CACHE_EVICT_CLEAN 0x01u /* Evict clean pages */
#define WT_CACHE_EVICT_CLEAN_HARD 0x02u /* Clean % blocking app threads */
-#define WT_CACHE_EVICT_DIRTY 0x04u /* Evict dirty pages */
-#define WT_CACHE_EVICT_DIRTY_HARD 0x08u /* Dirty % blocking app threads */
-#define WT_CACHE_EVICT_LOOKASIDE 0x10u /* Try lookaside eviction */
-#define WT_CACHE_EVICT_SCRUB 0x20u /* Scrub dirty pages */
-#define WT_CACHE_EVICT_URGENT 0x40u /* Pages are in the urgent queue */
+#define WT_CACHE_EVICT_DEBUG_MODE 0x04u /* Aggressive debugging mode */
+#define WT_CACHE_EVICT_DIRTY 0x08u /* Evict dirty pages */
+#define WT_CACHE_EVICT_DIRTY_HARD 0x10u /* Dirty % blocking app threads */
+#define WT_CACHE_EVICT_LOOKASIDE 0x20u /* Try lookaside eviction */
+#define WT_CACHE_EVICT_SCRUB 0x40u /* Scrub dirty pages */
+#define WT_CACHE_EVICT_URGENT 0x80u /* Pages are in the urgent queue */
/* AUTOMATIC FLAG VALUE GENERATION STOP */
#define WT_CACHE_EVICT_ALL (WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_DIRTY)
uint32_t flags;
@@ -290,3 +292,9 @@ struct __wt_cache_pool {
/* AUTOMATIC FLAG VALUE GENERATION STOP */
uint8_t flags;
};
+
+/* Flags used with __wt_evict */
+/* AUTOMATIC FLAG VALUE GENERATION START */
+#define WT_EVICT_CALL_CLOSING 0x1u /* Closing connection or tree */
+#define WT_EVICT_CALL_NO_SPLIT 0x2u /* Splits not allowed */
+/* AUTOMATIC FLAG VALUE GENERATION STOP */
diff --git a/src/third_party/wiredtiger/src/include/cell.h b/src/third_party/wiredtiger/src/include/cell.h
new file mode 100644
index 00000000000..5e079a613ad
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/cell.h
@@ -0,0 +1,162 @@
+/*-
+ * Copyright (c) 2014-2019 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * WT_CELL --
+ * Variable-length cell type.
+ *
+ * Pages containing variable-length keys or values data (the WT_PAGE_ROW_INT,
+ * WT_PAGE_ROW_LEAF, WT_PAGE_COL_INT and WT_PAGE_COL_VAR page types), have
+ * cells after the page header.
+ *
+ * There are 4 basic cell types: keys and data (each of which has an overflow
+ * form), deleted cells and off-page references. The cell is usually followed
+ * by additional data, varying by type: a key or data cell is followed by a set
+ * of bytes, an address cookie follows overflow or off-page cells.
+ *
+ * Deleted cells are place-holders for column-store files, where entries cannot
+ * be removed in order to preserve the record count.
+ *
+ * Here's the cell use by page type:
+ *
+ * WT_PAGE_ROW_INT (row-store internal page):
+ * Keys and offpage-reference pairs (a WT_CELL_KEY or WT_CELL_KEY_OVFL
+ * cell followed by a WT_CELL_ADDR_XXX cell).
+ *
+ * WT_PAGE_ROW_LEAF (row-store leaf page):
+ * Keys with optional data cells (a WT_CELL_KEY or WT_CELL_KEY_OVFL cell,
+ * normally followed by a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell).
+ *
+ * WT_PAGE_ROW_LEAF pages optionally prefix-compress keys, using a single
+ * byte count immediately following the cell.
+ *
+ * WT_PAGE_COL_INT (Column-store internal page):
+ * Off-page references (a WT_CELL_ADDR_XXX cell).
+ *
+ * WT_PAGE_COL_VAR (Column-store leaf page storing variable-length cells):
+ * Data cells (a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell), or deleted
+ * cells (a WT_CELL_DEL cell).
+ *
+ * Each cell starts with a descriptor byte:
+ *
+ * Bits 1 and 2 are reserved for "short" key and value cells (that is, a cell
+ * carrying data less than 64B, where we can store the data length in the cell
+ * descriptor byte):
+ * 0x00 Not a short key/data cell
+ * 0x01 Short key cell
+ * 0x10 Short key cell, with a following prefix-compression byte
+ * 0x11 Short value cell
+ * In these cases, the other 6 bits of the descriptor byte are the data length.
+ *
+ * Bit 3 marks an 8B packed, uint64_t value following the cell description byte.
+ * (A run-length counter or a record number for variable-length column store.)
+ *
+ * Bit 4 is unused.
+ *
+ * Bits 5-8 are cell "types".
+ */
+#define WT_CELL_KEY_SHORT 0x01 /* Short key */
+#define WT_CELL_KEY_SHORT_PFX 0x02 /* Short key with prefix byte */
+#define WT_CELL_VALUE_SHORT 0x03 /* Short data */
+#define WT_CELL_SHORT_TYPE(v) ((v) & 0x03U)
+
+#define WT_CELL_SHORT_MAX 63 /* Maximum short key/value */
+#define WT_CELL_SHORT_SHIFT 2 /* Shift for short key/value */
+
+#define WT_CELL_64V 0x04 /* Associated value */
+
+/*
+ * We could use bit 4 as a single bit (similar to bit 3), or as a type bit in a
+ * backward compatible way by adding bit 4 to the type mask and adding new types
+ * that incorporate it.
+ */
+#define WT_CELL_UNUSED_BIT4 0x08 /* Unused */
+
+/*
+ * WT_CELL_ADDR_INT is an internal block location, WT_CELL_ADDR_LEAF is a leaf
+ * block location, and WT_CELL_ADDR_LEAF_NO is a leaf block location where the
+ * page has no overflow items. (The goal is to speed up truncation as we don't
+ * have to read pages without overflow items in order to delete them. Note,
+ * WT_CELL_ADDR_LEAF_NO is not guaranteed to be set on every page without
+ * overflow items, the only guarantee is that if set, the page has no overflow
+ * items.)
+ *
+ * WT_CELL_VALUE_COPY is a reference to a previous cell on the page, supporting
+ * value dictionaries: if the two values are the same, we only store them once
+ * and have the second and subsequent use reference the original.
+ */
+#define WT_CELL_ADDR_DEL (0) /* Address: deleted */
+#define WT_CELL_ADDR_INT (1 << 4) /* Address: internal */
+#define WT_CELL_ADDR_LEAF (2 << 4) /* Address: leaf */
+#define WT_CELL_ADDR_LEAF_NO (3 << 4) /* Address: leaf no overflow */
+#define WT_CELL_DEL (4 << 4) /* Deleted value */
+#define WT_CELL_KEY (5 << 4) /* Key */
+#define WT_CELL_KEY_OVFL (6 << 4) /* Overflow key */
+#define WT_CELL_KEY_OVFL_RM (12 << 4) /* Overflow key (removed) */
+#define WT_CELL_KEY_PFX (7 << 4) /* Key with prefix byte */
+#define WT_CELL_VALUE (8 << 4) /* Value */
+#define WT_CELL_VALUE_COPY (9 << 4) /* Value copy */
+#define WT_CELL_VALUE_OVFL (10 << 4) /* Overflow value */
+#define WT_CELL_VALUE_OVFL_RM (11 << 4) /* Overflow value (removed) */
+
+#define WT_CELL_TYPE_MASK (0x0fU << 4) /* Maximum 16 cell types */
+#define WT_CELL_TYPE(v) ((v) & WT_CELL_TYPE_MASK)
+
+/*
+ * When we aren't able to create a short key or value (and, in the case of a
+ * value, there's no associated RLE), the key or value is at least 64B, else
+ * we'd have been able to store it as a short cell. Decrement/Increment the
+ * size before storing it, in the hopes that relatively small key/value sizes
+ * will pack into a single byte instead of two bytes.
+ */
+#define WT_CELL_SIZE_ADJUST 64
+
+/*
+ * WT_CELL --
+ * Variable-length, on-page cell header.
+ */
+struct __wt_cell {
+ /*
+ * Maximum of 16 bytes:
+ * 1: cell descriptor byte
+ * 1: prefix compression count
+ * 9: associated 64-bit value (uint64_t encoding, max 9 bytes)
+ * 5: data length (uint32_t encoding, max 5 bytes)
+ *
+ * This calculation is pessimistic: the prefix compression count and
+ * 64V value overlap, the 64V value and data length are optional.
+ */
+ uint8_t __chunk[1 + 1 + WT_INTPACK64_MAXSIZE + WT_INTPACK32_MAXSIZE];
+};
+
+/*
+ * WT_CELL_UNPACK --
+ * Unpacked cell.
+ */
+struct __wt_cell_unpack {
+ WT_CELL *cell; /* Cell's disk image address */
+
+ uint64_t v; /* RLE count or recno */
+
+ /*
+ * !!!
+ * The size and __len fields are reasonably type size_t; don't change
+ * the type, performance drops significantly if they're type size_t.
+ */
+ const void *data; /* Data */
+ uint32_t size; /* Data size */
+
+ uint32_t __len; /* Cell + data length (usually) */
+
+ uint8_t prefix; /* Cell prefix length */
+
+ uint8_t raw; /* Raw cell type (include "shorts") */
+ uint8_t type; /* Cell type */
+
+ uint8_t ovfl; /* boolean: cell is an overflow */
+};
diff --git a/src/third_party/wiredtiger/src/include/cell.i b/src/third_party/wiredtiger/src/include/cell.i
index f518acfcbb0..c807737c494 100644
--- a/src/third_party/wiredtiger/src/include/cell.i
+++ b/src/third_party/wiredtiger/src/include/cell.i
@@ -7,161 +7,6 @@
*/
/*
- * WT_CELL --
- * Variable-length cell type.
- *
- * Pages containing variable-length keys or values data (the WT_PAGE_ROW_INT,
- * WT_PAGE_ROW_LEAF, WT_PAGE_COL_INT and WT_PAGE_COL_VAR page types), have
- * cells after the page header.
- *
- * There are 4 basic cell types: keys and data (each of which has an overflow
- * form), deleted cells and off-page references. The cell is usually followed
- * by additional data, varying by type: a key or data cell is followed by a set
- * of bytes, an address cookie follows overflow or off-page cells.
- *
- * Deleted cells are place-holders for column-store files, where entries cannot
- * be removed in order to preserve the record count.
- *
- * Here's the cell use by page type:
- *
- * WT_PAGE_ROW_INT (row-store internal page):
- * Keys and offpage-reference pairs (a WT_CELL_KEY or WT_CELL_KEY_OVFL
- * cell followed by a WT_CELL_ADDR_XXX cell).
- *
- * WT_PAGE_ROW_LEAF (row-store leaf page):
- * Keys with optional data cells (a WT_CELL_KEY or WT_CELL_KEY_OVFL cell,
- * normally followed by a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell).
- *
- * WT_PAGE_ROW_LEAF pages optionally prefix-compress keys, using a single
- * byte count immediately following the cell.
- *
- * WT_PAGE_COL_INT (Column-store internal page):
- * Off-page references (a WT_CELL_ADDR_XXX cell).
- *
- * WT_PAGE_COL_VAR (Column-store leaf page storing variable-length cells):
- * Data cells (a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell), or deleted
- * cells (a WT_CELL_DEL cell).
- *
- * Each cell starts with a descriptor byte:
- *
- * Bits 1 and 2 are reserved for "short" key and value cells (that is, a cell
- * carrying data less than 64B, where we can store the data length in the cell
- * descriptor byte):
- * 0x00 Not a short key/data cell
- * 0x01 Short key cell
- * 0x10 Short key cell, with a following prefix-compression byte
- * 0x11 Short value cell
- * In these cases, the other 6 bits of the descriptor byte are the data length.
- *
- * Bit 3 marks an 8B packed, uint64_t value following the cell description byte.
- * (A run-length counter or a record number for variable-length column store.)
- *
- * Bit 4 is unused.
- *
- * Bits 5-8 are cell "types".
- */
-#define WT_CELL_KEY_SHORT 0x01 /* Short key */
-#define WT_CELL_KEY_SHORT_PFX 0x02 /* Short key with prefix byte */
-#define WT_CELL_VALUE_SHORT 0x03 /* Short data */
-#define WT_CELL_SHORT_TYPE(v) ((v) & 0x03U)
-
-#define WT_CELL_SHORT_MAX 63 /* Maximum short key/value */
-#define WT_CELL_SHORT_SHIFT 2 /* Shift for short key/value */
-
-#define WT_CELL_64V 0x04 /* Associated value */
-
-/*
- * We could use bit 4 as a single bit (similar to bit 3), or as a type bit in a
- * backward compatible way by adding bit 4 to the type mask and adding new types
- * that incorporate it.
- */
-#define WT_CELL_UNUSED_BIT4 0x08 /* Unused */
-
-/*
- * WT_CELL_ADDR_INT is an internal block location, WT_CELL_ADDR_LEAF is a leaf
- * block location, and WT_CELL_ADDR_LEAF_NO is a leaf block location where the
- * page has no overflow items. (The goal is to speed up truncation as we don't
- * have to read pages without overflow items in order to delete them. Note,
- * WT_CELL_ADDR_LEAF_NO is not guaranteed to be set on every page without
- * overflow items, the only guarantee is that if set, the page has no overflow
- * items.)
- *
- * WT_CELL_VALUE_COPY is a reference to a previous cell on the page, supporting
- * value dictionaries: if the two values are the same, we only store them once
- * and have the second and subsequent use reference the original.
- */
-#define WT_CELL_ADDR_DEL (0) /* Address: deleted */
-#define WT_CELL_ADDR_INT (1 << 4) /* Address: internal */
-#define WT_CELL_ADDR_LEAF (2 << 4) /* Address: leaf */
-#define WT_CELL_ADDR_LEAF_NO (3 << 4) /* Address: leaf no overflow */
-#define WT_CELL_DEL (4 << 4) /* Deleted value */
-#define WT_CELL_KEY (5 << 4) /* Key */
-#define WT_CELL_KEY_OVFL (6 << 4) /* Overflow key */
-#define WT_CELL_KEY_OVFL_RM (12 << 4) /* Overflow key (removed) */
-#define WT_CELL_KEY_PFX (7 << 4) /* Key with prefix byte */
-#define WT_CELL_VALUE (8 << 4) /* Value */
-#define WT_CELL_VALUE_COPY (9 << 4) /* Value copy */
-#define WT_CELL_VALUE_OVFL (10 << 4) /* Overflow value */
-#define WT_CELL_VALUE_OVFL_RM (11 << 4) /* Overflow value (removed) */
-
-#define WT_CELL_TYPE_MASK (0x0fU << 4) /* Maximum 16 cell types */
-#define WT_CELL_TYPE(v) ((v) & WT_CELL_TYPE_MASK)
-
-/*
- * When we aren't able to create a short key or value (and, in the case of a
- * value, there's no associated RLE), the key or value is at least 64B, else
- * we'd have been able to store it as a short cell. Decrement/Increment the
- * size before storing it, in the hopes that relatively small key/value sizes
- * will pack into a single byte instead of two bytes.
- */
-#define WT_CELL_SIZE_ADJUST 64
-
-/*
- * WT_CELL --
- * Variable-length, on-page cell header.
- */
-struct __wt_cell {
- /*
- * Maximum of 16 bytes:
- * 1: cell descriptor byte
- * 1: prefix compression count
- * 9: associated 64-bit value (uint64_t encoding, max 9 bytes)
- * 5: data length (uint32_t encoding, max 5 bytes)
- *
- * This calculation is pessimistic: the prefix compression count and
- * 64V value overlap, the 64V value and data length are optional.
- */
- uint8_t __chunk[1 + 1 + WT_INTPACK64_MAXSIZE + WT_INTPACK32_MAXSIZE];
-};
-
-/*
- * WT_CELL_UNPACK --
- * Unpacked cell.
- */
-struct __wt_cell_unpack {
- WT_CELL *cell; /* Cell's disk image address */
-
- uint64_t v; /* RLE count or recno */
-
- /*
- * !!!
- * The size and __len fields are reasonably type size_t; don't change
- * the type, performance drops significantly if they're type size_t.
- */
- const void *data; /* Data */
- uint32_t size; /* Data size */
-
- uint32_t __len; /* Cell + data length (usually) */
-
- uint8_t prefix; /* Cell prefix length */
-
- uint8_t raw; /* Raw cell type (include "shorts") */
- uint8_t type; /* Cell type */
-
- uint8_t ovfl; /* boolean: cell is an overflow */
-};
-
-/*
* WT_CELL_FOREACH --
* Walk the cells on a page.
*/
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
index a23434ea9e2..73ac6c85522 100644
--- a/src/third_party/wiredtiger/src/include/connection.h
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -193,6 +193,9 @@ struct __wt_connection_impl {
WT_SPINLOCK optrack_map_spinlock; /* Translation file spinlock. */
uintmax_t optrack_pid; /* Cache the process ID. */
+ WT_LSN *debug_ckpt; /* Debug mode checkpoint LSNs. */
+ uint32_t debug_ckpt_cnt;/* Checkpoint retention number */
+
void **foc; /* Free-on-close array */
size_t foc_cnt; /* Array entries */
size_t foc_size; /* Array size */
@@ -321,15 +324,16 @@ struct __wt_connection_impl {
/* AUTOMATIC FLAG VALUE GENERATION START */
#define WT_CONN_LOG_ARCHIVE 0x001u /* Archive is enabled */
-#define WT_CONN_LOG_DOWNGRADED 0x002u /* Running older version */
-#define WT_CONN_LOG_ENABLED 0x004u /* Logging is enabled */
-#define WT_CONN_LOG_EXISTED 0x008u /* Log files found */
-#define WT_CONN_LOG_FORCE_DOWNGRADE 0x010u /* Force downgrade */
-#define WT_CONN_LOG_RECOVER_DIRTY 0x020u /* Recovering unclean */
-#define WT_CONN_LOG_RECOVER_DONE 0x040u /* Recovery completed */
-#define WT_CONN_LOG_RECOVER_ERR 0x080u /* Error if recovery required */
-#define WT_CONN_LOG_RECOVER_FAILED 0x100u /* Recovery failed */
-#define WT_CONN_LOG_ZERO_FILL 0x200u /* Manually zero files */
+#define WT_CONN_LOG_DEBUG_MODE 0x002u /* Debug-mode logging enabled */
+#define WT_CONN_LOG_DOWNGRADED 0x004u /* Running older version */
+#define WT_CONN_LOG_ENABLED 0x008u /* Logging is enabled */
+#define WT_CONN_LOG_EXISTED 0x010u /* Log files found */
+#define WT_CONN_LOG_FORCE_DOWNGRADE 0x020u /* Force downgrade */
+#define WT_CONN_LOG_RECOVER_DIRTY 0x040u /* Recovering unclean */
+#define WT_CONN_LOG_RECOVER_DONE 0x080u /* Recovery completed */
+#define WT_CONN_LOG_RECOVER_ERR 0x100u /* Error if recovery required */
+#define WT_CONN_LOG_RECOVER_FAILED 0x200u /* Recovery failed */
+#define WT_CONN_LOG_ZERO_FILL 0x400u /* Manually zero files */
/* AUTOMATIC FLAG VALUE GENERATION STOP */
uint32_t log_flags; /* Global logging configuration */
WT_CONDVAR *log_cond; /* Log server wait mutex */
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index aa313fa2caf..1ecfaf6eef6 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -202,9 +202,10 @@ extern int __wt_update_alloc(WT_SESSION_IMPL *session, const WT_ITEM *value, WT_
extern WT_UPDATE *__wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd);
extern int __wt_search_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert, bool restore) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_las_config(WT_SESSION_IMPL *session, const char **cfg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern bool __wt_las_empty(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_las_stats_update(WT_SESSION_IMPL *session);
-extern int __wt_las_create(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_las_create(WT_SESSION_IMPL *session, const char **cfg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_las_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_las_cursor_open(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_las_cursor(WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags);
@@ -254,6 +255,7 @@ extern int __wt_encryptor_config(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cval,
extern int __wt_conn_remove_encryptor(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_extractor_config(WT_SESSION_IMPL *session, const char *uri, const char *config, WT_EXTRACTOR **extractorp, int *ownp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_conn_remove_extractor(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_debug_mode_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_verbose_dump_sessions(WT_SESSION_IMPL *session, bool show_cursors) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_timing_stress_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -386,11 +388,11 @@ extern bool __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC
extern void __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v);
extern void __wt_evict_priority_clear(WT_SESSION_IMPL *session);
extern int __wt_verbose_dump_cache(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing, uint32_t previous_state) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t previous_state, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_curstat_cache_walk(WT_SESSION_IMPL *session);
extern int __wt_log_printf(WT_SESSION_IMPL *session, const char *format, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern void __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn);
+extern void __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckpt_lsn);
extern int __wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn);
extern int __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -446,6 +448,9 @@ extern int __wt_logop_checkpoint_start_print(WT_SESSION_IMPL *session, const uin
extern int __wt_logop_prev_lsn_pack(WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *prev_lsn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_logop_prev_lsn_unpack(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_LSN *prev_lsnp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_logop_prev_lsn_print(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_TXN_PRINTLOG_ARGS *args) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_logop_txn_timestamp_pack(WT_SESSION_IMPL *session, WT_ITEM *logrec, uint64_t time_sec, uint64_t time_nsec, uint64_t commit_ts, uint64_t durable_ts, uint64_t first_ts, uint64_t prepare_ts, uint64_t read_ts) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_logop_txn_timestamp_unpack(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint64_t *time_secp, uint64_t *time_nsecp, uint64_t *commit_tsp, uint64_t *durable_tsp, uint64_t *first_tsp, uint64_t *prepare_tsp, uint64_t *read_tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_logop_txn_timestamp_print(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_TXN_PRINTLOG_ARGS *args) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_op_printlog(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_TXN_PRINTLOG_ARGS *args) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
extern int __wt_log_slot_switch(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced, bool *did_work) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -597,6 +602,21 @@ extern int __wt_ext_unpack_item(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, WT
extern int __wt_ext_unpack_int(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, int64_t *ip) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_ext_unpack_str(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, const char **sp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_ext_unpack_uint(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, uint64_t *up) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rec_child_modify(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, bool *hazardp, WT_CHILD_STATE *statep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_bulk_insert_fix_bitmap(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rec_col_fix_slvg(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref, WT_SALVAGE_COOKIE *salvage) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rec_col_var(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref, WT_SALVAGE_COOKIE *salvage) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rec_dictionary_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, u_int slots) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern void __wt_rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r);
+extern void __wt_rec_dictionary_reset(WT_RECONCILE *r);
+extern int __wt_rec_dictionary_lookup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *val, WT_REC_DICTIONARY **dpp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rec_row_leaf(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_ovfl_track_init(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page);
@@ -605,14 +625,16 @@ extern int __wt_ovfl_reuse_add(WT_SESSION_IMPL *session, WT_PAGE *page, const ui
extern void __wt_ovfl_reuse_free(WT_SESSION_IMPL *session, WT_PAGE *page);
extern int __wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, void *ripcip, WT_CELL_UNPACK *vpack, bool *upd_savedp, WT_UPDATE **updp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags, bool *lookaside_retryp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern uint32_t __wt_split_page_size(int split_pct, uint32_t maxpagesize, uint32_t allocsize);
+extern int __wt_rec_split_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page, uint64_t recno, uint64_t max) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rec_split_crossing_bnd(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_bulk_insert_fix_bitmap(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rec_cell_build_ovfl(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *kv, uint8_t type, uint64_t rle) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_schema_alter(WT_SESSION_IMPL *session, const char *uri, const char *newcfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_direct_io_size_check(WT_SESSION_IMPL *session, const char **cfg, const char *config_name, uint32_t *allocsizep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_schema_colgroup_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -840,6 +862,7 @@ extern void __wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op);
extern int __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_checkpoint_logread(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_LSN *ckpt_lsn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_txn_ts_log(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_checkpoint_log(WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_truncate_log(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_txn_truncate_end(WT_SESSION_IMPL *session);
@@ -861,7 +884,7 @@ extern int __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *c
extern int __wt_timestamp_validate(WT_SESSION_IMPL *session, const char *name, wt_timestamp_t ts, WT_CONFIG_ITEM *cval) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_txn_parse_prepare_timestamp(WT_SESSION_IMPL *session, const char *cfg[], wt_timestamp_t *timestamp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_txn_parse_read_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_txn_parse_read_timestamp(WT_SESSION_IMPL *session, const char *cfg[], bool *set_tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_txn_set_commit_timestamp(WT_SESSION_IMPL *session);
extern void __wt_txn_clear_commit_timestamp(WT_SESSION_IMPL *session);
extern void __wt_txn_set_read_timestamp(WT_SESSION_IMPL *session);
diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h
index 463f92a34a3..e7e49b8b0ce 100644
--- a/src/third_party/wiredtiger/src/include/log.h
+++ b/src/third_party/wiredtiger/src/include/log.h
@@ -21,6 +21,9 @@
#define WT_LOG_SYNC_ENABLED 0x10u
/* AUTOMATIC FLAG VALUE GENERATION STOP */
+#define WT_LOGOP_IGNORE 0x80000000
+#define WT_LOGOP_IS_IGNORED(val) (val & WT_LOGOP_IGNORE)
+
/*
* WT_LSN --
* A log sequence number, representing a position in the transaction log.
diff --git a/src/third_party/wiredtiger/src/include/meta.h b/src/third_party/wiredtiger/src/include/meta.h
index e221cad1481..e4b369f736d 100644
--- a/src/third_party/wiredtiger/src/include/meta.h
+++ b/src/third_party/wiredtiger/src/include/meta.h
@@ -27,6 +27,7 @@
#define WT_METAFILE_SLVG "WiredTiger.wt.orig" /* Metadata copy */
#define WT_METAFILE_URI "file:WiredTiger.wt" /* Metadata table URI */
+#define WT_LAS_FILE "WiredTigerLAS.wt" /* Lookaside table */
#define WT_LAS_URI "file:WiredTigerLAS.wt" /* Lookaside table URI*/
#define WT_SYSTEM_PREFIX "system:" /* System URI prefix */
diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h
new file mode 100644
index 00000000000..fdb47f3d3d9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/reconcile.h
@@ -0,0 +1,294 @@
+/*-
+ * Copyright (c) 2014-2019 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Reconciliation is the process of taking an in-memory page, walking each entry
+ * in the page, building a backing disk image in a temporary buffer representing
+ * that information, and writing that buffer to disk. What could be simpler?
+ *
+ * WT_RECONCILE --
+ * Information tracking a single page reconciliation.
+ */
+typedef struct {
+ WT_REF *ref; /* Page being reconciled */
+ WT_PAGE *page;
+ uint32_t flags; /* Caller's configuration */
+
+ /*
+ * Track start/stop write generation to decide if all changes to the
+ * page are written.
+ */
+ uint32_t orig_write_gen;
+
+ /*
+ * Track start/stop checkpoint generations to decide if lookaside table
+ * records are correct.
+ */
+ uint64_t orig_btree_checkpoint_gen;
+ uint64_t orig_txn_checkpoint_gen;
+
+ /*
+ * Track the oldest running transaction and whether to skew lookaside
+ * to the newest update.
+ */
+ bool las_skew_newest;
+ uint64_t last_running;
+
+ /* Track the page's min/maximum transactions. */
+ uint64_t max_txn;
+ wt_timestamp_t max_timestamp;
+
+ /* Lookaside boundary tracking. */
+ uint64_t unstable_txn;
+ wt_timestamp_t unstable_timestamp;
+
+ u_int updates_seen; /* Count of updates seen. */
+ u_int updates_unstable; /* Count of updates not visible_all. */
+
+ bool update_uncommitted; /* An update was uncommitted */
+ bool update_used; /* An update could be used */
+
+ /*
+ * When we can't mark the page clean (for example, checkpoint found some
+ * uncommitted updates), there's a leave-dirty flag.
+ */
+ bool leave_dirty;
+
+ /*
+ * Track if reconciliation has seen any overflow items. If a leaf page
+ * with no overflow items is written, the parent page's address cell is
+ * set to the leaf-no-overflow type. This means we can delete the leaf
+ * page without reading it because we don't have to discard any overflow
+ * items it might reference.
+ *
+ * The test test is per-page reconciliation, that is, once we see an
+ * overflow item on the page, all subsequent leaf pages written for the
+ * page will not be leaf-no-overflow type, regardless of whether or not
+ * they contain overflow items. In other words, leaf-no-overflow is not
+ * guaranteed to be set on every page that doesn't contain an overflow
+ * item, only that if it is set, the page contains no overflow items.
+ * XXX
+ * This was originally done because raw compression couldn't do better,
+ * now that raw compression has been removed, we should do better.
+ */
+ bool ovfl_items;
+
+ /*
+ * Track if reconciliation of a row-store leaf page has seen empty (zero
+ * length) values. We don't write out anything for empty values, so if
+ * there are empty values on a page, we have to make two passes over the
+ * page when it's read to figure out how many keys it has, expensive in
+ * the common case of no empty values and (entries / 2) keys. Likewise,
+ * a page with only empty values is another common data set, and keys on
+ * that page will be equal to the number of entries. In both cases, set
+ * a flag in the page's on-disk header.
+ *
+ * The test is per-page reconciliation as described above for the
+ * overflow-item test.
+ */
+ bool all_empty_value, any_empty_value;
+
+ /*
+ * Reconciliation gets tricky if we have to split a page, which happens
+ * when the disk image we create exceeds the page type's maximum disk
+ * image size.
+ *
+ * First, the target size of the page we're building.
+ */
+ uint32_t page_size; /* Page size */
+
+ /*
+ * Second, the split size: if we're doing the page layout, split to a
+ * smaller-than-maximum page size when a split is required so we don't
+ * repeatedly split a packed page.
+ */
+ uint32_t split_size; /* Split page size */
+ uint32_t min_split_size; /* Minimum split page size */
+
+ /*
+ * We maintain two split chunks in the memory during reconciliation to
+ * be written out as pages. As we get to the end of the data, if the
+ * last one turns out to be smaller than the minimum split size, we go
+ * back into the penultimate chunk and split at this minimum split size
+ * boundary. This moves some data from the penultimate chunk to the last
+ * chunk, hence increasing the size of the last page written without
+ * decreasing the penultimate page size beyond the minimum split size.
+ * For this reason, we maintain an expected split percentage boundary
+ * and a minimum split percentage boundary.
+ *
+ * Chunks are referenced by current and previous pointers. In case of a
+ * split, previous references the first chunk and current switches to
+ * the second chunk. If reconciliation generates more split chunks, the
+ * the previous chunk is written to the disk and current and previous
+ * swap.
+ */
+ struct __wt_rec_chunk {
+ /*
+ * The recno and entries fields are the starting record number
+ * of the split chunk (for column-store splits), and the number
+ * of entries in the split chunk.
+ *
+ * The key for a row-store page; no column-store key is needed
+ * because the page's recno, stored in the recno field, is the
+ * column-store key.
+ */
+ uint32_t entries;
+ uint64_t recno;
+ WT_ITEM key;
+
+ uint32_t min_entries;
+ uint64_t min_recno;
+ WT_ITEM min_key;
+
+ /* Minimum split-size boundary buffer offset. */
+ size_t min_offset;
+
+ WT_ITEM image; /* disk-image */
+ } chunkA, chunkB, *cur_ptr, *prev_ptr;
+
+ /*
+ * We track current information about the current record number, the
+ * number of entries copied into the disk image buffer, where we are
+ * in the buffer, and how much memory remains. Those values are
+ * packaged here rather than passing pointers to stack locations
+ * around the code.
+ */
+ uint64_t recno; /* Current record number */
+ uint32_t entries; /* Current number of entries */
+ uint8_t *first_free; /* Current first free byte */
+ size_t space_avail; /* Remaining space in this chunk */
+ /* Remaining space in this chunk to put a minimum size boundary */
+ size_t min_space_avail;
+
+ /*
+ * Saved update list, supporting the WT_REC_UPDATE_RESTORE and
+ * WT_REC_LOOKASIDE configurations. While reviewing updates for each
+ * page, we save WT_UPDATE lists here, and then move them to per-block
+ * areas as the blocks are defined.
+ */
+ WT_SAVE_UPD *supd; /* Saved updates */
+ uint32_t supd_next;
+ size_t supd_allocated;
+ size_t supd_memsize; /* Size of saved update structures */
+
+ /* List of pages we've written so far. */
+ WT_MULTI *multi;
+ uint32_t multi_next;
+ size_t multi_allocated;
+
+ /*
+ * Root pages are written when wrapping up the reconciliation, remember
+ * the image we're going to write.
+ */
+ WT_ITEM *wrapup_checkpoint;
+ bool wrapup_checkpoint_compressed;
+
+ /*
+ * We don't need to keep the 0th key around on internal pages, the
+ * search code ignores them as nothing can sort less by definition.
+ * There's some trickiness here, see the code for comments on how
+ * these fields work.
+ */
+ bool cell_zero; /* Row-store internal page 0th key */
+
+ /*
+ * We calculate checksums to find previously written identical blocks,
+ * but once a match fails during an eviction, there's no point trying
+ * again.
+ */
+ bool evict_matching_checksum_failed;
+
+ /*
+ * WT_REC_DICTIONARY --
+ * We optionally build a dictionary of values for leaf pages. Where
+ * two value cells are identical, only write the value once, the second
+ * and subsequent copies point to the original cell. The dictionary is
+ * fixed size, but organized in a skip-list to make searches faster.
+ */
+ struct __wt_rec_dictionary {
+ uint64_t hash; /* Hash value */
+ uint32_t offset; /* Matching cell */
+
+ u_int depth; /* Skiplist */
+ WT_REC_DICTIONARY *next[0];
+ } **dictionary; /* Dictionary */
+ u_int dictionary_next, dictionary_slots; /* Next, max entries */
+ /* Skiplist head. */
+ WT_REC_DICTIONARY *dictionary_head[WT_SKIP_MAXDEPTH];
+
+ /*
+ * WT_REC_KV--
+ * An on-page key/value item we're building.
+ */
+ struct __wt_rec_kv {
+ WT_ITEM buf; /* Data */
+ WT_CELL cell; /* Cell and cell's length */
+ size_t cell_len;
+ size_t len; /* Total length of cell + data */
+ } k, v; /* Key/Value being built */
+
+ WT_ITEM *cur, _cur; /* Key/Value being built */
+ WT_ITEM *last, _last; /* Last key/value built */
+
+ bool key_pfx_compress; /* If can prefix-compress next key */
+ bool key_pfx_compress_conf; /* If prefix compression configured */
+ bool key_sfx_compress; /* If can suffix-compress next key */
+ bool key_sfx_compress_conf; /* If suffix compression configured */
+
+ bool is_bulk_load; /* If it's a bulk load */
+
+ WT_SALVAGE_COOKIE *salvage; /* If it's a salvage operation */
+
+ bool cache_write_lookaside; /* Used the lookaside table */
+ bool cache_write_restore; /* Used update/restoration */
+
+ uint32_t tested_ref_state; /* Debugging information */
+
+ /*
+ * XXX
+ * In the case of a modified update, we may need a copy of the current
+ * value as a set of bytes. We call back into the btree code using a
+ * fake cursor to do that work. This a layering violation and fragile,
+ * we need a better solution.
+ */
+ WT_CURSOR_BTREE update_modify_cbt;
+} WT_RECONCILE;
+
+/*
+ * WT_CHILD_RELEASE, WT_CHILD_RELEASE_ERR --
+ * Macros to clean up during internal-page reconciliation, releasing the
+ * hazard pointer we're holding on child pages.
+ */
+#define WT_CHILD_RELEASE(session, hazard, ref) do { \
+ if (hazard) { \
+ (hazard) = false; \
+ WT_TRET( \
+ __wt_page_release(session, ref, WT_READ_NO_EVICT)); \
+ } \
+} while (0)
+#define WT_CHILD_RELEASE_ERR(session, hazard, ref) do { \
+ WT_CHILD_RELEASE(session, hazard, ref); \
+ WT_ERR(ret); \
+} while (0)
+
+typedef enum {
+ WT_CHILD_IGNORE, /* Ignored child */
+ WT_CHILD_MODIFIED, /* Modified child */
+ WT_CHILD_ORIGINAL, /* Original child */
+ WT_CHILD_PROXY /* Deleted child: proxy */
+} WT_CHILD_STATE;
+
+/*
+ * Macros from fixed-length entries to/from bytes.
+ */
+#define WT_FIX_BYTES_TO_ENTRIES(btree, bytes) \
+ ((uint32_t)((((bytes) * 8) / (btree)->bitcnt)))
+#define WT_FIX_ENTRIES_TO_BYTES(btree, entries) \
+ ((uint32_t)WT_ALIGN((entries) * (btree)->bitcnt, 8))
diff --git a/src/third_party/wiredtiger/src/include/reconcile.i b/src/third_party/wiredtiger/src/include/reconcile.i
new file mode 100644
index 00000000000..b56b8dc1404
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/reconcile.i
@@ -0,0 +1,257 @@
+/*-
+ * Copyright (c) 2014-2019 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#define WT_CROSSING_MIN_BND(r, next_len) \
+ ((r)->cur_ptr->min_offset == 0 && \
+ (next_len) > (r)->min_space_avail)
+#define WT_CROSSING_SPLIT_BND(r, next_len) ((next_len) > (r)->space_avail)
+#define WT_CHECK_CROSSING_BND(r, next_len) \
+ (WT_CROSSING_MIN_BND(r, next_len) || WT_CROSSING_SPLIT_BND(r, next_len))
+
+/*
+ * __wt_rec_vtype --
+ * Return a value cell's address type.
+ */
+static inline u_int
+__wt_rec_vtype(WT_ADDR *addr)
+{
+ if (addr->type == WT_ADDR_INT)
+ return (WT_CELL_ADDR_INT);
+ if (addr->type == WT_ADDR_LEAF)
+ return (WT_CELL_ADDR_LEAF);
+ return (WT_CELL_ADDR_LEAF_NO);
+}
+
+/*
+ * __wt_rec_need_split --
+ * Check whether adding some bytes to the page requires a split.
+ */
+static inline bool
+__wt_rec_need_split(WT_RECONCILE *r, size_t len)
+{
+ /*
+ * In the case of a row-store leaf page, trigger a split if a threshold
+ * number of saved updates is reached. This allows pages to split for
+ * update/restore and lookaside eviction when there is no visible data
+ * causing the disk image to grow.
+ *
+ * In the case of small pages or large keys, we might try to split when
+ * a page has no updates or entries, which isn't possible. To consider
+ * update/restore or lookaside information, require either page entries
+ * or updates that will be attached to the image. The limit is one of
+ * either, but it doesn't make sense to create pages or images with few
+ * entries or updates, even where page sizes are small (especially as
+ * updates that will eventually become overflow items can throw off our
+ * calculations). Bound the combination at something reasonable.
+ */
+ if (r->page->type == WT_PAGE_ROW_LEAF && r->entries + r->supd_next > 10)
+ len += r->supd_memsize;
+
+ /* Check for the disk image crossing a boundary. */
+ return (WT_CHECK_CROSSING_BND(r, len));
+}
+
+/*
+ * __wt_rec_incr --
+ * Update the memory tracking structure for a set of new entries.
+ */
+static inline void
+__wt_rec_incr(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size)
+{
+ /*
+ * The buffer code is fragile and prone to off-by-one errors -- check
+ * for overflow in diagnostic mode.
+ */
+ WT_ASSERT(session, r->space_avail >= size);
+ WT_ASSERT(session, WT_BLOCK_FITS(r->first_free, size,
+ r->cur_ptr->image.mem, r->cur_ptr->image.memsize));
+
+ r->entries += v;
+ r->space_avail -= size;
+ r->first_free += size;
+
+ /*
+ * If offset for the minimum split size boundary is not set, we have not
+ * yet reached the minimum boundary, reduce the space available for it.
+ */
+ if (r->cur_ptr->min_offset == 0) {
+ if (r->min_space_avail >= size)
+ r->min_space_avail -= size;
+ else
+ r->min_space_avail = 0;
+ }
+}
+
+/*
+ * __wt_rec_copy_incr --
+ * Copy a key/value cell and buffer pair into the new image.
+ */
+static inline void
+__wt_rec_copy_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *kv)
+{
+ size_t len;
+ uint8_t *p, *t;
+
+ /*
+ * If there's only one chunk of data to copy (because the cell and data
+ * are being copied from the original disk page), the cell length won't
+ * be set, the WT_ITEM data/length will reference the data to be copied.
+ *
+ * WT_CELLs are typically small, 1 or 2 bytes -- don't call memcpy, do
+ * the copy in-line.
+ */
+ for (p = r->first_free,
+ t = (uint8_t *)&kv->cell, len = kv->cell_len; len > 0; --len)
+ *p++ = *t++;
+
+ /* The data can be quite large -- call memcpy. */
+ if (kv->buf.size != 0)
+ memcpy(p, kv->buf.data, kv->buf.size);
+
+ WT_ASSERT(session, kv->len == kv->cell_len + kv->buf.size);
+ __wt_rec_incr(session, r, 1, kv->len);
+}
+
+/*
+ * __wt_rec_cell_build_addr --
+ * Process an address reference and return a cell structure to be stored
+ * on the page.
+ */
+static inline void
+__wt_rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r,
+ const void *addr, size_t size, u_int cell_type, uint64_t recno)
+{
+ WT_REC_KV *val;
+
+ val = &r->v;
+
+ WT_ASSERT(session, size != 0 || cell_type == WT_CELL_ADDR_DEL);
+
+ /*
+ * We don't check the address size because we can't store an address on
+ * an overflow page: if the address won't fit, the overflow page's
+ * address won't fit either. This possibility must be handled by Btree
+ * configuration, we have to disallow internal page sizes that are too
+ * small with respect to the largest address cookie the underlying block
+ * manager might return.
+ */
+
+ /*
+ * We don't copy the data into the buffer, it's not necessary; just
+ * re-point the buffer's data/length fields.
+ */
+ val->buf.data = addr;
+ val->buf.size = size;
+ val->cell_len =
+ __wt_cell_pack_addr(&val->cell, cell_type, recno, val->buf.size);
+ val->len = val->cell_len + val->buf.size;
+}
+
+/*
+ * __wt_rec_cell_build_val --
+ * Process a data item and return a WT_CELL structure and byte string to
+ * be stored on the page.
+ */
+static inline int
+__wt_rec_cell_build_val(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, const void *data, size_t size, uint64_t rle)
+{
+ WT_BTREE *btree;
+ WT_REC_KV *val;
+
+ btree = S2BT(session);
+
+ val = &r->v;
+
+ /*
+ * We don't copy the data into the buffer, it's not necessary; just
+ * re-point the buffer's data/length fields.
+ */
+ val->buf.data = data;
+ val->buf.size = size;
+
+ /* Handle zero-length cells quickly. */
+ if (size != 0) {
+ /* Optionally compress the data using the Huffman engine. */
+ if (btree->huffman_value != NULL)
+ WT_RET(__wt_huffman_encode(
+ session, btree->huffman_value,
+ val->buf.data, (uint32_t)val->buf.size, &val->buf));
+
+ /* Create an overflow object if the data won't fit. */
+ if (val->buf.size > btree->maxleafvalue) {
+ WT_STAT_DATA_INCR(session, rec_overflow_value);
+
+ return (__wt_rec_cell_build_ovfl(
+ session, r, val, WT_CELL_VALUE_OVFL, rle));
+ }
+ }
+ val->cell_len = __wt_cell_pack_data(&val->cell, rle, val->buf.size);
+ val->len = val->cell_len + val->buf.size;
+
+ return (0);
+}
+
+/*
+ * __wt_rec_dict_replace --
+ * Check for a dictionary match.
+ */
+static inline int
+__wt_rec_dict_replace(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, uint64_t rle, WT_REC_KV *val)
+{
+ WT_REC_DICTIONARY *dp;
+ uint64_t offset;
+
+ /*
+ * We optionally create a dictionary of values and only write a unique
+ * value once per page, using a special "copy" cell for all subsequent
+ * copies of the value. We have to do the cell build and resolution at
+ * this low level because we need physical cell offsets for the page.
+ *
+ * Sanity check: short-data cells can be smaller than dictionary-copy
+ * cells. If the data is already small, don't bother doing the work.
+ * This isn't just work avoidance: on-page cells can't grow as a result
+ * of writing a dictionary-copy cell, the reconciliation functions do a
+ * split-boundary test based on the size required by the value's cell;
+ * if we grow the cell after that test we'll potentially write off the
+ * end of the buffer's memory.
+ */
+ if (val->buf.size <= WT_INTPACK32_MAXSIZE)
+ return (0);
+ WT_RET(__wt_rec_dictionary_lookup(session, r, val, &dp));
+ if (dp == NULL)
+ return (0);
+
+ /*
+ * If the dictionary offset isn't set, we're creating a new entry in the
+ * dictionary, set its location.
+ *
+ * If the dictionary offset is set, we have a matching value. Create a
+ * copy cell instead.
+ */
+ if (dp->offset == 0)
+ dp->offset = WT_PTRDIFF32(r->first_free, r->cur_ptr->image.mem);
+ else {
+ /*
+ * The offset is the byte offset from this cell to the previous,
+ * matching cell, NOT the byte offset from the beginning of the
+ * page.
+ */
+ offset = (uint64_t)WT_PTRDIFF(r->first_free,
+ (uint8_t *)r->cur_ptr->image.mem + dp->offset);
+ val->len = val->cell_len =
+ __wt_cell_pack_copy(&val->cell, rle, offset);
+ val->buf.data = NULL;
+ val->buf.size = 0;
+ }
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
index a1fc065d263..b0e66d69743 100644
--- a/src/third_party/wiredtiger/src/include/stat.h
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -371,6 +371,8 @@ struct __wt_connection_stats {
int64_t cache_lookaside_score;
int64_t cache_lookaside_entries;
int64_t cache_lookaside_insert;
+ int64_t cache_lookaside_ondisk_max;
+ int64_t cache_lookaside_ondisk;
int64_t cache_lookaside_remove;
int64_t cache_eviction_checkpoint;
int64_t cache_eviction_get_ref;
diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h
index a1f6634922a..928c3c13ad5 100644
--- a/src/third_party/wiredtiger/src/include/txn.h
+++ b/src/third_party/wiredtiger/src/include/txn.h
@@ -153,6 +153,8 @@ struct __wt_txn_global {
WT_TXN_STATE checkpoint_state; /* Checkpoint's txn state */
wt_timestamp_t checkpoint_timestamp; /* Checkpoint's timestamp */
+ volatile uint64_t debug_ops; /* Debug mode op counter */
+ uint64_t debug_rollback; /* Debug mode rollback */
volatile uint64_t metadata_pinned; /* Oldest ID for metadata */
/* Named snapshot state. */
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index 1cd615fa3bd..de10e8c44b9 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -425,6 +425,42 @@ __wt_txn_op_apply_prepare_state(
}
/*
+ * __wt_txn_op_delete_commit_apply_timestamps --
+ * Apply the correct start and durable timestamps to any
+ * updates in the page del update list.
+ */
+static inline void
+__wt_txn_op_delete_commit_apply_timestamps(
+ WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_TXN *txn;
+ WT_UPDATE **updp;
+ uint32_t previous_state;
+
+ txn = &session->txn;
+
+ /*
+ * Lock the ref to ensure we don't race with eviction freeing the page
+ * deleted update list or with a page instantiate.
+ */
+ for (;; __wt_yield()) {
+ previous_state = ref->state;
+ WT_ASSERT(session, previous_state != WT_REF_READING);
+ if (previous_state != WT_REF_LOCKED && WT_REF_CAS_STATE(
+ session, ref, previous_state, WT_REF_LOCKED))
+ break;
+ }
+
+ for (updp = ref->page_del->update_list;
+ updp != NULL && *updp != NULL; ++updp) {
+ (*updp)->timestamp = txn->commit_timestamp;
+ }
+
+ /* Unlock the page by setting it back to it's previous state */
+ WT_REF_SET_STATE(ref, previous_state);
+}
+
+/*
* __wt_txn_op_set_timestamp --
* Decide whether to copy a commit timestamp into an update. If the op
* structure doesn't have a populated update or ref field or in prepared
@@ -471,6 +507,10 @@ __wt_txn_op_set_timestamp(WT_SESSION_IMPL *session, WT_TXN_OP *op)
&op->u.ref->page_del->timestamp : &op->u.op_upd->timestamp;
if (*timestamp == 0)
*timestamp = txn->commit_timestamp;
+
+ if (op->type == WT_TXN_OP_REF_DELETE)
+ __wt_txn_op_delete_commit_apply_timestamps(
+ session, op->u.ref);
}
}
@@ -1075,13 +1115,19 @@ static inline int
__wt_txn_update_check(WT_SESSION_IMPL *session, WT_UPDATE *upd)
{
WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
bool ignore_prepare_set;
txn = &session->txn;
+ txn_global = &S2C(session)->txn_global;
if (txn->isolation != WT_ISO_SNAPSHOT)
return (0);
+ if (txn_global->debug_rollback != 0 &&
+ ++txn_global->debug_ops % txn_global->debug_rollback == 0)
+ return (__wt_txn_rollback_required(session,
+ "debug mode simulated conflict"));
/*
* Always include prepared transactions in this check: they are not
* supposed to affect visibility for update operations.
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index 17bfb813151..2fe91e312e4 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -2196,6 +2196,16 @@ struct __wt_connection {
* application thread will wait for space to be available in cache
* before giving up. Default will wait forever., an integer greater
* than or equal to 0; default \c 0.}
+ * @config{cache_overflow = (, cache overflow configuration options., a
+ * set of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;file_max, The maximum number of bytes
+ * that WiredTiger is allowed to use for its cache overflow mechanism.
+ * If the cache overflow file exceeds this size\, a panic will be
+ * triggered. The default value means that the cache overflow file is
+ * unbounded and may use as much space as the filesystem will
+ * accommodate. The minimum non-zero setting is 100MB., an integer
+ * greater than or equal to 0; default \c 0.}
+ * @config{ ),,}
* @config{cache_overhead, assume the heap allocator overhead is the
* specified percentage\, and adjust the cache usage by that amount (for
* example\, if there is 10GB of data in cache\, a percentage of 10
@@ -2228,6 +2238,28 @@ struct __wt_connection {
* @config{&nbsp;&nbsp;&nbsp;&nbsp;release, compatibility release
* version string., a string; default empty.}
* @config{ ),,}
+ * @config{debug_mode = (, control the settings of various extended
+ * debugging features., a set of related configuration options defined
+ * below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;checkpoint_retention, adjust
+ * log archiving to retain the log records of this number of
+ * checkpoints. Zero or one means perform normal archiving., an integer
+ * between 0 and 1024; default \c 0.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;eviction, if true\, modify internal
+ * algorithms to change skew to force lookaside eviction to happen more
+ * aggressively. This includes but is not limited to not skewing
+ * newest\, not favoring leaf pages\, and modifying the eviction score
+ * mechanism., a boolean flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;rollback_error, return a WT_ROLLBACK
+ * error from a transaction operation about every Nth operation to
+ * simulate a collision., an integer between 0 and 10M; default \c 0.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;table_logging, if true\, write
+ * transaction related information to the log for all operations\, even
+ * operations for tables with logging turned off. This setting
+ * introduces a log format change that may break older versions of
+ * WiredTiger. These operations are informational and skipped in
+ * recovery., a boolean flag; default \c false.}
+ * @config{ ),,}
* @config{error_prefix, prefix string for error messages., a string;
* default empty.}
* @config{eviction = (, eviction configuration options., a set of
@@ -2795,6 +2827,15 @@ struct __wt_connection {
* thread will wait for space to be available in cache before giving up.
* Default will wait forever., an integer greater than or equal to 0; default \c
* 0.}
+ * @config{cache_overflow = (, cache overflow configuration options., a set of
+ * related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;file_max, The maximum number of bytes that
+ * WiredTiger is allowed to use for its cache overflow mechanism. If the cache
+ * overflow file exceeds this size\, a panic will be triggered. The default
+ * value means that the cache overflow file is unbounded and may use as much
+ * space as the filesystem will accommodate. The minimum non-zero setting is
+ * 100MB., an integer greater than or equal to 0; default \c 0.}
+ * @config{ ),,}
* @config{cache_overhead, assume the heap allocator overhead is the specified
* percentage\, and adjust the cache usage by that amount (for example\, if
* there is 10GB of data in cache\, a percentage of 10 means WiredTiger treats
@@ -2843,6 +2884,27 @@ struct __wt_connection {
* true.}
* @config{create, create the database if it does not exist., a boolean flag;
* default \c false.}
+ * @config{debug_mode = (, control the settings of various extended debugging
+ * features., a set of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;checkpoint_retention, adjust log archiving to
+ * retain the log records of this number of checkpoints. Zero or one means
+ * perform normal archiving., an integer between 0 and 1024; default \c 0.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;eviction, if true\, modify internal
+ * algorithms to change skew to force lookaside eviction to happen more
+ * aggressively. This includes but is not limited to not skewing newest\, not
+ * favoring leaf pages\, and modifying the eviction score mechanism., a boolean
+ * flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;rollback_error,
+ * return a WT_ROLLBACK error from a transaction operation about every Nth
+ * operation to simulate a collision., an integer between 0 and 10M; default \c
+ * 0.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;table_logging, if true\, write
+ * transaction related information to the log for all operations\, even
+ * operations for tables with logging turned off. This setting introduces a log
+ * format change that may break older versions of WiredTiger. These operations
+ * are informational and skipped in recovery., a boolean flag; default \c
+ * false.}
+ * @config{ ),,}
* @config{direct_io, Use \c O_DIRECT on POSIX systems\, and \c
* FILE_FLAG_NO_BUFFERING on Windows to access files. Options are given as a
* list\, such as <code>"direct_io=[data]"</code>. Configuring \c direct_io
@@ -4921,6 +4983,12 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_LOGOP_COL_MODIFY 9
/*! row-store modify */
#define WT_LOGOP_ROW_MODIFY 10
+/*
+ * NOTE: Diagnostic-only log operations should have values in
+ * the ignore range.
+ */
+/*! Diagnostic: transaction timestamps */
+#define WT_LOGOP_TXN_TIMESTAMP (WT_LOGOP_IGNORE | 11)
/*! @} */
/*******************************************
@@ -5034,737 +5102,741 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_CACHE_LOOKASIDE_ENTRIES 1045
/*! cache: cache overflow table insert calls */
#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1046
+/*! cache: cache overflow table max on-disk size */
+#define WT_STAT_CONN_CACHE_LOOKASIDE_ONDISK_MAX 1047
+/*! cache: cache overflow table on-disk size */
+#define WT_STAT_CONN_CACHE_LOOKASIDE_ONDISK 1048
/*! cache: cache overflow table remove calls */
-#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1047
+#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1049
/*! cache: checkpoint blocked page eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1048
+#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1050
/*! cache: eviction calls to get a page */
-#define WT_STAT_CONN_CACHE_EVICTION_GET_REF 1049
+#define WT_STAT_CONN_CACHE_EVICTION_GET_REF 1051
/*! cache: eviction calls to get a page found queue empty */
-#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY 1050
+#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY 1052
/*! cache: eviction calls to get a page found queue empty after locking */
-#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY2 1051
+#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY2 1053
/*! cache: eviction currently operating in aggressive mode */
-#define WT_STAT_CONN_CACHE_EVICTION_AGGRESSIVE_SET 1052
+#define WT_STAT_CONN_CACHE_EVICTION_AGGRESSIVE_SET 1054
/*! cache: eviction empty score */
-#define WT_STAT_CONN_CACHE_EVICTION_EMPTY_SCORE 1053
+#define WT_STAT_CONN_CACHE_EVICTION_EMPTY_SCORE 1055
/*! cache: eviction passes of a file */
-#define WT_STAT_CONN_CACHE_EVICTION_WALK_PASSES 1054
+#define WT_STAT_CONN_CACHE_EVICTION_WALK_PASSES 1056
/*! cache: eviction server candidate queue empty when topping up */
-#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1055
+#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1057
/*! cache: eviction server candidate queue not empty when topping up */
-#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1056
+#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1058
/*! cache: eviction server evicting pages */
-#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1057
+#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1059
/*!
* cache: eviction server slept, because we did not make progress with
* eviction
*/
-#define WT_STAT_CONN_CACHE_EVICTION_SERVER_SLEPT 1058
+#define WT_STAT_CONN_CACHE_EVICTION_SERVER_SLEPT 1060
/*! cache: eviction server unable to reach eviction goal */
-#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1059
+#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1061
/*! cache: eviction state */
-#define WT_STAT_CONN_CACHE_EVICTION_STATE 1060
+#define WT_STAT_CONN_CACHE_EVICTION_STATE 1062
/*! cache: eviction walk target pages histogram - 0-9 */
-#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT10 1061
+#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT10 1063
/*! cache: eviction walk target pages histogram - 10-31 */
-#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT32 1062
+#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT32 1064
/*! cache: eviction walk target pages histogram - 128 and higher */
-#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_GE128 1063
+#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_GE128 1065
/*! cache: eviction walk target pages histogram - 32-63 */
-#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT64 1064
+#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT64 1066
/*! cache: eviction walk target pages histogram - 64-128 */
-#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT128 1065
+#define WT_STAT_CONN_CACHE_EVICTION_TARGET_PAGE_LT128 1067
/*! cache: eviction walks abandoned */
-#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ABANDONED 1066
+#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ABANDONED 1068
/*! cache: eviction walks gave up because they restarted their walk twice */
-#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STOPPED 1067
+#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STOPPED 1069
/*!
* cache: eviction walks gave up because they saw too many pages and
* found no candidates
*/
-#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_NO_TARGETS 1068
+#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_NO_TARGETS 1070
/*!
* cache: eviction walks gave up because they saw too many pages and
* found too few candidates
*/
-#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_RATIO 1069
+#define WT_STAT_CONN_CACHE_EVICTION_WALKS_GAVE_UP_RATIO 1071
/*! cache: eviction walks reached end of tree */
-#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ENDED 1070
+#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ENDED 1072
/*! cache: eviction walks started from root of tree */
-#define WT_STAT_CONN_CACHE_EVICTION_WALK_FROM_ROOT 1071
+#define WT_STAT_CONN_CACHE_EVICTION_WALK_FROM_ROOT 1073
/*! cache: eviction walks started from saved location in tree */
-#define WT_STAT_CONN_CACHE_EVICTION_WALK_SAVED_POS 1072
+#define WT_STAT_CONN_CACHE_EVICTION_WALK_SAVED_POS 1074
/*! cache: eviction worker thread active */
-#define WT_STAT_CONN_CACHE_EVICTION_ACTIVE_WORKERS 1073
+#define WT_STAT_CONN_CACHE_EVICTION_ACTIVE_WORKERS 1075
/*! cache: eviction worker thread created */
-#define WT_STAT_CONN_CACHE_EVICTION_WORKER_CREATED 1074
+#define WT_STAT_CONN_CACHE_EVICTION_WORKER_CREATED 1076
/*! cache: eviction worker thread evicting pages */
-#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1075
+#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1077
/*! cache: eviction worker thread removed */
-#define WT_STAT_CONN_CACHE_EVICTION_WORKER_REMOVED 1076
+#define WT_STAT_CONN_CACHE_EVICTION_WORKER_REMOVED 1078
/*! cache: eviction worker thread stable number */
-#define WT_STAT_CONN_CACHE_EVICTION_STABLE_STATE_WORKERS 1077
+#define WT_STAT_CONN_CACHE_EVICTION_STABLE_STATE_WORKERS 1079
/*!
* cache: failed eviction of pages that exceeded the in-memory maximum
* count
*/
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1078
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1080
/*!
* cache: failed eviction of pages that exceeded the in-memory maximum
* time (usecs)
*/
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL_TIME 1079
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL_TIME 1081
/*! cache: files with active eviction walks */
-#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ACTIVE 1080
+#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ACTIVE 1082
/*! cache: files with new eviction walks started */
-#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STARTED 1081
+#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STARTED 1083
/*! cache: force re-tuning of eviction workers once in a while */
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE_RETUNE 1082
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_RETUNE 1084
/*! cache: hazard pointer blocked page eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1083
+#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1085
/*! cache: hazard pointer check calls */
-#define WT_STAT_CONN_CACHE_HAZARD_CHECKS 1084
+#define WT_STAT_CONN_CACHE_HAZARD_CHECKS 1086
/*! cache: hazard pointer check entries walked */
-#define WT_STAT_CONN_CACHE_HAZARD_WALKS 1085
+#define WT_STAT_CONN_CACHE_HAZARD_WALKS 1087
/*! cache: hazard pointer maximum array length */
-#define WT_STAT_CONN_CACHE_HAZARD_MAX 1086
+#define WT_STAT_CONN_CACHE_HAZARD_MAX 1088
/*! cache: in-memory page passed criteria to be split */
-#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1087
+#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1089
/*! cache: in-memory page splits */
-#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1088
+#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1090
/*! cache: internal pages evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1089
+#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1091
/*! cache: internal pages split during eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1090
+#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1092
/*! cache: leaf pages split during eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1091
+#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1093
/*! cache: maximum bytes configured */
-#define WT_STAT_CONN_CACHE_BYTES_MAX 1092
+#define WT_STAT_CONN_CACHE_BYTES_MAX 1094
/*! cache: maximum page size at eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1093
+#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1095
/*! cache: modified pages evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1094
+#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1096
/*! cache: modified pages evicted by application threads */
-#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1095
+#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1097
/*! cache: operations timed out waiting for space in cache */
-#define WT_STAT_CONN_CACHE_TIMED_OUT_OPS 1096
+#define WT_STAT_CONN_CACHE_TIMED_OUT_OPS 1098
/*! cache: overflow pages read into cache */
-#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1097
+#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1099
/*! cache: page split during eviction deepened the tree */
-#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1098
+#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1100
/*! cache: page written requiring cache overflow records */
-#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1099
+#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1101
/*! cache: pages currently held in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_INUSE 1100
+#define WT_STAT_CONN_CACHE_PAGES_INUSE 1102
/*! cache: pages evicted because they exceeded the in-memory maximum count */
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1101
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1103
/*!
* cache: pages evicted because they exceeded the in-memory maximum time
* (usecs)
*/
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE_TIME 1102
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_TIME 1104
/*! cache: pages evicted because they had chains of deleted items count */
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1103
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1105
/*!
* cache: pages evicted because they had chains of deleted items time
* (usecs)
*/
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE_TIME 1104
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE_TIME 1106
/*! cache: pages evicted by application threads */
-#define WT_STAT_CONN_CACHE_EVICTION_APP 1105
+#define WT_STAT_CONN_CACHE_EVICTION_APP 1107
/*! cache: pages queued for eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1106
+#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1108
/*! cache: pages queued for urgent eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1107
+#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1109
/*! cache: pages queued for urgent eviction during walk */
-#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1108
+#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1110
/*! cache: pages read into cache */
-#define WT_STAT_CONN_CACHE_READ 1109
+#define WT_STAT_CONN_CACHE_READ 1111
/*! cache: pages read into cache after truncate */
-#define WT_STAT_CONN_CACHE_READ_DELETED 1110
+#define WT_STAT_CONN_CACHE_READ_DELETED 1112
/*! cache: pages read into cache after truncate in prepare state */
-#define WT_STAT_CONN_CACHE_READ_DELETED_PREPARED 1111
+#define WT_STAT_CONN_CACHE_READ_DELETED_PREPARED 1113
/*! cache: pages read into cache requiring cache overflow entries */
-#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1112
+#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1114
/*! cache: pages read into cache requiring cache overflow for checkpoint */
-#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_CHECKPOINT 1113
+#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_CHECKPOINT 1115
/*! cache: pages read into cache skipping older cache overflow entries */
-#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_SKIPPED 1114
+#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_SKIPPED 1116
/*!
* cache: pages read into cache with skipped cache overflow entries
* needed later
*/
-#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_DELAY 1115
+#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_DELAY 1117
/*!
* cache: pages read into cache with skipped cache overflow entries
* needed later by checkpoint
*/
-#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_DELAY_CHECKPOINT 1116
+#define WT_STAT_CONN_CACHE_READ_LOOKASIDE_DELAY_CHECKPOINT 1118
/*! cache: pages requested from the cache */
-#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1117
+#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1119
/*! cache: pages seen by eviction walk */
-#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1118
+#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1120
/*! cache: pages selected for eviction unable to be evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1119
+#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1121
/*! cache: pages walked for eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_WALK 1120
+#define WT_STAT_CONN_CACHE_EVICTION_WALK 1122
/*! cache: pages written from cache */
-#define WT_STAT_CONN_CACHE_WRITE 1121
+#define WT_STAT_CONN_CACHE_WRITE 1123
/*! cache: pages written requiring in-memory restoration */
-#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1122
+#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1124
/*! cache: percentage overhead */
-#define WT_STAT_CONN_CACHE_OVERHEAD 1123
+#define WT_STAT_CONN_CACHE_OVERHEAD 1125
/*! cache: tracked bytes belonging to internal pages in the cache */
-#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1124
+#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1126
/*! cache: tracked bytes belonging to leaf pages in the cache */
-#define WT_STAT_CONN_CACHE_BYTES_LEAF 1125
+#define WT_STAT_CONN_CACHE_BYTES_LEAF 1127
/*! cache: tracked dirty bytes in the cache */
-#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1126
+#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1128
/*! cache: tracked dirty pages in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1127
+#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1129
/*! cache: unmodified pages evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1128
+#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1130
/*! capacity: background fsync file handles considered */
-#define WT_STAT_CONN_FSYNC_ALL_FH_TOTAL 1129
+#define WT_STAT_CONN_FSYNC_ALL_FH_TOTAL 1131
/*! capacity: background fsync file handles synced */
-#define WT_STAT_CONN_FSYNC_ALL_FH 1130
+#define WT_STAT_CONN_FSYNC_ALL_FH 1132
/*! capacity: background fsync time (msecs) */
-#define WT_STAT_CONN_FSYNC_ALL_TIME 1131
+#define WT_STAT_CONN_FSYNC_ALL_TIME 1133
/*! capacity: threshold to call fsync */
-#define WT_STAT_CONN_CAPACITY_THRESHOLD 1132
+#define WT_STAT_CONN_CAPACITY_THRESHOLD 1134
/*! capacity: throttled bytes read */
-#define WT_STAT_CONN_CAPACITY_BYTES_READ 1133
+#define WT_STAT_CONN_CAPACITY_BYTES_READ 1135
/*! capacity: throttled bytes written for checkpoint */
-#define WT_STAT_CONN_CAPACITY_BYTES_CKPT 1134
+#define WT_STAT_CONN_CAPACITY_BYTES_CKPT 1136
/*! capacity: throttled bytes written for eviction */
-#define WT_STAT_CONN_CAPACITY_BYTES_EVICT 1135
+#define WT_STAT_CONN_CAPACITY_BYTES_EVICT 1137
/*! capacity: throttled bytes written for log */
-#define WT_STAT_CONN_CAPACITY_BYTES_LOG 1136
+#define WT_STAT_CONN_CAPACITY_BYTES_LOG 1138
/*! capacity: throttled bytes written total */
-#define WT_STAT_CONN_CAPACITY_BYTES_WRITTEN 1137
+#define WT_STAT_CONN_CAPACITY_BYTES_WRITTEN 1139
/*! capacity: time waiting due to total capacity (usecs) */
-#define WT_STAT_CONN_CAPACITY_TIME_TOTAL 1138
+#define WT_STAT_CONN_CAPACITY_TIME_TOTAL 1140
/*! capacity: time waiting during checkpoint (usecs) */
-#define WT_STAT_CONN_CAPACITY_TIME_CKPT 1139
+#define WT_STAT_CONN_CAPACITY_TIME_CKPT 1141
/*! capacity: time waiting during eviction (usecs) */
-#define WT_STAT_CONN_CAPACITY_TIME_EVICT 1140
+#define WT_STAT_CONN_CAPACITY_TIME_EVICT 1142
/*! capacity: time waiting during logging (usecs) */
-#define WT_STAT_CONN_CAPACITY_TIME_LOG 1141
+#define WT_STAT_CONN_CAPACITY_TIME_LOG 1143
/*! capacity: time waiting during read (usecs) */
-#define WT_STAT_CONN_CAPACITY_TIME_READ 1142
+#define WT_STAT_CONN_CAPACITY_TIME_READ 1144
/*! connection: auto adjusting condition resets */
-#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1143
+#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1145
/*! connection: auto adjusting condition wait calls */
-#define WT_STAT_CONN_COND_AUTO_WAIT 1144
+#define WT_STAT_CONN_COND_AUTO_WAIT 1146
/*! connection: detected system time went backwards */
-#define WT_STAT_CONN_TIME_TRAVEL 1145
+#define WT_STAT_CONN_TIME_TRAVEL 1147
/*! connection: files currently open */
-#define WT_STAT_CONN_FILE_OPEN 1146
+#define WT_STAT_CONN_FILE_OPEN 1148
/*! connection: memory allocations */
-#define WT_STAT_CONN_MEMORY_ALLOCATION 1147
+#define WT_STAT_CONN_MEMORY_ALLOCATION 1149
/*! connection: memory frees */
-#define WT_STAT_CONN_MEMORY_FREE 1148
+#define WT_STAT_CONN_MEMORY_FREE 1150
/*! connection: memory re-allocations */
-#define WT_STAT_CONN_MEMORY_GROW 1149
+#define WT_STAT_CONN_MEMORY_GROW 1151
/*! connection: pthread mutex condition wait calls */
-#define WT_STAT_CONN_COND_WAIT 1150
+#define WT_STAT_CONN_COND_WAIT 1152
/*! connection: pthread mutex shared lock read-lock calls */
-#define WT_STAT_CONN_RWLOCK_READ 1151
+#define WT_STAT_CONN_RWLOCK_READ 1153
/*! connection: pthread mutex shared lock write-lock calls */
-#define WT_STAT_CONN_RWLOCK_WRITE 1152
+#define WT_STAT_CONN_RWLOCK_WRITE 1154
/*! connection: total fsync I/Os */
-#define WT_STAT_CONN_FSYNC_IO 1153
+#define WT_STAT_CONN_FSYNC_IO 1155
/*! connection: total read I/Os */
-#define WT_STAT_CONN_READ_IO 1154
+#define WT_STAT_CONN_READ_IO 1156
/*! connection: total write I/Os */
-#define WT_STAT_CONN_WRITE_IO 1155
+#define WT_STAT_CONN_WRITE_IO 1157
/*! cursor: cached cursor count */
-#define WT_STAT_CONN_CURSOR_CACHED_COUNT 1156
+#define WT_STAT_CONN_CURSOR_CACHED_COUNT 1158
/*! cursor: cursor close calls that result in cache */
-#define WT_STAT_CONN_CURSOR_CACHE 1157
+#define WT_STAT_CONN_CURSOR_CACHE 1159
/*! cursor: cursor create calls */
-#define WT_STAT_CONN_CURSOR_CREATE 1158
+#define WT_STAT_CONN_CURSOR_CREATE 1160
/*! cursor: cursor insert calls */
-#define WT_STAT_CONN_CURSOR_INSERT 1159
+#define WT_STAT_CONN_CURSOR_INSERT 1161
/*! cursor: cursor modify calls */
-#define WT_STAT_CONN_CURSOR_MODIFY 1160
+#define WT_STAT_CONN_CURSOR_MODIFY 1162
/*! cursor: cursor next calls */
-#define WT_STAT_CONN_CURSOR_NEXT 1161
+#define WT_STAT_CONN_CURSOR_NEXT 1163
/*! cursor: cursor operation restarted */
-#define WT_STAT_CONN_CURSOR_RESTART 1162
+#define WT_STAT_CONN_CURSOR_RESTART 1164
/*! cursor: cursor prev calls */
-#define WT_STAT_CONN_CURSOR_PREV 1163
+#define WT_STAT_CONN_CURSOR_PREV 1165
/*! cursor: cursor remove calls */
-#define WT_STAT_CONN_CURSOR_REMOVE 1164
+#define WT_STAT_CONN_CURSOR_REMOVE 1166
/*! cursor: cursor reserve calls */
-#define WT_STAT_CONN_CURSOR_RESERVE 1165
+#define WT_STAT_CONN_CURSOR_RESERVE 1167
/*! cursor: cursor reset calls */
-#define WT_STAT_CONN_CURSOR_RESET 1166
+#define WT_STAT_CONN_CURSOR_RESET 1168
/*! cursor: cursor search calls */
-#define WT_STAT_CONN_CURSOR_SEARCH 1167
+#define WT_STAT_CONN_CURSOR_SEARCH 1169
/*! cursor: cursor search near calls */
-#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1168
+#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1170
/*! cursor: cursor sweep buckets */
-#define WT_STAT_CONN_CURSOR_SWEEP_BUCKETS 1169
+#define WT_STAT_CONN_CURSOR_SWEEP_BUCKETS 1171
/*! cursor: cursor sweep cursors closed */
-#define WT_STAT_CONN_CURSOR_SWEEP_CLOSED 1170
+#define WT_STAT_CONN_CURSOR_SWEEP_CLOSED 1172
/*! cursor: cursor sweep cursors examined */
-#define WT_STAT_CONN_CURSOR_SWEEP_EXAMINED 1171
+#define WT_STAT_CONN_CURSOR_SWEEP_EXAMINED 1173
/*! cursor: cursor sweeps */
-#define WT_STAT_CONN_CURSOR_SWEEP 1172
+#define WT_STAT_CONN_CURSOR_SWEEP 1174
/*! cursor: cursor update calls */
-#define WT_STAT_CONN_CURSOR_UPDATE 1173
+#define WT_STAT_CONN_CURSOR_UPDATE 1175
/*! cursor: cursors reused from cache */
-#define WT_STAT_CONN_CURSOR_REOPEN 1174
+#define WT_STAT_CONN_CURSOR_REOPEN 1176
/*! cursor: open cursor count */
-#define WT_STAT_CONN_CURSOR_OPEN_COUNT 1175
+#define WT_STAT_CONN_CURSOR_OPEN_COUNT 1177
/*! cursor: truncate calls */
-#define WT_STAT_CONN_CURSOR_TRUNCATE 1176
+#define WT_STAT_CONN_CURSOR_TRUNCATE 1178
/*! data-handle: connection data handles currently active */
-#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1177
+#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1179
/*! data-handle: connection sweep candidate became referenced */
-#define WT_STAT_CONN_DH_SWEEP_REF 1178
+#define WT_STAT_CONN_DH_SWEEP_REF 1180
/*! data-handle: connection sweep dhandles closed */
-#define WT_STAT_CONN_DH_SWEEP_CLOSE 1179
+#define WT_STAT_CONN_DH_SWEEP_CLOSE 1181
/*! data-handle: connection sweep dhandles removed from hash list */
-#define WT_STAT_CONN_DH_SWEEP_REMOVE 1180
+#define WT_STAT_CONN_DH_SWEEP_REMOVE 1182
/*! data-handle: connection sweep time-of-death sets */
-#define WT_STAT_CONN_DH_SWEEP_TOD 1181
+#define WT_STAT_CONN_DH_SWEEP_TOD 1183
/*! data-handle: connection sweeps */
-#define WT_STAT_CONN_DH_SWEEPS 1182
+#define WT_STAT_CONN_DH_SWEEPS 1184
/*! data-handle: session dhandles swept */
-#define WT_STAT_CONN_DH_SESSION_HANDLES 1183
+#define WT_STAT_CONN_DH_SESSION_HANDLES 1185
/*! data-handle: session sweep attempts */
-#define WT_STAT_CONN_DH_SESSION_SWEEPS 1184
+#define WT_STAT_CONN_DH_SESSION_SWEEPS 1186
/*! lock: checkpoint lock acquisitions */
-#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1185
+#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1187
/*! lock: checkpoint lock application thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1186
+#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1188
/*! lock: checkpoint lock internal thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1187
+#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1189
/*!
* lock: commit timestamp queue lock application thread time waiting
* (usecs)
*/
-#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_APPLICATION 1188
+#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_APPLICATION 1190
/*! lock: commit timestamp queue lock internal thread time waiting (usecs) */
-#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_INTERNAL 1189
+#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WAIT_INTERNAL 1191
/*! lock: commit timestamp queue read lock acquisitions */
-#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_READ_COUNT 1190
+#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_READ_COUNT 1192
/*! lock: commit timestamp queue write lock acquisitions */
-#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WRITE_COUNT 1191
+#define WT_STAT_CONN_LOCK_COMMIT_TIMESTAMP_WRITE_COUNT 1193
/*! lock: dhandle lock application thread time waiting (usecs) */
-#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1192
+#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1194
/*! lock: dhandle lock internal thread time waiting (usecs) */
-#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1193
+#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1195
/*! lock: dhandle read lock acquisitions */
-#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1194
+#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1196
/*! lock: dhandle write lock acquisitions */
-#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1195
+#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1197
/*! lock: metadata lock acquisitions */
-#define WT_STAT_CONN_LOCK_METADATA_COUNT 1196
+#define WT_STAT_CONN_LOCK_METADATA_COUNT 1198
/*! lock: metadata lock application thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1197
+#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1199
/*! lock: metadata lock internal thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1198
+#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1200
/*!
* lock: read timestamp queue lock application thread time waiting
* (usecs)
*/
-#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_APPLICATION 1199
+#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_APPLICATION 1201
/*! lock: read timestamp queue lock internal thread time waiting (usecs) */
-#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_INTERNAL 1200
+#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WAIT_INTERNAL 1202
/*! lock: read timestamp queue read lock acquisitions */
-#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_READ_COUNT 1201
+#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_READ_COUNT 1203
/*! lock: read timestamp queue write lock acquisitions */
-#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WRITE_COUNT 1202
+#define WT_STAT_CONN_LOCK_READ_TIMESTAMP_WRITE_COUNT 1204
/*! lock: schema lock acquisitions */
-#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1203
+#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1205
/*! lock: schema lock application thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1204
+#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1206
/*! lock: schema lock internal thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1205
+#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1207
/*!
* lock: table lock application thread time waiting for the table lock
* (usecs)
*/
-#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1206
+#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1208
/*!
* lock: table lock internal thread time waiting for the table lock
* (usecs)
*/
-#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1207
+#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1209
/*! lock: table read lock acquisitions */
-#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1208
+#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1210
/*! lock: table write lock acquisitions */
-#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1209
+#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1211
/*! lock: txn global lock application thread time waiting (usecs) */
-#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_APPLICATION 1210
+#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_APPLICATION 1212
/*! lock: txn global lock internal thread time waiting (usecs) */
-#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_INTERNAL 1211
+#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WAIT_INTERNAL 1213
/*! lock: txn global read lock acquisitions */
-#define WT_STAT_CONN_LOCK_TXN_GLOBAL_READ_COUNT 1212
+#define WT_STAT_CONN_LOCK_TXN_GLOBAL_READ_COUNT 1214
/*! lock: txn global write lock acquisitions */
-#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WRITE_COUNT 1213
+#define WT_STAT_CONN_LOCK_TXN_GLOBAL_WRITE_COUNT 1215
/*! log: busy returns attempting to switch slots */
-#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1214
+#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1216
/*! log: force archive time sleeping (usecs) */
-#define WT_STAT_CONN_LOG_FORCE_ARCHIVE_SLEEP 1215
+#define WT_STAT_CONN_LOG_FORCE_ARCHIVE_SLEEP 1217
/*! log: log bytes of payload data */
-#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1216
+#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1218
/*! log: log bytes written */
-#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1217
+#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1219
/*! log: log files manually zero-filled */
-#define WT_STAT_CONN_LOG_ZERO_FILLS 1218
+#define WT_STAT_CONN_LOG_ZERO_FILLS 1220
/*! log: log flush operations */
-#define WT_STAT_CONN_LOG_FLUSH 1219
+#define WT_STAT_CONN_LOG_FLUSH 1221
/*! log: log force write operations */
-#define WT_STAT_CONN_LOG_FORCE_WRITE 1220
+#define WT_STAT_CONN_LOG_FORCE_WRITE 1222
/*! log: log force write operations skipped */
-#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1221
+#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1223
/*! log: log records compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1222
+#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1224
/*! log: log records not compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1223
+#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1225
/*! log: log records too small to compress */
-#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1224
+#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1226
/*! log: log release advances write LSN */
-#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1225
+#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1227
/*! log: log scan operations */
-#define WT_STAT_CONN_LOG_SCANS 1226
+#define WT_STAT_CONN_LOG_SCANS 1228
/*! log: log scan records requiring two reads */
-#define WT_STAT_CONN_LOG_SCAN_REREADS 1227
+#define WT_STAT_CONN_LOG_SCAN_REREADS 1229
/*! log: log server thread advances write LSN */
-#define WT_STAT_CONN_LOG_WRITE_LSN 1228
+#define WT_STAT_CONN_LOG_WRITE_LSN 1230
/*! log: log server thread write LSN walk skipped */
-#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1229
+#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1231
/*! log: log sync operations */
-#define WT_STAT_CONN_LOG_SYNC 1230
+#define WT_STAT_CONN_LOG_SYNC 1232
/*! log: log sync time duration (usecs) */
-#define WT_STAT_CONN_LOG_SYNC_DURATION 1231
+#define WT_STAT_CONN_LOG_SYNC_DURATION 1233
/*! log: log sync_dir operations */
-#define WT_STAT_CONN_LOG_SYNC_DIR 1232
+#define WT_STAT_CONN_LOG_SYNC_DIR 1234
/*! log: log sync_dir time duration (usecs) */
-#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1233
+#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1235
/*! log: log write operations */
-#define WT_STAT_CONN_LOG_WRITES 1234
+#define WT_STAT_CONN_LOG_WRITES 1236
/*! log: logging bytes consolidated */
-#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1235
+#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1237
/*! log: maximum log file size */
-#define WT_STAT_CONN_LOG_MAX_FILESIZE 1236
+#define WT_STAT_CONN_LOG_MAX_FILESIZE 1238
/*! log: number of pre-allocated log files to create */
-#define WT_STAT_CONN_LOG_PREALLOC_MAX 1237
+#define WT_STAT_CONN_LOG_PREALLOC_MAX 1239
/*! log: pre-allocated log files not ready and missed */
-#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1238
+#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1240
/*! log: pre-allocated log files prepared */
-#define WT_STAT_CONN_LOG_PREALLOC_FILES 1239
+#define WT_STAT_CONN_LOG_PREALLOC_FILES 1241
/*! log: pre-allocated log files used */
-#define WT_STAT_CONN_LOG_PREALLOC_USED 1240
+#define WT_STAT_CONN_LOG_PREALLOC_USED 1242
/*! log: records processed by log scan */
-#define WT_STAT_CONN_LOG_SCAN_RECORDS 1241
+#define WT_STAT_CONN_LOG_SCAN_RECORDS 1243
/*! log: slot close lost race */
-#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1242
+#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1244
/*! log: slot close unbuffered waits */
-#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1243
+#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1245
/*! log: slot closures */
-#define WT_STAT_CONN_LOG_SLOT_CLOSES 1244
+#define WT_STAT_CONN_LOG_SLOT_CLOSES 1246
/*! log: slot join atomic update races */
-#define WT_STAT_CONN_LOG_SLOT_RACES 1245
+#define WT_STAT_CONN_LOG_SLOT_RACES 1247
/*! log: slot join calls atomic updates raced */
-#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1246
+#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1248
/*! log: slot join calls did not yield */
-#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1247
+#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1249
/*! log: slot join calls found active slot closed */
-#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1248
+#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1250
/*! log: slot join calls slept */
-#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1249
+#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1251
/*! log: slot join calls yielded */
-#define WT_STAT_CONN_LOG_SLOT_YIELD 1250
+#define WT_STAT_CONN_LOG_SLOT_YIELD 1252
/*! log: slot join found active slot closed */
-#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1251
+#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1253
/*! log: slot joins yield time (usecs) */
-#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1252
+#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1254
/*! log: slot transitions unable to find free slot */
-#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1253
+#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1255
/*! log: slot unbuffered writes */
-#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1254
+#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1256
/*! log: total in-memory size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_MEM 1255
+#define WT_STAT_CONN_LOG_COMPRESS_MEM 1257
/*! log: total log buffer size */
-#define WT_STAT_CONN_LOG_BUFFER_SIZE 1256
+#define WT_STAT_CONN_LOG_BUFFER_SIZE 1258
/*! log: total size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_LEN 1257
+#define WT_STAT_CONN_LOG_COMPRESS_LEN 1259
/*! log: written slots coalesced */
-#define WT_STAT_CONN_LOG_SLOT_COALESCED 1258
+#define WT_STAT_CONN_LOG_SLOT_COALESCED 1260
/*! log: yields waiting for previous log file close */
-#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1259
+#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1261
/*! perf: file system read latency histogram (bucket 1) - 10-49ms */
-#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT50 1260
+#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT50 1262
/*! perf: file system read latency histogram (bucket 2) - 50-99ms */
-#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT100 1261
+#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT100 1263
/*! perf: file system read latency histogram (bucket 3) - 100-249ms */
-#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT250 1262
+#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT250 1264
/*! perf: file system read latency histogram (bucket 4) - 250-499ms */
-#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT500 1263
+#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT500 1265
/*! perf: file system read latency histogram (bucket 5) - 500-999ms */
-#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT1000 1264
+#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_LT1000 1266
/*! perf: file system read latency histogram (bucket 6) - 1000ms+ */
-#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_GT1000 1265
+#define WT_STAT_CONN_PERF_HIST_FSREAD_LATENCY_GT1000 1267
/*! perf: file system write latency histogram (bucket 1) - 10-49ms */
-#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT50 1266
+#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT50 1268
/*! perf: file system write latency histogram (bucket 2) - 50-99ms */
-#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT100 1267
+#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT100 1269
/*! perf: file system write latency histogram (bucket 3) - 100-249ms */
-#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT250 1268
+#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT250 1270
/*! perf: file system write latency histogram (bucket 4) - 250-499ms */
-#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT500 1269
+#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT500 1271
/*! perf: file system write latency histogram (bucket 5) - 500-999ms */
-#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT1000 1270
+#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_LT1000 1272
/*! perf: file system write latency histogram (bucket 6) - 1000ms+ */
-#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_GT1000 1271
+#define WT_STAT_CONN_PERF_HIST_FSWRITE_LATENCY_GT1000 1273
/*! perf: operation read latency histogram (bucket 1) - 100-249us */
-#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT250 1272
+#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT250 1274
/*! perf: operation read latency histogram (bucket 2) - 250-499us */
-#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT500 1273
+#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT500 1275
/*! perf: operation read latency histogram (bucket 3) - 500-999us */
-#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT1000 1274
+#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT1000 1276
/*! perf: operation read latency histogram (bucket 4) - 1000-9999us */
-#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT10000 1275
+#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_LT10000 1277
/*! perf: operation read latency histogram (bucket 5) - 10000us+ */
-#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_GT10000 1276
+#define WT_STAT_CONN_PERF_HIST_OPREAD_LATENCY_GT10000 1278
/*! perf: operation write latency histogram (bucket 1) - 100-249us */
-#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT250 1277
+#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT250 1279
/*! perf: operation write latency histogram (bucket 2) - 250-499us */
-#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT500 1278
+#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT500 1280
/*! perf: operation write latency histogram (bucket 3) - 500-999us */
-#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT1000 1279
+#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT1000 1281
/*! perf: operation write latency histogram (bucket 4) - 1000-9999us */
-#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT10000 1280
+#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_LT10000 1282
/*! perf: operation write latency histogram (bucket 5) - 10000us+ */
-#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_GT10000 1281
+#define WT_STAT_CONN_PERF_HIST_OPWRITE_LATENCY_GT10000 1283
/*! reconciliation: fast-path pages deleted */
-#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1282
+#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1284
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_CONN_REC_PAGES 1283
+#define WT_STAT_CONN_REC_PAGES 1285
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_CONN_REC_PAGES_EVICTION 1284
+#define WT_STAT_CONN_REC_PAGES_EVICTION 1286
/*! reconciliation: pages deleted */
-#define WT_STAT_CONN_REC_PAGE_DELETE 1285
+#define WT_STAT_CONN_REC_PAGE_DELETE 1287
/*! reconciliation: split bytes currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1286
+#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1288
/*! reconciliation: split objects currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1287
+#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1289
/*! session: open session count */
-#define WT_STAT_CONN_SESSION_OPEN 1288
+#define WT_STAT_CONN_SESSION_OPEN 1290
/*! session: session query timestamp calls */
-#define WT_STAT_CONN_SESSION_QUERY_TS 1289
+#define WT_STAT_CONN_SESSION_QUERY_TS 1291
/*! session: table alter failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1290
+#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1292
/*! session: table alter successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1291
+#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1293
/*! session: table alter unchanged and skipped */
-#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1292
+#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1294
/*! session: table compact failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1293
+#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1295
/*! session: table compact successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1294
+#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1296
/*! session: table create failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1295
+#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1297
/*! session: table create successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1296
+#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1298
/*! session: table drop failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1297
+#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1299
/*! session: table drop successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1298
+#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1300
/*! session: table rebalance failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1299
+#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1301
/*! session: table rebalance successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1300
+#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1302
/*! session: table rename failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1301
+#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1303
/*! session: table rename successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1302
+#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1304
/*! session: table salvage failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1303
+#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1305
/*! session: table salvage successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1304
+#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1306
/*! session: table truncate failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1305
+#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1307
/*! session: table truncate successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1306
+#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1308
/*! session: table verify failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1307
+#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1309
/*! session: table verify successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1308
+#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1310
/*! thread-state: active filesystem fsync calls */
-#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1309
+#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1311
/*! thread-state: active filesystem read calls */
-#define WT_STAT_CONN_THREAD_READ_ACTIVE 1310
+#define WT_STAT_CONN_THREAD_READ_ACTIVE 1312
/*! thread-state: active filesystem write calls */
-#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1311
+#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1313
/*! thread-yield: application thread time evicting (usecs) */
-#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1312
+#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1314
/*! thread-yield: application thread time waiting for cache (usecs) */
-#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1313
+#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1315
/*!
* thread-yield: connection close blocked waiting for transaction state
* stabilization
*/
-#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1314
+#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1316
/*! thread-yield: connection close yielded for lsm manager shutdown */
-#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1315
+#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1317
/*! thread-yield: data handle lock yielded */
-#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1316
+#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1318
/*!
* thread-yield: get reference for page index and slot time sleeping
* (usecs)
*/
-#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1317
+#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1319
/*! thread-yield: log server sync yielded for log write */
-#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1318
+#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1320
/*! thread-yield: page access yielded due to prepare state change */
-#define WT_STAT_CONN_PREPARED_TRANSITION_BLOCKED_PAGE 1319
+#define WT_STAT_CONN_PREPARED_TRANSITION_BLOCKED_PAGE 1321
/*! thread-yield: page acquire busy blocked */
-#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1320
+#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1322
/*! thread-yield: page acquire eviction blocked */
-#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1321
+#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1323
/*! thread-yield: page acquire locked blocked */
-#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1322
+#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1324
/*! thread-yield: page acquire read blocked */
-#define WT_STAT_CONN_PAGE_READ_BLOCKED 1323
+#define WT_STAT_CONN_PAGE_READ_BLOCKED 1325
/*! thread-yield: page acquire time sleeping (usecs) */
-#define WT_STAT_CONN_PAGE_SLEEP 1324
+#define WT_STAT_CONN_PAGE_SLEEP 1326
/*!
* thread-yield: page delete rollback time sleeping for state change
* (usecs)
*/
-#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1325
+#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1327
/*! thread-yield: page reconciliation yielded due to child modification */
-#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1326
+#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1328
/*! transaction: Number of prepared updates */
-#define WT_STAT_CONN_TXN_PREPARED_UPDATES_COUNT 1327
+#define WT_STAT_CONN_TXN_PREPARED_UPDATES_COUNT 1329
/*! transaction: Number of prepared updates added to cache overflow */
-#define WT_STAT_CONN_TXN_PREPARED_UPDATES_LOOKASIDE_INSERTS 1328
+#define WT_STAT_CONN_TXN_PREPARED_UPDATES_LOOKASIDE_INSERTS 1330
/*! transaction: Number of prepared updates resolved */
-#define WT_STAT_CONN_TXN_PREPARED_UPDATES_RESOLVED 1329
+#define WT_STAT_CONN_TXN_PREPARED_UPDATES_RESOLVED 1331
/*! transaction: commit timestamp queue entries walked */
-#define WT_STAT_CONN_TXN_COMMIT_QUEUE_WALKED 1330
+#define WT_STAT_CONN_TXN_COMMIT_QUEUE_WALKED 1332
/*! transaction: commit timestamp queue insert to empty */
-#define WT_STAT_CONN_TXN_COMMIT_QUEUE_EMPTY 1331
+#define WT_STAT_CONN_TXN_COMMIT_QUEUE_EMPTY 1333
/*! transaction: commit timestamp queue inserts to head */
-#define WT_STAT_CONN_TXN_COMMIT_QUEUE_HEAD 1332
+#define WT_STAT_CONN_TXN_COMMIT_QUEUE_HEAD 1334
/*! transaction: commit timestamp queue inserts total */
-#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1333
+#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1335
/*! transaction: commit timestamp queue length */
-#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1334
+#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1336
/*! transaction: number of named snapshots created */
-#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1335
+#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1337
/*! transaction: number of named snapshots dropped */
-#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1336
+#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1338
/*! transaction: prepared transactions */
-#define WT_STAT_CONN_TXN_PREPARE 1337
+#define WT_STAT_CONN_TXN_PREPARE 1339
/*! transaction: prepared transactions committed */
-#define WT_STAT_CONN_TXN_PREPARE_COMMIT 1338
+#define WT_STAT_CONN_TXN_PREPARE_COMMIT 1340
/*! transaction: prepared transactions currently active */
-#define WT_STAT_CONN_TXN_PREPARE_ACTIVE 1339
+#define WT_STAT_CONN_TXN_PREPARE_ACTIVE 1341
/*! transaction: prepared transactions rolled back */
-#define WT_STAT_CONN_TXN_PREPARE_ROLLBACK 1340
+#define WT_STAT_CONN_TXN_PREPARE_ROLLBACK 1342
/*! transaction: query timestamp calls */
-#define WT_STAT_CONN_TXN_QUERY_TS 1341
+#define WT_STAT_CONN_TXN_QUERY_TS 1343
/*! transaction: read timestamp queue entries walked */
-#define WT_STAT_CONN_TXN_READ_QUEUE_WALKED 1342
+#define WT_STAT_CONN_TXN_READ_QUEUE_WALKED 1344
/*! transaction: read timestamp queue insert to empty */
-#define WT_STAT_CONN_TXN_READ_QUEUE_EMPTY 1343
+#define WT_STAT_CONN_TXN_READ_QUEUE_EMPTY 1345
/*! transaction: read timestamp queue inserts to head */
-#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1344
+#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1346
/*! transaction: read timestamp queue inserts total */
-#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1345
+#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1347
/*! transaction: read timestamp queue length */
-#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1346
+#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1348
/*! transaction: rollback to stable calls */
-#define WT_STAT_CONN_TXN_ROLLBACK_TO_STABLE 1347
+#define WT_STAT_CONN_TXN_ROLLBACK_TO_STABLE 1349
/*! transaction: rollback to stable updates aborted */
-#define WT_STAT_CONN_TXN_ROLLBACK_UPD_ABORTED 1348
+#define WT_STAT_CONN_TXN_ROLLBACK_UPD_ABORTED 1350
/*! transaction: rollback to stable updates removed from cache overflow */
-#define WT_STAT_CONN_TXN_ROLLBACK_LAS_REMOVED 1349
+#define WT_STAT_CONN_TXN_ROLLBACK_LAS_REMOVED 1351
/*! transaction: set timestamp calls */
-#define WT_STAT_CONN_TXN_SET_TS 1350
+#define WT_STAT_CONN_TXN_SET_TS 1352
/*! transaction: set timestamp commit calls */
-#define WT_STAT_CONN_TXN_SET_TS_COMMIT 1351
+#define WT_STAT_CONN_TXN_SET_TS_COMMIT 1353
/*! transaction: set timestamp commit updates */
-#define WT_STAT_CONN_TXN_SET_TS_COMMIT_UPD 1352
+#define WT_STAT_CONN_TXN_SET_TS_COMMIT_UPD 1354
/*! transaction: set timestamp oldest calls */
-#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1353
+#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1355
/*! transaction: set timestamp oldest updates */
-#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1354
+#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1356
/*! transaction: set timestamp stable calls */
-#define WT_STAT_CONN_TXN_SET_TS_STABLE 1355
+#define WT_STAT_CONN_TXN_SET_TS_STABLE 1357
/*! transaction: set timestamp stable updates */
-#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1356
+#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1358
/*! transaction: transaction begins */
-#define WT_STAT_CONN_TXN_BEGIN 1357
+#define WT_STAT_CONN_TXN_BEGIN 1359
/*! transaction: transaction checkpoint currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1358
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1360
/*! transaction: transaction checkpoint generation */
-#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1359
+#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1361
/*! transaction: transaction checkpoint max time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1360
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1362
/*! transaction: transaction checkpoint min time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1361
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1363
/*! transaction: transaction checkpoint most recent time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1362
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1364
/*! transaction: transaction checkpoint scrub dirty target */
-#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1363
+#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1365
/*! transaction: transaction checkpoint scrub time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1364
+#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1366
/*! transaction: transaction checkpoint total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1365
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1367
/*! transaction: transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1366
+#define WT_STAT_CONN_TXN_CHECKPOINT 1368
/*!
* transaction: transaction checkpoints skipped because database was
* clean
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1367
+#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1369
/*! transaction: transaction failures due to cache overflow */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1368
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1370
/*!
* transaction: transaction fsync calls for checkpoint after allocating
* the transaction ID
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1369
+#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1371
/*!
* transaction: transaction fsync duration for checkpoint after
* allocating the transaction ID (usecs)
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1370
+#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1372
/*! transaction: transaction range of IDs currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_RANGE 1371
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1373
/*! transaction: transaction range of IDs currently pinned by a checkpoint */
-#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1372
+#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1374
/*!
* transaction: transaction range of IDs currently pinned by named
* snapshots
*/
-#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1373
+#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1375
/*! transaction: transaction range of timestamps currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1374
+#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1376
/*! transaction: transaction range of timestamps pinned by a checkpoint */
-#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1375
+#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1377
/*!
* transaction: transaction range of timestamps pinned by the oldest
* timestamp
*/
-#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1376
+#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1378
/*! transaction: transaction sync calls */
-#define WT_STAT_CONN_TXN_SYNC 1377
+#define WT_STAT_CONN_TXN_SYNC 1379
/*! transaction: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1378
+#define WT_STAT_CONN_TXN_COMMIT 1380
/*! transaction: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1379
+#define WT_STAT_CONN_TXN_ROLLBACK 1381
/*! transaction: update conflicts */
-#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1380
+#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1382
/*!
* @}
diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h
index d93f6a3be7f..9e31180dbb1 100644
--- a/src/third_party/wiredtiger/src/include/wt_internal.h
+++ b/src/third_party/wiredtiger/src/include/wt_internal.h
@@ -271,6 +271,12 @@ struct __wt_page_modify;
typedef struct __wt_page_modify WT_PAGE_MODIFY;
struct __wt_process;
typedef struct __wt_process WT_PROCESS;
+struct __wt_rec_chunk;
+ typedef struct __wt_rec_chunk WT_REC_CHUNK;
+struct __wt_rec_dictionary;
+ typedef struct __wt_rec_dictionary WT_REC_DICTIONARY;
+struct __wt_rec_kv;
+ typedef struct __wt_rec_kv WT_REC_KV;
struct __wt_ref;
typedef struct __wt_ref WT_REF;
struct __wt_ref_hist;
@@ -362,6 +368,7 @@ typedef uint64_t wt_timestamp_t;
#include "btree.h"
#include "cache.h"
#include "capacity.h"
+#include "cell.h"
#include "compact.h"
#include "config.h"
#include "cursor.h"
@@ -372,6 +379,7 @@ typedef uint64_t wt_timestamp_t;
#include "meta.h"
#include "optrack.h"
#include "os.h"
+#include "reconcile.h"
#include "schema.h"
#include "thread_group.h"
#include "txn.h"
@@ -407,6 +415,7 @@ typedef uint64_t wt_timestamp_t;
#include "os_fs.i"
#include "os_fstream.i"
#include "packing.i"
+#include "reconcile.i"
#include "serial.i"
#if defined(__cplusplus)
diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c
index 9e27a996251..1963a3770fc 100644
--- a/src/third_party/wiredtiger/src/log/log.c
+++ b/src/third_party/wiredtiger/src/log/log.c
@@ -236,16 +236,26 @@ __log_fs_write(WT_SESSION_IMPL *session,
* thread as needed.
*/
void
-__wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn)
+__wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckpt_lsn)
{
WT_CONNECTION_IMPL *conn;
WT_LOG *log;
+ int i;
conn = S2C(session);
log = conn->log;
- log->ckpt_lsn = *ckp_lsn;
+ log->ckpt_lsn = *ckpt_lsn;
if (conn->log_cond != NULL)
__wt_cond_signal(session, conn->log_cond);
+ /*
+ * If we are storing debugging LSNs to retain additional log files
+ * from archiving, then rotate the newest LSN into the array.
+ */
+ if (conn->debug_ckpt_cnt != 0) {
+ for (i = (int)conn->debug_ckpt_cnt - 1; i > 0; --i)
+ conn->debug_ckpt[i] = conn->debug_ckpt[i - 1];
+ conn->debug_ckpt[0] = *ckpt_lsn;
+ }
}
/*
diff --git a/src/third_party/wiredtiger/src/log/log_auto.c b/src/third_party/wiredtiger/src/log/log_auto.c
index f6d7afed0c2..d7f59fd920e 100644
--- a/src/third_party/wiredtiger/src/log/log_auto.c
+++ b/src/third_party/wiredtiger/src/log/log_auto.c
@@ -149,7 +149,7 @@ __wt_logop_col_modify_print(WT_SESSION_IMPL *session,
WT_RET(__wt_fprintf(session, args->fs,
" \"optype\": \"col_modify\",\n"));
WT_ERR(__wt_fprintf(session, args->fs,
- " \"fileid\": %" PRIu32 ",\n", fileid));
+ " \"fileid\": %" PRIu32 " 0x%" PRIx32 ",\n", fileid, fileid));
WT_ERR(__wt_fprintf(session, args->fs,
" \"recno\": %" PRIu64 ",\n", recno));
WT_ERR(__logrec_make_json_str(session, &escaped, &value));
@@ -224,7 +224,7 @@ __wt_logop_col_put_print(WT_SESSION_IMPL *session,
WT_RET(__wt_fprintf(session, args->fs,
" \"optype\": \"col_put\",\n"));
WT_ERR(__wt_fprintf(session, args->fs,
- " \"fileid\": %" PRIu32 ",\n", fileid));
+ " \"fileid\": %" PRIu32 " 0x%" PRIx32 ",\n", fileid, fileid));
WT_ERR(__wt_fprintf(session, args->fs,
" \"recno\": %" PRIu64 ",\n", recno));
WT_ERR(__logrec_make_json_str(session, &escaped, &value));
@@ -295,7 +295,7 @@ __wt_logop_col_remove_print(WT_SESSION_IMPL *session,
WT_RET(__wt_fprintf(session, args->fs,
" \"optype\": \"col_remove\",\n"));
WT_RET(__wt_fprintf(session, args->fs,
- " \"fileid\": %" PRIu32 ",\n", fileid));
+ " \"fileid\": %" PRIu32 " 0x%" PRIx32 ",\n", fileid, fileid));
WT_RET(__wt_fprintf(session, args->fs,
" \"recno\": %" PRIu64 "", recno));
return (0);
@@ -357,7 +357,7 @@ __wt_logop_col_truncate_print(WT_SESSION_IMPL *session,
WT_RET(__wt_fprintf(session, args->fs,
" \"optype\": \"col_truncate\",\n"));
WT_RET(__wt_fprintf(session, args->fs,
- " \"fileid\": %" PRIu32 ",\n", fileid));
+ " \"fileid\": %" PRIu32 " 0x%" PRIx32 ",\n", fileid, fileid));
WT_RET(__wt_fprintf(session, args->fs,
" \"start\": %" PRIu64 ",\n", start));
WT_RET(__wt_fprintf(session, args->fs,
@@ -424,7 +424,7 @@ __wt_logop_row_modify_print(WT_SESSION_IMPL *session,
WT_RET(__wt_fprintf(session, args->fs,
" \"optype\": \"row_modify\",\n"));
WT_ERR(__wt_fprintf(session, args->fs,
- " \"fileid\": %" PRIu32 ",\n", fileid));
+ " \"fileid\": %" PRIu32 " 0x%" PRIx32 ",\n", fileid, fileid));
WT_ERR(__logrec_make_json_str(session, &escaped, &key));
WT_ERR(__wt_fprintf(session, args->fs,
" \"key\": \"%s\",\n", escaped));
@@ -505,7 +505,7 @@ __wt_logop_row_put_print(WT_SESSION_IMPL *session,
WT_RET(__wt_fprintf(session, args->fs,
" \"optype\": \"row_put\",\n"));
WT_ERR(__wt_fprintf(session, args->fs,
- " \"fileid\": %" PRIu32 ",\n", fileid));
+ " \"fileid\": %" PRIu32 " 0x%" PRIx32 ",\n", fileid, fileid));
WT_ERR(__logrec_make_json_str(session, &escaped, &key));
WT_ERR(__wt_fprintf(session, args->fs,
" \"key\": \"%s\",\n", escaped));
@@ -585,7 +585,7 @@ __wt_logop_row_remove_print(WT_SESSION_IMPL *session,
WT_RET(__wt_fprintf(session, args->fs,
" \"optype\": \"row_remove\",\n"));
WT_ERR(__wt_fprintf(session, args->fs,
- " \"fileid\": %" PRIu32 ",\n", fileid));
+ " \"fileid\": %" PRIu32 " 0x%" PRIx32 ",\n", fileid, fileid));
WT_ERR(__logrec_make_json_str(session, &escaped, &key));
WT_ERR(__wt_fprintf(session, args->fs,
" \"key\": \"%s\"", escaped));
@@ -659,7 +659,7 @@ __wt_logop_row_truncate_print(WT_SESSION_IMPL *session,
WT_RET(__wt_fprintf(session, args->fs,
" \"optype\": \"row_truncate\",\n"));
WT_ERR(__wt_fprintf(session, args->fs,
- " \"fileid\": %" PRIu32 ",\n", fileid));
+ " \"fileid\": %" PRIu32 " 0x%" PRIx32 ",\n", fileid, fileid));
WT_ERR(__logrec_make_json_str(session, &escaped, &start));
WT_ERR(__wt_fprintf(session, args->fs,
" \"start\": \"%s\",\n", escaped));
@@ -798,6 +798,82 @@ __wt_logop_prev_lsn_print(WT_SESSION_IMPL *session,
}
int
+__wt_logop_txn_timestamp_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint64_t time_sec, uint64_t time_nsec, uint64_t commit_ts, uint64_t durable_ts, uint64_t first_ts, uint64_t prepare_ts, uint64_t read_ts)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIQQQQQQQ);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_TXN_TIMESTAMP;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, time_sec, time_nsec, commit_ts, durable_ts, first_ts, prepare_ts, read_ts));
+
+ __wt_struct_size_adjust(session, &size);
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, time_sec, time_nsec, commit_ts, durable_ts, first_ts, prepare_ts, read_ts));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_txn_timestamp_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint64_t *time_secp, uint64_t *time_nsecp, uint64_t *commit_tsp, uint64_t *durable_tsp, uint64_t *first_tsp, uint64_t *prepare_tsp, uint64_t *read_tsp)
+{
+ WT_DECL_RET;
+ const char *fmt = WT_UNCHECKED_STRING(IIQQQQQQQ);
+ uint32_t optype, size;
+
+ if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, time_secp, time_nsecp, commit_tsp, durable_tsp, first_tsp, prepare_tsp, read_tsp)) != 0)
+ WT_RET_MSG(session, ret, "logop_txn_timestamp: unpack failure");
+ WT_ASSERT(session, optype == WT_LOGOP_TXN_TIMESTAMP);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_txn_timestamp_print(WT_SESSION_IMPL *session,
+ const uint8_t **pp, const uint8_t *end, WT_TXN_PRINTLOG_ARGS *args)
+{
+ uint64_t time_sec;
+ uint64_t time_nsec;
+ uint64_t commit_ts;
+ uint64_t durable_ts;
+ uint64_t first_ts;
+ uint64_t prepare_ts;
+ uint64_t read_ts;
+
+ WT_RET(__wt_logop_txn_timestamp_unpack(
+ session, pp, end, &time_sec, &time_nsec, &commit_ts, &durable_ts, &first_ts, &prepare_ts, &read_ts));
+
+ WT_RET(__wt_fprintf(session, args->fs,
+ " \"optype\": \"txn_timestamp\",\n"));
+ WT_RET(__wt_fprintf(session, args->fs,
+ " \"time_sec\": %" PRIu64 ",\n", time_sec));
+ WT_RET(__wt_fprintf(session, args->fs,
+ " \"time_nsec\": %" PRIu64 ",\n", time_nsec));
+ WT_RET(__wt_fprintf(session, args->fs,
+ " \"commit_ts\": %" PRIu64 ",\n", commit_ts));
+ WT_RET(__wt_fprintf(session, args->fs,
+ " \"durable_ts\": %" PRIu64 ",\n", durable_ts));
+ WT_RET(__wt_fprintf(session, args->fs,
+ " \"first_ts\": %" PRIu64 ",\n", first_ts));
+ WT_RET(__wt_fprintf(session, args->fs,
+ " \"prepare_ts\": %" PRIu64 ",\n", prepare_ts));
+ WT_RET(__wt_fprintf(session, args->fs,
+ " \"read_ts\": %" PRIu64 "", read_ts));
+ return (0);
+}
+
+int
__wt_txn_op_printlog(WT_SESSION_IMPL *session,
const uint8_t **pp, const uint8_t *end, WT_TXN_PRINTLOG_ARGS *args)
{
@@ -848,6 +924,10 @@ __wt_txn_op_printlog(WT_SESSION_IMPL *session,
WT_RET(__wt_logop_prev_lsn_print(session, pp, end, args));
break;
+ case WT_LOGOP_TXN_TIMESTAMP:
+ WT_RET(__wt_logop_txn_timestamp_print(session, pp, end, args));
+ break;
+
WT_ILLEGAL_VALUE(session, optype);
}
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_child.c b/src/third_party/wiredtiger/src/reconcile/rec_child.c
new file mode 100644
index 00000000000..f1d261c8f42
--- /dev/null
+++ b/src/third_party/wiredtiger/src/reconcile/rec_child.c
@@ -0,0 +1,329 @@
+/*-
+ * Copyright (c) 2014-2019 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __rec_child_deleted --
+ * Handle pages with leaf pages in the WT_REF_DELETED state.
+ */
+static int
+__rec_child_deleted(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_REF *ref, WT_CHILD_STATE *statep)
+{
+ WT_PAGE_DELETED *page_del;
+
+ page_del = ref->page_del;
+
+ /*
+ * Internal pages with child leaf pages in the WT_REF_DELETED state are
+ * a special case during reconciliation. First, if the deletion was a
+ * result of a session truncate call, the deletion may not be visible to
+ * us. In that case, we proceed as with any change not visible during
+ * reconciliation by ignoring the change for the purposes of writing the
+ * internal page.
+ *
+ * In this case, there must be an associated page-deleted structure, and
+ * it holds the transaction ID we care about.
+ *
+ * In some cases, there had better not be any updates we can't see.
+ *
+ * A visible update to be in READY state (i.e. not in LOCKED or
+ * PREPARED state), for truly visible to others.
+ */
+ if (F_ISSET(r, WT_REC_VISIBILITY_ERR) && page_del != NULL &&
+ __wt_page_del_active(session, ref, false))
+ WT_PANIC_RET(session, EINVAL,
+ "reconciliation illegally skipped an update");
+
+ /*
+ * Deal with any underlying disk blocks.
+ *
+ * First, check to see if there is an address associated with this leaf:
+ * if there isn't, we're done, the underlying page is already gone. If
+ * the page still exists, check for any transactions in the system that
+ * might want to see the page's state before it's deleted.
+ *
+ * If any such transactions exist, we cannot discard the underlying leaf
+ * page to the block manager because the transaction may eventually read
+ * it. However, this write might be part of a checkpoint, and should we
+ * recover to that checkpoint, we'll need to delete the leaf page, else
+ * we'd leak it. The solution is to write a proxy cell on the internal
+ * page ensuring the leaf page is eventually discarded.
+ *
+ * If no such transactions exist, we can discard the leaf page to the
+ * block manager and no cell needs to be written at all. We do this
+ * outside of the underlying tracking routines because this action is
+ * permanent and irrevocable. (Clearing the address means we've lost
+ * track of the disk address in a permanent way. This is safe because
+ * there's no path to reading the leaf page again: if there's ever a
+ * read into this part of the name space again, the cache read function
+ * instantiates an entirely new page.)
+ */
+ if (ref->addr != NULL && !__wt_page_del_active(session, ref, true)) {
+ /*
+ * Minor memory cleanup: if a truncate call deleted this page
+ * and we were ever forced to instantiate the page in memory,
+ * we would have built a list of updates in the page reference
+ * in order to be able to commit/rollback the truncate. We just
+ * passed a visibility test, discard the update list.
+ */
+ if (page_del != NULL) {
+ __wt_free(session, ref->page_del->update_list);
+ __wt_free(session, ref->page_del);
+ }
+
+ WT_RET(__wt_ref_block_free(session, ref));
+ }
+
+ /*
+ * If the original page is gone, we can skip the slot on the internal
+ * page.
+ */
+ if (ref->addr == NULL) {
+ *statep = WT_CHILD_IGNORE;
+ return (0);
+ }
+
+ /*
+ * Internal pages with deletes that aren't stable cannot be evicted, we
+ * don't have sufficient information to restore the page's information
+ * if subsequently read (we wouldn't know which transactions should see
+ * the original page and which should see the deleted page).
+ */
+ if (F_ISSET(r, WT_REC_EVICT))
+ return (__wt_set_return(session, EBUSY));
+
+ /*
+ * If there are deleted child pages we can't discard immediately, keep
+ * the page dirty so they are eventually freed.
+ */
+ r->leave_dirty = true;
+
+ /*
+ * If the original page cannot be freed, we need to keep a slot on the
+ * page to reference it from the parent page.
+ *
+ * If the delete is not visible in this checkpoint, write the original
+ * address normally. Otherwise, we have to write a proxy record.
+ * If the delete state is not ready, then delete is not visible as it
+ * is in prepared state.
+ */
+ if (!__wt_page_del_active(session, ref, false))
+ *statep = WT_CHILD_PROXY;
+
+ return (0);
+}
+
+/*
+ * __wt_rec_child_modify --
+ * Return if the internal page's child references any modifications.
+ */
+int
+__wt_rec_child_modify(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_REF *ref, bool *hazardp, WT_CHILD_STATE *statep)
+{
+ WT_DECL_RET;
+ WT_PAGE_MODIFY *mod;
+
+ /* We may acquire a hazard pointer our caller must release. */
+ *hazardp = false;
+
+ /* Default to using the original child address. */
+ *statep = WT_CHILD_ORIGINAL;
+
+ /*
+ * This function is called when walking an internal page to decide how
+ * to handle child pages referenced by the internal page.
+ *
+ * Internal pages are reconciled for two reasons: first, when evicting
+ * an internal page, second by the checkpoint code when writing internal
+ * pages. During eviction, all pages should be in the WT_REF_DISK or
+ * WT_REF_DELETED state. During checkpoint, eviction that might affect
+ * review of an internal page is prohibited, however, as the subtree is
+ * not reserved for our exclusive use, there are other page states that
+ * must be considered.
+ */
+ for (;; __wt_yield()) {
+ switch (r->tested_ref_state = ref->state) {
+ case WT_REF_DISK:
+ /* On disk, not modified by definition. */
+ goto done;
+
+ case WT_REF_DELETED:
+ /*
+ * The child is in a deleted state.
+ *
+ * It's possible the state could change underneath us as
+ * the page is read in, and we can race between checking
+ * for a deleted state and looking at the transaction ID
+ * to see if the delete is visible to us. Lock down the
+ * structure.
+ */
+ if (!WT_REF_CAS_STATE(
+ session, ref, WT_REF_DELETED, WT_REF_LOCKED))
+ break;
+ ret = __rec_child_deleted(session, r, ref, statep);
+ WT_REF_SET_STATE(ref, WT_REF_DELETED);
+ goto done;
+
+ case WT_REF_LOCKED:
+ /*
+ * Locked.
+ *
+ * We should never be here during eviction, active child
+ * pages in an evicted page's subtree fails the eviction
+ * attempt.
+ */
+ WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
+ if (F_ISSET(r, WT_REC_EVICT))
+ return (__wt_set_return(session, EBUSY));
+
+ /*
+ * If called during checkpoint, the child is being
+ * considered by the eviction server or the child is a
+ * truncated page being read. The eviction may have
+ * started before the checkpoint and so we must wait
+ * for the eviction to be resolved. I suspect we could
+ * handle reads of truncated pages, but we can't
+ * distinguish between the two and reads of truncated
+ * pages aren't expected to be common.
+ */
+ break;
+
+ case WT_REF_LIMBO:
+ WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
+ /* FALLTHROUGH */
+ case WT_REF_LOOKASIDE:
+ /*
+ * On disk or in cache with lookaside updates.
+ *
+ * We should never be here during eviction: active
+ * child pages in an evicted page's subtree fails the
+ * eviction attempt.
+ */
+ if (F_ISSET(r, WT_REC_EVICT) &&
+ __wt_page_las_active(session, ref)) {
+ WT_ASSERT(session, false);
+ return (__wt_set_return(session, EBUSY));
+ }
+
+ /*
+ * A page evicted with lookaside entries may not have
+ * an address, if no updates were visible to
+ * reconciliation. Any child pages in that state
+ * should be ignored.
+ */
+ if (ref->addr == NULL) {
+ *statep = WT_CHILD_IGNORE;
+ WT_CHILD_RELEASE(session, *hazardp, ref);
+ }
+ goto done;
+
+ case WT_REF_MEM:
+ /*
+ * In memory.
+ *
+ * We should never be here during eviction, active child
+ * pages in an evicted page's subtree fails the eviction
+ * attempt.
+ */
+ WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
+ if (F_ISSET(r, WT_REC_EVICT))
+ return (__wt_set_return(session, EBUSY));
+
+ /*
+ * If called during checkpoint, acquire a hazard pointer
+ * so the child isn't evicted, it's an in-memory case.
+ *
+ * This call cannot return split/restart, we have a lock
+ * on the parent which prevents a child page split.
+ *
+ * Set WT_READ_NO_WAIT because we're only interested in
+ * the WT_REF's final state. Pages in transition might
+ * change WT_REF state during our read, and then return
+ * WT_NOTFOUND to us. In that case, loop and look again.
+ */
+ ret = __wt_page_in(session, ref,
+ WT_READ_CACHE | WT_READ_NO_EVICT |
+ WT_READ_NO_GEN | WT_READ_NO_WAIT);
+ if (ret == WT_NOTFOUND) {
+ ret = 0;
+ break;
+ }
+ WT_RET(ret);
+ *hazardp = true;
+ goto in_memory;
+
+ case WT_REF_READING:
+ /*
+ * Being read, not modified by definition.
+ *
+ * We should never be here during eviction, active child
+ * pages in an evicted page's subtree fails the eviction
+ * attempt.
+ */
+ WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
+ if (F_ISSET(r, WT_REC_EVICT))
+ return (__wt_set_return(session, EBUSY));
+ goto done;
+
+ case WT_REF_SPLIT:
+ /*
+ * The page was split out from under us.
+ *
+ * We should never be here during eviction, active child
+ * pages in an evicted page's subtree fails the eviction
+ * attempt.
+ *
+ * We should never be here during checkpoint, dirty page
+ * eviction is shutout during checkpoint, all splits in
+ * process will have completed before we walk any pages
+ * for checkpoint.
+ */
+ WT_ASSERT(session, WT_REF_SPLIT != WT_REF_SPLIT);
+ return (__wt_set_return(session, EBUSY));
+
+ WT_ILLEGAL_VALUE(session, r->tested_ref_state);
+ }
+ WT_STAT_CONN_INCR(session, child_modify_blocked_page);
+ }
+
+in_memory:
+ /*
+ * In-memory states: the child is potentially modified if the page's
+ * modify structure has been instantiated. If the modify structure
+ * exists and the page has actually been modified, set that state.
+ * If that's not the case, we would normally use the original cell's
+ * disk address as our reference, however there are two special cases,
+ * both flagged by a missing block address.
+ *
+ * First, if forced to instantiate a deleted child page and it's never
+ * modified, we end up here with a page that has a modify structure, no
+ * modifications, and no disk address. Ignore those pages, they're not
+ * modified and there is no reason to write the cell.
+ *
+ * Second, insert splits are permitted during checkpoint. When doing the
+ * final checkpoint pass, we first walk the internal page's page-index
+ * and write out any dirty pages we find, then we write out the internal
+ * page in post-order traversal. If we found the split page in the first
+ * step, it will have an address; if we didn't find the split page in
+ * the first step, it won't have an address and we ignore it, it's not
+ * part of the checkpoint.
+ */
+ mod = ref->page->modify;
+ if (mod != NULL && mod->rec_result != 0)
+ *statep = WT_CHILD_MODIFIED;
+ else if (ref->addr == NULL) {
+ *statep = WT_CHILD_IGNORE;
+ WT_CHILD_RELEASE(session, *hazardp, ref);
+ }
+
+done: WT_DIAGNOSTIC_YIELD;
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c
new file mode 100644
index 00000000000..6a57a9c26d6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c
@@ -0,0 +1,1077 @@
+/*-
+ * Copyright (c) 2014-2019 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __rec_col_fix_bulk_insert_split_check --
+ * Check if a bulk-loaded fixed-length column store page needs to split.
+ */
+static inline int
+__rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk)
+{
+ WT_BTREE *btree;
+ WT_RECONCILE *r;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session;
+ r = cbulk->reconcile;
+ btree = S2BT(session);
+
+ if (cbulk->entry == cbulk->nrecs) {
+ if (cbulk->entry != 0) {
+ /*
+ * If everything didn't fit, update the counters and
+ * split.
+ *
+ * Boundary: split or write the page.
+ *
+ * No need to have a minimum split size boundary, all
+ * pages are filled 100% except the last, allowing it to
+ * grow in the future.
+ */
+ __wt_rec_incr(session, r, cbulk->entry,
+ __bitstr_size(
+ (size_t)cbulk->entry * btree->bitcnt));
+ WT_RET(__wt_rec_split(session, r, 0));
+ }
+ cbulk->entry = 0;
+ cbulk->nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail);
+ }
+ return (0);
+}
+
+/*
+ * __wt_bulk_insert_fix --
+ * Fixed-length column-store bulk insert.
+ */
+int
+__wt_bulk_insert_fix(
+ WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_RECONCILE *r;
+
+ r = cbulk->reconcile;
+ btree = S2BT(session);
+ cursor = &cbulk->cbt.iface;
+
+ WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk));
+ __bit_setv(r->first_free, cbulk->entry,
+ btree->bitcnt, deleted ? 0 : ((uint8_t *)cursor->value.data)[0]);
+ ++cbulk->entry;
+ ++r->recno;
+
+ return (0);
+}
+
+/*
+ * __wt_bulk_insert_fix_bitmap --
+ * Fixed-length column-store bulk insert.
+ */
+int
+__wt_bulk_insert_fix_bitmap(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_RECONCILE *r;
+ uint32_t entries, offset, page_entries, page_size;
+ const uint8_t *data;
+
+ r = cbulk->reconcile;
+ btree = S2BT(session);
+ cursor = &cbulk->cbt.iface;
+
+ if (((r->recno - 1) * btree->bitcnt) & 0x7)
+ WT_RET_MSG(session, EINVAL,
+ "Bulk bitmap load not aligned on a byte boundary");
+ for (data = cursor->value.data,
+ entries = (uint32_t)cursor->value.size;
+ entries > 0;
+ entries -= page_entries, data += page_size) {
+ WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk));
+
+ page_entries = WT_MIN(entries, cbulk->nrecs - cbulk->entry);
+ page_size = __bitstr_size(page_entries * btree->bitcnt);
+ offset = __bitstr_size(cbulk->entry * btree->bitcnt);
+ memcpy(r->first_free + offset, data, page_size);
+ cbulk->entry += page_entries;
+ r->recno += page_entries;
+ }
+ return (0);
+}
+
+/*
+ * __wt_bulk_insert_var --
+ * Variable-length column-store bulk insert.
+ */
+int
+__wt_bulk_insert_var(
+ WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted)
+{
+ WT_BTREE *btree;
+ WT_RECONCILE *r;
+ WT_REC_KV *val;
+
+ r = cbulk->reconcile;
+ btree = S2BT(session);
+
+ val = &r->v;
+ if (deleted) {
+ val->cell_len = __wt_cell_pack_del(&val->cell, cbulk->rle);
+ val->buf.data = NULL;
+ val->buf.size = 0;
+ val->len = val->cell_len;
+ } else
+ /*
+ * Store the bulk cursor's last buffer, not the current value,
+ * we're tracking duplicates, which means we want the previous
+ * value seen, not the current value.
+ */
+ WT_RET(__wt_rec_cell_build_val(session,
+ r, cbulk->last.data, cbulk->last.size, cbulk->rle));
+
+ /* Boundary: split or write the page. */
+ if (WT_CROSSING_SPLIT_BND(r, val->len))
+ WT_RET(__wt_rec_split_crossing_bnd(session, r, val->len));
+
+ /* Copy the value onto the page. */
+ if (btree->dictionary)
+ WT_RET(__wt_rec_dict_replace(session, r, cbulk->rle, val));
+ __wt_rec_copy_incr(session, r, val);
+
+ /* Update the starting record number in case we split. */
+ r->recno += cbulk->rle;
+
+ return (0);
+}
+
+/*
+ * __rec_col_merge --
+ * Merge in a split page.
+ */
+static int
+__rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_ADDR *addr;
+ WT_MULTI *multi;
+ WT_PAGE_MODIFY *mod;
+ WT_REC_KV *val;
+ uint32_t i;
+
+ mod = page->modify;
+
+ val = &r->v;
+
+ /* For each entry in the split array... */
+ for (multi = mod->mod_multi,
+ i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+ /* Update the starting record number in case we split. */
+ r->recno = multi->key.recno;
+
+ /* Build the value cell. */
+ addr = &multi->addr;
+ __wt_rec_cell_build_addr(session, r,
+ addr->addr, addr->size, __wt_rec_vtype(addr), r->recno);
+
+ /* Boundary: split or write the page. */
+ if (__wt_rec_need_split(r, val->len))
+ WT_RET(__wt_rec_split_crossing_bnd(
+ session, r, val->len));
+
+ /* Copy the value onto the page. */
+ __wt_rec_copy_incr(session, r, val);
+ }
+ return (0);
+}
+
+/*
+ * __wt_rec_col_int --
+ * Reconcile a column-store internal page.
+ */
+int
+__wt_rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
+{
+ WT_ADDR *addr;
+ WT_BTREE *btree;
+ WT_CELL_UNPACK *vpack, _vpack;
+ WT_CHILD_STATE state;
+ WT_DECL_RET;
+ WT_PAGE *child, *page;
+ WT_REC_KV *val;
+ WT_REF *ref;
+ bool hazard;
+
+ btree = S2BT(session);
+ page = pageref->page;
+ child = NULL;
+ hazard = false;
+
+ val = &r->v;
+ vpack = &_vpack;
+
+ WT_RET(__wt_rec_split_init(session,
+ r, page, pageref->ref_recno, btree->maxintlpage_precomp));
+
+ /* For each entry in the in-memory page... */
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ /* Update the starting record number in case we split. */
+ r->recno = ref->ref_recno;
+
+ /*
+ * Modified child.
+ * The page may be emptied or internally created during a split.
+ * Deleted/split pages are merged into the parent and discarded.
+ */
+ WT_ERR(__wt_rec_child_modify(session, r, ref, &hazard, &state));
+ addr = NULL;
+ child = ref->page;
+
+ switch (state) {
+ case WT_CHILD_IGNORE:
+ /* Ignored child. */
+ WT_CHILD_RELEASE_ERR(session, hazard, ref);
+ continue;
+
+ case WT_CHILD_MODIFIED:
+ /*
+ * Modified child. Empty pages are merged into the
+ * parent and discarded.
+ */
+ switch (child->modify->rec_result) {
+ case WT_PM_REC_EMPTY:
+ /*
+ * Column-store pages are almost never empty, as
+ * discarding a page would remove a chunk of the
+ * name space. The exceptions are pages created
+ * when the tree is created, and never filled.
+ */
+ WT_CHILD_RELEASE_ERR(session, hazard, ref);
+ continue;
+ case WT_PM_REC_MULTIBLOCK:
+ WT_ERR(__rec_col_merge(session, r, child));
+ WT_CHILD_RELEASE_ERR(session, hazard, ref);
+ continue;
+ case WT_PM_REC_REPLACE:
+ addr = &child->modify->mod_replace;
+ break;
+ WT_ILLEGAL_VALUE_ERR(
+ session, child->modify->rec_result);
+ }
+ break;
+ case WT_CHILD_ORIGINAL:
+ /* Original child. */
+ break;
+ case WT_CHILD_PROXY:
+ /*
+ * Deleted child where we write a proxy cell, not yet
+ * supported for column-store.
+ */
+ WT_ERR(__wt_illegal_value(session, state));
+ }
+
+ /*
+ * Build the value cell. The child page address is in one of 3
+ * places: if the page was replaced, the page's modify structure
+ * references it and we built the value cell just above in the
+ * switch statement. Else, the WT_REF->addr reference points to
+ * an on-page cell or an off-page WT_ADDR structure: if it's an
+ * on-page cell and we copy it from the page, else build a new
+ * cell.
+ */
+ if (addr == NULL && __wt_off_page(page, ref->addr))
+ addr = ref->addr;
+ if (addr == NULL) {
+ __wt_cell_unpack(ref->addr, vpack);
+ val->buf.data = ref->addr;
+ val->buf.size = __wt_cell_total_len(vpack);
+ val->cell_len = 0;
+ val->len = val->buf.size;
+ } else
+ __wt_rec_cell_build_addr(session, r,
+ addr->addr, addr->size,
+ __wt_rec_vtype(addr), ref->ref_recno);
+ WT_CHILD_RELEASE_ERR(session, hazard, ref);
+
+ /* Boundary: split or write the page. */
+ if (__wt_rec_need_split(r, val->len))
+ WT_ERR(__wt_rec_split_crossing_bnd(
+ session, r, val->len));
+
+ /* Copy the value onto the page. */
+ __wt_rec_copy_incr(session, r, val);
+ } WT_INTL_FOREACH_END;
+
+ /* Write the remnant page. */
+ return (__wt_rec_split_finish(session, r));
+
+err: WT_CHILD_RELEASE(session, hazard, ref);
+ return (ret);
+}
+
+/*
+ * __wt_rec_col_fix --
+ * Reconcile a fixed-width, column-store leaf page.
+ */
+int
+__wt_rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
+{
+ WT_BTREE *btree;
+ WT_INSERT *ins;
+ WT_PAGE *page;
+ WT_UPDATE *upd;
+ uint64_t recno;
+ uint32_t entry, nrecs;
+
+ btree = S2BT(session);
+ page = pageref->page;
+
+ WT_RET(__wt_rec_split_init(
+ session, r, page, pageref->ref_recno, btree->maxleafpage));
+
+ /* Copy the original, disk-image bytes into place. */
+ memcpy(r->first_free, page->pg_fix_bitf,
+ __bitstr_size((size_t)page->entries * btree->bitcnt));
+
+ /* Update any changes to the original on-page data items. */
+ WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page)) {
+ WT_RET(__wt_rec_txn_read(
+ session, r, ins, NULL, NULL, NULL, &upd));
+ if (upd != NULL)
+ __bit_setv(r->first_free,
+ WT_INSERT_RECNO(ins) - pageref->ref_recno,
+ btree->bitcnt, *upd->data);
+ }
+
+ /* Calculate the number of entries per page remainder. */
+ entry = page->entries;
+ nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail) - page->entries;
+ r->recno += entry;
+
+ /* Walk any append list. */
+ for (ins =
+ WT_SKIP_FIRST(WT_COL_APPEND(page));; ins = WT_SKIP_NEXT(ins)) {
+ if (ins == NULL) {
+ /*
+ * If the page split, instantiate any missing records in
+ * the page's name space. (Imagine record 98 is
+ * transactionally visible, 99 wasn't created or is not
+ * yet visible, 100 is visible. Then the page splits and
+ * record 100 moves to another page. When we reconcile
+ * the original page, we write record 98, then we don't
+ * see record 99 for whatever reason. If we've moved
+ * record 100, we don't know to write a deleted record
+ * 99 on the page.)
+ *
+ * The record number recorded during the split is the
+ * first key on the split page, that is, one larger than
+ * the last key on this page, we have to decrement it.
+ */
+ if ((recno =
+ page->modify->mod_col_split_recno) == WT_RECNO_OOB)
+ break;
+ recno -= 1;
+
+ /*
+ * The following loop assumes records to write, and the
+ * previous key might have been visible.
+ */
+ if (r->recno > recno)
+ break;
+ upd = NULL;
+ } else {
+ WT_RET(__wt_rec_txn_read(
+ session, r, ins, NULL, NULL, NULL, &upd));
+ recno = WT_INSERT_RECNO(ins);
+ }
+ for (;;) {
+ /*
+ * The application may have inserted records which left
+ * gaps in the name space.
+ */
+ for (;
+ nrecs > 0 && r->recno < recno;
+ --nrecs, ++entry, ++r->recno)
+ __bit_setv(
+ r->first_free, entry, btree->bitcnt, 0);
+
+ if (nrecs > 0) {
+ __bit_setv(r->first_free, entry, btree->bitcnt,
+ upd == NULL ? 0 : *upd->data);
+ --nrecs;
+ ++entry;
+ ++r->recno;
+ break;
+ }
+
+ /*
+ * If everything didn't fit, update the counters and
+ * split.
+ *
+ * Boundary: split or write the page.
+ *
+ * No need to have a minimum split size boundary, all
+ * pages are filled 100% except the last, allowing it to
+ * grow in the future.
+ */
+ __wt_rec_incr(session, r, entry,
+ __bitstr_size((size_t)entry * btree->bitcnt));
+ WT_RET(__wt_rec_split(session, r, 0));
+
+ /* Calculate the number of entries per page. */
+ entry = 0;
+ nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail);
+ }
+
+ /*
+ * Execute this loop once without an insert item to catch any
+ * missing records due to a split, then quit.
+ */
+ if (ins == NULL)
+ break;
+ }
+
+ /* Update the counters. */
+ __wt_rec_incr(
+ session, r, entry, __bitstr_size((size_t)entry * btree->bitcnt));
+
+ /* Write the remnant page. */
+ return (__wt_rec_split_finish(session, r));
+}
+
+/*
+ * __wt_rec_col_fix_slvg --
+ * Reconcile a fixed-width, column-store leaf page created during salvage.
+ */
+int
+__wt_rec_col_fix_slvg(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_REF *pageref, WT_SALVAGE_COOKIE *salvage)
+{
+ WT_BTREE *btree;
+ WT_PAGE *page;
+ uint64_t page_start, page_take;
+ uint32_t entry, nrecs;
+
+ btree = S2BT(session);
+ page = pageref->page;
+
+ /*
+ * !!!
+ * It's vanishingly unlikely and probably impossible for fixed-length
+ * column-store files to have overlapping key ranges. It's possible
+ * for an entire key range to go missing (if a page is corrupted and
+ * lost), but because pages can't split, it shouldn't be possible to
+ * find pages where the key ranges overlap. That said, we check for
+ * it during salvage and clean up after it here because it doesn't
+ * cost much and future column-store formats or operations might allow
+ * for fixed-length format ranges to overlap during salvage, and I
+ * don't want to have to retrofit the code later.
+ */
+ WT_RET(__wt_rec_split_init(
+ session, r, page, pageref->ref_recno, btree->maxleafpage));
+
+ /* We may not be taking all of the entries on the original page. */
+ page_take = salvage->take == 0 ? page->entries : salvage->take;
+ page_start = salvage->skip == 0 ? 0 : salvage->skip;
+
+ /* Calculate the number of entries per page. */
+ entry = 0;
+ nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail);
+
+ for (; nrecs > 0 && salvage->missing > 0;
+ --nrecs, --salvage->missing, ++entry)
+ __bit_setv(r->first_free, entry, btree->bitcnt, 0);
+
+ for (; nrecs > 0 && page_take > 0;
+ --nrecs, --page_take, ++page_start, ++entry)
+ __bit_setv(r->first_free, entry, btree->bitcnt,
+ __bit_getv(page->pg_fix_bitf,
+ (uint32_t)page_start, btree->bitcnt));
+
+ r->recno += entry;
+ __wt_rec_incr(session, r, entry,
+ __bitstr_size((size_t)entry * btree->bitcnt));
+
+ /*
+ * We can't split during salvage -- if everything didn't fit, it's
+ * all gone wrong.
+ */
+ if (salvage->missing != 0 || page_take != 0)
+ WT_PANIC_RET(session, WT_PANIC,
+ "%s page too large, attempted split during salvage",
+ __wt_page_type_string(page->type));
+
+ /* Write the page. */
+ return (__wt_rec_split_finish(session, r));
+}
+
+/*
+ * __rec_col_var_helper --
+ * Create a column-store variable length record cell and write it onto a
+ * page.
+ */
+static int
+__rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r,
+ WT_SALVAGE_COOKIE *salvage,
+ WT_ITEM *value, bool deleted, uint8_t overflow_type, uint64_t rle)
+{
+ WT_BTREE *btree;
+ WT_REC_KV *val;
+
+ btree = S2BT(session);
+
+ val = &r->v;
+
+ /*
+ * Occasionally, salvage needs to discard records from the beginning or
+ * end of the page, and because the items may be part of a RLE cell, do
+ * the adjustments here. It's not a mistake we don't bother telling
+ * our caller we've handled all the records from the page we care about,
+ * and can quit processing the page: salvage is a rare operation and I
+ * don't want to complicate our caller's loop.
+ */
+ if (salvage != NULL) {
+ if (salvage->done)
+ return (0);
+ if (salvage->skip != 0) {
+ if (rle <= salvage->skip) {
+ salvage->skip -= rle;
+ return (0);
+ }
+ rle -= salvage->skip;
+ salvage->skip = 0;
+ }
+ if (salvage->take != 0) {
+ if (rle <= salvage->take)
+ salvage->take -= rle;
+ else {
+ rle = salvage->take;
+ salvage->take = 0;
+ }
+ if (salvage->take == 0)
+ salvage->done = true;
+ }
+ }
+
+ if (deleted) {
+ val->cell_len = __wt_cell_pack_del(&val->cell, rle);
+ val->buf.data = NULL;
+ val->buf.size = 0;
+ val->len = val->cell_len;
+ } else if (overflow_type) {
+ val->cell_len = __wt_cell_pack_ovfl(
+ &val->cell, overflow_type, rle, value->size);
+ val->buf.data = value->data;
+ val->buf.size = value->size;
+ val->len = val->cell_len + value->size;
+ } else
+ WT_RET(__wt_rec_cell_build_val(
+ session, r, value->data, value->size, rle));
+
+ /* Boundary: split or write the page. */
+ if (__wt_rec_need_split(r, val->len))
+ WT_RET(__wt_rec_split_crossing_bnd(session, r, val->len));
+
+ /* Copy the value onto the page. */
+ if (!deleted && !overflow_type && btree->dictionary)
+ WT_RET(__wt_rec_dict_replace(session, r, rle, val));
+ __wt_rec_copy_incr(session, r, val);
+
+ /* Update the starting record number in case we split. */
+ r->recno += rle;
+
+ return (0);
+}
+
+/*
+ * __wt_rec_col_var --
+ * Reconcile a variable-width column-store leaf page.
+ */
+int
+__wt_rec_col_var(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_REF *pageref, WT_SALVAGE_COOKIE *salvage)
+{
+ enum { OVFL_IGNORE, OVFL_UNUSED, OVFL_USED } ovfl_state;
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *vpack, _vpack;
+ WT_COL *cip;
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_ITEM(orig);
+ WT_DECL_RET;
+ WT_INSERT *ins;
+ WT_ITEM *last;
+ WT_PAGE *page;
+ WT_UPDATE *upd;
+ uint64_t n, nrepeat, repeat_count, rle, skip, src_recno;
+ uint32_t i, size;
+ bool deleted, last_deleted, orig_deleted, update_no_copy;
+ const void *data;
+
+ btree = S2BT(session);
+ page = pageref->page;
+ last = r->last;
+ vpack = &_vpack;
+ cbt = &r->update_modify_cbt;
+
+ WT_RET(__wt_rec_split_init(session,
+ r, page, pageref->ref_recno, btree->maxleafpage_precomp));
+
+ WT_RET(__wt_scr_alloc(session, 0, &orig));
+ data = NULL;
+ size = 0;
+ upd = NULL;
+
+ /*
+ * The salvage code may be calling us to reconcile a page where there
+ * were missing records in the column-store name space. If taking the
+ * first record from on the page, it might be a deleted record, so we
+ * have to give the RLE code a chance to figure that out. Else, if
+ * not taking the first record from the page, write a single element
+ * representing the missing records onto a new page. (Don't pass the
+ * salvage cookie to our helper function in this case, we're handling
+ * one of the salvage cookie fields on our own, and we don't need the
+ * helper function's assistance.)
+ */
+ rle = 0;
+ last_deleted = false;
+ if (salvage != NULL && salvage->missing != 0) {
+ if (salvage->skip == 0) {
+ rle = salvage->missing;
+ last_deleted = true;
+
+ /*
+ * Correct the number of records we're going to "take",
+ * pretending the missing records were on the page.
+ */
+ salvage->take += salvage->missing;
+ } else
+ WT_ERR(__rec_col_var_helper(session,
+ r, NULL, NULL, true, false, salvage->missing));
+ }
+
+ /*
+ * We track two data items through this loop: the previous (last) item
+ * and the current item: if the last item is the same as the current
+ * item, we increment the RLE count for the last item; if the last item
+ * is different from the current item, we write the last item onto the
+ * page, and replace it with the current item. The r->recno counter
+ * tracks records written to the page, and is incremented by the helper
+ * function immediately after writing records to the page. The record
+ * number of our source record, that is, the current item, is maintained
+ * in src_recno.
+ */
+ src_recno = r->recno + rle;
+
+ /* For each entry in the in-memory page... */
+ WT_COL_FOREACH(page, cip, i) {
+ ovfl_state = OVFL_IGNORE;
+ if ((cell = WT_COL_PTR(page, cip)) == NULL) {
+ nrepeat = 1;
+ ins = NULL;
+ orig_deleted = true;
+ } else {
+ __wt_cell_unpack(cell, vpack);
+ nrepeat = __wt_cell_rle(vpack);
+ ins = WT_SKIP_FIRST(WT_COL_UPDATE(page, cip));
+
+ /*
+ * If the original value is "deleted", there's no value
+ * to compare, we're done.
+ */
+ orig_deleted = vpack->type == WT_CELL_DEL;
+ if (orig_deleted)
+ goto record_loop;
+
+ /*
+ * Overflow items are tricky: we don't know until we're
+ * finished processing the set of values if we need the
+ * overflow value or not. If we don't use the overflow
+ * item at all, we have to discard it from the backing
+ * file, otherwise we'll leak blocks on the checkpoint.
+ * That's safe because if the backing overflow value is
+ * still needed by any running transaction, we'll cache
+ * a copy in the update list.
+ *
+ * Regardless, we avoid copying in overflow records: if
+ * there's a WT_INSERT entry that modifies a reference
+ * counted overflow record, we may have to write copies
+ * of the overflow record, and in that case we'll do the
+ * comparisons, but we don't read overflow items just to
+ * see if they match records on either side.
+ */
+ if (vpack->ovfl) {
+ ovfl_state = OVFL_UNUSED;
+ goto record_loop;
+ }
+
+ /*
+ * If data is Huffman encoded, we have to decode it in
+ * order to compare it with the last item we saw, which
+ * may have been an update string. This guarantees we
+ * find every single pair of objects we can RLE encode,
+ * including applications updating an existing record
+ * where the new value happens (?) to match a Huffman-
+ * encoded value in a previous or next record.
+ */
+ WT_ERR(__wt_dsk_cell_data_ref(
+ session, WT_PAGE_COL_VAR, vpack, orig));
+ }
+
+record_loop: /*
+ * Generate on-page entries: loop repeat records, looking for
+ * WT_INSERT entries matching the record number. The WT_INSERT
+ * lists are in sorted order, so only need check the next one.
+ */
+ for (n = 0;
+ n < nrepeat; n += repeat_count, src_recno += repeat_count) {
+ upd = NULL;
+ if (ins != NULL && WT_INSERT_RECNO(ins) == src_recno) {
+ WT_ERR(__wt_rec_txn_read(
+ session, r, ins, cip, vpack, NULL, &upd));
+ ins = WT_SKIP_NEXT(ins);
+ }
+
+ update_no_copy = true; /* No data copy */
+ repeat_count = 1; /* Single record */
+ deleted = false;
+
+ if (upd != NULL) {
+ switch (upd->type) {
+ case WT_UPDATE_MODIFY:
+ cbt->slot = WT_COL_SLOT(page, cip);
+ WT_ERR(__wt_value_return_upd(
+ session, cbt, upd,
+ F_ISSET(r, WT_REC_VISIBLE_ALL)));
+ data = cbt->iface.value.data;
+ size = (uint32_t)cbt->iface.value.size;
+ update_no_copy = false;
+ break;
+ case WT_UPDATE_STANDARD:
+ data = upd->data;
+ size = upd->size;
+ break;
+ case WT_UPDATE_TOMBSTONE:
+ deleted = true;
+ break;
+ WT_ILLEGAL_VALUE_ERR(session, upd->type);
+ }
+ } else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) {
+ /*
+ * If doing an update save and restore, and the
+ * underlying value is a removed overflow value,
+ * we end up here.
+ *
+ * If necessary, when the overflow value was
+ * originally removed, reconciliation appended
+ * a globally visible copy of the value to the
+ * key's update list, meaning the on-page item
+ * isn't accessed after page re-instantiation.
+ *
+ * Assert the case.
+ */
+ WT_ASSERT(session,
+ F_ISSET(r, WT_REC_UPDATE_RESTORE));
+
+ /*
+ * The on-page value will never be accessed,
+ * write a placeholder record.
+ */
+ data = "ovfl-unused";
+ size = WT_STORE_SIZE(strlen("ovfl-unused"));
+ } else {
+ update_no_copy = false; /* Maybe data copy */
+
+ /*
+ * The repeat count is the number of records up
+ * to the next WT_INSERT record, or up to the
+ * end of the entry if we have no more WT_INSERT
+ * records.
+ */
+ if (ins == NULL)
+ repeat_count = nrepeat - n;
+ else
+ repeat_count =
+ WT_INSERT_RECNO(ins) - src_recno;
+
+ deleted = orig_deleted;
+ if (deleted)
+ goto compare;
+
+ /*
+ * If we are handling overflow items, use the
+ * overflow item itself exactly once, after
+ * which we have to copy it into a buffer and
+ * from then on use a complete copy because we
+ * are re-creating a new overflow record each
+ * time.
+ */
+ switch (ovfl_state) {
+ case OVFL_UNUSED:
+ /*
+ * An as-yet-unused overflow item.
+ *
+ * We're going to copy the on-page cell,
+ * write out any record we're tracking.
+ */
+ if (rle != 0) {
+ WT_ERR(__rec_col_var_helper(
+ session, r, salvage, last,
+ last_deleted, 0, rle));
+ rle = 0;
+ }
+
+ last->data = vpack->data;
+ last->size = vpack->size;
+ WT_ERR(__rec_col_var_helper(
+ session, r, salvage, last, false,
+ WT_CELL_VALUE_OVFL, repeat_count));
+
+ /* Track if page has overflow items. */
+ r->ovfl_items = true;
+
+ ovfl_state = OVFL_USED;
+ continue;
+ case OVFL_USED:
+ /*
+ * Original is an overflow item; we used
+ * it for a key and now we need another
+ * copy; read it into memory.
+ */
+ WT_ERR(__wt_dsk_cell_data_ref(session,
+ WT_PAGE_COL_VAR, vpack, orig));
+
+ ovfl_state = OVFL_IGNORE;
+ /* FALLTHROUGH */
+ case OVFL_IGNORE:
+ /*
+ * Original is an overflow item and we
+ * were forced to copy it into memory,
+ * or the original wasn't an overflow
+ * item; use the data copied into orig.
+ */
+ data = orig->data;
+ size = (uint32_t)orig->size;
+ break;
+ }
+ }
+
+compare: /*
+ * If we have a record against which to compare, and
+ * the records compare equal, increment the rle counter
+ * and continue. If the records don't compare equal,
+ * output the last record and swap the last and current
+ * buffers: do NOT update the starting record number,
+ * we've been doing that all along.
+ */
+ if (rle != 0) {
+ if ((deleted && last_deleted) ||
+ (!last_deleted && !deleted &&
+ last->size == size &&
+ memcmp(last->data, data, size) == 0)) {
+ rle += repeat_count;
+ continue;
+ }
+ WT_ERR(__rec_col_var_helper(session, r,
+ salvage, last, last_deleted, 0, rle));
+ }
+
+ /*
+ * Swap the current/last state.
+ *
+ * Reset RLE counter and turn on comparisons.
+ */
+ if (!deleted) {
+ /*
+ * We can't simply assign the data values into
+ * the last buffer because they may have come
+ * from a copy built from an encoded/overflow
+ * cell and creating the next record is going
+ * to overwrite that memory. Check, because
+ * encoded/overflow cells aren't that common
+ * and we'd like to avoid the copy. If data
+ * was taken from the current unpack structure
+ * (which points into the page), or was taken
+ * from an update structure, we can just use
+ * the pointers, they're not moving.
+ */
+ if (data == vpack->data || update_no_copy) {
+ last->data = data;
+ last->size = size;
+ } else
+ WT_ERR(__wt_buf_set(
+ session, last, data, size));
+ }
+ last_deleted = deleted;
+ rle = repeat_count;
+ }
+
+ /*
+ * The first time we find an overflow record we never used,
+ * discard the underlying blocks, they're no longer useful.
+ */
+ if (ovfl_state == OVFL_UNUSED &&
+ vpack->raw != WT_CELL_VALUE_OVFL_RM)
+ WT_ERR(__wt_ovfl_remove(
+ session, page, vpack, F_ISSET(r, WT_REC_EVICT)));
+ }
+
+ /* Walk any append list. */
+ for (ins =
+ WT_SKIP_FIRST(WT_COL_APPEND(page));; ins = WT_SKIP_NEXT(ins)) {
+ if (ins == NULL) {
+ /*
+ * If the page split, instantiate any missing records in
+ * the page's name space. (Imagine record 98 is
+ * transactionally visible, 99 wasn't created or is not
+ * yet visible, 100 is visible. Then the page splits and
+ * record 100 moves to another page. When we reconcile
+ * the original page, we write record 98, then we don't
+ * see record 99 for whatever reason. If we've moved
+ * record 100, we don't know to write a deleted record
+ * 99 on the page.)
+ *
+ * Assert the recorded record number is past the end of
+ * the page.
+ *
+ * The record number recorded during the split is the
+ * first key on the split page, that is, one larger than
+ * the last key on this page, we have to decrement it.
+ */
+ if ((n = page->
+ modify->mod_col_split_recno) == WT_RECNO_OOB)
+ break;
+ WT_ASSERT(session, n >= src_recno);
+ n -= 1;
+
+ upd = NULL;
+ } else {
+ WT_ERR(__wt_rec_txn_read(
+ session, r, ins, NULL, NULL, NULL, &upd));
+ n = WT_INSERT_RECNO(ins);
+ }
+ while (src_recno <= n) {
+ deleted = false;
+ update_no_copy = true;
+
+ /*
+ * The application may have inserted records which left
+ * gaps in the name space, and these gaps can be huge.
+ * If we're in a set of deleted records, skip the boring
+ * part.
+ */
+ if (src_recno < n) {
+ deleted = true;
+ if (last_deleted) {
+ /*
+ * The record adjustment is decremented
+ * by one so we can naturally fall into
+ * the RLE accounting below, where we
+ * increment rle by one, then continue
+ * in the outer loop, where we increment
+ * src_recno by one.
+ */
+ skip = (n - src_recno) - 1;
+ rle += skip;
+ src_recno += skip;
+ }
+ } else if (upd == NULL)
+ deleted = true;
+ else
+ switch (upd->type) {
+ case WT_UPDATE_MODIFY:
+ /*
+ * Impossible slot, there's no backing
+ * on-page item.
+ */
+ cbt->slot = UINT32_MAX;
+ WT_ERR(__wt_value_return_upd(
+ session, cbt, upd,
+ F_ISSET(r, WT_REC_VISIBLE_ALL)));
+ data = cbt->iface.value.data;
+ size = (uint32_t)cbt->iface.value.size;
+ update_no_copy = false;
+ break;
+ case WT_UPDATE_STANDARD:
+ data = upd->data;
+ size = upd->size;
+ break;
+ case WT_UPDATE_TOMBSTONE:
+ deleted = true;
+ break;
+ WT_ILLEGAL_VALUE_ERR(session, upd->type);
+ }
+
+ /*
+ * Handle RLE accounting and comparisons -- see comment
+ * above, this code fragment does the same thing.
+ */
+ if (rle != 0) {
+ if ((deleted && last_deleted) ||
+ (!last_deleted && !deleted &&
+ last->size == size &&
+ memcmp(last->data, data, size) == 0)) {
+ ++rle;
+ goto next;
+ }
+ WT_ERR(__rec_col_var_helper(session, r,
+ salvage, last, last_deleted, 0, rle));
+ }
+
+ /*
+ * Swap the current/last state. We can't simply assign
+ * the data values into the last buffer because they may
+ * be a temporary copy built from a chain of modified
+ * updates and creating the next record will overwrite
+ * that memory. Check, we'd like to avoid the copy. If
+ * data was taken from an update structure, we can just
+ * use the pointers, they're not moving.
+ */
+ if (!deleted) {
+ if (update_no_copy) {
+ last->data = data;
+ last->size = size;
+ } else
+ WT_ERR(__wt_buf_set(
+ session, last, data, size));
+ }
+
+ /* Ready for the next loop, reset the RLE counter. */
+ last_deleted = deleted;
+ rle = 1;
+
+ /*
+ * Move to the next record. It's not a simple increment
+ * because if it's the maximum record, incrementing it
+ * wraps to 0 and this turns into an infinite loop.
+ */
+next: if (src_recno == UINT64_MAX)
+ break;
+ ++src_recno;
+ }
+
+ /*
+ * Execute this loop once without an insert item to catch any
+ * missing records due to a split, then quit.
+ */
+ if (ins == NULL)
+ break;
+ }
+
+ /* If we were tracking a record, write it. */
+ if (rle != 0)
+ WT_ERR(__rec_col_var_helper(
+ session, r, salvage, last, last_deleted, 0, rle));
+
+ /* Write the remnant page. */
+ ret = __wt_rec_split_finish(session, r);
+
+err: __wt_scr_free(session, &orig);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_dictionary.c b/src/third_party/wiredtiger/src/reconcile/rec_dictionary.c
new file mode 100644
index 00000000000..11707f77620
--- /dev/null
+++ b/src/third_party/wiredtiger/src/reconcile/rec_dictionary.c
@@ -0,0 +1,200 @@
+/*-
+ * Copyright (c) 2014-2019 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __rec_dictionary_skip_search --
+ * Search a dictionary skiplist.
+ */
+static WT_REC_DICTIONARY *
+__rec_dictionary_skip_search(WT_REC_DICTIONARY **head, uint64_t hash)
+{
+ WT_REC_DICTIONARY **e;
+ int i;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+ if (*e == NULL) { /* Empty levels */
+ --i;
+ --e;
+ continue;
+ }
+
+ /*
+ * Return any exact matches: we don't care in what search level
+ * we found a match.
+ */
+ if ((*e)->hash == hash) /* Exact match */
+ return (*e);
+ if ((*e)->hash > hash) { /* Drop down a level */
+ --i;
+ --e;
+ } else /* Keep going at this level */
+ e = &(*e)->next[i];
+ }
+ return (NULL);
+}
+
+/*
+ * __rec_dictionary_skip_search_stack --
+ * Search a dictionary skiplist, returning an insert/remove stack.
+ */
+static void
+__rec_dictionary_skip_search_stack(
+ WT_REC_DICTIONARY **head, WT_REC_DICTIONARY ***stack, uint64_t hash)
+{
+ WT_REC_DICTIONARY **e;
+ int i;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;)
+ if (*e == NULL || (*e)->hash > hash)
+ stack[i--] = e--; /* Drop down a level */
+ else
+ e = &(*e)->next[i]; /* Keep going at this level */
+}
+
+/*
+ * __rec_dictionary_skip_insert --
+ * Insert an entry into the dictionary skip-list.
+ */
+static void
+__rec_dictionary_skip_insert(
+ WT_REC_DICTIONARY **head, WT_REC_DICTIONARY *e, uint64_t hash)
+{
+ WT_REC_DICTIONARY **stack[WT_SKIP_MAXDEPTH];
+ u_int i;
+
+ /* Insert the new entry into the skiplist. */
+ __rec_dictionary_skip_search_stack(head, stack, hash);
+ for (i = 0; i < e->depth; ++i) {
+ e->next[i] = *stack[i];
+ *stack[i] = e;
+ }
+}
+
+/*
+ * __wt_rec_dictionary_init --
+ * Allocate and initialize the dictionary.
+ */
+int
+__wt_rec_dictionary_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, u_int slots)
+{
+ u_int depth, i;
+
+ /* Free any previous dictionary. */
+ __wt_rec_dictionary_free(session, r);
+
+ r->dictionary_slots = slots;
+ WT_RET(__wt_calloc(session,
+ r->dictionary_slots, sizeof(WT_REC_DICTIONARY *), &r->dictionary));
+ for (i = 0; i < r->dictionary_slots; ++i) {
+ depth = __wt_skip_choose_depth(session);
+ WT_RET(__wt_calloc(session, 1,
+ sizeof(WT_REC_DICTIONARY) +
+ depth * sizeof(WT_REC_DICTIONARY *), &r->dictionary[i]));
+ r->dictionary[i]->depth = depth;
+ }
+ return (0);
+}
+
+/*
+ * __wt_rec_dictionary_free --
+ * Free the dictionary.
+ */
+void
+__wt_rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ u_int i;
+
+ if (r->dictionary == NULL)
+ return;
+
+ /*
+ * We don't correct dictionary_slots when we fail during allocation,
+ * but that's OK, the value is either NULL or a memory reference to
+ * be free'd.
+ */
+ for (i = 0; i < r->dictionary_slots; ++i)
+ __wt_free(session, r->dictionary[i]);
+ __wt_free(session, r->dictionary);
+}
+
+/*
+ * __wt_rec_dictionary_reset --
+ * Reset the dictionary when reconciliation restarts and when crossing a
+ * page boundary (a potential split).
+ */
+void
+__wt_rec_dictionary_reset(WT_RECONCILE *r)
+{
+ if (r->dictionary_slots) {
+ r->dictionary_next = 0;
+ memset(r->dictionary_head, 0, sizeof(r->dictionary_head));
+ }
+}
+
+/*
+ * __wt_rec_dictionary_lookup --
+ * Check the dictionary for a matching value on this page.
+ */
+int
+__wt_rec_dictionary_lookup(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_REC_KV *val, WT_REC_DICTIONARY **dpp)
+{
+ WT_REC_DICTIONARY *dp, *next;
+ uint64_t hash;
+ bool match;
+
+ *dpp = NULL;
+
+ /* Search the dictionary, and return any match we find. */
+ hash = __wt_hash_fnv64(val->buf.data, val->buf.size);
+ for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash);
+ dp != NULL && dp->hash == hash; dp = dp->next[0]) {
+ WT_RET(__wt_cell_pack_data_match(
+ (WT_CELL *)((uint8_t *)r->cur_ptr->image.mem + dp->offset),
+ &val->cell, val->buf.data, &match));
+ if (match) {
+ WT_STAT_DATA_INCR(session, rec_dictionary);
+ *dpp = dp;
+ return (0);
+ }
+ }
+
+ /*
+ * We're not doing value replacement in the dictionary. We stop adding
+ * new entries if we run out of empty dictionary slots (but continue to
+ * use the existing entries). I can't think of any reason a leaf page
+ * value is more likely to be seen because it was seen more recently
+ * than some other value: if we find working sets where that's not the
+ * case, it shouldn't be too difficult to maintain a pointer which is
+ * the next dictionary slot to re-use.
+ */
+ if (r->dictionary_next >= r->dictionary_slots)
+ return (0);
+
+ /*
+ * Set the hash value, we'll add this entry into the dictionary when we
+ * write it into the page's disk image buffer (because that's when we
+ * know where on the page it will be written).
+ */
+ next = r->dictionary[r->dictionary_next++];
+ next->offset = 0; /* Not necessary, just cautious. */
+ next->hash = hash;
+ __rec_dictionary_skip_insert(r->dictionary_head, next, hash);
+ *dpp = next;
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_row.c b/src/third_party/wiredtiger/src/reconcile/rec_row.c
new file mode 100644
index 00000000000..dc249f6a22f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/reconcile/rec_row.c
@@ -0,0 +1,1025 @@
+/*-
+ * Copyright (c) 2014-2019 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __rec_key_state_update --
+ * Update prefix and suffix compression based on the last key.
+ */
+static inline void
+__rec_key_state_update(WT_RECONCILE *r, bool ovfl_key)
+{
+ WT_ITEM *a;
+
+ /*
+ * If writing an overflow key onto the page, don't update the "last key"
+ * value, and leave the state of prefix compression alone. (If we are
+ * currently doing prefix compression, we have a key state which will
+ * continue to work, we're just skipping the key just created because
+ * it's an overflow key and doesn't participate in prefix compression.
+ * If we are not currently doing prefix compression, we can't start, an
+ * overflow key doesn't give us any state.)
+ *
+ * Additionally, if we wrote an overflow key onto the page, turn off the
+ * suffix compression of row-store internal node keys. (When we split,
+ * "last key" is the largest key on the previous page, and "cur key" is
+ * the first key on the next page, which is being promoted. In some
+ * cases we can discard bytes from the "cur key" that are not needed to
+ * distinguish between the "last key" and "cur key", compressing the
+ * size of keys on internal nodes. If we just built an overflow key,
+ * we're not going to update the "last key", making suffix compression
+ * impossible for the next key. Alternatively, we could remember where
+ * the last key was on the page, detect it's an overflow key, read it
+ * from disk and do suffix compression, but that's too much work for an
+ * unlikely event.)
+ *
+ * If we're not writing an overflow key on the page, update the last-key
+ * value and turn on both prefix and suffix compression.
+ */
+ if (ovfl_key)
+ r->key_sfx_compress = false;
+ else {
+ a = r->cur;
+ r->cur = r->last;
+ r->last = a;
+
+ r->key_pfx_compress = r->key_pfx_compress_conf;
+ r->key_sfx_compress = r->key_sfx_compress_conf;
+ }
+}
+
+/*
+ * __rec_cell_build_int_key --
+ * Process a key and return a WT_CELL structure and byte string to be
+ * stored on a row-store internal page.
+ */
+static int
+__rec_cell_build_int_key(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, const void *data, size_t size, bool *is_ovflp)
+{
+ WT_BTREE *btree;
+ WT_REC_KV *key;
+
+ *is_ovflp = false;
+
+ btree = S2BT(session);
+
+ key = &r->k;
+
+ /* Copy the bytes into the "current" and key buffers. */
+ WT_RET(__wt_buf_set(session, r->cur, data, size));
+ WT_RET(__wt_buf_set(session, &key->buf, data, size));
+
+ /* Create an overflow object if the data won't fit. */
+ if (size > btree->maxintlkey) {
+ WT_STAT_DATA_INCR(session, rec_overflow_key_internal);
+
+ *is_ovflp = true;
+ return (__wt_rec_cell_build_ovfl(
+ session, r, key, WT_CELL_KEY_OVFL, (uint64_t)0));
+ }
+
+ key->cell_len = __wt_cell_pack_int_key(&key->cell, key->buf.size);
+ key->len = key->cell_len + key->buf.size;
+
+ return (0);
+}
+
+/*
+ * __rec_cell_build_leaf_key --
+ * Process a key and return a WT_CELL structure and byte string to be
+ * stored on a row-store leaf page.
+ */
+static int
+__rec_cell_build_leaf_key(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, const void *data, size_t size, bool *is_ovflp)
+{
+ WT_BTREE *btree;
+ WT_REC_KV *key;
+ size_t pfx_max;
+ const uint8_t *a, *b;
+ uint8_t pfx;
+
+ *is_ovflp = false;
+
+ btree = S2BT(session);
+
+ key = &r->k;
+
+ pfx = 0;
+ if (data == NULL)
+ /*
+ * When data is NULL, our caller has a prefix compressed key
+ * they can't use (probably because they just crossed a split
+ * point). Use the full key saved when last called, instead.
+ */
+ WT_RET(__wt_buf_set(
+ session, &key->buf, r->cur->data, r->cur->size));
+ else {
+ /*
+ * Save a copy of the key for later reference: we use the full
+ * key for prefix-compression comparisons, and if we are, for
+ * any reason, unable to use the compressed key we generate.
+ */
+ WT_RET(__wt_buf_set(session, r->cur, data, size));
+
+ /*
+ * Do prefix compression on the key. We know by definition the
+ * previous key sorts before the current key, which means the
+ * keys must differ and we just need to compare up to the
+ * shorter of the two keys.
+ */
+ if (r->key_pfx_compress) {
+ /*
+ * We can't compress out more than 256 bytes, limit the
+ * comparison to that.
+ */
+ pfx_max = UINT8_MAX;
+ if (size < pfx_max)
+ pfx_max = size;
+ if (r->last->size < pfx_max)
+ pfx_max = r->last->size;
+ for (a = data, b = r->last->data; pfx < pfx_max; ++pfx)
+ if (*a++ != *b++)
+ break;
+
+ /*
+ * Prefix compression may cost us CPU and memory when
+ * the page is re-loaded, don't do it unless there's
+ * reasonable gain.
+ */
+ if (pfx < btree->prefix_compression_min)
+ pfx = 0;
+ else
+ WT_STAT_DATA_INCRV(
+ session, rec_prefix_compression, pfx);
+ }
+
+ /* Copy the non-prefix bytes into the key buffer. */
+ WT_RET(__wt_buf_set(
+ session, &key->buf, (uint8_t *)data + pfx, size - pfx));
+ }
+
+ /* Optionally compress the key using the Huffman engine. */
+ if (btree->huffman_key != NULL)
+ WT_RET(__wt_huffman_encode(session, btree->huffman_key,
+ key->buf.data, (uint32_t)key->buf.size, &key->buf));
+
+ /* Create an overflow object if the data won't fit. */
+ if (key->buf.size > btree->maxleafkey) {
+ /*
+ * Overflow objects aren't prefix compressed -- rebuild any
+ * object that was prefix compressed.
+ */
+ if (pfx == 0) {
+ WT_STAT_DATA_INCR(session, rec_overflow_key_leaf);
+
+ *is_ovflp = true;
+ return (__wt_rec_cell_build_ovfl(
+ session, r, key, WT_CELL_KEY_OVFL, (uint64_t)0));
+ }
+ return (
+ __rec_cell_build_leaf_key(session, r, NULL, 0, is_ovflp));
+ }
+
+ key->cell_len = __wt_cell_pack_leaf_key(&key->cell, pfx, key->buf.size);
+ key->len = key->cell_len + key->buf.size;
+
+ return (0);
+}
+
+/*
+ * __wt_bulk_insert_row --
+ * Row-store bulk insert.
+ */
+int
+__wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_RECONCILE *r;
+ WT_REC_KV *key, *val;
+ bool ovfl_key;
+
+ r = cbulk->reconcile;
+ btree = S2BT(session);
+ cursor = &cbulk->cbt.iface;
+
+ key = &r->k;
+ val = &r->v;
+ WT_RET(__rec_cell_build_leaf_key(session, r, /* Build key cell */
+ cursor->key.data, cursor->key.size, &ovfl_key));
+ WT_RET(__wt_rec_cell_build_val(session, r, /* Build value cell */
+ cursor->value.data, cursor->value.size, (uint64_t)0));
+
+ /* Boundary: split or write the page. */
+ if (WT_CROSSING_SPLIT_BND(r, key->len + val->len)) {
+ /*
+ * Turn off prefix compression until a full key written to the
+ * new page, and (unless already working with an overflow key),
+ * rebuild the key without compression.
+ */
+ if (r->key_pfx_compress_conf) {
+ r->key_pfx_compress = false;
+ if (!ovfl_key)
+ WT_RET(__rec_cell_build_leaf_key(
+ session, r, NULL, 0, &ovfl_key));
+ }
+ WT_RET(__wt_rec_split_crossing_bnd(
+ session, r, key->len + val->len));
+ }
+
+ /* Copy the key/value pair onto the page. */
+ __wt_rec_copy_incr(session, r, key);
+ if (val->len == 0)
+ r->any_empty_value = true;
+ else {
+ r->all_empty_value = false;
+ if (btree->dictionary)
+ WT_RET(__wt_rec_dict_replace(session, r, 0, val));
+ __wt_rec_copy_incr(session, r, val);
+ }
+
+ /* Update compression state. */
+ __rec_key_state_update(r, ovfl_key);
+
+ return (0);
+}
+
+/*
+ * __rec_row_merge --
+ * Merge in a split page.
+ */
+static int
+__rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_ADDR *addr;
+ WT_MULTI *multi;
+ WT_PAGE_MODIFY *mod;
+ WT_REC_KV *key, *val;
+ uint32_t i;
+ bool ovfl_key;
+
+ mod = page->modify;
+
+ key = &r->k;
+ val = &r->v;
+
+ /* For each entry in the split array... */
+ for (multi = mod->mod_multi,
+ i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+ /* Build the key and value cells. */
+ WT_RET(__rec_cell_build_int_key(session, r,
+ WT_IKEY_DATA(multi->key.ikey),
+ r->cell_zero ? 1 : multi->key.ikey->size, &ovfl_key));
+ r->cell_zero = false;
+
+ addr = &multi->addr;
+ __wt_rec_cell_build_addr(session, r,
+ addr->addr, addr->size, __wt_rec_vtype(addr), WT_RECNO_OOB);
+
+ /* Boundary: split or write the page. */
+ if (__wt_rec_need_split(r, key->len + val->len))
+ WT_RET(__wt_rec_split_crossing_bnd(
+ session, r, key->len + val->len));
+
+ /* Copy the key and value onto the page. */
+ __wt_rec_copy_incr(session, r, key);
+ __wt_rec_copy_incr(session, r, val);
+
+ /* Update compression state. */
+ __rec_key_state_update(r, ovfl_key);
+ }
+ return (0);
+}
+
+/*
+ * __wt_rec_row_int --
+ * Reconcile a row-store internal page.
+ */
+int
+__wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_ADDR *addr;
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack;
+ WT_CHILD_STATE state;
+ WT_DECL_RET;
+ WT_IKEY *ikey;
+ WT_PAGE *child;
+ WT_REC_KV *key, *val;
+ WT_REF *ref;
+ size_t size;
+ u_int vtype;
+ bool hazard, key_onpage_ovfl, ovfl_key;
+ const void *p;
+
+ btree = S2BT(session);
+ child = NULL;
+ hazard = false;
+
+ key = &r->k;
+ kpack = &_kpack;
+ WT_CLEAR(*kpack); /* -Wuninitialized */
+ val = &r->v;
+ vpack = &_vpack;
+ WT_CLEAR(*vpack); /* -Wuninitialized */
+
+ ikey = NULL; /* -Wuninitialized */
+ cell = NULL;
+ key_onpage_ovfl = false;
+
+ WT_RET(__wt_rec_split_init(
+ session, r, page, 0, btree->maxintlpage_precomp));
+
+ /*
+ * Ideally, we'd never store the 0th key on row-store internal pages
+ * because it's never used during tree search and there's no reason
+ * to waste the space. The problem is how we do splits: when we split,
+ * we've potentially picked out several "split points" in the buffer
+ * which is overflowing the maximum page size, and when the overflow
+ * happens, we go back and physically split the buffer, at those split
+ * points, into new pages. It would be both difficult and expensive
+ * to re-process the 0th key at each split point to be an empty key,
+ * so we don't do that. However, we are reconciling an internal page
+ * for whatever reason, and the 0th key is known to be useless. We
+ * truncate the key to a single byte, instead of removing it entirely,
+ * it simplifies various things in other parts of the code (we don't
+ * have to special case transforming the page from its disk image to
+ * its in-memory version, for example).
+ */
+ r->cell_zero = true;
+
+ /* For each entry in the in-memory page... */
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ /*
+ * There are different paths if the key is an overflow item vs.
+ * a straight-forward on-page value. If an overflow item, we
+ * would have instantiated it, and we can use that fact to set
+ * things up.
+ *
+ * Note the cell reference and unpacked key cell are available
+ * only in the case of an instantiated, off-page key, we don't
+ * bother setting them if that's not possible.
+ */
+ if (F_ISSET_ATOMIC(page, WT_PAGE_OVERFLOW_KEYS)) {
+ cell = NULL;
+ key_onpage_ovfl = false;
+ ikey = __wt_ref_key_instantiated(ref);
+ if (ikey != NULL && ikey->cell_offset != 0) {
+ cell =
+ WT_PAGE_REF_OFFSET(page, ikey->cell_offset);
+ __wt_cell_unpack(cell, kpack);
+ key_onpage_ovfl = kpack->ovfl &&
+ kpack->raw != WT_CELL_KEY_OVFL_RM;
+ }
+ }
+
+ WT_ERR(__wt_rec_child_modify(session, r, ref, &hazard, &state));
+ addr = ref->addr;
+ child = ref->page;
+
+ switch (state) {
+ case WT_CHILD_IGNORE:
+ /*
+ * Ignored child.
+ *
+ * Overflow keys referencing pages we're not writing are
+ * no longer useful, schedule them for discard. Don't
+ * worry about instantiation, internal page keys are
+ * always instantiated. Don't worry about reuse,
+ * reusing this key in this reconciliation is unlikely.
+ */
+ if (key_onpage_ovfl)
+ WT_ERR(__wt_ovfl_discard_add(
+ session, page, kpack->cell));
+ WT_CHILD_RELEASE_ERR(session, hazard, ref);
+ continue;
+
+ case WT_CHILD_MODIFIED:
+ /*
+ * Modified child. Empty pages are merged into the
+ * parent and discarded.
+ */
+ switch (child->modify->rec_result) {
+ case WT_PM_REC_EMPTY:
+ /*
+ * Overflow keys referencing empty pages are no
+ * longer useful, schedule them for discard.
+ * Don't worry about instantiation, internal
+ * page keys are always instantiated. Don't
+ * worry about reuse, reusing this key in this
+ * reconciliation is unlikely.
+ */
+ if (key_onpage_ovfl)
+ WT_ERR(__wt_ovfl_discard_add(
+ session, page, kpack->cell));
+ WT_CHILD_RELEASE_ERR(session, hazard, ref);
+ continue;
+ case WT_PM_REC_MULTIBLOCK:
+ /*
+ * Overflow keys referencing split pages are no
+ * longer useful (the split page's key is the
+ * interesting key); schedule them for discard.
+ * Don't worry about instantiation, internal
+ * page keys are always instantiated. Don't
+ * worry about reuse, reusing this key in this
+ * reconciliation is unlikely.
+ */
+ if (key_onpage_ovfl)
+ WT_ERR(__wt_ovfl_discard_add(
+ session, page, kpack->cell));
+
+ WT_ERR(__rec_row_merge(session, r, child));
+ WT_CHILD_RELEASE_ERR(session, hazard, ref);
+ continue;
+ case WT_PM_REC_REPLACE:
+ /*
+ * If the page is replaced, the page's modify
+ * structure has the page's address.
+ */
+ addr = &child->modify->mod_replace;
+ break;
+ WT_ILLEGAL_VALUE_ERR(
+ session, child->modify->rec_result);
+ }
+ break;
+ case WT_CHILD_ORIGINAL:
+ /* Original child. */
+ break;
+ case WT_CHILD_PROXY:
+ /* Deleted child where we write a proxy cell. */
+ break;
+ }
+
+ /*
+ * Build the value cell, the child page's address. Addr points
+ * to an on-page cell or an off-page WT_ADDR structure. There's
+ * a special cell type in the case of page deletion requiring
+ * a proxy cell, otherwise use the information from the addr or
+ * original cell.
+ */
+ if (__wt_off_page(page, addr)) {
+ p = addr->addr;
+ size = addr->size;
+ vtype = state == WT_CHILD_PROXY ?
+ WT_CELL_ADDR_DEL : __wt_rec_vtype(addr);
+ } else {
+ __wt_cell_unpack(ref->addr, vpack);
+ p = vpack->data;
+ size = vpack->size;
+ vtype = state == WT_CHILD_PROXY ?
+ WT_CELL_ADDR_DEL : (u_int)vpack->raw;
+ }
+ __wt_rec_cell_build_addr(
+ session, r, p, size, vtype, WT_RECNO_OOB);
+ WT_CHILD_RELEASE_ERR(session, hazard, ref);
+
+ /*
+ * Build key cell.
+ * Truncate any 0th key, internal pages don't need 0th keys.
+ */
+ if (key_onpage_ovfl) {
+ key->buf.data = cell;
+ key->buf.size = __wt_cell_total_len(kpack);
+ key->cell_len = 0;
+ key->len = key->buf.size;
+ ovfl_key = true;
+ } else {
+ __wt_ref_key(page, ref, &p, &size);
+ WT_ERR(__rec_cell_build_int_key(
+ session, r, p, r->cell_zero ? 1 : size, &ovfl_key));
+ }
+ r->cell_zero = false;
+
+ /* Boundary: split or write the page. */
+ if (__wt_rec_need_split(r, key->len + val->len)) {
+ /*
+ * In one path above, we copied address blocks from the
+ * page rather than building the actual key. In that
+ * case, we have to build the key now because we are
+ * about to promote it.
+ */
+ if (key_onpage_ovfl) {
+ WT_ERR(__wt_buf_set(session, r->cur,
+ WT_IKEY_DATA(ikey), ikey->size));
+ key_onpage_ovfl = false;
+ }
+
+ WT_ERR(__wt_rec_split_crossing_bnd(
+ session, r, key->len + val->len));
+ }
+
+ /* Copy the key and value onto the page. */
+ __wt_rec_copy_incr(session, r, key);
+ __wt_rec_copy_incr(session, r, val);
+
+ /* Update compression state. */
+ __rec_key_state_update(r, ovfl_key);
+ } WT_INTL_FOREACH_END;
+
+ /* Write the remnant page. */
+ return (__wt_rec_split_finish(session, r));
+
+err: WT_CHILD_RELEASE(session, hazard, ref);
+ return (ret);
+}
+
+/*
+ * __rec_row_leaf_insert --
+ * Walk an insert chain, writing K/V pairs.
+ */
+static int
+__rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
+{
+ WT_BTREE *btree;
+ WT_CURSOR_BTREE *cbt;
+ WT_REC_KV *key, *val;
+ WT_UPDATE *upd;
+ bool ovfl_key, upd_saved;
+
+ btree = S2BT(session);
+ cbt = &r->update_modify_cbt;
+
+ key = &r->k;
+ val = &r->v;
+
+ for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) {
+ WT_RET(__wt_rec_txn_read(
+ session, r, ins, NULL, NULL, &upd_saved, &upd));
+
+ if (upd == NULL) {
+ /*
+ * If no update is visible but some were saved, check
+ * for splits.
+ */
+ if (!upd_saved)
+ continue;
+ if (!__wt_rec_need_split(r, WT_INSERT_KEY_SIZE(ins)))
+ continue;
+
+ /* Copy the current key into place and then split. */
+ WT_RET(__wt_buf_set(session, r->cur,
+ WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins)));
+ WT_RET(__wt_rec_split_crossing_bnd(
+ session, r, WT_INSERT_KEY_SIZE(ins)));
+
+ /*
+ * Turn off prefix and suffix compression until a full
+ * key is written into the new page.
+ */
+ r->key_pfx_compress = r->key_sfx_compress = false;
+ continue;
+ }
+
+ switch (upd->type) {
+ case WT_UPDATE_MODIFY:
+ /*
+ * Impossible slot, there's no backing on-page
+ * item.
+ */
+ cbt->slot = UINT32_MAX;
+ WT_RET(__wt_value_return_upd(
+ session, cbt, upd, F_ISSET(r, WT_REC_VISIBLE_ALL)));
+ WT_RET(__wt_rec_cell_build_val(session, r,
+ cbt->iface.value.data,
+ cbt->iface.value.size, (uint64_t)0));
+ break;
+ case WT_UPDATE_STANDARD:
+ if (upd->size == 0)
+ val->len = 0;
+ else
+ WT_RET(__wt_rec_cell_build_val(session,
+ r, upd->data, upd->size,
+ (uint64_t)0));
+ break;
+ case WT_UPDATE_TOMBSTONE:
+ continue;
+ WT_ILLEGAL_VALUE(session, upd->type);
+ }
+
+ /* Build key cell. */
+ WT_RET(__rec_cell_build_leaf_key(session, r,
+ WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key));
+
+ /* Boundary: split or write the page. */
+ if (__wt_rec_need_split(r, key->len + val->len)) {
+ /*
+ * Turn off prefix compression until a full key written
+ * to the new page, and (unless already working with an
+ * overflow key), rebuild the key without compression.
+ */
+ if (r->key_pfx_compress_conf) {
+ r->key_pfx_compress = false;
+ if (!ovfl_key)
+ WT_RET(__rec_cell_build_leaf_key(
+ session, r, NULL, 0, &ovfl_key));
+ }
+
+ WT_RET(__wt_rec_split_crossing_bnd(
+ session, r, key->len + val->len));
+ }
+
+ /* Copy the key/value pair onto the page. */
+ __wt_rec_copy_incr(session, r, key);
+ if (val->len == 0)
+ r->any_empty_value = true;
+ else {
+ r->all_empty_value = false;
+ if (btree->dictionary)
+ WT_RET(__wt_rec_dict_replace(
+ session, r, 0, val));
+ __wt_rec_copy_incr(session, r, val);
+ }
+
+ /* Update compression state. */
+ __rec_key_state_update(r, ovfl_key);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_rec_row_leaf --
+ * Reconcile a row-store leaf page.
+ */
+int
+__wt_rec_row_leaf(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack;
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_ITEM(tmpkey);
+ WT_DECL_ITEM(tmpval);
+ WT_DECL_RET;
+ WT_IKEY *ikey;
+ WT_INSERT *ins;
+ WT_REC_KV *key, *val;
+ WT_ROW *rip;
+ WT_UPDATE *upd;
+ size_t size;
+ uint64_t slvg_skip;
+ uint32_t i;
+ bool dictionary, key_onpage_ovfl, ovfl_key;
+ void *copy;
+ const void *p;
+
+ btree = S2BT(session);
+ cbt = &r->update_modify_cbt;
+ slvg_skip = salvage == NULL ? 0 : salvage->skip;
+
+ key = &r->k;
+ val = &r->v;
+ vpack = &_vpack;
+
+ WT_RET(__wt_rec_split_init(
+ session, r, page, 0, btree->maxleafpage_precomp));
+
+ /*
+ * Write any K/V pairs inserted into the page before the first from-disk
+ * key on the page.
+ */
+ if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL)
+ WT_RET(__rec_row_leaf_insert(session, r, ins));
+
+ /*
+ * Temporary buffers in which to instantiate any uninstantiated keys
+ * or value items we need.
+ */
+ WT_ERR(__wt_scr_alloc(session, 0, &tmpkey));
+ WT_ERR(__wt_scr_alloc(session, 0, &tmpval));
+
+ /* For each entry in the page... */
+ WT_ROW_FOREACH(page, rip, i) {
+ /*
+ * The salvage code, on some rare occasions, wants to reconcile
+ * a page but skip some leading records on the page. Because
+ * the row-store leaf reconciliation function copies keys from
+ * the original disk page, this is non-trivial -- just changing
+ * the in-memory pointers isn't sufficient, we have to change
+ * the WT_CELL structures on the disk page, too. It's ugly, but
+ * we pass in a value that tells us how many records to skip in
+ * this case.
+ */
+ if (slvg_skip != 0) {
+ --slvg_skip;
+ continue;
+ }
+
+ /*
+ * Figure out the key: set any cell reference (and unpack it),
+ * set any instantiated key reference.
+ */
+ copy = WT_ROW_KEY_COPY(rip);
+ (void)__wt_row_leaf_key_info(
+ page, copy, &ikey, &cell, NULL, NULL);
+ if (cell == NULL)
+ kpack = NULL;
+ else {
+ kpack = &_kpack;
+ __wt_cell_unpack(cell, kpack);
+ }
+
+ /* Unpack the on-page value cell, and look for an update. */
+ __wt_row_leaf_value_cell(page, rip, NULL, vpack);
+ WT_ERR(__wt_rec_txn_read(
+ session, r, NULL, rip, vpack, NULL, &upd));
+
+ /* Build value cell. */
+ dictionary = false;
+ if (upd == NULL) {
+ /*
+ * When the page was read into memory, there may not
+ * have been a value item.
+ *
+ * If there was a value item, check if it's a dictionary
+ * cell (a copy of another item on the page). If it's a
+ * copy, we have to create a new value item as the old
+ * item might have been discarded from the page.
+ */
+ if (vpack->raw == WT_CELL_VALUE_COPY) {
+ /* If the item is Huffman encoded, decode it. */
+ if (btree->huffman_value == NULL) {
+ p = vpack->data;
+ size = vpack->size;
+ } else {
+ WT_ERR(__wt_huffman_decode(session,
+ btree->huffman_value,
+ vpack->data, vpack->size,
+ tmpval));
+ p = tmpval->data;
+ size = tmpval->size;
+ }
+ WT_ERR(__wt_rec_cell_build_val(
+ session, r, p, size, (uint64_t)0));
+ dictionary = true;
+ } else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) {
+ /*
+ * If doing an update save and restore, and the
+ * underlying value is a removed overflow value,
+ * we end up here.
+ *
+ * If necessary, when the overflow value was
+ * originally removed, reconciliation appended
+ * a globally visible copy of the value to the
+ * key's update list, meaning the on-page item
+ * isn't accessed after page re-instantiation.
+ *
+ * Assert the case.
+ */
+ WT_ASSERT(session,
+ F_ISSET(r, WT_REC_UPDATE_RESTORE));
+
+ /*
+ * If the key is also a removed overflow item,
+ * don't write anything at all.
+ *
+ * We don't have to write anything because the
+ * code re-instantiating the page gets the key
+ * to match the saved list of updates from the
+ * original page. By not putting the key on
+ * the page, we'll move the key/value set from
+ * a row-store leaf page slot to an insert list,
+ * but that shouldn't matter.
+ *
+ * The reason we bother with the test is because
+ * overflows are expensive to write. It's hard
+ * to imagine a real workload where this test is
+ * worth the effort, but it's a simple test.
+ */
+ if (kpack != NULL &&
+ kpack->raw == WT_CELL_KEY_OVFL_RM)
+ goto leaf_insert;
+
+ /*
+ * The on-page value will never be accessed,
+ * write a placeholder record.
+ */
+ WT_ERR(__wt_rec_cell_build_val(session, r,
+ "ovfl-unused", strlen("ovfl-unused"),
+ (uint64_t)0));
+ } else {
+ val->buf.data = vpack->cell;
+ val->buf.size = __wt_cell_total_len(vpack);
+ val->cell_len = 0;
+ val->len = val->buf.size;
+
+ /* Track if page has overflow items. */
+ if (vpack->ovfl)
+ r->ovfl_items = true;
+ }
+ } else {
+ /*
+ * The first time we find an overflow record we're not
+ * going to use, discard the underlying blocks.
+ */
+ if (vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM)
+ WT_ERR(__wt_ovfl_remove(session,
+ page, vpack, F_ISSET(r, WT_REC_EVICT)));
+
+ switch (upd->type) {
+ case WT_UPDATE_MODIFY:
+ cbt->slot = WT_ROW_SLOT(page, rip);
+ WT_ERR(__wt_value_return_upd(session, cbt, upd,
+ F_ISSET(r, WT_REC_VISIBLE_ALL)));
+ WT_ERR(__wt_rec_cell_build_val(session, r,
+ cbt->iface.value.data,
+ cbt->iface.value.size, (uint64_t)0));
+ dictionary = true;
+ break;
+ case WT_UPDATE_STANDARD:
+ /*
+ * If no value, nothing needs to be copied.
+ * Otherwise, build the value's chunk from the
+ * update value.
+ */
+ if (upd->size == 0) {
+ val->buf.data = NULL;
+ val->cell_len =
+ val->len = val->buf.size = 0;
+ } else {
+ WT_ERR(__wt_rec_cell_build_val(
+ session, r,
+ upd->data, upd->size, (uint64_t)0));
+ dictionary = true;
+ }
+ break;
+ case WT_UPDATE_TOMBSTONE:
+ /*
+ * If this key/value pair was deleted, we're
+ * done.
+ *
+ * Overflow keys referencing discarded values
+ * are no longer useful, discard the backing
+ * blocks. Don't worry about reuse, reusing
+ * keys from a row-store page reconciliation
+ * seems unlikely enough to ignore.
+ */
+ if (kpack != NULL && kpack->ovfl &&
+ kpack->raw != WT_CELL_KEY_OVFL_RM) {
+ /*
+ * Keys are part of the name-space, we
+ * can't remove them from the in-memory
+ * tree; if an overflow key was deleted
+ * without being instantiated (for
+ * example, cursor-based truncation), do
+ * it now.
+ */
+ if (ikey == NULL)
+ WT_ERR(__wt_row_leaf_key(
+ session,
+ page, rip, tmpkey, true));
+
+ WT_ERR(__wt_ovfl_discard_add(
+ session, page, kpack->cell));
+ }
+
+ /*
+ * We aren't actually creating the key so we
+ * can't use bytes from this key to provide
+ * prefix information for a subsequent key.
+ */
+ tmpkey->size = 0;
+
+ /* Proceed with appended key/value pairs. */
+ goto leaf_insert;
+ WT_ILLEGAL_VALUE_ERR(session, upd->type);
+ }
+ }
+
+ /*
+ * Build key cell.
+ *
+ * If the key is an overflow key that hasn't been removed, use
+ * the original backing blocks.
+ */
+ key_onpage_ovfl = kpack != NULL &&
+ kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM;
+ if (key_onpage_ovfl) {
+ key->buf.data = cell;
+ key->buf.size = __wt_cell_total_len(kpack);
+ key->cell_len = 0;
+ key->len = key->buf.size;
+ ovfl_key = true;
+
+ /*
+ * We aren't creating a key so we can't use this key as
+ * a prefix for a subsequent key.
+ */
+ tmpkey->size = 0;
+
+ /* Track if page has overflow items. */
+ r->ovfl_items = true;
+ } else {
+ /*
+ * Get the key from the page or an instantiated key, or
+ * inline building the key from a previous key (it's a
+ * fast path for simple, prefix-compressed keys), or by
+ * by building the key from scratch.
+ */
+ if (__wt_row_leaf_key_info(page, copy,
+ NULL, &cell, &tmpkey->data, &tmpkey->size))
+ goto build;
+
+ kpack = &_kpack;
+ __wt_cell_unpack(cell, kpack);
+ if (btree->huffman_key == NULL &&
+ kpack->type == WT_CELL_KEY &&
+ tmpkey->size >= kpack->prefix) {
+ /*
+ * The previous clause checked for a prefix of
+ * zero, which means the temporary buffer must
+ * have a non-zero size, and it references a
+ * valid key.
+ */
+ WT_ASSERT(session, tmpkey->size != 0);
+
+ /*
+ * Grow the buffer as necessary, ensuring data
+ * data has been copied into local buffer space,
+ * then append the suffix to the prefix already
+ * in the buffer.
+ *
+ * Don't grow the buffer unnecessarily or copy
+ * data we don't need, truncate the item's data
+ * length to the prefix bytes.
+ */
+ tmpkey->size = kpack->prefix;
+ WT_ERR(__wt_buf_grow(session,
+ tmpkey, tmpkey->size + kpack->size));
+ memcpy((uint8_t *)tmpkey->mem + tmpkey->size,
+ kpack->data, kpack->size);
+ tmpkey->size += kpack->size;
+ } else
+ WT_ERR(__wt_row_leaf_key_copy(
+ session, page, rip, tmpkey));
+build:
+ WT_ERR(__rec_cell_build_leaf_key(session, r,
+ tmpkey->data, tmpkey->size, &ovfl_key));
+ }
+
+ /* Boundary: split or write the page. */
+ if (__wt_rec_need_split(r, key->len + val->len)) {
+ /*
+ * If we copied address blocks from the page rather than
+ * building the actual key, we have to build the key now
+ * because we are about to promote it.
+ */
+ if (key_onpage_ovfl) {
+ WT_ERR(__wt_dsk_cell_data_ref(session,
+ WT_PAGE_ROW_LEAF, kpack, r->cur));
+ WT_NOT_READ(key_onpage_ovfl, false);
+ }
+
+ /*
+ * Turn off prefix compression until a full key written
+ * to the new page, and (unless already working with an
+ * overflow key), rebuild the key without compression.
+ */
+ if (r->key_pfx_compress_conf) {
+ r->key_pfx_compress = false;
+ if (!ovfl_key)
+ WT_ERR(__rec_cell_build_leaf_key(
+ session, r, NULL, 0, &ovfl_key));
+ }
+
+ WT_ERR(__wt_rec_split_crossing_bnd(
+ session, r, key->len + val->len));
+ }
+
+ /* Copy the key/value pair onto the page. */
+ __wt_rec_copy_incr(session, r, key);
+ if (val->len == 0)
+ r->any_empty_value = true;
+ else {
+ r->all_empty_value = false;
+ if (dictionary && btree->dictionary)
+ WT_ERR(__wt_rec_dict_replace(
+ session, r, 0, val));
+ __wt_rec_copy_incr(session, r, val);
+ }
+
+ /* Update compression state. */
+ __rec_key_state_update(r, ovfl_key);
+
+leaf_insert: /* Write any K/V pairs inserted into the page after this key. */
+ if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT(page, rip))) != NULL)
+ WT_ERR(__rec_row_leaf_insert(session, r, ins));
+ }
+
+ /* Write the remnant page. */
+ ret = __wt_rec_split_finish(session, r);
+
+err: __wt_scr_free(session, &tmpkey);
+ __wt_scr_free(session, &tmpval);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
new file mode 100644
index 00000000000..97903db9e9e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
@@ -0,0 +1,405 @@
+/*-
+ * Copyright (c) 2014-2019 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __rec_update_save --
+ * Save a WT_UPDATE list for later restoration.
+ */
+static int
+__rec_update_save(WT_SESSION_IMPL *session, WT_RECONCILE *r,
+ WT_INSERT *ins, void *ripcip, WT_UPDATE *onpage_upd, size_t upd_memsize)
+{
+ WT_RET(__wt_realloc_def(
+ session, &r->supd_allocated, r->supd_next + 1, &r->supd));
+ r->supd[r->supd_next].ins = ins;
+ r->supd[r->supd_next].ripcip = ripcip;
+ r->supd[r->supd_next].onpage_upd = onpage_upd;
+ ++r->supd_next;
+ r->supd_memsize += upd_memsize;
+ return (0);
+}
+
+/*
+ * __rec_append_orig_value --
+ * Append the key's original value to its update list.
+ */
+static int
+__rec_append_orig_value(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_UPDATE *upd, WT_CELL_UNPACK *unpack)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_UPDATE *append;
+ size_t size;
+
+ /* Done if at least one self-contained update is globally visible. */
+ for (;; upd = upd->next) {
+ if (WT_UPDATE_DATA_VALUE(upd) &&
+ __wt_txn_upd_visible_all(session, upd))
+ return (0);
+
+ /* Add the original value after birthmarks. */
+ if (upd->type == WT_UPDATE_BIRTHMARK) {
+ WT_ASSERT(session, unpack != NULL &&
+ unpack->type != WT_CELL_DEL);
+ break;
+ }
+
+ /* Leave reference at the last item in the chain. */
+ if (upd->next == NULL)
+ break;
+ }
+
+ /*
+ * We need the original on-page value for some reader: get a copy and
+ * append it to the end of the update list with a transaction ID that
+ * guarantees its visibility.
+ *
+ * If we don't have a value cell, it's an insert/append list key/value
+ * pair which simply doesn't exist for some reader; place a deleted
+ * record at the end of the update list.
+ */
+ append = NULL; /* -Wconditional-uninitialized */
+ size = 0; /* -Wconditional-uninitialized */
+ if (unpack == NULL || unpack->type == WT_CELL_DEL)
+ WT_RET(__wt_update_alloc(session,
+ NULL, &append, &size, WT_UPDATE_TOMBSTONE));
+ else {
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp));
+ WT_ERR(__wt_update_alloc(
+ session, tmp, &append, &size, WT_UPDATE_STANDARD));
+ }
+
+ /*
+ * If we're saving the original value for a birthmark, transfer over
+ * the transaction ID and clear out the birthmark update.
+ *
+ * Else, set the entry's transaction information to the lowest possible
+ * value. Cleared memory matches the lowest possible transaction ID and
+ * timestamp, do nothing.
+ */
+ if (upd->type == WT_UPDATE_BIRTHMARK) {
+ append->txnid = upd->txnid;
+ append->timestamp = upd->timestamp;
+ append->next = upd->next;
+ }
+
+ /* Append the new entry into the update list. */
+ WT_PUBLISH(upd->next, append);
+ __wt_cache_page_inmem_incr(session, page, size);
+
+ if (upd->type == WT_UPDATE_BIRTHMARK) {
+ upd->type = WT_UPDATE_STANDARD;
+ upd->txnid = WT_TXN_ABORTED;
+ }
+
+err: __wt_scr_free(session, &tmp);
+ return (ret);
+}
+
+/*
+ * __wt_rec_txn_read --
+ * Return the update in a list that should be written (or NULL if none can
+ * be written).
+ */
+int
+__wt_rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
+ WT_INSERT *ins, void *ripcip, WT_CELL_UNPACK *vpack,
+ bool *upd_savedp, WT_UPDATE **updp)
+{
+ WT_PAGE *page;
+ WT_UPDATE *first_ts_upd, *first_txn_upd, *first_upd, *upd;
+ wt_timestamp_t timestamp;
+ size_t upd_memsize;
+ uint64_t max_txn, txnid;
+ bool all_visible, prepared, skipped_birthmark, uncommitted, upd_saved;
+
+ if (upd_savedp != NULL)
+ *upd_savedp = false;
+ *updp = NULL;
+
+ page = r->page;
+ first_ts_upd = first_txn_upd = NULL;
+ upd_memsize = 0;
+ max_txn = WT_TXN_NONE;
+ prepared = skipped_birthmark = uncommitted = upd_saved = false;
+
+ /*
+ * If called with a WT_INSERT item, use its WT_UPDATE list (which must
+ * exist), otherwise check for an on-page row-store WT_UPDATE list
+ * (which may not exist). Return immediately if the item has no updates.
+ */
+ if (ins != NULL)
+ first_upd = ins->upd;
+ else if ((first_upd = WT_ROW_UPDATE(page, ripcip)) == NULL)
+ return (0);
+
+ for (upd = first_upd; upd != NULL; upd = upd->next) {
+ if ((txnid = upd->txnid) == WT_TXN_ABORTED)
+ continue;
+
+ ++r->updates_seen;
+ upd_memsize += WT_UPDATE_MEMSIZE(upd);
+
+ /*
+ * Track the first update in the chain that is not aborted and
+ * the maximum transaction ID.
+ */
+ if (first_txn_upd == NULL)
+ first_txn_upd = upd;
+
+ /* Track the largest transaction ID seen. */
+ if (WT_TXNID_LT(max_txn, txnid))
+ max_txn = txnid;
+
+ /*
+ * Check whether the update was committed before reconciliation
+ * started. The global commit point can move forward during
+ * reconciliation so we use a cached copy to avoid races when a
+ * concurrent transaction commits or rolls back while we are
+ * examining its updates. As prepared transaction id's are
+ * globally visible, need to check the update state as well.
+ */
+ if (F_ISSET(r, WT_REC_EVICT)) {
+ if (upd->prepare_state == WT_PREPARE_LOCKED ||
+ upd->prepare_state == WT_PREPARE_INPROGRESS)
+ prepared = true;
+
+ if (F_ISSET(r, WT_REC_VISIBLE_ALL) ?
+ WT_TXNID_LE(r->last_running, txnid) :
+ !__txn_visible_id(session, txnid))
+ uncommitted = r->update_uncommitted = true;
+
+ if (prepared || uncommitted)
+ continue;
+ }
+
+ /* Track the first update with non-zero timestamp. */
+ if (first_ts_upd == NULL && upd->timestamp != 0)
+ first_ts_upd = upd;
+
+ /*
+ * Find the first update we can use.
+ *
+ * Update/restore eviction can handle any update (including
+ * uncommitted updates). Lookaside eviction can save any
+ * committed update. Regular eviction checks that the maximum
+ * transaction ID and timestamp seen are stable.
+ *
+ * Lookaside and update/restore eviction try to choose the same
+ * version as a subsequent checkpoint, so that checkpoint can
+ * skip over pages with lookaside entries. If the application
+ * has supplied a stable timestamp, we assume (a) that it is
+ * old, and (b) that the next checkpoint will use it, so we wait
+ * to see a stable update. If there is no stable timestamp, we
+ * assume the next checkpoint will write the most recent version
+ * (but we save enough information that checkpoint can fix
+ * things up if we choose an update that is too new).
+ */
+ if (*updp == NULL && r->las_skew_newest)
+ *updp = upd;
+
+ if (F_ISSET(r, WT_REC_VISIBLE_ALL) ?
+ !__wt_txn_upd_visible_all(session, upd) :
+ !__wt_txn_upd_visible(session, upd)) {
+ if (F_ISSET(r, WT_REC_EVICT))
+ ++r->updates_unstable;
+
+ /*
+ * Rare case: when applications run at low isolation
+ * levels, update/restore eviction may see a stable
+ * update followed by an uncommitted update. Give up
+ * in that case: we need to discard updates from the
+ * stable update and older for correctness and we can't
+ * discard an uncommitted update.
+ */
+ if (F_ISSET(r, WT_REC_UPDATE_RESTORE) &&
+ *updp != NULL && (uncommitted || prepared)) {
+ r->leave_dirty = true;
+ return (__wt_set_return(session, EBUSY));
+ }
+
+ if (upd->type == WT_UPDATE_BIRTHMARK)
+ skipped_birthmark = true;
+
+ continue;
+ }
+
+ /*
+ * Lookaside without stable timestamp was taken care of above
+ * (set to the first uncommitted transaction). Lookaside with
+ * stable timestamp always takes the first stable update.
+ */
+ if (*updp == NULL)
+ *updp = upd;
+
+ if (!F_ISSET(r, WT_REC_EVICT))
+ break;
+ }
+
+ /* Keep track of the selected update. */
+ upd = *updp;
+
+ /* Reconciliation should never see an aborted or reserved update. */
+ WT_ASSERT(session, upd == NULL ||
+ (upd->txnid != WT_TXN_ABORTED && upd->type != WT_UPDATE_RESERVE));
+
+ /* If all of the updates were aborted, quit. */
+ if (first_txn_upd == NULL) {
+ WT_ASSERT(session, upd == NULL);
+ return (0);
+ }
+
+ /* If no updates were skipped, record that we're making progress. */
+ if (upd == first_txn_upd)
+ r->update_used = true;
+
+ /*
+ * The checkpoint transaction is special. Make sure we never write
+ * metadata updates from a checkpoint in a concurrent session.
+ */
+ WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) ||
+ upd == NULL || upd->txnid == WT_TXN_NONE ||
+ upd->txnid != S2C(session)->txn_global.checkpoint_state.id ||
+ WT_SESSION_IS_CHECKPOINT(session));
+
+ /*
+ * Track the most recent transaction in the page. We store this in the
+ * tree at the end of reconciliation in the service of checkpoints, it
+ * is used to avoid discarding trees from memory when they have changes
+ * required to satisfy a snapshot read.
+ */
+ if (WT_TXNID_LT(r->max_txn, max_txn))
+ r->max_txn = max_txn;
+
+ /* Update the maximum timestamp. */
+ if (first_ts_upd != NULL && r->max_timestamp < first_ts_upd->timestamp)
+ r->max_timestamp = first_ts_upd->timestamp;
+
+ /*
+ * If the update we chose was a birthmark, or we are doing
+ * update-restore and we skipped a birthmark, the original on-page
+ * value must be retained.
+ */
+ if (upd != NULL &&
+ (upd->type == WT_UPDATE_BIRTHMARK ||
+ (F_ISSET(r, WT_REC_UPDATE_RESTORE) && skipped_birthmark)))
+ *updp = NULL;
+
+ /*
+ * Check if all updates on the page are visible. If not, it must stay
+ * dirty unless we are saving updates to the lookaside table.
+ *
+ * Updates can be out of transaction ID order (but not out of timestamp
+ * order), so we track the maximum transaction ID and the newest update
+ * with a timestamp (if any).
+ */
+ timestamp = first_ts_upd == NULL ? 0 : first_ts_upd->timestamp;
+ all_visible = upd == first_txn_upd && !(uncommitted || prepared) &&
+ (F_ISSET(r, WT_REC_VISIBLE_ALL) ?
+ __wt_txn_visible_all(session, max_txn, timestamp) :
+ __wt_txn_visible(session, max_txn, timestamp));
+
+ if (all_visible)
+ goto check_original_value;
+
+ r->leave_dirty = true;
+
+ if (F_ISSET(r, WT_REC_VISIBILITY_ERR))
+ WT_PANIC_RET(session, EINVAL,
+ "reconciliation error, update not visible");
+
+ /*
+ * If not trying to evict the page, we know what we'll write and we're
+ * done.
+ */
+ if (!F_ISSET(r, WT_REC_EVICT))
+ goto check_original_value;
+
+ /*
+ * We are attempting eviction with changes that are not yet stable
+ * (i.e. globally visible). There are two ways to continue, the
+ * save/restore eviction path or the lookaside table eviction path.
+ * Both cannot be configured because the paths track different
+ * information. The update/restore path can handle uncommitted changes,
+ * by evicting most of the page and then creating a new, smaller page
+ * to which we re-attach those changes. Lookaside eviction writes
+ * changes into the lookaside table and restores them on demand if and
+ * when the page is read back into memory.
+ *
+ * Both paths are configured outside of reconciliation: the save/restore
+ * path is the WT_REC_UPDATE_RESTORE flag, the lookaside table path is
+ * the WT_REC_LOOKASIDE flag.
+ */
+ if (!F_ISSET(r, WT_REC_LOOKASIDE | WT_REC_UPDATE_RESTORE))
+ return (__wt_set_return(session, EBUSY));
+ if (uncommitted && !F_ISSET(r, WT_REC_UPDATE_RESTORE))
+ return (__wt_set_return(session, EBUSY));
+
+ WT_ASSERT(session, r->max_txn != WT_TXN_NONE);
+
+ /*
+ * The order of the updates on the list matters, we can't move only the
+ * unresolved updates, move the entire update list.
+ */
+ WT_RET(__rec_update_save(session, r, ins, ripcip, *updp, upd_memsize));
+ upd_saved = true;
+ if (upd_savedp != NULL)
+ *upd_savedp = true;
+
+ /*
+ * Track the first off-page update when saving history in the lookaside
+ * table. When skewing newest, we want the first (non-aborted) update
+ * after the one stored on the page. Otherwise, we want the update
+ * before the on-page update.
+ */
+ if (F_ISSET(r, WT_REC_LOOKASIDE) && r->las_skew_newest) {
+ if (WT_TXNID_LT(r->unstable_txn, first_upd->txnid))
+ r->unstable_txn = first_upd->txnid;
+ if (first_ts_upd != NULL &&
+ r->unstable_timestamp < first_ts_upd->timestamp)
+ r->unstable_timestamp = first_ts_upd->timestamp;
+ } else if (F_ISSET(r, WT_REC_LOOKASIDE)) {
+ for (upd = first_upd; upd != *updp; upd = upd->next) {
+ if (upd->txnid == WT_TXN_ABORTED)
+ continue;
+
+ if (upd->txnid != WT_TXN_NONE &&
+ WT_TXNID_LT(upd->txnid, r->unstable_txn))
+ r->unstable_txn = upd->txnid;
+ if (upd->timestamp < r->unstable_timestamp)
+ r->unstable_timestamp = upd->timestamp;
+ }
+ }
+
+check_original_value:
+ /*
+ * Paranoia: check that we didn't choose an update that has since been
+ * rolled back.
+ */
+ WT_ASSERT(session, *updp == NULL || (*updp)->txnid != WT_TXN_ABORTED);
+
+ /*
+ * Returning an update means the original on-page value might be lost,
+ * and that's a problem if there's a reader that needs it. This call
+ * makes a copy of the on-page value and if there is a birthmark in the
+ * update list, replaces it. We do that any time there are saved
+ * updates and during reconciliation of a backing overflow record that
+ * will be physically removed once it's no longer needed.
+ */
+ if (*updp != NULL && (upd_saved ||
+ (vpack != NULL && vpack->ovfl &&
+ vpack->raw != WT_CELL_VALUE_OVFL_RM)))
+ WT_RET(
+ __rec_append_orig_value(session, page, first_upd, vpack));
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 87ce7ca1cc3..1c873fc3d8a 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -8,307 +8,18 @@
#include "wt_internal.h"
-struct __rec_chunk; typedef struct __rec_chunk WT_CHUNK;
-struct __rec_dictionary; typedef struct __rec_dictionary WT_DICTIONARY;
-struct __rec_kv; typedef struct __rec_kv WT_KV;
-
-/*
- * Reconciliation is the process of taking an in-memory page, walking each entry
- * in the page, building a backing disk image in a temporary buffer representing
- * that information, and writing that buffer to disk. What could be simpler?
- *
- * WT_RECONCILE --
- * Information tracking a single page reconciliation.
- */
-typedef struct {
- WT_REF *ref; /* Page being reconciled */
- WT_PAGE *page;
- uint32_t flags; /* Caller's configuration */
-
- /*
- * Track start/stop write generation to decide if all changes to the
- * page are written.
- */
- uint32_t orig_write_gen;
-
- /*
- * Track start/stop checkpoint generations to decide if lookaside table
- * records are correct.
- */
- uint64_t orig_btree_checkpoint_gen;
- uint64_t orig_txn_checkpoint_gen;
-
- /*
- * Track the oldest running transaction and whether to skew lookaside
- * to the newest update.
- */
- bool las_skew_newest;
- uint64_t last_running;
-
- /* Track the page's min/maximum transactions. */
- uint64_t max_txn;
- wt_timestamp_t max_timestamp;
-
- /* Lookaside boundary tracking. */
- uint64_t unstable_txn;
- wt_timestamp_t unstable_timestamp;
-
- u_int updates_seen; /* Count of updates seen. */
- u_int updates_unstable; /* Count of updates not visible_all. */
-
- bool update_uncommitted; /* An update was uncommitted */
- bool update_used; /* An update could be used */
-
- /*
- * When we can't mark the page clean (for example, checkpoint found some
- * uncommitted updates), there's a leave-dirty flag.
- */
- bool leave_dirty;
-
- /*
- * Track if reconciliation has seen any overflow items. If a leaf page
- * with no overflow items is written, the parent page's address cell is
- * set to the leaf-no-overflow type. This means we can delete the leaf
- * page without reading it because we don't have to discard any overflow
- * items it might reference.
- *
- * The test test is per-page reconciliation, that is, once we see an
- * overflow item on the page, all subsequent leaf pages written for the
- * page will not be leaf-no-overflow type, regardless of whether or not
- * they contain overflow items. In other words, leaf-no-overflow is not
- * guaranteed to be set on every page that doesn't contain an overflow
- * item, only that if it is set, the page contains no overflow items.
- * XXX
- * This was originally done because raw compression couldn't do better,
- * now that raw compression has been removed, we should do better.
- */
- bool ovfl_items;
-
- /*
- * Track if reconciliation of a row-store leaf page has seen empty (zero
- * length) values. We don't write out anything for empty values, so if
- * there are empty values on a page, we have to make two passes over the
- * page when it's read to figure out how many keys it has, expensive in
- * the common case of no empty values and (entries / 2) keys. Likewise,
- * a page with only empty values is another common data set, and keys on
- * that page will be equal to the number of entries. In both cases, set
- * a flag in the page's on-disk header.
- *
- * The test is per-page reconciliation as described above for the
- * overflow-item test.
- */
- bool all_empty_value, any_empty_value;
-
- /*
- * Reconciliation gets tricky if we have to split a page, which happens
- * when the disk image we create exceeds the page type's maximum disk
- * image size.
- *
- * First, the target size of the page we're building.
- */
- uint32_t page_size; /* Page size */
-
- /*
- * Second, the split size: if we're doing the page layout, split to a
- * smaller-than-maximum page size when a split is required so we don't
- * repeatedly split a packed page.
- */
- uint32_t split_size; /* Split page size */
- uint32_t min_split_size; /* Minimum split page size */
-
- /*
- * We maintain two split chunks in the memory during reconciliation to
- * be written out as pages. As we get to the end of the data, if the
- * last one turns out to be smaller than the minimum split size, we go
- * back into the penultimate chunk and split at this minimum split size
- * boundary. This moves some data from the penultimate chunk to the last
- * chunk, hence increasing the size of the last page written without
- * decreasing the penultimate page size beyond the minimum split size.
- * For this reason, we maintain an expected split percentage boundary
- * and a minimum split percentage boundary.
- *
- * Chunks are referenced by current and previous pointers. In case of a
- * split, previous references the first chunk and current switches to
- * the second chunk. If reconciliation generates more split chunks, the
- * the previous chunk is written to the disk and current and previous
- * swap.
- */
- struct __rec_chunk {
- /*
- * The recno and entries fields are the starting record number
- * of the split chunk (for column-store splits), and the number
- * of entries in the split chunk.
- *
- * The key for a row-store page; no column-store key is needed
- * because the page's recno, stored in the recno field, is the
- * column-store key.
- */
- uint32_t entries;
- uint64_t recno;
- WT_ITEM key;
-
- uint32_t min_entries;
- uint64_t min_recno;
- WT_ITEM min_key;
-
- /* Minimum split-size boundary buffer offset. */
- size_t min_offset;
-
- WT_ITEM image; /* disk-image */
- } chunkA, chunkB, *cur_ptr, *prev_ptr;
-
- /*
- * We track current information about the current record number, the
- * number of entries copied into the disk image buffer, where we are
- * in the buffer, and how much memory remains. Those values are
- * packaged here rather than passing pointers to stack locations
- * around the code.
- */
- uint64_t recno; /* Current record number */
- uint32_t entries; /* Current number of entries */
- uint8_t *first_free; /* Current first free byte */
- size_t space_avail; /* Remaining space in this chunk */
- /* Remaining space in this chunk to put a minimum size boundary */
- size_t min_space_avail;
-
- /*
- * Saved update list, supporting the WT_REC_UPDATE_RESTORE and
- * WT_REC_LOOKASIDE configurations. While reviewing updates for each
- * page, we save WT_UPDATE lists here, and then move them to per-block
- * areas as the blocks are defined.
- */
- WT_SAVE_UPD *supd; /* Saved updates */
- uint32_t supd_next;
- size_t supd_allocated;
- size_t supd_memsize; /* Size of saved update structures */
-
- /* List of pages we've written so far. */
- WT_MULTI *multi;
- uint32_t multi_next;
- size_t multi_allocated;
-
- /*
- * Root pages are written when wrapping up the reconciliation, remember
- * the image we're going to write.
- */
- WT_ITEM *wrapup_checkpoint;
- bool wrapup_checkpoint_compressed;
-
- /*
- * We don't need to keep the 0th key around on internal pages, the
- * search code ignores them as nothing can sort less by definition.
- * There's some trickiness here, see the code for comments on how
- * these fields work.
- */
- bool cell_zero; /* Row-store internal page 0th key */
-
- /*
- * We calculate checksums to find previously written identical blocks,
- * but once a match fails during an eviction, there's no point trying
- * again.
- */
- bool evict_matching_checksum_failed;
-
- /*
- * WT_DICTIONARY --
- * We optionally build a dictionary of values for leaf pages. Where
- * two value cells are identical, only write the value once, the second
- * and subsequent copies point to the original cell. The dictionary is
- * fixed size, but organized in a skip-list to make searches faster.
- */
- struct __rec_dictionary {
- uint64_t hash; /* Hash value */
- uint32_t offset; /* Matching cell */
-
- u_int depth; /* Skiplist */
- WT_DICTIONARY *next[0];
- } **dictionary; /* Dictionary */
- u_int dictionary_next, dictionary_slots; /* Next, max entries */
- /* Skiplist head. */
- WT_DICTIONARY *dictionary_head[WT_SKIP_MAXDEPTH];
-
- /*
- * WT_KV--
- * An on-page key/value item we're building.
- */
- struct __rec_kv {
- WT_ITEM buf; /* Data */
- WT_CELL cell; /* Cell and cell's length */
- size_t cell_len;
- size_t len; /* Total length of cell + data */
- } k, v; /* Key/Value being built */
-
- WT_ITEM *cur, _cur; /* Key/Value being built */
- WT_ITEM *last, _last; /* Last key/value built */
-
- bool key_pfx_compress; /* If can prefix-compress next key */
- bool key_pfx_compress_conf; /* If prefix compression configured */
- bool key_sfx_compress; /* If can suffix-compress next key */
- bool key_sfx_compress_conf; /* If suffix compression configured */
-
- bool is_bulk_load; /* If it's a bulk load */
-
- WT_SALVAGE_COOKIE *salvage; /* If it's a salvage operation */
-
- bool cache_write_lookaside; /* Used the lookaside table */
- bool cache_write_restore; /* Used update/restoration */
-
- uint32_t tested_ref_state; /* Debugging information */
-
- /*
- * XXX
- * In the case of a modified update, we may need a copy of the current
- * value as a set of bytes. We call back into the btree code using a
- * fake cursor to do that work. This a layering violation and fragile,
- * we need a better solution.
- */
- WT_CURSOR_BTREE update_modify_cbt;
-} WT_RECONCILE;
-
-#define WT_CROSSING_MIN_BND(r, next_len) \
- ((r)->cur_ptr->min_offset == 0 && \
- (next_len) > (r)->min_space_avail)
-#define WT_CROSSING_SPLIT_BND(r, next_len) ((next_len) > (r)->space_avail)
-#define WT_CHECK_CROSSING_BND(r, next_len) \
- (WT_CROSSING_MIN_BND(r, next_len) || WT_CROSSING_SPLIT_BND(r, next_len))
-
-static void __rec_cell_build_addr(WT_SESSION_IMPL *,
- WT_RECONCILE *, const void *, size_t, u_int, uint64_t);
-static int __rec_cell_build_int_key(WT_SESSION_IMPL *,
- WT_RECONCILE *, const void *, size_t, bool *);
-static int __rec_cell_build_leaf_key(WT_SESSION_IMPL *,
- WT_RECONCILE *, const void *, size_t, bool *);
-static int __rec_cell_build_ovfl(WT_SESSION_IMPL *,
- WT_RECONCILE *, WT_KV *, uint8_t, uint64_t);
-static int __rec_cell_build_val(WT_SESSION_IMPL *,
- WT_RECONCILE *, const void *, size_t, uint64_t);
static void __rec_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *);
-static int __rec_col_fix(WT_SESSION_IMPL *, WT_RECONCILE *, WT_REF *);
-static int __rec_col_fix_slvg(WT_SESSION_IMPL *,
- WT_RECONCILE *, WT_REF *, WT_SALVAGE_COOKIE *);
-static int __rec_col_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_REF *);
-static int __rec_col_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
-static int __rec_col_var(WT_SESSION_IMPL *,
- WT_RECONCILE *, WT_REF *, WT_SALVAGE_COOKIE *);
-static int __rec_col_var_helper(WT_SESSION_IMPL *, WT_RECONCILE *,
- WT_SALVAGE_COOKIE *, WT_ITEM *, bool, uint8_t, uint64_t);
static int __rec_destroy_session(WT_SESSION_IMPL *);
static int __rec_init(WT_SESSION_IMPL *,
WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *);
static int __rec_las_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *);
static int __rec_las_wrapup_err(WT_SESSION_IMPL *, WT_RECONCILE *);
static int __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t);
-static int __rec_row_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
-static int __rec_row_leaf(WT_SESSION_IMPL *,
- WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *);
-static int __rec_row_leaf_insert(
- WT_SESSION_IMPL *, WT_RECONCILE *, WT_INSERT *);
-static int __rec_row_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_split_discard(WT_SESSION_IMPL *, WT_PAGE *);
static int __rec_split_row_promote(
WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t);
-static int __rec_split_write(
- WT_SESSION_IMPL *, WT_RECONCILE *, WT_CHUNK *, WT_ITEM *, bool);
+static int __rec_split_write(WT_SESSION_IMPL *,
+ WT_RECONCILE *, WT_REC_CHUNK *, WT_ITEM *, bool);
static int __rec_write_check_complete(
WT_SESSION_IMPL *, WT_RECONCILE *, int, bool *);
static void __rec_write_page_status(WT_SESSION_IMPL *, WT_RECONCILE *);
@@ -316,12 +27,6 @@ static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_write_wrapup_err(
WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
-static void __rec_dictionary_free(WT_SESSION_IMPL *, WT_RECONCILE *);
-static int __rec_dictionary_init(WT_SESSION_IMPL *, WT_RECONCILE *, u_int);
-static int __rec_dictionary_lookup(
- WT_SESSION_IMPL *, WT_RECONCILE *, WT_KV *, WT_DICTIONARY **);
-static void __rec_dictionary_reset(WT_RECONCILE *);
-
/*
* __wt_reconcile --
* Reconcile an in-memory page into its on-disk format, and write it.
@@ -435,23 +140,23 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
switch (page->type) {
case WT_PAGE_COL_FIX:
if (salvage != NULL)
- ret = __rec_col_fix_slvg(session, r, ref, salvage);
+ ret = __wt_rec_col_fix_slvg(session, r, ref, salvage);
else
- ret = __rec_col_fix(session, r, ref);
+ ret = __wt_rec_col_fix(session, r, ref);
break;
case WT_PAGE_COL_INT:
WT_WITH_PAGE_INDEX(session,
- ret = __rec_col_int(session, r, ref));
+ ret = __wt_rec_col_int(session, r, ref));
break;
case WT_PAGE_COL_VAR:
- ret = __rec_col_var(session, r, ref, salvage);
+ ret = __wt_rec_col_var(session, r, ref, salvage);
break;
case WT_PAGE_ROW_INT:
WT_WITH_PAGE_INDEX(session,
- ret = __rec_row_int(session, r, page));
+ ret = __wt_rec_row_int(session, r, page));
break;
case WT_PAGE_ROW_LEAF:
- ret = __rec_row_leaf(session, r, page, salvage);
+ ret = __wt_rec_row_leaf(session, r, page, salvage);
break;
default:
ret = __wt_illegal_value(session, page->type);
@@ -870,6 +575,12 @@ __rec_init(WT_SESSION_IMPL *session,
* history, or the stable timestamp hasn't changed since last time this
* page was successfully, skew oldest instead.
*/
+ if (F_ISSET(S2C(session)->cache, WT_CACHE_EVICT_DEBUG_MODE) &&
+ __wt_random(&session->rnd) % 3 == 0)
+ r->las_skew_newest = false;
+ else
+ r->las_skew_newest =
+ LF_ISSET(WT_REC_LOOKASIDE) && LF_ISSET(WT_REC_VISIBLE_ALL);
r->las_skew_newest =
LF_ISSET(WT_REC_LOOKASIDE) && LF_ISSET(WT_REC_VISIBLE_ALL);
if (r->las_skew_newest &&
@@ -965,9 +676,9 @@ __rec_init(WT_SESSION_IMPL *session,
* Sanity check the size: 100 slots is the smallest dictionary we use.
*/
if (btree->dictionary != 0 && btree->dictionary > r->dictionary_slots)
- WT_RET(__rec_dictionary_init(session,
+ WT_RET(__wt_rec_dictionary_init(session,
r, btree->dictionary < 100 ? 100 : btree->dictionary));
- __rec_dictionary_reset(r);
+ __wt_rec_dictionary_reset(r);
/*
* Prefix compression discards repeated prefix bytes from row-store leaf
@@ -1059,7 +770,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep)
__wt_free(session, r->supd);
- __rec_dictionary_free(session, r);
+ __wt_rec_dictionary_free(session, r);
__wt_buf_free(session, &r->k.buf);
__wt_buf_free(session, &r->v.buf);
@@ -1083,914 +794,6 @@ __rec_destroy_session(WT_SESSION_IMPL *session)
}
/*
- * __rec_update_save --
- * Save a WT_UPDATE list for later restoration.
- */
-static int
-__rec_update_save(WT_SESSION_IMPL *session, WT_RECONCILE *r,
- WT_INSERT *ins, void *ripcip, WT_UPDATE *onpage_upd, size_t upd_memsize)
-{
- WT_RET(__wt_realloc_def(
- session, &r->supd_allocated, r->supd_next + 1, &r->supd));
- r->supd[r->supd_next].ins = ins;
- r->supd[r->supd_next].ripcip = ripcip;
- r->supd[r->supd_next].onpage_upd = onpage_upd;
- ++r->supd_next;
- r->supd_memsize += upd_memsize;
- return (0);
-}
-
-/*
- * __rec_append_orig_value --
- * Append the key's original value to its update list.
- */
-static int
-__rec_append_orig_value(WT_SESSION_IMPL *session,
- WT_PAGE *page, WT_UPDATE *upd, WT_CELL_UNPACK *unpack)
-{
- WT_DECL_ITEM(tmp);
- WT_DECL_RET;
- WT_UPDATE *append;
- size_t size;
-
- /* Done if at least one self-contained update is globally visible. */
- for (;; upd = upd->next) {
- if (WT_UPDATE_DATA_VALUE(upd) &&
- __wt_txn_upd_visible_all(session, upd))
- return (0);
-
- /* Add the original value after birthmarks. */
- if (upd->type == WT_UPDATE_BIRTHMARK) {
- WT_ASSERT(session, unpack != NULL &&
- unpack->type != WT_CELL_DEL);
- break;
- }
-
- /* Leave reference at the last item in the chain. */
- if (upd->next == NULL)
- break;
- }
-
- /*
- * We need the original on-page value for some reader: get a copy and
- * append it to the end of the update list with a transaction ID that
- * guarantees its visibility.
- *
- * If we don't have a value cell, it's an insert/append list key/value
- * pair which simply doesn't exist for some reader; place a deleted
- * record at the end of the update list.
- */
- append = NULL; /* -Wconditional-uninitialized */
- size = 0; /* -Wconditional-uninitialized */
- if (unpack == NULL || unpack->type == WT_CELL_DEL)
- WT_RET(__wt_update_alloc(session,
- NULL, &append, &size, WT_UPDATE_TOMBSTONE));
- else {
- WT_RET(__wt_scr_alloc(session, 0, &tmp));
- WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp));
- WT_ERR(__wt_update_alloc(
- session, tmp, &append, &size, WT_UPDATE_STANDARD));
- }
-
- /*
- * If we're saving the original value for a birthmark, transfer over
- * the transaction ID and clear out the birthmark update.
- *
- * Else, set the entry's transaction information to the lowest possible
- * value. Cleared memory matches the lowest possible transaction ID and
- * timestamp, do nothing.
- */
- if (upd->type == WT_UPDATE_BIRTHMARK) {
- append->txnid = upd->txnid;
- append->timestamp = upd->timestamp;
- append->next = upd->next;
- }
-
- /* Append the new entry into the update list. */
- WT_PUBLISH(upd->next, append);
- __wt_cache_page_inmem_incr(session, page, size);
-
- if (upd->type == WT_UPDATE_BIRTHMARK) {
- upd->type = WT_UPDATE_STANDARD;
- upd->txnid = WT_TXN_ABORTED;
- }
-
-err: __wt_scr_free(session, &tmp);
- return (ret);
-}
-
-/*
- * __rec_txn_read --
- * Return the update in a list that should be written (or NULL if none can
- * be written).
- */
-static int
-__rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
- WT_INSERT *ins, void *ripcip, WT_CELL_UNPACK *vpack,
- bool *upd_savedp, WT_UPDATE **updp)
-{
- WT_PAGE *page;
- WT_UPDATE *first_ts_upd, *first_txn_upd, *first_upd, *upd;
- wt_timestamp_t timestamp;
- size_t upd_memsize;
- uint64_t max_txn, txnid;
- bool all_visible, prepared, skipped_birthmark, uncommitted, upd_saved;
-
- if (upd_savedp != NULL)
- *upd_savedp = false;
- *updp = NULL;
-
- page = r->page;
- first_ts_upd = first_txn_upd = NULL;
- upd_memsize = 0;
- max_txn = WT_TXN_NONE;
- prepared = skipped_birthmark = uncommitted = upd_saved = false;
-
- /*
- * If called with a WT_INSERT item, use its WT_UPDATE list (which must
- * exist), otherwise check for an on-page row-store WT_UPDATE list
- * (which may not exist). Return immediately if the item has no updates.
- */
- if (ins != NULL)
- first_upd = ins->upd;
- else if ((first_upd = WT_ROW_UPDATE(page, ripcip)) == NULL)
- return (0);
-
- for (upd = first_upd; upd != NULL; upd = upd->next) {
- if ((txnid = upd->txnid) == WT_TXN_ABORTED)
- continue;
-
- ++r->updates_seen;
- upd_memsize += WT_UPDATE_MEMSIZE(upd);
-
- /*
- * Track the first update in the chain that is not aborted and
- * the maximum transaction ID.
- */
- if (first_txn_upd == NULL)
- first_txn_upd = upd;
-
- /* Track the largest transaction ID seen. */
- if (WT_TXNID_LT(max_txn, txnid))
- max_txn = txnid;
-
- /*
- * Check whether the update was committed before reconciliation
- * started. The global commit point can move forward during
- * reconciliation so we use a cached copy to avoid races when a
- * concurrent transaction commits or rolls back while we are
- * examining its updates. As prepared transaction id's are
- * globally visible, need to check the update state as well.
- */
- if (F_ISSET(r, WT_REC_EVICT)) {
- if (upd->prepare_state == WT_PREPARE_LOCKED ||
- upd->prepare_state == WT_PREPARE_INPROGRESS)
- prepared = true;
-
- if (F_ISSET(r, WT_REC_VISIBLE_ALL) ?
- WT_TXNID_LE(r->last_running, txnid) :
- !__txn_visible_id(session, txnid))
- uncommitted = r->update_uncommitted = true;
-
- if (prepared || uncommitted)
- continue;
- }
-
- /* Track the first update with non-zero timestamp. */
- if (first_ts_upd == NULL && upd->timestamp != 0)
- first_ts_upd = upd;
-
- /*
- * Find the first update we can use.
- *
- * Update/restore eviction can handle any update (including
- * uncommitted updates). Lookaside eviction can save any
- * committed update. Regular eviction checks that the maximum
- * transaction ID and timestamp seen are stable.
- *
- * Lookaside and update/restore eviction try to choose the same
- * version as a subsequent checkpoint, so that checkpoint can
- * skip over pages with lookaside entries. If the application
- * has supplied a stable timestamp, we assume (a) that it is
- * old, and (b) that the next checkpoint will use it, so we wait
- * to see a stable update. If there is no stable timestamp, we
- * assume the next checkpoint will write the most recent version
- * (but we save enough information that checkpoint can fix
- * things up if we choose an update that is too new).
- */
- if (*updp == NULL && r->las_skew_newest)
- *updp = upd;
-
- if (F_ISSET(r, WT_REC_VISIBLE_ALL) ?
- !__wt_txn_upd_visible_all(session, upd) :
- !__wt_txn_upd_visible(session, upd)) {
- if (F_ISSET(r, WT_REC_EVICT))
- ++r->updates_unstable;
-
- /*
- * Rare case: when applications run at low isolation
- * levels, update/restore eviction may see a stable
- * update followed by an uncommitted update. Give up
- * in that case: we need to discard updates from the
- * stable update and older for correctness and we can't
- * discard an uncommitted update.
- */
- if (F_ISSET(r, WT_REC_UPDATE_RESTORE) &&
- *updp != NULL && (uncommitted || prepared)) {
- r->leave_dirty = true;
- return (__wt_set_return(session, EBUSY));
- }
-
- if (upd->type == WT_UPDATE_BIRTHMARK)
- skipped_birthmark = true;
-
- continue;
- }
-
- /*
- * Lookaside without stable timestamp was taken care of above
- * (set to the first uncommitted transaction). Lookaside with
- * stable timestamp always takes the first stable update.
- */
- if (*updp == NULL)
- *updp = upd;
- }
-
- /* Keep track of the selected update. */
- upd = *updp;
-
- /* Reconciliation should never see an aborted or reserved update. */
- WT_ASSERT(session, upd == NULL ||
- (upd->txnid != WT_TXN_ABORTED && upd->type != WT_UPDATE_RESERVE));
-
- /* If all of the updates were aborted, quit. */
- if (first_txn_upd == NULL) {
- WT_ASSERT(session, upd == NULL);
- return (0);
- }
-
- /* If no updates were skipped, record that we're making progress. */
- if (upd == first_txn_upd)
- r->update_used = true;
-
- /*
- * The checkpoint transaction is special. Make sure we never write
- * metadata updates from a checkpoint in a concurrent session.
- */
- WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) ||
- upd == NULL || upd->txnid == WT_TXN_NONE ||
- upd->txnid != S2C(session)->txn_global.checkpoint_state.id ||
- WT_SESSION_IS_CHECKPOINT(session));
-
- /*
- * Track the most recent transaction in the page. We store this in the
- * tree at the end of reconciliation in the service of checkpoints, it
- * is used to avoid discarding trees from memory when they have changes
- * required to satisfy a snapshot read.
- */
- if (WT_TXNID_LT(r->max_txn, max_txn))
- r->max_txn = max_txn;
-
- /* Update the maximum timestamp. */
- if (first_ts_upd != NULL && r->max_timestamp < first_ts_upd->timestamp)
- r->max_timestamp = first_ts_upd->timestamp;
-
- /*
- * If the update we chose was a birthmark, or we are doing
- * update-restore and we skipped a birthmark, the original on-page
- * value must be retained.
- */
- if (upd != NULL &&
- (upd->type == WT_UPDATE_BIRTHMARK ||
- (F_ISSET(r, WT_REC_UPDATE_RESTORE) && skipped_birthmark)))
- *updp = NULL;
-
- /*
- * Check if all updates on the page are visible. If not, it must stay
- * dirty unless we are saving updates to the lookaside table.
- *
- * Updates can be out of transaction ID order (but not out of timestamp
- * order), so we track the maximum transaction ID and the newest update
- * with a timestamp (if any).
- */
- timestamp = first_ts_upd == NULL ? 0 : first_ts_upd->timestamp;
- all_visible = upd == first_txn_upd && !(uncommitted || prepared) &&
- (F_ISSET(r, WT_REC_VISIBLE_ALL) ?
- __wt_txn_visible_all(session, max_txn, timestamp) :
- __wt_txn_visible(session, max_txn, timestamp));
-
- if (all_visible)
- goto check_original_value;
-
- r->leave_dirty = true;
-
- if (F_ISSET(r, WT_REC_VISIBILITY_ERR))
- WT_PANIC_RET(session, EINVAL,
- "reconciliation error, update not visible");
-
- /*
- * If not trying to evict the page, we know what we'll write and we're
- * done.
- */
- if (!F_ISSET(r, WT_REC_EVICT))
- goto check_original_value;
-
- /*
- * We are attempting eviction with changes that are not yet stable
- * (i.e. globally visible). There are two ways to continue, the
- * save/restore eviction path or the lookaside table eviction path.
- * Both cannot be configured because the paths track different
- * information. The update/restore path can handle uncommitted changes,
- * by evicting most of the page and then creating a new, smaller page
- * to which we re-attach those changes. Lookaside eviction writes
- * changes into the lookaside table and restores them on demand if and
- * when the page is read back into memory.
- *
- * Both paths are configured outside of reconciliation: the save/restore
- * path is the WT_REC_UPDATE_RESTORE flag, the lookaside table path is
- * the WT_REC_LOOKASIDE flag.
- */
- if (!F_ISSET(r, WT_REC_LOOKASIDE | WT_REC_UPDATE_RESTORE))
- return (__wt_set_return(session, EBUSY));
- if (uncommitted && !F_ISSET(r, WT_REC_UPDATE_RESTORE))
- return (__wt_set_return(session, EBUSY));
-
- WT_ASSERT(session, r->max_txn != WT_TXN_NONE);
-
- /*
- * The order of the updates on the list matters, we can't move only the
- * unresolved updates, move the entire update list.
- */
- WT_RET(__rec_update_save(session, r, ins, ripcip, *updp, upd_memsize));
- upd_saved = true;
- if (upd_savedp != NULL)
- *upd_savedp = true;
-
- /*
- * Track the first off-page update when saving history in the lookaside
- * table. When skewing newest, we want the first (non-aborted) update
- * after the one stored on the page. Otherwise, we want the update
- * before the on-page update.
- */
- if (F_ISSET(r, WT_REC_LOOKASIDE) && r->las_skew_newest) {
- if (WT_TXNID_LT(r->unstable_txn, first_upd->txnid))
- r->unstable_txn = first_upd->txnid;
- if (first_ts_upd != NULL &&
- r->unstable_timestamp < first_ts_upd->timestamp)
- r->unstable_timestamp = first_ts_upd->timestamp;
- } else if (F_ISSET(r, WT_REC_LOOKASIDE)) {
- for (upd = first_upd; upd != *updp; upd = upd->next) {
- if (upd->txnid == WT_TXN_ABORTED)
- continue;
-
- if (upd->txnid != WT_TXN_NONE &&
- WT_TXNID_LT(upd->txnid, r->unstable_txn))
- r->unstable_txn = upd->txnid;
- if (upd->timestamp < r->unstable_timestamp)
- r->unstable_timestamp = upd->timestamp;
- }
- }
-
-check_original_value:
- /*
- * Paranoia: check that we didn't choose an update that has since been
- * rolled back.
- */
- WT_ASSERT(session, *updp == NULL || (*updp)->txnid != WT_TXN_ABORTED);
-
- /*
- * Returning an update means the original on-page value might be lost,
- * and that's a problem if there's a reader that needs it. This call
- * makes a copy of the on-page value and if there is a birthmark in the
- * update list, replaces it. We do that any time there are saved
- * updates and during reconciliation of a backing overflow record that
- * will be physically removed once it's no longer needed.
- */
- if (*updp != NULL && (upd_saved ||
- (vpack != NULL && vpack->ovfl &&
- vpack->raw != WT_CELL_VALUE_OVFL_RM)))
- WT_RET(
- __rec_append_orig_value(session, page, first_upd, vpack));
-
- return (0);
-}
-
-/*
- * WT_CHILD_RELEASE, WT_CHILD_RELEASE_ERR --
- * Macros to clean up during internal-page reconciliation, releasing the
- * hazard pointer we're holding on child pages.
- */
-#define WT_CHILD_RELEASE(session, hazard, ref) do { \
- if (hazard) { \
- (hazard) = false; \
- WT_TRET( \
- __wt_page_release(session, ref, WT_READ_NO_EVICT)); \
- } \
-} while (0)
-#define WT_CHILD_RELEASE_ERR(session, hazard, ref) do { \
- WT_CHILD_RELEASE(session, hazard, ref); \
- WT_ERR(ret); \
-} while (0)
-
-typedef enum {
- WT_CHILD_IGNORE, /* Ignored child */
- WT_CHILD_MODIFIED, /* Modified child */
- WT_CHILD_ORIGINAL, /* Original child */
- WT_CHILD_PROXY /* Deleted child: proxy */
-} WT_CHILD_STATE;
-
-/*
- * __rec_child_deleted --
- * Handle pages with leaf pages in the WT_REF_DELETED state.
- */
-static int
-__rec_child_deleted(WT_SESSION_IMPL *session,
- WT_RECONCILE *r, WT_REF *ref, WT_CHILD_STATE *statep)
-{
- WT_PAGE_DELETED *page_del;
-
- page_del = ref->page_del;
-
- /*
- * Internal pages with child leaf pages in the WT_REF_DELETED state are
- * a special case during reconciliation. First, if the deletion was a
- * result of a session truncate call, the deletion may not be visible to
- * us. In that case, we proceed as with any change not visible during
- * reconciliation by ignoring the change for the purposes of writing the
- * internal page.
- *
- * In this case, there must be an associated page-deleted structure, and
- * it holds the transaction ID we care about.
- *
- * In some cases, there had better not be any updates we can't see.
- *
- * A visible update to be in READY state (i.e. not in LOCKED or
- * PREPARED state), for truly visible to others.
- */
- if (F_ISSET(r, WT_REC_VISIBILITY_ERR) && page_del != NULL &&
- __wt_page_del_active(session, ref, false))
- WT_PANIC_RET(session, EINVAL,
- "reconciliation illegally skipped an update");
-
- /*
- * Deal with any underlying disk blocks.
- *
- * First, check to see if there is an address associated with this leaf:
- * if there isn't, we're done, the underlying page is already gone. If
- * the page still exists, check for any transactions in the system that
- * might want to see the page's state before it's deleted.
- *
- * If any such transactions exist, we cannot discard the underlying leaf
- * page to the block manager because the transaction may eventually read
- * it. However, this write might be part of a checkpoint, and should we
- * recover to that checkpoint, we'll need to delete the leaf page, else
- * we'd leak it. The solution is to write a proxy cell on the internal
- * page ensuring the leaf page is eventually discarded.
- *
- * If no such transactions exist, we can discard the leaf page to the
- * block manager and no cell needs to be written at all. We do this
- * outside of the underlying tracking routines because this action is
- * permanent and irrevocable. (Clearing the address means we've lost
- * track of the disk address in a permanent way. This is safe because
- * there's no path to reading the leaf page again: if there's ever a
- * read into this part of the name space again, the cache read function
- * instantiates an entirely new page.)
- */
- if (ref->addr != NULL && !__wt_page_del_active(session, ref, true)) {
- /*
- * Minor memory cleanup: if a truncate call deleted this page
- * and we were ever forced to instantiate the page in memory,
- * we would have built a list of updates in the page reference
- * in order to be able to commit/rollback the truncate. We just
- * passed a visibility test, discard the update list.
- */
- if (page_del != NULL) {
- __wt_free(session, ref->page_del->update_list);
- __wt_free(session, ref->page_del);
- }
-
- WT_RET(__wt_ref_block_free(session, ref));
- }
-
- /*
- * If the original page is gone, we can skip the slot on the internal
- * page.
- */
- if (ref->addr == NULL) {
- *statep = WT_CHILD_IGNORE;
- return (0);
- }
-
- /*
- * Internal pages with deletes that aren't stable cannot be evicted, we
- * don't have sufficient information to restore the page's information
- * if subsequently read (we wouldn't know which transactions should see
- * the original page and which should see the deleted page).
- */
- if (F_ISSET(r, WT_REC_EVICT))
- return (__wt_set_return(session, EBUSY));
-
- /*
- * If there are deleted child pages we can't discard immediately, keep
- * the page dirty so they are eventually freed.
- */
- r->leave_dirty = true;
-
- /*
- * If the original page cannot be freed, we need to keep a slot on the
- * page to reference it from the parent page.
- *
- * If the delete is not visible in this checkpoint, write the original
- * address normally. Otherwise, we have to write a proxy record.
- * If the delete state is not ready, then delete is not visible as it
- * is in prepared state.
- */
- if (!__wt_page_del_active(session, ref, false))
- *statep = WT_CHILD_PROXY;
-
- return (0);
-}
-
-/*
- * __rec_child_modify --
- * Return if the internal page's child references any modifications.
- */
-static int
-__rec_child_modify(WT_SESSION_IMPL *session,
- WT_RECONCILE *r, WT_REF *ref, bool *hazardp, WT_CHILD_STATE *statep)
-{
- WT_DECL_RET;
- WT_PAGE_MODIFY *mod;
-
- /* We may acquire a hazard pointer our caller must release. */
- *hazardp = false;
-
- /* Default to using the original child address. */
- *statep = WT_CHILD_ORIGINAL;
-
- /*
- * This function is called when walking an internal page to decide how
- * to handle child pages referenced by the internal page.
- *
- * Internal pages are reconciled for two reasons: first, when evicting
- * an internal page, second by the checkpoint code when writing internal
- * pages. During eviction, all pages should be in the WT_REF_DISK or
- * WT_REF_DELETED state. During checkpoint, eviction that might affect
- * review of an internal page is prohibited, however, as the subtree is
- * not reserved for our exclusive use, there are other page states that
- * must be considered.
- */
- for (;; __wt_yield()) {
- switch (r->tested_ref_state = ref->state) {
- case WT_REF_DISK:
- /* On disk, not modified by definition. */
- goto done;
-
- case WT_REF_DELETED:
- /*
- * The child is in a deleted state.
- *
- * It's possible the state could change underneath us as
- * the page is read in, and we can race between checking
- * for a deleted state and looking at the transaction ID
- * to see if the delete is visible to us. Lock down the
- * structure.
- */
- if (!WT_REF_CAS_STATE(
- session, ref, WT_REF_DELETED, WT_REF_LOCKED))
- break;
- ret = __rec_child_deleted(session, r, ref, statep);
- WT_REF_SET_STATE(ref, WT_REF_DELETED);
- goto done;
-
- case WT_REF_LOCKED:
- /*
- * Locked.
- *
- * We should never be here during eviction, active child
- * pages in an evicted page's subtree fails the eviction
- * attempt.
- */
- WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
- if (F_ISSET(r, WT_REC_EVICT))
- return (__wt_set_return(session, EBUSY));
-
- /*
- * If called during checkpoint, the child is being
- * considered by the eviction server or the child is a
- * truncated page being read. The eviction may have
- * started before the checkpoint and so we must wait
- * for the eviction to be resolved. I suspect we could
- * handle reads of truncated pages, but we can't
- * distinguish between the two and reads of truncated
- * pages aren't expected to be common.
- */
- break;
-
- case WT_REF_LIMBO:
- WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
- /* FALLTHROUGH */
- case WT_REF_LOOKASIDE:
- /*
- * On disk or in cache with lookaside updates.
- *
- * We should never be here during eviction: active
- * child pages in an evicted page's subtree fails the
- * eviction attempt.
- */
- if (F_ISSET(r, WT_REC_EVICT) &&
- __wt_page_las_active(session, ref)) {
- WT_ASSERT(session, false);
- return (__wt_set_return(session, EBUSY));
- }
-
- /*
- * A page evicted with lookaside entries may not have
- * an address, if no updates were visible to
- * reconciliation. Any child pages in that state
- * should be ignored.
- */
- if (ref->addr == NULL) {
- *statep = WT_CHILD_IGNORE;
- WT_CHILD_RELEASE(session, *hazardp, ref);
- }
- goto done;
-
- case WT_REF_MEM:
- /*
- * In memory.
- *
- * We should never be here during eviction, active child
- * pages in an evicted page's subtree fails the eviction
- * attempt.
- */
- WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
- if (F_ISSET(r, WT_REC_EVICT))
- return (__wt_set_return(session, EBUSY));
-
- /*
- * If called during checkpoint, acquire a hazard pointer
- * so the child isn't evicted, it's an in-memory case.
- *
- * This call cannot return split/restart, we have a lock
- * on the parent which prevents a child page split.
- *
- * Set WT_READ_NO_WAIT because we're only interested in
- * the WT_REF's final state. Pages in transition might
- * change WT_REF state during our read, and then return
- * WT_NOTFOUND to us. In that case, loop and look again.
- */
- ret = __wt_page_in(session, ref,
- WT_READ_CACHE | WT_READ_NO_EVICT |
- WT_READ_NO_GEN | WT_READ_NO_WAIT);
- if (ret == WT_NOTFOUND) {
- ret = 0;
- break;
- }
- WT_RET(ret);
- *hazardp = true;
- goto in_memory;
-
- case WT_REF_READING:
- /*
- * Being read, not modified by definition.
- *
- * We should never be here during eviction, active child
- * pages in an evicted page's subtree fails the eviction
- * attempt.
- */
- WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
- if (F_ISSET(r, WT_REC_EVICT))
- return (__wt_set_return(session, EBUSY));
- goto done;
-
- case WT_REF_SPLIT:
- /*
- * The page was split out from under us.
- *
- * We should never be here during eviction, active child
- * pages in an evicted page's subtree fails the eviction
- * attempt.
- *
- * We should never be here during checkpoint, dirty page
- * eviction is shutout during checkpoint, all splits in
- * process will have completed before we walk any pages
- * for checkpoint.
- */
- WT_ASSERT(session, WT_REF_SPLIT != WT_REF_SPLIT);
- return (__wt_set_return(session, EBUSY));
-
- WT_ILLEGAL_VALUE(session, r->tested_ref_state);
- }
- WT_STAT_CONN_INCR(session, child_modify_blocked_page);
- }
-
-in_memory:
- /*
- * In-memory states: the child is potentially modified if the page's
- * modify structure has been instantiated. If the modify structure
- * exists and the page has actually been modified, set that state.
- * If that's not the case, we would normally use the original cell's
- * disk address as our reference, however there are two special cases,
- * both flagged by a missing block address.
- *
- * First, if forced to instantiate a deleted child page and it's never
- * modified, we end up here with a page that has a modify structure, no
- * modifications, and no disk address. Ignore those pages, they're not
- * modified and there is no reason to write the cell.
- *
- * Second, insert splits are permitted during checkpoint. When doing the
- * final checkpoint pass, we first walk the internal page's page-index
- * and write out any dirty pages we find, then we write out the internal
- * page in post-order traversal. If we found the split page in the first
- * step, it will have an address; if we didn't find the split page in
- * the first step, it won't have an address and we ignore it, it's not
- * part of the checkpoint.
- */
- mod = ref->page->modify;
- if (mod != NULL && mod->rec_result != 0)
- *statep = WT_CHILD_MODIFIED;
- else if (ref->addr == NULL) {
- *statep = WT_CHILD_IGNORE;
- WT_CHILD_RELEASE(session, *hazardp, ref);
- }
-
-done: WT_DIAGNOSTIC_YIELD;
- return (ret);
-}
-
-/*
- * __rec_incr --
- * Update the memory tracking structure for a set of new entries.
- */
-static inline void
-__rec_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size)
-{
- /*
- * The buffer code is fragile and prone to off-by-one errors -- check
- * for overflow in diagnostic mode.
- */
- WT_ASSERT(session, r->space_avail >= size);
- WT_ASSERT(session, WT_BLOCK_FITS(r->first_free, size,
- r->cur_ptr->image.mem, r->cur_ptr->image.memsize));
-
- r->entries += v;
- r->space_avail -= size;
- r->first_free += size;
-
- /*
- * If offset for the minimum split size boundary is not set, we have not
- * yet reached the minimum boundary, reduce the space available for it.
- */
- if (r->cur_ptr->min_offset == 0) {
- if (r->min_space_avail >= size)
- r->min_space_avail -= size;
- else
- r->min_space_avail = 0;
- }
-}
-
-/*
- * __rec_copy_incr --
- * Copy a key/value cell and buffer pair into the new image.
- */
-static inline void
-__rec_copy_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *kv)
-{
- size_t len;
- uint8_t *p, *t;
-
- /*
- * If there's only one chunk of data to copy (because the cell and data
- * are being copied from the original disk page), the cell length won't
- * be set, the WT_ITEM data/length will reference the data to be copied.
- *
- * WT_CELLs are typically small, 1 or 2 bytes -- don't call memcpy, do
- * the copy in-line.
- */
- for (p = r->first_free,
- t = (uint8_t *)&kv->cell, len = kv->cell_len; len > 0; --len)
- *p++ = *t++;
-
- /* The data can be quite large -- call memcpy. */
- if (kv->buf.size != 0)
- memcpy(p, kv->buf.data, kv->buf.size);
-
- WT_ASSERT(session, kv->len == kv->cell_len + kv->buf.size);
- __rec_incr(session, r, 1, kv->len);
-}
-
-/*
- * __rec_dict_replace --
- * Check for a dictionary match.
- */
-static int
-__rec_dict_replace(
- WT_SESSION_IMPL *session, WT_RECONCILE *r, uint64_t rle, WT_KV *val)
-{
- WT_DICTIONARY *dp;
- uint64_t offset;
-
- /*
- * We optionally create a dictionary of values and only write a unique
- * value once per page, using a special "copy" cell for all subsequent
- * copies of the value. We have to do the cell build and resolution at
- * this low level because we need physical cell offsets for the page.
- *
- * Sanity check: short-data cells can be smaller than dictionary-copy
- * cells. If the data is already small, don't bother doing the work.
- * This isn't just work avoidance: on-page cells can't grow as a result
- * of writing a dictionary-copy cell, the reconciliation functions do a
- * split-boundary test based on the size required by the value's cell;
- * if we grow the cell after that test we'll potentially write off the
- * end of the buffer's memory.
- */
- if (val->buf.size <= WT_INTPACK32_MAXSIZE)
- return (0);
- WT_RET(__rec_dictionary_lookup(session, r, val, &dp));
- if (dp == NULL)
- return (0);
-
- /*
- * If the dictionary offset isn't set, we're creating a new entry in the
- * dictionary, set its location.
- *
- * If the dictionary offset is set, we have a matching value. Create a
- * copy cell instead.
- */
- if (dp->offset == 0)
- dp->offset = WT_PTRDIFF32(r->first_free, r->cur_ptr->image.mem);
- else {
- /*
- * The offset is the byte offset from this cell to the previous,
- * matching cell, NOT the byte offset from the beginning of the
- * page.
- */
- offset = (uint64_t)WT_PTRDIFF(r->first_free,
- (uint8_t *)r->cur_ptr->image.mem + dp->offset);
- val->len = val->cell_len =
- __wt_cell_pack_copy(&val->cell, rle, offset);
- val->buf.data = NULL;
- val->buf.size = 0;
- }
- return (0);
-}
-
-/*
- * __rec_key_state_update --
- * Update prefix and suffix compression based on the last key.
- */
-static inline void
-__rec_key_state_update(WT_RECONCILE *r, bool ovfl_key)
-{
- WT_ITEM *a;
-
- /*
- * If writing an overflow key onto the page, don't update the "last key"
- * value, and leave the state of prefix compression alone. (If we are
- * currently doing prefix compression, we have a key state which will
- * continue to work, we're just skipping the key just created because
- * it's an overflow key and doesn't participate in prefix compression.
- * If we are not currently doing prefix compression, we can't start, an
- * overflow key doesn't give us any state.)
- *
- * Additionally, if we wrote an overflow key onto the page, turn off the
- * suffix compression of row-store internal node keys. (When we split,
- * "last key" is the largest key on the previous page, and "cur key" is
- * the first key on the next page, which is being promoted. In some
- * cases we can discard bytes from the "cur key" that are not needed to
- * distinguish between the "last key" and "cur key", compressing the
- * size of keys on internal nodes. If we just built an overflow key,
- * we're not going to update the "last key", making suffix compression
- * impossible for the next key. Alternatively, we could remember where
- * the last key was on the page, detect it's an overflow key, read it
- * from disk and do suffix compression, but that's too much work for an
- * unlikely event.)
- *
- * If we're not writing an overflow key on the page, update the last-key
- * value and turn on both prefix and suffix compression.
- */
- if (ovfl_key)
- r->key_sfx_compress = false;
- else {
- a = r->cur;
- r->cur = r->last;
- r->last = a;
-
- r->key_pfx_compress = r->key_pfx_compress_conf;
- r->key_sfx_compress = r->key_sfx_compress_conf;
- }
-}
-
-/*
- * Macros from fixed-length entries to/from bytes.
- */
-#define WT_FIX_BYTES_TO_ENTRIES(btree, bytes) \
- ((uint32_t)((((bytes) * 8) / (btree)->bitcnt)))
-#define WT_FIX_ENTRIES_TO_BYTES(btree, entries) \
- ((uint32_t)WT_ALIGN((entries) * (btree)->bitcnt, 8))
-
-/*
* __rec_leaf_page_max --
* Figure out the maximum leaf page size for the reconciliation.
*/
@@ -2057,35 +860,6 @@ __rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r)
}
/*
- * __rec_need_split --
- * Check whether adding some bytes to the page requires a split.
- */
-static bool
-__rec_need_split(WT_RECONCILE *r, size_t len)
-{
- /*
- * In the case of a row-store leaf page, trigger a split if a threshold
- * number of saved updates is reached. This allows pages to split for
- * update/restore and lookaside eviction when there is no visible data
- * causing the disk image to grow.
- *
- * In the case of small pages or large keys, we might try to split when
- * a page has no updates or entries, which isn't possible. To consider
- * update/restore or lookaside information, require either page entries
- * or updates that will be attached to the image. The limit is one of
- * either, but it doesn't make sense to create pages or images with few
- * entries or updates, even where page sizes are small (especially as
- * updates that will eventually become overflow items can throw off our
- * calculations). Bound the combination at something reasonable.
- */
- if (r->page->type == WT_PAGE_ROW_LEAF && r->entries + r->supd_next > 10)
- len += r->supd_memsize;
-
- /* Check for the disk image crossing a boundary. */
- return (WT_CHECK_CROSSING_BND(r, len));
-}
-
-/*
* __wt_split_page_size --
* Given a split percentage, calculate split page size in bytes.
*/
@@ -2123,8 +897,8 @@ __wt_split_page_size(int split_pct, uint32_t maxpagesize, uint32_t allocsize)
* Initialize a single chunk structure.
*/
static int
-__rec_split_chunk_init(
- WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_CHUNK *chunk, size_t memsize)
+__rec_split_chunk_init(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_REC_CHUNK *chunk, size_t memsize)
{
chunk->min_recno = WT_RECNO_OOB;
chunk->min_entries = 0;
@@ -2158,16 +932,16 @@ __rec_split_chunk_init(
}
/*
- * __rec_split_init --
+ * __wt_rec_split_init --
* Initialization for the reconciliation split functions.
*/
-static int
-__rec_split_init(WT_SESSION_IMPL *session,
+int
+__wt_rec_split_init(WT_SESSION_IMPL *session,
WT_RECONCILE *r, WT_PAGE *page, uint64_t recno, uint64_t max)
{
WT_BM *bm;
WT_BTREE *btree;
- WT_CHUNK *chunk;
+ WT_REC_CHUNK *chunk;
WT_REF *ref;
size_t corrected_page_size, disk_img_buf_size;
@@ -2463,21 +1237,21 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len)
}
/*
- * __rec_split --
+ * __wt_rec_split --
* Handle the page reconciliation bookkeeping. (Did you know "bookkeeper"
* has 3 doubled letters in a row? Sweet-tooth does, too.)
*/
-static int
-__rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
+int
+__wt_rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
{
WT_BTREE *btree;
- WT_CHUNK *tmp;
+ WT_REC_CHUNK *tmp;
size_t inuse;
btree = S2BT(session);
/* Fixed length col store can call with next_len 0 */
- WT_ASSERT(session, next_len == 0 || __rec_need_split(r, next_len));
+ WT_ASSERT(session, next_len == 0 || __wt_rec_need_split(r, next_len));
/*
* We should never split during salvage, and we're about to drop core
@@ -2495,11 +1269,11 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
* Additionally, grow the buffer to contain the current item if we
* haven't already consumed a reasonable portion of a split chunk.
*/
- if (inuse < r->split_size / 2 && !__rec_need_split(r, 0))
+ if (inuse < r->split_size / 2 && !__wt_rec_need_split(r, 0))
goto done;
/* All page boundaries reset the dictionary. */
- __rec_dictionary_reset(r);
+ __wt_rec_dictionary_reset(r);
/* Set the number of entries and size for the just finished chunk. */
r->cur_ptr->entries = r->entries;
@@ -2567,18 +1341,18 @@ done: /*
}
/*
- * __rec_split_crossing_bnd --
+ * __wt_rec_split_crossing_bnd --
* Save the details for the minimum split size boundary or call for a
* split.
*/
-static inline int
-__rec_split_crossing_bnd(
+int
+__wt_rec_split_crossing_bnd(
WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
{
WT_BTREE *btree;
size_t min_offset;
- WT_ASSERT(session, __rec_need_split(r, next_len));
+ WT_ASSERT(session, __wt_rec_need_split(r, next_len));
/*
* If crossing the minimum split size boundary, store the boundary
@@ -2587,7 +1361,7 @@ __rec_split_crossing_bnd(
* large enough, just split at this point.
*/
if (WT_CROSSING_MIN_BND(r, next_len) &&
- !WT_CROSSING_SPLIT_BND(r, next_len) && !__rec_need_split(r, 0)) {
+ !WT_CROSSING_SPLIT_BND(r, next_len) && !__wt_rec_need_split(r, 0)) {
btree = S2BT(session);
WT_ASSERT(session, r->cur_ptr->min_offset == 0);
@@ -2609,13 +1383,13 @@ __rec_split_crossing_bnd(
session, r, &r->cur_ptr->min_key, r->page->type));
/* All page boundaries reset the dictionary. */
- __rec_dictionary_reset(r);
+ __wt_rec_dictionary_reset(r);
return (0);
}
/* We are crossing a split boundary */
- return (__rec_split(session, r, next_len));
+ return (__wt_rec_split(session, r, next_len));
}
/*
@@ -2632,8 +1406,8 @@ static int
__rec_split_finish_process_prev(WT_SESSION_IMPL *session, WT_RECONCILE *r)
{
WT_BTREE *btree;
- WT_CHUNK *cur_ptr, *prev_ptr, *tmp;
WT_PAGE_HEADER *dsk;
+ WT_REC_CHUNK *cur_ptr, *prev_ptr, *tmp;
size_t combined_size, len_to_move;
uint8_t *cur_dsk_start;
@@ -2714,11 +1488,11 @@ __rec_split_finish_process_prev(WT_SESSION_IMPL *session, WT_RECONCILE *r)
}
/*
- * __rec_split_finish --
+ * __wt_rec_split_finish --
* Finish processing a page.
*/
-static int
-__rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+int
+__wt_rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r)
{
/*
* We're done reconciling, write the final page. We may arrive here with
@@ -2771,13 +1545,13 @@ __rec_supd_move(
*/
static int
__rec_split_write_supd(WT_SESSION_IMPL *session,
- WT_RECONCILE *r, WT_CHUNK *chunk, WT_MULTI *multi, bool last_block)
+ WT_RECONCILE *r, WT_REC_CHUNK *chunk, WT_MULTI *multi, bool last_block)
{
WT_BTREE *btree;
- WT_CHUNK *next;
WT_DECL_ITEM(key);
WT_DECL_RET;
WT_PAGE *page;
+ WT_REC_CHUNK *next;
WT_SAVE_UPD *supd;
WT_UPDATE *upd;
uint32_t i, j;
@@ -2876,7 +1650,7 @@ err: __wt_scr_free(session, &key);
*/
static void
__rec_split_write_header(WT_SESSION_IMPL *session,
- WT_RECONCILE *r, WT_CHUNK *chunk, WT_MULTI *multi, WT_PAGE_HEADER *dsk)
+ WT_RECONCILE *r, WT_REC_CHUNK *chunk, WT_MULTI *multi, WT_PAGE_HEADER *dsk)
{
WT_BTREE *btree;
WT_PAGE *page;
@@ -3088,7 +1862,7 @@ __rec_compression_adjust(WT_SESSION_IMPL *session,
*/
static int
__rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r,
- WT_CHUNK *chunk, WT_ITEM *compressed_image, bool last_block)
+ WT_REC_CHUNK *chunk, WT_ITEM *compressed_image, bool last_block)
{
WT_BTREE *btree;
WT_MULTI *multi;
@@ -3304,7 +2078,7 @@ __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
recno = btree->type == BTREE_ROW ? WT_RECNO_OOB : 1;
- return (__rec_split_init(session,
+ return (__wt_rec_split_init(session,
r, cbulk->leaf, recno, btree->maxleafpage_precomp));
}
@@ -3326,7 +2100,7 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
switch (btree->type) {
case BTREE_COL_FIX:
if (cbulk->entry != 0)
- __rec_incr(session, r, cbulk->entry,
+ __wt_rec_incr(session, r, cbulk->entry,
__bitstr_size(
(size_t)cbulk->entry * btree->bitcnt));
break;
@@ -3338,7 +2112,7 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
break;
}
- WT_RET(__rec_split_finish(session, r));
+ WT_RET(__wt_rec_split_finish(session, r));
WT_RET(__rec_write_wrapup(session, r, r->page));
__rec_write_page_status(session, r);
@@ -3354,1912 +2128,6 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
}
/*
- * __wt_bulk_insert_row --
- * Row-store bulk insert.
- */
-int
-__wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
-{
- WT_BTREE *btree;
- WT_CURSOR *cursor;
- WT_KV *key, *val;
- WT_RECONCILE *r;
- bool ovfl_key;
-
- r = cbulk->reconcile;
- btree = S2BT(session);
- cursor = &cbulk->cbt.iface;
-
- key = &r->k;
- val = &r->v;
- WT_RET(__rec_cell_build_leaf_key(session, r, /* Build key cell */
- cursor->key.data, cursor->key.size, &ovfl_key));
- WT_RET(__rec_cell_build_val(session, r, /* Build value cell */
- cursor->value.data, cursor->value.size, (uint64_t)0));
-
- /* Boundary: split or write the page. */
- if (WT_CROSSING_SPLIT_BND(r, key->len + val->len)) {
- /*
- * Turn off prefix compression until a full key written to the
- * new page, and (unless already working with an overflow key),
- * rebuild the key without compression.
- */
- if (r->key_pfx_compress_conf) {
- r->key_pfx_compress = false;
- if (!ovfl_key)
- WT_RET(__rec_cell_build_leaf_key(
- session, r, NULL, 0, &ovfl_key));
- }
- WT_RET(__rec_split_crossing_bnd(
- session, r, key->len + val->len));
- }
-
- /* Copy the key/value pair onto the page. */
- __rec_copy_incr(session, r, key);
- if (val->len == 0)
- r->any_empty_value = true;
- else {
- r->all_empty_value = false;
- if (btree->dictionary)
- WT_RET(__rec_dict_replace(session, r, 0, val));
- __rec_copy_incr(session, r, val);
- }
-
- /* Update compression state. */
- __rec_key_state_update(r, ovfl_key);
-
- return (0);
-}
-
-/*
- * __rec_col_fix_bulk_insert_split_check --
- * Check if a bulk-loaded fixed-length column store page needs to split.
- */
-static inline int
-__rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk)
-{
- WT_BTREE *btree;
- WT_RECONCILE *r;
- WT_SESSION_IMPL *session;
-
- session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session;
- r = cbulk->reconcile;
- btree = S2BT(session);
-
- if (cbulk->entry == cbulk->nrecs) {
- if (cbulk->entry != 0) {
- /*
- * If everything didn't fit, update the counters and
- * split.
- *
- * Boundary: split or write the page.
- *
- * No need to have a minimum split size boundary, all
- * pages are filled 100% except the last, allowing it to
- * grow in the future.
- */
- __rec_incr(session, r, cbulk->entry,
- __bitstr_size(
- (size_t)cbulk->entry * btree->bitcnt));
- WT_RET(__rec_split(session, r, 0));
- }
- cbulk->entry = 0;
- cbulk->nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail);
- }
- return (0);
-}
-
-/*
- * __wt_bulk_insert_fix --
- * Fixed-length column-store bulk insert.
- */
-int
-__wt_bulk_insert_fix(
- WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted)
-{
- WT_BTREE *btree;
- WT_CURSOR *cursor;
- WT_RECONCILE *r;
-
- r = cbulk->reconcile;
- btree = S2BT(session);
- cursor = &cbulk->cbt.iface;
-
- WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk));
- __bit_setv(r->first_free, cbulk->entry,
- btree->bitcnt, deleted ? 0 : ((uint8_t *)cursor->value.data)[0]);
- ++cbulk->entry;
- ++r->recno;
-
- return (0);
-}
-
-/*
- * __wt_bulk_insert_fix_bitmap --
- * Fixed-length column-store bulk insert.
- */
-int
-__wt_bulk_insert_fix_bitmap(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
-{
- WT_BTREE *btree;
- WT_CURSOR *cursor;
- WT_RECONCILE *r;
- uint32_t entries, offset, page_entries, page_size;
- const uint8_t *data;
-
- r = cbulk->reconcile;
- btree = S2BT(session);
- cursor = &cbulk->cbt.iface;
-
- if (((r->recno - 1) * btree->bitcnt) & 0x7)
- WT_RET_MSG(session, EINVAL,
- "Bulk bitmap load not aligned on a byte boundary");
- for (data = cursor->value.data,
- entries = (uint32_t)cursor->value.size;
- entries > 0;
- entries -= page_entries, data += page_size) {
- WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk));
-
- page_entries = WT_MIN(entries, cbulk->nrecs - cbulk->entry);
- page_size = __bitstr_size(page_entries * btree->bitcnt);
- offset = __bitstr_size(cbulk->entry * btree->bitcnt);
- memcpy(r->first_free + offset, data, page_size);
- cbulk->entry += page_entries;
- r->recno += page_entries;
- }
- return (0);
-}
-
-/*
- * __wt_bulk_insert_var --
- * Variable-length column-store bulk insert.
- */
-int
-__wt_bulk_insert_var(
- WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted)
-{
- WT_BTREE *btree;
- WT_KV *val;
- WT_RECONCILE *r;
-
- r = cbulk->reconcile;
- btree = S2BT(session);
-
- val = &r->v;
- if (deleted) {
- val->cell_len = __wt_cell_pack_del(&val->cell, cbulk->rle);
- val->buf.data = NULL;
- val->buf.size = 0;
- val->len = val->cell_len;
- } else
- /*
- * Store the bulk cursor's last buffer, not the current value,
- * we're tracking duplicates, which means we want the previous
- * value seen, not the current value.
- */
- WT_RET(__rec_cell_build_val(session,
- r, cbulk->last.data, cbulk->last.size, cbulk->rle));
-
- /* Boundary: split or write the page. */
- if (WT_CROSSING_SPLIT_BND(r, val->len))
- WT_RET(__rec_split_crossing_bnd(session, r, val->len));
-
- /* Copy the value onto the page. */
- if (btree->dictionary)
- WT_RET(__rec_dict_replace(session, r, cbulk->rle, val));
- __rec_copy_incr(session, r, val);
-
- /* Update the starting record number in case we split. */
- r->recno += cbulk->rle;
-
- return (0);
-}
-
-/*
- * __rec_vtype --
- * Return a value cell's address type.
- */
-static inline u_int
-__rec_vtype(WT_ADDR *addr)
-{
- if (addr->type == WT_ADDR_INT)
- return (WT_CELL_ADDR_INT);
- if (addr->type == WT_ADDR_LEAF)
- return (WT_CELL_ADDR_LEAF);
- return (WT_CELL_ADDR_LEAF_NO);
-}
-
-/*
- * __rec_col_int --
- * Reconcile a column-store internal page.
- */
-static int
-__rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
-{
- WT_ADDR *addr;
- WT_BTREE *btree;
- WT_CELL_UNPACK *vpack, _vpack;
- WT_CHILD_STATE state;
- WT_DECL_RET;
- WT_KV *val;
- WT_PAGE *child, *page;
- WT_REF *ref;
- bool hazard;
-
- btree = S2BT(session);
- page = pageref->page;
- child = NULL;
- hazard = false;
-
- val = &r->v;
- vpack = &_vpack;
-
- WT_RET(__rec_split_init(session,
- r, page, pageref->ref_recno, btree->maxintlpage_precomp));
-
- /* For each entry in the in-memory page... */
- WT_INTL_FOREACH_BEGIN(session, page, ref) {
- /* Update the starting record number in case we split. */
- r->recno = ref->ref_recno;
-
- /*
- * Modified child.
- * The page may be emptied or internally created during a split.
- * Deleted/split pages are merged into the parent and discarded.
- */
- WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state));
- addr = NULL;
- child = ref->page;
-
- switch (state) {
- case WT_CHILD_IGNORE:
- /* Ignored child. */
- WT_CHILD_RELEASE_ERR(session, hazard, ref);
- continue;
-
- case WT_CHILD_MODIFIED:
- /*
- * Modified child. Empty pages are merged into the
- * parent and discarded.
- */
- switch (child->modify->rec_result) {
- case WT_PM_REC_EMPTY:
- /*
- * Column-store pages are almost never empty, as
- * discarding a page would remove a chunk of the
- * name space. The exceptions are pages created
- * when the tree is created, and never filled.
- */
- WT_CHILD_RELEASE_ERR(session, hazard, ref);
- continue;
- case WT_PM_REC_MULTIBLOCK:
- WT_ERR(__rec_col_merge(session, r, child));
- WT_CHILD_RELEASE_ERR(session, hazard, ref);
- continue;
- case WT_PM_REC_REPLACE:
- addr = &child->modify->mod_replace;
- break;
- WT_ILLEGAL_VALUE_ERR(
- session, child->modify->rec_result);
- }
- break;
- case WT_CHILD_ORIGINAL:
- /* Original child. */
- break;
- case WT_CHILD_PROXY:
- /*
- * Deleted child where we write a proxy cell, not yet
- * supported for column-store.
- */
- WT_ERR(__wt_illegal_value(session, state));
- }
-
- /*
- * Build the value cell. The child page address is in one of 3
- * places: if the page was replaced, the page's modify structure
- * references it and we built the value cell just above in the
- * switch statement. Else, the WT_REF->addr reference points to
- * an on-page cell or an off-page WT_ADDR structure: if it's an
- * on-page cell and we copy it from the page, else build a new
- * cell.
- */
- if (addr == NULL && __wt_off_page(page, ref->addr))
- addr = ref->addr;
- if (addr == NULL) {
- __wt_cell_unpack(ref->addr, vpack);
- val->buf.data = ref->addr;
- val->buf.size = __wt_cell_total_len(vpack);
- val->cell_len = 0;
- val->len = val->buf.size;
- } else
- __rec_cell_build_addr(session, r,
- addr->addr, addr->size,
- __rec_vtype(addr), ref->ref_recno);
- WT_CHILD_RELEASE_ERR(session, hazard, ref);
-
- /* Boundary: split or write the page. */
- if (__rec_need_split(r, val->len))
- WT_ERR(__rec_split_crossing_bnd(session, r, val->len));
-
- /* Copy the value onto the page. */
- __rec_copy_incr(session, r, val);
- } WT_INTL_FOREACH_END;
-
- /* Write the remnant page. */
- return (__rec_split_finish(session, r));
-
-err: WT_CHILD_RELEASE(session, hazard, ref);
- return (ret);
-}
-
-/*
- * __rec_col_merge --
- * Merge in a split page.
- */
-static int
-__rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
-{
- WT_ADDR *addr;
- WT_KV *val;
- WT_MULTI *multi;
- WT_PAGE_MODIFY *mod;
- uint32_t i;
-
- mod = page->modify;
-
- val = &r->v;
-
- /* For each entry in the split array... */
- for (multi = mod->mod_multi,
- i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
- /* Update the starting record number in case we split. */
- r->recno = multi->key.recno;
-
- /* Build the value cell. */
- addr = &multi->addr;
- __rec_cell_build_addr(session, r,
- addr->addr, addr->size, __rec_vtype(addr), r->recno);
-
- /* Boundary: split or write the page. */
- if (__rec_need_split(r, val->len))
- WT_RET(__rec_split_crossing_bnd(session, r, val->len));
-
- /* Copy the value onto the page. */
- __rec_copy_incr(session, r, val);
- }
- return (0);
-}
-
-/*
- * __rec_col_fix --
- * Reconcile a fixed-width, column-store leaf page.
- */
-static int
-__rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
-{
- WT_BTREE *btree;
- WT_INSERT *ins;
- WT_PAGE *page;
- WT_UPDATE *upd;
- uint64_t recno;
- uint32_t entry, nrecs;
-
- btree = S2BT(session);
- page = pageref->page;
-
- WT_RET(__rec_split_init(
- session, r, page, pageref->ref_recno, btree->maxleafpage));
-
- /* Copy the original, disk-image bytes into place. */
- memcpy(r->first_free, page->pg_fix_bitf,
- __bitstr_size((size_t)page->entries * btree->bitcnt));
-
- /* Update any changes to the original on-page data items. */
- WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page)) {
- WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, NULL, &upd));
- if (upd != NULL)
- __bit_setv(r->first_free,
- WT_INSERT_RECNO(ins) - pageref->ref_recno,
- btree->bitcnt, *upd->data);
- }
-
- /* Calculate the number of entries per page remainder. */
- entry = page->entries;
- nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail) - page->entries;
- r->recno += entry;
-
- /* Walk any append list. */
- for (ins =
- WT_SKIP_FIRST(WT_COL_APPEND(page));; ins = WT_SKIP_NEXT(ins)) {
- if (ins == NULL) {
- /*
- * If the page split, instantiate any missing records in
- * the page's name space. (Imagine record 98 is
- * transactionally visible, 99 wasn't created or is not
- * yet visible, 100 is visible. Then the page splits and
- * record 100 moves to another page. When we reconcile
- * the original page, we write record 98, then we don't
- * see record 99 for whatever reason. If we've moved
- * record 100, we don't know to write a deleted record
- * 99 on the page.)
- *
- * The record number recorded during the split is the
- * first key on the split page, that is, one larger than
- * the last key on this page, we have to decrement it.
- */
- if ((recno =
- page->modify->mod_col_split_recno) == WT_RECNO_OOB)
- break;
- recno -= 1;
-
- /*
- * The following loop assumes records to write, and the
- * previous key might have been visible.
- */
- if (r->recno > recno)
- break;
- upd = NULL;
- } else {
- WT_RET(__rec_txn_read(
- session, r, ins, NULL, NULL, NULL, &upd));
- recno = WT_INSERT_RECNO(ins);
- }
- for (;;) {
- /*
- * The application may have inserted records which left
- * gaps in the name space.
- */
- for (;
- nrecs > 0 && r->recno < recno;
- --nrecs, ++entry, ++r->recno)
- __bit_setv(
- r->first_free, entry, btree->bitcnt, 0);
-
- if (nrecs > 0) {
- __bit_setv(r->first_free, entry, btree->bitcnt,
- upd == NULL ? 0 : *upd->data);
- --nrecs;
- ++entry;
- ++r->recno;
- break;
- }
-
- /*
- * If everything didn't fit, update the counters and
- * split.
- *
- * Boundary: split or write the page.
- *
- * No need to have a minimum split size boundary, all
- * pages are filled 100% except the last, allowing it to
- * grow in the future.
- */
- __rec_incr(session, r, entry,
- __bitstr_size((size_t)entry * btree->bitcnt));
- WT_RET(__rec_split(session, r, 0));
-
- /* Calculate the number of entries per page. */
- entry = 0;
- nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail);
- }
-
- /*
- * Execute this loop once without an insert item to catch any
- * missing records due to a split, then quit.
- */
- if (ins == NULL)
- break;
- }
-
- /* Update the counters. */
- __rec_incr(
- session, r, entry, __bitstr_size((size_t)entry * btree->bitcnt));
-
- /* Write the remnant page. */
- return (__rec_split_finish(session, r));
-}
-
-/*
- * __rec_col_fix_slvg --
- * Reconcile a fixed-width, column-store leaf page created during salvage.
- */
-static int
-__rec_col_fix_slvg(WT_SESSION_IMPL *session,
- WT_RECONCILE *r, WT_REF *pageref, WT_SALVAGE_COOKIE *salvage)
-{
- WT_BTREE *btree;
- WT_PAGE *page;
- uint64_t page_start, page_take;
- uint32_t entry, nrecs;
-
- btree = S2BT(session);
- page = pageref->page;
-
- /*
- * !!!
- * It's vanishingly unlikely and probably impossible for fixed-length
- * column-store files to have overlapping key ranges. It's possible
- * for an entire key range to go missing (if a page is corrupted and
- * lost), but because pages can't split, it shouldn't be possible to
- * find pages where the key ranges overlap. That said, we check for
- * it during salvage and clean up after it here because it doesn't
- * cost much and future column-store formats or operations might allow
- * for fixed-length format ranges to overlap during salvage, and I
- * don't want to have to retrofit the code later.
- */
- WT_RET(__rec_split_init(
- session, r, page, pageref->ref_recno, btree->maxleafpage));
-
- /* We may not be taking all of the entries on the original page. */
- page_take = salvage->take == 0 ? page->entries : salvage->take;
- page_start = salvage->skip == 0 ? 0 : salvage->skip;
-
- /* Calculate the number of entries per page. */
- entry = 0;
- nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail);
-
- for (; nrecs > 0 && salvage->missing > 0;
- --nrecs, --salvage->missing, ++entry)
- __bit_setv(r->first_free, entry, btree->bitcnt, 0);
-
- for (; nrecs > 0 && page_take > 0;
- --nrecs, --page_take, ++page_start, ++entry)
- __bit_setv(r->first_free, entry, btree->bitcnt,
- __bit_getv(page->pg_fix_bitf,
- (uint32_t)page_start, btree->bitcnt));
-
- r->recno += entry;
- __rec_incr(session, r, entry,
- __bitstr_size((size_t)entry * btree->bitcnt));
-
- /*
- * We can't split during salvage -- if everything didn't fit, it's
- * all gone wrong.
- */
- if (salvage->missing != 0 || page_take != 0)
- WT_PANIC_RET(session, WT_PANIC,
- "%s page too large, attempted split during salvage",
- __wt_page_type_string(page->type));
-
- /* Write the page. */
- return (__rec_split_finish(session, r));
-}
-
-/*
- * __rec_col_var_helper --
- * Create a column-store variable length record cell and write it onto a
- * page.
- */
-static int
-__rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r,
- WT_SALVAGE_COOKIE *salvage,
- WT_ITEM *value, bool deleted, uint8_t overflow_type, uint64_t rle)
-{
- WT_BTREE *btree;
- WT_KV *val;
-
- btree = S2BT(session);
-
- val = &r->v;
-
- /*
- * Occasionally, salvage needs to discard records from the beginning or
- * end of the page, and because the items may be part of a RLE cell, do
- * the adjustments here. It's not a mistake we don't bother telling
- * our caller we've handled all the records from the page we care about,
- * and can quit processing the page: salvage is a rare operation and I
- * don't want to complicate our caller's loop.
- */
- if (salvage != NULL) {
- if (salvage->done)
- return (0);
- if (salvage->skip != 0) {
- if (rle <= salvage->skip) {
- salvage->skip -= rle;
- return (0);
- }
- rle -= salvage->skip;
- salvage->skip = 0;
- }
- if (salvage->take != 0) {
- if (rle <= salvage->take)
- salvage->take -= rle;
- else {
- rle = salvage->take;
- salvage->take = 0;
- }
- if (salvage->take == 0)
- salvage->done = true;
- }
- }
-
- if (deleted) {
- val->cell_len = __wt_cell_pack_del(&val->cell, rle);
- val->buf.data = NULL;
- val->buf.size = 0;
- val->len = val->cell_len;
- } else if (overflow_type) {
- val->cell_len = __wt_cell_pack_ovfl(
- &val->cell, overflow_type, rle, value->size);
- val->buf.data = value->data;
- val->buf.size = value->size;
- val->len = val->cell_len + value->size;
- } else
- WT_RET(__rec_cell_build_val(
- session, r, value->data, value->size, rle));
-
- /* Boundary: split or write the page. */
- if (__rec_need_split(r, val->len))
- WT_RET(__rec_split_crossing_bnd(session, r, val->len));
-
- /* Copy the value onto the page. */
- if (!deleted && !overflow_type && btree->dictionary)
- WT_RET(__rec_dict_replace(session, r, rle, val));
- __rec_copy_incr(session, r, val);
-
- /* Update the starting record number in case we split. */
- r->recno += rle;
-
- return (0);
-}
-
-/*
- * __rec_col_var --
- * Reconcile a variable-width column-store leaf page.
- */
-static int
-__rec_col_var(WT_SESSION_IMPL *session,
- WT_RECONCILE *r, WT_REF *pageref, WT_SALVAGE_COOKIE *salvage)
-{
- enum { OVFL_IGNORE, OVFL_UNUSED, OVFL_USED } ovfl_state;
- WT_BTREE *btree;
- WT_CELL *cell;
- WT_CELL_UNPACK *vpack, _vpack;
- WT_COL *cip;
- WT_CURSOR_BTREE *cbt;
- WT_DECL_ITEM(orig);
- WT_DECL_RET;
- WT_INSERT *ins;
- WT_ITEM *last;
- WT_PAGE *page;
- WT_UPDATE *upd;
- uint64_t n, nrepeat, repeat_count, rle, skip, src_recno;
- uint32_t i, size;
- bool deleted, last_deleted, orig_deleted, update_no_copy;
- const void *data;
-
- btree = S2BT(session);
- page = pageref->page;
- last = r->last;
- vpack = &_vpack;
- cbt = &r->update_modify_cbt;
-
- WT_RET(__rec_split_init(session,
- r, page, pageref->ref_recno, btree->maxleafpage_precomp));
-
- WT_RET(__wt_scr_alloc(session, 0, &orig));
- data = NULL;
- size = 0;
- upd = NULL;
-
- /*
- * The salvage code may be calling us to reconcile a page where there
- * were missing records in the column-store name space. If taking the
- * first record from on the page, it might be a deleted record, so we
- * have to give the RLE code a chance to figure that out. Else, if
- * not taking the first record from the page, write a single element
- * representing the missing records onto a new page. (Don't pass the
- * salvage cookie to our helper function in this case, we're handling
- * one of the salvage cookie fields on our own, and we don't need the
- * helper function's assistance.)
- */
- rle = 0;
- last_deleted = false;
- if (salvage != NULL && salvage->missing != 0) {
- if (salvage->skip == 0) {
- rle = salvage->missing;
- last_deleted = true;
-
- /*
- * Correct the number of records we're going to "take",
- * pretending the missing records were on the page.
- */
- salvage->take += salvage->missing;
- } else
- WT_ERR(__rec_col_var_helper(session,
- r, NULL, NULL, true, false, salvage->missing));
- }
-
- /*
- * We track two data items through this loop: the previous (last) item
- * and the current item: if the last item is the same as the current
- * item, we increment the RLE count for the last item; if the last item
- * is different from the current item, we write the last item onto the
- * page, and replace it with the current item. The r->recno counter
- * tracks records written to the page, and is incremented by the helper
- * function immediately after writing records to the page. The record
- * number of our source record, that is, the current item, is maintained
- * in src_recno.
- */
- src_recno = r->recno + rle;
-
- /* For each entry in the in-memory page... */
- WT_COL_FOREACH(page, cip, i) {
- ovfl_state = OVFL_IGNORE;
- if ((cell = WT_COL_PTR(page, cip)) == NULL) {
- nrepeat = 1;
- ins = NULL;
- orig_deleted = true;
- } else {
- __wt_cell_unpack(cell, vpack);
- nrepeat = __wt_cell_rle(vpack);
- ins = WT_SKIP_FIRST(WT_COL_UPDATE(page, cip));
-
- /*
- * If the original value is "deleted", there's no value
- * to compare, we're done.
- */
- orig_deleted = vpack->type == WT_CELL_DEL;
- if (orig_deleted)
- goto record_loop;
-
- /*
- * Overflow items are tricky: we don't know until we're
- * finished processing the set of values if we need the
- * overflow value or not. If we don't use the overflow
- * item at all, we have to discard it from the backing
- * file, otherwise we'll leak blocks on the checkpoint.
- * That's safe because if the backing overflow value is
- * still needed by any running transaction, we'll cache
- * a copy in the update list.
- *
- * Regardless, we avoid copying in overflow records: if
- * there's a WT_INSERT entry that modifies a reference
- * counted overflow record, we may have to write copies
- * of the overflow record, and in that case we'll do the
- * comparisons, but we don't read overflow items just to
- * see if they match records on either side.
- */
- if (vpack->ovfl) {
- ovfl_state = OVFL_UNUSED;
- goto record_loop;
- }
-
- /*
- * If data is Huffman encoded, we have to decode it in
- * order to compare it with the last item we saw, which
- * may have been an update string. This guarantees we
- * find every single pair of objects we can RLE encode,
- * including applications updating an existing record
- * where the new value happens (?) to match a Huffman-
- * encoded value in a previous or next record.
- */
- WT_ERR(__wt_dsk_cell_data_ref(
- session, WT_PAGE_COL_VAR, vpack, orig));
- }
-
-record_loop: /*
- * Generate on-page entries: loop repeat records, looking for
- * WT_INSERT entries matching the record number. The WT_INSERT
- * lists are in sorted order, so only need check the next one.
- */
- for (n = 0;
- n < nrepeat; n += repeat_count, src_recno += repeat_count) {
- upd = NULL;
- if (ins != NULL && WT_INSERT_RECNO(ins) == src_recno) {
- WT_ERR(__rec_txn_read(
- session, r, ins, cip, vpack, NULL, &upd));
- ins = WT_SKIP_NEXT(ins);
- }
-
- update_no_copy = true; /* No data copy */
- repeat_count = 1; /* Single record */
- deleted = false;
-
- if (upd != NULL) {
- switch (upd->type) {
- case WT_UPDATE_MODIFY:
- cbt->slot = WT_COL_SLOT(page, cip);
- WT_ERR(__wt_value_return_upd(
- session, cbt, upd,
- F_ISSET(r, WT_REC_VISIBLE_ALL)));
- data = cbt->iface.value.data;
- size = (uint32_t)cbt->iface.value.size;
- update_no_copy = false;
- break;
- case WT_UPDATE_STANDARD:
- data = upd->data;
- size = upd->size;
- break;
- case WT_UPDATE_TOMBSTONE:
- deleted = true;
- break;
- WT_ILLEGAL_VALUE_ERR(session, upd->type);
- }
- } else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) {
- /*
- * If doing an update save and restore, and the
- * underlying value is a removed overflow value,
- * we end up here.
- *
- * If necessary, when the overflow value was
- * originally removed, reconciliation appended
- * a globally visible copy of the value to the
- * key's update list, meaning the on-page item
- * isn't accessed after page re-instantiation.
- *
- * Assert the case.
- */
- WT_ASSERT(session,
- F_ISSET(r, WT_REC_UPDATE_RESTORE));
-
- /*
- * The on-page value will never be accessed,
- * write a placeholder record.
- */
- data = "ovfl-unused";
- size = WT_STORE_SIZE(strlen("ovfl-unused"));
- } else {
- update_no_copy = false; /* Maybe data copy */
-
- /*
- * The repeat count is the number of records up
- * to the next WT_INSERT record, or up to the
- * end of the entry if we have no more WT_INSERT
- * records.
- */
- if (ins == NULL)
- repeat_count = nrepeat - n;
- else
- repeat_count =
- WT_INSERT_RECNO(ins) - src_recno;
-
- deleted = orig_deleted;
- if (deleted)
- goto compare;
-
- /*
- * If we are handling overflow items, use the
- * overflow item itself exactly once, after
- * which we have to copy it into a buffer and
- * from then on use a complete copy because we
- * are re-creating a new overflow record each
- * time.
- */
- switch (ovfl_state) {
- case OVFL_UNUSED:
- /*
- * An as-yet-unused overflow item.
- *
- * We're going to copy the on-page cell,
- * write out any record we're tracking.
- */
- if (rle != 0) {
- WT_ERR(__rec_col_var_helper(
- session, r, salvage, last,
- last_deleted, 0, rle));
- rle = 0;
- }
-
- last->data = vpack->data;
- last->size = vpack->size;
- WT_ERR(__rec_col_var_helper(
- session, r, salvage, last, false,
- WT_CELL_VALUE_OVFL, repeat_count));
-
- /* Track if page has overflow items. */
- r->ovfl_items = true;
-
- ovfl_state = OVFL_USED;
- continue;
- case OVFL_USED:
- /*
- * Original is an overflow item; we used
- * it for a key and now we need another
- * copy; read it into memory.
- */
- WT_ERR(__wt_dsk_cell_data_ref(session,
- WT_PAGE_COL_VAR, vpack, orig));
-
- ovfl_state = OVFL_IGNORE;
- /* FALLTHROUGH */
- case OVFL_IGNORE:
- /*
- * Original is an overflow item and we
- * were forced to copy it into memory,
- * or the original wasn't an overflow
- * item; use the data copied into orig.
- */
- data = orig->data;
- size = (uint32_t)orig->size;
- break;
- }
- }
-
-compare: /*
- * If we have a record against which to compare, and
- * the records compare equal, increment the rle counter
- * and continue. If the records don't compare equal,
- * output the last record and swap the last and current
- * buffers: do NOT update the starting record number,
- * we've been doing that all along.
- */
- if (rle != 0) {
- if ((deleted && last_deleted) ||
- (!last_deleted && !deleted &&
- last->size == size &&
- memcmp(last->data, data, size) == 0)) {
- rle += repeat_count;
- continue;
- }
- WT_ERR(__rec_col_var_helper(session, r,
- salvage, last, last_deleted, 0, rle));
- }
-
- /*
- * Swap the current/last state.
- *
- * Reset RLE counter and turn on comparisons.
- */
- if (!deleted) {
- /*
- * We can't simply assign the data values into
- * the last buffer because they may have come
- * from a copy built from an encoded/overflow
- * cell and creating the next record is going
- * to overwrite that memory. Check, because
- * encoded/overflow cells aren't that common
- * and we'd like to avoid the copy. If data
- * was taken from the current unpack structure
- * (which points into the page), or was taken
- * from an update structure, we can just use
- * the pointers, they're not moving.
- */
- if (data == vpack->data || update_no_copy) {
- last->data = data;
- last->size = size;
- } else
- WT_ERR(__wt_buf_set(
- session, last, data, size));
- }
- last_deleted = deleted;
- rle = repeat_count;
- }
-
- /*
- * The first time we find an overflow record we never used,
- * discard the underlying blocks, they're no longer useful.
- */
- if (ovfl_state == OVFL_UNUSED &&
- vpack->raw != WT_CELL_VALUE_OVFL_RM)
- WT_ERR(__wt_ovfl_remove(
- session, page, vpack, F_ISSET(r, WT_REC_EVICT)));
- }
-
- /* Walk any append list. */
- for (ins =
- WT_SKIP_FIRST(WT_COL_APPEND(page));; ins = WT_SKIP_NEXT(ins)) {
- if (ins == NULL) {
- /*
- * If the page split, instantiate any missing records in
- * the page's name space. (Imagine record 98 is
- * transactionally visible, 99 wasn't created or is not
- * yet visible, 100 is visible. Then the page splits and
- * record 100 moves to another page. When we reconcile
- * the original page, we write record 98, then we don't
- * see record 99 for whatever reason. If we've moved
- * record 100, we don't know to write a deleted record
- * 99 on the page.)
- *
- * Assert the recorded record number is past the end of
- * the page.
- *
- * The record number recorded during the split is the
- * first key on the split page, that is, one larger than
- * the last key on this page, we have to decrement it.
- */
- if ((n = page->
- modify->mod_col_split_recno) == WT_RECNO_OOB)
- break;
- WT_ASSERT(session, n >= src_recno);
- n -= 1;
-
- upd = NULL;
- } else {
- WT_ERR(__rec_txn_read(
- session, r, ins, NULL, NULL, NULL, &upd));
- n = WT_INSERT_RECNO(ins);
- }
- while (src_recno <= n) {
- deleted = false;
- update_no_copy = true;
-
- /*
- * The application may have inserted records which left
- * gaps in the name space, and these gaps can be huge.
- * If we're in a set of deleted records, skip the boring
- * part.
- */
- if (src_recno < n) {
- deleted = true;
- if (last_deleted) {
- /*
- * The record adjustment is decremented
- * by one so we can naturally fall into
- * the RLE accounting below, where we
- * increment rle by one, then continue
- * in the outer loop, where we increment
- * src_recno by one.
- */
- skip = (n - src_recno) - 1;
- rle += skip;
- src_recno += skip;
- }
- } else if (upd == NULL)
- deleted = true;
- else
- switch (upd->type) {
- case WT_UPDATE_MODIFY:
- /*
- * Impossible slot, there's no backing
- * on-page item.
- */
- cbt->slot = UINT32_MAX;
- WT_ERR(__wt_value_return_upd(
- session, cbt, upd,
- F_ISSET(r, WT_REC_VISIBLE_ALL)));
- data = cbt->iface.value.data;
- size = (uint32_t)cbt->iface.value.size;
- update_no_copy = false;
- break;
- case WT_UPDATE_STANDARD:
- data = upd->data;
- size = upd->size;
- break;
- case WT_UPDATE_TOMBSTONE:
- deleted = true;
- break;
- WT_ILLEGAL_VALUE_ERR(session, upd->type);
- }
-
- /*
- * Handle RLE accounting and comparisons -- see comment
- * above, this code fragment does the same thing.
- */
- if (rle != 0) {
- if ((deleted && last_deleted) ||
- (!last_deleted && !deleted &&
- last->size == size &&
- memcmp(last->data, data, size) == 0)) {
- ++rle;
- goto next;
- }
- WT_ERR(__rec_col_var_helper(session, r,
- salvage, last, last_deleted, 0, rle));
- }
-
- /*
- * Swap the current/last state. We can't simply assign
- * the data values into the last buffer because they may
- * be a temporary copy built from a chain of modified
- * updates and creating the next record will overwrite
- * that memory. Check, we'd like to avoid the copy. If
- * data was taken from an update structure, we can just
- * use the pointers, they're not moving.
- */
- if (!deleted) {
- if (update_no_copy) {
- last->data = data;
- last->size = size;
- } else
- WT_ERR(__wt_buf_set(
- session, last, data, size));
- }
-
- /* Ready for the next loop, reset the RLE counter. */
- last_deleted = deleted;
- rle = 1;
-
- /*
- * Move to the next record. It's not a simple increment
- * because if it's the maximum record, incrementing it
- * wraps to 0 and this turns into an infinite loop.
- */
-next: if (src_recno == UINT64_MAX)
- break;
- ++src_recno;
- }
-
- /*
- * Execute this loop once without an insert item to catch any
- * missing records due to a split, then quit.
- */
- if (ins == NULL)
- break;
- }
-
- /* If we were tracking a record, write it. */
- if (rle != 0)
- WT_ERR(__rec_col_var_helper(
- session, r, salvage, last, last_deleted, 0, rle));
-
- /* Write the remnant page. */
- ret = __rec_split_finish(session, r);
-
-err: __wt_scr_free(session, &orig);
- return (ret);
-}
-
-/*
- * __rec_row_int --
- * Reconcile a row-store internal page.
- */
-static int
-__rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
-{
- WT_ADDR *addr;
- WT_BTREE *btree;
- WT_CELL *cell;
- WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack;
- WT_CHILD_STATE state;
- WT_DECL_RET;
- WT_IKEY *ikey;
- WT_KV *key, *val;
- WT_PAGE *child;
- WT_REF *ref;
- size_t size;
- u_int vtype;
- bool hazard, key_onpage_ovfl, ovfl_key;
- const void *p;
-
- btree = S2BT(session);
- child = NULL;
- hazard = false;
-
- key = &r->k;
- kpack = &_kpack;
- WT_CLEAR(*kpack); /* -Wuninitialized */
- val = &r->v;
- vpack = &_vpack;
- WT_CLEAR(*vpack); /* -Wuninitialized */
-
- ikey = NULL; /* -Wuninitialized */
- cell = NULL;
- key_onpage_ovfl = false;
-
- WT_RET(__rec_split_init(
- session, r, page, 0, btree->maxintlpage_precomp));
-
- /*
- * Ideally, we'd never store the 0th key on row-store internal pages
- * because it's never used during tree search and there's no reason
- * to waste the space. The problem is how we do splits: when we split,
- * we've potentially picked out several "split points" in the buffer
- * which is overflowing the maximum page size, and when the overflow
- * happens, we go back and physically split the buffer, at those split
- * points, into new pages. It would be both difficult and expensive
- * to re-process the 0th key at each split point to be an empty key,
- * so we don't do that. However, we are reconciling an internal page
- * for whatever reason, and the 0th key is known to be useless. We
- * truncate the key to a single byte, instead of removing it entirely,
- * it simplifies various things in other parts of the code (we don't
- * have to special case transforming the page from its disk image to
- * its in-memory version, for example).
- */
- r->cell_zero = true;
-
- /* For each entry in the in-memory page... */
- WT_INTL_FOREACH_BEGIN(session, page, ref) {
- /*
- * There are different paths if the key is an overflow item vs.
- * a straight-forward on-page value. If an overflow item, we
- * would have instantiated it, and we can use that fact to set
- * things up.
- *
- * Note the cell reference and unpacked key cell are available
- * only in the case of an instantiated, off-page key, we don't
- * bother setting them if that's not possible.
- */
- if (F_ISSET_ATOMIC(page, WT_PAGE_OVERFLOW_KEYS)) {
- cell = NULL;
- key_onpage_ovfl = false;
- ikey = __wt_ref_key_instantiated(ref);
- if (ikey != NULL && ikey->cell_offset != 0) {
- cell =
- WT_PAGE_REF_OFFSET(page, ikey->cell_offset);
- __wt_cell_unpack(cell, kpack);
- key_onpage_ovfl = kpack->ovfl &&
- kpack->raw != WT_CELL_KEY_OVFL_RM;
- }
- }
-
- WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state));
- addr = ref->addr;
- child = ref->page;
-
- switch (state) {
- case WT_CHILD_IGNORE:
- /*
- * Ignored child.
- *
- * Overflow keys referencing pages we're not writing are
- * no longer useful, schedule them for discard. Don't
- * worry about instantiation, internal page keys are
- * always instantiated. Don't worry about reuse,
- * reusing this key in this reconciliation is unlikely.
- */
- if (key_onpage_ovfl)
- WT_ERR(__wt_ovfl_discard_add(
- session, page, kpack->cell));
- WT_CHILD_RELEASE_ERR(session, hazard, ref);
- continue;
-
- case WT_CHILD_MODIFIED:
- /*
- * Modified child. Empty pages are merged into the
- * parent and discarded.
- */
- switch (child->modify->rec_result) {
- case WT_PM_REC_EMPTY:
- /*
- * Overflow keys referencing empty pages are no
- * longer useful, schedule them for discard.
- * Don't worry about instantiation, internal
- * page keys are always instantiated. Don't
- * worry about reuse, reusing this key in this
- * reconciliation is unlikely.
- */
- if (key_onpage_ovfl)
- WT_ERR(__wt_ovfl_discard_add(
- session, page, kpack->cell));
- WT_CHILD_RELEASE_ERR(session, hazard, ref);
- continue;
- case WT_PM_REC_MULTIBLOCK:
- /*
- * Overflow keys referencing split pages are no
- * longer useful (the split page's key is the
- * interesting key); schedule them for discard.
- * Don't worry about instantiation, internal
- * page keys are always instantiated. Don't
- * worry about reuse, reusing this key in this
- * reconciliation is unlikely.
- */
- if (key_onpage_ovfl)
- WT_ERR(__wt_ovfl_discard_add(
- session, page, kpack->cell));
-
- WT_ERR(__rec_row_merge(session, r, child));
- WT_CHILD_RELEASE_ERR(session, hazard, ref);
- continue;
- case WT_PM_REC_REPLACE:
- /*
- * If the page is replaced, the page's modify
- * structure has the page's address.
- */
- addr = &child->modify->mod_replace;
- break;
- WT_ILLEGAL_VALUE_ERR(
- session, child->modify->rec_result);
- }
- break;
- case WT_CHILD_ORIGINAL:
- /* Original child. */
- break;
- case WT_CHILD_PROXY:
- /* Deleted child where we write a proxy cell. */
- break;
- }
-
- /*
- * Build the value cell, the child page's address. Addr points
- * to an on-page cell or an off-page WT_ADDR structure. There's
- * a special cell type in the case of page deletion requiring
- * a proxy cell, otherwise use the information from the addr or
- * original cell.
- */
- if (__wt_off_page(page, addr)) {
- p = addr->addr;
- size = addr->size;
- vtype = state == WT_CHILD_PROXY ?
- WT_CELL_ADDR_DEL : __rec_vtype(addr);
- } else {
- __wt_cell_unpack(ref->addr, vpack);
- p = vpack->data;
- size = vpack->size;
- vtype = state == WT_CHILD_PROXY ?
- WT_CELL_ADDR_DEL : (u_int)vpack->raw;
- }
- __rec_cell_build_addr(session, r, p, size, vtype, WT_RECNO_OOB);
- WT_CHILD_RELEASE_ERR(session, hazard, ref);
-
- /*
- * Build key cell.
- * Truncate any 0th key, internal pages don't need 0th keys.
- */
- if (key_onpage_ovfl) {
- key->buf.data = cell;
- key->buf.size = __wt_cell_total_len(kpack);
- key->cell_len = 0;
- key->len = key->buf.size;
- ovfl_key = true;
- } else {
- __wt_ref_key(page, ref, &p, &size);
- WT_ERR(__rec_cell_build_int_key(
- session, r, p, r->cell_zero ? 1 : size, &ovfl_key));
- }
- r->cell_zero = false;
-
- /* Boundary: split or write the page. */
- if (__rec_need_split(r, key->len + val->len)) {
- /*
- * In one path above, we copied address blocks from the
- * page rather than building the actual key. In that
- * case, we have to build the key now because we are
- * about to promote it.
- */
- if (key_onpage_ovfl) {
- WT_ERR(__wt_buf_set(session, r->cur,
- WT_IKEY_DATA(ikey), ikey->size));
- key_onpage_ovfl = false;
- }
-
- WT_ERR(__rec_split_crossing_bnd(
- session, r, key->len + val->len));
- }
-
- /* Copy the key and value onto the page. */
- __rec_copy_incr(session, r, key);
- __rec_copy_incr(session, r, val);
-
- /* Update compression state. */
- __rec_key_state_update(r, ovfl_key);
- } WT_INTL_FOREACH_END;
-
- /* Write the remnant page. */
- return (__rec_split_finish(session, r));
-
-err: WT_CHILD_RELEASE(session, hazard, ref);
- return (ret);
-}
-
-/*
- * __rec_row_merge --
- * Merge in a split page.
- */
-static int
-__rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
-{
- WT_ADDR *addr;
- WT_KV *key, *val;
- WT_MULTI *multi;
- WT_PAGE_MODIFY *mod;
- uint32_t i;
- bool ovfl_key;
-
- mod = page->modify;
-
- key = &r->k;
- val = &r->v;
-
- /* For each entry in the split array... */
- for (multi = mod->mod_multi,
- i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
- /* Build the key and value cells. */
- WT_RET(__rec_cell_build_int_key(session, r,
- WT_IKEY_DATA(multi->key.ikey),
- r->cell_zero ? 1 : multi->key.ikey->size, &ovfl_key));
- r->cell_zero = false;
-
- addr = &multi->addr;
- __rec_cell_build_addr(session, r,
- addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB);
-
- /* Boundary: split or write the page. */
- if (__rec_need_split(r, key->len + val->len))
- WT_RET(__rec_split_crossing_bnd(
- session, r, key->len + val->len));
-
- /* Copy the key and value onto the page. */
- __rec_copy_incr(session, r, key);
- __rec_copy_incr(session, r, val);
-
- /* Update compression state. */
- __rec_key_state_update(r, ovfl_key);
- }
- return (0);
-}
-
-/*
- * __rec_row_leaf --
- * Reconcile a row-store leaf page.
- */
-static int
-__rec_row_leaf(WT_SESSION_IMPL *session,
- WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
-{
- WT_BTREE *btree;
- WT_CELL *cell;
- WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack;
- WT_CURSOR_BTREE *cbt;
- WT_DECL_ITEM(tmpkey);
- WT_DECL_ITEM(tmpval);
- WT_DECL_RET;
- WT_IKEY *ikey;
- WT_INSERT *ins;
- WT_KV *key, *val;
- WT_ROW *rip;
- WT_UPDATE *upd;
- size_t size;
- uint64_t slvg_skip;
- uint32_t i;
- bool dictionary, key_onpage_ovfl, ovfl_key;
- void *copy;
- const void *p;
-
- btree = S2BT(session);
- cbt = &r->update_modify_cbt;
- slvg_skip = salvage == NULL ? 0 : salvage->skip;
-
- key = &r->k;
- val = &r->v;
- vpack = &_vpack;
-
- WT_RET(__rec_split_init(
- session, r, page, 0, btree->maxleafpage_precomp));
-
- /*
- * Write any K/V pairs inserted into the page before the first from-disk
- * key on the page.
- */
- if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL)
- WT_RET(__rec_row_leaf_insert(session, r, ins));
-
- /*
- * Temporary buffers in which to instantiate any uninstantiated keys
- * or value items we need.
- */
- WT_ERR(__wt_scr_alloc(session, 0, &tmpkey));
- WT_ERR(__wt_scr_alloc(session, 0, &tmpval));
-
- /* For each entry in the page... */
- WT_ROW_FOREACH(page, rip, i) {
- /*
- * The salvage code, on some rare occasions, wants to reconcile
- * a page but skip some leading records on the page. Because
- * the row-store leaf reconciliation function copies keys from
- * the original disk page, this is non-trivial -- just changing
- * the in-memory pointers isn't sufficient, we have to change
- * the WT_CELL structures on the disk page, too. It's ugly, but
- * we pass in a value that tells us how many records to skip in
- * this case.
- */
- if (slvg_skip != 0) {
- --slvg_skip;
- continue;
- }
-
- /*
- * Figure out the key: set any cell reference (and unpack it),
- * set any instantiated key reference.
- */
- copy = WT_ROW_KEY_COPY(rip);
- (void)__wt_row_leaf_key_info(
- page, copy, &ikey, &cell, NULL, NULL);
- if (cell == NULL)
- kpack = NULL;
- else {
- kpack = &_kpack;
- __wt_cell_unpack(cell, kpack);
- }
-
- /* Unpack the on-page value cell, and look for an update. */
- __wt_row_leaf_value_cell(page, rip, NULL, vpack);
- WT_ERR(__rec_txn_read(
- session, r, NULL, rip, vpack, NULL, &upd));
-
- /* Build value cell. */
- dictionary = false;
- if (upd == NULL) {
- /*
- * When the page was read into memory, there may not
- * have been a value item.
- *
- * If there was a value item, check if it's a dictionary
- * cell (a copy of another item on the page). If it's a
- * copy, we have to create a new value item as the old
- * item might have been discarded from the page.
- */
- if (vpack->raw == WT_CELL_VALUE_COPY) {
- /* If the item is Huffman encoded, decode it. */
- if (btree->huffman_value == NULL) {
- p = vpack->data;
- size = vpack->size;
- } else {
- WT_ERR(__wt_huffman_decode(session,
- btree->huffman_value,
- vpack->data, vpack->size,
- tmpval));
- p = tmpval->data;
- size = tmpval->size;
- }
- WT_ERR(__rec_cell_build_val(
- session, r, p, size, (uint64_t)0));
- dictionary = true;
- } else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) {
- /*
- * If doing an update save and restore, and the
- * underlying value is a removed overflow value,
- * we end up here.
- *
- * If necessary, when the overflow value was
- * originally removed, reconciliation appended
- * a globally visible copy of the value to the
- * key's update list, meaning the on-page item
- * isn't accessed after page re-instantiation.
- *
- * Assert the case.
- */
- WT_ASSERT(session,
- F_ISSET(r, WT_REC_UPDATE_RESTORE));
-
- /*
- * If the key is also a removed overflow item,
- * don't write anything at all.
- *
- * We don't have to write anything because the
- * code re-instantiating the page gets the key
- * to match the saved list of updates from the
- * original page. By not putting the key on
- * the page, we'll move the key/value set from
- * a row-store leaf page slot to an insert list,
- * but that shouldn't matter.
- *
- * The reason we bother with the test is because
- * overflows are expensive to write. It's hard
- * to imagine a real workload where this test is
- * worth the effort, but it's a simple test.
- */
- if (kpack != NULL &&
- kpack->raw == WT_CELL_KEY_OVFL_RM)
- goto leaf_insert;
-
- /*
- * The on-page value will never be accessed,
- * write a placeholder record.
- */
- WT_ERR(__rec_cell_build_val(session, r,
- "ovfl-unused", strlen("ovfl-unused"),
- (uint64_t)0));
- } else {
- val->buf.data = vpack->cell;
- val->buf.size = __wt_cell_total_len(vpack);
- val->cell_len = 0;
- val->len = val->buf.size;
-
- /* Track if page has overflow items. */
- if (vpack->ovfl)
- r->ovfl_items = true;
- }
- } else {
- /*
- * The first time we find an overflow record we're not
- * going to use, discard the underlying blocks.
- */
- if (vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM)
- WT_ERR(__wt_ovfl_remove(session,
- page, vpack, F_ISSET(r, WT_REC_EVICT)));
-
- switch (upd->type) {
- case WT_UPDATE_MODIFY:
- cbt->slot = WT_ROW_SLOT(page, rip);
- WT_ERR(__wt_value_return_upd(session, cbt, upd,
- F_ISSET(r, WT_REC_VISIBLE_ALL)));
- WT_ERR(__rec_cell_build_val(session, r,
- cbt->iface.value.data,
- cbt->iface.value.size, (uint64_t)0));
- dictionary = true;
- break;
- case WT_UPDATE_STANDARD:
- /*
- * If no value, nothing needs to be copied.
- * Otherwise, build the value's chunk from the
- * update value.
- */
- if (upd->size == 0) {
- val->buf.data = NULL;
- val->cell_len =
- val->len = val->buf.size = 0;
- } else {
- WT_ERR(__rec_cell_build_val(session, r,
- upd->data, upd->size,
- (uint64_t)0));
- dictionary = true;
- }
- break;
- case WT_UPDATE_TOMBSTONE:
- /*
- * If this key/value pair was deleted, we're
- * done.
- *
- * Overflow keys referencing discarded values
- * are no longer useful, discard the backing
- * blocks. Don't worry about reuse, reusing
- * keys from a row-store page reconciliation
- * seems unlikely enough to ignore.
- */
- if (kpack != NULL && kpack->ovfl &&
- kpack->raw != WT_CELL_KEY_OVFL_RM) {
- /*
- * Keys are part of the name-space, we
- * can't remove them from the in-memory
- * tree; if an overflow key was deleted
- * without being instantiated (for
- * example, cursor-based truncation), do
- * it now.
- */
- if (ikey == NULL)
- WT_ERR(__wt_row_leaf_key(
- session,
- page, rip, tmpkey, true));
-
- WT_ERR(__wt_ovfl_discard_add(
- session, page, kpack->cell));
- }
-
- /*
- * We aren't actually creating the key so we
- * can't use bytes from this key to provide
- * prefix information for a subsequent key.
- */
- tmpkey->size = 0;
-
- /* Proceed with appended key/value pairs. */
- goto leaf_insert;
- WT_ILLEGAL_VALUE_ERR(session, upd->type);
- }
- }
-
- /*
- * Build key cell.
- *
- * If the key is an overflow key that hasn't been removed, use
- * the original backing blocks.
- */
- key_onpage_ovfl = kpack != NULL &&
- kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM;
- if (key_onpage_ovfl) {
- key->buf.data = cell;
- key->buf.size = __wt_cell_total_len(kpack);
- key->cell_len = 0;
- key->len = key->buf.size;
- ovfl_key = true;
-
- /*
- * We aren't creating a key so we can't use this key as
- * a prefix for a subsequent key.
- */
- tmpkey->size = 0;
-
- /* Track if page has overflow items. */
- r->ovfl_items = true;
- } else {
- /*
- * Get the key from the page or an instantiated key, or
- * inline building the key from a previous key (it's a
- * fast path for simple, prefix-compressed keys), or by
- * by building the key from scratch.
- */
- if (__wt_row_leaf_key_info(page, copy,
- NULL, &cell, &tmpkey->data, &tmpkey->size))
- goto build;
-
- kpack = &_kpack;
- __wt_cell_unpack(cell, kpack);
- if (btree->huffman_key == NULL &&
- kpack->type == WT_CELL_KEY &&
- tmpkey->size >= kpack->prefix) {
- /*
- * The previous clause checked for a prefix of
- * zero, which means the temporary buffer must
- * have a non-zero size, and it references a
- * valid key.
- */
- WT_ASSERT(session, tmpkey->size != 0);
-
- /*
- * Grow the buffer as necessary, ensuring data
- * data has been copied into local buffer space,
- * then append the suffix to the prefix already
- * in the buffer.
- *
- * Don't grow the buffer unnecessarily or copy
- * data we don't need, truncate the item's data
- * length to the prefix bytes.
- */
- tmpkey->size = kpack->prefix;
- WT_ERR(__wt_buf_grow(session,
- tmpkey, tmpkey->size + kpack->size));
- memcpy((uint8_t *)tmpkey->mem + tmpkey->size,
- kpack->data, kpack->size);
- tmpkey->size += kpack->size;
- } else
- WT_ERR(__wt_row_leaf_key_copy(
- session, page, rip, tmpkey));
-build:
- WT_ERR(__rec_cell_build_leaf_key(session, r,
- tmpkey->data, tmpkey->size, &ovfl_key));
- }
-
- /* Boundary: split or write the page. */
- if (__rec_need_split(r, key->len + val->len)) {
- /*
- * If we copied address blocks from the page rather than
- * building the actual key, we have to build the key now
- * because we are about to promote it.
- */
- if (key_onpage_ovfl) {
- WT_ERR(__wt_dsk_cell_data_ref(session,
- WT_PAGE_ROW_LEAF, kpack, r->cur));
- WT_NOT_READ(key_onpage_ovfl, false);
- }
-
- /*
- * Turn off prefix compression until a full key written
- * to the new page, and (unless already working with an
- * overflow key), rebuild the key without compression.
- */
- if (r->key_pfx_compress_conf) {
- r->key_pfx_compress = false;
- if (!ovfl_key)
- WT_ERR(__rec_cell_build_leaf_key(
- session, r, NULL, 0, &ovfl_key));
- }
-
- WT_ERR(__rec_split_crossing_bnd(
- session, r, key->len + val->len));
- }
-
- /* Copy the key/value pair onto the page. */
- __rec_copy_incr(session, r, key);
- if (val->len == 0)
- r->any_empty_value = true;
- else {
- r->all_empty_value = false;
- if (dictionary && btree->dictionary)
- WT_ERR(__rec_dict_replace(session, r, 0, val));
- __rec_copy_incr(session, r, val);
- }
-
- /* Update compression state. */
- __rec_key_state_update(r, ovfl_key);
-
-leaf_insert: /* Write any K/V pairs inserted into the page after this key. */
- if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT(page, rip))) != NULL)
- WT_ERR(__rec_row_leaf_insert(session, r, ins));
- }
-
- /* Write the remnant page. */
- ret = __rec_split_finish(session, r);
-
-err: __wt_scr_free(session, &tmpkey);
- __wt_scr_free(session, &tmpval);
- return (ret);
-}
-
-/*
- * __rec_row_leaf_insert --
- * Walk an insert chain, writing K/V pairs.
- */
-static int
-__rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
-{
- WT_BTREE *btree;
- WT_CURSOR_BTREE *cbt;
- WT_KV *key, *val;
- WT_UPDATE *upd;
- bool ovfl_key, upd_saved;
-
- btree = S2BT(session);
- cbt = &r->update_modify_cbt;
-
- key = &r->k;
- val = &r->v;
-
- for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) {
- WT_RET(__rec_txn_read(
- session, r, ins, NULL, NULL, &upd_saved, &upd));
-
- if (upd == NULL) {
- /*
- * If no update is visible but some were saved, check
- * for splits.
- */
- if (!upd_saved)
- continue;
- if (!__rec_need_split(r, WT_INSERT_KEY_SIZE(ins)))
- continue;
-
- /* Copy the current key into place and then split. */
- WT_RET(__wt_buf_set(session, r->cur,
- WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins)));
- WT_RET(__rec_split_crossing_bnd(
- session, r, WT_INSERT_KEY_SIZE(ins)));
-
- /*
- * Turn off prefix and suffix compression until a full
- * key is written into the new page.
- */
- r->key_pfx_compress = r->key_sfx_compress = false;
- continue;
- }
-
- switch (upd->type) {
- case WT_UPDATE_MODIFY:
- /*
- * Impossible slot, there's no backing on-page
- * item.
- */
- cbt->slot = UINT32_MAX;
- WT_RET(__wt_value_return_upd(
- session, cbt, upd, F_ISSET(r, WT_REC_VISIBLE_ALL)));
- WT_RET(__rec_cell_build_val(session, r,
- cbt->iface.value.data,
- cbt->iface.value.size, (uint64_t)0));
- break;
- case WT_UPDATE_STANDARD:
- if (upd->size == 0)
- val->len = 0;
- else
- WT_RET(__rec_cell_build_val(session,
- r, upd->data, upd->size,
- (uint64_t)0));
- break;
- case WT_UPDATE_TOMBSTONE:
- continue;
- WT_ILLEGAL_VALUE(session, upd->type);
- }
-
- /* Build key cell. */
- WT_RET(__rec_cell_build_leaf_key(session, r,
- WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key));
-
- /* Boundary: split or write the page. */
- if (__rec_need_split(r, key->len + val->len)) {
- /*
- * Turn off prefix compression until a full key written
- * to the new page, and (unless already working with an
- * overflow key), rebuild the key without compression.
- */
- if (r->key_pfx_compress_conf) {
- r->key_pfx_compress = false;
- if (!ovfl_key)
- WT_RET(__rec_cell_build_leaf_key(
- session, r, NULL, 0, &ovfl_key));
- }
-
- WT_RET(__rec_split_crossing_bnd(
- session, r, key->len + val->len));
- }
-
- /* Copy the key/value pair onto the page. */
- __rec_copy_incr(session, r, key);
- if (val->len == 0)
- r->any_empty_value = true;
- else {
- r->all_empty_value = false;
- if (btree->dictionary)
- WT_RET(__rec_dict_replace(session, r, 0, val));
- __rec_copy_incr(session, r, val);
- }
-
- /* Update compression state. */
- __rec_key_state_update(r, ovfl_key);
- }
-
- return (0);
-}
-
-/*
* __rec_split_discard --
* Discard the pages resulting from a previous split.
*/
@@ -5649,232 +2517,12 @@ __rec_las_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r)
}
/*
- * __rec_cell_build_int_key --
- * Process a key and return a WT_CELL structure and byte string to be
- * stored on a row-store internal page.
- */
-static int
-__rec_cell_build_int_key(WT_SESSION_IMPL *session,
- WT_RECONCILE *r, const void *data, size_t size, bool *is_ovflp)
-{
- WT_BTREE *btree;
- WT_KV *key;
-
- *is_ovflp = false;
-
- btree = S2BT(session);
-
- key = &r->k;
-
- /* Copy the bytes into the "current" and key buffers. */
- WT_RET(__wt_buf_set(session, r->cur, data, size));
- WT_RET(__wt_buf_set(session, &key->buf, data, size));
-
- /* Create an overflow object if the data won't fit. */
- if (size > btree->maxintlkey) {
- WT_STAT_DATA_INCR(session, rec_overflow_key_internal);
-
- *is_ovflp = true;
- return (__rec_cell_build_ovfl(
- session, r, key, WT_CELL_KEY_OVFL, (uint64_t)0));
- }
-
- key->cell_len = __wt_cell_pack_int_key(&key->cell, key->buf.size);
- key->len = key->cell_len + key->buf.size;
-
- return (0);
-}
-
-/*
- * __rec_cell_build_leaf_key --
- * Process a key and return a WT_CELL structure and byte string to be
- * stored on a row-store leaf page.
- */
-static int
-__rec_cell_build_leaf_key(WT_SESSION_IMPL *session,
- WT_RECONCILE *r, const void *data, size_t size, bool *is_ovflp)
-{
- WT_BTREE *btree;
- WT_KV *key;
- size_t pfx_max;
- const uint8_t *a, *b;
- uint8_t pfx;
-
- *is_ovflp = false;
-
- btree = S2BT(session);
-
- key = &r->k;
-
- pfx = 0;
- if (data == NULL)
- /*
- * When data is NULL, our caller has a prefix compressed key
- * they can't use (probably because they just crossed a split
- * point). Use the full key saved when last called, instead.
- */
- WT_RET(__wt_buf_set(
- session, &key->buf, r->cur->data, r->cur->size));
- else {
- /*
- * Save a copy of the key for later reference: we use the full
- * key for prefix-compression comparisons, and if we are, for
- * any reason, unable to use the compressed key we generate.
- */
- WT_RET(__wt_buf_set(session, r->cur, data, size));
-
- /*
- * Do prefix compression on the key. We know by definition the
- * previous key sorts before the current key, which means the
- * keys must differ and we just need to compare up to the
- * shorter of the two keys.
- */
- if (r->key_pfx_compress) {
- /*
- * We can't compress out more than 256 bytes, limit the
- * comparison to that.
- */
- pfx_max = UINT8_MAX;
- if (size < pfx_max)
- pfx_max = size;
- if (r->last->size < pfx_max)
- pfx_max = r->last->size;
- for (a = data, b = r->last->data; pfx < pfx_max; ++pfx)
- if (*a++ != *b++)
- break;
-
- /*
- * Prefix compression may cost us CPU and memory when
- * the page is re-loaded, don't do it unless there's
- * reasonable gain.
- */
- if (pfx < btree->prefix_compression_min)
- pfx = 0;
- else
- WT_STAT_DATA_INCRV(
- session, rec_prefix_compression, pfx);
- }
-
- /* Copy the non-prefix bytes into the key buffer. */
- WT_RET(__wt_buf_set(
- session, &key->buf, (uint8_t *)data + pfx, size - pfx));
- }
-
- /* Optionally compress the key using the Huffman engine. */
- if (btree->huffman_key != NULL)
- WT_RET(__wt_huffman_encode(session, btree->huffman_key,
- key->buf.data, (uint32_t)key->buf.size, &key->buf));
-
- /* Create an overflow object if the data won't fit. */
- if (key->buf.size > btree->maxleafkey) {
- /*
- * Overflow objects aren't prefix compressed -- rebuild any
- * object that was prefix compressed.
- */
- if (pfx == 0) {
- WT_STAT_DATA_INCR(session, rec_overflow_key_leaf);
-
- *is_ovflp = true;
- return (__rec_cell_build_ovfl(
- session, r, key, WT_CELL_KEY_OVFL, (uint64_t)0));
- }
- return (
- __rec_cell_build_leaf_key(session, r, NULL, 0, is_ovflp));
- }
-
- key->cell_len = __wt_cell_pack_leaf_key(&key->cell, pfx, key->buf.size);
- key->len = key->cell_len + key->buf.size;
-
- return (0);
-}
-
-/*
- * __rec_cell_build_addr --
- * Process an address reference and return a cell structure to be stored
- * on the page.
- */
-static void
-__rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r,
- const void *addr, size_t size, u_int cell_type, uint64_t recno)
-{
- WT_KV *val;
-
- val = &r->v;
-
- WT_ASSERT(session, size != 0 || cell_type == WT_CELL_ADDR_DEL);
-
- /*
- * We don't check the address size because we can't store an address on
- * an overflow page: if the address won't fit, the overflow page's
- * address won't fit either. This possibility must be handled by Btree
- * configuration, we have to disallow internal page sizes that are too
- * small with respect to the largest address cookie the underlying block
- * manager might return.
- */
-
- /*
- * We don't copy the data into the buffer, it's not necessary; just
- * re-point the buffer's data/length fields.
- */
- val->buf.data = addr;
- val->buf.size = size;
- val->cell_len =
- __wt_cell_pack_addr(&val->cell, cell_type, recno, val->buf.size);
- val->len = val->cell_len + val->buf.size;
-}
-
-/*
- * __rec_cell_build_val --
- * Process a data item and return a WT_CELL structure and byte string to
- * be stored on the page.
- */
-static int
-__rec_cell_build_val(WT_SESSION_IMPL *session,
- WT_RECONCILE *r, const void *data, size_t size, uint64_t rle)
-{
- WT_BTREE *btree;
- WT_KV *val;
-
- btree = S2BT(session);
-
- val = &r->v;
-
- /*
- * We don't copy the data into the buffer, it's not necessary; just
- * re-point the buffer's data/length fields.
- */
- val->buf.data = data;
- val->buf.size = size;
-
- /* Handle zero-length cells quickly. */
- if (size != 0) {
- /* Optionally compress the data using the Huffman engine. */
- if (btree->huffman_value != NULL)
- WT_RET(__wt_huffman_encode(
- session, btree->huffman_value,
- val->buf.data, (uint32_t)val->buf.size, &val->buf));
-
- /* Create an overflow object if the data won't fit. */
- if (val->buf.size > btree->maxleafvalue) {
- WT_STAT_DATA_INCR(session, rec_overflow_value);
-
- return (__rec_cell_build_ovfl(
- session, r, val, WT_CELL_VALUE_OVFL, rle));
- }
- }
- val->cell_len = __wt_cell_pack_data(&val->cell, rle, val->buf.size);
- val->len = val->cell_len + val->buf.size;
-
- return (0);
-}
-
-/*
- * __rec_cell_build_ovfl --
+ * __wt_rec_cell_build_ovfl --
* Store overflow items in the file, returning the address cookie.
*/
-static int
-__rec_cell_build_ovfl(WT_SESSION_IMPL *session,
- WT_RECONCILE *r, WT_KV *kv, uint8_t type, uint64_t rle)
+int
+__wt_rec_cell_build_ovfl(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_REC_KV *kv, uint8_t type, uint64_t rle)
{
WT_BM *bm;
WT_BTREE *btree;
@@ -5939,194 +2587,3 @@ __rec_cell_build_ovfl(WT_SESSION_IMPL *session,
err: __wt_scr_free(session, &tmp);
return (ret);
}
-
-/*
- * __rec_dictionary_skip_search --
- * Search a dictionary skiplist.
- */
-static WT_DICTIONARY *
-__rec_dictionary_skip_search(WT_DICTIONARY **head, uint64_t hash)
-{
- WT_DICTIONARY **e;
- int i;
-
- /*
- * Start at the highest skip level, then go as far as possible at each
- * level before stepping down to the next.
- */
- for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
- if (*e == NULL) { /* Empty levels */
- --i;
- --e;
- continue;
- }
-
- /*
- * Return any exact matches: we don't care in what search level
- * we found a match.
- */
- if ((*e)->hash == hash) /* Exact match */
- return (*e);
- if ((*e)->hash > hash) { /* Drop down a level */
- --i;
- --e;
- } else /* Keep going at this level */
- e = &(*e)->next[i];
- }
- return (NULL);
-}
-
-/*
- * __rec_dictionary_skip_search_stack --
- * Search a dictionary skiplist, returning an insert/remove stack.
- */
-static void
-__rec_dictionary_skip_search_stack(
- WT_DICTIONARY **head, WT_DICTIONARY ***stack, uint64_t hash)
-{
- WT_DICTIONARY **e;
- int i;
-
- /*
- * Start at the highest skip level, then go as far as possible at each
- * level before stepping down to the next.
- */
- for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;)
- if (*e == NULL || (*e)->hash > hash)
- stack[i--] = e--; /* Drop down a level */
- else
- e = &(*e)->next[i]; /* Keep going at this level */
-}
-
-/*
- * __rec_dictionary_skip_insert --
- * Insert an entry into the dictionary skip-list.
- */
-static void
-__rec_dictionary_skip_insert(
- WT_DICTIONARY **head, WT_DICTIONARY *e, uint64_t hash)
-{
- WT_DICTIONARY **stack[WT_SKIP_MAXDEPTH];
- u_int i;
-
- /* Insert the new entry into the skiplist. */
- __rec_dictionary_skip_search_stack(head, stack, hash);
- for (i = 0; i < e->depth; ++i) {
- e->next[i] = *stack[i];
- *stack[i] = e;
- }
-}
-
-/*
- * __rec_dictionary_init --
- * Allocate and initialize the dictionary.
- */
-static int
-__rec_dictionary_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, u_int slots)
-{
- u_int depth, i;
-
- /* Free any previous dictionary. */
- __rec_dictionary_free(session, r);
-
- r->dictionary_slots = slots;
- WT_RET(__wt_calloc(session,
- r->dictionary_slots, sizeof(WT_DICTIONARY *), &r->dictionary));
- for (i = 0; i < r->dictionary_slots; ++i) {
- depth = __wt_skip_choose_depth(session);
- WT_RET(__wt_calloc(session, 1,
- sizeof(WT_DICTIONARY) + depth * sizeof(WT_DICTIONARY *),
- &r->dictionary[i]));
- r->dictionary[i]->depth = depth;
- }
- return (0);
-}
-
-/*
- * __rec_dictionary_free --
- * Free the dictionary.
- */
-static void
-__rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r)
-{
- u_int i;
-
- if (r->dictionary == NULL)
- return;
-
- /*
- * We don't correct dictionary_slots when we fail during allocation,
- * but that's OK, the value is either NULL or a memory reference to
- * be free'd.
- */
- for (i = 0; i < r->dictionary_slots; ++i)
- __wt_free(session, r->dictionary[i]);
- __wt_free(session, r->dictionary);
-}
-
-/*
- * __rec_dictionary_reset --
- * Reset the dictionary when reconciliation restarts and when crossing a
- * page boundary (a potential split).
- */
-static void
-__rec_dictionary_reset(WT_RECONCILE *r)
-{
- if (r->dictionary_slots) {
- r->dictionary_next = 0;
- memset(r->dictionary_head, 0, sizeof(r->dictionary_head));
- }
-}
-
-/*
- * __rec_dictionary_lookup --
- * Check the dictionary for a matching value on this page.
- */
-static int
-__rec_dictionary_lookup(
- WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *val, WT_DICTIONARY **dpp)
-{
- WT_DICTIONARY *dp, *next;
- uint64_t hash;
- bool match;
-
- *dpp = NULL;
-
- /* Search the dictionary, and return any match we find. */
- hash = __wt_hash_fnv64(val->buf.data, val->buf.size);
- for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash);
- dp != NULL && dp->hash == hash; dp = dp->next[0]) {
- WT_RET(__wt_cell_pack_data_match(
- (WT_CELL *)((uint8_t *)r->cur_ptr->image.mem + dp->offset),
- &val->cell, val->buf.data, &match));
- if (match) {
- WT_STAT_DATA_INCR(session, rec_dictionary);
- *dpp = dp;
- return (0);
- }
- }
-
- /*
- * We're not doing value replacement in the dictionary. We stop adding
- * new entries if we run out of empty dictionary slots (but continue to
- * use the existing entries). I can't think of any reason a leaf page
- * value is more likely to be seen because it was seen more recently
- * than some other value: if we find working sets where that's not the
- * case, it shouldn't be too difficult to maintain a pointer which is
- * the next dictionary slot to re-use.
- */
- if (r->dictionary_next >= r->dictionary_slots)
- return (0);
-
- /*
- * Set the hash value, we'll add this entry into the dictionary when we
- * write it into the page's disk image buffer (because that's when we
- * know where on the page it will be written).
- */
- next = r->dictionary[r->dictionary_next++];
- next->offset = 0; /* Not necessary, just cautious. */
- next->hash = hash;
- __rec_dictionary_skip_insert(r->dictionary_head, next, hash);
- *dpp = next;
- return (0);
-}
diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c
index 534d598b3f3..8160ef92bbd 100644
--- a/src/third_party/wiredtiger/src/support/stat.c
+++ b/src/third_party/wiredtiger/src/support/stat.c
@@ -781,6 +781,8 @@ static const char * const __stats_connection_desc[] = {
"cache: cache overflow score",
"cache: cache overflow table entries",
"cache: cache overflow table insert calls",
+ "cache: cache overflow table max on-disk size",
+ "cache: cache overflow table on-disk size",
"cache: cache overflow table remove calls",
"cache: checkpoint blocked page eviction",
"cache: eviction calls to get a page",
@@ -1204,6 +1206,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
/* not clearing cache_lookaside_score */
/* not clearing cache_lookaside_entries */
stats->cache_lookaside_insert = 0;
+ /* not clearing cache_lookaside_ondisk_max */
+ /* not clearing cache_lookaside_ondisk */
stats->cache_lookaside_remove = 0;
stats->cache_eviction_checkpoint = 0;
stats->cache_eviction_get_ref = 0;
@@ -1616,6 +1620,10 @@ __wt_stat_connection_aggregate(
WT_STAT_READ(from, cache_lookaside_entries);
to->cache_lookaside_insert +=
WT_STAT_READ(from, cache_lookaside_insert);
+ to->cache_lookaside_ondisk_max +=
+ WT_STAT_READ(from, cache_lookaside_ondisk_max);
+ to->cache_lookaside_ondisk +=
+ WT_STAT_READ(from, cache_lookaside_ondisk);
to->cache_lookaside_remove +=
WT_STAT_READ(from, cache_lookaside_remove);
to->cache_eviction_checkpoint +=
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index 81bf2bdea4f..b21ccd355ce 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -485,7 +485,7 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[])
if (cval.val)
F_SET(txn, WT_TXN_IGNORE_PREPARE);
- WT_RET(__wt_txn_parse_read_timestamp(session, cfg));
+ WT_RET(__wt_txn_parse_read_timestamp(session, cfg, NULL));
return (0);
}
@@ -933,8 +933,12 @@ __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[])
WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0);
- /* Transaction should not have updated any of the logged tables. */
- WT_ASSERT(session, txn->logrec == NULL);
+ /*
+ * A transaction should not have updated any of the logged tables,
+ * if debug mode logging is not turned on.
+ */
+ if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_DEBUG_MODE))
+ WT_ASSERT(session, txn->logrec == NULL);
WT_RET(__wt_txn_context_check(session, true));
diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c
index f55715eb91b..1b5beff581f 100644
--- a/src/third_party/wiredtiger/src/txn/txn_log.c
+++ b/src/third_party/wiredtiger/src/txn/txn_log.c
@@ -38,6 +38,7 @@ __txn_op_log_row_key_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
if (cbt->ins == NULL) {
session = (WT_SESSION_IMPL *)cbt->iface.session;
page = cbt->ref->page;
+ WT_ASSERT(session, cbt->slot < page->entries);
rip = &page->pg_row[cbt->slot];
WT_ASSERT(session,
__wt_row_leaf_key(session, page, rip, &key, false) == 0);
@@ -59,19 +60,15 @@ __txn_op_log_row_key_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
* Log an operation for the current transaction.
*/
static int
-__txn_op_log(WT_SESSION_IMPL *session,
- WT_ITEM *logrec, WT_TXN_OP *op, WT_CURSOR_BTREE *cbt)
+__txn_op_log(WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ WT_TXN_OP *op, WT_CURSOR_BTREE *cbt, uint32_t fileid)
{
WT_CURSOR *cursor;
WT_ITEM value;
WT_UPDATE *upd;
uint64_t recno;
- uint32_t fileid;
cursor = &cbt->iface;
-
- fileid = op->btree->id;
-
upd = op->u.op_upd;
value.data = upd->data;
value.size = upd->size;
@@ -210,7 +207,16 @@ __txn_logrec_init(WT_SESSION_IMPL *session)
if (txn->logrec != NULL)
return (0);
- WT_ASSERT(session, txn->id != WT_TXN_NONE);
+ /*
+ * The only way we should ever get in here without a txn id is if we
+ * are recording diagnostic information. In that case, allocate an id.
+ */
+ if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_DEBUG_MODE) &&
+ txn->id == WT_TXN_NONE)
+ WT_RET(__wt_txn_id_check(session));
+ else
+ WT_ASSERT(session, txn->id != WT_TXN_NONE);
+
WT_RET(__wt_struct_size(session, &header_size, fmt, rectype, txn->id));
WT_RET(__wt_logrec_alloc(session, header_size, &logrec));
@@ -233,6 +239,7 @@ err: __wt_logrec_free(session, &logrec);
int
__wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
{
+ WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_ITEM *logrec;
WT_TXN *txn;
@@ -240,11 +247,13 @@ __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
uint32_t fileid;
+ conn = S2C(session);
txn = &session->txn;
- if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED) ||
+ if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) ||
F_ISSET(session, WT_SESSION_NO_LOGGING) ||
- F_ISSET(S2BT(session), WT_BTREE_NO_LOGGING))
+ (F_ISSET(S2BT(session), WT_BTREE_NO_LOGGING) &&
+ !FLD_ISSET(conn->log_flags, WT_CONN_LOG_DEBUG_MODE)))
return (0);
/* We'd better have a transaction. */
@@ -255,6 +264,14 @@ __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
op = txn->mod + txn->mod_count - 1;
fileid = op->btree->id;
+ /*
+ * If this operation is diagnostic only, set the ignore bit on the
+ * fileid so that recovery can skip it.
+ */
+ if (F_ISSET(S2BT(session), WT_BTREE_NO_LOGGING) &&
+ FLD_ISSET(conn->log_flags, WT_CONN_LOG_DEBUG_MODE))
+ FLD_SET(fileid, WT_LOGOP_IGNORE);
+
WT_RET(__txn_logrec_init(session));
logrec = txn->logrec;
@@ -267,7 +284,7 @@ __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
break;
case WT_TXN_OP_BASIC_COL:
case WT_TXN_OP_BASIC_ROW:
- ret = __txn_op_log(session, logrec, op, cbt);
+ ret = __txn_op_log(session, logrec, op, cbt, fileid);
break;
case WT_TXN_OP_TRUNCATE_COL:
ret = __wt_logop_col_truncate_pack(session, logrec, fileid,
@@ -366,6 +383,47 @@ __wt_txn_checkpoint_logread(WT_SESSION_IMPL *session,
}
/*
+ * __wt_txn_ts_log --
+ * Write a log record recording timestamps in the transaction.
+ */
+int
+__wt_txn_ts_log(WT_SESSION_IMPL *session)
+{
+ struct timespec t;
+ WT_CONNECTION_IMPL *conn;
+ WT_ITEM *logrec;
+ WT_TXN *txn;
+ wt_timestamp_t commit, durable, first, prepare, read;
+
+ conn = S2C(session);
+ txn = &session->txn;
+
+ if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) ||
+ F_ISSET(session, WT_SESSION_NO_LOGGING) ||
+ !FLD_ISSET(conn->log_flags, WT_CONN_LOG_DEBUG_MODE))
+ return (0);
+
+ /* We'd better have a transaction running. */
+ WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
+
+ WT_RET(__txn_logrec_init(session));
+ logrec = txn->logrec;
+ commit = durable = first = prepare = read = WT_TS_NONE;
+ if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) {
+ commit = txn->commit_timestamp;
+ first = txn->first_commit_timestamp;
+ }
+ prepare = txn->prepare_timestamp;
+ if (F_ISSET(txn, WT_TXN_HAS_TS_READ))
+ read = txn->read_timestamp;
+
+ __wt_epoch(session, &t);
+ return (__wt_logop_txn_timestamp_pack(session, logrec,
+ (uint64_t)t.tv_sec, (uint64_t)t.tv_nsec,
+ commit, durable, first, prepare, read));
+}
+
+/*
* __wt_txn_checkpoint_log --
* Write a log record for a checkpoint operation.
*/
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index 702196d17ee..b0960deb9c3 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -51,6 +51,11 @@ __recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r,
c = NULL;
/*
+ * File ids with the bit set to ignore this operation are skipped.
+ */
+ if (WT_LOGOP_IS_IGNORED(id))
+ return (0);
+ /*
* Metadata operations have an id of 0. Match operations based
* on the id and the current pass of recovery for metadata.
*
@@ -115,7 +120,8 @@ __txn_op_apply(
WT_DECL_RET;
WT_ITEM key, start_key, stop_key, value;
WT_SESSION_IMPL *session;
- uint64_t recno, start_recno, stop_recno;
+ wt_timestamp_t commit, durable, first, prepare, read;
+ uint64_t recno, start_recno, stop_recno, t_nsec, t_sec;
uint32_t fileid, mode, optype, opsize;
session = r->session;
@@ -125,6 +131,16 @@ __txn_op_apply(
WT_ERR(__wt_logop_read(session, pp, end, &optype, &opsize));
end = *pp + opsize;
+ /*
+ * If it is an operation type that should be ignored, we're done.
+ * Note that file ids within known operations also use the same
+ * macros to indicate that operation should be ignored.
+ */
+ if (WT_LOGOP_IS_IGNORED(optype)) {
+ *pp += opsize;
+ goto done;
+ }
+
switch (optype) {
case WT_LOGOP_COL_MODIFY:
WT_ERR(__wt_logop_col_modify_unpack(session, pp, end,
@@ -266,10 +282,20 @@ __txn_op_apply(
WT_TRET(stop->close(stop));
WT_ERR(ret);
break;
+ case WT_LOGOP_TXN_TIMESTAMP:
+ /*
+ * Timestamp records are informational only. We have to
+ * unpack it to properly move forward in the log record
+ * to the next operation, but otherwise ignore.
+ */
+ WT_ERR(__wt_logop_txn_timestamp_unpack(session, pp, end, &t_sec,
+ &t_nsec, &commit, &durable, &first, &prepare, &read));
+ break;
WT_ILLEGAL_VALUE_ERR(session, optype);
}
+done:
/* Reset the cursor so it doesn't block eviction. */
if (cursor != NULL)
WT_ERR(cursor->reset(cursor));
diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c
index b50da548f71..5ae391127b5 100644
--- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c
+++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c
@@ -667,8 +667,10 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
WT_DECL_RET;
WT_TXN *txn;
wt_timestamp_t ts;
+ bool set_ts;
txn = &session->txn;
+ set_ts = false;
/* Look for a commit timestamp. */
ret = __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval);
@@ -678,6 +680,7 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET(__wt_txn_parse_timestamp(session, "commit", &ts, &cval));
WT_RET(__wt_timestamp_validate(session, "commit", ts, &cval));
txn->commit_timestamp = ts;
+ set_ts = true;
__wt_txn_set_commit_timestamp(session);
} else
/*
@@ -687,7 +690,10 @@ __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET(__wt_txn_context_prepare_check(session));
/* Look for a read timestamp. */
- WT_RET(__wt_txn_parse_read_timestamp(session, cfg));
+ WT_RET(__wt_txn_parse_read_timestamp(session, cfg, &set_ts));
+
+ if (set_ts)
+ WT_RET(__wt_txn_ts_log(session));
return (0);
}
@@ -775,7 +781,8 @@ __wt_txn_parse_prepare_timestamp(
* Parse a request to set a transaction's read_timestamp.
*/
int
-__wt_txn_parse_read_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
+__wt_txn_parse_read_timestamp(WT_SESSION_IMPL *session,
+ const char *cfg[], bool *set_tsp)
{
WT_CONFIG_ITEM cval;
WT_TXN *txn;
@@ -844,6 +851,8 @@ __wt_txn_parse_read_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
round_to_oldest = false;
}
+ if (set_tsp != NULL)
+ *set_tsp = true;
__wt_txn_set_read_timestamp(session);
__wt_readunlock(session, &txn_global->rwlock);
if (round_to_oldest) {
diff --git a/src/third_party/wiredtiger/test/csuite/Makefile.am b/src/third_party/wiredtiger/test/csuite/Makefile.am
index 362d0775a88..cb3e24b5177 100644
--- a/src/third_party/wiredtiger/test/csuite/Makefile.am
+++ b/src/third_party/wiredtiger/test/csuite/Makefile.am
@@ -127,6 +127,10 @@ test_wt4333_handle_locks_SOURCES = wt4333_handle_locks/main.c
noinst_PROGRAMS += test_wt4333_handle_locks
all_TESTS += test_wt4333_handle_locks
+test_wt4803_cache_overflow_abort_SOURCES = wt4803_cache_overflow_abort/main.c
+noinst_PROGRAMS += test_wt4803_cache_overflow_abort
+all_TESTS += test_wt4803_cache_overflow_abort
+
# Run this during a "make check" smoke test.
TESTS = $(all_TESTS)
LOG_COMPILER = $(TEST_WRAPPER)
diff --git a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c
index bf59fe674a7..23e2ca3ebd9 100644
--- a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c
+++ b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c
@@ -71,9 +71,10 @@ static char home[1024]; /* Program working dir */
#define SESSION_MAX (MAX_TH + 3 + MAX_TH * PREPARE_PCT)
static const char * table_pfx = "table";
+static const char * const uri_collection = "collection";
static const char * const uri_local = "local";
static const char * const uri_oplog = "oplog";
-static const char * const uri_collection = "collection";
+static const char * const uri_shadow = "shadow";
static const char * const ckpt_file = "checkpoint_done";
@@ -82,9 +83,13 @@ static volatile uint64_t global_ts = 1;
#define ENV_CONFIG_COMPAT ",compatibility=(release=\"2.9\")"
#define ENV_CONFIG_DEF \
- "create,log=(archive=false,file_max=10M,enabled),session_max=%" PRIu32
+ "cache_size=20M,create,log=(archive=true,file_max=10M,enabled)," \
+ "debug_mode=(table_logging=true,checkpoint_retention=5)," \
+ "statistics=(fast),statistics_log=(wait=1,json=true),session_max=%" PRIu32
#define ENV_CONFIG_TXNSYNC \
- "create,log=(archive=false,file_max=10M,enabled)," \
+ "cache_size=20M,create,log=(archive=true,file_max=10M,enabled)," \
+ "debug_mode=(table_logging=true,checkpoint_retention=5)," \
+ "statistics=(fast),statistics_log=(wait=1,json=true)," \
"transaction_sync=(enabled,method=none),session_max=%" PRIu32
#define ENV_CONFIG_REC "log=(archive=false,recover=on)"
@@ -225,7 +230,7 @@ static WT_THREAD_RET
thread_run(void *arg)
{
FILE *fp;
- WT_CURSOR *cur_coll, *cur_local, *cur_oplog;
+ WT_CURSOR *cur_coll, *cur_local, *cur_oplog, *cur_shadow;
WT_ITEM data;
WT_RAND_STATE rnd;
WT_SESSION *prepared_session, *session;
@@ -286,6 +291,15 @@ thread_run(void *arg)
testutil_check(session->open_cursor(session,
uri, NULL, NULL, &cur_coll));
testutil_check(__wt_snprintf(
+ uri, sizeof(uri), "%s:%s", table_pfx, uri_shadow));
+ if (use_prep)
+ testutil_check(prepared_session->open_cursor(prepared_session,
+ uri, NULL, NULL, &cur_shadow));
+ else
+ testutil_check(session->open_cursor(session,
+ uri, NULL, NULL, &cur_shadow));
+
+ testutil_check(__wt_snprintf(
uri, sizeof(uri), "%s:%s", table_pfx, uri_local));
if (use_prep)
testutil_check(prepared_session->open_cursor(prepared_session,
@@ -315,7 +329,7 @@ thread_run(void *arg)
if (use_ts) {
testutil_check(pthread_rwlock_rdlock(&ts_lock));
- active_ts = __wt_atomic_addv64(&global_ts, 1);
+ active_ts = __wt_atomic_addv64(&global_ts, 2);
testutil_check(__wt_snprintf(tscfg,
sizeof(tscfg), "commit_timestamp=%" PRIx64,
active_ts));
@@ -334,6 +348,7 @@ thread_run(void *arg)
cur_coll->set_key(cur_coll, kname);
cur_local->set_key(cur_local, kname);
cur_oplog->set_key(cur_oplog, kname);
+ cur_shadow->set_key(cur_shadow, kname);
/*
* Put an informative string into the value so that it
* can be viewed well in a binary dump.
@@ -351,6 +366,20 @@ thread_run(void *arg)
data.data = cbuf;
cur_coll->set_value(cur_coll, &data);
testutil_check(cur_coll->insert(cur_coll));
+ cur_shadow->set_value(cur_shadow, &data);
+ if (use_ts) {
+ /*
+ * Change the timestamp in the middle of the
+ * transaction so that we simulate a secondary.
+ */
+ ++active_ts;
+ testutil_check(__wt_snprintf(tscfg,
+ sizeof(tscfg), "commit_timestamp=%" PRIx64,
+ active_ts));
+ testutil_check(session->timestamp_transaction(
+ session, tscfg));
+ }
+ testutil_check(cur_shadow->insert(cur_shadow));
data.size = __wt_random(&rnd) % MAX_VAL;
data.data = obuf;
cur_oplog->set_value(cur_oplog, &data);
@@ -437,6 +466,10 @@ run_workload(uint32_t nth)
testutil_check(session->create(session, uri,
"key_format=S,value_format=u,log=(enabled=false)"));
testutil_check(__wt_snprintf(
+ uri, sizeof(uri), "%s:%s", table_pfx, uri_shadow));
+ testutil_check(session->create(session, uri,
+ "key_format=S,value_format=u,log=(enabled=false)"));
+ testutil_check(__wt_snprintf(
uri, sizeof(uri), "%s:%s", table_pfx, uri_local));
testutil_check(session->create(session,
uri, "key_format=S,value_format=u"));
@@ -548,7 +581,7 @@ main(int argc, char *argv[])
FILE *fp;
REPORT c_rep[MAX_TH], l_rep[MAX_TH], o_rep[MAX_TH];
WT_CONNECTION *conn;
- WT_CURSOR *cur_coll, *cur_local, *cur_oplog;
+ WT_CURSOR *cur_coll, *cur_local, *cur_oplog, *cur_shadow;
WT_RAND_STATE rnd;
WT_SESSION *session;
pid_t pid;
@@ -725,6 +758,10 @@ main(int argc, char *argv[])
testutil_check(session->open_cursor(session,
buf, NULL, NULL, &cur_coll));
testutil_check(__wt_snprintf(
+ buf, sizeof(buf), "%s:%s", table_pfx, uri_shadow));
+ testutil_check(session->open_cursor(session,
+ buf, NULL, NULL, &cur_shadow));
+ testutil_check(__wt_snprintf(
buf, sizeof(buf), "%s:%s", table_pfx, uri_local));
testutil_check(session->open_cursor(session,
buf, NULL, NULL, &cur_local));
@@ -798,13 +835,20 @@ main(int argc, char *argv[])
cur_coll->set_key(cur_coll, kname);
cur_local->set_key(cur_local, kname);
cur_oplog->set_key(cur_oplog, kname);
+ cur_shadow->set_key(cur_shadow, kname);
/*
* The collection table should always only have the
- * data as of the checkpoint.
+ * data as of the checkpoint. The shadow table should
+ * always have the exact same data (or not) as the
+ * collection table.
*/
if ((ret = cur_coll->search(cur_coll)) != 0) {
if (ret != WT_NOTFOUND)
testutil_die(ret, "search");
+ if ((ret = cur_shadow->search(cur_shadow)) == 0)
+ testutil_die(ret,
+ "shadow search success");
+
/*
* If we don't find a record, the stable
* timestamp written to our file better be
@@ -841,7 +885,10 @@ main(int argc, char *argv[])
" > stable ts %" PRIu64 "\n",
fname, key, stable_fp, stable_val);
fatal = true;
- }
+ } else if ((ret = cur_shadow->search(cur_shadow)) != 0)
+ /* Collection and shadow both have the data. */
+ testutil_die(ret, "shadow search failure");
+
/*
* The local table should always have all data.
*/
diff --git a/src/third_party/wiredtiger/test/csuite/wt4803_cache_overflow_abort/main.c b/src/third_party/wiredtiger/test/csuite/wt4803_cache_overflow_abort/main.c
new file mode 100644
index 00000000000..7d9b0baf132
--- /dev/null
+++ b/src/third_party/wiredtiger/test/csuite/wt4803_cache_overflow_abort/main.c
@@ -0,0 +1,239 @@
+/*-
+ * Public Domain 2014-2019 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "test_util.h"
+
+#include <signal.h>
+#include <sys/wait.h>
+
+/*
+ * JIRA ticket reference: WT-4803
+ * Test case description: This test is checking the functionality of the
+ * lookaside file_max configuration. When the size of the lookaside file exceeds
+ * this value, we expect to panic.
+ * Failure mode: If we receive a panic in the test cases we weren't expecting to
+ * and vice versa.
+ */
+
+#define NUM_KEYS 2000
+
+/*
+ * This is a global flag that should be set before running test_las_workload.
+ * It lets the child process know whether it should be expecting a panic or not
+ * so that it can adjust its exit code as needed.
+ */
+static bool expect_panic;
+
+static int
+handle_message(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, int error, const char *message)
+{
+ WT_UNUSED(handler);
+ WT_UNUSED(session);
+
+ (void)fprintf(
+ stderr, "%s: %s\n", message, session->strerror(session, error));
+
+ if (error == WT_PANIC &&
+ strstr(message, "exceeds maximum size") != NULL) {
+ fprintf(stderr, "Got cache overflow error (expect_panic=%s)\n",
+ expect_panic ? "true" : "false");
+
+ /*
+ * If we're expecting a panic, exit with zero to indicate to the
+ * parent that this test was successful.
+ *
+ * If not, don't intercept. We'll naturally exit with non-zero
+ * if we're terminating due to panic.
+ */
+ if (expect_panic)
+ exit(EXIT_SUCCESS);
+ }
+
+ return (0);
+}
+
+static WT_EVENT_HANDLER event_handler = {
+ handle_message,
+ NULL,
+ NULL,
+ NULL
+};
+
+static void
+las_workload(TEST_OPTS *opts, const char *las_file_max)
+{
+ WT_CURSOR *cursor;
+ WT_SESSION *other_session, *session;
+ int i;
+ char buf[WT_MEGABYTE], open_config[128];
+
+ testutil_check(__wt_snprintf(open_config, sizeof(open_config),
+ "create,cache_size=50MB,cache_overflow=(file_max=%s)",
+ las_file_max));
+
+ testutil_check(wiredtiger_open(
+ opts->home, &event_handler, open_config, &opts->conn));
+ testutil_check(
+ opts->conn->open_session(opts->conn, NULL, NULL, &session));
+ testutil_check(
+ session->create(session, opts->uri, "key_format=i,value_format=S"));
+ testutil_check(
+ session->open_cursor(session, opts->uri, NULL, NULL, &cursor));
+
+ memset(buf, 0xA, WT_MEGABYTE);
+ buf[WT_MEGABYTE - 1] = '\0';
+
+ /* Populate the table. */
+ for (i = 0; i < NUM_KEYS; ++i) {
+ cursor->set_key(cursor, i);
+ cursor->set_value(cursor, buf);
+ testutil_check(cursor->insert(cursor));
+ }
+
+ /*
+ * Open a snapshot isolation transaction in another session. This forces
+ * the cache to retain all previous values. Then update all keys with a
+ * new value in the original session while keeping that snapshot
+ * transaction open. With the large value buffer, small cache and lots
+ * of keys, this will force a lot of lookaside usage.
+ *
+ * When the file_max setting is small, the maximum size should easily be
+ * reached and we should panic. When the maximum size is large or not
+ * set, then we should succeed.
+ */
+ testutil_check(
+ opts->conn->open_session(opts->conn, NULL, NULL, &other_session));
+ testutil_check(other_session->begin_transaction(
+ other_session, "isolation=snapshot"));
+
+ memset(buf, 0xB, WT_MEGABYTE);
+ buf[WT_MEGABYTE - 1] = '\0';
+
+ for (i = 0; i < NUM_KEYS; ++i) {
+ cursor->set_key(cursor, i);
+ cursor->set_value(cursor, buf);
+ testutil_check(cursor->update(cursor));
+ }
+
+ /*
+ * Cleanup.
+ * We do not get here when the file_max size is small because we will
+ * have already hit the maximum and exited. This code only executes on
+ * the successful path.
+ */
+ testutil_check(
+ other_session->rollback_transaction(other_session, NULL));
+ testutil_check(other_session->close(other_session, NULL));
+
+ testutil_check(cursor->close(cursor));
+ testutil_check(session->close(session, NULL));
+}
+
+static int
+test_las_workload(TEST_OPTS *opts, const char *las_file_max)
+{
+ pid_t pid;
+ int status;
+
+ /*
+ * We're going to run this workload for different configurations of
+ * file_max. So clean out the work directory each time.
+ */
+ testutil_make_work_dir(opts->home);
+
+ /*
+ * Since it's possible that the workload will panic and abort, we will
+ * fork the process and execute the workload in the child process.
+ *
+ * This way, we can safely check the exit code of the child process and
+ * confirm that it is what we expected.
+ */
+ pid = fork();
+ if (pid < 0)
+ /* Failed fork. */
+ testutil_die(errno, "fork");
+ else if (pid == 0) {
+ /* Child process from here. */
+ las_workload(opts, las_file_max);
+
+ /*
+ * If we're expecting a panic during the workload, we shouldn't
+ * get to this point. Exit with non-zero to indicate to parent
+ * that we should fail this test.
+ */
+ fprintf(stderr,
+ "Successfully completed workload (expect_panic=%s)\n",
+ expect_panic ? "true" : "false");
+
+ if (expect_panic)
+ exit(EXIT_FAILURE);
+ else
+ exit(EXIT_SUCCESS);
+ }
+
+ /* Parent process from here. */
+ if (waitpid(pid, &status, 0) == -1)
+ testutil_die(errno, "waitpid");
+
+ return (status);
+}
+
+int
+main(int argc, char **argv)
+{
+ TEST_OPTS opts;
+
+ memset(&opts, 0x0, sizeof(opts));
+ testutil_check(testutil_parse_opts(argc, argv, &opts));
+
+ /*
+ * The lookaside is unbounded.
+ * We don't expect any failure since we can use as much as needed.
+ */
+ expect_panic = false;
+ testutil_check(test_las_workload(&opts, "0"));
+
+ /*
+ * The lookaside is limited to 5GB.
+ * This is more than enough for this workload so we don't expect any
+ * failure.
+ */
+ expect_panic = false;
+ testutil_check(test_las_workload(&opts, "5GB"));
+
+ /*
+ * The lookaside is limited to 100MB.
+ * This is insufficient for this workload so we're expecting a failure.
+ */
+ expect_panic = true;
+ testutil_check(test_las_workload(&opts, "100MB"));
+
+ testutil_cleanup(&opts);
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml
index be111c6432c..01aff272320 100644
--- a/src/third_party/wiredtiger/test/evergreen.yml
+++ b/src/third_party/wiredtiger/test/evergreen.yml
@@ -8,7 +8,7 @@ functions:
command: git.get_project
params:
directory: wiredtiger
- "fetch artifacts" :
+ "fetch artifacts" :
- command: s3.get
params:
aws_key: ${aws_key}
@@ -19,7 +19,7 @@ functions:
"fetch mongo-tests repo" :
command: shell.exec
params:
- script: |
+ script: |
git clone https://github.com/wiredtiger/mongo-tests
"compile wiredtiger":
command: shell.exec
@@ -35,7 +35,7 @@ functions:
./build_posix/reconf
${configure_env_vars|} ./configure --enable-diagnostic --enable-python --enable-zlib --enable-strict --enable-static --prefix=$(pwd)/LOCAL_INSTALL
${make_command|make} ${smp_command|} 2>&1
-
+
# On macOS, change the binary location with install_name_tool since DYLD_LIBRARY_PATH
# appears not to work for dynamic modules loaded by python. For wt, the libtool generated
# script has the wrong path for running on test machines.
@@ -609,6 +609,20 @@ tasks:
${test_env_vars|} $(pwd)/test_wt4156_metadata_salvage 2>&1
+ - name: csuite-wt4803-cache-overflow-abort-test
+ depends_on:
+ - name: compile
+ commands:
+ - func: "fetch artifacts"
+ - command: shell.exec
+ params:
+ working_dir: "wiredtiger/build_posix/test/csuite"
+ script: |
+ set -o errexit
+ set -o verbose
+
+ ${test_env_vars|} $(pwd)/test_wt4803_cache_overflow_abort 2>&1
+
- name: csuite-rwlock-test
depends_on:
- name: compile
@@ -760,9 +774,9 @@ tasks:
# Break out Python unit tests into multiple buckets/tasks based on test name and runtime
# The test/suite/run.py script can work out test names by casting each command argument
- # with "test_" prefix and "*.py" postfix.
+ # with "test_" prefix and "*.py" postfix.
#
- # One example:
+ # One example:
# "test/suite/run.py [ab]" will be translated to testing "test_a*.py" and "test_b*.py"
- name: unit-test-bucket00
@@ -934,18 +948,18 @@ tasks:
- name: million-collection-test
depends_on: []
- run_on:
+ run_on:
- rhel62-large
- commands:
+ commands:
- func: "fetch mongo-tests repo"
- command: shell.exec
params:
working_dir: mongo-tests
- script: |
+ script: |
set -o errexit
set -o verbose
ulimit -n 1000000
- ulimit -c unlimited
+ ulimit -c unlimited
largescale/run-million-collection-test.sh .
buildvariants:
@@ -1051,4 +1065,3 @@ buildvariants:
- name: make-check-test
- name: unit-test
- name: fops
-
diff --git a/src/third_party/wiredtiger/test/suite/test_debug_mode01.py b/src/third_party/wiredtiger/test/suite/test_debug_mode01.py
new file mode 100644
index 00000000000..88ba81f9c1c
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_debug_mode01.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2019 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+
+# test_debug_mode01.py
+# Test the debug mode settings. Test rollback_error in this one.
+class test_debug_mode01(wttest.WiredTigerTestCase):
+ conn_config = 'log=(enabled=true),debug_mode=(rollback_error=5)'
+ session_config = 'isolation=snapshot'
+ uri = 'file:test_debug'
+
+ entries = 22
+ min_error = entries // 5
+
+ def rollback_error(self, val, insert=True):
+ keys = range(1, self.entries)
+ c = self.session.open_cursor(self.uri, None)
+ # We expect some operations to return an exception so we cannot
+ # use the simple 'c[k] = 1'. But we must explicitly set the key
+ # and value and then use the insert or update primitives.
+ #
+ # Look for a generic 'WT_ROLLBACK' string not the specific
+ # simulated reason string.
+ msg = '/WT_ROLLBACK/'
+ rollback = 0
+ for k in keys:
+ self.session.begin_transaction()
+ c.set_key(k)
+ c.set_value(val)
+ # Execute the insert or update. It will return true if the simulated
+ # conflict exception is raised, false if no exception occurred.
+ if insert:
+ conflict = self.assertRaisesException(wiredtiger.WiredTigerError, \
+ lambda:c.insert(), msg, True)
+ else:
+ conflict = self.assertRaisesException(wiredtiger.WiredTigerError, \
+ lambda:c.update(), msg, True)
+
+ if conflict:
+ rollback += 1
+ self.pr("Key: " + str(k) + " Rolled back")
+ self.session.rollback_transaction()
+ else:
+ self.session.commit_transaction()
+ c.close()
+ return rollback
+
+ def test_rollback_error(self):
+ self.session.create(self.uri, 'key_format=i,value_format=i')
+ rollback = self.rollback_error(1)
+ rollback += self.rollback_error(2, False)
+ self.pr("Rollback: " + str(rollback))
+ self.pr("Minimum: " + str(self.min_error))
+ self.assertTrue(rollback >= self.min_error)
+
+ def test_rollback_error_off(self):
+ # The setting is added in to wiredtiger_open via the config above.
+ # Test that we can properly turn the setting off via reconfigure.
+ # There should then be no rollback errors.
+ self.conn.reconfigure("debug_mode=(rollback_error=0)")
+
+ self.session.create(self.uri, 'key_format=i,value_format=i')
+ rollback = self.rollback_error(1)
+ rollback += self.rollback_error(2)
+ self.assertTrue(rollback == 0)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_debug_mode02.py b/src/third_party/wiredtiger/test/suite/test_debug_mode02.py
new file mode 100644
index 00000000000..0452e60fbd1
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_debug_mode02.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+#
+# Public Domain 2024-2029 MongoDB, Inc.
+# Public Domain 2008-2024 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import fnmatch, os, time, wiredtiger, wttest
+
+# test_debug_mode02.py
+# Test the debug mode settings. Test checkpoint_retention use.
+class test_debug_mode02(wttest.WiredTigerTestCase):
+ uri = 'file:test_debug'
+
+ entries = 100
+ loop = 0
+ retain = 5
+ log1 = 'WiredTigerLog.0000000001'
+ log2 = 'WiredTigerLog.0000000002'
+
+ def conn_config(self):
+ return 'log=(enabled=true,file_max=100K),debug_mode=(checkpoint_retention=%d)' % self.retain
+
+ def log_set(self):
+ logs = fnmatch.filter(os.listdir(self.home), "*gerLog*")
+ return set(logs)
+
+ def check_archive(self, logfile):
+ archived = False
+ for i in range(1,90):
+ # Sleep and then see if archive ran. We do this in a loop
+ # for slow machines. Max out at 90 seconds.
+ time.sleep(1.0)
+ if not os.path.exists(logfile):
+ archived = True
+ break
+ self.assertTrue(archived)
+
+ def advance_log_checkpoint(self):
+ # Advance the log file to the next file and write a checkpoint.
+ keys = range(1, self.entries)
+ cur_set = self.log_set()
+ c = self.session.open_cursor(self.uri, None)
+ new_set = cur_set
+ # Write data in small chunks until we switch log files.
+ while cur_set == new_set:
+ for k in keys:
+ c[k + (self.loop * self.entries)] = 1
+ self.loop += 1
+ new_set = self.log_set()
+ c.close()
+ # Write a checkpoint into the new log file.
+ self.session.checkpoint()
+
+ def test_checkpoint_retain(self):
+ self.session.create(self.uri, 'key_format=i,value_format=i')
+ # No log files should be archived while we have fewer than the
+ # retention number of logs. Make sure each iteration the new
+ # logs are a proper superset of the previous time.
+ for i in range(1, self.retain):
+ cur_set = self.log_set()
+ self.advance_log_checkpoint()
+ # We don't accomodate slow machines here because we don't expect
+ # the files the change and there is no way to know if archive ran
+ # otherwise.
+ time.sleep(1.0)
+ new_set = self.log_set()
+ self.assertTrue(new_set.issuperset(cur_set))
+
+ self.assertTrue(os.path.exists(self.log1))
+ self.advance_log_checkpoint()
+ self.check_archive(self.log1)
+
+ # Test that both zero and one archive as usual. And test reconfigure.
+ def test_checkpoint_retain_off(self):
+ self.conn.reconfigure("debug_mode=(checkpoint_retention=0)")
+ self.session.create(self.uri, 'key_format=i,value_format=i')
+
+ self.advance_log_checkpoint()
+ self.check_archive(self.log1)
+
+ self.conn.reconfigure("debug_mode=(checkpoint_retention=1)")
+ self.advance_log_checkpoint()
+ self.check_archive(self.log2)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_debug_mode03.py b/src/third_party/wiredtiger/test/suite/test_debug_mode03.py
new file mode 100644
index 00000000000..feb5c0d904a
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_debug_mode03.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+#
+# Public Domain 2034-2039 MongoDB, Inc.
+# Public Domain 2008-2034 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+
+# test_debug_mode03.py
+# Test the debug mode settings. Test table_logging use.
+class test_debug_mode03(wttest.WiredTigerTestCase):
+ conn_config = 'log=(enabled=true,file_max=100K),debug_mode=(table_logging=true)'
+ uri = 'file:test_debug'
+ entries = 100
+ value = b'\x01\x02abcd\x03\x04'
+
+ def add_data(self):
+ # Add a binary value we can search for in the log.
+ keys = range(0, self.entries)
+ c = self.session.open_cursor(self.uri, None)
+ for k in keys:
+ c[k] = self.value
+ c.close()
+
+ def find_log_recs(self):
+ # Open a log cursor. We should find log records that have
+ # the value we inserted.
+ c = self.session.open_cursor("log:", None)
+ count = 0
+ while c.next() == 0:
+ # lsn.file, lsn.offset, opcount
+ keys = c.get_key()
+ # txnid, rectype, optype, fileid, logrec_key, logrec_value
+ values = c.get_value()
+ # Look for log records that have a key/value pair.
+ if values[4] != b'':
+ if self.value in values[5]: # logrec_value
+ count += 1
+ c.close()
+ return count
+
+ def test_table_logging(self):
+ self.session.create(self.uri, 'key_format=i,value_format=u,log=(enabled=false)')
+ self.add_data()
+ count = self.find_log_recs()
+ self.assertEqual(count, self.entries)
+
+ # Test that both zero and one archive as usual. And test reconfigure.
+ def test_table_logging_off(self):
+ self.conn.reconfigure("debug_mode=(table_logging=false)")
+ self.session.create(self.uri, 'key_format=i,value_format=u,log=(enabled=false)')
+ self.add_data()
+ count = self.find_log_recs()
+ self.assertEqual(count, 0)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_debug_mode04.py b/src/third_party/wiredtiger/test/suite/test_debug_mode04.py
new file mode 100644
index 00000000000..1f5429495e8
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_debug_mode04.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+#
+# Public Domain 2034-2039 MongoDB, Inc.
+# Public Domain 2008-2034 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+
+# test_debug_mode04.py
+# Test the debug mode settings. Test eviction use.
+class test_debug_mode04(wttest.WiredTigerTestCase):
+ conn_config = 'log=(enabled=true,file_max=100K),debug_mode=(eviction=true)'
+ uri = 'file:test_debug'
+ entries = 100
+ value = b'\x01\x02abcd\x03\x04'
+
+ def add_data(self):
+ keys = range(0, self.entries)
+ c = self.session.open_cursor(self.uri, None)
+ for k in keys:
+ c[k] = self.value
+ c.close()
+
+ # Just test turning it on and off. There really isn't something
+ # specific to verify.
+ def test_table_logging(self):
+ self.session.create(self.uri, 'key_format=i,value_format=u')
+ self.add_data()
+
+ def test_table_logging_off(self):
+ self.conn.reconfigure("debug_mode=(eviction=false)")
+ self.session.create(self.uri, 'key_format=i,value_format=u')
+ self.add_data()
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_las04.py b/src/third_party/wiredtiger/test/suite/test_las04.py
new file mode 100644
index 00000000000..9d35d3c17f3
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_las04.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2019 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_las04.py
+# Test file_max configuration and reconfiguration for the lookaside table.
+#
+
+import wiredtiger, wttest
+from wtscenario import make_scenarios
+
+# Taken from src/include/misc.h.
+WT_MB = 1048576
+
+class test_las04(wttest.WiredTigerTestCase):
+ uri = 'table:las_04'
+ in_memory_values = [
+ ('false', dict(in_memory=False)),
+ ('none', dict(in_memory=None)),
+ ('true', dict(in_memory=True))
+ ]
+ init_file_max_values = [
+ ('default', dict(init_file_max=None, init_stat_val=0)),
+ ('non-zero', dict(init_file_max='100MB', init_stat_val=(WT_MB * 100))),
+ ('zero', dict(init_file_max='0', init_stat_val=0))
+ ]
+ reconfig_file_max_values = [
+ ('non-zero', dict(reconfig_file_max='100MB',
+ reconfig_stat_val=(WT_MB * 100))),
+ ('too-low', dict(reconfig_file_max='99MB', reconfig_stat_val=None)),
+ ('zero', dict(reconfig_file_max='0', reconfig_stat_val=0))
+ ]
+ scenarios = make_scenarios(init_file_max_values, reconfig_file_max_values,
+ in_memory_values)
+
+ def conn_config(self):
+ config = 'statistics=(fast)'
+ if self.init_file_max is not None:
+ config += ',cache_overflow=(file_max={})'.format(self.init_file_max)
+ if self.in_memory is not None:
+ config += ',in_memory=' + ('true' if self.in_memory else 'false')
+ return config
+
+ def get_stat(self, stat):
+ stat_cursor = self.session.open_cursor('statistics:')
+ val = stat_cursor[stat][2]
+ stat_cursor.close()
+ return val
+
+ def test_las(self):
+ self.session.create(self.uri, 'key_format=S,value_format=S')
+
+ if self.in_memory:
+ # For in-memory configurations, we simply ignore any lookaside
+ # related configuration.
+ self.assertEqual(
+ self.get_stat(wiredtiger.stat.conn.cache_lookaside_ondisk_max),
+ 0)
+ else:
+ self.assertEqual(
+ self.get_stat(wiredtiger.stat.conn.cache_lookaside_ondisk_max),
+ self.init_stat_val)
+
+ reconfigure = lambda: self.conn.reconfigure(
+ 'cache_overflow=(file_max={})'.format(self.reconfig_file_max))
+
+ # We expect an error when the statistic value is None because the value
+ # is out of range.
+ if self.reconfig_stat_val is None:
+ self.assertRaisesWithMessage(
+ wiredtiger.WiredTigerError, reconfigure, '/below minimum/')
+ return
+
+ reconfigure()
+
+ if self.in_memory:
+ self.assertEqual(
+ self.get_stat(wiredtiger.stat.conn.cache_lookaside_ondisk_max),
+ 0)
+ else:
+ self.assertEqual(
+ self.get_stat(wiredtiger.stat.conn.cache_lookaside_ondisk_max),
+ self.reconfig_stat_val)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp17.py b/src/third_party/wiredtiger/test/suite/test_timestamp17.py
new file mode 100644
index 00000000000..f03b002c0ed
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_timestamp17.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2019 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_timestamp17.py
+# Test unintended timestamp usage on an update and ensure behavior
+# matches expectations. Additionally, move the timestamp to ensure
+# that values read are still consistent after those timestamps are
+# moved.
+#
+
+import random
+from suite_subprocess import suite_subprocess
+import wiredtiger, wttest
+from wtscenario import make_scenarios
+
+def timestamp_str(t):
+ return '%x' % t
+
+class test_timestamp17(wttest.WiredTigerTestCase, suite_subprocess):
+ tablename = 'test_timestamp17'
+ uri = 'table:' + tablename
+ session_config = 'isolation=snapshot'
+
+ def test_inconsistent_timestamping(self):
+ self.session.create(self.uri, 'key_format=i,value_format=i')
+ self.session.begin_transaction()
+ cur1 = self.session.open_cursor(self.uri)
+ cur1[1] = 1
+ self.session.commit_transaction('commit_timestamp=25')
+
+ self.session.begin_transaction()
+ cur1[1] = 2
+ self.session.commit_transaction('commit_timestamp=50')
+
+ self.session.begin_transaction()
+ cur1[1] = 3
+ self.session.commit_transaction('commit_timestamp=200')
+
+ self.session.begin_transaction()
+ cur1.set_key(1)
+ cur1.remove()
+ self.session.commit_transaction('commit_timestamp=100')
+
+ # Read before any updates and ensure we cannot find the key or value.
+ self.session.begin_transaction('read_timestamp=20')
+ cur1.set_key(1)
+ search_success = cur1.search()
+ self.assertEqual(search_success, wiredtiger.WT_NOTFOUND)
+ self.session.commit_transaction()
+
+ # Read at 25 and we should see 1.
+ self.session.begin_transaction('read_timestamp=25')
+ cur1.set_key(1)
+ search_success = cur1.search()
+ self.assertEqual(search_success, 0)
+ value1 = cur1.get_value()
+ self.session.commit_transaction()
+ self.assertEqual(1, value1)
+
+ # Read at 50 and we should see 2.
+ self.session.begin_transaction('read_timestamp=50')
+ cur1.set_key(1)
+ search_success = cur1.search()
+ self.assertEqual(search_success, 0)
+ value1 = cur1.get_value()
+ self.session.commit_transaction()
+ self.assertEqual(2, value1)
+
+ # Read at 100 and we should not find anything.
+ self.session.begin_transaction('read_timestamp=100')
+ cur1.set_key(1)
+ search_success = cur1.search()
+ self.assertEqual(search_success, wiredtiger.WT_NOTFOUND)
+ self.session.commit_transaction()
+
+ # Read at 200 and we should still not find anything.
+ self.session.begin_transaction('read_timestamp=200')
+ cur1.set_key(1)
+ search_success = cur1.search()
+ self.assertEqual(search_success, wiredtiger.WT_NOTFOUND)
+ self.session.commit_transaction()
+
+ # Read at 300 for further validation.
+ self.session.begin_transaction('read_timestamp=300')
+ cur1.set_key(1)
+ search_success = cur1.search()
+ self.assertEqual(search_success, wiredtiger.WT_NOTFOUND)
+ self.session.commit_transaction()
+
+ # Move oldest timestamp forward and
+ # confirm we see the correct numbers.
+ self.conn.set_timestamp('oldest_timestamp=49')
+
+ # Read at 49 and we should see 1.
+ self.session.begin_transaction('read_timestamp=49')
+ cur1.set_key(1)
+ search_success = cur1.search()
+ self.assertEqual(search_success, 0)
+ value1 = cur1.get_value()
+ self.session.commit_transaction()
+ self.assertEqual(1, value1)
+
+ self.conn.set_timestamp('oldest_timestamp=99')
+
+ # Read at 99 and we should see 2.
+ self.session.begin_transaction('read_timestamp=99')
+ cur1.set_key(1)
+ search_success = cur1.search()
+ self.assertEqual(search_success, 0)
+ value1 = cur1.get_value()
+ self.session.commit_transaction()
+ self.assertEqual(2, value1)
+
+ # Move oldest to the point at which we deleted.
+ self.conn.set_timestamp('oldest_timestamp=100')
+
+ # Read at 100 and we should not find anything.
+ self.session.begin_transaction('read_timestamp=100')
+ cur1.set_key(1)
+ search_success = cur1.search()
+ self.assertEqual(search_success, wiredtiger.WT_NOTFOUND)
+ self.session.commit_transaction()
+
+ # Read at 200 and we should not find anything.
+ self.session.begin_transaction('read_timestamp=200')
+ cur1.set_key(1)
+ search_success = cur1.search()
+ self.assertEqual(search_success, wiredtiger.WT_NOTFOUND)
+ self.session.commit_transaction()
+
+ # Move oldest timestamp to 200 to ensure history
+ # works as expected and we do not see the value 3.
+ self.conn.set_timestamp('oldest_timestamp=200')
+
+ self.session.begin_transaction('read_timestamp=200')
+ cur1.set_key(1)
+ search_success = cur1.search()
+ self.assertEqual(search_success, wiredtiger.WT_NOTFOUND)
+ self.session.commit_transaction()
+
+ self.session.begin_transaction('read_timestamp=250')
+ cur1.set_key(1)
+ search_success = cur1.search()
+ self.assertEqual(search_success, wiredtiger.WT_NOTFOUND)
+ self.session.commit_transaction()
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/wttest.py b/src/third_party/wiredtiger/test/suite/wttest.py
index c0b755d2230..045cac26552 100644
--- a/src/third_party/wiredtiger/test/suite/wttest.py
+++ b/src/third_party/wiredtiger/test/suite/wttest.py
@@ -508,8 +508,9 @@ class WiredTigerTestCase(unittest.TestCase):
"""
Like TestCase.assertRaises(), with some additional options.
If the exceptionString argument is used, the exception's string
- must match it. If optional is set, then no assertion occurs
- if the exception doesn't occur.
+ must match it, or its pattern if the string starts and ends with
+ a slash. If optional is set, then no assertion occurs if the
+ exception doesn't occur.
Returns true if the assertion is raised.
"""
raised = False
@@ -519,9 +520,19 @@ class WiredTigerTestCase(unittest.TestCase):
if not isinstance(err, exceptionType):
self.fail('Exception of incorrect type raised, got type: ' + \
str(type(err)))
- if exceptionString != None and exceptionString != str(err):
- self.fail('Exception with incorrect string raised, got: "' + \
- str(err) + '"')
+ if exceptionString != None:
+ # Match either a pattern or an exact string.
+ fail = False
+ self.pr('Expecting string msg: ' + exceptionString)
+ if len(exceptionString) > 2 and \
+ exceptionString[0] == '/' and exceptionString[-1] == '/' :
+ if re.search(exceptionString[1:-1], str(err)) == None:
+ fail = True
+ elif exceptionString != str(err):
+ fail = True
+ if fail:
+ self.fail('Exception with incorrect string raised, got: "' + \
+ str(err) + '" Expected: ' + exceptionString)
raised = True
if not raised and not optional:
self.fail('no assertion raised')