summaryrefslogtreecommitdiff
path: root/src/third_party
diff options
context:
space:
mode:
authorAlex Gorrod <alexander.gorrod@mongodb.com>2017-10-10 16:29:49 +1100
committerAlex Gorrod <alexander.gorrod@mongodb.com>2017-10-10 16:37:55 +1100
commit39998ac6928c4e7f3acd2f7ee2fc5fb4df056c18 (patch)
treec075233cd32c6ec0205af77db475836c0fba60e9 /src/third_party
parentdd094ce1bc1fb424ccc6dd71939e5c7a30159e2e (diff)
downloadmongo-39998ac6928c4e7f3acd2f7ee2fc5fb4df056c18.tar.gz
Import wiredtiger: 0cd3d5bbd8a5c8779f1129c6754b4463403e788f from branch mongodb-3.6
ref: 6f561957cb..0cd3d5bbd8 for: 3.5.14 WT-3200 LSM bug: Failed lookup in bloom filter. WT-3435 Lookaside eviction should be able to save unstable updates WT-3453 Enhance lookaside table test coverage in Python suite WT-3559 Detect when a checkpoint races with metadata changes WT-3579 Enhance support for running wtperf workloads with workgen WT-3582 Cache stuck full of internal pages WT-3593 Add an API to enforce consistent use of timestamps (#3667) WT-3599 reconciliation calculates block matching checksums too frequently. WT-3600 timestamp API lets you set timestamps with invalid characters WT-3612 Improve documentation of durability with backup cursors WT-3613 test/format cache full with LSM WT-3618 WT remove solaris from evergreen builds WT-3620 POSIX thread attribute structures must be destroyed WT-3621 Add test for full backups with concurrent table creation WT-3622 Allow upper case hexadecimal timestamps WT-3627 test_txn14.test_txn14.test_log_flush timeout WT-3631 Convert timestamps to integers in Python tests before comparing WT-3636 Account for page image sizes in cache consistently WT-3638 format failure, update list without complete visible record WT-3639 Test/format tried to drop named checkpoints during a hot backup WT-3641 Track maximum timestamp used in each btree WT-3642 Avoid lookaside reads for dead trees
Diffstat (limited to 'src/third_party')
-rw-r--r--src/third_party/wiredtiger/bench/workgen/runner/runner/__init__.py2
-rw-r--r--src/third_party/wiredtiger/bench/workgen/runner/runner/core.py160
-rw-r--r--src/third_party/wiredtiger/bench/workgen/workgen.cxx168
-rw-r--r--src/third_party/wiredtiger/bench/workgen/workgen.h46
-rw-r--r--src/third_party/wiredtiger/bench/workgen/workgen_int.h15
-rw-r--r--src/third_party/wiredtiger/bench/workgen/wtperf.py413
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-80r20u.wtperf2
-rw-r--r--src/third_party/wiredtiger/dist/api_data.py14
-rw-r--r--src/third_party/wiredtiger/dist/flags.py19
-rw-r--r--src/third_party/wiredtiger/dist/s_define.list1
-rw-r--r--src/third_party/wiredtiger/dist/stat_data.py1
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_all.c2
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/bloom/bloom.c3
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c8
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c3
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_delete.c7
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_discard.c10
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_handle.c25
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_ovfl.c4
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_page.c16
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_random.c10
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_read.c258
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_rebalance.c5
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_slvg.c17
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c59
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_sync.c145
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_walk.c5
-rw-r--r--src/third_party/wiredtiger/src/btree/col_srch.c11
-rw-r--r--src/third_party/wiredtiger/src/btree/row_srch.c12
-rw-r--r--src/third_party/wiredtiger/src/cache/cache_las.c182
-rw-r--r--src/third_party/wiredtiger/src/config/config_def.c42
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c3
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_cache.c5
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_ckpt.c5
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_dhandle.c14
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_open.c6
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_sweep.c30
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_file.c8
-rw-r--r--src/third_party/wiredtiger/src/docs/backup.dox10
-rw-r--r--src/third_party/wiredtiger/src/docs/checkpoint.dox4
-rw-r--r--src/third_party/wiredtiger/src/docs/transactions.dox4
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_file.c48
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c54
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_page.c95
-rw-r--r--src/third_party/wiredtiger/src/include/api.h7
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h66
-rw-r--r--src/third_party/wiredtiger/src/include/btree.h7
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i11
-rw-r--r--src/third_party/wiredtiger/src/include/connection.h4
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h13
-rw-r--r--src/third_party/wiredtiger/src/include/flags.h69
-rw-r--r--src/third_party/wiredtiger/src/include/mutex.i8
-rw-r--r--src/third_party/wiredtiger/src/include/schema.h2
-rw-r--r--src/third_party/wiredtiger/src/include/session.h3
-rw-r--r--src/third_party/wiredtiger/src/include/stat.h1
-rw-r--r--src/third_party/wiredtiger/src/include/thread_group.h5
-rw-r--r--src/third_party/wiredtiger/src/include/txn.h26
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i51
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in408
-rw-r--r--src/third_party/wiredtiger/src/include/wt_internal.h2
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_tree.c11
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_work_unit.c25
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c7
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c944
-rw-r--r--src/third_party/wiredtiger/src/session/session_api.c3
-rw-r--r--src/third_party/wiredtiger/src/support/hex.c12
-rw-r--r--src/third_party/wiredtiger/src/support/stat.c4
-rw-r--r--src/third_party/wiredtiger/src/support/thread_group.c6
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c15
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ckpt.c25
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c65
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_timestamp.c46
-rw-r--r--src/third_party/wiredtiger/test/fops/file.c64
-rw-r--r--src/third_party/wiredtiger/test/fops/t.c13
-rw-r--r--src/third_party/wiredtiger/test/fops/thread.h1
-rw-r--r--src/third_party/wiredtiger/test/format/format.h1
-rw-r--r--src/third_party/wiredtiger/test/format/ops.c139
-rw-r--r--src/third_party/wiredtiger/test/format/t.c2
-rw-r--r--src/third_party/wiredtiger/test/format/util.c80
-rw-r--r--src/third_party/wiredtiger/test/mciproject.yml14
-rw-r--r--src/third_party/wiredtiger/test/suite/test_assert01.py114
-rw-r--r--src/third_party/wiredtiger/test/suite/test_assert02.py141
-rw-r--r--src/third_party/wiredtiger/test/suite/test_assert03.py86
-rw-r--r--src/third_party/wiredtiger/test/suite/test_backup07.py117
-rw-r--r--src/third_party/wiredtiger/test/suite/test_las.py88
-rw-r--r--src/third_party/wiredtiger/test/suite/test_timestamp01.py30
-rw-r--r--src/third_party/wiredtiger/test/suite/test_timestamp02.py12
-rw-r--r--src/third_party/wiredtiger/test/suite/test_timestamp03.py8
-rw-r--r--src/third_party/wiredtiger/test/suite/test_timestamp04.py7
-rw-r--r--src/third_party/wiredtiger/test/suite/test_timestamp05.py6
-rw-r--r--src/third_party/wiredtiger/test/suite/test_timestamp07.py41
-rw-r--r--src/third_party/wiredtiger/test/suite/test_timestamp09.py6
-rw-r--r--src/third_party/wiredtiger/test/suite/test_txn14.py9
-rw-r--r--src/third_party/wiredtiger/test/suite/wttest.py6
95 files changed, 3140 insertions, 1644 deletions
diff --git a/src/third_party/wiredtiger/bench/workgen/runner/runner/__init__.py b/src/third_party/wiredtiger/bench/workgen/runner/runner/__init__.py
index ed21fffe8dc..2d60e1522f5 100644
--- a/src/third_party/wiredtiger/bench/workgen/runner/runner/__init__.py
+++ b/src/third_party/wiredtiger/bench/workgen/runner/runner/__init__.py
@@ -88,5 +88,5 @@ except:
shutil.rmtree('WT_TEST', True)
os.mkdir('WT_TEST')
-from .core import txn, extensions_config, op_group_transaction, op_log_like, op_multi_table
+from .core import txn, extensions_config, op_append, op_group_transaction, op_log_like, op_multi_table, op_populate_with_range
from .latency import workload_latency
diff --git a/src/third_party/wiredtiger/bench/workgen/runner/runner/core.py b/src/third_party/wiredtiger/bench/workgen/runner/runner/core.py
index 2c8311c4ca7..a8977d9593e 100644
--- a/src/third_party/wiredtiger/bench/workgen/runner/runner/core.py
+++ b/src/third_party/wiredtiger/bench/workgen/runner/runner/core.py
@@ -28,7 +28,7 @@
#
# runner/core.py
# Core functions available to all runners
-import glob, os
+import glob, os, random
from workgen import Key, Operation, OpList, Table, Transaction, Value
# txn --
@@ -100,14 +100,73 @@ def extensions_config(exts):
result = ',extensions=[' + ','.join(extfiles.values()) + ']'
return result
-def _op_multi_table_as_list(ops_arg, tables):
+_PARETO_SHAPE = 1.5
+_BILLION = 1000000000
+
+# Choose a value from a range of ints based on the pareto parameter
+# The pareto value is interpreted as in wtperf, a number between 0 and 100.
+def _choose_pareto(nrange, pareto):
+ rval = random.randint(0, _BILLION)
+
+ # Use Pareto distribution to give 80/20 hot/cold values.
+ S1 = -1 / _PARETO_SHAPE
+ S2 = nrange * (pareto.param / 100.0) * (_PARETO_SHAPE - 1)
+ U = 1 - rval / (_BILLION * 1.0)
+ rval = (pow(U, S1) - 1) * S2
+ if rval >= nrange:
+ rval = 0
+ return int(rval)
+
+# Get the list of subordinate operations that are listed in the group.
+# Generally, the op._optype == Operation.OP_NONE, it indicates that
+# the operation contains a group of subordinates.
+#
+# XXX
+# Note that this function should be called for all iteration, rather than:
+# for o in op._group
+# because a bug in SWIG versions <= 2.0.11 would cause the above fragment
+# to produce a segmentation violation as described here:
+# https://sourceforge.net/p/swig/mailman/message/32838320/
+def _op_get_group_list(op):
+ grouplist = op._group
+ result = []
+ if grouplist != None:
+ result.extend(grouplist)
+ return result
+
+def _op_multi_table_as_list(ops_arg, tables, pareto_tables, multiplier):
result = []
if ops_arg._optype != Operation.OP_NONE:
- for table in tables:
- result.append(Operation(ops_arg._optype, table, ops_arg._key, ops_arg._value))
+ if pareto_tables <= 0:
+ for table in tables:
+ for i in range(0, multiplier):
+ result.append(Operation(ops_arg._optype, table, ops_arg._key, ops_arg._value))
+ else:
+ # Use the multiplier unless the length of the list will be large.
+ # In any case, make sure there's at least a multiplier of 3, to
+ # give a chance to hit all/most of the tables.
+ ntables = len(tables)
+ count = ntables * multiplier
+ if count > 1000:
+ count = 1000
+ mincount = ntables * 3
+ if mincount > count:
+ count = mincount
+ for i in range(0, count):
+ tnum = _choose_pareto(ntables, pareto_tables)
+ # Modify the pareto value to make it more flat
+ # as tnum gets higher. Workgen knows how to handle
+ # a portion of a pareto range.
+ table = tables[tnum]
+ key = Key(ops_arg._key)
+ key._pareto.range_low = (1.0 * i)/count
+ key._pareto.range_high = (1.0 * (i + 1))/count
+ result.append(Operation(ops_arg._optype, table, key, ops_arg._value))
else:
- for op in ops._group:
- result.extend(_op_multi_table_as_list(op, tables))
+ for op in _op_get_group_list(ops_arg):
+ for o in _op_multi_table_as_list(op, tables, pareto_tables, \
+ multiplier):
+ result.append(Operation(o))
return result
# A convenient way to build a list of operations
@@ -118,11 +177,52 @@ def op_append(op1, op2):
op1 += op2
return op1
+# Require consistent use of pareto on the set of operations,
+# that keeps our algorithm reasonably simple.
+def _check_pareto(ops_arg, cur = 0):
+ if ops_arg._key != None and ops_arg._key._keytype == Key.KEYGEN_PARETO:
+ p = ops_arg._key._pareto
+ if cur != 0 and p != cur:
+ raise Exception('mixed pareto values for ops within a ' + \
+ 'single thread not supported')
+ cur = p
+ if ops_arg._group != None:
+ for op in _op_get_group_list(ops_arg):
+ cur = _check_pareto(op, cur)
+ return cur
+
+_primes = [83, 89, 97, 101, 103, 107, 109, 113]
+
# Emulate wtperf's table_count option. Spread the given operations over
-# a set of tables.
-def op_multi_table(ops_arg, tables):
+# a set of tables. For example, given 5 operations and 4 tables, we return
+# a set of 20 operations for all possibilities.
+#
+# When we detect that pareto is used with a range partition, things get
+# trickier, because we'll want a higher proportion of operations channelled
+# to the first tables. Workgen only supports individual operations on a
+# single table, so to get good Pareto distribution, we first expand the
+# number in the total set of operations, and then choose a higher proportion
+# of the tables. We need to expand the number of operations to make sure
+# that the lower tables get some hits. While it's not perfect (without
+# creating a huge multiplier) it's a reasonable approximation for most
+# cases. Within each table's access, the pareto parameters have to be
+# adjusted to account for the each table's position in the total
+# distribution. For example, the lowest priority table will have a much
+# more even distribution.
+def op_multi_table(ops_arg, tables, range_partition = False):
ops = None
- for op in _op_multi_table_as_list(ops_arg, tables):
+ multiplier = 1
+ if range_partition:
+ pareto_tables = _check_pareto(ops_arg)
+ else:
+ pareto_tables = 0
+ if pareto_tables != 0:
+ multiplier = _primes[random.randint(0, len(_primes) - 1)]
+ ops_list = _op_multi_table_as_list(ops_arg, tables, pareto_tables, \
+ multiplier)
+ if pareto_tables != 0:
+ random.shuffle(ops_list)
+ for op in ops_list:
ops = op_append(ops, op)
return ops
@@ -152,7 +252,7 @@ def op_log_like(op, log_table, ops_per_txn):
op = txn(op) # txn for each action.
else:
oplist = []
- for op2 in op._group:
+ for op2 in _op_get_group_list(op):
if op2._optype == Operation.OP_NONE:
oplist.append(op_log_like(op2, log_table))
elif ops_per_txn == 0 and _optype_is_write(op2._optype):
@@ -182,10 +282,8 @@ def op_group_transaction(ops_arg, ops_per_txn, txn_config):
raise Exception('grouping transactions with multipliers not supported')
oplist = []
- ops = None
- nops = 0
txgroup = []
- for op in ops_arg._group:
+ for op in _op_get_group_list(ops_arg):
if op.optype == Operation.OP_NONE:
oplist.append(_op_transaction_list(txgroup, txn_config))
txgroup = []
@@ -199,3 +297,39 @@ def op_group_transaction(ops_arg, ops_per_txn, txn_config):
oplist.append(_op_transaction_list(txgroup, txn_config))
ops_arg._group = OpList(oplist)
return ops_arg
+
+# Populate using range partition with the random range.
+# We will totally fill 0 or more tables (fill_tables), and 0 or
+# 1 table will be partially filled. The rest (if any) will
+# by completely unfilled, to be filled/accessed during
+# the regular part of the run.
+def op_populate_with_range(ops_arg, tables, icount, random_range, pop_threads):
+ table_count = len(tables)
+ entries_per_table = (icount + random_range) / table_count
+ if entries_per_table == 0:
+ # This can happen if table_count is huge relative to
+ # icount/random_range. Not really worth handling.
+ raise Exception('table_count > (icount + random_range), seems absurd')
+ if (icount + random_range) % table_count != 0:
+ # This situation is not handled well by our simple algorithm,
+ # we won't get exactly icount entries added during the populate.
+ raise Exception('(icount + random_range) is not evenly divisible by ' +
+ 'table_count')
+ if entries_per_table % pop_threads != 0:
+ # Another situation that is not handled exactly.
+ raise Exception('(icount + random_range) is not evenly divisible by ' +
+ 'populate_threads')
+ fill_tables = icount / entries_per_table
+ fill_per_thread = entries_per_table / pop_threads
+ ops = None
+ for i in range(0, fill_tables):
+ op = Operation(ops_arg)
+ op._table = tables[i]
+ ops = op_append(ops, op * fill_per_thread)
+ partial_fill = icount % entries_per_table
+ if partial_fill > 0:
+ fill_per_thread = partial_fill / pop_threads
+ op = Operation(ops_arg)
+ op._table = tables[fill_tables]
+ ops = op_append(ops, op * fill_per_thread)
+ return ops
diff --git a/src/third_party/wiredtiger/bench/workgen/workgen.cxx b/src/third_party/wiredtiger/bench/workgen/workgen.cxx
index ce9debcca2f..31e21e6f6c9 100644
--- a/src/third_party/wiredtiger/bench/workgen/workgen.cxx
+++ b/src/third_party/wiredtiger/bench/workgen/workgen.cxx
@@ -240,7 +240,8 @@ Context& Context::operator=(const Context &other) {
}
ContextInternal::ContextInternal() : _tint(), _table_names(),
- _recno(NULL), _recno_alloced(0), _tint_last(0), _context_count(0) {
+ _table_runtime(NULL), _runtime_alloced(0), _tint_last(0),
+ _context_count(0) {
uint32_t count;
if ((count = workgen_atomic_add32(&context_count, 1)) != 1)
THROW("multiple Contexts not supported");
@@ -248,20 +249,20 @@ ContextInternal::ContextInternal() : _tint(), _table_names(),
}
ContextInternal::~ContextInternal() {
- if (_recno != NULL)
- delete _recno;
+ if (_table_runtime != NULL)
+ delete _table_runtime;
}
int ContextInternal::create_all() {
- if (_recno_alloced != _tint_last) {
+ if (_runtime_alloced != _tint_last) {
// The array references are 1-based, we'll waste one entry.
- uint64_t *new_recno = new uint64_t[_tint_last + 1];
- memcpy(new_recno, _recno, sizeof(uint64_t) * _recno_alloced);
- memset(&new_recno[_recno_alloced], 0,
- sizeof(uint64_t) * (_tint_last - _recno_alloced + 1));
- delete _recno;
- _recno = new_recno;
- _recno_alloced = _tint_last;
+ TableRuntime *new_table_runtime = new TableRuntime[_tint_last + 1];
+ memcpy(new_table_runtime, _table_runtime, sizeof(uint64_t) * _runtime_alloced);
+ memset(&new_table_runtime[_runtime_alloced], 0,
+ sizeof(uint64_t) * (_tint_last - _runtime_alloced + 1));
+ delete _table_runtime;
+ _table_runtime = new_table_runtime;
+ _runtime_alloced = _tint_last;
}
return (0);
}
@@ -301,7 +302,9 @@ int Monitor::run() {
workgen_version(version, sizeof(version));
Stats prev_interval;
while (!_stop) {
- for (int i = 0; i < options->sample_interval && !_stop; i++)
+ int waitsecs = (first && options->warmup > 0) ? options->warmup :
+ options->sample_interval;
+ for (int i = 0; i < waitsecs && !_stop; i++)
sleep(1);
if (_stop)
break;
@@ -387,6 +390,22 @@ int Monitor::run() {
return (0);
}
+ParetoOptions ParetoOptions::DEFAULT;
+ParetoOptions::ParetoOptions(int param_arg) : param(param_arg), range_low(0.0),
+ range_high(1.0), _options() {
+ _options.add_int("param", param,
+ "0 is disabled, otherwise a range from 1 (most aggressive) to "
+ "100 (least aggressive)");
+ _options.add_double("range_low", range_low,
+ "between 0.0 and 1.0, starting range of the pareto distribution");
+ _options.add_double("range_high", range_high,
+ "between 0.0 and 1.0, ending range of the pareto distribution");
+}
+ParetoOptions::ParetoOptions(const ParetoOptions &other) :
+ param(other.param), range_low(other.range_low),
+ range_high(other.range_high), _options(other._options) {}
+ParetoOptions::~ParetoOptions() {}
+
ThreadRunner::ThreadRunner() :
_errno(0), _exception(), _thread(NULL), _context(NULL), _icontext(NULL),
_workload(NULL), _wrunner(NULL), _rand_state(NULL),
@@ -536,9 +555,12 @@ void ThreadRunner::op_create_all(Operation *op, size_t &keysize,
op->create_all();
if (op->_optype != Operation::OP_NONE) {
- op->kv_compute_max(true);
+ op->kv_compute_max(true, false);
if (OP_HAS_VALUE(op))
- op->kv_compute_max(false);
+ op->kv_compute_max(false, op->_table.options.random_value);
+ if (op->_key._keytype == Key::KEYGEN_PARETO &&
+ op->_key._pareto.param == 0)
+ THROW("Key._pareto value must be set if KEYGEN_PARETO specified");
op->kv_size_buffer(true, keysize);
op->kv_size_buffer(false, valuesize);
@@ -575,17 +597,66 @@ void ThreadRunner::op_create_all(Operation *op, size_t &keysize,
op_create_all(&*i, keysize, valuesize);
}
-uint64_t ThreadRunner::op_get_key_recno(Operation *op, tint_t tint) {
+
+#define PARETO_SHAPE 1.5
+
+// Return a value within the interval [ 0, recno_max )
+// that is weighted toward lower numbers with pareto_param at 0 (the minimum),
+// and more evenly distributed with pareto_param at 100 (the maximum).
+//
+static uint64_t
+pareto_calculation(uint32_t randint, uint64_t recno_max,
+ ParetoOptions &pareto) {
+ double S1, S2, U;
+ uint32_t result;
+ double r;
+
+ r = (double)randint;
+ if (pareto.range_high != 1.0 || pareto.range_low != 0.0) {
+ if (pareto.range_high <= pareto.range_low ||
+ pareto.range_high > 1.0 || pareto.range_low < 0.0)
+ THROW("Pareto illegal range");
+ r = (pareto.range_low * (double)UINT32_MAX) +
+ r * (pareto.range_high - pareto.range_low);
+ }
+ S1 = (-1 / PARETO_SHAPE);
+ S2 = recno_max * (pareto.param / 100.0) * (PARETO_SHAPE - 1);
+ U = 1 - r / (double)UINT32_MAX; // interval [0, 1)
+ result = (uint64_t)((pow(U, S1) - 1) * S2);
+
+ // This Pareto calculation chooses out of range values less than 20%
+ // of the time, depending on pareto_param. For param of 0, it is
+ // never out of range, for param of 100, 19.2%. For the default
+ // pareto_param of 20, it will be out of range 2.7% of the time.
+ // Out of range values are channelled into the first key,
+ // making it "hot". Unfortunately, that means that using a higher
+ // param can get a lot lumped into the first bucket.
+ //
+ // XXX This matches the behavior of wtperf, we may consider instead
+ // retrying (modifying the random number) until we get a good value.
+ //
+ if (result > recno_max)
+ result = 0;
+ return (result);
+}
+
+uint64_t ThreadRunner::op_get_key_recno(Operation *op, uint64_t range,
+ tint_t tint) {
uint64_t recno_count;
- uint32_t rand;
+ uint32_t rval;
(void)op;
- recno_count = _icontext->_recno[tint];
+ if (range > 0)
+ recno_count = range;
+ else
+ recno_count = _icontext->_table_runtime[tint]._max_recno;
if (recno_count == 0)
// The file has no entries, returning 0 forces a WT_NOTFOUND return.
return (0);
- rand = workgen_random(_rand_state);
- return (rand % recno_count + 1); // recnos are one-based.
+ rval = workgen_random(_rand_state);
+ if (op->_key._keytype == Key::KEYGEN_PARETO)
+ rval = pareto_calculation(rval, recno_count, op->_key._pareto);
+ return (rval % recno_count + 1); // recnos are one-based.
}
int ThreadRunner::op_run(Operation *op) {
@@ -594,12 +665,14 @@ int ThreadRunner::op_run(Operation *op) {
WT_CURSOR *cursor;
WT_DECL_RET;
uint64_t recno;
+ uint64_t range;
bool measure_latency, own_cursor;
track = NULL;
cursor = NULL;
recno = 0;
own_cursor = false;
+ range = op->_table.options.range;
if (_throttle != NULL) {
if (_throttle_ops >= _throttle_limit && !_in_transaction) {
WT_ERR(_throttle->throttle(_throttle_ops,
@@ -621,19 +694,24 @@ int ThreadRunner::op_run(Operation *op) {
switch (op->_optype) {
case Operation::OP_INSERT:
track = &_stats.insert;
- recno = workgen_atomic_add64(&_icontext->_recno[tint], 1);
+ if (op->_key._keytype == Key::KEYGEN_APPEND ||
+ op->_key._keytype == Key::KEYGEN_AUTO)
+ recno = workgen_atomic_add64(
+ &_icontext->_table_runtime[tint]._max_recno, 1);
+ else
+ recno = op_get_key_recno(op, range, tint);
break;
case Operation::OP_REMOVE:
track = &_stats.remove;
- recno = op_get_key_recno(op, tint);
+ recno = op_get_key_recno(op, range, tint);
break;
case Operation::OP_SEARCH:
track = &_stats.read;
- recno = op_get_key_recno(op, tint);
+ recno = op_get_key_recno(op, range, tint);
break;
case Operation::OP_UPDATE:
track = &_stats.update;
- recno = op_get_key_recno(op, tint);
+ recno = op_get_key_recno(op, range, tint);
break;
case Operation::OP_NONE:
recno = 0;
@@ -651,6 +729,7 @@ int ThreadRunner::op_run(Operation *op) {
track->track_latency() &&
(track->ops % _workload->options.sample_rate == 0);
+ VERBOSE(*this, "OP " << op->_optype << " " << op->_table._uri.c_str() << ", recno=" << recno);
timespec start;
if (measure_latency)
workgen_epoch(&start);
@@ -663,10 +742,13 @@ int ThreadRunner::op_run(Operation *op) {
_in_transaction = true;
}
if (op->_optype != Operation::OP_NONE) {
- op->kv_gen(true, recno, _keybuf);
+ op->kv_gen(true, 0, recno, _keybuf);
cursor->set_key(cursor, _keybuf);
if (OP_HAS_VALUE(op)) {
- op->kv_gen(false, recno, _valuebuf);
+ uint32_t r = 0;
+ if (op->_table.options.random_value)
+ r = workgen_random(_rand_state);
+ op->kv_gen(false, r, recno, _valuebuf);
cursor->set_value(cursor, _valuebuf);
}
switch (op->_optype) {
@@ -969,7 +1051,7 @@ void Operation::get_static_counts(Stats &stats, int multiplier) {
i->get_static_counts(stats, multiplier * _repeatgroup);
}
-void Operation::kv_compute_max(bool iskey) {
+void Operation::kv_compute_max(bool iskey, bool has_random) {
uint64_t max;
int size;
@@ -981,6 +1063,14 @@ void Operation::kv_compute_max(bool iskey) {
THROW("Key.size too small for table '" << _table._uri << "'");
if (!iskey && size < 1)
THROW("Value.size too small for table '" << _table._uri << "'");
+ if (has_random) {
+ if (iskey)
+ THROW("Random keys not allowed");
+ size -= RANDOMIZER_SIZE;
+ if (size < 1)
+ THROW("Value.size with random values too small for table '"
+ << _table._uri << "'");
+ }
if (size > 1)
max = power64(10, (size - 1)) - 1;
@@ -1006,7 +1096,8 @@ void Operation::kv_size_buffer(bool iskey, size_t &maxsize) const {
}
}
-void Operation::kv_gen(bool iskey, uint64_t n, char *result) const {
+void Operation::kv_gen(bool iskey, uint32_t randomizer, uint64_t n,
+ char *result) const {
uint64_t max;
int size;
@@ -1015,6 +1106,12 @@ void Operation::kv_gen(bool iskey, uint64_t n, char *result) const {
if (n > max)
THROW((iskey ? "Key" : "Value") << " (" << n
<< ") too large for size (" << size << ")");
+ if (randomizer != 0) {
+ randomizer %= 1000;
+ snprintf(result, 6, ":%3.3d:", randomizer);
+ n -= RANDOMIZER_SIZE;
+ result += RANDOMIZER_SIZE;
+ }
workgen_u64_to_string_zf(n, result, size);
}
@@ -1338,14 +1435,20 @@ void Stats::track_latency(bool latency) {
truncate.track_latency(latency);
}
-TableOptions::TableOptions() : key_size(0), value_size(0), _options() {
+TableOptions::TableOptions() : key_size(0), value_size(0),
+ random_value(false), range(0), _options() {
_options.add_int("key_size", key_size,
"default size of the key, unless overridden by Key.size");
_options.add_int("value_size", value_size,
"default size of the value, unless overridden by Value.size");
+ _options.add_bool("random_value", random_value,
+ "generate random content for the value");
+ _options.add_int("range", range,
+ "if zero, keys are inserted at the end and reads/updates are in the current range, if non-zero, inserts/reads/updates are at a random key between 0 and the given range");
}
TableOptions::TableOptions(const TableOptions &other) :
key_size(other.key_size), value_size(other.value_size),
+ random_value(other.random_value), range(other.range),
_options(other._options) {}
TableOptions::~TableOptions() {}
@@ -1376,7 +1479,7 @@ TableInternal::~TableInternal() {}
WorkloadOptions::WorkloadOptions() : max_latency(0),
report_file("workload.stat"), report_interval(0), run_time(0),
- sample_file("sample.json"), sample_interval(0), sample_rate(1),
+ sample_file("sample.json"), sample_interval(0), sample_rate(1), warmup(0),
_options() {
_options.add_int("max_latency", max_latency,
"prints warning if any latency measured exceeds this number of "
@@ -1399,6 +1502,8 @@ WorkloadOptions::WorkloadOptions() : max_latency(0),
_options.add_int("sample_rate", sample_rate,
"how often the latency of operations is measured. 1 for every operation, "
"2 for every second operation, 3 for every third operation etc.");
+ _options.add_int("warmup", warmup,
+ "how long to run the workload phase before starting measurements");
}
WorkloadOptions::WorkloadOptions(const WorkloadOptions &other) :
@@ -1569,7 +1674,8 @@ int WorkloadRunner::run_all() {
workgen_epoch(&_start);
timespec end = _start + options->run_time;
- timespec next_report = _start + options->report_interval;
+ timespec next_report = _start +
+ ((options->warmup > 0) ? options->warmup : options->report_interval);
// Start all threads
if (options->sample_interval > 0) {
@@ -1653,6 +1759,8 @@ int WorkloadRunner::run_all() {
if (exception == NULL && !_trunners[i]._exception._str.empty())
exception = &_trunners[i]._exception;
}
+
+ workgen_epoch(&now);
if (options->sample_interval > 0) {
WT_TRET(pthread_join(monitor._handle, &status));
if (monitor._errno != 0)
diff --git a/src/third_party/wiredtiger/bench/workgen/workgen.h b/src/third_party/wiredtiger/bench/workgen/workgen.h
index a12e4dc4c89..2a116e1c89e 100644
--- a/src/third_party/wiredtiger/bench/workgen/workgen.h
+++ b/src/third_party/wiredtiger/bench/workgen/workgen.h
@@ -171,6 +171,8 @@ struct Context {
struct TableOptions {
int key_size;
int value_size;
+ bool random_value;
+ int range;
TableOptions();
TableOptions(const TableOptions &other);
@@ -179,6 +181,8 @@ struct TableOptions {
void describe(std::ostream &os) const {
os << "key_size " << key_size;
os << ", value_size " << value_size;
+ os << ", random_value " << random_value;
+ os << ", range " << range;
}
std::string help() const { return _options.help(); }
@@ -210,16 +214,46 @@ struct Table {
#endif
};
+struct ParetoOptions {
+ int param;
+ double range_low;
+ double range_high;
+ ParetoOptions(int param = 0);
+ ParetoOptions(const ParetoOptions &other);
+ ~ParetoOptions();
+
+ void describe(std::ostream &os) const {
+ os << "parameter " << param;
+ if (range_low != 0.0 || range_high != 1.0) {
+ os << "range [" << range_low << "-" << range_high << "]";
+ }
+ }
+
+ std::string help() const { return _options.help(); }
+ std::string help_description(const char *option_name) const {
+ return _options.help_description(option_name); }
+ std::string help_type(const char *option_name) const {
+ return _options.help_type(option_name); }
+
+ static ParetoOptions DEFAULT;
+private:
+ OptionsList _options;
+};
+
struct Key {
typedef enum {
KEYGEN_AUTO, KEYGEN_APPEND, KEYGEN_PARETO, KEYGEN_UNIFORM } KeyType;
KeyType _keytype;
int _size;
+ ParetoOptions _pareto;
/* XXX specify more about key distribution */
- Key() : _keytype(KEYGEN_AUTO), _size(0) {}
- Key(KeyType keytype, int size) : _keytype(keytype), _size(size) {}
- Key(const Key &other) : _keytype(other._keytype), _size(other._size) {}
+ Key() : _keytype(KEYGEN_AUTO), _size(0), _pareto(ParetoOptions::DEFAULT) {}
+ Key(KeyType keytype, int size=0,
+ const ParetoOptions &pareto=ParetoOptions::DEFAULT) :
+ _keytype(keytype), _size(size), _pareto(pareto) {}
+ Key(const Key &other) : _keytype(other._keytype), _size(other._size),
+ _pareto(other._pareto) {}
~Key() {}
void describe(std::ostream &os) const {
@@ -273,8 +307,9 @@ struct Operation {
Operation& operator=(const Operation &other);
void create_all();
void get_static_counts(Stats &stats, int multiplier);
- void kv_compute_max(bool);
- void kv_gen(bool, uint64_t, char *) const;
+ void kv_compute_max(bool iskey, bool has_random);
+ void kv_gen(bool iskey, uint32_t randomizer, uint64_t n,
+ char *result) const;
void kv_size_buffer(bool iskey, size_t &size) const;
void size_check() const;
#endif
@@ -365,6 +400,7 @@ struct WorkloadOptions {
int sample_interval;
int sample_rate;
std::string sample_file;
+ int warmup;
WorkloadOptions();
WorkloadOptions(const WorkloadOptions &other);
diff --git a/src/third_party/wiredtiger/bench/workgen/workgen_int.h b/src/third_party/wiredtiger/bench/workgen/workgen_int.h
index a8d008a3bc5..c7a5a7121e9 100644
--- a/src/third_party/wiredtiger/bench/workgen/workgen_int.h
+++ b/src/third_party/wiredtiger/bench/workgen/workgen_int.h
@@ -36,6 +36,8 @@ extern "C" {
}
#endif
+#define RANDOMIZER_SIZE 5 /* ":000:" prefix */
+
namespace workgen {
// A 'tint' or ('table integer') is a unique small value integer
@@ -126,7 +128,7 @@ struct ThreadRunner {
int run();
void op_create_all(Operation *, size_t &keysize, size_t &valuesize);
- uint64_t op_get_key_recno(Operation *, tint_t tint);
+ uint64_t op_get_key_recno(Operation *, uint64_t range, tint_t tint);
void op_get_static_counts(Operation *, Stats &, int);
int op_run(Operation *);
@@ -153,11 +155,18 @@ struct Monitor {
int run();
};
+struct TableRuntime {
+ uint64_t _max_recno; // highest recno allocated
+ bool _disjoint; // does key space have holes?
+
+ TableRuntime() : _max_recno(0), _disjoint(0) {}
+};
+
struct ContextInternal {
std::map<std::string, tint_t> _tint; // maps uri -> tint_t
std::map<tint_t, std::string> _table_names; // reverse mapping
- uint64_t *_recno; // # entries per tint_t
- uint32_t _recno_alloced; // length of allocated _recno
+ TableRuntime *_table_runtime; // # entries per tint_t
+ uint32_t _runtime_alloced; // length of _table_runtime
tint_t _tint_last; // last tint allocated
// unique id per context, to work with multiple contexts, starts at 1.
uint32_t _context_count;
diff --git a/src/third_party/wiredtiger/bench/workgen/wtperf.py b/src/third_party/wiredtiger/bench/workgen/wtperf.py
index 3a196fe7b57..2837be6d064 100644
--- a/src/third_party/wiredtiger/bench/workgen/wtperf.py
+++ b/src/third_party/wiredtiger/bench/workgen/wtperf.py
@@ -34,7 +34,7 @@
# See also the usage() function.
#
from __future__ import print_function
-import os, sys, tempfile
+import os, shutil, sys, tempfile
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
@@ -52,13 +52,15 @@ class Options(object):
pass
class Translator:
- def __init__(self, filename, prefix, verbose):
+ def __init__(self, filename, prefix, verbose, homedir):
self.filename = filename
self.prefix = prefix
self.verbose = verbose
+ self.homedir = homedir
self.linenum = 0
- self.opts = {}
- self.used_opts = {}
+ self.opts_map = {}
+ self.opts_used = {}
+ self.options = lambda: None # options behaves as an attribute dict
self.has_error = False
def error_file_line(self, fname, linenum, msg):
@@ -70,15 +72,17 @@ class Translator:
self.error_file_line(self.filename, self.linenum, msg)
# Report an error and unwind the stack
- def fatal_error(self, msg, errtype):
+ def fatal_error(self, msg, errtype = 'configuration error'):
self.error(msg)
raise TranslateException(errtype)
- supported_opt_list = [ 'compression', 'conn_config', 'icount',
- 'key_sz', 'log_like_table',
+ supported_opt_list = [ 'close_conn', 'compression', 'compact',
+ 'conn_config', 'create', 'icount',
+ 'key_sz', 'log_like_table', 'pareto',
'populate_ops_per_txn', 'populate_threads',
- 'reopen_connection',
- 'table_config', 'table_count',
+ 'random_range', 'random_value', 'range_partition',
+ 'readonly', 'reopen_connection', 'run_ops',
+ 'sess_config', 'table_config', 'table_count',
'threads', 'transaction_config', 'value_sz' ]
def set_opt(self, optname, val):
@@ -98,23 +102,32 @@ class Translator:
v = int(val) # it might be an integer
except ValueError:
v = val # it's a string after all
- self.opts[optname] = OptionValue(v, self.filename, self.linenum)
+ self.opts_map[optname] = OptionValue(v, self.filename, self.linenum)
- def get_opt(self, optname, dfault):
- if optname in self.opts:
- ret = self.opts[optname]
+ def _get_opt(self, optname, dfault):
+ if optname in self.opts_map:
+ ret = self.opts_map[optname]
self.filename = ret.filename
self.linenum = ret.linenum
- self.used_opts[optname] = 1
+ self.opts_used[optname] = 1
return ret.value
else:
return dfault
+ def get_string_opt(self, optname, dfault):
+ v = self._get_opt(optname, dfault)
+ setattr(self.options, optname, v)
+ return v
+
def get_int_opt(self, optname, dfault):
- return self.get_opt(optname, dfault) + 0
+ v = self._get_opt(optname, dfault) + 0
+ setattr(self.options, optname, v)
+ return v
def get_boolean_opt(self, optname, dfault):
- return not not self.get_opt(optname, dfault)
+ v = not not self._get_opt(optname, dfault)
+ setattr(self.options, optname, v)
+ return v
# Split a string 'left_side=right_side' into two parts
def split_assign(self, s):
@@ -159,17 +172,33 @@ class Translator:
def assign_str(self, left, right):
return left + '=' + str(right) + '\n'
- def add_operation_str(self, count, opname, multi):
+ def add_operation_str(self, count, opname, multi, pareto):
result = ''
tablename = 'tables[0]' if multi else 'table'
if count > 1:
result += str(count) + ' * '
if count > 0:
- result += 'Operation(Operation.' + opname + ', ' + \
- tablename + ') + \\\n'
+ result += 'Operation(Operation.' + opname + ', ' + tablename
+ if pareto > 0:
+ result += ', Key(Key.KEYGEN_PARETO, 0, ParetoOptions(' + \
+ str(pareto) + '))'
+ elif opname == 'OP_INSERT' and self.options.random_range != 0:
+ result += ', Key(Key.KEYGEN_UNIFORM)'
+ result += ') + \\\n'
result += ' '
return result
+ def copy_config(self):
+ # Note: If we add the capability of setting options on the command
+ # line, we won't be able to do a simple copy.
+ config_save = os.path.join(self.homedir, 'CONFIG.wtperf')
+ suffix = 0
+ while os.path.exists(config_save):
+ suffix += 1
+ config_save = os.path.join(self.homedir, \
+ 'CONFIG.wtperf.' + str(suffix))
+ shutil.copyfile(self.filename, config_save)
+
# Wtperf's throttle is based on the number of regular operations,
# not including log_like operations. Workgen counts all operations,
# it doesn't treat log operations any differently. Adjust the throttle
@@ -191,11 +220,13 @@ class Translator:
return (new_throttle, comment)
def parse_threads(self, threads_config):
+ opts = self.options
tdecls = ''
tlist = self.split_config_parens(threads_config)
table_count = self.get_int_opt('table_count', 1)
log_like_table = self.get_boolean_opt('log_like_table', False)
- txn_config = self.get_opt('transaction_config', '')
+ txn_config = self.get_string_opt('transaction_config', '')
+ run_ops = self.get_int_opt('run_ops', -1)
if log_like_table:
tdecls += 'log_name = "table:log"\n'
tdecls += 's.create(log_name, "key_format=S,value_format=S," +' + \
@@ -219,6 +250,7 @@ class Translator:
topts.throttle = 0
topts.update = 0
topts.updates = 0
+ topts.random_range = 0
for o in self.split_config_parens(t):
(k, v) = self.split_assign(o)
@@ -239,19 +271,41 @@ class Translator:
if topts.inserts + topts.reads + topts.updates == 0:
self.fatal_error('need read/insert/update/...',
'thread config error')
+
tdecls += 'ops = '
- tdecls += self.add_operation_str(topts.inserts, 'OP_INSERT', multi)
- tdecls += self.add_operation_str(topts.reads, 'OP_SEARCH', multi)
- tdecls += self.add_operation_str(topts.updates, 'OP_UPDATE', multi)
+ tdecls += self.add_operation_str(topts.inserts, 'OP_INSERT',
+ multi, opts.pareto)
+ tdecls += self.add_operation_str(topts.reads, 'OP_SEARCH',
+ multi, opts.pareto)
+ tdecls += self.add_operation_str(topts.updates, 'OP_UPDATE',
+ multi, opts.pareto)
tdecls = tdecls.rstrip(' \n\\+') + '\n'
+ range_partition = opts.range_partition
+
+ # Pareto with multiple tables is handled in op_multi_table.
if multi:
- tdecls += 'ops = op_multi_table(ops, tables)\n'
+ tdecls += 'ops = op_multi_table(ops, tables, ' + \
+ str(range_partition) + ')\n'
if topts.ops_per_txn > 0:
tdecls += 'ops = op_group_transaction(ops, ' + \
str(topts.ops_per_txn) + ', "' + txn_config + '")\n'
if log_like_table:
tdecls += 'ops = op_log_like(ops, log_table, ' + \
str(topts.ops_per_txn) + ')\n'
+ if run_ops != -1:
+ if len(tlist) > 1:
+ self.fatal_error('run_ops currently supported with a '
+ 'single type of thread')
+ tdecls += '\n'
+ if multi:
+ tdecls += \
+ '# Note that op_multi_table has already multiplied\n' +\
+ '# the number of operations by the number of tables.\n'
+ tdecls += 'ops = ops * (' + \
+ str(run_ops) + ' / (' + str(topts.count) + \
+ ' * table_count))' + \
+ ' # run_ops = ' + str(run_ops) + \
+ ', thread.count = ' + str(topts.count) + '\n'
tdecls += thread_name + ' = Thread(ops)\n'
if topts.throttle > 0:
(throttle, comment) = self.calc_throttle(topts, log_like_table)
@@ -273,6 +327,134 @@ class Translator:
# An error has already been reported
return None
+ def check_divisibility(self, icount, random_range, divisor_name, divisor):
+ if (icount + random_range) % divisor != 0:
+ if random_range == 0:
+ dividend = 'icount'
+ else:
+ dividend = '(icount + random_range)'
+ self.fatal_error(dividend + ' is not evenly divisible by ' +
+ divisor_name + ', this is not handled ' +
+ 'precisely by wtperf.py')
+
+ def translate_table_create(self):
+ opts = self.options
+ s = ''
+ s += 'wtperf_table_config = "key_format=S,value_format=S,type=lsm," +\\\n'
+ s += ' "exclusive=true,allocation_size=4kb," +\\\n'
+ s += ' "internal_page_max=64kb,leaf_page_max=4kb,split_pct=100,"\n'
+ if opts.compression != '':
+ s += 'compress_table_config = "block_compressor=' + opts.compression + ',"\n'
+ else:
+ s += 'compress_table_config = ""\n'
+ s += 'table_config = "' + opts.table_config + '"\n'
+ s += 'tables = []\n'
+ s += 'table_count = ' + str(opts.table_count) + '\n'
+ if opts.table_count == 1:
+ s += 'tname = "table:test.wt"\n'
+ indent = ''
+ else:
+ s += 'for i in range(0, table_count):\n'
+ s += ' tname = "table:test" + str(i) + ".wt"\n'
+ indent = ' '
+
+ s += indent + 'table = Table(tname)\n'
+ s += indent + 's.create(tname, wtperf_table_config +\\\n'
+ s += indent + ' compress_table_config + table_config)\n'
+ s += indent + 'table.options.key_size = ' + str(opts.key_sz) + '\n'
+ s += indent + 'table.options.value_size = ' + str(opts.value_sz) + '\n'
+ if opts.random_value:
+ s += indent + 'table.options.random_value = True\n'
+ if opts.random_range != 0:
+ # In wtperf, the icount plus random_range is the key range
+ table_range = (opts.random_range + opts.icount) / opts.table_count
+ s += indent + 'table.options.range = ' + str(table_range) + '\n'
+ s += indent + 'tables.append(table)\n'
+ return s
+
+ def translate_populate(self):
+ opts = self.options
+ s = '\n'
+ if opts.icount == 0:
+ if opts.populate_threads != 0:
+ self.error("populate_threads > 0, icount == 0")
+ return ''
+ if opts.populate_threads == 0:
+ self.fatal_error('icount != 0 and populate_threads == 0: ' +\
+ 'cannot populate entries with no threads')
+ s += 'populate_threads = ' + str(opts.populate_threads) + '\n'
+ s += 'icount = ' + str(opts.icount) + '\n'
+ need_ops_per_thread = True
+
+ # Since we're separating the populating by table, and also
+ # into multiple threads, we currently require that
+ # (icount + random_range) is evenly divisible by table count
+ # and by number of populating threads. It's possible to handle
+ # the cases when this is not true, but it hardly seems worth
+ # the extra complexity. Also, these could be made into warnings,
+ # and actually create fewer entries than icount, but that could be
+ # confusing.
+ self.check_divisibility(opts.icount, opts.random_range,
+ 'table_count', opts.table_count)
+ self.check_divisibility(opts.icount, opts.random_range,
+ '(populate_threads * table_count)',
+ opts.populate_threads * opts.table_count)
+
+ if opts.table_count == 1:
+ s += 'pop_ops = Operation(Operation.OP_INSERT, table)\n'
+ elif opts.range_partition and opts.random_range > 0:
+ # Populating using a range partition is complex enough
+ # to handle in its own function. It does all the operations
+ # for the thread, so we don't need a multiplier at the end.
+ need_ops_per_thread = False
+
+ s += 'random_range = ' + str(opts.random_range) + '\n'
+ s += 'pop_ops = Operation(Operation.OP_INSERT, tables[0])\n'
+ s += 'pop_ops = op_populate_with_range(pop_ops, tables, ' + \
+ 'icount, random_range, populate_threads)\n'
+ else:
+ s += '# There are multiple tables to be filled during populate,\n'
+ s += '# the icount is split between them all.\n'
+ s += 'pop_ops = Operation(Operation.OP_INSERT, tables[0])\n'
+ s += 'pop_ops = op_multi_table(pop_ops, tables)\n'
+
+ if need_ops_per_thread:
+ s += 'nops_per_thread = icount / (populate_threads * table_count)\n'
+ op_mult = ' * nops_per_thread'
+ else:
+ op_mult = ''
+
+ pop_per_txn = opts.populate_ops_per_txn
+ if pop_per_txn > 0:
+ s += 'pop_ops = op_group_transaction(pop_ops, ' + \
+ str(pop_per_txn) + ', "' + opts.transaction_config + '")\n'
+ s += 'pop_thread = Thread(pop_ops' + op_mult + ')\n'
+ s += 'pop_workload = Workload(context, populate_threads * pop_thread)\n'
+ if self.verbose > 0:
+ s += 'print("populate:")\n'
+ s += 'pop_workload.run(conn)\n'
+
+ # If configured, compact to allow LSM merging to complete. We
+ # set an unlimited timeout because if we close the connection
+ # then any in-progress compact/merge is aborted.
+ if opts.compact:
+ if opts.async_threads == 0:
+ self.fatal_error('unexpected value for async_threads')
+ s += '\n'
+ if self.verbose > 0:
+ s += 'print("compact after populate:")\n'
+ s += 'import time\n'
+ s += 'start_time = time.time()\n'
+ s += 'async_callback = WtperfAsyncCallback()\n'
+ s += 'for i in range(0, table_count):\n'
+ s += ' op = conn.async_new_op(tables[i]._uri, "timeout=0", async_callback)\n'
+ s += ' op.compact()\n'
+ s += 'conn.async_flush()\n'
+ s += 'print("compact completed in {} seconds".format(' + \
+ 'time.time() - start_time))\n'
+
+ return s
+
def translate_inner(self):
workloadopts = ''
with open(self.filename) as fin:
@@ -286,19 +468,40 @@ class Translator:
continue
(key, val) = self.split_assign(line)
if key in [ 'max_latency', 'report_file', 'report_interval',
- 'run_time', 'sample_interval', 'sample_rate' ]:
+ 'run_time', 'sample_interval', 'sample_rate',
+ 'warmup' ]:
workloadopts += 'workload.options.' + key + '=' + val + '\n'
else:
self.set_opt(key, val)
- table_count = self.get_int_opt('table_count', 1)
- conn_config = self.get_opt('conn_config', '')
- table_config = self.get_opt('table_config', '')
- key_sz = self.get_int_opt('key_sz', 20)
- value_sz = self.get_int_opt('value_sz', 100)
- reopen = self.get_boolean_opt('reopen_connection', False)
- compression = self.get_opt('compression', '')
- txn_config = self.get_opt('transaction_config', '')
+ conn_config = self.get_string_opt('conn_config', '')
+ sess_config = self.get_string_opt('sess_config', '')
+ create = self.get_boolean_opt('create', True)
+ reopen_connection = self.get_boolean_opt('reopen_connection', False)
+ readonly = self.get_boolean_opt('readonly', False)
+ close_conn = self.get_boolean_opt('close_conn', True)
+ compression = self.get_string_opt('compression', '')
+ self.get_int_opt('table_count', 1)
+ self.get_string_opt('table_config', '')
+ self.get_int_opt('key_sz', 20)
+ self.get_int_opt('value_sz', 100)
+ self.get_int_opt('icount', 0)
+ self.get_int_opt('populate_threads', 1)
+ self.get_int_opt('populate_ops_per_txn', 0)
+ self.get_boolean_opt('range_partition', False)
+ self.get_int_opt('random_range', 0)
+ self.get_boolean_opt('random_value', False)
+ self.get_string_opt('transaction_config', '')
+ self.get_boolean_opt('compact', False)
+ self.get_int_opt('async_threads', 0)
+ self.get_int_opt('pareto', 0)
+ opts = self.options
+ if opts.range_partition and opts.random_range == 0:
+ self.fatal_error('range_partition requires random_range to be set')
+ if opts.random_range > 0 and not opts.range_partition and \
+ opts.table_count != 1:
+ self.fatal_error('random_range and multiple tables without ' + \
+ 'range_partition is not supported')
s = '#/usr/bin/env python\n'
s += '# generated from ' + self.filename + '\n'
@@ -307,93 +510,75 @@ class Translator:
s += 'from wiredtiger import *\n'
s += 'from workgen import *\n'
s += '\n'
+ async_config = ''
+ if opts.compact and opts.async_threads == 0:
+ opts.async_threads = 2;
+ if opts.async_threads > 0:
+ # Assume the default of 1024 for the max ops, although we
+ # could bump that up to 4096 if needed.
+ async_config = ',async=(enabled=true,threads=' + \
+ str(opts.async_threads) + ')'
+ s += '# this can be further customized\n'
+ s += 'class WtperfAsyncCallback(AsyncCallback):\n'
+ s += ' def __init__(self):\n'
+ s += ' pass\n'
+ s += ' def notify_error(self, key, value, optype, desc):\n'
+ s += ' print("ERROR: async notify(" + str(key) + "," + \\\n'
+ s += ' str(value) + "," + str(optype) + "): " + desc)\n'
+ s += ' def notify(self, op, op_ret, flags):\n'
+ s += ' if op_ret != 0:\n'
+ s += ' self.notify_error(op._key, op._value,\\\n'
+ s += ' op._optype, wiredtiger_strerror(op_ret))\n'
+ s += ' return op_ret\n'
+ s += '\n'
s += 'context = Context()\n'
- s += 'conn_config = "' + conn_config + '"\n'
+ extra_config = ''
+ s += 'conn_config = ""\n'
+
+ if async_config != '':
+ s += 'conn_config += ",' + async_config + '" # async config\n'
+ if conn_config != '':
+ s += 'conn_config += ",' + conn_config + '" # explicitly added\n'
if compression != '':
s += 'conn_config += extensions_config(["compressors/' + \
- compression + '"])\n'
+ compression + '"])\n'
compression = 'block_compressor=' + compression + ','
- s += 'conn = wiredtiger_open("WT_TEST", "create," + conn_config)\n'
- s += 's = conn.open_session()\n'
+ s += 'conn = wiredtiger_open("' + self.homedir + \
+ '", "create," + conn_config)\n'
+ s += 's = conn.open_session("' + sess_config + '")\n'
s += '\n'
- s += 'wtperf_table_config = "key_format=S,value_format=S,type=lsm," +\\\n'
- s += ' "exclusive=true,allocation_size=4kb," +\\\n'
- s += ' "internal_page_max=64kb,leaf_page_max=4kb,split_pct=100,"\n'
- s += 'compress_table_config = "' + compression + '"\n'
- s += 'table_config = "' + table_config + '"\n'
- if table_count == 1:
- s += 'tname = "file:test.wt"\n'
- s += 's.create(tname, wtperf_table_config +\\\n'
- s += ' compress_table_config + table_config)\n'
- s += 'table = Table(tname)\n'
- s += 'table.options.key_size = ' + str(key_sz) + '\n'
- s += 'table.options.value_size = ' + str(value_sz) + '\n'
- else:
- s += 'table_count = ' + str(table_count) + '\n'
- s += 'tables = []\n'
- s += 'for i in range(0, table_count):\n'
- s += ' tname = "file:test" + str(i) + ".wt"\n'
- s += ' s.create(tname, ' + \
- 'wtperf_table_config + ' + \
- 'compress_table_config + table_config)\n'
- s += ' t = Table(tname)\n'
- s += ' t.options.key_size = ' + str(key_sz) + '\n'
- s += ' t.options.value_size = ' + str(value_sz) + '\n'
- s += ' tables.append(t)\n'
- s += '\n'
-
- icount = self.get_int_opt('icount', 0)
- pop_thread = self.get_int_opt('populate_threads', 1)
- pop_per_txn = self.get_int_opt('populate_ops_per_txn', 0)
- if icount != 0:
- if pop_thread == 0:
- self.fatal_error('icount != 0 and populate_threads == 0: ' +\
- 'cannot populate entries with no threads')
- elif pop_thread == 1:
- mult = ''
- else:
- mult = str(pop_thread) + ' * '
-
- # if there are multiple tables to be filled during populate,
- # the icount is split between them all.
- nops_per_thread = icount / (pop_thread * table_count)
- if table_count == 1:
- s += 'pop_ops = Operation(Operation.OP_INSERT, table)\n'
- else:
- s += 'pop_ops = Operation(Operation.OP_INSERT, tables[0])\n'
- s += 'pop_ops = op_multi_table(pop_ops, tables)\n'
- if pop_per_txn > 0:
- s += 'pop_ops = op_group_transaction(pop_ops, ' + \
- str(pop_per_txn) + ', "' + txn_config + '")\n'
- s += 'pop_thread = Thread(pop_ops * ' + str(nops_per_thread) + ')\n'
- s += 'pop_workload = Workload(context, ' + mult + 'pop_thread)\n'
- if self.verbose > 0:
- s += 'print("populate:")\n'
- s += 'pop_workload.run(conn)\n'
- else:
- if self.get_int_opt('populate_threads', 0) != 0:
- self.error("populate_threads > 0, icount == 0")
+ s += self.translate_table_create()
+ if create:
+ s += self.translate_populate()
- thread_config = self.get_opt('threads', '')
+ thread_config = self.get_string_opt('threads', '')
if thread_config != '':
(t_create, t_var) = self.parse_threads(thread_config)
s += '\n' + t_create
- if reopen:
+ if reopen_connection:
s += '\n# reopen the connection\n'
s += 'conn.close()\n'
+ if readonly:
+ 'conn_config += ",readonly=true"\n'
s += 'conn = wiredtiger_open(' + \
- '"WT_TEST", "create," + conn_config)\n'
+ '"' + self.homedir + '", "create," + conn_config)\n'
s += '\n'
s += 'workload = Workload(context, ' + t_var + ')\n'
s += workloadopts
if self.verbose > 0:
s += 'print("workload:")\n'
- s += 'workload.run(conn)\n'
-
- for o in self.used_opts:
- del self.opts[o]
- if len(self.opts) != 0:
- self.error('internal error, options not handled: ' + str(self.opts))
+ s += 'workload.run(conn)\n\n'
+ s += 'latency_filename = "' + self.homedir + '/latency.out"\n'
+ s += 'latency.workload_latency(workload, latency_filename)\n'
+
+ if close_conn:
+ s += 'conn.close()\n'
+
+ for o in self.opts_used:
+ del self.opts_map[o]
+ if len(self.opts_map) != 0:
+ self.error('internal error, options not handled: ' +
+ str(self.opts_map))
return s
def usage():
@@ -416,13 +601,17 @@ prefix = (
'sys.path.append("' + runner_dir + '")\n\n')
exit_status = 0
+homedir = 'WT_TEST'
for arg in sys.argv[1:]:
- if arg == '--python':
+ if arg == '--pydebug':
+ import pdb
+ pdb.set_trace()
+ elif arg == '--python':
py_out = True
elif arg == '--verbose' or arg == '-v':
verbose += 1
elif arg.endswith('.wtperf'):
- translator = Translator(arg, prefix, verbose)
+ translator = Translator(arg, prefix, verbose, homedir)
pysrc = translator.translate()
if translator.has_error:
exit_status = 1
@@ -432,8 +621,20 @@ for arg in sys.argv[1:]:
(outfd, tmpfile) = tempfile.mkstemp(suffix='.py')
os.write(outfd, pysrc)
os.close(outfd)
- execfile(tmpfile)
+ # We make a copy of the configuration file in the home
+ # directory after the run, because the wiredtiger_open
+ # in the generated code will clean out the directory first.
+ raised = None
+ try:
+ execfile(tmpfile)
+ except Exception, exception:
+ raised = exception
+ if not os.path.isdir(homedir):
+ os.makedirs(homedir)
+ translator.copy_config()
os.remove(tmpfile)
+ if raised != None:
+ raise raised
else:
usage()
sys.exit(1)
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-80r20u.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-80r20u.wtperf
index de5299bbac1..8b56a86e022 100644
--- a/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-80r20u.wtperf
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-80r20u.wtperf
@@ -11,7 +11,7 @@ compression="snappy"
# close_conn as false allows this test to close/finish faster, but if running
# as the set, the next test will need to run recovery.
close_conn=false
-sess_config="isolation=snapshot
+sess_config="isolation=snapshot"
table_count=2
key_sz=40
value_sz=120
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py
index dbd3dcbb233..32faec8709d 100644
--- a/src/third_party/wiredtiger/dist/api_data.py
+++ b/src/third_party/wiredtiger/dist/api_data.py
@@ -131,6 +131,20 @@ file_runtime_config = [
do not ever evict the object's pages from cache. Not compatible with
LSM tables; see @ref tuning_cache_resident for more information''',
type='boolean'),
+ Config('assert', '', r'''
+ enable enhanced checking. ''',
+ type='category', subconfig= [
+ Config('commit_timestamp', 'none', r'''
+ verify that timestamps should 'always' or 'never' be used
+ on modifications with this table. Verification is 'none'
+ if mixed update use is allowed.''',
+ choices=['always','never','none']),
+ Config('read_timestamp', 'none', r'''
+ verify that timestamps should 'always' or 'never' be used
+ on reads with this table. Verification is 'none'
+ if mixed read use is allowed.''',
+ choices=['always','never','none'])
+ ], undoc=True),
Config('log', '', r'''
the transaction log configuration for this object. Only valid if
log is enabled in ::wiredtiger_open''',
diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py
index 8c0448b27c1..df897bcb91e 100644
--- a/src/third_party/wiredtiger/dist/flags.py
+++ b/src/third_party/wiredtiger/dist/flags.py
@@ -23,6 +23,7 @@ flags = {
],
'page_read' : [
'READ_CACHE',
+ 'READ_LOOKASIDE',
'READ_NOTFOUND_OK',
'READ_NO_EMPTY',
'READ_NO_EVICT',
@@ -35,14 +36,15 @@ flags = {
'READ_WONT_NEED',
],
'rec_write' : [
- 'CHECKPOINTING',
- 'EVICTING',
- 'EVICT_IN_MEMORY',
- 'EVICT_INMEM_SPLIT',
- 'EVICT_LOOKASIDE',
- 'EVICT_SCRUB',
- 'EVICT_UPDATE_RESTORE',
- 'VISIBILITY_ERR',
+ 'REC_CHECKPOINT',
+ 'REC_EVICT',
+ 'REC_INMEM_SPLIT',
+ 'REC_IN_MEMORY',
+ 'REC_LOOKASIDE',
+ 'REC_SCRUB',
+ 'REC_UPDATE_RESTORE',
+ 'REC_VISIBILITY_ERR',
+ 'REC_VISIBLE_ALL',
],
'timing_stress_for_test' : [
'TIMING_STRESS_CHECKPOINT_SLOW',
@@ -102,6 +104,7 @@ flags = {
'CONN_CKPT_SYNC',
'CONN_CLOSING',
'CONN_CLOSING_NO_MORE_OPENS',
+ 'CONN_EVICTION_NO_LOOKASIDE',
'CONN_EVICTION_RUN',
'CONN_IN_MEMORY',
'CONN_LAS_OPEN',
diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list
index dcaf975434f..b2f6cbec43e 100644
--- a/src/third_party/wiredtiger/dist/s_define.list
+++ b/src/third_party/wiredtiger/dist/s_define.list
@@ -58,6 +58,7 @@ WT_STAT_INCRV_BASE
WT_STAT_WRITE
WT_TIMEDIFF_US
WT_TRET_ERROR_OK
+WT_TXN_TIMESTAMP_FLAG_CHECK
WT_UPDATE_SIZE
WT_WITH_LOCK_NOWAIT
WT_WITH_LOCK_WAIT
diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py
index 06e7dccd943..24610b9ab14 100644
--- a/src/third_party/wiredtiger/dist/stat_data.py
+++ b/src/third_party/wiredtiger/dist/stat_data.py
@@ -254,6 +254,7 @@ connection_stats = [
CacheStat('cache_hazard_walks', 'hazard pointer check entries walked'),
CacheStat('cache_inmem_split', 'in-memory page splits'),
CacheStat('cache_inmem_splittable', 'in-memory page passed criteria to be split'),
+ CacheStat('cache_lookaside_entries', 'lookaside table entries', 'no_clear,no_scale'),
CacheStat('cache_lookaside_insert', 'lookaside table insert calls'),
CacheStat('cache_lookaside_remove', 'lookaside table remove calls'),
CacheStat('cache_overhead', 'percentage overhead', 'no_clear,no_scale'),
diff --git a/src/third_party/wiredtiger/examples/c/ex_all.c b/src/third_party/wiredtiger/examples/c/ex_all.c
index a0c6f87ceda..dcd9dd406df 100644
--- a/src/third_party/wiredtiger/examples/c/ex_all.c
+++ b/src/third_party/wiredtiger/examples/c/ex_all.c
@@ -209,9 +209,9 @@ cursor_ops(WT_SESSION *session)
value.size = strlen("another value");
cursor->set_value(cursor, &value);
/*! [Set the cursor's raw value] */
- }
error_check(cursor->insert(cursor));
+ }
/*! [Return the next record] */
error_check(cursor->next(cursor));
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 3ed326b1854..6c4f2ee7138 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -1,5 +1,5 @@
{
- "commit": "6f561957cb5606f504f9fe5a124c80386b210b1a",
+ "commit": "0cd3d5bbd8a5c8779f1129c6754b4463403e788f",
"github": "wiredtiger/wiredtiger.git",
"vendor": "wiredtiger",
"branch": "mongodb-3.6"
diff --git a/src/third_party/wiredtiger/src/bloom/bloom.c b/src/third_party/wiredtiger/src/bloom/bloom.c
index a39d50e68c1..6f4050b3eb6 100644
--- a/src/third_party/wiredtiger/src/bloom/bloom.c
+++ b/src/third_party/wiredtiger/src/bloom/bloom.c
@@ -274,6 +274,7 @@ __wt_bloom_hash_get(WT_BLOOM *bloom, WT_BLOOM_HASH *bhash)
WT_ASSERT(bloom->session, bloom->bitstring == NULL);
/* Create a cursor on the first time through. */
+ c = NULL;
WT_ERR(__bloom_open_cursor(bloom, NULL));
c = bloom->c;
@@ -301,6 +302,8 @@ __wt_bloom_hash_get(WT_BLOOM *bloom, WT_BLOOM_HASH *bhash)
err: /* Don't return WT_NOTFOUND from a failed search. */
if (ret == WT_NOTFOUND)
ret = WT_ERROR;
+ if (c != NULL)
+ (void)c->reset(c);
__wt_err(bloom->session, ret, "Failed lookup in bloom filter");
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index 51882a7e466..ee800ca80ee 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -334,7 +334,7 @@ __cursor_col_search(
WT_DECL_RET;
WT_WITH_PAGE_INDEX(session,
- ret = __wt_col_search(session, cbt->iface.recno, leaf, cbt));
+ ret = __wt_col_search(session, cbt->iface.recno, leaf, cbt, false));
return (ret);
}
@@ -348,8 +348,8 @@ __cursor_row_search(
{
WT_DECL_RET;
- WT_WITH_PAGE_INDEX(session,
- ret = __wt_row_search(session, &cbt->iface.key, leaf, cbt, insert));
+ WT_WITH_PAGE_INDEX(session, ret = __wt_row_search(
+ session, &cbt->iface.key, leaf, cbt, insert, false));
return (ret);
}
@@ -445,6 +445,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt)
WT_STAT_CONN_INCR(session, cursor_search);
WT_STAT_DATA_INCR(session, cursor_search);
+ WT_RET(__wt_txn_search_check(session));
__cursor_state_save(cursor, &state);
/*
@@ -534,6 +535,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
WT_STAT_CONN_INCR(session, cursor_search_near);
WT_STAT_DATA_INCR(session, cursor_search_near);
+ WT_RET(__wt_txn_search_check(session));
__cursor_state_save(cursor, &state);
/*
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index 778adcc3dfd..f0388bd1f07 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -1124,6 +1124,9 @@ __debug_ref(WT_DBG *ds, WT_REF *ref)
case WT_REF_LOCKED:
state = "locked";
break;
+ case WT_REF_LOOKASIDE:
+ state = "lookaside";
+ break;
case WT_REF_MEM:
state = "memory";
break;
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
index 093192dbaa0..20e592d12bc 100644
--- a/src/third_party/wiredtiger/src/btree/bt_delete.c
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -85,12 +85,6 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
/*
* Atomically switch the page's state to lock it. If the page is not
* on-disk, other threads may be using it, no fast delete.
- *
- * Possible optimization: if the page is already deleted and the delete
- * is visible to us (the delete has been committed), we could skip the
- * page instead of instantiating it and figuring out there are no rows
- * in the page. While that's a huge amount of work to no purpose, it's
- * unclear optimizing for overlapping range deletes is worth the effort.
*/
if (ref->state != WT_REF_DISK ||
!__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED))
@@ -164,6 +158,7 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
for (sleep_count = yield_count = 0;;) {
switch (ref->state) {
case WT_REF_DISK:
+ case WT_REF_LOOKASIDE:
case WT_REF_READING:
WT_ASSERT(session, 0); /* Impossible, assert */
break;
diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c
index 806a9770057..1aae991a407 100644
--- a/src/third_party/wiredtiger/src/btree/bt_discard.c
+++ b/src/third_party/wiredtiger/src/btree/bt_discard.c
@@ -316,8 +316,14 @@ __wt_free_ref(
*/
__wt_ref_addr_free(session, ref);
- /* Free any page-deleted information. */
- if (ref->page_del != NULL) {
+ /*
+ * Free any lookaside or page-deleted information. We only expect a
+ * lookaside structure for lookaside references, but can see
+ * page-deleted information in other cases (such as WT_REF_MEM).
+ */
+ if (ref->state == WT_REF_LOOKASIDE)
+ __wt_free(session, ref->page_las);
+ else if (ref->page_del != NULL) {
__wt_free(session, ref->page_del->update_list);
__wt_free(session, ref->page_del);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c
index fd52c53861a..4ab88cea01e 100644
--- a/src/third_party/wiredtiger/src/btree/bt_handle.c
+++ b/src/third_party/wiredtiger/src/btree/bt_handle.c
@@ -398,6 +398,29 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
else
btree->checksum = CKSUM_UNCOMPRESSED;
+ /* Debugging information */
+ WT_RET(__wt_config_gets(session,
+ cfg, "assert.commit_timestamp", &cval));
+ if (WT_STRING_MATCH("always", cval.str, cval.len)) {
+ FLD_SET(btree->assert_flags, WT_ASSERT_COMMIT_TS_ALWAYS);
+ FLD_CLR(btree->assert_flags, WT_ASSERT_COMMIT_TS_NEVER);
+ } else if (WT_STRING_MATCH("never", cval.str, cval.len)) {
+ FLD_SET(btree->assert_flags, WT_ASSERT_COMMIT_TS_NEVER);
+ FLD_CLR(btree->assert_flags, WT_ASSERT_COMMIT_TS_ALWAYS);
+ } else
+ FLD_CLR(btree->assert_flags,
+ WT_ASSERT_COMMIT_TS_ALWAYS | WT_ASSERT_COMMIT_TS_NEVER);
+ WT_RET(__wt_config_gets(session, cfg, "assert.read_timestamp", &cval));
+ if (WT_STRING_MATCH("always", cval.str, cval.len)) {
+ FLD_SET(btree->assert_flags, WT_ASSERT_READ_TS_ALWAYS);
+ FLD_CLR(btree->assert_flags, WT_ASSERT_READ_TS_NEVER);
+ } else if (WT_STRING_MATCH("never", cval.str, cval.len)) {
+ FLD_SET(btree->assert_flags, WT_ASSERT_READ_TS_NEVER);
+ FLD_CLR(btree->assert_flags, WT_ASSERT_READ_TS_ALWAYS);
+ } else
+ FLD_CLR(btree->assert_flags,
+ WT_ASSERT_READ_TS_ALWAYS | WT_ASSERT_READ_TS_NEVER);
+
/* Huffman encoding */
WT_RET(__wt_btree_huffman_open(session));
@@ -549,7 +572,7 @@ __wt_btree_tree_open(
* the allocated copy of the disk image on return, the in-memory object
* steals it.
*/
- WT_ERR(__wt_page_inmem(session, NULL, dsk.data, dsk.memsize,
+ WT_ERR(__wt_page_inmem(session, NULL, dsk.data,
WT_DATA_IN_ITEM(&dsk) ?
WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));
dsk.mem = NULL;
diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
index ebd0eb0cb71..d65073a398f 100644
--- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c
+++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
@@ -147,7 +147,7 @@ err: __wt_scr_free(session, &tmp);
*/
int
__wt_ovfl_remove(WT_SESSION_IMPL *session,
- WT_PAGE *page, WT_CELL_UNPACK *unpack, bool checkpoint)
+ WT_PAGE *page, WT_CELL_UNPACK *unpack, bool evicting)
{
/*
* This function solves two problems in reconciliation.
@@ -188,7 +188,7 @@ __wt_ovfl_remove(WT_SESSION_IMPL *session,
* We only have to do this for checkpoints: in any eviction mode, there
* can't be threads sitting in our update lists.
*/
- if (checkpoint)
+ if (!evicting)
WT_RET(__ovfl_cache(session, page, unpack));
/*
diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c
index 5316b19a41e..d3df9f6bf78 100644
--- a/src/third_party/wiredtiger/src/btree/bt_page.c
+++ b/src/third_party/wiredtiger/src/btree/bt_page.c
@@ -127,8 +127,8 @@ err: if ((pindex = WT_INTL_INDEX_GET_SAFE(page)) != NULL) {
* Build in-memory page information.
*/
int
-__wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref,
- const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep)
+__wt_page_inmem(WT_SESSION_IMPL *session,
+ WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep)
{
WT_DECL_RET;
WT_PAGE *page;
@@ -196,8 +196,13 @@ __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref,
* Track the memory allocated to build this page so we can update the
* cache statistics in a single call. If the disk image is in allocated
* memory, start with that.
+ *
+ * Accounting is based on the page-header's in-memory disk size instead
+ * of the buffer memory used to instantiate the page image even though
+ * the values might not match exactly, because that's the only value we
+ * have when discarding the page image and accounting needs to match.
*/
- size = LF_ISSET(WT_PAGE_DISK_ALLOC) ? memsize : 0;
+ size = LF_ISSET(WT_PAGE_DISK_ALLOC) ? dsk->mem_size : 0;
switch (page->type) {
case WT_PAGE_COL_FIX:
@@ -218,9 +223,10 @@ __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref,
WT_ILLEGAL_VALUE_ERR(session);
}
- /* Update the page's in-memory size and the cache statistics. */
+ /* Update the page's cache statistics. */
__wt_cache_page_inmem_incr(session, page, size);
- __wt_cache_page_image_incr(session, dsk->mem_size);
+ if (LF_ISSET(WT_PAGE_DISK_ALLOC))
+ __wt_cache_page_image_incr(session, dsk->mem_size);
/* Link the new internal page to the parent. */
if (ref != NULL) {
diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c
index f28c4e10594..268b040bd6e 100644
--- a/src/third_party/wiredtiger/src/btree/bt_random.c
+++ b/src/third_party/wiredtiger/src/btree/bt_random.c
@@ -231,15 +231,17 @@ restart: /*
for (i = 0; i < entries; ++i) {
descent =
pindex->index[__wt_random(&session->rnd) % entries];
- if (descent->state == WT_REF_MEM ||
- descent->state == WT_REF_DISK)
+ if (descent->state == WT_REF_DISK ||
+ descent->state == WT_REF_LOOKASIDE ||
+ descent->state == WT_REF_MEM)
break;
}
if (i == entries)
for (i = 0; i < entries; ++i) {
descent = pindex->index[i];
- if (descent->state == WT_REF_MEM ||
- descent->state == WT_REF_DISK)
+ if (descent->state == WT_REF_DISK ||
+ descent->state == WT_REF_LOOKASIDE ||
+ descent->state == WT_REF_MEM)
break;
}
if (i == entries || descent == NULL) {
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
index edab3c8c217..ab8a8d7916b 100644
--- a/src/third_party/wiredtiger/src/btree/bt_read.c
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -8,72 +8,8 @@
#include "wt_internal.h"
-static void __btree_verbose_lookaside_read(WT_SESSION_IMPL *);
-
-/*
- * __wt_las_remove_block --
- * Remove all records matching a key prefix from the lookaside store.
- */
-int
-__wt_las_remove_block(WT_SESSION_IMPL *session,
- WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size)
-{
- WT_DECL_RET;
- WT_ITEM las_addr, las_key, las_timestamp;
- uint64_t las_counter, las_txnid, remove_cnt;
- uint32_t las_id;
- int exact;
-
- remove_cnt = 0;
-
- /*
- * Search for the block's unique prefix and step through all matching
- * records, removing them.
- */
- las_addr.data = addr;
- las_addr.size = addr_size;
- las_key.size = 0;
- las_timestamp.size = 0;
- cursor->set_key(cursor, btree_id, &las_addr,
- (uint64_t)0, (uint32_t)0, &las_timestamp, &las_key);
- if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0)
- ret = cursor->next(cursor);
- for (; ret == 0; ret = cursor->next(cursor)) {
- WT_ERR(cursor->get_key(cursor, &las_id, &las_addr, &las_counter,
- &las_txnid, &las_timestamp, &las_key));
-
- /*
- * Confirm the search using the unique prefix; if not a match,
- * we're done searching for records for this page.
- */
- if (las_id != btree_id ||
- las_addr.size != addr_size ||
- memcmp(las_addr.data, addr, addr_size) != 0)
- break;
-
- /*
- * Cursor opened overwrite=true: won't return WT_NOTFOUND should
- * another thread remove the record before we do, and the cursor
- * remains positioned in that case.
- */
- WT_ERR(cursor->remove(cursor));
- ++remove_cnt;
- }
- WT_ERR_NOTFOUND_OK(ret);
-
-err: /*
- * If there were races to remove records, we can over-count. All
- * arithmetic is signed, so underflow isn't fatal, but check anyway so
- * we don't skew low over time.
- */
- if (remove_cnt > S2C(session)->las_record_cnt)
- S2C(session)->las_record_cnt = 0;
- else if (remove_cnt > 0)
- (void)__wt_atomic_sub64(
- &S2C(session)->las_record_cnt, remove_cnt);
-
- return (ret);
-}
+static void __btree_verbose_lookaside_read(
+ WT_SESSION_IMPL *, uint32_t, uint64_t);
/*
* __col_instantiate --
@@ -88,13 +24,17 @@ __col_instantiate(WT_SESSION_IMPL *session,
page = ref->page;
- /* Discard any of the updates we don't need. */
+ /*
+ * Discard any of the updates we don't need.
+ *
+ * Just free the memory: it hasn't been accounted for on the page yet.
+ */
if (updlist->next != NULL &&
(upd = __wt_update_obsolete_check(session, page, updlist)) != NULL)
- __wt_update_obsolete_free(session, page, upd);
+ __wt_free_update_list(session, upd);
/* Search the page and add updates. */
- WT_RET(__wt_col_search(session, recno, ref, cbt));
+ WT_RET(__wt_col_search(session, recno, ref, cbt, true));
WT_RET(__wt_col_modify(
session, cbt, recno, NULL, updlist, WT_UPDATE_INVALID, false));
return (0);
@@ -113,13 +53,17 @@ __row_instantiate(WT_SESSION_IMPL *session,
page = ref->page;
- /* Discard any of the updates we don't need. */
+ /*
+ * Discard any of the updates we don't need.
+ *
+ * Just free the memory: it hasn't been accounted for on the page yet.
+ */
if (updlist->next != NULL &&
(upd = __wt_update_obsolete_check(session, page, updlist)) != NULL)
- __wt_update_obsolete_free(session, page, upd);
+ __wt_free_update_list(session, upd);
/* Search the page and add updates. */
- WT_RET(__wt_row_search(session, key, ref, cbt, true));
+ WT_RET(__wt_row_search(session, key, ref, cbt, true, true));
WT_RET(__wt_row_modify(
session, cbt, key, NULL, updlist, WT_UPDATE_INVALID, false));
return (0);
@@ -130,23 +74,21 @@ __row_instantiate(WT_SESSION_IMPL *session,
* Instantiate lookaside update records in a recently read page.
*/
static int
-__las_page_instantiate(WT_SESSION_IMPL *session,
- WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size)
+__las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t btree_id)
{
WT_CURSOR *cursor;
WT_CURSOR_BTREE cbt;
WT_DECL_ITEM(current_key);
WT_DECL_RET;
- WT_DECL_TIMESTAMP(timestamp)
- WT_ITEM las_addr, las_key, las_timestamp, las_value;
+ WT_ITEM las_key, las_timestamp, las_value;
WT_PAGE *page;
WT_UPDATE *first_upd, *last_upd, *upd;
size_t incr, total_incr;
- uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid;
+ uint64_t current_recno, las_counter, las_pageid, las_txnid, recno;
uint32_t las_id, session_flags;
+ const uint8_t *p;
uint8_t upd_type;
int exact;
- const uint8_t *p;
cursor = NULL;
page = ref->page;
@@ -174,47 +116,29 @@ __las_page_instantiate(WT_SESSION_IMPL *session,
* Search for the block's unique prefix, stepping through any matching
* records.
*/
- las_addr.data = addr;
- las_addr.size = addr_size;
- las_timestamp.size = 0;
- cursor->set_key(cursor, read_id, &las_addr,
- (uint64_t)0, (uint32_t)0, &las_timestamp, &las_key);
+ cursor->set_key(cursor,
+ btree_id, ref->page_las->las_pageid, (uint64_t)0, &las_key);
if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0)
ret = cursor->next(cursor);
for (; ret == 0; ret = cursor->next(cursor)) {
- WT_ERR(cursor->get_key(cursor, &las_id, &las_addr, &las_counter,
- &las_txnid, &las_timestamp, &las_key));
+ WT_ERR(cursor->get_key(cursor,
+ &las_id, &las_pageid, &las_counter, &las_key));
/*
* Confirm the search using the unique prefix; if not a match,
* we're done searching for records for this page.
*/
- if (las_id != read_id ||
- las_addr.size != addr_size ||
- memcmp(las_addr.data, addr, addr_size) != 0)
+ if (las_id != btree_id ||
+ las_pageid != ref->page_las->las_pageid)
break;
- /*
- * If the on-page value has become globally visible, this record
- * is no longer needed.
- *
- * Copy the timestamp from the cursor to avoid unaligned reads.
- */
-#ifdef HAVE_TIMESTAMPS
- WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE);
- memcpy(&timestamp, las_timestamp.data, las_timestamp.size);
-#endif
- if (__wt_txn_visible_all(
- session, las_txnid, WT_TIMESTAMP_NULL(&timestamp)))
- continue;
-
/* Allocate the WT_UPDATE structure. */
WT_ERR(cursor->get_value(cursor,
- &upd_txnid, &las_timestamp, &upd_type, &las_value));
+ &las_txnid, &las_timestamp, &upd_type, &las_value));
WT_ERR(__wt_update_alloc(
session, &las_value, &upd, &incr, upd_type));
total_incr += incr;
- upd->txnid = upd_txnid;
+ upd->txnid = las_txnid;
#ifdef HAVE_TIMESTAMPS
WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE);
memcpy(&upd->timestamp, las_timestamp.data, las_timestamp.size);
@@ -287,16 +211,8 @@ __las_page_instantiate(WT_SESSION_IMPL *session,
if (total_incr != 0) {
__wt_cache_page_inmem_incr(session, page, total_incr);
- /*
- * We've modified/dirtied the page, but that's not necessary and
- * if we keep the page clean, it's easier to evict. We leave the
- * lookaside table updates in place, so if we evict this page
- * without dirtying it, any future instantiation of it will find
- * the records it needs. If the page is dirtied before eviction,
- * then we'll write any needed lookaside table records for the
- * new location of the page.
- */
- __wt_page_modify_clear(session, page);
+ /* Make sure the page is included in the next checkpoint. */
+ page->modify->first_dirty_txn = WT_TXN_FIRST;
}
err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
@@ -384,12 +300,12 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref)
{
struct timespec start, stop;
WT_BTREE *btree;
+ WT_CURSOR *las_cursor;
WT_DECL_RET;
WT_ITEM tmp;
WT_PAGE *page;
- const WT_PAGE_HEADER *dsk;
size_t addr_size;
- uint32_t previous_state;
+ uint32_t new_state, previous_state, session_flags;
const uint8_t *addr;
bool timer;
@@ -404,26 +320,36 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref)
/*
* Attempt to set the state to WT_REF_READING for normal reads, or
- * WT_REF_LOCKED, for deleted pages. If successful, we've won the
- * race, read the page.
+ * WT_REF_LOCKED, for deleted pages or pages with lookaside entries.
+ * If successful, we've won the race, read the page.
*/
- if (__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_READING))
- previous_state = WT_REF_DISK;
- else if (__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
- previous_state = WT_REF_DELETED;
- else
+ switch (previous_state = ref->state) {
+ case WT_REF_DISK:
+ new_state = WT_REF_READING;
+ break;
+ case WT_REF_DELETED:
+ case WT_REF_LOOKASIDE:
+ new_state = WT_REF_LOCKED;
+ break;
+ default:
+ return (0);
+ }
+ if (!__wt_atomic_casv32(&ref->state, previous_state, new_state))
return (0);
/*
- * Get the address: if there is no address, the page was deleted, but a
- * subsequent search or insert is forcing re-creation of the name space.
+ * Get the address: if there is no address, the page was deleted or had
+ * only lookaside entries, and a subsequent search or insert is forcing
+ * re-creation of the name space.
*/
__wt_ref_info(ref, &addr, &addr_size, NULL);
if (addr == NULL) {
- WT_ASSERT(session, previous_state == WT_REF_DELETED);
+ WT_ASSERT(session, previous_state != WT_REF_DISK);
WT_ERR(__wt_btree_new_leaf_page(session, &page));
ref->page = page;
+ if (previous_state == WT_REF_LOOKASIDE)
+ goto skip_read;
goto done;
}
@@ -441,16 +367,18 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref)
WT_STAT_CONN_INCRV(session, cache_read_app_time,
WT_TIMEDIFF_US(stop, start));
}
- WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize,
- WT_DATA_IN_ITEM(&tmp) ?
- WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));
/*
- * Clear the local reference to an allocated copy of the disk image on
- * return; the page steals it, errors in this code should not free it.
+ * Build the in-memory version of the page. Clear our local reference to
+ * the allocated copy of the disk image on return, the in-memory object
+ * steals it.
*/
+ WT_ERR(__wt_page_inmem(session, ref, tmp.data,
+ WT_DATA_IN_ITEM(&tmp) ?
+ WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));
tmp.mem = NULL;
+skip_read:
/*
* If reading for a checkpoint, there's no additional work to do, the
* page on disk is correct as written.
@@ -468,18 +396,31 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref)
* We only care if the lookaside table is currently active, check that
* before doing any work.
*/
- dsk = tmp.data;
- if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && __wt_las_is_written(session)) {
- __btree_verbose_lookaside_read(session);
+ if (previous_state == WT_REF_LOOKASIDE) {
+ WT_ASSERT(session, (ref->page->dsk == NULL ||
+ F_ISSET(ref->page->dsk, WT_PAGE_LAS_UPDATE)));
+
+ __btree_verbose_lookaside_read(
+ session, btree->id, ref->page_las->las_pageid);
WT_STAT_CONN_INCR(session, cache_read_lookaside);
WT_STAT_DATA_INCR(session, cache_read_lookaside);
+ WT_ERR(__las_page_instantiate(session, ref, btree->id));
- WT_ERR(__las_page_instantiate(
- session, ref, btree->id, addr, addr_size));
+ /*
+ * The page is instantiated so we no longer need the lookaside
+ * entries. Note that we are discarding updates so the page
+ * must be marked available even if these operations fail.
+ */
+ __wt_las_cursor(session, &las_cursor, &session_flags);
+ WT_TRET(__wt_las_remove_block(
+ session, las_cursor, btree->id, ref->page_las->las_pageid));
+ __wt_free(session, ref->page_las);
+ WT_TRET(__wt_las_cursor_close(
+ session, &las_cursor, session_flags));
}
done: WT_PUBLISH(ref->state, WT_REF_MEM);
- return (0);
+ return (ret);
err: /*
* If the function building an in-memory version of the page failed,
@@ -512,7 +453,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
WT_PAGE *page;
uint64_t sleep_cnt, wait_cnt;
int force_attempts;
- bool busy, cache_work, evict_soon, stalled;
+ bool busy, cache_work, did_read, evict_soon, stalled;
btree = S2BT(session);
@@ -525,7 +466,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
WT_STAT_DATA_INCR(session, cache_pages_requested);
}
- for (evict_soon = stalled = false,
+ for (did_read = evict_soon = stalled = false,
force_attempts = 0, sleep_cnt = wait_cnt = 0;;) {
switch (ref->state) {
case WT_REF_DELETED:
@@ -534,8 +475,26 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
return (WT_NOTFOUND);
/* FALLTHROUGH */
case WT_REF_DISK:
- if (LF_ISSET(WT_READ_CACHE))
- return (WT_NOTFOUND);
+ case WT_REF_LOOKASIDE:
+ if (LF_ISSET(WT_READ_CACHE)) {
+ if (ref->state != WT_REF_LOOKASIDE)
+ return (WT_NOTFOUND);
+ if (!LF_ISSET(WT_READ_LOOKASIDE))
+ return (WT_NOTFOUND);
+#ifdef HAVE_TIMESTAMPS
+ /*
+ * Skip lookaside pages if reading as of a
+ * timestamp and all the updates are in the
+ * future.
+ */
+ if (F_ISSET(
+ &session->txn, WT_TXN_HAS_TS_READ) &&
+ __wt_timestamp_cmp(
+ &ref->page_las->min_timestamp,
+ &session->txn.read_timestamp) > 0)
+ return (WT_NOTFOUND);
+#endif
+ }
/*
* The page isn't in memory, read it. If this thread is
@@ -548,6 +507,12 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
WT_RET(__page_read(session, ref));
/*
+ * We just read a page, don't evict it before we have a
+ * chance to use it.
+ */
+ did_read = true;
+
+ /*
* If configured to not trash the cache, leave the page
* generation unset, we'll set it before returning to
* the oldest read generation, so the page is forcibly
@@ -610,7 +575,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
* the page's generation number. If eviction isn't being
* done on this file, we're done.
*/
- if (LF_ISSET(WT_READ_NO_EVICT) ||
+ if (did_read || LF_ISSET(WT_READ_NO_EVICT) ||
F_ISSET(session, WT_SESSION_NO_EVICTION) ||
btree->evict_disabled > 0 || btree->lsm_primary)
goto skip_evict;
@@ -706,7 +671,8 @@ skip_evict:
* performing a lookaside table read.
*/
static void
-__btree_verbose_lookaside_read(WT_SESSION_IMPL *session)
+__btree_verbose_lookaside_read(
+ WT_SESSION_IMPL *session, uint32_t las_id, uint64_t las_pageid)
{
#ifdef HAVE_VERBOSE
WT_CONNECTION_IMPL *conn;
@@ -733,10 +699,14 @@ __btree_verbose_lookaside_read(WT_SESSION_IMPL *session)
if (__wt_atomic_casv64(&conn->las_verb_gen_read,
ckpt_gen_last, ckpt_gen_current)) {
__wt_verbose(session, WT_VERB_LOOKASIDE,
- "%s", "Read from lookaside file triggered.");
+ "Read from lookaside file triggered for "
+ "file ID %" PRIu32 ", page ID %" PRIu64,
+ las_id, las_pageid);
}
}
#else
WT_UNUSED(session);
+ WT_UNUSED(las_id);
+ WT_UNUSED(las_pageid);
#endif
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_rebalance.c b/src/third_party/wiredtiger/src/btree/bt_rebalance.c
index 7f9693f22c0..c6d9253b2d3 100644
--- a/src/third_party/wiredtiger/src/btree/bt_rebalance.c
+++ b/src/third_party/wiredtiger/src/btree/bt_rebalance.c
@@ -262,9 +262,12 @@ __rebalance_row_leaf_key(WT_SESSION_IMPL *session,
* We need the first key from a leaf page. Leaf pages are relatively
* complex (Huffman encoding, prefix compression, and so on), do the
* work to instantiate the page and copy the first key to the buffer.
+ *
+ * Page flags are 0 because we aren't releasing the memory used to read
+ * the page into memory and we don't want page discard to free it.
*/
WT_RET(__wt_bt_read(session, rs->tmp1, addr, addr_len));
- WT_RET(__wt_page_inmem(session, NULL, rs->tmp1->data, 0, 0, &page));
+ WT_RET(__wt_page_inmem(session, NULL, rs->tmp1->data, 0, &page));
ret = __wt_row_leaf_key_copy(session, page, &page->pg_row[0], key);
__wt_page_out(session, &page);
return (ret);
diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c
index 4a43dd67ff6..e2da77348f0 100644
--- a/src/third_party/wiredtiger/src/btree/bt_slvg.c
+++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c
@@ -588,8 +588,12 @@ __slvg_trk_leaf(WT_SESSION_IMPL *session,
* and copy the full keys, then free the page. We do this on
* every leaf page, and if you need to speed up the salvage,
* it's probably a great place to start.
+ *
+ * Page flags are 0 because we aren't releasing the memory used
+ * to read the page into memory and we don't want page discard
+ * to free it.
*/
- WT_ERR(__wt_page_inmem(session, NULL, dsk, 0, 0, &page));
+ WT_ERR(__wt_page_inmem(session, NULL, dsk, 0, &page));
WT_ERR(__wt_row_leaf_key_copy(session,
page, &page->pg_row[0], &trk->row_start));
WT_ERR(__wt_row_leaf_key_copy(session,
@@ -1285,7 +1289,8 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
/* Write the new version of the leaf page to disk. */
WT_ERR(__slvg_modify_init(session, page));
- WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR, NULL));
+ WT_ERR(__wt_reconcile(
+ session, ref, cookie, WT_REC_VISIBILITY_ERR, NULL));
/* Reset the page. */
page->pg_var = save_col_var;
@@ -1735,10 +1740,13 @@ __slvg_row_trk_update_start(
* Read and instantiate the WT_TRACK page (we don't have to verify the
* page, nor do we have to be quiet on error, we've already read this
* page successfully).
+ *
+ * Page flags are 0 because we aren't releasing the memory used to read
+ * the page into memory and we don't want page discard to free it.
*/
WT_RET(__wt_scr_alloc(session, trk->trk_size, &dsk));
WT_ERR(__wt_bt_read(session, dsk, trk->trk_addr, trk->trk_addr_size));
- WT_ERR(__wt_page_inmem(session, NULL, dsk->mem, 0, 0, &page));
+ WT_ERR(__wt_page_inmem(session, NULL, dsk->data, 0, &page));
/*
* Walk the page, looking for a key sorting greater than the specified
@@ -1998,7 +2006,8 @@ __slvg_row_build_leaf(
/* Write the new version of the leaf page to disk. */
WT_ERR(__slvg_modify_init(session, page));
- WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR, NULL));
+ WT_ERR(__wt_reconcile(
+ session, ref, cookie, WT_REC_VISIBILITY_ERR, NULL));
/* Reset the page. */
page->entries += skip_stop;
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 1e76deb66d7..884ee9b5c8b 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -1385,10 +1385,12 @@ __split_multi_inmem(
WT_DECL_RET;
WT_PAGE *page;
WT_SAVE_UPD *supd;
- WT_UPDATE *upd;
+ WT_UPDATE *prev_upd, *upd;
uint64_t recno;
uint32_t i, slot;
+ WT_ASSERT(session, multi->las_pageid == 0);
+
/*
* In 04/2016, we removed column-store record numbers from the WT_PAGE
* structure, leading to hard-to-debug problems because we corrupt the
@@ -1409,9 +1411,8 @@ __split_multi_inmem(
* when discarding the original page, and our caller will discard the
* allocated page on error, when discarding the allocated WT_REF.
*/
- WT_RET(__wt_page_inmem(session, ref,
- multi->disk_image, ((WT_PAGE_HEADER *)multi->disk_image)->mem_size,
- WT_PAGE_DISK_ALLOC, &page));
+ WT_RET(__wt_page_inmem(
+ session, ref, multi->disk_image, WT_PAGE_DISK_ALLOC, &page));
multi->disk_image = NULL;
/*
@@ -1434,7 +1435,7 @@ __split_multi_inmem(
__wt_btcur_open(&cbt);
/* Re-create each modification we couldn't write. */
- for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd)
+ for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd) {
switch (orig->type) {
case WT_PAGE_COL_FIX:
case WT_PAGE_COL_VAR:
@@ -1443,7 +1444,8 @@ __split_multi_inmem(
recno = WT_INSERT_RECNO(supd->ins);
/* Search the page. */
- WT_ERR(__wt_col_search(session, recno, ref, &cbt));
+ WT_ERR(__wt_col_search(
+ session, recno, ref, &cbt, true));
/* Apply the modification. */
WT_ERR(__wt_col_modify(session, &cbt,
@@ -1465,7 +1467,8 @@ __split_multi_inmem(
}
/* Search the page. */
- WT_ERR(__wt_row_search(session, key, ref, &cbt, true));
+ WT_ERR(__wt_row_search(
+ session, key, ref, &cbt, true, true));
/* Apply the modification. */
WT_ERR(__wt_row_modify(session,
@@ -1474,6 +1477,37 @@ __split_multi_inmem(
WT_ILLEGAL_VALUE_ERR(session);
}
+ /*
+ * Discard the update used to create the on-page disk image.
+ * This is not just a performance issue: if the update used to
+ * create the value for this on-page disk image was a modify,
+ * and it was applied to the previous on-page value to
+ * determine a value to write to this disk image, that update
+ * cannot be applied to the new on-page value without risking
+ * corruption.
+ */
+ if (supd->onpage_upd != NULL) {
+ for (prev_upd = upd; prev_upd != NULL &&
+ prev_upd->next != supd->onpage_upd;
+ prev_upd = prev_upd->next)
+ ;
+ /*
+ * If the on-page update was in fact a tombstone, there
+ * will be no value on the page. Don't throw the
+ * tombstone away: we may need it to correctly resolve
+ * modifications.
+ */
+ if (supd->onpage_upd->type == WT_UPDATE_DELETED &&
+ prev_upd != NULL)
+ prev_upd = prev_upd->next;
+ if (prev_upd != NULL) {
+ __wt_update_obsolete_free(
+ session, page, prev_upd->next);
+ prev_upd->next = NULL;
+ }
+ }
+ }
+
/*
* When modifying the page we set the first dirty transaction to the
* last transaction currently running. However, the updates we made
@@ -1620,7 +1654,16 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
addr->type = multi->addr.type;
WT_RET(__wt_memdup(session,
multi->addr.addr, addr->size, &addr->addr));
- ref->state = WT_REF_DISK;
+ if (multi->las_pageid != 0) {
+ WT_RET(__wt_calloc_one(session, &ref->page_las));
+ ref->page_las->las_pageid = multi->las_pageid;
+#ifdef HAVE_TIMESTAMPS
+ __wt_timestamp_set(&ref->page_las->min_timestamp,
+ &multi->las_min_timestamp);
+#endif
+ ref->state = WT_REF_LOOKASIDE;
+ } else
+ ref->state = WT_REF_DISK;
}
/*
diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c
index 75f1c6ef930..02ff0a1a4be 100644
--- a/src/third_party/wiredtiger/src/btree/bt_sync.c
+++ b/src/third_party/wiredtiger/src/btree/bt_sync.c
@@ -62,6 +62,81 @@ __sync_checkpoint_can_skip(WT_SESSION_IMPL *session, WT_PAGE *page)
}
/*
+ * __sync_dup_walk --
+ * Duplicate a tree walk point.
+ */
+static inline int
+__sync_dup_walk(
+ WT_SESSION_IMPL *session, WT_REF *walk, uint32_t flags, WT_REF **dupp)
+{
+ WT_REF *old;
+ bool busy;
+
+ if ((old = *dupp) != NULL) {
+ *dupp = NULL;
+ WT_RET(__wt_page_release(session, old, flags));
+ }
+
+ /* It is okay to duplicate a walk before it starts. */
+ if (walk == NULL || __wt_ref_is_root(walk)) {
+ *dupp = walk;
+ return (0);
+ }
+
+ /* Get a duplicate hazard pointer. */
+ for (;;) {
+#ifdef HAVE_DIAGNOSTIC
+ WT_RET(
+ __wt_hazard_set(session, walk, &busy, __func__, __LINE__));
+#else
+ WT_RET(__wt_hazard_set(session, walk, &busy));
+#endif
+ /*
+ * We already have a hazard pointer, we should generally be able
+ * to get another one. We can get spurious busy errors (e.g., if
+ * eviction is attempting to lock the page. Keep trying: we have
+ * one hazard pointer so we should be able to get another one.
+ */
+ if (!busy)
+ break;
+ __wt_yield();
+ }
+
+ *dupp = walk;
+ return (0);
+}
+
+/*
+ * __sync_evict_page --
+ * Attempt to evict a page during a checkpoint walk.
+ */
+static int
+__sync_evict_page(WT_SESSION_IMPL *session, WT_REF **walkp, uint32_t flags)
+{
+ WT_DECL_RET;
+ WT_REF *next, *to_evict;
+
+ to_evict = *walkp;
+ next = NULL;
+
+ /*
+ * Get the ref after the page we're trying to evicting. If the
+ * eviction is successful, the walk will continue from here.
+ */
+ WT_RET(__sync_dup_walk(session, to_evict, flags, &next));
+ WT_ERR(__wt_tree_walk(session, &next, flags));
+
+ WT_ERR(__wt_page_release_evict(session, to_evict));
+
+ /* Success: continue the walk at the next page. */
+ *walkp = next;
+ return (0);
+
+err: WT_TRET(__wt_page_release(session, next, flags));
+ return (ret);
+}
+
+/*
* __sync_file --
* Flush pages for a specific file.
*/
@@ -73,22 +148,23 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_PAGE *page;
- WT_REF *walk;
+ WT_REF *prev, *walk;
WT_TXN *txn;
uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages;
uint64_t oldest_id, saved_pinned_id;
uint32_t flags;
- bool timer;
+ bool evict_failed, skip_walk, timer;
conn = S2C(session);
btree = S2BT(session);
- walk = NULL;
+ prev = walk = NULL;
txn = &session->txn;
- saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id;
- flags = WT_READ_CACHE | WT_READ_NO_GEN;
+ evict_failed = skip_walk = false;
+ flags = WT_READ_CACHE | WT_READ_NO_GEN;
internal_bytes = leaf_bytes = 0;
internal_pages = leaf_pages = 0;
+ saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id;
timer = WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT);
if (timer)
__wt_epoch(session, &start);
@@ -119,8 +195,8 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
*/
oldest_id = __wt_txn_oldest_id(session);
- flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL;
- for (walk = NULL;;) {
+ LF_SET(WT_READ_NO_WAIT | WT_READ_SKIP_INTL);
+ for (;;) {
WT_ERR(__wt_tree_walk(session, &walk, flags));
if (walk == NULL)
break;
@@ -139,7 +215,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
leaf_bytes += page->memory_footprint;
++leaf_pages;
WT_ERR(__wt_reconcile(session,
- walk, NULL, WT_CHECKPOINTING, NULL));
+ walk, NULL, WT_REC_CHECKPOINT, NULL));
}
}
break;
@@ -184,9 +260,19 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
btree->checkpointing = WT_CKPT_RUNNING;
/* Write all dirty in-cache pages. */
- flags |= WT_READ_NO_EVICT;
- for (walk = NULL;;) {
- WT_ERR(__wt_tree_walk(session, &walk, flags));
+ LF_SET(WT_READ_NO_EVICT);
+
+ /* Read pages with lookaside entries and evict them asap. */
+ LF_SET(WT_READ_LOOKASIDE | WT_READ_WONT_NEED);
+
+ for (;;) {
+ if (!skip_walk) {
+ WT_ERR(__sync_dup_walk(
+ session, walk, flags, &prev));
+ WT_ERR(__wt_tree_walk(session, &walk, flags));
+ }
+ skip_walk = false;
+
if (walk == NULL)
break;
@@ -221,8 +307,39 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
leaf_bytes += page->memory_footprint;
++leaf_pages;
}
+
+ /*
+ * If the page needs forced eviction, try to do that
+ * now.
+ *
+ * For eviction to have a chance, we first need to move
+ * the walk point to the next page checkpoint will
+ * visit. We want to avoid this code being too special
+ * purpose, so try to reuse the ordinary eviction path.
+ *
+ * If eviction succeeded, it steps to the next ref, so
+ * we have to skip the next walk. If eviction fails,
+ * remember so we don't retry it.
+ */
+ if (!WT_PAGE_IS_INTERNAL(page) &&
+ page->read_gen == WT_READGEN_OLDEST &&
+ !evict_failed) {
+ if ((ret = __sync_evict_page(
+ session, &walk, flags)) == 0) {
+ evict_failed = false;
+ skip_walk = true;
+ } else {
+ walk = prev;
+ prev = NULL;
+ evict_failed = true;
+ }
+ WT_ERR_BUSY_OK(ret);
+ continue;
+ }
+
+ evict_failed = false;
WT_ERR(__wt_reconcile(
- session, walk, NULL, WT_CHECKPOINTING, NULL));
+ session, walk, NULL, WT_REC_CHECKPOINT, NULL));
}
break;
case WT_SYNC_CLOSE:
@@ -244,8 +361,8 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
}
err: /* On error, clear any left-over tree walk. */
- if (walk != NULL)
- WT_TRET(__wt_page_release(session, walk, flags));
+ WT_TRET(__wt_page_release(session, walk, flags));
+ WT_TRET(__wt_page_release(session, prev, flags));
/*
* If we got a snapshot in order to write pages, and there was no
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c
index fcc2336a3e5..b68c6b9c5c6 100644
--- a/src/third_party/wiredtiger/src/btree/bt_walk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_walk.c
@@ -472,6 +472,11 @@ restart: /*
if (LF_ISSET(WT_READ_NO_WAIT) &&
ref->state != WT_REF_MEM)
break;
+
+ /* Skip lookaside pages if not requested. */
+ if (ref->state == WT_REF_LOOKASIDE &&
+ !LF_ISSET(WT_READ_LOOKASIDE))
+ break;
} else if (LF_ISSET(WT_READ_TRUNCATE)) {
/*
* Avoid pulling a deleted page back in to try
diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c
index 78ee367dc69..10bc3894a0d 100644
--- a/src/third_party/wiredtiger/src/btree/col_srch.c
+++ b/src/third_party/wiredtiger/src/btree/col_srch.c
@@ -62,7 +62,7 @@ __check_leaf_key_range(WT_SESSION_IMPL *session,
*/
int
__wt_col_search(WT_SESSION_IMPL *session,
- uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
+ uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool restore)
{
WT_BTREE *btree;
WT_COL *cip;
@@ -90,16 +90,15 @@ __wt_col_search(WT_SESSION_IMPL *session,
/*
* We may be searching only a single leaf page, not the full tree. In
- * the normal case where the page links to a parent, check the page's
+ * the normal case where we are searching a tree, check the page's
* parent keys before doing the full search, it's faster when the
- * cursor is being re-positioned. (One case where the page doesn't
- * have a parent is if it is being re-instantiated in memory as part
- * of a split).
+ * cursor is being re-positioned. Skip this if the page is being
+ * re-instantiated in memory.
*/
if (leaf != NULL) {
WT_ASSERT(session, search_recno != WT_RECNO_OOB);
- if (leaf->home != NULL) {
+ if (!restore) {
WT_RET(__check_leaf_key_range(
session, recno, leaf, cbt));
if (cbt->compare != 0) {
diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c
index 3a9a6eb0f9b..16081e841dc 100644
--- a/src/third_party/wiredtiger/src/btree/row_srch.c
+++ b/src/third_party/wiredtiger/src/btree/row_srch.c
@@ -205,7 +205,8 @@ __check_leaf_key_range(WT_SESSION_IMPL *session,
*/
int
__wt_row_search(WT_SESSION_IMPL *session,
- WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert)
+ WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt,
+ bool insert, bool restore)
{
WT_BTREE *btree;
WT_COLLATOR *collator;
@@ -250,14 +251,13 @@ __wt_row_search(WT_SESSION_IMPL *session,
/*
* We may be searching only a single leaf page, not the full tree. In
- * the normal case where the page links to a parent, check the page's
+ * the normal case where we are searching a tree, check the page's
* parent keys before doing the full search, it's faster when the
- * cursor is being re-positioned. (One case where the page doesn't
- * have a parent is if it is being re-instantiated in memory as part
- * of a split).
+ * cursor is being re-positioned. Skip this if the page is being
+ * re-instantiated in memory.
*/
if (leaf != NULL) {
- if (leaf->home != NULL) {
+ if (!restore) {
WT_RET(__check_leaf_key_range(
session, srch_key, leaf, cbt));
if (cbt->compare != 0) {
diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c
index e1e47b9eecb..d9a5dbc2096 100644
--- a/src/third_party/wiredtiger/src/cache/cache_las.c
+++ b/src/third_party/wiredtiger/src/cache/cache_las.c
@@ -89,17 +89,24 @@ __wt_las_create(WT_SESSION_IMPL *session)
WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_FORMAT));
/*
+ * Flag that the lookaside table has been created (before creating the
+ * connection's lookaside table session, it checks before creating a
+ * lookaside table cursor.
+ */
+ F_SET(conn, WT_CONN_LAS_OPEN);
+
+ /*
* Open a shared internal session used to access the lookaside table.
* This session should never be tapped for eviction.
*/
session_flags = WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION;
- WT_RET(__wt_open_internal_session(
+ WT_ERR(__wt_open_internal_session(
conn, "lookaside table", true, session_flags, &conn->las_session));
- /* Flag that the lookaside table has been created. */
- F_SET(conn, WT_CONN_LAS_OPEN);
-
return (0);
+
+err: F_CLR(conn, WT_CONN_LAS_OPEN);
+ return (ret);
}
/*
@@ -127,38 +134,6 @@ __wt_las_destroy(WT_SESSION_IMPL *session)
}
/*
- * __wt_las_set_written --
- * Flag that the lookaside table has been written.
- */
-void
-__wt_las_set_written(WT_SESSION_IMPL *session)
-{
- WT_CONNECTION_IMPL *conn;
-
- conn = S2C(session);
- if (!conn->las_written) {
- conn->las_written = true;
-
- /*
- * Future page reads must deal with lookaside table records.
- * No write could be cached until a future read might matter,
- * the barrier is more documentation than requirement.
- */
- WT_FULL_BARRIER();
- }
-}
-
-/*
- * __wt_las_is_written --
- * Return if the lookaside table has been written.
- */
-bool
-__wt_las_is_written(WT_SESSION_IMPL *session)
-{
- return (S2C(session)->las_written);
-}
-
-/*
* __wt_las_cursor_open --
* Open a new lookaside table cursor.
*/
@@ -280,129 +255,48 @@ __wt_las_cursor_close(
}
/*
- * __wt_las_sweep --
- * Sweep the lookaside table.
+ * __wt_las_remove_block --
+ * Remove all records matching a key prefix from the lookaside store.
*/
int
-__wt_las_sweep(WT_SESSION_IMPL *session)
+__wt_las_remove_block(WT_SESSION_IMPL *session,
+ WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid)
{
- WT_CONNECTION_IMPL *conn;
- WT_CURSOR *cursor;
WT_DECL_RET;
- WT_DECL_TIMESTAMP(timestamp)
- WT_ITEM *key;
- WT_ITEM las_addr, las_key, las_timestamp;
- uint64_t cnt, las_counter, las_txnid, remove_cnt;
- uint32_t las_id, session_flags;
- int notused;
+ WT_ITEM las_key;
+ uint64_t las_counter, las_pageid, remove_cnt;
+ uint32_t las_id;
+ int exact;
- conn = S2C(session);
- cursor = NULL;
- key = &conn->las_sweep_key;
remove_cnt = 0;
- session_flags = 0; /* [-Werror=maybe-uninitialized] */
-
- __wt_las_cursor(session, &cursor, &session_flags);
/*
- * If we're not starting a new sweep, position the cursor using the key
- * from the last call (we don't care if we're before or after the key,
- * just roughly in the same spot is fine).
+ * Search for the block's unique prefix and step through all matching
+ * records, removing them.
*/
- if (key->size != 0) {
- __wt_cursor_set_raw_key(cursor, key);
- ret = cursor->search_near(cursor, &notused);
-
- /*
- * Don't search for the same key twice; if we don't set a new
- * key below, it's because we've reached the end of the table
- * and we want the next pass to start at the beginning of the
- * table. Searching for the same key could leave us stuck at
- * the end of the table, repeatedly checking the same rows.
- */
- key->size = 0;
- if (ret != 0)
- goto srch_notfound;
- }
-
- /*
- * The sweep server wakes up every 10 seconds (by default), it's a slow
- * moving thread. Try to review the entire lookaside table once every 5
- * minutes, or every 30 calls.
- *
- * The reason is because the lookaside table exists because we're seeing
- * cache/eviction pressure (it allows us to trade performance and disk
- * space for cache space), and it's likely lookaside blocks are being
- * evicted, and reading them back in doesn't help things. A trickier,
- * but possibly better, alternative might be to review all lookaside
- * blocks in the cache in order to get rid of them, and slowly review
- * lookaside blocks that have already been evicted.
- */
- cnt = WT_MAX(100, conn->las_record_cnt / 30);
-
- /* Discard pages we read as soon as we're done with them. */
- F_SET(session, WT_SESSION_NO_CACHE);
+ las_key.size = 0;
+ cursor->set_key(cursor, btree_id, pageid, (uint64_t)0, &las_key);
+ if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0)
+ ret = cursor->next(cursor);
+ for (; ret == 0; ret = cursor->next(cursor)) {
+ WT_ERR(cursor->get_key(cursor,
+ &las_id, &las_pageid, &las_counter, &las_key));
- /* Walk the file. */
- for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) {
/*
- * If the loop terminates after completing a work unit, we will
- * continue the table sweep next time. Get a local copy of the
- * sweep key, we're going to reset the cursor; do so before
- * calling cursor.remove, cursor.remove can discard our hazard
- * pointer and the page could be evicted from underneath us.
+ * Confirm the search using the unique prefix; if not a match,
+ * we're done searching for records for this page. Note that
+ * page ID zero is special: it is a wild card indicating that
+ * all pages in the tree should be removed.
*/
- if (cnt == 1) {
- WT_ERR(__wt_cursor_get_raw_key(cursor, key));
- if (!WT_DATA_IN_ITEM(key))
- WT_ERR(__wt_buf_set(
- session, key, key->data, key->size));
- }
+ if (las_id != btree_id ||
+ (pageid != 0 && las_pageid != pageid))
+ break;
- /*
- * Cursor opened overwrite=true: won't return WT_NOTFOUND should
- * another thread remove the record before we do, and the cursor
- * remains positioned in that case.
- */
- WT_ERR(cursor->get_key(cursor, &las_id, &las_addr, &las_counter,
- &las_txnid, &las_timestamp, &las_key));
-
- /*
- * If the on-page record transaction ID associated with the
- * record is globally visible, the record can be discarded.
- *
- * Copy the timestamp from the cursor to avoid unaligned reads.
- */
-#ifdef HAVE_TIMESTAMPS
- WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE);
- memcpy(&timestamp, las_timestamp.data, las_timestamp.size);
-#endif
- if (__wt_txn_visible_all(
- session, las_txnid, WT_TIMESTAMP_NULL(&timestamp))) {
- WT_ERR(cursor->remove(cursor));
- ++remove_cnt;
- }
+ WT_ERR(cursor->remove(cursor));
+ ++remove_cnt;
}
-
-srch_notfound:
WT_ERR_NOTFOUND_OK(ret);
- if (0) {
-err: __wt_buf_free(session, key);
- }
-
- WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
-
- /*
- * If there were races to remove records, we can over-count. Underflow
- * isn't fatal, but check anyway so we don't skew low over time.
- */
- if (remove_cnt > conn->las_record_cnt)
- conn->las_record_cnt = 0;
- else if (remove_cnt > 0)
- (void)__wt_atomic_sub64(&conn->las_record_cnt, remove_cnt);
-
- F_CLR(session, WT_SESSION_NO_CACHE);
-
+err: WT_STAT_CONN_DECRV(session, cache_lookaside_entries, remove_cnt);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c
index a16ba6ba28c..d7f4f6fe148 100644
--- a/src/third_party/wiredtiger/src/config/config_def.c
+++ b/src/third_party/wiredtiger/src/config/config_def.c
@@ -202,6 +202,16 @@ static const WT_CONFIG_CHECK confchk_WT_CURSOR_reconfigure[] = {
{ NULL, NULL, NULL, NULL, NULL, 0 }
};
+static const WT_CONFIG_CHECK confchk_assert_subconfigs[] = {
+ { "commit_timestamp", "string",
+ NULL, "choices=[\"always\",\"never\",\"none\"]",
+ NULL, 0 },
+ { "read_timestamp", "string",
+ NULL, "choices=[\"always\",\"never\",\"none\"]",
+ NULL, 0 },
+ { NULL, NULL, NULL, NULL, NULL, 0 }
+};
+
static const WT_CONFIG_CHECK
confchk_WT_SESSION_create_log_subconfigs[] = {
{ "enabled", "boolean", NULL, NULL, NULL, 0 },
@@ -212,6 +222,9 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_alter[] = {
{ "access_pattern_hint", "string",
NULL, "choices=[\"none\",\"random\",\"sequential\"]",
NULL, 0 },
+ { "assert", "category",
+ NULL, NULL,
+ confchk_assert_subconfigs, 2 },
{ "cache_resident", "boolean", NULL, NULL, NULL, 0 },
{ "log", "category",
NULL, NULL,
@@ -285,6 +298,9 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_create[] = {
NULL, "min=512B,max=128MB",
NULL, 0 },
{ "app_metadata", "string", NULL, NULL, NULL, 0 },
+ { "assert", "category",
+ NULL, NULL,
+ confchk_assert_subconfigs, 2 },
{ "block_allocation", "string",
NULL, "choices=[\"first\",\"best\"]",
NULL, 0 },
@@ -470,6 +486,9 @@ static const WT_CONFIG_CHECK confchk_file_config[] = {
NULL, "min=512B,max=128MB",
NULL, 0 },
{ "app_metadata", "string", NULL, NULL, NULL, 0 },
+ { "assert", "category",
+ NULL, NULL,
+ confchk_assert_subconfigs, 2 },
{ "block_allocation", "string",
NULL, "choices=[\"first\",\"best\"]",
NULL, 0 },
@@ -531,6 +550,9 @@ static const WT_CONFIG_CHECK confchk_file_meta[] = {
NULL, "min=512B,max=128MB",
NULL, 0 },
{ "app_metadata", "string", NULL, NULL, NULL, 0 },
+ { "assert", "category",
+ NULL, NULL,
+ confchk_assert_subconfigs, 2 },
{ "block_allocation", "string",
NULL, "choices=[\"first\",\"best\"]",
NULL, 0 },
@@ -612,6 +634,9 @@ static const WT_CONFIG_CHECK confchk_lsm_meta[] = {
NULL, "min=512B,max=128MB",
NULL, 0 },
{ "app_metadata", "string", NULL, NULL, NULL, 0 },
+ { "assert", "category",
+ NULL, NULL,
+ confchk_assert_subconfigs, 2 },
{ "block_allocation", "string",
NULL, "choices=[\"first\",\"best\"]",
NULL, 0 },
@@ -1180,8 +1205,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {
confchk_WT_CURSOR_reconfigure, 2
},
{ "WT_SESSION.alter",
- "access_pattern_hint=none,cache_resident=false,log=(enabled=true)",
- confchk_WT_SESSION_alter, 3
+ "access_pattern_hint=none,assert=(commit_timestamp=none,"
+ "read_timestamp=none),cache_resident=false,log=(enabled=true)",
+ confchk_WT_SESSION_alter, 4
},
{ "WT_SESSION.begin_transaction",
"isolation=,name=,priority=0,read_timestamp=,snapshot=,sync=",
@@ -1205,6 +1231,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
},
{ "WT_SESSION.create",
"access_pattern_hint=none,allocation_size=4KB,app_metadata=,"
+ "assert=(commit_timestamp=none,read_timestamp=none),"
"block_allocation=best,block_compressor=,cache_resident=false,"
"checksum=uncompressed,colgroups=,collator=,columns=,dictionary=0"
",encryption=(keyid=,name=),exclusive=false,extractor=,"
@@ -1220,7 +1247,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false,"
"prefix_compression_min=4,source=,split_deepen_min_child=0,"
"split_deepen_per_child=0,split_pct=90,type=file,value_format=u",
- confchk_WT_SESSION_create, 42
+ confchk_WT_SESSION_create, 43
},
{ "WT_SESSION.drop",
"checkpoint_wait=true,force=false,lock_wait=true,"
@@ -1307,6 +1334,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
},
{ "file.config",
"access_pattern_hint=none,allocation_size=4KB,app_metadata=,"
+ "assert=(commit_timestamp=none,read_timestamp=none),"
"block_allocation=best,block_compressor=,cache_resident=false,"
"checksum=uncompressed,collator=,columns=,dictionary=0,"
"encryption=(keyid=,name=),format=btree,huffman_key=,"
@@ -1318,10 +1346,11 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false,"
"prefix_compression_min=4,split_deepen_min_child=0,"
"split_deepen_per_child=0,split_pct=90,value_format=u",
- confchk_file_config, 35
+ confchk_file_config, 36
},
{ "file.meta",
"access_pattern_hint=none,allocation_size=4KB,app_metadata=,"
+ "assert=(commit_timestamp=none,read_timestamp=none),"
"block_allocation=best,block_compressor=,cache_resident=false,"
"checkpoint=,checkpoint_lsn=,checksum=uncompressed,collator=,"
"columns=,dictionary=0,encryption=(keyid=,name=),format=btree,"
@@ -1334,7 +1363,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"os_cache_max=0,prefix_compression=false,prefix_compression_min=4"
",split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,"
"value_format=u,version=(major=0,minor=0)",
- confchk_file_meta, 39
+ confchk_file_meta, 40
},
{ "index.meta",
"app_metadata=,collator=,columns=,extractor=,immutable=false,"
@@ -1343,6 +1372,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
},
{ "lsm.meta",
"access_pattern_hint=none,allocation_size=4KB,app_metadata=,"
+ "assert=(commit_timestamp=none,read_timestamp=none),"
"block_allocation=best,block_compressor=,cache_resident=false,"
"checksum=uncompressed,chunks=,collator=,columns=,dictionary=0,"
"encryption=(keyid=,name=),format=btree,huffman_key=,"
@@ -1358,7 +1388,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false,"
"prefix_compression_min=4,split_deepen_min_child=0,"
"split_deepen_per_child=0,split_pct=90,value_format=u",
- confchk_lsm_meta, 39
+ confchk_lsm_meta, 40
},
{ "table.meta",
"app_metadata=,colgroups=,collator=,columns=,key_format=u,"
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
index 284e7e9883b..55251491129 100644
--- a/src/third_party/wiredtiger/src/conn/conn_api.c
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -1084,6 +1084,9 @@ err: /*
WT_TRET(wt_session->close(wt_session, config));
}
+ /* Shut down transactions (wait for in-flight operations to complete. */
+ WT_TRET(__wt_txn_global_shutdown(session));
+
/*
* Perform a system-wide checkpoint so that all tables are consistent
* with each other. All transactions are resolved but ignore
diff --git a/src/third_party/wiredtiger/src/conn/conn_cache.c b/src/third_party/wiredtiger/src/conn/conn_cache.c
index 5515eb026ca..625350cf3e6 100644
--- a/src/third_party/wiredtiger/src/conn/conn_cache.c
+++ b/src/third_party/wiredtiger/src/conn/conn_cache.c
@@ -300,6 +300,11 @@ __wt_cache_destroy(WT_SESSION_IMPL *session)
"cache server: exiting with %" PRIu64 " pages in "
"memory and %" PRIu64 " pages evicted",
cache->pages_inmem, cache->pages_evict);
+ if (cache->bytes_image != 0)
+ __wt_errx(session,
+ "cache server: exiting with %" PRIu64 " image bytes in "
+ "memory",
+ cache->bytes_image);
if (cache->bytes_inmem != 0)
__wt_errx(session,
"cache server: exiting with %" PRIu64 " bytes in memory",
diff --git a/src/third_party/wiredtiger/src/conn/conn_ckpt.c b/src/third_party/wiredtiger/src/conn/conn_ckpt.c
index a47524af2d7..d968d4e4b2b 100644
--- a/src/third_party/wiredtiger/src/conn/conn_ckpt.c
+++ b/src/third_party/wiredtiger/src/conn/conn_ckpt.c
@@ -161,8 +161,11 @@ __ckpt_server_start(WT_CONNECTION_IMPL *conn)
*
* Checkpoint does enough I/O it may be called upon to perform slow
* operations for the block manager.
+ *
+ * The checkpoint thread reads the lookaside table for outdated records,
+ * it gets its own cursor for that purpose.
*/
- session_flags = WT_SESSION_CAN_WAIT;
+ session_flags = WT_SESSION_CAN_WAIT | WT_SESSION_LOOKASIDE_CURSOR;
WT_RET(__wt_open_internal_session(conn,
"checkpoint-server", true, session_flags, &conn->ckpt_session));
session = conn->ckpt_session;
diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
index 56a37cf16eb..2606c9d083b 100644
--- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c
+++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
@@ -774,13 +774,14 @@ __wt_conn_dhandle_discard(WT_SESSION_IMPL *session)
__wt_session_close_cache(session);
/*
- * Close open data handles: first, everything but the metadata file (as
- * closing a normal file may open and write the metadata file), then
- * the metadata file.
+ * Close open data handles: first, everything apart from metadata and
+ * lookaside (as closing a normal file may write metadata and read
+ * lookaside entries). Then close whatever is left open.
*/
restart:
TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
- if (WT_IS_METADATA(dhandle))
+ if (WT_IS_METADATA(dhandle) ||
+ strcmp(dhandle->name, WT_LAS_URI) == 0)
continue;
WT_WITH_DHANDLE(session, dhandle,
@@ -789,6 +790,9 @@ restart:
goto restart;
}
+ /* Shut down the lookaside table after all eviction is complete. */
+ WT_TRET(__wt_las_destroy(session));
+
/*
* Closing the files may have resulted in entries on our default
* session's list of open data handles, specifically, we added the
@@ -807,7 +811,7 @@ restart:
if (session->meta_cursor != NULL)
WT_TRET(session->meta_cursor->close(session->meta_cursor));
- /* Close the metadata file handle. */
+ /* Close the remaining handles. */
WT_TAILQ_SAFE_REMOVE_BEGIN(dhandle, &conn->dhqh, q, dhandle_tmp) {
WT_WITH_DHANDLE(session, dhandle,
WT_TRET(__wt_conn_dhandle_discard_single(
diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c
index 2865dc9e2fa..e72fa5c00a4 100644
--- a/src/third_party/wiredtiger/src/conn/conn_open.c
+++ b/src/third_party/wiredtiger/src/conn/conn_open.c
@@ -75,9 +75,6 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
wt_conn = &conn->iface;
session = conn->default_session;
- /* Shut down transactions (wait for in-flight operations to complete. */
- WT_TRET(__wt_txn_global_shutdown(session));
-
/* Shut down the subsystems, ensuring workers see the state change. */
F_SET(conn, WT_CONN_CLOSING);
WT_FULL_BARRIER();
@@ -111,9 +108,6 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
/* The eviction server is shut down last. */
WT_TRET(__wt_evict_destroy(session));
- /* Shut down the lookaside table, after all eviction is complete. */
- WT_TRET(__wt_las_destroy(session));
-
/* Close open data handles. */
WT_TRET(__wt_conn_dhandle_discard(session));
diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c
index 008aa6c08d8..a164e34fe33 100644
--- a/src/third_party/wiredtiger/src/conn/conn_sweep.c
+++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c
@@ -278,12 +278,10 @@ __sweep_server(void *arg)
WT_DECL_RET;
WT_SESSION_IMPL *session;
time_t now;
- uint64_t last_las_sweep_id, oldest_id;
u_int dead_handles;
session = arg;
conn = S2C(session);
- last_las_sweep_id = WT_TXN_NONE;
/*
* Sweep for dead and excess handles.
@@ -302,26 +300,6 @@ __sweep_server(void *arg)
WT_STAT_CONN_INCR(session, dh_sweeps);
/*
- * Sweep the lookaside table. If the lookaside table hasn't yet
- * been written, there's no work to do.
- *
- * Don't sweep the lookaside table if the cache is stuck full.
- * The sweep uses the cache and can exacerbate the problem.
- * If we try to sweep when the cache is full or we aren't
- * making progress in eviction, sweeping can wind up constantly
- * bringing in and evicting pages from the lookaside table,
- * which will stop the cache from moving into the stuck state.
- */
- if (__wt_las_is_written(session) &&
- !__wt_cache_stuck(session)) {
- oldest_id = __wt_txn_oldest_id(session);
- if (WT_TXNID_LT(last_las_sweep_id, oldest_id)) {
- WT_ERR(__wt_las_sweep(session));
- last_las_sweep_id = oldest_id;
- }
- }
-
- /*
* Mark handles with a time of death, and report whether any
* handles are marked dead. If sweep_idle_time is 0, handles
* never become idle.
@@ -403,14 +381,9 @@ __wt_sweep_create(WT_SESSION_IMPL *session)
* Handle sweep does enough I/O it may be called upon to perform slow
* operations for the block manager.
*
- * The sweep thread sweeps the lookaside table for outdated records,
- * it gets its own cursor for that purpose.
- *
* Don't tap the sweep thread for eviction.
*/
session_flags = WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION;
- if (F_ISSET(conn, WT_CONN_LAS_OPEN))
- session_flags |= WT_SESSION_LOOKASIDE_CURSOR;
WT_RET(__wt_open_internal_session(
conn, "sweep-server", true, session_flags, &conn->sweep_session));
session = conn->sweep_session;
@@ -453,8 +426,5 @@ __wt_sweep_destroy(WT_SESSION_IMPL *session)
conn->sweep_session = NULL;
}
- /* Discard any saved lookaside key. */
- __wt_buf_free(session, &conn->las_sweep_key);
-
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c
index 9aa93ade372..22ba6d1dee1 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_file.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_file.c
@@ -263,14 +263,20 @@ __wt_curfile_insert_check(WT_CURSOR *cursor)
WT_CURSOR_BTREE *cbt;
WT_DECL_RET;
WT_SESSION_IMPL *session;
+ int tret;
cbt = (WT_CURSOR_BTREE *)cursor;
+ tret = 0;
CURSOR_UPDATE_API_CALL_BTREE(cursor, session, update, cbt->btree);
WT_ERR(__cursor_checkkey(cursor));
- ret = __wt_btcur_insert_check(cbt);
+ tret = __wt_btcur_insert_check(cbt);
+ /*
+ * Detecting a conflict should not cause transaction error.
+ */
err: CURSOR_UPDATE_API_END(session, ret);
+ WT_TRET(tret);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/docs/backup.dox b/src/third_party/wiredtiger/src/docs/backup.dox
index 91b15da9275..b952a975788 100644
--- a/src/third_party/wiredtiger/src/docs/backup.dox
+++ b/src/third_party/wiredtiger/src/docs/backup.dox
@@ -59,10 +59,12 @@ During the period the backup cursor is open, database checkpoints can
be created, but no checkpoints can be deleted. This may result in
significant file growth.
-Additionally, if a crash occurs during the period the backup cursor is open and
-logging is disabled, then the system will be restored to the most recent
-checkpoint prior to the opening of the backup cursor, even if later database
-checkpoints were created.
+Additionally, if a crash occurs during the period the backup cursor is
+open and logging is disabled (in other words, when depending on
+checkpoints for durability), then the system will be restored to the
+most recent checkpoint prior to the opening of the backup cursor, even
+if later database checkpoints were completed. <b>Note this exception to
+WiredTiger's checkpoint durability guarantees.</b>
The following is a programmatic example of creating a backup:
diff --git a/src/third_party/wiredtiger/src/docs/checkpoint.dox b/src/third_party/wiredtiger/src/docs/checkpoint.dox
index ec28fea13c3..3d636cd17b6 100644
--- a/src/third_party/wiredtiger/src/docs/checkpoint.dox
+++ b/src/third_party/wiredtiger/src/docs/checkpoint.dox
@@ -22,6 +22,10 @@ configuration to ::wiredtiger_open.
All transactional updates committed before a checkpoint are made durable
by the checkpoint, therefore the frequency of checkpoints limits the
volume of data that may be lost due to application or system failure.
+<b>This guarantee has an exception:</b> If a crash occurs when a backup
+cursor is open, then the system will be restored to the most recent
+checkpoint prior to the opening of the backup cursor, even if later
+database checkpoints were completed.
Data sources that are involved in an exclusive operation when the
checkpoint starts, including bulk load, verify or salvage, will be skipped
diff --git a/src/third_party/wiredtiger/src/docs/transactions.dox b/src/third_party/wiredtiger/src/docs/transactions.dox
index d9cc72dcf24..4ba6d5d2526 100644
--- a/src/third_party/wiredtiger/src/docs/transactions.dox
+++ b/src/third_party/wiredtiger/src/docs/transactions.dox
@@ -165,8 +165,8 @@ transaction timestamp functionality.
Applications can assign explicit commit timestamps to transactions, then read
"as of" a timestamp. Timestamps are communicated to WiredTiger using a
-lower case hexadecimal encoding, so the encoded value can be twice as long as
-the raw timestamp value.
+hexadecimal encoding, so the encoded value can be twice as long as the raw
+timestamp value.
Setting a read timestamp in WT_SESSION::begin_transaction forces a transaction
to run at snapshot isolation and ignore any commits with a newer timestamp.
diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c
index 56638934305..f2a09a0a769 100644
--- a/src/third_party/wiredtiger/src/evict/evict_file.c
+++ b/src/third_party/wiredtiger/src/evict/evict_file.c
@@ -16,11 +16,15 @@ int
__wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
{
WT_BTREE *btree;
+ WT_CURSOR *las_cursor;
+ WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
WT_PAGE *page;
WT_REF *next_ref, *ref;
+ uint32_t session_flags, walk_flags;
- btree = S2BT(session);
+ dhandle = session->dhandle;
+ btree = dhandle->handle;
/*
* We need exclusive access to the file, we're about to discard the root
@@ -28,7 +32,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
*/
WT_ASSERT(session,
btree->evict_disabled > 0 ||
- !F_ISSET(session->dhandle, WT_DHANDLE_OPEN));
+ !F_ISSET(dhandle, WT_DHANDLE_OPEN));
/*
* We do discard objects without pages in memory. If that's the case,
@@ -37,14 +41,39 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
if (btree->root.page == NULL)
return (0);
+ walk_flags = WT_READ_CACHE | WT_READ_NO_EVICT;
+
+ /*
+ * If discarding a dead tree, remove any lookaside entries. This deals
+ * with the case where a tree is dropped with "force=true". It happens
+ * that we also force-drop the lookaside table itself: it can never
+ * participate in lookaside eviction, and we can't open a cursor on it
+ * as we are discarding it.
+ *
+ * We use the special page ID zero so that all lookaside entries for
+ * the tree are removed.
+ */
+ if (F_ISSET(dhandle, WT_DHANDLE_DEAD) &&
+ F_ISSET(S2C(session), WT_CONN_LAS_OPEN) &&
+ !F_ISSET(btree, WT_BTREE_LOOKASIDE)) {
+ WT_ASSERT(session, !WT_IS_METADATA(dhandle));
+
+ __wt_las_cursor(session, &las_cursor, &session_flags);
+ WT_TRET(__wt_las_remove_block(
+ session, las_cursor, btree->id, 0));
+ WT_TRET(__wt_las_cursor_close(
+ session, &las_cursor, session_flags));
+ WT_RET(ret);
+ } else
+ FLD_SET(walk_flags, WT_READ_LOOKASIDE);
+
/* Make sure the oldest transaction ID is up-to-date. */
WT_RET(__wt_txn_update_oldest(
session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
/* Walk the tree, discarding pages. */
next_ref = NULL;
- WT_ERR(__wt_tree_walk(
- session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT));
+ WT_ERR(__wt_tree_walk(session, &next_ref, walk_flags));
while ((ref = next_ref) != NULL) {
page = ref->page;
@@ -69,8 +98,8 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
* error, retrying later.
*/
if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page))
- WT_ERR(__wt_reconcile(
- session, ref, NULL, WT_EVICTING, NULL));
+ WT_ERR(__wt_reconcile(session, ref, NULL,
+ WT_REC_EVICT | WT_REC_VISIBLE_ALL, NULL));
/*
* We can't evict the page just returned to us (it marks our
@@ -81,8 +110,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
* the reconciliation, the next walk call could miss a page in
* the tree.
*/
- WT_ERR(__wt_tree_walk(session,
- &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT));
+ WT_ERR(__wt_tree_walk(session, &next_ref, walk_flags));
switch (syncop) {
case WT_SYNC_CLOSE:
@@ -96,7 +124,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
* Discard the page regardless of whether it is dirty.
*/
WT_ASSERT(session,
- F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
+ F_ISSET(dhandle, WT_DHANDLE_DEAD) ||
__wt_page_can_evict(session, ref, NULL));
__wt_ref_out(session, ref);
break;
@@ -111,7 +139,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
err: /* On error, clear any left-over tree walk. */
if (next_ref != NULL)
WT_TRET(__wt_page_release(
- session, next_ref, WT_READ_NO_EVICT));
+ session, next_ref, walk_flags));
}
return (ret);
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index 28d7bd2f1fa..8dd48738735 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -460,6 +460,7 @@ int
__wt_evict_create(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
+ uint32_t session_flags;
conn = S2C(session);
@@ -471,10 +472,12 @@ __wt_evict_create(WT_SESSION_IMPL *session)
* Create the eviction thread group.
* Set the group size to the maximum allowed sessions.
*/
+ session_flags = WT_THREAD_CAN_WAIT |
+ WT_THREAD_LOOKASIDE | WT_THREAD_PANIC_FAIL;
WT_RET(__wt_thread_group_create(session, &conn->evict_threads,
"eviction-server", conn->evict_threads_min, conn->evict_threads_max,
- WT_THREAD_CAN_WAIT | WT_THREAD_PANIC_FAIL, __wt_evict_thread_chk,
- __wt_evict_thread_run, __wt_evict_thread_stop));
+ session_flags, __wt_evict_thread_chk, __wt_evict_thread_run,
+ __wt_evict_thread_stop));
#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE)
/*
@@ -1874,6 +1877,24 @@ __evict_walk_file(WT_SESSION_IMPL *session,
F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
goto fast;
+ /*
+ * If application threads are blocked waiting for eviction (so
+ * we are going to consider lookaside), and the only thing
+ * preventing a clean page from being evicted is that it
+ * contains historical data, mark it dirty so we can do
+ * lookaside eviction.
+ */
+ if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD |
+ WT_CACHE_EVICT_DIRTY_HARD) &&
+ !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE) &&
+ !modified && page->modify != NULL &&
+ !__wt_txn_visible_all(session, page->modify->rec_max_txn,
+ WT_TIMESTAMP_NULL(&page->modify->rec_max_timestamp))) {
+ __wt_page_only_modify_set(session, page);
+ modified = true;
+ goto fast;
+ }
+
/* Skip clean pages if appropriate. */
if (!modified && !F_ISSET(cache, WT_CACHE_EVICT_CLEAN))
continue;
@@ -1905,14 +1926,19 @@ __evict_walk_file(WT_SESSION_IMPL *session,
goto fast;
/*
- * If the oldest transaction hasn't changed since the last time
- * this page was written, it's unlikely we can make progress.
- * Similarly, if the most recent update on the page is not yet
- * globally visible, eviction will fail. These heuristics
- * attempt to avoid repeated attempts to evict the same page.
+ * If there are active transaction and oldest transaction
+ * hasn't changed since the last time this page was written,
+ * it's unlikely we can make progress. Similarly, if the most
+ * recent update on the page is not yet globally visible,
+ * eviction will fail. This heuristic avoids repeated attempts
+ * to evict the same page.
+ *
+ * We skip this for the lookaside table because updates there
+ * can be evicted as soon as they are committed.
*/
mod = page->modify;
- if (modified && txn_global->current != txn_global->oldest_id &&
+ if (modified && !F_ISSET(btree, WT_BTREE_LOOKASIDE) &&
+ txn_global->current != txn_global->oldest_id &&
(mod->last_eviction_id == __wt_txn_oldest_id(session) ||
!__wt_txn_visible_all(session, mod->update_txn, NULL)))
continue;
@@ -2424,6 +2450,7 @@ static int
__verbose_dump_cache_single(WT_SESSION_IMPL *session,
uint64_t *total_bytesp, uint64_t *total_dirty_bytesp)
{
+ WT_BTREE *btree;
WT_DATA_HANDLE *dhandle;
WT_PAGE *page;
WT_REF *next_walk;
@@ -2469,11 +2496,12 @@ __verbose_dump_cache_single(WT_SESSION_IMPL *session,
}
dhandle = session->dhandle;
- if (dhandle->checkpoint == NULL)
- WT_RET(__wt_msg(session, "%s(<live>):", dhandle->name));
- else
- WT_RET(__wt_msg(session, "%s(checkpoint=%s):",
- dhandle->name, dhandle->checkpoint));
+ btree = dhandle->handle;
+ WT_RET(__wt_msg(session, "%s(%s%s)%s%s:",
+ dhandle->name, dhandle->checkpoint != NULL ? "checkpoint=" : "",
+ dhandle->checkpoint != NULL ? dhandle->checkpoint : "<live>",
+ btree->evict_disabled != 0 ? "eviction disabled" : "",
+ btree->evict_disabled_open ? " at open" : ""));
if (intl_pages != 0)
WT_RET(__wt_msg(session,
"internal: "
diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c
index ada1c39ddcf..7536e3593e8 100644
--- a/src/third_party/wiredtiger/src/evict/evict_page.c
+++ b/src/third_party/wiredtiger/src/evict/evict_page.c
@@ -10,7 +10,7 @@
static int __evict_page_clean_update(WT_SESSION_IMPL *, WT_REF *, bool);
static int __evict_page_dirty_update(WT_SESSION_IMPL *, WT_REF *, bool);
-static int __evict_review(WT_SESSION_IMPL *, WT_REF *, uint32_t *, bool);
+static int __evict_review(WT_SESSION_IMPL *, WT_REF *, bool, uint32_t *);
/*
* __evict_exclusive_clear --
@@ -127,9 +127,6 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
conn = S2C(session);
- /* Checkpoints should never do eviction. */
- WT_ASSERT(session, !WT_SESSION_IS_CHECKPOINT(session));
-
/* Enter the eviction generation. */
__wt_session_gen_enter(session, WT_GEN_EVICT);
@@ -146,13 +143,13 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
* to make this check for clean pages, too: while unlikely eviction
* would choose an internal page with children, it's not disallowed.
*/
- WT_ERR(__evict_review(session, ref, &flags, closing));
+ WT_ERR(__evict_review(session, ref, closing, &flags));
/*
* If there was an in-memory split, the tree has been left in the state
* we want: there is nothing more to do.
*/
- if (LF_ISSET(WT_EVICT_INMEM_SPLIT))
+ if (LF_ISSET(WT_REC_INMEM_SPLIT))
goto done;
/* Count evictions of internal pages during normal operation. */
@@ -312,9 +309,6 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
* of the page, if we're forced to "read" into that namespace,
* we'll instantiate a new page instead of trying to read from
* the backing store.
- *
- * Publish: a barrier to ensure the structure fields are set
- * before the state change makes the page available to readers.
*/
__wt_ref_out(session, ref);
ref->addr = NULL;
@@ -353,19 +347,37 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
* Publish: a barrier to ensure the structure fields are set
* before the state change makes the page available to readers.
*/
- WT_RET(__wt_calloc_one(session, &addr));
- *addr = mod->mod_replace;
- mod->mod_replace.addr = NULL;
- mod->mod_replace.size = 0;
- ref->addr = addr;
+ if (mod->mod_replace.addr == NULL)
+ ref->addr = NULL;
+ else {
+ WT_RET(__wt_calloc_one(session, &addr));
+ *addr = mod->mod_replace;
+ mod->mod_replace.addr = NULL;
+ mod->mod_replace.size = 0;
+ ref->addr = addr;
+ }
/*
* Eviction wants to keep this page if we have a disk image,
* re-instantiate the page in memory, else discard the page.
*/
if (mod->mod_disk_image == NULL) {
- __wt_ref_out(session, ref);
- WT_PUBLISH(ref->state, WT_REF_DISK);
+ if (mod->mod_replace_las_pageid != 0) {
+ WT_RET(
+ __wt_calloc_one(session, &ref->page_las));
+ ref->page_las->las_pageid =
+ mod->mod_replace_las_pageid;
+#ifdef HAVE_TIMESTAMPS
+ __wt_timestamp_set(
+ &ref->page_las->min_timestamp,
+ &mod->mod_replace_las_min_timestamp);
+#endif
+ __wt_ref_out(session, ref);
+ WT_PUBLISH(ref->state, WT_REF_LOOKASIDE);
+ } else {
+ __wt_ref_out(session, ref);
+ WT_PUBLISH(ref->state, WT_REF_DISK);
+ }
} else {
/*
* The split code works with WT_MULTI structures, build
@@ -413,7 +425,7 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent)
*/
static int
__evict_review(
- WT_SESSION_IMPL *session, WT_REF *ref, uint32_t *flagsp, bool closing)
+ WT_SESSION_IMPL *session, WT_REF *ref, bool closing, uint32_t *flagsp)
{
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
@@ -423,7 +435,9 @@ __evict_review(
bool lookaside_retry, *lookaside_retryp, modified;
conn = S2C(session);
- flags = WT_EVICTING;
+ flags = WT_REC_EVICT;
+ if (!WT_SESSION_IS_CHECKPOINT(session))
+ LF_SET(WT_REC_VISIBLE_ALL);
*flagsp = flags;
/*
@@ -502,7 +516,7 @@ __evict_review(
* the page stays in memory and the tree is left in the desired
* state: avoid the usual cleanup.
*/
- if (LF_ISSET(WT_EVICT_INMEM_SPLIT))
+ if (LF_ISSET(WT_REC_INMEM_SPLIT))
return (__wt_split_insert(session, ref));
}
@@ -545,22 +559,27 @@ __evict_review(
lookaside_retryp = NULL;
if (closing)
- LF_SET(WT_VISIBILITY_ERR);
- else if (!WT_PAGE_IS_INTERNAL(page)) {
+ LF_SET(WT_REC_VISIBILITY_ERR);
+ else if (!WT_PAGE_IS_INTERNAL(page) &&
+ !F_ISSET(S2BT(session), WT_BTREE_LOOKASIDE)) {
if (F_ISSET(conn, WT_CONN_IN_MEMORY))
- LF_SET(WT_EVICT_IN_MEMORY |
- WT_EVICT_SCRUB | WT_EVICT_UPDATE_RESTORE);
+ LF_SET(WT_REC_IN_MEMORY |
+ WT_REC_SCRUB | WT_REC_UPDATE_RESTORE);
else {
- LF_SET(WT_EVICT_UPDATE_RESTORE);
+ if (!WT_SESSION_IS_CHECKPOINT(session)) {
+ LF_SET(WT_REC_UPDATE_RESTORE);
- if (F_ISSET(cache, WT_CACHE_EVICT_SCRUB))
- LF_SET(WT_EVICT_SCRUB);
+ if (F_ISSET(cache, WT_CACHE_EVICT_SCRUB))
+ LF_SET(WT_REC_SCRUB);
+ }
/*
* Check if reconciliation suggests trying the
* lookaside table.
*/
- lookaside_retryp = &lookaside_retry;
+ if (__wt_cache_aggressive(session) &&
+ !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE))
+ lookaside_retryp = &lookaside_retry;
}
}
@@ -574,9 +593,9 @@ __evict_review(
* table, allowing the eviction of pages we'd otherwise have to retain
* in cache to support older readers.
*/
- if (ret == EBUSY && lookaside_retry && __wt_cache_stuck(session)) {
- LF_CLR(WT_EVICT_SCRUB | WT_EVICT_UPDATE_RESTORE);
- LF_SET(WT_EVICT_LOOKASIDE);
+ if (ret == EBUSY && lookaside_retry) {
+ LF_CLR(WT_REC_SCRUB | WT_REC_UPDATE_RESTORE);
+ LF_SET(WT_REC_LOOKASIDE);
ret = __wt_reconcile(session, ref, NULL, flags, NULL);
}
@@ -584,6 +603,18 @@ __evict_review(
WT_RET(ret);
/*
+ * If attempting eviction in service of a checkpoint, we may
+ * successfully reconcile but then find that there are updates on the
+ * page too new to evict. Give up in that case: checkpoint will
+ * reconcile the page normally.
+ */
+ if (WT_SESSION_IS_CHECKPOINT(session) && !__wt_page_is_modified(page) &&
+ !LF_ISSET(WT_REC_LOOKASIDE) &&
+ !__wt_txn_visible_all(session, page->modify->rec_max_txn,
+ WT_TIMESTAMP_NULL(&page->modify->rec_max_timestamp)))
+ return (EBUSY);
+
+ /*
* Success: assert the page is clean or reconciliation was configured
* for update/restore. If the page is clean, assert that reconciliation
* was configured for a lookaside table, or it's not a durable object
@@ -591,10 +622,10 @@ __evict_review(
* visible.
*/
WT_ASSERT(session,
- !__wt_page_is_modified(page) || LF_ISSET(WT_EVICT_UPDATE_RESTORE));
+ !__wt_page_is_modified(page) || LF_ISSET(WT_REC_UPDATE_RESTORE));
WT_ASSERT(session,
__wt_page_is_modified(page) ||
- LF_ISSET(WT_EVICT_LOOKASIDE) ||
+ LF_ISSET(WT_REC_LOOKASIDE) ||
F_ISSET(S2BT(session), WT_BTREE_LOOKASIDE) ||
__wt_txn_visible_all(session, page->modify->rec_max_txn,
WT_TIMESTAMP_NULL(&page->modify->rec_max_timestamp)));
diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h
index 60ed31b64e8..3eb951f81ac 100644
--- a/src/third_party/wiredtiger/src/include/api.h
+++ b/src/third_party/wiredtiger/src/include/api.h
@@ -39,9 +39,15 @@
} while (0)
/* An API call wrapped in a transaction if necessary. */
+#ifdef HAVE_TIMESTAMPS
+#define WT_TXN_TIMESTAMP_FLAG_CHECK(s) __wt_txn_timestamp_flags((s))
+#else
+#define WT_TXN_TIMESTAMP_FLAG_CHECK(s)
+#endif
#define TXN_API_CALL(s, h, n, bt, config, cfg) do { \
bool __autotxn = false; \
API_CALL(s, h, n, bt, config, cfg); \
+ WT_TXN_TIMESTAMP_FLAG_CHECK(s); \
__autotxn = !F_ISSET(&(s)->txn, WT_TXN_AUTOCOMMIT | WT_TXN_RUNNING);\
if (__autotxn) \
F_SET(&(s)->txn, WT_TXN_AUTOCOMMIT)
@@ -50,6 +56,7 @@
#define TXN_API_CALL_NOCONF(s, h, n, dh) do { \
bool __autotxn = false; \
API_CALL_NOCONF(s, h, n, dh); \
+ WT_TXN_TIMESTAMP_FLAG_CHECK(s); \
__autotxn = !F_ISSET(&(s)->txn, WT_TXN_AUTOCOMMIT | WT_TXN_RUNNING);\
if (__autotxn) \
F_SET(&(s)->txn, WT_TXN_AUTOCOMMIT)
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index f0d810281c2..486ab7562a1 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -165,13 +165,13 @@ struct __wt_ovfl_reuse {
* Lookaside table support: when a page is being reconciled for eviction and has
* updates that might be required by earlier readers in the system, the updates
* are written into a lookaside table, and restored as necessary if the page is
- * read. The key is a unique marker for the page (a file ID plus an address),
- * a counter (used to ensure the update records remain in the original order),
- * the on-page item's transaction ID and timestamp (so we can discard any
- * update records from the lookaside table once the on-page item's transaction
- * is globally visible), and the page key (byte-string for row-store, record
- * number for column-store). The value is the WT_UPDATE structure's
- * transaction ID, update size and value.
+ * read.
+ *
+ * The key is a unique marker for the page (a file ID plus a page ID), a
+ * counter (used to ensure the update records remain in the original order),
+ * and the record's key (byte-string for row-store, record number for
+ * column-store). The value is the WT_UPDATE structure's transaction ID,
+ * timestamp, update type and value.
*
* As the key for the lookaside table is different for row- and column-store, we
* store both key types in a WT_ITEM, building/parsing them in the code, because
@@ -182,7 +182,7 @@ struct __wt_ovfl_reuse {
* the row-store key is relatively large.
*/
#define WT_LAS_FORMAT \
- "key_format=" WT_UNCHECKED_STRING(IuQQuu) \
+ "key_format=" WT_UNCHECKED_STRING(IQQu) \
",value_format=" WT_UNCHECKED_STRING(QuBu)
/*
@@ -239,11 +239,19 @@ struct __wt_page_modify {
* re-instantiate the page in memory.
*/
void *disk_image;
+
+ /* The page has lookaside entries. */
+ uint64_t las_pageid;
+ WT_DECL_TIMESTAMP(las_min_timestamp)
} r;
#undef mod_replace
#define mod_replace u1.r.replace
#undef mod_disk_image
#define mod_disk_image u1.r.disk_image
+#undef mod_replace_las_pageid
+#define mod_replace_las_pageid u1.r.las_pageid
+#undef mod_replace_las_min_timestamp
+#define mod_replace_las_min_timestamp u1.r.las_min_timestamp
struct { /* Multiple replacement blocks */
struct __wt_multi {
@@ -274,8 +282,7 @@ struct __wt_page_modify {
struct __wt_save_upd {
WT_INSERT *ins; /* Insert list reference */
WT_ROW *ripcip; /* Original on-page reference */
- uint64_t onpage_txn;
- WT_DECL_TIMESTAMP(onpage_timestamp)
+ WT_UPDATE *onpage_upd;
} *supd;
uint32_t supd_entries;
@@ -289,6 +296,9 @@ struct __wt_page_modify {
WT_ADDR addr;
uint32_t size;
uint32_t checksum;
+
+ uint64_t las_pageid;
+ WT_DECL_TIMESTAMP(las_min_timestamp)
} *multi;
uint32_t multi_entries; /* Multiple blocks element count */
} m;
@@ -659,6 +669,10 @@ struct __wt_page {
* thread that set the page to WT_REF_LOCKED has exclusive access, no
* other thread may use the WT_REF until the state is changed.
*
+ * WT_REF_LOOKASIDE:
+ * The page is on disk (as per WT_REF_DISK) and has entries in the
+ * lookaside table that must be applied before the page can be read.
+ *
* WT_REF_MEM:
* Set by a reading thread once the page has been read from disk; the page
* is in the cache and the page reference is OK.
@@ -696,10 +710,20 @@ struct __wt_page {
* Related information for fast-delete, on-disk pages.
*/
struct __wt_page_deleted {
- volatile uint64_t txnid; /* Transaction ID */
+ volatile uint64_t txnid; /* Transaction ID */
WT_DECL_TIMESTAMP(timestamp)
- WT_UPDATE **update_list; /* List of updates for abort */
+ WT_UPDATE **update_list; /* List of updates for abort */
+};
+
+/*
+ * WT_PAGE_LOOKASIDE --
+ * Related information for on-disk pages with lookaside entries.
+ */
+struct __wt_page_lookaside {
+ uint64_t las_pageid; /* Page ID in lookaside */
+ WT_DECL_TIMESTAMP(min_timestamp) /* Oldest timestamp in
+ lookaside for the page */
};
/*
@@ -718,12 +742,13 @@ struct __wt_ref {
WT_PAGE * volatile home; /* Reference page */
volatile uint32_t pindex_hint; /* Reference page index hint */
-#define WT_REF_DISK 0 /* Page is on disk */
-#define WT_REF_DELETED 1 /* Page is on disk, but deleted */
-#define WT_REF_LOCKED 2 /* Page locked for exclusive access */
-#define WT_REF_MEM 3 /* Page is in cache and valid */
-#define WT_REF_READING 4 /* Page being read */
-#define WT_REF_SPLIT 5 /* Parent page split (WT_REF dead) */
+#define WT_REF_DISK 0 /* Page is on disk */
+#define WT_REF_DELETED 1 /* Page is on disk, but deleted */
+#define WT_REF_LOCKED 2 /* Page locked for exclusive access */
+#define WT_REF_LOOKASIDE 3 /* Page is on disk with lookaside */
+#define WT_REF_MEM 4 /* Page is in cache and valid */
+#define WT_REF_READING 5 /* Page being read */
+#define WT_REF_SPLIT 6 /* Parent page split (WT_REF dead) */
volatile uint32_t state; /* Page state */
/*
@@ -745,7 +770,10 @@ struct __wt_ref {
#undef ref_ikey
#define ref_ikey key.ikey
- WT_PAGE_DELETED *page_del; /* Deleted on-disk page information */
+ union {
+ WT_PAGE_DELETED *page_del; /* Deleted page information */
+ WT_PAGE_LOOKASIDE *page_las; /* Lookaside information */
+ };
};
/*
* WT_REF_SIZE is the expected structure size -- we verify the build to ensure
diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h
index 8184d606022..7dc9b4a11a7 100644
--- a/src/third_party/wiredtiger/src/include/btree.h
+++ b/src/third_party/wiredtiger/src/include/btree.h
@@ -97,6 +97,12 @@ struct __wt_btree {
uint64_t maxmempage; /* In-memory page max size */
uint64_t splitmempage; /* In-memory split trigger size */
+#define WT_ASSERT_COMMIT_TS_ALWAYS 0x0001
+#define WT_ASSERT_COMMIT_TS_NEVER 0x0002
+#define WT_ASSERT_READ_TS_ALWAYS 0x0004
+#define WT_ASSERT_READ_TS_NEVER 0x0008
+ uint32_t assert_flags; /* Debugging assertion information */
+
void *huffman_key; /* Key huffman encoding */
void *huffman_value; /* Value huffman encoding */
@@ -128,6 +134,7 @@ struct __wt_btree {
u_int rec_multiblock_max; /* Maximum blocks written for a page */
uint64_t last_recno; /* Column-store last record number */
+ uint64_t las_pageid; /* Lookaside table page ID counter */
WT_REF root; /* Root page reference */
bool modified; /* If the tree ever modified */
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index 4d6844e10cc..3b196dca673 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -1303,19 +1303,20 @@ __wt_page_can_evict(
*/
if (__wt_leaf_page_can_split(session, page)) {
if (evict_flagsp != NULL)
- FLD_SET(*evict_flagsp, WT_EVICT_INMEM_SPLIT);
+ FLD_SET(*evict_flagsp, WT_REC_INMEM_SPLIT);
return (true);
}
modified = __wt_page_is_modified(page);
/*
- * If the file is being checkpointed, we can't evict dirty pages:
- * if we write a page and free the previous version of the page, that
+ * If the file is being checkpointed, other threads can't evict dirty
+ * pages: if a page is written and the previous version freed, that
* previous version might be referenced by an internal page already
- * been written in the checkpoint, leaving the checkpoint inconsistent.
+ * written in the checkpoint, leaving the checkpoint inconsistent.
*/
- if (modified && btree->checkpointing != WT_CKPT_OFF) {
+ if (modified && btree->checkpointing != WT_CKPT_OFF &&
+ !WT_SESSION_IS_CHECKPOINT(session)) {
WT_STAT_CONN_INCR(session, cache_eviction_checkpoint);
WT_STAT_DATA_INCR(session, cache_eviction_checkpoint);
return (false);
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
index e5593357347..1d7b6142685 100644
--- a/src/third_party/wiredtiger/src/include/connection.h
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -365,10 +365,6 @@ struct __wt_connection_impl {
*/
WT_SPINLOCK las_lock; /* Lookaside table spinlock */
WT_SESSION_IMPL *las_session; /* Lookaside table session */
- bool las_written; /* Lookaside table has been written */
-
- WT_ITEM las_sweep_key; /* Sweep server's saved key */
- uint64_t las_record_cnt;/* Count of lookaside records */
/*
* The "lookaside_activity" verbose messages are throttled to once per
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 362acc71c0f..23897a05dfb 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -152,14 +152,13 @@ extern const char *__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref,
extern const char *__wt_addr_string(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ITEM *buf);
extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store, bool *decoded) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_ovfl_discard_remove(WT_SESSION_IMPL *session, WT_PAGE *page);
-extern int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, bool checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, bool evicting) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
@@ -186,7 +185,7 @@ extern int __wt_tree_walk_count(WT_SESSION_IMPL *session, WT_REF **refp, uint64_
extern int __wt_tree_walk_custom_skip( WT_SESSION_IMPL *session, WT_REF **refp, int (*skip_func)(WT_SESSION_IMPL *, WT_REF *, void *, bool *), void *func_cookie, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_tree_walk_skip( WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, const WT_ITEM *value, WT_UPDATE *upd_arg, u_int modify_type, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool restore) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_leaf_key_work(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, bool instantiate) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -200,16 +199,14 @@ extern int __wt_update_alloc(WT_SESSION_IMPL *session, const WT_ITEM *value, WT_
extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd);
extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd);
extern int __wt_search_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert, bool restore) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_las_stats_update(WT_SESSION_IMPL *session);
extern int __wt_las_create(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_las_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern void __wt_las_set_written(WT_SESSION_IMPL *session);
-extern bool __wt_las_is_written(WT_SESSION_IMPL *session);
extern int __wt_las_cursor_open(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags);
extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_las_sweep(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern uint32_t __wt_checksum_sw(const void *chunk, size_t len) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern void __wt_checksum_init(void);
extern void __wt_config_initn( WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len);
diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h
index ccb32900dc4..65b4ce34752 100644
--- a/src/third_party/wiredtiger/src/include/flags.h
+++ b/src/third_party/wiredtiger/src/include/flags.h
@@ -2,32 +2,26 @@
* DO NOT EDIT: automatically built by dist/flags.py.
* flags section: BEGIN
*/
-#define WT_CHECKPOINTING 0x00000001
#define WT_CONN_CACHE_POOL 0x00000001
#define WT_CONN_CKPT_SYNC 0x00000002
#define WT_CONN_CLOSING 0x00000004
#define WT_CONN_CLOSING_NO_MORE_OPENS 0x00000008
-#define WT_CONN_EVICTION_RUN 0x00000010
-#define WT_CONN_IN_MEMORY 0x00000020
-#define WT_CONN_LAS_OPEN 0x00000040
-#define WT_CONN_LEAK_MEMORY 0x00000080
-#define WT_CONN_LSM_MERGE 0x00000100
-#define WT_CONN_PANIC 0x00000200
-#define WT_CONN_READONLY 0x00000400
-#define WT_CONN_RECOVERING 0x00000800
-#define WT_CONN_SERVER_ASYNC 0x00001000
-#define WT_CONN_SERVER_CHECKPOINT 0x00002000
-#define WT_CONN_SERVER_LOG 0x00004000
-#define WT_CONN_SERVER_LSM 0x00008000
-#define WT_CONN_SERVER_STATISTICS 0x00010000
-#define WT_CONN_SERVER_SWEEP 0x00020000
-#define WT_CONN_WAS_BACKUP 0x00040000
-#define WT_EVICTING 0x00000002
-#define WT_EVICT_INMEM_SPLIT 0x00000004
-#define WT_EVICT_IN_MEMORY 0x00000008
-#define WT_EVICT_LOOKASIDE 0x00000010
-#define WT_EVICT_SCRUB 0x00000020
-#define WT_EVICT_UPDATE_RESTORE 0x00000040
+#define WT_CONN_EVICTION_NO_LOOKASIDE 0x00000010
+#define WT_CONN_EVICTION_RUN 0x00000020
+#define WT_CONN_IN_MEMORY 0x00000040
+#define WT_CONN_LAS_OPEN 0x00000080
+#define WT_CONN_LEAK_MEMORY 0x00000100
+#define WT_CONN_LSM_MERGE 0x00000200
+#define WT_CONN_PANIC 0x00000400
+#define WT_CONN_READONLY 0x00000800
+#define WT_CONN_RECOVERING 0x00001000
+#define WT_CONN_SERVER_ASYNC 0x00002000
+#define WT_CONN_SERVER_CHECKPOINT 0x00004000
+#define WT_CONN_SERVER_LOG 0x00008000
+#define WT_CONN_SERVER_LSM 0x00010000
+#define WT_CONN_SERVER_STATISTICS 0x00020000
+#define WT_CONN_SERVER_SWEEP 0x00040000
+#define WT_CONN_WAS_BACKUP 0x00080000
#define WT_LOGSCAN_FIRST 0x00000001
#define WT_LOGSCAN_FROM_CKP 0x00000002
#define WT_LOGSCAN_ONE 0x00000004
@@ -38,16 +32,26 @@
#define WT_LOG_FSYNC 0x00000008
#define WT_LOG_SYNC_ENABLED 0x00000010
#define WT_READ_CACHE 0x00000001
-#define WT_READ_NOTFOUND_OK 0x00000002
-#define WT_READ_NO_EMPTY 0x00000004
-#define WT_READ_NO_EVICT 0x00000008
-#define WT_READ_NO_GEN 0x00000010
-#define WT_READ_NO_WAIT 0x00000020
-#define WT_READ_PREV 0x00000040
-#define WT_READ_RESTART_OK 0x00000080
-#define WT_READ_SKIP_INTL 0x00000100
-#define WT_READ_TRUNCATE 0x00000200
-#define WT_READ_WONT_NEED 0x00000400
+#define WT_READ_LOOKASIDE 0x00000002
+#define WT_READ_NOTFOUND_OK 0x00000004
+#define WT_READ_NO_EMPTY 0x00000008
+#define WT_READ_NO_EVICT 0x00000010
+#define WT_READ_NO_GEN 0x00000020
+#define WT_READ_NO_WAIT 0x00000040
+#define WT_READ_PREV 0x00000080
+#define WT_READ_RESTART_OK 0x00000100
+#define WT_READ_SKIP_INTL 0x00000200
+#define WT_READ_TRUNCATE 0x00000400
+#define WT_READ_WONT_NEED 0x00000800
+#define WT_REC_CHECKPOINT 0x00000001
+#define WT_REC_EVICT 0x00000002
+#define WT_REC_INMEM_SPLIT 0x00000004
+#define WT_REC_IN_MEMORY 0x00000008
+#define WT_REC_LOOKASIDE 0x00000010
+#define WT_REC_SCRUB 0x00000020
+#define WT_REC_UPDATE_RESTORE 0x00000040
+#define WT_REC_VISIBILITY_ERR 0x00000080
+#define WT_REC_VISIBLE_ALL 0x00000100
#define WT_SESSION_CAN_WAIT 0x00000001
#define WT_SESSION_INTERNAL 0x00000002
#define WT_SESSION_LOCKED_CHECKPOINT 0x00000004
@@ -118,7 +122,6 @@
#define WT_VERB_VERIFY 0x10000000
#define WT_VERB_VERSION 0x20000000
#define WT_VERB_WRITE 0x40000000
-#define WT_VISIBILITY_ERR 0x00000080
/*
* flags section: END
* DO NOT EDIT: automatically built by dist/flags.py.
diff --git a/src/third_party/wiredtiger/src/include/mutex.i b/src/third_party/wiredtiger/src/include/mutex.i
index 5b14bb24730..871ccf63be8 100644
--- a/src/third_party/wiredtiger/src/include/mutex.i
+++ b/src/third_party/wiredtiger/src/include/mutex.i
@@ -113,11 +113,15 @@ static inline int
__wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name)
{
#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE
+ WT_DECL_RET;
pthread_mutexattr_t attr;
WT_RET(pthread_mutexattr_init(&attr));
- WT_RET(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP));
- WT_RET(pthread_mutex_init(&t->lock, &attr));
+ ret = pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
+ if (ret == 0)
+ ret = pthread_mutex_init(&t->lock, &attr);
+ WT_TRET(pthread_mutexattr_destroy(&attr));
+ WT_RET(ret);
#else
WT_RET(pthread_mutex_init(&t->lock, NULL));
#endif
diff --git a/src/third_party/wiredtiger/src/include/schema.h b/src/third_party/wiredtiger/src/include/schema.h
index 9ab4c12f0d0..bae5fc8cc04 100644
--- a/src/third_party/wiredtiger/src/include/schema.h
+++ b/src/third_party/wiredtiger/src/include/schema.h
@@ -296,7 +296,9 @@ struct __wt_table {
F_CLR(session, WT_SESSION_LOCKED_CHECKPOINT); \
__wt_spin_unlock(session, &__conn->checkpoint_lock); \
} \
+ __wt_yield(); \
op; \
+ __wt_yield(); \
if (__checkpoint_locked) { \
__wt_spin_lock(session, &__conn->checkpoint_lock); \
F_SET(session, WT_SESSION_LOCKED_CHECKPOINT); \
diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h
index bd69cc36405..bea436e05e2 100644
--- a/src/third_party/wiredtiger/src/include/session.h
+++ b/src/third_party/wiredtiger/src/include/session.h
@@ -77,9 +77,6 @@ struct __wt_session_impl {
enum { WT_COMPACT_NONE=0,
WT_COMPACT_RUNNING, WT_COMPACT_SUCCESS } compact_state;
- /*
- * Lookaside table cursor, sweep and eviction worker threads only.
- */
WT_CURSOR *las_cursor; /* Lookaside table cursor */
WT_CURSOR *meta_cursor; /* Metadata file */
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
index c7110c245c7..922b211bec4 100644
--- a/src/third_party/wiredtiger/src/include/stat.h
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -341,6 +341,7 @@ struct __wt_connection_stats {
int64_t cache_eviction_internal;
int64_t cache_eviction_split_internal;
int64_t cache_eviction_split_leaf;
+ int64_t cache_lookaside_entries;
int64_t cache_lookaside_insert;
int64_t cache_lookaside_remove;
int64_t cache_bytes_max;
diff --git a/src/third_party/wiredtiger/src/include/thread_group.h b/src/third_party/wiredtiger/src/include/thread_group.h
index 7375f9dfd87..97eda6ab674 100644
--- a/src/third_party/wiredtiger/src/include/thread_group.h
+++ b/src/third_party/wiredtiger/src/include/thread_group.h
@@ -23,8 +23,9 @@ struct __wt_thread {
*/
#define WT_THREAD_ACTIVE 0x01 /* thread is active or paused */
#define WT_THREAD_CAN_WAIT 0x02 /* WT_SESSION_CAN_WAIT */
-#define WT_THREAD_PANIC_FAIL 0x04 /* panic if the thread fails */
-#define WT_THREAD_RUN 0x08 /* thread is running */
+#define WT_THREAD_LOOKASIDE 0x04 /* open lookaside cursor */
+#define WT_THREAD_PANIC_FAIL 0x08 /* panic if the thread fails */
+#define WT_THREAD_RUN 0x10 /* thread is running */
uint32_t flags;
/*
diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h
index 69481409aaf..6b78c78a5cd 100644
--- a/src/third_party/wiredtiger/src/include/txn.h
+++ b/src/third_party/wiredtiger/src/include/txn.h
@@ -246,17 +246,19 @@ struct __wt_txn {
WT_ITEM *ckpt_snapshot;
bool full_ckpt;
-#define WT_TXN_AUTOCOMMIT 0x001
-#define WT_TXN_ERROR 0x002
-#define WT_TXN_HAS_ID 0x004
-#define WT_TXN_HAS_SNAPSHOT 0x008
-#define WT_TXN_HAS_TS_COMMIT 0x010
-#define WT_TXN_HAS_TS_READ 0x020
-#define WT_TXN_NAMED_SNAPSHOT 0x040
-#define WT_TXN_PUBLIC_TS_COMMIT 0x080
-#define WT_TXN_PUBLIC_TS_READ 0x100
-#define WT_TXN_READONLY 0x200
-#define WT_TXN_RUNNING 0x400
-#define WT_TXN_SYNC_SET 0x800
+#define WT_TXN_AUTOCOMMIT 0x00001
+#define WT_TXN_ERROR 0x00002
+#define WT_TXN_HAS_ID 0x00004
+#define WT_TXN_HAS_SNAPSHOT 0x00008
+#define WT_TXN_HAS_TS_COMMIT 0x00010
+#define WT_TXN_HAS_TS_READ 0x00020
+#define WT_TXN_NAMED_SNAPSHOT 0x00040
+#define WT_TXN_PUBLIC_TS_COMMIT 0x00080
+#define WT_TXN_PUBLIC_TS_READ 0x00100
+#define WT_TXN_READONLY 0x00200
+#define WT_TXN_RUNNING 0x00400
+#define WT_TXN_SYNC_SET 0x00800
+#define WT_TXN_TS_COMMIT_ALWAYS 0x01000
+#define WT_TXN_TS_COMMIT_NEVER 0x02000
uint32_t flags;
};
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index e53ab6a69ee..26dcd01fe5e 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -10,6 +10,26 @@ static inline int __wt_txn_id_check(WT_SESSION_IMPL *session);
static inline void __wt_txn_read_last(WT_SESSION_IMPL *session);
#ifdef HAVE_TIMESTAMPS
+/*
+ * __wt_txn_timestamp_flags --
+ * Set txn related timestamp flags.
+ */
+static inline void
+__wt_txn_timestamp_flags(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+
+ if (session->dhandle == NULL)
+ return;
+ btree = S2BT(session);
+ if (btree == NULL)
+ return;
+ if (FLD_ISSET(btree->assert_flags, WT_ASSERT_COMMIT_TS_ALWAYS))
+ F_SET(&session->txn, WT_TXN_TS_COMMIT_ALWAYS);
+ if (FLD_ISSET(btree->assert_flags, WT_ASSERT_COMMIT_TS_NEVER))
+ F_SET(&session->txn, WT_TXN_TS_COMMIT_NEVER);
+}
+
#if WT_TIMESTAMP_SIZE == 8
#define WT_WITH_TIMESTAMP_READLOCK(session, l, e) e
@@ -635,6 +655,37 @@ __wt_txn_id_check(WT_SESSION_IMPL *session)
}
/*
+ * __wt_txn_search_check --
+ * Check if the current transaction can search.
+ */
+static inline int
+__wt_txn_search_check(WT_SESSION_IMPL *session)
+{
+#ifdef HAVE_TIMESTAMPS
+ WT_BTREE *btree;
+ WT_TXN *txn;
+
+ txn = &session->txn;
+ btree = S2BT(session);
+ /*
+ * If the user says a table should always use a read timestamp,
+ * verify this transaction has one. Same if it should never have
+ * a read timestamp.
+ */
+ if (FLD_ISSET(btree->assert_flags, WT_ASSERT_READ_TS_ALWAYS) &&
+ !F_ISSET(txn, WT_TXN_PUBLIC_TS_READ))
+ WT_RET_MSG(session, EINVAL, "read_timestamp required and "
+ "none set on this transaction");
+ if (FLD_ISSET(btree->assert_flags, WT_ASSERT_READ_TS_NEVER) &&
+ F_ISSET(txn, WT_TXN_PUBLIC_TS_READ))
+ WT_RET_MSG(session, EINVAL, "no read_timestamp required and "
+ "timestamp set on this transaction");
+#endif
+ WT_UNUSED(session);
+ return (0);
+}
+
+/*
* __wt_txn_update_check --
* Check if the current transaction can update an item.
*/
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index 1e526edaedc..830850f102b 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -4841,454 +4841,456 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1082
/*! cache: leaf pages split during eviction */
#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1083
+/*! cache: lookaside table entries */
+#define WT_STAT_CONN_CACHE_LOOKASIDE_ENTRIES 1084
/*! cache: lookaside table insert calls */
-#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1084
+#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1085
/*! cache: lookaside table remove calls */
-#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1085
+#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1086
/*! cache: maximum bytes configured */
-#define WT_STAT_CONN_CACHE_BYTES_MAX 1086
+#define WT_STAT_CONN_CACHE_BYTES_MAX 1087
/*! cache: maximum page size at eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1087
+#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1088
/*! cache: modified pages evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1088
+#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1089
/*! cache: modified pages evicted by application threads */
-#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1089
+#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1090
/*! cache: overflow pages read into cache */
-#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1090
+#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1091
/*! cache: page split during eviction deepened the tree */
-#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1091
+#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1092
/*! cache: page written requiring lookaside records */
-#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1092
+#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1093
/*! cache: pages currently held in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_INUSE 1093
+#define WT_STAT_CONN_CACHE_PAGES_INUSE 1094
/*! cache: pages evicted because they exceeded the in-memory maximum count */
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1094
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1095
/*!
* cache: pages evicted because they exceeded the in-memory maximum time
* (usecs)
*/
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE_TIME 1095
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_TIME 1096
/*! cache: pages evicted because they had chains of deleted items count */
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1096
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1097
/*!
* cache: pages evicted because they had chains of deleted items time
* (usecs)
*/
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE_TIME 1097
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE_TIME 1098
/*! cache: pages evicted by application threads */
-#define WT_STAT_CONN_CACHE_EVICTION_APP 1098
+#define WT_STAT_CONN_CACHE_EVICTION_APP 1099
/*! cache: pages queued for eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1099
+#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1100
/*! cache: pages queued for urgent eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1100
+#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1101
/*! cache: pages queued for urgent eviction during walk */
-#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1101
+#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1102
/*! cache: pages read into cache */
-#define WT_STAT_CONN_CACHE_READ 1102
+#define WT_STAT_CONN_CACHE_READ 1103
/*! cache: pages read into cache requiring lookaside entries */
-#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1103
+#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1104
/*! cache: pages requested from the cache */
-#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1104
+#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1105
/*! cache: pages seen by eviction walk */
-#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1105
+#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1106
/*! cache: pages selected for eviction unable to be evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1106
+#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1107
/*! cache: pages walked for eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_WALK 1107
+#define WT_STAT_CONN_CACHE_EVICTION_WALK 1108
/*! cache: pages written from cache */
-#define WT_STAT_CONN_CACHE_WRITE 1108
+#define WT_STAT_CONN_CACHE_WRITE 1109
/*! cache: pages written requiring in-memory restoration */
-#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1109
+#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1110
/*! cache: percentage overhead */
-#define WT_STAT_CONN_CACHE_OVERHEAD 1110
+#define WT_STAT_CONN_CACHE_OVERHEAD 1111
/*! cache: tracked bytes belonging to internal pages in the cache */
-#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1111
+#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1112
/*! cache: tracked bytes belonging to leaf pages in the cache */
-#define WT_STAT_CONN_CACHE_BYTES_LEAF 1112
+#define WT_STAT_CONN_CACHE_BYTES_LEAF 1113
/*! cache: tracked dirty bytes in the cache */
-#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1113
+#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1114
/*! cache: tracked dirty pages in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1114
+#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1115
/*! cache: unmodified pages evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1115
+#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1116
/*! connection: auto adjusting condition resets */
-#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1116
+#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1117
/*! connection: auto adjusting condition wait calls */
-#define WT_STAT_CONN_COND_AUTO_WAIT 1117
+#define WT_STAT_CONN_COND_AUTO_WAIT 1118
/*! connection: detected system time went backwards */
-#define WT_STAT_CONN_TIME_TRAVEL 1118
+#define WT_STAT_CONN_TIME_TRAVEL 1119
/*! connection: files currently open */
-#define WT_STAT_CONN_FILE_OPEN 1119
+#define WT_STAT_CONN_FILE_OPEN 1120
/*! connection: memory allocations */
-#define WT_STAT_CONN_MEMORY_ALLOCATION 1120
+#define WT_STAT_CONN_MEMORY_ALLOCATION 1121
/*! connection: memory frees */
-#define WT_STAT_CONN_MEMORY_FREE 1121
+#define WT_STAT_CONN_MEMORY_FREE 1122
/*! connection: memory re-allocations */
-#define WT_STAT_CONN_MEMORY_GROW 1122
+#define WT_STAT_CONN_MEMORY_GROW 1123
/*! connection: pthread mutex condition wait calls */
-#define WT_STAT_CONN_COND_WAIT 1123
+#define WT_STAT_CONN_COND_WAIT 1124
/*! connection: pthread mutex shared lock read-lock calls */
-#define WT_STAT_CONN_RWLOCK_READ 1124
+#define WT_STAT_CONN_RWLOCK_READ 1125
/*! connection: pthread mutex shared lock write-lock calls */
-#define WT_STAT_CONN_RWLOCK_WRITE 1125
+#define WT_STAT_CONN_RWLOCK_WRITE 1126
/*! connection: total fsync I/Os */
-#define WT_STAT_CONN_FSYNC_IO 1126
+#define WT_STAT_CONN_FSYNC_IO 1127
/*! connection: total read I/Os */
-#define WT_STAT_CONN_READ_IO 1127
+#define WT_STAT_CONN_READ_IO 1128
/*! connection: total write I/Os */
-#define WT_STAT_CONN_WRITE_IO 1128
+#define WT_STAT_CONN_WRITE_IO 1129
/*! cursor: cursor create calls */
-#define WT_STAT_CONN_CURSOR_CREATE 1129
+#define WT_STAT_CONN_CURSOR_CREATE 1130
/*! cursor: cursor insert calls */
-#define WT_STAT_CONN_CURSOR_INSERT 1130
+#define WT_STAT_CONN_CURSOR_INSERT 1131
/*! cursor: cursor modify calls */
-#define WT_STAT_CONN_CURSOR_MODIFY 1131
+#define WT_STAT_CONN_CURSOR_MODIFY 1132
/*! cursor: cursor next calls */
-#define WT_STAT_CONN_CURSOR_NEXT 1132
+#define WT_STAT_CONN_CURSOR_NEXT 1133
/*! cursor: cursor prev calls */
-#define WT_STAT_CONN_CURSOR_PREV 1133
+#define WT_STAT_CONN_CURSOR_PREV 1134
/*! cursor: cursor remove calls */
-#define WT_STAT_CONN_CURSOR_REMOVE 1134
+#define WT_STAT_CONN_CURSOR_REMOVE 1135
/*! cursor: cursor reserve calls */
-#define WT_STAT_CONN_CURSOR_RESERVE 1135
+#define WT_STAT_CONN_CURSOR_RESERVE 1136
/*! cursor: cursor reset calls */
-#define WT_STAT_CONN_CURSOR_RESET 1136
+#define WT_STAT_CONN_CURSOR_RESET 1137
/*! cursor: cursor restarted searches */
-#define WT_STAT_CONN_CURSOR_RESTART 1137
+#define WT_STAT_CONN_CURSOR_RESTART 1138
/*! cursor: cursor search calls */
-#define WT_STAT_CONN_CURSOR_SEARCH 1138
+#define WT_STAT_CONN_CURSOR_SEARCH 1139
/*! cursor: cursor search near calls */
-#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1139
+#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1140
/*! cursor: cursor update calls */
-#define WT_STAT_CONN_CURSOR_UPDATE 1140
+#define WT_STAT_CONN_CURSOR_UPDATE 1141
/*! cursor: truncate calls */
-#define WT_STAT_CONN_CURSOR_TRUNCATE 1141
+#define WT_STAT_CONN_CURSOR_TRUNCATE 1142
/*! data-handle: connection data handles currently active */
-#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1142
+#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1143
/*! data-handle: connection sweep candidate became referenced */
-#define WT_STAT_CONN_DH_SWEEP_REF 1143
+#define WT_STAT_CONN_DH_SWEEP_REF 1144
/*! data-handle: connection sweep dhandles closed */
-#define WT_STAT_CONN_DH_SWEEP_CLOSE 1144
+#define WT_STAT_CONN_DH_SWEEP_CLOSE 1145
/*! data-handle: connection sweep dhandles removed from hash list */
-#define WT_STAT_CONN_DH_SWEEP_REMOVE 1145
+#define WT_STAT_CONN_DH_SWEEP_REMOVE 1146
/*! data-handle: connection sweep time-of-death sets */
-#define WT_STAT_CONN_DH_SWEEP_TOD 1146
+#define WT_STAT_CONN_DH_SWEEP_TOD 1147
/*! data-handle: connection sweeps */
-#define WT_STAT_CONN_DH_SWEEPS 1147
+#define WT_STAT_CONN_DH_SWEEPS 1148
/*! data-handle: session dhandles swept */
-#define WT_STAT_CONN_DH_SESSION_HANDLES 1148
+#define WT_STAT_CONN_DH_SESSION_HANDLES 1149
/*! data-handle: session sweep attempts */
-#define WT_STAT_CONN_DH_SESSION_SWEEPS 1149
+#define WT_STAT_CONN_DH_SESSION_SWEEPS 1150
/*! lock: checkpoint lock acquisitions */
-#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1150
+#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1151
/*! lock: checkpoint lock application thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1151
+#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1152
/*! lock: checkpoint lock internal thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1152
+#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1153
/*!
* lock: dhandle lock application thread time waiting for the dhandle
* lock (usecs)
*/
-#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1153
+#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1154
/*!
* lock: dhandle lock internal thread time waiting for the dhandle lock
* (usecs)
*/
-#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1154
+#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1155
/*! lock: dhandle read lock acquisitions */
-#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1155
+#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1156
/*! lock: dhandle write lock acquisitions */
-#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1156
+#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1157
/*! lock: metadata lock acquisitions */
-#define WT_STAT_CONN_LOCK_METADATA_COUNT 1157
+#define WT_STAT_CONN_LOCK_METADATA_COUNT 1158
/*! lock: metadata lock application thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1158
+#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1159
/*! lock: metadata lock internal thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1159
+#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1160
/*! lock: schema lock acquisitions */
-#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1160
+#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1161
/*! lock: schema lock application thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1161
+#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1162
/*! lock: schema lock internal thread wait time (usecs) */
-#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1162
+#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1163
/*!
* lock: table lock application thread time waiting for the table lock
* (usecs)
*/
-#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1163
+#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1164
/*!
* lock: table lock internal thread time waiting for the table lock
* (usecs)
*/
-#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1164
+#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1165
/*! lock: table read lock acquisitions */
-#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1165
+#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1166
/*! lock: table write lock acquisitions */
-#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1166
+#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1167
/*! log: busy returns attempting to switch slots */
-#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1167
+#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1168
/*! log: force checkpoint calls slept */
-#define WT_STAT_CONN_LOG_FORCE_CKPT_SLEEP 1168
+#define WT_STAT_CONN_LOG_FORCE_CKPT_SLEEP 1169
/*! log: log bytes of payload data */
-#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1169
+#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1170
/*! log: log bytes written */
-#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1170
+#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1171
/*! log: log files manually zero-filled */
-#define WT_STAT_CONN_LOG_ZERO_FILLS 1171
+#define WT_STAT_CONN_LOG_ZERO_FILLS 1172
/*! log: log flush operations */
-#define WT_STAT_CONN_LOG_FLUSH 1172
+#define WT_STAT_CONN_LOG_FLUSH 1173
/*! log: log force write operations */
-#define WT_STAT_CONN_LOG_FORCE_WRITE 1173
+#define WT_STAT_CONN_LOG_FORCE_WRITE 1174
/*! log: log force write operations skipped */
-#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1174
+#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1175
/*! log: log records compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1175
+#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1176
/*! log: log records not compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1176
+#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1177
/*! log: log records too small to compress */
-#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1177
+#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1178
/*! log: log release advances write LSN */
-#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1178
+#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1179
/*! log: log scan operations */
-#define WT_STAT_CONN_LOG_SCANS 1179
+#define WT_STAT_CONN_LOG_SCANS 1180
/*! log: log scan records requiring two reads */
-#define WT_STAT_CONN_LOG_SCAN_REREADS 1180
+#define WT_STAT_CONN_LOG_SCAN_REREADS 1181
/*! log: log server thread advances write LSN */
-#define WT_STAT_CONN_LOG_WRITE_LSN 1181
+#define WT_STAT_CONN_LOG_WRITE_LSN 1182
/*! log: log server thread write LSN walk skipped */
-#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1182
+#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1183
/*! log: log sync operations */
-#define WT_STAT_CONN_LOG_SYNC 1183
+#define WT_STAT_CONN_LOG_SYNC 1184
/*! log: log sync time duration (usecs) */
-#define WT_STAT_CONN_LOG_SYNC_DURATION 1184
+#define WT_STAT_CONN_LOG_SYNC_DURATION 1185
/*! log: log sync_dir operations */
-#define WT_STAT_CONN_LOG_SYNC_DIR 1185
+#define WT_STAT_CONN_LOG_SYNC_DIR 1186
/*! log: log sync_dir time duration (usecs) */
-#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1186
+#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1187
/*! log: log write operations */
-#define WT_STAT_CONN_LOG_WRITES 1187
+#define WT_STAT_CONN_LOG_WRITES 1188
/*! log: logging bytes consolidated */
-#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1188
+#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1189
/*! log: maximum log file size */
-#define WT_STAT_CONN_LOG_MAX_FILESIZE 1189
+#define WT_STAT_CONN_LOG_MAX_FILESIZE 1190
/*! log: number of pre-allocated log files to create */
-#define WT_STAT_CONN_LOG_PREALLOC_MAX 1190
+#define WT_STAT_CONN_LOG_PREALLOC_MAX 1191
/*! log: pre-allocated log files not ready and missed */
-#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1191
+#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1192
/*! log: pre-allocated log files prepared */
-#define WT_STAT_CONN_LOG_PREALLOC_FILES 1192
+#define WT_STAT_CONN_LOG_PREALLOC_FILES 1193
/*! log: pre-allocated log files used */
-#define WT_STAT_CONN_LOG_PREALLOC_USED 1193
+#define WT_STAT_CONN_LOG_PREALLOC_USED 1194
/*! log: records processed by log scan */
-#define WT_STAT_CONN_LOG_SCAN_RECORDS 1194
+#define WT_STAT_CONN_LOG_SCAN_RECORDS 1195
/*! log: slot close lost race */
-#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1195
+#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1196
/*! log: slot close unbuffered waits */
-#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1196
+#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1197
/*! log: slot closures */
-#define WT_STAT_CONN_LOG_SLOT_CLOSES 1197
+#define WT_STAT_CONN_LOG_SLOT_CLOSES 1198
/*! log: slot join atomic update races */
-#define WT_STAT_CONN_LOG_SLOT_RACES 1198
+#define WT_STAT_CONN_LOG_SLOT_RACES 1199
/*! log: slot join calls atomic updates raced */
-#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1199
+#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1200
/*! log: slot join calls did not yield */
-#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1200
+#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1201
/*! log: slot join calls found active slot closed */
-#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1201
+#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1202
/*! log: slot join calls slept */
-#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1202
+#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1203
/*! log: slot join calls yielded */
-#define WT_STAT_CONN_LOG_SLOT_YIELD 1203
+#define WT_STAT_CONN_LOG_SLOT_YIELD 1204
/*! log: slot join found active slot closed */
-#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1204
+#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1205
/*! log: slot joins yield time (usecs) */
-#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1205
+#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1206
/*! log: slot transitions unable to find free slot */
-#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1206
+#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1207
/*! log: slot unbuffered writes */
-#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1207
+#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1208
/*! log: total in-memory size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_MEM 1208
+#define WT_STAT_CONN_LOG_COMPRESS_MEM 1209
/*! log: total log buffer size */
-#define WT_STAT_CONN_LOG_BUFFER_SIZE 1209
+#define WT_STAT_CONN_LOG_BUFFER_SIZE 1210
/*! log: total size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_LEN 1210
+#define WT_STAT_CONN_LOG_COMPRESS_LEN 1211
/*! log: written slots coalesced */
-#define WT_STAT_CONN_LOG_SLOT_COALESCED 1211
+#define WT_STAT_CONN_LOG_SLOT_COALESCED 1212
/*! log: yields waiting for previous log file close */
-#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1212
+#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1213
/*! reconciliation: fast-path pages deleted */
-#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1213
+#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1214
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_CONN_REC_PAGES 1214
+#define WT_STAT_CONN_REC_PAGES 1215
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_CONN_REC_PAGES_EVICTION 1215
+#define WT_STAT_CONN_REC_PAGES_EVICTION 1216
/*! reconciliation: pages deleted */
-#define WT_STAT_CONN_REC_PAGE_DELETE 1216
+#define WT_STAT_CONN_REC_PAGE_DELETE 1217
/*! reconciliation: split bytes currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1217
+#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1218
/*! reconciliation: split objects currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1218
+#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1219
/*! session: open cursor count */
-#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1219
+#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1220
/*! session: open session count */
-#define WT_STAT_CONN_SESSION_OPEN 1220
+#define WT_STAT_CONN_SESSION_OPEN 1221
/*! session: table alter failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1221
+#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1222
/*! session: table alter successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1222
+#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1223
/*! session: table alter unchanged and skipped */
-#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1223
+#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1224
/*! session: table compact failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1224
+#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1225
/*! session: table compact successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1225
+#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1226
/*! session: table create failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1226
+#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1227
/*! session: table create successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1227
+#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1228
/*! session: table drop failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1228
+#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1229
/*! session: table drop successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1229
+#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1230
/*! session: table rebalance failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1230
+#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1231
/*! session: table rebalance successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1231
+#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1232
/*! session: table rename failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1232
+#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1233
/*! session: table rename successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1233
+#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1234
/*! session: table salvage failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1234
+#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1235
/*! session: table salvage successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1235
+#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1236
/*! session: table truncate failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1236
+#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1237
/*! session: table truncate successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1237
+#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1238
/*! session: table verify failed calls */
-#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1238
+#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1239
/*! session: table verify successful calls */
-#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1239
+#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1240
/*! thread-state: active filesystem fsync calls */
-#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1240
+#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1241
/*! thread-state: active filesystem read calls */
-#define WT_STAT_CONN_THREAD_READ_ACTIVE 1241
+#define WT_STAT_CONN_THREAD_READ_ACTIVE 1242
/*! thread-state: active filesystem write calls */
-#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1242
+#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1243
/*! thread-yield: application thread time evicting (usecs) */
-#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1243
+#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1244
/*! thread-yield: application thread time waiting for cache (usecs) */
-#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1244
+#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1245
/*!
* thread-yield: connection close blocked waiting for transaction state
* stabilization
*/
-#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1245
+#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1246
/*! thread-yield: connection close yielded for lsm manager shutdown */
-#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1246
+#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1247
/*! thread-yield: data handle lock yielded */
-#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1247
+#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1248
/*!
* thread-yield: get reference for page index and slot time sleeping
* (usecs)
*/
-#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1248
+#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1249
/*! thread-yield: log server sync yielded for log write */
-#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1249
+#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1250
/*! thread-yield: page acquire busy blocked */
-#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1250
+#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1251
/*! thread-yield: page acquire eviction blocked */
-#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1251
+#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1252
/*! thread-yield: page acquire locked blocked */
-#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1252
+#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1253
/*! thread-yield: page acquire read blocked */
-#define WT_STAT_CONN_PAGE_READ_BLOCKED 1253
+#define WT_STAT_CONN_PAGE_READ_BLOCKED 1254
/*! thread-yield: page acquire time sleeping (usecs) */
-#define WT_STAT_CONN_PAGE_SLEEP 1254
+#define WT_STAT_CONN_PAGE_SLEEP 1255
/*!
* thread-yield: page delete rollback time sleeping for state change
* (usecs)
*/
-#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1255
+#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1256
/*! thread-yield: page reconciliation yielded due to child modification */
-#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1256
+#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1257
/*!
* thread-yield: tree descend one level yielded for split page index
* update
*/
-#define WT_STAT_CONN_TREE_DESCEND_BLOCKED 1257
+#define WT_STAT_CONN_TREE_DESCEND_BLOCKED 1258
/*! transaction: number of named snapshots created */
-#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1258
+#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1259
/*! transaction: number of named snapshots dropped */
-#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1259
+#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1260
/*! transaction: transaction begins */
-#define WT_STAT_CONN_TXN_BEGIN 1260
+#define WT_STAT_CONN_TXN_BEGIN 1261
/*! transaction: transaction checkpoint currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1261
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1262
/*! transaction: transaction checkpoint generation */
-#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1262
+#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1263
/*! transaction: transaction checkpoint max time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1263
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1264
/*! transaction: transaction checkpoint min time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1264
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1265
/*! transaction: transaction checkpoint most recent time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1265
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1266
/*! transaction: transaction checkpoint scrub dirty target */
-#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1266
+#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1267
/*! transaction: transaction checkpoint scrub time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1267
+#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1268
/*! transaction: transaction checkpoint total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1268
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1269
/*! transaction: transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1269
+#define WT_STAT_CONN_TXN_CHECKPOINT 1270
/*!
* transaction: transaction checkpoints skipped because database was
* clean
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1270
+#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1271
/*! transaction: transaction failures due to cache overflow */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1271
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1272
/*!
* transaction: transaction fsync calls for checkpoint after allocating
* the transaction ID
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1272
+#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1273
/*!
* transaction: transaction fsync duration for checkpoint after
* allocating the transaction ID (usecs)
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1273
+#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1274
/*! transaction: transaction range of IDs currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_RANGE 1274
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1275
/*! transaction: transaction range of IDs currently pinned by a checkpoint */
-#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1275
+#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1276
/*!
* transaction: transaction range of IDs currently pinned by named
* snapshots
*/
-#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1276
+#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1277
/*! transaction: transaction sync calls */
-#define WT_STAT_CONN_TXN_SYNC 1277
+#define WT_STAT_CONN_TXN_SYNC 1278
/*! transaction: transactions commit timestamp queue inserts to head */
-#define WT_STAT_CONN_TXN_COMMIT_QUEUE_HEAD 1278
+#define WT_STAT_CONN_TXN_COMMIT_QUEUE_HEAD 1279
/*! transaction: transactions commit timestamp queue inserts total */
-#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1279
+#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1280
/*! transaction: transactions commit timestamp queue length */
-#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1280
+#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1281
/*! transaction: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1281
+#define WT_STAT_CONN_TXN_COMMIT 1282
/*! transaction: transactions read timestamp queue inserts to head */
-#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1282
+#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1283
/*! transaction: transactions read timestamp queue inserts total */
-#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1283
+#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1284
/*! transaction: transactions read timestamp queue length */
-#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1284
+#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1285
/*! transaction: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1285
+#define WT_STAT_CONN_TXN_ROLLBACK 1286
/*! transaction: update conflicts */
-#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1286
+#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1287
/*!
* @}
diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h
index 84617dfcab8..b25ed08e30f 100644
--- a/src/third_party/wiredtiger/src/include/wt_internal.h
+++ b/src/third_party/wiredtiger/src/include/wt_internal.h
@@ -258,6 +258,8 @@ struct __wt_page_header;
typedef struct __wt_page_header WT_PAGE_HEADER;
struct __wt_page_index;
typedef struct __wt_page_index WT_PAGE_INDEX;
+struct __wt_page_lookaside;
+ typedef struct __wt_page_lookaside WT_PAGE_LOOKASIDE;
struct __wt_page_modify;
typedef struct __wt_page_modify WT_PAGE_MODIFY;
struct __wt_process;
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
index a42fbbe511b..95d025247a6 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
@@ -102,8 +102,6 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final)
static void
__lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final)
{
- int i;
-
/*
* Stop any new work units being added. The barrier is necessary
* because we rely on the state change being visible before checking
@@ -118,8 +116,7 @@ __lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final)
* we know a user is holding a reference to the tree, so exclusive
* access is not available.
*/
- for (i = 0;
- lsm_tree->queue_ref > 0 || (final && lsm_tree->refcnt > 1); ++i) {
+ while (lsm_tree->queue_ref > 0 || (final && lsm_tree->refcnt > 1)) {
/*
* Remove any work units from the manager queues. Do this step
* repeatedly in case a work unit was in the process of being
@@ -133,10 +130,8 @@ __lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final)
* other schema level operations will return EBUSY, even though
* we're dropping the schema lock here.
*/
- if (i % WT_THOUSAND == 0)
- WT_WITHOUT_LOCKS(session,
- __wt_lsm_manager_clear_tree(session, lsm_tree));
- __wt_yield();
+ WT_WITHOUT_LOCKS(session,
+ __wt_lsm_manager_clear_tree(session, lsm_tree));
}
}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
index 879913bccec..05e5fe5b07e 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
@@ -320,11 +320,12 @@ int
__wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
{
+ WT_BTREE *btree;
WT_DECL_RET;
WT_TXN_ISOLATION saved_isolation;
- bool flush_set, release_btree;
+ bool flush_set, release_dhandle;
- flush_set = release_btree = false;
+ flush_set = release_dhandle = false;
/*
* If the chunk is already checkpointed, make sure it is also evicted.
@@ -374,7 +375,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
* take a long time.
*/
WT_ERR(__wt_session_get_dhandle(session, chunk->uri, NULL, NULL, 0));
- release_btree = true;
+ release_dhandle = true;
/*
* Set read-uncommitted: we have already checked that all of the updates
@@ -407,9 +408,6 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
if (ret != 0)
WT_ERR_MSG(session, ret, "LSM checkpoint");
- release_btree = false;
- WT_ERR(__wt_session_release_dhandle(session));
-
/* Now the file is written, get the chunk size. */
WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk));
@@ -429,6 +427,19 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
if (ret != 0)
WT_ERR_MSG(session, ret, "LSM metadata write");
+ /*
+ * Enable eviction on the live chunk so it doesn't block the cache.
+ * Future reads should direct to the on-disk chunk anyway.
+ */
+ btree = session->dhandle->handle;
+ if (btree->evict_disabled_open) {
+ btree->evict_disabled_open = false;
+ __wt_evict_file_exclusive_off(session);
+ }
+
+ release_dhandle = false;
+ WT_ERR(__wt_session_release_dhandle(session));
+
WT_PUBLISH(chunk->flushing, 0);
flush_set = false;
@@ -448,7 +459,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
err: if (flush_set)
WT_PUBLISH(chunk->flushing, 0);
- if (release_btree)
+ if (release_dhandle)
WT_TRET(__wt_session_release_dhandle(session));
return (ret);
diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
index 5d0295d94ce..533d2a0ab08 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
@@ -26,8 +26,11 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp)
pthread_condattr_t condattr;
WT_ERR(pthread_condattr_init(&condattr));
- WT_ERR(pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC));
- WT_ERR(pthread_cond_init(&cond->cond, &condattr));
+ ret = pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC);
+ if (ret == 0)
+ ret = pthread_cond_init(&cond->cond, &condattr);
+ WT_TRET(pthread_condattr_destroy(&condattr));
+ WT_ERR(ret);
}
#else
WT_ERR(pthread_cond_init(&cond->cond, NULL));
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 4cb5ae12e5b..af43a56f877 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -45,13 +45,13 @@ typedef struct {
uint64_t last_running;
WT_DECL_TIMESTAMP(stable_timestamp)
- /* Track the page's maximum transaction. */
+ /* Track the page's min/maximum transactions. */
uint64_t max_txn;
WT_DECL_TIMESTAMP(max_timestamp)
+ WT_DECL_TIMESTAMP(min_saved_timestamp)
- uint64_t update_mem_all; /* Total update memory size */
- uint64_t update_mem_saved; /* Saved update memory size */
- uint64_t update_mem_uncommitted;/* Uncommitted update memory size */
+ bool update_uncommitted; /* An update was uncommitted */
+ bool update_used; /* An update could be used */
/*
* When we can't mark the page clean (for example, checkpoint found some
@@ -154,8 +154,6 @@ typedef struct {
*/
struct __rec_chunk {
/*
- * Current and minimum boundaries.
- *
* The recno and entries fields are the starting record number
* of the split chunk (for column-store splits), and the number
* of entries in the split chunk.
@@ -193,8 +191,8 @@ typedef struct {
size_t min_space_avail;
/*
- * Saved update list, supporting the WT_EVICT_UPDATE_RESTORE and
- * WT_EVICT_LOOKASIDE configurations. While reviewing updates for each
+ * Saved update list, supporting the WT_REC_UPDATE_RESTORE and
+ * WT_REC_LOOKASIDE configurations. While reviewing updates for each
* page, we save WT_UPDATE lists here, and then move them to per-block
* areas as the blocks are defined.
*/
@@ -220,7 +218,14 @@ typedef struct {
* There's some trickiness here, see the code for comments on how
* these fields work.
*/
- bool cell_zero; /* Row-store internal page 0th key */
+ bool cell_zero; /* Row-store internal page 0th key */
+
+ /*
+ * We calculate checksums to find previously written identical blocks,
+ * but once a match fails during an eviction, there's no point trying
+ * again.
+ */
+ bool evict_matching_checksum_failed;
/*
* WT_DICTIONARY --
@@ -324,7 +329,7 @@ static int __rec_split_write(
static int __rec_update_las(
WT_SESSION_IMPL *, WT_RECONCILE *, uint32_t, WT_MULTI *);
static int __rec_write_check_complete(
- WT_SESSION_IMPL *, WT_RECONCILE *, bool *);
+ WT_SESSION_IMPL *, WT_RECONCILE *, int, bool *);
static void __rec_write_page_status(WT_SESSION_IMPL *, WT_RECONCILE *);
static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_write_wrapup_err(
@@ -335,7 +340,8 @@ static int __rec_dictionary_init(WT_SESSION_IMPL *, WT_RECONCILE *, u_int);
static int __rec_dictionary_lookup(
WT_SESSION_IMPL *, WT_RECONCILE *, WT_KV *, WT_DICTIONARY **);
static void __rec_dictionary_reset(WT_RECONCILE *);
-static void __rec_verbose_lookaside_write(WT_SESSION_IMPL *);
+static void __rec_verbose_lookaside_write(
+ WT_SESSION_IMPL *, uint32_t, uint64_t);
/*
* __wt_reconcile --
@@ -361,9 +367,21 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
__wt_verbose(session, WT_VERB_RECONCILE,
"%p reconcile %s (%s%s%s)",
(void *)ref, __wt_page_type_string(page->type),
- LF_ISSET(WT_EVICTING) ? "evict" : "checkpoint",
- LF_ISSET(WT_EVICT_LOOKASIDE) ? ", lookaside" : "",
- LF_ISSET(WT_EVICT_UPDATE_RESTORE) ? ", update/restore" : "");
+ LF_ISSET(WT_REC_EVICT) ? "evict" : "checkpoint",
+ LF_ISSET(WT_REC_LOOKASIDE) ? ", lookaside" : "",
+ LF_ISSET(WT_REC_UPDATE_RESTORE) ? ", update/restore" : "");
+
+ /*
+ * Sanity check flags.
+ *
+ * We can only do update/restore eviction when the version that ends up
+ * in the page image is the oldest one any reader could need.
+ * Otherwise we would need to keep updates in memory that go back older
+ * than the version in the disk image, and since modify operations
+ * aren't idempotent, that is problematic.
+ */
+ WT_ASSERT(session, !LF_ISSET(WT_REC_UPDATE_RESTORE) ||
+ LF_ISSET(WT_REC_VISIBLE_ALL));
/* We shouldn't get called with a clean page, that's an error. */
WT_ASSERT(session, __wt_page_is_modified(page));
@@ -380,7 +398,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
WT_PAGE_LOCK(session, page);
oldest_id = __wt_txn_oldest_id(session);
- if (LF_ISSET(WT_EVICTING))
+ if (LF_ISSET(WT_REC_EVICT))
mod->last_eviction_id = oldest_id;
#ifdef HAVE_DIAGNOSTIC
@@ -426,9 +444,8 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
WT_ILLEGAL_VALUE_SET(session);
}
- /* Checks for a successful reconciliation. */
- if (ret == 0)
- ret = __rec_write_check_complete(session, r, lookaside_retryp);
+ /* Check for a successful reconciliation. */
+ WT_TRET(__rec_write_check_complete(session, r, ret, lookaside_retryp));
/* Wrap up the page reconciliation. */
if (ret == 0 && (ret = __rec_write_wrapup(session, r, page)) == 0)
@@ -442,7 +459,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
/* Update statistics. */
WT_STAT_CONN_INCR(session, rec_pages);
WT_STAT_DATA_INCR(session, rec_pages);
- if (LF_ISSET(WT_EVICTING)) {
+ if (LF_ISSET(WT_REC_EVICT)) {
WT_STAT_CONN_INCR(session, rec_pages_eviction);
WT_STAT_DATA_INCR(session, rec_pages_eviction);
}
@@ -478,14 +495,16 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
WT_TRET(session->block_manager_cleanup(session));
WT_TRET(__rec_destroy_session(session));
+ }
- /*
- * We track removed overflow objects in case there's a reader
- * in transit when they're removed. Any form of eviction locks
- * out readers, we can discard them all.
- */
+ /*
+ * We track removed overflow objects in case there's a reader in
+ * transit when they're removed. Any form of eviction locks out
+ * readers, we can discard them all.
+ */
+ if (LF_ISSET(WT_REC_EVICT))
__wt_ovfl_discard_remove(session, page);
- }
+
WT_RET(ret);
/*
@@ -531,7 +550,7 @@ __rec_las_checkpoint_test(WT_SESSION_IMPL *session, WT_RECONCILE *r)
* drain lookaside table reconciliations, and this isn't a problem for
* most workloads.
*/
- if (!F_ISSET(r, WT_EVICT_LOOKASIDE))
+ if (!F_ISSET(r, WT_REC_LOOKASIDE))
return (false);
if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
return (false);
@@ -549,7 +568,7 @@ __rec_las_checkpoint_test(WT_SESSION_IMPL *session, WT_RECONCILE *r)
*/
static int
__rec_write_check_complete(
- WT_SESSION_IMPL *session, WT_RECONCILE *r, bool *lookaside_retryp)
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, int tret, bool *lookaside_retryp)
{
/*
* Tests in this function are lookaside tests and tests to decide if
@@ -558,7 +577,7 @@ __rec_write_check_complete(
* checks for in-memory eviction because a small cache can force us to
* rewrite every possible page.
*/
- if (F_ISSET(r, WT_EVICT_IN_MEMORY))
+ if (F_ISSET(r, WT_REC_IN_MEMORY))
return (0);
/*
@@ -569,17 +588,26 @@ __rec_write_check_complete(
return (EBUSY);
/*
- * Eviction can configure lookaside table reconciliation, consider if
- * it's worth giving up this reconciliation attempt and falling back to
- * using the lookaside table. We continue with evict/restore if
- * switching to the lookaside doesn't make sense for any reason: we
- * won't retry an evict/restore reconciliation until/unless the
- * transactional system moves forward, so at worst it's a single wasted
- * effort.
+ * Fall back to lookaside eviction during checkpoints if a page can't
+ * be evicted.
+ */
+ if (tret == EBUSY && lookaside_retryp != NULL &&
+ !F_ISSET(r, WT_REC_UPDATE_RESTORE) && !r->update_uncommitted)
+ *lookaside_retryp = true;
+
+ /* Don't continue if we have already given up. */
+ WT_RET(tret);
+
+ /*
+ * Check if this reconciliation attempt is making progress. If there's
+ * any sign of progress, don't fall back to the lookaside table.
*
- * First, check if the lookaside table is a possible alternative.
+ * Check if the current reconciliation split, in which case we'll
+ * likely get to write at least one of the blocks. If we've created a
+ * page image for a page that previously didn't have one, or we had a
+ * page image and it is now empty, that's also progress.
*/
- if (lookaside_retryp == NULL)
+ if (r->multi_next > 1)
return (0);
/*
@@ -590,38 +618,20 @@ __rec_write_check_complete(
* If no updates were saved, eviction will succeed without needing to
* restore anything.
*/
- if (!F_ISSET(r, WT_EVICT_UPDATE_RESTORE) || r->supd == NULL)
- return (0);
-
- /*
- * Check if this reconciliation attempt is making progress. If there's
- * any sign of progress, don't fall back to the lookaside table.
- *
- * Check if the current reconciliation split, in which case we'll likely
- * get to write at least one of the blocks.
- */
- if (r->multi_next > 1)
+ if (!F_ISSET(r, WT_REC_UPDATE_RESTORE) || lookaside_retryp == NULL ||
+ (r->multi_next == 1 && r->multi->supd_entries == 0))
return (0);
/*
* Check if the current reconciliation applied some updates, in which
* case evict/restore should gain us some space.
- */
- if (r->update_mem_saved != r->update_mem_all)
- return (0);
-
- /*
+ *
* Check if lookaside eviction is possible. If any of the updates we
- * saw were uncommitted, the lookaside table cannot be used: it only
- * helps with older readers preventing eviction.
+ * saw were uncommitted, the lookaside table cannot be used.
*/
- if (r->update_mem_uncommitted != 0)
+ if (r->update_used || r->update_uncommitted)
return (0);
- /*
- * The current evict/restore approach shows no signs of being useful,
- * lookaside is possible, suggest the lookaside table.
- */
*lookaside_retryp = true;
return (EBUSY);
}
@@ -665,8 +675,8 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r)
* eviction path.
*/
WT_ASSERT(session,
- !F_ISSET(r, WT_EVICTING) ||
- F_ISSET(r, WT_EVICT_UPDATE_RESTORE));
+ !F_ISSET(r, WT_REC_EVICT) ||
+ F_ISSET(r, WT_REC_UPDATE_RESTORE));
} else {
/*
* Track the page's maximum transaction ID (used to decide if
@@ -685,7 +695,7 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r)
* about the maximum transaction ID of current updates in the
* tree, and checkpoint visits every dirty page in the tree.
*/
- if (F_ISSET(r, WT_EVICTING)) {
+ if (!F_ISSET(r, WT_REC_EVICT)) {
if (WT_TXNID_LT(btree->rec_max_txn, r->max_txn))
btree->rec_max_txn = r->max_txn;
#ifdef HAVE_TIMESTAMPS
@@ -707,7 +717,7 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r)
if (__wt_atomic_cas32(&mod->write_gen, r->orig_write_gen, 0))
__wt_cache_dirty_decr(session, page);
else
- WT_ASSERT(session, !F_ISSET(r, WT_EVICTING));
+ WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
}
}
@@ -903,49 +913,50 @@ __rec_init(WT_SESSION_IMPL *session,
#endif
/*
+ * When operating on the lookaside table, we should never try
+ * update/restore or lookaside eviction.
+ */
+ WT_ASSERT(session, !F_ISSET(btree, WT_BTREE_LOOKASIDE) ||
+ !LF_ISSET(WT_REC_LOOKASIDE | WT_REC_UPDATE_RESTORE));
+
+ /*
* Lookaside table eviction is configured when eviction gets aggressive,
* adjust the flags for cases we don't support.
+ *
+ * We don't yet support fixed-length column-store combined with the
+ * lookaside table. It's not hard to do, but the underlying function
+ * that reviews which updates can be written to the evicted page and
+ * which updates need to be written to the lookaside table needs access
+ * to the original value from the page being evicted, and there's no
+ * code path for that in the case of fixed-length column-store objects.
+ * (Row-store and variable-width column-store objects provide a
+ * reference to the unpacked on-page cell for this purpose, but there
+ * isn't an on-page cell for fixed-length column-store objects.) For
+ * now, turn it off.
*/
- if (LF_ISSET(WT_EVICT_LOOKASIDE)) {
- /*
- * Saving lookaside table updates into the lookaside table won't
- * work.
- */
- if (F_ISSET(btree, WT_BTREE_LOOKASIDE))
- LF_CLR(WT_EVICT_LOOKASIDE);
+ if (page->type == WT_PAGE_COL_FIX)
+ LF_CLR(WT_REC_LOOKASIDE);
- /*
- * We don't yet support fixed-length column-store combined with
- * the lookaside table. It's not hard to do, but the underlying
- * function that reviews which updates can be written to the
- * evicted page and which updates need to be written to the
- * lookaside table needs access to the original value from the
- * page being evicted, and there's no code path for that in the
- * case of fixed-length column-store objects. (Row-store and
- * variable-width column-store objects provide a reference to
- * the unpacked on-page cell for this purpose, but there isn't
- * an on-page cell for fixed-length column-store objects.) For
- * now, turn it off.
- */
- if (page->type == WT_PAGE_COL_FIX)
- LF_CLR(WT_EVICT_LOOKASIDE);
+ /*
+ * Check for a lookaside table and checkpoint collision, and if we find
+ * one, turn off the lookaside file (we've gone to all the effort of
+ * getting exclusive access to the page, might as well try and evict
+ * it).
+ */
+ if (LF_ISSET(WT_REC_LOOKASIDE) && __rec_las_checkpoint_test(session, r))
+ LF_CLR(WT_REC_LOOKASIDE);
- /*
- * Check for a lookaside table and checkpoint collision, and if
- * we find one, turn off the lookaside file (we've gone to all
- * the effort of getting exclusive access to the page, might as
- * well try and evict it).
- */
- if (__rec_las_checkpoint_test(session, r))
- LF_CLR(WT_EVICT_LOOKASIDE);
- }
r->flags = flags;
- /* Track the page's maximum transaction ID. */
+ /* Track the page's min/maximum transaction */
r->max_txn = WT_TXN_NONE;
+#ifdef HAVE_TIMESTAMPS
+ __wt_timestamp_set_zero(&r->max_timestamp);
+ __wt_timestamp_set_inf(&r->min_saved_timestamp);
+#endif
- /* Track if all updates were skipped. */
- r->update_mem_all = r->update_mem_saved = r->update_mem_uncommitted = 0;
+ /* Track if updates were used and/or uncommitted. */
+ r->update_used = r->update_uncommitted = false;
/* Track if the page can be marked clean. */
r->leave_dirty = false;
@@ -973,6 +984,8 @@ __rec_init(WT_SESSION_IMPL *session,
r->wrapup_checkpoint = NULL;
r->wrapup_checkpoint_compressed = false;
+ r->evict_matching_checksum_failed = false;
+
/*
* Dictionary compression only writes repeated values once. We grow
* the dictionary as necessary, always using the largest size we've
@@ -1032,7 +1045,7 @@ __rec_init(WT_SESSION_IMPL *session,
/*
* __rec_cleanup --
* Clean up after a reconciliation run, except for structures cached
- * across runs.
+ * across runs.
*/
static void
__rec_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
@@ -1113,19 +1126,13 @@ __rec_destroy_session(WT_SESSION_IMPL *session)
*/
static int
__rec_update_save(WT_SESSION_IMPL *session,
- WT_RECONCILE *r, WT_INSERT *ins, void *ripcip, WT_UPDATE *upd)
+ WT_RECONCILE *r, WT_INSERT *ins, void *ripcip, WT_UPDATE *onpage_upd)
{
WT_RET(__wt_realloc_def(
session, &r->supd_allocated, r->supd_next + 1, &r->supd));
r->supd[r->supd_next].ins = ins;
r->supd[r->supd_next].ripcip = ripcip;
- r->supd[r->supd_next].onpage_txn =
- upd == NULL ? WT_TXN_NONE : upd->txnid;
-#ifdef HAVE_TIMESTAMPS
- if (upd != NULL)
- __wt_timestamp_set(
- &r->supd[r->supd_next].onpage_timestamp, &upd->timestamp);
-#endif
+ r->supd[r->supd_next].onpage_upd = onpage_upd;
++r->supd_next;
return (0);
}
@@ -1136,7 +1143,7 @@ __rec_update_save(WT_SESSION_IMPL *session,
*/
static int
__rec_append_orig_value(WT_SESSION_IMPL *session,
- WT_PAGE *page, WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack)
+ WT_PAGE *page, WT_UPDATE *first_upd, WT_CELL_UNPACK *unpack)
{
WT_DECL_ITEM(tmp);
WT_DECL_RET;
@@ -1147,7 +1154,7 @@ __rec_append_orig_value(WT_SESSION_IMPL *session,
* If at least one self-contained update is globally visible, we're
* done.
*/
- for (upd = upd_list; upd != NULL; upd = upd->next)
+ for (upd = first_upd; upd != NULL; upd = upd->next)
if (WT_UPDATE_DATA_VALUE(upd) &&
__wt_txn_upd_visible_all(session, upd))
return (0);
@@ -1180,7 +1187,7 @@ __rec_append_orig_value(WT_SESSION_IMPL *session,
*
* Append the new entry to the update list.
*/
- for (upd = upd_list; upd->next != NULL; upd = upd->next)
+ for (upd = first_upd; upd->next != NULL; upd = upd->next)
;
WT_PUBLISH(upd->next, append);
__wt_cache_page_inmem_incr(session, page, size);
@@ -1192,138 +1199,114 @@ err: __wt_scr_free(session, &tmp);
/*
* __rec_txn_read --
* Return the update in a list that should be written (or NULL if none can
- * be written).
+ * be written).
*/
static int
__rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
WT_INSERT *ins, void *ripcip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp)
{
WT_BTREE *btree;
- WT_DECL_TIMESTAMP(max_timestamp)
WT_PAGE *page;
- WT_UPDATE *upd, *upd_list;
- size_t update_mem;
+ WT_UPDATE *first_ts_upd, *first_txn_upd, *first_upd, *upd;
+ wt_timestamp_t *timestampp;
uint64_t max_txn, txnid;
- bool skipped;
+ bool all_visible, uncommitted;
*updp = NULL;
btree = S2BT(session);
page = r->page;
+ first_ts_upd = first_txn_upd = NULL;
+ max_txn = WT_TXN_NONE;
+ uncommitted = false;
/*
* If called with a WT_INSERT item, use its WT_UPDATE list (which must
* exist), otherwise check for an on-page row-store WT_UPDATE list
* (which may not exist). Return immediately if the item has no updates.
*/
- if (ins == NULL) {
- if ((upd_list = WT_ROW_UPDATE(page, ripcip)) == NULL)
- return (0);
- } else
- upd_list = ins->upd;
+ if (ins != NULL)
+ first_upd = ins->upd;
+ else if ((first_upd = WT_ROW_UPDATE(page, ripcip)) == NULL)
+ return (0);
- skipped = false;
- update_mem = 0;
- max_txn = WT_TXN_NONE;
-#ifdef HAVE_TIMESTAMPS
- __wt_timestamp_set_zero(&max_timestamp);
-#endif
+ for (upd = first_upd; upd != NULL; upd = upd->next) {
+ if ((txnid = upd->txnid) == WT_TXN_ABORTED)
+ continue;
- if (F_ISSET(r, WT_EVICTING)) {
- /* Discard obsolete updates. */
- if ((upd = __wt_update_obsolete_check(
- session, page, upd_list->next)) != NULL)
- __wt_update_obsolete_free(session, page, upd);
+ /*
+ * Track the first update in the chain that is not aborted and
+ * the maximum transaction ID.
+ */
+ if (first_txn_upd == NULL)
+ first_txn_upd = upd;
- for (upd = upd_list; upd != NULL; upd = upd->next) {
- /* Track the total memory in the update chain. */
- update_mem += WT_UPDATE_MEMSIZE(upd);
+ /* Track the largest transaction ID seen. */
+ if (WT_TXNID_LT(max_txn, txnid))
+ max_txn = txnid;
- if ((txnid = upd->txnid) == WT_TXN_ABORTED)
- continue;
+ /*
+ * Check whether the update was committed before reconciliation
+ * started. The global commit point can move forward during
+ * reconciliation so we use a cached copy to avoid races when a
+ * concurrent transaction commits or rolls back while we are
+ * examining its updates.
+ */
+ if (WT_TXNID_LE(r->last_running, txnid))
+ uncommitted = r->update_uncommitted = true;
- /*
- * Track the largest/smallest transaction IDs on the
- * list.
- */
- if (WT_TXNID_LT(max_txn, txnid))
- max_txn = txnid;
+ /*
+ * Find the first update we can use.
+ *
+ * Update/restore eviction can handle any update (including
+ * uncommitted updates). Lookaside eviction can save any
+ * committed update. Regular eviction checks that the maximum
+ * transaction ID and timestamp seen are stable.
+ *
+ * Use the first committed entry we find in the lookaside
+ * table.
+ */
+ if (F_ISSET(btree, WT_BTREE_LOOKASIDE) && !uncommitted) {
+ *updp = upd;
+ break;
+ }
+ if (F_ISSET(r, WT_REC_VISIBLE_ALL) ?
+ !__wt_txn_upd_visible_all(session, upd) :
+ !__wt_txn_upd_visible(session, upd)) {
/*
- * Find the first update we can use.
- *
- * Check whether the update was committed before
- * reconciliation started. The global commit point can
- * move forward during reconciliation so we use a
- * cached copy to avoid races when a concurrent
- * transaction commits or rolls back while we are
- * examining its updates.
- *
- * Lookaside eviction can cope with any committed
- * update. Other eviction modes check that the maximum
- * transaction ID and timestamp seen are stable.
- *
- * When reconciling for eviction, track whether any
- * uncommitted updates are found.
+ * Rare case: when applications run at low isolation
+ * levels, update/restore eviction may see a stable
+ * update followed by an uncommitted update. Give up
+ * in that case: we need to discard updates from the
+ * stable update and older for correctness and we can't
+ * discard an uncommitted update.
*/
- if (WT_TXNID_LE(r->last_running, txnid)) {
- skipped = true;
- continue;
- }
-
- if (*updp == NULL)
- *updp = upd;
+ if (F_ISSET(r, WT_REC_UPDATE_RESTORE) &&
+ *updp != NULL && uncommitted)
+ return (EBUSY);
-#ifdef HAVE_TIMESTAMPS
- /* Track min/max timestamps. */
- if (__wt_timestamp_cmp(
- &upd->timestamp, &max_timestamp) > 0)
- __wt_timestamp_set(
- &max_timestamp, &upd->timestamp);
-#endif
+ continue;
}
- } else
- for (upd = upd_list; upd != NULL; upd = upd->next) {
- if ((txnid = upd->txnid) == WT_TXN_ABORTED)
- continue;
- /* Track the largest transaction ID on the list. */
- if (WT_TXNID_LT(max_txn, txnid))
- max_txn = txnid;
+ if (*updp == NULL)
+ *updp = upd;
- /*
- * Find the first update we can use.
- *
- * Checkpoint can only write updates visible as of its
- * snapshot.
- *
- * When reconciling for a checkpoint, track whether any
- * updates were skipped on the way to finding the first
- * visible update.
- */
- if (*updp == NULL) {
- if (__wt_txn_upd_visible(session, upd))
- *updp = upd;
- else
- skipped = true;
- }
- }
+#ifdef HAVE_TIMESTAMPS
+ /* Track the first update with non-zero timestamp. */
+ if (first_ts_upd == NULL &&
+ !__wt_timestamp_iszero(&upd->timestamp))
+ first_ts_upd = upd;
+#endif
+ }
/* Reconciliation should never see an aborted or reserved update. */
WT_ASSERT(session, *updp == NULL ||
((*updp)->txnid != WT_TXN_ABORTED &&
(*updp)->type != WT_UPDATE_RESERVED));
- r->update_mem_all += update_mem;
-
- /*
- * If all of the updates were aborted, quit. This test is not strictly
- * necessary because the above loop exits with skipped not set and the
- * maximum transaction left at its initial value of WT_TXN_NONE, so
- * the test below will be branch true and return, but it's cheap and a
- * little more explicit, and makes Coverity happy.
- */
- if (max_txn == WT_TXN_NONE)
+ /* If all of the updates were aborted, quit. */
+ if (first_txn_upd == NULL)
return (0);
/*
@@ -1334,140 +1317,104 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
*/
if (WT_TXNID_LT(r->max_txn, max_txn))
r->max_txn = max_txn;
+
#ifdef HAVE_TIMESTAMPS
- if (__wt_timestamp_cmp(&r->max_timestamp, &max_timestamp) < 0)
- __wt_timestamp_set(&r->max_timestamp, &max_timestamp);
+ if (first_ts_upd != NULL &&
+ __wt_timestamp_cmp(&r->max_timestamp, &first_ts_upd->timestamp) < 0)
+ __wt_timestamp_set(&r->max_timestamp, &first_ts_upd->timestamp);
#endif
/*
- * If there are no skipped updates and all updates are globally visible,
- * the page can be marked clean and we're done, regardless if evicting
- * or checkpointing.
- *
- * We have to check both: the oldest transaction ID may have moved while
- * we were scanning the update list, so it is possible to find a skipped
- * update, but then find all updates are stable at the end of the scan.
- *
- * Skip the visibility check for the lookaside table as a special-case,
- * we know there are no older readers of that table.
+ * The checkpoint transaction is special. Make sure we never write
+ * (metadata) updates from a checkpoint in a concurrent session.
*/
- if (!skipped && (F_ISSET(btree, WT_BTREE_LOOKASIDE) ||
- __wt_txn_visible_all(session,
- max_txn, WT_TIMESTAMP_NULL(&max_timestamp)))) {
- /*
- * The checkpoint transaction is special. Make sure we never
- * write (metadata) updates from a checkpoint in a concurrent
- * session.
- */
- WT_ASSERT(session, *updp == NULL ||
- (*updp)->txnid !=
- S2C(session)->txn_global.checkpoint_state.id ||
- WT_SESSION_IS_CHECKPOINT(session));
+ WT_ASSERT(session, *updp == NULL || (*updp)->txnid == WT_TXN_NONE ||
+ (*updp)->txnid != S2C(session)->txn_global.checkpoint_state.id ||
+ WT_SESSION_IS_CHECKPOINT(session));
- goto check_original_value;
- }
+ /*
+ * If there are no skipped updates, record that we're making progress.
+ */
+ if (*updp == first_txn_upd)
+ r->update_used = true;
/*
- * In some cases, there had better not be skipped updates or updates not
- * yet globally visible.
+ * Check if all updates on the page are visible. If not, it must stay
+ * dirty unless we are saving updates to the lookaside table.
+ *
+ * Updates can be out of transaction ID order (but not out of timestamp
+ * order), so we track the maximum transaction ID and the newest update
+ * with a timestamp (if any).
*/
- if (F_ISSET(r, WT_VISIBILITY_ERR))
+#ifdef HAVE_TIMESTAMPS
+ timestampp = first_ts_upd == NULL ? NULL : &first_ts_upd->timestamp;
+#else
+ WT_UNUSED(first_ts_upd);
+ timestampp = NULL;
+#endif
+ if (F_ISSET(btree, WT_BTREE_LOOKASIDE))
+ all_visible = !uncommitted;
+ else
+ all_visible = *updp == first_txn_upd &&
+ (F_ISSET(r, WT_REC_VISIBLE_ALL) ?
+ __wt_txn_visible_all(session, max_txn, timestampp) :
+ __wt_txn_visible(session, max_txn, timestampp));
+
+ if (all_visible)
+ goto check_original_value;
+
+ if (F_ISSET(r, WT_REC_VISIBILITY_ERR))
WT_PANIC_RET(session, EINVAL,
- "reconciliation error, uncommitted update or update not "
- "globally visible");
+ "reconciliation error, update not visible");
+ if (!F_ISSET(r, WT_REC_LOOKASIDE))
+ r->leave_dirty = true;
/*
* If not trying to evict the page, we know what we'll write and we're
- * done. Because some updates were skipped or are not globally visible,
- * the page can't be marked clean.
+ * done.
*/
- if (!F_ISSET(r, WT_EVICTING)) {
- r->leave_dirty = true;
+ if (!F_ISSET(r, WT_REC_EVICT))
goto check_original_value;
- }
/*
- * Evicting with either uncommitted changes or not-yet-globally-visible
- * changes. There are two ways to continue, the save/restore eviction
- * path or the lookaside table eviction path. Both cannot be configured
- * because the paths track different information. The save/restore path
- * can handle both uncommitted and not-yet-globally-visible changes, by
- * evicting most of the page and then creating a new, smaller page into
- * which we re-instantiate those changes. The lookaside table path can
- * only handle not-yet-globally-visible changes by writing those changes
- * into the lookaside table and restoring them on demand if and when the
- * page is read back into memory.
+ * We are attempting eviction with changes that are not yet stable
+ * (i.e. globally visible). There are two ways to continue, the
+ * save/restore eviction path or the lookaside table eviction path.
+ * Both cannot be configured because the paths track different
+ * information. The update/restore path can handle uncommitted changes,
+ * by evicting most of the page and then creating a new, smaller page
+ * to which we re-attach those changes. Lookaside eviction writes
+ * changes into the lookaside table and restores them on demand if and
+ * when the page is read back into memory.
*
* Both paths are configured outside of reconciliation: the save/restore
- * path is the WT_EVICT_UPDATE_RESTORE flag, the lookaside table path is
- * the WT_EVICT_LOOKASIDE flag.
+ * path is the WT_REC_UPDATE_RESTORE flag, the lookaside table path is
+ * the WT_REC_LOOKASIDE flag.
*/
- if (!F_ISSET(r, WT_EVICT_LOOKASIDE | WT_EVICT_UPDATE_RESTORE))
+ if (!F_ISSET(r, WT_REC_LOOKASIDE | WT_REC_UPDATE_RESTORE) &&
+ !F_ISSET(btree, WT_BTREE_LOOKASIDE))
return (EBUSY);
- if (skipped && !F_ISSET(r, WT_EVICT_UPDATE_RESTORE))
+ if (uncommitted && !F_ISSET(r, WT_REC_UPDATE_RESTORE))
return (EBUSY);
/*
- * Track the memory required by the update chain.
- *
- * A page with no uncommitted (skipped) updates, that can't be evicted
- * because some updates aren't yet globally visible, can be evicted by
- * writing previous versions of the updates to the lookaside file. That
- * test is just checking if the skipped updates memory is zero.
- *
- * If that's not possible (there are skipped updates), we can rewrite
- * the pages in-memory, but we don't want to unless there's memory to
- * recover. That test is comparing the memory we'd recover to the memory
- * we'd have to re-instantiate as part of the rewrite.
+ * The order of the updates on the list matters, we can't move only the
+ * unresolved updates, move the entire update list.
*/
- r->update_mem_saved += update_mem;
- if (skipped)
- r->update_mem_uncommitted += update_mem;
+ WT_RET(__rec_update_save(session, r, ins, ripcip, *updp));
#ifdef HAVE_TIMESTAMPS
- /*
- * Don't allow lookaside eviction with updates newer than the stable
- * timestamp. Also don't recommend lookaside eviction in that case.
- */
- if (__wt_timestamp_cmp(&max_timestamp, &r->stable_timestamp) > 0) {
- if (!F_ISSET(r, WT_EVICT_UPDATE_RESTORE))
- return (EBUSY);
-
- if (!skipped)
- r->update_mem_uncommitted += update_mem;
+ /* Track the oldest saved timestamp for lookaside. */
+ if (F_ISSET(r, WT_REC_LOOKASIDE)) {
+ for (upd = first_upd; upd->next != NULL; upd = upd->next)
+ ;
+ if (__wt_timestamp_cmp(
+ &r->min_saved_timestamp, &upd->timestamp) > 0)
+ __wt_timestamp_set(
+ &r->min_saved_timestamp, &upd->timestamp);
}
#endif
- if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) {
- /*
- * The save/restore eviction path.
- *
- * Clear the returned update, it's not needed. If there's an
- * on-page key/value pair to which the update list applies, our
- * caller writes it to the disk image. If an insert/append list,
- * our caller can ignore the key/value pair (everything needed
- * is in the update list), or in the case of row-store, write
- * the key to the disk image to split up the insert/append list.
- */
- *updp = NULL;
-
- /* The page can't be marked clean. */
- r->leave_dirty = true;
- }
-
- /*
- * The order of the updates on the list matters, we can't move only the
- * unresolved updates, move the entire update list.
- *
- * If we skipped updates, the transaction value is never used. If we
- * didn't skip updates, the list of updates are eventually written to
- * the lookaside table, and associated with each update record is the
- * transaction ID of the update we wrote in the reconciled page; once
- * that transaction ID is globally visible, we know we no longer need
- * the lookaside table records, allowing them to be discarded.
- */
- WT_RET(__rec_update_save(session, r, ins, ripcip, *updp));
-
check_original_value:
/*
* Returning an update means the original on-page value might be lost,
@@ -1477,10 +1424,11 @@ check_original_value:
* record that will be physically removed once it's no longer needed.
*/
if (*updp != NULL &&
- (F_ISSET(r, WT_EVICT_LOOKASIDE) ||
- (vpack != NULL &&
+ (F_ISSET(r, WT_REC_LOOKASIDE) ||
+ (*updp != NULL && vpack != NULL &&
vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM)))
- WT_RET(__rec_append_orig_value(session, page, *updp, vpack));
+ WT_RET(
+ __rec_append_orig_value(session, page, first_upd, vpack));
return (0);
}
@@ -1488,7 +1436,7 @@ check_original_value:
/*
* WT_CHILD_RELEASE, WT_CHILD_RELEASE_ERR --
* Macros to clean up during internal-page reconciliation, releasing the
- * hazard pointer we're holding on child pages.
+ * hazard pointer we're holding on child pages.
*/
#define WT_CHILD_RELEASE(session, hazard, ref) do { \
if (hazard) { \
@@ -1534,7 +1482,7 @@ __rec_child_deleted(WT_SESSION_IMPL *session,
*
* In some cases, there had better not be any updates we can't see.
*/
- if (F_ISSET(r, WT_VISIBILITY_ERR) && page_del != NULL &&
+ if (F_ISSET(r, WT_REC_VISIBILITY_ERR) && page_del != NULL &&
!__wt_txn_visible(session,
page_del->txnid, WT_TIMESTAMP_NULL(&page_del->timestamp)))
WT_PANIC_RET(session, EINVAL,
@@ -1600,7 +1548,7 @@ __rec_child_deleted(WT_SESSION_IMPL *session,
* if subsequently read (we wouldn't know which transactions should see
* the original page and which should see the deleted page).
*/
- if (F_ISSET(r, WT_EVICTING))
+ if (F_ISSET(r, WT_REC_EVICT))
return (EBUSY);
/*
@@ -1683,10 +1631,9 @@ __rec_child_modify(WT_SESSION_IMPL *session,
* pages in an evicted page's subtree fails the eviction
* attempt.
*/
- if (F_ISSET(r, WT_EVICTING)) {
- WT_ASSERT(session, !F_ISSET(r, WT_EVICTING));
+ WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
+ if (F_ISSET(r, WT_REC_EVICT))
return (EBUSY);
- }
/*
* If called during checkpoint, the child is being
@@ -1700,6 +1647,20 @@ __rec_child_modify(WT_SESSION_IMPL *session,
*/
break;
+ case WT_REF_LOOKASIDE:
+ /*
+ * On disk, with lookaside updates.
+ *
+ * We should never be here during eviction, active
+ * child pages in an evicted page's subtree fails the
+ * eviction attempt.
+ */
+ WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
+ if (F_ISSET(r, WT_REC_EVICT))
+ return (EBUSY);
+
+ goto done;
+
case WT_REF_MEM:
/*
* In memory.
@@ -1708,10 +1669,9 @@ __rec_child_modify(WT_SESSION_IMPL *session,
* pages in an evicted page's subtree fails the eviction
* attempt.
*/
- if (F_ISSET(r, WT_EVICTING)) {
- WT_ASSERT(session, !F_ISSET(r, WT_EVICTING));
+ WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
+ if (F_ISSET(r, WT_REC_EVICT))
return (EBUSY);
- }
/*
* If called during checkpoint, acquire a hazard pointer
@@ -1739,10 +1699,9 @@ __rec_child_modify(WT_SESSION_IMPL *session,
* pages in an evicted page's subtree fails the eviction
* attempt.
*/
- if (F_ISSET(r, WT_EVICTING)) {
- WT_ASSERT(session, !F_ISSET(r, WT_EVICTING));
+ WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT));
+ if (F_ISSET(r, WT_REC_EVICT))
return (EBUSY);
- }
goto done;
case WT_REF_SPLIT:
@@ -2073,7 +2032,8 @@ __rec_split_page_size_from_pct(
/*
* __wt_split_page_size --
* Split page size calculation: we don't want to repeatedly split every
- * time a new entry is added, so we split to a smaller-than-maximum page size.
+ * time a new entry is added, so we split to a smaller-than-maximum page
+ * size.
*/
uint32_t
__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize)
@@ -2396,7 +2356,7 @@ __rec_split_row_promote(
* the last key and smaller than the current key.
*/
max = r->last;
- if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE))
+ if (F_ISSET(r, WT_REC_UPDATE_RESTORE))
for (i = r->supd_next; i > 0; --i) {
supd = &r->supd[i - 1];
if (supd->ins == NULL)
@@ -2484,7 +2444,7 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len)
/*
* __rec_split --
* Handle the page reconciliation bookkeeping. (Did you know "bookkeeper"
- * has 3 doubled letters in a row? Sweet-tooth does, too.)
+ * has 3 doubled letters in a row? Sweet-tooth does, too.)
*/
static int
__rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
@@ -3157,27 +3117,13 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r)
/*
* We may arrive here with no entries to write if the page was entirely
* empty or if nothing on the page was visible to us.
+ *
+ * Pages with skipped or not-yet-globally visible updates aren't really
+ * empty; otherwise, the page is truly empty and we will merge it into
+ * its parent during the parent's reconciliation.
*/
- if (r->entries == 0) {
- /*
- * Pages with skipped or not-yet-globally visible updates aren't
- * really empty; otherwise, the page is truly empty and we will
- * merge it into its parent during the parent's reconciliation.
- */
- if (r->supd_next == 0)
- return (0);
-
- /*
- * If using the save/restore eviction path, continue with the
- * write, the page will be restored after we finish.
- *
- * If using the lookaside table eviction path, we can't continue
- * (we need a page to be written, otherwise we won't ever find
- * the updates for future reads).
- */
- if (F_ISSET(r, WT_EVICT_LOOKASIDE))
- return (EBUSY);
- }
+ if (r->entries == 0 && r->supd_next == 0)
+ return (0);
/* Set the number of entries and size for the just finished chunk. */
r->cur_ptr->image.size =
@@ -3195,7 +3141,7 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r)
/*
* __rec_supd_move --
* Move a saved WT_UPDATE list from the per-page cache to a specific
- * block's list.
+ * block's list.
*/
static int
__rec_supd_move(
@@ -3214,7 +3160,7 @@ __rec_supd_move(
/*
* __rec_split_write_supd --
* Check if we've saved updates that belong to this block, and move any
- * to the per-block structure.
+ * to the per-block structure.
*/
static int
__rec_split_write_supd(WT_SESSION_IMPL *session,
@@ -3329,7 +3275,7 @@ __rec_split_write_header(WT_SESSION_IMPL *session,
* and we found updates that weren't globally visible when reconciling
* this page.
*/
- if (F_ISSET(r, WT_EVICT_LOOKASIDE) && multi->supd != NULL) {
+ if (F_ISSET(r, WT_REC_LOOKASIDE) && multi->supd != NULL) {
F_SET(dsk, WT_PAGE_LAS_UPDATE);
r->cache_write_lookaside = true;
}
@@ -3345,6 +3291,91 @@ __rec_split_write_header(WT_SESSION_IMPL *session,
}
/*
+ * __rec_split_write_reuse --
+ * Check if a previously written block can be reused.
+ */
+static bool
+__rec_split_write_reuse(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_MULTI *multi, WT_ITEM *image, bool last_block)
+{
+ WT_MULTI *multi_match;
+ WT_PAGE_MODIFY *mod;
+
+ mod = r->page->modify;
+
+ /*
+ * Don't bother calculating checksums for bulk loads, there's no reason
+ * to believe they'll be useful. Check because LSM does bulk-loads as
+ * part of normal operations and the check is cheap.
+ */
+ if (r->is_bulk_load)
+ return (false);
+
+ /*
+ * Calculating the checksum is the expensive part, try to avoid it.
+ *
+ * Ignore the last block of any reconciliation. Pages are written in the
+ * same block order every time, so the last block written for a page is
+ * unlikely to match any previously written block or block written in
+ * the future, (absent a point-update earlier in the page which didn't
+ * change the size of the on-page object in any way).
+ */
+ if (last_block)
+ return (false);
+
+ /*
+ * Quit if evicting with no previously written block to compare against.
+ * (In other words, if there's eviction pressure and the page was never
+ * written by a checkpoint, calculating a checksum is worthless.)
+ *
+ * Quit if evicting and a previous check failed, once there's a miss no
+ * future block will match.
+ */
+ if (F_ISSET(r, WT_REC_EVICT)) {
+ if (mod->rec_result != WT_PM_REC_MULTIBLOCK ||
+ mod->mod_multi_entries < r->multi_next)
+ return (false);
+ if (r->evict_matching_checksum_failed)
+ return (false);
+ }
+
+ /* Calculate the checksum for this block. */
+ multi->checksum = __wt_checksum(image->data, image->size);
+
+ /*
+ * Don't check for a block match when writing blocks during compaction,
+ * the whole idea is to move those blocks. Check after calculating the
+ * checksum, we don't distinguish between pages written solely as part
+ * of the compaction and pages written at around the same time, and so
+ * there's a possibility the calculated checksum will be useful in the
+ * future.
+ */
+ if (session->compact_state != WT_COMPACT_NONE)
+ return (false);
+
+ /*
+ * Pages are written in the same block order every time, only check the
+ * appropriate slot.
+ */
+ if (mod->rec_result != WT_PM_REC_MULTIBLOCK ||
+ mod->mod_multi_entries < r->multi_next)
+ return (false);
+
+ multi_match = &mod->mod_multi[r->multi_next - 1];
+ if (multi_match->size != multi->size ||
+ multi_match->checksum != multi->checksum) {
+ r->evict_matching_checksum_failed = true;
+ return (false);
+ }
+
+ multi_match->addr.reuse = 1;
+ multi->addr = multi_match->addr;
+
+ WT_STAT_DATA_INCR(session, rec_page_match);
+ return (true);
+}
+
+/*
* __rec_split_write --
* Write a disk block out for the split helper functions.
*/
@@ -3353,9 +3384,8 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r,
WT_CHUNK *chunk, WT_ITEM *compressed_image, bool last_block)
{
WT_BTREE *btree;
- WT_MULTI *multi, *multi_mod;
+ WT_MULTI *multi;
WT_PAGE *page;
- WT_PAGE_MODIFY *mod;
size_t addr_size;
uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE];
#ifdef HAVE_DIAGNOSTIC
@@ -3364,7 +3394,6 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r,
btree = S2BT(session);
page = r->page;
- mod = page->modify;
#ifdef HAVE_DIAGNOSTIC
verify_image = true;
#endif
@@ -3422,7 +3451,7 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r,
*/
if (last_block &&
r->multi_next == 1 && __rec_is_checkpoint(session, r)) {
- WT_ASSERT(session, r->supd == NULL);
+ WT_ASSERT(session, r->supd_next == 0);
if (compressed_image == NULL)
r->wrapup_checkpoint = &chunk->image;
@@ -3434,71 +3463,64 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r,
}
/*
- * If configured for an in-memory database, or using the save/restore
- * eviction path and we had to skip updates in order to build this disk
- * image, we can't actually write it. Instead, we will re-instantiate
- * the page using the disk image and any list of updates we skipped.
+ * If configured for an in-memory database, we can't actually write it.
+ * Instead, we will re-instantiate the page using the disk image and
+ * any list of updates we skipped.
*/
- if (F_ISSET(r, WT_EVICT_IN_MEMORY))
- goto copy_image;
- if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && multi->supd != NULL) {
- r->cache_write_restore = true;
+ if (F_ISSET(r, WT_REC_IN_MEMORY))
goto copy_image;
- }
/*
- * If we wrote this block before, re-use it. Pages get written in the
- * same block order every time, only check the appropriate slot. The
- * expensive part of this test is the checksum, only do that work when
- * there has been or will be a reconciliation of this page involving
- * split pages. This test isn't perfect: we're doing a checksum if a
- * previous reconciliation of the page split or if we will split this
- * time, but that test won't calculate a checksum on the first block
- * the first time the page splits.
+ * If there are saved updates, we are either doing update/restore
+ * eviction or lookaside eviction. Update/restore never writes the
+ * disk image.
+ *
+ * Lookaside does write disk images, but also needs to cope with the
+ * case where no updates could be written, which means there are no
+ * entries in the page image to write.
*/
- if (r->multi_next > 1 ||
- (mod->rec_result == WT_PM_REC_MULTIBLOCK &&
- mod->mod_multi != NULL)) {
- multi->checksum =
- __wt_checksum(chunk->image.data, chunk->image.size);
-
+ if (multi->supd != NULL &&
+ (F_ISSET(r, WT_REC_UPDATE_RESTORE) || chunk->entries == 0)) {
/*
- * One last check: don't reuse blocks if compacting, the reason
- * for compaction is to move blocks to different locations. We
- * do this check after calculating the checksums, hopefully the
- * next write can be skipped.
+ * If no entries were used, the page is empty and we can only
+ * restore updates against an empty row store leaf page.
+ * (Column store modify will attempt to allocate a zero-length
+ * array).
*/
- if (session->compact_state == WT_COMPACT_NONE &&
- mod->rec_result == WT_PM_REC_MULTIBLOCK &&
- mod->mod_multi_entries > r->multi_next) {
- multi_mod = &mod->mod_multi[r->multi_next - 1];
- if (multi_mod->size == multi->size &&
- multi_mod->checksum == multi->checksum) {
- multi_mod->addr.reuse = 1;
- multi->addr = multi_mod->addr;
-
- WT_STAT_DATA_INCR(session, rec_page_match);
- goto copy_image;
- }
- }
+ if (r->page->type != WT_PAGE_ROW_LEAF &&
+ chunk->entries == 0 && multi->supd != NULL)
+ return (EBUSY);
+
+ r->cache_write_restore = true;
+ goto update_las;
}
+ /*
+ * If we wrote this block before, re-use it. Prefer a checksum of the
+ * compressed image. It's an identical test and should be faster.
+ */
+ if (__rec_split_write_reuse(session, r, multi,
+ compressed_image == NULL ? &chunk->image : compressed_image,
+ last_block))
+ goto copy_image;
+
WT_RET(__wt_bt_write(session,
compressed_image == NULL ? &chunk->image : compressed_image,
- addr, &addr_size,
- false, F_ISSET(r, WT_CHECKPOINTING), compressed_image != NULL));
+ addr, &addr_size, false, F_ISSET(r, WT_REC_CHECKPOINT),
+ compressed_image != NULL));
#ifdef HAVE_DIAGNOSTIC
verify_image = false;
#endif
WT_RET(__wt_memdup(session, addr, addr_size, &multi->addr.addr));
multi->addr.size = (uint8_t)addr_size;
+update_las:
/*
* If using the lookaside table eviction path and we found updates that
* weren't globally visible when reconciling this page, copy them into
* the database's lookaside store.
*/
- if (F_ISSET(r, WT_EVICT_LOOKASIDE) && multi->supd != NULL)
+ if (F_ISSET(r, WT_REC_LOOKASIDE) && multi->supd != NULL)
WT_RET(__rec_update_las(session, r, btree->id, multi));
copy_image:
@@ -3511,13 +3533,14 @@ copy_image:
__wt_verify_dsk_image(session,
"[reconcile-image]", chunk->image.data, 0, true) == 0);
#endif
+
/*
* If re-instantiating this page in memory (either because eviction
* wants to, or because we skipped updates to build the disk image),
* save a copy of the disk image.
*/
- if (F_ISSET(r, WT_EVICT_SCRUB) ||
- (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && multi->supd != NULL))
+ if (F_ISSET(r, WT_REC_SCRUB) ||
+ (F_ISSET(r, WT_REC_UPDATE_RESTORE) && multi->supd != NULL))
WT_RET(__wt_memdup(session,
chunk->image.data, chunk->image.size, &multi->disk_image));
@@ -3535,26 +3558,19 @@ __rec_update_las(WT_SESSION_IMPL *session,
WT_CURSOR *cursor;
WT_DECL_ITEM(key);
WT_DECL_RET;
- WT_ITEM las_addr, las_timestamp, las_value;
+ WT_ITEM las_timestamp, las_value;
WT_PAGE *page;
WT_SAVE_UPD *list;
WT_UPDATE *upd;
- uint64_t insert_cnt, las_counter;
+ uint64_t insert_cnt, las_counter, las_pageid;
uint32_t i, session_flags, slot;
uint8_t *p;
cursor = NULL;
- WT_CLEAR(las_addr);
WT_CLEAR(las_timestamp);
WT_CLEAR(las_value);
page = r->page;
- insert_cnt = 0;
-
- /*
- * We're writing lookaside records: start instantiating them on pages
- * we read (with the right flag set), and start sweeping the file.
- */
- __wt_las_set_written(session);
+ insert_cnt = las_pageid = 0;
__wt_las_cursor(session, &cursor, &session_flags);
@@ -3562,29 +3578,20 @@ __rec_update_las(WT_SESSION_IMPL *session,
WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key));
/*
- * Each key in the lookaside table is associated with a block, and those
- * blocks are freed and reallocated to other pages as pages in the tree
- * are modified and reconciled. We want to be sure we don't add records
- * to the lookaside table, then discard the block to which they apply,
- * then write a new block to the same address, and then apply the old
- * records to the new block when it's read. We don't want to clean old
- * records out of the lookaside table every time we free a block because
- * that happens a lot and would be costly; instead, we clean out the old
- * records when adding new records into the lookaside table. This works
- * because we only read from the lookaside table for pages marked with
- * the WT_PAGE_LAS_UPDATE flag: that flag won't be set if we rewrite a
- * block with no lookaside records, so the lookaside table won't be
- * checked when the block is read, even if there are lookaside table
- * records matching that block. If we rewrite a block that has lookaside
- * records, we'll run this code, discarding any old records that might
- * exist.
- */
- WT_ERR(__wt_las_remove_block(
- session, cursor, btree_id, multi->addr.addr, multi->addr.size));
-
- /* Lookaside table key component: block address. */
- las_addr.data = multi->addr.addr;
- las_addr.size = multi->addr.size;
+ * Each key in the lookaside table is associated with a unique
+ * identifier, allocated sequentially per tree.
+ */
+ las_pageid = multi->las_pageid =
+ __wt_atomic_add64(&S2BT(session)->las_pageid, 1);
+
+ /* The zero page ID is reserved, check we don't see it. */
+ WT_ASSERT(session, las_pageid != 0);
+
+ /*
+ * Make sure there are no left over entries (e.g., from a handle
+ * reopen).
+ */
+ WT_ERR(__wt_las_remove_block(session, cursor, btree_id, las_pageid));
/* Enter each update in the boundary's list into the lookaside store. */
for (las_counter = 0, i = 0,
@@ -3654,13 +3661,8 @@ __rec_update_las(WT_SESSION_IMPL *session,
continue;
}
-#ifdef HAVE_TIMESTAMPS
- las_timestamp.data = &list->onpage_timestamp;
- las_timestamp.size = WT_TIMESTAMP_SIZE;
-#endif
cursor->set_key(cursor,
- btree_id, &las_addr, ++las_counter,
- list->onpage_txn, &las_timestamp, key);
+ btree_id, las_pageid, ++las_counter, key);
#ifdef HAVE_TIMESTAMPS
las_timestamp.data = &upd->timestamp;
@@ -3680,9 +3682,9 @@ __rec_update_las(WT_SESSION_IMPL *session,
err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
if (insert_cnt > 0) {
- (void)__wt_atomic_add64(
- &S2C(session)->las_record_cnt, insert_cnt);
- __rec_verbose_lookaside_write(session);
+ WT_STAT_CONN_INCRV(
+ session, cache_lookaside_entries, insert_cnt);
+ __rec_verbose_lookaside_write(session, btree_id, las_pageid);
}
__wt_scr_free(session, &key);
@@ -4368,7 +4370,7 @@ __rec_col_fix_slvg(WT_SESSION_IMPL *session,
/*
* __rec_col_var_helper --
* Create a column-store variable length record cell and write it onto a
- * page.
+ * page.
*/
static int
__rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r,
@@ -4634,7 +4636,7 @@ record_loop: /*
* Assert the case.
*/
WT_ASSERT(session,
- F_ISSET(r, WT_EVICT_UPDATE_RESTORE));
+ F_ISSET(r, WT_REC_UPDATE_RESTORE));
/*
* The on-page value will never be accessed,
@@ -4776,7 +4778,7 @@ compare: /*
if (ovfl_state == OVFL_UNUSED &&
vpack->raw != WT_CELL_VALUE_OVFL_RM)
WT_ERR(__wt_ovfl_remove(
- session, page, vpack, !F_ISSET(r, WT_EVICTING)));
+ session, page, vpack, F_ISSET(r, WT_REC_EVICT)));
}
/* Walk any append list. */
@@ -5356,7 +5358,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
* Assert the case.
*/
WT_ASSERT(session,
- F_ISSET(r, WT_EVICT_UPDATE_RESTORE));
+ F_ISSET(r, WT_REC_UPDATE_RESTORE));
/*
* If the key is also a removed overflow item,
@@ -5404,7 +5406,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
if (vpack != NULL &&
vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM)
WT_ERR(__wt_ovfl_remove(session,
- page, vpack, !F_ISSET(r, WT_EVICTING)));
+ page, vpack, F_ISSET(r, WT_REC_EVICT)));
switch (upd->type) {
case WT_UPDATE_DELETED:
@@ -5632,12 +5634,13 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) {
WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
+
if (upd == NULL) {
/*
* Look for an update. If nothing is visible and not in
* evict/restore, there's no work to do.
*/
- if (!F_ISSET(r, WT_EVICT_UPDATE_RESTORE))
+ if (!F_ISSET(r, WT_REC_UPDATE_RESTORE))
continue;
/*
@@ -5679,8 +5682,8 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
if (upd->size == 0)
val->len = 0;
else
- WT_RET(__rec_cell_build_val(
- session, r, upd->data, upd->size,
+ WT_RET(__rec_cell_build_val(session,
+ r, upd->data, upd->size,
(uint64_t)0));
break;
WT_ILLEGAL_VALUE(session);
@@ -5945,9 +5948,9 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
* in memory because the latter can't handle update lists and
* splits can.
*/
- if (F_ISSET(r, WT_EVICT_IN_MEMORY) ||
- (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) &&
- r->multi->supd != NULL))
+ if (F_ISSET(r, WT_REC_IN_MEMORY) ||
+ (F_ISSET(r, WT_REC_UPDATE_RESTORE) &&
+ r->multi->supd_entries != 0))
goto split;
/*
@@ -5959,9 +5962,15 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
r->multi->addr.addr = NULL;
mod->mod_disk_image = r->multi->disk_image;
r->multi->disk_image = NULL;
+ mod->mod_replace_las_pageid = r->multi->las_pageid;
+#ifdef HAVE_TIMESTAMPS
+ __wt_timestamp_set(&mod->mod_replace_las_min_timestamp,
+ &r->min_saved_timestamp);
+#endif
+ r->multi->las_pageid = 0;
} else
WT_RET(__wt_bt_write(session, r->wrapup_checkpoint,
- NULL, NULL, true, F_ISSET(r, WT_CHECKPOINTING),
+ NULL, NULL, true, F_ISSET(r, WT_REC_CHECKPOINT),
r->wrapup_checkpoint_compressed));
mod->rec_result = WT_PM_REC_REPLACE;
@@ -6037,14 +6046,13 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
}
WT_TRET(__wt_ovfl_track_wrapup_err(session, page));
-
return (ret);
}
/*
* __rec_cell_build_int_key --
* Process a key and return a WT_CELL structure and byte string to be
- * stored on a row-store internal page.
+ * stored on a row-store internal page.
*/
static int
__rec_cell_build_int_key(WT_SESSION_IMPL *session,
@@ -6081,7 +6089,7 @@ __rec_cell_build_int_key(WT_SESSION_IMPL *session,
/*
* __rec_cell_build_leaf_key --
* Process a key and return a WT_CELL structure and byte string to be
- * stored on a row-store leaf page.
+ * stored on a row-store leaf page.
*/
static int
__rec_cell_build_leaf_key(WT_SESSION_IMPL *session,
@@ -6184,7 +6192,7 @@ __rec_cell_build_leaf_key(WT_SESSION_IMPL *session,
/*
* __rec_cell_build_addr --
* Process an address reference and return a cell structure to be stored
- * on the page.
+ * on the page.
*/
static void
__rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r,
@@ -6219,7 +6227,7 @@ __rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r,
/*
* __rec_cell_build_val --
* Process a data item and return a WT_CELL structure and byte string to
- * be stored on the page.
+ * be stored on the page.
*/
static int
__rec_cell_build_val(WT_SESSION_IMPL *session,
@@ -6311,7 +6319,7 @@ __rec_cell_build_ovfl(WT_SESSION_IMPL *session,
/* Write the buffer. */
addr = buf;
WT_ERR(__wt_bt_write(session, tmp,
- addr, &size, false, F_ISSET(r, WT_CHECKPOINTING), false));
+ addr, &size, false, F_ISSET(r, WT_REC_CHECKPOINT), false));
/*
* Track the overflow record (unless it's a bulk load, which
@@ -6460,7 +6468,7 @@ __rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r)
/*
* __rec_dictionary_reset --
* Reset the dictionary when reconciliation restarts and when crossing a
- * page boundary (a potential split).
+ * page boundary (a potential split).
*/
static void
__rec_dictionary_reset(WT_RECONCILE *r)
@@ -6527,10 +6535,11 @@ __rec_dictionary_lookup(
/*
* __rec_verbose_lookaside_write --
* Create a verbose message to display once per checkpoint with details
- * about the cache state when performing a lookaside table write.
+ * about the cache state when performing a lookaside table write.
*/
static void
-__rec_verbose_lookaside_write(WT_SESSION_IMPL *session)
+__rec_verbose_lookaside_write(
+ WT_SESSION_IMPL *session, uint32_t las_id, uint64_t las_pageid)
{
#ifdef HAVE_VERBOSE
WT_CONNECTION_IMPL *conn;
@@ -6560,14 +6569,19 @@ __rec_verbose_lookaside_write(WT_SESSION_IMPL *session)
(void)__wt_eviction_dirty_needed(session, &pct_dirty);
__wt_verbose(session, WT_VERB_LOOKASIDE,
- "Page reconciliation triggered lookaside write. "
- "Entries now in lookaside file: %" PRIu64 ", "
+ "Page reconciliation triggered lookaside write"
+ "file ID %" PRIu32 ", page ID %" PRIu64 ". "
+ "Entries now in lookaside file: %" PRId64 ", "
"cache dirty: %" PRIu32 "%% , "
"cache use: %" PRIu32 "%%",
- conn->las_record_cnt, pct_dirty, pct_full);
+ las_id, las_pageid,
+ WT_STAT_READ(conn->stats, cache_lookaside_entries),
+ pct_dirty, pct_full);
}
}
#else
WT_UNUSED(session);
+ WT_UNUSED(las_id);
+ WT_UNUSED(las_pageid);
#endif
}
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
index d3540cb1dab..cc32766c9dc 100644
--- a/src/third_party/wiredtiger/src/session/session_api.c
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -1992,11 +1992,14 @@ __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name,
F_SET(session, session_flags | WT_SESSION_INTERNAL);
/*
+ * Optionally acquire a lookaside table cursor (or clear caller's flag).
* Acquiring the lookaside table cursor requires various locks; we've
* seen problems in the past where deadlocks happened because sessions
* deadlocked getting the cursor late in the process. Be defensive,
* get it now.
*/
+ if (!F_ISSET(conn, WT_CONN_LAS_OPEN))
+ F_CLR(session, WT_SESSION_LOOKASIDE_CURSOR);
if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR) &&
(ret = __wt_las_cursor_open(session, &session->las_cursor)) != 0) {
wt_session = &session->iface;
diff --git a/src/third_party/wiredtiger/src/support/hex.c b/src/third_party/wiredtiger/src/support/hex.c
index e0b1b6de1ea..58730b1505b 100644
--- a/src/third_party/wiredtiger/src/support/hex.c
+++ b/src/third_party/wiredtiger/src/support/hex.c
@@ -116,6 +116,12 @@ __wt_hex2byte(const u_char *from, u_char *to)
case '7': byte = 7 << 4; break;
case '8': byte = 8 << 4; break;
case '9': byte = 9 << 4; break;
+ case 'A': byte = 10 << 4; break;
+ case 'B': byte = 11 << 4; break;
+ case 'C': byte = 12 << 4; break;
+ case 'D': byte = 13 << 4; break;
+ case 'E': byte = 14 << 4; break;
+ case 'F': byte = 15 << 4; break;
case 'a': byte = 10 << 4; break;
case 'b': byte = 11 << 4; break;
case 'c': byte = 12 << 4; break;
@@ -137,6 +143,12 @@ __wt_hex2byte(const u_char *from, u_char *to)
case '7': byte |= 7; break;
case '8': byte |= 8; break;
case '9': byte |= 9; break;
+ case 'A': byte |= 10; break;
+ case 'B': byte |= 11; break;
+ case 'C': byte |= 12; break;
+ case 'D': byte |= 13; break;
+ case 'E': byte |= 14; break;
+ case 'F': byte |= 15; break;
case 'a': byte |= 10; break;
case 'b': byte |= 11; break;
case 'c': byte |= 12; break;
diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c
index 05b653a8c77..57dcd33c7f1 100644
--- a/src/third_party/wiredtiger/src/support/stat.c
+++ b/src/third_party/wiredtiger/src/support/stat.c
@@ -809,6 +809,7 @@ static const char * const __stats_connection_desc[] = {
"cache: internal pages evicted",
"cache: internal pages split during eviction",
"cache: leaf pages split during eviction",
+ "cache: lookaside table entries",
"cache: lookaside table insert calls",
"cache: lookaside table remove calls",
"cache: maximum bytes configured",
@@ -1138,6 +1139,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->cache_eviction_internal = 0;
stats->cache_eviction_split_internal = 0;
stats->cache_eviction_split_leaf = 0;
+ /* not clearing cache_lookaside_entries */
stats->cache_lookaside_insert = 0;
stats->cache_lookaside_remove = 0;
/* not clearing cache_bytes_max */
@@ -1488,6 +1490,8 @@ __wt_stat_connection_aggregate(
WT_STAT_READ(from, cache_eviction_split_internal);
to->cache_eviction_split_leaf +=
WT_STAT_READ(from, cache_eviction_split_leaf);
+ to->cache_lookaside_entries +=
+ WT_STAT_READ(from, cache_lookaside_entries);
to->cache_lookaside_insert +=
WT_STAT_READ(from, cache_lookaside_insert);
to->cache_lookaside_remove +=
diff --git a/src/third_party/wiredtiger/src/support/thread_group.c b/src/third_party/wiredtiger/src/support/thread_group.c
index 59caaedf5cf..f5842bea572 100644
--- a/src/third_party/wiredtiger/src/support/thread_group.c
+++ b/src/third_party/wiredtiger/src/support/thread_group.c
@@ -141,7 +141,6 @@ __thread_group_resize(
conn = S2C(session);
thread = NULL;
- session_flags = 0;
__wt_verbose(session, WT_VERB_THREAD_GROUP,
"Resize thread group: %p, from min: %" PRIu32 " -> %" PRIu32
@@ -187,9 +186,10 @@ __thread_group_resize(
* started during recovery, before the lookaside table is
* created.
*/
+ session_flags = 0;
if (LF_ISSET(WT_THREAD_CAN_WAIT))
- session_flags = WT_SESSION_CAN_WAIT;
- if (F_ISSET(conn, WT_CONN_LAS_OPEN))
+ FLD_SET(session_flags, WT_SESSION_CAN_WAIT);
+ if (LF_ISSET(WT_THREAD_LOOKASIDE))
FLD_SET(session_flags, WT_SESSION_LOOKASIDE_CURSOR);
WT_ERR(__wt_open_internal_session(conn, group->name,
false, session_flags, &thread->session));
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index ea5cd3390e2..c5c514c008b 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -593,6 +593,21 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
#endif
}
+#ifdef HAVE_TIMESTAMPS
+ /*
+ * Debugging checks on timestamps, if user requested them.
+ */
+ if (F_ISSET(txn, WT_TXN_TS_COMMIT_ALWAYS) &&
+ !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
+ txn->mod_count != 0)
+ WT_ERR_MSG(session, EINVAL, "commit_timestamp required and "
+ "none set on this transaction");
+ if (F_ISSET(txn, WT_TXN_TS_COMMIT_NEVER) &&
+ F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
+ txn->mod_count != 0)
+ WT_ERR_MSG(session, EINVAL, "no commit_timestamp required and "
+ "timestamp set on this transaction");
+#endif
/*
* The default sync setting is inherited from the connection, but can
* be overridden by an explicit "sync" setting for this transaction.
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
index 9d5f0c1adc0..7d2bb62cdd1 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -289,7 +289,6 @@ __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[])
if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
return (0);
-#ifdef HAVE_DIAGNOSTIC
/*
* We may have raced between starting the checkpoint transaction and
* some operation completing on the handle that updated the metadata
@@ -301,32 +300,26 @@ __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[])
*/
if (!WT_IS_METADATA(session->dhandle)) {
WT_CURSOR *meta_cursor;
- bool metadata_race;
WT_ASSERT(session, !F_ISSET(&session->txn, WT_TXN_ERROR));
WT_RET(__wt_metadata_cursor(session, &meta_cursor));
meta_cursor->set_key(meta_cursor, session->dhandle->name);
ret = __wt_curfile_insert_check(meta_cursor);
if (ret == WT_ROLLBACK) {
- metadata_race = true;
/*
- * Disable this check and assertion for now - it is
- * possible that a schema operation with a timestamp in
- * the future is in the metadata, but not part of the
- * the checkpoint now that checkpoints can be created
- * at the stable timestamp.
- * See WT-3559 for context on re-adding this assertion.
+ * If create or drop or any schema operation of a table
+ * is with in an user transaction then checkpoint can
+ * see the dhandle before the commit, which will lead
+ * to the rollback error. We will ignore this dhandle as
+ * part of this checkpoint by returning from here.
*/
-#if 0
- ret = 0;
-#endif
- } else
- metadata_race = false;
+ WT_TRET(__wt_metadata_cursor_release(session,
+ &meta_cursor));
+ return (0);
+ }
WT_TRET(__wt_metadata_cursor_release(session, &meta_cursor));
WT_RET(ret);
- WT_ASSERT(session, !metadata_race);
}
-#endif
/*
* Decide whether the tree needs to be included in the checkpoint and
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index 54634c03dfb..929aba30155 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -20,14 +20,15 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session)
WT_CURSOR *cursor;
WT_DECL_RET;
WT_DECL_TIMESTAMP(rollback_timestamp)
- WT_ITEM las_addr, las_key, las_timestamp;
+ WT_ITEM las_key, las_timestamp, las_value;
WT_TXN_GLOBAL *txn_global;
- uint64_t las_counter, las_txnid, remove_cnt;
+ uint64_t las_counter, las_pageid, las_total, las_txnid;
uint32_t las_id, session_flags;
+ uint8_t upd_type;
conn = S2C(session);
cursor = NULL;
- remove_cnt = 0;
+ las_total = 0;
session_flags = 0; /* [-Werror=maybe-uninitialized] */
WT_CLEAR(las_timestamp);
@@ -40,7 +41,7 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session)
txn_global = &conn->txn_global;
WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
__wt_timestamp_set(
- &rollback_timestamp, &txn_global->stable_timestamp));
+ &rollback_timestamp, &txn_global->stable_timestamp));
__wt_las_cursor(session, &cursor, &session_flags);
@@ -49,8 +50,8 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session)
/* Walk the file. */
for (; (ret = cursor->next(cursor)) == 0; ) {
- WT_ERR(cursor->get_key(cursor, &las_id, &las_addr, &las_counter,
- &las_txnid, &las_timestamp, &las_key));
+ WT_ERR(cursor->get_key(cursor,
+ &las_id, &las_pageid, &las_counter, &las_key));
/* Check the file ID so we can skip durable tables */
if (las_id >= conn->stable_rollback_maxfile)
@@ -60,27 +61,23 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session)
if (__bit_test(conn->stable_rollback_bitstring, las_id))
continue;
+ WT_ERR(cursor->get_value(cursor,
+ &las_txnid, &las_timestamp, &upd_type, &las_value));
+
/*
* Entries with no timestamp will have a timestamp of zero,
* which will fail the following check and cause them to never
* be removed.
*/
if (__wt_timestamp_cmp(
- &rollback_timestamp, las_timestamp.data) < 0) {
+ &rollback_timestamp, las_timestamp.data) < 0)
WT_ERR(cursor->remove(cursor));
- ++remove_cnt;
- }
+ else
+ ++las_total;
}
WT_ERR_NOTFOUND_OK(ret);
err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
- /*
- * If there were races to remove records, we can over-count. Underflow
- * isn't fatal, but check anyway so we don't skew low over time.
- */
- if (remove_cnt > conn->las_record_cnt)
- conn->las_record_cnt = 0;
- else if (remove_cnt > 0)
- (void)__wt_atomic_sub64(&conn->las_record_cnt, remove_cnt);
+ WT_STAT_CONN_SET(session, cache_lookaside_entries, las_total);
F_CLR(session, WT_SESSION_NO_CACHE);
@@ -303,6 +300,20 @@ __txn_rollback_to_stable_btree_walk(
}
/*
+ * __txn_rollback_eviction_drain --
+ * Wait for eviction to drain from a tree.
+ */
+static int
+__txn_rollback_eviction_drain(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_UNUSED(cfg);
+
+ WT_RET(__wt_evict_file_exclusive_on(session));
+ __wt_evict_file_exclusive_off(session);
+ return (0);
+}
+
+/*
* __txn_rollback_to_stable_btree --
* Called for each open handle - choose to either skip or wipe the commits
*/
@@ -422,7 +433,19 @@ __wt_txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[])
WT_DECL_RET;
conn = S2C(session);
- WT_RET(__txn_rollback_to_stable_check(session));
+
+ /*
+ * Mark that a rollback operation is in progress and wait for eviction
+ * to drain. This is necessary because lookaside eviction uses
+ * transactions and causes the check for a quiescent system to fail.
+ */
+ F_SET(conn, WT_CONN_EVICTION_NO_LOOKASIDE);
+ WT_ERR(__wt_conn_btree_apply(session,
+ NULL, __txn_rollback_eviction_drain, NULL, cfg));
+
+ WT_ERR(__txn_rollback_to_stable_check(session));
+
+ F_CLR(conn, WT_CONN_EVICTION_NO_LOOKASIDE);
/*
* Allocate a non-durable btree bitstring. We increment the global
@@ -430,7 +453,7 @@ __wt_txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[])
* hence we need to add one here.
*/
conn->stable_rollback_maxfile = conn->next_file_id + 1;
- WT_RET(__bit_alloc(session,
+ WT_ERR(__bit_alloc(session,
conn->stable_rollback_maxfile, &conn->stable_rollback_bitstring));
WT_ERR(__wt_conn_btree_apply(session,
NULL, __txn_rollback_to_stable_btree, NULL, cfg));
@@ -442,7 +465,9 @@ __wt_txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[])
* lookaside records should be removed.
*/
WT_ERR(__txn_rollback_to_stable_lookaside_fixup(session));
-err: __wt_free(session, conn->stable_rollback_bitstring);
+
+err: F_CLR(conn, WT_CONN_EVICTION_NO_LOOKASIDE);
+ __wt_free(session, conn->stable_rollback_bitstring);
return (ret);
#endif
}
diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c
index 2182a3924a5..8f90afeb8b4 100644
--- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c
+++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c
@@ -107,27 +107,37 @@ __wt_txn_parse_timestamp(WT_SESSION_IMPL *session,
#if WT_TIMESTAMP_SIZE == 8
{
- static const u_char hextable[] = {
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 1, 2, 3, 4, 5, 6, 7,
- 8, 9, 0, 0, 0, 0, 0, 0,
- 0, 10, 11, 12, 13, 14, 15, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 10, 11, 12, 13, 14, 15
+ static const int8_t hextable[] = {
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, -1, -1, -1, -1, -1, -1,
+ -1, 10, 11, 12, 13, 14, 15, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, 10, 11, 12, 13, 14, 15, -1
};
wt_timestamp_t ts;
size_t len;
- const char *hex;
-
- for (ts.val = 0, hex = cval->str, len = cval->len; len > 0; --len)
- ts.val = (ts.val << 4) | hextable[(int)*hex++];
+ int hex_val;
+ const char *hex_itr;
+
+ for (ts.val = 0, hex_itr = cval->str, len = cval->len; len > 0; --len) {
+ if ((size_t)*hex_itr < WT_ELEMENTS(hextable))
+ hex_val = hextable[(size_t)*hex_itr++];
+ else
+ hex_val = -1;
+ if (hex_val < 0)
+ WT_RET_MSG(session, EINVAL,
+ "Failed to parse %s timestamp '%.*s'",
+ name, (int)cval->len, cval->str);
+ ts.val = (ts.val << 4) | (uint64_t)hex_val;
+ }
__wt_timestamp_set(timestamp, &ts);
}
#else
diff --git a/src/third_party/wiredtiger/test/fops/file.c b/src/third_party/wiredtiger/test/fops/file.c
index 60320ae3a38..118845ab805 100644
--- a/src/third_party/wiredtiger/test/fops/file.c
+++ b/src/third_party/wiredtiger/test/fops/file.c
@@ -39,6 +39,8 @@ obj_bulk(void)
testutil_check(conn->open_session(conn, NULL, NULL, &session));
+ if (use_txn)
+ testutil_check(session->begin_transaction(session, NULL));
if ((ret = session->create(session, uri, config)) != 0)
if (ret != EEXIST && ret != EBUSY)
testutil_die(ret, "session.create");
@@ -51,6 +53,21 @@ obj_bulk(void)
} else if (ret != ENOENT && ret != EBUSY && ret != EINVAL)
testutil_die(ret, "session.open_cursor bulk");
}
+
+ if (use_txn) {
+ /*
+ * As the operations are being performed concurrently,
+ * return value can be ENOENT, EBUSY or EINVAL will set
+ * error to transaction opened by session. In these
+ * cases the transaction has to be aborted.
+ */
+ if (ret != ENOENT && ret != EBUSY && ret != EINVAL)
+ ret = session->commit_transaction(session, NULL);
+ else
+ ret = session->rollback_transaction(session, NULL);
+ if (ret == EINVAL)
+ testutil_die(ret, "session.commit bulk");
+ }
testutil_check(session->close(session, NULL));
}
@@ -70,6 +87,8 @@ obj_bulk_unique(int force)
new_uri, sizeof(new_uri), "%s.%u", uri, ++uid));
testutil_check(pthread_rwlock_unlock(&single));
+ if (use_txn)
+ testutil_check(session->begin_transaction(session, NULL));
testutil_check(session->create(session, new_uri, config));
__wt_yield();
@@ -89,6 +108,10 @@ obj_bulk_unique(int force)
if (ret != EBUSY)
testutil_die(ret, "session.drop: %s", new_uri);
+ if (use_txn &&
+ (ret = session->commit_transaction(session, NULL)) != 0 &&
+ ret != EINVAL)
+ testutil_die(ret, "session.commit bulk unique");
testutil_check(session->close(session, NULL));
}
@@ -101,12 +124,19 @@ obj_cursor(void)
testutil_check(conn->open_session(conn, NULL, NULL, &session));
+ if (use_txn)
+ testutil_check(session->begin_transaction(session, NULL));
if ((ret =
session->open_cursor(session, uri, NULL, NULL, &cursor)) != 0) {
if (ret != ENOENT && ret != EBUSY)
testutil_die(ret, "session.open_cursor");
} else
testutil_check(cursor->close(cursor));
+
+ if (use_txn &&
+ (ret = session->commit_transaction(session, NULL)) != 0 &&
+ ret != EINVAL)
+ testutil_die(ret, "session.commit cursor");
testutil_check(session->close(session, NULL));
}
@@ -118,10 +148,16 @@ obj_create(void)
testutil_check(conn->open_session(conn, NULL, NULL, &session));
+ if (use_txn)
+ testutil_check(session->begin_transaction(session, NULL));
if ((ret = session->create(session, uri, config)) != 0)
if (ret != EEXIST && ret != EBUSY)
testutil_die(ret, "session.create");
+ if (use_txn &&
+ (ret = session->commit_transaction(session, NULL)) != 0 &&
+ ret != EINVAL)
+ testutil_die(ret, "session.commit create");
testutil_check(session->close(session, NULL));
}
@@ -140,13 +176,25 @@ obj_create_unique(int force)
new_uri, sizeof(new_uri), "%s.%u", uri, ++uid));
testutil_check(pthread_rwlock_unlock(&single));
+ if (use_txn)
+ testutil_check(session->begin_transaction(session, NULL));
testutil_check(session->create(session, new_uri, config));
+ if (use_txn &&
+ (ret = session->commit_transaction(session, NULL)) != 0 &&
+ ret != EINVAL)
+ testutil_die(ret, "session.commit create unique");
__wt_yield();
+ if (use_txn)
+ testutil_check(session->begin_transaction(session, NULL));
while ((ret = session->drop(
session, new_uri, force ? "force" : NULL)) != 0)
if (ret != EBUSY)
testutil_die(ret, "session.drop: %s", new_uri);
+ if (use_txn &&
+ (ret = session->commit_transaction(session, NULL)) != 0 &&
+ ret != EINVAL)
+ testutil_die(ret, "session.commit create unique");
testutil_check(session->close(session, NULL));
}
@@ -159,10 +207,26 @@ obj_drop(int force)
testutil_check(conn->open_session(conn, NULL, NULL, &session));
+ if (use_txn)
+ testutil_check(session->begin_transaction(session, NULL));
if ((ret = session->drop(session, uri, force ? "force" : NULL)) != 0)
if (ret != ENOENT && ret != EBUSY)
testutil_die(ret, "session.drop");
+ if (use_txn) {
+ /*
+ * As the operations are being performed concurrently,
+ * return value can be ENOENT or EBUSY will set
+ * error to transaction opened by session. In these
+ * cases the transaction has to be aborted.
+ */
+ if (ret != ENOENT && ret != EBUSY)
+ ret = session->commit_transaction(session, NULL);
+ else
+ ret = session->rollback_transaction(session, NULL);
+ if (ret == EINVAL)
+ testutil_die(ret, "session.commit drop");
+ }
testutil_check(session->close(session, NULL));
}
diff --git a/src/third_party/wiredtiger/test/fops/t.c b/src/third_party/wiredtiger/test/fops/t.c
index b6b80ba5db8..fcbbdcabd73 100644
--- a/src/third_party/wiredtiger/test/fops/t.c
+++ b/src/third_party/wiredtiger/test/fops/t.c
@@ -28,6 +28,7 @@
#include "thread.h"
+bool use_txn; /* Operations with user txn */
WT_CONNECTION *conn; /* WiredTiger connection */
pthread_rwlock_t single; /* Single thread */
u_int nops; /* Operations */
@@ -77,8 +78,9 @@ main(int argc, char *argv[])
nops = 1000;
nthreads = 10;
runs = 1;
+ use_txn = false;
config_open = working_dir = NULL;
- while ((ch = __wt_getopt(progname, argc, argv, "C:h:l:n:r:t:")) != EOF)
+ while ((ch = __wt_getopt(progname, argc, argv, "C:h:l:n:r:t:x")) != EOF)
switch (ch) {
case 'C': /* wiredtiger_open config */
config_open = __wt_optarg;
@@ -102,6 +104,9 @@ main(int argc, char *argv[])
case 't':
nthreads = (u_int)atoi(__wt_optarg);
break;
+ case 'x':
+ use_txn = true;
+ break;
default:
return (usage());
}
@@ -245,7 +250,8 @@ usage(void)
{
fprintf(stderr,
"usage: %s "
- "[-C wiredtiger-config] [-l log] [-n ops] [-r runs] [-t threads]\n",
+ "[-C wiredtiger-config] [-l log] [-n ops] [-r runs] [-t threads] "
+ "[-x] \n",
progname);
fprintf(stderr, "%s",
"\t-C specify wiredtiger_open configuration arguments\n"
@@ -253,6 +259,7 @@ usage(void)
"\t-l specify a log file\n"
"\t-n set number of operations each thread does\n"
"\t-r set number of runs\n"
- "\t-t set number of threads\n");
+ "\t-t set number of threads\n"
+ "\t-x operations within user transaction \n");
return (EXIT_FAILURE);
}
diff --git a/src/third_party/wiredtiger/test/fops/thread.h b/src/third_party/wiredtiger/test/fops/thread.h
index f6b6bdffd63..0df36025be0 100644
--- a/src/third_party/wiredtiger/test/fops/thread.h
+++ b/src/third_party/wiredtiger/test/fops/thread.h
@@ -30,6 +30,7 @@
#include <signal.h>
+extern bool use_txn; /* Operations with user txn */
extern WT_CONNECTION *conn; /* WiredTiger connection */
extern u_int nops; /* Operations per thread */
diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h
index 81b7fa27f79..f35e71f58aa 100644
--- a/src/third_party/wiredtiger/test/format/format.h
+++ b/src/third_party/wiredtiger/test/format/format.h
@@ -287,6 +287,7 @@ void bdb_update(const void *, size_t, const void *, size_t);
WT_THREAD_RET alter(void *);
WT_THREAD_RET backup(void *);
+WT_THREAD_RET checkpoint(void *);
WT_THREAD_RET compact(void *);
void config_clear(void);
void config_error(void);
diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c
index f4770465628..4fed18d12b4 100644
--- a/src/third_party/wiredtiger/test/format/ops.c
+++ b/src/third_party/wiredtiger/test/format/ops.c
@@ -76,7 +76,8 @@ wts_ops(int lastrun)
TINFO **tinfo_list, *tinfo, total;
WT_CONNECTION *conn;
WT_SESSION *session;
- wt_thread_t alter_tid, backup_tid, compact_tid, lrt_tid, timestamp_tid;
+ wt_thread_t alter_tid, backup_tid, checkpoint_tid, compact_tid, lrt_tid;
+ wt_thread_t timestamp_tid;
int64_t fourths, quit_fourths, thread_ops;
uint32_t i;
bool running;
@@ -86,6 +87,7 @@ wts_ops(int lastrun)
session = NULL; /* -Wconditional-uninitialized */
memset(&alter_tid, 0, sizeof(alter_tid));
memset(&backup_tid, 0, sizeof(backup_tid));
+ memset(&checkpoint_tid, 0, sizeof(checkpoint_tid));
memset(&compact_tid, 0, sizeof(compact_tid));
memset(&lrt_tid, 0, sizeof(lrt_tid));
memset(&timestamp_tid, 0, sizeof(timestamp_tid));
@@ -173,6 +175,9 @@ wts_ops(int lastrun)
if (g.c_backups)
testutil_check(
__wt_thread_create(NULL, &backup_tid, backup, NULL));
+ if (g.c_checkpoints)
+ testutil_check(__wt_thread_create(
+ NULL, &checkpoint_tid, checkpoint, NULL));
if (g.c_compact)
testutil_check(
__wt_thread_create(NULL, &compact_tid, compact, NULL));
@@ -247,6 +252,8 @@ wts_ops(int lastrun)
testutil_check(__wt_thread_join(NULL, alter_tid));
if (g.c_backups)
testutil_check(__wt_thread_join(NULL, backup_tid));
+ if (g.c_checkpoints)
+ testutil_check(__wt_thread_join(NULL, checkpoint_tid));
if (g.c_compact)
testutil_check(__wt_thread_join(NULL, compact_tid));
if (!SINGLETHREADED && g.c_long_running_txn)
@@ -514,12 +521,11 @@ ops(void *arg)
WT_DECL_RET;
WT_ITEM *key, _key, *value, _value;
WT_SESSION *session;
- uint64_t ckpt_op, keyno, reset_op, session_op;
+ uint64_t keyno, reset_op, session_op;
uint32_t rnd;
u_int i, iso_config;
int dir;
- char *ckpt_config, ckpt_name[64];
- bool ckpt_available, intxn, positioned, readonly;
+ bool intxn, positioned, readonly;
tinfo = arg;
@@ -542,58 +548,61 @@ ops(void *arg)
session = NULL;
session_op = 0;
- /* Set the first operation where we'll perform checkpoint operations. */
- ckpt_op = g.c_checkpoints ? mmrand(&tinfo->rnd, 100, 10000) : 0;
- ckpt_available = false;
-
/* Set the first operation where we'll reset the session. */
reset_op = mmrand(&tinfo->rnd, 100, 10000);
for (intxn = false; !tinfo->quit; ++tinfo->ops) {
- /*
- * We can't checkpoint or swap sessions/cursors while in a
- * transaction, resolve any running transaction.
- */
- if (intxn &&
- (tinfo->ops == ckpt_op || tinfo->ops == session_op)) {
- commit_transaction(tinfo, session);
- intxn = false;
- }
-
- /* Open up a new session and cursors. */
- if (tinfo->ops == session_op ||
+ /* Periodically open up a new session and cursors. */
+ if (tinfo->ops > session_op ||
session == NULL || cursor == NULL) {
+ /*
+ * We can't swap sessions/cursors if in a transaction,
+ * resolve any running transaction.
+ */
+ if (intxn) {
+ commit_transaction(tinfo, session);
+ intxn = false;
+ }
+
if (session != NULL)
testutil_check(session->close(session, NULL));
-
testutil_check(
conn->open_session(conn, NULL, NULL, &session));
+ /* Pick the next session/cursor close/open. */
+ session_op += mmrand(&tinfo->rnd, 100, 5000);
+
/*
* 10% of the time, perform some read-only operations
* from a checkpoint.
*
- * Skip that if we are single-threaded and doing checks
- * against a Berkeley DB database, because that won't
- * work because the Berkeley DB database records won't
- * match the checkpoint. Also skip if we are using
- * LSM, because it doesn't support reads from
- * checkpoints.
+ * Skip if single-threaded and doing checks against a
+ * Berkeley DB database, that won't work because the
+ * Berkeley DB database won't match the checkpoint.
+ *
+ * Skip if we are using data-sources or LSM, they don't
+ * support reading from checkpoints.
*/
- if (!SINGLETHREADED && !DATASOURCE("lsm") &&
- ckpt_available && mmrand(&tinfo->rnd, 1, 10) == 1) {
+ if (!SINGLETHREADED && !DATASOURCE("helium") &&
+ !DATASOURCE("kvsbdb") && !DATASOURCE("lsm") &&
+ mmrand(&tinfo->rnd, 1, 10) == 1) {
/*
* open_cursor can return EBUSY if concurrent
* with a metadata operation, retry.
*/
while ((ret = session->open_cursor(session,
- g.uri, NULL, ckpt_name, &cursor)) == EBUSY)
+ g.uri, NULL,
+ "checkpoint=WiredTigerCheckpoint",
+ &cursor)) == EBUSY)
__wt_yield();
+ /*
+ * If the checkpoint hasn't been created yet,
+ * ignore the error.
+ */
+ if (ret == ENOENT)
+ continue;
testutil_check(ret);
- /* Pick the next session/cursor close/open. */
- session_op += 250;
-
/* Checkpoints are read-only. */
readonly = true;
} else {
@@ -608,75 +617,11 @@ ops(void *arg)
__wt_yield();
testutil_check(ret);
- /* Pick the next session/cursor close/open. */
- session_op += mmrand(&tinfo->rnd, 100, 5000);
-
/* Updates supported. */
readonly = false;
}
}
- /* Checkpoint the database. */
- if (tinfo->ops == ckpt_op && g.c_checkpoints) {
- /*
- * Checkpoints are single-threaded inside WiredTiger,
- * skip our checkpoint if another thread is already
- * doing one.
- */
- ret = pthread_rwlock_trywrlock(&g.checkpoint_lock);
- if (ret == EBUSY)
- goto skip_checkpoint;
- testutil_check(ret);
-
- /*
- * LSM and data-sources don't support named checkpoints
- * and we can't drop a named checkpoint while there's a
- * backup in progress, otherwise name the checkpoint 5%
- * of the time.
- */
- if (mmrand(&tinfo->rnd, 1, 20) != 1 ||
- DATASOURCE("helium") ||
- DATASOURCE("kvsbdb") || DATASOURCE("lsm") ||
- pthread_rwlock_trywrlock(&g.backup_lock) == EBUSY)
- ckpt_config = NULL;
- else {
- testutil_check(__wt_snprintf(
- ckpt_name, sizeof(ckpt_name),
- "name=thread-%d", tinfo->id));
- ckpt_config = ckpt_name;
- }
-
- ret = session->checkpoint(session, ckpt_config);
- /*
- * We may be trying to create a named checkpoint while
- * we hold a cursor open to the previous checkpoint.
- * Tolerate EBUSY.
- */
- if (ret != 0 && ret != EBUSY)
- testutil_die(ret, "%s",
- ckpt_config == NULL ? "" : ckpt_config);
- ret = 0;
-
- if (ckpt_config != NULL)
- testutil_check(
- pthread_rwlock_unlock(&g.backup_lock));
- testutil_check(
- pthread_rwlock_unlock(&g.checkpoint_lock));
-
- /* Rephrase the checkpoint name for cursor open. */
- if (ckpt_config == NULL)
- strcpy(ckpt_name,
- "checkpoint=WiredTigerCheckpoint");
- else
- testutil_check(__wt_snprintf(
- ckpt_name, sizeof(ckpt_name),
- "checkpoint=thread-%d", tinfo->id));
- ckpt_available = true;
-
-skip_checkpoint: /* Pick the next checkpoint operation. */
- ckpt_op += mmrand(&tinfo->rnd, 5000, 20000);
- }
-
/*
* Reset the session every now and then, just to make sure that
* operation gets tested. Note the test is not for equality, we
diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c
index dc288ba4bc2..02ed0a2da60 100644
--- a/src/third_party/wiredtiger/test/format/t.c
+++ b/src/third_party/wiredtiger/test/format/t.c
@@ -169,7 +169,6 @@ main(int argc, char *argv[])
*/
testutil_check(pthread_rwlock_init(&g.append_lock, NULL));
testutil_check(pthread_rwlock_init(&g.backup_lock, NULL));
- testutil_check(pthread_rwlock_init(&g.checkpoint_lock, NULL));
testutil_check(pthread_rwlock_init(&g.death_lock, NULL));
printf("%s: process %" PRIdMAX "\n", progname, (intmax_t)getpid());
@@ -267,7 +266,6 @@ main(int argc, char *argv[])
testutil_check(pthread_rwlock_destroy(&g.append_lock));
testutil_check(pthread_rwlock_destroy(&g.backup_lock));
- testutil_check(pthread_rwlock_destroy(&g.checkpoint_lock));
testutil_check(pthread_rwlock_destroy(&g.death_lock));
config_clear();
diff --git a/src/third_party/wiredtiger/test/format/util.c b/src/third_party/wiredtiger/test/format/util.c
index 98af8e766f1..9ea44a29801 100644
--- a/src/third_party/wiredtiger/test/format/util.c
+++ b/src/third_party/wiredtiger/test/format/util.c
@@ -501,6 +501,86 @@ fclose_and_clear(FILE **fpp)
}
/*
+ * checkpoint --
+ * Periodically take a checkpoint
+ */
+WT_THREAD_RET
+checkpoint(void *arg)
+{
+ WT_CONNECTION *conn;
+ WT_DECL_RET;
+ WT_SESSION *session;
+ u_int secs;
+ const char *ckpt_config;
+ char config_buf[64];
+ bool backup_locked;
+
+ (void)arg;
+ conn = g.wts_conn;
+ testutil_check(conn->open_session(conn, NULL, NULL, &session));
+
+ for (secs = mmrand(NULL, 1, 10); !g.workers_finished;) {
+ if (secs > 0) {
+ __wt_sleep(1, 0);
+ --secs;
+ continue;
+ }
+
+ /*
+ * LSM and data-sources don't support named checkpoints. Also,
+ * don't attempt named checkpoints during a hot backup. It's
+ * OK to create named checkpoints during a hot backup, but we
+ * can't delete them, so repeating an already existing named
+ * checkpoint will fail when we can't drop the previous one.
+ */
+ ckpt_config = NULL;
+ backup_locked = false;
+ if (!DATASOURCE("helium") && !DATASOURCE("kvsbdb") &&
+ !DATASOURCE("lsm"))
+ switch (mmrand(NULL, 1, 20)) {
+ case 1:
+ /*
+ * 5% create a named snapshot. Rotate between a
+ * few names to test multiple named snapshots in
+ * the system.
+ */
+ ret = pthread_rwlock_trywrlock(&g.backup_lock);
+ if (ret == 0) {
+ backup_locked = true;
+ testutil_check(__wt_snprintf(
+ config_buf, sizeof(config_buf),
+ "name=mine.%" PRIu32,
+ mmrand(NULL, 1, 4)));
+ ckpt_config = config_buf;
+ } else if (ret != EBUSY)
+ testutil_check(ret);
+ break;
+ case 2:
+ /*
+ * 5% drop all named snapshots.
+ */
+ ret = pthread_rwlock_trywrlock(&g.backup_lock);
+ if (ret == 0) {
+ backup_locked = true;
+ ckpt_config = "drop=(all)";
+ } else if (ret != EBUSY)
+ testutil_check(ret);
+ break;
+ }
+
+ testutil_check(session->checkpoint(session, ckpt_config));
+
+ if (backup_locked)
+ testutil_check(pthread_rwlock_unlock(&g.backup_lock));
+
+ secs = mmrand(NULL, 5, 40);
+ }
+
+ testutil_check(session->close(session, NULL));
+ return (WT_THREAD_RET_VALUE);
+}
+
+/*
* timestamp --
* Periodically update the oldest timestamp.
*/
diff --git a/src/third_party/wiredtiger/test/mciproject.yml b/src/third_party/wiredtiger/test/mciproject.yml
index 72022fe46ec..4b67299d14c 100644
--- a/src/third_party/wiredtiger/test/mciproject.yml
+++ b/src/third_party/wiredtiger/test/mciproject.yml
@@ -167,20 +167,6 @@ buildvariants:
- name: unit-test
- name: fops
-- name: solaris
- display_name: Solaris
- run_on:
- - solaris
- expansions:
- make_command: PATH=/opt/mongodbtoolchain/bin:$PATH gmake
- test_env_vars: LD_LIBRARY_PATH=`pwd`/.libs
- smp_command: -j $(kstat cpu | sort -u | grep -c "^module")
- configure_env_vars: PATH=/opt/mongodbtoolchain/bin:$PATH CFLAGS="-m64"
- tasks:
- - name: compile
- - name: unit-test
- - name: fops
-
- name: windows-64
display_name: Windows 64-bit
run_on:
diff --git a/src/third_party/wiredtiger/test/suite/test_assert01.py b/src/third_party/wiredtiger/test/suite/test_assert01.py
new file mode 100644
index 00000000000..3a4f8e4127a
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_assert01.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2017 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_assert01.py
+# Timestamps: assert commit settings
+#
+
+from suite_subprocess import suite_subprocess
+import wiredtiger, wttest
+
+def timestamp_str(t):
+ return '%x' % t
+
+class test_assert01(wttest.WiredTigerTestCase, suite_subprocess):
+ base = 'assert01'
+ base_uri = 'file:' + base
+ uri_always = base_uri + '.always.wt'
+ uri_def = base_uri + '.def.wt'
+ uri_never = base_uri + '.never.wt'
+ uri_none = base_uri + '.none.wt'
+ cfg = 'key_format=S,value_format=S,'
+ cfg_always = 'assert=(commit_timestamp=always)'
+ cfg_def = ''
+ cfg_never = 'assert=(commit_timestamp=never)'
+ cfg_none = 'assert=(commit_timestamp=none)'
+
+ count = 1
+ #
+ # Commit a k/v pair making sure that it detects an error if needed, when
+ # used with and without a commit timestamp.
+ #
+ def insert_check(self, uri, use_ts):
+ c = self.session.open_cursor(uri)
+ key = 'key' + str(self.count)
+ val = 'value' + str(self.count)
+
+ # Commit with a timestamp
+ self.session.begin_transaction()
+ self.session.timestamp_transaction(
+ 'commit_timestamp=' + timestamp_str(self.count))
+ c[key] = val
+ # All settings other than never should commit successfully
+ if (use_ts != 'never'):
+ self.session.commit_transaction()
+ else:
+ msg = "/timestamp set on this transaction/"
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda:self.assertEquals(self.session.commit_transaction(),
+ 0), msg)
+ c.close()
+ self.count += 1
+
+ # Commit without a timestamp
+ key = 'key' + str(self.count)
+ val = 'value' + str(self.count)
+ c = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ c[key] = val
+ # All settings other than always should commit successfully
+ if (use_ts != 'always'):
+ self.session.commit_transaction()
+ else:
+ msg = "/none set on this transaction/"
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda:self.assertEquals(self.session.commit_transaction(),
+ 0), msg)
+ self.count += 1
+ c.close()
+
+ def test_commit_timestamp(self):
+ #if not wiredtiger.timestamp_build() or not wiredtiger.diagnostic_build():
+ # self.skipTest('requires a timestamp and diagnostic build')
+ if not wiredtiger.timestamp_build():
+ self.skipTest('requires a timestamp build')
+
+ # Create a data item at a timestamp
+ self.session.create(self.uri_always, self.cfg + self.cfg_always)
+ self.session.create(self.uri_def, self.cfg + self.cfg_def)
+ self.session.create(self.uri_never, self.cfg + self.cfg_never)
+ self.session.create(self.uri_none, self.cfg + self.cfg_none)
+
+ # Check inserting into each table
+ self.insert_check(self.uri_always, 'always')
+ self.insert_check(self.uri_def, 'none')
+ self.insert_check(self.uri_never, 'never')
+ self.insert_check(self.uri_none, 'none')
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_assert02.py b/src/third_party/wiredtiger/test/suite/test_assert02.py
new file mode 100644
index 00000000000..d264273c3a0
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_assert02.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2017 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_assert02.py
+# Timestamps: assert read timestamp settings
+#
+
+from suite_subprocess import suite_subprocess
+import wiredtiger, wttest
+
+def timestamp_str(t):
+ return '%x' % t
+
+class test_assert02(wttest.WiredTigerTestCase, suite_subprocess):
+ def test_read_timestamp(self):
+ #if not wiredtiger.timestamp_build() or not wiredtiger.diagnostic_build():
+ # self.skipTest('requires a timestamp and diagnostic build')
+ if not wiredtiger.timestamp_build():
+ self.skipTest('requires a timestamp build')
+
+ base = 'assert02.'
+ base_uri = 'file:' + base
+ uri_always = base_uri + '.always.wt'
+ uri_def = base_uri + '.def.wt'
+ uri_never = base_uri + '.never.wt'
+ uri_none = base_uri + '.none.wt'
+
+ cfg = 'key_format=S,value_format=S'
+ cfg_always = cfg + ',assert=(read_timestamp=always)'
+ cfg_def = cfg
+ cfg_never = cfg + ',assert=(read_timestamp=never)'
+ cfg_none = cfg + ',assert=(read_timestamp=none)'
+
+ # Create a data item at a timestamp
+ self.session.create(uri_always, cfg_always)
+ self.session.create(uri_def, cfg_def)
+ self.session.create(uri_never, cfg_never)
+ self.session.create(uri_none, cfg_none)
+
+ # Insert a data item at timestamp 1. This should work for all.
+ c_always = self.session.open_cursor(uri_always)
+ c_def = self.session.open_cursor(uri_def)
+ c_never = self.session.open_cursor(uri_never)
+ c_none = self.session.open_cursor(uri_none)
+ self.session.begin_transaction()
+ self.session.timestamp_transaction(
+ 'commit_timestamp=' + timestamp_str(1))
+ c_always['key1'] = 'value1'
+ c_def['key1'] = 'value1'
+ c_never['key1'] = 'value1'
+ c_none['key1'] = 'value1'
+ self.session.commit_transaction()
+ c_always.close()
+ c_def.close()
+ c_never.close()
+ c_none.close()
+
+ # Now that we have a timestamped data, try reading with and without
+ # the timestamp.
+ c_always = self.session.open_cursor(uri_always)
+ c_def = self.session.open_cursor(uri_def)
+ c_never = self.session.open_cursor(uri_never)
+ c_none = self.session.open_cursor(uri_none)
+
+ c_always.set_key('key1')
+ c_def.set_key('key1')
+ c_never.set_key('key1')
+ c_none.set_key('key1')
+
+ self.session.begin_transaction('read_timestamp=' + timestamp_str(1))
+ c_always.search()
+ c_def.search()
+ c_none.search()
+ self.assertEqual(c_always.get_value(), 'value1')
+ self.assertEqual(c_def.get_value(), 'value1')
+ self.assertEqual(c_none.get_value(), 'value1')
+
+ msg = "/timestamp set on this transaction/"
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda:self.assertEquals(c_never.search(), 0), msg)
+ self.session.commit_transaction()
+ c_always.close()
+ c_def.close()
+ c_never.close()
+ c_none.close()
+
+ # Read in a transaction without a timestamp.
+ c_always = self.session.open_cursor(uri_always)
+ c_def = self.session.open_cursor(uri_def)
+ c_never = self.session.open_cursor(uri_never)
+ c_none = self.session.open_cursor(uri_none)
+
+ c_always.set_key('key1')
+ c_def.set_key('key1')
+ c_never.set_key('key1')
+ c_none.set_key('key1')
+
+ self.session.begin_transaction()
+ c_never.search()
+ c_def.search()
+ c_none.search()
+ self.assertEqual(c_never.get_value(), 'value1')
+ self.assertEqual(c_def.get_value(), 'value1')
+ self.assertEqual(c_none.get_value(), 'value1')
+
+ msg = "/none set on this transaction/"
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda:self.assertEquals(c_always.search(), 0), msg)
+ self.session.commit_transaction()
+ c_always.close()
+ c_def.close()
+ c_never.close()
+ c_none.close()
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_assert03.py b/src/third_party/wiredtiger/test/suite/test_assert03.py
new file mode 100644
index 00000000000..36d4936a82e
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_assert03.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2017 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# test_assert03.py
+# Test changing assert setting via alter.
+#
+
+from suite_subprocess import suite_subprocess
+import wiredtiger, wttest
+
+class test_assert03(wttest.WiredTigerTestCase, suite_subprocess):
+ conn_config = 'log=(enabled)'
+ base_uri = 'file:assert03.wt'
+ cfg = 'key_format=S,value_format=S'
+ always = 'assert=(commit_timestamp=always)'
+ never = 'assert=(commit_timestamp=never)'
+ none = 'assert=(commit_timestamp=none)'
+
+ def test_assert03(self):
+
+ #if not wiredtiger.timestamp_build() or not wiredtiger.diagnostic_build():
+ # self.skipTest('requires a timestamp and diagnostic build')
+ if not wiredtiger.timestamp_build():
+ self.skipTest('requires a timestamp build')
+
+ # Create a data item at the default setting
+ self.session.create(self.base_uri, self.cfg)
+ c = self.session.open_cursor(self.base_uri)
+ self.session.begin_transaction()
+ c['key0'] = 'value0'
+ self.session.commit_transaction()
+ c.close()
+
+ # Now rotate through the alter settings and verify the data.
+ # The always setting should fail.
+ self.session.alter(self.base_uri, self.always)
+ c = self.session.open_cursor(self.base_uri)
+ self.session.begin_transaction()
+ c['key1'] = 'value1'
+ msg = "/none set on this transaction/"
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda:self.assertEquals(self.session.commit_transaction(), 0), msg)
+ c.close()
+
+ # The never and none settings should succeed.
+ self.session.alter(self.base_uri, self.never)
+ c = self.session.open_cursor(self.base_uri)
+ self.session.begin_transaction()
+ c['key2'] = 'value2'
+ self.session.commit_transaction()
+ c.close()
+
+ self.session.alter(self.base_uri, self.none)
+ c = self.session.open_cursor(self.base_uri)
+ self.session.begin_transaction()
+ c['key3'] = 'value3'
+ self.session.commit_transaction()
+ c.close()
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_backup07.py b/src/third_party/wiredtiger/test/suite/test_backup07.py
new file mode 100644
index 00000000000..8332815b0ca
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_backup07.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2017 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+import os, shutil
+from helper import compare_files
+from suite_subprocess import suite_subprocess
+from wtdataset import simple_key
+from wtscenario import make_scenarios
+
+# test_backup07.py
+# Test cursor backup with target URIs, logging and create during backup
+
+class test_backup07(wttest.WiredTigerTestCase, suite_subprocess):
+ dir='backup.dir' # Backup directory name
+ logmax="100K"
+ newuri="table:newtable"
+
+ pfx = 'test_backup'
+ scenarios = make_scenarios([
+ ('table', dict(uri='table:test',dsize=100,nops=100,nthreads=1)),
+ ])
+
+ # Create a large cache, otherwise this test runs quite slowly.
+ def conn_config(self):
+ return 'cache_size=1G,log=(archive=false,enabled,file_max=%s)' % \
+ self.logmax
+
+ # Run background inserts while running checkpoints and incremental backups
+ # repeatedly.
+ def test_backup07(self):
+ log2 = "WiredTigerLog.0000000002"
+
+ self.session.create(self.uri, "key_format=S,value_format=S")
+
+ # Insert small amounts of data at a time stopping just after we
+ # cross into log file 2. That way we can add more operations into
+ # log file 2 during the full backup.
+ loop = 0
+ c = self.session.open_cursor(self.uri)
+ while not os.path.exists(log2):
+ for i in range(0, self.nops):
+ num = i + (loop * self.nops)
+ key = 'key' + str(num)
+ val = 'value' + str(num)
+ c[key] = val
+ loop += 1
+
+ # Test a potential bug in full backups and creates.
+ # We allow creates during backup because the file doesn't exist
+ # when the backup metadata is created on cursor open and the newly
+ # created file is not in the cursor list. However, if using logging
+ # and the create and inserts/updates appear in a log file copied,
+ # then currently there will be an error opening the backup directory.
+
+ # Open up the backup cursor, create and add data to a new table
+ # and then copy the files.
+ os.mkdir(self.dir)
+ bkup_c = self.session.open_cursor('backup:', None, None)
+
+ # Now create and populate the new table. Make sure the log records
+ # are on disk and will be copied to the backup.
+ self.session.create(self.newuri, "key_format=S,value_format=S")
+ c = self.session.open_cursor(self.newuri)
+ for i in range(0, self.nops):
+ key = 'key' + str(i)
+ val = 'value' + str(i)
+ c[key] = val
+ c.close()
+ self.session.log_flush('sync=on')
+
+ # Now copy the files returned by the backup cursor. This will
+ # include the log file that has updates for the newly created table.
+ while True:
+ ret = bkup_c.next()
+ if ret != 0:
+ break
+ newfile = bkup_c.get_key()
+ sz = os.path.getsize(newfile)
+ self.pr('Copy from: ' + newfile + ' (' + str(sz) + ') to ' + self.dir)
+ shutil.copy(newfile, self.dir)
+ self.assertEqual(ret, wiredtiger.WT_NOTFOUND)
+ bkup_c.close()
+
+ # After the full backup, open and recover the backup database.
+ # Make sure we properly recover even though the log file will have
+ # records for the newly created table file id.
+ backup_conn = self.wiredtiger_open(self.dir)
+ backup_conn.close()
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_las.py b/src/third_party/wiredtiger/test/suite/test_las.py
index d0bd1d108fa..52a0b2d7300 100644
--- a/src/third_party/wiredtiger/test/suite/test_las.py
+++ b/src/third_party/wiredtiger/test/suite/test_las.py
@@ -26,16 +26,53 @@
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
+from helper import copy_wiredtiger_home
import wiredtiger, wttest
from wtdataset import SimpleDataSet
+def timestamp_str(t):
+ return '%x' % t
+
# test_las.py
-# Smoke tests to ensure lookaside tables are working.
+# Smoke tests to ensure lookaside tables are working.
class test_las(wttest.WiredTigerTestCase):
# Force a small cache.
def conn_config(self):
return 'cache_size=1GB'
+ def large_updates(self, session, uri, value, ds, nrows, timestamp=False):
+ # Insert a large number of records, we'll hang if the lookaside table
+ # isn't doing its thing.
+ cursor = session.open_cursor(uri)
+ for i in range(1, 1000000):
+ if timestamp == True:
+ session.begin_transaction()
+ cursor.set_key(ds.key(nrows + i))
+ cursor.set_value(value)
+ self.assertEquals(cursor.update(), 0)
+ if timestamp == True:
+ session.commit_transaction('commit_timestamp=' + timestamp_str(i + 1))
+ cursor.close()
+
+ def durable_check(self, check_value, uri, ds, nrows):
+ # Checkpoint and backup so as to simulate recovery
+ self.session.checkpoint()
+ newdir = "BACKUP"
+ copy_wiredtiger_home('.', newdir, True)
+
+ conn = self.setUpConnectionOpen(newdir)
+ session = self.setUpSessionOpen(conn)
+ cursor = session.open_cursor(uri, None)
+ # Skip the initial rows, which were not updated
+ for i in range(0, nrows+1):
+ self.assertEquals(cursor.next(), 0)
+ #print "Check value : " + str(check_value)
+ #print "value : " + str(cursor.get_value())
+ self.assertTrue(check_value == cursor.get_value())
+ cursor.close()
+ session.close()
+ conn.close()
+
@wttest.longtest('lookaside table smoke test')
def test_las(self):
# Create a small table.
@@ -43,18 +80,49 @@ class test_las(wttest.WiredTigerTestCase):
nrows = 100
ds = SimpleDataSet(self, uri, nrows, key_format="S")
ds.populate()
+ bigvalue = "aaaaa" * 100
- # Take a snapshot.
+ # Initially load huge data
+ cursor = self.session.open_cursor(uri)
+ for i in range(1, 1000000):
+ cursor.set_key(ds.key(nrows + i))
+ cursor.set_value(bigvalue)
+ self.assertEquals(cursor.insert(), 0)
+ cursor.close()
+ self.session.checkpoint()
+
+ # Scenario: 1
+ # Check to see LAS working with old snapshot
+ bigvalue1 = "bbbbb" * 100
self.session.snapshot("name=xxx")
+ # Update the values in different session after snapshot
+ self.large_updates(self.session, uri, bigvalue1, ds, nrows)
+ # Check to see the value after recovery
+ self.durable_check(bigvalue1, uri, ds, nrows)
+ self.session.snapshot("drop=(all)")
- # Insert a large number of records, we'll hang if the lookaside table
- # isn't doing its thing.
- c = self.session.open_cursor(uri)
- bigvalue = "abcde" * 100
- for i in range(1, 1000000):
- c.set_key(ds.key(nrows + i))
- c.set_value(bigvalue)
- self.assertEquals(c.insert(), 0)
+ # Scenario: 2
+ # Check to see LAS working with old reader
+ bigvalue2 = "ccccc" * 100
+ session2 = self.conn.open_session()
+ session2.begin_transaction('isolation=snapshot')
+ self.large_updates(self.session, uri, bigvalue2, ds, nrows)
+ # Check to see the value after recovery
+ self.durable_check(bigvalue2, uri, ds, nrows)
+ session2.rollback_transaction()
+ session2.close()
+
+ # Scenario: 3
+ # Check to see LAS working with old timestamp
+ bigvalue3 = "ddddd" * 100
+ self.conn.set_timestamp('stable_timestamp=' + timestamp_str(1))
+ self.large_updates(self.session, uri, bigvalue3, ds, nrows, timestamp=True)
+ # Check to see data can be see only till the stable_timestamp
+ self.durable_check(bigvalue2, uri, ds, nrows)
+
+ self.conn.set_timestamp('stable_timestamp=' + timestamp_str(i + 1))
+ # Check to see latest data can be seen
+ self.durable_check(bigvalue3, uri, ds, nrows)
if __name__ == '__main__':
wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp01.py b/src/third_party/wiredtiger/test/suite/test_timestamp01.py
index c7a5df66ae0..09a264e2afd 100644
--- a/src/third_party/wiredtiger/test/suite/test_timestamp01.py
+++ b/src/third_party/wiredtiger/test/suite/test_timestamp01.py
@@ -61,12 +61,40 @@ class test_timestamp01(wttest.WiredTigerTestCase, suite_subprocess):
'commit_timestamp=' + timestamp_str(1 << 5000)),
'/too long/')
- # One is okay, as is 2**64 - 1
+ # Anything other than lower case hexadecimal characters is not permitted
+ self.session.begin_transaction()
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.commit_transaction(
+ 'commit_timestamp=' + timestamp_str(-1)),
+ '/Failed to parse commit timestamp/')
+
+ self.session.begin_transaction()
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.commit_transaction(
+ 'commit_timestamp=' + 'a/78f'),
+ '/Failed to parse commit timestamp/')
+
+ self.session.begin_transaction()
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.commit_transaction(
+ 'commit_timestamp=' + 'a`78f'),
+ '/Failed to parse commit timestamp/')
+
+ self.session.begin_transaction()
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.commit_transaction(
+ 'commit_timestamp=' + 'a{78f'),
+ '/Failed to parse commit timestamp/')
+
+ # One is okay, as is upper case hex and 2**64 - 1
self.session.begin_transaction()
self.session.commit_transaction(
'commit_timestamp=' + timestamp_str(1))
self.session.begin_transaction()
self.session.commit_transaction(
+ 'commit_timestamp=0A78F')
+ self.session.begin_transaction()
+ self.session.commit_transaction(
'commit_timestamp=' + timestamp_str(1 << 64 - 1))
if __name__ == '__main__':
diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp02.py b/src/third_party/wiredtiger/test/suite/test_timestamp02.py
index 31bea22ec66..f928dbc184f 100644
--- a/src/third_party/wiredtiger/test/suite/test_timestamp02.py
+++ b/src/third_party/wiredtiger/test/suite/test_timestamp02.py
@@ -38,12 +38,6 @@ from wtscenario import make_scenarios
def timestamp_str(t):
return '%x' % t
-def timestamp_ret_str(t):
- s = timestamp_str(t)
- if len(s) % 2 == 1:
- s = '0' + s
- return s
-
class test_timestamp02(wttest.WiredTigerTestCase, suite_subprocess):
tablename = 'test_timestamp02'
uri = 'table:' + tablename
@@ -98,7 +92,7 @@ class test_timestamp02(wttest.WiredTigerTestCase, suite_subprocess):
dict((k, 1) for k in orig_keys[:i+1]))
# Everything up to and including timestamp 100 has been committed.
- self.assertEqual(self.conn.query_timestamp(), timestamp_ret_str(100))
+ self.assertTimestampsEqual(self.conn.query_timestamp(), timestamp_str(100))
# Bump the oldest timestamp, we're not going back...
self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(100))
@@ -111,11 +105,11 @@ class test_timestamp02(wttest.WiredTigerTestCase, suite_subprocess):
self.session.commit_transaction('commit_timestamp=' + timestamp_str(k + 100))
# Everything up to and including timestamp 200 has been committed.
- self.assertEqual(self.conn.query_timestamp(), timestamp_ret_str(200))
+ self.assertTimestampsEqual(self.conn.query_timestamp(), timestamp_str(200))
# Test that we can manually move the commit timestamp back
self.conn.set_timestamp('commit_timestamp=' + timestamp_str(150))
- self.assertEqual(self.conn.query_timestamp(), timestamp_ret_str(150))
+ self.assertTimestampsEqual(self.conn.query_timestamp(), timestamp_str(150))
self.conn.set_timestamp('commit_timestamp=' + timestamp_str(200))
# Now the stable timestamp before we read.
diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp03.py b/src/third_party/wiredtiger/test/suite/test_timestamp03.py
index 9caf597e6ed..1a2511ea6ee 100644
--- a/src/third_party/wiredtiger/test/suite/test_timestamp03.py
+++ b/src/third_party/wiredtiger/test/suite/test_timestamp03.py
@@ -39,12 +39,6 @@ from wtscenario import make_scenarios
def timestamp_str(t):
return '%x' % t
-def timestamp_ret_str(t):
- s = timestamp_str(t)
- if len(s) % 2 == 1:
- s = '0' + s
- return s
-
class test_timestamp03(wttest.WiredTigerTestCase, suite_subprocess):
table_ts_log = 'ts03_ts_logged'
table_ts_nolog = 'ts03_ts_nologged'
@@ -226,7 +220,7 @@ class test_timestamp03(wttest.WiredTigerTestCase, suite_subprocess):
self.table_nots_nolog, dict((k, self.value) for k in orig_keys))
# Bump the oldest_timestamp, we're not going back...
- self.assertEqual(self.conn.query_timestamp(), timestamp_ret_str(100))
+ self.assertTimestampsEqual(self.conn.query_timestamp(), timestamp_str(100))
old_ts = timestamp_str(100)
self.conn.set_timestamp('oldest_timestamp=' + old_ts)
self.conn.set_timestamp('stable_timestamp=' + old_ts)
diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp04.py b/src/third_party/wiredtiger/test/suite/test_timestamp04.py
index a52675daf8b..f7052448208 100644
--- a/src/third_party/wiredtiger/test/suite/test_timestamp04.py
+++ b/src/third_party/wiredtiger/test/suite/test_timestamp04.py
@@ -37,12 +37,6 @@ from wtscenario import make_scenarios
def timestamp_str(t):
return '%x' % t
-def timestamp_ret_str(t):
- s = timestamp_str(t)
- if len(s) % 2 == 1:
- s = '0' + s
- return s
-
class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess):
table_ts_log = 'table:ts04_ts_logged'
table_ts_nolog = 'table:ts04_ts_nologged'
@@ -61,6 +55,7 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess):
('col_var', dict(empty=0, cacheSize='cache_size=20MB', extra_config=',key_format=r')),
('lsm', dict(empty=0, cacheSize='cache_size=31MB', extra_config=',type=lsm')),
('row', dict(empty=0, cacheSize='cache_size=20MB', extra_config='',)),
+ ('row-smallcache', dict(empty=0, cacheSize='cache_size=2MB', extra_config='',)),
]
scenarios = make_scenarios(conncfg, types)
diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp05.py b/src/third_party/wiredtiger/test/suite/test_timestamp05.py
index d7131cb2004..f145184146c 100644
--- a/src/third_party/wiredtiger/test/suite/test_timestamp05.py
+++ b/src/third_party/wiredtiger/test/suite/test_timestamp05.py
@@ -39,12 +39,6 @@ from wtscenario import make_scenarios
def timestamp_str(t):
return '%x' % t
-def timestamp_ret_str(t):
- s = timestamp_str(t)
- if len(s) % 2 == 1:
- s = '0' + s
- return s
-
class test_timestamp05(wttest.WiredTigerTestCase, suite_subprocess):
uri = 'table:ts05'
diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp07.py b/src/third_party/wiredtiger/test/suite/test_timestamp07.py
index 12b36bdc2f8..09547dba3a7 100644
--- a/src/third_party/wiredtiger/test/suite/test_timestamp07.py
+++ b/src/third_party/wiredtiger/test/suite/test_timestamp07.py
@@ -56,8 +56,8 @@ class test_timestamp07(wttest.WiredTigerTestCase, suite_subprocess):
nkeys = [
('100keys', dict(nkeys=100)),
-# ('500keys', dict(nkeys=500)),
-# ('1000keys', dict(nkeys=1000)),
+ ('500keys', dict(nkeys=500)),
+ ('1000keys', dict(nkeys=1000)),
]
scenarios = make_scenarios(types, conncfg, nkeys)
@@ -68,19 +68,20 @@ class test_timestamp07(wttest.WiredTigerTestCase, suite_subprocess):
value3 = u'\u0001\u0002cdef\u0007\u0004'
# Check that a cursor (optionally started in a new transaction), sees the
- # expected values.
- def check(self, session, txn_config, expected):
+ # expected value for a key
+ def check(self, session, txn_config, k, expected):
if txn_config:
session.begin_transaction(txn_config)
c = session.open_cursor(self.uri + self.tablename, None)
- actual = dict((k, v) for k, v in c if v != 0)
- self.assertTrue(actual == expected)
- # Search for the expected items as well as iterating
- for k, v in expected.iteritems():
- self.assertEqual(c[k], v, "for key " + str(k))
+ if not expected:
+ c.set_key(k)
+ self.assertEqual(c.search(), wiredtiger.WT_NOTFOUND)
+ else:
+ self.assertEqual(c[k], expected)
c.close()
if txn_config:
session.commit_transaction()
+
#
# Take a backup of the database and verify that the value we want to
# check exists in the tables the expected number of times.
@@ -168,12 +169,14 @@ class test_timestamp07(wttest.WiredTigerTestCase, suite_subprocess):
# Now check that we see the expected state when reading at each
# timestamp.
- for i, t in enumerate(orig_keys):
- self.check(self.session, 'read_timestamp=' + timestamp_str(t),
- dict((k, self.value) for k in orig_keys[:i+1]))
+ for k in orig_keys:
+ self.check(self.session, 'read_timestamp=' + timestamp_str(k),
+ k, self.value)
+ self.check(self.session, 'read_timestamp=' + timestamp_str(k),
+ k + 1, None)
# Bump the oldest timestamp, we're not going back...
- self.assertEqual(self.conn.query_timestamp(), timestamp_str(self.nkeys))
+ self.assertTimestampsEqual(self.conn.query_timestamp(), timestamp_str(self.nkeys))
self.oldts = timestamp_str(self.nkeys)
self.conn.set_timestamp('oldest_timestamp=' + self.oldts)
self.conn.set_timestamp('stable_timestamp=' + self.oldts)
@@ -201,12 +204,8 @@ class test_timestamp07(wttest.WiredTigerTestCase, suite_subprocess):
# Take a checkpoint using the given configuration. Then verify
# whether value2 appears in a copy of that data or not.
- valcnt2 = valcnt3 = self.nkeys
- valcnt = 0
- # If logging is disabled then value2 should not appear in logged table.
- if self.using_log == False:
- valcnt3 = 0
- self.ckpt_backup(self.value2, valcnt, valcnt2, valcnt3)
+ self.ckpt_backup(self.value2, 0, self.nkeys, self.nkeys if self.using_log else 0)
+
# Update the stable timestamp to the latest, but not the oldest
# timestamp and make sure we can see the data. Once the stable
# timestamp is moved we should see all keys with value2.
@@ -245,9 +244,7 @@ class test_timestamp07(wttest.WiredTigerTestCase, suite_subprocess):
# of that data or not. Both tables that are logged should see
# all the data regardless of timestamps. The table that is not
# logged should not see any of it.
- valcnt = 0
- valcnt2 = valcnt3 = self.nkeys
- self.backup_check(self.value3, valcnt, valcnt2, valcnt3)
+ self.backup_check(self.value3, 0, self.nkeys, self.nkeys)
if __name__ == '__main__':
wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp09.py b/src/third_party/wiredtiger/test/suite/test_timestamp09.py
index 41a6909cbef..b79521329e7 100644
--- a/src/third_party/wiredtiger/test/suite/test_timestamp09.py
+++ b/src/third_party/wiredtiger/test/suite/test_timestamp09.py
@@ -114,8 +114,7 @@ class test_timestamp09(wttest.WiredTigerTestCase, suite_subprocess):
# Oldest timestamp is 3 at the moment, trying to set it to an earlier
# timestamp is a no-op.
self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(1))
- self.assertEqual(int(self.conn.query_timestamp('get=oldest')),
- int(timestamp_str(3)))
+ self.assertTimestampsEqual(self.conn.query_timestamp('get=oldest'), timestamp_str(3))
self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(3) +
',stable_timestamp=' + timestamp_str(3))
@@ -123,8 +122,7 @@ class test_timestamp09(wttest.WiredTigerTestCase, suite_subprocess):
# Stable timestamp is 5 at the moment, trying to set it to an earlier
# timestamp is a no-op.
self.conn.set_timestamp('stable_timestamp=' + timestamp_str(4))
- self.assertEqual(int(self.conn.query_timestamp('get=stable')),
- int(timestamp_str(5)))
+ self.assertTimestampsEqual(self.conn.query_timestamp('get=stable'), timestamp_str(5))
self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(5))
self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
diff --git a/src/third_party/wiredtiger/test/suite/test_txn14.py b/src/third_party/wiredtiger/test/suite/test_txn14.py
index 7579bbc8e54..2245f49ae85 100644
--- a/src/third_party/wiredtiger/test/suite/test_txn14.py
+++ b/src/third_party/wiredtiger/test/suite/test_txn14.py
@@ -93,10 +93,11 @@ class test_txn14(wttest.WiredTigerTestCase, suite_subprocess):
c.close()
self.session.log_flush(cfgarg)
if self.sync == 'background':
- # If doing a background flush, wait a few seconds. I have
- # seen an individual log file's fsync take more than a second
- # on some systems. So give it time to flush perhaps a few files.
- self.session.transaction_sync('timeout_ms=4000')
+ # If doing a background flush, wait 10 seconds. I have seen an
+ # individual log file's fsync take more than a second on some
+ # systems, and we've seen timeouts at lower levels on systems
+ # with slow I/O. So give it time to flush perhaps a few files.
+ self.session.transaction_sync('timeout_ms=10000')
self.simulate_crash_restart(".", "RESTART")
c = self.session.open_cursor(self.t1, None, None)
i = 0
diff --git a/src/third_party/wiredtiger/test/suite/wttest.py b/src/third_party/wiredtiger/test/suite/wttest.py
index 1c95eb355ae..c654370718c 100644
--- a/src/third_party/wiredtiger/test/suite/wttest.py
+++ b/src/third_party/wiredtiger/test/suite/wttest.py
@@ -490,6 +490,12 @@ class WiredTigerTestCase(unittest.TestCase):
with self.expectedStderr(message):
self.assertRaises(exceptionType, expr)
+ def assertTimestampsEqual(self, ts1, ts2):
+ """
+ TestCase.assertEqual() for timestamps
+ """
+ self.assertEqual(int(ts1, 16), int(ts2, 16))
+
def exceptionToStderr(self, expr):
"""
Used by assertRaisesHavingMessage to convert an expression