diff options
author | Alex Gorrod <alexander.gorrod@mongodb.com> | 2017-10-10 16:29:49 +1100 |
---|---|---|
committer | Alex Gorrod <alexander.gorrod@mongodb.com> | 2017-10-10 16:37:55 +1100 |
commit | 39998ac6928c4e7f3acd2f7ee2fc5fb4df056c18 (patch) | |
tree | c075233cd32c6ec0205af77db475836c0fba60e9 /src/third_party | |
parent | dd094ce1bc1fb424ccc6dd71939e5c7a30159e2e (diff) | |
download | mongo-39998ac6928c4e7f3acd2f7ee2fc5fb4df056c18.tar.gz |
Import wiredtiger: 0cd3d5bbd8a5c8779f1129c6754b4463403e788f from branch mongodb-3.6
ref: 6f561957cb..0cd3d5bbd8
for: 3.5.14
WT-3200 LSM bug: Failed lookup in bloom filter.
WT-3435 Lookaside eviction should be able to save unstable updates
WT-3453 Enhance lookaside table test coverage in Python suite
WT-3559 Detect when a checkpoint races with metadata changes
WT-3579 Enhance support for running wtperf workloads with workgen
WT-3582 Cache stuck full of internal pages
WT-3593 Add an API to enforce consistent use of timestamps (#3667)
WT-3599 reconciliation calculates block matching checksums too frequently.
WT-3600 timestamp API lets you set timestamps with invalid characters
WT-3612 Improve documentation of durability with backup cursors
WT-3613 test/format cache full with LSM
WT-3618 WT remove solaris from evergreen builds
WT-3620 POSIX thread attribute structures must be destroyed
WT-3621 Add test for full backups with concurrent table creation
WT-3622 Allow upper case hexadecimal timestamps
WT-3627 test_txn14.test_txn14.test_log_flush timeout
WT-3631 Convert timestamps to integers in Python tests before comparing
WT-3636 Account for page image sizes in cache consistently
WT-3638 format failure, update list without complete visible record
WT-3639 Test/format tried to drop named checkpoints during a hot backup
WT-3641 Track maximum timestamp used in each btree
WT-3642 Avoid lookaside reads for dead trees
Diffstat (limited to 'src/third_party')
95 files changed, 3140 insertions, 1644 deletions
diff --git a/src/third_party/wiredtiger/bench/workgen/runner/runner/__init__.py b/src/third_party/wiredtiger/bench/workgen/runner/runner/__init__.py index ed21fffe8dc..2d60e1522f5 100644 --- a/src/third_party/wiredtiger/bench/workgen/runner/runner/__init__.py +++ b/src/third_party/wiredtiger/bench/workgen/runner/runner/__init__.py @@ -88,5 +88,5 @@ except: shutil.rmtree('WT_TEST', True) os.mkdir('WT_TEST') -from .core import txn, extensions_config, op_group_transaction, op_log_like, op_multi_table +from .core import txn, extensions_config, op_append, op_group_transaction, op_log_like, op_multi_table, op_populate_with_range from .latency import workload_latency diff --git a/src/third_party/wiredtiger/bench/workgen/runner/runner/core.py b/src/third_party/wiredtiger/bench/workgen/runner/runner/core.py index 2c8311c4ca7..a8977d9593e 100644 --- a/src/third_party/wiredtiger/bench/workgen/runner/runner/core.py +++ b/src/third_party/wiredtiger/bench/workgen/runner/runner/core.py @@ -28,7 +28,7 @@ # # runner/core.py # Core functions available to all runners -import glob, os +import glob, os, random from workgen import Key, Operation, OpList, Table, Transaction, Value # txn -- @@ -100,14 +100,73 @@ def extensions_config(exts): result = ',extensions=[' + ','.join(extfiles.values()) + ']' return result -def _op_multi_table_as_list(ops_arg, tables): +_PARETO_SHAPE = 1.5 +_BILLION = 1000000000 + +# Choose a value from a range of ints based on the pareto parameter +# The pareto value is interpreted as in wtperf, a number between 0 and 100. +def _choose_pareto(nrange, pareto): + rval = random.randint(0, _BILLION) + + # Use Pareto distribution to give 80/20 hot/cold values. + S1 = -1 / _PARETO_SHAPE + S2 = nrange * (pareto.param / 100.0) * (_PARETO_SHAPE - 1) + U = 1 - rval / (_BILLION * 1.0) + rval = (pow(U, S1) - 1) * S2 + if rval >= nrange: + rval = 0 + return int(rval) + +# Get the list of subordinate operations that are listed in the group. +# Generally, the op._optype == Operation.OP_NONE, it indicates that +# the operation contains a group of subordinates. +# +# XXX +# Note that this function should be called for all iteration, rather than: +# for o in op._group +# because a bug in SWIG versions <= 2.0.11 would cause the above fragment +# to produce a segmentation violation as described here: +# https://sourceforge.net/p/swig/mailman/message/32838320/ +def _op_get_group_list(op): + grouplist = op._group + result = [] + if grouplist != None: + result.extend(grouplist) + return result + +def _op_multi_table_as_list(ops_arg, tables, pareto_tables, multiplier): result = [] if ops_arg._optype != Operation.OP_NONE: - for table in tables: - result.append(Operation(ops_arg._optype, table, ops_arg._key, ops_arg._value)) + if pareto_tables <= 0: + for table in tables: + for i in range(0, multiplier): + result.append(Operation(ops_arg._optype, table, ops_arg._key, ops_arg._value)) + else: + # Use the multiplier unless the length of the list will be large. + # In any case, make sure there's at least a multiplier of 3, to + # give a chance to hit all/most of the tables. + ntables = len(tables) + count = ntables * multiplier + if count > 1000: + count = 1000 + mincount = ntables * 3 + if mincount > count: + count = mincount + for i in range(0, count): + tnum = _choose_pareto(ntables, pareto_tables) + # Modify the pareto value to make it more flat + # as tnum gets higher. Workgen knows how to handle + # a portion of a pareto range. + table = tables[tnum] + key = Key(ops_arg._key) + key._pareto.range_low = (1.0 * i)/count + key._pareto.range_high = (1.0 * (i + 1))/count + result.append(Operation(ops_arg._optype, table, key, ops_arg._value)) else: - for op in ops._group: - result.extend(_op_multi_table_as_list(op, tables)) + for op in _op_get_group_list(ops_arg): + for o in _op_multi_table_as_list(op, tables, pareto_tables, \ + multiplier): + result.append(Operation(o)) return result # A convenient way to build a list of operations @@ -118,11 +177,52 @@ def op_append(op1, op2): op1 += op2 return op1 +# Require consistent use of pareto on the set of operations, +# that keeps our algorithm reasonably simple. +def _check_pareto(ops_arg, cur = 0): + if ops_arg._key != None and ops_arg._key._keytype == Key.KEYGEN_PARETO: + p = ops_arg._key._pareto + if cur != 0 and p != cur: + raise Exception('mixed pareto values for ops within a ' + \ + 'single thread not supported') + cur = p + if ops_arg._group != None: + for op in _op_get_group_list(ops_arg): + cur = _check_pareto(op, cur) + return cur + +_primes = [83, 89, 97, 101, 103, 107, 109, 113] + # Emulate wtperf's table_count option. Spread the given operations over -# a set of tables. -def op_multi_table(ops_arg, tables): +# a set of tables. For example, given 5 operations and 4 tables, we return +# a set of 20 operations for all possibilities. +# +# When we detect that pareto is used with a range partition, things get +# trickier, because we'll want a higher proportion of operations channelled +# to the first tables. Workgen only supports individual operations on a +# single table, so to get good Pareto distribution, we first expand the +# number in the total set of operations, and then choose a higher proportion +# of the tables. We need to expand the number of operations to make sure +# that the lower tables get some hits. While it's not perfect (without +# creating a huge multiplier) it's a reasonable approximation for most +# cases. Within each table's access, the pareto parameters have to be +# adjusted to account for the each table's position in the total +# distribution. For example, the lowest priority table will have a much +# more even distribution. +def op_multi_table(ops_arg, tables, range_partition = False): ops = None - for op in _op_multi_table_as_list(ops_arg, tables): + multiplier = 1 + if range_partition: + pareto_tables = _check_pareto(ops_arg) + else: + pareto_tables = 0 + if pareto_tables != 0: + multiplier = _primes[random.randint(0, len(_primes) - 1)] + ops_list = _op_multi_table_as_list(ops_arg, tables, pareto_tables, \ + multiplier) + if pareto_tables != 0: + random.shuffle(ops_list) + for op in ops_list: ops = op_append(ops, op) return ops @@ -152,7 +252,7 @@ def op_log_like(op, log_table, ops_per_txn): op = txn(op) # txn for each action. else: oplist = [] - for op2 in op._group: + for op2 in _op_get_group_list(op): if op2._optype == Operation.OP_NONE: oplist.append(op_log_like(op2, log_table)) elif ops_per_txn == 0 and _optype_is_write(op2._optype): @@ -182,10 +282,8 @@ def op_group_transaction(ops_arg, ops_per_txn, txn_config): raise Exception('grouping transactions with multipliers not supported') oplist = [] - ops = None - nops = 0 txgroup = [] - for op in ops_arg._group: + for op in _op_get_group_list(ops_arg): if op.optype == Operation.OP_NONE: oplist.append(_op_transaction_list(txgroup, txn_config)) txgroup = [] @@ -199,3 +297,39 @@ def op_group_transaction(ops_arg, ops_per_txn, txn_config): oplist.append(_op_transaction_list(txgroup, txn_config)) ops_arg._group = OpList(oplist) return ops_arg + +# Populate using range partition with the random range. +# We will totally fill 0 or more tables (fill_tables), and 0 or +# 1 table will be partially filled. The rest (if any) will +# by completely unfilled, to be filled/accessed during +# the regular part of the run. +def op_populate_with_range(ops_arg, tables, icount, random_range, pop_threads): + table_count = len(tables) + entries_per_table = (icount + random_range) / table_count + if entries_per_table == 0: + # This can happen if table_count is huge relative to + # icount/random_range. Not really worth handling. + raise Exception('table_count > (icount + random_range), seems absurd') + if (icount + random_range) % table_count != 0: + # This situation is not handled well by our simple algorithm, + # we won't get exactly icount entries added during the populate. + raise Exception('(icount + random_range) is not evenly divisible by ' + + 'table_count') + if entries_per_table % pop_threads != 0: + # Another situation that is not handled exactly. + raise Exception('(icount + random_range) is not evenly divisible by ' + + 'populate_threads') + fill_tables = icount / entries_per_table + fill_per_thread = entries_per_table / pop_threads + ops = None + for i in range(0, fill_tables): + op = Operation(ops_arg) + op._table = tables[i] + ops = op_append(ops, op * fill_per_thread) + partial_fill = icount % entries_per_table + if partial_fill > 0: + fill_per_thread = partial_fill / pop_threads + op = Operation(ops_arg) + op._table = tables[fill_tables] + ops = op_append(ops, op * fill_per_thread) + return ops diff --git a/src/third_party/wiredtiger/bench/workgen/workgen.cxx b/src/third_party/wiredtiger/bench/workgen/workgen.cxx index ce9debcca2f..31e21e6f6c9 100644 --- a/src/third_party/wiredtiger/bench/workgen/workgen.cxx +++ b/src/third_party/wiredtiger/bench/workgen/workgen.cxx @@ -240,7 +240,8 @@ Context& Context::operator=(const Context &other) { } ContextInternal::ContextInternal() : _tint(), _table_names(), - _recno(NULL), _recno_alloced(0), _tint_last(0), _context_count(0) { + _table_runtime(NULL), _runtime_alloced(0), _tint_last(0), + _context_count(0) { uint32_t count; if ((count = workgen_atomic_add32(&context_count, 1)) != 1) THROW("multiple Contexts not supported"); @@ -248,20 +249,20 @@ ContextInternal::ContextInternal() : _tint(), _table_names(), } ContextInternal::~ContextInternal() { - if (_recno != NULL) - delete _recno; + if (_table_runtime != NULL) + delete _table_runtime; } int ContextInternal::create_all() { - if (_recno_alloced != _tint_last) { + if (_runtime_alloced != _tint_last) { // The array references are 1-based, we'll waste one entry. - uint64_t *new_recno = new uint64_t[_tint_last + 1]; - memcpy(new_recno, _recno, sizeof(uint64_t) * _recno_alloced); - memset(&new_recno[_recno_alloced], 0, - sizeof(uint64_t) * (_tint_last - _recno_alloced + 1)); - delete _recno; - _recno = new_recno; - _recno_alloced = _tint_last; + TableRuntime *new_table_runtime = new TableRuntime[_tint_last + 1]; + memcpy(new_table_runtime, _table_runtime, sizeof(uint64_t) * _runtime_alloced); + memset(&new_table_runtime[_runtime_alloced], 0, + sizeof(uint64_t) * (_tint_last - _runtime_alloced + 1)); + delete _table_runtime; + _table_runtime = new_table_runtime; + _runtime_alloced = _tint_last; } return (0); } @@ -301,7 +302,9 @@ int Monitor::run() { workgen_version(version, sizeof(version)); Stats prev_interval; while (!_stop) { - for (int i = 0; i < options->sample_interval && !_stop; i++) + int waitsecs = (first && options->warmup > 0) ? options->warmup : + options->sample_interval; + for (int i = 0; i < waitsecs && !_stop; i++) sleep(1); if (_stop) break; @@ -387,6 +390,22 @@ int Monitor::run() { return (0); } +ParetoOptions ParetoOptions::DEFAULT; +ParetoOptions::ParetoOptions(int param_arg) : param(param_arg), range_low(0.0), + range_high(1.0), _options() { + _options.add_int("param", param, + "0 is disabled, otherwise a range from 1 (most aggressive) to " + "100 (least aggressive)"); + _options.add_double("range_low", range_low, + "between 0.0 and 1.0, starting range of the pareto distribution"); + _options.add_double("range_high", range_high, + "between 0.0 and 1.0, ending range of the pareto distribution"); +} +ParetoOptions::ParetoOptions(const ParetoOptions &other) : + param(other.param), range_low(other.range_low), + range_high(other.range_high), _options(other._options) {} +ParetoOptions::~ParetoOptions() {} + ThreadRunner::ThreadRunner() : _errno(0), _exception(), _thread(NULL), _context(NULL), _icontext(NULL), _workload(NULL), _wrunner(NULL), _rand_state(NULL), @@ -536,9 +555,12 @@ void ThreadRunner::op_create_all(Operation *op, size_t &keysize, op->create_all(); if (op->_optype != Operation::OP_NONE) { - op->kv_compute_max(true); + op->kv_compute_max(true, false); if (OP_HAS_VALUE(op)) - op->kv_compute_max(false); + op->kv_compute_max(false, op->_table.options.random_value); + if (op->_key._keytype == Key::KEYGEN_PARETO && + op->_key._pareto.param == 0) + THROW("Key._pareto value must be set if KEYGEN_PARETO specified"); op->kv_size_buffer(true, keysize); op->kv_size_buffer(false, valuesize); @@ -575,17 +597,66 @@ void ThreadRunner::op_create_all(Operation *op, size_t &keysize, op_create_all(&*i, keysize, valuesize); } -uint64_t ThreadRunner::op_get_key_recno(Operation *op, tint_t tint) { + +#define PARETO_SHAPE 1.5 + +// Return a value within the interval [ 0, recno_max ) +// that is weighted toward lower numbers with pareto_param at 0 (the minimum), +// and more evenly distributed with pareto_param at 100 (the maximum). +// +static uint64_t +pareto_calculation(uint32_t randint, uint64_t recno_max, + ParetoOptions &pareto) { + double S1, S2, U; + uint32_t result; + double r; + + r = (double)randint; + if (pareto.range_high != 1.0 || pareto.range_low != 0.0) { + if (pareto.range_high <= pareto.range_low || + pareto.range_high > 1.0 || pareto.range_low < 0.0) + THROW("Pareto illegal range"); + r = (pareto.range_low * (double)UINT32_MAX) + + r * (pareto.range_high - pareto.range_low); + } + S1 = (-1 / PARETO_SHAPE); + S2 = recno_max * (pareto.param / 100.0) * (PARETO_SHAPE - 1); + U = 1 - r / (double)UINT32_MAX; // interval [0, 1) + result = (uint64_t)((pow(U, S1) - 1) * S2); + + // This Pareto calculation chooses out of range values less than 20% + // of the time, depending on pareto_param. For param of 0, it is + // never out of range, for param of 100, 19.2%. For the default + // pareto_param of 20, it will be out of range 2.7% of the time. + // Out of range values are channelled into the first key, + // making it "hot". Unfortunately, that means that using a higher + // param can get a lot lumped into the first bucket. + // + // XXX This matches the behavior of wtperf, we may consider instead + // retrying (modifying the random number) until we get a good value. + // + if (result > recno_max) + result = 0; + return (result); +} + +uint64_t ThreadRunner::op_get_key_recno(Operation *op, uint64_t range, + tint_t tint) { uint64_t recno_count; - uint32_t rand; + uint32_t rval; (void)op; - recno_count = _icontext->_recno[tint]; + if (range > 0) + recno_count = range; + else + recno_count = _icontext->_table_runtime[tint]._max_recno; if (recno_count == 0) // The file has no entries, returning 0 forces a WT_NOTFOUND return. return (0); - rand = workgen_random(_rand_state); - return (rand % recno_count + 1); // recnos are one-based. + rval = workgen_random(_rand_state); + if (op->_key._keytype == Key::KEYGEN_PARETO) + rval = pareto_calculation(rval, recno_count, op->_key._pareto); + return (rval % recno_count + 1); // recnos are one-based. } int ThreadRunner::op_run(Operation *op) { @@ -594,12 +665,14 @@ int ThreadRunner::op_run(Operation *op) { WT_CURSOR *cursor; WT_DECL_RET; uint64_t recno; + uint64_t range; bool measure_latency, own_cursor; track = NULL; cursor = NULL; recno = 0; own_cursor = false; + range = op->_table.options.range; if (_throttle != NULL) { if (_throttle_ops >= _throttle_limit && !_in_transaction) { WT_ERR(_throttle->throttle(_throttle_ops, @@ -621,19 +694,24 @@ int ThreadRunner::op_run(Operation *op) { switch (op->_optype) { case Operation::OP_INSERT: track = &_stats.insert; - recno = workgen_atomic_add64(&_icontext->_recno[tint], 1); + if (op->_key._keytype == Key::KEYGEN_APPEND || + op->_key._keytype == Key::KEYGEN_AUTO) + recno = workgen_atomic_add64( + &_icontext->_table_runtime[tint]._max_recno, 1); + else + recno = op_get_key_recno(op, range, tint); break; case Operation::OP_REMOVE: track = &_stats.remove; - recno = op_get_key_recno(op, tint); + recno = op_get_key_recno(op, range, tint); break; case Operation::OP_SEARCH: track = &_stats.read; - recno = op_get_key_recno(op, tint); + recno = op_get_key_recno(op, range, tint); break; case Operation::OP_UPDATE: track = &_stats.update; - recno = op_get_key_recno(op, tint); + recno = op_get_key_recno(op, range, tint); break; case Operation::OP_NONE: recno = 0; @@ -651,6 +729,7 @@ int ThreadRunner::op_run(Operation *op) { track->track_latency() && (track->ops % _workload->options.sample_rate == 0); + VERBOSE(*this, "OP " << op->_optype << " " << op->_table._uri.c_str() << ", recno=" << recno); timespec start; if (measure_latency) workgen_epoch(&start); @@ -663,10 +742,13 @@ int ThreadRunner::op_run(Operation *op) { _in_transaction = true; } if (op->_optype != Operation::OP_NONE) { - op->kv_gen(true, recno, _keybuf); + op->kv_gen(true, 0, recno, _keybuf); cursor->set_key(cursor, _keybuf); if (OP_HAS_VALUE(op)) { - op->kv_gen(false, recno, _valuebuf); + uint32_t r = 0; + if (op->_table.options.random_value) + r = workgen_random(_rand_state); + op->kv_gen(false, r, recno, _valuebuf); cursor->set_value(cursor, _valuebuf); } switch (op->_optype) { @@ -969,7 +1051,7 @@ void Operation::get_static_counts(Stats &stats, int multiplier) { i->get_static_counts(stats, multiplier * _repeatgroup); } -void Operation::kv_compute_max(bool iskey) { +void Operation::kv_compute_max(bool iskey, bool has_random) { uint64_t max; int size; @@ -981,6 +1063,14 @@ void Operation::kv_compute_max(bool iskey) { THROW("Key.size too small for table '" << _table._uri << "'"); if (!iskey && size < 1) THROW("Value.size too small for table '" << _table._uri << "'"); + if (has_random) { + if (iskey) + THROW("Random keys not allowed"); + size -= RANDOMIZER_SIZE; + if (size < 1) + THROW("Value.size with random values too small for table '" + << _table._uri << "'"); + } if (size > 1) max = power64(10, (size - 1)) - 1; @@ -1006,7 +1096,8 @@ void Operation::kv_size_buffer(bool iskey, size_t &maxsize) const { } } -void Operation::kv_gen(bool iskey, uint64_t n, char *result) const { +void Operation::kv_gen(bool iskey, uint32_t randomizer, uint64_t n, + char *result) const { uint64_t max; int size; @@ -1015,6 +1106,12 @@ void Operation::kv_gen(bool iskey, uint64_t n, char *result) const { if (n > max) THROW((iskey ? "Key" : "Value") << " (" << n << ") too large for size (" << size << ")"); + if (randomizer != 0) { + randomizer %= 1000; + snprintf(result, 6, ":%3.3d:", randomizer); + n -= RANDOMIZER_SIZE; + result += RANDOMIZER_SIZE; + } workgen_u64_to_string_zf(n, result, size); } @@ -1338,14 +1435,20 @@ void Stats::track_latency(bool latency) { truncate.track_latency(latency); } -TableOptions::TableOptions() : key_size(0), value_size(0), _options() { +TableOptions::TableOptions() : key_size(0), value_size(0), + random_value(false), range(0), _options() { _options.add_int("key_size", key_size, "default size of the key, unless overridden by Key.size"); _options.add_int("value_size", value_size, "default size of the value, unless overridden by Value.size"); + _options.add_bool("random_value", random_value, + "generate random content for the value"); + _options.add_int("range", range, + "if zero, keys are inserted at the end and reads/updates are in the current range, if non-zero, inserts/reads/updates are at a random key between 0 and the given range"); } TableOptions::TableOptions(const TableOptions &other) : key_size(other.key_size), value_size(other.value_size), + random_value(other.random_value), range(other.range), _options(other._options) {} TableOptions::~TableOptions() {} @@ -1376,7 +1479,7 @@ TableInternal::~TableInternal() {} WorkloadOptions::WorkloadOptions() : max_latency(0), report_file("workload.stat"), report_interval(0), run_time(0), - sample_file("sample.json"), sample_interval(0), sample_rate(1), + sample_file("sample.json"), sample_interval(0), sample_rate(1), warmup(0), _options() { _options.add_int("max_latency", max_latency, "prints warning if any latency measured exceeds this number of " @@ -1399,6 +1502,8 @@ WorkloadOptions::WorkloadOptions() : max_latency(0), _options.add_int("sample_rate", sample_rate, "how often the latency of operations is measured. 1 for every operation, " "2 for every second operation, 3 for every third operation etc."); + _options.add_int("warmup", warmup, + "how long to run the workload phase before starting measurements"); } WorkloadOptions::WorkloadOptions(const WorkloadOptions &other) : @@ -1569,7 +1674,8 @@ int WorkloadRunner::run_all() { workgen_epoch(&_start); timespec end = _start + options->run_time; - timespec next_report = _start + options->report_interval; + timespec next_report = _start + + ((options->warmup > 0) ? options->warmup : options->report_interval); // Start all threads if (options->sample_interval > 0) { @@ -1653,6 +1759,8 @@ int WorkloadRunner::run_all() { if (exception == NULL && !_trunners[i]._exception._str.empty()) exception = &_trunners[i]._exception; } + + workgen_epoch(&now); if (options->sample_interval > 0) { WT_TRET(pthread_join(monitor._handle, &status)); if (monitor._errno != 0) diff --git a/src/third_party/wiredtiger/bench/workgen/workgen.h b/src/third_party/wiredtiger/bench/workgen/workgen.h index a12e4dc4c89..2a116e1c89e 100644 --- a/src/third_party/wiredtiger/bench/workgen/workgen.h +++ b/src/third_party/wiredtiger/bench/workgen/workgen.h @@ -171,6 +171,8 @@ struct Context { struct TableOptions { int key_size; int value_size; + bool random_value; + int range; TableOptions(); TableOptions(const TableOptions &other); @@ -179,6 +181,8 @@ struct TableOptions { void describe(std::ostream &os) const { os << "key_size " << key_size; os << ", value_size " << value_size; + os << ", random_value " << random_value; + os << ", range " << range; } std::string help() const { return _options.help(); } @@ -210,16 +214,46 @@ struct Table { #endif }; +struct ParetoOptions { + int param; + double range_low; + double range_high; + ParetoOptions(int param = 0); + ParetoOptions(const ParetoOptions &other); + ~ParetoOptions(); + + void describe(std::ostream &os) const { + os << "parameter " << param; + if (range_low != 0.0 || range_high != 1.0) { + os << "range [" << range_low << "-" << range_high << "]"; + } + } + + std::string help() const { return _options.help(); } + std::string help_description(const char *option_name) const { + return _options.help_description(option_name); } + std::string help_type(const char *option_name) const { + return _options.help_type(option_name); } + + static ParetoOptions DEFAULT; +private: + OptionsList _options; +}; + struct Key { typedef enum { KEYGEN_AUTO, KEYGEN_APPEND, KEYGEN_PARETO, KEYGEN_UNIFORM } KeyType; KeyType _keytype; int _size; + ParetoOptions _pareto; /* XXX specify more about key distribution */ - Key() : _keytype(KEYGEN_AUTO), _size(0) {} - Key(KeyType keytype, int size) : _keytype(keytype), _size(size) {} - Key(const Key &other) : _keytype(other._keytype), _size(other._size) {} + Key() : _keytype(KEYGEN_AUTO), _size(0), _pareto(ParetoOptions::DEFAULT) {} + Key(KeyType keytype, int size=0, + const ParetoOptions &pareto=ParetoOptions::DEFAULT) : + _keytype(keytype), _size(size), _pareto(pareto) {} + Key(const Key &other) : _keytype(other._keytype), _size(other._size), + _pareto(other._pareto) {} ~Key() {} void describe(std::ostream &os) const { @@ -273,8 +307,9 @@ struct Operation { Operation& operator=(const Operation &other); void create_all(); void get_static_counts(Stats &stats, int multiplier); - void kv_compute_max(bool); - void kv_gen(bool, uint64_t, char *) const; + void kv_compute_max(bool iskey, bool has_random); + void kv_gen(bool iskey, uint32_t randomizer, uint64_t n, + char *result) const; void kv_size_buffer(bool iskey, size_t &size) const; void size_check() const; #endif @@ -365,6 +400,7 @@ struct WorkloadOptions { int sample_interval; int sample_rate; std::string sample_file; + int warmup; WorkloadOptions(); WorkloadOptions(const WorkloadOptions &other); diff --git a/src/third_party/wiredtiger/bench/workgen/workgen_int.h b/src/third_party/wiredtiger/bench/workgen/workgen_int.h index a8d008a3bc5..c7a5a7121e9 100644 --- a/src/third_party/wiredtiger/bench/workgen/workgen_int.h +++ b/src/third_party/wiredtiger/bench/workgen/workgen_int.h @@ -36,6 +36,8 @@ extern "C" { } #endif +#define RANDOMIZER_SIZE 5 /* ":000:" prefix */ + namespace workgen { // A 'tint' or ('table integer') is a unique small value integer @@ -126,7 +128,7 @@ struct ThreadRunner { int run(); void op_create_all(Operation *, size_t &keysize, size_t &valuesize); - uint64_t op_get_key_recno(Operation *, tint_t tint); + uint64_t op_get_key_recno(Operation *, uint64_t range, tint_t tint); void op_get_static_counts(Operation *, Stats &, int); int op_run(Operation *); @@ -153,11 +155,18 @@ struct Monitor { int run(); }; +struct TableRuntime { + uint64_t _max_recno; // highest recno allocated + bool _disjoint; // does key space have holes? + + TableRuntime() : _max_recno(0), _disjoint(0) {} +}; + struct ContextInternal { std::map<std::string, tint_t> _tint; // maps uri -> tint_t std::map<tint_t, std::string> _table_names; // reverse mapping - uint64_t *_recno; // # entries per tint_t - uint32_t _recno_alloced; // length of allocated _recno + TableRuntime *_table_runtime; // # entries per tint_t + uint32_t _runtime_alloced; // length of _table_runtime tint_t _tint_last; // last tint allocated // unique id per context, to work with multiple contexts, starts at 1. uint32_t _context_count; diff --git a/src/third_party/wiredtiger/bench/workgen/wtperf.py b/src/third_party/wiredtiger/bench/workgen/wtperf.py index 3a196fe7b57..2837be6d064 100644 --- a/src/third_party/wiredtiger/bench/workgen/wtperf.py +++ b/src/third_party/wiredtiger/bench/workgen/wtperf.py @@ -34,7 +34,7 @@ # See also the usage() function. # from __future__ import print_function -import os, sys, tempfile +import os, shutil, sys, tempfile def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) @@ -52,13 +52,15 @@ class Options(object): pass class Translator: - def __init__(self, filename, prefix, verbose): + def __init__(self, filename, prefix, verbose, homedir): self.filename = filename self.prefix = prefix self.verbose = verbose + self.homedir = homedir self.linenum = 0 - self.opts = {} - self.used_opts = {} + self.opts_map = {} + self.opts_used = {} + self.options = lambda: None # options behaves as an attribute dict self.has_error = False def error_file_line(self, fname, linenum, msg): @@ -70,15 +72,17 @@ class Translator: self.error_file_line(self.filename, self.linenum, msg) # Report an error and unwind the stack - def fatal_error(self, msg, errtype): + def fatal_error(self, msg, errtype = 'configuration error'): self.error(msg) raise TranslateException(errtype) - supported_opt_list = [ 'compression', 'conn_config', 'icount', - 'key_sz', 'log_like_table', + supported_opt_list = [ 'close_conn', 'compression', 'compact', + 'conn_config', 'create', 'icount', + 'key_sz', 'log_like_table', 'pareto', 'populate_ops_per_txn', 'populate_threads', - 'reopen_connection', - 'table_config', 'table_count', + 'random_range', 'random_value', 'range_partition', + 'readonly', 'reopen_connection', 'run_ops', + 'sess_config', 'table_config', 'table_count', 'threads', 'transaction_config', 'value_sz' ] def set_opt(self, optname, val): @@ -98,23 +102,32 @@ class Translator: v = int(val) # it might be an integer except ValueError: v = val # it's a string after all - self.opts[optname] = OptionValue(v, self.filename, self.linenum) + self.opts_map[optname] = OptionValue(v, self.filename, self.linenum) - def get_opt(self, optname, dfault): - if optname in self.opts: - ret = self.opts[optname] + def _get_opt(self, optname, dfault): + if optname in self.opts_map: + ret = self.opts_map[optname] self.filename = ret.filename self.linenum = ret.linenum - self.used_opts[optname] = 1 + self.opts_used[optname] = 1 return ret.value else: return dfault + def get_string_opt(self, optname, dfault): + v = self._get_opt(optname, dfault) + setattr(self.options, optname, v) + return v + def get_int_opt(self, optname, dfault): - return self.get_opt(optname, dfault) + 0 + v = self._get_opt(optname, dfault) + 0 + setattr(self.options, optname, v) + return v def get_boolean_opt(self, optname, dfault): - return not not self.get_opt(optname, dfault) + v = not not self._get_opt(optname, dfault) + setattr(self.options, optname, v) + return v # Split a string 'left_side=right_side' into two parts def split_assign(self, s): @@ -159,17 +172,33 @@ class Translator: def assign_str(self, left, right): return left + '=' + str(right) + '\n' - def add_operation_str(self, count, opname, multi): + def add_operation_str(self, count, opname, multi, pareto): result = '' tablename = 'tables[0]' if multi else 'table' if count > 1: result += str(count) + ' * ' if count > 0: - result += 'Operation(Operation.' + opname + ', ' + \ - tablename + ') + \\\n' + result += 'Operation(Operation.' + opname + ', ' + tablename + if pareto > 0: + result += ', Key(Key.KEYGEN_PARETO, 0, ParetoOptions(' + \ + str(pareto) + '))' + elif opname == 'OP_INSERT' and self.options.random_range != 0: + result += ', Key(Key.KEYGEN_UNIFORM)' + result += ') + \\\n' result += ' ' return result + def copy_config(self): + # Note: If we add the capability of setting options on the command + # line, we won't be able to do a simple copy. + config_save = os.path.join(self.homedir, 'CONFIG.wtperf') + suffix = 0 + while os.path.exists(config_save): + suffix += 1 + config_save = os.path.join(self.homedir, \ + 'CONFIG.wtperf.' + str(suffix)) + shutil.copyfile(self.filename, config_save) + # Wtperf's throttle is based on the number of regular operations, # not including log_like operations. Workgen counts all operations, # it doesn't treat log operations any differently. Adjust the throttle @@ -191,11 +220,13 @@ class Translator: return (new_throttle, comment) def parse_threads(self, threads_config): + opts = self.options tdecls = '' tlist = self.split_config_parens(threads_config) table_count = self.get_int_opt('table_count', 1) log_like_table = self.get_boolean_opt('log_like_table', False) - txn_config = self.get_opt('transaction_config', '') + txn_config = self.get_string_opt('transaction_config', '') + run_ops = self.get_int_opt('run_ops', -1) if log_like_table: tdecls += 'log_name = "table:log"\n' tdecls += 's.create(log_name, "key_format=S,value_format=S," +' + \ @@ -219,6 +250,7 @@ class Translator: topts.throttle = 0 topts.update = 0 topts.updates = 0 + topts.random_range = 0 for o in self.split_config_parens(t): (k, v) = self.split_assign(o) @@ -239,19 +271,41 @@ class Translator: if topts.inserts + topts.reads + topts.updates == 0: self.fatal_error('need read/insert/update/...', 'thread config error') + tdecls += 'ops = ' - tdecls += self.add_operation_str(topts.inserts, 'OP_INSERT', multi) - tdecls += self.add_operation_str(topts.reads, 'OP_SEARCH', multi) - tdecls += self.add_operation_str(topts.updates, 'OP_UPDATE', multi) + tdecls += self.add_operation_str(topts.inserts, 'OP_INSERT', + multi, opts.pareto) + tdecls += self.add_operation_str(topts.reads, 'OP_SEARCH', + multi, opts.pareto) + tdecls += self.add_operation_str(topts.updates, 'OP_UPDATE', + multi, opts.pareto) tdecls = tdecls.rstrip(' \n\\+') + '\n' + range_partition = opts.range_partition + + # Pareto with multiple tables is handled in op_multi_table. if multi: - tdecls += 'ops = op_multi_table(ops, tables)\n' + tdecls += 'ops = op_multi_table(ops, tables, ' + \ + str(range_partition) + ')\n' if topts.ops_per_txn > 0: tdecls += 'ops = op_group_transaction(ops, ' + \ str(topts.ops_per_txn) + ', "' + txn_config + '")\n' if log_like_table: tdecls += 'ops = op_log_like(ops, log_table, ' + \ str(topts.ops_per_txn) + ')\n' + if run_ops != -1: + if len(tlist) > 1: + self.fatal_error('run_ops currently supported with a ' + 'single type of thread') + tdecls += '\n' + if multi: + tdecls += \ + '# Note that op_multi_table has already multiplied\n' +\ + '# the number of operations by the number of tables.\n' + tdecls += 'ops = ops * (' + \ + str(run_ops) + ' / (' + str(topts.count) + \ + ' * table_count))' + \ + ' # run_ops = ' + str(run_ops) + \ + ', thread.count = ' + str(topts.count) + '\n' tdecls += thread_name + ' = Thread(ops)\n' if topts.throttle > 0: (throttle, comment) = self.calc_throttle(topts, log_like_table) @@ -273,6 +327,134 @@ class Translator: # An error has already been reported return None + def check_divisibility(self, icount, random_range, divisor_name, divisor): + if (icount + random_range) % divisor != 0: + if random_range == 0: + dividend = 'icount' + else: + dividend = '(icount + random_range)' + self.fatal_error(dividend + ' is not evenly divisible by ' + + divisor_name + ', this is not handled ' + + 'precisely by wtperf.py') + + def translate_table_create(self): + opts = self.options + s = '' + s += 'wtperf_table_config = "key_format=S,value_format=S,type=lsm," +\\\n' + s += ' "exclusive=true,allocation_size=4kb," +\\\n' + s += ' "internal_page_max=64kb,leaf_page_max=4kb,split_pct=100,"\n' + if opts.compression != '': + s += 'compress_table_config = "block_compressor=' + opts.compression + ',"\n' + else: + s += 'compress_table_config = ""\n' + s += 'table_config = "' + opts.table_config + '"\n' + s += 'tables = []\n' + s += 'table_count = ' + str(opts.table_count) + '\n' + if opts.table_count == 1: + s += 'tname = "table:test.wt"\n' + indent = '' + else: + s += 'for i in range(0, table_count):\n' + s += ' tname = "table:test" + str(i) + ".wt"\n' + indent = ' ' + + s += indent + 'table = Table(tname)\n' + s += indent + 's.create(tname, wtperf_table_config +\\\n' + s += indent + ' compress_table_config + table_config)\n' + s += indent + 'table.options.key_size = ' + str(opts.key_sz) + '\n' + s += indent + 'table.options.value_size = ' + str(opts.value_sz) + '\n' + if opts.random_value: + s += indent + 'table.options.random_value = True\n' + if opts.random_range != 0: + # In wtperf, the icount plus random_range is the key range + table_range = (opts.random_range + opts.icount) / opts.table_count + s += indent + 'table.options.range = ' + str(table_range) + '\n' + s += indent + 'tables.append(table)\n' + return s + + def translate_populate(self): + opts = self.options + s = '\n' + if opts.icount == 0: + if opts.populate_threads != 0: + self.error("populate_threads > 0, icount == 0") + return '' + if opts.populate_threads == 0: + self.fatal_error('icount != 0 and populate_threads == 0: ' +\ + 'cannot populate entries with no threads') + s += 'populate_threads = ' + str(opts.populate_threads) + '\n' + s += 'icount = ' + str(opts.icount) + '\n' + need_ops_per_thread = True + + # Since we're separating the populating by table, and also + # into multiple threads, we currently require that + # (icount + random_range) is evenly divisible by table count + # and by number of populating threads. It's possible to handle + # the cases when this is not true, but it hardly seems worth + # the extra complexity. Also, these could be made into warnings, + # and actually create fewer entries than icount, but that could be + # confusing. + self.check_divisibility(opts.icount, opts.random_range, + 'table_count', opts.table_count) + self.check_divisibility(opts.icount, opts.random_range, + '(populate_threads * table_count)', + opts.populate_threads * opts.table_count) + + if opts.table_count == 1: + s += 'pop_ops = Operation(Operation.OP_INSERT, table)\n' + elif opts.range_partition and opts.random_range > 0: + # Populating using a range partition is complex enough + # to handle in its own function. It does all the operations + # for the thread, so we don't need a multiplier at the end. + need_ops_per_thread = False + + s += 'random_range = ' + str(opts.random_range) + '\n' + s += 'pop_ops = Operation(Operation.OP_INSERT, tables[0])\n' + s += 'pop_ops = op_populate_with_range(pop_ops, tables, ' + \ + 'icount, random_range, populate_threads)\n' + else: + s += '# There are multiple tables to be filled during populate,\n' + s += '# the icount is split between them all.\n' + s += 'pop_ops = Operation(Operation.OP_INSERT, tables[0])\n' + s += 'pop_ops = op_multi_table(pop_ops, tables)\n' + + if need_ops_per_thread: + s += 'nops_per_thread = icount / (populate_threads * table_count)\n' + op_mult = ' * nops_per_thread' + else: + op_mult = '' + + pop_per_txn = opts.populate_ops_per_txn + if pop_per_txn > 0: + s += 'pop_ops = op_group_transaction(pop_ops, ' + \ + str(pop_per_txn) + ', "' + opts.transaction_config + '")\n' + s += 'pop_thread = Thread(pop_ops' + op_mult + ')\n' + s += 'pop_workload = Workload(context, populate_threads * pop_thread)\n' + if self.verbose > 0: + s += 'print("populate:")\n' + s += 'pop_workload.run(conn)\n' + + # If configured, compact to allow LSM merging to complete. We + # set an unlimited timeout because if we close the connection + # then any in-progress compact/merge is aborted. + if opts.compact: + if opts.async_threads == 0: + self.fatal_error('unexpected value for async_threads') + s += '\n' + if self.verbose > 0: + s += 'print("compact after populate:")\n' + s += 'import time\n' + s += 'start_time = time.time()\n' + s += 'async_callback = WtperfAsyncCallback()\n' + s += 'for i in range(0, table_count):\n' + s += ' op = conn.async_new_op(tables[i]._uri, "timeout=0", async_callback)\n' + s += ' op.compact()\n' + s += 'conn.async_flush()\n' + s += 'print("compact completed in {} seconds".format(' + \ + 'time.time() - start_time))\n' + + return s + def translate_inner(self): workloadopts = '' with open(self.filename) as fin: @@ -286,19 +468,40 @@ class Translator: continue (key, val) = self.split_assign(line) if key in [ 'max_latency', 'report_file', 'report_interval', - 'run_time', 'sample_interval', 'sample_rate' ]: + 'run_time', 'sample_interval', 'sample_rate', + 'warmup' ]: workloadopts += 'workload.options.' + key + '=' + val + '\n' else: self.set_opt(key, val) - table_count = self.get_int_opt('table_count', 1) - conn_config = self.get_opt('conn_config', '') - table_config = self.get_opt('table_config', '') - key_sz = self.get_int_opt('key_sz', 20) - value_sz = self.get_int_opt('value_sz', 100) - reopen = self.get_boolean_opt('reopen_connection', False) - compression = self.get_opt('compression', '') - txn_config = self.get_opt('transaction_config', '') + conn_config = self.get_string_opt('conn_config', '') + sess_config = self.get_string_opt('sess_config', '') + create = self.get_boolean_opt('create', True) + reopen_connection = self.get_boolean_opt('reopen_connection', False) + readonly = self.get_boolean_opt('readonly', False) + close_conn = self.get_boolean_opt('close_conn', True) + compression = self.get_string_opt('compression', '') + self.get_int_opt('table_count', 1) + self.get_string_opt('table_config', '') + self.get_int_opt('key_sz', 20) + self.get_int_opt('value_sz', 100) + self.get_int_opt('icount', 0) + self.get_int_opt('populate_threads', 1) + self.get_int_opt('populate_ops_per_txn', 0) + self.get_boolean_opt('range_partition', False) + self.get_int_opt('random_range', 0) + self.get_boolean_opt('random_value', False) + self.get_string_opt('transaction_config', '') + self.get_boolean_opt('compact', False) + self.get_int_opt('async_threads', 0) + self.get_int_opt('pareto', 0) + opts = self.options + if opts.range_partition and opts.random_range == 0: + self.fatal_error('range_partition requires random_range to be set') + if opts.random_range > 0 and not opts.range_partition and \ + opts.table_count != 1: + self.fatal_error('random_range and multiple tables without ' + \ + 'range_partition is not supported') s = '#/usr/bin/env python\n' s += '# generated from ' + self.filename + '\n' @@ -307,93 +510,75 @@ class Translator: s += 'from wiredtiger import *\n' s += 'from workgen import *\n' s += '\n' + async_config = '' + if opts.compact and opts.async_threads == 0: + opts.async_threads = 2; + if opts.async_threads > 0: + # Assume the default of 1024 for the max ops, although we + # could bump that up to 4096 if needed. + async_config = ',async=(enabled=true,threads=' + \ + str(opts.async_threads) + ')' + s += '# this can be further customized\n' + s += 'class WtperfAsyncCallback(AsyncCallback):\n' + s += ' def __init__(self):\n' + s += ' pass\n' + s += ' def notify_error(self, key, value, optype, desc):\n' + s += ' print("ERROR: async notify(" + str(key) + "," + \\\n' + s += ' str(value) + "," + str(optype) + "): " + desc)\n' + s += ' def notify(self, op, op_ret, flags):\n' + s += ' if op_ret != 0:\n' + s += ' self.notify_error(op._key, op._value,\\\n' + s += ' op._optype, wiredtiger_strerror(op_ret))\n' + s += ' return op_ret\n' + s += '\n' s += 'context = Context()\n' - s += 'conn_config = "' + conn_config + '"\n' + extra_config = '' + s += 'conn_config = ""\n' + + if async_config != '': + s += 'conn_config += ",' + async_config + '" # async config\n' + if conn_config != '': + s += 'conn_config += ",' + conn_config + '" # explicitly added\n' if compression != '': s += 'conn_config += extensions_config(["compressors/' + \ - compression + '"])\n' + compression + '"])\n' compression = 'block_compressor=' + compression + ',' - s += 'conn = wiredtiger_open("WT_TEST", "create," + conn_config)\n' - s += 's = conn.open_session()\n' + s += 'conn = wiredtiger_open("' + self.homedir + \ + '", "create," + conn_config)\n' + s += 's = conn.open_session("' + sess_config + '")\n' s += '\n' - s += 'wtperf_table_config = "key_format=S,value_format=S,type=lsm," +\\\n' - s += ' "exclusive=true,allocation_size=4kb," +\\\n' - s += ' "internal_page_max=64kb,leaf_page_max=4kb,split_pct=100,"\n' - s += 'compress_table_config = "' + compression + '"\n' - s += 'table_config = "' + table_config + '"\n' - if table_count == 1: - s += 'tname = "file:test.wt"\n' - s += 's.create(tname, wtperf_table_config +\\\n' - s += ' compress_table_config + table_config)\n' - s += 'table = Table(tname)\n' - s += 'table.options.key_size = ' + str(key_sz) + '\n' - s += 'table.options.value_size = ' + str(value_sz) + '\n' - else: - s += 'table_count = ' + str(table_count) + '\n' - s += 'tables = []\n' - s += 'for i in range(0, table_count):\n' - s += ' tname = "file:test" + str(i) + ".wt"\n' - s += ' s.create(tname, ' + \ - 'wtperf_table_config + ' + \ - 'compress_table_config + table_config)\n' - s += ' t = Table(tname)\n' - s += ' t.options.key_size = ' + str(key_sz) + '\n' - s += ' t.options.value_size = ' + str(value_sz) + '\n' - s += ' tables.append(t)\n' - s += '\n' - - icount = self.get_int_opt('icount', 0) - pop_thread = self.get_int_opt('populate_threads', 1) - pop_per_txn = self.get_int_opt('populate_ops_per_txn', 0) - if icount != 0: - if pop_thread == 0: - self.fatal_error('icount != 0 and populate_threads == 0: ' +\ - 'cannot populate entries with no threads') - elif pop_thread == 1: - mult = '' - else: - mult = str(pop_thread) + ' * ' - - # if there are multiple tables to be filled during populate, - # the icount is split between them all. - nops_per_thread = icount / (pop_thread * table_count) - if table_count == 1: - s += 'pop_ops = Operation(Operation.OP_INSERT, table)\n' - else: - s += 'pop_ops = Operation(Operation.OP_INSERT, tables[0])\n' - s += 'pop_ops = op_multi_table(pop_ops, tables)\n' - if pop_per_txn > 0: - s += 'pop_ops = op_group_transaction(pop_ops, ' + \ - str(pop_per_txn) + ', "' + txn_config + '")\n' - s += 'pop_thread = Thread(pop_ops * ' + str(nops_per_thread) + ')\n' - s += 'pop_workload = Workload(context, ' + mult + 'pop_thread)\n' - if self.verbose > 0: - s += 'print("populate:")\n' - s += 'pop_workload.run(conn)\n' - else: - if self.get_int_opt('populate_threads', 0) != 0: - self.error("populate_threads > 0, icount == 0") + s += self.translate_table_create() + if create: + s += self.translate_populate() - thread_config = self.get_opt('threads', '') + thread_config = self.get_string_opt('threads', '') if thread_config != '': (t_create, t_var) = self.parse_threads(thread_config) s += '\n' + t_create - if reopen: + if reopen_connection: s += '\n# reopen the connection\n' s += 'conn.close()\n' + if readonly: + 'conn_config += ",readonly=true"\n' s += 'conn = wiredtiger_open(' + \ - '"WT_TEST", "create," + conn_config)\n' + '"' + self.homedir + '", "create," + conn_config)\n' s += '\n' s += 'workload = Workload(context, ' + t_var + ')\n' s += workloadopts if self.verbose > 0: s += 'print("workload:")\n' - s += 'workload.run(conn)\n' - - for o in self.used_opts: - del self.opts[o] - if len(self.opts) != 0: - self.error('internal error, options not handled: ' + str(self.opts)) + s += 'workload.run(conn)\n\n' + s += 'latency_filename = "' + self.homedir + '/latency.out"\n' + s += 'latency.workload_latency(workload, latency_filename)\n' + + if close_conn: + s += 'conn.close()\n' + + for o in self.opts_used: + del self.opts_map[o] + if len(self.opts_map) != 0: + self.error('internal error, options not handled: ' + + str(self.opts_map)) return s def usage(): @@ -416,13 +601,17 @@ prefix = ( 'sys.path.append("' + runner_dir + '")\n\n') exit_status = 0 +homedir = 'WT_TEST' for arg in sys.argv[1:]: - if arg == '--python': + if arg == '--pydebug': + import pdb + pdb.set_trace() + elif arg == '--python': py_out = True elif arg == '--verbose' or arg == '-v': verbose += 1 elif arg.endswith('.wtperf'): - translator = Translator(arg, prefix, verbose) + translator = Translator(arg, prefix, verbose, homedir) pysrc = translator.translate() if translator.has_error: exit_status = 1 @@ -432,8 +621,20 @@ for arg in sys.argv[1:]: (outfd, tmpfile) = tempfile.mkstemp(suffix='.py') os.write(outfd, pysrc) os.close(outfd) - execfile(tmpfile) + # We make a copy of the configuration file in the home + # directory after the run, because the wiredtiger_open + # in the generated code will clean out the directory first. + raised = None + try: + execfile(tmpfile) + except Exception, exception: + raised = exception + if not os.path.isdir(homedir): + os.makedirs(homedir) + translator.copy_config() os.remove(tmpfile) + if raised != None: + raise raised else: usage() sys.exit(1) diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-80r20u.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-80r20u.wtperf index de5299bbac1..8b56a86e022 100644 --- a/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-80r20u.wtperf +++ b/src/third_party/wiredtiger/bench/wtperf/runners/500m-btree-80r20u.wtperf @@ -11,7 +11,7 @@ compression="snappy" # close_conn as false allows this test to close/finish faster, but if running # as the set, the next test will need to run recovery. close_conn=false -sess_config="isolation=snapshot +sess_config="isolation=snapshot" table_count=2 key_sz=40 value_sz=120 diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index dbd3dcbb233..32faec8709d 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -131,6 +131,20 @@ file_runtime_config = [ do not ever evict the object's pages from cache. Not compatible with LSM tables; see @ref tuning_cache_resident for more information''', type='boolean'), + Config('assert', '', r''' + enable enhanced checking. ''', + type='category', subconfig= [ + Config('commit_timestamp', 'none', r''' + verify that timestamps should 'always' or 'never' be used + on modifications with this table. Verification is 'none' + if mixed update use is allowed.''', + choices=['always','never','none']), + Config('read_timestamp', 'none', r''' + verify that timestamps should 'always' or 'never' be used + on reads with this table. Verification is 'none' + if mixed read use is allowed.''', + choices=['always','never','none']) + ], undoc=True), Config('log', '', r''' the transaction log configuration for this object. Only valid if log is enabled in ::wiredtiger_open''', diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py index 8c0448b27c1..df897bcb91e 100644 --- a/src/third_party/wiredtiger/dist/flags.py +++ b/src/third_party/wiredtiger/dist/flags.py @@ -23,6 +23,7 @@ flags = { ], 'page_read' : [ 'READ_CACHE', + 'READ_LOOKASIDE', 'READ_NOTFOUND_OK', 'READ_NO_EMPTY', 'READ_NO_EVICT', @@ -35,14 +36,15 @@ flags = { 'READ_WONT_NEED', ], 'rec_write' : [ - 'CHECKPOINTING', - 'EVICTING', - 'EVICT_IN_MEMORY', - 'EVICT_INMEM_SPLIT', - 'EVICT_LOOKASIDE', - 'EVICT_SCRUB', - 'EVICT_UPDATE_RESTORE', - 'VISIBILITY_ERR', + 'REC_CHECKPOINT', + 'REC_EVICT', + 'REC_INMEM_SPLIT', + 'REC_IN_MEMORY', + 'REC_LOOKASIDE', + 'REC_SCRUB', + 'REC_UPDATE_RESTORE', + 'REC_VISIBILITY_ERR', + 'REC_VISIBLE_ALL', ], 'timing_stress_for_test' : [ 'TIMING_STRESS_CHECKPOINT_SLOW', @@ -102,6 +104,7 @@ flags = { 'CONN_CKPT_SYNC', 'CONN_CLOSING', 'CONN_CLOSING_NO_MORE_OPENS', + 'CONN_EVICTION_NO_LOOKASIDE', 'CONN_EVICTION_RUN', 'CONN_IN_MEMORY', 'CONN_LAS_OPEN', diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list index dcaf975434f..b2f6cbec43e 100644 --- a/src/third_party/wiredtiger/dist/s_define.list +++ b/src/third_party/wiredtiger/dist/s_define.list @@ -58,6 +58,7 @@ WT_STAT_INCRV_BASE WT_STAT_WRITE WT_TIMEDIFF_US WT_TRET_ERROR_OK +WT_TXN_TIMESTAMP_FLAG_CHECK WT_UPDATE_SIZE WT_WITH_LOCK_NOWAIT WT_WITH_LOCK_WAIT diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index 06e7dccd943..24610b9ab14 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -254,6 +254,7 @@ connection_stats = [ CacheStat('cache_hazard_walks', 'hazard pointer check entries walked'), CacheStat('cache_inmem_split', 'in-memory page splits'), CacheStat('cache_inmem_splittable', 'in-memory page passed criteria to be split'), + CacheStat('cache_lookaside_entries', 'lookaside table entries', 'no_clear,no_scale'), CacheStat('cache_lookaside_insert', 'lookaside table insert calls'), CacheStat('cache_lookaside_remove', 'lookaside table remove calls'), CacheStat('cache_overhead', 'percentage overhead', 'no_clear,no_scale'), diff --git a/src/third_party/wiredtiger/examples/c/ex_all.c b/src/third_party/wiredtiger/examples/c/ex_all.c index a0c6f87ceda..dcd9dd406df 100644 --- a/src/third_party/wiredtiger/examples/c/ex_all.c +++ b/src/third_party/wiredtiger/examples/c/ex_all.c @@ -209,9 +209,9 @@ cursor_ops(WT_SESSION *session) value.size = strlen("another value"); cursor->set_value(cursor, &value); /*! [Set the cursor's raw value] */ - } error_check(cursor->insert(cursor)); + } /*! [Return the next record] */ error_check(cursor->next(cursor)); diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 3ed326b1854..6c4f2ee7138 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "6f561957cb5606f504f9fe5a124c80386b210b1a", + "commit": "0cd3d5bbd8a5c8779f1129c6754b4463403e788f", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-3.6" diff --git a/src/third_party/wiredtiger/src/bloom/bloom.c b/src/third_party/wiredtiger/src/bloom/bloom.c index a39d50e68c1..6f4050b3eb6 100644 --- a/src/third_party/wiredtiger/src/bloom/bloom.c +++ b/src/third_party/wiredtiger/src/bloom/bloom.c @@ -274,6 +274,7 @@ __wt_bloom_hash_get(WT_BLOOM *bloom, WT_BLOOM_HASH *bhash) WT_ASSERT(bloom->session, bloom->bitstring == NULL); /* Create a cursor on the first time through. */ + c = NULL; WT_ERR(__bloom_open_cursor(bloom, NULL)); c = bloom->c; @@ -301,6 +302,8 @@ __wt_bloom_hash_get(WT_BLOOM *bloom, WT_BLOOM_HASH *bhash) err: /* Don't return WT_NOTFOUND from a failed search. */ if (ret == WT_NOTFOUND) ret = WT_ERROR; + if (c != NULL) + (void)c->reset(c); __wt_err(bloom->session, ret, "Failed lookup in bloom filter"); return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 51882a7e466..ee800ca80ee 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -334,7 +334,7 @@ __cursor_col_search( WT_DECL_RET; WT_WITH_PAGE_INDEX(session, - ret = __wt_col_search(session, cbt->iface.recno, leaf, cbt)); + ret = __wt_col_search(session, cbt->iface.recno, leaf, cbt, false)); return (ret); } @@ -348,8 +348,8 @@ __cursor_row_search( { WT_DECL_RET; - WT_WITH_PAGE_INDEX(session, - ret = __wt_row_search(session, &cbt->iface.key, leaf, cbt, insert)); + WT_WITH_PAGE_INDEX(session, ret = __wt_row_search( + session, &cbt->iface.key, leaf, cbt, insert, false)); return (ret); } @@ -445,6 +445,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_STAT_CONN_INCR(session, cursor_search); WT_STAT_DATA_INCR(session, cursor_search); + WT_RET(__wt_txn_search_check(session)); __cursor_state_save(cursor, &state); /* @@ -534,6 +535,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_STAT_CONN_INCR(session, cursor_search_near); WT_STAT_DATA_INCR(session, cursor_search_near); + WT_RET(__wt_txn_search_check(session)); __cursor_state_save(cursor, &state); /* diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index 778adcc3dfd..f0388bd1f07 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -1124,6 +1124,9 @@ __debug_ref(WT_DBG *ds, WT_REF *ref) case WT_REF_LOCKED: state = "locked"; break; + case WT_REF_LOOKASIDE: + state = "lookaside"; + break; case WT_REF_MEM: state = "memory"; break; diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index 093192dbaa0..20e592d12bc 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -85,12 +85,6 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) /* * Atomically switch the page's state to lock it. If the page is not * on-disk, other threads may be using it, no fast delete. - * - * Possible optimization: if the page is already deleted and the delete - * is visible to us (the delete has been committed), we could skip the - * page instead of instantiating it and figuring out there are no rows - * in the page. While that's a huge amount of work to no purpose, it's - * unclear optimizing for overlapping range deletes is worth the effort. */ if (ref->state != WT_REF_DISK || !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED)) @@ -164,6 +158,7 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) for (sleep_count = yield_count = 0;;) { switch (ref->state) { case WT_REF_DISK: + case WT_REF_LOOKASIDE: case WT_REF_READING: WT_ASSERT(session, 0); /* Impossible, assert */ break; diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c index 806a9770057..1aae991a407 100644 --- a/src/third_party/wiredtiger/src/btree/bt_discard.c +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -316,8 +316,14 @@ __wt_free_ref( */ __wt_ref_addr_free(session, ref); - /* Free any page-deleted information. */ - if (ref->page_del != NULL) { + /* + * Free any lookaside or page-deleted information. We only expect a + * lookaside structure for lookaside references, but can see + * page-deleted information in other cases (such as WT_REF_MEM). + */ + if (ref->state == WT_REF_LOOKASIDE) + __wt_free(session, ref->page_las); + else if (ref->page_del != NULL) { __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); } diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c index fd52c53861a..4ab88cea01e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_handle.c +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -398,6 +398,29 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) else btree->checksum = CKSUM_UNCOMPRESSED; + /* Debugging information */ + WT_RET(__wt_config_gets(session, + cfg, "assert.commit_timestamp", &cval)); + if (WT_STRING_MATCH("always", cval.str, cval.len)) { + FLD_SET(btree->assert_flags, WT_ASSERT_COMMIT_TS_ALWAYS); + FLD_CLR(btree->assert_flags, WT_ASSERT_COMMIT_TS_NEVER); + } else if (WT_STRING_MATCH("never", cval.str, cval.len)) { + FLD_SET(btree->assert_flags, WT_ASSERT_COMMIT_TS_NEVER); + FLD_CLR(btree->assert_flags, WT_ASSERT_COMMIT_TS_ALWAYS); + } else + FLD_CLR(btree->assert_flags, + WT_ASSERT_COMMIT_TS_ALWAYS | WT_ASSERT_COMMIT_TS_NEVER); + WT_RET(__wt_config_gets(session, cfg, "assert.read_timestamp", &cval)); + if (WT_STRING_MATCH("always", cval.str, cval.len)) { + FLD_SET(btree->assert_flags, WT_ASSERT_READ_TS_ALWAYS); + FLD_CLR(btree->assert_flags, WT_ASSERT_READ_TS_NEVER); + } else if (WT_STRING_MATCH("never", cval.str, cval.len)) { + FLD_SET(btree->assert_flags, WT_ASSERT_READ_TS_NEVER); + FLD_CLR(btree->assert_flags, WT_ASSERT_READ_TS_ALWAYS); + } else + FLD_CLR(btree->assert_flags, + WT_ASSERT_READ_TS_ALWAYS | WT_ASSERT_READ_TS_NEVER); + /* Huffman encoding */ WT_RET(__wt_btree_huffman_open(session)); @@ -549,7 +572,7 @@ __wt_btree_tree_open( * the allocated copy of the disk image on return, the in-memory object * steals it. */ - WT_ERR(__wt_page_inmem(session, NULL, dsk.data, dsk.memsize, + WT_ERR(__wt_page_inmem(session, NULL, dsk.data, WT_DATA_IN_ITEM(&dsk) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); dsk.mem = NULL; diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c index ebd0eb0cb71..d65073a398f 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c +++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c @@ -147,7 +147,7 @@ err: __wt_scr_free(session, &tmp); */ int __wt_ovfl_remove(WT_SESSION_IMPL *session, - WT_PAGE *page, WT_CELL_UNPACK *unpack, bool checkpoint) + WT_PAGE *page, WT_CELL_UNPACK *unpack, bool evicting) { /* * This function solves two problems in reconciliation. @@ -188,7 +188,7 @@ __wt_ovfl_remove(WT_SESSION_IMPL *session, * We only have to do this for checkpoints: in any eviction mode, there * can't be threads sitting in our update lists. */ - if (checkpoint) + if (!evicting) WT_RET(__ovfl_cache(session, page, unpack)); /* diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index 5316b19a41e..d3df9f6bf78 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -127,8 +127,8 @@ err: if ((pindex = WT_INTL_INDEX_GET_SAFE(page)) != NULL) { * Build in-memory page information. */ int -__wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, - const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep) +__wt_page_inmem(WT_SESSION_IMPL *session, + WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep) { WT_DECL_RET; WT_PAGE *page; @@ -196,8 +196,13 @@ __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, * Track the memory allocated to build this page so we can update the * cache statistics in a single call. If the disk image is in allocated * memory, start with that. + * + * Accounting is based on the page-header's in-memory disk size instead + * of the buffer memory used to instantiate the page image even though + * the values might not match exactly, because that's the only value we + * have when discarding the page image and accounting needs to match. */ - size = LF_ISSET(WT_PAGE_DISK_ALLOC) ? memsize : 0; + size = LF_ISSET(WT_PAGE_DISK_ALLOC) ? dsk->mem_size : 0; switch (page->type) { case WT_PAGE_COL_FIX: @@ -218,9 +223,10 @@ __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, WT_ILLEGAL_VALUE_ERR(session); } - /* Update the page's in-memory size and the cache statistics. */ + /* Update the page's cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); - __wt_cache_page_image_incr(session, dsk->mem_size); + if (LF_ISSET(WT_PAGE_DISK_ALLOC)) + __wt_cache_page_image_incr(session, dsk->mem_size); /* Link the new internal page to the parent. */ if (ref != NULL) { diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c index f28c4e10594..268b040bd6e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_random.c +++ b/src/third_party/wiredtiger/src/btree/bt_random.c @@ -231,15 +231,17 @@ restart: /* for (i = 0; i < entries; ++i) { descent = pindex->index[__wt_random(&session->rnd) % entries]; - if (descent->state == WT_REF_MEM || - descent->state == WT_REF_DISK) + if (descent->state == WT_REF_DISK || + descent->state == WT_REF_LOOKASIDE || + descent->state == WT_REF_MEM) break; } if (i == entries) for (i = 0; i < entries; ++i) { descent = pindex->index[i]; - if (descent->state == WT_REF_MEM || - descent->state == WT_REF_DISK) + if (descent->state == WT_REF_DISK || + descent->state == WT_REF_LOOKASIDE || + descent->state == WT_REF_MEM) break; } if (i == entries || descent == NULL) { diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index edab3c8c217..ab8a8d7916b 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -8,72 +8,8 @@ #include "wt_internal.h" -static void __btree_verbose_lookaside_read(WT_SESSION_IMPL *); - -/* - * __wt_las_remove_block -- - * Remove all records matching a key prefix from the lookaside store. - */ -int -__wt_las_remove_block(WT_SESSION_IMPL *session, - WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size) -{ - WT_DECL_RET; - WT_ITEM las_addr, las_key, las_timestamp; - uint64_t las_counter, las_txnid, remove_cnt; - uint32_t las_id; - int exact; - - remove_cnt = 0; - - /* - * Search for the block's unique prefix and step through all matching - * records, removing them. - */ - las_addr.data = addr; - las_addr.size = addr_size; - las_key.size = 0; - las_timestamp.size = 0; - cursor->set_key(cursor, btree_id, &las_addr, - (uint64_t)0, (uint32_t)0, &las_timestamp, &las_key); - if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) - ret = cursor->next(cursor); - for (; ret == 0; ret = cursor->next(cursor)) { - WT_ERR(cursor->get_key(cursor, &las_id, &las_addr, &las_counter, - &las_txnid, &las_timestamp, &las_key)); - - /* - * Confirm the search using the unique prefix; if not a match, - * we're done searching for records for this page. - */ - if (las_id != btree_id || - las_addr.size != addr_size || - memcmp(las_addr.data, addr, addr_size) != 0) - break; - - /* - * Cursor opened overwrite=true: won't return WT_NOTFOUND should - * another thread remove the record before we do, and the cursor - * remains positioned in that case. - */ - WT_ERR(cursor->remove(cursor)); - ++remove_cnt; - } - WT_ERR_NOTFOUND_OK(ret); - -err: /* - * If there were races to remove records, we can over-count. All - * arithmetic is signed, so underflow isn't fatal, but check anyway so - * we don't skew low over time. - */ - if (remove_cnt > S2C(session)->las_record_cnt) - S2C(session)->las_record_cnt = 0; - else if (remove_cnt > 0) - (void)__wt_atomic_sub64( - &S2C(session)->las_record_cnt, remove_cnt); - - return (ret); -} +static void __btree_verbose_lookaside_read( + WT_SESSION_IMPL *, uint32_t, uint64_t); /* * __col_instantiate -- @@ -88,13 +24,17 @@ __col_instantiate(WT_SESSION_IMPL *session, page = ref->page; - /* Discard any of the updates we don't need. */ + /* + * Discard any of the updates we don't need. + * + * Just free the memory: it hasn't been accounted for on the page yet. + */ if (updlist->next != NULL && (upd = __wt_update_obsolete_check(session, page, updlist)) != NULL) - __wt_update_obsolete_free(session, page, upd); + __wt_free_update_list(session, upd); /* Search the page and add updates. */ - WT_RET(__wt_col_search(session, recno, ref, cbt)); + WT_RET(__wt_col_search(session, recno, ref, cbt, true)); WT_RET(__wt_col_modify( session, cbt, recno, NULL, updlist, WT_UPDATE_INVALID, false)); return (0); @@ -113,13 +53,17 @@ __row_instantiate(WT_SESSION_IMPL *session, page = ref->page; - /* Discard any of the updates we don't need. */ + /* + * Discard any of the updates we don't need. + * + * Just free the memory: it hasn't been accounted for on the page yet. + */ if (updlist->next != NULL && (upd = __wt_update_obsolete_check(session, page, updlist)) != NULL) - __wt_update_obsolete_free(session, page, upd); + __wt_free_update_list(session, upd); /* Search the page and add updates. */ - WT_RET(__wt_row_search(session, key, ref, cbt, true)); + WT_RET(__wt_row_search(session, key, ref, cbt, true, true)); WT_RET(__wt_row_modify( session, cbt, key, NULL, updlist, WT_UPDATE_INVALID, false)); return (0); @@ -130,23 +74,21 @@ __row_instantiate(WT_SESSION_IMPL *session, * Instantiate lookaside update records in a recently read page. */ static int -__las_page_instantiate(WT_SESSION_IMPL *session, - WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size) +__las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t btree_id) { WT_CURSOR *cursor; WT_CURSOR_BTREE cbt; WT_DECL_ITEM(current_key); WT_DECL_RET; - WT_DECL_TIMESTAMP(timestamp) - WT_ITEM las_addr, las_key, las_timestamp, las_value; + WT_ITEM las_key, las_timestamp, las_value; WT_PAGE *page; WT_UPDATE *first_upd, *last_upd, *upd; size_t incr, total_incr; - uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid; + uint64_t current_recno, las_counter, las_pageid, las_txnid, recno; uint32_t las_id, session_flags; + const uint8_t *p; uint8_t upd_type; int exact; - const uint8_t *p; cursor = NULL; page = ref->page; @@ -174,47 +116,29 @@ __las_page_instantiate(WT_SESSION_IMPL *session, * Search for the block's unique prefix, stepping through any matching * records. */ - las_addr.data = addr; - las_addr.size = addr_size; - las_timestamp.size = 0; - cursor->set_key(cursor, read_id, &las_addr, - (uint64_t)0, (uint32_t)0, &las_timestamp, &las_key); + cursor->set_key(cursor, + btree_id, ref->page_las->las_pageid, (uint64_t)0, &las_key); if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) ret = cursor->next(cursor); for (; ret == 0; ret = cursor->next(cursor)) { - WT_ERR(cursor->get_key(cursor, &las_id, &las_addr, &las_counter, - &las_txnid, &las_timestamp, &las_key)); + WT_ERR(cursor->get_key(cursor, + &las_id, &las_pageid, &las_counter, &las_key)); /* * Confirm the search using the unique prefix; if not a match, * we're done searching for records for this page. */ - if (las_id != read_id || - las_addr.size != addr_size || - memcmp(las_addr.data, addr, addr_size) != 0) + if (las_id != btree_id || + las_pageid != ref->page_las->las_pageid) break; - /* - * If the on-page value has become globally visible, this record - * is no longer needed. - * - * Copy the timestamp from the cursor to avoid unaligned reads. - */ -#ifdef HAVE_TIMESTAMPS - WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE); - memcpy(×tamp, las_timestamp.data, las_timestamp.size); -#endif - if (__wt_txn_visible_all( - session, las_txnid, WT_TIMESTAMP_NULL(×tamp))) - continue; - /* Allocate the WT_UPDATE structure. */ WT_ERR(cursor->get_value(cursor, - &upd_txnid, &las_timestamp, &upd_type, &las_value)); + &las_txnid, &las_timestamp, &upd_type, &las_value)); WT_ERR(__wt_update_alloc( session, &las_value, &upd, &incr, upd_type)); total_incr += incr; - upd->txnid = upd_txnid; + upd->txnid = las_txnid; #ifdef HAVE_TIMESTAMPS WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE); memcpy(&upd->timestamp, las_timestamp.data, las_timestamp.size); @@ -287,16 +211,8 @@ __las_page_instantiate(WT_SESSION_IMPL *session, if (total_incr != 0) { __wt_cache_page_inmem_incr(session, page, total_incr); - /* - * We've modified/dirtied the page, but that's not necessary and - * if we keep the page clean, it's easier to evict. We leave the - * lookaside table updates in place, so if we evict this page - * without dirtying it, any future instantiation of it will find - * the records it needs. If the page is dirtied before eviction, - * then we'll write any needed lookaside table records for the - * new location of the page. - */ - __wt_page_modify_clear(session, page); + /* Make sure the page is included in the next checkpoint. */ + page->modify->first_dirty_txn = WT_TXN_FIRST; } err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); @@ -384,12 +300,12 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref) { struct timespec start, stop; WT_BTREE *btree; + WT_CURSOR *las_cursor; WT_DECL_RET; WT_ITEM tmp; WT_PAGE *page; - const WT_PAGE_HEADER *dsk; size_t addr_size; - uint32_t previous_state; + uint32_t new_state, previous_state, session_flags; const uint8_t *addr; bool timer; @@ -404,26 +320,36 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref) /* * Attempt to set the state to WT_REF_READING for normal reads, or - * WT_REF_LOCKED, for deleted pages. If successful, we've won the - * race, read the page. + * WT_REF_LOCKED, for deleted pages or pages with lookaside entries. + * If successful, we've won the race, read the page. */ - if (__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_READING)) - previous_state = WT_REF_DISK; - else if (__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED)) - previous_state = WT_REF_DELETED; - else + switch (previous_state = ref->state) { + case WT_REF_DISK: + new_state = WT_REF_READING; + break; + case WT_REF_DELETED: + case WT_REF_LOOKASIDE: + new_state = WT_REF_LOCKED; + break; + default: + return (0); + } + if (!__wt_atomic_casv32(&ref->state, previous_state, new_state)) return (0); /* - * Get the address: if there is no address, the page was deleted, but a - * subsequent search or insert is forcing re-creation of the name space. + * Get the address: if there is no address, the page was deleted or had + * only lookaside entries, and a subsequent search or insert is forcing + * re-creation of the name space. */ __wt_ref_info(ref, &addr, &addr_size, NULL); if (addr == NULL) { - WT_ASSERT(session, previous_state == WT_REF_DELETED); + WT_ASSERT(session, previous_state != WT_REF_DISK); WT_ERR(__wt_btree_new_leaf_page(session, &page)); ref->page = page; + if (previous_state == WT_REF_LOOKASIDE) + goto skip_read; goto done; } @@ -441,16 +367,18 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref) WT_STAT_CONN_INCRV(session, cache_read_app_time, WT_TIMEDIFF_US(stop, start)); } - WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize, - WT_DATA_IN_ITEM(&tmp) ? - WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); /* - * Clear the local reference to an allocated copy of the disk image on - * return; the page steals it, errors in this code should not free it. + * Build the in-memory version of the page. Clear our local reference to + * the allocated copy of the disk image on return, the in-memory object + * steals it. */ + WT_ERR(__wt_page_inmem(session, ref, tmp.data, + WT_DATA_IN_ITEM(&tmp) ? + WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); tmp.mem = NULL; +skip_read: /* * If reading for a checkpoint, there's no additional work to do, the * page on disk is correct as written. @@ -468,18 +396,31 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref) * We only care if the lookaside table is currently active, check that * before doing any work. */ - dsk = tmp.data; - if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && __wt_las_is_written(session)) { - __btree_verbose_lookaside_read(session); + if (previous_state == WT_REF_LOOKASIDE) { + WT_ASSERT(session, (ref->page->dsk == NULL || + F_ISSET(ref->page->dsk, WT_PAGE_LAS_UPDATE))); + + __btree_verbose_lookaside_read( + session, btree->id, ref->page_las->las_pageid); WT_STAT_CONN_INCR(session, cache_read_lookaside); WT_STAT_DATA_INCR(session, cache_read_lookaside); + WT_ERR(__las_page_instantiate(session, ref, btree->id)); - WT_ERR(__las_page_instantiate( - session, ref, btree->id, addr, addr_size)); + /* + * The page is instantiated so we no longer need the lookaside + * entries. Note that we are discarding updates so the page + * must be marked available even if these operations fail. + */ + __wt_las_cursor(session, &las_cursor, &session_flags); + WT_TRET(__wt_las_remove_block( + session, las_cursor, btree->id, ref->page_las->las_pageid)); + __wt_free(session, ref->page_las); + WT_TRET(__wt_las_cursor_close( + session, &las_cursor, session_flags)); } done: WT_PUBLISH(ref->state, WT_REF_MEM); - return (0); + return (ret); err: /* * If the function building an in-memory version of the page failed, @@ -512,7 +453,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags WT_PAGE *page; uint64_t sleep_cnt, wait_cnt; int force_attempts; - bool busy, cache_work, evict_soon, stalled; + bool busy, cache_work, did_read, evict_soon, stalled; btree = S2BT(session); @@ -525,7 +466,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags WT_STAT_DATA_INCR(session, cache_pages_requested); } - for (evict_soon = stalled = false, + for (did_read = evict_soon = stalled = false, force_attempts = 0, sleep_cnt = wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DELETED: @@ -534,8 +475,26 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags return (WT_NOTFOUND); /* FALLTHROUGH */ case WT_REF_DISK: - if (LF_ISSET(WT_READ_CACHE)) - return (WT_NOTFOUND); + case WT_REF_LOOKASIDE: + if (LF_ISSET(WT_READ_CACHE)) { + if (ref->state != WT_REF_LOOKASIDE) + return (WT_NOTFOUND); + if (!LF_ISSET(WT_READ_LOOKASIDE)) + return (WT_NOTFOUND); +#ifdef HAVE_TIMESTAMPS + /* + * Skip lookaside pages if reading as of a + * timestamp and all the updates are in the + * future. + */ + if (F_ISSET( + &session->txn, WT_TXN_HAS_TS_READ) && + __wt_timestamp_cmp( + &ref->page_las->min_timestamp, + &session->txn.read_timestamp) > 0) + return (WT_NOTFOUND); +#endif + } /* * The page isn't in memory, read it. If this thread is @@ -548,6 +507,12 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags WT_RET(__page_read(session, ref)); /* + * We just read a page, don't evict it before we have a + * chance to use it. + */ + did_read = true; + + /* * If configured to not trash the cache, leave the page * generation unset, we'll set it before returning to * the oldest read generation, so the page is forcibly @@ -610,7 +575,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags * the page's generation number. If eviction isn't being * done on this file, we're done. */ - if (LF_ISSET(WT_READ_NO_EVICT) || + if (did_read || LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || btree->evict_disabled > 0 || btree->lsm_primary) goto skip_evict; @@ -706,7 +671,8 @@ skip_evict: * performing a lookaside table read. */ static void -__btree_verbose_lookaside_read(WT_SESSION_IMPL *session) +__btree_verbose_lookaside_read( + WT_SESSION_IMPL *session, uint32_t las_id, uint64_t las_pageid) { #ifdef HAVE_VERBOSE WT_CONNECTION_IMPL *conn; @@ -733,10 +699,14 @@ __btree_verbose_lookaside_read(WT_SESSION_IMPL *session) if (__wt_atomic_casv64(&conn->las_verb_gen_read, ckpt_gen_last, ckpt_gen_current)) { __wt_verbose(session, WT_VERB_LOOKASIDE, - "%s", "Read from lookaside file triggered."); + "Read from lookaside file triggered for " + "file ID %" PRIu32 ", page ID %" PRIu64, + las_id, las_pageid); } } #else WT_UNUSED(session); + WT_UNUSED(las_id); + WT_UNUSED(las_pageid); #endif } diff --git a/src/third_party/wiredtiger/src/btree/bt_rebalance.c b/src/third_party/wiredtiger/src/btree/bt_rebalance.c index 7f9693f22c0..c6d9253b2d3 100644 --- a/src/third_party/wiredtiger/src/btree/bt_rebalance.c +++ b/src/third_party/wiredtiger/src/btree/bt_rebalance.c @@ -262,9 +262,12 @@ __rebalance_row_leaf_key(WT_SESSION_IMPL *session, * We need the first key from a leaf page. Leaf pages are relatively * complex (Huffman encoding, prefix compression, and so on), do the * work to instantiate the page and copy the first key to the buffer. + * + * Page flags are 0 because we aren't releasing the memory used to read + * the page into memory and we don't want page discard to free it. */ WT_RET(__wt_bt_read(session, rs->tmp1, addr, addr_len)); - WT_RET(__wt_page_inmem(session, NULL, rs->tmp1->data, 0, 0, &page)); + WT_RET(__wt_page_inmem(session, NULL, rs->tmp1->data, 0, &page)); ret = __wt_row_leaf_key_copy(session, page, &page->pg_row[0], key); __wt_page_out(session, &page); return (ret); diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c index 4a43dd67ff6..e2da77348f0 100644 --- a/src/third_party/wiredtiger/src/btree/bt_slvg.c +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -588,8 +588,12 @@ __slvg_trk_leaf(WT_SESSION_IMPL *session, * and copy the full keys, then free the page. We do this on * every leaf page, and if you need to speed up the salvage, * it's probably a great place to start. + * + * Page flags are 0 because we aren't releasing the memory used + * to read the page into memory and we don't want page discard + * to free it. */ - WT_ERR(__wt_page_inmem(session, NULL, dsk, 0, 0, &page)); + WT_ERR(__wt_page_inmem(session, NULL, dsk, 0, &page)); WT_ERR(__wt_row_leaf_key_copy(session, page, &page->pg_row[0], &trk->row_start)); WT_ERR(__wt_row_leaf_key_copy(session, @@ -1285,7 +1289,8 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) /* Write the new version of the leaf page to disk. */ WT_ERR(__slvg_modify_init(session, page)); - WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR, NULL)); + WT_ERR(__wt_reconcile( + session, ref, cookie, WT_REC_VISIBILITY_ERR, NULL)); /* Reset the page. */ page->pg_var = save_col_var; @@ -1735,10 +1740,13 @@ __slvg_row_trk_update_start( * Read and instantiate the WT_TRACK page (we don't have to verify the * page, nor do we have to be quiet on error, we've already read this * page successfully). + * + * Page flags are 0 because we aren't releasing the memory used to read + * the page into memory and we don't want page discard to free it. */ WT_RET(__wt_scr_alloc(session, trk->trk_size, &dsk)); WT_ERR(__wt_bt_read(session, dsk, trk->trk_addr, trk->trk_addr_size)); - WT_ERR(__wt_page_inmem(session, NULL, dsk->mem, 0, 0, &page)); + WT_ERR(__wt_page_inmem(session, NULL, dsk->data, 0, &page)); /* * Walk the page, looking for a key sorting greater than the specified @@ -1998,7 +2006,8 @@ __slvg_row_build_leaf( /* Write the new version of the leaf page to disk. */ WT_ERR(__slvg_modify_init(session, page)); - WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR, NULL)); + WT_ERR(__wt_reconcile( + session, ref, cookie, WT_REC_VISIBILITY_ERR, NULL)); /* Reset the page. */ page->entries += skip_stop; diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 1e76deb66d7..884ee9b5c8b 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -1385,10 +1385,12 @@ __split_multi_inmem( WT_DECL_RET; WT_PAGE *page; WT_SAVE_UPD *supd; - WT_UPDATE *upd; + WT_UPDATE *prev_upd, *upd; uint64_t recno; uint32_t i, slot; + WT_ASSERT(session, multi->las_pageid == 0); + /* * In 04/2016, we removed column-store record numbers from the WT_PAGE * structure, leading to hard-to-debug problems because we corrupt the @@ -1409,9 +1411,8 @@ __split_multi_inmem( * when discarding the original page, and our caller will discard the * allocated page on error, when discarding the allocated WT_REF. */ - WT_RET(__wt_page_inmem(session, ref, - multi->disk_image, ((WT_PAGE_HEADER *)multi->disk_image)->mem_size, - WT_PAGE_DISK_ALLOC, &page)); + WT_RET(__wt_page_inmem( + session, ref, multi->disk_image, WT_PAGE_DISK_ALLOC, &page)); multi->disk_image = NULL; /* @@ -1434,7 +1435,7 @@ __split_multi_inmem( __wt_btcur_open(&cbt); /* Re-create each modification we couldn't write. */ - for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd) + for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd) { switch (orig->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: @@ -1443,7 +1444,8 @@ __split_multi_inmem( recno = WT_INSERT_RECNO(supd->ins); /* Search the page. */ - WT_ERR(__wt_col_search(session, recno, ref, &cbt)); + WT_ERR(__wt_col_search( + session, recno, ref, &cbt, true)); /* Apply the modification. */ WT_ERR(__wt_col_modify(session, &cbt, @@ -1465,7 +1467,8 @@ __split_multi_inmem( } /* Search the page. */ - WT_ERR(__wt_row_search(session, key, ref, &cbt, true)); + WT_ERR(__wt_row_search( + session, key, ref, &cbt, true, true)); /* Apply the modification. */ WT_ERR(__wt_row_modify(session, @@ -1474,6 +1477,37 @@ __split_multi_inmem( WT_ILLEGAL_VALUE_ERR(session); } + /* + * Discard the update used to create the on-page disk image. + * This is not just a performance issue: if the update used to + * create the value for this on-page disk image was a modify, + * and it was applied to the previous on-page value to + * determine a value to write to this disk image, that update + * cannot be applied to the new on-page value without risking + * corruption. + */ + if (supd->onpage_upd != NULL) { + for (prev_upd = upd; prev_upd != NULL && + prev_upd->next != supd->onpage_upd; + prev_upd = prev_upd->next) + ; + /* + * If the on-page update was in fact a tombstone, there + * will be no value on the page. Don't throw the + * tombstone away: we may need it to correctly resolve + * modifications. + */ + if (supd->onpage_upd->type == WT_UPDATE_DELETED && + prev_upd != NULL) + prev_upd = prev_upd->next; + if (prev_upd != NULL) { + __wt_update_obsolete_free( + session, page, prev_upd->next); + prev_upd->next = NULL; + } + } + } + /* * When modifying the page we set the first dirty transaction to the * last transaction currently running. However, the updates we made @@ -1620,7 +1654,16 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, addr->type = multi->addr.type; WT_RET(__wt_memdup(session, multi->addr.addr, addr->size, &addr->addr)); - ref->state = WT_REF_DISK; + if (multi->las_pageid != 0) { + WT_RET(__wt_calloc_one(session, &ref->page_las)); + ref->page_las->las_pageid = multi->las_pageid; +#ifdef HAVE_TIMESTAMPS + __wt_timestamp_set(&ref->page_las->min_timestamp, + &multi->las_min_timestamp); +#endif + ref->state = WT_REF_LOOKASIDE; + } else + ref->state = WT_REF_DISK; } /* diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index 75f1c6ef930..02ff0a1a4be 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -62,6 +62,81 @@ __sync_checkpoint_can_skip(WT_SESSION_IMPL *session, WT_PAGE *page) } /* + * __sync_dup_walk -- + * Duplicate a tree walk point. + */ +static inline int +__sync_dup_walk( + WT_SESSION_IMPL *session, WT_REF *walk, uint32_t flags, WT_REF **dupp) +{ + WT_REF *old; + bool busy; + + if ((old = *dupp) != NULL) { + *dupp = NULL; + WT_RET(__wt_page_release(session, old, flags)); + } + + /* It is okay to duplicate a walk before it starts. */ + if (walk == NULL || __wt_ref_is_root(walk)) { + *dupp = walk; + return (0); + } + + /* Get a duplicate hazard pointer. */ + for (;;) { +#ifdef HAVE_DIAGNOSTIC + WT_RET( + __wt_hazard_set(session, walk, &busy, __func__, __LINE__)); +#else + WT_RET(__wt_hazard_set(session, walk, &busy)); +#endif + /* + * We already have a hazard pointer, we should generally be able + * to get another one. We can get spurious busy errors (e.g., if + * eviction is attempting to lock the page. Keep trying: we have + * one hazard pointer so we should be able to get another one. + */ + if (!busy) + break; + __wt_yield(); + } + + *dupp = walk; + return (0); +} + +/* + * __sync_evict_page -- + * Attempt to evict a page during a checkpoint walk. + */ +static int +__sync_evict_page(WT_SESSION_IMPL *session, WT_REF **walkp, uint32_t flags) +{ + WT_DECL_RET; + WT_REF *next, *to_evict; + + to_evict = *walkp; + next = NULL; + + /* + * Get the ref after the page we're trying to evicting. If the + * eviction is successful, the walk will continue from here. + */ + WT_RET(__sync_dup_walk(session, to_evict, flags, &next)); + WT_ERR(__wt_tree_walk(session, &next, flags)); + + WT_ERR(__wt_page_release_evict(session, to_evict)); + + /* Success: continue the walk at the next page. */ + *walkp = next; + return (0); + +err: WT_TRET(__wt_page_release(session, next, flags)); + return (ret); +} + +/* * __sync_file -- * Flush pages for a specific file. */ @@ -73,22 +148,23 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; - WT_REF *walk; + WT_REF *prev, *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; uint64_t oldest_id, saved_pinned_id; uint32_t flags; - bool timer; + bool evict_failed, skip_walk, timer; conn = S2C(session); btree = S2BT(session); - walk = NULL; + prev = walk = NULL; txn = &session->txn; - saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; - flags = WT_READ_CACHE | WT_READ_NO_GEN; + evict_failed = skip_walk = false; + flags = WT_READ_CACHE | WT_READ_NO_GEN; internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; + saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; timer = WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT); if (timer) __wt_epoch(session, &start); @@ -119,8 +195,8 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) */ oldest_id = __wt_txn_oldest_id(session); - flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL; - for (walk = NULL;;) { + LF_SET(WT_READ_NO_WAIT | WT_READ_SKIP_INTL); + for (;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; @@ -139,7 +215,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, - walk, NULL, WT_CHECKPOINTING, NULL)); + walk, NULL, WT_REC_CHECKPOINT, NULL)); } } break; @@ -184,9 +260,19 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) btree->checkpointing = WT_CKPT_RUNNING; /* Write all dirty in-cache pages. */ - flags |= WT_READ_NO_EVICT; - for (walk = NULL;;) { - WT_ERR(__wt_tree_walk(session, &walk, flags)); + LF_SET(WT_READ_NO_EVICT); + + /* Read pages with lookaside entries and evict them asap. */ + LF_SET(WT_READ_LOOKASIDE | WT_READ_WONT_NEED); + + for (;;) { + if (!skip_walk) { + WT_ERR(__sync_dup_walk( + session, walk, flags, &prev)); + WT_ERR(__wt_tree_walk(session, &walk, flags)); + } + skip_walk = false; + if (walk == NULL) break; @@ -221,8 +307,39 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) leaf_bytes += page->memory_footprint; ++leaf_pages; } + + /* + * If the page needs forced eviction, try to do that + * now. + * + * For eviction to have a chance, we first need to move + * the walk point to the next page checkpoint will + * visit. We want to avoid this code being too special + * purpose, so try to reuse the ordinary eviction path. + * + * If eviction succeeded, it steps to the next ref, so + * we have to skip the next walk. If eviction fails, + * remember so we don't retry it. + */ + if (!WT_PAGE_IS_INTERNAL(page) && + page->read_gen == WT_READGEN_OLDEST && + !evict_failed) { + if ((ret = __sync_evict_page( + session, &walk, flags)) == 0) { + evict_failed = false; + skip_walk = true; + } else { + walk = prev; + prev = NULL; + evict_failed = true; + } + WT_ERR_BUSY_OK(ret); + continue; + } + + evict_failed = false; WT_ERR(__wt_reconcile( - session, walk, NULL, WT_CHECKPOINTING, NULL)); + session, walk, NULL, WT_REC_CHECKPOINT, NULL)); } break; case WT_SYNC_CLOSE: @@ -244,8 +361,8 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) } err: /* On error, clear any left-over tree walk. */ - if (walk != NULL) - WT_TRET(__wt_page_release(session, walk, flags)); + WT_TRET(__wt_page_release(session, walk, flags)); + WT_TRET(__wt_page_release(session, prev, flags)); /* * If we got a snapshot in order to write pages, and there was no diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index fcc2336a3e5..b68c6b9c5c6 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -472,6 +472,11 @@ restart: /* if (LF_ISSET(WT_READ_NO_WAIT) && ref->state != WT_REF_MEM) break; + + /* Skip lookaside pages if not requested. */ + if (ref->state == WT_REF_LOOKASIDE && + !LF_ISSET(WT_READ_LOOKASIDE)) + break; } else if (LF_ISSET(WT_READ_TRUNCATE)) { /* * Avoid pulling a deleted page back in to try diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c index 78ee367dc69..10bc3894a0d 100644 --- a/src/third_party/wiredtiger/src/btree/col_srch.c +++ b/src/third_party/wiredtiger/src/btree/col_srch.c @@ -62,7 +62,7 @@ __check_leaf_key_range(WT_SESSION_IMPL *session, */ int __wt_col_search(WT_SESSION_IMPL *session, - uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) + uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool restore) { WT_BTREE *btree; WT_COL *cip; @@ -90,16 +90,15 @@ __wt_col_search(WT_SESSION_IMPL *session, /* * We may be searching only a single leaf page, not the full tree. In - * the normal case where the page links to a parent, check the page's + * the normal case where we are searching a tree, check the page's * parent keys before doing the full search, it's faster when the - * cursor is being re-positioned. (One case where the page doesn't - * have a parent is if it is being re-instantiated in memory as part - * of a split). + * cursor is being re-positioned. Skip this if the page is being + * re-instantiated in memory. */ if (leaf != NULL) { WT_ASSERT(session, search_recno != WT_RECNO_OOB); - if (leaf->home != NULL) { + if (!restore) { WT_RET(__check_leaf_key_range( session, recno, leaf, cbt)); if (cbt->compare != 0) { diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index 3a9a6eb0f9b..16081e841dc 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -205,7 +205,8 @@ __check_leaf_key_range(WT_SESSION_IMPL *session, */ int __wt_row_search(WT_SESSION_IMPL *session, - WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert) + WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, + bool insert, bool restore) { WT_BTREE *btree; WT_COLLATOR *collator; @@ -250,14 +251,13 @@ __wt_row_search(WT_SESSION_IMPL *session, /* * We may be searching only a single leaf page, not the full tree. In - * the normal case where the page links to a parent, check the page's + * the normal case where we are searching a tree, check the page's * parent keys before doing the full search, it's faster when the - * cursor is being re-positioned. (One case where the page doesn't - * have a parent is if it is being re-instantiated in memory as part - * of a split). + * cursor is being re-positioned. Skip this if the page is being + * re-instantiated in memory. */ if (leaf != NULL) { - if (leaf->home != NULL) { + if (!restore) { WT_RET(__check_leaf_key_range( session, srch_key, leaf, cbt)); if (cbt->compare != 0) { diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c index e1e47b9eecb..d9a5dbc2096 100644 --- a/src/third_party/wiredtiger/src/cache/cache_las.c +++ b/src/third_party/wiredtiger/src/cache/cache_las.c @@ -89,17 +89,24 @@ __wt_las_create(WT_SESSION_IMPL *session) WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_FORMAT)); /* + * Flag that the lookaside table has been created (before creating the + * connection's lookaside table session, it checks before creating a + * lookaside table cursor. + */ + F_SET(conn, WT_CONN_LAS_OPEN); + + /* * Open a shared internal session used to access the lookaside table. * This session should never be tapped for eviction. */ session_flags = WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION; - WT_RET(__wt_open_internal_session( + WT_ERR(__wt_open_internal_session( conn, "lookaside table", true, session_flags, &conn->las_session)); - /* Flag that the lookaside table has been created. */ - F_SET(conn, WT_CONN_LAS_OPEN); - return (0); + +err: F_CLR(conn, WT_CONN_LAS_OPEN); + return (ret); } /* @@ -127,38 +134,6 @@ __wt_las_destroy(WT_SESSION_IMPL *session) } /* - * __wt_las_set_written -- - * Flag that the lookaside table has been written. - */ -void -__wt_las_set_written(WT_SESSION_IMPL *session) -{ - WT_CONNECTION_IMPL *conn; - - conn = S2C(session); - if (!conn->las_written) { - conn->las_written = true; - - /* - * Future page reads must deal with lookaside table records. - * No write could be cached until a future read might matter, - * the barrier is more documentation than requirement. - */ - WT_FULL_BARRIER(); - } -} - -/* - * __wt_las_is_written -- - * Return if the lookaside table has been written. - */ -bool -__wt_las_is_written(WT_SESSION_IMPL *session) -{ - return (S2C(session)->las_written); -} - -/* * __wt_las_cursor_open -- * Open a new lookaside table cursor. */ @@ -280,129 +255,48 @@ __wt_las_cursor_close( } /* - * __wt_las_sweep -- - * Sweep the lookaside table. + * __wt_las_remove_block -- + * Remove all records matching a key prefix from the lookaside store. */ int -__wt_las_sweep(WT_SESSION_IMPL *session) +__wt_las_remove_block(WT_SESSION_IMPL *session, + WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) { - WT_CONNECTION_IMPL *conn; - WT_CURSOR *cursor; WT_DECL_RET; - WT_DECL_TIMESTAMP(timestamp) - WT_ITEM *key; - WT_ITEM las_addr, las_key, las_timestamp; - uint64_t cnt, las_counter, las_txnid, remove_cnt; - uint32_t las_id, session_flags; - int notused; + WT_ITEM las_key; + uint64_t las_counter, las_pageid, remove_cnt; + uint32_t las_id; + int exact; - conn = S2C(session); - cursor = NULL; - key = &conn->las_sweep_key; remove_cnt = 0; - session_flags = 0; /* [-Werror=maybe-uninitialized] */ - - __wt_las_cursor(session, &cursor, &session_flags); /* - * If we're not starting a new sweep, position the cursor using the key - * from the last call (we don't care if we're before or after the key, - * just roughly in the same spot is fine). + * Search for the block's unique prefix and step through all matching + * records, removing them. */ - if (key->size != 0) { - __wt_cursor_set_raw_key(cursor, key); - ret = cursor->search_near(cursor, ¬used); - - /* - * Don't search for the same key twice; if we don't set a new - * key below, it's because we've reached the end of the table - * and we want the next pass to start at the beginning of the - * table. Searching for the same key could leave us stuck at - * the end of the table, repeatedly checking the same rows. - */ - key->size = 0; - if (ret != 0) - goto srch_notfound; - } - - /* - * The sweep server wakes up every 10 seconds (by default), it's a slow - * moving thread. Try to review the entire lookaside table once every 5 - * minutes, or every 30 calls. - * - * The reason is because the lookaside table exists because we're seeing - * cache/eviction pressure (it allows us to trade performance and disk - * space for cache space), and it's likely lookaside blocks are being - * evicted, and reading them back in doesn't help things. A trickier, - * but possibly better, alternative might be to review all lookaside - * blocks in the cache in order to get rid of them, and slowly review - * lookaside blocks that have already been evicted. - */ - cnt = WT_MAX(100, conn->las_record_cnt / 30); - - /* Discard pages we read as soon as we're done with them. */ - F_SET(session, WT_SESSION_NO_CACHE); + las_key.size = 0; + cursor->set_key(cursor, btree_id, pageid, (uint64_t)0, &las_key); + if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) + ret = cursor->next(cursor); + for (; ret == 0; ret = cursor->next(cursor)) { + WT_ERR(cursor->get_key(cursor, + &las_id, &las_pageid, &las_counter, &las_key)); - /* Walk the file. */ - for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) { /* - * If the loop terminates after completing a work unit, we will - * continue the table sweep next time. Get a local copy of the - * sweep key, we're going to reset the cursor; do so before - * calling cursor.remove, cursor.remove can discard our hazard - * pointer and the page could be evicted from underneath us. + * Confirm the search using the unique prefix; if not a match, + * we're done searching for records for this page. Note that + * page ID zero is special: it is a wild card indicating that + * all pages in the tree should be removed. */ - if (cnt == 1) { - WT_ERR(__wt_cursor_get_raw_key(cursor, key)); - if (!WT_DATA_IN_ITEM(key)) - WT_ERR(__wt_buf_set( - session, key, key->data, key->size)); - } + if (las_id != btree_id || + (pageid != 0 && las_pageid != pageid)) + break; - /* - * Cursor opened overwrite=true: won't return WT_NOTFOUND should - * another thread remove the record before we do, and the cursor - * remains positioned in that case. - */ - WT_ERR(cursor->get_key(cursor, &las_id, &las_addr, &las_counter, - &las_txnid, &las_timestamp, &las_key)); - - /* - * If the on-page record transaction ID associated with the - * record is globally visible, the record can be discarded. - * - * Copy the timestamp from the cursor to avoid unaligned reads. - */ -#ifdef HAVE_TIMESTAMPS - WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE); - memcpy(×tamp, las_timestamp.data, las_timestamp.size); -#endif - if (__wt_txn_visible_all( - session, las_txnid, WT_TIMESTAMP_NULL(×tamp))) { - WT_ERR(cursor->remove(cursor)); - ++remove_cnt; - } + WT_ERR(cursor->remove(cursor)); + ++remove_cnt; } - -srch_notfound: WT_ERR_NOTFOUND_OK(ret); - if (0) { -err: __wt_buf_free(session, key); - } - - WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); - - /* - * If there were races to remove records, we can over-count. Underflow - * isn't fatal, but check anyway so we don't skew low over time. - */ - if (remove_cnt > conn->las_record_cnt) - conn->las_record_cnt = 0; - else if (remove_cnt > 0) - (void)__wt_atomic_sub64(&conn->las_record_cnt, remove_cnt); - - F_CLR(session, WT_SESSION_NO_CACHE); - +err: WT_STAT_CONN_DECRV(session, cache_lookaside_entries, remove_cnt); return (ret); } diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index a16ba6ba28c..d7f4f6fe148 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -202,6 +202,16 @@ static const WT_CONFIG_CHECK confchk_WT_CURSOR_reconfigure[] = { { NULL, NULL, NULL, NULL, NULL, 0 } }; +static const WT_CONFIG_CHECK confchk_assert_subconfigs[] = { + { "commit_timestamp", "string", + NULL, "choices=[\"always\",\"never\",\"none\"]", + NULL, 0 }, + { "read_timestamp", "string", + NULL, "choices=[\"always\",\"never\",\"none\"]", + NULL, 0 }, + { NULL, NULL, NULL, NULL, NULL, 0 } +}; + static const WT_CONFIG_CHECK confchk_WT_SESSION_create_log_subconfigs[] = { { "enabled", "boolean", NULL, NULL, NULL, 0 }, @@ -212,6 +222,9 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_alter[] = { { "access_pattern_hint", "string", NULL, "choices=[\"none\",\"random\",\"sequential\"]", NULL, 0 }, + { "assert", "category", + NULL, NULL, + confchk_assert_subconfigs, 2 }, { "cache_resident", "boolean", NULL, NULL, NULL, 0 }, { "log", "category", NULL, NULL, @@ -285,6 +298,9 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_create[] = { NULL, "min=512B,max=128MB", NULL, 0 }, { "app_metadata", "string", NULL, NULL, NULL, 0 }, + { "assert", "category", + NULL, NULL, + confchk_assert_subconfigs, 2 }, { "block_allocation", "string", NULL, "choices=[\"first\",\"best\"]", NULL, 0 }, @@ -470,6 +486,9 @@ static const WT_CONFIG_CHECK confchk_file_config[] = { NULL, "min=512B,max=128MB", NULL, 0 }, { "app_metadata", "string", NULL, NULL, NULL, 0 }, + { "assert", "category", + NULL, NULL, + confchk_assert_subconfigs, 2 }, { "block_allocation", "string", NULL, "choices=[\"first\",\"best\"]", NULL, 0 }, @@ -531,6 +550,9 @@ static const WT_CONFIG_CHECK confchk_file_meta[] = { NULL, "min=512B,max=128MB", NULL, 0 }, { "app_metadata", "string", NULL, NULL, NULL, 0 }, + { "assert", "category", + NULL, NULL, + confchk_assert_subconfigs, 2 }, { "block_allocation", "string", NULL, "choices=[\"first\",\"best\"]", NULL, 0 }, @@ -612,6 +634,9 @@ static const WT_CONFIG_CHECK confchk_lsm_meta[] = { NULL, "min=512B,max=128MB", NULL, 0 }, { "app_metadata", "string", NULL, NULL, NULL, 0 }, + { "assert", "category", + NULL, NULL, + confchk_assert_subconfigs, 2 }, { "block_allocation", "string", NULL, "choices=[\"first\",\"best\"]", NULL, 0 }, @@ -1180,8 +1205,9 @@ static const WT_CONFIG_ENTRY config_entries[] = { confchk_WT_CURSOR_reconfigure, 2 }, { "WT_SESSION.alter", - "access_pattern_hint=none,cache_resident=false,log=(enabled=true)", - confchk_WT_SESSION_alter, 3 + "access_pattern_hint=none,assert=(commit_timestamp=none," + "read_timestamp=none),cache_resident=false,log=(enabled=true)", + confchk_WT_SESSION_alter, 4 }, { "WT_SESSION.begin_transaction", "isolation=,name=,priority=0,read_timestamp=,snapshot=,sync=", @@ -1205,6 +1231,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { }, { "WT_SESSION.create", "access_pattern_hint=none,allocation_size=4KB,app_metadata=," + "assert=(commit_timestamp=none,read_timestamp=none)," "block_allocation=best,block_compressor=,cache_resident=false," "checksum=uncompressed,colgroups=,collator=,columns=,dictionary=0" ",encryption=(keyid=,name=),exclusive=false,extractor=," @@ -1220,7 +1247,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false," "prefix_compression_min=4,source=,split_deepen_min_child=0," "split_deepen_per_child=0,split_pct=90,type=file,value_format=u", - confchk_WT_SESSION_create, 42 + confchk_WT_SESSION_create, 43 }, { "WT_SESSION.drop", "checkpoint_wait=true,force=false,lock_wait=true," @@ -1307,6 +1334,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { }, { "file.config", "access_pattern_hint=none,allocation_size=4KB,app_metadata=," + "assert=(commit_timestamp=none,read_timestamp=none)," "block_allocation=best,block_compressor=,cache_resident=false," "checksum=uncompressed,collator=,columns=,dictionary=0," "encryption=(keyid=,name=),format=btree,huffman_key=," @@ -1318,10 +1346,11 @@ static const WT_CONFIG_ENTRY config_entries[] = { "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false," "prefix_compression_min=4,split_deepen_min_child=0," "split_deepen_per_child=0,split_pct=90,value_format=u", - confchk_file_config, 35 + confchk_file_config, 36 }, { "file.meta", "access_pattern_hint=none,allocation_size=4KB,app_metadata=," + "assert=(commit_timestamp=none,read_timestamp=none)," "block_allocation=best,block_compressor=,cache_resident=false," "checkpoint=,checkpoint_lsn=,checksum=uncompressed,collator=," "columns=,dictionary=0,encryption=(keyid=,name=),format=btree," @@ -1334,7 +1363,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "os_cache_max=0,prefix_compression=false,prefix_compression_min=4" ",split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90," "value_format=u,version=(major=0,minor=0)", - confchk_file_meta, 39 + confchk_file_meta, 40 }, { "index.meta", "app_metadata=,collator=,columns=,extractor=,immutable=false," @@ -1343,6 +1372,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { }, { "lsm.meta", "access_pattern_hint=none,allocation_size=4KB,app_metadata=," + "assert=(commit_timestamp=none,read_timestamp=none)," "block_allocation=best,block_compressor=,cache_resident=false," "checksum=uncompressed,chunks=,collator=,columns=,dictionary=0," "encryption=(keyid=,name=),format=btree,huffman_key=," @@ -1358,7 +1388,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false," "prefix_compression_min=4,split_deepen_min_child=0," "split_deepen_per_child=0,split_pct=90,value_format=u", - confchk_lsm_meta, 39 + confchk_lsm_meta, 40 }, { "table.meta", "app_metadata=,colgroups=,collator=,columns=,key_format=u," diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index 284e7e9883b..55251491129 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -1084,6 +1084,9 @@ err: /* WT_TRET(wt_session->close(wt_session, config)); } + /* Shut down transactions (wait for in-flight operations to complete. */ + WT_TRET(__wt_txn_global_shutdown(session)); + /* * Perform a system-wide checkpoint so that all tables are consistent * with each other. All transactions are resolved but ignore diff --git a/src/third_party/wiredtiger/src/conn/conn_cache.c b/src/third_party/wiredtiger/src/conn/conn_cache.c index 5515eb026ca..625350cf3e6 100644 --- a/src/third_party/wiredtiger/src/conn/conn_cache.c +++ b/src/third_party/wiredtiger/src/conn/conn_cache.c @@ -300,6 +300,11 @@ __wt_cache_destroy(WT_SESSION_IMPL *session) "cache server: exiting with %" PRIu64 " pages in " "memory and %" PRIu64 " pages evicted", cache->pages_inmem, cache->pages_evict); + if (cache->bytes_image != 0) + __wt_errx(session, + "cache server: exiting with %" PRIu64 " image bytes in " + "memory", + cache->bytes_image); if (cache->bytes_inmem != 0) __wt_errx(session, "cache server: exiting with %" PRIu64 " bytes in memory", diff --git a/src/third_party/wiredtiger/src/conn/conn_ckpt.c b/src/third_party/wiredtiger/src/conn/conn_ckpt.c index a47524af2d7..d968d4e4b2b 100644 --- a/src/third_party/wiredtiger/src/conn/conn_ckpt.c +++ b/src/third_party/wiredtiger/src/conn/conn_ckpt.c @@ -161,8 +161,11 @@ __ckpt_server_start(WT_CONNECTION_IMPL *conn) * * Checkpoint does enough I/O it may be called upon to perform slow * operations for the block manager. + * + * The checkpoint thread reads the lookaside table for outdated records, + * it gets its own cursor for that purpose. */ - session_flags = WT_SESSION_CAN_WAIT; + session_flags = WT_SESSION_CAN_WAIT | WT_SESSION_LOOKASIDE_CURSOR; WT_RET(__wt_open_internal_session(conn, "checkpoint-server", true, session_flags, &conn->ckpt_session)); session = conn->ckpt_session; diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c index 56a37cf16eb..2606c9d083b 100644 --- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c +++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c @@ -774,13 +774,14 @@ __wt_conn_dhandle_discard(WT_SESSION_IMPL *session) __wt_session_close_cache(session); /* - * Close open data handles: first, everything but the metadata file (as - * closing a normal file may open and write the metadata file), then - * the metadata file. + * Close open data handles: first, everything apart from metadata and + * lookaside (as closing a normal file may write metadata and read + * lookaside entries). Then close whatever is left open. */ restart: TAILQ_FOREACH(dhandle, &conn->dhqh, q) { - if (WT_IS_METADATA(dhandle)) + if (WT_IS_METADATA(dhandle) || + strcmp(dhandle->name, WT_LAS_URI) == 0) continue; WT_WITH_DHANDLE(session, dhandle, @@ -789,6 +790,9 @@ restart: goto restart; } + /* Shut down the lookaside table after all eviction is complete. */ + WT_TRET(__wt_las_destroy(session)); + /* * Closing the files may have resulted in entries on our default * session's list of open data handles, specifically, we added the @@ -807,7 +811,7 @@ restart: if (session->meta_cursor != NULL) WT_TRET(session->meta_cursor->close(session->meta_cursor)); - /* Close the metadata file handle. */ + /* Close the remaining handles. */ WT_TAILQ_SAFE_REMOVE_BEGIN(dhandle, &conn->dhqh, q, dhandle_tmp) { WT_WITH_DHANDLE(session, dhandle, WT_TRET(__wt_conn_dhandle_discard_single( diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c index 2865dc9e2fa..e72fa5c00a4 100644 --- a/src/third_party/wiredtiger/src/conn/conn_open.c +++ b/src/third_party/wiredtiger/src/conn/conn_open.c @@ -75,9 +75,6 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) wt_conn = &conn->iface; session = conn->default_session; - /* Shut down transactions (wait for in-flight operations to complete. */ - WT_TRET(__wt_txn_global_shutdown(session)); - /* Shut down the subsystems, ensuring workers see the state change. */ F_SET(conn, WT_CONN_CLOSING); WT_FULL_BARRIER(); @@ -111,9 +108,6 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) /* The eviction server is shut down last. */ WT_TRET(__wt_evict_destroy(session)); - /* Shut down the lookaside table, after all eviction is complete. */ - WT_TRET(__wt_las_destroy(session)); - /* Close open data handles. */ WT_TRET(__wt_conn_dhandle_discard(session)); diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c index 008aa6c08d8..a164e34fe33 100644 --- a/src/third_party/wiredtiger/src/conn/conn_sweep.c +++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c @@ -278,12 +278,10 @@ __sweep_server(void *arg) WT_DECL_RET; WT_SESSION_IMPL *session; time_t now; - uint64_t last_las_sweep_id, oldest_id; u_int dead_handles; session = arg; conn = S2C(session); - last_las_sweep_id = WT_TXN_NONE; /* * Sweep for dead and excess handles. @@ -302,26 +300,6 @@ __sweep_server(void *arg) WT_STAT_CONN_INCR(session, dh_sweeps); /* - * Sweep the lookaside table. If the lookaside table hasn't yet - * been written, there's no work to do. - * - * Don't sweep the lookaside table if the cache is stuck full. - * The sweep uses the cache and can exacerbate the problem. - * If we try to sweep when the cache is full or we aren't - * making progress in eviction, sweeping can wind up constantly - * bringing in and evicting pages from the lookaside table, - * which will stop the cache from moving into the stuck state. - */ - if (__wt_las_is_written(session) && - !__wt_cache_stuck(session)) { - oldest_id = __wt_txn_oldest_id(session); - if (WT_TXNID_LT(last_las_sweep_id, oldest_id)) { - WT_ERR(__wt_las_sweep(session)); - last_las_sweep_id = oldest_id; - } - } - - /* * Mark handles with a time of death, and report whether any * handles are marked dead. If sweep_idle_time is 0, handles * never become idle. @@ -403,14 +381,9 @@ __wt_sweep_create(WT_SESSION_IMPL *session) * Handle sweep does enough I/O it may be called upon to perform slow * operations for the block manager. * - * The sweep thread sweeps the lookaside table for outdated records, - * it gets its own cursor for that purpose. - * * Don't tap the sweep thread for eviction. */ session_flags = WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION; - if (F_ISSET(conn, WT_CONN_LAS_OPEN)) - session_flags |= WT_SESSION_LOOKASIDE_CURSOR; WT_RET(__wt_open_internal_session( conn, "sweep-server", true, session_flags, &conn->sweep_session)); session = conn->sweep_session; @@ -453,8 +426,5 @@ __wt_sweep_destroy(WT_SESSION_IMPL *session) conn->sweep_session = NULL; } - /* Discard any saved lookaside key. */ - __wt_buf_free(session, &conn->las_sweep_key); - return (ret); } diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c index 9aa93ade372..22ba6d1dee1 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_file.c +++ b/src/third_party/wiredtiger/src/cursor/cur_file.c @@ -263,14 +263,20 @@ __wt_curfile_insert_check(WT_CURSOR *cursor) WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; + int tret; cbt = (WT_CURSOR_BTREE *)cursor; + tret = 0; CURSOR_UPDATE_API_CALL_BTREE(cursor, session, update, cbt->btree); WT_ERR(__cursor_checkkey(cursor)); - ret = __wt_btcur_insert_check(cbt); + tret = __wt_btcur_insert_check(cbt); + /* + * Detecting a conflict should not cause transaction error. + */ err: CURSOR_UPDATE_API_END(session, ret); + WT_TRET(tret); return (ret); } diff --git a/src/third_party/wiredtiger/src/docs/backup.dox b/src/third_party/wiredtiger/src/docs/backup.dox index 91b15da9275..b952a975788 100644 --- a/src/third_party/wiredtiger/src/docs/backup.dox +++ b/src/third_party/wiredtiger/src/docs/backup.dox @@ -59,10 +59,12 @@ During the period the backup cursor is open, database checkpoints can be created, but no checkpoints can be deleted. This may result in significant file growth. -Additionally, if a crash occurs during the period the backup cursor is open and -logging is disabled, then the system will be restored to the most recent -checkpoint prior to the opening of the backup cursor, even if later database -checkpoints were created. +Additionally, if a crash occurs during the period the backup cursor is +open and logging is disabled (in other words, when depending on +checkpoints for durability), then the system will be restored to the +most recent checkpoint prior to the opening of the backup cursor, even +if later database checkpoints were completed. <b>Note this exception to +WiredTiger's checkpoint durability guarantees.</b> The following is a programmatic example of creating a backup: diff --git a/src/third_party/wiredtiger/src/docs/checkpoint.dox b/src/third_party/wiredtiger/src/docs/checkpoint.dox index ec28fea13c3..3d636cd17b6 100644 --- a/src/third_party/wiredtiger/src/docs/checkpoint.dox +++ b/src/third_party/wiredtiger/src/docs/checkpoint.dox @@ -22,6 +22,10 @@ configuration to ::wiredtiger_open. All transactional updates committed before a checkpoint are made durable by the checkpoint, therefore the frequency of checkpoints limits the volume of data that may be lost due to application or system failure. +<b>This guarantee has an exception:</b> If a crash occurs when a backup +cursor is open, then the system will be restored to the most recent +checkpoint prior to the opening of the backup cursor, even if later +database checkpoints were completed. Data sources that are involved in an exclusive operation when the checkpoint starts, including bulk load, verify or salvage, will be skipped diff --git a/src/third_party/wiredtiger/src/docs/transactions.dox b/src/third_party/wiredtiger/src/docs/transactions.dox index d9cc72dcf24..4ba6d5d2526 100644 --- a/src/third_party/wiredtiger/src/docs/transactions.dox +++ b/src/third_party/wiredtiger/src/docs/transactions.dox @@ -165,8 +165,8 @@ transaction timestamp functionality. Applications can assign explicit commit timestamps to transactions, then read "as of" a timestamp. Timestamps are communicated to WiredTiger using a -lower case hexadecimal encoding, so the encoded value can be twice as long as -the raw timestamp value. +hexadecimal encoding, so the encoded value can be twice as long as the raw +timestamp value. Setting a read timestamp in WT_SESSION::begin_transaction forces a transaction to run at snapshot isolation and ignore any commits with a newer timestamp. diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c index 56638934305..f2a09a0a769 100644 --- a/src/third_party/wiredtiger/src/evict/evict_file.c +++ b/src/third_party/wiredtiger/src/evict/evict_file.c @@ -16,11 +16,15 @@ int __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { WT_BTREE *btree; + WT_CURSOR *las_cursor; + WT_DATA_HANDLE *dhandle; WT_DECL_RET; WT_PAGE *page; WT_REF *next_ref, *ref; + uint32_t session_flags, walk_flags; - btree = S2BT(session); + dhandle = session->dhandle; + btree = dhandle->handle; /* * We need exclusive access to the file, we're about to discard the root @@ -28,7 +32,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) */ WT_ASSERT(session, btree->evict_disabled > 0 || - !F_ISSET(session->dhandle, WT_DHANDLE_OPEN)); + !F_ISSET(dhandle, WT_DHANDLE_OPEN)); /* * We do discard objects without pages in memory. If that's the case, @@ -37,14 +41,39 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) if (btree->root.page == NULL) return (0); + walk_flags = WT_READ_CACHE | WT_READ_NO_EVICT; + + /* + * If discarding a dead tree, remove any lookaside entries. This deals + * with the case where a tree is dropped with "force=true". It happens + * that we also force-drop the lookaside table itself: it can never + * participate in lookaside eviction, and we can't open a cursor on it + * as we are discarding it. + * + * We use the special page ID zero so that all lookaside entries for + * the tree are removed. + */ + if (F_ISSET(dhandle, WT_DHANDLE_DEAD) && + F_ISSET(S2C(session), WT_CONN_LAS_OPEN) && + !F_ISSET(btree, WT_BTREE_LOOKASIDE)) { + WT_ASSERT(session, !WT_IS_METADATA(dhandle)); + + __wt_las_cursor(session, &las_cursor, &session_flags); + WT_TRET(__wt_las_remove_block( + session, las_cursor, btree->id, 0)); + WT_TRET(__wt_las_cursor_close( + session, &las_cursor, session_flags)); + WT_RET(ret); + } else + FLD_SET(walk_flags, WT_READ_LOOKASIDE); + /* Make sure the oldest transaction ID is up-to-date. */ WT_RET(__wt_txn_update_oldest( session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); /* Walk the tree, discarding pages. */ next_ref = NULL; - WT_ERR(__wt_tree_walk( - session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT)); + WT_ERR(__wt_tree_walk(session, &next_ref, walk_flags)); while ((ref = next_ref) != NULL) { page = ref->page; @@ -69,8 +98,8 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * error, retrying later. */ if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page)) - WT_ERR(__wt_reconcile( - session, ref, NULL, WT_EVICTING, NULL)); + WT_ERR(__wt_reconcile(session, ref, NULL, + WT_REC_EVICT | WT_REC_VISIBLE_ALL, NULL)); /* * We can't evict the page just returned to us (it marks our @@ -81,8 +110,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * the reconciliation, the next walk call could miss a page in * the tree. */ - WT_ERR(__wt_tree_walk(session, - &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT)); + WT_ERR(__wt_tree_walk(session, &next_ref, walk_flags)); switch (syncop) { case WT_SYNC_CLOSE: @@ -96,7 +124,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * Discard the page regardless of whether it is dirty. */ WT_ASSERT(session, - F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || + F_ISSET(dhandle, WT_DHANDLE_DEAD) || __wt_page_can_evict(session, ref, NULL)); __wt_ref_out(session, ref); break; @@ -111,7 +139,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) err: /* On error, clear any left-over tree walk. */ if (next_ref != NULL) WT_TRET(__wt_page_release( - session, next_ref, WT_READ_NO_EVICT)); + session, next_ref, walk_flags)); } return (ret); diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 28d7bd2f1fa..8dd48738735 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -460,6 +460,7 @@ int __wt_evict_create(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; + uint32_t session_flags; conn = S2C(session); @@ -471,10 +472,12 @@ __wt_evict_create(WT_SESSION_IMPL *session) * Create the eviction thread group. * Set the group size to the maximum allowed sessions. */ + session_flags = WT_THREAD_CAN_WAIT | + WT_THREAD_LOOKASIDE | WT_THREAD_PANIC_FAIL; WT_RET(__wt_thread_group_create(session, &conn->evict_threads, "eviction-server", conn->evict_threads_min, conn->evict_threads_max, - WT_THREAD_CAN_WAIT | WT_THREAD_PANIC_FAIL, __wt_evict_thread_chk, - __wt_evict_thread_run, __wt_evict_thread_stop)); + session_flags, __wt_evict_thread_chk, __wt_evict_thread_run, + __wt_evict_thread_stop)); #if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) /* @@ -1874,6 +1877,24 @@ __evict_walk_file(WT_SESSION_IMPL *session, F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) goto fast; + /* + * If application threads are blocked waiting for eviction (so + * we are going to consider lookaside), and the only thing + * preventing a clean page from being evicted is that it + * contains historical data, mark it dirty so we can do + * lookaside eviction. + */ + if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD | + WT_CACHE_EVICT_DIRTY_HARD) && + !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE) && + !modified && page->modify != NULL && + !__wt_txn_visible_all(session, page->modify->rec_max_txn, + WT_TIMESTAMP_NULL(&page->modify->rec_max_timestamp))) { + __wt_page_only_modify_set(session, page); + modified = true; + goto fast; + } + /* Skip clean pages if appropriate. */ if (!modified && !F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) continue; @@ -1905,14 +1926,19 @@ __evict_walk_file(WT_SESSION_IMPL *session, goto fast; /* - * If the oldest transaction hasn't changed since the last time - * this page was written, it's unlikely we can make progress. - * Similarly, if the most recent update on the page is not yet - * globally visible, eviction will fail. These heuristics - * attempt to avoid repeated attempts to evict the same page. + * If there are active transaction and oldest transaction + * hasn't changed since the last time this page was written, + * it's unlikely we can make progress. Similarly, if the most + * recent update on the page is not yet globally visible, + * eviction will fail. This heuristic avoids repeated attempts + * to evict the same page. + * + * We skip this for the lookaside table because updates there + * can be evicted as soon as they are committed. */ mod = page->modify; - if (modified && txn_global->current != txn_global->oldest_id && + if (modified && !F_ISSET(btree, WT_BTREE_LOOKASIDE) && + txn_global->current != txn_global->oldest_id && (mod->last_eviction_id == __wt_txn_oldest_id(session) || !__wt_txn_visible_all(session, mod->update_txn, NULL))) continue; @@ -2424,6 +2450,7 @@ static int __verbose_dump_cache_single(WT_SESSION_IMPL *session, uint64_t *total_bytesp, uint64_t *total_dirty_bytesp) { + WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_PAGE *page; WT_REF *next_walk; @@ -2469,11 +2496,12 @@ __verbose_dump_cache_single(WT_SESSION_IMPL *session, } dhandle = session->dhandle; - if (dhandle->checkpoint == NULL) - WT_RET(__wt_msg(session, "%s(<live>):", dhandle->name)); - else - WT_RET(__wt_msg(session, "%s(checkpoint=%s):", - dhandle->name, dhandle->checkpoint)); + btree = dhandle->handle; + WT_RET(__wt_msg(session, "%s(%s%s)%s%s:", + dhandle->name, dhandle->checkpoint != NULL ? "checkpoint=" : "", + dhandle->checkpoint != NULL ? dhandle->checkpoint : "<live>", + btree->evict_disabled != 0 ? "eviction disabled" : "", + btree->evict_disabled_open ? " at open" : "")); if (intl_pages != 0) WT_RET(__wt_msg(session, "internal: " diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index ada1c39ddcf..7536e3593e8 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -10,7 +10,7 @@ static int __evict_page_clean_update(WT_SESSION_IMPL *, WT_REF *, bool); static int __evict_page_dirty_update(WT_SESSION_IMPL *, WT_REF *, bool); -static int __evict_review(WT_SESSION_IMPL *, WT_REF *, uint32_t *, bool); +static int __evict_review(WT_SESSION_IMPL *, WT_REF *, bool, uint32_t *); /* * __evict_exclusive_clear -- @@ -127,9 +127,6 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) conn = S2C(session); - /* Checkpoints should never do eviction. */ - WT_ASSERT(session, !WT_SESSION_IS_CHECKPOINT(session)); - /* Enter the eviction generation. */ __wt_session_gen_enter(session, WT_GEN_EVICT); @@ -146,13 +143,13 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) * to make this check for clean pages, too: while unlikely eviction * would choose an internal page with children, it's not disallowed. */ - WT_ERR(__evict_review(session, ref, &flags, closing)); + WT_ERR(__evict_review(session, ref, closing, &flags)); /* * If there was an in-memory split, the tree has been left in the state * we want: there is nothing more to do. */ - if (LF_ISSET(WT_EVICT_INMEM_SPLIT)) + if (LF_ISSET(WT_REC_INMEM_SPLIT)) goto done; /* Count evictions of internal pages during normal operation. */ @@ -312,9 +309,6 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) * of the page, if we're forced to "read" into that namespace, * we'll instantiate a new page instead of trying to read from * the backing store. - * - * Publish: a barrier to ensure the structure fields are set - * before the state change makes the page available to readers. */ __wt_ref_out(session, ref); ref->addr = NULL; @@ -353,19 +347,37 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ - WT_RET(__wt_calloc_one(session, &addr)); - *addr = mod->mod_replace; - mod->mod_replace.addr = NULL; - mod->mod_replace.size = 0; - ref->addr = addr; + if (mod->mod_replace.addr == NULL) + ref->addr = NULL; + else { + WT_RET(__wt_calloc_one(session, &addr)); + *addr = mod->mod_replace; + mod->mod_replace.addr = NULL; + mod->mod_replace.size = 0; + ref->addr = addr; + } /* * Eviction wants to keep this page if we have a disk image, * re-instantiate the page in memory, else discard the page. */ if (mod->mod_disk_image == NULL) { - __wt_ref_out(session, ref); - WT_PUBLISH(ref->state, WT_REF_DISK); + if (mod->mod_replace_las_pageid != 0) { + WT_RET( + __wt_calloc_one(session, &ref->page_las)); + ref->page_las->las_pageid = + mod->mod_replace_las_pageid; +#ifdef HAVE_TIMESTAMPS + __wt_timestamp_set( + &ref->page_las->min_timestamp, + &mod->mod_replace_las_min_timestamp); +#endif + __wt_ref_out(session, ref); + WT_PUBLISH(ref->state, WT_REF_LOOKASIDE); + } else { + __wt_ref_out(session, ref); + WT_PUBLISH(ref->state, WT_REF_DISK); + } } else { /* * The split code works with WT_MULTI structures, build @@ -413,7 +425,7 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent) */ static int __evict_review( - WT_SESSION_IMPL *session, WT_REF *ref, uint32_t *flagsp, bool closing) + WT_SESSION_IMPL *session, WT_REF *ref, bool closing, uint32_t *flagsp) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; @@ -423,7 +435,9 @@ __evict_review( bool lookaside_retry, *lookaside_retryp, modified; conn = S2C(session); - flags = WT_EVICTING; + flags = WT_REC_EVICT; + if (!WT_SESSION_IS_CHECKPOINT(session)) + LF_SET(WT_REC_VISIBLE_ALL); *flagsp = flags; /* @@ -502,7 +516,7 @@ __evict_review( * the page stays in memory and the tree is left in the desired * state: avoid the usual cleanup. */ - if (LF_ISSET(WT_EVICT_INMEM_SPLIT)) + if (LF_ISSET(WT_REC_INMEM_SPLIT)) return (__wt_split_insert(session, ref)); } @@ -545,22 +559,27 @@ __evict_review( lookaside_retryp = NULL; if (closing) - LF_SET(WT_VISIBILITY_ERR); - else if (!WT_PAGE_IS_INTERNAL(page)) { + LF_SET(WT_REC_VISIBILITY_ERR); + else if (!WT_PAGE_IS_INTERNAL(page) && + !F_ISSET(S2BT(session), WT_BTREE_LOOKASIDE)) { if (F_ISSET(conn, WT_CONN_IN_MEMORY)) - LF_SET(WT_EVICT_IN_MEMORY | - WT_EVICT_SCRUB | WT_EVICT_UPDATE_RESTORE); + LF_SET(WT_REC_IN_MEMORY | + WT_REC_SCRUB | WT_REC_UPDATE_RESTORE); else { - LF_SET(WT_EVICT_UPDATE_RESTORE); + if (!WT_SESSION_IS_CHECKPOINT(session)) { + LF_SET(WT_REC_UPDATE_RESTORE); - if (F_ISSET(cache, WT_CACHE_EVICT_SCRUB)) - LF_SET(WT_EVICT_SCRUB); + if (F_ISSET(cache, WT_CACHE_EVICT_SCRUB)) + LF_SET(WT_REC_SCRUB); + } /* * Check if reconciliation suggests trying the * lookaside table. */ - lookaside_retryp = &lookaside_retry; + if (__wt_cache_aggressive(session) && + !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE)) + lookaside_retryp = &lookaside_retry; } } @@ -574,9 +593,9 @@ __evict_review( * table, allowing the eviction of pages we'd otherwise have to retain * in cache to support older readers. */ - if (ret == EBUSY && lookaside_retry && __wt_cache_stuck(session)) { - LF_CLR(WT_EVICT_SCRUB | WT_EVICT_UPDATE_RESTORE); - LF_SET(WT_EVICT_LOOKASIDE); + if (ret == EBUSY && lookaside_retry) { + LF_CLR(WT_REC_SCRUB | WT_REC_UPDATE_RESTORE); + LF_SET(WT_REC_LOOKASIDE); ret = __wt_reconcile(session, ref, NULL, flags, NULL); } @@ -584,6 +603,18 @@ __evict_review( WT_RET(ret); /* + * If attempting eviction in service of a checkpoint, we may + * successfully reconcile but then find that there are updates on the + * page too new to evict. Give up in that case: checkpoint will + * reconcile the page normally. + */ + if (WT_SESSION_IS_CHECKPOINT(session) && !__wt_page_is_modified(page) && + !LF_ISSET(WT_REC_LOOKASIDE) && + !__wt_txn_visible_all(session, page->modify->rec_max_txn, + WT_TIMESTAMP_NULL(&page->modify->rec_max_timestamp))) + return (EBUSY); + + /* * Success: assert the page is clean or reconciliation was configured * for update/restore. If the page is clean, assert that reconciliation * was configured for a lookaside table, or it's not a durable object @@ -591,10 +622,10 @@ __evict_review( * visible. */ WT_ASSERT(session, - !__wt_page_is_modified(page) || LF_ISSET(WT_EVICT_UPDATE_RESTORE)); + !__wt_page_is_modified(page) || LF_ISSET(WT_REC_UPDATE_RESTORE)); WT_ASSERT(session, __wt_page_is_modified(page) || - LF_ISSET(WT_EVICT_LOOKASIDE) || + LF_ISSET(WT_REC_LOOKASIDE) || F_ISSET(S2BT(session), WT_BTREE_LOOKASIDE) || __wt_txn_visible_all(session, page->modify->rec_max_txn, WT_TIMESTAMP_NULL(&page->modify->rec_max_timestamp))); diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h index 60ed31b64e8..3eb951f81ac 100644 --- a/src/third_party/wiredtiger/src/include/api.h +++ b/src/third_party/wiredtiger/src/include/api.h @@ -39,9 +39,15 @@ } while (0) /* An API call wrapped in a transaction if necessary. */ +#ifdef HAVE_TIMESTAMPS +#define WT_TXN_TIMESTAMP_FLAG_CHECK(s) __wt_txn_timestamp_flags((s)) +#else +#define WT_TXN_TIMESTAMP_FLAG_CHECK(s) +#endif #define TXN_API_CALL(s, h, n, bt, config, cfg) do { \ bool __autotxn = false; \ API_CALL(s, h, n, bt, config, cfg); \ + WT_TXN_TIMESTAMP_FLAG_CHECK(s); \ __autotxn = !F_ISSET(&(s)->txn, WT_TXN_AUTOCOMMIT | WT_TXN_RUNNING);\ if (__autotxn) \ F_SET(&(s)->txn, WT_TXN_AUTOCOMMIT) @@ -50,6 +56,7 @@ #define TXN_API_CALL_NOCONF(s, h, n, dh) do { \ bool __autotxn = false; \ API_CALL_NOCONF(s, h, n, dh); \ + WT_TXN_TIMESTAMP_FLAG_CHECK(s); \ __autotxn = !F_ISSET(&(s)->txn, WT_TXN_AUTOCOMMIT | WT_TXN_RUNNING);\ if (__autotxn) \ F_SET(&(s)->txn, WT_TXN_AUTOCOMMIT) diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index f0d810281c2..486ab7562a1 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -165,13 +165,13 @@ struct __wt_ovfl_reuse { * Lookaside table support: when a page is being reconciled for eviction and has * updates that might be required by earlier readers in the system, the updates * are written into a lookaside table, and restored as necessary if the page is - * read. The key is a unique marker for the page (a file ID plus an address), - * a counter (used to ensure the update records remain in the original order), - * the on-page item's transaction ID and timestamp (so we can discard any - * update records from the lookaside table once the on-page item's transaction - * is globally visible), and the page key (byte-string for row-store, record - * number for column-store). The value is the WT_UPDATE structure's - * transaction ID, update size and value. + * read. + * + * The key is a unique marker for the page (a file ID plus a page ID), a + * counter (used to ensure the update records remain in the original order), + * and the record's key (byte-string for row-store, record number for + * column-store). The value is the WT_UPDATE structure's transaction ID, + * timestamp, update type and value. * * As the key for the lookaside table is different for row- and column-store, we * store both key types in a WT_ITEM, building/parsing them in the code, because @@ -182,7 +182,7 @@ struct __wt_ovfl_reuse { * the row-store key is relatively large. */ #define WT_LAS_FORMAT \ - "key_format=" WT_UNCHECKED_STRING(IuQQuu) \ + "key_format=" WT_UNCHECKED_STRING(IQQu) \ ",value_format=" WT_UNCHECKED_STRING(QuBu) /* @@ -239,11 +239,19 @@ struct __wt_page_modify { * re-instantiate the page in memory. */ void *disk_image; + + /* The page has lookaside entries. */ + uint64_t las_pageid; + WT_DECL_TIMESTAMP(las_min_timestamp) } r; #undef mod_replace #define mod_replace u1.r.replace #undef mod_disk_image #define mod_disk_image u1.r.disk_image +#undef mod_replace_las_pageid +#define mod_replace_las_pageid u1.r.las_pageid +#undef mod_replace_las_min_timestamp +#define mod_replace_las_min_timestamp u1.r.las_min_timestamp struct { /* Multiple replacement blocks */ struct __wt_multi { @@ -274,8 +282,7 @@ struct __wt_page_modify { struct __wt_save_upd { WT_INSERT *ins; /* Insert list reference */ WT_ROW *ripcip; /* Original on-page reference */ - uint64_t onpage_txn; - WT_DECL_TIMESTAMP(onpage_timestamp) + WT_UPDATE *onpage_upd; } *supd; uint32_t supd_entries; @@ -289,6 +296,9 @@ struct __wt_page_modify { WT_ADDR addr; uint32_t size; uint32_t checksum; + + uint64_t las_pageid; + WT_DECL_TIMESTAMP(las_min_timestamp) } *multi; uint32_t multi_entries; /* Multiple blocks element count */ } m; @@ -659,6 +669,10 @@ struct __wt_page { * thread that set the page to WT_REF_LOCKED has exclusive access, no * other thread may use the WT_REF until the state is changed. * + * WT_REF_LOOKASIDE: + * The page is on disk (as per WT_REF_DISK) and has entries in the + * lookaside table that must be applied before the page can be read. + * * WT_REF_MEM: * Set by a reading thread once the page has been read from disk; the page * is in the cache and the page reference is OK. @@ -696,10 +710,20 @@ struct __wt_page { * Related information for fast-delete, on-disk pages. */ struct __wt_page_deleted { - volatile uint64_t txnid; /* Transaction ID */ + volatile uint64_t txnid; /* Transaction ID */ WT_DECL_TIMESTAMP(timestamp) - WT_UPDATE **update_list; /* List of updates for abort */ + WT_UPDATE **update_list; /* List of updates for abort */ +}; + +/* + * WT_PAGE_LOOKASIDE -- + * Related information for on-disk pages with lookaside entries. + */ +struct __wt_page_lookaside { + uint64_t las_pageid; /* Page ID in lookaside */ + WT_DECL_TIMESTAMP(min_timestamp) /* Oldest timestamp in + lookaside for the page */ }; /* @@ -718,12 +742,13 @@ struct __wt_ref { WT_PAGE * volatile home; /* Reference page */ volatile uint32_t pindex_hint; /* Reference page index hint */ -#define WT_REF_DISK 0 /* Page is on disk */ -#define WT_REF_DELETED 1 /* Page is on disk, but deleted */ -#define WT_REF_LOCKED 2 /* Page locked for exclusive access */ -#define WT_REF_MEM 3 /* Page is in cache and valid */ -#define WT_REF_READING 4 /* Page being read */ -#define WT_REF_SPLIT 5 /* Parent page split (WT_REF dead) */ +#define WT_REF_DISK 0 /* Page is on disk */ +#define WT_REF_DELETED 1 /* Page is on disk, but deleted */ +#define WT_REF_LOCKED 2 /* Page locked for exclusive access */ +#define WT_REF_LOOKASIDE 3 /* Page is on disk with lookaside */ +#define WT_REF_MEM 4 /* Page is in cache and valid */ +#define WT_REF_READING 5 /* Page being read */ +#define WT_REF_SPLIT 6 /* Parent page split (WT_REF dead) */ volatile uint32_t state; /* Page state */ /* @@ -745,7 +770,10 @@ struct __wt_ref { #undef ref_ikey #define ref_ikey key.ikey - WT_PAGE_DELETED *page_del; /* Deleted on-disk page information */ + union { + WT_PAGE_DELETED *page_del; /* Deleted page information */ + WT_PAGE_LOOKASIDE *page_las; /* Lookaside information */ + }; }; /* * WT_REF_SIZE is the expected structure size -- we verify the build to ensure diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h index 8184d606022..7dc9b4a11a7 100644 --- a/src/third_party/wiredtiger/src/include/btree.h +++ b/src/third_party/wiredtiger/src/include/btree.h @@ -97,6 +97,12 @@ struct __wt_btree { uint64_t maxmempage; /* In-memory page max size */ uint64_t splitmempage; /* In-memory split trigger size */ +#define WT_ASSERT_COMMIT_TS_ALWAYS 0x0001 +#define WT_ASSERT_COMMIT_TS_NEVER 0x0002 +#define WT_ASSERT_READ_TS_ALWAYS 0x0004 +#define WT_ASSERT_READ_TS_NEVER 0x0008 + uint32_t assert_flags; /* Debugging assertion information */ + void *huffman_key; /* Key huffman encoding */ void *huffman_value; /* Value huffman encoding */ @@ -128,6 +134,7 @@ struct __wt_btree { u_int rec_multiblock_max; /* Maximum blocks written for a page */ uint64_t last_recno; /* Column-store last record number */ + uint64_t las_pageid; /* Lookaside table page ID counter */ WT_REF root; /* Root page reference */ bool modified; /* If the tree ever modified */ diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 4d6844e10cc..3b196dca673 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -1303,19 +1303,20 @@ __wt_page_can_evict( */ if (__wt_leaf_page_can_split(session, page)) { if (evict_flagsp != NULL) - FLD_SET(*evict_flagsp, WT_EVICT_INMEM_SPLIT); + FLD_SET(*evict_flagsp, WT_REC_INMEM_SPLIT); return (true); } modified = __wt_page_is_modified(page); /* - * If the file is being checkpointed, we can't evict dirty pages: - * if we write a page and free the previous version of the page, that + * If the file is being checkpointed, other threads can't evict dirty + * pages: if a page is written and the previous version freed, that * previous version might be referenced by an internal page already - * been written in the checkpoint, leaving the checkpoint inconsistent. + * written in the checkpoint, leaving the checkpoint inconsistent. */ - if (modified && btree->checkpointing != WT_CKPT_OFF) { + if (modified && btree->checkpointing != WT_CKPT_OFF && + !WT_SESSION_IS_CHECKPOINT(session)) { WT_STAT_CONN_INCR(session, cache_eviction_checkpoint); WT_STAT_DATA_INCR(session, cache_eviction_checkpoint); return (false); diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index e5593357347..1d7b6142685 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -365,10 +365,6 @@ struct __wt_connection_impl { */ WT_SPINLOCK las_lock; /* Lookaside table spinlock */ WT_SESSION_IMPL *las_session; /* Lookaside table session */ - bool las_written; /* Lookaside table has been written */ - - WT_ITEM las_sweep_key; /* Sweep server's saved key */ - uint64_t las_record_cnt;/* Count of lookaside records */ /* * The "lookaside_activity" verbose messages are throttled to once per diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 362acc71c0f..23897a05dfb 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -152,14 +152,13 @@ extern const char *__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, extern const char *__wt_addr_string(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ITEM *buf); extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store, bool *decoded) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_ovfl_discard_remove(WT_SESSION_IMPL *session, WT_PAGE *page); -extern int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, bool checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, bool evicting) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC @@ -186,7 +185,7 @@ extern int __wt_tree_walk_count(WT_SESSION_IMPL *session, WT_REF **refp, uint64_ extern int __wt_tree_walk_custom_skip( WT_SESSION_IMPL *session, WT_REF **refp, int (*skip_func)(WT_SESSION_IMPL *, WT_REF *, void *, bool *), void *func_cookie, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_tree_walk_skip( WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, const WT_ITEM *value, WT_UPDATE *upd_arg, u_int modify_type, bool exclusive) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t search_recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool restore) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_leaf_key_work(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, bool instantiate) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -200,16 +199,14 @@ extern int __wt_update_alloc(WT_SESSION_IMPL *session, const WT_ITEM *value, WT_ extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); extern int __wt_search_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert, bool restore) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_las_stats_update(WT_SESSION_IMPL *session); extern int __wt_las_create(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern void __wt_las_set_written(WT_SESSION_IMPL *session); -extern bool __wt_las_is_written(WT_SESSION_IMPL *session); extern int __wt_las_cursor_open(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags); extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_las_sweep(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern uint32_t __wt_checksum_sw(const void *chunk, size_t len) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_checksum_init(void); extern void __wt_config_initn( WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len); diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h index ccb32900dc4..65b4ce34752 100644 --- a/src/third_party/wiredtiger/src/include/flags.h +++ b/src/third_party/wiredtiger/src/include/flags.h @@ -2,32 +2,26 @@ * DO NOT EDIT: automatically built by dist/flags.py. * flags section: BEGIN */ -#define WT_CHECKPOINTING 0x00000001 #define WT_CONN_CACHE_POOL 0x00000001 #define WT_CONN_CKPT_SYNC 0x00000002 #define WT_CONN_CLOSING 0x00000004 #define WT_CONN_CLOSING_NO_MORE_OPENS 0x00000008 -#define WT_CONN_EVICTION_RUN 0x00000010 -#define WT_CONN_IN_MEMORY 0x00000020 -#define WT_CONN_LAS_OPEN 0x00000040 -#define WT_CONN_LEAK_MEMORY 0x00000080 -#define WT_CONN_LSM_MERGE 0x00000100 -#define WT_CONN_PANIC 0x00000200 -#define WT_CONN_READONLY 0x00000400 -#define WT_CONN_RECOVERING 0x00000800 -#define WT_CONN_SERVER_ASYNC 0x00001000 -#define WT_CONN_SERVER_CHECKPOINT 0x00002000 -#define WT_CONN_SERVER_LOG 0x00004000 -#define WT_CONN_SERVER_LSM 0x00008000 -#define WT_CONN_SERVER_STATISTICS 0x00010000 -#define WT_CONN_SERVER_SWEEP 0x00020000 -#define WT_CONN_WAS_BACKUP 0x00040000 -#define WT_EVICTING 0x00000002 -#define WT_EVICT_INMEM_SPLIT 0x00000004 -#define WT_EVICT_IN_MEMORY 0x00000008 -#define WT_EVICT_LOOKASIDE 0x00000010 -#define WT_EVICT_SCRUB 0x00000020 -#define WT_EVICT_UPDATE_RESTORE 0x00000040 +#define WT_CONN_EVICTION_NO_LOOKASIDE 0x00000010 +#define WT_CONN_EVICTION_RUN 0x00000020 +#define WT_CONN_IN_MEMORY 0x00000040 +#define WT_CONN_LAS_OPEN 0x00000080 +#define WT_CONN_LEAK_MEMORY 0x00000100 +#define WT_CONN_LSM_MERGE 0x00000200 +#define WT_CONN_PANIC 0x00000400 +#define WT_CONN_READONLY 0x00000800 +#define WT_CONN_RECOVERING 0x00001000 +#define WT_CONN_SERVER_ASYNC 0x00002000 +#define WT_CONN_SERVER_CHECKPOINT 0x00004000 +#define WT_CONN_SERVER_LOG 0x00008000 +#define WT_CONN_SERVER_LSM 0x00010000 +#define WT_CONN_SERVER_STATISTICS 0x00020000 +#define WT_CONN_SERVER_SWEEP 0x00040000 +#define WT_CONN_WAS_BACKUP 0x00080000 #define WT_LOGSCAN_FIRST 0x00000001 #define WT_LOGSCAN_FROM_CKP 0x00000002 #define WT_LOGSCAN_ONE 0x00000004 @@ -38,16 +32,26 @@ #define WT_LOG_FSYNC 0x00000008 #define WT_LOG_SYNC_ENABLED 0x00000010 #define WT_READ_CACHE 0x00000001 -#define WT_READ_NOTFOUND_OK 0x00000002 -#define WT_READ_NO_EMPTY 0x00000004 -#define WT_READ_NO_EVICT 0x00000008 -#define WT_READ_NO_GEN 0x00000010 -#define WT_READ_NO_WAIT 0x00000020 -#define WT_READ_PREV 0x00000040 -#define WT_READ_RESTART_OK 0x00000080 -#define WT_READ_SKIP_INTL 0x00000100 -#define WT_READ_TRUNCATE 0x00000200 -#define WT_READ_WONT_NEED 0x00000400 +#define WT_READ_LOOKASIDE 0x00000002 +#define WT_READ_NOTFOUND_OK 0x00000004 +#define WT_READ_NO_EMPTY 0x00000008 +#define WT_READ_NO_EVICT 0x00000010 +#define WT_READ_NO_GEN 0x00000020 +#define WT_READ_NO_WAIT 0x00000040 +#define WT_READ_PREV 0x00000080 +#define WT_READ_RESTART_OK 0x00000100 +#define WT_READ_SKIP_INTL 0x00000200 +#define WT_READ_TRUNCATE 0x00000400 +#define WT_READ_WONT_NEED 0x00000800 +#define WT_REC_CHECKPOINT 0x00000001 +#define WT_REC_EVICT 0x00000002 +#define WT_REC_INMEM_SPLIT 0x00000004 +#define WT_REC_IN_MEMORY 0x00000008 +#define WT_REC_LOOKASIDE 0x00000010 +#define WT_REC_SCRUB 0x00000020 +#define WT_REC_UPDATE_RESTORE 0x00000040 +#define WT_REC_VISIBILITY_ERR 0x00000080 +#define WT_REC_VISIBLE_ALL 0x00000100 #define WT_SESSION_CAN_WAIT 0x00000001 #define WT_SESSION_INTERNAL 0x00000002 #define WT_SESSION_LOCKED_CHECKPOINT 0x00000004 @@ -118,7 +122,6 @@ #define WT_VERB_VERIFY 0x10000000 #define WT_VERB_VERSION 0x20000000 #define WT_VERB_WRITE 0x40000000 -#define WT_VISIBILITY_ERR 0x00000080 /* * flags section: END * DO NOT EDIT: automatically built by dist/flags.py. diff --git a/src/third_party/wiredtiger/src/include/mutex.i b/src/third_party/wiredtiger/src/include/mutex.i index 5b14bb24730..871ccf63be8 100644 --- a/src/third_party/wiredtiger/src/include/mutex.i +++ b/src/third_party/wiredtiger/src/include/mutex.i @@ -113,11 +113,15 @@ static inline int __wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name) { #if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE + WT_DECL_RET; pthread_mutexattr_t attr; WT_RET(pthread_mutexattr_init(&attr)); - WT_RET(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP)); - WT_RET(pthread_mutex_init(&t->lock, &attr)); + ret = pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP); + if (ret == 0) + ret = pthread_mutex_init(&t->lock, &attr); + WT_TRET(pthread_mutexattr_destroy(&attr)); + WT_RET(ret); #else WT_RET(pthread_mutex_init(&t->lock, NULL)); #endif diff --git a/src/third_party/wiredtiger/src/include/schema.h b/src/third_party/wiredtiger/src/include/schema.h index 9ab4c12f0d0..bae5fc8cc04 100644 --- a/src/third_party/wiredtiger/src/include/schema.h +++ b/src/third_party/wiredtiger/src/include/schema.h @@ -296,7 +296,9 @@ struct __wt_table { F_CLR(session, WT_SESSION_LOCKED_CHECKPOINT); \ __wt_spin_unlock(session, &__conn->checkpoint_lock); \ } \ + __wt_yield(); \ op; \ + __wt_yield(); \ if (__checkpoint_locked) { \ __wt_spin_lock(session, &__conn->checkpoint_lock); \ F_SET(session, WT_SESSION_LOCKED_CHECKPOINT); \ diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index bd69cc36405..bea436e05e2 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -77,9 +77,6 @@ struct __wt_session_impl { enum { WT_COMPACT_NONE=0, WT_COMPACT_RUNNING, WT_COMPACT_SUCCESS } compact_state; - /* - * Lookaside table cursor, sweep and eviction worker threads only. - */ WT_CURSOR *las_cursor; /* Lookaside table cursor */ WT_CURSOR *meta_cursor; /* Metadata file */ diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index c7110c245c7..922b211bec4 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -341,6 +341,7 @@ struct __wt_connection_stats { int64_t cache_eviction_internal; int64_t cache_eviction_split_internal; int64_t cache_eviction_split_leaf; + int64_t cache_lookaside_entries; int64_t cache_lookaside_insert; int64_t cache_lookaside_remove; int64_t cache_bytes_max; diff --git a/src/third_party/wiredtiger/src/include/thread_group.h b/src/third_party/wiredtiger/src/include/thread_group.h index 7375f9dfd87..97eda6ab674 100644 --- a/src/third_party/wiredtiger/src/include/thread_group.h +++ b/src/third_party/wiredtiger/src/include/thread_group.h @@ -23,8 +23,9 @@ struct __wt_thread { */ #define WT_THREAD_ACTIVE 0x01 /* thread is active or paused */ #define WT_THREAD_CAN_WAIT 0x02 /* WT_SESSION_CAN_WAIT */ -#define WT_THREAD_PANIC_FAIL 0x04 /* panic if the thread fails */ -#define WT_THREAD_RUN 0x08 /* thread is running */ +#define WT_THREAD_LOOKASIDE 0x04 /* open lookaside cursor */ +#define WT_THREAD_PANIC_FAIL 0x08 /* panic if the thread fails */ +#define WT_THREAD_RUN 0x10 /* thread is running */ uint32_t flags; /* diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index 69481409aaf..6b78c78a5cd 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -246,17 +246,19 @@ struct __wt_txn { WT_ITEM *ckpt_snapshot; bool full_ckpt; -#define WT_TXN_AUTOCOMMIT 0x001 -#define WT_TXN_ERROR 0x002 -#define WT_TXN_HAS_ID 0x004 -#define WT_TXN_HAS_SNAPSHOT 0x008 -#define WT_TXN_HAS_TS_COMMIT 0x010 -#define WT_TXN_HAS_TS_READ 0x020 -#define WT_TXN_NAMED_SNAPSHOT 0x040 -#define WT_TXN_PUBLIC_TS_COMMIT 0x080 -#define WT_TXN_PUBLIC_TS_READ 0x100 -#define WT_TXN_READONLY 0x200 -#define WT_TXN_RUNNING 0x400 -#define WT_TXN_SYNC_SET 0x800 +#define WT_TXN_AUTOCOMMIT 0x00001 +#define WT_TXN_ERROR 0x00002 +#define WT_TXN_HAS_ID 0x00004 +#define WT_TXN_HAS_SNAPSHOT 0x00008 +#define WT_TXN_HAS_TS_COMMIT 0x00010 +#define WT_TXN_HAS_TS_READ 0x00020 +#define WT_TXN_NAMED_SNAPSHOT 0x00040 +#define WT_TXN_PUBLIC_TS_COMMIT 0x00080 +#define WT_TXN_PUBLIC_TS_READ 0x00100 +#define WT_TXN_READONLY 0x00200 +#define WT_TXN_RUNNING 0x00400 +#define WT_TXN_SYNC_SET 0x00800 +#define WT_TXN_TS_COMMIT_ALWAYS 0x01000 +#define WT_TXN_TS_COMMIT_NEVER 0x02000 uint32_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index e53ab6a69ee..26dcd01fe5e 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -10,6 +10,26 @@ static inline int __wt_txn_id_check(WT_SESSION_IMPL *session); static inline void __wt_txn_read_last(WT_SESSION_IMPL *session); #ifdef HAVE_TIMESTAMPS +/* + * __wt_txn_timestamp_flags -- + * Set txn related timestamp flags. + */ +static inline void +__wt_txn_timestamp_flags(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + + if (session->dhandle == NULL) + return; + btree = S2BT(session); + if (btree == NULL) + return; + if (FLD_ISSET(btree->assert_flags, WT_ASSERT_COMMIT_TS_ALWAYS)) + F_SET(&session->txn, WT_TXN_TS_COMMIT_ALWAYS); + if (FLD_ISSET(btree->assert_flags, WT_ASSERT_COMMIT_TS_NEVER)) + F_SET(&session->txn, WT_TXN_TS_COMMIT_NEVER); +} + #if WT_TIMESTAMP_SIZE == 8 #define WT_WITH_TIMESTAMP_READLOCK(session, l, e) e @@ -635,6 +655,37 @@ __wt_txn_id_check(WT_SESSION_IMPL *session) } /* + * __wt_txn_search_check -- + * Check if the current transaction can search. + */ +static inline int +__wt_txn_search_check(WT_SESSION_IMPL *session) +{ +#ifdef HAVE_TIMESTAMPS + WT_BTREE *btree; + WT_TXN *txn; + + txn = &session->txn; + btree = S2BT(session); + /* + * If the user says a table should always use a read timestamp, + * verify this transaction has one. Same if it should never have + * a read timestamp. + */ + if (FLD_ISSET(btree->assert_flags, WT_ASSERT_READ_TS_ALWAYS) && + !F_ISSET(txn, WT_TXN_PUBLIC_TS_READ)) + WT_RET_MSG(session, EINVAL, "read_timestamp required and " + "none set on this transaction"); + if (FLD_ISSET(btree->assert_flags, WT_ASSERT_READ_TS_NEVER) && + F_ISSET(txn, WT_TXN_PUBLIC_TS_READ)) + WT_RET_MSG(session, EINVAL, "no read_timestamp required and " + "timestamp set on this transaction"); +#endif + WT_UNUSED(session); + return (0); +} + +/* * __wt_txn_update_check -- * Check if the current transaction can update an item. */ diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index 1e526edaedc..830850f102b 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -4841,454 +4841,456 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1082 /*! cache: leaf pages split during eviction */ #define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1083 +/*! cache: lookaside table entries */ +#define WT_STAT_CONN_CACHE_LOOKASIDE_ENTRIES 1084 /*! cache: lookaside table insert calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1084 +#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1085 /*! cache: lookaside table remove calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1085 +#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1086 /*! cache: maximum bytes configured */ -#define WT_STAT_CONN_CACHE_BYTES_MAX 1086 +#define WT_STAT_CONN_CACHE_BYTES_MAX 1087 /*! cache: maximum page size at eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1087 +#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1088 /*! cache: modified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1088 +#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1089 /*! cache: modified pages evicted by application threads */ -#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1089 +#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1090 /*! cache: overflow pages read into cache */ -#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1090 +#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1091 /*! cache: page split during eviction deepened the tree */ -#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1091 +#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1092 /*! cache: page written requiring lookaside records */ -#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1092 +#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1093 /*! cache: pages currently held in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_INUSE 1093 +#define WT_STAT_CONN_CACHE_PAGES_INUSE 1094 /*! cache: pages evicted because they exceeded the in-memory maximum count */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1094 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1095 /*! * cache: pages evicted because they exceeded the in-memory maximum time * (usecs) */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_TIME 1095 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_TIME 1096 /*! cache: pages evicted because they had chains of deleted items count */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1096 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1097 /*! * cache: pages evicted because they had chains of deleted items time * (usecs) */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE_TIME 1097 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE_TIME 1098 /*! cache: pages evicted by application threads */ -#define WT_STAT_CONN_CACHE_EVICTION_APP 1098 +#define WT_STAT_CONN_CACHE_EVICTION_APP 1099 /*! cache: pages queued for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1099 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1100 /*! cache: pages queued for urgent eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1100 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1101 /*! cache: pages queued for urgent eviction during walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1101 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1102 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1102 +#define WT_STAT_CONN_CACHE_READ 1103 /*! cache: pages read into cache requiring lookaside entries */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1103 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1104 /*! cache: pages requested from the cache */ -#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1104 +#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1105 /*! cache: pages seen by eviction walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1105 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1106 /*! cache: pages selected for eviction unable to be evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1106 +#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1107 /*! cache: pages walked for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK 1107 +#define WT_STAT_CONN_CACHE_EVICTION_WALK 1108 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1108 +#define WT_STAT_CONN_CACHE_WRITE 1109 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1109 +#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1110 /*! cache: percentage overhead */ -#define WT_STAT_CONN_CACHE_OVERHEAD 1110 +#define WT_STAT_CONN_CACHE_OVERHEAD 1111 /*! cache: tracked bytes belonging to internal pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1111 +#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1112 /*! cache: tracked bytes belonging to leaf pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_LEAF 1112 +#define WT_STAT_CONN_CACHE_BYTES_LEAF 1113 /*! cache: tracked dirty bytes in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1113 +#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1114 /*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1114 +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1115 /*! cache: unmodified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1115 +#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1116 /*! connection: auto adjusting condition resets */ -#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1116 +#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1117 /*! connection: auto adjusting condition wait calls */ -#define WT_STAT_CONN_COND_AUTO_WAIT 1117 +#define WT_STAT_CONN_COND_AUTO_WAIT 1118 /*! connection: detected system time went backwards */ -#define WT_STAT_CONN_TIME_TRAVEL 1118 +#define WT_STAT_CONN_TIME_TRAVEL 1119 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1119 +#define WT_STAT_CONN_FILE_OPEN 1120 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1120 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1121 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1121 +#define WT_STAT_CONN_MEMORY_FREE 1122 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1122 +#define WT_STAT_CONN_MEMORY_GROW 1123 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1123 +#define WT_STAT_CONN_COND_WAIT 1124 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1124 +#define WT_STAT_CONN_RWLOCK_READ 1125 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1125 +#define WT_STAT_CONN_RWLOCK_WRITE 1126 /*! connection: total fsync I/Os */ -#define WT_STAT_CONN_FSYNC_IO 1126 +#define WT_STAT_CONN_FSYNC_IO 1127 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1127 +#define WT_STAT_CONN_READ_IO 1128 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1128 +#define WT_STAT_CONN_WRITE_IO 1129 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1129 +#define WT_STAT_CONN_CURSOR_CREATE 1130 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1130 +#define WT_STAT_CONN_CURSOR_INSERT 1131 /*! cursor: cursor modify calls */ -#define WT_STAT_CONN_CURSOR_MODIFY 1131 +#define WT_STAT_CONN_CURSOR_MODIFY 1132 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1132 +#define WT_STAT_CONN_CURSOR_NEXT 1133 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1133 +#define WT_STAT_CONN_CURSOR_PREV 1134 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1134 +#define WT_STAT_CONN_CURSOR_REMOVE 1135 /*! cursor: cursor reserve calls */ -#define WT_STAT_CONN_CURSOR_RESERVE 1135 +#define WT_STAT_CONN_CURSOR_RESERVE 1136 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1136 +#define WT_STAT_CONN_CURSOR_RESET 1137 /*! cursor: cursor restarted searches */ -#define WT_STAT_CONN_CURSOR_RESTART 1137 +#define WT_STAT_CONN_CURSOR_RESTART 1138 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1138 +#define WT_STAT_CONN_CURSOR_SEARCH 1139 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1139 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1140 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1140 +#define WT_STAT_CONN_CURSOR_UPDATE 1141 /*! cursor: truncate calls */ -#define WT_STAT_CONN_CURSOR_TRUNCATE 1141 +#define WT_STAT_CONN_CURSOR_TRUNCATE 1142 /*! data-handle: connection data handles currently active */ -#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1142 +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1143 /*! data-handle: connection sweep candidate became referenced */ -#define WT_STAT_CONN_DH_SWEEP_REF 1143 +#define WT_STAT_CONN_DH_SWEEP_REF 1144 /*! data-handle: connection sweep dhandles closed */ -#define WT_STAT_CONN_DH_SWEEP_CLOSE 1144 +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1145 /*! data-handle: connection sweep dhandles removed from hash list */ -#define WT_STAT_CONN_DH_SWEEP_REMOVE 1145 +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1146 /*! data-handle: connection sweep time-of-death sets */ -#define WT_STAT_CONN_DH_SWEEP_TOD 1146 +#define WT_STAT_CONN_DH_SWEEP_TOD 1147 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_SWEEPS 1147 +#define WT_STAT_CONN_DH_SWEEPS 1148 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1148 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1149 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1149 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1150 /*! lock: checkpoint lock acquisitions */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1150 +#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1151 /*! lock: checkpoint lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1151 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1152 /*! lock: checkpoint lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1152 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1153 /*! * lock: dhandle lock application thread time waiting for the dhandle * lock (usecs) */ -#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1153 +#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_APPLICATION 1154 /*! * lock: dhandle lock internal thread time waiting for the dhandle lock * (usecs) */ -#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1154 +#define WT_STAT_CONN_LOCK_DHANDLE_WAIT_INTERNAL 1155 /*! lock: dhandle read lock acquisitions */ -#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1155 +#define WT_STAT_CONN_LOCK_DHANDLE_READ_COUNT 1156 /*! lock: dhandle write lock acquisitions */ -#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1156 +#define WT_STAT_CONN_LOCK_DHANDLE_WRITE_COUNT 1157 /*! lock: metadata lock acquisitions */ -#define WT_STAT_CONN_LOCK_METADATA_COUNT 1157 +#define WT_STAT_CONN_LOCK_METADATA_COUNT 1158 /*! lock: metadata lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1158 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1159 /*! lock: metadata lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1159 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1160 /*! lock: schema lock acquisitions */ -#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1160 +#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1161 /*! lock: schema lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1161 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1162 /*! lock: schema lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1162 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1163 /*! * lock: table lock application thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1163 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1164 /*! * lock: table lock internal thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1164 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1165 /*! lock: table read lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1165 +#define WT_STAT_CONN_LOCK_TABLE_READ_COUNT 1166 /*! lock: table write lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1166 +#define WT_STAT_CONN_LOCK_TABLE_WRITE_COUNT 1167 /*! log: busy returns attempting to switch slots */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1167 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1168 /*! log: force checkpoint calls slept */ -#define WT_STAT_CONN_LOG_FORCE_CKPT_SLEEP 1168 +#define WT_STAT_CONN_LOG_FORCE_CKPT_SLEEP 1169 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1169 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1170 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1170 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1171 /*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1171 +#define WT_STAT_CONN_LOG_ZERO_FILLS 1172 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1172 +#define WT_STAT_CONN_LOG_FLUSH 1173 /*! log: log force write operations */ -#define WT_STAT_CONN_LOG_FORCE_WRITE 1173 +#define WT_STAT_CONN_LOG_FORCE_WRITE 1174 /*! log: log force write operations skipped */ -#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1174 +#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1175 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1175 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1176 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1176 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1177 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1177 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1178 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1178 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1179 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1179 +#define WT_STAT_CONN_LOG_SCANS 1180 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1180 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1181 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1181 +#define WT_STAT_CONN_LOG_WRITE_LSN 1182 /*! log: log server thread write LSN walk skipped */ -#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1182 +#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1183 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1183 +#define WT_STAT_CONN_LOG_SYNC 1184 /*! log: log sync time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DURATION 1184 +#define WT_STAT_CONN_LOG_SYNC_DURATION 1185 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1185 +#define WT_STAT_CONN_LOG_SYNC_DIR 1186 /*! log: log sync_dir time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1186 +#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1187 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1187 +#define WT_STAT_CONN_LOG_WRITES 1188 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1188 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1189 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1189 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1190 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1190 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1191 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1191 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1192 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1192 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1193 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1193 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1194 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1194 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1195 /*! log: slot close lost race */ -#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1195 +#define WT_STAT_CONN_LOG_SLOT_CLOSE_RACE 1196 /*! log: slot close unbuffered waits */ -#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1196 +#define WT_STAT_CONN_LOG_SLOT_CLOSE_UNBUF 1197 /*! log: slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1197 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1198 /*! log: slot join atomic update races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1198 +#define WT_STAT_CONN_LOG_SLOT_RACES 1199 /*! log: slot join calls atomic updates raced */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1199 +#define WT_STAT_CONN_LOG_SLOT_YIELD_RACE 1200 /*! log: slot join calls did not yield */ -#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1200 +#define WT_STAT_CONN_LOG_SLOT_IMMEDIATE 1201 /*! log: slot join calls found active slot closed */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1201 +#define WT_STAT_CONN_LOG_SLOT_YIELD_CLOSE 1202 /*! log: slot join calls slept */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1202 +#define WT_STAT_CONN_LOG_SLOT_YIELD_SLEEP 1203 /*! log: slot join calls yielded */ -#define WT_STAT_CONN_LOG_SLOT_YIELD 1203 +#define WT_STAT_CONN_LOG_SLOT_YIELD 1204 /*! log: slot join found active slot closed */ -#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1204 +#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1205 /*! log: slot joins yield time (usecs) */ -#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1205 +#define WT_STAT_CONN_LOG_SLOT_YIELD_DURATION 1206 /*! log: slot transitions unable to find free slot */ -#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1206 +#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1207 /*! log: slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1207 +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1208 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1208 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1209 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1209 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1210 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1210 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1211 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1211 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1212 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1212 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1213 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1213 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1214 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1214 +#define WT_STAT_CONN_REC_PAGES 1215 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1215 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1216 /*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1216 +#define WT_STAT_CONN_REC_PAGE_DELETE 1217 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1217 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1218 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1218 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1219 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1219 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1220 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1220 +#define WT_STAT_CONN_SESSION_OPEN 1221 /*! session: table alter failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1221 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1222 /*! session: table alter successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1222 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1223 /*! session: table alter unchanged and skipped */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1223 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1224 /*! session: table compact failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1224 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1225 /*! session: table compact successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1225 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1226 /*! session: table create failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1226 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1227 /*! session: table create successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1227 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1228 /*! session: table drop failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1228 +#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1229 /*! session: table drop successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1229 +#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1230 /*! session: table rebalance failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1230 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1231 /*! session: table rebalance successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1231 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1232 /*! session: table rename failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1232 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1233 /*! session: table rename successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1233 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1234 /*! session: table salvage failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1234 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1235 /*! session: table salvage successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1235 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1236 /*! session: table truncate failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1236 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1237 /*! session: table truncate successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1237 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1238 /*! session: table verify failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1238 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1239 /*! session: table verify successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1239 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1240 /*! thread-state: active filesystem fsync calls */ -#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1240 +#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1241 /*! thread-state: active filesystem read calls */ -#define WT_STAT_CONN_THREAD_READ_ACTIVE 1241 +#define WT_STAT_CONN_THREAD_READ_ACTIVE 1242 /*! thread-state: active filesystem write calls */ -#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1242 +#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1243 /*! thread-yield: application thread time evicting (usecs) */ -#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1243 +#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1244 /*! thread-yield: application thread time waiting for cache (usecs) */ -#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1244 +#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1245 /*! * thread-yield: connection close blocked waiting for transaction state * stabilization */ -#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1245 +#define WT_STAT_CONN_TXN_RELEASE_BLOCKED 1246 /*! thread-yield: connection close yielded for lsm manager shutdown */ -#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1246 +#define WT_STAT_CONN_CONN_CLOSE_BLOCKED_LSM 1247 /*! thread-yield: data handle lock yielded */ -#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1247 +#define WT_STAT_CONN_DHANDLE_LOCK_BLOCKED 1248 /*! * thread-yield: get reference for page index and slot time sleeping * (usecs) */ -#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1248 +#define WT_STAT_CONN_PAGE_INDEX_SLOT_REF_BLOCKED 1249 /*! thread-yield: log server sync yielded for log write */ -#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1249 +#define WT_STAT_CONN_LOG_SERVER_SYNC_BLOCKED 1250 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1250 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1251 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1251 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1252 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1252 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1253 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1253 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1254 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1254 +#define WT_STAT_CONN_PAGE_SLEEP 1255 /*! * thread-yield: page delete rollback time sleeping for state change * (usecs) */ -#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1255 +#define WT_STAT_CONN_PAGE_DEL_ROLLBACK_BLOCKED 1256 /*! thread-yield: page reconciliation yielded due to child modification */ -#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1256 +#define WT_STAT_CONN_CHILD_MODIFY_BLOCKED_PAGE 1257 /*! * thread-yield: tree descend one level yielded for split page index * update */ -#define WT_STAT_CONN_TREE_DESCEND_BLOCKED 1257 +#define WT_STAT_CONN_TREE_DESCEND_BLOCKED 1258 /*! transaction: number of named snapshots created */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1258 +#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1259 /*! transaction: number of named snapshots dropped */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1259 +#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1260 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1260 +#define WT_STAT_CONN_TXN_BEGIN 1261 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1261 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1262 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1262 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1263 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1263 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1264 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1264 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1265 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1265 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1266 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1266 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1267 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1267 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1268 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1268 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1269 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1269 +#define WT_STAT_CONN_TXN_CHECKPOINT 1270 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1270 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1271 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1271 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1272 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1272 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1273 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1273 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1274 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1274 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1275 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1275 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1276 /*! * transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1276 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1277 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1277 +#define WT_STAT_CONN_TXN_SYNC 1278 /*! transaction: transactions commit timestamp queue inserts to head */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_HEAD 1278 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_HEAD 1279 /*! transaction: transactions commit timestamp queue inserts total */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1279 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_INSERTS 1280 /*! transaction: transactions commit timestamp queue length */ -#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1280 +#define WT_STAT_CONN_TXN_COMMIT_QUEUE_LEN 1281 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1281 +#define WT_STAT_CONN_TXN_COMMIT 1282 /*! transaction: transactions read timestamp queue inserts to head */ -#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1282 +#define WT_STAT_CONN_TXN_READ_QUEUE_HEAD 1283 /*! transaction: transactions read timestamp queue inserts total */ -#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1283 +#define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1284 /*! transaction: transactions read timestamp queue length */ -#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1284 +#define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1285 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1285 +#define WT_STAT_CONN_TXN_ROLLBACK 1286 /*! transaction: update conflicts */ -#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1286 +#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1287 /*! * @} diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index 84617dfcab8..b25ed08e30f 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -258,6 +258,8 @@ struct __wt_page_header; typedef struct __wt_page_header WT_PAGE_HEADER; struct __wt_page_index; typedef struct __wt_page_index WT_PAGE_INDEX; +struct __wt_page_lookaside; + typedef struct __wt_page_lookaside WT_PAGE_LOOKASIDE; struct __wt_page_modify; typedef struct __wt_page_modify WT_PAGE_MODIFY; struct __wt_process; diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c index a42fbbe511b..95d025247a6 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c @@ -102,8 +102,6 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final) static void __lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final) { - int i; - /* * Stop any new work units being added. The barrier is necessary * because we rely on the state change being visible before checking @@ -118,8 +116,7 @@ __lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final) * we know a user is holding a reference to the tree, so exclusive * access is not available. */ - for (i = 0; - lsm_tree->queue_ref > 0 || (final && lsm_tree->refcnt > 1); ++i) { + while (lsm_tree->queue_ref > 0 || (final && lsm_tree->refcnt > 1)) { /* * Remove any work units from the manager queues. Do this step * repeatedly in case a work unit was in the process of being @@ -133,10 +130,8 @@ __lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final) * other schema level operations will return EBUSY, even though * we're dropping the schema lock here. */ - if (i % WT_THOUSAND == 0) - WT_WITHOUT_LOCKS(session, - __wt_lsm_manager_clear_tree(session, lsm_tree)); - __wt_yield(); + WT_WITHOUT_LOCKS(session, + __wt_lsm_manager_clear_tree(session, lsm_tree)); } } diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c index 879913bccec..05e5fe5b07e 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c @@ -320,11 +320,12 @@ int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) { + WT_BTREE *btree; WT_DECL_RET; WT_TXN_ISOLATION saved_isolation; - bool flush_set, release_btree; + bool flush_set, release_dhandle; - flush_set = release_btree = false; + flush_set = release_dhandle = false; /* * If the chunk is already checkpointed, make sure it is also evicted. @@ -374,7 +375,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, * take a long time. */ WT_ERR(__wt_session_get_dhandle(session, chunk->uri, NULL, NULL, 0)); - release_btree = true; + release_dhandle = true; /* * Set read-uncommitted: we have already checked that all of the updates @@ -407,9 +408,6 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, if (ret != 0) WT_ERR_MSG(session, ret, "LSM checkpoint"); - release_btree = false; - WT_ERR(__wt_session_release_dhandle(session)); - /* Now the file is written, get the chunk size. */ WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); @@ -429,6 +427,19 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, if (ret != 0) WT_ERR_MSG(session, ret, "LSM metadata write"); + /* + * Enable eviction on the live chunk so it doesn't block the cache. + * Future reads should direct to the on-disk chunk anyway. + */ + btree = session->dhandle->handle; + if (btree->evict_disabled_open) { + btree->evict_disabled_open = false; + __wt_evict_file_exclusive_off(session); + } + + release_dhandle = false; + WT_ERR(__wt_session_release_dhandle(session)); + WT_PUBLISH(chunk->flushing, 0); flush_set = false; @@ -448,7 +459,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, err: if (flush_set) WT_PUBLISH(chunk->flushing, 0); - if (release_btree) + if (release_dhandle) WT_TRET(__wt_session_release_dhandle(session)); return (ret); diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c index 5d0295d94ce..533d2a0ab08 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c +++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c @@ -26,8 +26,11 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) pthread_condattr_t condattr; WT_ERR(pthread_condattr_init(&condattr)); - WT_ERR(pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC)); - WT_ERR(pthread_cond_init(&cond->cond, &condattr)); + ret = pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC); + if (ret == 0) + ret = pthread_cond_init(&cond->cond, &condattr); + WT_TRET(pthread_condattr_destroy(&condattr)); + WT_ERR(ret); } #else WT_ERR(pthread_cond_init(&cond->cond, NULL)); diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 4cb5ae12e5b..af43a56f877 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -45,13 +45,13 @@ typedef struct { uint64_t last_running; WT_DECL_TIMESTAMP(stable_timestamp) - /* Track the page's maximum transaction. */ + /* Track the page's min/maximum transactions. */ uint64_t max_txn; WT_DECL_TIMESTAMP(max_timestamp) + WT_DECL_TIMESTAMP(min_saved_timestamp) - uint64_t update_mem_all; /* Total update memory size */ - uint64_t update_mem_saved; /* Saved update memory size */ - uint64_t update_mem_uncommitted;/* Uncommitted update memory size */ + bool update_uncommitted; /* An update was uncommitted */ + bool update_used; /* An update could be used */ /* * When we can't mark the page clean (for example, checkpoint found some @@ -154,8 +154,6 @@ typedef struct { */ struct __rec_chunk { /* - * Current and minimum boundaries. - * * The recno and entries fields are the starting record number * of the split chunk (for column-store splits), and the number * of entries in the split chunk. @@ -193,8 +191,8 @@ typedef struct { size_t min_space_avail; /* - * Saved update list, supporting the WT_EVICT_UPDATE_RESTORE and - * WT_EVICT_LOOKASIDE configurations. While reviewing updates for each + * Saved update list, supporting the WT_REC_UPDATE_RESTORE and + * WT_REC_LOOKASIDE configurations. While reviewing updates for each * page, we save WT_UPDATE lists here, and then move them to per-block * areas as the blocks are defined. */ @@ -220,7 +218,14 @@ typedef struct { * There's some trickiness here, see the code for comments on how * these fields work. */ - bool cell_zero; /* Row-store internal page 0th key */ + bool cell_zero; /* Row-store internal page 0th key */ + + /* + * We calculate checksums to find previously written identical blocks, + * but once a match fails during an eviction, there's no point trying + * again. + */ + bool evict_matching_checksum_failed; /* * WT_DICTIONARY -- @@ -324,7 +329,7 @@ static int __rec_split_write( static int __rec_update_las( WT_SESSION_IMPL *, WT_RECONCILE *, uint32_t, WT_MULTI *); static int __rec_write_check_complete( - WT_SESSION_IMPL *, WT_RECONCILE *, bool *); + WT_SESSION_IMPL *, WT_RECONCILE *, int, bool *); static void __rec_write_page_status(WT_SESSION_IMPL *, WT_RECONCILE *); static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_write_wrapup_err( @@ -335,7 +340,8 @@ static int __rec_dictionary_init(WT_SESSION_IMPL *, WT_RECONCILE *, u_int); static int __rec_dictionary_lookup( WT_SESSION_IMPL *, WT_RECONCILE *, WT_KV *, WT_DICTIONARY **); static void __rec_dictionary_reset(WT_RECONCILE *); -static void __rec_verbose_lookaside_write(WT_SESSION_IMPL *); +static void __rec_verbose_lookaside_write( + WT_SESSION_IMPL *, uint32_t, uint64_t); /* * __wt_reconcile -- @@ -361,9 +367,21 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, __wt_verbose(session, WT_VERB_RECONCILE, "%p reconcile %s (%s%s%s)", (void *)ref, __wt_page_type_string(page->type), - LF_ISSET(WT_EVICTING) ? "evict" : "checkpoint", - LF_ISSET(WT_EVICT_LOOKASIDE) ? ", lookaside" : "", - LF_ISSET(WT_EVICT_UPDATE_RESTORE) ? ", update/restore" : ""); + LF_ISSET(WT_REC_EVICT) ? "evict" : "checkpoint", + LF_ISSET(WT_REC_LOOKASIDE) ? ", lookaside" : "", + LF_ISSET(WT_REC_UPDATE_RESTORE) ? ", update/restore" : ""); + + /* + * Sanity check flags. + * + * We can only do update/restore eviction when the version that ends up + * in the page image is the oldest one any reader could need. + * Otherwise we would need to keep updates in memory that go back older + * than the version in the disk image, and since modify operations + * aren't idempotent, that is problematic. + */ + WT_ASSERT(session, !LF_ISSET(WT_REC_UPDATE_RESTORE) || + LF_ISSET(WT_REC_VISIBLE_ALL)); /* We shouldn't get called with a clean page, that's an error. */ WT_ASSERT(session, __wt_page_is_modified(page)); @@ -380,7 +398,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_LOCK(session, page); oldest_id = __wt_txn_oldest_id(session); - if (LF_ISSET(WT_EVICTING)) + if (LF_ISSET(WT_REC_EVICT)) mod->last_eviction_id = oldest_id; #ifdef HAVE_DIAGNOSTIC @@ -426,9 +444,8 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_ILLEGAL_VALUE_SET(session); } - /* Checks for a successful reconciliation. */ - if (ret == 0) - ret = __rec_write_check_complete(session, r, lookaside_retryp); + /* Check for a successful reconciliation. */ + WT_TRET(__rec_write_check_complete(session, r, ret, lookaside_retryp)); /* Wrap up the page reconciliation. */ if (ret == 0 && (ret = __rec_write_wrapup(session, r, page)) == 0) @@ -442,7 +459,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, /* Update statistics. */ WT_STAT_CONN_INCR(session, rec_pages); WT_STAT_DATA_INCR(session, rec_pages); - if (LF_ISSET(WT_EVICTING)) { + if (LF_ISSET(WT_REC_EVICT)) { WT_STAT_CONN_INCR(session, rec_pages_eviction); WT_STAT_DATA_INCR(session, rec_pages_eviction); } @@ -478,14 +495,16 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_TRET(session->block_manager_cleanup(session)); WT_TRET(__rec_destroy_session(session)); + } - /* - * We track removed overflow objects in case there's a reader - * in transit when they're removed. Any form of eviction locks - * out readers, we can discard them all. - */ + /* + * We track removed overflow objects in case there's a reader in + * transit when they're removed. Any form of eviction locks out + * readers, we can discard them all. + */ + if (LF_ISSET(WT_REC_EVICT)) __wt_ovfl_discard_remove(session, page); - } + WT_RET(ret); /* @@ -531,7 +550,7 @@ __rec_las_checkpoint_test(WT_SESSION_IMPL *session, WT_RECONCILE *r) * drain lookaside table reconciliations, and this isn't a problem for * most workloads. */ - if (!F_ISSET(r, WT_EVICT_LOOKASIDE)) + if (!F_ISSET(r, WT_REC_LOOKASIDE)) return (false); if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) return (false); @@ -549,7 +568,7 @@ __rec_las_checkpoint_test(WT_SESSION_IMPL *session, WT_RECONCILE *r) */ static int __rec_write_check_complete( - WT_SESSION_IMPL *session, WT_RECONCILE *r, bool *lookaside_retryp) + WT_SESSION_IMPL *session, WT_RECONCILE *r, int tret, bool *lookaside_retryp) { /* * Tests in this function are lookaside tests and tests to decide if @@ -558,7 +577,7 @@ __rec_write_check_complete( * checks for in-memory eviction because a small cache can force us to * rewrite every possible page. */ - if (F_ISSET(r, WT_EVICT_IN_MEMORY)) + if (F_ISSET(r, WT_REC_IN_MEMORY)) return (0); /* @@ -569,17 +588,26 @@ __rec_write_check_complete( return (EBUSY); /* - * Eviction can configure lookaside table reconciliation, consider if - * it's worth giving up this reconciliation attempt and falling back to - * using the lookaside table. We continue with evict/restore if - * switching to the lookaside doesn't make sense for any reason: we - * won't retry an evict/restore reconciliation until/unless the - * transactional system moves forward, so at worst it's a single wasted - * effort. + * Fall back to lookaside eviction during checkpoints if a page can't + * be evicted. + */ + if (tret == EBUSY && lookaside_retryp != NULL && + !F_ISSET(r, WT_REC_UPDATE_RESTORE) && !r->update_uncommitted) + *lookaside_retryp = true; + + /* Don't continue if we have already given up. */ + WT_RET(tret); + + /* + * Check if this reconciliation attempt is making progress. If there's + * any sign of progress, don't fall back to the lookaside table. * - * First, check if the lookaside table is a possible alternative. + * Check if the current reconciliation split, in which case we'll + * likely get to write at least one of the blocks. If we've created a + * page image for a page that previously didn't have one, or we had a + * page image and it is now empty, that's also progress. */ - if (lookaside_retryp == NULL) + if (r->multi_next > 1) return (0); /* @@ -590,38 +618,20 @@ __rec_write_check_complete( * If no updates were saved, eviction will succeed without needing to * restore anything. */ - if (!F_ISSET(r, WT_EVICT_UPDATE_RESTORE) || r->supd == NULL) - return (0); - - /* - * Check if this reconciliation attempt is making progress. If there's - * any sign of progress, don't fall back to the lookaside table. - * - * Check if the current reconciliation split, in which case we'll likely - * get to write at least one of the blocks. - */ - if (r->multi_next > 1) + if (!F_ISSET(r, WT_REC_UPDATE_RESTORE) || lookaside_retryp == NULL || + (r->multi_next == 1 && r->multi->supd_entries == 0)) return (0); /* * Check if the current reconciliation applied some updates, in which * case evict/restore should gain us some space. - */ - if (r->update_mem_saved != r->update_mem_all) - return (0); - - /* + * * Check if lookaside eviction is possible. If any of the updates we - * saw were uncommitted, the lookaside table cannot be used: it only - * helps with older readers preventing eviction. + * saw were uncommitted, the lookaside table cannot be used. */ - if (r->update_mem_uncommitted != 0) + if (r->update_used || r->update_uncommitted) return (0); - /* - * The current evict/restore approach shows no signs of being useful, - * lookaside is possible, suggest the lookaside table. - */ *lookaside_retryp = true; return (EBUSY); } @@ -665,8 +675,8 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r) * eviction path. */ WT_ASSERT(session, - !F_ISSET(r, WT_EVICTING) || - F_ISSET(r, WT_EVICT_UPDATE_RESTORE)); + !F_ISSET(r, WT_REC_EVICT) || + F_ISSET(r, WT_REC_UPDATE_RESTORE)); } else { /* * Track the page's maximum transaction ID (used to decide if @@ -685,7 +695,7 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r) * about the maximum transaction ID of current updates in the * tree, and checkpoint visits every dirty page in the tree. */ - if (F_ISSET(r, WT_EVICTING)) { + if (!F_ISSET(r, WT_REC_EVICT)) { if (WT_TXNID_LT(btree->rec_max_txn, r->max_txn)) btree->rec_max_txn = r->max_txn; #ifdef HAVE_TIMESTAMPS @@ -707,7 +717,7 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r) if (__wt_atomic_cas32(&mod->write_gen, r->orig_write_gen, 0)) __wt_cache_dirty_decr(session, page); else - WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT)); } } @@ -903,49 +913,50 @@ __rec_init(WT_SESSION_IMPL *session, #endif /* + * When operating on the lookaside table, we should never try + * update/restore or lookaside eviction. + */ + WT_ASSERT(session, !F_ISSET(btree, WT_BTREE_LOOKASIDE) || + !LF_ISSET(WT_REC_LOOKASIDE | WT_REC_UPDATE_RESTORE)); + + /* * Lookaside table eviction is configured when eviction gets aggressive, * adjust the flags for cases we don't support. + * + * We don't yet support fixed-length column-store combined with the + * lookaside table. It's not hard to do, but the underlying function + * that reviews which updates can be written to the evicted page and + * which updates need to be written to the lookaside table needs access + * to the original value from the page being evicted, and there's no + * code path for that in the case of fixed-length column-store objects. + * (Row-store and variable-width column-store objects provide a + * reference to the unpacked on-page cell for this purpose, but there + * isn't an on-page cell for fixed-length column-store objects.) For + * now, turn it off. */ - if (LF_ISSET(WT_EVICT_LOOKASIDE)) { - /* - * Saving lookaside table updates into the lookaside table won't - * work. - */ - if (F_ISSET(btree, WT_BTREE_LOOKASIDE)) - LF_CLR(WT_EVICT_LOOKASIDE); + if (page->type == WT_PAGE_COL_FIX) + LF_CLR(WT_REC_LOOKASIDE); - /* - * We don't yet support fixed-length column-store combined with - * the lookaside table. It's not hard to do, but the underlying - * function that reviews which updates can be written to the - * evicted page and which updates need to be written to the - * lookaside table needs access to the original value from the - * page being evicted, and there's no code path for that in the - * case of fixed-length column-store objects. (Row-store and - * variable-width column-store objects provide a reference to - * the unpacked on-page cell for this purpose, but there isn't - * an on-page cell for fixed-length column-store objects.) For - * now, turn it off. - */ - if (page->type == WT_PAGE_COL_FIX) - LF_CLR(WT_EVICT_LOOKASIDE); + /* + * Check for a lookaside table and checkpoint collision, and if we find + * one, turn off the lookaside file (we've gone to all the effort of + * getting exclusive access to the page, might as well try and evict + * it). + */ + if (LF_ISSET(WT_REC_LOOKASIDE) && __rec_las_checkpoint_test(session, r)) + LF_CLR(WT_REC_LOOKASIDE); - /* - * Check for a lookaside table and checkpoint collision, and if - * we find one, turn off the lookaside file (we've gone to all - * the effort of getting exclusive access to the page, might as - * well try and evict it). - */ - if (__rec_las_checkpoint_test(session, r)) - LF_CLR(WT_EVICT_LOOKASIDE); - } r->flags = flags; - /* Track the page's maximum transaction ID. */ + /* Track the page's min/maximum transaction */ r->max_txn = WT_TXN_NONE; +#ifdef HAVE_TIMESTAMPS + __wt_timestamp_set_zero(&r->max_timestamp); + __wt_timestamp_set_inf(&r->min_saved_timestamp); +#endif - /* Track if all updates were skipped. */ - r->update_mem_all = r->update_mem_saved = r->update_mem_uncommitted = 0; + /* Track if updates were used and/or uncommitted. */ + r->update_used = r->update_uncommitted = false; /* Track if the page can be marked clean. */ r->leave_dirty = false; @@ -973,6 +984,8 @@ __rec_init(WT_SESSION_IMPL *session, r->wrapup_checkpoint = NULL; r->wrapup_checkpoint_compressed = false; + r->evict_matching_checksum_failed = false; + /* * Dictionary compression only writes repeated values once. We grow * the dictionary as necessary, always using the largest size we've @@ -1032,7 +1045,7 @@ __rec_init(WT_SESSION_IMPL *session, /* * __rec_cleanup -- * Clean up after a reconciliation run, except for structures cached - * across runs. + * across runs. */ static void __rec_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r) @@ -1113,19 +1126,13 @@ __rec_destroy_session(WT_SESSION_IMPL *session) */ static int __rec_update_save(WT_SESSION_IMPL *session, - WT_RECONCILE *r, WT_INSERT *ins, void *ripcip, WT_UPDATE *upd) + WT_RECONCILE *r, WT_INSERT *ins, void *ripcip, WT_UPDATE *onpage_upd) { WT_RET(__wt_realloc_def( session, &r->supd_allocated, r->supd_next + 1, &r->supd)); r->supd[r->supd_next].ins = ins; r->supd[r->supd_next].ripcip = ripcip; - r->supd[r->supd_next].onpage_txn = - upd == NULL ? WT_TXN_NONE : upd->txnid; -#ifdef HAVE_TIMESTAMPS - if (upd != NULL) - __wt_timestamp_set( - &r->supd[r->supd_next].onpage_timestamp, &upd->timestamp); -#endif + r->supd[r->supd_next].onpage_upd = onpage_upd; ++r->supd_next; return (0); } @@ -1136,7 +1143,7 @@ __rec_update_save(WT_SESSION_IMPL *session, */ static int __rec_append_orig_value(WT_SESSION_IMPL *session, - WT_PAGE *page, WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack) + WT_PAGE *page, WT_UPDATE *first_upd, WT_CELL_UNPACK *unpack) { WT_DECL_ITEM(tmp); WT_DECL_RET; @@ -1147,7 +1154,7 @@ __rec_append_orig_value(WT_SESSION_IMPL *session, * If at least one self-contained update is globally visible, we're * done. */ - for (upd = upd_list; upd != NULL; upd = upd->next) + for (upd = first_upd; upd != NULL; upd = upd->next) if (WT_UPDATE_DATA_VALUE(upd) && __wt_txn_upd_visible_all(session, upd)) return (0); @@ -1180,7 +1187,7 @@ __rec_append_orig_value(WT_SESSION_IMPL *session, * * Append the new entry to the update list. */ - for (upd = upd_list; upd->next != NULL; upd = upd->next) + for (upd = first_upd; upd->next != NULL; upd = upd->next) ; WT_PUBLISH(upd->next, append); __wt_cache_page_inmem_incr(session, page, size); @@ -1192,138 +1199,114 @@ err: __wt_scr_free(session, &tmp); /* * __rec_txn_read -- * Return the update in a list that should be written (or NULL if none can - * be written). + * be written). */ static int __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, void *ripcip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp) { WT_BTREE *btree; - WT_DECL_TIMESTAMP(max_timestamp) WT_PAGE *page; - WT_UPDATE *upd, *upd_list; - size_t update_mem; + WT_UPDATE *first_ts_upd, *first_txn_upd, *first_upd, *upd; + wt_timestamp_t *timestampp; uint64_t max_txn, txnid; - bool skipped; + bool all_visible, uncommitted; *updp = NULL; btree = S2BT(session); page = r->page; + first_ts_upd = first_txn_upd = NULL; + max_txn = WT_TXN_NONE; + uncommitted = false; /* * If called with a WT_INSERT item, use its WT_UPDATE list (which must * exist), otherwise check for an on-page row-store WT_UPDATE list * (which may not exist). Return immediately if the item has no updates. */ - if (ins == NULL) { - if ((upd_list = WT_ROW_UPDATE(page, ripcip)) == NULL) - return (0); - } else - upd_list = ins->upd; + if (ins != NULL) + first_upd = ins->upd; + else if ((first_upd = WT_ROW_UPDATE(page, ripcip)) == NULL) + return (0); - skipped = false; - update_mem = 0; - max_txn = WT_TXN_NONE; -#ifdef HAVE_TIMESTAMPS - __wt_timestamp_set_zero(&max_timestamp); -#endif + for (upd = first_upd; upd != NULL; upd = upd->next) { + if ((txnid = upd->txnid) == WT_TXN_ABORTED) + continue; - if (F_ISSET(r, WT_EVICTING)) { - /* Discard obsolete updates. */ - if ((upd = __wt_update_obsolete_check( - session, page, upd_list->next)) != NULL) - __wt_update_obsolete_free(session, page, upd); + /* + * Track the first update in the chain that is not aborted and + * the maximum transaction ID. + */ + if (first_txn_upd == NULL) + first_txn_upd = upd; - for (upd = upd_list; upd != NULL; upd = upd->next) { - /* Track the total memory in the update chain. */ - update_mem += WT_UPDATE_MEMSIZE(upd); + /* Track the largest transaction ID seen. */ + if (WT_TXNID_LT(max_txn, txnid)) + max_txn = txnid; - if ((txnid = upd->txnid) == WT_TXN_ABORTED) - continue; + /* + * Check whether the update was committed before reconciliation + * started. The global commit point can move forward during + * reconciliation so we use a cached copy to avoid races when a + * concurrent transaction commits or rolls back while we are + * examining its updates. + */ + if (WT_TXNID_LE(r->last_running, txnid)) + uncommitted = r->update_uncommitted = true; - /* - * Track the largest/smallest transaction IDs on the - * list. - */ - if (WT_TXNID_LT(max_txn, txnid)) - max_txn = txnid; + /* + * Find the first update we can use. + * + * Update/restore eviction can handle any update (including + * uncommitted updates). Lookaside eviction can save any + * committed update. Regular eviction checks that the maximum + * transaction ID and timestamp seen are stable. + * + * Use the first committed entry we find in the lookaside + * table. + */ + if (F_ISSET(btree, WT_BTREE_LOOKASIDE) && !uncommitted) { + *updp = upd; + break; + } + if (F_ISSET(r, WT_REC_VISIBLE_ALL) ? + !__wt_txn_upd_visible_all(session, upd) : + !__wt_txn_upd_visible(session, upd)) { /* - * Find the first update we can use. - * - * Check whether the update was committed before - * reconciliation started. The global commit point can - * move forward during reconciliation so we use a - * cached copy to avoid races when a concurrent - * transaction commits or rolls back while we are - * examining its updates. - * - * Lookaside eviction can cope with any committed - * update. Other eviction modes check that the maximum - * transaction ID and timestamp seen are stable. - * - * When reconciling for eviction, track whether any - * uncommitted updates are found. + * Rare case: when applications run at low isolation + * levels, update/restore eviction may see a stable + * update followed by an uncommitted update. Give up + * in that case: we need to discard updates from the + * stable update and older for correctness and we can't + * discard an uncommitted update. */ - if (WT_TXNID_LE(r->last_running, txnid)) { - skipped = true; - continue; - } - - if (*updp == NULL) - *updp = upd; + if (F_ISSET(r, WT_REC_UPDATE_RESTORE) && + *updp != NULL && uncommitted) + return (EBUSY); -#ifdef HAVE_TIMESTAMPS - /* Track min/max timestamps. */ - if (__wt_timestamp_cmp( - &upd->timestamp, &max_timestamp) > 0) - __wt_timestamp_set( - &max_timestamp, &upd->timestamp); -#endif + continue; } - } else - for (upd = upd_list; upd != NULL; upd = upd->next) { - if ((txnid = upd->txnid) == WT_TXN_ABORTED) - continue; - /* Track the largest transaction ID on the list. */ - if (WT_TXNID_LT(max_txn, txnid)) - max_txn = txnid; + if (*updp == NULL) + *updp = upd; - /* - * Find the first update we can use. - * - * Checkpoint can only write updates visible as of its - * snapshot. - * - * When reconciling for a checkpoint, track whether any - * updates were skipped on the way to finding the first - * visible update. - */ - if (*updp == NULL) { - if (__wt_txn_upd_visible(session, upd)) - *updp = upd; - else - skipped = true; - } - } +#ifdef HAVE_TIMESTAMPS + /* Track the first update with non-zero timestamp. */ + if (first_ts_upd == NULL && + !__wt_timestamp_iszero(&upd->timestamp)) + first_ts_upd = upd; +#endif + } /* Reconciliation should never see an aborted or reserved update. */ WT_ASSERT(session, *updp == NULL || ((*updp)->txnid != WT_TXN_ABORTED && (*updp)->type != WT_UPDATE_RESERVED)); - r->update_mem_all += update_mem; - - /* - * If all of the updates were aborted, quit. This test is not strictly - * necessary because the above loop exits with skipped not set and the - * maximum transaction left at its initial value of WT_TXN_NONE, so - * the test below will be branch true and return, but it's cheap and a - * little more explicit, and makes Coverity happy. - */ - if (max_txn == WT_TXN_NONE) + /* If all of the updates were aborted, quit. */ + if (first_txn_upd == NULL) return (0); /* @@ -1334,140 +1317,104 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, */ if (WT_TXNID_LT(r->max_txn, max_txn)) r->max_txn = max_txn; + #ifdef HAVE_TIMESTAMPS - if (__wt_timestamp_cmp(&r->max_timestamp, &max_timestamp) < 0) - __wt_timestamp_set(&r->max_timestamp, &max_timestamp); + if (first_ts_upd != NULL && + __wt_timestamp_cmp(&r->max_timestamp, &first_ts_upd->timestamp) < 0) + __wt_timestamp_set(&r->max_timestamp, &first_ts_upd->timestamp); #endif /* - * If there are no skipped updates and all updates are globally visible, - * the page can be marked clean and we're done, regardless if evicting - * or checkpointing. - * - * We have to check both: the oldest transaction ID may have moved while - * we were scanning the update list, so it is possible to find a skipped - * update, but then find all updates are stable at the end of the scan. - * - * Skip the visibility check for the lookaside table as a special-case, - * we know there are no older readers of that table. + * The checkpoint transaction is special. Make sure we never write + * (metadata) updates from a checkpoint in a concurrent session. */ - if (!skipped && (F_ISSET(btree, WT_BTREE_LOOKASIDE) || - __wt_txn_visible_all(session, - max_txn, WT_TIMESTAMP_NULL(&max_timestamp)))) { - /* - * The checkpoint transaction is special. Make sure we never - * write (metadata) updates from a checkpoint in a concurrent - * session. - */ - WT_ASSERT(session, *updp == NULL || - (*updp)->txnid != - S2C(session)->txn_global.checkpoint_state.id || - WT_SESSION_IS_CHECKPOINT(session)); + WT_ASSERT(session, *updp == NULL || (*updp)->txnid == WT_TXN_NONE || + (*updp)->txnid != S2C(session)->txn_global.checkpoint_state.id || + WT_SESSION_IS_CHECKPOINT(session)); - goto check_original_value; - } + /* + * If there are no skipped updates, record that we're making progress. + */ + if (*updp == first_txn_upd) + r->update_used = true; /* - * In some cases, there had better not be skipped updates or updates not - * yet globally visible. + * Check if all updates on the page are visible. If not, it must stay + * dirty unless we are saving updates to the lookaside table. + * + * Updates can be out of transaction ID order (but not out of timestamp + * order), so we track the maximum transaction ID and the newest update + * with a timestamp (if any). */ - if (F_ISSET(r, WT_VISIBILITY_ERR)) +#ifdef HAVE_TIMESTAMPS + timestampp = first_ts_upd == NULL ? NULL : &first_ts_upd->timestamp; +#else + WT_UNUSED(first_ts_upd); + timestampp = NULL; +#endif + if (F_ISSET(btree, WT_BTREE_LOOKASIDE)) + all_visible = !uncommitted; + else + all_visible = *updp == first_txn_upd && + (F_ISSET(r, WT_REC_VISIBLE_ALL) ? + __wt_txn_visible_all(session, max_txn, timestampp) : + __wt_txn_visible(session, max_txn, timestampp)); + + if (all_visible) + goto check_original_value; + + if (F_ISSET(r, WT_REC_VISIBILITY_ERR)) WT_PANIC_RET(session, EINVAL, - "reconciliation error, uncommitted update or update not " - "globally visible"); + "reconciliation error, update not visible"); + if (!F_ISSET(r, WT_REC_LOOKASIDE)) + r->leave_dirty = true; /* * If not trying to evict the page, we know what we'll write and we're - * done. Because some updates were skipped or are not globally visible, - * the page can't be marked clean. + * done. */ - if (!F_ISSET(r, WT_EVICTING)) { - r->leave_dirty = true; + if (!F_ISSET(r, WT_REC_EVICT)) goto check_original_value; - } /* - * Evicting with either uncommitted changes or not-yet-globally-visible - * changes. There are two ways to continue, the save/restore eviction - * path or the lookaside table eviction path. Both cannot be configured - * because the paths track different information. The save/restore path - * can handle both uncommitted and not-yet-globally-visible changes, by - * evicting most of the page and then creating a new, smaller page into - * which we re-instantiate those changes. The lookaside table path can - * only handle not-yet-globally-visible changes by writing those changes - * into the lookaside table and restoring them on demand if and when the - * page is read back into memory. + * We are attempting eviction with changes that are not yet stable + * (i.e. globally visible). There are two ways to continue, the + * save/restore eviction path or the lookaside table eviction path. + * Both cannot be configured because the paths track different + * information. The update/restore path can handle uncommitted changes, + * by evicting most of the page and then creating a new, smaller page + * to which we re-attach those changes. Lookaside eviction writes + * changes into the lookaside table and restores them on demand if and + * when the page is read back into memory. * * Both paths are configured outside of reconciliation: the save/restore - * path is the WT_EVICT_UPDATE_RESTORE flag, the lookaside table path is - * the WT_EVICT_LOOKASIDE flag. + * path is the WT_REC_UPDATE_RESTORE flag, the lookaside table path is + * the WT_REC_LOOKASIDE flag. */ - if (!F_ISSET(r, WT_EVICT_LOOKASIDE | WT_EVICT_UPDATE_RESTORE)) + if (!F_ISSET(r, WT_REC_LOOKASIDE | WT_REC_UPDATE_RESTORE) && + !F_ISSET(btree, WT_BTREE_LOOKASIDE)) return (EBUSY); - if (skipped && !F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) + if (uncommitted && !F_ISSET(r, WT_REC_UPDATE_RESTORE)) return (EBUSY); /* - * Track the memory required by the update chain. - * - * A page with no uncommitted (skipped) updates, that can't be evicted - * because some updates aren't yet globally visible, can be evicted by - * writing previous versions of the updates to the lookaside file. That - * test is just checking if the skipped updates memory is zero. - * - * If that's not possible (there are skipped updates), we can rewrite - * the pages in-memory, but we don't want to unless there's memory to - * recover. That test is comparing the memory we'd recover to the memory - * we'd have to re-instantiate as part of the rewrite. + * The order of the updates on the list matters, we can't move only the + * unresolved updates, move the entire update list. */ - r->update_mem_saved += update_mem; - if (skipped) - r->update_mem_uncommitted += update_mem; + WT_RET(__rec_update_save(session, r, ins, ripcip, *updp)); #ifdef HAVE_TIMESTAMPS - /* - * Don't allow lookaside eviction with updates newer than the stable - * timestamp. Also don't recommend lookaside eviction in that case. - */ - if (__wt_timestamp_cmp(&max_timestamp, &r->stable_timestamp) > 0) { - if (!F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) - return (EBUSY); - - if (!skipped) - r->update_mem_uncommitted += update_mem; + /* Track the oldest saved timestamp for lookaside. */ + if (F_ISSET(r, WT_REC_LOOKASIDE)) { + for (upd = first_upd; upd->next != NULL; upd = upd->next) + ; + if (__wt_timestamp_cmp( + &r->min_saved_timestamp, &upd->timestamp) > 0) + __wt_timestamp_set( + &r->min_saved_timestamp, &upd->timestamp); } #endif - if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) { - /* - * The save/restore eviction path. - * - * Clear the returned update, it's not needed. If there's an - * on-page key/value pair to which the update list applies, our - * caller writes it to the disk image. If an insert/append list, - * our caller can ignore the key/value pair (everything needed - * is in the update list), or in the case of row-store, write - * the key to the disk image to split up the insert/append list. - */ - *updp = NULL; - - /* The page can't be marked clean. */ - r->leave_dirty = true; - } - - /* - * The order of the updates on the list matters, we can't move only the - * unresolved updates, move the entire update list. - * - * If we skipped updates, the transaction value is never used. If we - * didn't skip updates, the list of updates are eventually written to - * the lookaside table, and associated with each update record is the - * transaction ID of the update we wrote in the reconciled page; once - * that transaction ID is globally visible, we know we no longer need - * the lookaside table records, allowing them to be discarded. - */ - WT_RET(__rec_update_save(session, r, ins, ripcip, *updp)); - check_original_value: /* * Returning an update means the original on-page value might be lost, @@ -1477,10 +1424,11 @@ check_original_value: * record that will be physically removed once it's no longer needed. */ if (*updp != NULL && - (F_ISSET(r, WT_EVICT_LOOKASIDE) || - (vpack != NULL && + (F_ISSET(r, WT_REC_LOOKASIDE) || + (*updp != NULL && vpack != NULL && vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM))) - WT_RET(__rec_append_orig_value(session, page, *updp, vpack)); + WT_RET( + __rec_append_orig_value(session, page, first_upd, vpack)); return (0); } @@ -1488,7 +1436,7 @@ check_original_value: /* * WT_CHILD_RELEASE, WT_CHILD_RELEASE_ERR -- * Macros to clean up during internal-page reconciliation, releasing the - * hazard pointer we're holding on child pages. + * hazard pointer we're holding on child pages. */ #define WT_CHILD_RELEASE(session, hazard, ref) do { \ if (hazard) { \ @@ -1534,7 +1482,7 @@ __rec_child_deleted(WT_SESSION_IMPL *session, * * In some cases, there had better not be any updates we can't see. */ - if (F_ISSET(r, WT_VISIBILITY_ERR) && page_del != NULL && + if (F_ISSET(r, WT_REC_VISIBILITY_ERR) && page_del != NULL && !__wt_txn_visible(session, page_del->txnid, WT_TIMESTAMP_NULL(&page_del->timestamp))) WT_PANIC_RET(session, EINVAL, @@ -1600,7 +1548,7 @@ __rec_child_deleted(WT_SESSION_IMPL *session, * if subsequently read (we wouldn't know which transactions should see * the original page and which should see the deleted page). */ - if (F_ISSET(r, WT_EVICTING)) + if (F_ISSET(r, WT_REC_EVICT)) return (EBUSY); /* @@ -1683,10 +1631,9 @@ __rec_child_modify(WT_SESSION_IMPL *session, * pages in an evicted page's subtree fails the eviction * attempt. */ - if (F_ISSET(r, WT_EVICTING)) { - WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT)); + if (F_ISSET(r, WT_REC_EVICT)) return (EBUSY); - } /* * If called during checkpoint, the child is being @@ -1700,6 +1647,20 @@ __rec_child_modify(WT_SESSION_IMPL *session, */ break; + case WT_REF_LOOKASIDE: + /* + * On disk, with lookaside updates. + * + * We should never be here during eviction, active + * child pages in an evicted page's subtree fails the + * eviction attempt. + */ + WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT)); + if (F_ISSET(r, WT_REC_EVICT)) + return (EBUSY); + + goto done; + case WT_REF_MEM: /* * In memory. @@ -1708,10 +1669,9 @@ __rec_child_modify(WT_SESSION_IMPL *session, * pages in an evicted page's subtree fails the eviction * attempt. */ - if (F_ISSET(r, WT_EVICTING)) { - WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT)); + if (F_ISSET(r, WT_REC_EVICT)) return (EBUSY); - } /* * If called during checkpoint, acquire a hazard pointer @@ -1739,10 +1699,9 @@ __rec_child_modify(WT_SESSION_IMPL *session, * pages in an evicted page's subtree fails the eviction * attempt. */ - if (F_ISSET(r, WT_EVICTING)) { - WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + WT_ASSERT(session, !F_ISSET(r, WT_REC_EVICT)); + if (F_ISSET(r, WT_REC_EVICT)) return (EBUSY); - } goto done; case WT_REF_SPLIT: @@ -2073,7 +2032,8 @@ __rec_split_page_size_from_pct( /* * __wt_split_page_size -- * Split page size calculation: we don't want to repeatedly split every - * time a new entry is added, so we split to a smaller-than-maximum page size. + * time a new entry is added, so we split to a smaller-than-maximum page + * size. */ uint32_t __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) @@ -2396,7 +2356,7 @@ __rec_split_row_promote( * the last key and smaller than the current key. */ max = r->last; - if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) + if (F_ISSET(r, WT_REC_UPDATE_RESTORE)) for (i = r->supd_next; i > 0; --i) { supd = &r->supd[i - 1]; if (supd->ins == NULL) @@ -2484,7 +2444,7 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len) /* * __rec_split -- * Handle the page reconciliation bookkeeping. (Did you know "bookkeeper" - * has 3 doubled letters in a row? Sweet-tooth does, too.) + * has 3 doubled letters in a row? Sweet-tooth does, too.) */ static int __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) @@ -3157,27 +3117,13 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) /* * We may arrive here with no entries to write if the page was entirely * empty or if nothing on the page was visible to us. + * + * Pages with skipped or not-yet-globally visible updates aren't really + * empty; otherwise, the page is truly empty and we will merge it into + * its parent during the parent's reconciliation. */ - if (r->entries == 0) { - /* - * Pages with skipped or not-yet-globally visible updates aren't - * really empty; otherwise, the page is truly empty and we will - * merge it into its parent during the parent's reconciliation. - */ - if (r->supd_next == 0) - return (0); - - /* - * If using the save/restore eviction path, continue with the - * write, the page will be restored after we finish. - * - * If using the lookaside table eviction path, we can't continue - * (we need a page to be written, otherwise we won't ever find - * the updates for future reads). - */ - if (F_ISSET(r, WT_EVICT_LOOKASIDE)) - return (EBUSY); - } + if (r->entries == 0 && r->supd_next == 0) + return (0); /* Set the number of entries and size for the just finished chunk. */ r->cur_ptr->image.size = @@ -3195,7 +3141,7 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) /* * __rec_supd_move -- * Move a saved WT_UPDATE list from the per-page cache to a specific - * block's list. + * block's list. */ static int __rec_supd_move( @@ -3214,7 +3160,7 @@ __rec_supd_move( /* * __rec_split_write_supd -- * Check if we've saved updates that belong to this block, and move any - * to the per-block structure. + * to the per-block structure. */ static int __rec_split_write_supd(WT_SESSION_IMPL *session, @@ -3329,7 +3275,7 @@ __rec_split_write_header(WT_SESSION_IMPL *session, * and we found updates that weren't globally visible when reconciling * this page. */ - if (F_ISSET(r, WT_EVICT_LOOKASIDE) && multi->supd != NULL) { + if (F_ISSET(r, WT_REC_LOOKASIDE) && multi->supd != NULL) { F_SET(dsk, WT_PAGE_LAS_UPDATE); r->cache_write_lookaside = true; } @@ -3345,6 +3291,91 @@ __rec_split_write_header(WT_SESSION_IMPL *session, } /* + * __rec_split_write_reuse -- + * Check if a previously written block can be reused. + */ +static bool +__rec_split_write_reuse(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_MULTI *multi, WT_ITEM *image, bool last_block) +{ + WT_MULTI *multi_match; + WT_PAGE_MODIFY *mod; + + mod = r->page->modify; + + /* + * Don't bother calculating checksums for bulk loads, there's no reason + * to believe they'll be useful. Check because LSM does bulk-loads as + * part of normal operations and the check is cheap. + */ + if (r->is_bulk_load) + return (false); + + /* + * Calculating the checksum is the expensive part, try to avoid it. + * + * Ignore the last block of any reconciliation. Pages are written in the + * same block order every time, so the last block written for a page is + * unlikely to match any previously written block or block written in + * the future, (absent a point-update earlier in the page which didn't + * change the size of the on-page object in any way). + */ + if (last_block) + return (false); + + /* + * Quit if evicting with no previously written block to compare against. + * (In other words, if there's eviction pressure and the page was never + * written by a checkpoint, calculating a checksum is worthless.) + * + * Quit if evicting and a previous check failed, once there's a miss no + * future block will match. + */ + if (F_ISSET(r, WT_REC_EVICT)) { + if (mod->rec_result != WT_PM_REC_MULTIBLOCK || + mod->mod_multi_entries < r->multi_next) + return (false); + if (r->evict_matching_checksum_failed) + return (false); + } + + /* Calculate the checksum for this block. */ + multi->checksum = __wt_checksum(image->data, image->size); + + /* + * Don't check for a block match when writing blocks during compaction, + * the whole idea is to move those blocks. Check after calculating the + * checksum, we don't distinguish between pages written solely as part + * of the compaction and pages written at around the same time, and so + * there's a possibility the calculated checksum will be useful in the + * future. + */ + if (session->compact_state != WT_COMPACT_NONE) + return (false); + + /* + * Pages are written in the same block order every time, only check the + * appropriate slot. + */ + if (mod->rec_result != WT_PM_REC_MULTIBLOCK || + mod->mod_multi_entries < r->multi_next) + return (false); + + multi_match = &mod->mod_multi[r->multi_next - 1]; + if (multi_match->size != multi->size || + multi_match->checksum != multi->checksum) { + r->evict_matching_checksum_failed = true; + return (false); + } + + multi_match->addr.reuse = 1; + multi->addr = multi_match->addr; + + WT_STAT_DATA_INCR(session, rec_page_match); + return (true); +} + +/* * __rec_split_write -- * Write a disk block out for the split helper functions. */ @@ -3353,9 +3384,8 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_CHUNK *chunk, WT_ITEM *compressed_image, bool last_block) { WT_BTREE *btree; - WT_MULTI *multi, *multi_mod; + WT_MULTI *multi; WT_PAGE *page; - WT_PAGE_MODIFY *mod; size_t addr_size; uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE]; #ifdef HAVE_DIAGNOSTIC @@ -3364,7 +3394,6 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r, btree = S2BT(session); page = r->page; - mod = page->modify; #ifdef HAVE_DIAGNOSTIC verify_image = true; #endif @@ -3422,7 +3451,7 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r, */ if (last_block && r->multi_next == 1 && __rec_is_checkpoint(session, r)) { - WT_ASSERT(session, r->supd == NULL); + WT_ASSERT(session, r->supd_next == 0); if (compressed_image == NULL) r->wrapup_checkpoint = &chunk->image; @@ -3434,71 +3463,64 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r, } /* - * If configured for an in-memory database, or using the save/restore - * eviction path and we had to skip updates in order to build this disk - * image, we can't actually write it. Instead, we will re-instantiate - * the page using the disk image and any list of updates we skipped. + * If configured for an in-memory database, we can't actually write it. + * Instead, we will re-instantiate the page using the disk image and + * any list of updates we skipped. */ - if (F_ISSET(r, WT_EVICT_IN_MEMORY)) - goto copy_image; - if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && multi->supd != NULL) { - r->cache_write_restore = true; + if (F_ISSET(r, WT_REC_IN_MEMORY)) goto copy_image; - } /* - * If we wrote this block before, re-use it. Pages get written in the - * same block order every time, only check the appropriate slot. The - * expensive part of this test is the checksum, only do that work when - * there has been or will be a reconciliation of this page involving - * split pages. This test isn't perfect: we're doing a checksum if a - * previous reconciliation of the page split or if we will split this - * time, but that test won't calculate a checksum on the first block - * the first time the page splits. + * If there are saved updates, we are either doing update/restore + * eviction or lookaside eviction. Update/restore never writes the + * disk image. + * + * Lookaside does write disk images, but also needs to cope with the + * case where no updates could be written, which means there are no + * entries in the page image to write. */ - if (r->multi_next > 1 || - (mod->rec_result == WT_PM_REC_MULTIBLOCK && - mod->mod_multi != NULL)) { - multi->checksum = - __wt_checksum(chunk->image.data, chunk->image.size); - + if (multi->supd != NULL && + (F_ISSET(r, WT_REC_UPDATE_RESTORE) || chunk->entries == 0)) { /* - * One last check: don't reuse blocks if compacting, the reason - * for compaction is to move blocks to different locations. We - * do this check after calculating the checksums, hopefully the - * next write can be skipped. + * If no entries were used, the page is empty and we can only + * restore updates against an empty row store leaf page. + * (Column store modify will attempt to allocate a zero-length + * array). */ - if (session->compact_state == WT_COMPACT_NONE && - mod->rec_result == WT_PM_REC_MULTIBLOCK && - mod->mod_multi_entries > r->multi_next) { - multi_mod = &mod->mod_multi[r->multi_next - 1]; - if (multi_mod->size == multi->size && - multi_mod->checksum == multi->checksum) { - multi_mod->addr.reuse = 1; - multi->addr = multi_mod->addr; - - WT_STAT_DATA_INCR(session, rec_page_match); - goto copy_image; - } - } + if (r->page->type != WT_PAGE_ROW_LEAF && + chunk->entries == 0 && multi->supd != NULL) + return (EBUSY); + + r->cache_write_restore = true; + goto update_las; } + /* + * If we wrote this block before, re-use it. Prefer a checksum of the + * compressed image. It's an identical test and should be faster. + */ + if (__rec_split_write_reuse(session, r, multi, + compressed_image == NULL ? &chunk->image : compressed_image, + last_block)) + goto copy_image; + WT_RET(__wt_bt_write(session, compressed_image == NULL ? &chunk->image : compressed_image, - addr, &addr_size, - false, F_ISSET(r, WT_CHECKPOINTING), compressed_image != NULL)); + addr, &addr_size, false, F_ISSET(r, WT_REC_CHECKPOINT), + compressed_image != NULL)); #ifdef HAVE_DIAGNOSTIC verify_image = false; #endif WT_RET(__wt_memdup(session, addr, addr_size, &multi->addr.addr)); multi->addr.size = (uint8_t)addr_size; +update_las: /* * If using the lookaside table eviction path and we found updates that * weren't globally visible when reconciling this page, copy them into * the database's lookaside store. */ - if (F_ISSET(r, WT_EVICT_LOOKASIDE) && multi->supd != NULL) + if (F_ISSET(r, WT_REC_LOOKASIDE) && multi->supd != NULL) WT_RET(__rec_update_las(session, r, btree->id, multi)); copy_image: @@ -3511,13 +3533,14 @@ copy_image: __wt_verify_dsk_image(session, "[reconcile-image]", chunk->image.data, 0, true) == 0); #endif + /* * If re-instantiating this page in memory (either because eviction * wants to, or because we skipped updates to build the disk image), * save a copy of the disk image. */ - if (F_ISSET(r, WT_EVICT_SCRUB) || - (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && multi->supd != NULL)) + if (F_ISSET(r, WT_REC_SCRUB) || + (F_ISSET(r, WT_REC_UPDATE_RESTORE) && multi->supd != NULL)) WT_RET(__wt_memdup(session, chunk->image.data, chunk->image.size, &multi->disk_image)); @@ -3535,26 +3558,19 @@ __rec_update_las(WT_SESSION_IMPL *session, WT_CURSOR *cursor; WT_DECL_ITEM(key); WT_DECL_RET; - WT_ITEM las_addr, las_timestamp, las_value; + WT_ITEM las_timestamp, las_value; WT_PAGE *page; WT_SAVE_UPD *list; WT_UPDATE *upd; - uint64_t insert_cnt, las_counter; + uint64_t insert_cnt, las_counter, las_pageid; uint32_t i, session_flags, slot; uint8_t *p; cursor = NULL; - WT_CLEAR(las_addr); WT_CLEAR(las_timestamp); WT_CLEAR(las_value); page = r->page; - insert_cnt = 0; - - /* - * We're writing lookaside records: start instantiating them on pages - * we read (with the right flag set), and start sweeping the file. - */ - __wt_las_set_written(session); + insert_cnt = las_pageid = 0; __wt_las_cursor(session, &cursor, &session_flags); @@ -3562,29 +3578,20 @@ __rec_update_las(WT_SESSION_IMPL *session, WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key)); /* - * Each key in the lookaside table is associated with a block, and those - * blocks are freed and reallocated to other pages as pages in the tree - * are modified and reconciled. We want to be sure we don't add records - * to the lookaside table, then discard the block to which they apply, - * then write a new block to the same address, and then apply the old - * records to the new block when it's read. We don't want to clean old - * records out of the lookaside table every time we free a block because - * that happens a lot and would be costly; instead, we clean out the old - * records when adding new records into the lookaside table. This works - * because we only read from the lookaside table for pages marked with - * the WT_PAGE_LAS_UPDATE flag: that flag won't be set if we rewrite a - * block with no lookaside records, so the lookaside table won't be - * checked when the block is read, even if there are lookaside table - * records matching that block. If we rewrite a block that has lookaside - * records, we'll run this code, discarding any old records that might - * exist. - */ - WT_ERR(__wt_las_remove_block( - session, cursor, btree_id, multi->addr.addr, multi->addr.size)); - - /* Lookaside table key component: block address. */ - las_addr.data = multi->addr.addr; - las_addr.size = multi->addr.size; + * Each key in the lookaside table is associated with a unique + * identifier, allocated sequentially per tree. + */ + las_pageid = multi->las_pageid = + __wt_atomic_add64(&S2BT(session)->las_pageid, 1); + + /* The zero page ID is reserved, check we don't see it. */ + WT_ASSERT(session, las_pageid != 0); + + /* + * Make sure there are no left over entries (e.g., from a handle + * reopen). + */ + WT_ERR(__wt_las_remove_block(session, cursor, btree_id, las_pageid)); /* Enter each update in the boundary's list into the lookaside store. */ for (las_counter = 0, i = 0, @@ -3654,13 +3661,8 @@ __rec_update_las(WT_SESSION_IMPL *session, continue; } -#ifdef HAVE_TIMESTAMPS - las_timestamp.data = &list->onpage_timestamp; - las_timestamp.size = WT_TIMESTAMP_SIZE; -#endif cursor->set_key(cursor, - btree_id, &las_addr, ++las_counter, - list->onpage_txn, &las_timestamp, key); + btree_id, las_pageid, ++las_counter, key); #ifdef HAVE_TIMESTAMPS las_timestamp.data = &upd->timestamp; @@ -3680,9 +3682,9 @@ __rec_update_las(WT_SESSION_IMPL *session, err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); if (insert_cnt > 0) { - (void)__wt_atomic_add64( - &S2C(session)->las_record_cnt, insert_cnt); - __rec_verbose_lookaside_write(session); + WT_STAT_CONN_INCRV( + session, cache_lookaside_entries, insert_cnt); + __rec_verbose_lookaside_write(session, btree_id, las_pageid); } __wt_scr_free(session, &key); @@ -4368,7 +4370,7 @@ __rec_col_fix_slvg(WT_SESSION_IMPL *session, /* * __rec_col_var_helper -- * Create a column-store variable length record cell and write it onto a - * page. + * page. */ static int __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, @@ -4634,7 +4636,7 @@ record_loop: /* * Assert the case. */ WT_ASSERT(session, - F_ISSET(r, WT_EVICT_UPDATE_RESTORE)); + F_ISSET(r, WT_REC_UPDATE_RESTORE)); /* * The on-page value will never be accessed, @@ -4776,7 +4778,7 @@ compare: /* if (ovfl_state == OVFL_UNUSED && vpack->raw != WT_CELL_VALUE_OVFL_RM) WT_ERR(__wt_ovfl_remove( - session, page, vpack, !F_ISSET(r, WT_EVICTING))); + session, page, vpack, F_ISSET(r, WT_REC_EVICT))); } /* Walk any append list. */ @@ -5356,7 +5358,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session, * Assert the case. */ WT_ASSERT(session, - F_ISSET(r, WT_EVICT_UPDATE_RESTORE)); + F_ISSET(r, WT_REC_UPDATE_RESTORE)); /* * If the key is also a removed overflow item, @@ -5404,7 +5406,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session, if (vpack != NULL && vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM) WT_ERR(__wt_ovfl_remove(session, - page, vpack, !F_ISSET(r, WT_EVICTING))); + page, vpack, F_ISSET(r, WT_REC_EVICT))); switch (upd->type) { case WT_UPDATE_DELETED: @@ -5632,12 +5634,13 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) { WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd)); + if (upd == NULL) { /* * Look for an update. If nothing is visible and not in * evict/restore, there's no work to do. */ - if (!F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) + if (!F_ISSET(r, WT_REC_UPDATE_RESTORE)) continue; /* @@ -5679,8 +5682,8 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) if (upd->size == 0) val->len = 0; else - WT_RET(__rec_cell_build_val( - session, r, upd->data, upd->size, + WT_RET(__rec_cell_build_val(session, + r, upd->data, upd->size, (uint64_t)0)); break; WT_ILLEGAL_VALUE(session); @@ -5945,9 +5948,9 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * in memory because the latter can't handle update lists and * splits can. */ - if (F_ISSET(r, WT_EVICT_IN_MEMORY) || - (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && - r->multi->supd != NULL)) + if (F_ISSET(r, WT_REC_IN_MEMORY) || + (F_ISSET(r, WT_REC_UPDATE_RESTORE) && + r->multi->supd_entries != 0)) goto split; /* @@ -5959,9 +5962,15 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) r->multi->addr.addr = NULL; mod->mod_disk_image = r->multi->disk_image; r->multi->disk_image = NULL; + mod->mod_replace_las_pageid = r->multi->las_pageid; +#ifdef HAVE_TIMESTAMPS + __wt_timestamp_set(&mod->mod_replace_las_min_timestamp, + &r->min_saved_timestamp); +#endif + r->multi->las_pageid = 0; } else WT_RET(__wt_bt_write(session, r->wrapup_checkpoint, - NULL, NULL, true, F_ISSET(r, WT_CHECKPOINTING), + NULL, NULL, true, F_ISSET(r, WT_REC_CHECKPOINT), r->wrapup_checkpoint_compressed)); mod->rec_result = WT_PM_REC_REPLACE; @@ -6037,14 +6046,13 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) } WT_TRET(__wt_ovfl_track_wrapup_err(session, page)); - return (ret); } /* * __rec_cell_build_int_key -- * Process a key and return a WT_CELL structure and byte string to be - * stored on a row-store internal page. + * stored on a row-store internal page. */ static int __rec_cell_build_int_key(WT_SESSION_IMPL *session, @@ -6081,7 +6089,7 @@ __rec_cell_build_int_key(WT_SESSION_IMPL *session, /* * __rec_cell_build_leaf_key -- * Process a key and return a WT_CELL structure and byte string to be - * stored on a row-store leaf page. + * stored on a row-store leaf page. */ static int __rec_cell_build_leaf_key(WT_SESSION_IMPL *session, @@ -6184,7 +6192,7 @@ __rec_cell_build_leaf_key(WT_SESSION_IMPL *session, /* * __rec_cell_build_addr -- * Process an address reference and return a cell structure to be stored - * on the page. + * on the page. */ static void __rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r, @@ -6219,7 +6227,7 @@ __rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r, /* * __rec_cell_build_val -- * Process a data item and return a WT_CELL structure and byte string to - * be stored on the page. + * be stored on the page. */ static int __rec_cell_build_val(WT_SESSION_IMPL *session, @@ -6311,7 +6319,7 @@ __rec_cell_build_ovfl(WT_SESSION_IMPL *session, /* Write the buffer. */ addr = buf; WT_ERR(__wt_bt_write(session, tmp, - addr, &size, false, F_ISSET(r, WT_CHECKPOINTING), false)); + addr, &size, false, F_ISSET(r, WT_REC_CHECKPOINT), false)); /* * Track the overflow record (unless it's a bulk load, which @@ -6460,7 +6468,7 @@ __rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r) /* * __rec_dictionary_reset -- * Reset the dictionary when reconciliation restarts and when crossing a - * page boundary (a potential split). + * page boundary (a potential split). */ static void __rec_dictionary_reset(WT_RECONCILE *r) @@ -6527,10 +6535,11 @@ __rec_dictionary_lookup( /* * __rec_verbose_lookaside_write -- * Create a verbose message to display once per checkpoint with details - * about the cache state when performing a lookaside table write. + * about the cache state when performing a lookaside table write. */ static void -__rec_verbose_lookaside_write(WT_SESSION_IMPL *session) +__rec_verbose_lookaside_write( + WT_SESSION_IMPL *session, uint32_t las_id, uint64_t las_pageid) { #ifdef HAVE_VERBOSE WT_CONNECTION_IMPL *conn; @@ -6560,14 +6569,19 @@ __rec_verbose_lookaside_write(WT_SESSION_IMPL *session) (void)__wt_eviction_dirty_needed(session, &pct_dirty); __wt_verbose(session, WT_VERB_LOOKASIDE, - "Page reconciliation triggered lookaside write. " - "Entries now in lookaside file: %" PRIu64 ", " + "Page reconciliation triggered lookaside write" + "file ID %" PRIu32 ", page ID %" PRIu64 ". " + "Entries now in lookaside file: %" PRId64 ", " "cache dirty: %" PRIu32 "%% , " "cache use: %" PRIu32 "%%", - conn->las_record_cnt, pct_dirty, pct_full); + las_id, las_pageid, + WT_STAT_READ(conn->stats, cache_lookaside_entries), + pct_dirty, pct_full); } } #else WT_UNUSED(session); + WT_UNUSED(las_id); + WT_UNUSED(las_pageid); #endif } diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index d3540cb1dab..cc32766c9dc 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -1992,11 +1992,14 @@ __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, F_SET(session, session_flags | WT_SESSION_INTERNAL); /* + * Optionally acquire a lookaside table cursor (or clear caller's flag). * Acquiring the lookaside table cursor requires various locks; we've * seen problems in the past where deadlocks happened because sessions * deadlocked getting the cursor late in the process. Be defensive, * get it now. */ + if (!F_ISSET(conn, WT_CONN_LAS_OPEN)) + F_CLR(session, WT_SESSION_LOOKASIDE_CURSOR); if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR) && (ret = __wt_las_cursor_open(session, &session->las_cursor)) != 0) { wt_session = &session->iface; diff --git a/src/third_party/wiredtiger/src/support/hex.c b/src/third_party/wiredtiger/src/support/hex.c index e0b1b6de1ea..58730b1505b 100644 --- a/src/third_party/wiredtiger/src/support/hex.c +++ b/src/third_party/wiredtiger/src/support/hex.c @@ -116,6 +116,12 @@ __wt_hex2byte(const u_char *from, u_char *to) case '7': byte = 7 << 4; break; case '8': byte = 8 << 4; break; case '9': byte = 9 << 4; break; + case 'A': byte = 10 << 4; break; + case 'B': byte = 11 << 4; break; + case 'C': byte = 12 << 4; break; + case 'D': byte = 13 << 4; break; + case 'E': byte = 14 << 4; break; + case 'F': byte = 15 << 4; break; case 'a': byte = 10 << 4; break; case 'b': byte = 11 << 4; break; case 'c': byte = 12 << 4; break; @@ -137,6 +143,12 @@ __wt_hex2byte(const u_char *from, u_char *to) case '7': byte |= 7; break; case '8': byte |= 8; break; case '9': byte |= 9; break; + case 'A': byte |= 10; break; + case 'B': byte |= 11; break; + case 'C': byte |= 12; break; + case 'D': byte |= 13; break; + case 'E': byte |= 14; break; + case 'F': byte |= 15; break; case 'a': byte |= 10; break; case 'b': byte |= 11; break; case 'c': byte |= 12; break; diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 05b653a8c77..57dcd33c7f1 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -809,6 +809,7 @@ static const char * const __stats_connection_desc[] = { "cache: internal pages evicted", "cache: internal pages split during eviction", "cache: leaf pages split during eviction", + "cache: lookaside table entries", "cache: lookaside table insert calls", "cache: lookaside table remove calls", "cache: maximum bytes configured", @@ -1138,6 +1139,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->cache_eviction_internal = 0; stats->cache_eviction_split_internal = 0; stats->cache_eviction_split_leaf = 0; + /* not clearing cache_lookaside_entries */ stats->cache_lookaside_insert = 0; stats->cache_lookaside_remove = 0; /* not clearing cache_bytes_max */ @@ -1488,6 +1490,8 @@ __wt_stat_connection_aggregate( WT_STAT_READ(from, cache_eviction_split_internal); to->cache_eviction_split_leaf += WT_STAT_READ(from, cache_eviction_split_leaf); + to->cache_lookaside_entries += + WT_STAT_READ(from, cache_lookaside_entries); to->cache_lookaside_insert += WT_STAT_READ(from, cache_lookaside_insert); to->cache_lookaside_remove += diff --git a/src/third_party/wiredtiger/src/support/thread_group.c b/src/third_party/wiredtiger/src/support/thread_group.c index 59caaedf5cf..f5842bea572 100644 --- a/src/third_party/wiredtiger/src/support/thread_group.c +++ b/src/third_party/wiredtiger/src/support/thread_group.c @@ -141,7 +141,6 @@ __thread_group_resize( conn = S2C(session); thread = NULL; - session_flags = 0; __wt_verbose(session, WT_VERB_THREAD_GROUP, "Resize thread group: %p, from min: %" PRIu32 " -> %" PRIu32 @@ -187,9 +186,10 @@ __thread_group_resize( * started during recovery, before the lookaside table is * created. */ + session_flags = 0; if (LF_ISSET(WT_THREAD_CAN_WAIT)) - session_flags = WT_SESSION_CAN_WAIT; - if (F_ISSET(conn, WT_CONN_LAS_OPEN)) + FLD_SET(session_flags, WT_SESSION_CAN_WAIT); + if (LF_ISSET(WT_THREAD_LOOKASIDE)) FLD_SET(session_flags, WT_SESSION_LOOKASIDE_CURSOR); WT_ERR(__wt_open_internal_session(conn, group->name, false, session_flags, &thread->session)); diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index ea5cd3390e2..c5c514c008b 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -593,6 +593,21 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) #endif } +#ifdef HAVE_TIMESTAMPS + /* + * Debugging checks on timestamps, if user requested them. + */ + if (F_ISSET(txn, WT_TXN_TS_COMMIT_ALWAYS) && + !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && + txn->mod_count != 0) + WT_ERR_MSG(session, EINVAL, "commit_timestamp required and " + "none set on this transaction"); + if (F_ISSET(txn, WT_TXN_TS_COMMIT_NEVER) && + F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && + txn->mod_count != 0) + WT_ERR_MSG(session, EINVAL, "no commit_timestamp required and " + "timestamp set on this transaction"); +#endif /* * The default sync setting is inherited from the connection, but can * be overridden by an explicit "sync" setting for this transaction. diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 9d5f0c1adc0..7d2bb62cdd1 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -289,7 +289,6 @@ __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) return (0); -#ifdef HAVE_DIAGNOSTIC /* * We may have raced between starting the checkpoint transaction and * some operation completing on the handle that updated the metadata @@ -301,32 +300,26 @@ __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) */ if (!WT_IS_METADATA(session->dhandle)) { WT_CURSOR *meta_cursor; - bool metadata_race; WT_ASSERT(session, !F_ISSET(&session->txn, WT_TXN_ERROR)); WT_RET(__wt_metadata_cursor(session, &meta_cursor)); meta_cursor->set_key(meta_cursor, session->dhandle->name); ret = __wt_curfile_insert_check(meta_cursor); if (ret == WT_ROLLBACK) { - metadata_race = true; /* - * Disable this check and assertion for now - it is - * possible that a schema operation with a timestamp in - * the future is in the metadata, but not part of the - * the checkpoint now that checkpoints can be created - * at the stable timestamp. - * See WT-3559 for context on re-adding this assertion. + * If create or drop or any schema operation of a table + * is with in an user transaction then checkpoint can + * see the dhandle before the commit, which will lead + * to the rollback error. We will ignore this dhandle as + * part of this checkpoint by returning from here. */ -#if 0 - ret = 0; -#endif - } else - metadata_race = false; + WT_TRET(__wt_metadata_cursor_release(session, + &meta_cursor)); + return (0); + } WT_TRET(__wt_metadata_cursor_release(session, &meta_cursor)); WT_RET(ret); - WT_ASSERT(session, !metadata_race); } -#endif /* * Decide whether the tree needs to be included in the checkpoint and diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 54634c03dfb..929aba30155 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -20,14 +20,15 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session) WT_CURSOR *cursor; WT_DECL_RET; WT_DECL_TIMESTAMP(rollback_timestamp) - WT_ITEM las_addr, las_key, las_timestamp; + WT_ITEM las_key, las_timestamp, las_value; WT_TXN_GLOBAL *txn_global; - uint64_t las_counter, las_txnid, remove_cnt; + uint64_t las_counter, las_pageid, las_total, las_txnid; uint32_t las_id, session_flags; + uint8_t upd_type; conn = S2C(session); cursor = NULL; - remove_cnt = 0; + las_total = 0; session_flags = 0; /* [-Werror=maybe-uninitialized] */ WT_CLEAR(las_timestamp); @@ -40,7 +41,7 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session) txn_global = &conn->txn_global; WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, __wt_timestamp_set( - &rollback_timestamp, &txn_global->stable_timestamp)); + &rollback_timestamp, &txn_global->stable_timestamp)); __wt_las_cursor(session, &cursor, &session_flags); @@ -49,8 +50,8 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session) /* Walk the file. */ for (; (ret = cursor->next(cursor)) == 0; ) { - WT_ERR(cursor->get_key(cursor, &las_id, &las_addr, &las_counter, - &las_txnid, &las_timestamp, &las_key)); + WT_ERR(cursor->get_key(cursor, + &las_id, &las_pageid, &las_counter, &las_key)); /* Check the file ID so we can skip durable tables */ if (las_id >= conn->stable_rollback_maxfile) @@ -60,27 +61,23 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session) if (__bit_test(conn->stable_rollback_bitstring, las_id)) continue; + WT_ERR(cursor->get_value(cursor, + &las_txnid, &las_timestamp, &upd_type, &las_value)); + /* * Entries with no timestamp will have a timestamp of zero, * which will fail the following check and cause them to never * be removed. */ if (__wt_timestamp_cmp( - &rollback_timestamp, las_timestamp.data) < 0) { + &rollback_timestamp, las_timestamp.data) < 0) WT_ERR(cursor->remove(cursor)); - ++remove_cnt; - } + else + ++las_total; } WT_ERR_NOTFOUND_OK(ret); err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); - /* - * If there were races to remove records, we can over-count. Underflow - * isn't fatal, but check anyway so we don't skew low over time. - */ - if (remove_cnt > conn->las_record_cnt) - conn->las_record_cnt = 0; - else if (remove_cnt > 0) - (void)__wt_atomic_sub64(&conn->las_record_cnt, remove_cnt); + WT_STAT_CONN_SET(session, cache_lookaside_entries, las_total); F_CLR(session, WT_SESSION_NO_CACHE); @@ -303,6 +300,20 @@ __txn_rollback_to_stable_btree_walk( } /* + * __txn_rollback_eviction_drain -- + * Wait for eviction to drain from a tree. + */ +static int +__txn_rollback_eviction_drain(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_UNUSED(cfg); + + WT_RET(__wt_evict_file_exclusive_on(session)); + __wt_evict_file_exclusive_off(session); + return (0); +} + +/* * __txn_rollback_to_stable_btree -- * Called for each open handle - choose to either skip or wipe the commits */ @@ -422,7 +433,19 @@ __wt_txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[]) WT_DECL_RET; conn = S2C(session); - WT_RET(__txn_rollback_to_stable_check(session)); + + /* + * Mark that a rollback operation is in progress and wait for eviction + * to drain. This is necessary because lookaside eviction uses + * transactions and causes the check for a quiescent system to fail. + */ + F_SET(conn, WT_CONN_EVICTION_NO_LOOKASIDE); + WT_ERR(__wt_conn_btree_apply(session, + NULL, __txn_rollback_eviction_drain, NULL, cfg)); + + WT_ERR(__txn_rollback_to_stable_check(session)); + + F_CLR(conn, WT_CONN_EVICTION_NO_LOOKASIDE); /* * Allocate a non-durable btree bitstring. We increment the global @@ -430,7 +453,7 @@ __wt_txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[]) * hence we need to add one here. */ conn->stable_rollback_maxfile = conn->next_file_id + 1; - WT_RET(__bit_alloc(session, + WT_ERR(__bit_alloc(session, conn->stable_rollback_maxfile, &conn->stable_rollback_bitstring)); WT_ERR(__wt_conn_btree_apply(session, NULL, __txn_rollback_to_stable_btree, NULL, cfg)); @@ -442,7 +465,9 @@ __wt_txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[]) * lookaside records should be removed. */ WT_ERR(__txn_rollback_to_stable_lookaside_fixup(session)); -err: __wt_free(session, conn->stable_rollback_bitstring); + +err: F_CLR(conn, WT_CONN_EVICTION_NO_LOOKASIDE); + __wt_free(session, conn->stable_rollback_bitstring); return (ret); #endif } diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c index 2182a3924a5..8f90afeb8b4 100644 --- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c +++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c @@ -107,27 +107,37 @@ __wt_txn_parse_timestamp(WT_SESSION_IMPL *session, #if WT_TIMESTAMP_SIZE == 8 { - static const u_char hextable[] = { - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 0, 0, 0, 0, 0, 0, - 0, 10, 11, 12, 13, 14, 15, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 10, 11, 12, 13, 14, 15 + static const int8_t hextable[] = { + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1 }; wt_timestamp_t ts; size_t len; - const char *hex; - - for (ts.val = 0, hex = cval->str, len = cval->len; len > 0; --len) - ts.val = (ts.val << 4) | hextable[(int)*hex++]; + int hex_val; + const char *hex_itr; + + for (ts.val = 0, hex_itr = cval->str, len = cval->len; len > 0; --len) { + if ((size_t)*hex_itr < WT_ELEMENTS(hextable)) + hex_val = hextable[(size_t)*hex_itr++]; + else + hex_val = -1; + if (hex_val < 0) + WT_RET_MSG(session, EINVAL, + "Failed to parse %s timestamp '%.*s'", + name, (int)cval->len, cval->str); + ts.val = (ts.val << 4) | (uint64_t)hex_val; + } __wt_timestamp_set(timestamp, &ts); } #else diff --git a/src/third_party/wiredtiger/test/fops/file.c b/src/third_party/wiredtiger/test/fops/file.c index 60320ae3a38..118845ab805 100644 --- a/src/third_party/wiredtiger/test/fops/file.c +++ b/src/third_party/wiredtiger/test/fops/file.c @@ -39,6 +39,8 @@ obj_bulk(void) testutil_check(conn->open_session(conn, NULL, NULL, &session)); + if (use_txn) + testutil_check(session->begin_transaction(session, NULL)); if ((ret = session->create(session, uri, config)) != 0) if (ret != EEXIST && ret != EBUSY) testutil_die(ret, "session.create"); @@ -51,6 +53,21 @@ obj_bulk(void) } else if (ret != ENOENT && ret != EBUSY && ret != EINVAL) testutil_die(ret, "session.open_cursor bulk"); } + + if (use_txn) { + /* + * As the operations are being performed concurrently, + * return value can be ENOENT, EBUSY or EINVAL will set + * error to transaction opened by session. In these + * cases the transaction has to be aborted. + */ + if (ret != ENOENT && ret != EBUSY && ret != EINVAL) + ret = session->commit_transaction(session, NULL); + else + ret = session->rollback_transaction(session, NULL); + if (ret == EINVAL) + testutil_die(ret, "session.commit bulk"); + } testutil_check(session->close(session, NULL)); } @@ -70,6 +87,8 @@ obj_bulk_unique(int force) new_uri, sizeof(new_uri), "%s.%u", uri, ++uid)); testutil_check(pthread_rwlock_unlock(&single)); + if (use_txn) + testutil_check(session->begin_transaction(session, NULL)); testutil_check(session->create(session, new_uri, config)); __wt_yield(); @@ -89,6 +108,10 @@ obj_bulk_unique(int force) if (ret != EBUSY) testutil_die(ret, "session.drop: %s", new_uri); + if (use_txn && + (ret = session->commit_transaction(session, NULL)) != 0 && + ret != EINVAL) + testutil_die(ret, "session.commit bulk unique"); testutil_check(session->close(session, NULL)); } @@ -101,12 +124,19 @@ obj_cursor(void) testutil_check(conn->open_session(conn, NULL, NULL, &session)); + if (use_txn) + testutil_check(session->begin_transaction(session, NULL)); if ((ret = session->open_cursor(session, uri, NULL, NULL, &cursor)) != 0) { if (ret != ENOENT && ret != EBUSY) testutil_die(ret, "session.open_cursor"); } else testutil_check(cursor->close(cursor)); + + if (use_txn && + (ret = session->commit_transaction(session, NULL)) != 0 && + ret != EINVAL) + testutil_die(ret, "session.commit cursor"); testutil_check(session->close(session, NULL)); } @@ -118,10 +148,16 @@ obj_create(void) testutil_check(conn->open_session(conn, NULL, NULL, &session)); + if (use_txn) + testutil_check(session->begin_transaction(session, NULL)); if ((ret = session->create(session, uri, config)) != 0) if (ret != EEXIST && ret != EBUSY) testutil_die(ret, "session.create"); + if (use_txn && + (ret = session->commit_transaction(session, NULL)) != 0 && + ret != EINVAL) + testutil_die(ret, "session.commit create"); testutil_check(session->close(session, NULL)); } @@ -140,13 +176,25 @@ obj_create_unique(int force) new_uri, sizeof(new_uri), "%s.%u", uri, ++uid)); testutil_check(pthread_rwlock_unlock(&single)); + if (use_txn) + testutil_check(session->begin_transaction(session, NULL)); testutil_check(session->create(session, new_uri, config)); + if (use_txn && + (ret = session->commit_transaction(session, NULL)) != 0 && + ret != EINVAL) + testutil_die(ret, "session.commit create unique"); __wt_yield(); + if (use_txn) + testutil_check(session->begin_transaction(session, NULL)); while ((ret = session->drop( session, new_uri, force ? "force" : NULL)) != 0) if (ret != EBUSY) testutil_die(ret, "session.drop: %s", new_uri); + if (use_txn && + (ret = session->commit_transaction(session, NULL)) != 0 && + ret != EINVAL) + testutil_die(ret, "session.commit create unique"); testutil_check(session->close(session, NULL)); } @@ -159,10 +207,26 @@ obj_drop(int force) testutil_check(conn->open_session(conn, NULL, NULL, &session)); + if (use_txn) + testutil_check(session->begin_transaction(session, NULL)); if ((ret = session->drop(session, uri, force ? "force" : NULL)) != 0) if (ret != ENOENT && ret != EBUSY) testutil_die(ret, "session.drop"); + if (use_txn) { + /* + * As the operations are being performed concurrently, + * return value can be ENOENT or EBUSY will set + * error to transaction opened by session. In these + * cases the transaction has to be aborted. + */ + if (ret != ENOENT && ret != EBUSY) + ret = session->commit_transaction(session, NULL); + else + ret = session->rollback_transaction(session, NULL); + if (ret == EINVAL) + testutil_die(ret, "session.commit drop"); + } testutil_check(session->close(session, NULL)); } diff --git a/src/third_party/wiredtiger/test/fops/t.c b/src/third_party/wiredtiger/test/fops/t.c index b6b80ba5db8..fcbbdcabd73 100644 --- a/src/third_party/wiredtiger/test/fops/t.c +++ b/src/third_party/wiredtiger/test/fops/t.c @@ -28,6 +28,7 @@ #include "thread.h" +bool use_txn; /* Operations with user txn */ WT_CONNECTION *conn; /* WiredTiger connection */ pthread_rwlock_t single; /* Single thread */ u_int nops; /* Operations */ @@ -77,8 +78,9 @@ main(int argc, char *argv[]) nops = 1000; nthreads = 10; runs = 1; + use_txn = false; config_open = working_dir = NULL; - while ((ch = __wt_getopt(progname, argc, argv, "C:h:l:n:r:t:")) != EOF) + while ((ch = __wt_getopt(progname, argc, argv, "C:h:l:n:r:t:x")) != EOF) switch (ch) { case 'C': /* wiredtiger_open config */ config_open = __wt_optarg; @@ -102,6 +104,9 @@ main(int argc, char *argv[]) case 't': nthreads = (u_int)atoi(__wt_optarg); break; + case 'x': + use_txn = true; + break; default: return (usage()); } @@ -245,7 +250,8 @@ usage(void) { fprintf(stderr, "usage: %s " - "[-C wiredtiger-config] [-l log] [-n ops] [-r runs] [-t threads]\n", + "[-C wiredtiger-config] [-l log] [-n ops] [-r runs] [-t threads] " + "[-x] \n", progname); fprintf(stderr, "%s", "\t-C specify wiredtiger_open configuration arguments\n" @@ -253,6 +259,7 @@ usage(void) "\t-l specify a log file\n" "\t-n set number of operations each thread does\n" "\t-r set number of runs\n" - "\t-t set number of threads\n"); + "\t-t set number of threads\n" + "\t-x operations within user transaction \n"); return (EXIT_FAILURE); } diff --git a/src/third_party/wiredtiger/test/fops/thread.h b/src/third_party/wiredtiger/test/fops/thread.h index f6b6bdffd63..0df36025be0 100644 --- a/src/third_party/wiredtiger/test/fops/thread.h +++ b/src/third_party/wiredtiger/test/fops/thread.h @@ -30,6 +30,7 @@ #include <signal.h> +extern bool use_txn; /* Operations with user txn */ extern WT_CONNECTION *conn; /* WiredTiger connection */ extern u_int nops; /* Operations per thread */ diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h index 81b7fa27f79..f35e71f58aa 100644 --- a/src/third_party/wiredtiger/test/format/format.h +++ b/src/third_party/wiredtiger/test/format/format.h @@ -287,6 +287,7 @@ void bdb_update(const void *, size_t, const void *, size_t); WT_THREAD_RET alter(void *); WT_THREAD_RET backup(void *); +WT_THREAD_RET checkpoint(void *); WT_THREAD_RET compact(void *); void config_clear(void); void config_error(void); diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c index f4770465628..4fed18d12b4 100644 --- a/src/third_party/wiredtiger/test/format/ops.c +++ b/src/third_party/wiredtiger/test/format/ops.c @@ -76,7 +76,8 @@ wts_ops(int lastrun) TINFO **tinfo_list, *tinfo, total; WT_CONNECTION *conn; WT_SESSION *session; - wt_thread_t alter_tid, backup_tid, compact_tid, lrt_tid, timestamp_tid; + wt_thread_t alter_tid, backup_tid, checkpoint_tid, compact_tid, lrt_tid; + wt_thread_t timestamp_tid; int64_t fourths, quit_fourths, thread_ops; uint32_t i; bool running; @@ -86,6 +87,7 @@ wts_ops(int lastrun) session = NULL; /* -Wconditional-uninitialized */ memset(&alter_tid, 0, sizeof(alter_tid)); memset(&backup_tid, 0, sizeof(backup_tid)); + memset(&checkpoint_tid, 0, sizeof(checkpoint_tid)); memset(&compact_tid, 0, sizeof(compact_tid)); memset(&lrt_tid, 0, sizeof(lrt_tid)); memset(×tamp_tid, 0, sizeof(timestamp_tid)); @@ -173,6 +175,9 @@ wts_ops(int lastrun) if (g.c_backups) testutil_check( __wt_thread_create(NULL, &backup_tid, backup, NULL)); + if (g.c_checkpoints) + testutil_check(__wt_thread_create( + NULL, &checkpoint_tid, checkpoint, NULL)); if (g.c_compact) testutil_check( __wt_thread_create(NULL, &compact_tid, compact, NULL)); @@ -247,6 +252,8 @@ wts_ops(int lastrun) testutil_check(__wt_thread_join(NULL, alter_tid)); if (g.c_backups) testutil_check(__wt_thread_join(NULL, backup_tid)); + if (g.c_checkpoints) + testutil_check(__wt_thread_join(NULL, checkpoint_tid)); if (g.c_compact) testutil_check(__wt_thread_join(NULL, compact_tid)); if (!SINGLETHREADED && g.c_long_running_txn) @@ -514,12 +521,11 @@ ops(void *arg) WT_DECL_RET; WT_ITEM *key, _key, *value, _value; WT_SESSION *session; - uint64_t ckpt_op, keyno, reset_op, session_op; + uint64_t keyno, reset_op, session_op; uint32_t rnd; u_int i, iso_config; int dir; - char *ckpt_config, ckpt_name[64]; - bool ckpt_available, intxn, positioned, readonly; + bool intxn, positioned, readonly; tinfo = arg; @@ -542,58 +548,61 @@ ops(void *arg) session = NULL; session_op = 0; - /* Set the first operation where we'll perform checkpoint operations. */ - ckpt_op = g.c_checkpoints ? mmrand(&tinfo->rnd, 100, 10000) : 0; - ckpt_available = false; - /* Set the first operation where we'll reset the session. */ reset_op = mmrand(&tinfo->rnd, 100, 10000); for (intxn = false; !tinfo->quit; ++tinfo->ops) { - /* - * We can't checkpoint or swap sessions/cursors while in a - * transaction, resolve any running transaction. - */ - if (intxn && - (tinfo->ops == ckpt_op || tinfo->ops == session_op)) { - commit_transaction(tinfo, session); - intxn = false; - } - - /* Open up a new session and cursors. */ - if (tinfo->ops == session_op || + /* Periodically open up a new session and cursors. */ + if (tinfo->ops > session_op || session == NULL || cursor == NULL) { + /* + * We can't swap sessions/cursors if in a transaction, + * resolve any running transaction. + */ + if (intxn) { + commit_transaction(tinfo, session); + intxn = false; + } + if (session != NULL) testutil_check(session->close(session, NULL)); - testutil_check( conn->open_session(conn, NULL, NULL, &session)); + /* Pick the next session/cursor close/open. */ + session_op += mmrand(&tinfo->rnd, 100, 5000); + /* * 10% of the time, perform some read-only operations * from a checkpoint. * - * Skip that if we are single-threaded and doing checks - * against a Berkeley DB database, because that won't - * work because the Berkeley DB database records won't - * match the checkpoint. Also skip if we are using - * LSM, because it doesn't support reads from - * checkpoints. + * Skip if single-threaded and doing checks against a + * Berkeley DB database, that won't work because the + * Berkeley DB database won't match the checkpoint. + * + * Skip if we are using data-sources or LSM, they don't + * support reading from checkpoints. */ - if (!SINGLETHREADED && !DATASOURCE("lsm") && - ckpt_available && mmrand(&tinfo->rnd, 1, 10) == 1) { + if (!SINGLETHREADED && !DATASOURCE("helium") && + !DATASOURCE("kvsbdb") && !DATASOURCE("lsm") && + mmrand(&tinfo->rnd, 1, 10) == 1) { /* * open_cursor can return EBUSY if concurrent * with a metadata operation, retry. */ while ((ret = session->open_cursor(session, - g.uri, NULL, ckpt_name, &cursor)) == EBUSY) + g.uri, NULL, + "checkpoint=WiredTigerCheckpoint", + &cursor)) == EBUSY) __wt_yield(); + /* + * If the checkpoint hasn't been created yet, + * ignore the error. + */ + if (ret == ENOENT) + continue; testutil_check(ret); - /* Pick the next session/cursor close/open. */ - session_op += 250; - /* Checkpoints are read-only. */ readonly = true; } else { @@ -608,75 +617,11 @@ ops(void *arg) __wt_yield(); testutil_check(ret); - /* Pick the next session/cursor close/open. */ - session_op += mmrand(&tinfo->rnd, 100, 5000); - /* Updates supported. */ readonly = false; } } - /* Checkpoint the database. */ - if (tinfo->ops == ckpt_op && g.c_checkpoints) { - /* - * Checkpoints are single-threaded inside WiredTiger, - * skip our checkpoint if another thread is already - * doing one. - */ - ret = pthread_rwlock_trywrlock(&g.checkpoint_lock); - if (ret == EBUSY) - goto skip_checkpoint; - testutil_check(ret); - - /* - * LSM and data-sources don't support named checkpoints - * and we can't drop a named checkpoint while there's a - * backup in progress, otherwise name the checkpoint 5% - * of the time. - */ - if (mmrand(&tinfo->rnd, 1, 20) != 1 || - DATASOURCE("helium") || - DATASOURCE("kvsbdb") || DATASOURCE("lsm") || - pthread_rwlock_trywrlock(&g.backup_lock) == EBUSY) - ckpt_config = NULL; - else { - testutil_check(__wt_snprintf( - ckpt_name, sizeof(ckpt_name), - "name=thread-%d", tinfo->id)); - ckpt_config = ckpt_name; - } - - ret = session->checkpoint(session, ckpt_config); - /* - * We may be trying to create a named checkpoint while - * we hold a cursor open to the previous checkpoint. - * Tolerate EBUSY. - */ - if (ret != 0 && ret != EBUSY) - testutil_die(ret, "%s", - ckpt_config == NULL ? "" : ckpt_config); - ret = 0; - - if (ckpt_config != NULL) - testutil_check( - pthread_rwlock_unlock(&g.backup_lock)); - testutil_check( - pthread_rwlock_unlock(&g.checkpoint_lock)); - - /* Rephrase the checkpoint name for cursor open. */ - if (ckpt_config == NULL) - strcpy(ckpt_name, - "checkpoint=WiredTigerCheckpoint"); - else - testutil_check(__wt_snprintf( - ckpt_name, sizeof(ckpt_name), - "checkpoint=thread-%d", tinfo->id)); - ckpt_available = true; - -skip_checkpoint: /* Pick the next checkpoint operation. */ - ckpt_op += mmrand(&tinfo->rnd, 5000, 20000); - } - /* * Reset the session every now and then, just to make sure that * operation gets tested. Note the test is not for equality, we diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c index dc288ba4bc2..02ed0a2da60 100644 --- a/src/third_party/wiredtiger/test/format/t.c +++ b/src/third_party/wiredtiger/test/format/t.c @@ -169,7 +169,6 @@ main(int argc, char *argv[]) */ testutil_check(pthread_rwlock_init(&g.append_lock, NULL)); testutil_check(pthread_rwlock_init(&g.backup_lock, NULL)); - testutil_check(pthread_rwlock_init(&g.checkpoint_lock, NULL)); testutil_check(pthread_rwlock_init(&g.death_lock, NULL)); printf("%s: process %" PRIdMAX "\n", progname, (intmax_t)getpid()); @@ -267,7 +266,6 @@ main(int argc, char *argv[]) testutil_check(pthread_rwlock_destroy(&g.append_lock)); testutil_check(pthread_rwlock_destroy(&g.backup_lock)); - testutil_check(pthread_rwlock_destroy(&g.checkpoint_lock)); testutil_check(pthread_rwlock_destroy(&g.death_lock)); config_clear(); diff --git a/src/third_party/wiredtiger/test/format/util.c b/src/third_party/wiredtiger/test/format/util.c index 98af8e766f1..9ea44a29801 100644 --- a/src/third_party/wiredtiger/test/format/util.c +++ b/src/third_party/wiredtiger/test/format/util.c @@ -501,6 +501,86 @@ fclose_and_clear(FILE **fpp) } /* + * checkpoint -- + * Periodically take a checkpoint + */ +WT_THREAD_RET +checkpoint(void *arg) +{ + WT_CONNECTION *conn; + WT_DECL_RET; + WT_SESSION *session; + u_int secs; + const char *ckpt_config; + char config_buf[64]; + bool backup_locked; + + (void)arg; + conn = g.wts_conn; + testutil_check(conn->open_session(conn, NULL, NULL, &session)); + + for (secs = mmrand(NULL, 1, 10); !g.workers_finished;) { + if (secs > 0) { + __wt_sleep(1, 0); + --secs; + continue; + } + + /* + * LSM and data-sources don't support named checkpoints. Also, + * don't attempt named checkpoints during a hot backup. It's + * OK to create named checkpoints during a hot backup, but we + * can't delete them, so repeating an already existing named + * checkpoint will fail when we can't drop the previous one. + */ + ckpt_config = NULL; + backup_locked = false; + if (!DATASOURCE("helium") && !DATASOURCE("kvsbdb") && + !DATASOURCE("lsm")) + switch (mmrand(NULL, 1, 20)) { + case 1: + /* + * 5% create a named snapshot. Rotate between a + * few names to test multiple named snapshots in + * the system. + */ + ret = pthread_rwlock_trywrlock(&g.backup_lock); + if (ret == 0) { + backup_locked = true; + testutil_check(__wt_snprintf( + config_buf, sizeof(config_buf), + "name=mine.%" PRIu32, + mmrand(NULL, 1, 4))); + ckpt_config = config_buf; + } else if (ret != EBUSY) + testutil_check(ret); + break; + case 2: + /* + * 5% drop all named snapshots. + */ + ret = pthread_rwlock_trywrlock(&g.backup_lock); + if (ret == 0) { + backup_locked = true; + ckpt_config = "drop=(all)"; + } else if (ret != EBUSY) + testutil_check(ret); + break; + } + + testutil_check(session->checkpoint(session, ckpt_config)); + + if (backup_locked) + testutil_check(pthread_rwlock_unlock(&g.backup_lock)); + + secs = mmrand(NULL, 5, 40); + } + + testutil_check(session->close(session, NULL)); + return (WT_THREAD_RET_VALUE); +} + +/* * timestamp -- * Periodically update the oldest timestamp. */ diff --git a/src/third_party/wiredtiger/test/mciproject.yml b/src/third_party/wiredtiger/test/mciproject.yml index 72022fe46ec..4b67299d14c 100644 --- a/src/third_party/wiredtiger/test/mciproject.yml +++ b/src/third_party/wiredtiger/test/mciproject.yml @@ -167,20 +167,6 @@ buildvariants: - name: unit-test - name: fops -- name: solaris - display_name: Solaris - run_on: - - solaris - expansions: - make_command: PATH=/opt/mongodbtoolchain/bin:$PATH gmake - test_env_vars: LD_LIBRARY_PATH=`pwd`/.libs - smp_command: -j $(kstat cpu | sort -u | grep -c "^module") - configure_env_vars: PATH=/opt/mongodbtoolchain/bin:$PATH CFLAGS="-m64" - tasks: - - name: compile - - name: unit-test - - name: fops - - name: windows-64 display_name: Windows 64-bit run_on: diff --git a/src/third_party/wiredtiger/test/suite/test_assert01.py b/src/third_party/wiredtiger/test/suite/test_assert01.py new file mode 100644 index 00000000000..3a4f8e4127a --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_assert01.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2017 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_assert01.py +# Timestamps: assert commit settings +# + +from suite_subprocess import suite_subprocess +import wiredtiger, wttest + +def timestamp_str(t): + return '%x' % t + +class test_assert01(wttest.WiredTigerTestCase, suite_subprocess): + base = 'assert01' + base_uri = 'file:' + base + uri_always = base_uri + '.always.wt' + uri_def = base_uri + '.def.wt' + uri_never = base_uri + '.never.wt' + uri_none = base_uri + '.none.wt' + cfg = 'key_format=S,value_format=S,' + cfg_always = 'assert=(commit_timestamp=always)' + cfg_def = '' + cfg_never = 'assert=(commit_timestamp=never)' + cfg_none = 'assert=(commit_timestamp=none)' + + count = 1 + # + # Commit a k/v pair making sure that it detects an error if needed, when + # used with and without a commit timestamp. + # + def insert_check(self, uri, use_ts): + c = self.session.open_cursor(uri) + key = 'key' + str(self.count) + val = 'value' + str(self.count) + + # Commit with a timestamp + self.session.begin_transaction() + self.session.timestamp_transaction( + 'commit_timestamp=' + timestamp_str(self.count)) + c[key] = val + # All settings other than never should commit successfully + if (use_ts != 'never'): + self.session.commit_transaction() + else: + msg = "/timestamp set on this transaction/" + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:self.assertEquals(self.session.commit_transaction(), + 0), msg) + c.close() + self.count += 1 + + # Commit without a timestamp + key = 'key' + str(self.count) + val = 'value' + str(self.count) + c = self.session.open_cursor(uri) + self.session.begin_transaction() + c[key] = val + # All settings other than always should commit successfully + if (use_ts != 'always'): + self.session.commit_transaction() + else: + msg = "/none set on this transaction/" + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:self.assertEquals(self.session.commit_transaction(), + 0), msg) + self.count += 1 + c.close() + + def test_commit_timestamp(self): + #if not wiredtiger.timestamp_build() or not wiredtiger.diagnostic_build(): + # self.skipTest('requires a timestamp and diagnostic build') + if not wiredtiger.timestamp_build(): + self.skipTest('requires a timestamp build') + + # Create a data item at a timestamp + self.session.create(self.uri_always, self.cfg + self.cfg_always) + self.session.create(self.uri_def, self.cfg + self.cfg_def) + self.session.create(self.uri_never, self.cfg + self.cfg_never) + self.session.create(self.uri_none, self.cfg + self.cfg_none) + + # Check inserting into each table + self.insert_check(self.uri_always, 'always') + self.insert_check(self.uri_def, 'none') + self.insert_check(self.uri_never, 'never') + self.insert_check(self.uri_none, 'none') + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_assert02.py b/src/third_party/wiredtiger/test/suite/test_assert02.py new file mode 100644 index 00000000000..d264273c3a0 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_assert02.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2017 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_assert02.py +# Timestamps: assert read timestamp settings +# + +from suite_subprocess import suite_subprocess +import wiredtiger, wttest + +def timestamp_str(t): + return '%x' % t + +class test_assert02(wttest.WiredTigerTestCase, suite_subprocess): + def test_read_timestamp(self): + #if not wiredtiger.timestamp_build() or not wiredtiger.diagnostic_build(): + # self.skipTest('requires a timestamp and diagnostic build') + if not wiredtiger.timestamp_build(): + self.skipTest('requires a timestamp build') + + base = 'assert02.' + base_uri = 'file:' + base + uri_always = base_uri + '.always.wt' + uri_def = base_uri + '.def.wt' + uri_never = base_uri + '.never.wt' + uri_none = base_uri + '.none.wt' + + cfg = 'key_format=S,value_format=S' + cfg_always = cfg + ',assert=(read_timestamp=always)' + cfg_def = cfg + cfg_never = cfg + ',assert=(read_timestamp=never)' + cfg_none = cfg + ',assert=(read_timestamp=none)' + + # Create a data item at a timestamp + self.session.create(uri_always, cfg_always) + self.session.create(uri_def, cfg_def) + self.session.create(uri_never, cfg_never) + self.session.create(uri_none, cfg_none) + + # Insert a data item at timestamp 1. This should work for all. + c_always = self.session.open_cursor(uri_always) + c_def = self.session.open_cursor(uri_def) + c_never = self.session.open_cursor(uri_never) + c_none = self.session.open_cursor(uri_none) + self.session.begin_transaction() + self.session.timestamp_transaction( + 'commit_timestamp=' + timestamp_str(1)) + c_always['key1'] = 'value1' + c_def['key1'] = 'value1' + c_never['key1'] = 'value1' + c_none['key1'] = 'value1' + self.session.commit_transaction() + c_always.close() + c_def.close() + c_never.close() + c_none.close() + + # Now that we have a timestamped data, try reading with and without + # the timestamp. + c_always = self.session.open_cursor(uri_always) + c_def = self.session.open_cursor(uri_def) + c_never = self.session.open_cursor(uri_never) + c_none = self.session.open_cursor(uri_none) + + c_always.set_key('key1') + c_def.set_key('key1') + c_never.set_key('key1') + c_none.set_key('key1') + + self.session.begin_transaction('read_timestamp=' + timestamp_str(1)) + c_always.search() + c_def.search() + c_none.search() + self.assertEqual(c_always.get_value(), 'value1') + self.assertEqual(c_def.get_value(), 'value1') + self.assertEqual(c_none.get_value(), 'value1') + + msg = "/timestamp set on this transaction/" + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:self.assertEquals(c_never.search(), 0), msg) + self.session.commit_transaction() + c_always.close() + c_def.close() + c_never.close() + c_none.close() + + # Read in a transaction without a timestamp. + c_always = self.session.open_cursor(uri_always) + c_def = self.session.open_cursor(uri_def) + c_never = self.session.open_cursor(uri_never) + c_none = self.session.open_cursor(uri_none) + + c_always.set_key('key1') + c_def.set_key('key1') + c_never.set_key('key1') + c_none.set_key('key1') + + self.session.begin_transaction() + c_never.search() + c_def.search() + c_none.search() + self.assertEqual(c_never.get_value(), 'value1') + self.assertEqual(c_def.get_value(), 'value1') + self.assertEqual(c_none.get_value(), 'value1') + + msg = "/none set on this transaction/" + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:self.assertEquals(c_always.search(), 0), msg) + self.session.commit_transaction() + c_always.close() + c_def.close() + c_never.close() + c_none.close() + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_assert03.py b/src/third_party/wiredtiger/test/suite/test_assert03.py new file mode 100644 index 00000000000..36d4936a82e --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_assert03.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2017 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_assert03.py +# Test changing assert setting via alter. +# + +from suite_subprocess import suite_subprocess +import wiredtiger, wttest + +class test_assert03(wttest.WiredTigerTestCase, suite_subprocess): + conn_config = 'log=(enabled)' + base_uri = 'file:assert03.wt' + cfg = 'key_format=S,value_format=S' + always = 'assert=(commit_timestamp=always)' + never = 'assert=(commit_timestamp=never)' + none = 'assert=(commit_timestamp=none)' + + def test_assert03(self): + + #if not wiredtiger.timestamp_build() or not wiredtiger.diagnostic_build(): + # self.skipTest('requires a timestamp and diagnostic build') + if not wiredtiger.timestamp_build(): + self.skipTest('requires a timestamp build') + + # Create a data item at the default setting + self.session.create(self.base_uri, self.cfg) + c = self.session.open_cursor(self.base_uri) + self.session.begin_transaction() + c['key0'] = 'value0' + self.session.commit_transaction() + c.close() + + # Now rotate through the alter settings and verify the data. + # The always setting should fail. + self.session.alter(self.base_uri, self.always) + c = self.session.open_cursor(self.base_uri) + self.session.begin_transaction() + c['key1'] = 'value1' + msg = "/none set on this transaction/" + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:self.assertEquals(self.session.commit_transaction(), 0), msg) + c.close() + + # The never and none settings should succeed. + self.session.alter(self.base_uri, self.never) + c = self.session.open_cursor(self.base_uri) + self.session.begin_transaction() + c['key2'] = 'value2' + self.session.commit_transaction() + c.close() + + self.session.alter(self.base_uri, self.none) + c = self.session.open_cursor(self.base_uri) + self.session.begin_transaction() + c['key3'] = 'value3' + self.session.commit_transaction() + c.close() + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_backup07.py b/src/third_party/wiredtiger/test/suite/test_backup07.py new file mode 100644 index 00000000000..8332815b0ca --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_backup07.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2017 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest +import os, shutil +from helper import compare_files +from suite_subprocess import suite_subprocess +from wtdataset import simple_key +from wtscenario import make_scenarios + +# test_backup07.py +# Test cursor backup with target URIs, logging and create during backup + +class test_backup07(wttest.WiredTigerTestCase, suite_subprocess): + dir='backup.dir' # Backup directory name + logmax="100K" + newuri="table:newtable" + + pfx = 'test_backup' + scenarios = make_scenarios([ + ('table', dict(uri='table:test',dsize=100,nops=100,nthreads=1)), + ]) + + # Create a large cache, otherwise this test runs quite slowly. + def conn_config(self): + return 'cache_size=1G,log=(archive=false,enabled,file_max=%s)' % \ + self.logmax + + # Run background inserts while running checkpoints and incremental backups + # repeatedly. + def test_backup07(self): + log2 = "WiredTigerLog.0000000002" + + self.session.create(self.uri, "key_format=S,value_format=S") + + # Insert small amounts of data at a time stopping just after we + # cross into log file 2. That way we can add more operations into + # log file 2 during the full backup. + loop = 0 + c = self.session.open_cursor(self.uri) + while not os.path.exists(log2): + for i in range(0, self.nops): + num = i + (loop * self.nops) + key = 'key' + str(num) + val = 'value' + str(num) + c[key] = val + loop += 1 + + # Test a potential bug in full backups and creates. + # We allow creates during backup because the file doesn't exist + # when the backup metadata is created on cursor open and the newly + # created file is not in the cursor list. However, if using logging + # and the create and inserts/updates appear in a log file copied, + # then currently there will be an error opening the backup directory. + + # Open up the backup cursor, create and add data to a new table + # and then copy the files. + os.mkdir(self.dir) + bkup_c = self.session.open_cursor('backup:', None, None) + + # Now create and populate the new table. Make sure the log records + # are on disk and will be copied to the backup. + self.session.create(self.newuri, "key_format=S,value_format=S") + c = self.session.open_cursor(self.newuri) + for i in range(0, self.nops): + key = 'key' + str(i) + val = 'value' + str(i) + c[key] = val + c.close() + self.session.log_flush('sync=on') + + # Now copy the files returned by the backup cursor. This will + # include the log file that has updates for the newly created table. + while True: + ret = bkup_c.next() + if ret != 0: + break + newfile = bkup_c.get_key() + sz = os.path.getsize(newfile) + self.pr('Copy from: ' + newfile + ' (' + str(sz) + ') to ' + self.dir) + shutil.copy(newfile, self.dir) + self.assertEqual(ret, wiredtiger.WT_NOTFOUND) + bkup_c.close() + + # After the full backup, open and recover the backup database. + # Make sure we properly recover even though the log file will have + # records for the newly created table file id. + backup_conn = self.wiredtiger_open(self.dir) + backup_conn.close() + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_las.py b/src/third_party/wiredtiger/test/suite/test_las.py index d0bd1d108fa..52a0b2d7300 100644 --- a/src/third_party/wiredtiger/test/suite/test_las.py +++ b/src/third_party/wiredtiger/test/suite/test_las.py @@ -26,16 +26,53 @@ # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR # OTHER DEALINGS IN THE SOFTWARE. +from helper import copy_wiredtiger_home import wiredtiger, wttest from wtdataset import SimpleDataSet +def timestamp_str(t): + return '%x' % t + # test_las.py -# Smoke tests to ensure lookaside tables are working. +# Smoke tests to ensure lookaside tables are working. class test_las(wttest.WiredTigerTestCase): # Force a small cache. def conn_config(self): return 'cache_size=1GB' + def large_updates(self, session, uri, value, ds, nrows, timestamp=False): + # Insert a large number of records, we'll hang if the lookaside table + # isn't doing its thing. + cursor = session.open_cursor(uri) + for i in range(1, 1000000): + if timestamp == True: + session.begin_transaction() + cursor.set_key(ds.key(nrows + i)) + cursor.set_value(value) + self.assertEquals(cursor.update(), 0) + if timestamp == True: + session.commit_transaction('commit_timestamp=' + timestamp_str(i + 1)) + cursor.close() + + def durable_check(self, check_value, uri, ds, nrows): + # Checkpoint and backup so as to simulate recovery + self.session.checkpoint() + newdir = "BACKUP" + copy_wiredtiger_home('.', newdir, True) + + conn = self.setUpConnectionOpen(newdir) + session = self.setUpSessionOpen(conn) + cursor = session.open_cursor(uri, None) + # Skip the initial rows, which were not updated + for i in range(0, nrows+1): + self.assertEquals(cursor.next(), 0) + #print "Check value : " + str(check_value) + #print "value : " + str(cursor.get_value()) + self.assertTrue(check_value == cursor.get_value()) + cursor.close() + session.close() + conn.close() + @wttest.longtest('lookaside table smoke test') def test_las(self): # Create a small table. @@ -43,18 +80,49 @@ class test_las(wttest.WiredTigerTestCase): nrows = 100 ds = SimpleDataSet(self, uri, nrows, key_format="S") ds.populate() + bigvalue = "aaaaa" * 100 - # Take a snapshot. + # Initially load huge data + cursor = self.session.open_cursor(uri) + for i in range(1, 1000000): + cursor.set_key(ds.key(nrows + i)) + cursor.set_value(bigvalue) + self.assertEquals(cursor.insert(), 0) + cursor.close() + self.session.checkpoint() + + # Scenario: 1 + # Check to see LAS working with old snapshot + bigvalue1 = "bbbbb" * 100 self.session.snapshot("name=xxx") + # Update the values in different session after snapshot + self.large_updates(self.session, uri, bigvalue1, ds, nrows) + # Check to see the value after recovery + self.durable_check(bigvalue1, uri, ds, nrows) + self.session.snapshot("drop=(all)") - # Insert a large number of records, we'll hang if the lookaside table - # isn't doing its thing. - c = self.session.open_cursor(uri) - bigvalue = "abcde" * 100 - for i in range(1, 1000000): - c.set_key(ds.key(nrows + i)) - c.set_value(bigvalue) - self.assertEquals(c.insert(), 0) + # Scenario: 2 + # Check to see LAS working with old reader + bigvalue2 = "ccccc" * 100 + session2 = self.conn.open_session() + session2.begin_transaction('isolation=snapshot') + self.large_updates(self.session, uri, bigvalue2, ds, nrows) + # Check to see the value after recovery + self.durable_check(bigvalue2, uri, ds, nrows) + session2.rollback_transaction() + session2.close() + + # Scenario: 3 + # Check to see LAS working with old timestamp + bigvalue3 = "ddddd" * 100 + self.conn.set_timestamp('stable_timestamp=' + timestamp_str(1)) + self.large_updates(self.session, uri, bigvalue3, ds, nrows, timestamp=True) + # Check to see data can be see only till the stable_timestamp + self.durable_check(bigvalue2, uri, ds, nrows) + + self.conn.set_timestamp('stable_timestamp=' + timestamp_str(i + 1)) + # Check to see latest data can be seen + self.durable_check(bigvalue3, uri, ds, nrows) if __name__ == '__main__': wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp01.py b/src/third_party/wiredtiger/test/suite/test_timestamp01.py index c7a5df66ae0..09a264e2afd 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp01.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp01.py @@ -61,12 +61,40 @@ class test_timestamp01(wttest.WiredTigerTestCase, suite_subprocess): 'commit_timestamp=' + timestamp_str(1 << 5000)), '/too long/') - # One is okay, as is 2**64 - 1 + # Anything other than lower case hexadecimal characters is not permitted + self.session.begin_transaction() + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.commit_transaction( + 'commit_timestamp=' + timestamp_str(-1)), + '/Failed to parse commit timestamp/') + + self.session.begin_transaction() + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.commit_transaction( + 'commit_timestamp=' + 'a/78f'), + '/Failed to parse commit timestamp/') + + self.session.begin_transaction() + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.commit_transaction( + 'commit_timestamp=' + 'a`78f'), + '/Failed to parse commit timestamp/') + + self.session.begin_transaction() + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.commit_transaction( + 'commit_timestamp=' + 'a{78f'), + '/Failed to parse commit timestamp/') + + # One is okay, as is upper case hex and 2**64 - 1 self.session.begin_transaction() self.session.commit_transaction( 'commit_timestamp=' + timestamp_str(1)) self.session.begin_transaction() self.session.commit_transaction( + 'commit_timestamp=0A78F') + self.session.begin_transaction() + self.session.commit_transaction( 'commit_timestamp=' + timestamp_str(1 << 64 - 1)) if __name__ == '__main__': diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp02.py b/src/third_party/wiredtiger/test/suite/test_timestamp02.py index 31bea22ec66..f928dbc184f 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp02.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp02.py @@ -38,12 +38,6 @@ from wtscenario import make_scenarios def timestamp_str(t): return '%x' % t -def timestamp_ret_str(t): - s = timestamp_str(t) - if len(s) % 2 == 1: - s = '0' + s - return s - class test_timestamp02(wttest.WiredTigerTestCase, suite_subprocess): tablename = 'test_timestamp02' uri = 'table:' + tablename @@ -98,7 +92,7 @@ class test_timestamp02(wttest.WiredTigerTestCase, suite_subprocess): dict((k, 1) for k in orig_keys[:i+1])) # Everything up to and including timestamp 100 has been committed. - self.assertEqual(self.conn.query_timestamp(), timestamp_ret_str(100)) + self.assertTimestampsEqual(self.conn.query_timestamp(), timestamp_str(100)) # Bump the oldest timestamp, we're not going back... self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(100)) @@ -111,11 +105,11 @@ class test_timestamp02(wttest.WiredTigerTestCase, suite_subprocess): self.session.commit_transaction('commit_timestamp=' + timestamp_str(k + 100)) # Everything up to and including timestamp 200 has been committed. - self.assertEqual(self.conn.query_timestamp(), timestamp_ret_str(200)) + self.assertTimestampsEqual(self.conn.query_timestamp(), timestamp_str(200)) # Test that we can manually move the commit timestamp back self.conn.set_timestamp('commit_timestamp=' + timestamp_str(150)) - self.assertEqual(self.conn.query_timestamp(), timestamp_ret_str(150)) + self.assertTimestampsEqual(self.conn.query_timestamp(), timestamp_str(150)) self.conn.set_timestamp('commit_timestamp=' + timestamp_str(200)) # Now the stable timestamp before we read. diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp03.py b/src/third_party/wiredtiger/test/suite/test_timestamp03.py index 9caf597e6ed..1a2511ea6ee 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp03.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp03.py @@ -39,12 +39,6 @@ from wtscenario import make_scenarios def timestamp_str(t): return '%x' % t -def timestamp_ret_str(t): - s = timestamp_str(t) - if len(s) % 2 == 1: - s = '0' + s - return s - class test_timestamp03(wttest.WiredTigerTestCase, suite_subprocess): table_ts_log = 'ts03_ts_logged' table_ts_nolog = 'ts03_ts_nologged' @@ -226,7 +220,7 @@ class test_timestamp03(wttest.WiredTigerTestCase, suite_subprocess): self.table_nots_nolog, dict((k, self.value) for k in orig_keys)) # Bump the oldest_timestamp, we're not going back... - self.assertEqual(self.conn.query_timestamp(), timestamp_ret_str(100)) + self.assertTimestampsEqual(self.conn.query_timestamp(), timestamp_str(100)) old_ts = timestamp_str(100) self.conn.set_timestamp('oldest_timestamp=' + old_ts) self.conn.set_timestamp('stable_timestamp=' + old_ts) diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp04.py b/src/third_party/wiredtiger/test/suite/test_timestamp04.py index a52675daf8b..f7052448208 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp04.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp04.py @@ -37,12 +37,6 @@ from wtscenario import make_scenarios def timestamp_str(t): return '%x' % t -def timestamp_ret_str(t): - s = timestamp_str(t) - if len(s) % 2 == 1: - s = '0' + s - return s - class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): table_ts_log = 'table:ts04_ts_logged' table_ts_nolog = 'table:ts04_ts_nologged' @@ -61,6 +55,7 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): ('col_var', dict(empty=0, cacheSize='cache_size=20MB', extra_config=',key_format=r')), ('lsm', dict(empty=0, cacheSize='cache_size=31MB', extra_config=',type=lsm')), ('row', dict(empty=0, cacheSize='cache_size=20MB', extra_config='',)), + ('row-smallcache', dict(empty=0, cacheSize='cache_size=2MB', extra_config='',)), ] scenarios = make_scenarios(conncfg, types) diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp05.py b/src/third_party/wiredtiger/test/suite/test_timestamp05.py index d7131cb2004..f145184146c 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp05.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp05.py @@ -39,12 +39,6 @@ from wtscenario import make_scenarios def timestamp_str(t): return '%x' % t -def timestamp_ret_str(t): - s = timestamp_str(t) - if len(s) % 2 == 1: - s = '0' + s - return s - class test_timestamp05(wttest.WiredTigerTestCase, suite_subprocess): uri = 'table:ts05' diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp07.py b/src/third_party/wiredtiger/test/suite/test_timestamp07.py index 12b36bdc2f8..09547dba3a7 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp07.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp07.py @@ -56,8 +56,8 @@ class test_timestamp07(wttest.WiredTigerTestCase, suite_subprocess): nkeys = [ ('100keys', dict(nkeys=100)), -# ('500keys', dict(nkeys=500)), -# ('1000keys', dict(nkeys=1000)), + ('500keys', dict(nkeys=500)), + ('1000keys', dict(nkeys=1000)), ] scenarios = make_scenarios(types, conncfg, nkeys) @@ -68,19 +68,20 @@ class test_timestamp07(wttest.WiredTigerTestCase, suite_subprocess): value3 = u'\u0001\u0002cdef\u0007\u0004' # Check that a cursor (optionally started in a new transaction), sees the - # expected values. - def check(self, session, txn_config, expected): + # expected value for a key + def check(self, session, txn_config, k, expected): if txn_config: session.begin_transaction(txn_config) c = session.open_cursor(self.uri + self.tablename, None) - actual = dict((k, v) for k, v in c if v != 0) - self.assertTrue(actual == expected) - # Search for the expected items as well as iterating - for k, v in expected.iteritems(): - self.assertEqual(c[k], v, "for key " + str(k)) + if not expected: + c.set_key(k) + self.assertEqual(c.search(), wiredtiger.WT_NOTFOUND) + else: + self.assertEqual(c[k], expected) c.close() if txn_config: session.commit_transaction() + # # Take a backup of the database and verify that the value we want to # check exists in the tables the expected number of times. @@ -168,12 +169,14 @@ class test_timestamp07(wttest.WiredTigerTestCase, suite_subprocess): # Now check that we see the expected state when reading at each # timestamp. - for i, t in enumerate(orig_keys): - self.check(self.session, 'read_timestamp=' + timestamp_str(t), - dict((k, self.value) for k in orig_keys[:i+1])) + for k in orig_keys: + self.check(self.session, 'read_timestamp=' + timestamp_str(k), + k, self.value) + self.check(self.session, 'read_timestamp=' + timestamp_str(k), + k + 1, None) # Bump the oldest timestamp, we're not going back... - self.assertEqual(self.conn.query_timestamp(), timestamp_str(self.nkeys)) + self.assertTimestampsEqual(self.conn.query_timestamp(), timestamp_str(self.nkeys)) self.oldts = timestamp_str(self.nkeys) self.conn.set_timestamp('oldest_timestamp=' + self.oldts) self.conn.set_timestamp('stable_timestamp=' + self.oldts) @@ -201,12 +204,8 @@ class test_timestamp07(wttest.WiredTigerTestCase, suite_subprocess): # Take a checkpoint using the given configuration. Then verify # whether value2 appears in a copy of that data or not. - valcnt2 = valcnt3 = self.nkeys - valcnt = 0 - # If logging is disabled then value2 should not appear in logged table. - if self.using_log == False: - valcnt3 = 0 - self.ckpt_backup(self.value2, valcnt, valcnt2, valcnt3) + self.ckpt_backup(self.value2, 0, self.nkeys, self.nkeys if self.using_log else 0) + # Update the stable timestamp to the latest, but not the oldest # timestamp and make sure we can see the data. Once the stable # timestamp is moved we should see all keys with value2. @@ -245,9 +244,7 @@ class test_timestamp07(wttest.WiredTigerTestCase, suite_subprocess): # of that data or not. Both tables that are logged should see # all the data regardless of timestamps. The table that is not # logged should not see any of it. - valcnt = 0 - valcnt2 = valcnt3 = self.nkeys - self.backup_check(self.value3, valcnt, valcnt2, valcnt3) + self.backup_check(self.value3, 0, self.nkeys, self.nkeys) if __name__ == '__main__': wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp09.py b/src/third_party/wiredtiger/test/suite/test_timestamp09.py index 41a6909cbef..b79521329e7 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp09.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp09.py @@ -114,8 +114,7 @@ class test_timestamp09(wttest.WiredTigerTestCase, suite_subprocess): # Oldest timestamp is 3 at the moment, trying to set it to an earlier # timestamp is a no-op. self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(1)) - self.assertEqual(int(self.conn.query_timestamp('get=oldest')), - int(timestamp_str(3))) + self.assertTimestampsEqual(self.conn.query_timestamp('get=oldest'), timestamp_str(3)) self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(3) + ',stable_timestamp=' + timestamp_str(3)) @@ -123,8 +122,7 @@ class test_timestamp09(wttest.WiredTigerTestCase, suite_subprocess): # Stable timestamp is 5 at the moment, trying to set it to an earlier # timestamp is a no-op. self.conn.set_timestamp('stable_timestamp=' + timestamp_str(4)) - self.assertEqual(int(self.conn.query_timestamp('get=stable')), - int(timestamp_str(5))) + self.assertTimestampsEqual(self.conn.query_timestamp('get=stable'), timestamp_str(5)) self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(5)) self.assertRaisesWithMessage(wiredtiger.WiredTigerError, diff --git a/src/third_party/wiredtiger/test/suite/test_txn14.py b/src/third_party/wiredtiger/test/suite/test_txn14.py index 7579bbc8e54..2245f49ae85 100644 --- a/src/third_party/wiredtiger/test/suite/test_txn14.py +++ b/src/third_party/wiredtiger/test/suite/test_txn14.py @@ -93,10 +93,11 @@ class test_txn14(wttest.WiredTigerTestCase, suite_subprocess): c.close() self.session.log_flush(cfgarg) if self.sync == 'background': - # If doing a background flush, wait a few seconds. I have - # seen an individual log file's fsync take more than a second - # on some systems. So give it time to flush perhaps a few files. - self.session.transaction_sync('timeout_ms=4000') + # If doing a background flush, wait 10 seconds. I have seen an + # individual log file's fsync take more than a second on some + # systems, and we've seen timeouts at lower levels on systems + # with slow I/O. So give it time to flush perhaps a few files. + self.session.transaction_sync('timeout_ms=10000') self.simulate_crash_restart(".", "RESTART") c = self.session.open_cursor(self.t1, None, None) i = 0 diff --git a/src/third_party/wiredtiger/test/suite/wttest.py b/src/third_party/wiredtiger/test/suite/wttest.py index 1c95eb355ae..c654370718c 100644 --- a/src/third_party/wiredtiger/test/suite/wttest.py +++ b/src/third_party/wiredtiger/test/suite/wttest.py @@ -490,6 +490,12 @@ class WiredTigerTestCase(unittest.TestCase): with self.expectedStderr(message): self.assertRaises(exceptionType, expr) + def assertTimestampsEqual(self, ts1, ts2): + """ + TestCase.assertEqual() for timestamps + """ + self.assertEqual(int(ts1, 16), int(ts2, 16)) + def exceptionToStderr(self, expr): """ Used by assertRaisesHavingMessage to convert an expression |