summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/third_party/wiredtiger/bench/workgen/runner/example_prepare.py78
-rwxr-xr-xsrc/third_party/wiredtiger/bench/workgen/runner/runner/core.py4
-rw-r--r--src/third_party/wiredtiger/bench/workgen/workgen.cxx172
-rw-r--r--src/third_party/wiredtiger/bench/workgen/workgen.h19
-rw-r--r--src/third_party/wiredtiger/bench/workgen/workgen_int.h31
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok3
-rw-r--r--src/third_party/wiredtiger/examples/c/ex_backup.c2
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/block/block_ext.c1
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c18
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c141
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_vrfy.c626
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c9
-rw-r--r--src/third_party/wiredtiger/src/btree/row_key.c18
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c17
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_log.c72
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_reconfig.c8
-rw-r--r--src/third_party/wiredtiger/src/docs/durability.dox11
-rw-r--r--src/third_party/wiredtiger/src/docs/eviction.dox107
-rw-r--r--src/third_party/wiredtiger/src/docs/programming.dox1
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c6
-rw-r--r--src/third_party/wiredtiger/src/history/hs.c120
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h7
-rw-r--r--src/third_party/wiredtiger/src/include/connection.h2
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h7
-rw-r--r--src/third_party/wiredtiger/src/include/reconcile.h6
-rw-r--r--src/third_party/wiredtiger/src/include/session.h12
-rw-r--r--src/third_party/wiredtiger/src/include/time.i9
-rw-r--r--src/third_party/wiredtiger/src/include/txn.h10
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i8
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in14
-rw-r--r--src/third_party/wiredtiger/src/include/wt_internal.h2
-rw-r--r--src/third_party/wiredtiger/src/log/log.c27
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_fs.c4
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_visibility.c122
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c31
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c37
-rw-r--r--src/third_party/wiredtiger/test/csuite/wt2853_perf/main.c21
-rwxr-xr-xsrc/third_party/wiredtiger/test/evergreen.yml2
-rwxr-xr-xsrc/third_party/wiredtiger/test/evergreen/compatibility_test_for_mongodb_releases.sh31
-rw-r--r--src/third_party/wiredtiger/test/format/backup.c2
-rw-r--r--src/third_party/wiredtiger/test/format/config.c135
-rw-r--r--src/third_party/wiredtiger/test/format/config_compat.c2
-rw-r--r--src/third_party/wiredtiger/test/format/format.h6
-rw-r--r--src/third_party/wiredtiger/test/format/t.c8
-rw-r--r--src/third_party/wiredtiger/test/format/wts.c12
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_bug023.py102
-rw-r--r--src/third_party/wiredtiger/test/suite/test_hs06.py9
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_rollback_to_stable06.py3
49 files changed, 1325 insertions, 772 deletions
diff --git a/src/third_party/wiredtiger/bench/workgen/runner/example_prepare.py b/src/third_party/wiredtiger/bench/workgen/runner/example_prepare.py
new file mode 100644
index 00000000000..4520f2cb787
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/workgen/runner/example_prepare.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2020 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+from runner import *
+from wiredtiger import *
+from workgen import *
+
+conn = wiredtiger_open("WT_TEST", "create,cache_size=500MB")
+s = conn.open_session()
+tname = "table:test"
+config = "key_format=S,value_format=S,"
+s.create(tname, config)
+table = Table(tname)
+table.options.key_size = 20
+table.options.value_size = 10
+
+context = Context()
+op = Operation(Operation.OP_INSERT, table)
+thread = Thread(op * 5000)
+pop_workload = Workload(context, thread)
+print('populate:')
+pop_workload.run(conn)
+
+opread = Operation(Operation.OP_SEARCH, table)
+read_txn = txn(opread * 10, 'read_timestamp')
+# read_timestamp_lag is the lag to the read_timestamp from current time
+read_txn.transaction.read_timestamp_lag = 5
+treader = Thread(read_txn)
+
+opwrite = Operation(Operation.OP_INSERT, table)
+write_txn = txn(opwrite * 10, 'isolation=snapshot')
+# use_prepare_timestamp - Commit the transaction with stable_timestamp.
+write_txn.transaction.use_prepare_timestamp = True
+twriter = Thread(write_txn)
+
+opupdate = Operation(Operation.OP_UPDATE, table)
+update_txn = txn(opupdate * 10, 'isolation=snapshot')
+# use_commit_timestamp - Commit the transaction with commit_timestamp.
+update_txn.transaction.use_commit_timestamp = True
+tupdate = Thread(update_txn)
+
+workload = Workload(context, 10 * twriter + 10 * tupdate + 10 * treader)
+workload.options.run_time = 50
+workload.options.report_interval=500
+# read_timestamp_lag - Number of seconds lag to the oldest_timestamp from current time.
+workload.options.oldest_timestamp_lag=30
+# read_timestamp_lag - Number of seconds lag to the stable_timestamp from current time.
+workload.options.stable_timestamp_lag=10
+# timestamp_advance is the number of seconds to wait before moving oldest and stable timestamp.
+workload.options.timestamp_advance=1
+print('transactional prepare workload:')
+workload.run(conn)
diff --git a/src/third_party/wiredtiger/bench/workgen/runner/runner/core.py b/src/third_party/wiredtiger/bench/workgen/runner/runner/core.py
index ae3de8efa64..158a65d1fbd 100755
--- a/src/third_party/wiredtiger/bench/workgen/runner/runner/core.py
+++ b/src/third_party/wiredtiger/bench/workgen/runner/runner/core.py
@@ -35,7 +35,7 @@ from workgen import Key, Operation, OpList, Table, Transaction, Value
# Put the operation (and any suboperations) within a transaction.
def txn(op, config=None):
t = Transaction(config)
- op._transaction = t
+ op.transaction = t
return op
# sleep --
@@ -301,7 +301,7 @@ def _op_transaction_list(oplist, txn_config):
def op_group_transaction(ops_arg, ops_per_txn, txn_config):
if ops_arg != Operation.OP_NONE:
return txn(ops_arg, txn_config)
- if ops_arg._transaction != None:
+ if ops_arg.transaction != None:
raise Exception('nested transactions not supported')
if ops_arg._repeatgroup != None:
raise Exception('grouping transactions with multipliers not supported')
diff --git a/src/third_party/wiredtiger/bench/workgen/workgen.cxx b/src/third_party/wiredtiger/bench/workgen/workgen.cxx
index 2347cf9d6b5..ca0cd4b308d 100644
--- a/src/third_party/wiredtiger/bench/workgen/workgen.cxx
+++ b/src/third_party/wiredtiger/bench/workgen/workgen.cxx
@@ -35,7 +35,6 @@
#include "wiredtiger.h"
#include "workgen.h"
#include "workgen_int.h"
-#include "workgen_time.h"
extern "C" {
// Include some specific WT files, as some files included by wt_internal.h
// have some C-ism's that don't work in C++.
@@ -43,12 +42,11 @@ extern "C" {
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
-#include <unistd.h>
#include <errno.h>
-#include <math.h>
#include "error.h"
#include "misc.h"
}
+#define BUF_SIZE 100
#define LATENCY_US_BUCKETS 1000
#define LATENCY_MS_BUCKETS 1000
@@ -102,6 +100,12 @@ extern "C" {
namespace workgen {
+
+struct WorkloadRunnerConnection {
+ WorkloadRunner *runner;
+ WT_CONNECTION *connection;
+};
+
// The number of contexts. Normally there is one context created, but it will
// be possible to use several eventually. More than one is not yet
// implemented, but we must at least guard against the caller creating more
@@ -118,6 +122,48 @@ static void *thread_runner_main(void *arg) {
return (NULL);
}
+static void *thread_workload(void *arg) {
+
+ WorkloadRunnerConnection *runnerConnection = (WorkloadRunnerConnection *) arg;
+ WorkloadRunner *runner = runnerConnection->runner;
+ WT_CONNECTION *connection = runnerConnection->connection;
+
+ try {
+ runner->increment_timestamp(connection);
+ } catch (WorkgenException &wge) {
+ std::cerr << "Exception while incrementing timestamp." << std::endl;
+ }
+
+ return (NULL);
+}
+
+/*
+ * This function will sleep for "timestamp_advance" seconds, increment and set oldest_timestamp,
+ * stable_timestamp with the specified lag until stopping is set to true
+ */
+int WorkloadRunner::increment_timestamp(WT_CONNECTION *conn) {
+ char buf[BUF_SIZE];
+ uint64_t time_us;
+
+ while (!stopping)
+ {
+ if (_workload->options.oldest_timestamp_lag > 0) {
+ time_us = WorkgenTimeStamp::get_timestamp_lag(_workload->options.oldest_timestamp_lag);
+ sprintf(buf, "oldest_timestamp=%" PRIu64, time_us);
+ conn->set_timestamp(conn, buf);
+ }
+
+ if (_workload->options.stable_timestamp_lag > 0) {
+ time_us = WorkgenTimeStamp::get_timestamp_lag(_workload->options.stable_timestamp_lag);
+ sprintf(buf, "stable_timestamp=%" PRIu64, time_us);
+ conn->set_timestamp(conn, buf);
+ }
+
+ WorkgenTimeStamp::sleep(_workload->options.timestamp_advance);
+ }
+ return 0;
+}
+
static void *monitor_main(void *arg) {
Monitor *monitor = (Monitor *)arg;
try {
@@ -715,6 +761,9 @@ int ThreadRunner::op_run(Operation *op) {
uint64_t recno;
uint64_t range;
bool measure_latency, own_cursor, retry_op;
+ timespec start_time;
+ uint64_t time_us;
+ char buf[BUF_SIZE];
track = NULL;
cursor = NULL;
@@ -795,6 +844,7 @@ int ThreadRunner::op_run(Operation *op) {
timespec start;
if (measure_latency)
workgen_epoch(&start);
+
// Whether or not we are measuring latency, we track how many operations
// are in progress, or that complete.
if (track != NULL)
@@ -814,11 +864,22 @@ int ThreadRunner::op_run(Operation *op) {
}
// Retry on rollback until success.
while (retry_op) {
- if (op->_transaction != NULL) {
+ if (op->transaction != NULL) {
if (_in_transaction)
THROW("nested transactions not supported");
- WT_ERR(_session->begin_transaction(_session,
- op->_transaction->_begin_config.c_str()));
+ if (op->transaction->use_commit_timestamp && op->transaction->use_prepare_timestamp)
+ {
+ THROW("Either use_prepare_timestamp or use_commit_timestamp must be set.");
+ }
+ if (op->transaction->read_timestamp_lag > 0) {
+ uint64_t read = WorkgenTimeStamp::get_timestamp_lag(op->transaction->read_timestamp_lag);
+ sprintf(buf, "%s=%" PRIu64, op->transaction->_begin_config.c_str(), read);
+ }
+ else {
+ sprintf(buf, "%s", op->transaction->_begin_config.c_str());
+ }
+ WT_ERR(_session->begin_transaction(_session, buf));
+
_in_transaction = true;
}
if (op->is_table_op()) {
@@ -899,12 +960,28 @@ int ThreadRunner::op_run(Operation *op) {
err:
if (own_cursor)
WT_TRET(cursor->close(cursor));
- if (op->_transaction != NULL) {
- if (ret != 0 || op->_transaction->_rollback)
+ if (op->transaction != NULL) {
+ if (ret != 0 || op->transaction->_rollback)
WT_TRET(_session->rollback_transaction(_session, NULL));
- else if (_in_transaction)
- ret = _session->commit_transaction(_session,
- op->_transaction->_commit_config.c_str());
+ else if (_in_transaction) {
+ // Set prepare, commit and durable timestamp if prepare is set.
+ if (op->transaction->use_prepare_timestamp) {
+ time_us = WorkgenTimeStamp::get_timestamp();
+ sprintf(buf, "prepare_timestamp=%" PRIu64, time_us);
+ ret = _session->prepare_transaction(_session, buf);
+ sprintf(buf, "commit_timestamp=%" PRIu64 ",durable_timestamp=%" PRIu64, time_us, time_us);
+ ret = _session->commit_transaction(_session, buf);
+ }
+ else if (op->transaction->use_commit_timestamp) {
+ uint64_t commit_time_us = WorkgenTimeStamp::get_timestamp();
+ sprintf(buf, "commit_timestamp=%" PRIu64, commit_time_us);
+ ret = _session->commit_transaction(_session, buf);
+ }
+ else {
+ ret = _session->commit_transaction(_session,
+ op->transaction->_commit_config.c_str());
+ }
+ }
_in_transaction = false;
}
return (ret);
@@ -1077,27 +1154,27 @@ void Thread::describe(std::ostream &os) const {
Operation::Operation() :
_optype(OP_NONE), _internal(NULL), _table(), _key(), _value(), _config(),
- _transaction(NULL), _group(NULL), _repeatgroup(0), _timed(0.0) {
+ transaction(NULL), _group(NULL), _repeatgroup(0), _timed(0.0) {
init_internal(NULL);
}
Operation::Operation(OpType optype, Table table, Key key, Value value) :
_optype(optype), _internal(NULL), _table(table), _key(key), _value(value),
- _config(), _transaction(NULL), _group(NULL), _repeatgroup(0), _timed(0.0) {
+ _config(), transaction(NULL), _group(NULL), _repeatgroup(0), _timed(0.0) {
init_internal(NULL);
size_check();
}
Operation::Operation(OpType optype, Table table, Key key) :
_optype(optype), _internal(NULL), _table(table), _key(key), _value(),
- _config(), _transaction(NULL), _group(NULL), _repeatgroup(0), _timed(0.0) {
+ _config(), transaction(NULL), _group(NULL), _repeatgroup(0), _timed(0.0) {
init_internal(NULL);
size_check();
}
Operation::Operation(OpType optype, Table table) :
_optype(optype), _internal(NULL), _table(table), _key(), _value(),
- _config(), _transaction(NULL), _group(NULL), _repeatgroup(0), _timed(0.0) {
+ _config(), transaction(NULL), _group(NULL), _repeatgroup(0), _timed(0.0) {
init_internal(NULL);
size_check();
}
@@ -1105,22 +1182,22 @@ Operation::Operation(OpType optype, Table table) :
Operation::Operation(const Operation &other) :
_optype(other._optype), _internal(NULL), _table(other._table),
_key(other._key), _value(other._value), _config(other._config),
- _transaction(other._transaction), _group(other._group),
+ transaction(other.transaction), _group(other._group),
_repeatgroup(other._repeatgroup), _timed(other._timed) {
- // Creation and destruction of _group and _transaction is managed
+ // Creation and destruction of _group and transaction is managed
// by Python.
init_internal(other._internal);
}
Operation::Operation(OpType optype, const char *config) :
_optype(optype), _internal(NULL), _table(), _key(), _value(),
- _config(config), _transaction(NULL), _group(NULL), _repeatgroup(0),
+ _config(config), transaction(NULL), _group(NULL), _repeatgroup(0),
_timed(0.0) {
init_internal(NULL);
}
Operation::~Operation() {
- // Creation and destruction of _group, _transaction is managed by Python.
+ // Creation and destruction of _group, transaction is managed by Python.
delete _internal;
}
@@ -1129,7 +1206,7 @@ Operation& Operation::operator=(const Operation &other) {
_table = other._table;
_key = other._key;
_value = other._value;
- _transaction = other._transaction;
+ transaction = other.transaction;
_group = other._group;
_repeatgroup = other._repeatgroup;
_timed = other._timed;
@@ -1184,7 +1261,7 @@ void Operation::init_internal(OperationInternal *other) {
bool Operation::combinable() const {
return (_group != NULL && _repeatgroup == 1 && _timed == 0.0 &&
- _transaction == NULL && _config == "");
+ transaction == NULL && _config == "");
}
void Operation::create_all() {
@@ -1203,9 +1280,9 @@ void Operation::describe(std::ostream &os) const {
}
if (!_config.empty())
os << ", '" << _config << "'";
- if (_transaction != NULL) {
+ if (transaction != NULL) {
os << ", [";
- _transaction->describe(os);
+ transaction->describe(os);
os << "]";
}
if (_timed != 0.0)
@@ -1439,7 +1516,7 @@ int SleepOperationInternal::run(ThreadRunner *runner, WT_SESSION *session)
uint64_t SleepOperationInternal::sync_time_us() const
{
- return (secs_us(_sleepvalue));
+ return (secs_us(_sleepvalue));
}
void TableOperationInternal::parse_config(const std::string &config)
@@ -1857,7 +1934,8 @@ TableInternal::~TableInternal() {}
WorkloadOptions::WorkloadOptions() : max_latency(0),
report_file("workload.stat"), report_interval(0), run_time(0),
sample_file("monitor.json"), sample_interval_ms(0), sample_rate(1),
- warmup(0), _options() {
+ warmup(0), oldest_timestamp_lag(0.0), stable_timestamp_lag(0.0),
+ timestamp_advance(0.0), _options() {
_options.add_int("max_latency", max_latency,
"prints warning if any latency measured exceeds this number of "
"milliseconds. Requires sample_interval to be configured.");
@@ -1881,6 +1959,13 @@ WorkloadOptions::WorkloadOptions() : max_latency(0),
"2 for every second operation, 3 for every third operation etc.");
_options.add_int("warmup", warmup,
"how long to run the workload phase before starting measurements");
+ _options.add_double("oldest_timestamp_lag", oldest_timestamp_lag,
+ "how much lag to the oldest timestamp from epoch time");
+ _options.add_double("stable_timestamp_lag", stable_timestamp_lag,
+ "how much lag to the oldest timestamp from epoch time");
+ _options.add_double("timestamp_advance", timestamp_advance,
+ "how many seconds to wait before moving oldest and stable"
+ "timestamp forward");
}
WorkloadOptions::WorkloadOptions(const WorkloadOptions &other) :
@@ -1917,13 +2002,12 @@ Workload& Workload::operator=(const Workload &other) {
int Workload::run(WT_CONNECTION *conn) {
WorkloadRunner runner(this);
-
return (runner.run(conn));
}
WorkloadRunner::WorkloadRunner(Workload *workload) :
_workload(workload), _trunners(workload->_threads.size()),
- _report_out(&std::cout), _start() {
+ _report_out(&std::cout), _start(), stopping(false) {
ts_clear(_start);
}
WorkloadRunner::~WorkloadRunner() {}
@@ -1934,6 +2018,9 @@ int WorkloadRunner::run(WT_CONNECTION *conn) {
std::ofstream report_out;
_wt_home = conn->get_home(conn);
+
+ if ( (options->oldest_timestamp_lag > 0 || options->stable_timestamp_lag > 0) && options->timestamp_advance < 0 )
+ THROW("Workload.options.timestamp_advance must be positive if either Workload.options.oldest_timestamp_lag or Workload.options.stable_timestamp_lag is set");
if (options->sample_interval_ms > 0 && options->sample_rate <= 0)
THROW("Workload.options.sample_rate must be positive");
if (!options->report_file.empty()) {
@@ -1944,7 +2031,7 @@ int WorkloadRunner::run(WT_CONNECTION *conn) {
WT_ERR(create_all(conn, _workload->_context));
WT_ERR(open_all());
WT_ERR(ThreadRunner::cross_check(_trunners));
- WT_ERR(run_all());
+ WT_ERR(run_all(conn));
err:
//TODO: (void)close_all();
_report_out = &std::cout;
@@ -2031,16 +2118,18 @@ void WorkloadRunner::final_report(timespec &totalsecs) {
out << "Run completed: " << totalsecs << " seconds" << std::endl;
}
-int WorkloadRunner::run_all() {
+int WorkloadRunner::run_all(WT_CONNECTION *conn) {
void *status;
std::vector<pthread_t> thread_handles;
Stats counts(false);
WorkgenException *exception;
WorkloadOptions *options = &_workload->options;
+ WorkloadRunnerConnection *runnerConnection;
Monitor monitor(*this);
std::ofstream monitor_out;
std::ofstream monitor_json;
std::ostream &out = *_report_out;
+ pthread_t time_thandle;
WT_DECL_RET;
for (size_t i = 0; i < _trunners.size(); i++)
@@ -2086,6 +2175,22 @@ int WorkloadRunner::run_all() {
thread_handles.push_back(thandle);
}
+ // Start Timestamp increment thread
+ if (options->oldest_timestamp_lag > 0 || options->stable_timestamp_lag > 0) {
+
+ runnerConnection = new WorkloadRunnerConnection();
+ runnerConnection->runner = this;
+ runnerConnection->connection = conn;
+
+ if ((ret = pthread_create(&time_thandle, NULL, thread_workload,
+ runnerConnection)) != 0) {
+ std::cerr << "pthread_create failed err=" << ret << std::endl;
+ std::cerr << "Stopping Time threads." << std::endl;
+ (void)pthread_join(time_thandle, &status);
+ delete runnerConnection;
+ }
+ }
+
// Treat warmup separately from report interval so that if we have a
// warmup period we clear and ignore stats after it ends.
if (options->warmup != 0)
@@ -2132,6 +2237,9 @@ int WorkloadRunner::run_all() {
_trunners[i]._stop = true;
if (options->sample_interval_ms > 0)
monitor._stop = true;
+ if (options->oldest_timestamp_lag > 0 || options->stable_timestamp_lag > 0) {
+ stopping = true;
+ }
// wait for all threads
exception = NULL;
@@ -2146,6 +2254,12 @@ int WorkloadRunner::run_all() {
exception = &_trunners[i]._exception;
}
+ // Wait for the time increment thread
+ if (options->oldest_timestamp_lag > 0 || options->stable_timestamp_lag > 0) {
+ WT_TRET(pthread_join(time_thandle, &status));
+ delete runnerConnection;
+ }
+
workgen_epoch(&now);
if (options->sample_interval_ms > 0) {
WT_TRET(pthread_join(monitor._handle, &status));
diff --git a/src/third_party/wiredtiger/bench/workgen/workgen.h b/src/third_party/wiredtiger/bench/workgen/workgen.h
index 382ca65dcfc..b963cf3d47e 100644
--- a/src/third_party/wiredtiger/bench/workgen/workgen.h
+++ b/src/third_party/wiredtiger/bench/workgen/workgen.h
@@ -292,7 +292,7 @@ struct Operation {
Key _key;
Value _value;
std::string _config;
- Transaction *_transaction;
+ Transaction *transaction;
std::vector<Operation> *_group;
int _repeatgroup;
double _timed;
@@ -386,11 +386,15 @@ struct Thread {
struct Transaction {
bool _rollback;
+ bool use_commit_timestamp;
+ bool use_prepare_timestamp;
std::string _begin_config;
std::string _commit_config;
+ double read_timestamp_lag;
- Transaction(const char *_config = NULL) : _rollback(false),
- _begin_config(_config == NULL ? "" : _config), _commit_config() {}
+ Transaction(const char *_config = NULL) : _rollback(false), use_commit_timestamp(false), use_prepare_timestamp(false), _begin_config(_config == NULL ? "" : _config), _commit_config(),
+ read_timestamp_lag(0.0)
+ {}
void describe(std::ostream &os) const {
os << "Transaction: ";
@@ -399,6 +403,12 @@ struct Transaction {
os << "begin_config: " << _begin_config;
if (!_commit_config.empty())
os << ", commit_config: " << _commit_config;
+ if (use_commit_timestamp)
+ os << "(use_commit_timestamp) ";
+ if (use_prepare_timestamp)
+ os << "(use_prepare_timestamp) ";
+ if (read_timestamp_lag)
+ os << "(read_timestamp_lag)";
}
};
@@ -414,6 +424,9 @@ struct WorkloadOptions {
int sample_rate;
std::string sample_file;
int warmup;
+ double oldest_timestamp_lag;
+ double stable_timestamp_lag;
+ double timestamp_advance;
WorkloadOptions();
WorkloadOptions(const WorkloadOptions &other);
diff --git a/src/third_party/wiredtiger/bench/workgen/workgen_int.h b/src/third_party/wiredtiger/bench/workgen/workgen_int.h
index ca93e5c2733..d5ed99c8c53 100644
--- a/src/third_party/wiredtiger/bench/workgen/workgen_int.h
+++ b/src/third_party/wiredtiger/bench/workgen/workgen_int.h
@@ -30,11 +30,12 @@
#include <vector>
#include <map>
#include <set>
-#ifndef SWIG
extern "C" {
+#include <unistd.h>
#include "workgen_func.h"
+#include <math.h>
}
-#endif
+#include "workgen_time.h"
namespace workgen {
@@ -46,6 +47,28 @@ typedef uint32_t tint_t;
struct ThreadRunner;
struct WorkloadRunner;
+struct WorkgenTimeStamp {
+ WorkgenTimeStamp() {}
+
+ static uint64_t get_timestamp_lag(double seconds) {
+ timespec start_time;
+ workgen_epoch(&start_time);
+
+ return (ts_us(start_time) - secs_us(seconds));
+ }
+
+ static void sleep(double seconds) {
+ usleep(ceil(secs_us(seconds)));
+ }
+
+ static uint64_t get_timestamp() {
+ timespec start_time;
+ workgen_epoch(&start_time);
+
+ return (ts_us(start_time));
+ }
+};
+
// A exception generated by the workgen classes. Methods generally return an
// int errno, so this is useful primarily for notifying the caller about
// failures in constructors.
@@ -250,10 +273,12 @@ struct WorkloadRunner {
std::ostream *_report_out;
std::string _wt_home;
timespec _start;
+ bool stopping;
WorkloadRunner(Workload *);
~WorkloadRunner();
int run(WT_CONNECTION *conn);
+ int increment_timestamp(WT_CONNECTION *conn);
private:
int close_all();
@@ -263,7 +288,7 @@ private:
int open_all();
void open_report_file(std::ofstream &, const char *, const char *);
void report(time_t, time_t, Stats *stats);
- int run_all();
+ int run_all(WT_CONNECTION *conn);
WorkloadRunner(const WorkloadRunner &); // disallowed
WorkloadRunner& operator=(const WorkloadRunner &other); // disallowed
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index 9070134e8e2..6314cd62004 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -43,6 +43,7 @@ Barack
BerkeleyDB
Bitfield
Bitwise
+Blqr
Brueckner
Bsearch
Btree
@@ -958,7 +959,6 @@ lookaside
lookup
lookups
lossy
-lqr
lqrt
lr
lrt
@@ -975,6 +975,7 @@ lz
lzo
mT
madvise
+majmin
majorp
malloc
mappable
diff --git a/src/third_party/wiredtiger/examples/c/ex_backup.c b/src/third_party/wiredtiger/examples/c/ex_backup.c
index d9f410461ee..30d8ee4dd88 100644
--- a/src/third_party/wiredtiger/examples/c/ex_backup.c
+++ b/src/third_party/wiredtiger/examples/c/ex_backup.c
@@ -217,7 +217,9 @@ take_incr_backup(WT_SESSION *session, int i)
* With an incremental cursor, we want to truncate on the backup cursor to archive the logs.
* Only do this if the copy process was entirely successful.
*/
+ /*! [Truncate a backup cursor] */
error_check(session->truncate(session, "log:", cursor, NULL, NULL));
+ /*! [Truncate a backup cursor] */
error_check(cursor->close(cursor));
}
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 51a6056c6fa..c7fdaec04b5 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-4.4",
- "commit": "5b5d7988560a20dbe827a1fa16c0481e6c0e2821"
+ "commit": "da6c25fee0c5c9b0376df0dc19caa40a553cc5a2"
}
diff --git a/src/third_party/wiredtiger/src/block/block_ext.c b/src/third_party/wiredtiger/src/block/block_ext.c
index 3f11cbe5496..e8d100d6df7 100644
--- a/src/third_party/wiredtiger/src/block/block_ext.c
+++ b/src/third_party/wiredtiger/src/block/block_ext.c
@@ -1185,6 +1185,7 @@ __wt_block_extlist_write(
dsk = tmp->mem;
memset(dsk, 0, WT_BLOCK_HEADER_BYTE_SIZE);
dsk->type = WT_PAGE_BLOCK_MANAGER;
+ dsk->version = WT_PAGE_VERSION_TS;
/* Fill the page's data. */
p = WT_BLOCK_HEADER_BYTE(dsk);
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index 63f8113eceb..0026a9dafdd 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -794,7 +794,8 @@ err:
* Dump information about a key and/or value.
*/
int
-__wt_debug_key_value(WT_SESSION_IMPL *session, WT_ITEM *key, WT_CELL_UNPACK *value)
+__wt_debug_key_value(
+ WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno, uint64_t rle, WT_CELL_UNPACK *value)
{
WT_DBG *ds, _ds;
WT_DECL_RET;
@@ -803,17 +804,16 @@ __wt_debug_key_value(WT_SESSION_IMPL *session, WT_ITEM *key, WT_CELL_UNPACK *val
WT_ERR(__debug_config(session, ds, NULL));
- if (key != NULL)
+ if (key == NULL)
+ WT_ERR(ds->f(ds, "\tK {%" PRIu64 " %" PRIu64 "}", recno, rle));
+ else
WT_ERR(__debug_item_key(ds, "K", key->data, key->size));
- if (value != NULL) {
- WT_ERR(__debug_time_pairs(
- ds, "T", value->start_ts, value->start_txn, value->stop_ts, value->stop_txn));
- WT_ERR(__debug_cell_data(ds, NULL, value != NULL ? value->type : 0, "V", value));
- }
+ WT_ERR(__debug_time_pairs(
+ ds, "T", value->start_ts, value->start_txn, value->stop_ts, value->stop_txn));
+ WT_ERR(__debug_cell_data(ds, NULL, value != NULL ? value->type : 0, "V", value));
err:
- WT_RET(__debug_wrapup(ds));
- return (ret);
+ return (__debug_wrapup(ds));
}
/*
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 7e369fcaa0f..e2bdb70a078 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -1388,7 +1388,7 @@ __split_multi_inmem(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_MULTI *multi, WT
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
WT_SAVE_UPD *supd;
- WT_UPDATE *upd;
+ WT_UPDATE *prev_onpage, *upd;
uint64_t recno;
uint32_t i, slot;
@@ -1423,9 +1423,13 @@ __split_multi_inmem(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_MULTI *multi, WT
if (!WT_READGEN_EVICT_SOON(orig->read_gen))
page->read_gen = orig->read_gen;
- /* If there are no updates to apply to the page, we're done. */
+ /*
+ * If there are no updates to apply to the page, we're done. Otherwise, there are updates we
+ * need to restore.
+ */
if (multi->supd_entries == 0)
return (0);
+ WT_ASSERT(session, multi->supd_restore);
if (orig->type == WT_PAGE_ROW_LEAF)
WT_RET(__wt_scr_alloc(session, 0, &key));
@@ -1435,11 +1439,49 @@ __split_multi_inmem(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_MULTI *multi, WT
/* Re-create each modification we couldn't write. */
for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd) {
+ /* Ignore update chains that don't need to be restored. */
+ if (!supd->restore)
+ continue;
+
+ if (supd->ins == NULL) {
+ slot = WT_ROW_SLOT(orig, supd->ripcip);
+ upd = orig->modify->mod_row_update[slot];
+ } else
+ upd = supd->ins->upd;
+
+ /* We shouldn't restore an empty update chain. */
+ WT_ASSERT(session, upd != NULL);
+
+ /*
+ * Truncate the onpage value and the older versions moved to the history store. We can't
+ * truncate the updates for in memory database and fixed length column store as they don't
+ * support the history sore. We can't free the truncated updates here as we may still fail.
+ * If we fail, we will append them back to their original update chains. Truncate before we
+ * restore them to ensure the size of the page is correct.
+ */
+ if (supd->onpage_upd != NULL && !F_ISSET(S2C(session), WT_CONN_IN_MEMORY) &&
+ orig->type != WT_PAGE_COL_FIX) {
+ /*
+ * We have decided to restore this update chain so it must have newer updates than the
+ * onpage value on it.
+ */
+ WT_ASSERT(session, upd != supd->onpage_upd);
+ /*
+ * Move the pointer to the position before the onpage value and truncate all the updates
+ * starting from the onpage value.
+ */
+ for (prev_onpage = upd;
+ prev_onpage->next != NULL && prev_onpage->next != supd->onpage_upd;
+ prev_onpage = prev_onpage->next)
+ ;
+ WT_ASSERT(session, prev_onpage->next == supd->onpage_upd);
+ prev_onpage->next = NULL;
+ }
+
switch (orig->type) {
case WT_PAGE_COL_FIX:
case WT_PAGE_COL_VAR:
/* Build a key. */
- upd = supd->ins->upd;
recno = WT_INSERT_RECNO(supd->ins);
/* Search the page. */
@@ -1450,14 +1492,9 @@ __split_multi_inmem(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_MULTI *multi, WT
break;
case WT_PAGE_ROW_LEAF:
/* Build a key. */
- if (supd->ins == NULL) {
- slot = WT_ROW_SLOT(orig, supd->ripcip);
- upd = orig->modify->mod_row_update[slot];
-
+ if (supd->ins == NULL)
WT_ERR(__wt_row_leaf_key(session, orig, supd->ripcip, key, false));
- } else {
- upd = supd->ins->upd;
-
+ else {
key->data = WT_INSERT_KEY(supd->ins);
key->size = WT_INSERT_KEY_SIZE(supd->ins);
}
@@ -1506,42 +1543,78 @@ err:
/*
* __split_multi_inmem_final --
- * Discard moved update lists from the original page.
+ * Discard moved update lists from the original page and free the updates written to the data
+ * store and the history store.
*/
static void
-__split_multi_inmem_final(WT_PAGE *orig, WT_MULTI *multi)
+__split_multi_inmem_final(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_MULTI *multi)
{
WT_SAVE_UPD *supd;
uint32_t i, slot;
+ /* If we have saved updates, we must have decided to restore them to the new page. */
+ WT_ASSERT(session, multi->supd_entries == 0 || multi->supd_restore);
+
/*
* We successfully created new in-memory pages. For error-handling reasons, we've left the
* update chains referenced by both the original and new pages. We're ready to discard the
- * original page, terminate the original page's reference to any update list we moved.
+ * original page, terminate the original page's reference to any update list we moved and free
+ * the updates written to the data store and the history store.
*/
- for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd)
- switch (orig->type) {
- case WT_PAGE_COL_FIX:
- case WT_PAGE_COL_VAR:
+ for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd) {
+ /* We have finished restoration. Discard the update chains that aren't restored. */
+ if (!supd->restore)
+ continue;
+
+ if (supd->ins == NULL) {
+ slot = WT_ROW_SLOT(orig, supd->ripcip);
+ orig->modify->mod_row_update[slot] = NULL;
+ } else
supd->ins->upd = NULL;
- break;
- case WT_PAGE_ROW_LEAF:
- if (supd->ins == NULL) {
- slot = WT_ROW_SLOT(orig, supd->ripcip);
- orig->modify->mod_row_update[slot] = NULL;
- } else
- supd->ins->upd = NULL;
- break;
- }
+
+ /* Free the updates written to the data store and the history store. */
+ if (supd->onpage_upd != NULL && !F_ISSET(S2C(session), WT_CONN_IN_MEMORY) &&
+ orig->type != WT_PAGE_COL_FIX)
+ __wt_free_update_list(session, &supd->onpage_upd);
+ }
}
/*
* __split_multi_inmem_fail --
- * Discard allocated pages after failure.
+ * Discard allocated pages after failure and append the onpage values back to the original
+ * update chains.
*/
static void
-__split_multi_inmem_fail(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_REF *ref)
+__split_multi_inmem_fail(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_MULTI *multi, WT_REF *ref)
{
+ WT_SAVE_UPD *supd;
+ WT_UPDATE *upd;
+ uint32_t i, slot;
+
+ if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY) && orig->type != WT_PAGE_COL_FIX)
+ /* Append the onpage values back to the original update chains. */
+ for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd) {
+ /*
+ * We don't need to do anything for update chains that are not restored, or restored
+ * without an onpage value.
+ */
+ if (!supd->restore || supd->onpage_upd == NULL)
+ continue;
+
+ if (supd->ins == NULL) {
+ slot = WT_ROW_SLOT(orig, supd->ripcip);
+ upd = orig->modify->mod_row_update[slot];
+ } else
+ upd = supd->ins->upd;
+
+ WT_ASSERT(session, upd != NULL);
+
+ for (; upd->next != NULL && upd->next != supd->onpage_upd; upd = upd->next)
+ ;
+ if (upd->next == NULL)
+ upd->next = supd->onpage_upd;
+ }
+
/*
* We failed creating new in-memory pages. For error-handling reasons, we've left the update
* chains referenced by both the original and new pages. Discard the newly allocated WT_REF
@@ -1579,6 +1652,10 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_R
/* If closing the file, there better not be any saved updates. */
WT_ASSERT(session, !closing || multi->supd == NULL);
+ /* If we don't have a disk image, we can't restore the saved updates. */
+ WT_ASSERT(
+ session, multi->disk_image != NULL || (multi->supd_entries == 0 && !multi->supd_restore));
+
/* Verify any disk image we have. */
WT_ASSERT(session, multi->disk_image == NULL ||
__wt_verify_dsk_image(
@@ -2064,7 +2141,7 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
* Finalize the move, discarding moved update lists from the original page.
*/
for (i = 0; i < new_entries; ++i)
- __split_multi_inmem_final(page, &mod->mod_multi[i]);
+ __split_multi_inmem_final(session, page, &mod->mod_multi[i]);
/*
* Pages with unresolved changes are not marked clean in reconciliation, do it now, then discard
@@ -2076,7 +2153,7 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
if (0) {
err:
for (i = 0; i < new_entries; ++i)
- __split_multi_inmem_fail(session, page, ref_new[i]);
+ __split_multi_inmem_fail(session, page, &mod->mod_multi[i], ref_new[i]);
}
__wt_free(session, ref_new);
@@ -2199,7 +2276,7 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, WT_MULTI *multi)
*
* Finalize the move, discarding moved update lists from the original page.
*/
- __split_multi_inmem_final(page, multi);
+ __split_multi_inmem_final(session, page, multi);
/*
* Discard the original page.
@@ -2222,6 +2299,6 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, WT_MULTI *multi)
return (0);
err:
- __split_multi_inmem_fail(session, page, new);
+ __split_multi_inmem_fail(session, page, multi, new);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
index 229d7d51ce0..5335d04fb93 100644
--- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
@@ -13,11 +13,12 @@
* prettier.
*/
typedef struct {
- uint64_t record_total; /* Total record count */
+ uint64_t records_so_far; /* Records seen so far */
WT_ITEM *max_key; /* Largest key */
WT_ITEM *max_addr; /* Largest key page */
+#define WT_VERIFY_PROGRESS_INTERVAL 100
uint64_t fcnt; /* Progress counter */
/* Configuration options passed in. */
@@ -29,7 +30,6 @@ typedef struct {
bool dump_history;
bool dump_layout;
bool dump_pages;
- bool hs_verify;
/* Page layout information. */
uint64_t depth, depth_internal[100], depth_leaf[100];
@@ -38,14 +38,10 @@ typedef struct {
} WT_VSTUFF;
static void __verify_checkpoint_reset(WT_VSTUFF *);
-static int __verify_col_var_page_hs(WT_SESSION_IMPL *, WT_REF *, WT_VSTUFF *);
-static int __verify_key_hs(WT_SESSION_IMPL *, WT_ITEM *, WT_CELL_UNPACK *, WT_VSTUFF *);
-static int __verify_page_cell(WT_SESSION_IMPL *, WT_REF *, WT_CELL_UNPACK *, WT_VSTUFF *);
+static int __verify_page_content(WT_SESSION_IMPL *, WT_REF *, WT_CELL_UNPACK *, WT_VSTUFF *);
static int __verify_row_int_key_order(
WT_SESSION_IMPL *, WT_PAGE *, WT_REF *, uint32_t, WT_VSTUFF *);
static int __verify_row_leaf_key_order(WT_SESSION_IMPL *, WT_REF *, WT_VSTUFF *);
-static int __verify_row_leaf_page_hs(WT_SESSION_IMPL *, WT_REF *, WT_VSTUFF *);
-static const char *__verify_timestamp_to_pretty_string(wt_timestamp_t, char *ts_string);
static int __verify_tree(WT_SESSION_IMPL *, WT_REF *, WT_CELL_UNPACK *, WT_VSTUFF *);
static int __verify_ts_stable_cmp(
WT_SESSION_IMPL *, WT_ITEM *, WT_REF *, uint32_t, wt_timestamp_t, wt_timestamp_t, WT_VSTUFF *);
@@ -160,188 +156,6 @@ __dump_layout(WT_SESSION_IMPL *session, WT_VSTUFF *vs)
}
/*
- * __verify_col_var_page_hs --
- * Verify a page against the history store.
- */
-static int
-__verify_col_var_page_hs(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs)
-{
- WT_CELL *cell;
- WT_CELL_UNPACK *unpack, _unpack;
- WT_COL *cip;
- WT_DECL_ITEM(key);
- WT_DECL_RET;
- WT_PAGE *page;
- uint64_t recno, rle;
- uint32_t i;
- uint8_t *p;
-
- page = ref->page;
- recno = ref->ref_recno;
- unpack = &_unpack;
-
- /* Ensure enough room for a column-store key without checking. */
- WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key));
-
- WT_COL_FOREACH (page, cip, i) {
- p = key->mem;
- WT_ERR(__wt_vpack_uint(&p, 0, recno));
- key->size = WT_PTRDIFF(p, key->data);
-
- cell = WT_COL_PTR(page, cip);
- __wt_cell_unpack(session, page, cell, unpack);
- rle = __wt_cell_rle(unpack);
-
-#ifdef HAVE_DIAGNOSTIC
- /* Optionally dump historical time pairs and values in debug mode. */
- if (vs->dump_history) {
- WT_ERR(__wt_msg(session, "\tK {%" PRIu64 " %" PRIu64 "}", recno, rle));
- WT_ERR(__wt_debug_key_value(session, NULL, unpack));
- }
-#endif
-
- WT_ERR(__verify_key_hs(session, key, unpack, vs));
- recno += rle;
- }
-
-err:
- __wt_scr_free(session, &key);
-
- return (ret);
-}
-
-/*
- * __verify_row_leaf_page_hs --
- * Verify a page against the history store.
- */
-static int
-__verify_row_leaf_page_hs(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs)
-{
- WT_CELL_UNPACK *unpack, _unpack;
- WT_DECL_ITEM(key);
- WT_DECL_RET;
- WT_PAGE *page;
- WT_ROW *rip;
- uint32_t i;
-
- page = ref->page;
- unpack = &_unpack;
-
- WT_RET(__wt_scr_alloc(session, 256, &key));
-
- WT_ROW_FOREACH (page, rip, i) {
- WT_ERR(__wt_row_leaf_key(session, page, rip, key, false));
- __wt_row_leaf_value_cell(session, page, rip, NULL, unpack);
-
-#ifdef HAVE_DIAGNOSTIC
- /* Optionally dump historical time pairs and values in debug mode. */
- if (vs->dump_history)
- WT_ERR(__wt_debug_key_value(session, key, unpack));
-#endif
-
- WT_ERR(__verify_key_hs(session, key, unpack, vs));
- }
-
-err:
- __wt_scr_free(session, &key);
- return (ret);
-}
-
-/*
- * __verify_key_hs --
- * Verify a key against the history store. The unpack denotes the data store's timestamp range
- * information and is used for verifying timestamp range overlaps.
- */
-static int
-__verify_key_hs(WT_SESSION_IMPL *session, WT_ITEM *key, WT_CELL_UNPACK *unpack, WT_VSTUFF *vs)
-{
- WT_BTREE *btree;
- WT_CURSOR *hs_cursor;
- WT_DECL_RET;
- wt_timestamp_t newer_start_ts, older_start_ts, older_stop_ts;
- uint64_t hs_counter;
- uint32_t hs_btree_id, session_flags;
- int cmp, exact;
- char ts_string[2][WT_TS_INT_STRING_SIZE];
- bool is_owner;
-
- btree = S2BT(session);
- hs_cursor = NULL;
- hs_btree_id = btree->id;
- /*
- * Set the data store timestamp and transactions to initiate timestamp range verification. Since
- * transaction-ids are wiped out on start, we could possibly have a start txn-id of WT_TXN_NONE,
- * in which case we initialize our newest with the max txn-id.
- */
- newer_start_ts = unpack->start_ts;
- session_flags = 0;
- older_stop_ts = 0;
- is_owner = false;
-
- /*
- * Open a history store cursor positioned at the end of the data store key (the newest record)
- * and iterate backwards until we reach a different key or btree.
- */
- WT_ERR(__wt_hs_cursor(session, &session_flags, &is_owner));
- hs_cursor = session->hs_cursor;
- hs_cursor->set_key(hs_cursor, hs_btree_id, key, WT_TS_MAX, UINT64_MAX);
- WT_ERR(hs_cursor->search_near(hs_cursor, &exact));
-
- /* If we jumped to the next key, go back to the previous key. */
- if (exact > 0)
- WT_ERR(hs_cursor->prev(hs_cursor));
-
- for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) {
- WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, vs->tmp1, &older_start_ts, &hs_counter));
-
- if (hs_btree_id != btree->id)
- break;
-
- WT_ERR(__wt_compare(session, NULL, vs->tmp1, key, &cmp));
- if (cmp != 0)
- break;
-
-#ifdef HAVE_DIAGNOSTIC
- /* Optionally dump historical time pairs and values in debug mode. */
- if (vs->dump_history)
- WT_ERR(__wt_debug_cursor_hs(session, hs_cursor));
-#else
- WT_UNUSED(vs);
-#endif
-
- /* Verify that the newer record's start is later than the older record's stop. */
- if (newer_start_ts < older_stop_ts) {
- WT_ERR_MSG(session, WT_ERROR,
- "In the Btree %" PRIu32
- ", Key %s has a overlap of "
- "timestamp ranges between history store stop timestamp %s being "
- "newer than a more recent timestamp range having start timestamp %s",
- hs_btree_id,
- __wt_buf_set_printable(session, vs->tmp1->data, vs->tmp1->size, vs->tmp1),
- __verify_timestamp_to_pretty_string(older_stop_ts, ts_string[0]),
- __verify_timestamp_to_pretty_string(newer_start_ts, ts_string[1]));
- }
- /*
- * Since we are iterating from newer to older, the current older record becomes the newer
- * for the next round of verification.
- */
- newer_start_ts = older_start_ts;
-
- WT_ERR(__verify_ts_stable_cmp(session, key, NULL, 0, older_start_ts, older_stop_ts, vs));
- }
- WT_ERR_NOTFOUND_OK(ret);
-
-err:
- /* It is okay to have not found the key. */
- if (ret == WT_NOTFOUND)
- ret = 0;
-
- WT_TRET(__wt_hs_cursor_close(session, session_flags, is_owner));
-
- return (ret);
-}
-
-/*
* __wt_verify --
* Verify a file.
*/
@@ -355,21 +169,24 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
WT_DECL_RET;
WT_VSTUFF *vs, _vstuff;
size_t root_addr_size;
+ uint32_t session_flags;
uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE];
const char *name;
- bool bm_start, quit;
+ bool bm_start, is_owner, quit;
btree = S2BT(session);
bm = btree->bm;
ckptbase = NULL;
+ session_flags = 0; /* -Wuninitialized */
name = session->dhandle->name;
bm_start = false;
+ is_owner = false; /* -Wuninitialized */
WT_CLEAR(_vstuff);
vs = &_vstuff;
WT_ERR(__wt_scr_alloc(session, 0, &vs->max_key));
WT_ERR(__wt_scr_alloc(session, 0, &vs->max_addr));
- WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1));
+ WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &vs->tmp1));
WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2));
WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp3));
WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp4));
@@ -393,6 +210,9 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
}
WT_ERR(ret);
+ /* Open a history store cursor. */
+ WT_ERR(__wt_hs_cursor(session, &session_flags, &is_owner));
+
/* Inform the underlying block manager we're verifying. */
WT_ERR(bm->verify_start(bm, session, ckptbase, cfg));
bm_start = true;
@@ -433,11 +253,16 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
*/
memset(&addr_unpack, 0, sizeof(addr_unpack));
addr_unpack.newest_start_durable_ts = ckpt->start_durable_ts;
- addr_unpack.oldest_start_ts = ckpt->oldest_start_ts;
- addr_unpack.oldest_start_txn = ckpt->oldest_start_txn;
addr_unpack.newest_stop_durable_ts = ckpt->stop_durable_ts;
+ addr_unpack.oldest_start_ts = ckpt->oldest_start_ts;
addr_unpack.newest_stop_ts = ckpt->newest_stop_ts;
- addr_unpack.newest_stop_txn = ckpt->newest_stop_txn;
+ if (ckpt->write_gen > S2C(session)->base_write_gen) {
+ addr_unpack.oldest_start_txn = ckpt->oldest_start_txn;
+ addr_unpack.newest_stop_txn = ckpt->newest_stop_txn;
+ } else {
+ addr_unpack.oldest_start_txn = WT_TXN_NONE;
+ addr_unpack.newest_stop_txn = WT_TXN_MAX;
+ }
addr_unpack.raw = WT_CELL_ADDR_INT;
/* Verify the tree. */
@@ -478,6 +303,8 @@ err:
if (bm_start)
WT_TRET(bm->verify_end(bm, session));
+ WT_TRET(__wt_hs_cursor_close(session, session_flags, is_owner));
+
/* Discard the list of checkpoints. */
if (ckptbase != NULL)
__wt_meta_ckptlist_free(session, &ckptbase);
@@ -506,7 +333,7 @@ __verify_checkpoint_reset(WT_VSTUFF *vs)
vs->max_addr->size = 0;
/* Record total is per checkpoint, reset the record count. */
- vs->record_total = 0;
+ vs->records_so_far = 0;
/* Tree depth. */
vs->depth = 1;
@@ -573,132 +400,6 @@ __verify_addr_ts(WT_SESSION_IMPL *session, WT_REF *ref, WT_CELL_UNPACK *unpack,
}
/*
- * __wt_verify_history_store_tree --
- * Verify the history store. There can't be an entry in the history store without having the
- * latest value for the respective key in the data store. If given a uri, limit the verification
- * to the corresponding btree.
- */
-int
-__wt_verify_history_store_tree(WT_SESSION_IMPL *session, const char *uri)
-{
- WT_CURSOR *cursor, *data_cursor;
- WT_DECL_ITEM(hs_key);
- WT_DECL_ITEM(prev_hs_key);
- WT_DECL_ITEM(tmp);
- WT_DECL_RET;
- wt_timestamp_t hs_start_ts;
- uint64_t hs_counter;
- uint32_t btree_id, btree_id_given_uri, session_flags, prev_btree_id;
- int exact, cmp;
- char *uri_itr;
- bool is_owner;
-
- cursor = data_cursor = NULL;
- session_flags = 0;
- btree_id_given_uri = 0; /* [-Wconditional-uninitialized] */
- prev_btree_id = 0; /* [-Wconditional-uninitialized] */
- is_owner = false; /* [-Wconditional-uninitialized] */
- uri_itr = NULL;
-
- WT_ERR(__wt_scr_alloc(session, 0, &hs_key));
- WT_ERR(__wt_scr_alloc(session, 0, &prev_hs_key));
-
- WT_ERR(__wt_hs_cursor(session, &session_flags, &is_owner));
- cursor = session->hs_cursor;
-
- /*
- * If a uri has been provided, limit verification to the corresponding btree by jumping to the
- * first record for that btree in the history store. Otherwise scan the whole history store.
- */
- if (uri != NULL) {
- ret = __wt_metadata_uri_to_btree_id(session, uri, &btree_id_given_uri);
- if (ret != 0)
- WT_ERR_MSG(session, ret, "Unable to locate the URI %s in the metadata file", uri);
-
- /*
- * Position the cursor at the first record of the specified btree, or one after. It is
- * possible there are no records in the history store for this btree.
- */
- cursor->set_key(cursor, btree_id_given_uri, hs_key, 0, 0, 0, 0);
- ret = cursor->search_near(cursor, &exact);
- if (ret == 0 && exact < 0)
- ret = cursor->next(cursor);
- } else
- ret = cursor->next(cursor);
-
- /* We have the history store cursor positioned at the first record that we want to verify. */
- for (; ret == 0; ret = cursor->next(cursor)) {
- WT_ERR(cursor->get_key(cursor, &btree_id, hs_key, &hs_start_ts, &hs_counter));
-
- /* When limiting our verification to a uri, bail out if the btree-id doesn't match. */
- if (uri != NULL && btree_id != btree_id_given_uri)
- break;
-
- /*
- * Keep track of the previous comparison. The history store is stored in order, so we can
- * avoid redundant comparisons. Previous btree ID isn't set, until data cursor is open.
- */
- if (data_cursor == NULL || (prev_btree_id != btree_id)) {
- /*
- * Check whether this btree-id exists in the metadata. We do that by finding what uri
- * this btree belongs to. Using this URI, verify the history store key with the data
- * store.
- */
- if (data_cursor != NULL) {
- WT_ERR(data_cursor->close(data_cursor));
- /* Setting data_cursor to null, to avoid double free */
- data_cursor = NULL;
- }
- /*
- * Using the btree-id find the metadata entry and extract the URI for this btree. Don't
- * forget to free the copy of the URI returned.
- */
- __wt_free(session, uri_itr);
- ret = __wt_metadata_btree_id_to_uri(session, btree_id, &uri_itr);
- if (ret != 0) {
- WT_ERR(__wt_scr_alloc(session, 0, &tmp));
- WT_ERR_MSG(session, ret, "Unable to find btree-id %" PRIu32
- " in the metadata file for the associated "
- "history store key %s",
- btree_id, __wt_buf_set_printable(session, hs_key->data, hs_key->size, tmp));
- }
-
- WT_ERR(__wt_open_cursor(session, uri_itr, NULL, NULL, &data_cursor));
- F_SET(data_cursor, WT_CURSOR_RAW_OK);
- } else {
- WT_ERR(__wt_compare(session, NULL, hs_key, prev_hs_key, &cmp));
- if (cmp == 0)
- continue;
- }
- WT_ERR(__wt_buf_set(session, prev_hs_key, hs_key->data, hs_key->size));
- prev_btree_id = btree_id;
-
- data_cursor->set_key(data_cursor, hs_key);
- ret = data_cursor->search(data_cursor);
- if (ret == WT_NOTFOUND) {
- WT_ERR(__wt_scr_alloc(session, 0, &tmp));
- WT_ERR_MSG(session, WT_NOTFOUND,
- "In the URI %s, the associated history store key %s cannot be found in the data "
- "store",
- uri_itr, __wt_buf_set_printable(session, hs_key->data, hs_key->size, tmp));
- }
- WT_ERR(ret);
- }
- WT_ERR_NOTFOUND_OK(ret);
-err:
- if (data_cursor != NULL)
- WT_TRET(data_cursor->close(data_cursor));
- if (cursor != NULL)
- WT_TRET(__wt_hs_cursor_close(session, session_flags, is_owner));
-
- __wt_scr_free(session, &hs_key);
- __wt_scr_free(session, &prev_hs_key);
- __wt_scr_free(session, &tmp);
- __wt_free(session, uri_itr);
- return (ret);
-}
-
-/*
* __verify_tree --
* Verify a tree, recursively descending through it in depth-first fashion. The page argument
* was physically verified (so we know it's correctly formed), and the in-memory version built.
@@ -708,23 +409,15 @@ static int
__verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_CELL_UNPACK *addr_unpack, WT_VSTUFF *vs)
{
WT_BM *bm;
- WT_CELL *cell;
WT_CELL_UNPACK *unpack, _unpack;
- WT_COL *cip;
WT_DECL_RET;
WT_PAGE *page;
WT_REF *child_ref;
- uint64_t recno;
- uint32_t entry, i;
- bool enable_hs_verify;
+ uint32_t entry;
bm = S2BT(session)->bm;
- page = ref->page;
-
- /* Temporarily disable as MongoDB tests are timing out. Re-enable with WT-5796. */
- enable_hs_verify = false;
-
unpack = &_unpack;
+ page = ref->page;
__wt_verbose(session, WT_VERB_VERIFY, "%s %s", __verify_addr_string(session, ref, vs->tmp1),
__wt_page_type_string(page->type));
@@ -740,30 +433,24 @@ __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_CELL_UNPACK *addr_unpack
else
++vs->depth_leaf[WT_MIN(vs->depth, WT_ELEMENTS(vs->depth_internal) - 1)];
-/*
- * The page's physical structure was verified when it was read into
- * memory by the read server thread, and then the in-memory version
- * of the page was built. Now we make sure the page and tree are
- * logically consistent.
- *
- * !!!
- * The problem: (1) the read server has to build the in-memory version
- * of the page because the read server is the thread that flags when
- * any thread can access the page in the tree; (2) we can't build the
- * in-memory version of the page until the physical structure is known
- * to be OK, so the read server has to verify at least the physical
- * structure of the page; (3) doing complete page verification requires
- * reading additional pages (for example, overflow keys imply reading
- * overflow pages in order to test the key's order in the page); (4)
- * the read server cannot read additional pages because it will hang
- * waiting on itself. For this reason, we split page verification
- * into a physical verification, which allows the in-memory version
- * of the page to be built, and then a subsequent logical verification
- * which happens here.
- *
- * Report progress occasionally.
- */
-#define WT_VERIFY_PROGRESS_INTERVAL 100
+ /*
+ * The page's physical structure was verified when it was read into memory by the read server
+ * thread, and then the in-memory version of the page was built. Now we make sure the page and
+ * tree are logically consistent.
+ *
+ * !!!
+ * The problem: (1) the read server has to build the in-memory version of the page because the
+ * read server is the thread that flags when any thread can access the page in the tree; (2) we
+ * can't build the in-memory version of the page until the physical structure is known to be OK,
+ * so the read server has to verify at least the physical structure of the page; (3) doing
+ * complete page verification requires reading additional pages (for example, overflow keys
+ * imply reading overflow pages in order to test the key's order in the page); (4) the read
+ * server cannot read additional pages because it will hang waiting on itself. For this reason,
+ * we split page verification into a physical verification, which allows the in-memory version
+ * of the page to be built, and then a subsequent logical verification which happens here.
+ *
+ * Report progress occasionally.
+ */
if (++vs->fcnt % WT_VERIFY_PROGRESS_INTERVAL == 0)
WT_RET(__wt_progress(session, NULL, vs->fcnt));
@@ -775,36 +462,15 @@ __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_CELL_UNPACK *addr_unpack
WT_RET(__wt_debug_page(session, NULL, ref, NULL));
#endif
- /*
- * Column-store key order checks: check the page's record number and then update the total
- * record count.
- */
+ /* Column-store key order checks: check the page's record number. */
switch (page->type) {
case WT_PAGE_COL_FIX:
case WT_PAGE_COL_INT:
- recno = ref->ref_recno;
- goto recno_chk;
case WT_PAGE_COL_VAR:
- recno = ref->ref_recno;
-recno_chk:
- if (recno != vs->record_total + 1)
+ if (ref->ref_recno != vs->records_so_far + 1)
WT_RET_MSG(session, WT_ERROR, "page at %s has a starting record of %" PRIu64
" when the expected starting record is %" PRIu64,
- __verify_addr_string(session, ref, vs->tmp1), recno, vs->record_total + 1);
- break;
- }
- switch (page->type) {
- case WT_PAGE_COL_FIX:
- vs->record_total += page->entries;
- break;
- case WT_PAGE_COL_VAR:
- recno = 0;
- WT_COL_FOREACH (page, cip, i) {
- cell = WT_COL_PTR(page, cip);
- __wt_cell_unpack(session, page, cell, unpack);
- recno += __wt_cell_rle(unpack);
- }
- vs->record_total += recno;
+ __verify_addr_string(session, ref, vs->tmp1), ref->ref_recno, vs->records_so_far + 1);
break;
}
@@ -818,21 +484,17 @@ recno_chk:
break;
}
- /*
- * History store checks. Ensure continuity between the data store and history store based on
- * keys in leaf/var pages.
- *
- * Temporarily disable as MongoDB tests are timing out. Re-enable with WT-5796.
- */
- if (enable_hs_verify) {
- switch (page->type) {
- case WT_PAGE_ROW_LEAF:
- WT_RET(__verify_row_leaf_page_hs(session, ref, vs));
- break;
- case WT_PAGE_COL_VAR:
- WT_RET(__verify_col_var_page_hs(session, ref, vs));
- break;
- }
+ /* Check page content, additionally updating the variable-length column-store record count. */
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ vs->records_so_far += page->entries;
+ break;
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ WT_RET(__verify_page_content(session, ref, addr_unpack, vs));
+ break;
}
/* Compare the address type against the page type. */
@@ -862,19 +524,6 @@ celltype_err:
break;
}
- /*
- * Check overflow pages and timestamps. Done in one function as both checks require walking the
- * page cells and we don't want to do it twice.
- */
- switch (page->type) {
- case WT_PAGE_COL_INT:
- case WT_PAGE_COL_VAR:
- case WT_PAGE_ROW_INT:
- case WT_PAGE_ROW_LEAF:
- WT_RET(__verify_page_cell(session, ref, addr_unpack, vs));
- break;
- }
-
/* Check tree connections and recursively descend the tree. */
switch (page->type) {
case WT_PAGE_COL_INT:
@@ -886,14 +535,14 @@ celltype_err:
* than the total records reviewed to this point.
*/
++entry;
- if (child_ref->ref_recno != vs->record_total + 1) {
+ if (child_ref->ref_recno != vs->records_so_far + 1) {
WT_RET_MSG(session, WT_ERROR, "the starting record number in entry %" PRIu32
" of the column internal page at "
"%s is %" PRIu64
" and the expected "
"starting record number is %" PRIu64,
entry, __verify_addr_string(session, child_ref, vs->tmp1), child_ref->ref_recno,
- vs->record_total + 1);
+ vs->records_so_far + 1);
}
/* Unpack the address block and check timestamps */
@@ -1068,6 +717,20 @@ __verify_overflow(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_siz
}
/*
+ * __verify_timestamp_to_pretty_string --
+ * Convert a timestamp to a pretty string, utilizes existing timestamp to string function.
+ */
+static const char *
+__verify_timestamp_to_pretty_string(wt_timestamp_t ts, char *ts_string)
+{
+ if (ts == WT_TS_MAX)
+ return ("WT_TS_MAX");
+ if (ts == WT_TS_NONE)
+ return ("WT_TS_NONE");
+ return (__wt_timestamp_to_string(ts, ts_string));
+}
+
+/*
* __verify_ts_addr_cmp --
* Do a cell timestamp check against the parent.
*/
@@ -1168,54 +831,123 @@ __verify_txn_addr_cmp(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t cell_num,
}
/*
- * __verify_timestamp_to_pretty_string --
- * Convert a timestamp to a pretty string, utilizes existing timestamp to string function.
+ * __verify_key_hs --
+ * Verify a key against the history store. The unpack denotes the data store's timestamp range
+ * information and is used for verifying timestamp range overlaps.
*/
-static const char *
-__verify_timestamp_to_pretty_string(wt_timestamp_t ts, char *ts_string)
+static int
+__verify_key_hs(
+ WT_SESSION_IMPL *session, WT_ITEM *tmp1, wt_timestamp_t newer_start_ts, WT_VSTUFF *vs)
{
- const char *ts_bp;
+#ifdef WT_VERIFY_VALIDATE_HISTORY_STORE
+ WT_BTREE *btree;
+ WT_CURSOR *hs_cursor;
+ WT_DECL_RET;
+ wt_timestamp_t older_start_ts, older_stop_ts;
+ uint64_t hs_counter;
+ uint32_t hs_btree_id;
+ int cmp, exact;
+ char ts_string[2][WT_TS_INT_STRING_SIZE];
- switch (ts) {
- case WT_TS_MAX:
- ts_bp = "WT_TS_MAX";
- break;
- case WT_TS_NONE:
- ts_bp = "WT_TS_NONE";
- break;
- default:
- ts_bp = __wt_timestamp_to_string(ts, ts_string);
- break;
+ btree = S2BT(session);
+ hs_btree_id = btree->id;
+
+ /*
+ * Set the data store timestamp and transactions to initiate timestamp range verification. Since
+ * transaction-ids are wiped out on start, we could possibly have a start txn-id of WT_TXN_NONE,
+ * in which case we initialize our newest with the max txn-id.
+ */
+ older_stop_ts = 0;
+
+ /*
+ * Open a history store cursor positioned at the end of the data store key (the newest record)
+ * and iterate backwards until we reach a different key or btree.
+ */
+ hs_cursor = session->hs_cursor;
+ hs_cursor->set_key(hs_cursor, hs_btree_id, tmp1, WT_TS_MAX, WT_TXN_MAX);
+ ret = hs_cursor->search_near(hs_cursor, &exact);
+
+ /* If we jumped to the next key, go back to the previous key. */
+ if (ret == 0 && exact > 0)
+ ret = hs_cursor->prev(hs_cursor);
+
+ for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) {
+ WT_RET(hs_cursor->get_key(hs_cursor, &hs_btree_id, vs->tmp2, &older_start_ts, &hs_counter));
+
+ if (hs_btree_id != btree->id)
+ break;
+
+ WT_RET(__wt_compare(session, NULL, tmp1, vs->tmp2, &cmp));
+ if (cmp != 0)
+ break;
+
+#ifdef HAVE_DIAGNOSTIC
+ /* Optionally dump historical time pairs and values in debug mode. */
+ if (vs->dump_history)
+ WT_RET(__wt_debug_cursor_hs(session, hs_cursor));
+#endif
+
+ /* Verify the newer record's start is later than the older record's stop. */
+ if (newer_start_ts < older_stop_ts) {
+ WT_RET_MSG(session, WT_ERROR,
+ "key %s has a overlap of timestamp ranges between history store stop timestamp %s "
+ "being newer than a more recent timestamp range having start timestamp %s",
+ __wt_buf_set_printable(session, tmp1->data, tmp1->size, vs->tmp2),
+ __verify_timestamp_to_pretty_string(older_stop_ts, ts_string[0]),
+ __verify_timestamp_to_pretty_string(newer_start_ts, ts_string[1]));
+ }
+ WT_RET(__verify_ts_stable_cmp(session, tmp1, NULL, 0, older_start_ts, older_stop_ts, vs));
+
+ /*
+ * Since we are iterating from newer to older, the current older record becomes the newer
+ * for the next round of verification.
+ */
+ newer_start_ts = older_start_ts;
}
- return ts_bp;
+
+ return (ret == WT_NOTFOUND ? 0 : ret);
+#else
+ WT_UNUSED(session);
+ WT_UNUSED(tmp1);
+ WT_UNUSED(newer_start_ts);
+ WT_UNUSED(vs);
+ return (0);
+#endif
}
/*
- * __verify_page_cell --
- * Verify the cells on the page.
+ * __verify_page_content --
+ * Verify the page's content.
*/
static int
-__verify_page_cell(
+__verify_page_content(
WT_SESSION_IMPL *session, WT_REF *ref, WT_CELL_UNPACK *addr_unpack, WT_VSTUFF *vs)
{
WT_BTREE *btree;
WT_CELL_UNPACK unpack;
WT_DECL_RET;
+ WT_PAGE *page;
const WT_PAGE_HEADER *dsk;
+ WT_ROW *rip;
+ uint64_t recno, rle;
uint32_t cell_num;
+ uint8_t *p;
char ts_string[2][WT_TS_INT_STRING_SIZE];
bool found_ovfl;
+ btree = S2BT(session);
+ page = ref->page;
+ rip = page->pg_row;
+ recno = ref->ref_recno;
+ found_ovfl = false;
+
/*
* If a tree is empty (just created), it won't have a disk image; if there is no disk image,
* we're done.
*/
- if ((dsk = ref->page->dsk) == NULL)
+ if ((dsk = page->dsk) == NULL)
return (0);
- btree = S2BT(session);
- found_ovfl = false;
-
/* Walk the page, tracking timestamps and verifying overflow pages. */
cell_num = 0;
WT_CELL_FOREACH_BEGIN (session, btree, dsk, unpack) {
@@ -1347,6 +1079,36 @@ __verify_page_cell(
session, NULL, ref, cell_num - 1, unpack.start_ts, unpack.stop_ts, vs));
break;
}
+
+ /*
+ * Verify key-associated history-store entries, optionally dump historical time pairs and
+ * values in debug mode.
+ */
+ if (page->type == WT_PAGE_ROW_LEAF) {
+ if (unpack.type != WT_CELL_KEY && unpack.type != WT_CELL_KEY_OVFL)
+ continue;
+
+ WT_RET(__wt_row_leaf_key(session, page, rip++, vs->tmp1, false));
+ WT_RET(__verify_key_hs(session, vs->tmp1, unpack.start_ts, vs));
+
+#ifdef HAVE_DIAGNOSTIC
+ if (vs->dump_history)
+ WT_RET(__wt_debug_key_value(session, vs->tmp1, WT_RECNO_OOB, 0, &unpack));
+#endif
+ } else if (page->type == WT_PAGE_COL_VAR) {
+ rle = __wt_cell_rle(&unpack);
+ p = vs->tmp1->mem;
+ WT_RET(__wt_vpack_uint(&p, 0, recno));
+ vs->tmp1->size = WT_PTRDIFF(p, vs->tmp1->mem);
+ WT_RET(__verify_key_hs(session, vs->tmp1, unpack.start_ts, vs));
+
+#ifdef HAVE_DIAGNOSTIC
+ if (vs->dump_history)
+ WT_RET(__wt_debug_key_value(session, NULL, recno, rle, &unpack));
+#endif
+ recno += rle;
+ vs->records_so_far += rle;
+ }
}
WT_CELL_FOREACH_END;
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
index fb8cff2d9a6..d3ff920c9eb 100644
--- a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
@@ -119,6 +119,15 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_H
if (dsk->unused != 0)
WT_RET_VRFY(session, "page at %s has non-zero unused page header bytes", tag);
+ /* Check the page version. */
+ switch (dsk->version) {
+ case WT_PAGE_VERSION_ORIG:
+ case WT_PAGE_VERSION_TS:
+ break;
+ default:
+ WT_RET_VRFY(session, "page at %s has an invalid version of %" PRIu8, tag, dsk->version);
+ }
+
/*
* Any bytes after the data chunk should be nul bytes; ignore if the size is 0, that allows easy
* checking of disk images where we don't have the size.
diff --git a/src/third_party/wiredtiger/src/btree/row_key.c b/src/third_party/wiredtiger/src/btree/row_key.c
index aec356f1a90..8db18398831 100644
--- a/src/third_party/wiredtiger/src/btree/row_key.c
+++ b/src/third_party/wiredtiger/src/btree/row_key.c
@@ -136,9 +136,6 @@ __wt_row_leaf_key_work(
WT_IKEY *ikey;
WT_ROW *rip, *jump_rip;
size_t size;
-#ifdef HAVE_DIAGNOSTIC
- uint32_t current, start;
-#endif
u_int last_prefix;
int jump_slot_offset, slot_offset;
void *copy;
@@ -163,9 +160,6 @@ __wt_row_leaf_key_work(
size = 0; /* -Werror=maybe-uninitialized */
direction = BACKWARD;
-#ifdef HAVE_DIAGNOSTIC
- __wt_seconds32(session, &start);
-#endif
for (slot_offset = 0;;) {
if (0) {
switch_and_jump:
@@ -178,18 +172,6 @@ switch_and_jump:
slot_offset = jump_slot_offset;
}
copy = WT_ROW_KEY_COPY(rip);
-#ifdef HAVE_DIAGNOSTIC
- /*
- * Debugging added to detect and gather information for rare hang, WT-5043. Detect and abort
- * if the current function call or operation takes too long (and 5 minutes is an eternity).
- */
- __wt_seconds32(session, &current);
- WT_ERR_ASSERT(session, (current - start) < WT_MINUTE * 5, EINVAL,
- "call tracking for WT-5043: %s took longer than 5 minutes", __func__);
- WT_ERR_ASSERT(session,
- (session->op_5043_seconds == 0 || (current - session->op_5043_seconds) < WT_MINUTE * 5),
- EINVAL, "operation tracking for WT-5043: %s took longer than 5 minutes", session->name);
-#endif
/*
* Figure out what the key looks like.
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
index 30c58f3571a..7f63b65c03d 100644
--- a/src/third_party/wiredtiger/src/conn/conn_api.c
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -2390,8 +2390,16 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c
/*
* Set compatibility versions early so that any subsystem sees it. Call after we own the
- * database so that we can know if the database is new or not.
+ * database so that we can know if the database is new or not. Compatibility testing needs to
+ * know if salvage has been set, so parse that early.
*/
+ WT_ERR(__wt_config_gets(session, cfg, "salvage", &cval));
+ if (cval.val) {
+ if (F_ISSET(conn, WT_CONN_READONLY))
+ WT_ERR_MSG(session, EINVAL, "Readonly configuration incompatible with salvage");
+ F_SET(conn, WT_CONN_SALVAGE);
+ }
+
WT_ERR(__wt_conn_compat_config(session, cfg, false));
/*
@@ -2579,13 +2587,6 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c
WT_ERR(__wt_config_gets(session, cfg, "operation_timeout_ms", &cval));
conn->operation_timeout_us = (uint64_t)(cval.val * WT_THOUSAND);
- WT_ERR(__wt_config_gets(session, cfg, "salvage", &cval));
- if (cval.val) {
- if (F_ISSET(conn, WT_CONN_READONLY))
- WT_ERR_MSG(session, EINVAL, "Readonly configuration incompatible with salvage");
- F_SET(conn, WT_CONN_SALVAGE);
- }
-
WT_ERR(__wt_conn_statistics_config(session, cfg));
WT_ERR(__wt_lsm_manager_config(session, cfg));
WT_ERR(__wt_sweep_config(session, cfg));
diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c
index f09d5130aae..0010a55198a 100644
--- a/src/third_party/wiredtiger/src/conn/conn_log.c
+++ b/src/third_party/wiredtiger/src/conn/conn_log.c
@@ -79,6 +79,48 @@ __logmgr_force_archive(WT_SESSION_IMPL *session, uint32_t lognum)
}
/*
+ * __logmgr_set_majmin --
+ * Set the required major or minor of the given field. Wrapper for setting the required minimum
+ * and maximum fields in the connection.
+ */
+static void
+__logmgr_set_majmin(uint16_t req_major, uint16_t req_minor, uint16_t *log_req)
+{
+ /*
+ * Set up the maximum and minimum log version required if needed.
+ */
+ if (req_major != WT_CONN_COMPAT_NONE) {
+ if (req_major == WT_LOG_V5_MAJOR)
+ *log_req = WT_LOG_VERSION;
+ else if (req_major == WT_LOG_V4_MAJOR)
+ if (req_minor == WT_LOG_V4_MINOR)
+ *log_req = 4;
+ else if (req_minor > WT_LOG_V2_MINOR)
+ *log_req = 3;
+ else
+ *log_req = 2;
+ else
+ *log_req = 1;
+ }
+}
+
+/*
+ * __wt_logmgr_compat_version --
+ * Set up the compatibility versions in the log manager. This is split out because it is called
+ * much earlier than log subsystem creation on startup so that we can verify the system state in
+ * files before modifying files.
+ */
+void
+__wt_logmgr_compat_version(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+ __logmgr_set_majmin(conn->req_max_major, conn->req_max_minor, &conn->log_req_max);
+ __logmgr_set_majmin(conn->req_min_major, conn->req_min_minor, &conn->log_req_min);
+}
+
+/*
* __logmgr_version --
* Set up the versions in the log manager.
*/
@@ -122,35 +164,7 @@ __logmgr_version(WT_SESSION_IMPL *session, bool reconfig)
downgrade = true;
}
- /*
- * Set up the maximum and minimum log version required if needed.
- */
- if (conn->req_max_major != WT_CONN_COMPAT_NONE) {
- if (conn->req_max_major == WT_LOG_V5_MAJOR)
- conn->log_req_max = WT_LOG_VERSION;
- else if (conn->req_max_major == WT_LOG_V4_MAJOR)
- if (conn->req_max_minor == WT_LOG_V4_MINOR)
- conn->log_req_max = 4;
- else if (conn->req_max_minor > WT_LOG_V2_MINOR)
- conn->log_req_max = 3;
- else
- conn->log_req_max = 2;
- else
- conn->log_req_max = 1;
- }
- if (conn->req_min_major != WT_CONN_COMPAT_NONE) {
- if (conn->req_min_major == WT_LOG_V5_MAJOR)
- conn->log_req_min = WT_LOG_VERSION;
- else if (conn->req_min_major == WT_LOG_V4_MAJOR)
- if (conn->req_min_minor == WT_LOG_V4_MINOR)
- conn->log_req_min = 4;
- else if (conn->req_min_minor > WT_LOG_V2_MINOR)
- conn->log_req_min = 3;
- else
- conn->log_req_min = 2;
- else
- conn->log_req_min = 1;
- }
+ __wt_logmgr_compat_version(session);
/*
* If the version is the same, there is nothing to do.
diff --git a/src/third_party/wiredtiger/src/conn/conn_reconfig.c b/src/third_party/wiredtiger/src/conn/conn_reconfig.c
index 043cd19b661..1d32b811b7e 100644
--- a/src/third_party/wiredtiger/src/conn/conn_reconfig.c
+++ b/src/third_party/wiredtiger/src/conn/conn_reconfig.c
@@ -201,6 +201,14 @@ done:
conn->req_max_minor = max_minor;
conn->req_min_major = min_major;
conn->req_min_minor = min_minor;
+ /*
+ * Set up the log manager versions in the connection and verify any logs. We do this at the end
+ * here, but very early in the startup process so that if we're starting from a backup and there
+ * are compatibility errors, we inform the user but leave the directory unchanged.
+ */
+ __wt_logmgr_compat_version(session);
+ if (!reconfig && !F_ISSET(conn, WT_CONN_SALVAGE))
+ WT_ERR(__wt_log_compat_verify(session));
err:
__wt_free(session, value);
diff --git a/src/third_party/wiredtiger/src/docs/durability.dox b/src/third_party/wiredtiger/src/docs/durability.dox
index 30683335978..68ad68d72e2 100644
--- a/src/third_party/wiredtiger/src/docs/durability.dox
+++ b/src/third_party/wiredtiger/src/docs/durability.dox
@@ -76,10 +76,13 @@ instead must disable log file removal using the \c log=(archive=false)
configuration to ::wiredtiger_open.
Log files may be removed or archived after a checkpoint has completed,
-as long as there's not a backup in progress. Immediately after the checkpoint
-has completed, only the most recent log file is needed for recovery, and all
-other log files can be removed or archived. Note that there must always
-be at least one log file for the database.
+as long as there's not a backup in progress. When performing @ref
+backup_incremental, WT_SESSION::truncate can be used to remove log files
+after completing each incremental backup.
+
+Immediately after the checkpoint has completed, only the most recent log file
+is needed for recovery, and all other log files can be removed or archived.
+Note that there must always be at least one log file for the database.
Open log cursors prevents WiredTiger from automatically removing log files.
Therefore, we recommend proactively closing log cursors when done with them.
diff --git a/src/third_party/wiredtiger/src/docs/eviction.dox b/src/third_party/wiredtiger/src/docs/eviction.dox
new file mode 100644
index 00000000000..1d04b3ba76d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/docs/eviction.dox
@@ -0,0 +1,107 @@
+/*! @m_page{{c,java},eviction, Eviction}
+
+All the operations performed in WiredTiger are on the data read into
+preconfigured amount of memory. WiredTiger uses memory as a cache for
+all the data on the disk and the data in memory forms the current
+working set.
+
+Since memory is limited and can't hold all the data stored on the disk,
+WiredTiger has to continuously move data that is currently not being
+accessed out of the memory to free up enough space to read data that
+are requested by the user but currently reside on the disk back into
+memory. This process is called eviction.
+
+@section overview Eviction overview
+
+The WiredTiger eviction runs in the background with one eviction server
+thread and several eviction worker threads.
+
+1) The eviction server thread walks the btrees and adds the least
+recently used pages to the eviction queues until the queus are full.
+2) The eviction workers continuously remove pages from the eviction
+queue and try to evict them.
+
+Eviction has to obtain exclusive access to the page as there may be
+application threads still reading content on the page that is being
+evicted.
+
+There are three types of eviction in WiredTiger:
+
+1) Clean eviction
+2) History store eviction
+3) Update restore eviction
+
+@section clean_eviction Clean eviction
+
+In the case of clean eviction, there is no dirty content on the page
+and it is directly removed from the memory. After that, the page
+remains solely on the disk unchanged.
+
+@section hs_eviction History store eviction
+
+For history store eviction, the page is modified but the changes are
+all committed. Therefore, the page should be clean and can be removed
+from memory after the dirty changes are written to the new disk image,
+which is the data fomat WiredTiger stores on disk. The process that
+builds the disk image is called reconciliation. In reconciliation,
+WiredTiger writes the newest committed value of each key to the disk
+image. All the older values of the key are moved to the history store
+table. In the end, reconciliation flushes the new disk image to the
+data file on the disk and later eviction removes the old page from
+memory.
+
+The history store table is the internal table for WiredTiger to store
+historical values for all the user-created tables. Each entry in the
+history store represents a historical value of a key in the
+user-created tables. The pages in the history store table need to be
+evicted and loaded back to memory as well. Different from the
+user-created tables, all the content on the history store page is
+committed as the historical values it stores must all be committed,
+and each entry only has one value and a deletion marking the expiration
+of the entry on its update chain. Therefore, history store pages should
+always be clean after reconciliation and can be evicted from memory.
+Besides, only having one committed value for each key in the history
+store prevents eviction from moving content already in the history
+store recursively to the history store when evicting a history store
+page because we only move the values after the first committed value to
+the history store.
+
+@section update_restore_eviction Update restore eviction
+
+Update restore eviction largely overlaps with history store eviction.
+The difference is that there are uncommitted changes on the page and
+WiredTiger cannot write them to the disk image. Instead, besides
+writing the disk image to the disk, eviction also keeps the disk image
+in memory and restores the uncommitted changes to the new disk image.
+
+Since usually there is only a subset of keys that have uncommitted
+changes, eviction only needs to restore those keys that are dirty
+after reconciliation. Apart from that, eviction may split a page into
+multiple pages and some of them may be clean after reconciliation
+even though the old page before split has uncommitted changes.
+Therefore, WiredTiger tracks whether it needs to do update restore
+eviction at the key level and the page level.
+
+At the key level, the key needs to be restored if it has uncommitted
+updates. For each new page, restoration is required if any of the
+keys on the new page needs to be restored. Once eviction decides to
+restore a page, it will make a copy of the disk image in memory
+during reconciliation.
+
+For the restored keys, eviction also frees all the values that have
+been written to the disk image or moved to the history store as there
+is no need to duplicate them on the update chains. For the clean keys
+that are not restored, their whole update chains are removed from
+memory along with the old disk image.
+
+@section exceptions Exceptions
+
+Eviction works differently for in-memory database and fixed-length
+column store. Since they don't support the history store, eviction
+only discards the update chains when all the values on the update
+chains are globally visible. For the same reason, we cannot free the
+updates older than the update written to the disk image. Because
+there is no disk storage for in-memory database, eviction has to keep
+all the reconciled disk images in memory.
+
+ */
diff --git a/src/third_party/wiredtiger/src/docs/programming.dox b/src/third_party/wiredtiger/src/docs/programming.dox
index 334ee5ce4c6..449e89eb231 100644
--- a/src/third_party/wiredtiger/src/docs/programming.dox
+++ b/src/third_party/wiredtiger/src/docs/programming.dox
@@ -40,6 +40,7 @@ each of which is ordered by one or more columns.
- @subpage backup
- @subpage compact
- @subpage checkpoint
+- @subpage eviction
- @subpage durability
- @subpage in_memory
- @subpage cursor_join
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index c342fb232ca..719faec9ede 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -2240,13 +2240,7 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server)
*/
__wt_cache_read_gen_bump(session, ref->page);
-#ifdef HAVE_DIAGNOSTIC
- __wt_seconds32(session, &session->op_5043_seconds);
-#endif
WT_WITH_BTREE(session, btree, ret = __wt_evict(session, ref, previous_state, 0));
-#ifdef HAVE_DIAGNOSTIC
- session->op_5043_seconds = 0;
-#endif
(void)__wt_atomic_subv32(&btree->evict_busy, 1);
diff --git a/src/third_party/wiredtiger/src/history/hs.c b/src/third_party/wiredtiger/src/history/hs.c
index b79a90c2b71..198f548873a 100644
--- a/src/third_party/wiredtiger/src/history/hs.c
+++ b/src/third_party/wiredtiger/src/history/hs.c
@@ -1239,3 +1239,123 @@ err:
__wt_free(session, upd);
return (ret);
}
+
+/*
+ * __wt_verify_history_store_tree --
+ * Verify the history store. There can't be an entry in the history store without having the
+ * latest value for the respective key in the data store. If given a uri, limit the verification
+ * to the corresponding btree.
+ */
+int
+__wt_verify_history_store_tree(WT_SESSION_IMPL *session, const char *uri)
+{
+ WT_CURSOR *cursor, *data_cursor;
+ WT_DECL_ITEM(hs_key);
+ WT_DECL_ITEM(prev_hs_key);
+ WT_DECL_RET;
+ wt_timestamp_t hs_start_ts;
+ uint64_t hs_counter;
+ uint32_t btree_id, btree_id_given_uri, session_flags, prev_btree_id;
+ int exact, cmp;
+ char *uri_itr;
+ bool is_owner;
+
+ cursor = data_cursor = NULL;
+ btree_id_given_uri = 0; /* [-Wconditional-uninitialized] */
+ session_flags = 0; /* [-Wconditional-uninitialized] */
+ prev_btree_id = 0; /* [-Wconditional-uninitialized] */
+ uri_itr = NULL;
+ is_owner = false; /* [-Wconditional-uninitialized] */
+
+ WT_ERR(__wt_scr_alloc(session, 0, &hs_key));
+ WT_ERR(__wt_scr_alloc(session, 0, &prev_hs_key));
+
+ WT_ERR(__wt_hs_cursor(session, &session_flags, &is_owner));
+ cursor = session->hs_cursor;
+
+ /*
+ * If a uri has been provided, limit verification to the corresponding btree by jumping to the
+ * first record for that btree in the history store. Otherwise scan the whole history store.
+ */
+ if (uri != NULL) {
+ ret = __wt_metadata_uri_to_btree_id(session, uri, &btree_id_given_uri);
+ if (ret != 0)
+ WT_ERR_MSG(session, ret, "Unable to locate the URI %s in the metadata file", uri);
+
+ /*
+ * Position the cursor at the first record of the specified btree, or one after. It is
+ * possible there are no records in the history store for this btree.
+ */
+ cursor->set_key(cursor, btree_id_given_uri, hs_key, 0, 0, 0, 0);
+ ret = cursor->search_near(cursor, &exact);
+ if (ret == 0 && exact < 0)
+ ret = cursor->next(cursor);
+ } else
+ ret = cursor->next(cursor);
+
+ /* We have the history store cursor positioned at the first record that we want to verify. */
+ for (; ret == 0; ret = cursor->next(cursor)) {
+ WT_ERR(cursor->get_key(cursor, &btree_id, hs_key, &hs_start_ts, &hs_counter));
+
+ /* When limiting our verification to a uri, bail out if the btree-id doesn't match. */
+ if (uri != NULL && btree_id != btree_id_given_uri)
+ break;
+
+ /*
+ * Keep track of the previous comparison. The history store is stored in order, so we can
+ * avoid redundant comparisons. Previous btree ID isn't set, until data cursor is open.
+ */
+ if (data_cursor == NULL || (prev_btree_id != btree_id)) {
+ /*
+ * Check whether this btree-id exists in the metadata. We do that by finding what uri
+ * this btree belongs to. Using this URI, verify the history store key with the data
+ * store.
+ */
+ if (data_cursor != NULL) {
+ WT_ERR(data_cursor->close(data_cursor));
+ /* Setting data_cursor to null, to avoid double free */
+ data_cursor = NULL;
+ }
+ /*
+ * Using the btree-id find the metadata entry and extract the URI for this btree. Don't
+ * forget to free the copy of the URI returned.
+ *
+ * Re-purpose the previous-key buffer on error, safe because we're about to error out.
+ */
+ __wt_free(session, uri_itr);
+ if ((ret = __wt_metadata_btree_id_to_uri(session, btree_id, &uri_itr)) != 0)
+ WT_ERR_MSG(session, ret,
+ "Unable to find btree-id %" PRIu32
+ " in the metadata file for the associated history store key %s",
+ btree_id,
+ __wt_buf_set_printable(session, hs_key->data, hs_key->size, prev_hs_key));
+
+ WT_ERR(__wt_open_cursor(session, uri_itr, NULL, NULL, &data_cursor));
+ F_SET(data_cursor, WT_CURSOR_RAW_OK);
+ } else {
+ WT_ERR(__wt_compare(session, NULL, hs_key, prev_hs_key, &cmp));
+ if (cmp == 0)
+ continue;
+ }
+ WT_ERR(__wt_buf_set(session, prev_hs_key, hs_key->data, hs_key->size));
+ prev_btree_id = btree_id;
+
+ /* Re-purpose the previous-key buffer on error, safe because we're about to error out. */
+ data_cursor->set_key(data_cursor, hs_key);
+ if ((ret = data_cursor->search(data_cursor)) == WT_NOTFOUND)
+ WT_ERR_MSG(session, ret,
+ "In %s, the associated history store key %s was not found in the data store", uri_itr,
+ __wt_buf_set_printable(session, hs_key->data, hs_key->size, prev_hs_key));
+ WT_ERR(ret);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+err:
+ if (data_cursor != NULL)
+ WT_TRET(data_cursor->close(data_cursor));
+ WT_TRET(__wt_hs_cursor_close(session, session_flags, is_owner));
+
+ __wt_scr_free(session, &hs_key);
+ __wt_scr_free(session, &prev_hs_key);
+ __wt_free(session, uri_itr);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index dacc804be25..d1b427e805a 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -79,6 +79,10 @@ struct __wt_page_header {
/* A byte of padding, positioned to be added to the flags. */
uint8_t unused; /* 26: unused padding */
+
+#define WT_PAGE_VERSION_ORIG 0 /* Original version */
+#define WT_PAGE_VERSION_TS 1 /* Timestamps added */
+ uint8_t version; /* 27: version */
};
/*
* WT_PAGE_HEADER_SIZE is the number of bytes we allocate for the structure: if the compiler inserts
@@ -339,9 +343,10 @@ struct __wt_page_modify {
WT_INSERT *ins; /* Insert list reference */
WT_ROW *ripcip; /* Original on-page reference */
WT_UPDATE *onpage_upd;
+ bool restore; /* Whether to restore this saved update chain */
} * supd;
uint32_t supd_entries;
- bool supd_restore;
+ bool supd_restore; /* Whether to restore saved update chains to this page */
/*
* Disk image was written: address, size and checksum. On subsequent reconciliations
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
index 6f6ebb0d7f0..f6bd37c5124 100644
--- a/src/third_party/wiredtiger/src/include/connection.h
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -19,8 +19,6 @@ struct __wt_process {
/* Locked: connection queue */
TAILQ_HEAD(__wt_connection_impl_qh, __wt_connection_impl) connqh;
- bool page_version_ts; /* timestamp version page formats */
-
/* Checksum functions */
#define __wt_checksum(chunk, len) __wt_process.checksum(chunk, len)
uint32_t (*checksum)(const void *, size_t);
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 218ecfc216e..01d01a97754 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -592,8 +592,8 @@ extern int __wt_debug_cursor_tree_hs(void *cursor_arg, const char *ofile)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_debug_disk(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_debug_key_value(WT_SESSION_IMPL *session, WT_ITEM *key, WT_CELL_UNPACK *value)
- WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_debug_key_value(WT_SESSION_IMPL *session, WT_ITEM *key, uint64_t recno,
+ uint64_t rle, WT_CELL_UNPACK *value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_debug_mode_config(WT_SESSION_IMPL *session, const char *cfg[])
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_debug_offset(WT_SESSION_IMPL *session, wt_off_t offset, uint32_t size,
@@ -797,6 +797,8 @@ extern int __wt_log_allocfile(WT_SESSION_IMPL *session, uint32_t lognum, const c
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_log_close(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_log_compat_verify(WT_SESSION_IMPL *session)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_log_extract_lognum(WT_SESSION_IMPL *session, const char *name, uint32_t *id)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_log_filename(WT_SESSION_IMPL *session, uint32_t id, const char *file_prefix,
@@ -1674,6 +1676,7 @@ extern void __wt_log_slot_join(
WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot);
extern void __wt_log_written_reset(WT_SESSION_IMPL *session);
extern void __wt_log_wrlsn(WT_SESSION_IMPL *session, int *yield);
+extern void __wt_logmgr_compat_version(WT_SESSION_IMPL *session);
extern void __wt_logrec_free(WT_SESSION_IMPL *session, WT_ITEM **logrecp);
extern void __wt_lsm_manager_clear_tree(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
extern void __wt_lsm_manager_free_work_unit(WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT *entry);
diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h
index 116fe0e35ab..183d2288037 100644
--- a/src/third_party/wiredtiger/src/include/reconcile.h
+++ b/src/third_party/wiredtiger/src/include/reconcile.h
@@ -37,6 +37,12 @@ struct __wt_reconcile {
wt_timestamp_t max_ondisk_ts;
wt_timestamp_t min_skipped_ts;
+ /*
+ * FIXME: temporarily track the stable timestamp when reconciliation starts. Remove it when
+ * PM-1524 completes.
+ */
+ wt_timestamp_t stable_ts;
+
u_int updates_seen; /* Count of updates seen. */
u_int updates_unstable; /* Count of updates not visible_all. */
diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h
index c92abe34c5c..da321cca22e 100644
--- a/src/third_party/wiredtiger/src/include/session.h
+++ b/src/third_party/wiredtiger/src/include/session.h
@@ -53,6 +53,7 @@ typedef TAILQ_HEAD(__wt_cursor_list, __wt_cursor) WT_CURSOR_LIST;
*/
struct __wt_session_impl {
WT_SESSION iface;
+ WT_EVENT_HANDLER *event_handler; /* Application's event handlers */
void *lang_private; /* Language specific private storage */
@@ -62,13 +63,9 @@ struct __wt_session_impl {
const char *lastop; /* Last operation */
uint32_t id; /* UID, offset in session array */
+ uint64_t cache_wait_us; /* Wait time for cache for current operation */
uint64_t operation_start_us; /* Operation start */
uint64_t operation_timeout_us; /* Maximum operation period before rollback */
-#ifdef HAVE_DIAGNOSTIC
- uint32_t op_5043_seconds; /* Temporary debugging to catch WT-5043, discard after 01/2020. */
-#endif
-
- WT_EVENT_HANDLER *event_handler; /* Application's event handlers */
WT_DATA_HANDLE *dhandle; /* Current data handle */
@@ -84,6 +81,7 @@ struct __wt_session_impl {
struct timespec last_epoch; /* Last epoch time returned */
WT_CURSOR_LIST cursors; /* Cursors closed with the session */
+ u_int ncursors; /* Count of active file cursors. */
uint32_t cursor_sweep_position; /* Position in cursor_cache for sweep */
uint32_t cursor_sweep_countdown; /* Countdown to cursor sweep */
uint64_t last_cursor_sweep; /* Last sweep for dead cursors */
@@ -136,9 +134,9 @@ struct __wt_session_impl {
WT_TXN_ISOLATION isolation;
WT_TXN txn; /* Transaction state */
+
#define WT_SESSION_BG_SYNC_MSEC 1200000
WT_LSN bg_sync_lsn; /* Background sync operation LSN. */
- u_int ncursors; /* Count of active file cursors. */
void *block_manager; /* Block-manager support */
int (*block_manager_cleanup)(WT_SESSION_IMPL *);
@@ -148,8 +146,6 @@ struct __wt_session_impl {
u_int ckpt_handle_next; /* Next empty slot */
size_t ckpt_handle_allocated; /* Bytes allocated */
- uint64_t cache_wait_us; /* Wait time for cache for current operation */
-
/*
* Operations acting on handles.
*
diff --git a/src/third_party/wiredtiger/src/include/time.i b/src/third_party/wiredtiger/src/include/time.i
index 1b3567df8bf..759b8338370 100644
--- a/src/third_party/wiredtiger/src/include/time.i
+++ b/src/third_party/wiredtiger/src/include/time.i
@@ -171,15 +171,6 @@ __wt_op_timer_start(WT_SESSION_IMPL *session)
session->operation_start_us = __wt_clock(session);
session->operation_timeout_us = timeout_us;
}
-
-#ifdef HAVE_DIAGNOSTIC
- /*
- * This is called at the beginning of each API call. We need to clear out any old values from
- * this debugging field so that we don't leave a stale value in there that may then give a false
- * positive.
- */
- session->op_5043_seconds = 0;
-#endif
}
/*
diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h
index 588368599ad..aedc94a96a2 100644
--- a/src/third_party/wiredtiger/src/include/txn.h
+++ b/src/third_party/wiredtiger/src/include/txn.h
@@ -99,16 +99,6 @@ typedef enum {
txn_state->pinned_id = saved_state.pinned_id; \
} while (0)
-struct __wt_named_snapshot {
- const char *name;
-
- TAILQ_ENTRY(__wt_named_snapshot) q;
-
- uint64_t id, pinned_id, snap_min, snap_max;
- uint64_t *snapshot;
- uint32_t snapshot_count;
-};
-
struct __wt_txn_state {
WT_CACHE_LINE_PAD_BEGIN
volatile uint64_t id;
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index ae870008f8b..60b64573184 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -752,9 +752,13 @@ __wt_txn_read_upd_list(WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_UPDATE **upd
continue;
upd_visible = __wt_txn_upd_visible_type(session, upd);
if (upd_visible == WT_VISIBLE_TRUE) {
- /* Don't consider tombstone updates for the history store during rollback to stable. */
+ /*
+ * A tombstone representing a stop time pair will have either a valid txn id or a valid
+ * timestamp. Ignore such tombstones in history store based on session settings.
+ */
if (type == WT_UPDATE_TOMBSTONE && WT_IS_HS(S2BT(session)) &&
- F_ISSET(session, WT_SESSION_IGNORE_HS_TOMBSTONE))
+ F_ISSET(session, WT_SESSION_IGNORE_HS_TOMBSTONE) &&
+ (upd->start_ts != WT_TS_NONE || upd->txnid != WT_TXN_NONE))
continue;
*updp = upd;
return (0);
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index 7b1d259e410..f939a421dac 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -1627,7 +1627,7 @@ struct __wt_session {
const char *name, const char *config);
/*!
- * Truncate a file, table or cursor range.
+ * Truncate a file, table, cursor range, or backup cursor
*
* Truncate a table or file.
* @snippet ex_all.c Truncate a table
@@ -1649,14 +1649,20 @@ struct __wt_session {
* they do commit, and if there is a crash and recovery runs, the result
* may be different than what was in cache before the crash.
*
+ * Truncate a backup cursor. This operation removes all log files that
+ * have been returned by the backup cursor. It can be used to remove log
+ * files after copying them during @ref backup_incremental.
+ * @snippet ex_backup.c Truncate a backup cursor
+ *
* @param session the session handle
- * @param name the URI of the table or file to truncate
+ * @param name the URI of the table or file to truncate, or \c "log:"
+ * for a backup cursor
* @param start optional cursor marking the first record discarded;
* if <code>NULL</code>, the truncate starts from the beginning of
- * the object
+ * the object; must be provided when truncating a backup cursor
* @param stop optional cursor marking the last record discarded;
* if <code>NULL</code>, the truncate continues to the end of the
- * object
+ * object; ignored when truncating a backup cursor
* @configempty{WT_SESSION.truncate, see dist/api_data.py}
* @errors
*/
diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h
index 9b3bda96e7a..31b8b740ed9 100644
--- a/src/third_party/wiredtiger/src/include/wt_internal.h
+++ b/src/third_party/wiredtiger/src/include/wt_internal.h
@@ -255,8 +255,6 @@ struct __wt_named_encryptor;
typedef struct __wt_named_encryptor WT_NAMED_ENCRYPTOR;
struct __wt_named_extractor;
typedef struct __wt_named_extractor WT_NAMED_EXTRACTOR;
-struct __wt_named_snapshot;
-typedef struct __wt_named_snapshot WT_NAMED_SNAPSHOT;
struct __wt_optrack_header;
typedef struct __wt_optrack_header WT_OPTRACK_HEADER;
struct __wt_optrack_record;
diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c
index b7b7ff66c58..963b1998289 100644
--- a/src/third_party/wiredtiger/src/log/log.c
+++ b/src/third_party/wiredtiger/src/log/log.c
@@ -1574,6 +1574,33 @@ err:
}
/*
+ * __wt_log_compat_verify --
+ * Verify the last log when opening for the compatibility settings. This is separate because we
+ * need to do it very early in the startup process.
+ */
+int
+__wt_log_compat_verify(WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+ uint32_t lastlog, lognum;
+ u_int i, logcount;
+ char **logfiles;
+
+ lastlog = 0;
+
+ WT_ERR(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount));
+ for (i = 0; i < logcount; i++) {
+ WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
+ lastlog = WT_MAX(lastlog, lognum);
+ }
+ if (lastlog != 0)
+ WT_ERR(__log_open_verify(session, lastlog, NULL, NULL, NULL, NULL));
+err:
+ WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
+ return (ret);
+}
+
+/*
* __wt_log_open --
* Open the appropriate log file for the connection. The purpose is to find the last log file
* that exists, open it and set our initial LSNs to the end of that file. If none exist, call
diff --git a/src/third_party/wiredtiger/src/os_posix/os_fs.c b/src/third_party/wiredtiger/src/os_posix/os_fs.c
index ba3a1803f5a..d8f695fa1c8 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_fs.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_fs.c
@@ -462,7 +462,7 @@ __posix_file_read_mmap(
(void)__wt_atomic_addv32(&pfh->mmap_usecount, 1);
/*
- * If the I/O falls inside the mapped buffer, and the buffer is not being resized, we will use
+ * If the I/O falls inside the mapped buffer, and the buffer is not being re-sized, we will use
* the mapped buffer.
*/
mmap_success = false;
@@ -653,7 +653,7 @@ __posix_file_write_mmap(
(void)__wt_atomic_addv32(&pfh->mmap_usecount, 1);
/*
- * If the I/O falls inside the mapped buffer, and the buffer is not being resized, we will use
+ * If the I/O falls inside the mapped buffer, and the buffer is not being re-sized, we will use
* the mapped buffer.
*/
mmap_success = false;
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
index ed8da3394b3..793ad6bdabe 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
@@ -12,7 +12,7 @@
* __rec_update_stable --
* Return whether an update is stable or not.
*/
-static bool
+static inline bool
__rec_update_stable(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE *upd)
{
return (F_ISSET(r, WT_REC_VISIBLE_ALL) ?
@@ -25,20 +25,24 @@ __rec_update_stable(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE *upd)
* __rec_update_save --
* Save a WT_UPDATE list for later restoration.
*/
-static int
+static inline int
__rec_update_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, void *ripcip,
- WT_UPDATE *onpage_upd, size_t upd_memsize)
+ WT_UPDATE *onpage_upd, bool supd_restore, size_t upd_memsize)
{
WT_SAVE_UPD *supd;
+ /* If nothing is committed, we must restore the update chain. */
+ WT_ASSERT(session, onpage_upd != NULL || supd_restore);
+ /* We can only write a standard update or a modify to the data store. */
+ WT_ASSERT(session, onpage_upd == NULL || onpage_upd->type == WT_UPDATE_STANDARD ||
+ onpage_upd->type == WT_UPDATE_MODIFY);
+
WT_RET(__wt_realloc_def(session, &r->supd_allocated, r->supd_next + 1, &r->supd));
supd = &r->supd[r->supd_next];
supd->ins = ins;
supd->ripcip = ripcip;
- WT_CLEAR(supd->onpage_upd);
- if (onpage_upd != NULL &&
- (onpage_upd->type == WT_UPDATE_STANDARD || onpage_upd->type == WT_UPDATE_MODIFY))
- supd->onpage_upd = onpage_upd;
+ supd->onpage_upd = onpage_upd;
+ supd->restore = supd_restore;
++r->supd_next;
r->supd_memsize += upd_memsize;
return (0);
@@ -57,6 +61,8 @@ __rec_append_orig_value(
WT_UPDATE *append, *tombstone;
size_t size, total_size;
+ WT_ASSERT(session, upd != NULL && unpack != NULL && unpack->type != WT_CELL_DEL);
+
for (;; upd = upd->next) {
/* Done if at least one self-contained update is globally visible. */
if (WT_UPDATE_DATA_VALUE(upd) && __wt_txn_upd_visible_all(session, upd))
@@ -89,36 +95,38 @@ __rec_append_orig_value(
*/
append = tombstone = NULL; /* -Wconditional-uninitialized */
total_size = size = 0; /* -Wconditional-uninitialized */
- if (unpack == NULL || unpack->type == WT_CELL_DEL)
- WT_RET(__wt_update_alloc(session, NULL, &append, &size, WT_UPDATE_TOMBSTONE));
- else {
- WT_RET(__wt_scr_alloc(session, 0, &tmp));
- WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp));
- WT_ERR(__wt_update_alloc(session, tmp, &append, &size, WT_UPDATE_STANDARD));
- append->start_ts = append->durable_ts = unpack->start_ts;
- append->txnid = unpack->start_txn;
- total_size = size;
- /*
- * We need to append a TOMBSTONE before the onpage value if the onpage value has a valid
- * stop pair.
- *
- * Imagine a case we insert and delete a value respectively at timestamp 0 and 10, and later
- * insert it again at 20. We need the TOMBSTONE to tell us there is no value between 10 and
- * 20.
- */
- if (unpack->stop_ts != WT_TS_MAX || unpack->stop_txn != WT_TXN_MAX) {
- WT_ERR(__wt_update_alloc(session, NULL, &tombstone, &size, WT_UPDATE_TOMBSTONE));
- tombstone->txnid = unpack->stop_txn;
- tombstone->start_ts = unpack->stop_ts;
- tombstone->durable_ts = unpack->stop_ts;
- tombstone->next = append;
- total_size += size;
- }
+ /*
+ * We need to append a TOMBSTONE before the onpage value if the onpage value has a valid
+ * stop pair.
+ *
+ * Imagine a case we insert and delete a value respectively at timestamp 0 and 10, and later
+ * insert it again at 20. We need the TOMBSTONE to tell us there is no value between 10 and
+ * 20.
+ */
+ if (unpack->stop_ts != WT_TS_MAX || unpack->stop_txn != WT_TXN_MAX) {
+ /* No need to append anything if the stop time pair is globally visible. */
+ if (__wt_txn_visible_all(session, unpack->stop_txn, unpack->stop_ts))
+ return (0);
+ WT_ERR(__wt_update_alloc(session, NULL, &tombstone, &size, WT_UPDATE_TOMBSTONE));
+ tombstone->txnid = unpack->stop_txn;
+ tombstone->start_ts = unpack->stop_ts;
+ tombstone->durable_ts = unpack->durable_stop_ts;
+ total_size += size;
}
- if (tombstone != NULL)
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp));
+ WT_ERR(__wt_update_alloc(session, tmp, &append, &size, WT_UPDATE_STANDARD));
+ append->txnid = unpack->start_txn;
+ append->start_ts = unpack->start_ts;
+ append->durable_ts = unpack->durable_start_ts;
+ total_size += size;
+
+ if (tombstone != NULL) {
+ tombstone->next = append;
append = tombstone;
+ }
/* Append the new entry into the update list. */
WT_PUBLISH(upd->next, append);
@@ -129,7 +137,8 @@ err:
__wt_scr_free(session, &tmp);
/* Free append when tombstone allocation fails */
if (ret != 0) {
- __wt_free_update_list(session, &append);
+ __wt_free(session, append);
+ __wt_free(session, tombstone);
}
return (ret);
}
@@ -138,7 +147,7 @@ err:
* __rec_need_save_upd --
* Return if we need to save the update chain
*/
-static bool
+static inline bool
__rec_need_save_upd(
WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE_SELECT *upd_select, bool has_newer_updates)
{
@@ -173,10 +182,10 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
WT_DECL_RET;
WT_PAGE *page;
WT_UPDATE *first_txn_upd, *first_upd, *upd, *last_upd;
- wt_timestamp_t checkpoint_timestamp, max_ts, tombstone_durable_ts;
+ wt_timestamp_t max_ts, tombstone_durable_ts;
size_t size, upd_memsize;
uint64_t max_txn, txnid;
- bool has_newer_updates, is_hs_page, upd_saved;
+ bool has_newer_updates, is_hs_page, supd_restore, upd_saved;
/*
* The "saved updates" return value is used independently of returning an update we can write,
@@ -193,7 +202,6 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
page = r->page;
first_txn_upd = upd = last_upd = NULL;
upd_memsize = 0;
- checkpoint_timestamp = S2C(session)->txn_global.checkpoint_timestamp;
max_ts = WT_TS_NONE;
tombstone_durable_ts = WT_TS_NONE;
max_txn = WT_TXN_NONE;
@@ -258,13 +266,12 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
/*
* FIXME-prepare-support: A temporary solution for not storing durable timestamp in the
- * cell. Properly fix this problem in PM-1524. It is currently not OK to write prepared
- * updates with durable timestamp larger than checkpoint timestamp to data store as we don't
- * store durable timestamp in the cell. However, it is OK to write them to the history store
- * as we store the durable timestamp in the history store value.
+ * cell. Properly fix this problem in PM-1524. Currently pin all the prepared updates with
+ * durable timestamp larger than stable timestamp in cache.
*/
- if (upd->durable_ts != upd->start_ts && upd->durable_ts > checkpoint_timestamp) {
+ if (upd->durable_ts != upd->start_ts && upd->durable_ts > r->stable_ts) {
has_newer_updates = true;
+ upd_select->upd = NULL;
continue;
}
@@ -397,7 +404,8 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
WT_ERR(__wt_scr_alloc(session, 0, &tmp));
WT_ERR(__wt_page_cell_data_ref(session, page, vpack, tmp));
WT_ERR(__wt_update_alloc(session, tmp, &upd, &size, WT_UPDATE_STANDARD));
- upd->start_ts = upd->durable_ts = vpack->start_ts;
+ upd->durable_ts = vpack->durable_start_ts;
+ upd->start_ts = vpack->start_ts;
upd->txnid = vpack->start_txn;
WT_PUBLISH(last_upd->next, upd);
/* This is going in our update list so it should be accounted for in cache usage. */
@@ -455,13 +463,6 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
r->leave_dirty = true;
/*
- * We should restore the update chains to the new disk image if there are newer updates in
- * eviction.
- */
- if (has_newer_updates && F_ISSET(r, WT_REC_EVICT))
- r->cache_write_restore = true;
-
- /*
* The update doesn't have any further updates that need to be written to the history store,
* skip saving the update as saving the update will cause reconciliation to think there is work
* that needs to be done when there might not be.
@@ -469,7 +470,20 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
* Additionally history store reconciliation is not set skip saving an update.
*/
if (__rec_need_save_upd(session, r, upd_select, has_newer_updates)) {
- WT_ERR(__rec_update_save(session, r, ins, ripcip, upd_select->upd, upd_memsize));
+ /*
+ * We should restore the update chains to the new disk image if there are newer updates in
+ * eviction, or for cases that don't support history store, such as in-memory database and
+ * fixed length column store.
+ */
+ supd_restore = F_ISSET(r, WT_REC_EVICT) &&
+ (has_newer_updates || F_ISSET(S2C(session), WT_CONN_IN_MEMORY) ||
+ page->type == WT_PAGE_COL_FIX);
+ if (supd_restore)
+ r->cache_write_restore = true;
+ WT_ERR(__rec_update_save(session, r, ins, ripcip,
+ upd_select->upd != NULL && upd_select->upd->type == WT_UPDATE_TOMBSTONE ? NULL :
+ upd_select->upd,
+ supd_restore, upd_memsize));
upd_saved = true;
}
@@ -484,9 +498,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v
* time there are saved updates and during reconciliation of a backing overflow record that will
* be physically removed once it's no longer needed.
*/
- if (upd_select->upd != NULL &&
- (upd_saved || (vpack != NULL && F_ISSET(vpack, WT_CELL_UNPACK_OVERFLOW) &&
- vpack->raw != WT_CELL_VALUE_OVFL_RM)))
+ if (vpack != NULL && vpack->type != WT_CELL_DEL && upd_select->upd != NULL && upd_saved)
WT_ERR(__rec_append_orig_value(session, page, upd_select->upd, vpack));
err:
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index fc17a1ab6ab..572c7374ca3 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -455,6 +455,7 @@ __rec_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COO
WT_PAGE *page;
WT_RECONCILE *r;
WT_TXN_GLOBAL *txn_global;
+ wt_timestamp_t checkpoint_ts;
uint64_t ckpt_txn;
btree = S2BT(session);
@@ -648,6 +649,15 @@ __rec_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COO
r->update_modify_cbt.ref = ref;
r->update_modify_cbt.iface.value_format = btree->value_format;
+ /*
+ * FIXME: cache the stable timestamp used to check if the durable timestamps in prepared updates
+ * can be discarded (until PM-1524 completes and durable timestamps are stored in data pages).
+ */
+ WT_ORDERED_READ(r->stable_ts, S2C(session)->txn_global.stable_timestamp);
+ if ((checkpoint_ts = S2C(session)->txn_global.checkpoint_timestamp) != WT_TS_NONE &&
+ checkpoint_ts < r->stable_ts)
+ r->stable_ts = checkpoint_ts;
+
/*
* If we allocated the reconciliation structure and there was an error, clean up. If our caller
* passed in a structure, they own it.
@@ -1440,10 +1450,16 @@ __rec_supd_move(WT_SESSION_IMPL *session, WT_MULTI *multi, WT_SAVE_UPD *supd, ui
{
uint32_t i;
+ multi->supd_restore = false;
+
WT_RET(__wt_calloc_def(session, n, &multi->supd));
- for (i = 0; i < n; ++i)
+ for (i = 0; i < n; ++i) {
+ if (supd->restore)
+ multi->supd_restore = true;
multi->supd[i] = *supd++;
+ }
+
multi->supd_entries = n;
return (0);
}
@@ -1573,6 +1589,7 @@ __rec_split_write_header(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_CHUNK
}
dsk->unused = 0;
+ dsk->version = WT_PAGE_VERSION_TS;
/* Clear the memory owned by the block manager. */
memset(WT_BLOCK_HEADER_REF(dsk), 0, btree->block_header);
@@ -1789,6 +1806,7 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_CHUNK *chunk
}
multi->size = WT_STORE_SIZE(chunk->image.size);
multi->checksum = 0;
+ multi->supd_restore = false;
/* Set the key. */
if (btree->type == BTREE_ROW)
@@ -1843,13 +1861,10 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_CHUNK *chunk
return (__wt_set_return(session, EBUSY));
/* If we need to restore the page to memory, copy the disk image. */
- if (r->cache_write_restore) {
- multi->supd_restore = true;
+ if (multi->supd_restore)
goto copy_image;
- }
- if (chunk->entries == 0)
- return (0);
+ WT_ASSERT(session, chunk->entries > 0);
}
/*
@@ -1892,7 +1907,7 @@ copy_image:
* If re-instantiating this page in memory (either because eviction wants to, or because we
* skipped updates to build the disk image), save a copy of the disk image.
*/
- if (F_ISSET(r, WT_REC_SCRUB) || (r->cache_write_restore && multi->supd != NULL))
+ if (F_ISSET(r, WT_REC_SCRUB) || multi->supd_restore)
WT_RET(__wt_memdup(session, chunk->image.data, chunk->image.size, &multi->disk_image));
return (0);
@@ -2190,7 +2205,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
* eviction has decided to retain the page in memory because the latter can't handle
* update lists and splits can.
*/
- if (F_ISSET(r, WT_REC_IN_MEMORY) || r->cache_write_restore) {
+ if (F_ISSET(r, WT_REC_IN_MEMORY) || r->multi->supd_restore) {
WT_ASSERT(session, F_ISSET(r, WT_REC_IN_MEMORY) ||
(F_ISSET(r, WT_REC_EVICT) && r->leave_dirty && r->multi->supd_entries != 0));
goto split;
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index cdf5f93a9e9..7d93bf4bb10 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -22,12 +22,8 @@ __rollback_abort_newer_update(WT_SESSION_IMPL *session, WT_UPDATE *first_upd,
*stable_update_found = false;
for (upd = first_upd; upd != NULL; upd = upd->next) {
- /*
- * Updates with no timestamp will have a timestamp of zero and will never be rolled back. If
- * the table is configured for strict timestamp checking, assert that all more recent
- * updates were also rolled back.
- */
- if (upd->txnid == WT_TXN_ABORTED || upd->start_ts == WT_TS_NONE) {
+ /* Skip the updates that are aborted. */
+ if (upd->txnid == WT_TXN_ABORTED) {
if (upd == first_upd)
first_upd = upd->next;
} else if (rollback_timestamp < upd->durable_ts) {
@@ -51,6 +47,10 @@ __rollback_abort_newer_update(WT_SESSION_IMPL *session, WT_UPDATE *first_upd,
upd->txnid = WT_TXN_ABORTED;
WT_STAT_CONN_INCR(session, txn_rts_upd_aborted);
upd->durable_ts = upd->start_ts = WT_TS_NONE;
+ } else {
+ /* Valid update is found. */
+ WT_ASSERT(session, first_upd == upd);
+ break;
}
}
@@ -354,7 +354,16 @@ __rollback_abort_row_ondisk_kv(
__wt_timestamp_to_string(vpack->durable_start_ts, ts_string[0]),
__wt_timestamp_to_string(vpack->start_ts, ts_string[1]),
__wt_timestamp_to_string(rollback_timestamp, ts_string[2]));
- return (__rollback_row_ondisk_fixup_key(session, page, rip, rollback_timestamp, true));
+ if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
+ return (__rollback_row_ondisk_fixup_key(session, page, rip, rollback_timestamp, true));
+ else {
+ /*
+ * In-memory database don't have a history store to provide a stable update, so remove
+ * the key.
+ */
+ WT_RET(__wt_upd_alloc_tombstone(session, &upd));
+ WT_STAT_CONN_INCR(session, txn_rts_keys_removed);
+ }
} else if (vpack->durable_stop_ts != WT_TS_NONE &&
vpack->durable_stop_ts > rollback_timestamp) {
/*
@@ -464,7 +473,8 @@ __rollback_abort_row_reconciled_page_internal(WT_SESSION_IMPL *session, const vo
image_local = tmp.data;
}
- page_flags = WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED;
+ /* Don't free the passed image later. */
+ page_flags = image != NULL ? 0 : WT_PAGE_DISK_ALLOC;
WT_ERR(__wt_page_inmem(session, NULL, image_local, page_flags, &mod_page));
tmp.mem = NULL;
WT_ROW_FOREACH (mod_page, rip, i)
@@ -576,10 +586,9 @@ __rollback_abort_newer_row_leaf(
__rollback_abort_newer_insert(session, insert, rollback_timestamp);
/*
- * If the configuration is not in-memory and no stable update found in the update list,
- * abort any on-disk value.
+ * If there is no stable update found in the update list, abort any on-disk value.
*/
- if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY) && !stable_update_found)
+ if (!stable_update_found)
WT_RET(__rollback_abort_row_ondisk_kv(session, page, rip, rollback_timestamp));
}
@@ -1110,9 +1119,11 @@ __wt_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[], bool no_ckp
/*
* Don't use the connection's default session: we are working on data handles and (a) don't want
* to cache all of them forever, plus (b) can't guarantee that no other method will be called
- * concurrently.
+ * concurrently. Copy parent session no logging option to the internal session to make sure that
+ * rollback to stable doesn't generate log records.
*/
- WT_RET(__wt_open_internal_session(S2C(session), "txn rollback_to_stable", true, 0, &session));
+ WT_RET(__wt_open_internal_session(S2C(session), "txn rollback_to_stable", true,
+ F_MASK(session, WT_SESSION_NO_LOGGING), &session));
F_SET(session, WT_SESSION_ROLLBACK_TO_STABLE_FLAGS);
ret = __rollback_to_stable(session, cfg);
diff --git a/src/third_party/wiredtiger/test/csuite/wt2853_perf/main.c b/src/third_party/wiredtiger/test/csuite/wt2853_perf/main.c
index 3e25ee1ebdf..ff59c66fbb7 100644
--- a/src/third_party/wiredtiger/test/csuite/wt2853_perf/main.c
+++ b/src/third_party/wiredtiger/test/csuite/wt2853_perf/main.c
@@ -44,7 +44,8 @@ static void *thread_insert(void *);
static void *thread_get(void *);
#define BLOOM false
-#define MAX_GAP 7
+#define GAP_DISPLAY 5 /* Threshold for seconds of gap to be displayed */
+#define GAP_ERROR 25 /* Threshold for seconds of gap to be treated as error */
#define N_RECORDS 10000
#define N_INSERT 1000000
#define N_INSERT_THREAD 1
@@ -52,6 +53,10 @@ static void *thread_get(void *);
#define S64 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789::"
#define S1024 (S64 S64 S64 S64 S64 S64 S64 S64 S64 S64 S64 S64 S64 S64 S64 S64)
+#if GAP_ERROR < GAP_DISPLAY
+#error "GAP_ERROR must be >= GAP_DISPLAY"
+#endif
+
typedef struct {
char posturi[256];
char baluri[256];
@@ -187,7 +192,7 @@ main(int argc, char *argv[])
" took more than %d seconds.\n"
"This may indicate a real problem or a"
" particularly slow machine.\n",
- nfail, MAX_GAP);
+ nfail, GAP_ERROR);
testutil_assert(nfail == 0);
testutil_progress(opts, "cleanup starting");
testutil_cleanup(opts);
@@ -245,14 +250,16 @@ thread_insert(void *arg)
else
fprintf(stderr, ".");
__wt_seconds((WT_SESSION_IMPL *)session, &curtime);
- if ((elapsed = curtime - prevtime) > MAX_GAP) {
+ elapsed = curtime - prevtime;
+ if (elapsed > GAP_DISPLAY) {
testutil_progress(opts, "insert time gap");
fprintf(stderr,
"\n"
"GAP: %" PRIu64 " secs after %d inserts\n",
elapsed, i);
- threadargs->nfail++;
}
+ if (elapsed > GAP_ERROR)
+ threadargs->nfail++;
prevtime = curtime;
}
}
@@ -310,14 +317,16 @@ thread_get(void *arg)
testutil_check(session->rollback_transaction(session, NULL));
__wt_seconds((WT_SESSION_IMPL *)session, &curtime);
- if ((elapsed = curtime - prevtime) > MAX_GAP) {
+ elapsed = curtime - prevtime;
+ if (elapsed > GAP_DISPLAY) {
testutil_progress(opts, "get time gap");
fprintf(stderr,
"\n"
"GAP: %" PRIu64 " secs after %d gets\n",
elapsed, threadargs->njoins);
- threadargs->nfail++;
}
+ if (elapsed > GAP_ERROR)
+ threadargs->nfail++;
prevtime = curtime;
}
testutil_progress(opts, "get end");
diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml
index f95d567105c..16b250f4a36 100755
--- a/src/third_party/wiredtiger/test/evergreen.yml
+++ b/src/third_party/wiredtiger/test/evergreen.yml
@@ -458,7 +458,7 @@ tasks:
posix_configure_flags: --enable-silent-rules --enable-strict --enable-diagnostic --disable-static
- func: "make check all"
vars:
- test_env_vars: ASAN_OPTIONS=detect_leaks=1:abort_on_error=1:disable_coredump=0 ASAN_SYMBOLIZER_PATH=/opt/mongodbtoolchain/v3/bin/llvm-symbolizer
+ test_env_vars: ASAN_OPTIONS=detect_leaks=1:abort_on_error=1:disable_coredump=0 ASAN_SYMBOLIZER_PATH=/opt/mongodbtoolchain/v3/bin/llvm-symbolizer TESTUTIL_SLOW_MACHINE=1
- name: make-check-linux-no-ftruncate-test
depends_on:
diff --git a/src/third_party/wiredtiger/test/evergreen/compatibility_test_for_mongodb_releases.sh b/src/third_party/wiredtiger/test/evergreen/compatibility_test_for_mongodb_releases.sh
index 147ad92a3ae..277c6e10610 100755
--- a/src/third_party/wiredtiger/test/evergreen/compatibility_test_for_mongodb_releases.sh
+++ b/src/third_party/wiredtiger/test/evergreen/compatibility_test_for_mongodb_releases.sh
@@ -30,6 +30,7 @@ build_release()
# run_format:
# arg1: release
# arg2: access methods list
+# arg3: -B for compatibility testing
#############################################################
run_format()
{
@@ -46,6 +47,7 @@ run_format()
args+="data_source=table "
args+="in_memory=0 " # Interested in the on-disk format
args+="leak_memory=1 " # Faster runs
+ args+="logging=1 " # Test log compatibility
args+="logging_compression=snappy " # We only built with snappy, force the choice
args+="rebalance=0 " # Faster runs
args+="rows=1000000 "
@@ -56,7 +58,13 @@ run_format()
for am in $2; do
dir="RUNDIR.$am"
echo "./t running $am access method..."
- ./t -1q -h $dir "file_type=$am" $args
+ ./t -1q $3 -h $dir "file_type=$am" $args
+
+ # Remove the version string from the base configuration file. (MongoDB does not create
+ # a base configuration file, but format does, so we need to remove its version string
+ # to allow backward compatibility testing.)
+ (echo '/^version=/d'
+ echo w) | ed -s $dir/WiredTiger.basecfg > /dev/null
done
}
@@ -67,12 +75,12 @@ EXT+="ext/encryptors/rotn/.libs/libwiredtiger_rotn.so, "
EXT+="]"
#############################################################
-# verify_backward:
+# verify_release:
# arg1: release #1
# arg2: release #2
# arg3: access methods list
#############################################################
-verify_backward()
+verify_release()
{
echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
echo "Release \"$1\" verifying \"$2\""
@@ -106,14 +114,17 @@ cd "$top"
(run_format mongodb-4.0 "fix row var")
(run_format mongodb-4.2 "fix row var")
#(run_format mongodb-4.4 "row")
-(run_format "develop" "row")
+(run_format "develop" "row" "-B")
# Verify backward compatibility for supported access methods.
-(verify_backward mongodb-3.6 mongodb-3.4 "fix row var")
-(verify_backward mongodb-4.0 mongodb-3.6 "fix row var")
-(verify_backward mongodb-4.2 mongodb-4.0 "fix row var")
-#(verify_backward mongodb-4.4 mongodb-4.2 "fix row var")
-#(verify_backward develop mongodb-4.4 "row")
- (verify_backward develop mongodb-4.2 "fix row var")
+(verify_release mongodb-3.6 mongodb-3.4 "fix row var")
+(verify_release mongodb-4.0 mongodb-3.6 "fix row var")
+(verify_release mongodb-4.2 mongodb-4.0 "fix row var")
+(verify_release mongodb-4.4 mongodb-4.2 "fix row var")
+(verify_release develop mongodb-4.4 "row")
+(verify_release develop mongodb-4.2 "fix row var")
+
+# Verify forward compatibility for supported access methods.
+(verify_release mongodb-4.2 develop "row")
exit 0
diff --git a/src/third_party/wiredtiger/test/format/backup.c b/src/third_party/wiredtiger/test/format/backup.c
index f0123418888..9dbdb0c8e4d 100644
--- a/src/third_party/wiredtiger/test/format/backup.c
+++ b/src/third_party/wiredtiger/test/format/backup.c
@@ -344,7 +344,7 @@ backup(void *arg)
* Perform a full backup at somewhere under 10 seconds (that way there's at least one), then at
* larger intervals, optionally do incremental backups between full backups.
*/
- full = incr_full = true;
+ incr_full = true;
incremental = 0;
active_files_init(&active[0]);
active_files_init(&active[1]);
diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c
index d6ce62da6a2..0c52292a060 100644
--- a/src/third_party/wiredtiger/test/format/config.c
+++ b/src/third_party/wiredtiger/test/format/config.c
@@ -30,6 +30,7 @@
#include "config.h"
static void config_backup(void);
+static void config_backward_compatible(void);
static void config_cache(void);
static void config_checkpoint(void);
static void config_checksum(void);
@@ -190,11 +191,13 @@ config_setup(void)
config_pct();
config_cache();
- /* Give in-memory and LSM configurations a final review. */
+ /* Give in-memory, LSM and backward compatible configurations a final review. */
if (g.c_in_memory != 0)
config_in_memory_reset();
if (DATASOURCE("lsm"))
config_lsm_reset();
+ if (g.backward_compatible != 0)
+ config_backward_compatible();
/*
* Key/value minimum/maximum are related, correct unless specified by the configuration.
@@ -275,6 +278,22 @@ config_backup(void)
config_single(cstr, false);
}
}
+
+/*
+ * config_backward_compatible --
+ * Backward compatibility configuration.
+ */
+static void
+config_backward_compatible(void)
+{
+ if (!g.backward_compatible)
+ return;
+
+ if (config_is_perm("disk.mmap_all"))
+ testutil_die(EINVAL, "-B option incompatible with mmap_all configuration");
+ config_single("disk.mmap_all=off", false);
+}
+
/*
* config_cache --
* Cache configuration.
@@ -747,67 +766,94 @@ config_pct(void)
static void
config_transaction(void)
{
- bool prepare_requires_ts;
+ /*
+ * WiredTiger cannot support relaxed isolation levels. Turn off everything but timestamps with
+ * snapshot isolation.
+ */
+ if ((!g.c_txn_timestamps && config_is_perm("transaction.timestamps")) ||
+ (g.c_isolation_flag != ISOLATION_SNAPSHOT && config_is_perm("transaction.isolation")))
+ testutil_die(EINVAL, "format limited to timestamp and snapshot-isolation testing");
+ if (!g.c_txn_timestamps)
+ config_single("transaction.timestamps=on", false);
+ if (g.c_isolation_flag != ISOLATION_SNAPSHOT)
+ config_single("transaction.isolation=snapshot", false);
/*
- * We can't prepare a transaction if logging is configured or timestamps aren't configured.
- * Further, for repeatable reads to work in timestamp testing, all updates must be within a
- * snapshot-isolation transaction. Check for incompatible configurations, then let prepare and
- * timestamp drive the remaining configuration.
+ * Check the permanent configuration. We can't prepare a transaction if logging is configured or
+ * timestamps aren't configured. For repeatable reads to work in timestamp testing, all updates
+ * must be done in a snapshot isolation transaction.
*/
- prepare_requires_ts = false;
- if (g.c_prepare) {
- if (config_is_perm("ops.prepare")) {
- if (g.c_logging && config_is_perm("logging"))
- testutil_die(EINVAL, "prepare is incompatible with logging");
- if (!g.c_txn_timestamps && config_is_perm("transaction.timestamps"))
- testutil_die(EINVAL, "prepare requires transaction timestamps");
- } else if ((g.c_logging && config_is_perm("logging")) ||
- (!g.c_txn_timestamps && config_is_perm("transaction.timestamps")))
- config_single("ops.prepare=off", false);
- if (g.c_prepare) {
- prepare_requires_ts = true;
- if (g.c_logging)
- config_single("logging=off", false);
- if (!g.c_txn_timestamps)
- config_single("transaction.timestamps=on", false);
- }
+ if (g.c_prepare && config_is_perm("ops.prepare")) {
+ if (g.c_logging && config_is_perm("logging"))
+ testutil_die(EINVAL, "prepare is incompatible with logging");
+ if (!g.c_txn_timestamps && config_is_perm("transaction.timestamps"))
+ testutil_die(EINVAL, "prepare requires transaction timestamps");
+ if (g.c_isolation_flag != ISOLATION_SNAPSHOT && config_is_perm("transaction.isolation"))
+ testutil_die(EINVAL, "prepare requires snapshot isolation");
+ if (g.c_txn_freq != 100 && config_is_perm("transaction.frequency"))
+ testutil_die(EINVAL, "prepare requires transaction frequency set to 100");
+ }
+ if (g.c_txn_timestamps && config_is_perm("transaction.timestamps")) {
+ if (g.c_isolation_flag != ISOLATION_SNAPSHOT && config_is_perm("transaction.isolation"))
+ testutil_die(EINVAL, "timestamps require snapshot isolation");
+ if (g.c_txn_freq != 100 && config_is_perm("transaction.frequency"))
+ testutil_die(EINVAL, "timestamps require transaction frequency set to 100");
+ }
+ if (g.c_isolation_flag == ISOLATION_SNAPSHOT && config_is_perm("transaction.isolation")) {
+ if (!g.c_txn_timestamps && config_is_perm("transaction.timestamps"))
+ testutil_die(EINVAL, "snapshot isolation requires timestamps");
+ if (g.c_txn_freq != 100 && config_is_perm("transaction.frequency"))
+ testutil_die(EINVAL, "snapshot isolation requires transaction frequency set to 100");
}
- if (g.c_txn_timestamps) {
- if (prepare_requires_ts || config_is_perm("transaction.timestamps")) {
- if (g.c_isolation_flag != ISOLATION_SNAPSHOT && config_is_perm("transaction.isolation"))
- testutil_die(
- EINVAL, "transaction_timestamps or prepare require isolation=snapshot");
- if (g.c_txn_freq != 100 && config_is_perm("transaction.frequency"))
- testutil_die(
- EINVAL, "transaction_timestamps or prepare require transaction-frequency=100");
- } else if ((g.c_isolation_flag != ISOLATION_SNAPSHOT &&
- config_is_perm("transaction.isolation")) ||
- (g.c_txn_freq != 100 && config_is_perm("transaction.frequency")))
- config_single("transaction.timestamps=off", false);
+ /*
+ * The permanent configuration has no incompatible settings, adjust the temporary configuration
+ * as necessary. Prepare overrides timestamps, overrides isolation, for no reason other than
+ * prepare is the least configured and timestamps are the option we want to test the most.
+ */
+ if (g.c_prepare) {
+ if (g.c_logging)
+ config_single("logging=off", false);
+ if (!g.c_txn_timestamps)
+ config_single("transaction.timestamps=on", false);
+ if (g.c_isolation_flag != ISOLATION_SNAPSHOT)
+ config_single("transaction.isolation=snapshot", false);
+ if (g.c_txn_freq != 100)
+ config_single("transaction.frequency=100", false);
}
if (g.c_txn_timestamps) {
if (g.c_isolation_flag != ISOLATION_SNAPSHOT)
config_single("transaction.isolation=snapshot", false);
if (g.c_txn_freq != 100)
config_single("transaction.frequency=100", false);
- } else if (!config_is_perm("transaction.isolation"))
- switch (mmrand(NULL, 1, 4)) {
- case 1:
+ }
+ if (g.c_isolation_flag == ISOLATION_NOT_SET) {
+ switch (mmrand(NULL, 1, 20)) {
+ case 1: /* 5% */
config_single("transaction.isolation=random", false);
break;
- case 2:
+ case 2: /* 5% */
config_single("transaction.isolation=read-uncommitted", false);
break;
- case 3:
+ case 3: /* 5% */
config_single("transaction.isolation=read-committed", false);
break;
- case 4:
- default:
+ default: /* 85% */
config_single("transaction.isolation=snapshot", false);
break;
}
+ if (g.c_isolation_flag == ISOLATION_SNAPSHOT) {
+ if (!g.c_txn_timestamps)
+ config_single("transaction.timestamps=on", false);
+ if (g.c_txn_freq != 100)
+ config_single("transaction.frequency=100", false);
+ } else {
+ if (g.c_prepare)
+ config_single("ops.prepare=off", false);
+ if (g.c_txn_timestamps)
+ config_single("transaction.timestamps=off", false);
+ }
+ }
}
/*
@@ -946,7 +992,7 @@ config_reset(void)
CONFIG *cp;
if (!config_is_perm("transaction.isolation"))
- g.c_isolation_flag = 0;
+ g.c_isolation_flag = ISOLATION_NOT_SET;
/* Clear temporary allocated configuration data. */
for (cp = c; cp->name != NULL; ++cp) {
@@ -1013,6 +1059,9 @@ config_single(const char *s, bool perm)
u_int i;
const char *equalp, *vp1, *vp2;
+ while (__wt_isspace((u_char)*s))
+ ++s;
+
config_compat(&s);
if ((equalp = strchr(s, '=')) == NULL)
diff --git a/src/third_party/wiredtiger/test/format/config_compat.c b/src/third_party/wiredtiger/test/format/config_compat.c
index d3dbc14eb7d..0a5fe4424f7 100644
--- a/src/third_party/wiredtiger/test/format/config_compat.c
+++ b/src/third_party/wiredtiger/test/format/config_compat.c
@@ -50,7 +50,7 @@ static const char *list[] = {
"logging.compression", "logging_file_max=", "logging.file_max", "logging_prealloc=",
"logging.prealloc", "lsm_worker_threads=", "lsm.worker_threads", "major_timeout=",
"format.major_timeout", "memory_page_max=", "btree.memory_page_max", "merge_max=",
- "lsm.merge_max", "mmap=", "disk.mmap", "mmap_all=", "runs.mmap_all", "modify_pct=",
+ "lsm.merge_max", "mmap=", "disk.mmap", "mmap_all=", "disk.mmap_all", "modify_pct=",
"ops.pct.modify", "ops=", "runs.ops", "prefix_compression=", "btree.prefix_compression",
"prefix_compression_min=", "btree.prefix_compression_min", "prepare=", "ops.prepare",
"random_cursor=", "ops.random_cursor", "read_pct=", "ops.pct.read", "rebalance=", "ops.rebalance",
diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h
index 54414e744cd..7b7b9d0605e 100644
--- a/src/third_party/wiredtiger/test/format/format.h
+++ b/src/third_party/wiredtiger/test/format/format.h
@@ -86,8 +86,9 @@ typedef struct {
bool logging; /* log operations */
FILE *logfp; /* log file */
- bool replay; /* Replaying a run. */
- bool workers_finished; /* Operations completed */
+ bool backward_compatible; /* Backward compatibility testing */
+ bool replay; /* Replaying a run. */
+ bool workers_finished; /* Operations completed */
pthread_rwlock_t backup_lock; /* Backup running */
uint32_t backup_id; /* Block incremental id */
@@ -238,6 +239,7 @@ typedef struct {
#define ENCRYPT_ROTN_7 2
u_int c_encryption_flag; /* Encryption flag value */
+#define ISOLATION_NOT_SET 0
#define ISOLATION_RANDOM 1
#define ISOLATION_READ_UNCOMMITTED 2
#define ISOLATION_READ_COMMITTED 3
diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c
index 67e2821f52d..098c33be2aa 100644
--- a/src/third_party/wiredtiger/test/format/t.c
+++ b/src/third_party/wiredtiger/test/format/t.c
@@ -152,11 +152,14 @@ main(int argc, char *argv[])
/* Set values from the command line. */
home = NULL;
one_flag = quiet_flag = false;
- while ((ch = __wt_getopt(progname, argc, argv, "1C:c:h:lqrt:")) != EOF)
+ while ((ch = __wt_getopt(progname, argc, argv, "1BC:c:h:lqrt:")) != EOF)
switch (ch) {
case '1': /* One run */
one_flag = true;
break;
+ case 'B': /* Backward compatibility */
+ g.backward_compatible = true;
+ break;
case 'C': /* wiredtiger_open config */
g.config_open = __wt_optarg;
break;
@@ -394,11 +397,12 @@ static void
usage(void)
{
fprintf(stderr,
- "usage: %s [-1lqr] [-C wiredtiger-config]\n "
+ "usage: %s [-1Blqr] [-C wiredtiger-config]\n "
"[-c config-file] [-h home] [name=value ...]\n",
progname);
fprintf(stderr, "%s",
"\t-1 run once\n"
+ "\t-B create backward compatible configurations\n"
"\t-C specify wiredtiger_open configuration arguments\n"
"\t-c read test program configuration from a file\n"
"\t-h home (default 'RUNDIR')\n"
diff --git a/src/third_party/wiredtiger/test/format/wts.c b/src/third_party/wiredtiger/test/format/wts.c
index a01210b881a..e5b9fda4082 100644
--- a/src/third_party/wiredtiger/test/format/wts.c
+++ b/src/third_party/wiredtiger/test/format/wts.c
@@ -198,7 +198,10 @@ wts_open(const char *home, bool set_api, WT_CONNECTION **connp)
CONFIG_APPEND(p, ",buffer_alignment=512");
#endif
- CONFIG_APPEND(p, ",mmap=%d,mmap_all=%d", g.c_mmap ? 1 : 0, g.c_mmap_all ? 1 : 0);
+ if (g.c_mmap)
+ CONFIG_APPEND(p, ",mmap=1");
+ if (g.c_mmap_all)
+ CONFIG_APPEND(p, ",mmap_all=1");
if (g.c_direct_io)
CONFIG_APPEND(p, ",direct_io=(data)");
@@ -448,13 +451,14 @@ void
wts_close(void)
{
WT_CONNECTION *conn;
- const char *config;
conn = g.wts_conn;
- config = g.c_leak_memory ? "leak_memory" : NULL;
+ if (g.backward_compatible)
+ testutil_check(conn->reconfigure(conn, "compatibility=(release=3.3)"));
+
+ testutil_check(conn->close(conn, g.c_leak_memory ? "leak_memory" : NULL));
- testutil_check(conn->close(conn, config));
g.wts_conn = NULL;
g.wt_api = NULL;
}
diff --git a/src/third_party/wiredtiger/test/suite/test_bug023.py b/src/third_party/wiredtiger/test/suite/test_bug023.py
new file mode 100755
index 00000000000..7b0f55c3197
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_bug023.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2020 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+from helper import copy_wiredtiger_home
+from suite_subprocess import suite_subprocess
+import os
+import shutil
+import wiredtiger, wttest
+
+# test_bug023.py
+# JIRA WT-5930: starting up a backup database with an error in wiredtiger_open
+# then leaves the database in an incorrect state so that the next wiredtiger_open
+# without an error loses data.
+class test_bug023(wttest.WiredTigerTestCase, suite_subprocess):
+ '''Test backup, compatibility levels and an error opening the backup'''
+
+ conn_config = 'config_base=false,log=(enabled),compatibility=(release=3.2.0)'
+ conn_config_32_min = 'config_base=false,log=(enabled),compatibility=(require_min=3.2.0)'
+ conn_config_33_err = 'config_base=false,log=(enabled),compatibility=(require_min=3.3.0)'
+ dir='backup.dir'
+ nentries = 10
+ uri = 'file:bug023.wt'
+
+ def take_full_backup(self, dir):
+ # Open up the backup cursor, and copy the files. Do a full backup.
+ cursor = self.session.open_cursor('backup:', None, None)
+ self.pr('Full backup to ' + dir + ': ')
+ os.mkdir(dir)
+ while True:
+ ret = cursor.next()
+ if ret != 0:
+ break
+ bkup_file = cursor.get_key()
+ sz = os.path.getsize(bkup_file)
+ self.pr('Copy from: ' + bkup_file + ' (' + str(sz) + ') to ' + dir)
+ shutil.copy(bkup_file, dir)
+ self.assertEqual(ret, wiredtiger.WT_NOTFOUND)
+ cursor.close()
+
+ def test_bug023(self):
+ '''Test backup and compatibility levels and an error opening the backup'''
+ self.session.create(self.uri, 'key_format=i,value_format=i')
+ c = self.session.open_cursor(self.uri)
+
+ self.session.begin_transaction()
+ for i in range(self.nentries):
+ c[i] = i
+ self.session.commit_transaction()
+ self.session.checkpoint()
+
+ # Add more entries after the check point. They should be recovered.
+ self.session.begin_transaction()
+ for i in range(self.nentries):
+ c[i + self.nentries] = i
+ self.session.commit_transaction()
+ c.close()
+ orig_data = list(self.session.open_cursor(self.uri))
+
+ # Take a full backup.
+ self.take_full_backup(self.dir)
+ self.close_conn()
+
+ msg = '/Version incompatibility detected:/'
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.wiredtiger_open(self.dir, self.conn_config_33_err), msg)
+
+ self.pr('try opening error directory with correct config')
+ # After getting the error we should be able to open the error backup directory with the
+ # correct compatibility setting and then also see our data.
+ self.conn = self.wiredtiger_open(self.dir, self.conn_config_32_min)
+ session = self.conn.open_session()
+ bkup_data = list(session.open_cursor(self.uri))
+
+ self.assertEqual(orig_data, bkup_data)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_hs06.py b/src/third_party/wiredtiger/test/suite/test_hs06.py
index edef6368e9d..042d9c731cb 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs06.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs06.py
@@ -401,7 +401,7 @@ class test_hs06(wttest.WiredTigerTestCase):
self.assertEqual(cursor[self.create_key(i)], expected)
self.session.rollback_transaction()
- def test_hs_modify_birthmark_is_base_update(self):
+ def test_hs_modify_stable_is_base_update(self):
# Create a small table.
uri = "table:test_hs06"
create_params = 'key_format={},value_format=S'.format(self.key_format)
@@ -415,8 +415,8 @@ class test_hs06(wttest.WiredTigerTestCase):
'oldest_timestamp=' + timestamp_str(1) + ',stable_timestamp=' + timestamp_str(1))
# The base update is at timestamp 1.
- # When we history store evict these pages, the base update will be used as the birthmark since
- # it's the only thing behind the stable timestamp.
+ # When we history store evict these pages, the base update is the only thing behind
+ # the stable timestamp.
cursor = self.session.open_cursor(uri)
for i in range(1, 10000):
self.session.begin_transaction()
@@ -457,8 +457,7 @@ class test_hs06(wttest.WiredTigerTestCase):
expected[300] = 'D'
expected = str().join(expected)
- # Go back and read. We should get the initial value with the 3 modifies applied on top.
- # Ensure that we're aware that the birthmark update could be the base update.
+ # Go back and read.
self.session.begin_transaction('read_timestamp=' + timestamp_str(4))
for i in range(1, 11):
self.assertEqual(cursor[self.create_key(i)], expected)
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable06.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable06.py
index fd2d2871522..6b02b0a0894 100755
--- a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable06.py
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable06.py
@@ -118,12 +118,11 @@ class test_rollback_to_stable06(test_rollback_to_stable_base):
self.assertEqual(calls, 1)
self.assertEqual(keys_restored, 0)
self.assertGreater(pages_visited, 0)
+ self.assertGreaterEqual(keys_removed, 0)
if self.in_memory or self.prepare:
- self.assertEqual(keys_removed, 0)
self.assertEqual(upd_aborted, nrows * 4)
self.assertEqual(hs_removed, 0)
else:
- self.assertGreaterEqual(keys_removed, 0)
self.assertGreaterEqual(upd_aborted, 0)
self.assertGreaterEqual(hs_removed, nrows * 3)